mirror of
https://github.com/enso-org/enso.git
synced 2025-01-09 03:57:54 +03:00
Restore Encoding.Default. (#10567)
Following the fix of Input Stream, restore the encoding parts. No significant performance impact on reading the client test data.
This commit is contained in:
parent
06274a2c5b
commit
2442ebc52e
@ -71,8 +71,7 @@ type Encoding
|
||||
default -> Encoding =
|
||||
# This factory method is used to publicly expose the `Default` constructor.
|
||||
# The constructor itself has to be private, because we want to make `Value` constructor private, but all constructors must have the same privacy.
|
||||
# ToDo: This is a workaround for performance issue.
|
||||
Encoding.utf_8
|
||||
Encoding.Default
|
||||
|
||||
## PRIVATE
|
||||
A default encoding that will try to guess the encoding based on some heuristics.
|
||||
|
@ -117,7 +117,7 @@ add_specs suite_builder =
|
||||
'{"type":"Date_Time","constructor":"new","year":2023,"month":9,"day":29,"hour":11,"second":52}'.should_parse_as (JS_Object.from_pairs [["type", "Date_Time"], ["constructor", "new"], ["year", 2023], ["month", 9], ["day", 29], ["hour", 11], ["second", 52]])
|
||||
'{"type":"Date_Time","constructor":"new","year":2023,"month":9,"day":29,"hour":11,"minute":52,"nanosecond":572104300}'.should_parse_as (JS_Object.from_pairs [["type", "Date_Time"], ["constructor", "new"], ["year", 2023], ["month", 9], ["day", 29], ["hour", 11], ["minute", 52], ["nanosecond", 572104300]])
|
||||
|
||||
group_builder.specify "should be able to read a JSON file with a BOM indicating UTF-16 encoding" pending="Encoding.default turned off temporarily" <|
|
||||
group_builder.specify "should be able to read a JSON file with a BOM indicating UTF-16 encoding" <|
|
||||
utf_16_le_bom = [-1, -2]
|
||||
bytes = utf_16_le_bom + ("{}".bytes Encoding.utf_16_le)
|
||||
f = File.create_temporary_file "json-with-bom" ".json"
|
||||
|
@ -68,7 +68,7 @@ add_specs suite_builder =
|
||||
default_warning.should_equal invalid_ascii_out
|
||||
Problems.get_attached_warnings default_warning . should_contain_the_same_elements_as problems
|
||||
|
||||
suite_builder.group "Default Encoding" pending="Encoding.default turned off temporarily" group_builder->
|
||||
suite_builder.group "Default Encoding" group_builder->
|
||||
group_builder.specify "should try reading as UTF-8 by default" <|
|
||||
bytes = [65, -60, -123, -60, -103]
|
||||
# A ą ę
|
||||
|
@ -475,7 +475,7 @@ add_specs suite_builder =
|
||||
Delimited_Format.Delimited ',' . with_line_endings Line_Ending_Style.Unix . should_equal (Delimited_Format.Delimited ',' line_endings=Line_Ending_Style.Unix)
|
||||
|
||||
utf_16_le_bom = [-1, -2]
|
||||
group_builder.specify "(in default mode) should detect UTF-16 encoding if BOM is present" pending="Encoding.default turned off temporarily" <|
|
||||
group_builder.specify "(in default mode) should detect UTF-16 encoding if BOM is present" <|
|
||||
bytes = utf_16_le_bom + ('a,b\n1,2'.bytes Encoding.utf_16_le)
|
||||
f = File.create_temporary_file "delimited-utf-16-bom" ".csv"
|
||||
bytes.write_bytes f . should_succeed
|
||||
@ -485,7 +485,7 @@ add_specs suite_builder =
|
||||
# No hidden BOM in the column name
|
||||
table.column_names.first.utf_8 . should_equal [97]
|
||||
|
||||
group_builder.specify "(in default mode) should skip UTF-8 BOM if it was present" pending="Encoding.default turned off temporarily" <|
|
||||
group_builder.specify "(in default mode) should skip UTF-8 BOM if it was present" <|
|
||||
utf_8_bom = [-17, -69, -65]
|
||||
bytes = utf_8_bom + ('a,b\n1,2'.bytes Encoding.utf_8)
|
||||
f = File.create_temporary_file "delimited-utf-8-bom" ".csv"
|
||||
@ -506,9 +506,10 @@ add_specs suite_builder =
|
||||
# The first column name now contains this invalid character, because it wasn't a BOM
|
||||
r.column_names.first . should_equal "a"
|
||||
|
||||
group_builder.specify "if UTF-16 encoding was selected but an inverted BOM is detected, a warning is issued (pt 2)" pending="Encoding.default turned off temporarily" <|
|
||||
group_builder.specify "if UTF-16 encoding was selected but an inverted BOM is detected, a warning is issued (pt 2)" <|
|
||||
bytes = utf_16_le_bom + ('a,b\n1,2'.bytes Encoding.utf_16_be)
|
||||
f = File.create_temporary_file "delimited-utf-16-inverted-bom" ".csv"
|
||||
bytes.write_bytes f . should_succeed
|
||||
|
||||
# If we read without specifying the encoding, we will infer UTF-16 LE encoding because of the BOM and get garbage:
|
||||
r2 = f.read
|
||||
@ -527,7 +528,7 @@ add_specs suite_builder =
|
||||
r.first_column.to_vector . should_equal ['\uFFFD']
|
||||
Problems.expect_only_warning Encoding_Error r
|
||||
|
||||
group_builder.specify "should fall back to Windows-1252 encoding if invalid UTF-8 characters are encountered in Default encoding" pending="Encoding.default turned off temporarily" <|
|
||||
group_builder.specify "should fall back to Windows-1252 encoding if invalid UTF-8 characters are encountered in Default encoding" <|
|
||||
f = File.create_temporary_file "delimited-invalid-utf-8" ".csv"
|
||||
# On the simple characters all three encodings (ASCII, UTF-8 and Win-1252) agree, so we can use ASCII bytes.
|
||||
bytes = ('A,B\n1,y'.bytes Encoding.ascii) + [-1] + ('z\n2,-'.bytes Encoding.ascii)
|
||||
|
@ -569,7 +569,7 @@ add_specs suite_builder =
|
||||
|
||||
## If the Delimited config has Encoding.default, the encoding for read will be determined by BOM and Win-1252 fallback heuristics.
|
||||
The same encoding should be used for writing, to ensure that when the resulting file is read, all content is correctly decoded.
|
||||
group_builder.specify "should use the same effective encoding for writing as the one that would be used for reading" pending="Encoding.default turned off temporarily" <|
|
||||
group_builder.specify "should use the same effective encoding for writing as the one that would be used for reading" <|
|
||||
f = File.create_temporary_file "append-detect" ".csv"
|
||||
Test.with_clue "UTF-16 detected by BOM: " <|
|
||||
bom = [-1, -2]
|
||||
|
Loading…
Reference in New Issue
Block a user