mirror of
https://github.com/enso-org/enso.git
synced 2024-12-23 07:51:56 +03:00
Restore Encoding.Default. (#10567)
Following the fix of Input Stream, restore the encoding parts. No significant performance impact on reading the client test data.
This commit is contained in:
parent
06274a2c5b
commit
2442ebc52e
@ -71,8 +71,7 @@ type Encoding
|
|||||||
default -> Encoding =
|
default -> Encoding =
|
||||||
# This factory method is used to publicly expose the `Default` constructor.
|
# This factory method is used to publicly expose the `Default` constructor.
|
||||||
# The constructor itself has to be private, because we want to make `Value` constructor private, but all constructors must have the same privacy.
|
# The constructor itself has to be private, because we want to make `Value` constructor private, but all constructors must have the same privacy.
|
||||||
# ToDo: This is a workaround for performance issue.
|
Encoding.Default
|
||||||
Encoding.utf_8
|
|
||||||
|
|
||||||
## PRIVATE
|
## PRIVATE
|
||||||
A default encoding that will try to guess the encoding based on some heuristics.
|
A default encoding that will try to guess the encoding based on some heuristics.
|
||||||
|
@ -117,7 +117,7 @@ add_specs suite_builder =
|
|||||||
'{"type":"Date_Time","constructor":"new","year":2023,"month":9,"day":29,"hour":11,"second":52}'.should_parse_as (JS_Object.from_pairs [["type", "Date_Time"], ["constructor", "new"], ["year", 2023], ["month", 9], ["day", 29], ["hour", 11], ["second", 52]])
|
'{"type":"Date_Time","constructor":"new","year":2023,"month":9,"day":29,"hour":11,"second":52}'.should_parse_as (JS_Object.from_pairs [["type", "Date_Time"], ["constructor", "new"], ["year", 2023], ["month", 9], ["day", 29], ["hour", 11], ["second", 52]])
|
||||||
'{"type":"Date_Time","constructor":"new","year":2023,"month":9,"day":29,"hour":11,"minute":52,"nanosecond":572104300}'.should_parse_as (JS_Object.from_pairs [["type", "Date_Time"], ["constructor", "new"], ["year", 2023], ["month", 9], ["day", 29], ["hour", 11], ["minute", 52], ["nanosecond", 572104300]])
|
'{"type":"Date_Time","constructor":"new","year":2023,"month":9,"day":29,"hour":11,"minute":52,"nanosecond":572104300}'.should_parse_as (JS_Object.from_pairs [["type", "Date_Time"], ["constructor", "new"], ["year", 2023], ["month", 9], ["day", 29], ["hour", 11], ["minute", 52], ["nanosecond", 572104300]])
|
||||||
|
|
||||||
group_builder.specify "should be able to read a JSON file with a BOM indicating UTF-16 encoding" pending="Encoding.default turned off temporarily" <|
|
group_builder.specify "should be able to read a JSON file with a BOM indicating UTF-16 encoding" <|
|
||||||
utf_16_le_bom = [-1, -2]
|
utf_16_le_bom = [-1, -2]
|
||||||
bytes = utf_16_le_bom + ("{}".bytes Encoding.utf_16_le)
|
bytes = utf_16_le_bom + ("{}".bytes Encoding.utf_16_le)
|
||||||
f = File.create_temporary_file "json-with-bom" ".json"
|
f = File.create_temporary_file "json-with-bom" ".json"
|
||||||
|
@ -68,7 +68,7 @@ add_specs suite_builder =
|
|||||||
default_warning.should_equal invalid_ascii_out
|
default_warning.should_equal invalid_ascii_out
|
||||||
Problems.get_attached_warnings default_warning . should_contain_the_same_elements_as problems
|
Problems.get_attached_warnings default_warning . should_contain_the_same_elements_as problems
|
||||||
|
|
||||||
suite_builder.group "Default Encoding" pending="Encoding.default turned off temporarily" group_builder->
|
suite_builder.group "Default Encoding" group_builder->
|
||||||
group_builder.specify "should try reading as UTF-8 by default" <|
|
group_builder.specify "should try reading as UTF-8 by default" <|
|
||||||
bytes = [65, -60, -123, -60, -103]
|
bytes = [65, -60, -123, -60, -103]
|
||||||
# A ą ę
|
# A ą ę
|
||||||
|
@ -475,7 +475,7 @@ add_specs suite_builder =
|
|||||||
Delimited_Format.Delimited ',' . with_line_endings Line_Ending_Style.Unix . should_equal (Delimited_Format.Delimited ',' line_endings=Line_Ending_Style.Unix)
|
Delimited_Format.Delimited ',' . with_line_endings Line_Ending_Style.Unix . should_equal (Delimited_Format.Delimited ',' line_endings=Line_Ending_Style.Unix)
|
||||||
|
|
||||||
utf_16_le_bom = [-1, -2]
|
utf_16_le_bom = [-1, -2]
|
||||||
group_builder.specify "(in default mode) should detect UTF-16 encoding if BOM is present" pending="Encoding.default turned off temporarily" <|
|
group_builder.specify "(in default mode) should detect UTF-16 encoding if BOM is present" <|
|
||||||
bytes = utf_16_le_bom + ('a,b\n1,2'.bytes Encoding.utf_16_le)
|
bytes = utf_16_le_bom + ('a,b\n1,2'.bytes Encoding.utf_16_le)
|
||||||
f = File.create_temporary_file "delimited-utf-16-bom" ".csv"
|
f = File.create_temporary_file "delimited-utf-16-bom" ".csv"
|
||||||
bytes.write_bytes f . should_succeed
|
bytes.write_bytes f . should_succeed
|
||||||
@ -485,7 +485,7 @@ add_specs suite_builder =
|
|||||||
# No hidden BOM in the column name
|
# No hidden BOM in the column name
|
||||||
table.column_names.first.utf_8 . should_equal [97]
|
table.column_names.first.utf_8 . should_equal [97]
|
||||||
|
|
||||||
group_builder.specify "(in default mode) should skip UTF-8 BOM if it was present" pending="Encoding.default turned off temporarily" <|
|
group_builder.specify "(in default mode) should skip UTF-8 BOM if it was present" <|
|
||||||
utf_8_bom = [-17, -69, -65]
|
utf_8_bom = [-17, -69, -65]
|
||||||
bytes = utf_8_bom + ('a,b\n1,2'.bytes Encoding.utf_8)
|
bytes = utf_8_bom + ('a,b\n1,2'.bytes Encoding.utf_8)
|
||||||
f = File.create_temporary_file "delimited-utf-8-bom" ".csv"
|
f = File.create_temporary_file "delimited-utf-8-bom" ".csv"
|
||||||
@ -506,9 +506,10 @@ add_specs suite_builder =
|
|||||||
# The first column name now contains this invalid character, because it wasn't a BOM
|
# The first column name now contains this invalid character, because it wasn't a BOM
|
||||||
r.column_names.first . should_equal "a"
|
r.column_names.first . should_equal "a"
|
||||||
|
|
||||||
group_builder.specify "if UTF-16 encoding was selected but an inverted BOM is detected, a warning is issued (pt 2)" pending="Encoding.default turned off temporarily" <|
|
group_builder.specify "if UTF-16 encoding was selected but an inverted BOM is detected, a warning is issued (pt 2)" <|
|
||||||
bytes = utf_16_le_bom + ('a,b\n1,2'.bytes Encoding.utf_16_be)
|
bytes = utf_16_le_bom + ('a,b\n1,2'.bytes Encoding.utf_16_be)
|
||||||
f = File.create_temporary_file "delimited-utf-16-inverted-bom" ".csv"
|
f = File.create_temporary_file "delimited-utf-16-inverted-bom" ".csv"
|
||||||
|
bytes.write_bytes f . should_succeed
|
||||||
|
|
||||||
# If we read without specifying the encoding, we will infer UTF-16 LE encoding because of the BOM and get garbage:
|
# If we read without specifying the encoding, we will infer UTF-16 LE encoding because of the BOM and get garbage:
|
||||||
r2 = f.read
|
r2 = f.read
|
||||||
@ -527,7 +528,7 @@ add_specs suite_builder =
|
|||||||
r.first_column.to_vector . should_equal ['\uFFFD']
|
r.first_column.to_vector . should_equal ['\uFFFD']
|
||||||
Problems.expect_only_warning Encoding_Error r
|
Problems.expect_only_warning Encoding_Error r
|
||||||
|
|
||||||
group_builder.specify "should fall back to Windows-1252 encoding if invalid UTF-8 characters are encountered in Default encoding" pending="Encoding.default turned off temporarily" <|
|
group_builder.specify "should fall back to Windows-1252 encoding if invalid UTF-8 characters are encountered in Default encoding" <|
|
||||||
f = File.create_temporary_file "delimited-invalid-utf-8" ".csv"
|
f = File.create_temporary_file "delimited-invalid-utf-8" ".csv"
|
||||||
# On the simple characters all three encodings (ASCII, UTF-8 and Win-1252) agree, so we can use ASCII bytes.
|
# On the simple characters all three encodings (ASCII, UTF-8 and Win-1252) agree, so we can use ASCII bytes.
|
||||||
bytes = ('A,B\n1,y'.bytes Encoding.ascii) + [-1] + ('z\n2,-'.bytes Encoding.ascii)
|
bytes = ('A,B\n1,y'.bytes Encoding.ascii) + [-1] + ('z\n2,-'.bytes Encoding.ascii)
|
||||||
|
@ -569,7 +569,7 @@ add_specs suite_builder =
|
|||||||
|
|
||||||
## If the Delimited config has Encoding.default, the encoding for read will be determined by BOM and Win-1252 fallback heuristics.
|
## If the Delimited config has Encoding.default, the encoding for read will be determined by BOM and Win-1252 fallback heuristics.
|
||||||
The same encoding should be used for writing, to ensure that when the resulting file is read, all content is correctly decoded.
|
The same encoding should be used for writing, to ensure that when the resulting file is read, all content is correctly decoded.
|
||||||
group_builder.specify "should use the same effective encoding for writing as the one that would be used for reading" pending="Encoding.default turned off temporarily" <|
|
group_builder.specify "should use the same effective encoding for writing as the one that would be used for reading" <|
|
||||||
f = File.create_temporary_file "append-detect" ".csv"
|
f = File.create_temporary_file "append-detect" ".csv"
|
||||||
Test.with_clue "UTF-16 detected by BOM: " <|
|
Test.with_clue "UTF-16 detected by BOM: " <|
|
||||||
bom = [-1, -2]
|
bom = [-1, -2]
|
||||||
|
Loading…
Reference in New Issue
Block a user