Restore Encoding.Default. (#10567)

Following the fix of Input Stream, restore the encoding parts.

No significant performance impact on reading the client test data.
This commit is contained in:
James Dunkerley 2024-07-16 17:49:46 +01:00 committed by GitHub
parent 06274a2c5b
commit 2442ebc52e
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
5 changed files with 9 additions and 9 deletions

View File

@ -71,8 +71,7 @@ type Encoding
default -> Encoding =
# This factory method is used to publicly expose the `Default` constructor.
# The constructor itself has to be private, because we want to make `Value` constructor private, but all constructors must have the same privacy.
# ToDo: This is a workaround for performance issue.
Encoding.utf_8
Encoding.Default
## PRIVATE
A default encoding that will try to guess the encoding based on some heuristics.

View File

@ -117,7 +117,7 @@ add_specs suite_builder =
'{"type":"Date_Time","constructor":"new","year":2023,"month":9,"day":29,"hour":11,"second":52}'.should_parse_as (JS_Object.from_pairs [["type", "Date_Time"], ["constructor", "new"], ["year", 2023], ["month", 9], ["day", 29], ["hour", 11], ["second", 52]])
'{"type":"Date_Time","constructor":"new","year":2023,"month":9,"day":29,"hour":11,"minute":52,"nanosecond":572104300}'.should_parse_as (JS_Object.from_pairs [["type", "Date_Time"], ["constructor", "new"], ["year", 2023], ["month", 9], ["day", 29], ["hour", 11], ["minute", 52], ["nanosecond", 572104300]])
group_builder.specify "should be able to read a JSON file with a BOM indicating UTF-16 encoding" pending="Encoding.default turned off temporarily" <|
group_builder.specify "should be able to read a JSON file with a BOM indicating UTF-16 encoding" <|
utf_16_le_bom = [-1, -2]
bytes = utf_16_le_bom + ("{}".bytes Encoding.utf_16_le)
f = File.create_temporary_file "json-with-bom" ".json"

View File

@ -68,7 +68,7 @@ add_specs suite_builder =
default_warning.should_equal invalid_ascii_out
Problems.get_attached_warnings default_warning . should_contain_the_same_elements_as problems
suite_builder.group "Default Encoding" pending="Encoding.default turned off temporarily" group_builder->
suite_builder.group "Default Encoding" group_builder->
group_builder.specify "should try reading as UTF-8 by default" <|
bytes = [65, -60, -123, -60, -103]
# A ą ę

View File

@ -475,7 +475,7 @@ add_specs suite_builder =
Delimited_Format.Delimited ',' . with_line_endings Line_Ending_Style.Unix . should_equal (Delimited_Format.Delimited ',' line_endings=Line_Ending_Style.Unix)
utf_16_le_bom = [-1, -2]
group_builder.specify "(in default mode) should detect UTF-16 encoding if BOM is present" pending="Encoding.default turned off temporarily" <|
group_builder.specify "(in default mode) should detect UTF-16 encoding if BOM is present" <|
bytes = utf_16_le_bom + ('a,b\n1,2'.bytes Encoding.utf_16_le)
f = File.create_temporary_file "delimited-utf-16-bom" ".csv"
bytes.write_bytes f . should_succeed
@ -485,7 +485,7 @@ add_specs suite_builder =
# No hidden BOM in the column name
table.column_names.first.utf_8 . should_equal [97]
group_builder.specify "(in default mode) should skip UTF-8 BOM if it was present" pending="Encoding.default turned off temporarily" <|
group_builder.specify "(in default mode) should skip UTF-8 BOM if it was present" <|
utf_8_bom = [-17, -69, -65]
bytes = utf_8_bom + ('a,b\n1,2'.bytes Encoding.utf_8)
f = File.create_temporary_file "delimited-utf-8-bom" ".csv"
@ -506,9 +506,10 @@ add_specs suite_builder =
# The first column name now contains this invalid character, because it wasn't a BOM
r.column_names.first . should_equal "￾a"
group_builder.specify "if UTF-16 encoding was selected but an inverted BOM is detected, a warning is issued (pt 2)" pending="Encoding.default turned off temporarily" <|
group_builder.specify "if UTF-16 encoding was selected but an inverted BOM is detected, a warning is issued (pt 2)" <|
bytes = utf_16_le_bom + ('a,b\n1,2'.bytes Encoding.utf_16_be)
f = File.create_temporary_file "delimited-utf-16-inverted-bom" ".csv"
bytes.write_bytes f . should_succeed
# If we read without specifying the encoding, we will infer UTF-16 LE encoding because of the BOM and get garbage:
r2 = f.read
@ -527,7 +528,7 @@ add_specs suite_builder =
r.first_column.to_vector . should_equal ['\uFFFD']
Problems.expect_only_warning Encoding_Error r
group_builder.specify "should fall back to Windows-1252 encoding if invalid UTF-8 characters are encountered in Default encoding" pending="Encoding.default turned off temporarily" <|
group_builder.specify "should fall back to Windows-1252 encoding if invalid UTF-8 characters are encountered in Default encoding" <|
f = File.create_temporary_file "delimited-invalid-utf-8" ".csv"
# On the simple characters all three encodings (ASCII, UTF-8 and Win-1252) agree, so we can use ASCII bytes.
bytes = ('A,B\n1,y'.bytes Encoding.ascii) + [-1] + ('z\n2,-'.bytes Encoding.ascii)

View File

@ -569,7 +569,7 @@ add_specs suite_builder =
## If the Delimited config has Encoding.default, the encoding for read will be determined by BOM and Win-1252 fallback heuristics.
The same encoding should be used for writing, to ensure that when the resulting file is read, all content is correctly decoded.
group_builder.specify "should use the same effective encoding for writing as the one that would be used for reading" pending="Encoding.default turned off temporarily" <|
group_builder.specify "should use the same effective encoding for writing as the one that would be used for reading" <|
f = File.create_temporary_file "append-detect" ".csv"
Test.with_clue "UTF-16 detected by BOM: " <|
bom = [-1, -2]