Restore Encoding.Default. (#10567)

Following the fix of Input Stream, restore the encoding parts.

No significant performance impact on reading the client test data.
This commit is contained in:
James Dunkerley 2024-07-16 17:49:46 +01:00 committed by GitHub
parent 06274a2c5b
commit 2442ebc52e
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
5 changed files with 9 additions and 9 deletions

View File

@ -71,8 +71,7 @@ type Encoding
default -> Encoding = default -> Encoding =
# This factory method is used to publicly expose the `Default` constructor. # This factory method is used to publicly expose the `Default` constructor.
# The constructor itself has to be private, because we want to make `Value` constructor private, but all constructors must have the same privacy. # The constructor itself has to be private, because we want to make `Value` constructor private, but all constructors must have the same privacy.
# ToDo: This is a workaround for performance issue. Encoding.Default
Encoding.utf_8
## PRIVATE ## PRIVATE
A default encoding that will try to guess the encoding based on some heuristics. A default encoding that will try to guess the encoding based on some heuristics.

View File

@ -117,7 +117,7 @@ add_specs suite_builder =
'{"type":"Date_Time","constructor":"new","year":2023,"month":9,"day":29,"hour":11,"second":52}'.should_parse_as (JS_Object.from_pairs [["type", "Date_Time"], ["constructor", "new"], ["year", 2023], ["month", 9], ["day", 29], ["hour", 11], ["second", 52]]) '{"type":"Date_Time","constructor":"new","year":2023,"month":9,"day":29,"hour":11,"second":52}'.should_parse_as (JS_Object.from_pairs [["type", "Date_Time"], ["constructor", "new"], ["year", 2023], ["month", 9], ["day", 29], ["hour", 11], ["second", 52]])
'{"type":"Date_Time","constructor":"new","year":2023,"month":9,"day":29,"hour":11,"minute":52,"nanosecond":572104300}'.should_parse_as (JS_Object.from_pairs [["type", "Date_Time"], ["constructor", "new"], ["year", 2023], ["month", 9], ["day", 29], ["hour", 11], ["minute", 52], ["nanosecond", 572104300]]) '{"type":"Date_Time","constructor":"new","year":2023,"month":9,"day":29,"hour":11,"minute":52,"nanosecond":572104300}'.should_parse_as (JS_Object.from_pairs [["type", "Date_Time"], ["constructor", "new"], ["year", 2023], ["month", 9], ["day", 29], ["hour", 11], ["minute", 52], ["nanosecond", 572104300]])
group_builder.specify "should be able to read a JSON file with a BOM indicating UTF-16 encoding" pending="Encoding.default turned off temporarily" <| group_builder.specify "should be able to read a JSON file with a BOM indicating UTF-16 encoding" <|
utf_16_le_bom = [-1, -2] utf_16_le_bom = [-1, -2]
bytes = utf_16_le_bom + ("{}".bytes Encoding.utf_16_le) bytes = utf_16_le_bom + ("{}".bytes Encoding.utf_16_le)
f = File.create_temporary_file "json-with-bom" ".json" f = File.create_temporary_file "json-with-bom" ".json"

View File

@ -68,7 +68,7 @@ add_specs suite_builder =
default_warning.should_equal invalid_ascii_out default_warning.should_equal invalid_ascii_out
Problems.get_attached_warnings default_warning . should_contain_the_same_elements_as problems Problems.get_attached_warnings default_warning . should_contain_the_same_elements_as problems
suite_builder.group "Default Encoding" pending="Encoding.default turned off temporarily" group_builder-> suite_builder.group "Default Encoding" group_builder->
group_builder.specify "should try reading as UTF-8 by default" <| group_builder.specify "should try reading as UTF-8 by default" <|
bytes = [65, -60, -123, -60, -103] bytes = [65, -60, -123, -60, -103]
# A ą ę # A ą ę

View File

@ -475,7 +475,7 @@ add_specs suite_builder =
Delimited_Format.Delimited ',' . with_line_endings Line_Ending_Style.Unix . should_equal (Delimited_Format.Delimited ',' line_endings=Line_Ending_Style.Unix) Delimited_Format.Delimited ',' . with_line_endings Line_Ending_Style.Unix . should_equal (Delimited_Format.Delimited ',' line_endings=Line_Ending_Style.Unix)
utf_16_le_bom = [-1, -2] utf_16_le_bom = [-1, -2]
group_builder.specify "(in default mode) should detect UTF-16 encoding if BOM is present" pending="Encoding.default turned off temporarily" <| group_builder.specify "(in default mode) should detect UTF-16 encoding if BOM is present" <|
bytes = utf_16_le_bom + ('a,b\n1,2'.bytes Encoding.utf_16_le) bytes = utf_16_le_bom + ('a,b\n1,2'.bytes Encoding.utf_16_le)
f = File.create_temporary_file "delimited-utf-16-bom" ".csv" f = File.create_temporary_file "delimited-utf-16-bom" ".csv"
bytes.write_bytes f . should_succeed bytes.write_bytes f . should_succeed
@ -485,7 +485,7 @@ add_specs suite_builder =
# No hidden BOM in the column name # No hidden BOM in the column name
table.column_names.first.utf_8 . should_equal [97] table.column_names.first.utf_8 . should_equal [97]
group_builder.specify "(in default mode) should skip UTF-8 BOM if it was present" pending="Encoding.default turned off temporarily" <| group_builder.specify "(in default mode) should skip UTF-8 BOM if it was present" <|
utf_8_bom = [-17, -69, -65] utf_8_bom = [-17, -69, -65]
bytes = utf_8_bom + ('a,b\n1,2'.bytes Encoding.utf_8) bytes = utf_8_bom + ('a,b\n1,2'.bytes Encoding.utf_8)
f = File.create_temporary_file "delimited-utf-8-bom" ".csv" f = File.create_temporary_file "delimited-utf-8-bom" ".csv"
@ -506,9 +506,10 @@ add_specs suite_builder =
# The first column name now contains this invalid character, because it wasn't a BOM # The first column name now contains this invalid character, because it wasn't a BOM
r.column_names.first . should_equal "￾a" r.column_names.first . should_equal "￾a"
group_builder.specify "if UTF-16 encoding was selected but an inverted BOM is detected, a warning is issued (pt 2)" pending="Encoding.default turned off temporarily" <| group_builder.specify "if UTF-16 encoding was selected but an inverted BOM is detected, a warning is issued (pt 2)" <|
bytes = utf_16_le_bom + ('a,b\n1,2'.bytes Encoding.utf_16_be) bytes = utf_16_le_bom + ('a,b\n1,2'.bytes Encoding.utf_16_be)
f = File.create_temporary_file "delimited-utf-16-inverted-bom" ".csv" f = File.create_temporary_file "delimited-utf-16-inverted-bom" ".csv"
bytes.write_bytes f . should_succeed
# If we read without specifying the encoding, we will infer UTF-16 LE encoding because of the BOM and get garbage: # If we read without specifying the encoding, we will infer UTF-16 LE encoding because of the BOM and get garbage:
r2 = f.read r2 = f.read
@ -527,7 +528,7 @@ add_specs suite_builder =
r.first_column.to_vector . should_equal ['\uFFFD'] r.first_column.to_vector . should_equal ['\uFFFD']
Problems.expect_only_warning Encoding_Error r Problems.expect_only_warning Encoding_Error r
group_builder.specify "should fall back to Windows-1252 encoding if invalid UTF-8 characters are encountered in Default encoding" pending="Encoding.default turned off temporarily" <| group_builder.specify "should fall back to Windows-1252 encoding if invalid UTF-8 characters are encountered in Default encoding" <|
f = File.create_temporary_file "delimited-invalid-utf-8" ".csv" f = File.create_temporary_file "delimited-invalid-utf-8" ".csv"
# On the simple characters all three encodings (ASCII, UTF-8 and Win-1252) agree, so we can use ASCII bytes. # On the simple characters all three encodings (ASCII, UTF-8 and Win-1252) agree, so we can use ASCII bytes.
bytes = ('A,B\n1,y'.bytes Encoding.ascii) + [-1] + ('z\n2,-'.bytes Encoding.ascii) bytes = ('A,B\n1,y'.bytes Encoding.ascii) + [-1] + ('z\n2,-'.bytes Encoding.ascii)

View File

@ -569,7 +569,7 @@ add_specs suite_builder =
## If the Delimited config has Encoding.default, the encoding for read will be determined by BOM and Win-1252 fallback heuristics. ## If the Delimited config has Encoding.default, the encoding for read will be determined by BOM and Win-1252 fallback heuristics.
The same encoding should be used for writing, to ensure that when the resulting file is read, all content is correctly decoded. The same encoding should be used for writing, to ensure that when the resulting file is read, all content is correctly decoded.
group_builder.specify "should use the same effective encoding for writing as the one that would be used for reading" pending="Encoding.default turned off temporarily" <| group_builder.specify "should use the same effective encoding for writing as the one that would be used for reading" <|
f = File.create_temporary_file "append-detect" ".csv" f = File.create_temporary_file "append-detect" ".csv"
Test.with_clue "UTF-16 detected by BOM: " <| Test.with_clue "UTF-16 detected by BOM: " <|
bom = [-1, -2] bom = [-1, -2]