Restore Encoding.Default. (#10567)

Following the fix of Input Stream, restore the encoding parts. No significant performance impact on reading the client test data.
2024-12-23 02:21:54 +03:00 · 2024-07-16 17:49:46 +01:00 · 2024-07-16 17:49:46 +01:00 · 2442ebc52e
commit 2442ebc52e
parent 06274a2c5b
5 changed files with 9 additions and 9 deletions
--- a/distribution/lib/Standard/Base/0.0.0-dev/src/Data/Text/Encoding.enso
+++ b/distribution/lib/Standard/Base/0.0.0-dev/src/Data/Text/Encoding.enso
@ -71,8 +71,7 @@ type Encoding
    default -> Encoding =
        # This factory method is used to publicly expose the `Default` constructor.
        # The constructor itself has to be private, because we want to make `Value` constructor private, but all constructors must have the same privacy.
-        # ToDo: This is a workaround for performance issue.
-        Encoding.utf_8
+        Encoding.Default

    ## PRIVATE
       A default encoding that will try to guess the encoding based on some heuristics.
--- a/test/Base_Tests/src/Data/Json_Spec.enso
+++ b/test/Base_Tests/src/Data/Json_Spec.enso
@ -117,7 +117,7 @@ add_specs suite_builder =
            '{"type":"Date_Time","constructor":"new","year":2023,"month":9,"day":29,"hour":11,"second":52}'.should_parse_as (JS_Object.from_pairs [["type", "Date_Time"], ["constructor", "new"], ["year", 2023], ["month", 9], ["day", 29], ["hour", 11], ["second", 52]])
            '{"type":"Date_Time","constructor":"new","year":2023,"month":9,"day":29,"hour":11,"minute":52,"nanosecond":572104300}'.should_parse_as (JS_Object.from_pairs [["type", "Date_Time"], ["constructor", "new"], ["year", 2023], ["month", 9], ["day", 29], ["hour", 11], ["minute", 52], ["nanosecond", 572104300]])

-        group_builder.specify "should be able to read a JSON file with a BOM indicating UTF-16 encoding" pending="Encoding.default turned off temporarily" <|
+        group_builder.specify "should be able to read a JSON file with a BOM indicating UTF-16 encoding" <|
            utf_16_le_bom = [-1, -2]
            bytes = utf_16_le_bom + ("{}".bytes Encoding.utf_16_le)
            f = File.create_temporary_file "json-with-bom" ".json"
--- a/test/Base_Tests/src/Data/Text/Encoding_Spec.enso
+++ b/test/Base_Tests/src/Data/Text/Encoding_Spec.enso
@ -68,7 +68,7 @@ add_specs suite_builder =
            default_warning.should_equal invalid_ascii_out
            Problems.get_attached_warnings default_warning . should_contain_the_same_elements_as problems

-    suite_builder.group "Default Encoding" pending="Encoding.default turned off temporarily" group_builder->
+    suite_builder.group "Default Encoding" group_builder->
        group_builder.specify "should try reading as UTF-8 by default" <|
            bytes = [65, -60, -123, -60, -103]
            #        A   ą          ę
--- a/test/Table_Tests/src/IO/Delimited_Read_Spec.enso
+++ b/test/Table_Tests/src/IO/Delimited_Read_Spec.enso
@ -475,7 +475,7 @@ add_specs suite_builder =
            Delimited_Format.Delimited ',' . with_line_endings Line_Ending_Style.Unix . should_equal (Delimited_Format.Delimited ',' line_endings=Line_Ending_Style.Unix)

        utf_16_le_bom = [-1, -2]
-        group_builder.specify "(in default mode) should detect UTF-16 encoding if BOM is present" pending="Encoding.default turned off temporarily" <|
+        group_builder.specify "(in default mode) should detect UTF-16 encoding if BOM is present" <|
            bytes = utf_16_le_bom + ('a,b\n1,2'.bytes Encoding.utf_16_le)
            f = File.create_temporary_file "delimited-utf-16-bom" ".csv"
            bytes.write_bytes f . should_succeed
@ -485,7 +485,7 @@ add_specs suite_builder =
            # No hidden BOM in the column name
            table.column_names.first.utf_8 . should_equal [97]

-        group_builder.specify "(in default mode) should skip UTF-8 BOM if it was present" pending="Encoding.default turned off temporarily" <|
+        group_builder.specify "(in default mode) should skip UTF-8 BOM if it was present" <|
            utf_8_bom = [-17, -69, -65]
            bytes = utf_8_bom + ('a,b\n1,2'.bytes Encoding.utf_8)
            f = File.create_temporary_file "delimited-utf-8-bom" ".csv"
@ -506,9 +506,10 @@ add_specs suite_builder =
            # The first column name now contains this invalid character, because it wasn't a BOM
            r.column_names.first . should_equal "a"

-        group_builder.specify "if UTF-16 encoding was selected but an inverted BOM is detected, a warning is issued (pt 2)" pending="Encoding.default turned off temporarily" <|
+        group_builder.specify "if UTF-16 encoding was selected but an inverted BOM is detected, a warning is issued (pt 2)" <|
            bytes = utf_16_le_bom + ('a,b\n1,2'.bytes Encoding.utf_16_be)
            f = File.create_temporary_file "delimited-utf-16-inverted-bom" ".csv"
+            bytes.write_bytes f . should_succeed

            # If we read without specifying the encoding, we will infer UTF-16 LE encoding because of the BOM and get garbage:
            r2 = f.read
@ -527,7 +528,7 @@ add_specs suite_builder =
            r.first_column.to_vector . should_equal ['\uFFFD']
            Problems.expect_only_warning Encoding_Error r

-        group_builder.specify "should fall back to Windows-1252 encoding if invalid UTF-8 characters are encountered in Default encoding" pending="Encoding.default turned off temporarily" <|
+        group_builder.specify "should fall back to Windows-1252 encoding if invalid UTF-8 characters are encountered in Default encoding" <|
            f = File.create_temporary_file "delimited-invalid-utf-8" ".csv"
            # On the simple characters all three encodings (ASCII, UTF-8 and Win-1252) agree, so we can use ASCII bytes.
            bytes = ('A,B\n1,y'.bytes Encoding.ascii) + [-1] + ('z\n2,-'.bytes Encoding.ascii)
--- a/test/Table_Tests/src/IO/Delimited_Write_Spec.enso
+++ b/test/Table_Tests/src/IO/Delimited_Write_Spec.enso
@ -569,7 +569,7 @@ add_specs suite_builder =

        ## If the Delimited config has Encoding.default, the encoding for read will be determined by BOM and Win-1252 fallback heuristics.
           The same encoding should be used for writing, to ensure that when the resulting file is read, all content is correctly decoded.
-        group_builder.specify "should use the same effective encoding for writing as the one that would be used for reading" pending="Encoding.default turned off temporarily" <|
+        group_builder.specify "should use the same effective encoding for writing as the one that would be used for reading" <|
            f = File.create_temporary_file "append-detect" ".csv"
            Test.with_clue "UTF-16 detected by BOM: " <|
                bom = [-1, -2]