From 7c94fa6a777ed0557eb2bb5ccd834b7e638f5c1b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Rados=C5=82aw=20Wa=C5=9Bko?= Date: Thu, 7 Jul 2022 02:20:00 +0200 Subject: [PATCH] Custom Encoding support when writing Delimited files (#3564) Implements https://www.pivotaltracker.com/story/show/182545847 --- CHANGELOG.md | 2 + build.sbt | 13 +- .../Base/0.0.0-dev/src/System/File.enso | 29 ++- .../src/Internal/Delimited_Reader.enso | 16 +- .../src/Internal/Delimited_Writer.enso | 17 +- .../java/org/enso/base/Encoding_Utils.java | 61 +++++- .../main/java/org/enso/base/WithProblems.java | 5 + .../base/encoding/ReportingStreamEncoder.java | 195 ++++++++++++++++++ .../Table_Tests/src/Delimited_Write_Spec.enso | 6 +- .../System/Reporting_Stream_Decoder_Spec.enso | 160 +++++++------- .../System/Reporting_Stream_Encoder_Spec.enso | 91 ++++++++ 11 files changed, 478 insertions(+), 117 deletions(-) create mode 100644 std-bits/base/src/main/java/org/enso/base/WithProblems.java create mode 100644 std-bits/base/src/main/java/org/enso/base/encoding/ReportingStreamEncoder.java create mode 100644 test/Tests/src/System/Reporting_Stream_Encoder_Spec.enso diff --git a/CHANGELOG.md b/CHANGELOG.md index d3e45251dc9..4a1473ab02b 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -147,6 +147,7 @@ - [Added `File_Format.Delimited` support to `Table.write` for new files.][3528] - [Adjusted `Database.connect` API to new design.][3542] - [Added `File_Format.Excel` support to `Table.write` for new files.][3551] +- [Added support for custom encodings in `File_Format.Delimited` writing.][3564] [debug-shortcuts]: https://github.com/enso-org/enso/blob/develop/app/gui/docs/product/shortcuts.md#debug @@ -232,6 +233,7 @@ [3528]: https://github.com/enso-org/enso/pull/3528 [3542]: https://github.com/enso-org/enso/pull/3542 [3551]: https://github.com/enso-org/enso/pull/3551 +[3564]: https://github.com/enso-org/enso/pull/3564 [3552]: https://github.com/enso-org/enso/pull/3552 #### Enso Compiler diff --git a/build.sbt b/build.sbt index fb624750f63..b55d843021a 100644 --- a/build.sbt +++ b/build.sbt @@ -5,11 +5,8 @@ import sbt.Keys.{libraryDependencies, scalacOptions} import sbt.addCompilerPlugin import sbt.complete.DefaultParsers._ import sbt.complete.Parser -import sbtcrossproject.CrossPlugin.autoImport.{crossProject, CrossType} -import src.main.scala.licenses.{ - DistributionDescription, - SBTDistributionComponent -} +import sbtcrossproject.CrossPlugin.autoImport.{CrossType, crossProject} +import src.main.scala.licenses.{DistributionDescription, SBTDistributionComponent} import java.io.File @@ -17,9 +14,9 @@ import java.io.File // === Global Configuration =================================================== // ============================================================================ -val scalacVersion = "2.13.7" -val graalVersion = "21.3.0" -val javaVersion = "11" +val scalacVersion = "2.13.7" +val graalVersion = "21.3.0" +val javaVersion = "11" val defaultDevEnsoVersion = "0.0.0-dev" val ensoVersion = sys.env.getOrElse( "ENSO_VERSION", diff --git a/distribution/lib/Standard/Base/0.0.0-dev/src/System/File.enso b/distribution/lib/Standard/Base/0.0.0-dev/src/System/File.enso index 5df0a184352..55a8ff65a64 100644 --- a/distribution/lib/Standard/Base/0.0.0-dev/src/System/File.enso +++ b/distribution/lib/Standard/Base/0.0.0-dev/src/System/File.enso @@ -2,14 +2,16 @@ from Standard.Base import all import Standard.Base.System.File.Option import Standard.Base.System.File.Existing_File_Behavior +import Standard.Base.Error.Problem_Behavior import Standard.Base.Data.Text.Matching_Mode import Standard.Base.Data.Text.Text_Sub_Range -from Standard.Base.Data.Text.Encoding as Encoding_Module import Encoding +from Standard.Base.Data.Text.Encoding as Encoding_Module import Encoding, Encoding_Error from Standard.Base.Error.Problem_Behavior as Problem_Behavior_Module import Problem_Behavior, Report_Warning from Standard.Base.Runtime.Resource import all export Standard.Base.System.File.Option +polyglot java import org.enso.base.Encoding_Utils polyglot java import java.io.InputStream as Java_Input_Stream polyglot java import java.io.OutputStream as Java_Output_Stream polyglot java import java.io.IOException @@ -781,6 +783,22 @@ type Output_Stream with_java_stream : (Java_Output_Stream -> Any) -> Any with_java_stream f = self.stream_resource . with f + ## PRIVATE + Runs an action with a `ReportingStreamEncoder` encoding data to the + output stream with the specified encoding. + with_stream_encoder : Encoding -> Problem_Behavior -> (ReportingStreamEncoder -> Any) -> Any + with_stream_encoder encoding on_problems action = self.with_java_stream java_stream-> + ## We ignore any warnings raised by the `bytes` method, because if the + original Unicode replacement character failed to encode, the `bytes` + method will have replaced it with the simple `?` sign which should be + available in all encodings. And this is exactly the behavior we want: + if available, we use the `�` character and otherwise we fallback to + the `?` character. + replacement_sequence = Encoding_Utils.INVALID_CHARACTER.bytes encoding on_problems=Problem_Behavior.Ignore + java_charset = encoding.to_java_charset + results = Encoding_Utils.with_stream_encoder java_stream java_charset replacement_sequence.to_array action + problems = Vector.Vector results.problems . map Encoding_Error + on_problems.attach_problems_after results.result problems ## An input stream, allowing for interactive reading of contents from an open file. @@ -906,6 +924,15 @@ type Input_Stream with_java_stream : (Java_Input_Stream -> Any) -> Any with_java_stream f = self.stream_resource . with f + ## PRIVATE + Runs an action with a `ReportingStreamDecoder` decoding data from the + input stream with the specified encoding. + with_stream_decoder : Encoding -> Problem_Behavior -> (ReportingStreamDecoder -> Any) -> Any + with_stream_decoder encoding on_problems action = self.stream_resource . with java_stream-> + java_charset = encoding.to_java_charset + results = Encoding_Utils.with_stream_decoder java_stream java_charset action + problems = Vector.Vector results.problems . map Encoding_Error + on_problems.attach_problems_after results.result problems ## PRIVATE diff --git a/distribution/lib/Standard/Table/0.0.0-dev/src/Internal/Delimited_Reader.enso b/distribution/lib/Standard/Table/0.0.0-dev/src/Internal/Delimited_Reader.enso index c0fc1ba0197..771831e5272 100644 --- a/distribution/lib/Standard/Table/0.0.0-dev/src/Internal/Delimited_Reader.enso +++ b/distribution/lib/Standard/Table/0.0.0-dev/src/Internal/Delimited_Reader.enso @@ -44,8 +44,7 @@ read_file format file on_problems = exceptions), we can catch the exception indicating the limit has been reached and restart parsing with an increased limit. file.with_input_stream [File.Option.Read] stream-> - stream.with_java_stream java_stream-> - here.read_stream format java_stream on_problems related_file=file + here.read_stream format stream on_problems related_file=file read_text : Text -> Delimited -> Problem_Behavior -> Table read_text text format on_problems = @@ -57,7 +56,7 @@ read_text text format on_problems = Arguments: - format: The specification of the delimited file format. - - java_stream: A Java `InputStream` used as the data source. + - stream: An `Input_Stream` to be used as the data source. - on_problems: Specifies the behavior when a problem occurs during the operation. By default, a warning is issued, but the operation proceeds. If set to `Report_Error`, the operation fails with a dataflow error. @@ -67,17 +66,14 @@ read_text text format on_problems = integer. - related_file: The file related to the provided `java_stream`, if available, or `Nothing`. It is used for more detailed error reporting. -read_stream : Delimited -> InputStream -> Problem_Behavior -> Integer -> File | Nothing -> Any -read_stream format java_stream on_problems max_columns=4096 related_file=Nothing = +read_stream : Delimited -> Input_Stream -> Problem_Behavior -> Integer -> File | Nothing -> Any +read_stream format stream on_problems max_columns=4096 related_file=Nothing = handle_io_exception ~action = Panic.catch IOException action caught_panic-> Error.throw (File.wrap_io_exception related_file caught_panic.payload.cause) - java_charset = format.encoding.to_java_charset handle_io_exception <| - Encoding_Utils.with_stream_decoder java_stream java_charset reporting_stream_decoder-> - result = here.read_from_reader format reporting_stream_decoder on_problems max_columns - decoding_problems = Vector.Vector reporting_stream_decoder.getReportedProblems . map Encoding_Error - on_problems.attach_problems_after result decoding_problems + stream.with_stream_decoder format.encoding on_problems reporting_stream_decoder-> + here.read_from_reader format reporting_stream_decoder on_problems max_columns ## PRIVATE Reads data from the provided `Reader` according to the provided format. diff --git a/distribution/lib/Standard/Table/0.0.0-dev/src/Internal/Delimited_Writer.enso b/distribution/lib/Standard/Table/0.0.0-dev/src/Internal/Delimited_Writer.enso index 244df4a1bb4..71dced143ef 100644 --- a/distribution/lib/Standard/Table/0.0.0-dev/src/Internal/Delimited_Writer.enso +++ b/distribution/lib/Standard/Table/0.0.0-dev/src/Internal/Delimited_Writer.enso @@ -36,8 +36,7 @@ write_file table format file on_existing_file on_problems = Errors.unimplemented "Appending to an existing File_Format.Delimited file is not implemented yet." _ -> on_existing_file.write file stream-> - stream.with_java_stream java_stream-> - here.write_to_stream table format java_stream on_problems related_file=file + here.write_to_stream table format stream on_problems related_file=file ## PRIVATE Returns a Text value representing the table in the delimited format. @@ -53,25 +52,21 @@ write_text table format = Arguments: - table: The table to serialize. - format: The specification of the delimited file format. - - java_stream: A Java `OutputStream` used as the data destination. + - stream: An `Output_Stream` used as the data destination. - on_problems: Specifies the behavior when a problem occurs during the operation. By default, a warning is issued, but the operation proceeds. If set to `Report_Error`, the operation fails with a dataflow error. If set to `Ignore`, the operation proceeds without errors or warnings. - related_file: The file related to the provided `java_stream`, if available, or `Nothing`. It is used for more detailed error reporting. -write_to_stream : Table -> File_Format.Delimited -> OutputStream -> Problem_Behavior -> File | Nothing -> Any -write_to_stream table format java_stream on_problems related_file=Nothing = +write_to_stream : Table -> File_Format.Delimited -> Output_Stream -> Problem_Behavior -> File | Nothing -> Any +write_to_stream table format stream on_problems related_file=Nothing = handle_io_exception ~action = Panic.catch IOException action caught_panic-> Error.throw (File.wrap_io_exception related_file caught_panic.payload.cause) - # TODO handling encoding - #java_charset = format.encoding.to_java_charset - _ = on_problems handle_io_exception <| - # TODO create a writer that will use the appropriate encoding and handle mismatches - writer = PrintWriter.new java_stream - here.write_to_writer table format writer + stream.with_stream_encoder format.encoding on_problems reporting_stream_encoder-> + here.write_to_writer table format reporting_stream_encoder ## PRIVATE Writes data to the provided `Writer` according to the provided format. diff --git a/std-bits/base/src/main/java/org/enso/base/Encoding_Utils.java b/std-bits/base/src/main/java/org/enso/base/Encoding_Utils.java index 2f8e972536b..88096de2b10 100644 --- a/std-bits/base/src/main/java/org/enso/base/Encoding_Utils.java +++ b/std-bits/base/src/main/java/org/enso/base/Encoding_Utils.java @@ -1,21 +1,20 @@ package org.enso.base; +import org.enso.base.encoding.ReportingStreamDecoder; +import org.enso.base.encoding.ReportingStreamEncoder; +import org.enso.base.text.ResultWithWarnings; + import java.io.IOException; import java.io.InputStream; +import java.io.OutputStream; import java.nio.Buffer; import java.nio.ByteBuffer; import java.nio.CharBuffer; -import java.nio.charset.Charset; -import java.nio.charset.CharsetDecoder; -import java.nio.charset.CharsetEncoder; -import java.nio.charset.CoderResult; -import java.nio.charset.CodingErrorAction; +import java.nio.charset.*; import java.util.Arrays; import java.util.function.BiConsumer; import java.util.function.Function; import java.util.function.IntFunction; -import org.enso.base.encoding.ReportingStreamDecoder; -import org.enso.base.text.ResultWithWarnings; public class Encoding_Utils { /** The replacement character used for characters that could not have been decoded. */ @@ -164,13 +163,55 @@ public class Encoding_Utils { /** * A helper function which runs an action with a created stream decoder and closes it afterwards. + * + *

It returns the result returned from the executed action and any encoding problems that + * occurred when processing it. */ - public static R with_stream_decoder( + public static WithProblems with_stream_decoder( InputStream stream, Charset charset, Function action) throws IOException { - try (ReportingStreamDecoder decoder = create_stream_decoder(stream, charset)) { - return action.apply(decoder); + R result; + ReportingStreamDecoder decoder = create_stream_decoder(stream, charset); + try { + result = action.apply(decoder); + } finally { + decoder.close(); } + return new WithProblems<>(result, decoder.getReportedProblems()); + } + + /** Creates a new instance of {@code ReportingStreamEncoder} encoding a given charset. */ + private static ReportingStreamEncoder create_stream_encoder( + OutputStream stream, Charset charset, byte[] replacementSequence) { + CharsetEncoder encoder = + charset + .newEncoder() + .onMalformedInput(CodingErrorAction.REPORT) + .onUnmappableCharacter(CodingErrorAction.REPORT) + .reset(); + return new ReportingStreamEncoder(stream, encoder, replacementSequence); + } + + /** + * A helper function which runs an action with a created stream encoder and closes it afterwards. + * + *

It returns the result returned from the executed action and any encoding problems that + * occurred when processing it. + */ + public static WithProblems with_stream_encoder( + OutputStream stream, + Charset charset, + byte[] replacementSequence, + Function action) + throws IOException { + R result; + ReportingStreamEncoder encoder = create_stream_encoder(stream, charset, replacementSequence); + try { + result = action.apply(encoder); + } finally { + encoder.close(); + } + return new WithProblems<>(result, encoder.getReportedProblems()); } /** diff --git a/std-bits/base/src/main/java/org/enso/base/WithProblems.java b/std-bits/base/src/main/java/org/enso/base/WithProblems.java new file mode 100644 index 00000000000..9e0419a8abe --- /dev/null +++ b/std-bits/base/src/main/java/org/enso/base/WithProblems.java @@ -0,0 +1,5 @@ +package org.enso.base; + +import java.util.List; + +public record WithProblems(ResultType result, List problems) {} diff --git a/std-bits/base/src/main/java/org/enso/base/encoding/ReportingStreamEncoder.java b/std-bits/base/src/main/java/org/enso/base/encoding/ReportingStreamEncoder.java new file mode 100644 index 00000000000..7260d40002a --- /dev/null +++ b/std-bits/base/src/main/java/org/enso/base/encoding/ReportingStreamEncoder.java @@ -0,0 +1,195 @@ +package org.enso.base.encoding; + +import org.enso.base.Encoding_Utils; + +import java.io.BufferedOutputStream; +import java.io.IOException; +import java.io.OutputStream; +import java.io.Writer; +import java.nio.ByteBuffer; +import java.nio.CharBuffer; +import java.nio.charset.CharsetEncoder; +import java.nio.charset.CoderResult; +import java.util.ArrayList; +import java.util.List; +import java.util.stream.Collectors; + +/** + * A {@code Writer} which encodes any characters provided to itself using the provided {@code + * CharsetEncoder} and passes the encoded data to the provided {@code OutputStream}. + * + *

Functionally, it should be equivalent to {@code java.io.OutputStreamWriter}. The major + * difference is that this class allows more granular reporting of encoding issues - instead of just + * replacing malformed characters with a replacement or failing at the first error, it allows to + * both perform the replacements but also remember the positions at which the problems occurred and + * then return a bulk report of places where the issues have been encountered. + */ +public class ReportingStreamEncoder extends Writer { + + /** + * Creates a writer which encodes characters and writes them to the provided output stream. + * + *

The encoder reports any malformed or unmappable characters as problems and replaces them + * with the provided replacement sequence. + * + *

The encoder must be closed at the end of the encoding process to indicate that no further + * data will be processed so that it can properly handle the finalization of encoding. + */ + public ReportingStreamEncoder( + OutputStream outputStream, CharsetEncoder encoder, byte[] replacementSequence) { + this.encoder = encoder; + bufferedOutputStream = new BufferedOutputStream(outputStream); + this.replacementSequence = replacementSequence; + } + + private final BufferedOutputStream bufferedOutputStream; + private final CharsetEncoder encoder; + + /** + * The buffer keeping any input that has already been written but not encoded yet. + * + *

Between the calls to write, it satisfies the invariant that it is in 'reading' mode - to be + * able to write to it, it needs to be reallocated, compacted or flipped. + */ + private CharBuffer inputBuffer = CharBuffer.allocate(0); + + private int inputCharactersConsumedBeforeCurrentBuffer = 0; + + private final byte[] replacementSequence; + + private boolean wasClosed = false; + + /** + * The buffer re-used for storing encoded output before writing it to the output stream. + * + *

It is cleared after each call to write, so that it can be freshly re-used in the following + * call. It is preserved only to avoid re-allocating a big buffer upon each call. + */ + private ByteBuffer outputBuffer = ByteBuffer.allocate(0); + + private void ensureInputBufferHasEnoughFreeSpace(int bytesToAppend) { + int freeSpaceInInputBuffer = inputBuffer.capacity() - inputBuffer.remaining(); + + // After either compacting the buffer or reallocating it, any remaining input is shifted to + // the beginning of the buffer. Thus the bytes that preceded the current position are lost + // (because they already have been processed), so we increase the counter to keep the global + // position in the input. + inputCharactersConsumedBeforeCurrentBuffer += inputBuffer.position(); + + if (freeSpaceInInputBuffer < bytesToAppend) { + var old = inputBuffer; + inputBuffer = CharBuffer.allocate(old.remaining() + bytesToAppend); + inputBuffer.put(old); + } else { + inputBuffer.compact(); + } + } + + /** Returns the amount of characters that have already been consumed by the encoder. */ + private int getCurrentInputPosition() { + return inputCharactersConsumedBeforeCurrentBuffer + inputBuffer.position(); + } + + @Override + public void write(char[] cbuf, int off, int len) throws IOException { + if (len < 0) { + throw new IndexOutOfBoundsException(); + } + + ensureInputBufferHasEnoughFreeSpace(len); + inputBuffer.put(cbuf, off, len); + + // We flip the input buffer back to reading mode, to be able to pass it to the encoder. + inputBuffer.flip(); + + if (outputBuffer.capacity() == 0) { + outputBuffer = + ByteBuffer.allocate((int) (inputBuffer.remaining() * encoder.averageBytesPerChar())); + } + runEncoderOnInputBuffer(); + + bufferedOutputStream.write(outputBuffer.array(), 0, outputBuffer.position()); + outputBuffer.clear(); + } + + private void runEncoderOnInputBuffer() { + while (inputBuffer.hasRemaining()) { + CoderResult cr = encoder.encode(inputBuffer, outputBuffer, false); + + if (cr.isMalformed() || cr.isUnmappable()) { + reportEncodingProblem(); + + while (outputBuffer.remaining() < replacementSequence.length) { + growOutputBuffer(); + } + + outputBuffer.put(replacementSequence); + inputBuffer.position(inputBuffer.position() + cr.length()); + } else if (cr.isUnderflow()) { + break; + } else if (cr.isOverflow()) { + growOutputBuffer(); + } + } + } + + /** + * A list of positions containing encoding issues like malformed characters. + * + *

Used for reporting warnings. + */ + List encodingIssuePositions = new ArrayList<>(); + + private void reportEncodingProblem() { + encodingIssuePositions.add(getCurrentInputPosition()); + } + + public List getReportedProblems() { + if (encodingIssuePositions.isEmpty()) { + return List.of(); + } else { + if (encodingIssuePositions.size() == 1) { + return List.of("Encoding issues at character " + encodingIssuePositions.get(0) + "."); + } + + String issues = + encodingIssuePositions.stream() + .map(String::valueOf) + .collect(Collectors.joining(", ", "Encoding issues at characters ", ".")); + return List.of(issues); + } + } + + private void growOutputBuffer() { + outputBuffer = Encoding_Utils.resize(outputBuffer, ByteBuffer::allocate, ByteBuffer::put); + } + + @Override + public void flush() throws IOException { + // We don't flush the encoder here, because the flush operation for the encoder is supposed to + // be run at the very end, and for a Writer the flush may be called whenever and further write + // operations may follow it. So we do the best we can - flush the underlying stream and keep the + // encoder intact, ready for possible writes. + bufferedOutputStream.flush(); + } + + @Override + public void close() throws IOException { + if (wasClosed) { + return; + } + + while (encoder.encode(inputBuffer, outputBuffer, true).isOverflow()) { + growOutputBuffer(); + } + + while (encoder.flush(outputBuffer).isOverflow()) { + growOutputBuffer(); + } + + bufferedOutputStream.write(outputBuffer.array(), 0, outputBuffer.position()); + bufferedOutputStream.flush(); + bufferedOutputStream.close(); + wasClosed = true; + } +} diff --git a/test/Table_Tests/src/Delimited_Write_Spec.enso b/test/Table_Tests/src/Delimited_Write_Spec.enso index a8776293014..bba55434da8 100644 --- a/test/Table_Tests/src/Delimited_Write_Spec.enso +++ b/test/Table_Tests/src/Delimited_Write_Spec.enso @@ -129,7 +129,7 @@ spec = text = File.read_text file text.should_equal expected_text+'\n' - Test.specify "should correctly handle alternative encodings" pending="TODO: will be implemented in the next PR" <| + Test.specify "should correctly handle alternative encodings" <| table = Table.new [["ąęćś", [0]], ["ß", ["żółw 🐢"]]] file = (Enso_Project.data / "transient" / "utf16.csv") file.delete_if_exists @@ -140,7 +140,7 @@ spec = text = File.read_text file encoding=Encoding.utf_16_be text.should_equal expected_text+'\n' - Test.specify "should correctly handle encoding errors" pending="TODO: will be implemented in the next PR" <| + Test.specify "should correctly handle encoding errors" <| table = Table.new [["A", [0, 1]], ["B", ["słówka", "🐢"]]] file = (Enso_Project.data / "transient" / "ascii.csv") file.delete_if_exists @@ -152,7 +152,7 @@ spec = text = File.read_text file encoding=Encoding.ascii text.should_equal expected_text+'\n' result . should_equal Nothing - Warning.get_all result . map .value . should_equal [Encoding_Error "Encoding issues at 7, 8, 15."] + Warning.get_all result . map .value . should_equal [Encoding_Error "Encoding issues at characters 7, 8, 15."] Test.specify "should allow only text columns if no formatter is specified" <| format = File_Format.Delimited "," value_formatter=Nothing diff --git a/test/Tests/src/System/Reporting_Stream_Decoder_Spec.enso b/test/Tests/src/System/Reporting_Stream_Decoder_Spec.enso index 9a6844bf572..e3fcbe9a621 100644 --- a/test/Tests/src/System/Reporting_Stream_Decoder_Spec.enso +++ b/test/Tests/src/System/Reporting_Stream_Decoder_Spec.enso @@ -1,8 +1,8 @@ from Standard.Base import all from Standard.Base.Data.Text.Encoding as Encoding_Module import Encoding, Encoding_Error +import Standard.Base.Error.Problem_Behavior -polyglot java import org.enso.base.Encoding_Utils polyglot java import java.nio.CharBuffer import Standard.Test @@ -11,33 +11,18 @@ import Standard.Test.Problems spec = windows_file = Enso_Project.data / "windows.txt" - read_file_one_by_one file java_charset expected_size expected_problems=[] = - file.with_input_stream [File.Option.Read] stream-> - stream.with_java_stream java_stream-> - Encoding_Utils.with_stream_decoder java_stream java_charset reporting_stream_decoder-> - codepoints = 0.up_to expected_size . map _-> - reporting_stream_decoder.read - reporting_stream_decoder.read.should_equal -1 - - problems = Vector.Vector reporting_stream_decoder.getReportedProblems - problems.should_equal expected_problems - - Text.from_codepoints codepoints - Test.group "ReportingStreamDecoder" <| Test.specify "should allow reading a file character by character" <| f = Enso_Project.data / "short.txt" f.delete_if_exists f.exists.should_be_false "Cup".write f - java_charset = Encoding.utf_8.to_java_charset f.with_input_stream [File.Option.Read] stream-> - stream.with_java_stream java_stream-> - Encoding_Utils.with_stream_decoder java_stream java_charset reporting_stream_decoder-> - reporting_stream_decoder.read.should_equal 67 - reporting_stream_decoder.read.should_equal 117 - reporting_stream_decoder.read.should_equal 112 - reporting_stream_decoder.read.should_equal -1 + stream.with_stream_decoder Encoding.utf_8 reporting_stream_decoder-> + reporting_stream_decoder.read.should_equal 67 + reporting_stream_decoder.read.should_equal 117 + reporting_stream_decoder.read.should_equal 112 + reporting_stream_decoder.read.should_equal -1 f.delete f.exists.should_be_false @@ -46,82 +31,109 @@ spec = fragment = 'Hello 😎🚀🚧!' contents = 1.up_to 1000 . map _->fragment . join '\n' contents.write f - java_charset = Encoding.utf_8.to_java_charset - all_codepoints = Vector.new_builder read_chars decoder n = - buffer = CharBuffer.allocate n - chars_read = decoder.read buffer - if chars_read == -1 then Nothing else - buffer.flip - v = Vector.new_builder - transfer_codepoints _ = - if buffer.hasRemaining.not then Nothing else - char = buffer.get - v.append char - all_codepoints.append char - @Tail_Call transfer_codepoints Nothing - transfer_codepoints Nothing - v.to_vector + case here.read_characters decoder n of + Nothing -> Nothing + chars -> + chars.each all_codepoints.append + chars - f.with_input_stream [File.Option.Read] stream-> - stream.with_java_stream java_stream-> - Encoding_Utils.with_stream_decoder java_stream java_charset decoder-> - read_chars decoder 1 . should_equal "H".codepoints - read_chars decoder 2 . should_equal "el".codepoints - read_chars decoder 3 . should_equal "lo ".codepoints - v1 = read_chars decoder 6 - Text.from_codepoints v1 . should_equal '😎🚀🚧' + result = f.with_input_stream [File.Option.Read] stream-> + stream.with_stream_decoder Encoding.utf_8 Problem_Behavior.Report_Error decoder-> + read_chars decoder 1 . should_equal "H".codepoints + read_chars decoder 2 . should_equal "el".codepoints + read_chars decoder 3 . should_equal "lo ".codepoints + v1 = read_chars decoder 6 + Text.from_codepoints v1 . should_equal '😎🚀🚧' - v2 = read_chars decoder 200 - ## Here we show that while the decoder is trying to read - 200 codepoints, some codepoints require more than one - byte in UTF-8 to represent, so the actual result - should be slightly smaller. - (v2.length < 200) . should_be_true + v2 = read_chars decoder 200 + ## Here we show that while the decoder is trying to read + 200 codepoints, some codepoints require more than one + byte in UTF-8 to represent, so the actual result + should be slightly smaller. + (v2.length < 200) . should_be_true - ## Now we read increasingly larger amounts, to trigger - and test all paths of the input buffer resizing - mechanism. - read_chars decoder 40 - read_chars decoder 500 - read_chars decoder 1000 - read_chars decoder 1 - read_chars decoder 2 - read_chars decoder 10 + ## Now we read increasingly larger amounts, to trigger + and test all paths of the input buffer resizing + mechanism. + read_chars decoder 40 + read_chars decoder 500 + read_chars decoder 1000 + read_chars decoder 1 + read_chars decoder 2 + read_chars decoder 10 - ## Finally read all the remaining contents of the file - to verify they were decoded correctly as a whole. - read_rest _ = - case read_chars decoder 100 of - Nothing -> Nothing - _ -> @Tail_Call read_rest Nothing - read_rest Nothing + ## Finally read all the remaining contents of the file + to verify they were decoded correctly as a whole. + read_rest _ = + case read_chars decoder 100 of + Nothing -> Nothing + _ -> @Tail_Call read_rest Nothing + read_rest Nothing Text.from_codepoints all_codepoints.to_vector . should_equal contents + result . should_equal Nothing f.delete Test.specify "should allow reading a UTF-8 file" <| f = Enso_Project.data / "transient" / "utf8.txt" encoding = Encoding.utf_8 - java_charset = encoding.to_java_charset ((0.up_to 100).map _->'Hello World!' . join '\n').write f expected_contents = f.read_text - contents = read_file_one_by_one f java_charset expected_contents.length + contents = here.read_file_one_by_one f encoding expected_contents.length contents.should_equal expected_contents Test.specify "should allow reading a Windows file" <| encoding = Encoding.windows_1252 - java_charset = encoding.to_java_charset expected_contents = "Hello World! $¢¤¥" - contents = read_file_one_by_one windows_file java_charset expected_contents.length + contents = here.read_file_one_by_one windows_file encoding expected_contents.length contents.should_equal expected_contents Test.specify "should raise warnings when reading invalid characters" <| encoding = Encoding.ascii - java_charset = encoding.to_java_charset expected_contents = 'Hello World! $\uFFFD\uFFFD\uFFFD' - expected_problems = ["Encoding issues at bytes 14, 15, 16."] - contents = read_file_one_by_one windows_file java_charset expected_contents.length expected_problems=expected_problems - contents.should_equal expected_contents + expected_problems = [Encoding_Error "Encoding issues at bytes 14, 15, 16."] + contents_1 = here.read_file_one_by_one windows_file encoding expected_contents.length on_problems=Problem_Behavior.Report_Warning + contents_1.should_equal expected_contents + Warning.get_all contents_1 . map .value . should_equal expected_problems + + contents_2 = windows_file.with_input_stream [File.Option.Read] stream-> + stream.with_stream_decoder encoding Problem_Behavior.Report_Warning reporting_stream_decoder-> + codepoint_1 = reporting_stream_decoder.read + codepoints_1 = here.read_characters reporting_stream_decoder 5 + codepoints_2 = here.read_characters reporting_stream_decoder 3 + codepoints_3 = here.read_characters reporting_stream_decoder 100 + reporting_stream_decoder.read.should_equal -1 + Text.from_codepoints <| [codepoint_1]+codepoints_1+codepoints_2+codepoints_3 + contents_2.should_equal expected_contents + Warning.get_all contents_2 . map .value . should_equal expected_problems + + Test.specify "should work correctly if no data is read from it" <| + result = windows_file.with_input_stream [File.Option.Read] stream-> + stream.with_stream_decoder Encoding.ascii Problem_Behavior.Report_Error _->Nothing + result.should_equal Nothing + +read_file_one_by_one file encoding expected_size on_problems=Problem_Behavior.Report_Error = + file.with_input_stream [File.Option.Read] stream-> + stream.with_stream_decoder encoding on_problems reporting_stream_decoder-> + codepoints = 0.up_to expected_size . map _-> + reporting_stream_decoder.read + reporting_stream_decoder.read.should_equal -1 + + Text.from_codepoints codepoints + +read_characters decoder n = + buffer = CharBuffer.allocate n + chars_read = decoder.read buffer + if chars_read == -1 then Nothing else + buffer.flip + v = Vector.new_builder + transfer_codepoints _ = + if buffer.hasRemaining.not then Nothing else + char = buffer.get + v.append char + @Tail_Call transfer_codepoints Nothing + transfer_codepoints Nothing + v.to_vector main = Test.Suite.run_main here.spec diff --git a/test/Tests/src/System/Reporting_Stream_Encoder_Spec.enso b/test/Tests/src/System/Reporting_Stream_Encoder_Spec.enso new file mode 100644 index 00000000000..4f8a3a64658 --- /dev/null +++ b/test/Tests/src/System/Reporting_Stream_Encoder_Spec.enso @@ -0,0 +1,91 @@ +from Standard.Base import all + +from Standard.Base.Data.Text.Encoding as Encoding_Module import Encoding, Encoding_Error +import Standard.Base.Error.Problem_Behavior + +polyglot java import org.enso.base.Encoding_Utils +polyglot java import java.nio.CharBuffer + +import Standard.Test +import Standard.Test.Problems + +spec = + Test.group "ReportingStreamEncoder" <| + Test.specify "should allow writing a file codepoint by codepoint" <| + f = Enso_Project.data / "transient" / "char-by-char.txt" + f.delete_if_exists + f.exists.should_be_false + contents = 1.up_to 7 . map _->'Cześc\u0301 😎🚀🚧!' . join '\n' + f.with_output_stream [File.Option.Write, File.Option.Create_New] stream-> + stream.with_stream_encoder Encoding.utf_8 Problem_Behavior.Report_Error reporting_stream_encoder-> + contents.char_vector.each char-> + reporting_stream_encoder.write char + f.read_text.should_equal contents + + Test.specify "should work correctly when writing chunks of varying sizes" <| + f = Enso_Project.data / "transient" / "varying-utf16.txt" + f.delete_if_exists + f.exists.should_be_false + encoding = Encoding.utf_16_be + big = 1.up_to 7 . map _->'Cześc\u0301 😎🚀🚧!' . join '\n' + f.with_output_stream [File.Option.Write, File.Option.Create_New] stream-> + stream.with_stream_encoder encoding Problem_Behavior.Report_Error reporting_stream_encoder-> + reporting_stream_encoder.write "A" + reporting_stream_encoder.write "Abc" + reporting_stream_encoder.write "Defghi" + reporting_stream_encoder.write 'O\u0301' + reporting_stream_encoder.write "X" + reporting_stream_encoder.write big + reporting_stream_encoder.write "Y" + reporting_stream_encoder.write "Ź" + + contents = 'AAbcDefghiO\u0301X' + big + "YŹ" + f.read_text encoding . should_equal contents + + Test.specify "should allow writing a Windows file" <| + f = Enso_Project.data / "transient" / "windows.txt" + encoding = Encoding.windows_1252 + contents = "Hello World! $¢¤¥" + + f.delete_if_exists + f.with_output_stream [File.Option.Write, File.Option.Create_New] stream-> + stream.with_stream_encoder encoding Problem_Behavior.Report_Error reporting_stream_encoder-> + reporting_stream_encoder.write contents + + f.read_text encoding . should_equal contents + + Test.specify "should raise warnings when writing characters that cannot be encoded and replace them with the Unicode replacement character or a question mark" <| + f = Enso_Project.data / "transient" / "ascii.txt" + encoding = Encoding.ascii + contents = 'Sło\u0301wka!' + f.delete_if_exists + result = f.with_output_stream [File.Option.Write, File.Option.Create_New] stream-> + stream.with_stream_encoder encoding Problem_Behavior.Report_Warning reporting_stream_encoder-> + reporting_stream_encoder.write contents + result . should_equal Nothing + Warning.get_all result . map .value . should_equal [Encoding_Error "Encoding issues at characters 1, 3."] + f.read_text encoding . should_equal "S?o?wka!" + + f.delete_if_exists + result_2 = f.with_output_stream [File.Option.Write, File.Option.Create_New] stream-> + stream.with_stream_encoder encoding Problem_Behavior.Report_Warning reporting_stream_encoder-> + reporting_stream_encoder.write "ABC" + reporting_stream_encoder.write "ą" + reporting_stream_encoder.write "foo" + reporting_stream_encoder.write " -🚧- " + reporting_stream_encoder.write "bar" + + result_2 . should_equal Nothing + Warning.get_all result_2 . map .value . should_equal [Encoding_Error "Encoding issues at characters 3, 9."] + f.read_text encoding . should_equal "ABC?foo -?- bar" + + Test.specify "should work correctly if no data is written to it" <| + f = Enso_Project.data / "transient" / "empty.txt" + encoding = Encoding.ascii + f.delete_if_exists + result = f.with_output_stream [File.Option.Write, File.Option.Create_New] stream-> + stream.with_stream_encoder encoding Problem_Behavior.Report_Error _->Nothing + result . should_equal Nothing + f.read_text encoding . should_equal "" + +main = Test.Suite.run_main here.spec