Custom Encoding support when writing Delimited files (#3564)

Implements https://www.pivotaltracker.com/story/show/182545847
2024-12-27 21:12:48 +03:00 · 2022-07-07 02:20:00 +02:00 · 2022-07-07 02:20:00 +02:00 · 7c94fa6a77
commit 7c94fa6a77
parent d950499a90
11 changed files with 478 additions and 117 deletions
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@ -147,6 +147,7 @@
 - [Added `File_Format.Delimited` support to `Table.write` for new files.][3528]
 - [Adjusted `Database.connect` API to new design.][3542]
 - [Added `File_Format.Excel` support to `Table.write` for new files.][3551]
+- [Added support for custom encodings in `File_Format.Delimited` writing.][3564]

 [debug-shortcuts]:
  https://github.com/enso-org/enso/blob/develop/app/gui/docs/product/shortcuts.md#debug
@ -232,6 +233,7 @@
 [3528]: https://github.com/enso-org/enso/pull/3528
 [3542]: https://github.com/enso-org/enso/pull/3542
 [3551]: https://github.com/enso-org/enso/pull/3551
+[3564]: https://github.com/enso-org/enso/pull/3564
 [3552]: https://github.com/enso-org/enso/pull/3552

 #### Enso Compiler
--- a/build.sbt
+++ b/build.sbt
@ -5,11 +5,8 @@ import sbt.Keys.{libraryDependencies, scalacOptions}
 import sbt.addCompilerPlugin
 import sbt.complete.DefaultParsers._
 import sbt.complete.Parser
-import sbtcrossproject.CrossPlugin.autoImport.{crossProject, CrossType}
-import src.main.scala.licenses.{
-  DistributionDescription,
-  SBTDistributionComponent
-}
+import sbtcrossproject.CrossPlugin.autoImport.{CrossType, crossProject}
+import src.main.scala.licenses.{DistributionDescription, SBTDistributionComponent}

 import java.io.File

@ -17,9 +14,9 @@ import java.io.File
 // === Global Configuration ===================================================
 // ============================================================================

-val scalacVersion         = "2.13.7"
-val graalVersion          = "21.3.0"
-val javaVersion           = "11"
+val scalacVersion = "2.13.7"
+val graalVersion = "21.3.0"
+val javaVersion = "11"
 val defaultDevEnsoVersion = "0.0.0-dev"
 val ensoVersion = sys.env.getOrElse(
  "ENSO_VERSION",
--- a/distribution/lib/Standard/Base/0.0.0-dev/src/System/File.enso
+++ b/distribution/lib/Standard/Base/0.0.0-dev/src/System/File.enso
@ -2,14 +2,16 @@ from Standard.Base import all

 import Standard.Base.System.File.Option
 import Standard.Base.System.File.Existing_File_Behavior
+import Standard.Base.Error.Problem_Behavior
 import Standard.Base.Data.Text.Matching_Mode
 import Standard.Base.Data.Text.Text_Sub_Range
-from Standard.Base.Data.Text.Encoding as Encoding_Module import Encoding
+from Standard.Base.Data.Text.Encoding as Encoding_Module import Encoding, Encoding_Error
 from Standard.Base.Error.Problem_Behavior as Problem_Behavior_Module import Problem_Behavior, Report_Warning
 from Standard.Base.Runtime.Resource import all

 export Standard.Base.System.File.Option

+polyglot java import org.enso.base.Encoding_Utils
 polyglot java import java.io.InputStream as Java_Input_Stream
 polyglot java import java.io.OutputStream as Java_Output_Stream
 polyglot java import java.io.IOException
@ -781,6 +783,22 @@ type Output_Stream
    with_java_stream : (Java_Output_Stream -> Any) -> Any
    with_java_stream f = self.stream_resource . with f

+    ## PRIVATE
+       Runs an action with a `ReportingStreamEncoder` encoding data to the
+       output stream with the specified encoding.
+    with_stream_encoder : Encoding -> Problem_Behavior -> (ReportingStreamEncoder -> Any) -> Any
+    with_stream_encoder encoding on_problems action = self.with_java_stream java_stream->
+        ## We ignore any warnings raised by the `bytes` method, because if the
+           original Unicode replacement character failed to encode, the `bytes`
+           method will have replaced it with the simple `?` sign which should be
+           available in all encodings. And this is exactly the behavior we want:
+           if available, we use the `<60>` character and otherwise we fallback to
+           the `?` character.
+        replacement_sequence = Encoding_Utils.INVALID_CHARACTER.bytes encoding on_problems=Problem_Behavior.Ignore
+        java_charset = encoding.to_java_charset
+        results = Encoding_Utils.with_stream_encoder java_stream java_charset replacement_sequence.to_array action
+        problems = Vector.Vector results.problems . map Encoding_Error
+        on_problems.attach_problems_after results.result problems

 ## An input stream, allowing for interactive reading of contents from an open
   file.
@ -906,6 +924,15 @@ type Input_Stream
    with_java_stream : (Java_Input_Stream -> Any) -> Any
    with_java_stream f = self.stream_resource . with f

+    ## PRIVATE
+       Runs an action with a `ReportingStreamDecoder` decoding data from the
+       input stream with the specified encoding.
+    with_stream_decoder : Encoding -> Problem_Behavior -> (ReportingStreamDecoder -> Any) -> Any
+    with_stream_decoder encoding on_problems action = self.stream_resource . with java_stream->
+        java_charset = encoding.to_java_charset
+        results = Encoding_Utils.with_stream_decoder java_stream java_charset action
+        problems = Vector.Vector results.problems . map Encoding_Error
+        on_problems.attach_problems_after results.result problems

 ## PRIVATE

--- a/distribution/lib/Standard/Table/0.0.0-dev/src/Internal/Delimited_Reader.enso
+++ b/distribution/lib/Standard/Table/0.0.0-dev/src/Internal/Delimited_Reader.enso
@ -44,8 +44,7 @@ read_file format file on_problems =
       exceptions), we can catch the exception indicating the limit has been
       reached and restart parsing with an increased limit.
    file.with_input_stream [File.Option.Read] stream->
-        stream.with_java_stream java_stream->
-            here.read_stream format java_stream on_problems related_file=file
+        here.read_stream format stream on_problems related_file=file

 read_text : Text -> Delimited -> Problem_Behavior -> Table
 read_text text format on_problems =
@ -57,7 +56,7 @@ read_text text format on_problems =

   Arguments:
   - format: The specification of the delimited file format.
-   - java_stream: A Java `InputStream` used as the data source.
+   - stream: An `Input_Stream` to be used as the data source.
   - on_problems: Specifies the behavior when a problem occurs during the
     operation. By default, a warning is issued, but the operation proceeds.
     If set to `Report_Error`, the operation fails with a dataflow error.
@ -67,17 +66,14 @@ read_text text format on_problems =
     integer.
   - related_file: The file related to the provided `java_stream`, if available,
     or `Nothing`. It is used for more detailed error reporting.
-read_stream : Delimited -> InputStream -> Problem_Behavior -> Integer -> File | Nothing -> Any
-read_stream format java_stream on_problems max_columns=4096 related_file=Nothing =
+read_stream : Delimited -> Input_Stream -> Problem_Behavior -> Integer -> File | Nothing -> Any
+read_stream format stream on_problems max_columns=4096 related_file=Nothing =
    handle_io_exception ~action = Panic.catch IOException action caught_panic->
        Error.throw (File.wrap_io_exception related_file caught_panic.payload.cause)

-    java_charset = format.encoding.to_java_charset
    handle_io_exception <|
-        Encoding_Utils.with_stream_decoder java_stream java_charset reporting_stream_decoder->
-            result = here.read_from_reader format reporting_stream_decoder on_problems max_columns
-            decoding_problems = Vector.Vector reporting_stream_decoder.getReportedProblems . map Encoding_Error
-            on_problems.attach_problems_after result decoding_problems
+        stream.with_stream_decoder format.encoding on_problems reporting_stream_decoder->
+            here.read_from_reader format reporting_stream_decoder on_problems max_columns

 ## PRIVATE
   Reads data from the provided `Reader` according to the provided format.
--- a/distribution/lib/Standard/Table/0.0.0-dev/src/Internal/Delimited_Writer.enso
+++ b/distribution/lib/Standard/Table/0.0.0-dev/src/Internal/Delimited_Writer.enso
@ -36,8 +36,7 @@ write_file table format file on_existing_file on_problems =
            Errors.unimplemented "Appending to an existing File_Format.Delimited file is not implemented yet."
        _ ->
            on_existing_file.write file stream->
-                stream.with_java_stream java_stream->
-                    here.write_to_stream table format java_stream on_problems related_file=file
+                here.write_to_stream table format stream on_problems related_file=file

 ## PRIVATE
   Returns a Text value representing the table in the delimited format.
@ -53,25 +52,21 @@ write_text table format =
   Arguments:
   - table: The table to serialize.
   - format: The specification of the delimited file format.
-   - java_stream: A Java `OutputStream` used as the data destination.
+   - stream: An `Output_Stream` used as the data destination.
   - on_problems: Specifies the behavior when a problem occurs during the
     operation. By default, a warning is issued, but the operation proceeds.
     If set to `Report_Error`, the operation fails with a dataflow error.
     If set to `Ignore`, the operation proceeds without errors or warnings.
   - related_file: The file related to the provided `java_stream`, if available,
     or `Nothing`. It is used for more detailed error reporting.
-write_to_stream : Table -> File_Format.Delimited -> OutputStream -> Problem_Behavior -> File | Nothing -> Any
-write_to_stream table format java_stream on_problems related_file=Nothing =
+write_to_stream : Table -> File_Format.Delimited -> Output_Stream -> Problem_Behavior -> File | Nothing -> Any
+write_to_stream table format stream on_problems related_file=Nothing =
    handle_io_exception ~action = Panic.catch IOException action caught_panic->
        Error.throw (File.wrap_io_exception related_file caught_panic.payload.cause)

-    # TODO handling encoding
-    #java_charset = format.encoding.to_java_charset
-    _ = on_problems
    handle_io_exception <|
-        # TODO create a writer that will use the appropriate encoding and handle mismatches
-        writer = PrintWriter.new java_stream
-        here.write_to_writer table format writer
+        stream.with_stream_encoder format.encoding on_problems reporting_stream_encoder->
+            here.write_to_writer table format reporting_stream_encoder

 ## PRIVATE
   Writes data to the provided `Writer` according to the provided format.
--- a/std-bits/base/src/main/java/org/enso/base/Encoding_Utils.java
+++ b/std-bits/base/src/main/java/org/enso/base/Encoding_Utils.java
@ -1,21 +1,20 @@
 package org.enso.base;

+import org.enso.base.encoding.ReportingStreamDecoder;
+import org.enso.base.encoding.ReportingStreamEncoder;
+import org.enso.base.text.ResultWithWarnings;
+
 import java.io.IOException;
 import java.io.InputStream;
+import java.io.OutputStream;
 import java.nio.Buffer;
 import java.nio.ByteBuffer;
 import java.nio.CharBuffer;
-import java.nio.charset.Charset;
-import java.nio.charset.CharsetDecoder;
-import java.nio.charset.CharsetEncoder;
-import java.nio.charset.CoderResult;
-import java.nio.charset.CodingErrorAction;
+import java.nio.charset.*;
 import java.util.Arrays;
 import java.util.function.BiConsumer;
 import java.util.function.Function;
 import java.util.function.IntFunction;
-import org.enso.base.encoding.ReportingStreamDecoder;
-import org.enso.base.text.ResultWithWarnings;

 public class Encoding_Utils {
  /** The replacement character used for characters that could not have been decoded. */
@ -164,13 +163,55 @@ public class Encoding_Utils {

  /**
   * A helper function which runs an action with a created stream decoder and closes it afterwards.
+   *
+   * <p>It returns the result returned from the executed action and any encoding problems that
+   * occurred when processing it.
   */
-  public static <R> R with_stream_decoder(
+  public static <R> WithProblems<R, String> with_stream_decoder(
      InputStream stream, Charset charset, Function<ReportingStreamDecoder, R> action)
      throws IOException {
-    try (ReportingStreamDecoder decoder = create_stream_decoder(stream, charset)) {
-      return action.apply(decoder);
+    R result;
+    ReportingStreamDecoder decoder = create_stream_decoder(stream, charset);
+    try {
+      result = action.apply(decoder);
+    } finally {
+      decoder.close();
    }
+    return new WithProblems<>(result, decoder.getReportedProblems());
+  }
+
+  /** Creates a new instance of {@code ReportingStreamEncoder} encoding a given charset. */
+  private static ReportingStreamEncoder create_stream_encoder(
+      OutputStream stream, Charset charset, byte[] replacementSequence) {
+    CharsetEncoder encoder =
+        charset
+            .newEncoder()
+            .onMalformedInput(CodingErrorAction.REPORT)
+            .onUnmappableCharacter(CodingErrorAction.REPORT)
+            .reset();
+    return new ReportingStreamEncoder(stream, encoder, replacementSequence);
+  }
+
+  /**
+   * A helper function which runs an action with a created stream encoder and closes it afterwards.
+   *
+   * <p>It returns the result returned from the executed action and any encoding problems that
+   * occurred when processing it.
+   */
+  public static <R> WithProblems<R, String> with_stream_encoder(
+      OutputStream stream,
+      Charset charset,
+      byte[] replacementSequence,
+      Function<ReportingStreamEncoder, R> action)
+      throws IOException {
+    R result;
+    ReportingStreamEncoder encoder = create_stream_encoder(stream, charset, replacementSequence);
+    try {
+      result = action.apply(encoder);
+    } finally {
+      encoder.close();
+    }
+    return new WithProblems<>(result, encoder.getReportedProblems());
  }

  /**
--- a/std-bits/base/src/main/java/org/enso/base/WithProblems.java
+++ b/std-bits/base/src/main/java/org/enso/base/WithProblems.java
@ -0,0 +1,5 @@
+package org.enso.base;
+
+import java.util.List;
+
+public record WithProblems<ResultType, ProblemType>(ResultType result, List<ProblemType> problems) {}
--- a/std-bits/base/src/main/java/org/enso/base/encoding/ReportingStreamEncoder.java
+++ b/std-bits/base/src/main/java/org/enso/base/encoding/ReportingStreamEncoder.java
@ -0,0 +1,195 @@
+package org.enso.base.encoding;
+
+import org.enso.base.Encoding_Utils;
+
+import java.io.BufferedOutputStream;
+import java.io.IOException;
+import java.io.OutputStream;
+import java.io.Writer;
+import java.nio.ByteBuffer;
+import java.nio.CharBuffer;
+import java.nio.charset.CharsetEncoder;
+import java.nio.charset.CoderResult;
+import java.util.ArrayList;
+import java.util.List;
+import java.util.stream.Collectors;
+
+/**
+ * A {@code Writer} which encodes any characters provided to itself using the provided {@code
+ * CharsetEncoder} and passes the encoded data to the provided {@code OutputStream}.
+ *
+ * <p>Functionally, it should be equivalent to {@code java.io.OutputStreamWriter}. The major
+ * difference is that this class allows more granular reporting of encoding issues - instead of just
+ * replacing malformed characters with a replacement or failing at the first error, it allows to
+ * both perform the replacements but also remember the positions at which the problems occurred and
+ * then return a bulk report of places where the issues have been encountered.
+ */
+public class ReportingStreamEncoder extends Writer {
+
+  /**
+   * Creates a writer which encodes characters and writes them to the provided output stream.
+   *
+   * <p>The encoder reports any malformed or unmappable characters as problems and replaces them
+   * with the provided replacement sequence.
+   *
+   * <p>The encoder must be closed at the end of the encoding process to indicate that no further
+   * data will be processed so that it can properly handle the finalization of encoding.
+   */
+  public ReportingStreamEncoder(
+      OutputStream outputStream, CharsetEncoder encoder, byte[] replacementSequence) {
+    this.encoder = encoder;
+    bufferedOutputStream = new BufferedOutputStream(outputStream);
+    this.replacementSequence = replacementSequence;
+  }
+
+  private final BufferedOutputStream bufferedOutputStream;
+  private final CharsetEncoder encoder;
+
+  /**
+   * The buffer keeping any input that has already been written but not encoded yet.
+   *
+   * <p>Between the calls to write, it satisfies the invariant that it is in 'reading' mode - to be
+   * able to write to it, it needs to be reallocated, compacted or flipped.
+   */
+  private CharBuffer inputBuffer = CharBuffer.allocate(0);
+
+  private int inputCharactersConsumedBeforeCurrentBuffer = 0;
+
+  private final byte[] replacementSequence;
+
+  private boolean wasClosed = false;
+
+  /**
+   * The buffer re-used for storing encoded output before writing it to the output stream.
+   *
+   * <p>It is cleared after each call to write, so that it can be freshly re-used in the following
+   * call. It is preserved only to avoid re-allocating a big buffer upon each call.
+   */
+  private ByteBuffer outputBuffer = ByteBuffer.allocate(0);
+
+  private void ensureInputBufferHasEnoughFreeSpace(int bytesToAppend) {
+    int freeSpaceInInputBuffer = inputBuffer.capacity() - inputBuffer.remaining();
+
+    // After either compacting the buffer or reallocating it, any remaining input is shifted to
+    // the beginning of the buffer. Thus the bytes that preceded the current position are lost
+    // (because they already have been processed), so we increase the counter to keep the global
+    // position in the input.
+    inputCharactersConsumedBeforeCurrentBuffer += inputBuffer.position();
+
+    if (freeSpaceInInputBuffer < bytesToAppend) {
+      var old = inputBuffer;
+      inputBuffer = CharBuffer.allocate(old.remaining() + bytesToAppend);
+      inputBuffer.put(old);
+    } else {
+      inputBuffer.compact();
+    }
+  }
+
+  /** Returns the amount of characters that have already been consumed by the encoder. */
+  private int getCurrentInputPosition() {
+    return inputCharactersConsumedBeforeCurrentBuffer + inputBuffer.position();
+  }
+
+  @Override
+  public void write(char[] cbuf, int off, int len) throws IOException {
+    if (len < 0) {
+      throw new IndexOutOfBoundsException();
+    }
+
+    ensureInputBufferHasEnoughFreeSpace(len);
+    inputBuffer.put(cbuf, off, len);
+
+    // We flip the input buffer back to reading mode, to be able to pass it to the encoder.
+    inputBuffer.flip();
+
+    if (outputBuffer.capacity() == 0) {
+      outputBuffer =
+          ByteBuffer.allocate((int) (inputBuffer.remaining() * encoder.averageBytesPerChar()));
+    }
+    runEncoderOnInputBuffer();
+
+    bufferedOutputStream.write(outputBuffer.array(), 0, outputBuffer.position());
+    outputBuffer.clear();
+  }
+
+  private void runEncoderOnInputBuffer() {
+    while (inputBuffer.hasRemaining()) {
+      CoderResult cr = encoder.encode(inputBuffer, outputBuffer, false);
+
+      if (cr.isMalformed() || cr.isUnmappable()) {
+        reportEncodingProblem();
+
+        while (outputBuffer.remaining() < replacementSequence.length) {
+          growOutputBuffer();
+        }
+
+        outputBuffer.put(replacementSequence);
+        inputBuffer.position(inputBuffer.position() + cr.length());
+      } else if (cr.isUnderflow()) {
+        break;
+      } else if (cr.isOverflow()) {
+        growOutputBuffer();
+      }
+    }
+  }
+
+  /**
+   * A list of positions containing encoding issues like malformed characters.
+   *
+   * <p>Used for reporting warnings.
+   */
+  List<Integer> encodingIssuePositions = new ArrayList<>();
+
+  private void reportEncodingProblem() {
+    encodingIssuePositions.add(getCurrentInputPosition());
+  }
+
+  public List<String> getReportedProblems() {
+    if (encodingIssuePositions.isEmpty()) {
+      return List.of();
+    } else {
+      if (encodingIssuePositions.size() == 1) {
+        return List.of("Encoding issues at character " + encodingIssuePositions.get(0) + ".");
+      }
+
+      String issues =
+          encodingIssuePositions.stream()
+              .map(String::valueOf)
+              .collect(Collectors.joining(", ", "Encoding issues at characters ", "."));
+      return List.of(issues);
+    }
+  }
+
+  private void growOutputBuffer() {
+    outputBuffer = Encoding_Utils.resize(outputBuffer, ByteBuffer::allocate, ByteBuffer::put);
+  }
+
+  @Override
+  public void flush() throws IOException {
+    // We don't flush the encoder here, because the flush operation for the encoder is supposed to
+    // be run at the very end, and for a Writer the flush may be called whenever and further write
+    // operations may follow it. So we do the best we can - flush the underlying stream and keep the
+    // encoder intact, ready for possible writes.
+    bufferedOutputStream.flush();
+  }
+
+  @Override
+  public void close() throws IOException {
+    if (wasClosed) {
+      return;
+    }
+
+    while (encoder.encode(inputBuffer, outputBuffer, true).isOverflow()) {
+      growOutputBuffer();
+    }
+
+    while (encoder.flush(outputBuffer).isOverflow()) {
+      growOutputBuffer();
+    }
+
+    bufferedOutputStream.write(outputBuffer.array(), 0, outputBuffer.position());
+    bufferedOutputStream.flush();
+    bufferedOutputStream.close();
+    wasClosed = true;
+  }
+}
--- a/test/Table_Tests/src/Delimited_Write_Spec.enso
+++ b/test/Table_Tests/src/Delimited_Write_Spec.enso
@ -129,7 +129,7 @@ spec =
            text = File.read_text file
            text.should_equal expected_text+'\n'

-        Test.specify "should correctly handle alternative encodings" pending="TODO: will be implemented in the next PR" <|
+        Test.specify "should correctly handle alternative encodings" <|
            table = Table.new [["ąęćś", [0]], ["ß", ["żółw 🐢"]]]
            file = (Enso_Project.data / "transient" / "utf16.csv")
            file.delete_if_exists
@ -140,7 +140,7 @@ spec =
            text = File.read_text file encoding=Encoding.utf_16_be
            text.should_equal expected_text+'\n'

-        Test.specify "should correctly handle encoding errors" pending="TODO: will be implemented in the next PR" <|
+        Test.specify "should correctly handle encoding errors" <|
            table = Table.new [["A", [0, 1]], ["B", ["słówka", "🐢"]]]
            file = (Enso_Project.data / "transient" / "ascii.csv")
            file.delete_if_exists
@ -152,7 +152,7 @@ spec =
            text = File.read_text file encoding=Encoding.ascii
            text.should_equal expected_text+'\n'
            result . should_equal Nothing
-            Warning.get_all result . map .value . should_equal [Encoding_Error "Encoding issues at 7, 8, 15."]
+            Warning.get_all result . map .value . should_equal [Encoding_Error "Encoding issues at characters 7, 8, 15."]

        Test.specify "should allow only text columns if no formatter is specified" <|
            format = File_Format.Delimited "," value_formatter=Nothing
--- a/test/Tests/src/System/Reporting_Stream_Decoder_Spec.enso
+++ b/test/Tests/src/System/Reporting_Stream_Decoder_Spec.enso
@ -1,8 +1,8 @@
 from Standard.Base import all

 from Standard.Base.Data.Text.Encoding as Encoding_Module import Encoding, Encoding_Error
+import Standard.Base.Error.Problem_Behavior

-polyglot java import org.enso.base.Encoding_Utils
 polyglot java import java.nio.CharBuffer

 import Standard.Test
@ -11,33 +11,18 @@ import Standard.Test.Problems
 spec =
    windows_file = Enso_Project.data / "windows.txt"

-    read_file_one_by_one file java_charset expected_size expected_problems=[] =
-        file.with_input_stream [File.Option.Read] stream->
-            stream.with_java_stream java_stream->
-                Encoding_Utils.with_stream_decoder java_stream java_charset reporting_stream_decoder->
-                    codepoints = 0.up_to expected_size . map _->
-                        reporting_stream_decoder.read
-                    reporting_stream_decoder.read.should_equal -1
-
-                    problems = Vector.Vector reporting_stream_decoder.getReportedProblems
-                    problems.should_equal expected_problems
-
-                    Text.from_codepoints codepoints
-
    Test.group "ReportingStreamDecoder" <|
        Test.specify "should allow reading a file character by character" <|
            f = Enso_Project.data / "short.txt"
            f.delete_if_exists
            f.exists.should_be_false
            "Cup".write f
-            java_charset = Encoding.utf_8.to_java_charset
            f.with_input_stream [File.Option.Read] stream->
-                stream.with_java_stream java_stream->
-                    Encoding_Utils.with_stream_decoder java_stream java_charset reporting_stream_decoder->
-                        reporting_stream_decoder.read.should_equal 67
-                        reporting_stream_decoder.read.should_equal 117
-                        reporting_stream_decoder.read.should_equal 112
-                        reporting_stream_decoder.read.should_equal -1
+                stream.with_stream_decoder Encoding.utf_8 reporting_stream_decoder->
+                    reporting_stream_decoder.read.should_equal 67
+                    reporting_stream_decoder.read.should_equal 117
+                    reporting_stream_decoder.read.should_equal 112
+                    reporting_stream_decoder.read.should_equal -1
            f.delete
            f.exists.should_be_false

@ -46,82 +31,109 @@ spec =
            fragment = 'Hello 😎🚀🚧!'
            contents = 1.up_to 1000 . map _->fragment . join '\n'
            contents.write f
-            java_charset = Encoding.utf_8.to_java_charset
-
            all_codepoints = Vector.new_builder
            read_chars decoder n =
-                buffer = CharBuffer.allocate n
-                chars_read = decoder.read buffer
-                if chars_read == -1 then Nothing else
-                    buffer.flip
-                    v = Vector.new_builder
-                    transfer_codepoints _ =
-                        if buffer.hasRemaining.not then Nothing else
-                            char = buffer.get
-                            v.append char
-                            all_codepoints.append char
-                            @Tail_Call transfer_codepoints Nothing
-                    transfer_codepoints Nothing
-                    v.to_vector
+                case here.read_characters decoder n of
+                    Nothing -> Nothing
+                    chars ->
+                        chars.each all_codepoints.append
+                        chars

-            f.with_input_stream [File.Option.Read] stream->
-                stream.with_java_stream java_stream->
-                    Encoding_Utils.with_stream_decoder java_stream java_charset decoder->
-                        read_chars decoder 1 . should_equal "H".codepoints
-                        read_chars decoder 2 . should_equal "el".codepoints
-                        read_chars decoder 3 . should_equal "lo ".codepoints
-                        v1 = read_chars decoder 6
-                        Text.from_codepoints v1 . should_equal '😎🚀🚧'
+            result = f.with_input_stream [File.Option.Read] stream->
+                stream.with_stream_decoder Encoding.utf_8 Problem_Behavior.Report_Error decoder->
+                    read_chars decoder 1 . should_equal "H".codepoints
+                    read_chars decoder 2 . should_equal "el".codepoints
+                    read_chars decoder 3 . should_equal "lo ".codepoints
+                    v1 = read_chars decoder 6
+                    Text.from_codepoints v1 . should_equal '😎🚀🚧'

-                        v2 = read_chars decoder 200
-                        ## Here we show that while the decoder is trying to read
-                           200 codepoints, some codepoints require more than one
-                           byte in UTF-8 to represent, so the actual result
-                           should be slightly smaller.
-                        (v2.length < 200) . should_be_true
+                    v2 = read_chars decoder 200
+                    ## Here we show that while the decoder is trying to read
+                       200 codepoints, some codepoints require more than one
+                       byte in UTF-8 to represent, so the actual result
+                       should be slightly smaller.
+                    (v2.length < 200) . should_be_true

-                        ## Now we read increasingly larger amounts, to trigger
-                           and test all paths of the input buffer resizing
-                           mechanism.
-                        read_chars decoder 40
-                        read_chars decoder 500
-                        read_chars decoder 1000
-                        read_chars decoder 1
-                        read_chars decoder 2
-                        read_chars decoder 10
+                    ## Now we read increasingly larger amounts, to trigger
+                       and test all paths of the input buffer resizing
+                       mechanism.
+                    read_chars decoder 40
+                    read_chars decoder 500
+                    read_chars decoder 1000
+                    read_chars decoder 1
+                    read_chars decoder 2
+                    read_chars decoder 10

-                        ## Finally read all the remaining contents of the file
-                           to verify they were decoded correctly as a whole.
-                        read_rest _ =
-                            case read_chars decoder 100 of
-                                Nothing -> Nothing
-                                _ -> @Tail_Call read_rest Nothing
-                        read_rest Nothing
+                    ## Finally read all the remaining contents of the file
+                       to verify they were decoded correctly as a whole.
+                    read_rest _ =
+                        case read_chars decoder 100 of
+                            Nothing -> Nothing
+                            _ -> @Tail_Call read_rest Nothing
+                    read_rest Nothing
            Text.from_codepoints all_codepoints.to_vector . should_equal contents
+            result . should_equal Nothing
            f.delete

        Test.specify "should allow reading a UTF-8 file" <|
            f = Enso_Project.data / "transient" / "utf8.txt"
            encoding = Encoding.utf_8
-            java_charset = encoding.to_java_charset
            ((0.up_to 100).map _->'Hello World!' . join '\n').write f
            expected_contents = f.read_text
-            contents = read_file_one_by_one f java_charset expected_contents.length
+            contents = here.read_file_one_by_one f encoding expected_contents.length
            contents.should_equal expected_contents

        Test.specify "should allow reading a Windows file" <|
            encoding = Encoding.windows_1252
-            java_charset = encoding.to_java_charset
            expected_contents = "Hello World! $¢¤¥"
-            contents = read_file_one_by_one windows_file java_charset expected_contents.length
+            contents = here.read_file_one_by_one windows_file encoding expected_contents.length
            contents.should_equal expected_contents

        Test.specify "should raise warnings when reading invalid characters" <|
            encoding = Encoding.ascii
-            java_charset = encoding.to_java_charset
            expected_contents = 'Hello World! $\uFFFD\uFFFD\uFFFD'
-            expected_problems = ["Encoding issues at bytes 14, 15, 16."]
-            contents = read_file_one_by_one windows_file java_charset expected_contents.length expected_problems=expected_problems
-            contents.should_equal expected_contents
+            expected_problems = [Encoding_Error "Encoding issues at bytes 14, 15, 16."]
+            contents_1 = here.read_file_one_by_one windows_file encoding expected_contents.length on_problems=Problem_Behavior.Report_Warning
+            contents_1.should_equal expected_contents
+            Warning.get_all contents_1 . map .value . should_equal expected_problems
+
+            contents_2 = windows_file.with_input_stream [File.Option.Read] stream->
+                stream.with_stream_decoder encoding Problem_Behavior.Report_Warning reporting_stream_decoder->
+                    codepoint_1 = reporting_stream_decoder.read
+                    codepoints_1 = here.read_characters reporting_stream_decoder 5
+                    codepoints_2 = here.read_characters reporting_stream_decoder 3
+                    codepoints_3 = here.read_characters reporting_stream_decoder 100
+                    reporting_stream_decoder.read.should_equal -1
+                    Text.from_codepoints <| [codepoint_1]+codepoints_1+codepoints_2+codepoints_3
+            contents_2.should_equal expected_contents
+            Warning.get_all contents_2 . map .value . should_equal expected_problems
+
+        Test.specify "should work correctly if no data is read from it" <|
+            result = windows_file.with_input_stream [File.Option.Read] stream->
+                stream.with_stream_decoder Encoding.ascii Problem_Behavior.Report_Error _->Nothing
+            result.should_equal Nothing
+
+read_file_one_by_one file encoding expected_size on_problems=Problem_Behavior.Report_Error =
+    file.with_input_stream [File.Option.Read] stream->
+        stream.with_stream_decoder encoding on_problems reporting_stream_decoder->
+            codepoints = 0.up_to expected_size . map _->
+                reporting_stream_decoder.read
+            reporting_stream_decoder.read.should_equal -1
+
+            Text.from_codepoints codepoints
+
+read_characters decoder n =
+    buffer = CharBuffer.allocate n
+    chars_read = decoder.read buffer
+    if chars_read == -1 then Nothing else
+        buffer.flip
+        v = Vector.new_builder
+        transfer_codepoints _ =
+            if buffer.hasRemaining.not then Nothing else
+                char = buffer.get
+                v.append char
+                @Tail_Call transfer_codepoints Nothing
+        transfer_codepoints Nothing
+        v.to_vector

 main = Test.Suite.run_main here.spec
--- a/test/Tests/src/System/Reporting_Stream_Encoder_Spec.enso
+++ b/test/Tests/src/System/Reporting_Stream_Encoder_Spec.enso
@ -0,0 +1,91 @@
+from Standard.Base import all
+
+from Standard.Base.Data.Text.Encoding as Encoding_Module import Encoding, Encoding_Error
+import Standard.Base.Error.Problem_Behavior
+
+polyglot java import org.enso.base.Encoding_Utils
+polyglot java import java.nio.CharBuffer
+
+import Standard.Test
+import Standard.Test.Problems
+
+spec =
+    Test.group "ReportingStreamEncoder" <|
+        Test.specify "should allow writing a file codepoint by codepoint" <|
+            f = Enso_Project.data / "transient" / "char-by-char.txt"
+            f.delete_if_exists
+            f.exists.should_be_false
+            contents = 1.up_to 7 . map _->'Cześc\u0301 😎🚀🚧!' . join '\n'
+            f.with_output_stream [File.Option.Write, File.Option.Create_New] stream->
+                stream.with_stream_encoder Encoding.utf_8 Problem_Behavior.Report_Error reporting_stream_encoder->
+                    contents.char_vector.each char->
+                        reporting_stream_encoder.write char
+            f.read_text.should_equal contents
+
+        Test.specify "should work correctly when writing chunks of varying sizes" <|
+            f = Enso_Project.data / "transient" / "varying-utf16.txt"
+            f.delete_if_exists
+            f.exists.should_be_false
+            encoding = Encoding.utf_16_be
+            big = 1.up_to 7 . map _->'Cześc\u0301 😎🚀🚧!' . join '\n'
+            f.with_output_stream [File.Option.Write, File.Option.Create_New] stream->
+                stream.with_stream_encoder encoding Problem_Behavior.Report_Error reporting_stream_encoder->
+                    reporting_stream_encoder.write "A"
+                    reporting_stream_encoder.write "Abc"
+                    reporting_stream_encoder.write "Defghi"
+                    reporting_stream_encoder.write 'O\u0301'
+                    reporting_stream_encoder.write "X"
+                    reporting_stream_encoder.write big
+                    reporting_stream_encoder.write "Y"
+                    reporting_stream_encoder.write "Ź"
+
+            contents = 'AAbcDefghiO\u0301X' + big + "YŹ"
+            f.read_text encoding . should_equal contents
+
+        Test.specify "should allow writing a Windows file" <|
+            f = Enso_Project.data / "transient" / "windows.txt"
+            encoding = Encoding.windows_1252
+            contents = "Hello World! $¢¤¥"
+
+            f.delete_if_exists
+            f.with_output_stream [File.Option.Write, File.Option.Create_New] stream->
+                stream.with_stream_encoder encoding Problem_Behavior.Report_Error reporting_stream_encoder->
+                    reporting_stream_encoder.write contents
+
+            f.read_text encoding . should_equal contents
+
+        Test.specify "should raise warnings when writing characters that cannot be encoded and replace them with the Unicode replacement character or a question mark" <|
+            f = Enso_Project.data / "transient" / "ascii.txt"
+            encoding = Encoding.ascii
+            contents = 'Sło\u0301wka!'
+            f.delete_if_exists
+            result = f.with_output_stream [File.Option.Write, File.Option.Create_New] stream->
+                stream.with_stream_encoder encoding Problem_Behavior.Report_Warning reporting_stream_encoder->
+                    reporting_stream_encoder.write contents
+            result . should_equal Nothing
+            Warning.get_all result . map .value . should_equal [Encoding_Error "Encoding issues at characters 1, 3."]
+            f.read_text encoding . should_equal "S?o?wka!"
+
+            f.delete_if_exists
+            result_2 = f.with_output_stream [File.Option.Write, File.Option.Create_New] stream->
+                stream.with_stream_encoder encoding Problem_Behavior.Report_Warning reporting_stream_encoder->
+                    reporting_stream_encoder.write "ABC"
+                    reporting_stream_encoder.write "ą"
+                    reporting_stream_encoder.write "foo"
+                    reporting_stream_encoder.write " -🚧- "
+                    reporting_stream_encoder.write "bar"
+
+            result_2 . should_equal Nothing
+            Warning.get_all result_2 . map .value . should_equal [Encoding_Error "Encoding issues at characters 3, 9."]
+            f.read_text encoding . should_equal "ABC?foo -?- bar"
+
+        Test.specify "should work correctly if no data is written to it" <|
+            f = Enso_Project.data / "transient" / "empty.txt"
+            encoding = Encoding.ascii
+            f.delete_if_exists
+            result = f.with_output_stream [File.Option.Write, File.Option.Create_New] stream->
+                stream.with_stream_encoder encoding Problem_Behavior.Report_Error _->Nothing
+            result . should_equal Nothing
+            f.read_text encoding . should_equal ""
+
+main = Test.Suite.run_main here.spec