Custom Encoding support when writing Delimited files (#3564)

Implements https://www.pivotaltracker.com/story/show/182545847
This commit is contained in:
Radosław Waśko 2022-07-07 02:20:00 +02:00 committed by GitHub
parent d950499a90
commit 7c94fa6a77
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
11 changed files with 478 additions and 117 deletions

View File

@ -147,6 +147,7 @@
- [Added `File_Format.Delimited` support to `Table.write` for new files.][3528]
- [Adjusted `Database.connect` API to new design.][3542]
- [Added `File_Format.Excel` support to `Table.write` for new files.][3551]
- [Added support for custom encodings in `File_Format.Delimited` writing.][3564]
[debug-shortcuts]:
https://github.com/enso-org/enso/blob/develop/app/gui/docs/product/shortcuts.md#debug
@ -232,6 +233,7 @@
[3528]: https://github.com/enso-org/enso/pull/3528
[3542]: https://github.com/enso-org/enso/pull/3542
[3551]: https://github.com/enso-org/enso/pull/3551
[3564]: https://github.com/enso-org/enso/pull/3564
[3552]: https://github.com/enso-org/enso/pull/3552
#### Enso Compiler

View File

@ -5,11 +5,8 @@ import sbt.Keys.{libraryDependencies, scalacOptions}
import sbt.addCompilerPlugin
import sbt.complete.DefaultParsers._
import sbt.complete.Parser
import sbtcrossproject.CrossPlugin.autoImport.{crossProject, CrossType}
import src.main.scala.licenses.{
DistributionDescription,
SBTDistributionComponent
}
import sbtcrossproject.CrossPlugin.autoImport.{CrossType, crossProject}
import src.main.scala.licenses.{DistributionDescription, SBTDistributionComponent}
import java.io.File
@ -17,9 +14,9 @@ import java.io.File
// === Global Configuration ===================================================
// ============================================================================
val scalacVersion = "2.13.7"
val graalVersion = "21.3.0"
val javaVersion = "11"
val scalacVersion = "2.13.7"
val graalVersion = "21.3.0"
val javaVersion = "11"
val defaultDevEnsoVersion = "0.0.0-dev"
val ensoVersion = sys.env.getOrElse(
"ENSO_VERSION",

View File

@ -2,14 +2,16 @@ from Standard.Base import all
import Standard.Base.System.File.Option
import Standard.Base.System.File.Existing_File_Behavior
import Standard.Base.Error.Problem_Behavior
import Standard.Base.Data.Text.Matching_Mode
import Standard.Base.Data.Text.Text_Sub_Range
from Standard.Base.Data.Text.Encoding as Encoding_Module import Encoding
from Standard.Base.Data.Text.Encoding as Encoding_Module import Encoding, Encoding_Error
from Standard.Base.Error.Problem_Behavior as Problem_Behavior_Module import Problem_Behavior, Report_Warning
from Standard.Base.Runtime.Resource import all
export Standard.Base.System.File.Option
polyglot java import org.enso.base.Encoding_Utils
polyglot java import java.io.InputStream as Java_Input_Stream
polyglot java import java.io.OutputStream as Java_Output_Stream
polyglot java import java.io.IOException
@ -781,6 +783,22 @@ type Output_Stream
with_java_stream : (Java_Output_Stream -> Any) -> Any
with_java_stream f = self.stream_resource . with f
## PRIVATE
Runs an action with a `ReportingStreamEncoder` encoding data to the
output stream with the specified encoding.
with_stream_encoder : Encoding -> Problem_Behavior -> (ReportingStreamEncoder -> Any) -> Any
with_stream_encoder encoding on_problems action = self.with_java_stream java_stream->
## We ignore any warnings raised by the `bytes` method, because if the
original Unicode replacement character failed to encode, the `bytes`
method will have replaced it with the simple `?` sign which should be
available in all encodings. And this is exactly the behavior we want:
if available, we use the `<60>` character and otherwise we fallback to
the `?` character.
replacement_sequence = Encoding_Utils.INVALID_CHARACTER.bytes encoding on_problems=Problem_Behavior.Ignore
java_charset = encoding.to_java_charset
results = Encoding_Utils.with_stream_encoder java_stream java_charset replacement_sequence.to_array action
problems = Vector.Vector results.problems . map Encoding_Error
on_problems.attach_problems_after results.result problems
## An input stream, allowing for interactive reading of contents from an open
file.
@ -906,6 +924,15 @@ type Input_Stream
with_java_stream : (Java_Input_Stream -> Any) -> Any
with_java_stream f = self.stream_resource . with f
## PRIVATE
Runs an action with a `ReportingStreamDecoder` decoding data from the
input stream with the specified encoding.
with_stream_decoder : Encoding -> Problem_Behavior -> (ReportingStreamDecoder -> Any) -> Any
with_stream_decoder encoding on_problems action = self.stream_resource . with java_stream->
java_charset = encoding.to_java_charset
results = Encoding_Utils.with_stream_decoder java_stream java_charset action
problems = Vector.Vector results.problems . map Encoding_Error
on_problems.attach_problems_after results.result problems
## PRIVATE

View File

@ -44,8 +44,7 @@ read_file format file on_problems =
exceptions), we can catch the exception indicating the limit has been
reached and restart parsing with an increased limit.
file.with_input_stream [File.Option.Read] stream->
stream.with_java_stream java_stream->
here.read_stream format java_stream on_problems related_file=file
here.read_stream format stream on_problems related_file=file
read_text : Text -> Delimited -> Problem_Behavior -> Table
read_text text format on_problems =
@ -57,7 +56,7 @@ read_text text format on_problems =
Arguments:
- format: The specification of the delimited file format.
- java_stream: A Java `InputStream` used as the data source.
- stream: An `Input_Stream` to be used as the data source.
- on_problems: Specifies the behavior when a problem occurs during the
operation. By default, a warning is issued, but the operation proceeds.
If set to `Report_Error`, the operation fails with a dataflow error.
@ -67,17 +66,14 @@ read_text text format on_problems =
integer.
- related_file: The file related to the provided `java_stream`, if available,
or `Nothing`. It is used for more detailed error reporting.
read_stream : Delimited -> InputStream -> Problem_Behavior -> Integer -> File | Nothing -> Any
read_stream format java_stream on_problems max_columns=4096 related_file=Nothing =
read_stream : Delimited -> Input_Stream -> Problem_Behavior -> Integer -> File | Nothing -> Any
read_stream format stream on_problems max_columns=4096 related_file=Nothing =
handle_io_exception ~action = Panic.catch IOException action caught_panic->
Error.throw (File.wrap_io_exception related_file caught_panic.payload.cause)
java_charset = format.encoding.to_java_charset
handle_io_exception <|
Encoding_Utils.with_stream_decoder java_stream java_charset reporting_stream_decoder->
result = here.read_from_reader format reporting_stream_decoder on_problems max_columns
decoding_problems = Vector.Vector reporting_stream_decoder.getReportedProblems . map Encoding_Error
on_problems.attach_problems_after result decoding_problems
stream.with_stream_decoder format.encoding on_problems reporting_stream_decoder->
here.read_from_reader format reporting_stream_decoder on_problems max_columns
## PRIVATE
Reads data from the provided `Reader` according to the provided format.

View File

@ -36,8 +36,7 @@ write_file table format file on_existing_file on_problems =
Errors.unimplemented "Appending to an existing File_Format.Delimited file is not implemented yet."
_ ->
on_existing_file.write file stream->
stream.with_java_stream java_stream->
here.write_to_stream table format java_stream on_problems related_file=file
here.write_to_stream table format stream on_problems related_file=file
## PRIVATE
Returns a Text value representing the table in the delimited format.
@ -53,25 +52,21 @@ write_text table format =
Arguments:
- table: The table to serialize.
- format: The specification of the delimited file format.
- java_stream: A Java `OutputStream` used as the data destination.
- stream: An `Output_Stream` used as the data destination.
- on_problems: Specifies the behavior when a problem occurs during the
operation. By default, a warning is issued, but the operation proceeds.
If set to `Report_Error`, the operation fails with a dataflow error.
If set to `Ignore`, the operation proceeds without errors or warnings.
- related_file: The file related to the provided `java_stream`, if available,
or `Nothing`. It is used for more detailed error reporting.
write_to_stream : Table -> File_Format.Delimited -> OutputStream -> Problem_Behavior -> File | Nothing -> Any
write_to_stream table format java_stream on_problems related_file=Nothing =
write_to_stream : Table -> File_Format.Delimited -> Output_Stream -> Problem_Behavior -> File | Nothing -> Any
write_to_stream table format stream on_problems related_file=Nothing =
handle_io_exception ~action = Panic.catch IOException action caught_panic->
Error.throw (File.wrap_io_exception related_file caught_panic.payload.cause)
# TODO handling encoding
#java_charset = format.encoding.to_java_charset
_ = on_problems
handle_io_exception <|
# TODO create a writer that will use the appropriate encoding and handle mismatches
writer = PrintWriter.new java_stream
here.write_to_writer table format writer
stream.with_stream_encoder format.encoding on_problems reporting_stream_encoder->
here.write_to_writer table format reporting_stream_encoder
## PRIVATE
Writes data to the provided `Writer` according to the provided format.

View File

@ -1,21 +1,20 @@
package org.enso.base;
import org.enso.base.encoding.ReportingStreamDecoder;
import org.enso.base.encoding.ReportingStreamEncoder;
import org.enso.base.text.ResultWithWarnings;
import java.io.IOException;
import java.io.InputStream;
import java.io.OutputStream;
import java.nio.Buffer;
import java.nio.ByteBuffer;
import java.nio.CharBuffer;
import java.nio.charset.Charset;
import java.nio.charset.CharsetDecoder;
import java.nio.charset.CharsetEncoder;
import java.nio.charset.CoderResult;
import java.nio.charset.CodingErrorAction;
import java.nio.charset.*;
import java.util.Arrays;
import java.util.function.BiConsumer;
import java.util.function.Function;
import java.util.function.IntFunction;
import org.enso.base.encoding.ReportingStreamDecoder;
import org.enso.base.text.ResultWithWarnings;
public class Encoding_Utils {
/** The replacement character used for characters that could not have been decoded. */
@ -164,13 +163,55 @@ public class Encoding_Utils {
/**
* A helper function which runs an action with a created stream decoder and closes it afterwards.
*
* <p>It returns the result returned from the executed action and any encoding problems that
* occurred when processing it.
*/
public static <R> R with_stream_decoder(
public static <R> WithProblems<R, String> with_stream_decoder(
InputStream stream, Charset charset, Function<ReportingStreamDecoder, R> action)
throws IOException {
try (ReportingStreamDecoder decoder = create_stream_decoder(stream, charset)) {
return action.apply(decoder);
R result;
ReportingStreamDecoder decoder = create_stream_decoder(stream, charset);
try {
result = action.apply(decoder);
} finally {
decoder.close();
}
return new WithProblems<>(result, decoder.getReportedProblems());
}
/** Creates a new instance of {@code ReportingStreamEncoder} encoding a given charset. */
private static ReportingStreamEncoder create_stream_encoder(
OutputStream stream, Charset charset, byte[] replacementSequence) {
CharsetEncoder encoder =
charset
.newEncoder()
.onMalformedInput(CodingErrorAction.REPORT)
.onUnmappableCharacter(CodingErrorAction.REPORT)
.reset();
return new ReportingStreamEncoder(stream, encoder, replacementSequence);
}
/**
* A helper function which runs an action with a created stream encoder and closes it afterwards.
*
* <p>It returns the result returned from the executed action and any encoding problems that
* occurred when processing it.
*/
public static <R> WithProblems<R, String> with_stream_encoder(
OutputStream stream,
Charset charset,
byte[] replacementSequence,
Function<ReportingStreamEncoder, R> action)
throws IOException {
R result;
ReportingStreamEncoder encoder = create_stream_encoder(stream, charset, replacementSequence);
try {
result = action.apply(encoder);
} finally {
encoder.close();
}
return new WithProblems<>(result, encoder.getReportedProblems());
}
/**

View File

@ -0,0 +1,5 @@
package org.enso.base;
import java.util.List;
public record WithProblems<ResultType, ProblemType>(ResultType result, List<ProblemType> problems) {}

View File

@ -0,0 +1,195 @@
package org.enso.base.encoding;
import org.enso.base.Encoding_Utils;
import java.io.BufferedOutputStream;
import java.io.IOException;
import java.io.OutputStream;
import java.io.Writer;
import java.nio.ByteBuffer;
import java.nio.CharBuffer;
import java.nio.charset.CharsetEncoder;
import java.nio.charset.CoderResult;
import java.util.ArrayList;
import java.util.List;
import java.util.stream.Collectors;
/**
* A {@code Writer} which encodes any characters provided to itself using the provided {@code
* CharsetEncoder} and passes the encoded data to the provided {@code OutputStream}.
*
* <p>Functionally, it should be equivalent to {@code java.io.OutputStreamWriter}. The major
* difference is that this class allows more granular reporting of encoding issues - instead of just
* replacing malformed characters with a replacement or failing at the first error, it allows to
* both perform the replacements but also remember the positions at which the problems occurred and
* then return a bulk report of places where the issues have been encountered.
*/
public class ReportingStreamEncoder extends Writer {
/**
* Creates a writer which encodes characters and writes them to the provided output stream.
*
* <p>The encoder reports any malformed or unmappable characters as problems and replaces them
* with the provided replacement sequence.
*
* <p>The encoder must be closed at the end of the encoding process to indicate that no further
* data will be processed so that it can properly handle the finalization of encoding.
*/
public ReportingStreamEncoder(
OutputStream outputStream, CharsetEncoder encoder, byte[] replacementSequence) {
this.encoder = encoder;
bufferedOutputStream = new BufferedOutputStream(outputStream);
this.replacementSequence = replacementSequence;
}
private final BufferedOutputStream bufferedOutputStream;
private final CharsetEncoder encoder;
/**
* The buffer keeping any input that has already been written but not encoded yet.
*
* <p>Between the calls to write, it satisfies the invariant that it is in 'reading' mode - to be
* able to write to it, it needs to be reallocated, compacted or flipped.
*/
private CharBuffer inputBuffer = CharBuffer.allocate(0);
private int inputCharactersConsumedBeforeCurrentBuffer = 0;
private final byte[] replacementSequence;
private boolean wasClosed = false;
/**
* The buffer re-used for storing encoded output before writing it to the output stream.
*
* <p>It is cleared after each call to write, so that it can be freshly re-used in the following
* call. It is preserved only to avoid re-allocating a big buffer upon each call.
*/
private ByteBuffer outputBuffer = ByteBuffer.allocate(0);
private void ensureInputBufferHasEnoughFreeSpace(int bytesToAppend) {
int freeSpaceInInputBuffer = inputBuffer.capacity() - inputBuffer.remaining();
// After either compacting the buffer or reallocating it, any remaining input is shifted to
// the beginning of the buffer. Thus the bytes that preceded the current position are lost
// (because they already have been processed), so we increase the counter to keep the global
// position in the input.
inputCharactersConsumedBeforeCurrentBuffer += inputBuffer.position();
if (freeSpaceInInputBuffer < bytesToAppend) {
var old = inputBuffer;
inputBuffer = CharBuffer.allocate(old.remaining() + bytesToAppend);
inputBuffer.put(old);
} else {
inputBuffer.compact();
}
}
/** Returns the amount of characters that have already been consumed by the encoder. */
private int getCurrentInputPosition() {
return inputCharactersConsumedBeforeCurrentBuffer + inputBuffer.position();
}
@Override
public void write(char[] cbuf, int off, int len) throws IOException {
if (len < 0) {
throw new IndexOutOfBoundsException();
}
ensureInputBufferHasEnoughFreeSpace(len);
inputBuffer.put(cbuf, off, len);
// We flip the input buffer back to reading mode, to be able to pass it to the encoder.
inputBuffer.flip();
if (outputBuffer.capacity() == 0) {
outputBuffer =
ByteBuffer.allocate((int) (inputBuffer.remaining() * encoder.averageBytesPerChar()));
}
runEncoderOnInputBuffer();
bufferedOutputStream.write(outputBuffer.array(), 0, outputBuffer.position());
outputBuffer.clear();
}
private void runEncoderOnInputBuffer() {
while (inputBuffer.hasRemaining()) {
CoderResult cr = encoder.encode(inputBuffer, outputBuffer, false);
if (cr.isMalformed() || cr.isUnmappable()) {
reportEncodingProblem();
while (outputBuffer.remaining() < replacementSequence.length) {
growOutputBuffer();
}
outputBuffer.put(replacementSequence);
inputBuffer.position(inputBuffer.position() + cr.length());
} else if (cr.isUnderflow()) {
break;
} else if (cr.isOverflow()) {
growOutputBuffer();
}
}
}
/**
* A list of positions containing encoding issues like malformed characters.
*
* <p>Used for reporting warnings.
*/
List<Integer> encodingIssuePositions = new ArrayList<>();
private void reportEncodingProblem() {
encodingIssuePositions.add(getCurrentInputPosition());
}
public List<String> getReportedProblems() {
if (encodingIssuePositions.isEmpty()) {
return List.of();
} else {
if (encodingIssuePositions.size() == 1) {
return List.of("Encoding issues at character " + encodingIssuePositions.get(0) + ".");
}
String issues =
encodingIssuePositions.stream()
.map(String::valueOf)
.collect(Collectors.joining(", ", "Encoding issues at characters ", "."));
return List.of(issues);
}
}
private void growOutputBuffer() {
outputBuffer = Encoding_Utils.resize(outputBuffer, ByteBuffer::allocate, ByteBuffer::put);
}
@Override
public void flush() throws IOException {
// We don't flush the encoder here, because the flush operation for the encoder is supposed to
// be run at the very end, and for a Writer the flush may be called whenever and further write
// operations may follow it. So we do the best we can - flush the underlying stream and keep the
// encoder intact, ready for possible writes.
bufferedOutputStream.flush();
}
@Override
public void close() throws IOException {
if (wasClosed) {
return;
}
while (encoder.encode(inputBuffer, outputBuffer, true).isOverflow()) {
growOutputBuffer();
}
while (encoder.flush(outputBuffer).isOverflow()) {
growOutputBuffer();
}
bufferedOutputStream.write(outputBuffer.array(), 0, outputBuffer.position());
bufferedOutputStream.flush();
bufferedOutputStream.close();
wasClosed = true;
}
}

View File

@ -129,7 +129,7 @@ spec =
text = File.read_text file
text.should_equal expected_text+'\n'
Test.specify "should correctly handle alternative encodings" pending="TODO: will be implemented in the next PR" <|
Test.specify "should correctly handle alternative encodings" <|
table = Table.new [["ąęćś", [0]], ["ß", ["żółw 🐢"]]]
file = (Enso_Project.data / "transient" / "utf16.csv")
file.delete_if_exists
@ -140,7 +140,7 @@ spec =
text = File.read_text file encoding=Encoding.utf_16_be
text.should_equal expected_text+'\n'
Test.specify "should correctly handle encoding errors" pending="TODO: will be implemented in the next PR" <|
Test.specify "should correctly handle encoding errors" <|
table = Table.new [["A", [0, 1]], ["B", ["słówka", "🐢"]]]
file = (Enso_Project.data / "transient" / "ascii.csv")
file.delete_if_exists
@ -152,7 +152,7 @@ spec =
text = File.read_text file encoding=Encoding.ascii
text.should_equal expected_text+'\n'
result . should_equal Nothing
Warning.get_all result . map .value . should_equal [Encoding_Error "Encoding issues at 7, 8, 15."]
Warning.get_all result . map .value . should_equal [Encoding_Error "Encoding issues at characters 7, 8, 15."]
Test.specify "should allow only text columns if no formatter is specified" <|
format = File_Format.Delimited "," value_formatter=Nothing

View File

@ -1,8 +1,8 @@
from Standard.Base import all
from Standard.Base.Data.Text.Encoding as Encoding_Module import Encoding, Encoding_Error
import Standard.Base.Error.Problem_Behavior
polyglot java import org.enso.base.Encoding_Utils
polyglot java import java.nio.CharBuffer
import Standard.Test
@ -11,33 +11,18 @@ import Standard.Test.Problems
spec =
windows_file = Enso_Project.data / "windows.txt"
read_file_one_by_one file java_charset expected_size expected_problems=[] =
file.with_input_stream [File.Option.Read] stream->
stream.with_java_stream java_stream->
Encoding_Utils.with_stream_decoder java_stream java_charset reporting_stream_decoder->
codepoints = 0.up_to expected_size . map _->
reporting_stream_decoder.read
reporting_stream_decoder.read.should_equal -1
problems = Vector.Vector reporting_stream_decoder.getReportedProblems
problems.should_equal expected_problems
Text.from_codepoints codepoints
Test.group "ReportingStreamDecoder" <|
Test.specify "should allow reading a file character by character" <|
f = Enso_Project.data / "short.txt"
f.delete_if_exists
f.exists.should_be_false
"Cup".write f
java_charset = Encoding.utf_8.to_java_charset
f.with_input_stream [File.Option.Read] stream->
stream.with_java_stream java_stream->
Encoding_Utils.with_stream_decoder java_stream java_charset reporting_stream_decoder->
reporting_stream_decoder.read.should_equal 67
reporting_stream_decoder.read.should_equal 117
reporting_stream_decoder.read.should_equal 112
reporting_stream_decoder.read.should_equal -1
stream.with_stream_decoder Encoding.utf_8 reporting_stream_decoder->
reporting_stream_decoder.read.should_equal 67
reporting_stream_decoder.read.should_equal 117
reporting_stream_decoder.read.should_equal 112
reporting_stream_decoder.read.should_equal -1
f.delete
f.exists.should_be_false
@ -46,82 +31,109 @@ spec =
fragment = 'Hello 😎🚀🚧!'
contents = 1.up_to 1000 . map _->fragment . join '\n'
contents.write f
java_charset = Encoding.utf_8.to_java_charset
all_codepoints = Vector.new_builder
read_chars decoder n =
buffer = CharBuffer.allocate n
chars_read = decoder.read buffer
if chars_read == -1 then Nothing else
buffer.flip
v = Vector.new_builder
transfer_codepoints _ =
if buffer.hasRemaining.not then Nothing else
char = buffer.get
v.append char
all_codepoints.append char
@Tail_Call transfer_codepoints Nothing
transfer_codepoints Nothing
v.to_vector
case here.read_characters decoder n of
Nothing -> Nothing
chars ->
chars.each all_codepoints.append
chars
f.with_input_stream [File.Option.Read] stream->
stream.with_java_stream java_stream->
Encoding_Utils.with_stream_decoder java_stream java_charset decoder->
read_chars decoder 1 . should_equal "H".codepoints
read_chars decoder 2 . should_equal "el".codepoints
read_chars decoder 3 . should_equal "lo ".codepoints
v1 = read_chars decoder 6
Text.from_codepoints v1 . should_equal '😎🚀🚧'
result = f.with_input_stream [File.Option.Read] stream->
stream.with_stream_decoder Encoding.utf_8 Problem_Behavior.Report_Error decoder->
read_chars decoder 1 . should_equal "H".codepoints
read_chars decoder 2 . should_equal "el".codepoints
read_chars decoder 3 . should_equal "lo ".codepoints
v1 = read_chars decoder 6
Text.from_codepoints v1 . should_equal '😎🚀🚧'
v2 = read_chars decoder 200
## Here we show that while the decoder is trying to read
200 codepoints, some codepoints require more than one
byte in UTF-8 to represent, so the actual result
should be slightly smaller.
(v2.length < 200) . should_be_true
v2 = read_chars decoder 200
## Here we show that while the decoder is trying to read
200 codepoints, some codepoints require more than one
byte in UTF-8 to represent, so the actual result
should be slightly smaller.
(v2.length < 200) . should_be_true
## Now we read increasingly larger amounts, to trigger
and test all paths of the input buffer resizing
mechanism.
read_chars decoder 40
read_chars decoder 500
read_chars decoder 1000
read_chars decoder 1
read_chars decoder 2
read_chars decoder 10
## Now we read increasingly larger amounts, to trigger
and test all paths of the input buffer resizing
mechanism.
read_chars decoder 40
read_chars decoder 500
read_chars decoder 1000
read_chars decoder 1
read_chars decoder 2
read_chars decoder 10
## Finally read all the remaining contents of the file
to verify they were decoded correctly as a whole.
read_rest _ =
case read_chars decoder 100 of
Nothing -> Nothing
_ -> @Tail_Call read_rest Nothing
read_rest Nothing
## Finally read all the remaining contents of the file
to verify they were decoded correctly as a whole.
read_rest _ =
case read_chars decoder 100 of
Nothing -> Nothing
_ -> @Tail_Call read_rest Nothing
read_rest Nothing
Text.from_codepoints all_codepoints.to_vector . should_equal contents
result . should_equal Nothing
f.delete
Test.specify "should allow reading a UTF-8 file" <|
f = Enso_Project.data / "transient" / "utf8.txt"
encoding = Encoding.utf_8
java_charset = encoding.to_java_charset
((0.up_to 100).map _->'Hello World!' . join '\n').write f
expected_contents = f.read_text
contents = read_file_one_by_one f java_charset expected_contents.length
contents = here.read_file_one_by_one f encoding expected_contents.length
contents.should_equal expected_contents
Test.specify "should allow reading a Windows file" <|
encoding = Encoding.windows_1252
java_charset = encoding.to_java_charset
expected_contents = "Hello World! $¢¤¥"
contents = read_file_one_by_one windows_file java_charset expected_contents.length
contents = here.read_file_one_by_one windows_file encoding expected_contents.length
contents.should_equal expected_contents
Test.specify "should raise warnings when reading invalid characters" <|
encoding = Encoding.ascii
java_charset = encoding.to_java_charset
expected_contents = 'Hello World! $\uFFFD\uFFFD\uFFFD'
expected_problems = ["Encoding issues at bytes 14, 15, 16."]
contents = read_file_one_by_one windows_file java_charset expected_contents.length expected_problems=expected_problems
contents.should_equal expected_contents
expected_problems = [Encoding_Error "Encoding issues at bytes 14, 15, 16."]
contents_1 = here.read_file_one_by_one windows_file encoding expected_contents.length on_problems=Problem_Behavior.Report_Warning
contents_1.should_equal expected_contents
Warning.get_all contents_1 . map .value . should_equal expected_problems
contents_2 = windows_file.with_input_stream [File.Option.Read] stream->
stream.with_stream_decoder encoding Problem_Behavior.Report_Warning reporting_stream_decoder->
codepoint_1 = reporting_stream_decoder.read
codepoints_1 = here.read_characters reporting_stream_decoder 5
codepoints_2 = here.read_characters reporting_stream_decoder 3
codepoints_3 = here.read_characters reporting_stream_decoder 100
reporting_stream_decoder.read.should_equal -1
Text.from_codepoints <| [codepoint_1]+codepoints_1+codepoints_2+codepoints_3
contents_2.should_equal expected_contents
Warning.get_all contents_2 . map .value . should_equal expected_problems
Test.specify "should work correctly if no data is read from it" <|
result = windows_file.with_input_stream [File.Option.Read] stream->
stream.with_stream_decoder Encoding.ascii Problem_Behavior.Report_Error _->Nothing
result.should_equal Nothing
read_file_one_by_one file encoding expected_size on_problems=Problem_Behavior.Report_Error =
file.with_input_stream [File.Option.Read] stream->
stream.with_stream_decoder encoding on_problems reporting_stream_decoder->
codepoints = 0.up_to expected_size . map _->
reporting_stream_decoder.read
reporting_stream_decoder.read.should_equal -1
Text.from_codepoints codepoints
read_characters decoder n =
buffer = CharBuffer.allocate n
chars_read = decoder.read buffer
if chars_read == -1 then Nothing else
buffer.flip
v = Vector.new_builder
transfer_codepoints _ =
if buffer.hasRemaining.not then Nothing else
char = buffer.get
v.append char
@Tail_Call transfer_codepoints Nothing
transfer_codepoints Nothing
v.to_vector
main = Test.Suite.run_main here.spec

View File

@ -0,0 +1,91 @@
from Standard.Base import all
from Standard.Base.Data.Text.Encoding as Encoding_Module import Encoding, Encoding_Error
import Standard.Base.Error.Problem_Behavior
polyglot java import org.enso.base.Encoding_Utils
polyglot java import java.nio.CharBuffer
import Standard.Test
import Standard.Test.Problems
spec =
Test.group "ReportingStreamEncoder" <|
Test.specify "should allow writing a file codepoint by codepoint" <|
f = Enso_Project.data / "transient" / "char-by-char.txt"
f.delete_if_exists
f.exists.should_be_false
contents = 1.up_to 7 . map _->'Cześc\u0301 😎🚀🚧!' . join '\n'
f.with_output_stream [File.Option.Write, File.Option.Create_New] stream->
stream.with_stream_encoder Encoding.utf_8 Problem_Behavior.Report_Error reporting_stream_encoder->
contents.char_vector.each char->
reporting_stream_encoder.write char
f.read_text.should_equal contents
Test.specify "should work correctly when writing chunks of varying sizes" <|
f = Enso_Project.data / "transient" / "varying-utf16.txt"
f.delete_if_exists
f.exists.should_be_false
encoding = Encoding.utf_16_be
big = 1.up_to 7 . map _->'Cześc\u0301 😎🚀🚧!' . join '\n'
f.with_output_stream [File.Option.Write, File.Option.Create_New] stream->
stream.with_stream_encoder encoding Problem_Behavior.Report_Error reporting_stream_encoder->
reporting_stream_encoder.write "A"
reporting_stream_encoder.write "Abc"
reporting_stream_encoder.write "Defghi"
reporting_stream_encoder.write 'O\u0301'
reporting_stream_encoder.write "X"
reporting_stream_encoder.write big
reporting_stream_encoder.write "Y"
reporting_stream_encoder.write "Ź"
contents = 'AAbcDefghiO\u0301X' + big + "YŹ"
f.read_text encoding . should_equal contents
Test.specify "should allow writing a Windows file" <|
f = Enso_Project.data / "transient" / "windows.txt"
encoding = Encoding.windows_1252
contents = "Hello World! $¢¤¥"
f.delete_if_exists
f.with_output_stream [File.Option.Write, File.Option.Create_New] stream->
stream.with_stream_encoder encoding Problem_Behavior.Report_Error reporting_stream_encoder->
reporting_stream_encoder.write contents
f.read_text encoding . should_equal contents
Test.specify "should raise warnings when writing characters that cannot be encoded and replace them with the Unicode replacement character or a question mark" <|
f = Enso_Project.data / "transient" / "ascii.txt"
encoding = Encoding.ascii
contents = 'Sło\u0301wka!'
f.delete_if_exists
result = f.with_output_stream [File.Option.Write, File.Option.Create_New] stream->
stream.with_stream_encoder encoding Problem_Behavior.Report_Warning reporting_stream_encoder->
reporting_stream_encoder.write contents
result . should_equal Nothing
Warning.get_all result . map .value . should_equal [Encoding_Error "Encoding issues at characters 1, 3."]
f.read_text encoding . should_equal "S?o?wka!"
f.delete_if_exists
result_2 = f.with_output_stream [File.Option.Write, File.Option.Create_New] stream->
stream.with_stream_encoder encoding Problem_Behavior.Report_Warning reporting_stream_encoder->
reporting_stream_encoder.write "ABC"
reporting_stream_encoder.write "ą"
reporting_stream_encoder.write "foo"
reporting_stream_encoder.write " -🚧- "
reporting_stream_encoder.write "bar"
result_2 . should_equal Nothing
Warning.get_all result_2 . map .value . should_equal [Encoding_Error "Encoding issues at characters 3, 9."]
f.read_text encoding . should_equal "ABC?foo -?- bar"
Test.specify "should work correctly if no data is written to it" <|
f = Enso_Project.data / "transient" / "empty.txt"
encoding = Encoding.ascii
f.delete_if_exists
result = f.with_output_stream [File.Option.Write, File.Option.Create_New] stream->
stream.with_stream_encoder encoding Problem_Behavior.Report_Error _->Nothing
result . should_equal Nothing
f.read_text encoding . should_equal ""
main = Test.Suite.run_main here.spec