mirror of
https://github.com/enso-org/enso.git
synced 2024-12-27 21:12:48 +03:00
Custom Encoding support when writing Delimited files (#3564)
Implements https://www.pivotaltracker.com/story/show/182545847
This commit is contained in:
parent
d950499a90
commit
7c94fa6a77
@ -147,6 +147,7 @@
|
||||
- [Added `File_Format.Delimited` support to `Table.write` for new files.][3528]
|
||||
- [Adjusted `Database.connect` API to new design.][3542]
|
||||
- [Added `File_Format.Excel` support to `Table.write` for new files.][3551]
|
||||
- [Added support for custom encodings in `File_Format.Delimited` writing.][3564]
|
||||
|
||||
[debug-shortcuts]:
|
||||
https://github.com/enso-org/enso/blob/develop/app/gui/docs/product/shortcuts.md#debug
|
||||
@ -232,6 +233,7 @@
|
||||
[3528]: https://github.com/enso-org/enso/pull/3528
|
||||
[3542]: https://github.com/enso-org/enso/pull/3542
|
||||
[3551]: https://github.com/enso-org/enso/pull/3551
|
||||
[3564]: https://github.com/enso-org/enso/pull/3564
|
||||
[3552]: https://github.com/enso-org/enso/pull/3552
|
||||
|
||||
#### Enso Compiler
|
||||
|
13
build.sbt
13
build.sbt
@ -5,11 +5,8 @@ import sbt.Keys.{libraryDependencies, scalacOptions}
|
||||
import sbt.addCompilerPlugin
|
||||
import sbt.complete.DefaultParsers._
|
||||
import sbt.complete.Parser
|
||||
import sbtcrossproject.CrossPlugin.autoImport.{crossProject, CrossType}
|
||||
import src.main.scala.licenses.{
|
||||
DistributionDescription,
|
||||
SBTDistributionComponent
|
||||
}
|
||||
import sbtcrossproject.CrossPlugin.autoImport.{CrossType, crossProject}
|
||||
import src.main.scala.licenses.{DistributionDescription, SBTDistributionComponent}
|
||||
|
||||
import java.io.File
|
||||
|
||||
@ -17,9 +14,9 @@ import java.io.File
|
||||
// === Global Configuration ===================================================
|
||||
// ============================================================================
|
||||
|
||||
val scalacVersion = "2.13.7"
|
||||
val graalVersion = "21.3.0"
|
||||
val javaVersion = "11"
|
||||
val scalacVersion = "2.13.7"
|
||||
val graalVersion = "21.3.0"
|
||||
val javaVersion = "11"
|
||||
val defaultDevEnsoVersion = "0.0.0-dev"
|
||||
val ensoVersion = sys.env.getOrElse(
|
||||
"ENSO_VERSION",
|
||||
|
@ -2,14 +2,16 @@ from Standard.Base import all
|
||||
|
||||
import Standard.Base.System.File.Option
|
||||
import Standard.Base.System.File.Existing_File_Behavior
|
||||
import Standard.Base.Error.Problem_Behavior
|
||||
import Standard.Base.Data.Text.Matching_Mode
|
||||
import Standard.Base.Data.Text.Text_Sub_Range
|
||||
from Standard.Base.Data.Text.Encoding as Encoding_Module import Encoding
|
||||
from Standard.Base.Data.Text.Encoding as Encoding_Module import Encoding, Encoding_Error
|
||||
from Standard.Base.Error.Problem_Behavior as Problem_Behavior_Module import Problem_Behavior, Report_Warning
|
||||
from Standard.Base.Runtime.Resource import all
|
||||
|
||||
export Standard.Base.System.File.Option
|
||||
|
||||
polyglot java import org.enso.base.Encoding_Utils
|
||||
polyglot java import java.io.InputStream as Java_Input_Stream
|
||||
polyglot java import java.io.OutputStream as Java_Output_Stream
|
||||
polyglot java import java.io.IOException
|
||||
@ -781,6 +783,22 @@ type Output_Stream
|
||||
with_java_stream : (Java_Output_Stream -> Any) -> Any
|
||||
with_java_stream f = self.stream_resource . with f
|
||||
|
||||
## PRIVATE
|
||||
Runs an action with a `ReportingStreamEncoder` encoding data to the
|
||||
output stream with the specified encoding.
|
||||
with_stream_encoder : Encoding -> Problem_Behavior -> (ReportingStreamEncoder -> Any) -> Any
|
||||
with_stream_encoder encoding on_problems action = self.with_java_stream java_stream->
|
||||
## We ignore any warnings raised by the `bytes` method, because if the
|
||||
original Unicode replacement character failed to encode, the `bytes`
|
||||
method will have replaced it with the simple `?` sign which should be
|
||||
available in all encodings. And this is exactly the behavior we want:
|
||||
if available, we use the `<60>` character and otherwise we fallback to
|
||||
the `?` character.
|
||||
replacement_sequence = Encoding_Utils.INVALID_CHARACTER.bytes encoding on_problems=Problem_Behavior.Ignore
|
||||
java_charset = encoding.to_java_charset
|
||||
results = Encoding_Utils.with_stream_encoder java_stream java_charset replacement_sequence.to_array action
|
||||
problems = Vector.Vector results.problems . map Encoding_Error
|
||||
on_problems.attach_problems_after results.result problems
|
||||
|
||||
## An input stream, allowing for interactive reading of contents from an open
|
||||
file.
|
||||
@ -906,6 +924,15 @@ type Input_Stream
|
||||
with_java_stream : (Java_Input_Stream -> Any) -> Any
|
||||
with_java_stream f = self.stream_resource . with f
|
||||
|
||||
## PRIVATE
|
||||
Runs an action with a `ReportingStreamDecoder` decoding data from the
|
||||
input stream with the specified encoding.
|
||||
with_stream_decoder : Encoding -> Problem_Behavior -> (ReportingStreamDecoder -> Any) -> Any
|
||||
with_stream_decoder encoding on_problems action = self.stream_resource . with java_stream->
|
||||
java_charset = encoding.to_java_charset
|
||||
results = Encoding_Utils.with_stream_decoder java_stream java_charset action
|
||||
problems = Vector.Vector results.problems . map Encoding_Error
|
||||
on_problems.attach_problems_after results.result problems
|
||||
|
||||
## PRIVATE
|
||||
|
||||
|
@ -44,8 +44,7 @@ read_file format file on_problems =
|
||||
exceptions), we can catch the exception indicating the limit has been
|
||||
reached and restart parsing with an increased limit.
|
||||
file.with_input_stream [File.Option.Read] stream->
|
||||
stream.with_java_stream java_stream->
|
||||
here.read_stream format java_stream on_problems related_file=file
|
||||
here.read_stream format stream on_problems related_file=file
|
||||
|
||||
read_text : Text -> Delimited -> Problem_Behavior -> Table
|
||||
read_text text format on_problems =
|
||||
@ -57,7 +56,7 @@ read_text text format on_problems =
|
||||
|
||||
Arguments:
|
||||
- format: The specification of the delimited file format.
|
||||
- java_stream: A Java `InputStream` used as the data source.
|
||||
- stream: An `Input_Stream` to be used as the data source.
|
||||
- on_problems: Specifies the behavior when a problem occurs during the
|
||||
operation. By default, a warning is issued, but the operation proceeds.
|
||||
If set to `Report_Error`, the operation fails with a dataflow error.
|
||||
@ -67,17 +66,14 @@ read_text text format on_problems =
|
||||
integer.
|
||||
- related_file: The file related to the provided `java_stream`, if available,
|
||||
or `Nothing`. It is used for more detailed error reporting.
|
||||
read_stream : Delimited -> InputStream -> Problem_Behavior -> Integer -> File | Nothing -> Any
|
||||
read_stream format java_stream on_problems max_columns=4096 related_file=Nothing =
|
||||
read_stream : Delimited -> Input_Stream -> Problem_Behavior -> Integer -> File | Nothing -> Any
|
||||
read_stream format stream on_problems max_columns=4096 related_file=Nothing =
|
||||
handle_io_exception ~action = Panic.catch IOException action caught_panic->
|
||||
Error.throw (File.wrap_io_exception related_file caught_panic.payload.cause)
|
||||
|
||||
java_charset = format.encoding.to_java_charset
|
||||
handle_io_exception <|
|
||||
Encoding_Utils.with_stream_decoder java_stream java_charset reporting_stream_decoder->
|
||||
result = here.read_from_reader format reporting_stream_decoder on_problems max_columns
|
||||
decoding_problems = Vector.Vector reporting_stream_decoder.getReportedProblems . map Encoding_Error
|
||||
on_problems.attach_problems_after result decoding_problems
|
||||
stream.with_stream_decoder format.encoding on_problems reporting_stream_decoder->
|
||||
here.read_from_reader format reporting_stream_decoder on_problems max_columns
|
||||
|
||||
## PRIVATE
|
||||
Reads data from the provided `Reader` according to the provided format.
|
||||
|
@ -36,8 +36,7 @@ write_file table format file on_existing_file on_problems =
|
||||
Errors.unimplemented "Appending to an existing File_Format.Delimited file is not implemented yet."
|
||||
_ ->
|
||||
on_existing_file.write file stream->
|
||||
stream.with_java_stream java_stream->
|
||||
here.write_to_stream table format java_stream on_problems related_file=file
|
||||
here.write_to_stream table format stream on_problems related_file=file
|
||||
|
||||
## PRIVATE
|
||||
Returns a Text value representing the table in the delimited format.
|
||||
@ -53,25 +52,21 @@ write_text table format =
|
||||
Arguments:
|
||||
- table: The table to serialize.
|
||||
- format: The specification of the delimited file format.
|
||||
- java_stream: A Java `OutputStream` used as the data destination.
|
||||
- stream: An `Output_Stream` used as the data destination.
|
||||
- on_problems: Specifies the behavior when a problem occurs during the
|
||||
operation. By default, a warning is issued, but the operation proceeds.
|
||||
If set to `Report_Error`, the operation fails with a dataflow error.
|
||||
If set to `Ignore`, the operation proceeds without errors or warnings.
|
||||
- related_file: The file related to the provided `java_stream`, if available,
|
||||
or `Nothing`. It is used for more detailed error reporting.
|
||||
write_to_stream : Table -> File_Format.Delimited -> OutputStream -> Problem_Behavior -> File | Nothing -> Any
|
||||
write_to_stream table format java_stream on_problems related_file=Nothing =
|
||||
write_to_stream : Table -> File_Format.Delimited -> Output_Stream -> Problem_Behavior -> File | Nothing -> Any
|
||||
write_to_stream table format stream on_problems related_file=Nothing =
|
||||
handle_io_exception ~action = Panic.catch IOException action caught_panic->
|
||||
Error.throw (File.wrap_io_exception related_file caught_panic.payload.cause)
|
||||
|
||||
# TODO handling encoding
|
||||
#java_charset = format.encoding.to_java_charset
|
||||
_ = on_problems
|
||||
handle_io_exception <|
|
||||
# TODO create a writer that will use the appropriate encoding and handle mismatches
|
||||
writer = PrintWriter.new java_stream
|
||||
here.write_to_writer table format writer
|
||||
stream.with_stream_encoder format.encoding on_problems reporting_stream_encoder->
|
||||
here.write_to_writer table format reporting_stream_encoder
|
||||
|
||||
## PRIVATE
|
||||
Writes data to the provided `Writer` according to the provided format.
|
||||
|
@ -1,21 +1,20 @@
|
||||
package org.enso.base;
|
||||
|
||||
import org.enso.base.encoding.ReportingStreamDecoder;
|
||||
import org.enso.base.encoding.ReportingStreamEncoder;
|
||||
import org.enso.base.text.ResultWithWarnings;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.io.InputStream;
|
||||
import java.io.OutputStream;
|
||||
import java.nio.Buffer;
|
||||
import java.nio.ByteBuffer;
|
||||
import java.nio.CharBuffer;
|
||||
import java.nio.charset.Charset;
|
||||
import java.nio.charset.CharsetDecoder;
|
||||
import java.nio.charset.CharsetEncoder;
|
||||
import java.nio.charset.CoderResult;
|
||||
import java.nio.charset.CodingErrorAction;
|
||||
import java.nio.charset.*;
|
||||
import java.util.Arrays;
|
||||
import java.util.function.BiConsumer;
|
||||
import java.util.function.Function;
|
||||
import java.util.function.IntFunction;
|
||||
import org.enso.base.encoding.ReportingStreamDecoder;
|
||||
import org.enso.base.text.ResultWithWarnings;
|
||||
|
||||
public class Encoding_Utils {
|
||||
/** The replacement character used for characters that could not have been decoded. */
|
||||
@ -164,13 +163,55 @@ public class Encoding_Utils {
|
||||
|
||||
/**
|
||||
* A helper function which runs an action with a created stream decoder and closes it afterwards.
|
||||
*
|
||||
* <p>It returns the result returned from the executed action and any encoding problems that
|
||||
* occurred when processing it.
|
||||
*/
|
||||
public static <R> R with_stream_decoder(
|
||||
public static <R> WithProblems<R, String> with_stream_decoder(
|
||||
InputStream stream, Charset charset, Function<ReportingStreamDecoder, R> action)
|
||||
throws IOException {
|
||||
try (ReportingStreamDecoder decoder = create_stream_decoder(stream, charset)) {
|
||||
return action.apply(decoder);
|
||||
R result;
|
||||
ReportingStreamDecoder decoder = create_stream_decoder(stream, charset);
|
||||
try {
|
||||
result = action.apply(decoder);
|
||||
} finally {
|
||||
decoder.close();
|
||||
}
|
||||
return new WithProblems<>(result, decoder.getReportedProblems());
|
||||
}
|
||||
|
||||
/** Creates a new instance of {@code ReportingStreamEncoder} encoding a given charset. */
|
||||
private static ReportingStreamEncoder create_stream_encoder(
|
||||
OutputStream stream, Charset charset, byte[] replacementSequence) {
|
||||
CharsetEncoder encoder =
|
||||
charset
|
||||
.newEncoder()
|
||||
.onMalformedInput(CodingErrorAction.REPORT)
|
||||
.onUnmappableCharacter(CodingErrorAction.REPORT)
|
||||
.reset();
|
||||
return new ReportingStreamEncoder(stream, encoder, replacementSequence);
|
||||
}
|
||||
|
||||
/**
|
||||
* A helper function which runs an action with a created stream encoder and closes it afterwards.
|
||||
*
|
||||
* <p>It returns the result returned from the executed action and any encoding problems that
|
||||
* occurred when processing it.
|
||||
*/
|
||||
public static <R> WithProblems<R, String> with_stream_encoder(
|
||||
OutputStream stream,
|
||||
Charset charset,
|
||||
byte[] replacementSequence,
|
||||
Function<ReportingStreamEncoder, R> action)
|
||||
throws IOException {
|
||||
R result;
|
||||
ReportingStreamEncoder encoder = create_stream_encoder(stream, charset, replacementSequence);
|
||||
try {
|
||||
result = action.apply(encoder);
|
||||
} finally {
|
||||
encoder.close();
|
||||
}
|
||||
return new WithProblems<>(result, encoder.getReportedProblems());
|
||||
}
|
||||
|
||||
/**
|
||||
|
@ -0,0 +1,5 @@
|
||||
package org.enso.base;
|
||||
|
||||
import java.util.List;
|
||||
|
||||
public record WithProblems<ResultType, ProblemType>(ResultType result, List<ProblemType> problems) {}
|
@ -0,0 +1,195 @@
|
||||
package org.enso.base.encoding;
|
||||
|
||||
import org.enso.base.Encoding_Utils;
|
||||
|
||||
import java.io.BufferedOutputStream;
|
||||
import java.io.IOException;
|
||||
import java.io.OutputStream;
|
||||
import java.io.Writer;
|
||||
import java.nio.ByteBuffer;
|
||||
import java.nio.CharBuffer;
|
||||
import java.nio.charset.CharsetEncoder;
|
||||
import java.nio.charset.CoderResult;
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
import java.util.stream.Collectors;
|
||||
|
||||
/**
|
||||
* A {@code Writer} which encodes any characters provided to itself using the provided {@code
|
||||
* CharsetEncoder} and passes the encoded data to the provided {@code OutputStream}.
|
||||
*
|
||||
* <p>Functionally, it should be equivalent to {@code java.io.OutputStreamWriter}. The major
|
||||
* difference is that this class allows more granular reporting of encoding issues - instead of just
|
||||
* replacing malformed characters with a replacement or failing at the first error, it allows to
|
||||
* both perform the replacements but also remember the positions at which the problems occurred and
|
||||
* then return a bulk report of places where the issues have been encountered.
|
||||
*/
|
||||
public class ReportingStreamEncoder extends Writer {
|
||||
|
||||
/**
|
||||
* Creates a writer which encodes characters and writes them to the provided output stream.
|
||||
*
|
||||
* <p>The encoder reports any malformed or unmappable characters as problems and replaces them
|
||||
* with the provided replacement sequence.
|
||||
*
|
||||
* <p>The encoder must be closed at the end of the encoding process to indicate that no further
|
||||
* data will be processed so that it can properly handle the finalization of encoding.
|
||||
*/
|
||||
public ReportingStreamEncoder(
|
||||
OutputStream outputStream, CharsetEncoder encoder, byte[] replacementSequence) {
|
||||
this.encoder = encoder;
|
||||
bufferedOutputStream = new BufferedOutputStream(outputStream);
|
||||
this.replacementSequence = replacementSequence;
|
||||
}
|
||||
|
||||
private final BufferedOutputStream bufferedOutputStream;
|
||||
private final CharsetEncoder encoder;
|
||||
|
||||
/**
|
||||
* The buffer keeping any input that has already been written but not encoded yet.
|
||||
*
|
||||
* <p>Between the calls to write, it satisfies the invariant that it is in 'reading' mode - to be
|
||||
* able to write to it, it needs to be reallocated, compacted or flipped.
|
||||
*/
|
||||
private CharBuffer inputBuffer = CharBuffer.allocate(0);
|
||||
|
||||
private int inputCharactersConsumedBeforeCurrentBuffer = 0;
|
||||
|
||||
private final byte[] replacementSequence;
|
||||
|
||||
private boolean wasClosed = false;
|
||||
|
||||
/**
|
||||
* The buffer re-used for storing encoded output before writing it to the output stream.
|
||||
*
|
||||
* <p>It is cleared after each call to write, so that it can be freshly re-used in the following
|
||||
* call. It is preserved only to avoid re-allocating a big buffer upon each call.
|
||||
*/
|
||||
private ByteBuffer outputBuffer = ByteBuffer.allocate(0);
|
||||
|
||||
private void ensureInputBufferHasEnoughFreeSpace(int bytesToAppend) {
|
||||
int freeSpaceInInputBuffer = inputBuffer.capacity() - inputBuffer.remaining();
|
||||
|
||||
// After either compacting the buffer or reallocating it, any remaining input is shifted to
|
||||
// the beginning of the buffer. Thus the bytes that preceded the current position are lost
|
||||
// (because they already have been processed), so we increase the counter to keep the global
|
||||
// position in the input.
|
||||
inputCharactersConsumedBeforeCurrentBuffer += inputBuffer.position();
|
||||
|
||||
if (freeSpaceInInputBuffer < bytesToAppend) {
|
||||
var old = inputBuffer;
|
||||
inputBuffer = CharBuffer.allocate(old.remaining() + bytesToAppend);
|
||||
inputBuffer.put(old);
|
||||
} else {
|
||||
inputBuffer.compact();
|
||||
}
|
||||
}
|
||||
|
||||
/** Returns the amount of characters that have already been consumed by the encoder. */
|
||||
private int getCurrentInputPosition() {
|
||||
return inputCharactersConsumedBeforeCurrentBuffer + inputBuffer.position();
|
||||
}
|
||||
|
||||
@Override
|
||||
public void write(char[] cbuf, int off, int len) throws IOException {
|
||||
if (len < 0) {
|
||||
throw new IndexOutOfBoundsException();
|
||||
}
|
||||
|
||||
ensureInputBufferHasEnoughFreeSpace(len);
|
||||
inputBuffer.put(cbuf, off, len);
|
||||
|
||||
// We flip the input buffer back to reading mode, to be able to pass it to the encoder.
|
||||
inputBuffer.flip();
|
||||
|
||||
if (outputBuffer.capacity() == 0) {
|
||||
outputBuffer =
|
||||
ByteBuffer.allocate((int) (inputBuffer.remaining() * encoder.averageBytesPerChar()));
|
||||
}
|
||||
runEncoderOnInputBuffer();
|
||||
|
||||
bufferedOutputStream.write(outputBuffer.array(), 0, outputBuffer.position());
|
||||
outputBuffer.clear();
|
||||
}
|
||||
|
||||
private void runEncoderOnInputBuffer() {
|
||||
while (inputBuffer.hasRemaining()) {
|
||||
CoderResult cr = encoder.encode(inputBuffer, outputBuffer, false);
|
||||
|
||||
if (cr.isMalformed() || cr.isUnmappable()) {
|
||||
reportEncodingProblem();
|
||||
|
||||
while (outputBuffer.remaining() < replacementSequence.length) {
|
||||
growOutputBuffer();
|
||||
}
|
||||
|
||||
outputBuffer.put(replacementSequence);
|
||||
inputBuffer.position(inputBuffer.position() + cr.length());
|
||||
} else if (cr.isUnderflow()) {
|
||||
break;
|
||||
} else if (cr.isOverflow()) {
|
||||
growOutputBuffer();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* A list of positions containing encoding issues like malformed characters.
|
||||
*
|
||||
* <p>Used for reporting warnings.
|
||||
*/
|
||||
List<Integer> encodingIssuePositions = new ArrayList<>();
|
||||
|
||||
private void reportEncodingProblem() {
|
||||
encodingIssuePositions.add(getCurrentInputPosition());
|
||||
}
|
||||
|
||||
public List<String> getReportedProblems() {
|
||||
if (encodingIssuePositions.isEmpty()) {
|
||||
return List.of();
|
||||
} else {
|
||||
if (encodingIssuePositions.size() == 1) {
|
||||
return List.of("Encoding issues at character " + encodingIssuePositions.get(0) + ".");
|
||||
}
|
||||
|
||||
String issues =
|
||||
encodingIssuePositions.stream()
|
||||
.map(String::valueOf)
|
||||
.collect(Collectors.joining(", ", "Encoding issues at characters ", "."));
|
||||
return List.of(issues);
|
||||
}
|
||||
}
|
||||
|
||||
private void growOutputBuffer() {
|
||||
outputBuffer = Encoding_Utils.resize(outputBuffer, ByteBuffer::allocate, ByteBuffer::put);
|
||||
}
|
||||
|
||||
@Override
|
||||
public void flush() throws IOException {
|
||||
// We don't flush the encoder here, because the flush operation for the encoder is supposed to
|
||||
// be run at the very end, and for a Writer the flush may be called whenever and further write
|
||||
// operations may follow it. So we do the best we can - flush the underlying stream and keep the
|
||||
// encoder intact, ready for possible writes.
|
||||
bufferedOutputStream.flush();
|
||||
}
|
||||
|
||||
@Override
|
||||
public void close() throws IOException {
|
||||
if (wasClosed) {
|
||||
return;
|
||||
}
|
||||
|
||||
while (encoder.encode(inputBuffer, outputBuffer, true).isOverflow()) {
|
||||
growOutputBuffer();
|
||||
}
|
||||
|
||||
while (encoder.flush(outputBuffer).isOverflow()) {
|
||||
growOutputBuffer();
|
||||
}
|
||||
|
||||
bufferedOutputStream.write(outputBuffer.array(), 0, outputBuffer.position());
|
||||
bufferedOutputStream.flush();
|
||||
bufferedOutputStream.close();
|
||||
wasClosed = true;
|
||||
}
|
||||
}
|
@ -129,7 +129,7 @@ spec =
|
||||
text = File.read_text file
|
||||
text.should_equal expected_text+'\n'
|
||||
|
||||
Test.specify "should correctly handle alternative encodings" pending="TODO: will be implemented in the next PR" <|
|
||||
Test.specify "should correctly handle alternative encodings" <|
|
||||
table = Table.new [["ąęćś", [0]], ["ß", ["żółw 🐢"]]]
|
||||
file = (Enso_Project.data / "transient" / "utf16.csv")
|
||||
file.delete_if_exists
|
||||
@ -140,7 +140,7 @@ spec =
|
||||
text = File.read_text file encoding=Encoding.utf_16_be
|
||||
text.should_equal expected_text+'\n'
|
||||
|
||||
Test.specify "should correctly handle encoding errors" pending="TODO: will be implemented in the next PR" <|
|
||||
Test.specify "should correctly handle encoding errors" <|
|
||||
table = Table.new [["A", [0, 1]], ["B", ["słówka", "🐢"]]]
|
||||
file = (Enso_Project.data / "transient" / "ascii.csv")
|
||||
file.delete_if_exists
|
||||
@ -152,7 +152,7 @@ spec =
|
||||
text = File.read_text file encoding=Encoding.ascii
|
||||
text.should_equal expected_text+'\n'
|
||||
result . should_equal Nothing
|
||||
Warning.get_all result . map .value . should_equal [Encoding_Error "Encoding issues at 7, 8, 15."]
|
||||
Warning.get_all result . map .value . should_equal [Encoding_Error "Encoding issues at characters 7, 8, 15."]
|
||||
|
||||
Test.specify "should allow only text columns if no formatter is specified" <|
|
||||
format = File_Format.Delimited "," value_formatter=Nothing
|
||||
|
@ -1,8 +1,8 @@
|
||||
from Standard.Base import all
|
||||
|
||||
from Standard.Base.Data.Text.Encoding as Encoding_Module import Encoding, Encoding_Error
|
||||
import Standard.Base.Error.Problem_Behavior
|
||||
|
||||
polyglot java import org.enso.base.Encoding_Utils
|
||||
polyglot java import java.nio.CharBuffer
|
||||
|
||||
import Standard.Test
|
||||
@ -11,33 +11,18 @@ import Standard.Test.Problems
|
||||
spec =
|
||||
windows_file = Enso_Project.data / "windows.txt"
|
||||
|
||||
read_file_one_by_one file java_charset expected_size expected_problems=[] =
|
||||
file.with_input_stream [File.Option.Read] stream->
|
||||
stream.with_java_stream java_stream->
|
||||
Encoding_Utils.with_stream_decoder java_stream java_charset reporting_stream_decoder->
|
||||
codepoints = 0.up_to expected_size . map _->
|
||||
reporting_stream_decoder.read
|
||||
reporting_stream_decoder.read.should_equal -1
|
||||
|
||||
problems = Vector.Vector reporting_stream_decoder.getReportedProblems
|
||||
problems.should_equal expected_problems
|
||||
|
||||
Text.from_codepoints codepoints
|
||||
|
||||
Test.group "ReportingStreamDecoder" <|
|
||||
Test.specify "should allow reading a file character by character" <|
|
||||
f = Enso_Project.data / "short.txt"
|
||||
f.delete_if_exists
|
||||
f.exists.should_be_false
|
||||
"Cup".write f
|
||||
java_charset = Encoding.utf_8.to_java_charset
|
||||
f.with_input_stream [File.Option.Read] stream->
|
||||
stream.with_java_stream java_stream->
|
||||
Encoding_Utils.with_stream_decoder java_stream java_charset reporting_stream_decoder->
|
||||
reporting_stream_decoder.read.should_equal 67
|
||||
reporting_stream_decoder.read.should_equal 117
|
||||
reporting_stream_decoder.read.should_equal 112
|
||||
reporting_stream_decoder.read.should_equal -1
|
||||
stream.with_stream_decoder Encoding.utf_8 reporting_stream_decoder->
|
||||
reporting_stream_decoder.read.should_equal 67
|
||||
reporting_stream_decoder.read.should_equal 117
|
||||
reporting_stream_decoder.read.should_equal 112
|
||||
reporting_stream_decoder.read.should_equal -1
|
||||
f.delete
|
||||
f.exists.should_be_false
|
||||
|
||||
@ -46,82 +31,109 @@ spec =
|
||||
fragment = 'Hello 😎🚀🚧!'
|
||||
contents = 1.up_to 1000 . map _->fragment . join '\n'
|
||||
contents.write f
|
||||
java_charset = Encoding.utf_8.to_java_charset
|
||||
|
||||
all_codepoints = Vector.new_builder
|
||||
read_chars decoder n =
|
||||
buffer = CharBuffer.allocate n
|
||||
chars_read = decoder.read buffer
|
||||
if chars_read == -1 then Nothing else
|
||||
buffer.flip
|
||||
v = Vector.new_builder
|
||||
transfer_codepoints _ =
|
||||
if buffer.hasRemaining.not then Nothing else
|
||||
char = buffer.get
|
||||
v.append char
|
||||
all_codepoints.append char
|
||||
@Tail_Call transfer_codepoints Nothing
|
||||
transfer_codepoints Nothing
|
||||
v.to_vector
|
||||
case here.read_characters decoder n of
|
||||
Nothing -> Nothing
|
||||
chars ->
|
||||
chars.each all_codepoints.append
|
||||
chars
|
||||
|
||||
f.with_input_stream [File.Option.Read] stream->
|
||||
stream.with_java_stream java_stream->
|
||||
Encoding_Utils.with_stream_decoder java_stream java_charset decoder->
|
||||
read_chars decoder 1 . should_equal "H".codepoints
|
||||
read_chars decoder 2 . should_equal "el".codepoints
|
||||
read_chars decoder 3 . should_equal "lo ".codepoints
|
||||
v1 = read_chars decoder 6
|
||||
Text.from_codepoints v1 . should_equal '😎🚀🚧'
|
||||
result = f.with_input_stream [File.Option.Read] stream->
|
||||
stream.with_stream_decoder Encoding.utf_8 Problem_Behavior.Report_Error decoder->
|
||||
read_chars decoder 1 . should_equal "H".codepoints
|
||||
read_chars decoder 2 . should_equal "el".codepoints
|
||||
read_chars decoder 3 . should_equal "lo ".codepoints
|
||||
v1 = read_chars decoder 6
|
||||
Text.from_codepoints v1 . should_equal '😎🚀🚧'
|
||||
|
||||
v2 = read_chars decoder 200
|
||||
## Here we show that while the decoder is trying to read
|
||||
200 codepoints, some codepoints require more than one
|
||||
byte in UTF-8 to represent, so the actual result
|
||||
should be slightly smaller.
|
||||
(v2.length < 200) . should_be_true
|
||||
v2 = read_chars decoder 200
|
||||
## Here we show that while the decoder is trying to read
|
||||
200 codepoints, some codepoints require more than one
|
||||
byte in UTF-8 to represent, so the actual result
|
||||
should be slightly smaller.
|
||||
(v2.length < 200) . should_be_true
|
||||
|
||||
## Now we read increasingly larger amounts, to trigger
|
||||
and test all paths of the input buffer resizing
|
||||
mechanism.
|
||||
read_chars decoder 40
|
||||
read_chars decoder 500
|
||||
read_chars decoder 1000
|
||||
read_chars decoder 1
|
||||
read_chars decoder 2
|
||||
read_chars decoder 10
|
||||
## Now we read increasingly larger amounts, to trigger
|
||||
and test all paths of the input buffer resizing
|
||||
mechanism.
|
||||
read_chars decoder 40
|
||||
read_chars decoder 500
|
||||
read_chars decoder 1000
|
||||
read_chars decoder 1
|
||||
read_chars decoder 2
|
||||
read_chars decoder 10
|
||||
|
||||
## Finally read all the remaining contents of the file
|
||||
to verify they were decoded correctly as a whole.
|
||||
read_rest _ =
|
||||
case read_chars decoder 100 of
|
||||
Nothing -> Nothing
|
||||
_ -> @Tail_Call read_rest Nothing
|
||||
read_rest Nothing
|
||||
## Finally read all the remaining contents of the file
|
||||
to verify they were decoded correctly as a whole.
|
||||
read_rest _ =
|
||||
case read_chars decoder 100 of
|
||||
Nothing -> Nothing
|
||||
_ -> @Tail_Call read_rest Nothing
|
||||
read_rest Nothing
|
||||
Text.from_codepoints all_codepoints.to_vector . should_equal contents
|
||||
result . should_equal Nothing
|
||||
f.delete
|
||||
|
||||
Test.specify "should allow reading a UTF-8 file" <|
|
||||
f = Enso_Project.data / "transient" / "utf8.txt"
|
||||
encoding = Encoding.utf_8
|
||||
java_charset = encoding.to_java_charset
|
||||
((0.up_to 100).map _->'Hello World!' . join '\n').write f
|
||||
expected_contents = f.read_text
|
||||
contents = read_file_one_by_one f java_charset expected_contents.length
|
||||
contents = here.read_file_one_by_one f encoding expected_contents.length
|
||||
contents.should_equal expected_contents
|
||||
|
||||
Test.specify "should allow reading a Windows file" <|
|
||||
encoding = Encoding.windows_1252
|
||||
java_charset = encoding.to_java_charset
|
||||
expected_contents = "Hello World! $¢¤¥"
|
||||
contents = read_file_one_by_one windows_file java_charset expected_contents.length
|
||||
contents = here.read_file_one_by_one windows_file encoding expected_contents.length
|
||||
contents.should_equal expected_contents
|
||||
|
||||
Test.specify "should raise warnings when reading invalid characters" <|
|
||||
encoding = Encoding.ascii
|
||||
java_charset = encoding.to_java_charset
|
||||
expected_contents = 'Hello World! $\uFFFD\uFFFD\uFFFD'
|
||||
expected_problems = ["Encoding issues at bytes 14, 15, 16."]
|
||||
contents = read_file_one_by_one windows_file java_charset expected_contents.length expected_problems=expected_problems
|
||||
contents.should_equal expected_contents
|
||||
expected_problems = [Encoding_Error "Encoding issues at bytes 14, 15, 16."]
|
||||
contents_1 = here.read_file_one_by_one windows_file encoding expected_contents.length on_problems=Problem_Behavior.Report_Warning
|
||||
contents_1.should_equal expected_contents
|
||||
Warning.get_all contents_1 . map .value . should_equal expected_problems
|
||||
|
||||
contents_2 = windows_file.with_input_stream [File.Option.Read] stream->
|
||||
stream.with_stream_decoder encoding Problem_Behavior.Report_Warning reporting_stream_decoder->
|
||||
codepoint_1 = reporting_stream_decoder.read
|
||||
codepoints_1 = here.read_characters reporting_stream_decoder 5
|
||||
codepoints_2 = here.read_characters reporting_stream_decoder 3
|
||||
codepoints_3 = here.read_characters reporting_stream_decoder 100
|
||||
reporting_stream_decoder.read.should_equal -1
|
||||
Text.from_codepoints <| [codepoint_1]+codepoints_1+codepoints_2+codepoints_3
|
||||
contents_2.should_equal expected_contents
|
||||
Warning.get_all contents_2 . map .value . should_equal expected_problems
|
||||
|
||||
Test.specify "should work correctly if no data is read from it" <|
|
||||
result = windows_file.with_input_stream [File.Option.Read] stream->
|
||||
stream.with_stream_decoder Encoding.ascii Problem_Behavior.Report_Error _->Nothing
|
||||
result.should_equal Nothing
|
||||
|
||||
read_file_one_by_one file encoding expected_size on_problems=Problem_Behavior.Report_Error =
|
||||
file.with_input_stream [File.Option.Read] stream->
|
||||
stream.with_stream_decoder encoding on_problems reporting_stream_decoder->
|
||||
codepoints = 0.up_to expected_size . map _->
|
||||
reporting_stream_decoder.read
|
||||
reporting_stream_decoder.read.should_equal -1
|
||||
|
||||
Text.from_codepoints codepoints
|
||||
|
||||
read_characters decoder n =
|
||||
buffer = CharBuffer.allocate n
|
||||
chars_read = decoder.read buffer
|
||||
if chars_read == -1 then Nothing else
|
||||
buffer.flip
|
||||
v = Vector.new_builder
|
||||
transfer_codepoints _ =
|
||||
if buffer.hasRemaining.not then Nothing else
|
||||
char = buffer.get
|
||||
v.append char
|
||||
@Tail_Call transfer_codepoints Nothing
|
||||
transfer_codepoints Nothing
|
||||
v.to_vector
|
||||
|
||||
main = Test.Suite.run_main here.spec
|
||||
|
91
test/Tests/src/System/Reporting_Stream_Encoder_Spec.enso
Normal file
91
test/Tests/src/System/Reporting_Stream_Encoder_Spec.enso
Normal file
@ -0,0 +1,91 @@
|
||||
from Standard.Base import all
|
||||
|
||||
from Standard.Base.Data.Text.Encoding as Encoding_Module import Encoding, Encoding_Error
|
||||
import Standard.Base.Error.Problem_Behavior
|
||||
|
||||
polyglot java import org.enso.base.Encoding_Utils
|
||||
polyglot java import java.nio.CharBuffer
|
||||
|
||||
import Standard.Test
|
||||
import Standard.Test.Problems
|
||||
|
||||
spec =
|
||||
Test.group "ReportingStreamEncoder" <|
|
||||
Test.specify "should allow writing a file codepoint by codepoint" <|
|
||||
f = Enso_Project.data / "transient" / "char-by-char.txt"
|
||||
f.delete_if_exists
|
||||
f.exists.should_be_false
|
||||
contents = 1.up_to 7 . map _->'Cześc\u0301 😎🚀🚧!' . join '\n'
|
||||
f.with_output_stream [File.Option.Write, File.Option.Create_New] stream->
|
||||
stream.with_stream_encoder Encoding.utf_8 Problem_Behavior.Report_Error reporting_stream_encoder->
|
||||
contents.char_vector.each char->
|
||||
reporting_stream_encoder.write char
|
||||
f.read_text.should_equal contents
|
||||
|
||||
Test.specify "should work correctly when writing chunks of varying sizes" <|
|
||||
f = Enso_Project.data / "transient" / "varying-utf16.txt"
|
||||
f.delete_if_exists
|
||||
f.exists.should_be_false
|
||||
encoding = Encoding.utf_16_be
|
||||
big = 1.up_to 7 . map _->'Cześc\u0301 😎🚀🚧!' . join '\n'
|
||||
f.with_output_stream [File.Option.Write, File.Option.Create_New] stream->
|
||||
stream.with_stream_encoder encoding Problem_Behavior.Report_Error reporting_stream_encoder->
|
||||
reporting_stream_encoder.write "A"
|
||||
reporting_stream_encoder.write "Abc"
|
||||
reporting_stream_encoder.write "Defghi"
|
||||
reporting_stream_encoder.write 'O\u0301'
|
||||
reporting_stream_encoder.write "X"
|
||||
reporting_stream_encoder.write big
|
||||
reporting_stream_encoder.write "Y"
|
||||
reporting_stream_encoder.write "Ź"
|
||||
|
||||
contents = 'AAbcDefghiO\u0301X' + big + "YŹ"
|
||||
f.read_text encoding . should_equal contents
|
||||
|
||||
Test.specify "should allow writing a Windows file" <|
|
||||
f = Enso_Project.data / "transient" / "windows.txt"
|
||||
encoding = Encoding.windows_1252
|
||||
contents = "Hello World! $¢¤¥"
|
||||
|
||||
f.delete_if_exists
|
||||
f.with_output_stream [File.Option.Write, File.Option.Create_New] stream->
|
||||
stream.with_stream_encoder encoding Problem_Behavior.Report_Error reporting_stream_encoder->
|
||||
reporting_stream_encoder.write contents
|
||||
|
||||
f.read_text encoding . should_equal contents
|
||||
|
||||
Test.specify "should raise warnings when writing characters that cannot be encoded and replace them with the Unicode replacement character or a question mark" <|
|
||||
f = Enso_Project.data / "transient" / "ascii.txt"
|
||||
encoding = Encoding.ascii
|
||||
contents = 'Sło\u0301wka!'
|
||||
f.delete_if_exists
|
||||
result = f.with_output_stream [File.Option.Write, File.Option.Create_New] stream->
|
||||
stream.with_stream_encoder encoding Problem_Behavior.Report_Warning reporting_stream_encoder->
|
||||
reporting_stream_encoder.write contents
|
||||
result . should_equal Nothing
|
||||
Warning.get_all result . map .value . should_equal [Encoding_Error "Encoding issues at characters 1, 3."]
|
||||
f.read_text encoding . should_equal "S?o?wka!"
|
||||
|
||||
f.delete_if_exists
|
||||
result_2 = f.with_output_stream [File.Option.Write, File.Option.Create_New] stream->
|
||||
stream.with_stream_encoder encoding Problem_Behavior.Report_Warning reporting_stream_encoder->
|
||||
reporting_stream_encoder.write "ABC"
|
||||
reporting_stream_encoder.write "ą"
|
||||
reporting_stream_encoder.write "foo"
|
||||
reporting_stream_encoder.write " -🚧- "
|
||||
reporting_stream_encoder.write "bar"
|
||||
|
||||
result_2 . should_equal Nothing
|
||||
Warning.get_all result_2 . map .value . should_equal [Encoding_Error "Encoding issues at characters 3, 9."]
|
||||
f.read_text encoding . should_equal "ABC?foo -?- bar"
|
||||
|
||||
Test.specify "should work correctly if no data is written to it" <|
|
||||
f = Enso_Project.data / "transient" / "empty.txt"
|
||||
encoding = Encoding.ascii
|
||||
f.delete_if_exists
|
||||
result = f.with_output_stream [File.Option.Write, File.Option.Create_New] stream->
|
||||
stream.with_stream_encoder encoding Problem_Behavior.Report_Error _->Nothing
|
||||
result . should_equal Nothing
|
||||
f.read_text encoding . should_equal ""
|
||||
|
||||
main = Test.Suite.run_main here.spec
|
Loading…
Reference in New Issue
Block a user