Implement Windows-1252 fallback logic for Encoding.Default (#10190)

- Closes #10148
- [x] Tests for `Restartable_Input_Stream`, `peek_bytes` and `skip_n_bytes`.
- [x] Report `Managed_Resource` stack overflow bug: #10211
- [x] Followup possible optimization: #10220
- [x] Test use-case from blog.
This commit is contained in:
Radosław Waśko 2024-06-10 12:49:26 +02:00 committed by GitHub
parent d938c96c55
commit 41d02e95ef
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
22 changed files with 652 additions and 375 deletions

View File

@ -16,6 +16,7 @@
operations.][9950]
- [Implemented `.cast` to and from `Decimal` columns for the in-memory
database.][10206]
- [Implemented fallback to Windows-1252 encoding for `Encoding.Default`.][10190]
[debug-shortcuts]:
@ -23,6 +24,7 @@
[10122]: https://github.com/enso-org/enso/pull/10122
[10130]: https://github.com/enso-org/enso/pull/10130
[10206]: https://github.com/enso-org/enso/pull/10206
[10190]: https://github.com/enso-org/enso/pull/10190
<br/>![Release Notes](/docs/assets/tags/release_notes.svg)

View File

@ -62,7 +62,10 @@ type Encoding
encoding. Otherwise, the input is decoded using UTF-8 unless it contains
invalid UTF-8 sequences, in which case Windows-1252 is used as a fallback.
When used for encoding, it will always encode using UTF-8.
When used for encoding, it will either use the same encoding detection
heuristics as in read in case of Append mode. When writing a new file,
it will always use UTF-8.
This encoding cannot be passed to some functions that require a Java
Charset.
default -> Encoding =

View File

@ -33,6 +33,7 @@ import project.Errors.Time_Error.Time_Error
import project.Meta
import project.Nothing.Nothing
import project.Panic.Panic
import project.System.Internal.Reporting_Stream_Decoder_Helper
from project.Data.Boolean import Boolean, False, True
from project.Data.Json import Invalid_JSON, JS_Object, Json
from project.Data.Numbers import Float, Integer, Number, Number_Parse_Error
@ -763,11 +764,7 @@ Text.bytes self (encoding : Encoding = Encoding.utf_8) (on_problems : Problem_Be
@encoding Encoding.default_widget
Text.from_bytes : Vector Integer -> Encoding -> Problem_Behavior -> Text
Text.from_bytes bytes (encoding : Encoding = Encoding.default) (on_problems : Problem_Behavior = Problem_Behavior.Report_Error) =
result = Encoding_Utils.from_bytes bytes encoding.to_java_charset_or_null : WithProblems
if result.problems.is_empty then result.result else
problems = result.problems.map decoding_problem->
Encoding_Error.Error decoding_problem.message
on_problems.attach_problems_after result.result problems
Reporting_Stream_Decoder_Helper.decode_bytes_to_text bytes encoding on_problems
## ICON convert
Returns a vector containing bytes representing the UTF-8 encoding of the

View File

@ -29,13 +29,13 @@ import project.System.File_Format.Infer
import project.System.File_Format.Plain_Text_Format
import project.System.File_Format_Metadata.File_Format_Metadata
import project.System.Input_Stream.Input_Stream
import project.System.Advanced.Restartable_Input_Stream.Restartable_Input_Stream
from project.Data.Boolean import Boolean, False, True
from project.Data.Text.Extensions import all
from project.Metadata.Choice import Option
from project.Metadata.Widget import Single_Choice
from project.System.File_Format import format_types
polyglot java import java.io.ByteArrayInputStream
polyglot java import java.io.InputStream
## PRIVATE
@ -62,24 +62,15 @@ type Response_Body
Raw_Stream (raw_stream:Input_Stream) (metadata:File_Format_Metadata) uri:URI
## PRIVATE
Materialized_Byte_Array (bytes:Vector) (metadata:File_Format_Metadata) uri:URI
## PRIVATE
Materialized_Temporary_File (temporary_file:Temporary_File) (metadata:File_Format_Metadata) uri:URI
Materialized_Stream (restartable_stream:Restartable_Input_Stream) (metadata:File_Format_Metadata) uri:URI
## PRIVATE
with_stream : (Input_Stream -> Any ! HTTP_Error) -> Any ! HTTP_Error
with_stream self action = case self of
Response_Body.Raw_Stream raw_stream _ _ ->
Managed_Resource.bracket raw_stream (_.close) action
Response_Body.Materialized_Byte_Array bytes _ _ ->
byte_stream = Input_Stream.new (ByteArrayInputStream.new bytes) (HTTP_Error.handle_java_exceptions self.uri)
Managed_Resource.bracket byte_stream (_.close) action
Response_Body.Materialized_Temporary_File temporary_file _ _ -> temporary_file.with_file file->
opts = [File_Access.Read.to_java]
stream = HTTP_Error.handle_java_exceptions self.uri (file.input_stream_builtin opts)
file_stream = Input_Stream.new stream (HTTP_Error.handle_java_exceptions self.uri) associated_file=temporary_file
Managed_Resource.bracket (file_stream) (_.close) action
Response_Body.Materialized_Stream restartable_stream _ _ ->
restartable_stream.with_fresh_stream action
## PRIVATE
ADVANCED
@ -87,25 +78,11 @@ type Response_Body
return a new Response_Body.
materialize : Input_Stream
materialize self = case self of
Response_Body.Raw_Stream _ _ _ ->
self.with_stream body_stream->
body_stream.with_java_stream body_java_stream->
first_block = body_java_stream.readNBytes maximum_body_in_memory
case first_block.length < maximum_body_in_memory of
True -> Response_Body.Materialized_Byte_Array (Vector.from_polyglot_array first_block) self.metadata self.uri
False -> Context.Output.with_enabled <|
## Write contents to a temporary file
temp_file = Temporary_File.new self.uri.host
r = temp_file.with_file file->
file.with_output_stream [File_Access.Write, File_Access.Create, File_Access.Truncate_Existing] output_stream->
output_stream.with_java_stream java_output_stream->
java_output_stream.write first_block
body_java_stream.transferTo java_output_stream
java_output_stream.flush
Nothing
r.if_not_error <|
Response_Body.Materialized_Temporary_File temp_file self.metadata self.uri
_ -> self
Response_Body.Raw_Stream _ metadata uri ->
restartable_stream = self.with_stream body_stream->
body_stream.as_restartable_stream
Response_Body.Materialized_Stream restartable_stream metadata uri
Response_Body.Materialized_Stream _ _ _ -> self
## ALIAS parse
GROUP Input

View File

@ -37,8 +37,12 @@ type Managed_Resource
function once it is no longer in use.
Arguments:
- resource: The resource to register.
- function: The action to be executed on resource to clean it up when
it is no longer in use.
Returns:
A `Managed_Resource` object that can be used to access the resource.
register : Any -> (Any -> Nothing) -> Managed_Resource
register resource function = @Builtin_Method "Managed_Resource.register"

View File

@ -0,0 +1,105 @@
import project.Any.Any
import project.Data.Text.Text
import project.Data.Vector.Vector
import project.Nothing.Nothing
import project.Runtime.Context
import project.Runtime.Managed_Resource.Managed_Resource
import project.System.File.Advanced.Temporary_File.Temporary_File
import project.System.File.File
import project.System.File.File_Access.File_Access
import project.System.Input_Stream.Input_Stream
from project.Data.Boolean import Boolean, False, True
## PRIVATE
An input stream that can be read multiple times.
It may be useful when multiple passes over the data are required.
If you need to check only the beginning of the stream, consider using a much
lighter `Input_Stream.as_peekable_stream`.
A generic stream can be converted to `Restartable_Input_Stream` by reading
all its contents and storing them either in memory or in a temporary file.
A stream backed by an existing file can be converted to
`Restartable_Input_Stream` at no cost.
! Stream Lifetime
Note that if we use an existing file as a shortcut to avoid copying the
data, we need to assume that the file will not be modified in the meantime.
Thus the `Restartable_Input_Stream` does not fully guarantee immutability
of the data. The lifetime of such `Restartable_Input_Stream` is tied to the
lifetime of its backing file.
If the stream should stay usable for a longer time, `extend_lifetime=True`
should be passed when creating it.
type Restartable_Input_Stream
## PRIVATE
`bytes` may be a Vector or a raw `byte[]` array (convertible to vector, but no annotation to avoid conversions).
private From_Bytes bytes
## PRIVATE
private From_Existing_File file:File
## PRIVATE
private From_Temporary_File temporary_file:Temporary_File
## PRIVATE
to_text self -> Text =
suffix = case self of
Restartable_Input_Stream.From_Bytes _ -> "From_Bytes"
Restartable_Input_Stream.From_Existing_File file -> "From_Existing_File "+file.to_text
Restartable_Input_Stream.From_Temporary_File _ -> "From_Temporary_File"
"Restartable_Input_Stream."+suffix
## PRIVATE
make (input_stream : Input_Stream) (extend_lifetime : Boolean) -> Restartable_Input_Stream =
case input_stream.associated_source of
temp_file : Temporary_File -> Restartable_Input_Stream.From_Temporary_File temp_file
file : File ->
if extend_lifetime then cache_generic_input_stream input_stream else
Restartable_Input_Stream.From_Existing_File file
bytes : Vector -> Restartable_Input_Stream.From_Bytes bytes
_ -> cache_generic_input_stream input_stream
## PRIVATE
Runs the provided action with a fresh input stream pointing to the
beginning of the data represented by this stream.
This method may be called multiple times, allowing multiple 'rounds' of
processing.
with_fresh_stream self (action : Input_Stream -> Any) -> Any =
case self of
Restartable_Input_Stream.From_Bytes bytes ->
Managed_Resource.bracket (Input_Stream.from_bytes bytes) (.close) action
Restartable_Input_Stream.From_Existing_File file ->
file.with_input_stream [File_Access.Read] action
Restartable_Input_Stream.From_Temporary_File temp_file ->
temp_file.with_file file->
file.with_input_stream [File_Access.Read] action
## PRIVATE
Maximum size for a stream to be held in memory.
If the amount of data exceeds this limit, it will be stored in a temporary file.
max_in_memory_size =
# 64 KiB
64 * 1024
## PRIVATE
private cache_generic_input_stream (input_stream : Input_Stream) -> Restartable_Input_Stream =
input_stream.with_java_stream java_input_stream->
first_block = java_input_stream.readNBytes max_in_memory_size
case first_block.length < max_in_memory_size of
True ->
Restartable_Input_Stream.From_Bytes first_block
False ->
Context.Output.with_enabled <|
temp_file = Temporary_File.new "restartable-input-stream"
r = temp_file.with_file file->
file.with_output_stream [File_Access.Write, File_Access.Create, File_Access.Truncate_Existing] output_stream->
output_stream.with_java_stream java_output_stream->
java_output_stream.write first_block
java_input_stream.transferTo java_output_stream
java_output_stream.flush
Nothing
r.if_not_error <|
Restartable_Input_Stream.From_Temporary_File temp_file

View File

@ -246,11 +246,11 @@ type File
file.with_input_stream [File_Access.Create, File_Access.Read] action
with_input_stream : Vector File_Access -> (Input_Stream -> Any ! File_Error) -> Any ! File_Error
with_input_stream self (open_options : Vector) action =
new_input_stream : File -> Vector File_Access -> Output_Stream ! File_Error
new_input_stream : File -> Vector File_Access -> Input_Stream ! File_Error
new_input_stream file open_options =
opts = open_options . map (_.to_java)
stream = File_Error.handle_java_exceptions file (file.input_stream_builtin opts)
Input_Stream.new stream (File_Error.handle_java_exceptions self)
Input_Stream.new stream (File_Error.handle_java_exceptions self) associated_source=self
if self.is_directory then Error.throw (File_Error.IO_Error self "File '"+self.path+"' is a directory") else
open_as_data_link = (open_options.contains Data_Link_Access.No_Follow . not) && (Data_Link.is_data_link self)

View File

@ -96,7 +96,7 @@ type Temporary_File
If the stream is already backed by a temporary or regular file, that file is returned.
from_stream_light : Input_Stream -> Temporary_File | File
from_stream_light stream =
case stream.associated_file of
case stream.associated_source of
tmp : Temporary_File -> tmp
file : File -> file
_ -> Temporary_File.from_stream stream

View File

@ -1,19 +1,28 @@
import project.Any.Any
import project.Data.Array.Array
import project.Data.Numbers.Integer
import project.Data.Text.Encoding.Encoding
import project.Data.Vector.Vector
import project.Errors.Encoding_Error.Encoding_Error
import project.Error.Error
import project.Errors.File_Error.File_Error
import project.Errors.Illegal_State.Illegal_State
import project.Errors.Problem_Behavior.Problem_Behavior
import project.Nothing.Nothing
import project.Runtime.Managed_Resource.Managed_Resource
import project.System.Advanced.Restartable_Input_Stream.Restartable_Input_Stream
import project.System.File.Advanced.Temporary_File.Temporary_File
import project.System.File.File
import project.System.File.File_Access.File_Access
import project.System.File.Generic.Writable_File.Writable_File
import project.System.Internal.Reporting_Stream_Decoder_Helper
from project.Data.Boolean import Boolean, False, True
polyglot java import java.io.BufferedInputStream
polyglot java import java.io.ByteArrayInputStream
polyglot java import java.io.InputStream as Java_Input_Stream
polyglot java import org.enso.base.encoding.ReportingStreamDecoder
polyglot java import org.enso.base.encoding.Encoding_Utils
polyglot java import org.enso.base.encoding.ReportingStreamDecoder
polyglot java import org.enso.base.Stream_Utils
## PRIVATE
An input stream, allowing for interactive reading of contents.
@ -23,10 +32,17 @@ type Input_Stream
Given a Java InputStream, wraps as a Managed_Resource and returns a new
Input_Stream.
new : Java_Input_Stream -> Any -> (Nothing | File | Temporary_File) -> Input_Stream
new java_stream error_handler associated_file=Nothing =
new : Java_Input_Stream -> Any -> (Nothing | File | Temporary_File | Vector | Array) -> Input_Stream
new java_stream error_handler associated_source=Nothing =
resource = Managed_Resource.register java_stream close_stream
Input_Stream.Value resource error_handler associated_file
Input_Stream.Value resource error_handler associated_source
## PRIVATE
ADVANCED
Creates a new input stream from a vector of bytes.
from_bytes bytes -> Input_Stream =
raw_stream = ByteArrayInputStream.new bytes
Input_Stream.new raw_stream (File_Error.handle_java_exceptions Nothing) bytes
## PRIVATE
An input stream, allowing for interactive reading of contents.
@ -35,15 +51,16 @@ type Input_Stream
- stream_resource: The internal resource that represents the underlying
stream.
- error_handler: An error handler for IOExceptions thrown when reading.
- associated_file: The file associated with this stream, if any.
Value stream_resource error_handler (associated_file:Nothing|File|Temporary_File)
- associated_source: The source associated with this stream, if any.
It can be used to cheaply convert this stream into a `Restartable_Input_Stream`.
private Value stream_resource error_handler (associated_source : Nothing | File | Temporary_File | Vector | Array)
## PRIVATE
ADVANCED
Reads all the bytes in this stream into a vector of bytes.
read_all_bytes : Vector
read_all_bytes self = self.stream_resource . with java_stream->
self.error_handler <| Vector.from_polyglot_array java_stream.readAllBytes
read_all_bytes self = self.with_java_stream java_stream->
Vector.from_polyglot_array java_stream.readAllBytes
## PRIVATE
ADVANCED
@ -58,10 +75,14 @@ type Input_Stream
Arguments:
- n: The number of bytes to read from the stream.
read_n_bytes : Integer -> Vector
read_n_bytes self n = self.stream_resource . with java_stream->
self.error_handler <|
bytes = java_stream.readNBytes n
Vector.from_polyglot_array bytes
read_n_bytes self (n : Integer) = self.with_java_stream java_stream->
bytes = java_stream.readNBytes n
Vector.from_polyglot_array bytes
## PRIVATE
It may throw an error if not enough bytes are available.
skip_n_bytes self (n : Integer) = self.with_java_stream java_stream->
java_stream.skipNBytes n
## PRIVATE
ADVANCED
@ -70,9 +91,8 @@ type Input_Stream
The returned value is an integer in the range 0-255 representing the
next byte of input, or -1 if end of stream is reached.
read_byte : Integer
read_byte self = self.stream_resource . with java_stream->
self.error_handler <|
java_stream.read
read_byte self = self.with_java_stream java_stream->
java_stream.read
## PRIVATE
ADVANCED
@ -93,17 +113,75 @@ type Input_Stream
Arguments:
- f: Applies a function over the internal java stream.
with_java_stream : (Java_Input_Stream -> Any) -> Any
with_java_stream self f = self.stream_resource . with f
with_java_stream self f = self.stream_resource . with java_stream->
self.error_handler <| f java_stream
## PRIVATE
Runs an action with a `ReportingStreamDecoder` decoding data from the
input stream with the specified encoding.
with_stream_decoder : Encoding -> Problem_Behavior -> (ReportingStreamDecoder -> Any) -> Any
with_stream_decoder self encoding on_problems action = self.stream_resource . with java_stream->
results = Encoding_Utils.with_stream_decoder java_stream encoding.to_java_charset_or_null action
problems = Vector.from_polyglot_array results.problems . map decoding_problem->
Encoding_Error.Error decoding_problem.message
on_problems.attach_problems_after results.result problems
with_stream_decoder self (encoding : Encoding) (on_problems : Problem_Behavior) action =
Reporting_Stream_Decoder_Helper.run self encoding on_problems allow_usage_in_background_threads=True action
## PRIVATE
Converts this stream into a restartable stream.
The conversion tries to be efficient, avoiding reading the stream if it
is simply backed by a file or byte array.
This stream is invalidated after the conversion.
Arguments:
- extend_lifetime: This option is only applicable if the source stream
was backed by a file. If set to `True`, a new temporary file will be
created, to untie the lifetime of the returned stream from the backing
file. If set to `False`, the returned stream will be valid only as long
as the original backing file, but the operation will be more efficient
- so it is preferred if the caller can guarantee the lifetime of such
backing file.
as_restartable_stream self (extend_lifetime : Boolean = True) -> Restartable_Input_Stream =
Restartable_Input_Stream.make self extend_lifetime
## PRIVATE
is_peekable self -> Boolean =
self.with_java_stream java_stream->
java_stream.markSupported
## PRIVATE
Converts this stream into a stream that can be peeked.
This is useful when some application needs to look ahead in the stream,
but then needs to pass it further with the same state.
The peeked bytes are cached in memory, so this should only be used for
small amounts of data. If more data has to be processed more than once,
`as_restartable_stream` is preferred.
The current stream may be invalidated after the conversion, and it should
no longer be used - only the returned stream should be used.
as_peekable_stream self -> Input_Stream = if self.is_peekable then self else
raw_java_stream = self.stream_resource.take
buffered_stream = BufferedInputStream.new raw_java_stream
Input_Stream.new buffered_stream self.error_handler self.associated_source
## PRIVATE
Peeks up to the provided number of bytes from the stream.
Makes a best-effort to read as many bytes as provided, however fewer
bytes may be read, if end of stream is encountered.
The length of the returned vector is the same as the number of bytes
read.
No bytes are consumed from the stream - a next read or peek
operation will see the same contents as before this call.
This operation is only allowed if `is_peekable` returns `True`.
Arguments:
- n: The number of bytes to read from the stream.
peek_bytes self (n : Integer) -> Vector Integer =
if self.is_peekable.not then Error.throw (Illegal_State.Error "`peek_bytes` called on a stream where `is_peekable=False`. Please convert the stream using `as_peekable_stream`.") else
self.with_java_stream java_stream->
Vector.from_polyglot_array <| Stream_Utils.peek java_stream n
## PRIVATE
Reads the contents of this stream into a given file.

View File

@ -0,0 +1,130 @@
private
import project.Any.Any
import project.Data.Text.Encoding.Encoding
import project.Data.Text.Text
import project.Data.Vector.Vector
import project.Errors.Encoding_Error.Encoding_Error
import project.Errors.Problem_Behavior.Problem_Behavior
import project.Nothing.Nothing
import project.Runtime.Managed_Resource.Managed_Resource
import project.System.Input_Stream.Input_Stream
from project.Data.Boolean import Boolean, False, True
from project.Runtime import assert
polyglot java import org.enso.base.encoding.DecodingProblemAggregator
polyglot java import org.enso.base.encoding.Encoding_Utils
polyglot java import org.enso.base.encoding.ReportingStreamDecoder
## PRIVATE
Builds the `ReportingStreamDecoder`, consuming the `Input_Stream`.
It will do any necessary encoding detection, as determined by the `Encoding`
run (input_stream : Input_Stream) (encoding : Encoding) (on_problems : Problem_Behavior) (allow_usage_in_background_threads : Boolean) (continuation : ReportingStreamDecoder -> Any) =
# We always ensure the stream is peekable, as that also implies buffering which is supposedly more efficient e.g. when working with files.
buffered_input_stream = input_stream.as_peekable_stream
problem_aggregator = DecodingProblemAggregator.new
result = resolve_encoding encoding buffered_input_stream problem_aggregator effective_encoding-> amended_input_stream->
amended_input_stream.with_java_stream java_stream->
# We can only poll safepoints if the decoder is guaranteed to be used in the main thread only.
poll_safepoints = allow_usage_in_background_threads.not
decoder = ReportingStreamDecoder.new java_stream effective_encoding.to_java_charset problem_aggregator poll_safepoints
continuation decoder
problems = Vector.from_polyglot_array problem_aggregator.summarize . map decoding_problem->
Encoding_Error.Error decoding_problem.message
on_problems.attach_problems_after result problems
## PRIVATE
decode_bytes_to_text bytes (encoding : Encoding) (on_problems : Problem_Behavior) -> Text =
Managed_Resource.bracket (Input_Stream.from_bytes bytes) (.close) input_stream->
run input_stream encoding on_problems allow_usage_in_background_threads=False decoder->
decoder.readAllIntoMemory
## PRIVATE
resolve_encoding (encoding : Encoding) (buffered_input_stream : Input_Stream) (problem_aggregator : DecodingProblemAggregator) (continuation : Encoding -> Input_Stream -> Any) -> Any =
case encoding of
Encoding.Default ->
detect_default_encoding buffered_input_stream problem_aggregator continuation
Encoding.Value "UTF-8" ->
verify_unicode_encoding encoding Unicode_BOM.UTF_8 buffered_input_stream problem_aggregator continuation
Encoding.Value "UTF-16LE" ->
verify_unicode_encoding encoding Unicode_BOM.UTF_16_LE buffered_input_stream problem_aggregator continuation
Encoding.Value "UTF-16BE" ->
verify_unicode_encoding encoding Unicode_BOM.UTF_16_BE buffered_input_stream problem_aggregator continuation
# Any other encoding just continues without any additional processing.
_ -> continuation encoding buffered_input_stream
## PRIVATE
detect_default_encoding (buffered_input_stream : Input_Stream) (problem_aggregator : DecodingProblemAggregator) (continuation : Encoding -> Input_Stream -> Any) -> Any =
case detect_bom buffered_input_stream of
# We rely on the detected BOM for the encoding, even if there are errors down the line.
detected_bom : Unicode_BOM ->
encoding_name = detected_bom.corresponding_encoding.to_display_text
context_message = "An " + encoding_name + " BOM was detected, so " + encoding_name + " encoding has been assumed, but some characters seem invalid: "
problem_aggregator.setInvalidCharacterErrorPrefix context_message
# Skip the BOM to avoid including it in the decoded data.
buffered_input_stream.skip_n_bytes detected_bom.as_bytes.length . if_not_error <|
continuation detected_bom.corresponding_encoding buffered_input_stream
# If no BOM was detected, we do a pass to try UTF-8 encoding and if it fails, we restart and fallback to Windows-1252.
Nothing ->
# We don't need to extend the lifetime of the stream, as it will not be kept longer than the lifetime of the `buffered_input_stream`.
restartable_stream = buffered_input_stream.as_restartable_stream extend_lifetime=False
is_valid_utf_8 = restartable_stream.with_fresh_stream checking_stream->
checking_stream.with_java_stream java_checking_stream->
Encoding_Utils.canDecodeWithoutErrors java_checking_stream Encoding.utf_8.to_java_charset
effective_encoding = if is_valid_utf_8 then Encoding.utf_8 else Encoding.windows_1252
restartable_stream.with_fresh_stream input_stream->
continuation effective_encoding input_stream
## PRIVATE
verify_unicode_encoding (encoding : Encoding) (expected_bom : Unicode_BOM) (buffered_input_stream : Input_Stream) (problem_aggregator : DecodingProblemAggregator) (continuation : Encoding -> Input_Stream -> Any) -> Any =
case detect_bom buffered_input_stream of
Nothing ->
# No BOM detected, so we just proceed.
continuation encoding buffered_input_stream
detected_bom : Unicode_BOM ->
case detected_bom == expected_bom of
True ->
# We found the expected BOM. We skip it to avoid including it in decoded data.
buffered_input_stream.skip_n_bytes expected_bom.as_bytes.length . if_not_error <|
continuation encoding buffered_input_stream
False ->
# Report BOM mismatch
message = detected_bom.corresponding_encoding.to_display_text + " BOM has been found when decoding as " + encoding.to_display_text + "."
problem_aggregator.reportOtherProblem message
continuation encoding buffered_input_stream
## PRIVATE
type Unicode_BOM
## PRIVATE
UTF_8
## PRIVATE
UTF_16_LE
## PRIVATE
UTF_16_BE
## PRIVATE
as_bytes self -> Vector = case self of
Unicode_BOM.UTF_8 -> [-17, -69, -65]
Unicode_BOM.UTF_16_LE -> [-1, -2]
Unicode_BOM.UTF_16_BE -> [-2, -1]
corresponding_encoding self -> Encoding = case self of
Unicode_BOM.UTF_8 -> Encoding.utf_8
Unicode_BOM.UTF_16_LE -> Encoding.utf_16_le
Unicode_BOM.UTF_16_BE -> Encoding.utf_16_be
## PRIVATE
all = [Unicode_BOM.UTF_8, Unicode_BOM.UTF_16_LE, Unicode_BOM.UTF_16_BE]
## PRIVATE
detect_bom (input_stream : Input_Stream) -> Unicode_BOM | Nothing =
assert input_stream.is_peekable
beginning = input_stream.peek_bytes 3
matching_bom = Unicode_BOM.all.find if_missing=Nothing bom->
expected_bytes = bom.as_bytes
expected_bytes == (beginning.take expected_bytes.length)
matching_bom

View File

@ -0,0 +1,28 @@
package org.enso.base;
import java.io.IOException;
import java.io.InputStream;
import java.util.Arrays;
public class Stream_Utils {
public static byte[] peek(InputStream stream, int n) throws IOException {
assert n >= 0;
assert stream.markSupported();
byte[] buffer = new byte[n];
stream.mark(n + 1);
int offset = 0;
while (offset < n) {
int read = stream.read(buffer, offset, n - offset);
if (read == -1) {
break;
}
offset += read;
}
stream.reset();
if (offset < n) {
buffer = Arrays.copyOf(buffer, offset);
}
return buffer;
}
}

View File

@ -31,6 +31,10 @@ public class DecodingProblemAggregator {
}
}
public boolean hasEncounteredInvalidCharacters() {
return invalidUnitCount > 0;
}
private DecodingProblem summarizeInvalidCharacterProblems() {
if (invalidUnitCount == 0) {
return null;

View File

@ -1,213 +0,0 @@
package org.enso.base.encoding;
import java.io.BufferedInputStream;
import java.io.EOFException;
import java.io.IOException;
import java.nio.charset.Charset;
import java.nio.charset.StandardCharsets;
public abstract sealed class EncodingRepresentation {
private static final byte[] UTF_8_BOM = new byte[] {(byte) 0xEF, (byte) 0xBB, (byte) 0xBF};
private static final byte[] UTF_16_BE_BOM = new byte[] {(byte) 0xFE, (byte) 0xFF};
private static final byte[] UTF_16_LE_BOM = new byte[] {(byte) 0xFF, (byte) 0xFE};
private static byte[] peekStream(BufferedInputStream stream, int n) throws IOException {
byte[] buffer = new byte[n];
stream.mark(n + 1);
int offset = 0;
while (offset < n) {
int read = stream.read(buffer, offset, n - offset);
if (read == -1) {
break;
}
offset += read;
}
stream.reset();
return buffer;
}
private static void skipStream(BufferedInputStream stream, int n) throws IOException {
try {
stream.skipNBytes(n);
} catch (EOFException eofException) {
// ignore early EOF
}
}
private static boolean startsWith(byte[] buffer, byte[] prefix) {
if (buffer.length < prefix.length) {
return false;
}
for (int i = 0; i < prefix.length; i++) {
if (buffer[i] != prefix[i]) {
return false;
}
}
return true;
}
public static EncodingRepresentation fromCharset(Charset charset) {
if (charset == null) {
return new Default();
} else if (charset.equals(StandardCharsets.UTF_8)) {
return UTF8.INSTANCE;
} else if (charset.equals(StandardCharsets.UTF_16LE)) {
return UTF16.LittleEndian;
} else if (charset.equals(StandardCharsets.UTF_16BE)) {
return UTF16.BigEndian;
} else {
return new Other(charset);
}
}
/**
* Detects the effective charset based on initial data present in the stream.
*
* @param stream the stream to detect the charset for. Initial BOM header may be deliberately
* consumed by this method, so that it is ignored for further processing. No other data is
* consumed (we only peek).
* @param problemAggregator an aggregator to report any warnings about detected encoding, or
* report context metadata to be used in future errors reported by other components.
*/
public abstract Charset detectCharset(
BufferedInputStream stream, DecodingProblemAggregator problemAggregator) throws IOException;
private static final class Default extends EncodingRepresentation {
// Note: the Windows-1252 fallback is not implemented as part of this detection - it only allows
// to distinguish UTF BOMs.
@Override
public Charset detectCharset(
BufferedInputStream stream, DecodingProblemAggregator problemAggregator)
throws IOException {
byte[] beginning = peekStream(stream, 3);
if (startsWith(beginning, UTF_8_BOM)) {
skipStream(stream, UTF_8_BOM.length);
notifyContextAboutAssumedEncoding(problemAggregator, "UTF-8");
return StandardCharsets.UTF_8;
} else if (startsWith(beginning, UTF_16_BE_BOM)) {
skipStream(stream, UTF_16_BE_BOM.length);
notifyContextAboutAssumedEncoding(problemAggregator, "UTF-16 BE");
return StandardCharsets.UTF_16BE;
} else if (startsWith(beginning, UTF_16_LE_BOM)) {
skipStream(stream, UTF_16_LE_BOM.length);
notifyContextAboutAssumedEncoding(problemAggregator, "UTF-16 LE");
return StandardCharsets.UTF_16LE;
} else {
// If no BOM we fallback to UTF-8.
return StandardCharsets.UTF_8;
}
}
private static void notifyContextAboutAssumedEncoding(
DecodingProblemAggregator problemAggregator, String encodingName) {
String prefix =
"An "
+ encodingName
+ " BOM was detected, so "
+ encodingName
+ " encoding has been assumed, but some characters seem invalid: ";
problemAggregator.setInvalidCharacterErrorPrefix(prefix);
}
}
private static final class Other extends EncodingRepresentation {
private final Charset charset;
public Other(Charset charset) {
this.charset = charset;
}
@Override
public Charset detectCharset(
BufferedInputStream stream, DecodingProblemAggregator problemAggregator)
throws IOException {
// We ignore the stream as we just use the provided encoding as-is.
return charset;
}
}
private static final class UTF8 extends EncodingRepresentation {
static final UTF8 INSTANCE = new UTF8();
@Override
public Charset detectCharset(
BufferedInputStream stream, DecodingProblemAggregator problemAggregator)
throws IOException {
byte[] beginning = peekStream(stream, UTF_8_BOM.length);
if (startsWith(beginning, UTF_8_BOM)) {
skipStream(stream, UTF_8_BOM.length);
} else {
if (startsWith(beginning, UTF_16_BE_BOM) || startsWith(beginning, UTF_16_LE_BOM)) {
problemAggregator.reportOtherProblem("UTF-16 BOM has been found when decoding as UTF-8.");
}
}
return StandardCharsets.UTF_8;
}
}
private static final class UTF16 extends EncodingRepresentation {
static final UTF16 BigEndian = new UTF16(StandardCharsets.UTF_16BE, Endianness.BigEndian);
static final UTF16 LittleEndian = new UTF16(StandardCharsets.UTF_16LE, Endianness.LittleEndian);
private final Charset charset;
private final byte[] expectedBOM;
private final byte[] flippedBOM;
private Endianness endianness;
private UTF16(Charset charset, Endianness endianness) {
this.charset = charset;
this.expectedBOM =
switch (endianness) {
case BigEndian -> UTF_16_BE_BOM;
case LittleEndian -> UTF_16_LE_BOM;
};
this.flippedBOM =
switch (endianness) {
case BigEndian -> UTF_16_LE_BOM;
case LittleEndian -> UTF_16_BE_BOM;
};
this.endianness = endianness;
}
@Override
public Charset detectCharset(
BufferedInputStream stream, DecodingProblemAggregator problemAggregator)
throws IOException {
assert expectedBOM.length == flippedBOM.length;
assert expectedBOM.length == 2;
byte[] beginning = peekStream(stream, 2);
if (startsWith(beginning, expectedBOM)) {
skipStream(stream, expectedBOM.length);
} else if (startsWith(beginning, flippedBOM)) {
problemAggregator.reportOtherProblem(
"Decoding as UTF-16 "
+ endianness
+ ", but a "
+ endianness.flip()
+ " BOM has been found.");
}
return charset;
}
enum Endianness {
BigEndian,
LittleEndian;
@Override
public String toString() {
return switch (this) {
case BigEndian -> "Big Endian";
case LittleEndian -> "Little Endian";
};
}
public Endianness flip() {
return switch (this) {
case BigEndian -> LittleEndian;
case LittleEndian -> BigEndian;
};
}
}
}
}

View File

@ -1,7 +1,5 @@
package org.enso.base.encoding;
import java.io.BufferedInputStream;
import java.io.ByteArrayInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.OutputStream;
@ -13,7 +11,6 @@ import java.nio.charset.CharsetEncoder;
import java.nio.charset.CoderResult;
import java.nio.charset.CodingErrorAction;
import java.util.Arrays;
import java.util.List;
import java.util.function.BiConsumer;
import java.util.function.Function;
import java.util.function.IntFunction;
@ -97,89 +94,25 @@ public class Encoding_Utils {
}
/**
* Converts an array of encoded bytes into a string.
* Checks if the following stream can be decoded without errors using the given charset.
*
* @param bytes the bytes to convert
* @param charset the character set to use for decoding, use {@code null} to try auto-detection
* @return the resulting string, and any potential problems
* <p>The stream is (partially or wholly) consumed as part of this process.
*/
public static WithProblems<String, DecodingProblem> from_bytes(byte[] bytes, Charset charset) {
if (bytes == null || bytes.length == 0) {
return new WithProblems<>("", List.of());
}
public static boolean canDecodeWithoutErrors(InputStream stream, Charset charset)
throws IOException {
DecodingProblemAggregator problemAggregator = new DecodingProblemAggregator();
ByteArrayInputStream inputStream = new ByteArrayInputStream(bytes);
ReportingStreamDecoder decoder;
try {
decoder = create_stream_decoder(inputStream, charset, problemAggregator, true);
} catch (IOException e) {
throw new IllegalStateException(
"Unexpected IO exception in internal code: " + e.getMessage(), e);
}
CharBuffer out = CharBuffer.allocate((int) (bytes.length * decoder.averageCharsPerByte()));
try {
int n;
do {
if (!out.hasRemaining()) {
out = resize(out, CharBuffer::allocate, CharBuffer::put);
try (var decoder = new ReportingStreamDecoder(stream, charset, problemAggregator, true)) {
char[] tmpBuffer = new char[1024];
while (decoder.read(tmpBuffer) >= 0) {
if (problemAggregator.hasEncounteredInvalidCharacters()) {
// early exit - no need to process the stream any further
return false;
}
// read is already polling safepoints so we don't have to
n = decoder.read(out);
} while (n >= 0);
} catch (IOException e) {
throw new IllegalStateException("Unexpected exception: " + e.getMessage(), e);
}
}
out.flip();
return new WithProblems<>(out.toString(), problemAggregator.summarize());
}
/**
* Creates a new instance of {@code ReportingStreamDecoder} decoding a given charset.
*
* @param stream the input stream to decode
* @param charset the character set to use for decoding, use {@code null} to try auto-detection
* @param pollSafepoints whether to poll for safepoints during decoding. This should be true if
* the decoding will run on the main thread, and false otherwise.
*/
private static ReportingStreamDecoder create_stream_decoder(
InputStream stream,
Charset charset,
DecodingProblemAggregator problemAggregator,
boolean pollSafepoints)
throws IOException {
BufferedInputStream bufferedStream = new BufferedInputStream(stream);
EncodingRepresentation representation = EncodingRepresentation.fromCharset(charset);
// This may also advance the stream past the BOM
Charset detectedCharset = representation.detectCharset(bufferedStream, problemAggregator);
return new ReportingStreamDecoder(
bufferedStream, detectedCharset, problemAggregator, pollSafepoints);
}
/**
* A helper function which runs an action with a created stream decoder and closes it afterwards.
*
* <p>It returns the result returned from the executed action and any encoding problems that
* occurred when processing it.
*
* @param stream the input stream to decode
* @param charset the character set to use for decoding, use {@code null} to try auto-detection
* @param action the action to run with the created decoder
* @return the result of the action and any problems that occurred during decoding
*/
public static WithProblems<Value, DecodingProblem> with_stream_decoder(
InputStream stream, Charset charset, Function<ReportingStreamDecoder, Value> action)
throws IOException {
DecodingProblemAggregator problemAggregator = new DecodingProblemAggregator();
Value result;
ReportingStreamDecoder decoder =
create_stream_decoder(stream, charset, problemAggregator, false);
try (decoder) {
result = action.apply(decoder);
}
return new WithProblems<>(result, problemAggregator.summarize());
// Check one more time after EOF
return !problemAggregator.hasEncounteredInvalidCharacters();
}
/** Creates a new instance of {@code ReportingStreamEncoder} encoding a given charset. */

View File

@ -2,8 +2,8 @@ package org.enso.base.encoding;
import static org.enso.base.encoding.Encoding_Utils.INVALID_CHARACTER;
import java.io.BufferedInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.Reader;
import java.nio.ByteBuffer;
import java.nio.CharBuffer;
@ -30,7 +30,7 @@ public class ReportingStreamDecoder extends Reader {
return decoder.averageCharsPerByte();
}
private final BufferedInputStream bufferedInputStream;
private final InputStream inputStream;
private final CharsetDecoder decoder;
private final Charset charset;
@ -39,11 +39,11 @@ public class ReportingStreamDecoder extends Reader {
}
public ReportingStreamDecoder(
BufferedInputStream stream,
InputStream stream,
Charset charset,
DecodingProblemAggregator problemAggregator,
boolean pollSafepoints) {
bufferedInputStream = stream;
inputStream = stream;
this.charset = charset;
this.decoder =
charset
@ -55,6 +55,23 @@ public class ReportingStreamDecoder extends Reader {
this.pollSafepoints = pollSafepoints;
}
/** Decodes the entire input stream into a String. */
public String readAllIntoMemory() throws IOException {
int initialCapacity = Math.max((int) (inputStream.available() * averageCharsPerByte()), 16);
CharBuffer out = CharBuffer.allocate(initialCapacity);
int n;
do {
if (!out.hasRemaining()) {
out = Encoding_Utils.resize(out, CharBuffer::allocate, CharBuffer::put);
}
// read is already polling safepoints so we don't have to
n = this.read(out);
} while (n >= 0);
out.flip();
return out.toString();
}
/**
* Currently there is no easy way to check if a Context is available in the current thread and we
* can use safepoints or not. The issue tracking this feature can be found at: <a
@ -245,7 +262,7 @@ public class ReportingStreamDecoder extends Reader {
int bytesToRead = Math.max(expectedInputSize - bufferedInput, 1);
ensureWorkArraySize(bytesToRead);
int bytesActuallyRead = bufferedInputStream.read(workArray, 0, bytesToRead);
int bytesActuallyRead = inputStream.read(workArray, 0, bytesToRead);
if (bytesActuallyRead == -1) {
eof = true;
}
@ -383,6 +400,9 @@ public class ReportingStreamDecoder extends Reader {
@Override
public void close() throws IOException {
bufferedInputStream.close();
inputStream.close();
inputBuffer = null;
outputBuffer = null;
workArray = null;
}
}

View File

@ -0,0 +1,25 @@
package org.enso.base_test_helpers;
import java.io.IOException;
import java.io.InputStream;
/** A generic stream used for tests. */
public class RangeStream extends InputStream {
private final int end;
private int current;
public RangeStream(int start, int end) {
assert 0 <= start && start <= end;
this.end = end;
this.current = start;
}
@Override
public int read() throws IOException {
if (current >= end) {
return -1;
} else {
return current++ % 256;
}
}
}

View File

@ -69,7 +69,6 @@ add_specs suite_builder =
Problems.get_attached_warnings default_warning . should_contain_the_same_elements_as problems
suite_builder.group "Default Encoding" group_builder->
pending_fallback = "Fallback to Windows-1252 is not implemented yet."
group_builder.specify "should try reading as UTF-8 by default" <|
bytes = [65, -60, -123, -60, -103]
# A ą ę
@ -113,7 +112,7 @@ add_specs suite_builder =
bytes.length . should_equal 4*2
txt.length . should_equal 3
group_builder.specify "falls back to Windows-1252 if invalid Unicode is detected and no BOM" pending=pending_fallback <|
group_builder.specify "falls back to Windows-1252 if invalid Unicode is detected and no BOM" <|
# These are not valid bytes for UTF-8
bytes = [-30, -55, -1]
Text.from_bytes bytes Encoding.utf_8 . should_fail_with Encoding_Error
@ -148,7 +147,6 @@ add_specs suite_builder =
empty.should_equal ""
Problems.assume_no_problems empty
group_builder.specify "should work on 0 or 1 byte input (TODO merge with above)" pending=pending_fallback <|
txt = Text.from_bytes [-1] Encoding.default Problem_Behavior.Report_Warning
txt.should_equal 'ÿ'
# No problems, as falling back to Windows-1252.
@ -166,8 +164,6 @@ add_specs suite_builder =
([-1, -2] + [65, 0, 5, 1, 25, 1]).write_bytes f Existing_File_Behavior.Overwrite . should_succeed
f.read . should_equal "Aąę"
group_builder.specify "Default Encoding heuristics also work in File.read (TODO merge with above)" pending=pending_fallback <|
f = File.create_temporary_file "utf-heuristics" ".txt"
# Fallback to Windows-1252
([-30, -55, -1]).write_bytes f Existing_File_Behavior.Overwrite . should_succeed
f.read . should_equal "âÉÿ"

View File

@ -0,0 +1,63 @@
from Standard.Base import all
import Standard.Base.Errors.Illegal_State.Illegal_State
import Standard.Base.Runtime.Managed_Resource.Managed_Resource
import Standard.Base.System.Input_Stream.Input_Stream
from Standard.Test import all
polyglot java import org.enso.base_test_helpers.RangeStream
main filter=Nothing =
suite = Test.build suite_builder->
add_specs suite_builder
suite.run_with_filter filter
add_specs suite_builder = suite_builder.group "Input Stream" group_builder->
group_builder.specify "should be peekable if backed by memory" <|
Managed_Resource.bracket (Input_Stream.from_bytes [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]) (.close) stream->
stream.is_peekable . should_be_true
stream.peek_bytes 3 . should_equal [1, 2, 3]
stream.peek_bytes 3 . should_equal [1, 2, 3]
# After the peek operation, read still starts from beginning
stream.read_n_bytes 3 . should_equal [1, 2, 3]
# Further peek after a read, starts from where the next read would start
stream.peek_bytes 3 . should_equal [4, 5, 6]
stream.read_n_bytes 3 . should_equal [4, 5, 6]
stream.read_n_bytes 5 . should_equal [7, 8, 9, 10]
group_builder.specify "should allow to skip bytes" <|
Managed_Resource.bracket (Input_Stream.from_bytes [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]) (.close) stream->
stream.skip_n_bytes 3
stream.read_n_bytes 1 . should_equal [4]
stream.peek_bytes 3 . should_equal [5, 6, 7]
stream.skip_n_bytes 3
stream.read_n_bytes 4 . should_equal [8, 9, 10]
group_builder.specify "should not be peekable if generic stream is provided" <|
error_handler x = x
generic_stream = Input_Stream.new (RangeStream.new 100 120) error_handler
generic_stream.is_peekable . should_be_false
generic_stream.peek_bytes 3 . should_fail_with Illegal_State
group_builder.specify "should be possible to make peekable" <|
error_handler x = x
generic_stream = Input_Stream.new (RangeStream.new 100 120) error_handler
generic_stream.read_n_bytes 4 . should_equal [100, 101, 102, 103]
promoted_stream = generic_stream.as_peekable_stream
# The new stream starts at the same position as the old one was left
promoted_stream.peek_bytes 4 . should_equal [104, 105, 106, 107]
promoted_stream.read_n_bytes 4 . should_equal [104, 105, 106, 107]
promoted_stream.peek_bytes 2 . should_equal [108, 109]
promoted_stream.peek_bytes 2 . should_equal [108, 109]
group_builder.specify "should allow to peek beyond EOF, still correctly restarting afterwards" <|
error_handler x = x
generic_stream = Input_Stream.new (RangeStream.new 100 105) error_handler
promoted_stream = generic_stream.as_peekable_stream
promoted_stream.peek_bytes 10 . should_equal [100, 101, 102, 103, 104]
promoted_stream.peek_bytes 10 . should_equal [100, 101, 102, 103, 104]
# The read still succeeds - ensuring there isn't some early EOF
promoted_stream.read_n_bytes 10 . should_equal [100, 101, 102, 103, 104]

View File

@ -0,0 +1,87 @@
from Standard.Base import all
import Standard.Base.Errors.Illegal_State.Illegal_State
import Standard.Base.Runtime.Managed_Resource.Managed_Resource
import Standard.Base.System.File.Advanced.Temporary_File.Temporary_File
import Standard.Base.System.Input_Stream.Input_Stream
from Standard.Test import all
polyglot java import org.enso.base_test_helpers.RangeStream
main filter=Nothing =
suite = Test.build suite_builder->
add_specs suite_builder
suite.run_with_filter filter
add_specs suite_builder = suite_builder.group "Restartable Input Stream" group_builder->
group_builder.specify "should allow to restart a generic stream by caching its data" <|
error_handler x = x
generic_stream = Input_Stream.new (RangeStream.new 100 120) error_handler
generic_stream.read_n_bytes 3 . should_equal [100, 101, 102]
# A generic stream is frozen in its current state - the first few bytes that were already read will be lost
restartable_stream = generic_stream.as_restartable_stream
restartable_stream.with_fresh_stream stream->
stream.read_n_bytes 4 . should_equal [103, 104, 105, 106]
restartable_stream.with_fresh_stream stream->
stream.read_n_bytes 4 . should_equal [103, 104, 105, 106]
# Small stream is backed by memory:
restartable_stream.to_text . should_contain "From_Bytes"
group_builder.specify "will fall back to caching in a temporary file for a very large stream" <|
error_handler x = x
# 1MB
generic_stream = Input_Stream.new (RangeStream.new 1 1000000) error_handler
restartable_stream = generic_stream.as_restartable_stream
restartable_stream.with_fresh_stream stream->
stream.read_n_bytes 4 . should_equal [1, 2, 3, 4]
restartable_stream.to_text . should_contain "From_Temporary_File"
group_builder.specify "will re-use the original byte array if it was known" <|
byte_stream = Input_Stream.from_bytes [1, 2, 3, 4, 5]
byte_stream.read_n_bytes 3 . should_equal [1, 2, 3]
restartable_stream = byte_stream.as_restartable_stream
restartable_stream.with_fresh_stream stream->
# All bytes are preserved
stream.read_n_bytes 4 . should_equal [1, 2, 3, 4]
restartable_stream.to_text . should_contain "From_Bytes"
restartable_stream.with_fresh_stream stream->
stream.read_n_bytes 4 . should_equal [1, 2, 3, 4]
group_builder.specify "will be backed by a file, if a file stream is converted and extend_lifetime=False" <|
file = File.create_temporary_file "test" ".bin"
[10, 20, 30, 40, 50].write_bytes file . should_succeed
restartable_stream = file.with_input_stream [File_Access.Read] file_stream->
file_stream.as_restartable_stream extend_lifetime=False
restartable_stream.with_fresh_stream stream->
stream.read_n_bytes 3 . should_equal [10, 20, 30]
restartable_stream.to_text . should_contain "From_Existing_File"
restartable_stream.with_fresh_stream stream->
stream.read_n_bytes 3 . should_equal [10, 20, 30]
# However, if backed by existing file - the stream is prone to file modifications that happen in the meantime
[11, 12, 13, 14].write_bytes file on_existing_file=Existing_File_Behavior.Overwrite . should_succeed
restartable_stream.with_fresh_stream stream->
stream.read_n_bytes 3 . should_equal [11, 12, 13]
group_builder.specify "will not be tied to the original backing file if extend_lifetime=True (default)" <|
file = File.create_temporary_file "test" ".bin"
[10, 20, 30, 40, 50].write_bytes file . should_succeed
restartable_stream = file.with_input_stream [File_Access.Read] file_stream->
file_stream.as_restartable_stream
restartable_stream.with_fresh_stream stream->
stream.read_n_bytes 3 . should_equal [10, 20, 30]
# Modify backing file
[11, 12, 13, 14].write_bytes file on_existing_file=Existing_File_Behavior.Overwrite . should_succeed
# The stream still yields old values because it was untied:
restartable_stream.with_fresh_stream stream->
stream.read_n_bytes 3 . should_equal [10, 20, 30]

View File

@ -92,37 +92,47 @@ add_specs suite_builder =
tmp.to_text . should_not_contain "pref"
tmp.to_text . should_not_contain "suf"
group_builder.specify "should allow to materialize an input stream that is already associated with a temporary file without copying it" <|
group_builder.specify "should allow to materialize an input stream that is already associated with a temporary file without copying it (low-level)" <|
tmp = Temporary_File.new
tmp.with_file f->
"test payload 3" . write f
java_file = Java_File.new tmp.unsafe_get.absolute.path
stream = Input_Stream.new (FileInputStream.new java_file) (File_Error.handle_java_exceptions tmp.unsafe_get) associated_file=tmp
stream = Input_Stream.new (FileInputStream.new java_file) (File_Error.handle_java_exceptions tmp.unsafe_get) associated_source=tmp
tmp2 = Temporary_File.from_stream_light stream
# The returned tmp file should be the same one as original.
tmp2.should_be_a Temporary_File
tmp2.unsafe_get.absolute.path . should_equal tmp.unsafe_get.absolute.path
tmp2.should_equal tmp
# If the raw file is associated, the stream will return that File descriptor (not as temporary file, but regular one):
stream3 = Input_Stream.new (FileInputStream.new java_file) (File_Error.handle_java_exceptions tmp.unsafe_get) associated_file=tmp.unsafe_get
stream3 = Input_Stream.new (FileInputStream.new java_file) (File_Error.handle_java_exceptions tmp.unsafe_get) associated_source=tmp.unsafe_get
f3 = Temporary_File.from_stream_light stream3
f3.should_be_a File
f3.absolute.path . should_equal tmp.unsafe_get.absolute.path
# But if there's no association, a new temporary file gets created:
stream4 = Input_Stream.new (FileInputStream.new java_file) (File_Error.handle_java_exceptions tmp.unsafe_get) associated_file=Nothing
stream4 = Input_Stream.new (FileInputStream.new java_file) (File_Error.handle_java_exceptions tmp.unsafe_get) associated_source=Nothing
tmp4 = Temporary_File.from_stream_light stream4
tmp4.should_be_a Temporary_File
tmp4.unsafe_get.absolute.path . should_not_equal tmp.unsafe_get.absolute.path
# The base variant of from_stream also always copies:
stream5 = Input_Stream.new (FileInputStream.new java_file) (File_Error.handle_java_exceptions tmp.unsafe_get) associated_file=tmp
stream5 = Input_Stream.new (FileInputStream.new java_file) (File_Error.handle_java_exceptions tmp.unsafe_get) associated_source=tmp
tmp5 = Temporary_File.from_stream stream5
tmp5.should_be_a Temporary_File
tmp5.unsafe_get.absolute.path . should_not_equal tmp.unsafe_get.absolute.path
group_builder.specify "should allow to materialize an input stream opened to a temporary file without copying it" pending="Currently the file stream does not know it came from Temporary_File. May be revisited in the future to automatically correlate it." <|
tmp = Temporary_File.new
tmp.with_file f->
"test payload 4" . write f
tmp2 = tmp.with_file f-> f.with_input_stream [File_Access.Read] input_stream->
Temporary_File.from_stream_light input_stream
# The returned file should be the same as original
tmp2.should_equal tmp
make_stream text =
raw_stream = ByteArrayInputStream.new text.utf_8
Input_Stream.new raw_stream (File_Error.handle_java_exceptions Nothing)

View File

@ -173,7 +173,8 @@ add_specs suite_builder =
utf8_bytes = [97, 44, 98, 44, 99, 10, -60, -123, 44, -17, -65, -65, 44, -61, 40, -61, 40, 10]
utf8_bytes.write_bytes utf8_file
action_1 on_problems =
utf8_file.read (Delimited_Format.Delimited "," headers=True) on_problems
# We need to set the encoding explicitly, as otherwise we'd just fallback to Windows-1252 and have no errors
utf8_file.read (Delimited_Format.Delimited "," headers=True encoding=Encoding.utf_8) on_problems
tester_1 table =
table.columns.map .name . should_equal ['a', 'b', 'c']
table.at 'a' . to_vector . should_equal ['ą']
@ -522,6 +523,18 @@ add_specs suite_builder =
r.first_column.to_vector . should_equal ['\uFFFD']
Problems.expect_only_warning Encoding_Error r
group_builder.specify "should fall back to Windows-1252 encoding if invalid UTF-8 characters are encountered in Default encoding" <|
f = File.create_temporary_file "delimited-invalid-utf-8" ".csv"
# On the simple characters all three encodings (ASCII, UTF-8 and Win-1252) agree, so we can use ASCII bytes.
bytes = ('A,B\n1,y'.bytes Encoding.ascii) + [-1] + ('z\n2,-'.bytes Encoding.ascii)
bytes.write_bytes f . should_succeed
r = f.read
r.should_be_a Table
r.column_names.should_equal ["A", "B"]
r.at "A" . to_vector . should_equal [1, 2]
# We fallback to Win-1252 where byte -1 means ÿ
r.at "B" . to_vector . should_equal ["yÿz", "-"]
main filter=Nothing =
suite = Test.build suite_builder->
add_specs suite_builder

View File

@ -577,6 +577,22 @@ add_specs suite_builder =
t.at "A" . to_vector . should_equal [1, 2, 3, 4]
t.at "B" . to_vector . should_equal ["ąęćś", "💯", "żółw", "🐢"]
Test.with_clue "Windows-1252 fallback: " <|
initial_bytes = 'A,B\n1,a\n2,b\n3,¥\n4,d'.bytes Encoding.windows_1252
initial_bytes.write_bytes f on_existing_file=Existing_File_Behavior.Overwrite . should_succeed
r = (Table.new [["A", [5, 6]], ["B", ["æ", "💯"]]]).write f on_existing_file=Existing_File_Behavior.Append
# Should report problem writing the 💯 character, because selected Windows-1252 encoding does not support it
Problems.expect_only_warning Encoding_Error r
# Now we read in auto mode, the part appended to the table should have been correctly written in Windows-1252
t = f.read
t.should_be_a Table
t.column_names . should_equal ["A", "B"]
t.at "A" . to_vector . should_equal [1, 2, 3, 4, 5, 6]
# The æ should have been correctly written in Win-1252, the 💯 is replaced by `?`.
t.at "B" . to_vector . should_equal ["a", "b", "¥", "d", "æ", "?"]
group_builder.specify "should fail if the target file is read-only" <|
f = enso_project.data / "transient" / "permission.csv"
f.delete_if_exists
@ -635,4 +651,3 @@ main filter=Nothing =
suite = Test.build suite_builder->
add_specs suite_builder
suite.run_with_filter filter