mirror of
https://github.com/enso-org/enso.git
synced 2024-11-22 11:52:59 +03:00
Implement Windows-1252 fallback logic for Encoding.Default
(#10190)
- Closes #10148 - [x] Tests for `Restartable_Input_Stream`, `peek_bytes` and `skip_n_bytes`. - [x] Report `Managed_Resource` stack overflow bug: #10211 - [x] Followup possible optimization: #10220 - [x] Test use-case from blog.
This commit is contained in:
parent
d938c96c55
commit
41d02e95ef
@ -16,6 +16,7 @@
|
||||
operations.][9950]
|
||||
- [Implemented `.cast` to and from `Decimal` columns for the in-memory
|
||||
database.][10206]
|
||||
- [Implemented fallback to Windows-1252 encoding for `Encoding.Default`.][10190]
|
||||
|
||||
[debug-shortcuts]:
|
||||
|
||||
@ -23,6 +24,7 @@
|
||||
[10122]: https://github.com/enso-org/enso/pull/10122
|
||||
[10130]: https://github.com/enso-org/enso/pull/10130
|
||||
[10206]: https://github.com/enso-org/enso/pull/10206
|
||||
[10190]: https://github.com/enso-org/enso/pull/10190
|
||||
|
||||
<br/>![Release Notes](/docs/assets/tags/release_notes.svg)
|
||||
|
||||
|
@ -62,7 +62,10 @@ type Encoding
|
||||
encoding. Otherwise, the input is decoded using UTF-8 unless it contains
|
||||
invalid UTF-8 sequences, in which case Windows-1252 is used as a fallback.
|
||||
|
||||
When used for encoding, it will always encode using UTF-8.
|
||||
When used for encoding, it will either use the same encoding detection
|
||||
heuristics as in read in case of Append mode. When writing a new file,
|
||||
it will always use UTF-8.
|
||||
|
||||
This encoding cannot be passed to some functions that require a Java
|
||||
Charset.
|
||||
default -> Encoding =
|
||||
|
@ -33,6 +33,7 @@ import project.Errors.Time_Error.Time_Error
|
||||
import project.Meta
|
||||
import project.Nothing.Nothing
|
||||
import project.Panic.Panic
|
||||
import project.System.Internal.Reporting_Stream_Decoder_Helper
|
||||
from project.Data.Boolean import Boolean, False, True
|
||||
from project.Data.Json import Invalid_JSON, JS_Object, Json
|
||||
from project.Data.Numbers import Float, Integer, Number, Number_Parse_Error
|
||||
@ -763,11 +764,7 @@ Text.bytes self (encoding : Encoding = Encoding.utf_8) (on_problems : Problem_Be
|
||||
@encoding Encoding.default_widget
|
||||
Text.from_bytes : Vector Integer -> Encoding -> Problem_Behavior -> Text
|
||||
Text.from_bytes bytes (encoding : Encoding = Encoding.default) (on_problems : Problem_Behavior = Problem_Behavior.Report_Error) =
|
||||
result = Encoding_Utils.from_bytes bytes encoding.to_java_charset_or_null : WithProblems
|
||||
if result.problems.is_empty then result.result else
|
||||
problems = result.problems.map decoding_problem->
|
||||
Encoding_Error.Error decoding_problem.message
|
||||
on_problems.attach_problems_after result.result problems
|
||||
Reporting_Stream_Decoder_Helper.decode_bytes_to_text bytes encoding on_problems
|
||||
|
||||
## ICON convert
|
||||
Returns a vector containing bytes representing the UTF-8 encoding of the
|
||||
|
@ -29,13 +29,13 @@ import project.System.File_Format.Infer
|
||||
import project.System.File_Format.Plain_Text_Format
|
||||
import project.System.File_Format_Metadata.File_Format_Metadata
|
||||
import project.System.Input_Stream.Input_Stream
|
||||
import project.System.Advanced.Restartable_Input_Stream.Restartable_Input_Stream
|
||||
from project.Data.Boolean import Boolean, False, True
|
||||
from project.Data.Text.Extensions import all
|
||||
from project.Metadata.Choice import Option
|
||||
from project.Metadata.Widget import Single_Choice
|
||||
from project.System.File_Format import format_types
|
||||
|
||||
polyglot java import java.io.ByteArrayInputStream
|
||||
polyglot java import java.io.InputStream
|
||||
|
||||
## PRIVATE
|
||||
@ -62,24 +62,15 @@ type Response_Body
|
||||
Raw_Stream (raw_stream:Input_Stream) (metadata:File_Format_Metadata) uri:URI
|
||||
|
||||
## PRIVATE
|
||||
Materialized_Byte_Array (bytes:Vector) (metadata:File_Format_Metadata) uri:URI
|
||||
|
||||
## PRIVATE
|
||||
Materialized_Temporary_File (temporary_file:Temporary_File) (metadata:File_Format_Metadata) uri:URI
|
||||
Materialized_Stream (restartable_stream:Restartable_Input_Stream) (metadata:File_Format_Metadata) uri:URI
|
||||
|
||||
## PRIVATE
|
||||
with_stream : (Input_Stream -> Any ! HTTP_Error) -> Any ! HTTP_Error
|
||||
with_stream self action = case self of
|
||||
Response_Body.Raw_Stream raw_stream _ _ ->
|
||||
Managed_Resource.bracket raw_stream (_.close) action
|
||||
Response_Body.Materialized_Byte_Array bytes _ _ ->
|
||||
byte_stream = Input_Stream.new (ByteArrayInputStream.new bytes) (HTTP_Error.handle_java_exceptions self.uri)
|
||||
Managed_Resource.bracket byte_stream (_.close) action
|
||||
Response_Body.Materialized_Temporary_File temporary_file _ _ -> temporary_file.with_file file->
|
||||
opts = [File_Access.Read.to_java]
|
||||
stream = HTTP_Error.handle_java_exceptions self.uri (file.input_stream_builtin opts)
|
||||
file_stream = Input_Stream.new stream (HTTP_Error.handle_java_exceptions self.uri) associated_file=temporary_file
|
||||
Managed_Resource.bracket (file_stream) (_.close) action
|
||||
Response_Body.Materialized_Stream restartable_stream _ _ ->
|
||||
restartable_stream.with_fresh_stream action
|
||||
|
||||
## PRIVATE
|
||||
ADVANCED
|
||||
@ -87,25 +78,11 @@ type Response_Body
|
||||
return a new Response_Body.
|
||||
materialize : Input_Stream
|
||||
materialize self = case self of
|
||||
Response_Body.Raw_Stream _ _ _ ->
|
||||
self.with_stream body_stream->
|
||||
body_stream.with_java_stream body_java_stream->
|
||||
first_block = body_java_stream.readNBytes maximum_body_in_memory
|
||||
case first_block.length < maximum_body_in_memory of
|
||||
True -> Response_Body.Materialized_Byte_Array (Vector.from_polyglot_array first_block) self.metadata self.uri
|
||||
False -> Context.Output.with_enabled <|
|
||||
## Write contents to a temporary file
|
||||
temp_file = Temporary_File.new self.uri.host
|
||||
r = temp_file.with_file file->
|
||||
file.with_output_stream [File_Access.Write, File_Access.Create, File_Access.Truncate_Existing] output_stream->
|
||||
output_stream.with_java_stream java_output_stream->
|
||||
java_output_stream.write first_block
|
||||
body_java_stream.transferTo java_output_stream
|
||||
java_output_stream.flush
|
||||
Nothing
|
||||
r.if_not_error <|
|
||||
Response_Body.Materialized_Temporary_File temp_file self.metadata self.uri
|
||||
_ -> self
|
||||
Response_Body.Raw_Stream _ metadata uri ->
|
||||
restartable_stream = self.with_stream body_stream->
|
||||
body_stream.as_restartable_stream
|
||||
Response_Body.Materialized_Stream restartable_stream metadata uri
|
||||
Response_Body.Materialized_Stream _ _ _ -> self
|
||||
|
||||
## ALIAS parse
|
||||
GROUP Input
|
||||
|
@ -37,8 +37,12 @@ type Managed_Resource
|
||||
function once it is no longer in use.
|
||||
|
||||
Arguments:
|
||||
- resource: The resource to register.
|
||||
- function: The action to be executed on resource to clean it up when
|
||||
it is no longer in use.
|
||||
|
||||
Returns:
|
||||
A `Managed_Resource` object that can be used to access the resource.
|
||||
register : Any -> (Any -> Nothing) -> Managed_Resource
|
||||
register resource function = @Builtin_Method "Managed_Resource.register"
|
||||
|
||||
|
@ -0,0 +1,105 @@
|
||||
import project.Any.Any
|
||||
import project.Data.Text.Text
|
||||
import project.Data.Vector.Vector
|
||||
import project.Nothing.Nothing
|
||||
import project.Runtime.Context
|
||||
import project.Runtime.Managed_Resource.Managed_Resource
|
||||
import project.System.File.Advanced.Temporary_File.Temporary_File
|
||||
import project.System.File.File
|
||||
import project.System.File.File_Access.File_Access
|
||||
import project.System.Input_Stream.Input_Stream
|
||||
from project.Data.Boolean import Boolean, False, True
|
||||
|
||||
## PRIVATE
|
||||
An input stream that can be read multiple times.
|
||||
|
||||
It may be useful when multiple passes over the data are required.
|
||||
If you need to check only the beginning of the stream, consider using a much
|
||||
lighter `Input_Stream.as_peekable_stream`.
|
||||
|
||||
A generic stream can be converted to `Restartable_Input_Stream` by reading
|
||||
all its contents and storing them either in memory or in a temporary file.
|
||||
A stream backed by an existing file can be converted to
|
||||
`Restartable_Input_Stream` at no cost.
|
||||
|
||||
! Stream Lifetime
|
||||
|
||||
Note that if we use an existing file as a shortcut to avoid copying the
|
||||
data, we need to assume that the file will not be modified in the meantime.
|
||||
Thus the `Restartable_Input_Stream` does not fully guarantee immutability
|
||||
of the data. The lifetime of such `Restartable_Input_Stream` is tied to the
|
||||
lifetime of its backing file.
|
||||
|
||||
If the stream should stay usable for a longer time, `extend_lifetime=True`
|
||||
should be passed when creating it.
|
||||
type Restartable_Input_Stream
|
||||
## PRIVATE
|
||||
`bytes` may be a Vector or a raw `byte[]` array (convertible to vector, but no annotation to avoid conversions).
|
||||
private From_Bytes bytes
|
||||
|
||||
## PRIVATE
|
||||
private From_Existing_File file:File
|
||||
|
||||
## PRIVATE
|
||||
private From_Temporary_File temporary_file:Temporary_File
|
||||
|
||||
## PRIVATE
|
||||
to_text self -> Text =
|
||||
suffix = case self of
|
||||
Restartable_Input_Stream.From_Bytes _ -> "From_Bytes"
|
||||
Restartable_Input_Stream.From_Existing_File file -> "From_Existing_File "+file.to_text
|
||||
Restartable_Input_Stream.From_Temporary_File _ -> "From_Temporary_File"
|
||||
"Restartable_Input_Stream."+suffix
|
||||
|
||||
## PRIVATE
|
||||
make (input_stream : Input_Stream) (extend_lifetime : Boolean) -> Restartable_Input_Stream =
|
||||
case input_stream.associated_source of
|
||||
temp_file : Temporary_File -> Restartable_Input_Stream.From_Temporary_File temp_file
|
||||
file : File ->
|
||||
if extend_lifetime then cache_generic_input_stream input_stream else
|
||||
Restartable_Input_Stream.From_Existing_File file
|
||||
bytes : Vector -> Restartable_Input_Stream.From_Bytes bytes
|
||||
_ -> cache_generic_input_stream input_stream
|
||||
|
||||
## PRIVATE
|
||||
Runs the provided action with a fresh input stream pointing to the
|
||||
beginning of the data represented by this stream.
|
||||
|
||||
This method may be called multiple times, allowing multiple 'rounds' of
|
||||
processing.
|
||||
with_fresh_stream self (action : Input_Stream -> Any) -> Any =
|
||||
case self of
|
||||
Restartable_Input_Stream.From_Bytes bytes ->
|
||||
Managed_Resource.bracket (Input_Stream.from_bytes bytes) (.close) action
|
||||
Restartable_Input_Stream.From_Existing_File file ->
|
||||
file.with_input_stream [File_Access.Read] action
|
||||
Restartable_Input_Stream.From_Temporary_File temp_file ->
|
||||
temp_file.with_file file->
|
||||
file.with_input_stream [File_Access.Read] action
|
||||
|
||||
## PRIVATE
|
||||
Maximum size for a stream to be held in memory.
|
||||
If the amount of data exceeds this limit, it will be stored in a temporary file.
|
||||
max_in_memory_size =
|
||||
# 64 KiB
|
||||
64 * 1024
|
||||
|
||||
## PRIVATE
|
||||
private cache_generic_input_stream (input_stream : Input_Stream) -> Restartable_Input_Stream =
|
||||
input_stream.with_java_stream java_input_stream->
|
||||
first_block = java_input_stream.readNBytes max_in_memory_size
|
||||
case first_block.length < max_in_memory_size of
|
||||
True ->
|
||||
Restartable_Input_Stream.From_Bytes first_block
|
||||
False ->
|
||||
Context.Output.with_enabled <|
|
||||
temp_file = Temporary_File.new "restartable-input-stream"
|
||||
r = temp_file.with_file file->
|
||||
file.with_output_stream [File_Access.Write, File_Access.Create, File_Access.Truncate_Existing] output_stream->
|
||||
output_stream.with_java_stream java_output_stream->
|
||||
java_output_stream.write first_block
|
||||
java_input_stream.transferTo java_output_stream
|
||||
java_output_stream.flush
|
||||
Nothing
|
||||
r.if_not_error <|
|
||||
Restartable_Input_Stream.From_Temporary_File temp_file
|
@ -246,11 +246,11 @@ type File
|
||||
file.with_input_stream [File_Access.Create, File_Access.Read] action
|
||||
with_input_stream : Vector File_Access -> (Input_Stream -> Any ! File_Error) -> Any ! File_Error
|
||||
with_input_stream self (open_options : Vector) action =
|
||||
new_input_stream : File -> Vector File_Access -> Output_Stream ! File_Error
|
||||
new_input_stream : File -> Vector File_Access -> Input_Stream ! File_Error
|
||||
new_input_stream file open_options =
|
||||
opts = open_options . map (_.to_java)
|
||||
stream = File_Error.handle_java_exceptions file (file.input_stream_builtin opts)
|
||||
Input_Stream.new stream (File_Error.handle_java_exceptions self)
|
||||
Input_Stream.new stream (File_Error.handle_java_exceptions self) associated_source=self
|
||||
|
||||
if self.is_directory then Error.throw (File_Error.IO_Error self "File '"+self.path+"' is a directory") else
|
||||
open_as_data_link = (open_options.contains Data_Link_Access.No_Follow . not) && (Data_Link.is_data_link self)
|
||||
|
@ -96,7 +96,7 @@ type Temporary_File
|
||||
If the stream is already backed by a temporary or regular file, that file is returned.
|
||||
from_stream_light : Input_Stream -> Temporary_File | File
|
||||
from_stream_light stream =
|
||||
case stream.associated_file of
|
||||
case stream.associated_source of
|
||||
tmp : Temporary_File -> tmp
|
||||
file : File -> file
|
||||
_ -> Temporary_File.from_stream stream
|
||||
|
@ -1,19 +1,28 @@
|
||||
import project.Any.Any
|
||||
import project.Data.Array.Array
|
||||
import project.Data.Numbers.Integer
|
||||
import project.Data.Text.Encoding.Encoding
|
||||
import project.Data.Vector.Vector
|
||||
import project.Errors.Encoding_Error.Encoding_Error
|
||||
import project.Error.Error
|
||||
import project.Errors.File_Error.File_Error
|
||||
import project.Errors.Illegal_State.Illegal_State
|
||||
import project.Errors.Problem_Behavior.Problem_Behavior
|
||||
import project.Nothing.Nothing
|
||||
import project.Runtime.Managed_Resource.Managed_Resource
|
||||
import project.System.Advanced.Restartable_Input_Stream.Restartable_Input_Stream
|
||||
import project.System.File.Advanced.Temporary_File.Temporary_File
|
||||
import project.System.File.File
|
||||
import project.System.File.File_Access.File_Access
|
||||
import project.System.File.Generic.Writable_File.Writable_File
|
||||
import project.System.Internal.Reporting_Stream_Decoder_Helper
|
||||
from project.Data.Boolean import Boolean, False, True
|
||||
|
||||
polyglot java import java.io.BufferedInputStream
|
||||
polyglot java import java.io.ByteArrayInputStream
|
||||
polyglot java import java.io.InputStream as Java_Input_Stream
|
||||
polyglot java import org.enso.base.encoding.ReportingStreamDecoder
|
||||
polyglot java import org.enso.base.encoding.Encoding_Utils
|
||||
polyglot java import org.enso.base.encoding.ReportingStreamDecoder
|
||||
polyglot java import org.enso.base.Stream_Utils
|
||||
|
||||
## PRIVATE
|
||||
An input stream, allowing for interactive reading of contents.
|
||||
@ -23,10 +32,17 @@ type Input_Stream
|
||||
|
||||
Given a Java InputStream, wraps as a Managed_Resource and returns a new
|
||||
Input_Stream.
|
||||
new : Java_Input_Stream -> Any -> (Nothing | File | Temporary_File) -> Input_Stream
|
||||
new java_stream error_handler associated_file=Nothing =
|
||||
new : Java_Input_Stream -> Any -> (Nothing | File | Temporary_File | Vector | Array) -> Input_Stream
|
||||
new java_stream error_handler associated_source=Nothing =
|
||||
resource = Managed_Resource.register java_stream close_stream
|
||||
Input_Stream.Value resource error_handler associated_file
|
||||
Input_Stream.Value resource error_handler associated_source
|
||||
|
||||
## PRIVATE
|
||||
ADVANCED
|
||||
Creates a new input stream from a vector of bytes.
|
||||
from_bytes bytes -> Input_Stream =
|
||||
raw_stream = ByteArrayInputStream.new bytes
|
||||
Input_Stream.new raw_stream (File_Error.handle_java_exceptions Nothing) bytes
|
||||
|
||||
## PRIVATE
|
||||
An input stream, allowing for interactive reading of contents.
|
||||
@ -35,15 +51,16 @@ type Input_Stream
|
||||
- stream_resource: The internal resource that represents the underlying
|
||||
stream.
|
||||
- error_handler: An error handler for IOExceptions thrown when reading.
|
||||
- associated_file: The file associated with this stream, if any.
|
||||
Value stream_resource error_handler (associated_file:Nothing|File|Temporary_File)
|
||||
- associated_source: The source associated with this stream, if any.
|
||||
It can be used to cheaply convert this stream into a `Restartable_Input_Stream`.
|
||||
private Value stream_resource error_handler (associated_source : Nothing | File | Temporary_File | Vector | Array)
|
||||
|
||||
## PRIVATE
|
||||
ADVANCED
|
||||
Reads all the bytes in this stream into a vector of bytes.
|
||||
read_all_bytes : Vector
|
||||
read_all_bytes self = self.stream_resource . with java_stream->
|
||||
self.error_handler <| Vector.from_polyglot_array java_stream.readAllBytes
|
||||
read_all_bytes self = self.with_java_stream java_stream->
|
||||
Vector.from_polyglot_array java_stream.readAllBytes
|
||||
|
||||
## PRIVATE
|
||||
ADVANCED
|
||||
@ -58,10 +75,14 @@ type Input_Stream
|
||||
Arguments:
|
||||
- n: The number of bytes to read from the stream.
|
||||
read_n_bytes : Integer -> Vector
|
||||
read_n_bytes self n = self.stream_resource . with java_stream->
|
||||
self.error_handler <|
|
||||
bytes = java_stream.readNBytes n
|
||||
Vector.from_polyglot_array bytes
|
||||
read_n_bytes self (n : Integer) = self.with_java_stream java_stream->
|
||||
bytes = java_stream.readNBytes n
|
||||
Vector.from_polyglot_array bytes
|
||||
|
||||
## PRIVATE
|
||||
It may throw an error if not enough bytes are available.
|
||||
skip_n_bytes self (n : Integer) = self.with_java_stream java_stream->
|
||||
java_stream.skipNBytes n
|
||||
|
||||
## PRIVATE
|
||||
ADVANCED
|
||||
@ -70,9 +91,8 @@ type Input_Stream
|
||||
The returned value is an integer in the range 0-255 representing the
|
||||
next byte of input, or -1 if end of stream is reached.
|
||||
read_byte : Integer
|
||||
read_byte self = self.stream_resource . with java_stream->
|
||||
self.error_handler <|
|
||||
java_stream.read
|
||||
read_byte self = self.with_java_stream java_stream->
|
||||
java_stream.read
|
||||
|
||||
## PRIVATE
|
||||
ADVANCED
|
||||
@ -93,17 +113,75 @@ type Input_Stream
|
||||
Arguments:
|
||||
- f: Applies a function over the internal java stream.
|
||||
with_java_stream : (Java_Input_Stream -> Any) -> Any
|
||||
with_java_stream self f = self.stream_resource . with f
|
||||
with_java_stream self f = self.stream_resource . with java_stream->
|
||||
self.error_handler <| f java_stream
|
||||
|
||||
## PRIVATE
|
||||
Runs an action with a `ReportingStreamDecoder` decoding data from the
|
||||
input stream with the specified encoding.
|
||||
with_stream_decoder : Encoding -> Problem_Behavior -> (ReportingStreamDecoder -> Any) -> Any
|
||||
with_stream_decoder self encoding on_problems action = self.stream_resource . with java_stream->
|
||||
results = Encoding_Utils.with_stream_decoder java_stream encoding.to_java_charset_or_null action
|
||||
problems = Vector.from_polyglot_array results.problems . map decoding_problem->
|
||||
Encoding_Error.Error decoding_problem.message
|
||||
on_problems.attach_problems_after results.result problems
|
||||
with_stream_decoder self (encoding : Encoding) (on_problems : Problem_Behavior) action =
|
||||
Reporting_Stream_Decoder_Helper.run self encoding on_problems allow_usage_in_background_threads=True action
|
||||
|
||||
## PRIVATE
|
||||
Converts this stream into a restartable stream.
|
||||
|
||||
The conversion tries to be efficient, avoiding reading the stream if it
|
||||
is simply backed by a file or byte array.
|
||||
This stream is invalidated after the conversion.
|
||||
|
||||
Arguments:
|
||||
- extend_lifetime: This option is only applicable if the source stream
|
||||
was backed by a file. If set to `True`, a new temporary file will be
|
||||
created, to untie the lifetime of the returned stream from the backing
|
||||
file. If set to `False`, the returned stream will be valid only as long
|
||||
as the original backing file, but the operation will be more efficient
|
||||
- so it is preferred if the caller can guarantee the lifetime of such
|
||||
backing file.
|
||||
as_restartable_stream self (extend_lifetime : Boolean = True) -> Restartable_Input_Stream =
|
||||
Restartable_Input_Stream.make self extend_lifetime
|
||||
|
||||
## PRIVATE
|
||||
is_peekable self -> Boolean =
|
||||
self.with_java_stream java_stream->
|
||||
java_stream.markSupported
|
||||
|
||||
## PRIVATE
|
||||
Converts this stream into a stream that can be peeked.
|
||||
This is useful when some application needs to look ahead in the stream,
|
||||
but then needs to pass it further with the same state.
|
||||
|
||||
The peeked bytes are cached in memory, so this should only be used for
|
||||
small amounts of data. If more data has to be processed more than once,
|
||||
`as_restartable_stream` is preferred.
|
||||
|
||||
The current stream may be invalidated after the conversion, and it should
|
||||
no longer be used - only the returned stream should be used.
|
||||
as_peekable_stream self -> Input_Stream = if self.is_peekable then self else
|
||||
raw_java_stream = self.stream_resource.take
|
||||
buffered_stream = BufferedInputStream.new raw_java_stream
|
||||
Input_Stream.new buffered_stream self.error_handler self.associated_source
|
||||
|
||||
## PRIVATE
|
||||
Peeks up to the provided number of bytes from the stream.
|
||||
|
||||
Makes a best-effort to read as many bytes as provided, however fewer
|
||||
bytes may be read, if end of stream is encountered.
|
||||
|
||||
The length of the returned vector is the same as the number of bytes
|
||||
read.
|
||||
|
||||
No bytes are consumed from the stream - a next read or peek
|
||||
operation will see the same contents as before this call.
|
||||
|
||||
This operation is only allowed if `is_peekable` returns `True`.
|
||||
|
||||
Arguments:
|
||||
- n: The number of bytes to read from the stream.
|
||||
peek_bytes self (n : Integer) -> Vector Integer =
|
||||
if self.is_peekable.not then Error.throw (Illegal_State.Error "`peek_bytes` called on a stream where `is_peekable=False`. Please convert the stream using `as_peekable_stream`.") else
|
||||
self.with_java_stream java_stream->
|
||||
Vector.from_polyglot_array <| Stream_Utils.peek java_stream n
|
||||
|
||||
## PRIVATE
|
||||
Reads the contents of this stream into a given file.
|
||||
|
@ -0,0 +1,130 @@
|
||||
private
|
||||
|
||||
import project.Any.Any
|
||||
import project.Data.Text.Encoding.Encoding
|
||||
import project.Data.Text.Text
|
||||
import project.Data.Vector.Vector
|
||||
import project.Errors.Encoding_Error.Encoding_Error
|
||||
import project.Errors.Problem_Behavior.Problem_Behavior
|
||||
import project.Nothing.Nothing
|
||||
import project.Runtime.Managed_Resource.Managed_Resource
|
||||
import project.System.Input_Stream.Input_Stream
|
||||
from project.Data.Boolean import Boolean, False, True
|
||||
from project.Runtime import assert
|
||||
|
||||
polyglot java import org.enso.base.encoding.DecodingProblemAggregator
|
||||
polyglot java import org.enso.base.encoding.Encoding_Utils
|
||||
polyglot java import org.enso.base.encoding.ReportingStreamDecoder
|
||||
|
||||
## PRIVATE
|
||||
Builds the `ReportingStreamDecoder`, consuming the `Input_Stream`.
|
||||
It will do any necessary encoding detection, as determined by the `Encoding`
|
||||
run (input_stream : Input_Stream) (encoding : Encoding) (on_problems : Problem_Behavior) (allow_usage_in_background_threads : Boolean) (continuation : ReportingStreamDecoder -> Any) =
|
||||
# We always ensure the stream is peekable, as that also implies buffering which is supposedly more efficient e.g. when working with files.
|
||||
buffered_input_stream = input_stream.as_peekable_stream
|
||||
problem_aggregator = DecodingProblemAggregator.new
|
||||
result = resolve_encoding encoding buffered_input_stream problem_aggregator effective_encoding-> amended_input_stream->
|
||||
amended_input_stream.with_java_stream java_stream->
|
||||
# We can only poll safepoints if the decoder is guaranteed to be used in the main thread only.
|
||||
poll_safepoints = allow_usage_in_background_threads.not
|
||||
decoder = ReportingStreamDecoder.new java_stream effective_encoding.to_java_charset problem_aggregator poll_safepoints
|
||||
continuation decoder
|
||||
problems = Vector.from_polyglot_array problem_aggregator.summarize . map decoding_problem->
|
||||
Encoding_Error.Error decoding_problem.message
|
||||
on_problems.attach_problems_after result problems
|
||||
|
||||
## PRIVATE
|
||||
decode_bytes_to_text bytes (encoding : Encoding) (on_problems : Problem_Behavior) -> Text =
|
||||
Managed_Resource.bracket (Input_Stream.from_bytes bytes) (.close) input_stream->
|
||||
run input_stream encoding on_problems allow_usage_in_background_threads=False decoder->
|
||||
decoder.readAllIntoMemory
|
||||
|
||||
## PRIVATE
|
||||
resolve_encoding (encoding : Encoding) (buffered_input_stream : Input_Stream) (problem_aggregator : DecodingProblemAggregator) (continuation : Encoding -> Input_Stream -> Any) -> Any =
|
||||
case encoding of
|
||||
Encoding.Default ->
|
||||
detect_default_encoding buffered_input_stream problem_aggregator continuation
|
||||
Encoding.Value "UTF-8" ->
|
||||
verify_unicode_encoding encoding Unicode_BOM.UTF_8 buffered_input_stream problem_aggregator continuation
|
||||
Encoding.Value "UTF-16LE" ->
|
||||
verify_unicode_encoding encoding Unicode_BOM.UTF_16_LE buffered_input_stream problem_aggregator continuation
|
||||
Encoding.Value "UTF-16BE" ->
|
||||
verify_unicode_encoding encoding Unicode_BOM.UTF_16_BE buffered_input_stream problem_aggregator continuation
|
||||
|
||||
# Any other encoding just continues without any additional processing.
|
||||
_ -> continuation encoding buffered_input_stream
|
||||
|
||||
## PRIVATE
|
||||
detect_default_encoding (buffered_input_stream : Input_Stream) (problem_aggregator : DecodingProblemAggregator) (continuation : Encoding -> Input_Stream -> Any) -> Any =
|
||||
case detect_bom buffered_input_stream of
|
||||
# We rely on the detected BOM for the encoding, even if there are errors down the line.
|
||||
detected_bom : Unicode_BOM ->
|
||||
encoding_name = detected_bom.corresponding_encoding.to_display_text
|
||||
context_message = "An " + encoding_name + " BOM was detected, so " + encoding_name + " encoding has been assumed, but some characters seem invalid: "
|
||||
problem_aggregator.setInvalidCharacterErrorPrefix context_message
|
||||
# Skip the BOM to avoid including it in the decoded data.
|
||||
buffered_input_stream.skip_n_bytes detected_bom.as_bytes.length . if_not_error <|
|
||||
continuation detected_bom.corresponding_encoding buffered_input_stream
|
||||
|
||||
# If no BOM was detected, we do a pass to try UTF-8 encoding and if it fails, we restart and fallback to Windows-1252.
|
||||
Nothing ->
|
||||
# We don't need to extend the lifetime of the stream, as it will not be kept longer than the lifetime of the `buffered_input_stream`.
|
||||
restartable_stream = buffered_input_stream.as_restartable_stream extend_lifetime=False
|
||||
is_valid_utf_8 = restartable_stream.with_fresh_stream checking_stream->
|
||||
checking_stream.with_java_stream java_checking_stream->
|
||||
Encoding_Utils.canDecodeWithoutErrors java_checking_stream Encoding.utf_8.to_java_charset
|
||||
effective_encoding = if is_valid_utf_8 then Encoding.utf_8 else Encoding.windows_1252
|
||||
restartable_stream.with_fresh_stream input_stream->
|
||||
continuation effective_encoding input_stream
|
||||
|
||||
## PRIVATE
|
||||
verify_unicode_encoding (encoding : Encoding) (expected_bom : Unicode_BOM) (buffered_input_stream : Input_Stream) (problem_aggregator : DecodingProblemAggregator) (continuation : Encoding -> Input_Stream -> Any) -> Any =
|
||||
case detect_bom buffered_input_stream of
|
||||
Nothing ->
|
||||
# No BOM detected, so we just proceed.
|
||||
continuation encoding buffered_input_stream
|
||||
detected_bom : Unicode_BOM ->
|
||||
case detected_bom == expected_bom of
|
||||
True ->
|
||||
# We found the expected BOM. We skip it to avoid including it in decoded data.
|
||||
buffered_input_stream.skip_n_bytes expected_bom.as_bytes.length . if_not_error <|
|
||||
continuation encoding buffered_input_stream
|
||||
False ->
|
||||
# Report BOM mismatch
|
||||
message = detected_bom.corresponding_encoding.to_display_text + " BOM has been found when decoding as " + encoding.to_display_text + "."
|
||||
problem_aggregator.reportOtherProblem message
|
||||
continuation encoding buffered_input_stream
|
||||
|
||||
## PRIVATE
|
||||
type Unicode_BOM
|
||||
## PRIVATE
|
||||
UTF_8
|
||||
|
||||
## PRIVATE
|
||||
UTF_16_LE
|
||||
|
||||
## PRIVATE
|
||||
UTF_16_BE
|
||||
|
||||
## PRIVATE
|
||||
as_bytes self -> Vector = case self of
|
||||
Unicode_BOM.UTF_8 -> [-17, -69, -65]
|
||||
Unicode_BOM.UTF_16_LE -> [-1, -2]
|
||||
Unicode_BOM.UTF_16_BE -> [-2, -1]
|
||||
|
||||
corresponding_encoding self -> Encoding = case self of
|
||||
Unicode_BOM.UTF_8 -> Encoding.utf_8
|
||||
Unicode_BOM.UTF_16_LE -> Encoding.utf_16_le
|
||||
Unicode_BOM.UTF_16_BE -> Encoding.utf_16_be
|
||||
|
||||
## PRIVATE
|
||||
all = [Unicode_BOM.UTF_8, Unicode_BOM.UTF_16_LE, Unicode_BOM.UTF_16_BE]
|
||||
|
||||
## PRIVATE
|
||||
detect_bom (input_stream : Input_Stream) -> Unicode_BOM | Nothing =
|
||||
assert input_stream.is_peekable
|
||||
beginning = input_stream.peek_bytes 3
|
||||
matching_bom = Unicode_BOM.all.find if_missing=Nothing bom->
|
||||
expected_bytes = bom.as_bytes
|
||||
expected_bytes == (beginning.take expected_bytes.length)
|
||||
matching_bom
|
28
std-bits/base/src/main/java/org/enso/base/Stream_Utils.java
Normal file
28
std-bits/base/src/main/java/org/enso/base/Stream_Utils.java
Normal file
@ -0,0 +1,28 @@
|
||||
package org.enso.base;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.io.InputStream;
|
||||
import java.util.Arrays;
|
||||
|
||||
public class Stream_Utils {
|
||||
public static byte[] peek(InputStream stream, int n) throws IOException {
|
||||
assert n >= 0;
|
||||
assert stream.markSupported();
|
||||
|
||||
byte[] buffer = new byte[n];
|
||||
stream.mark(n + 1);
|
||||
int offset = 0;
|
||||
while (offset < n) {
|
||||
int read = stream.read(buffer, offset, n - offset);
|
||||
if (read == -1) {
|
||||
break;
|
||||
}
|
||||
offset += read;
|
||||
}
|
||||
stream.reset();
|
||||
if (offset < n) {
|
||||
buffer = Arrays.copyOf(buffer, offset);
|
||||
}
|
||||
return buffer;
|
||||
}
|
||||
}
|
@ -31,6 +31,10 @@ public class DecodingProblemAggregator {
|
||||
}
|
||||
}
|
||||
|
||||
public boolean hasEncounteredInvalidCharacters() {
|
||||
return invalidUnitCount > 0;
|
||||
}
|
||||
|
||||
private DecodingProblem summarizeInvalidCharacterProblems() {
|
||||
if (invalidUnitCount == 0) {
|
||||
return null;
|
||||
|
@ -1,213 +0,0 @@
|
||||
package org.enso.base.encoding;
|
||||
|
||||
import java.io.BufferedInputStream;
|
||||
import java.io.EOFException;
|
||||
import java.io.IOException;
|
||||
import java.nio.charset.Charset;
|
||||
import java.nio.charset.StandardCharsets;
|
||||
|
||||
public abstract sealed class EncodingRepresentation {
|
||||
private static final byte[] UTF_8_BOM = new byte[] {(byte) 0xEF, (byte) 0xBB, (byte) 0xBF};
|
||||
private static final byte[] UTF_16_BE_BOM = new byte[] {(byte) 0xFE, (byte) 0xFF};
|
||||
private static final byte[] UTF_16_LE_BOM = new byte[] {(byte) 0xFF, (byte) 0xFE};
|
||||
|
||||
private static byte[] peekStream(BufferedInputStream stream, int n) throws IOException {
|
||||
byte[] buffer = new byte[n];
|
||||
stream.mark(n + 1);
|
||||
int offset = 0;
|
||||
while (offset < n) {
|
||||
int read = stream.read(buffer, offset, n - offset);
|
||||
if (read == -1) {
|
||||
break;
|
||||
}
|
||||
offset += read;
|
||||
}
|
||||
stream.reset();
|
||||
return buffer;
|
||||
}
|
||||
|
||||
private static void skipStream(BufferedInputStream stream, int n) throws IOException {
|
||||
try {
|
||||
stream.skipNBytes(n);
|
||||
} catch (EOFException eofException) {
|
||||
// ignore early EOF
|
||||
}
|
||||
}
|
||||
|
||||
private static boolean startsWith(byte[] buffer, byte[] prefix) {
|
||||
if (buffer.length < prefix.length) {
|
||||
return false;
|
||||
}
|
||||
|
||||
for (int i = 0; i < prefix.length; i++) {
|
||||
if (buffer[i] != prefix[i]) {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
public static EncodingRepresentation fromCharset(Charset charset) {
|
||||
if (charset == null) {
|
||||
return new Default();
|
||||
} else if (charset.equals(StandardCharsets.UTF_8)) {
|
||||
return UTF8.INSTANCE;
|
||||
} else if (charset.equals(StandardCharsets.UTF_16LE)) {
|
||||
return UTF16.LittleEndian;
|
||||
} else if (charset.equals(StandardCharsets.UTF_16BE)) {
|
||||
return UTF16.BigEndian;
|
||||
} else {
|
||||
return new Other(charset);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Detects the effective charset based on initial data present in the stream.
|
||||
*
|
||||
* @param stream the stream to detect the charset for. Initial BOM header may be deliberately
|
||||
* consumed by this method, so that it is ignored for further processing. No other data is
|
||||
* consumed (we only peek).
|
||||
* @param problemAggregator an aggregator to report any warnings about detected encoding, or
|
||||
* report context metadata to be used in future errors reported by other components.
|
||||
*/
|
||||
public abstract Charset detectCharset(
|
||||
BufferedInputStream stream, DecodingProblemAggregator problemAggregator) throws IOException;
|
||||
|
||||
private static final class Default extends EncodingRepresentation {
|
||||
// Note: the Windows-1252 fallback is not implemented as part of this detection - it only allows
|
||||
// to distinguish UTF BOMs.
|
||||
@Override
|
||||
public Charset detectCharset(
|
||||
BufferedInputStream stream, DecodingProblemAggregator problemAggregator)
|
||||
throws IOException {
|
||||
byte[] beginning = peekStream(stream, 3);
|
||||
if (startsWith(beginning, UTF_8_BOM)) {
|
||||
skipStream(stream, UTF_8_BOM.length);
|
||||
notifyContextAboutAssumedEncoding(problemAggregator, "UTF-8");
|
||||
return StandardCharsets.UTF_8;
|
||||
} else if (startsWith(beginning, UTF_16_BE_BOM)) {
|
||||
skipStream(stream, UTF_16_BE_BOM.length);
|
||||
notifyContextAboutAssumedEncoding(problemAggregator, "UTF-16 BE");
|
||||
return StandardCharsets.UTF_16BE;
|
||||
} else if (startsWith(beginning, UTF_16_LE_BOM)) {
|
||||
skipStream(stream, UTF_16_LE_BOM.length);
|
||||
notifyContextAboutAssumedEncoding(problemAggregator, "UTF-16 LE");
|
||||
return StandardCharsets.UTF_16LE;
|
||||
} else {
|
||||
// If no BOM we fallback to UTF-8.
|
||||
return StandardCharsets.UTF_8;
|
||||
}
|
||||
}
|
||||
|
||||
private static void notifyContextAboutAssumedEncoding(
|
||||
DecodingProblemAggregator problemAggregator, String encodingName) {
|
||||
String prefix =
|
||||
"An "
|
||||
+ encodingName
|
||||
+ " BOM was detected, so "
|
||||
+ encodingName
|
||||
+ " encoding has been assumed, but some characters seem invalid: ";
|
||||
problemAggregator.setInvalidCharacterErrorPrefix(prefix);
|
||||
}
|
||||
}
|
||||
|
||||
private static final class Other extends EncodingRepresentation {
|
||||
private final Charset charset;
|
||||
|
||||
public Other(Charset charset) {
|
||||
this.charset = charset;
|
||||
}
|
||||
|
||||
@Override
|
||||
public Charset detectCharset(
|
||||
BufferedInputStream stream, DecodingProblemAggregator problemAggregator)
|
||||
throws IOException {
|
||||
// We ignore the stream as we just use the provided encoding as-is.
|
||||
return charset;
|
||||
}
|
||||
}
|
||||
|
||||
private static final class UTF8 extends EncodingRepresentation {
|
||||
static final UTF8 INSTANCE = new UTF8();
|
||||
|
||||
@Override
|
||||
public Charset detectCharset(
|
||||
BufferedInputStream stream, DecodingProblemAggregator problemAggregator)
|
||||
throws IOException {
|
||||
byte[] beginning = peekStream(stream, UTF_8_BOM.length);
|
||||
if (startsWith(beginning, UTF_8_BOM)) {
|
||||
skipStream(stream, UTF_8_BOM.length);
|
||||
} else {
|
||||
if (startsWith(beginning, UTF_16_BE_BOM) || startsWith(beginning, UTF_16_LE_BOM)) {
|
||||
problemAggregator.reportOtherProblem("UTF-16 BOM has been found when decoding as UTF-8.");
|
||||
}
|
||||
}
|
||||
return StandardCharsets.UTF_8;
|
||||
}
|
||||
}
|
||||
|
||||
private static final class UTF16 extends EncodingRepresentation {
|
||||
static final UTF16 BigEndian = new UTF16(StandardCharsets.UTF_16BE, Endianness.BigEndian);
|
||||
static final UTF16 LittleEndian = new UTF16(StandardCharsets.UTF_16LE, Endianness.LittleEndian);
|
||||
|
||||
private final Charset charset;
|
||||
private final byte[] expectedBOM;
|
||||
private final byte[] flippedBOM;
|
||||
private Endianness endianness;
|
||||
|
||||
private UTF16(Charset charset, Endianness endianness) {
|
||||
this.charset = charset;
|
||||
this.expectedBOM =
|
||||
switch (endianness) {
|
||||
case BigEndian -> UTF_16_BE_BOM;
|
||||
case LittleEndian -> UTF_16_LE_BOM;
|
||||
};
|
||||
this.flippedBOM =
|
||||
switch (endianness) {
|
||||
case BigEndian -> UTF_16_LE_BOM;
|
||||
case LittleEndian -> UTF_16_BE_BOM;
|
||||
};
|
||||
this.endianness = endianness;
|
||||
}
|
||||
|
||||
@Override
|
||||
public Charset detectCharset(
|
||||
BufferedInputStream stream, DecodingProblemAggregator problemAggregator)
|
||||
throws IOException {
|
||||
assert expectedBOM.length == flippedBOM.length;
|
||||
assert expectedBOM.length == 2;
|
||||
byte[] beginning = peekStream(stream, 2);
|
||||
if (startsWith(beginning, expectedBOM)) {
|
||||
skipStream(stream, expectedBOM.length);
|
||||
} else if (startsWith(beginning, flippedBOM)) {
|
||||
problemAggregator.reportOtherProblem(
|
||||
"Decoding as UTF-16 "
|
||||
+ endianness
|
||||
+ ", but a "
|
||||
+ endianness.flip()
|
||||
+ " BOM has been found.");
|
||||
}
|
||||
return charset;
|
||||
}
|
||||
|
||||
enum Endianness {
|
||||
BigEndian,
|
||||
LittleEndian;
|
||||
|
||||
@Override
|
||||
public String toString() {
|
||||
return switch (this) {
|
||||
case BigEndian -> "Big Endian";
|
||||
case LittleEndian -> "Little Endian";
|
||||
};
|
||||
}
|
||||
|
||||
public Endianness flip() {
|
||||
return switch (this) {
|
||||
case BigEndian -> LittleEndian;
|
||||
case LittleEndian -> BigEndian;
|
||||
};
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
@ -1,7 +1,5 @@
|
||||
package org.enso.base.encoding;
|
||||
|
||||
import java.io.BufferedInputStream;
|
||||
import java.io.ByteArrayInputStream;
|
||||
import java.io.IOException;
|
||||
import java.io.InputStream;
|
||||
import java.io.OutputStream;
|
||||
@ -13,7 +11,6 @@ import java.nio.charset.CharsetEncoder;
|
||||
import java.nio.charset.CoderResult;
|
||||
import java.nio.charset.CodingErrorAction;
|
||||
import java.util.Arrays;
|
||||
import java.util.List;
|
||||
import java.util.function.BiConsumer;
|
||||
import java.util.function.Function;
|
||||
import java.util.function.IntFunction;
|
||||
@ -97,89 +94,25 @@ public class Encoding_Utils {
|
||||
}
|
||||
|
||||
/**
|
||||
* Converts an array of encoded bytes into a string.
|
||||
* Checks if the following stream can be decoded without errors using the given charset.
|
||||
*
|
||||
* @param bytes the bytes to convert
|
||||
* @param charset the character set to use for decoding, use {@code null} to try auto-detection
|
||||
* @return the resulting string, and any potential problems
|
||||
* <p>The stream is (partially or wholly) consumed as part of this process.
|
||||
*/
|
||||
public static WithProblems<String, DecodingProblem> from_bytes(byte[] bytes, Charset charset) {
|
||||
if (bytes == null || bytes.length == 0) {
|
||||
return new WithProblems<>("", List.of());
|
||||
}
|
||||
|
||||
public static boolean canDecodeWithoutErrors(InputStream stream, Charset charset)
|
||||
throws IOException {
|
||||
DecodingProblemAggregator problemAggregator = new DecodingProblemAggregator();
|
||||
ByteArrayInputStream inputStream = new ByteArrayInputStream(bytes);
|
||||
ReportingStreamDecoder decoder;
|
||||
try {
|
||||
decoder = create_stream_decoder(inputStream, charset, problemAggregator, true);
|
||||
} catch (IOException e) {
|
||||
throw new IllegalStateException(
|
||||
"Unexpected IO exception in internal code: " + e.getMessage(), e);
|
||||
}
|
||||
|
||||
CharBuffer out = CharBuffer.allocate((int) (bytes.length * decoder.averageCharsPerByte()));
|
||||
try {
|
||||
int n;
|
||||
do {
|
||||
if (!out.hasRemaining()) {
|
||||
out = resize(out, CharBuffer::allocate, CharBuffer::put);
|
||||
try (var decoder = new ReportingStreamDecoder(stream, charset, problemAggregator, true)) {
|
||||
char[] tmpBuffer = new char[1024];
|
||||
while (decoder.read(tmpBuffer) >= 0) {
|
||||
if (problemAggregator.hasEncounteredInvalidCharacters()) {
|
||||
// early exit - no need to process the stream any further
|
||||
return false;
|
||||
}
|
||||
// read is already polling safepoints so we don't have to
|
||||
n = decoder.read(out);
|
||||
} while (n >= 0);
|
||||
} catch (IOException e) {
|
||||
throw new IllegalStateException("Unexpected exception: " + e.getMessage(), e);
|
||||
}
|
||||
}
|
||||
|
||||
out.flip();
|
||||
return new WithProblems<>(out.toString(), problemAggregator.summarize());
|
||||
}
|
||||
|
||||
/**
|
||||
* Creates a new instance of {@code ReportingStreamDecoder} decoding a given charset.
|
||||
*
|
||||
* @param stream the input stream to decode
|
||||
* @param charset the character set to use for decoding, use {@code null} to try auto-detection
|
||||
* @param pollSafepoints whether to poll for safepoints during decoding. This should be true if
|
||||
* the decoding will run on the main thread, and false otherwise.
|
||||
*/
|
||||
private static ReportingStreamDecoder create_stream_decoder(
|
||||
InputStream stream,
|
||||
Charset charset,
|
||||
DecodingProblemAggregator problemAggregator,
|
||||
boolean pollSafepoints)
|
||||
throws IOException {
|
||||
BufferedInputStream bufferedStream = new BufferedInputStream(stream);
|
||||
EncodingRepresentation representation = EncodingRepresentation.fromCharset(charset);
|
||||
// This may also advance the stream past the BOM
|
||||
Charset detectedCharset = representation.detectCharset(bufferedStream, problemAggregator);
|
||||
return new ReportingStreamDecoder(
|
||||
bufferedStream, detectedCharset, problemAggregator, pollSafepoints);
|
||||
}
|
||||
|
||||
/**
|
||||
* A helper function which runs an action with a created stream decoder and closes it afterwards.
|
||||
*
|
||||
* <p>It returns the result returned from the executed action and any encoding problems that
|
||||
* occurred when processing it.
|
||||
*
|
||||
* @param stream the input stream to decode
|
||||
* @param charset the character set to use for decoding, use {@code null} to try auto-detection
|
||||
* @param action the action to run with the created decoder
|
||||
* @return the result of the action and any problems that occurred during decoding
|
||||
*/
|
||||
public static WithProblems<Value, DecodingProblem> with_stream_decoder(
|
||||
InputStream stream, Charset charset, Function<ReportingStreamDecoder, Value> action)
|
||||
throws IOException {
|
||||
DecodingProblemAggregator problemAggregator = new DecodingProblemAggregator();
|
||||
Value result;
|
||||
ReportingStreamDecoder decoder =
|
||||
create_stream_decoder(stream, charset, problemAggregator, false);
|
||||
try (decoder) {
|
||||
result = action.apply(decoder);
|
||||
}
|
||||
return new WithProblems<>(result, problemAggregator.summarize());
|
||||
// Check one more time after EOF
|
||||
return !problemAggregator.hasEncounteredInvalidCharacters();
|
||||
}
|
||||
|
||||
/** Creates a new instance of {@code ReportingStreamEncoder} encoding a given charset. */
|
||||
|
@ -2,8 +2,8 @@ package org.enso.base.encoding;
|
||||
|
||||
import static org.enso.base.encoding.Encoding_Utils.INVALID_CHARACTER;
|
||||
|
||||
import java.io.BufferedInputStream;
|
||||
import java.io.IOException;
|
||||
import java.io.InputStream;
|
||||
import java.io.Reader;
|
||||
import java.nio.ByteBuffer;
|
||||
import java.nio.CharBuffer;
|
||||
@ -30,7 +30,7 @@ public class ReportingStreamDecoder extends Reader {
|
||||
return decoder.averageCharsPerByte();
|
||||
}
|
||||
|
||||
private final BufferedInputStream bufferedInputStream;
|
||||
private final InputStream inputStream;
|
||||
private final CharsetDecoder decoder;
|
||||
private final Charset charset;
|
||||
|
||||
@ -39,11 +39,11 @@ public class ReportingStreamDecoder extends Reader {
|
||||
}
|
||||
|
||||
public ReportingStreamDecoder(
|
||||
BufferedInputStream stream,
|
||||
InputStream stream,
|
||||
Charset charset,
|
||||
DecodingProblemAggregator problemAggregator,
|
||||
boolean pollSafepoints) {
|
||||
bufferedInputStream = stream;
|
||||
inputStream = stream;
|
||||
this.charset = charset;
|
||||
this.decoder =
|
||||
charset
|
||||
@ -55,6 +55,23 @@ public class ReportingStreamDecoder extends Reader {
|
||||
this.pollSafepoints = pollSafepoints;
|
||||
}
|
||||
|
||||
/** Decodes the entire input stream into a String. */
|
||||
public String readAllIntoMemory() throws IOException {
|
||||
int initialCapacity = Math.max((int) (inputStream.available() * averageCharsPerByte()), 16);
|
||||
CharBuffer out = CharBuffer.allocate(initialCapacity);
|
||||
int n;
|
||||
do {
|
||||
if (!out.hasRemaining()) {
|
||||
out = Encoding_Utils.resize(out, CharBuffer::allocate, CharBuffer::put);
|
||||
}
|
||||
// read is already polling safepoints so we don't have to
|
||||
n = this.read(out);
|
||||
} while (n >= 0);
|
||||
|
||||
out.flip();
|
||||
return out.toString();
|
||||
}
|
||||
|
||||
/**
|
||||
* Currently there is no easy way to check if a Context is available in the current thread and we
|
||||
* can use safepoints or not. The issue tracking this feature can be found at: <a
|
||||
@ -245,7 +262,7 @@ public class ReportingStreamDecoder extends Reader {
|
||||
int bytesToRead = Math.max(expectedInputSize - bufferedInput, 1);
|
||||
|
||||
ensureWorkArraySize(bytesToRead);
|
||||
int bytesActuallyRead = bufferedInputStream.read(workArray, 0, bytesToRead);
|
||||
int bytesActuallyRead = inputStream.read(workArray, 0, bytesToRead);
|
||||
if (bytesActuallyRead == -1) {
|
||||
eof = true;
|
||||
}
|
||||
@ -383,6 +400,9 @@ public class ReportingStreamDecoder extends Reader {
|
||||
|
||||
@Override
|
||||
public void close() throws IOException {
|
||||
bufferedInputStream.close();
|
||||
inputStream.close();
|
||||
inputBuffer = null;
|
||||
outputBuffer = null;
|
||||
workArray = null;
|
||||
}
|
||||
}
|
||||
|
@ -0,0 +1,25 @@
|
||||
package org.enso.base_test_helpers;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.io.InputStream;
|
||||
|
||||
/** A generic stream used for tests. */
|
||||
public class RangeStream extends InputStream {
|
||||
private final int end;
|
||||
private int current;
|
||||
|
||||
public RangeStream(int start, int end) {
|
||||
assert 0 <= start && start <= end;
|
||||
this.end = end;
|
||||
this.current = start;
|
||||
}
|
||||
|
||||
@Override
|
||||
public int read() throws IOException {
|
||||
if (current >= end) {
|
||||
return -1;
|
||||
} else {
|
||||
return current++ % 256;
|
||||
}
|
||||
}
|
||||
}
|
@ -69,7 +69,6 @@ add_specs suite_builder =
|
||||
Problems.get_attached_warnings default_warning . should_contain_the_same_elements_as problems
|
||||
|
||||
suite_builder.group "Default Encoding" group_builder->
|
||||
pending_fallback = "Fallback to Windows-1252 is not implemented yet."
|
||||
group_builder.specify "should try reading as UTF-8 by default" <|
|
||||
bytes = [65, -60, -123, -60, -103]
|
||||
# A ą ę
|
||||
@ -113,7 +112,7 @@ add_specs suite_builder =
|
||||
bytes.length . should_equal 4*2
|
||||
txt.length . should_equal 3
|
||||
|
||||
group_builder.specify "falls back to Windows-1252 if invalid Unicode is detected and no BOM" pending=pending_fallback <|
|
||||
group_builder.specify "falls back to Windows-1252 if invalid Unicode is detected and no BOM" <|
|
||||
# These are not valid bytes for UTF-8
|
||||
bytes = [-30, -55, -1]
|
||||
Text.from_bytes bytes Encoding.utf_8 . should_fail_with Encoding_Error
|
||||
@ -148,7 +147,6 @@ add_specs suite_builder =
|
||||
empty.should_equal ""
|
||||
Problems.assume_no_problems empty
|
||||
|
||||
group_builder.specify "should work on 0 or 1 byte input (TODO merge with above)" pending=pending_fallback <|
|
||||
txt = Text.from_bytes [-1] Encoding.default Problem_Behavior.Report_Warning
|
||||
txt.should_equal 'ÿ'
|
||||
# No problems, as falling back to Windows-1252.
|
||||
@ -166,8 +164,6 @@ add_specs suite_builder =
|
||||
([-1, -2] + [65, 0, 5, 1, 25, 1]).write_bytes f Existing_File_Behavior.Overwrite . should_succeed
|
||||
f.read . should_equal "Aąę"
|
||||
|
||||
group_builder.specify "Default Encoding heuristics also work in File.read (TODO merge with above)" pending=pending_fallback <|
|
||||
f = File.create_temporary_file "utf-heuristics" ".txt"
|
||||
# Fallback to Windows-1252
|
||||
([-30, -55, -1]).write_bytes f Existing_File_Behavior.Overwrite . should_succeed
|
||||
f.read . should_equal "âÉÿ"
|
||||
|
63
test/Base_Tests/src/System/Input_Stream_Spec.enso
Normal file
63
test/Base_Tests/src/System/Input_Stream_Spec.enso
Normal file
@ -0,0 +1,63 @@
|
||||
from Standard.Base import all
|
||||
import Standard.Base.Errors.Illegal_State.Illegal_State
|
||||
import Standard.Base.Runtime.Managed_Resource.Managed_Resource
|
||||
import Standard.Base.System.Input_Stream.Input_Stream
|
||||
|
||||
from Standard.Test import all
|
||||
|
||||
polyglot java import org.enso.base_test_helpers.RangeStream
|
||||
|
||||
main filter=Nothing =
|
||||
suite = Test.build suite_builder->
|
||||
add_specs suite_builder
|
||||
suite.run_with_filter filter
|
||||
|
||||
add_specs suite_builder = suite_builder.group "Input Stream" group_builder->
|
||||
group_builder.specify "should be peekable if backed by memory" <|
|
||||
Managed_Resource.bracket (Input_Stream.from_bytes [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]) (.close) stream->
|
||||
stream.is_peekable . should_be_true
|
||||
|
||||
stream.peek_bytes 3 . should_equal [1, 2, 3]
|
||||
stream.peek_bytes 3 . should_equal [1, 2, 3]
|
||||
# After the peek operation, read still starts from beginning
|
||||
stream.read_n_bytes 3 . should_equal [1, 2, 3]
|
||||
|
||||
# Further peek after a read, starts from where the next read would start
|
||||
stream.peek_bytes 3 . should_equal [4, 5, 6]
|
||||
stream.read_n_bytes 3 . should_equal [4, 5, 6]
|
||||
stream.read_n_bytes 5 . should_equal [7, 8, 9, 10]
|
||||
|
||||
group_builder.specify "should allow to skip bytes" <|
|
||||
Managed_Resource.bracket (Input_Stream.from_bytes [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]) (.close) stream->
|
||||
stream.skip_n_bytes 3
|
||||
stream.read_n_bytes 1 . should_equal [4]
|
||||
stream.peek_bytes 3 . should_equal [5, 6, 7]
|
||||
stream.skip_n_bytes 3
|
||||
stream.read_n_bytes 4 . should_equal [8, 9, 10]
|
||||
|
||||
group_builder.specify "should not be peekable if generic stream is provided" <|
|
||||
error_handler x = x
|
||||
generic_stream = Input_Stream.new (RangeStream.new 100 120) error_handler
|
||||
generic_stream.is_peekable . should_be_false
|
||||
generic_stream.peek_bytes 3 . should_fail_with Illegal_State
|
||||
|
||||
group_builder.specify "should be possible to make peekable" <|
|
||||
error_handler x = x
|
||||
generic_stream = Input_Stream.new (RangeStream.new 100 120) error_handler
|
||||
generic_stream.read_n_bytes 4 . should_equal [100, 101, 102, 103]
|
||||
|
||||
promoted_stream = generic_stream.as_peekable_stream
|
||||
# The new stream starts at the same position as the old one was left
|
||||
promoted_stream.peek_bytes 4 . should_equal [104, 105, 106, 107]
|
||||
promoted_stream.read_n_bytes 4 . should_equal [104, 105, 106, 107]
|
||||
promoted_stream.peek_bytes 2 . should_equal [108, 109]
|
||||
promoted_stream.peek_bytes 2 . should_equal [108, 109]
|
||||
|
||||
group_builder.specify "should allow to peek beyond EOF, still correctly restarting afterwards" <|
|
||||
error_handler x = x
|
||||
generic_stream = Input_Stream.new (RangeStream.new 100 105) error_handler
|
||||
promoted_stream = generic_stream.as_peekable_stream
|
||||
promoted_stream.peek_bytes 10 . should_equal [100, 101, 102, 103, 104]
|
||||
promoted_stream.peek_bytes 10 . should_equal [100, 101, 102, 103, 104]
|
||||
# The read still succeeds - ensuring there isn't some early EOF
|
||||
promoted_stream.read_n_bytes 10 . should_equal [100, 101, 102, 103, 104]
|
@ -0,0 +1,87 @@
|
||||
from Standard.Base import all
|
||||
import Standard.Base.Errors.Illegal_State.Illegal_State
|
||||
import Standard.Base.Runtime.Managed_Resource.Managed_Resource
|
||||
import Standard.Base.System.File.Advanced.Temporary_File.Temporary_File
|
||||
import Standard.Base.System.Input_Stream.Input_Stream
|
||||
|
||||
from Standard.Test import all
|
||||
|
||||
polyglot java import org.enso.base_test_helpers.RangeStream
|
||||
|
||||
main filter=Nothing =
|
||||
suite = Test.build suite_builder->
|
||||
add_specs suite_builder
|
||||
suite.run_with_filter filter
|
||||
|
||||
add_specs suite_builder = suite_builder.group "Restartable Input Stream" group_builder->
|
||||
group_builder.specify "should allow to restart a generic stream by caching its data" <|
|
||||
error_handler x = x
|
||||
generic_stream = Input_Stream.new (RangeStream.new 100 120) error_handler
|
||||
generic_stream.read_n_bytes 3 . should_equal [100, 101, 102]
|
||||
# A generic stream is frozen in its current state - the first few bytes that were already read will be lost
|
||||
restartable_stream = generic_stream.as_restartable_stream
|
||||
restartable_stream.with_fresh_stream stream->
|
||||
stream.read_n_bytes 4 . should_equal [103, 104, 105, 106]
|
||||
restartable_stream.with_fresh_stream stream->
|
||||
stream.read_n_bytes 4 . should_equal [103, 104, 105, 106]
|
||||
|
||||
# Small stream is backed by memory:
|
||||
restartable_stream.to_text . should_contain "From_Bytes"
|
||||
|
||||
group_builder.specify "will fall back to caching in a temporary file for a very large stream" <|
|
||||
error_handler x = x
|
||||
# 1MB
|
||||
generic_stream = Input_Stream.new (RangeStream.new 1 1000000) error_handler
|
||||
restartable_stream = generic_stream.as_restartable_stream
|
||||
restartable_stream.with_fresh_stream stream->
|
||||
stream.read_n_bytes 4 . should_equal [1, 2, 3, 4]
|
||||
restartable_stream.to_text . should_contain "From_Temporary_File"
|
||||
|
||||
group_builder.specify "will re-use the original byte array if it was known" <|
|
||||
byte_stream = Input_Stream.from_bytes [1, 2, 3, 4, 5]
|
||||
byte_stream.read_n_bytes 3 . should_equal [1, 2, 3]
|
||||
restartable_stream = byte_stream.as_restartable_stream
|
||||
restartable_stream.with_fresh_stream stream->
|
||||
# All bytes are preserved
|
||||
stream.read_n_bytes 4 . should_equal [1, 2, 3, 4]
|
||||
|
||||
restartable_stream.to_text . should_contain "From_Bytes"
|
||||
restartable_stream.with_fresh_stream stream->
|
||||
stream.read_n_bytes 4 . should_equal [1, 2, 3, 4]
|
||||
|
||||
group_builder.specify "will be backed by a file, if a file stream is converted and extend_lifetime=False" <|
|
||||
file = File.create_temporary_file "test" ".bin"
|
||||
[10, 20, 30, 40, 50].write_bytes file . should_succeed
|
||||
|
||||
restartable_stream = file.with_input_stream [File_Access.Read] file_stream->
|
||||
file_stream.as_restartable_stream extend_lifetime=False
|
||||
|
||||
restartable_stream.with_fresh_stream stream->
|
||||
stream.read_n_bytes 3 . should_equal [10, 20, 30]
|
||||
|
||||
restartable_stream.to_text . should_contain "From_Existing_File"
|
||||
|
||||
restartable_stream.with_fresh_stream stream->
|
||||
stream.read_n_bytes 3 . should_equal [10, 20, 30]
|
||||
|
||||
# However, if backed by existing file - the stream is prone to file modifications that happen in the meantime
|
||||
[11, 12, 13, 14].write_bytes file on_existing_file=Existing_File_Behavior.Overwrite . should_succeed
|
||||
restartable_stream.with_fresh_stream stream->
|
||||
stream.read_n_bytes 3 . should_equal [11, 12, 13]
|
||||
|
||||
group_builder.specify "will not be tied to the original backing file if extend_lifetime=True (default)" <|
|
||||
file = File.create_temporary_file "test" ".bin"
|
||||
[10, 20, 30, 40, 50].write_bytes file . should_succeed
|
||||
|
||||
restartable_stream = file.with_input_stream [File_Access.Read] file_stream->
|
||||
file_stream.as_restartable_stream
|
||||
|
||||
restartable_stream.with_fresh_stream stream->
|
||||
stream.read_n_bytes 3 . should_equal [10, 20, 30]
|
||||
|
||||
# Modify backing file
|
||||
[11, 12, 13, 14].write_bytes file on_existing_file=Existing_File_Behavior.Overwrite . should_succeed
|
||||
|
||||
# The stream still yields old values because it was untied:
|
||||
restartable_stream.with_fresh_stream stream->
|
||||
stream.read_n_bytes 3 . should_equal [10, 20, 30]
|
@ -92,37 +92,47 @@ add_specs suite_builder =
|
||||
tmp.to_text . should_not_contain "pref"
|
||||
tmp.to_text . should_not_contain "suf"
|
||||
|
||||
group_builder.specify "should allow to materialize an input stream that is already associated with a temporary file without copying it" <|
|
||||
group_builder.specify "should allow to materialize an input stream that is already associated with a temporary file without copying it (low-level)" <|
|
||||
tmp = Temporary_File.new
|
||||
tmp.with_file f->
|
||||
"test payload 3" . write f
|
||||
|
||||
java_file = Java_File.new tmp.unsafe_get.absolute.path
|
||||
stream = Input_Stream.new (FileInputStream.new java_file) (File_Error.handle_java_exceptions tmp.unsafe_get) associated_file=tmp
|
||||
stream = Input_Stream.new (FileInputStream.new java_file) (File_Error.handle_java_exceptions tmp.unsafe_get) associated_source=tmp
|
||||
|
||||
tmp2 = Temporary_File.from_stream_light stream
|
||||
# The returned tmp file should be the same one as original.
|
||||
tmp2.should_be_a Temporary_File
|
||||
tmp2.unsafe_get.absolute.path . should_equal tmp.unsafe_get.absolute.path
|
||||
tmp2.should_equal tmp
|
||||
|
||||
# If the raw file is associated, the stream will return that File descriptor (not as temporary file, but regular one):
|
||||
stream3 = Input_Stream.new (FileInputStream.new java_file) (File_Error.handle_java_exceptions tmp.unsafe_get) associated_file=tmp.unsafe_get
|
||||
stream3 = Input_Stream.new (FileInputStream.new java_file) (File_Error.handle_java_exceptions tmp.unsafe_get) associated_source=tmp.unsafe_get
|
||||
f3 = Temporary_File.from_stream_light stream3
|
||||
f3.should_be_a File
|
||||
f3.absolute.path . should_equal tmp.unsafe_get.absolute.path
|
||||
|
||||
# But if there's no association, a new temporary file gets created:
|
||||
stream4 = Input_Stream.new (FileInputStream.new java_file) (File_Error.handle_java_exceptions tmp.unsafe_get) associated_file=Nothing
|
||||
stream4 = Input_Stream.new (FileInputStream.new java_file) (File_Error.handle_java_exceptions tmp.unsafe_get) associated_source=Nothing
|
||||
tmp4 = Temporary_File.from_stream_light stream4
|
||||
tmp4.should_be_a Temporary_File
|
||||
tmp4.unsafe_get.absolute.path . should_not_equal tmp.unsafe_get.absolute.path
|
||||
|
||||
# The base variant of from_stream also always copies:
|
||||
stream5 = Input_Stream.new (FileInputStream.new java_file) (File_Error.handle_java_exceptions tmp.unsafe_get) associated_file=tmp
|
||||
stream5 = Input_Stream.new (FileInputStream.new java_file) (File_Error.handle_java_exceptions tmp.unsafe_get) associated_source=tmp
|
||||
tmp5 = Temporary_File.from_stream stream5
|
||||
tmp5.should_be_a Temporary_File
|
||||
tmp5.unsafe_get.absolute.path . should_not_equal tmp.unsafe_get.absolute.path
|
||||
|
||||
group_builder.specify "should allow to materialize an input stream opened to a temporary file without copying it" pending="Currently the file stream does not know it came from Temporary_File. May be revisited in the future to automatically correlate it." <|
|
||||
tmp = Temporary_File.new
|
||||
tmp.with_file f->
|
||||
"test payload 4" . write f
|
||||
|
||||
tmp2 = tmp.with_file f-> f.with_input_stream [File_Access.Read] input_stream->
|
||||
Temporary_File.from_stream_light input_stream
|
||||
# The returned file should be the same as original
|
||||
tmp2.should_equal tmp
|
||||
|
||||
make_stream text =
|
||||
raw_stream = ByteArrayInputStream.new text.utf_8
|
||||
Input_Stream.new raw_stream (File_Error.handle_java_exceptions Nothing)
|
||||
|
@ -173,7 +173,8 @@ add_specs suite_builder =
|
||||
utf8_bytes = [97, 44, 98, 44, 99, 10, -60, -123, 44, -17, -65, -65, 44, -61, 40, -61, 40, 10]
|
||||
utf8_bytes.write_bytes utf8_file
|
||||
action_1 on_problems =
|
||||
utf8_file.read (Delimited_Format.Delimited "," headers=True) on_problems
|
||||
# We need to set the encoding explicitly, as otherwise we'd just fallback to Windows-1252 and have no errors
|
||||
utf8_file.read (Delimited_Format.Delimited "," headers=True encoding=Encoding.utf_8) on_problems
|
||||
tester_1 table =
|
||||
table.columns.map .name . should_equal ['a', 'b', 'c']
|
||||
table.at 'a' . to_vector . should_equal ['ą']
|
||||
@ -522,6 +523,18 @@ add_specs suite_builder =
|
||||
r.first_column.to_vector . should_equal ['\uFFFD']
|
||||
Problems.expect_only_warning Encoding_Error r
|
||||
|
||||
group_builder.specify "should fall back to Windows-1252 encoding if invalid UTF-8 characters are encountered in Default encoding" <|
|
||||
f = File.create_temporary_file "delimited-invalid-utf-8" ".csv"
|
||||
# On the simple characters all three encodings (ASCII, UTF-8 and Win-1252) agree, so we can use ASCII bytes.
|
||||
bytes = ('A,B\n1,y'.bytes Encoding.ascii) + [-1] + ('z\n2,-'.bytes Encoding.ascii)
|
||||
bytes.write_bytes f . should_succeed
|
||||
r = f.read
|
||||
r.should_be_a Table
|
||||
r.column_names.should_equal ["A", "B"]
|
||||
r.at "A" . to_vector . should_equal [1, 2]
|
||||
# We fallback to Win-1252 where byte -1 means ÿ
|
||||
r.at "B" . to_vector . should_equal ["yÿz", "-"]
|
||||
|
||||
main filter=Nothing =
|
||||
suite = Test.build suite_builder->
|
||||
add_specs suite_builder
|
||||
|
@ -577,6 +577,22 @@ add_specs suite_builder =
|
||||
t.at "A" . to_vector . should_equal [1, 2, 3, 4]
|
||||
t.at "B" . to_vector . should_equal ["ąęćś", "💯", "żółw", "🐢"]
|
||||
|
||||
Test.with_clue "Windows-1252 fallback: " <|
|
||||
initial_bytes = 'A,B\n1,a\n2,b\n3,¥\n4,d'.bytes Encoding.windows_1252
|
||||
initial_bytes.write_bytes f on_existing_file=Existing_File_Behavior.Overwrite . should_succeed
|
||||
|
||||
r = (Table.new [["A", [5, 6]], ["B", ["æ", "💯"]]]).write f on_existing_file=Existing_File_Behavior.Append
|
||||
# Should report problem writing the 💯 character, because selected Windows-1252 encoding does not support it
|
||||
Problems.expect_only_warning Encoding_Error r
|
||||
|
||||
# Now we read in auto mode, the part appended to the table should have been correctly written in Windows-1252
|
||||
t = f.read
|
||||
t.should_be_a Table
|
||||
t.column_names . should_equal ["A", "B"]
|
||||
t.at "A" . to_vector . should_equal [1, 2, 3, 4, 5, 6]
|
||||
# The æ should have been correctly written in Win-1252, the 💯 is replaced by `?`.
|
||||
t.at "B" . to_vector . should_equal ["a", "b", "¥", "d", "æ", "?"]
|
||||
|
||||
group_builder.specify "should fail if the target file is read-only" <|
|
||||
f = enso_project.data / "transient" / "permission.csv"
|
||||
f.delete_if_exists
|
||||
@ -635,4 +651,3 @@ main filter=Nothing =
|
||||
suite = Test.build suite_builder->
|
||||
add_specs suite_builder
|
||||
suite.run_with_filter filter
|
||||
|
||||
|
Loading…
Reference in New Issue
Block a user