mirror of
https://github.com/enso-org/enso.git
synced 2024-11-25 21:25:20 +03:00
Implement Windows-1252 fallback logic for Encoding.Default
(#10190)
- Closes #10148 - [x] Tests for `Restartable_Input_Stream`, `peek_bytes` and `skip_n_bytes`. - [x] Report `Managed_Resource` stack overflow bug: #10211 - [x] Followup possible optimization: #10220 - [x] Test use-case from blog.
This commit is contained in:
parent
d938c96c55
commit
41d02e95ef
@ -16,6 +16,7 @@
|
|||||||
operations.][9950]
|
operations.][9950]
|
||||||
- [Implemented `.cast` to and from `Decimal` columns for the in-memory
|
- [Implemented `.cast` to and from `Decimal` columns for the in-memory
|
||||||
database.][10206]
|
database.][10206]
|
||||||
|
- [Implemented fallback to Windows-1252 encoding for `Encoding.Default`.][10190]
|
||||||
|
|
||||||
[debug-shortcuts]:
|
[debug-shortcuts]:
|
||||||
|
|
||||||
@ -23,6 +24,7 @@
|
|||||||
[10122]: https://github.com/enso-org/enso/pull/10122
|
[10122]: https://github.com/enso-org/enso/pull/10122
|
||||||
[10130]: https://github.com/enso-org/enso/pull/10130
|
[10130]: https://github.com/enso-org/enso/pull/10130
|
||||||
[10206]: https://github.com/enso-org/enso/pull/10206
|
[10206]: https://github.com/enso-org/enso/pull/10206
|
||||||
|
[10190]: https://github.com/enso-org/enso/pull/10190
|
||||||
|
|
||||||
<br/>![Release Notes](/docs/assets/tags/release_notes.svg)
|
<br/>![Release Notes](/docs/assets/tags/release_notes.svg)
|
||||||
|
|
||||||
|
@ -62,7 +62,10 @@ type Encoding
|
|||||||
encoding. Otherwise, the input is decoded using UTF-8 unless it contains
|
encoding. Otherwise, the input is decoded using UTF-8 unless it contains
|
||||||
invalid UTF-8 sequences, in which case Windows-1252 is used as a fallback.
|
invalid UTF-8 sequences, in which case Windows-1252 is used as a fallback.
|
||||||
|
|
||||||
When used for encoding, it will always encode using UTF-8.
|
When used for encoding, it will either use the same encoding detection
|
||||||
|
heuristics as in read in case of Append mode. When writing a new file,
|
||||||
|
it will always use UTF-8.
|
||||||
|
|
||||||
This encoding cannot be passed to some functions that require a Java
|
This encoding cannot be passed to some functions that require a Java
|
||||||
Charset.
|
Charset.
|
||||||
default -> Encoding =
|
default -> Encoding =
|
||||||
|
@ -33,6 +33,7 @@ import project.Errors.Time_Error.Time_Error
|
|||||||
import project.Meta
|
import project.Meta
|
||||||
import project.Nothing.Nothing
|
import project.Nothing.Nothing
|
||||||
import project.Panic.Panic
|
import project.Panic.Panic
|
||||||
|
import project.System.Internal.Reporting_Stream_Decoder_Helper
|
||||||
from project.Data.Boolean import Boolean, False, True
|
from project.Data.Boolean import Boolean, False, True
|
||||||
from project.Data.Json import Invalid_JSON, JS_Object, Json
|
from project.Data.Json import Invalid_JSON, JS_Object, Json
|
||||||
from project.Data.Numbers import Float, Integer, Number, Number_Parse_Error
|
from project.Data.Numbers import Float, Integer, Number, Number_Parse_Error
|
||||||
@ -763,11 +764,7 @@ Text.bytes self (encoding : Encoding = Encoding.utf_8) (on_problems : Problem_Be
|
|||||||
@encoding Encoding.default_widget
|
@encoding Encoding.default_widget
|
||||||
Text.from_bytes : Vector Integer -> Encoding -> Problem_Behavior -> Text
|
Text.from_bytes : Vector Integer -> Encoding -> Problem_Behavior -> Text
|
||||||
Text.from_bytes bytes (encoding : Encoding = Encoding.default) (on_problems : Problem_Behavior = Problem_Behavior.Report_Error) =
|
Text.from_bytes bytes (encoding : Encoding = Encoding.default) (on_problems : Problem_Behavior = Problem_Behavior.Report_Error) =
|
||||||
result = Encoding_Utils.from_bytes bytes encoding.to_java_charset_or_null : WithProblems
|
Reporting_Stream_Decoder_Helper.decode_bytes_to_text bytes encoding on_problems
|
||||||
if result.problems.is_empty then result.result else
|
|
||||||
problems = result.problems.map decoding_problem->
|
|
||||||
Encoding_Error.Error decoding_problem.message
|
|
||||||
on_problems.attach_problems_after result.result problems
|
|
||||||
|
|
||||||
## ICON convert
|
## ICON convert
|
||||||
Returns a vector containing bytes representing the UTF-8 encoding of the
|
Returns a vector containing bytes representing the UTF-8 encoding of the
|
||||||
|
@ -29,13 +29,13 @@ import project.System.File_Format.Infer
|
|||||||
import project.System.File_Format.Plain_Text_Format
|
import project.System.File_Format.Plain_Text_Format
|
||||||
import project.System.File_Format_Metadata.File_Format_Metadata
|
import project.System.File_Format_Metadata.File_Format_Metadata
|
||||||
import project.System.Input_Stream.Input_Stream
|
import project.System.Input_Stream.Input_Stream
|
||||||
|
import project.System.Advanced.Restartable_Input_Stream.Restartable_Input_Stream
|
||||||
from project.Data.Boolean import Boolean, False, True
|
from project.Data.Boolean import Boolean, False, True
|
||||||
from project.Data.Text.Extensions import all
|
from project.Data.Text.Extensions import all
|
||||||
from project.Metadata.Choice import Option
|
from project.Metadata.Choice import Option
|
||||||
from project.Metadata.Widget import Single_Choice
|
from project.Metadata.Widget import Single_Choice
|
||||||
from project.System.File_Format import format_types
|
from project.System.File_Format import format_types
|
||||||
|
|
||||||
polyglot java import java.io.ByteArrayInputStream
|
|
||||||
polyglot java import java.io.InputStream
|
polyglot java import java.io.InputStream
|
||||||
|
|
||||||
## PRIVATE
|
## PRIVATE
|
||||||
@ -62,24 +62,15 @@ type Response_Body
|
|||||||
Raw_Stream (raw_stream:Input_Stream) (metadata:File_Format_Metadata) uri:URI
|
Raw_Stream (raw_stream:Input_Stream) (metadata:File_Format_Metadata) uri:URI
|
||||||
|
|
||||||
## PRIVATE
|
## PRIVATE
|
||||||
Materialized_Byte_Array (bytes:Vector) (metadata:File_Format_Metadata) uri:URI
|
Materialized_Stream (restartable_stream:Restartable_Input_Stream) (metadata:File_Format_Metadata) uri:URI
|
||||||
|
|
||||||
## PRIVATE
|
|
||||||
Materialized_Temporary_File (temporary_file:Temporary_File) (metadata:File_Format_Metadata) uri:URI
|
|
||||||
|
|
||||||
## PRIVATE
|
## PRIVATE
|
||||||
with_stream : (Input_Stream -> Any ! HTTP_Error) -> Any ! HTTP_Error
|
with_stream : (Input_Stream -> Any ! HTTP_Error) -> Any ! HTTP_Error
|
||||||
with_stream self action = case self of
|
with_stream self action = case self of
|
||||||
Response_Body.Raw_Stream raw_stream _ _ ->
|
Response_Body.Raw_Stream raw_stream _ _ ->
|
||||||
Managed_Resource.bracket raw_stream (_.close) action
|
Managed_Resource.bracket raw_stream (_.close) action
|
||||||
Response_Body.Materialized_Byte_Array bytes _ _ ->
|
Response_Body.Materialized_Stream restartable_stream _ _ ->
|
||||||
byte_stream = Input_Stream.new (ByteArrayInputStream.new bytes) (HTTP_Error.handle_java_exceptions self.uri)
|
restartable_stream.with_fresh_stream action
|
||||||
Managed_Resource.bracket byte_stream (_.close) action
|
|
||||||
Response_Body.Materialized_Temporary_File temporary_file _ _ -> temporary_file.with_file file->
|
|
||||||
opts = [File_Access.Read.to_java]
|
|
||||||
stream = HTTP_Error.handle_java_exceptions self.uri (file.input_stream_builtin opts)
|
|
||||||
file_stream = Input_Stream.new stream (HTTP_Error.handle_java_exceptions self.uri) associated_file=temporary_file
|
|
||||||
Managed_Resource.bracket (file_stream) (_.close) action
|
|
||||||
|
|
||||||
## PRIVATE
|
## PRIVATE
|
||||||
ADVANCED
|
ADVANCED
|
||||||
@ -87,25 +78,11 @@ type Response_Body
|
|||||||
return a new Response_Body.
|
return a new Response_Body.
|
||||||
materialize : Input_Stream
|
materialize : Input_Stream
|
||||||
materialize self = case self of
|
materialize self = case self of
|
||||||
Response_Body.Raw_Stream _ _ _ ->
|
Response_Body.Raw_Stream _ metadata uri ->
|
||||||
self.with_stream body_stream->
|
restartable_stream = self.with_stream body_stream->
|
||||||
body_stream.with_java_stream body_java_stream->
|
body_stream.as_restartable_stream
|
||||||
first_block = body_java_stream.readNBytes maximum_body_in_memory
|
Response_Body.Materialized_Stream restartable_stream metadata uri
|
||||||
case first_block.length < maximum_body_in_memory of
|
Response_Body.Materialized_Stream _ _ _ -> self
|
||||||
True -> Response_Body.Materialized_Byte_Array (Vector.from_polyglot_array first_block) self.metadata self.uri
|
|
||||||
False -> Context.Output.with_enabled <|
|
|
||||||
## Write contents to a temporary file
|
|
||||||
temp_file = Temporary_File.new self.uri.host
|
|
||||||
r = temp_file.with_file file->
|
|
||||||
file.with_output_stream [File_Access.Write, File_Access.Create, File_Access.Truncate_Existing] output_stream->
|
|
||||||
output_stream.with_java_stream java_output_stream->
|
|
||||||
java_output_stream.write first_block
|
|
||||||
body_java_stream.transferTo java_output_stream
|
|
||||||
java_output_stream.flush
|
|
||||||
Nothing
|
|
||||||
r.if_not_error <|
|
|
||||||
Response_Body.Materialized_Temporary_File temp_file self.metadata self.uri
|
|
||||||
_ -> self
|
|
||||||
|
|
||||||
## ALIAS parse
|
## ALIAS parse
|
||||||
GROUP Input
|
GROUP Input
|
||||||
|
@ -37,8 +37,12 @@ type Managed_Resource
|
|||||||
function once it is no longer in use.
|
function once it is no longer in use.
|
||||||
|
|
||||||
Arguments:
|
Arguments:
|
||||||
|
- resource: The resource to register.
|
||||||
- function: The action to be executed on resource to clean it up when
|
- function: The action to be executed on resource to clean it up when
|
||||||
it is no longer in use.
|
it is no longer in use.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
A `Managed_Resource` object that can be used to access the resource.
|
||||||
register : Any -> (Any -> Nothing) -> Managed_Resource
|
register : Any -> (Any -> Nothing) -> Managed_Resource
|
||||||
register resource function = @Builtin_Method "Managed_Resource.register"
|
register resource function = @Builtin_Method "Managed_Resource.register"
|
||||||
|
|
||||||
|
@ -0,0 +1,105 @@
|
|||||||
|
import project.Any.Any
|
||||||
|
import project.Data.Text.Text
|
||||||
|
import project.Data.Vector.Vector
|
||||||
|
import project.Nothing.Nothing
|
||||||
|
import project.Runtime.Context
|
||||||
|
import project.Runtime.Managed_Resource.Managed_Resource
|
||||||
|
import project.System.File.Advanced.Temporary_File.Temporary_File
|
||||||
|
import project.System.File.File
|
||||||
|
import project.System.File.File_Access.File_Access
|
||||||
|
import project.System.Input_Stream.Input_Stream
|
||||||
|
from project.Data.Boolean import Boolean, False, True
|
||||||
|
|
||||||
|
## PRIVATE
|
||||||
|
An input stream that can be read multiple times.
|
||||||
|
|
||||||
|
It may be useful when multiple passes over the data are required.
|
||||||
|
If you need to check only the beginning of the stream, consider using a much
|
||||||
|
lighter `Input_Stream.as_peekable_stream`.
|
||||||
|
|
||||||
|
A generic stream can be converted to `Restartable_Input_Stream` by reading
|
||||||
|
all its contents and storing them either in memory or in a temporary file.
|
||||||
|
A stream backed by an existing file can be converted to
|
||||||
|
`Restartable_Input_Stream` at no cost.
|
||||||
|
|
||||||
|
! Stream Lifetime
|
||||||
|
|
||||||
|
Note that if we use an existing file as a shortcut to avoid copying the
|
||||||
|
data, we need to assume that the file will not be modified in the meantime.
|
||||||
|
Thus the `Restartable_Input_Stream` does not fully guarantee immutability
|
||||||
|
of the data. The lifetime of such `Restartable_Input_Stream` is tied to the
|
||||||
|
lifetime of its backing file.
|
||||||
|
|
||||||
|
If the stream should stay usable for a longer time, `extend_lifetime=True`
|
||||||
|
should be passed when creating it.
|
||||||
|
type Restartable_Input_Stream
|
||||||
|
## PRIVATE
|
||||||
|
`bytes` may be a Vector or a raw `byte[]` array (convertible to vector, but no annotation to avoid conversions).
|
||||||
|
private From_Bytes bytes
|
||||||
|
|
||||||
|
## PRIVATE
|
||||||
|
private From_Existing_File file:File
|
||||||
|
|
||||||
|
## PRIVATE
|
||||||
|
private From_Temporary_File temporary_file:Temporary_File
|
||||||
|
|
||||||
|
## PRIVATE
|
||||||
|
to_text self -> Text =
|
||||||
|
suffix = case self of
|
||||||
|
Restartable_Input_Stream.From_Bytes _ -> "From_Bytes"
|
||||||
|
Restartable_Input_Stream.From_Existing_File file -> "From_Existing_File "+file.to_text
|
||||||
|
Restartable_Input_Stream.From_Temporary_File _ -> "From_Temporary_File"
|
||||||
|
"Restartable_Input_Stream."+suffix
|
||||||
|
|
||||||
|
## PRIVATE
|
||||||
|
make (input_stream : Input_Stream) (extend_lifetime : Boolean) -> Restartable_Input_Stream =
|
||||||
|
case input_stream.associated_source of
|
||||||
|
temp_file : Temporary_File -> Restartable_Input_Stream.From_Temporary_File temp_file
|
||||||
|
file : File ->
|
||||||
|
if extend_lifetime then cache_generic_input_stream input_stream else
|
||||||
|
Restartable_Input_Stream.From_Existing_File file
|
||||||
|
bytes : Vector -> Restartable_Input_Stream.From_Bytes bytes
|
||||||
|
_ -> cache_generic_input_stream input_stream
|
||||||
|
|
||||||
|
## PRIVATE
|
||||||
|
Runs the provided action with a fresh input stream pointing to the
|
||||||
|
beginning of the data represented by this stream.
|
||||||
|
|
||||||
|
This method may be called multiple times, allowing multiple 'rounds' of
|
||||||
|
processing.
|
||||||
|
with_fresh_stream self (action : Input_Stream -> Any) -> Any =
|
||||||
|
case self of
|
||||||
|
Restartable_Input_Stream.From_Bytes bytes ->
|
||||||
|
Managed_Resource.bracket (Input_Stream.from_bytes bytes) (.close) action
|
||||||
|
Restartable_Input_Stream.From_Existing_File file ->
|
||||||
|
file.with_input_stream [File_Access.Read] action
|
||||||
|
Restartable_Input_Stream.From_Temporary_File temp_file ->
|
||||||
|
temp_file.with_file file->
|
||||||
|
file.with_input_stream [File_Access.Read] action
|
||||||
|
|
||||||
|
## PRIVATE
|
||||||
|
Maximum size for a stream to be held in memory.
|
||||||
|
If the amount of data exceeds this limit, it will be stored in a temporary file.
|
||||||
|
max_in_memory_size =
|
||||||
|
# 64 KiB
|
||||||
|
64 * 1024
|
||||||
|
|
||||||
|
## PRIVATE
|
||||||
|
private cache_generic_input_stream (input_stream : Input_Stream) -> Restartable_Input_Stream =
|
||||||
|
input_stream.with_java_stream java_input_stream->
|
||||||
|
first_block = java_input_stream.readNBytes max_in_memory_size
|
||||||
|
case first_block.length < max_in_memory_size of
|
||||||
|
True ->
|
||||||
|
Restartable_Input_Stream.From_Bytes first_block
|
||||||
|
False ->
|
||||||
|
Context.Output.with_enabled <|
|
||||||
|
temp_file = Temporary_File.new "restartable-input-stream"
|
||||||
|
r = temp_file.with_file file->
|
||||||
|
file.with_output_stream [File_Access.Write, File_Access.Create, File_Access.Truncate_Existing] output_stream->
|
||||||
|
output_stream.with_java_stream java_output_stream->
|
||||||
|
java_output_stream.write first_block
|
||||||
|
java_input_stream.transferTo java_output_stream
|
||||||
|
java_output_stream.flush
|
||||||
|
Nothing
|
||||||
|
r.if_not_error <|
|
||||||
|
Restartable_Input_Stream.From_Temporary_File temp_file
|
@ -246,11 +246,11 @@ type File
|
|||||||
file.with_input_stream [File_Access.Create, File_Access.Read] action
|
file.with_input_stream [File_Access.Create, File_Access.Read] action
|
||||||
with_input_stream : Vector File_Access -> (Input_Stream -> Any ! File_Error) -> Any ! File_Error
|
with_input_stream : Vector File_Access -> (Input_Stream -> Any ! File_Error) -> Any ! File_Error
|
||||||
with_input_stream self (open_options : Vector) action =
|
with_input_stream self (open_options : Vector) action =
|
||||||
new_input_stream : File -> Vector File_Access -> Output_Stream ! File_Error
|
new_input_stream : File -> Vector File_Access -> Input_Stream ! File_Error
|
||||||
new_input_stream file open_options =
|
new_input_stream file open_options =
|
||||||
opts = open_options . map (_.to_java)
|
opts = open_options . map (_.to_java)
|
||||||
stream = File_Error.handle_java_exceptions file (file.input_stream_builtin opts)
|
stream = File_Error.handle_java_exceptions file (file.input_stream_builtin opts)
|
||||||
Input_Stream.new stream (File_Error.handle_java_exceptions self)
|
Input_Stream.new stream (File_Error.handle_java_exceptions self) associated_source=self
|
||||||
|
|
||||||
if self.is_directory then Error.throw (File_Error.IO_Error self "File '"+self.path+"' is a directory") else
|
if self.is_directory then Error.throw (File_Error.IO_Error self "File '"+self.path+"' is a directory") else
|
||||||
open_as_data_link = (open_options.contains Data_Link_Access.No_Follow . not) && (Data_Link.is_data_link self)
|
open_as_data_link = (open_options.contains Data_Link_Access.No_Follow . not) && (Data_Link.is_data_link self)
|
||||||
|
@ -96,7 +96,7 @@ type Temporary_File
|
|||||||
If the stream is already backed by a temporary or regular file, that file is returned.
|
If the stream is already backed by a temporary or regular file, that file is returned.
|
||||||
from_stream_light : Input_Stream -> Temporary_File | File
|
from_stream_light : Input_Stream -> Temporary_File | File
|
||||||
from_stream_light stream =
|
from_stream_light stream =
|
||||||
case stream.associated_file of
|
case stream.associated_source of
|
||||||
tmp : Temporary_File -> tmp
|
tmp : Temporary_File -> tmp
|
||||||
file : File -> file
|
file : File -> file
|
||||||
_ -> Temporary_File.from_stream stream
|
_ -> Temporary_File.from_stream stream
|
||||||
|
@ -1,19 +1,28 @@
|
|||||||
import project.Any.Any
|
import project.Any.Any
|
||||||
|
import project.Data.Array.Array
|
||||||
import project.Data.Numbers.Integer
|
import project.Data.Numbers.Integer
|
||||||
import project.Data.Text.Encoding.Encoding
|
import project.Data.Text.Encoding.Encoding
|
||||||
import project.Data.Vector.Vector
|
import project.Data.Vector.Vector
|
||||||
import project.Errors.Encoding_Error.Encoding_Error
|
import project.Error.Error
|
||||||
|
import project.Errors.File_Error.File_Error
|
||||||
|
import project.Errors.Illegal_State.Illegal_State
|
||||||
import project.Errors.Problem_Behavior.Problem_Behavior
|
import project.Errors.Problem_Behavior.Problem_Behavior
|
||||||
import project.Nothing.Nothing
|
import project.Nothing.Nothing
|
||||||
import project.Runtime.Managed_Resource.Managed_Resource
|
import project.Runtime.Managed_Resource.Managed_Resource
|
||||||
|
import project.System.Advanced.Restartable_Input_Stream.Restartable_Input_Stream
|
||||||
import project.System.File.Advanced.Temporary_File.Temporary_File
|
import project.System.File.Advanced.Temporary_File.Temporary_File
|
||||||
import project.System.File.File
|
import project.System.File.File
|
||||||
import project.System.File.File_Access.File_Access
|
import project.System.File.File_Access.File_Access
|
||||||
import project.System.File.Generic.Writable_File.Writable_File
|
import project.System.File.Generic.Writable_File.Writable_File
|
||||||
|
import project.System.Internal.Reporting_Stream_Decoder_Helper
|
||||||
|
from project.Data.Boolean import Boolean, False, True
|
||||||
|
|
||||||
|
polyglot java import java.io.BufferedInputStream
|
||||||
|
polyglot java import java.io.ByteArrayInputStream
|
||||||
polyglot java import java.io.InputStream as Java_Input_Stream
|
polyglot java import java.io.InputStream as Java_Input_Stream
|
||||||
polyglot java import org.enso.base.encoding.ReportingStreamDecoder
|
|
||||||
polyglot java import org.enso.base.encoding.Encoding_Utils
|
polyglot java import org.enso.base.encoding.Encoding_Utils
|
||||||
|
polyglot java import org.enso.base.encoding.ReportingStreamDecoder
|
||||||
|
polyglot java import org.enso.base.Stream_Utils
|
||||||
|
|
||||||
## PRIVATE
|
## PRIVATE
|
||||||
An input stream, allowing for interactive reading of contents.
|
An input stream, allowing for interactive reading of contents.
|
||||||
@ -23,10 +32,17 @@ type Input_Stream
|
|||||||
|
|
||||||
Given a Java InputStream, wraps as a Managed_Resource and returns a new
|
Given a Java InputStream, wraps as a Managed_Resource and returns a new
|
||||||
Input_Stream.
|
Input_Stream.
|
||||||
new : Java_Input_Stream -> Any -> (Nothing | File | Temporary_File) -> Input_Stream
|
new : Java_Input_Stream -> Any -> (Nothing | File | Temporary_File | Vector | Array) -> Input_Stream
|
||||||
new java_stream error_handler associated_file=Nothing =
|
new java_stream error_handler associated_source=Nothing =
|
||||||
resource = Managed_Resource.register java_stream close_stream
|
resource = Managed_Resource.register java_stream close_stream
|
||||||
Input_Stream.Value resource error_handler associated_file
|
Input_Stream.Value resource error_handler associated_source
|
||||||
|
|
||||||
|
## PRIVATE
|
||||||
|
ADVANCED
|
||||||
|
Creates a new input stream from a vector of bytes.
|
||||||
|
from_bytes bytes -> Input_Stream =
|
||||||
|
raw_stream = ByteArrayInputStream.new bytes
|
||||||
|
Input_Stream.new raw_stream (File_Error.handle_java_exceptions Nothing) bytes
|
||||||
|
|
||||||
## PRIVATE
|
## PRIVATE
|
||||||
An input stream, allowing for interactive reading of contents.
|
An input stream, allowing for interactive reading of contents.
|
||||||
@ -35,15 +51,16 @@ type Input_Stream
|
|||||||
- stream_resource: The internal resource that represents the underlying
|
- stream_resource: The internal resource that represents the underlying
|
||||||
stream.
|
stream.
|
||||||
- error_handler: An error handler for IOExceptions thrown when reading.
|
- error_handler: An error handler for IOExceptions thrown when reading.
|
||||||
- associated_file: The file associated with this stream, if any.
|
- associated_source: The source associated with this stream, if any.
|
||||||
Value stream_resource error_handler (associated_file:Nothing|File|Temporary_File)
|
It can be used to cheaply convert this stream into a `Restartable_Input_Stream`.
|
||||||
|
private Value stream_resource error_handler (associated_source : Nothing | File | Temporary_File | Vector | Array)
|
||||||
|
|
||||||
## PRIVATE
|
## PRIVATE
|
||||||
ADVANCED
|
ADVANCED
|
||||||
Reads all the bytes in this stream into a vector of bytes.
|
Reads all the bytes in this stream into a vector of bytes.
|
||||||
read_all_bytes : Vector
|
read_all_bytes : Vector
|
||||||
read_all_bytes self = self.stream_resource . with java_stream->
|
read_all_bytes self = self.with_java_stream java_stream->
|
||||||
self.error_handler <| Vector.from_polyglot_array java_stream.readAllBytes
|
Vector.from_polyglot_array java_stream.readAllBytes
|
||||||
|
|
||||||
## PRIVATE
|
## PRIVATE
|
||||||
ADVANCED
|
ADVANCED
|
||||||
@ -58,10 +75,14 @@ type Input_Stream
|
|||||||
Arguments:
|
Arguments:
|
||||||
- n: The number of bytes to read from the stream.
|
- n: The number of bytes to read from the stream.
|
||||||
read_n_bytes : Integer -> Vector
|
read_n_bytes : Integer -> Vector
|
||||||
read_n_bytes self n = self.stream_resource . with java_stream->
|
read_n_bytes self (n : Integer) = self.with_java_stream java_stream->
|
||||||
self.error_handler <|
|
bytes = java_stream.readNBytes n
|
||||||
bytes = java_stream.readNBytes n
|
Vector.from_polyglot_array bytes
|
||||||
Vector.from_polyglot_array bytes
|
|
||||||
|
## PRIVATE
|
||||||
|
It may throw an error if not enough bytes are available.
|
||||||
|
skip_n_bytes self (n : Integer) = self.with_java_stream java_stream->
|
||||||
|
java_stream.skipNBytes n
|
||||||
|
|
||||||
## PRIVATE
|
## PRIVATE
|
||||||
ADVANCED
|
ADVANCED
|
||||||
@ -70,9 +91,8 @@ type Input_Stream
|
|||||||
The returned value is an integer in the range 0-255 representing the
|
The returned value is an integer in the range 0-255 representing the
|
||||||
next byte of input, or -1 if end of stream is reached.
|
next byte of input, or -1 if end of stream is reached.
|
||||||
read_byte : Integer
|
read_byte : Integer
|
||||||
read_byte self = self.stream_resource . with java_stream->
|
read_byte self = self.with_java_stream java_stream->
|
||||||
self.error_handler <|
|
java_stream.read
|
||||||
java_stream.read
|
|
||||||
|
|
||||||
## PRIVATE
|
## PRIVATE
|
||||||
ADVANCED
|
ADVANCED
|
||||||
@ -93,17 +113,75 @@ type Input_Stream
|
|||||||
Arguments:
|
Arguments:
|
||||||
- f: Applies a function over the internal java stream.
|
- f: Applies a function over the internal java stream.
|
||||||
with_java_stream : (Java_Input_Stream -> Any) -> Any
|
with_java_stream : (Java_Input_Stream -> Any) -> Any
|
||||||
with_java_stream self f = self.stream_resource . with f
|
with_java_stream self f = self.stream_resource . with java_stream->
|
||||||
|
self.error_handler <| f java_stream
|
||||||
|
|
||||||
## PRIVATE
|
## PRIVATE
|
||||||
Runs an action with a `ReportingStreamDecoder` decoding data from the
|
Runs an action with a `ReportingStreamDecoder` decoding data from the
|
||||||
input stream with the specified encoding.
|
input stream with the specified encoding.
|
||||||
with_stream_decoder : Encoding -> Problem_Behavior -> (ReportingStreamDecoder -> Any) -> Any
|
with_stream_decoder : Encoding -> Problem_Behavior -> (ReportingStreamDecoder -> Any) -> Any
|
||||||
with_stream_decoder self encoding on_problems action = self.stream_resource . with java_stream->
|
with_stream_decoder self (encoding : Encoding) (on_problems : Problem_Behavior) action =
|
||||||
results = Encoding_Utils.with_stream_decoder java_stream encoding.to_java_charset_or_null action
|
Reporting_Stream_Decoder_Helper.run self encoding on_problems allow_usage_in_background_threads=True action
|
||||||
problems = Vector.from_polyglot_array results.problems . map decoding_problem->
|
|
||||||
Encoding_Error.Error decoding_problem.message
|
## PRIVATE
|
||||||
on_problems.attach_problems_after results.result problems
|
Converts this stream into a restartable stream.
|
||||||
|
|
||||||
|
The conversion tries to be efficient, avoiding reading the stream if it
|
||||||
|
is simply backed by a file or byte array.
|
||||||
|
This stream is invalidated after the conversion.
|
||||||
|
|
||||||
|
Arguments:
|
||||||
|
- extend_lifetime: This option is only applicable if the source stream
|
||||||
|
was backed by a file. If set to `True`, a new temporary file will be
|
||||||
|
created, to untie the lifetime of the returned stream from the backing
|
||||||
|
file. If set to `False`, the returned stream will be valid only as long
|
||||||
|
as the original backing file, but the operation will be more efficient
|
||||||
|
- so it is preferred if the caller can guarantee the lifetime of such
|
||||||
|
backing file.
|
||||||
|
as_restartable_stream self (extend_lifetime : Boolean = True) -> Restartable_Input_Stream =
|
||||||
|
Restartable_Input_Stream.make self extend_lifetime
|
||||||
|
|
||||||
|
## PRIVATE
|
||||||
|
is_peekable self -> Boolean =
|
||||||
|
self.with_java_stream java_stream->
|
||||||
|
java_stream.markSupported
|
||||||
|
|
||||||
|
## PRIVATE
|
||||||
|
Converts this stream into a stream that can be peeked.
|
||||||
|
This is useful when some application needs to look ahead in the stream,
|
||||||
|
but then needs to pass it further with the same state.
|
||||||
|
|
||||||
|
The peeked bytes are cached in memory, so this should only be used for
|
||||||
|
small amounts of data. If more data has to be processed more than once,
|
||||||
|
`as_restartable_stream` is preferred.
|
||||||
|
|
||||||
|
The current stream may be invalidated after the conversion, and it should
|
||||||
|
no longer be used - only the returned stream should be used.
|
||||||
|
as_peekable_stream self -> Input_Stream = if self.is_peekable then self else
|
||||||
|
raw_java_stream = self.stream_resource.take
|
||||||
|
buffered_stream = BufferedInputStream.new raw_java_stream
|
||||||
|
Input_Stream.new buffered_stream self.error_handler self.associated_source
|
||||||
|
|
||||||
|
## PRIVATE
|
||||||
|
Peeks up to the provided number of bytes from the stream.
|
||||||
|
|
||||||
|
Makes a best-effort to read as many bytes as provided, however fewer
|
||||||
|
bytes may be read, if end of stream is encountered.
|
||||||
|
|
||||||
|
The length of the returned vector is the same as the number of bytes
|
||||||
|
read.
|
||||||
|
|
||||||
|
No bytes are consumed from the stream - a next read or peek
|
||||||
|
operation will see the same contents as before this call.
|
||||||
|
|
||||||
|
This operation is only allowed if `is_peekable` returns `True`.
|
||||||
|
|
||||||
|
Arguments:
|
||||||
|
- n: The number of bytes to read from the stream.
|
||||||
|
peek_bytes self (n : Integer) -> Vector Integer =
|
||||||
|
if self.is_peekable.not then Error.throw (Illegal_State.Error "`peek_bytes` called on a stream where `is_peekable=False`. Please convert the stream using `as_peekable_stream`.") else
|
||||||
|
self.with_java_stream java_stream->
|
||||||
|
Vector.from_polyglot_array <| Stream_Utils.peek java_stream n
|
||||||
|
|
||||||
## PRIVATE
|
## PRIVATE
|
||||||
Reads the contents of this stream into a given file.
|
Reads the contents of this stream into a given file.
|
||||||
|
@ -0,0 +1,130 @@
|
|||||||
|
private
|
||||||
|
|
||||||
|
import project.Any.Any
|
||||||
|
import project.Data.Text.Encoding.Encoding
|
||||||
|
import project.Data.Text.Text
|
||||||
|
import project.Data.Vector.Vector
|
||||||
|
import project.Errors.Encoding_Error.Encoding_Error
|
||||||
|
import project.Errors.Problem_Behavior.Problem_Behavior
|
||||||
|
import project.Nothing.Nothing
|
||||||
|
import project.Runtime.Managed_Resource.Managed_Resource
|
||||||
|
import project.System.Input_Stream.Input_Stream
|
||||||
|
from project.Data.Boolean import Boolean, False, True
|
||||||
|
from project.Runtime import assert
|
||||||
|
|
||||||
|
polyglot java import org.enso.base.encoding.DecodingProblemAggregator
|
||||||
|
polyglot java import org.enso.base.encoding.Encoding_Utils
|
||||||
|
polyglot java import org.enso.base.encoding.ReportingStreamDecoder
|
||||||
|
|
||||||
|
## PRIVATE
|
||||||
|
Builds the `ReportingStreamDecoder`, consuming the `Input_Stream`.
|
||||||
|
It will do any necessary encoding detection, as determined by the `Encoding`
|
||||||
|
run (input_stream : Input_Stream) (encoding : Encoding) (on_problems : Problem_Behavior) (allow_usage_in_background_threads : Boolean) (continuation : ReportingStreamDecoder -> Any) =
|
||||||
|
# We always ensure the stream is peekable, as that also implies buffering which is supposedly more efficient e.g. when working with files.
|
||||||
|
buffered_input_stream = input_stream.as_peekable_stream
|
||||||
|
problem_aggregator = DecodingProblemAggregator.new
|
||||||
|
result = resolve_encoding encoding buffered_input_stream problem_aggregator effective_encoding-> amended_input_stream->
|
||||||
|
amended_input_stream.with_java_stream java_stream->
|
||||||
|
# We can only poll safepoints if the decoder is guaranteed to be used in the main thread only.
|
||||||
|
poll_safepoints = allow_usage_in_background_threads.not
|
||||||
|
decoder = ReportingStreamDecoder.new java_stream effective_encoding.to_java_charset problem_aggregator poll_safepoints
|
||||||
|
continuation decoder
|
||||||
|
problems = Vector.from_polyglot_array problem_aggregator.summarize . map decoding_problem->
|
||||||
|
Encoding_Error.Error decoding_problem.message
|
||||||
|
on_problems.attach_problems_after result problems
|
||||||
|
|
||||||
|
## PRIVATE
|
||||||
|
decode_bytes_to_text bytes (encoding : Encoding) (on_problems : Problem_Behavior) -> Text =
|
||||||
|
Managed_Resource.bracket (Input_Stream.from_bytes bytes) (.close) input_stream->
|
||||||
|
run input_stream encoding on_problems allow_usage_in_background_threads=False decoder->
|
||||||
|
decoder.readAllIntoMemory
|
||||||
|
|
||||||
|
## PRIVATE
|
||||||
|
resolve_encoding (encoding : Encoding) (buffered_input_stream : Input_Stream) (problem_aggregator : DecodingProblemAggregator) (continuation : Encoding -> Input_Stream -> Any) -> Any =
|
||||||
|
case encoding of
|
||||||
|
Encoding.Default ->
|
||||||
|
detect_default_encoding buffered_input_stream problem_aggregator continuation
|
||||||
|
Encoding.Value "UTF-8" ->
|
||||||
|
verify_unicode_encoding encoding Unicode_BOM.UTF_8 buffered_input_stream problem_aggregator continuation
|
||||||
|
Encoding.Value "UTF-16LE" ->
|
||||||
|
verify_unicode_encoding encoding Unicode_BOM.UTF_16_LE buffered_input_stream problem_aggregator continuation
|
||||||
|
Encoding.Value "UTF-16BE" ->
|
||||||
|
verify_unicode_encoding encoding Unicode_BOM.UTF_16_BE buffered_input_stream problem_aggregator continuation
|
||||||
|
|
||||||
|
# Any other encoding just continues without any additional processing.
|
||||||
|
_ -> continuation encoding buffered_input_stream
|
||||||
|
|
||||||
|
## PRIVATE
|
||||||
|
detect_default_encoding (buffered_input_stream : Input_Stream) (problem_aggregator : DecodingProblemAggregator) (continuation : Encoding -> Input_Stream -> Any) -> Any =
|
||||||
|
case detect_bom buffered_input_stream of
|
||||||
|
# We rely on the detected BOM for the encoding, even if there are errors down the line.
|
||||||
|
detected_bom : Unicode_BOM ->
|
||||||
|
encoding_name = detected_bom.corresponding_encoding.to_display_text
|
||||||
|
context_message = "An " + encoding_name + " BOM was detected, so " + encoding_name + " encoding has been assumed, but some characters seem invalid: "
|
||||||
|
problem_aggregator.setInvalidCharacterErrorPrefix context_message
|
||||||
|
# Skip the BOM to avoid including it in the decoded data.
|
||||||
|
buffered_input_stream.skip_n_bytes detected_bom.as_bytes.length . if_not_error <|
|
||||||
|
continuation detected_bom.corresponding_encoding buffered_input_stream
|
||||||
|
|
||||||
|
# If no BOM was detected, we do a pass to try UTF-8 encoding and if it fails, we restart and fallback to Windows-1252.
|
||||||
|
Nothing ->
|
||||||
|
# We don't need to extend the lifetime of the stream, as it will not be kept longer than the lifetime of the `buffered_input_stream`.
|
||||||
|
restartable_stream = buffered_input_stream.as_restartable_stream extend_lifetime=False
|
||||||
|
is_valid_utf_8 = restartable_stream.with_fresh_stream checking_stream->
|
||||||
|
checking_stream.with_java_stream java_checking_stream->
|
||||||
|
Encoding_Utils.canDecodeWithoutErrors java_checking_stream Encoding.utf_8.to_java_charset
|
||||||
|
effective_encoding = if is_valid_utf_8 then Encoding.utf_8 else Encoding.windows_1252
|
||||||
|
restartable_stream.with_fresh_stream input_stream->
|
||||||
|
continuation effective_encoding input_stream
|
||||||
|
|
||||||
|
## PRIVATE
|
||||||
|
verify_unicode_encoding (encoding : Encoding) (expected_bom : Unicode_BOM) (buffered_input_stream : Input_Stream) (problem_aggregator : DecodingProblemAggregator) (continuation : Encoding -> Input_Stream -> Any) -> Any =
|
||||||
|
case detect_bom buffered_input_stream of
|
||||||
|
Nothing ->
|
||||||
|
# No BOM detected, so we just proceed.
|
||||||
|
continuation encoding buffered_input_stream
|
||||||
|
detected_bom : Unicode_BOM ->
|
||||||
|
case detected_bom == expected_bom of
|
||||||
|
True ->
|
||||||
|
# We found the expected BOM. We skip it to avoid including it in decoded data.
|
||||||
|
buffered_input_stream.skip_n_bytes expected_bom.as_bytes.length . if_not_error <|
|
||||||
|
continuation encoding buffered_input_stream
|
||||||
|
False ->
|
||||||
|
# Report BOM mismatch
|
||||||
|
message = detected_bom.corresponding_encoding.to_display_text + " BOM has been found when decoding as " + encoding.to_display_text + "."
|
||||||
|
problem_aggregator.reportOtherProblem message
|
||||||
|
continuation encoding buffered_input_stream
|
||||||
|
|
||||||
|
## PRIVATE
|
||||||
|
type Unicode_BOM
|
||||||
|
## PRIVATE
|
||||||
|
UTF_8
|
||||||
|
|
||||||
|
## PRIVATE
|
||||||
|
UTF_16_LE
|
||||||
|
|
||||||
|
## PRIVATE
|
||||||
|
UTF_16_BE
|
||||||
|
|
||||||
|
## PRIVATE
|
||||||
|
as_bytes self -> Vector = case self of
|
||||||
|
Unicode_BOM.UTF_8 -> [-17, -69, -65]
|
||||||
|
Unicode_BOM.UTF_16_LE -> [-1, -2]
|
||||||
|
Unicode_BOM.UTF_16_BE -> [-2, -1]
|
||||||
|
|
||||||
|
corresponding_encoding self -> Encoding = case self of
|
||||||
|
Unicode_BOM.UTF_8 -> Encoding.utf_8
|
||||||
|
Unicode_BOM.UTF_16_LE -> Encoding.utf_16_le
|
||||||
|
Unicode_BOM.UTF_16_BE -> Encoding.utf_16_be
|
||||||
|
|
||||||
|
## PRIVATE
|
||||||
|
all = [Unicode_BOM.UTF_8, Unicode_BOM.UTF_16_LE, Unicode_BOM.UTF_16_BE]
|
||||||
|
|
||||||
|
## PRIVATE
|
||||||
|
detect_bom (input_stream : Input_Stream) -> Unicode_BOM | Nothing =
|
||||||
|
assert input_stream.is_peekable
|
||||||
|
beginning = input_stream.peek_bytes 3
|
||||||
|
matching_bom = Unicode_BOM.all.find if_missing=Nothing bom->
|
||||||
|
expected_bytes = bom.as_bytes
|
||||||
|
expected_bytes == (beginning.take expected_bytes.length)
|
||||||
|
matching_bom
|
28
std-bits/base/src/main/java/org/enso/base/Stream_Utils.java
Normal file
28
std-bits/base/src/main/java/org/enso/base/Stream_Utils.java
Normal file
@ -0,0 +1,28 @@
|
|||||||
|
package org.enso.base;
|
||||||
|
|
||||||
|
import java.io.IOException;
|
||||||
|
import java.io.InputStream;
|
||||||
|
import java.util.Arrays;
|
||||||
|
|
||||||
|
public class Stream_Utils {
|
||||||
|
public static byte[] peek(InputStream stream, int n) throws IOException {
|
||||||
|
assert n >= 0;
|
||||||
|
assert stream.markSupported();
|
||||||
|
|
||||||
|
byte[] buffer = new byte[n];
|
||||||
|
stream.mark(n + 1);
|
||||||
|
int offset = 0;
|
||||||
|
while (offset < n) {
|
||||||
|
int read = stream.read(buffer, offset, n - offset);
|
||||||
|
if (read == -1) {
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
offset += read;
|
||||||
|
}
|
||||||
|
stream.reset();
|
||||||
|
if (offset < n) {
|
||||||
|
buffer = Arrays.copyOf(buffer, offset);
|
||||||
|
}
|
||||||
|
return buffer;
|
||||||
|
}
|
||||||
|
}
|
@ -31,6 +31,10 @@ public class DecodingProblemAggregator {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public boolean hasEncounteredInvalidCharacters() {
|
||||||
|
return invalidUnitCount > 0;
|
||||||
|
}
|
||||||
|
|
||||||
private DecodingProblem summarizeInvalidCharacterProblems() {
|
private DecodingProblem summarizeInvalidCharacterProblems() {
|
||||||
if (invalidUnitCount == 0) {
|
if (invalidUnitCount == 0) {
|
||||||
return null;
|
return null;
|
||||||
|
@ -1,213 +0,0 @@
|
|||||||
package org.enso.base.encoding;
|
|
||||||
|
|
||||||
import java.io.BufferedInputStream;
|
|
||||||
import java.io.EOFException;
|
|
||||||
import java.io.IOException;
|
|
||||||
import java.nio.charset.Charset;
|
|
||||||
import java.nio.charset.StandardCharsets;
|
|
||||||
|
|
||||||
public abstract sealed class EncodingRepresentation {
|
|
||||||
private static final byte[] UTF_8_BOM = new byte[] {(byte) 0xEF, (byte) 0xBB, (byte) 0xBF};
|
|
||||||
private static final byte[] UTF_16_BE_BOM = new byte[] {(byte) 0xFE, (byte) 0xFF};
|
|
||||||
private static final byte[] UTF_16_LE_BOM = new byte[] {(byte) 0xFF, (byte) 0xFE};
|
|
||||||
|
|
||||||
private static byte[] peekStream(BufferedInputStream stream, int n) throws IOException {
|
|
||||||
byte[] buffer = new byte[n];
|
|
||||||
stream.mark(n + 1);
|
|
||||||
int offset = 0;
|
|
||||||
while (offset < n) {
|
|
||||||
int read = stream.read(buffer, offset, n - offset);
|
|
||||||
if (read == -1) {
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
offset += read;
|
|
||||||
}
|
|
||||||
stream.reset();
|
|
||||||
return buffer;
|
|
||||||
}
|
|
||||||
|
|
||||||
private static void skipStream(BufferedInputStream stream, int n) throws IOException {
|
|
||||||
try {
|
|
||||||
stream.skipNBytes(n);
|
|
||||||
} catch (EOFException eofException) {
|
|
||||||
// ignore early EOF
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
private static boolean startsWith(byte[] buffer, byte[] prefix) {
|
|
||||||
if (buffer.length < prefix.length) {
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
|
|
||||||
for (int i = 0; i < prefix.length; i++) {
|
|
||||||
if (buffer[i] != prefix[i]) {
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
return true;
|
|
||||||
}
|
|
||||||
|
|
||||||
public static EncodingRepresentation fromCharset(Charset charset) {
|
|
||||||
if (charset == null) {
|
|
||||||
return new Default();
|
|
||||||
} else if (charset.equals(StandardCharsets.UTF_8)) {
|
|
||||||
return UTF8.INSTANCE;
|
|
||||||
} else if (charset.equals(StandardCharsets.UTF_16LE)) {
|
|
||||||
return UTF16.LittleEndian;
|
|
||||||
} else if (charset.equals(StandardCharsets.UTF_16BE)) {
|
|
||||||
return UTF16.BigEndian;
|
|
||||||
} else {
|
|
||||||
return new Other(charset);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Detects the effective charset based on initial data present in the stream.
|
|
||||||
*
|
|
||||||
* @param stream the stream to detect the charset for. Initial BOM header may be deliberately
|
|
||||||
* consumed by this method, so that it is ignored for further processing. No other data is
|
|
||||||
* consumed (we only peek).
|
|
||||||
* @param problemAggregator an aggregator to report any warnings about detected encoding, or
|
|
||||||
* report context metadata to be used in future errors reported by other components.
|
|
||||||
*/
|
|
||||||
public abstract Charset detectCharset(
|
|
||||||
BufferedInputStream stream, DecodingProblemAggregator problemAggregator) throws IOException;
|
|
||||||
|
|
||||||
private static final class Default extends EncodingRepresentation {
|
|
||||||
// Note: the Windows-1252 fallback is not implemented as part of this detection - it only allows
|
|
||||||
// to distinguish UTF BOMs.
|
|
||||||
@Override
|
|
||||||
public Charset detectCharset(
|
|
||||||
BufferedInputStream stream, DecodingProblemAggregator problemAggregator)
|
|
||||||
throws IOException {
|
|
||||||
byte[] beginning = peekStream(stream, 3);
|
|
||||||
if (startsWith(beginning, UTF_8_BOM)) {
|
|
||||||
skipStream(stream, UTF_8_BOM.length);
|
|
||||||
notifyContextAboutAssumedEncoding(problemAggregator, "UTF-8");
|
|
||||||
return StandardCharsets.UTF_8;
|
|
||||||
} else if (startsWith(beginning, UTF_16_BE_BOM)) {
|
|
||||||
skipStream(stream, UTF_16_BE_BOM.length);
|
|
||||||
notifyContextAboutAssumedEncoding(problemAggregator, "UTF-16 BE");
|
|
||||||
return StandardCharsets.UTF_16BE;
|
|
||||||
} else if (startsWith(beginning, UTF_16_LE_BOM)) {
|
|
||||||
skipStream(stream, UTF_16_LE_BOM.length);
|
|
||||||
notifyContextAboutAssumedEncoding(problemAggregator, "UTF-16 LE");
|
|
||||||
return StandardCharsets.UTF_16LE;
|
|
||||||
} else {
|
|
||||||
// If no BOM we fallback to UTF-8.
|
|
||||||
return StandardCharsets.UTF_8;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
private static void notifyContextAboutAssumedEncoding(
|
|
||||||
DecodingProblemAggregator problemAggregator, String encodingName) {
|
|
||||||
String prefix =
|
|
||||||
"An "
|
|
||||||
+ encodingName
|
|
||||||
+ " BOM was detected, so "
|
|
||||||
+ encodingName
|
|
||||||
+ " encoding has been assumed, but some characters seem invalid: ";
|
|
||||||
problemAggregator.setInvalidCharacterErrorPrefix(prefix);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
private static final class Other extends EncodingRepresentation {
|
|
||||||
private final Charset charset;
|
|
||||||
|
|
||||||
public Other(Charset charset) {
|
|
||||||
this.charset = charset;
|
|
||||||
}
|
|
||||||
|
|
||||||
@Override
|
|
||||||
public Charset detectCharset(
|
|
||||||
BufferedInputStream stream, DecodingProblemAggregator problemAggregator)
|
|
||||||
throws IOException {
|
|
||||||
// We ignore the stream as we just use the provided encoding as-is.
|
|
||||||
return charset;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
private static final class UTF8 extends EncodingRepresentation {
|
|
||||||
static final UTF8 INSTANCE = new UTF8();
|
|
||||||
|
|
||||||
@Override
|
|
||||||
public Charset detectCharset(
|
|
||||||
BufferedInputStream stream, DecodingProblemAggregator problemAggregator)
|
|
||||||
throws IOException {
|
|
||||||
byte[] beginning = peekStream(stream, UTF_8_BOM.length);
|
|
||||||
if (startsWith(beginning, UTF_8_BOM)) {
|
|
||||||
skipStream(stream, UTF_8_BOM.length);
|
|
||||||
} else {
|
|
||||||
if (startsWith(beginning, UTF_16_BE_BOM) || startsWith(beginning, UTF_16_LE_BOM)) {
|
|
||||||
problemAggregator.reportOtherProblem("UTF-16 BOM has been found when decoding as UTF-8.");
|
|
||||||
}
|
|
||||||
}
|
|
||||||
return StandardCharsets.UTF_8;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
private static final class UTF16 extends EncodingRepresentation {
|
|
||||||
static final UTF16 BigEndian = new UTF16(StandardCharsets.UTF_16BE, Endianness.BigEndian);
|
|
||||||
static final UTF16 LittleEndian = new UTF16(StandardCharsets.UTF_16LE, Endianness.LittleEndian);
|
|
||||||
|
|
||||||
private final Charset charset;
|
|
||||||
private final byte[] expectedBOM;
|
|
||||||
private final byte[] flippedBOM;
|
|
||||||
private Endianness endianness;
|
|
||||||
|
|
||||||
private UTF16(Charset charset, Endianness endianness) {
|
|
||||||
this.charset = charset;
|
|
||||||
this.expectedBOM =
|
|
||||||
switch (endianness) {
|
|
||||||
case BigEndian -> UTF_16_BE_BOM;
|
|
||||||
case LittleEndian -> UTF_16_LE_BOM;
|
|
||||||
};
|
|
||||||
this.flippedBOM =
|
|
||||||
switch (endianness) {
|
|
||||||
case BigEndian -> UTF_16_LE_BOM;
|
|
||||||
case LittleEndian -> UTF_16_BE_BOM;
|
|
||||||
};
|
|
||||||
this.endianness = endianness;
|
|
||||||
}
|
|
||||||
|
|
||||||
@Override
|
|
||||||
public Charset detectCharset(
|
|
||||||
BufferedInputStream stream, DecodingProblemAggregator problemAggregator)
|
|
||||||
throws IOException {
|
|
||||||
assert expectedBOM.length == flippedBOM.length;
|
|
||||||
assert expectedBOM.length == 2;
|
|
||||||
byte[] beginning = peekStream(stream, 2);
|
|
||||||
if (startsWith(beginning, expectedBOM)) {
|
|
||||||
skipStream(stream, expectedBOM.length);
|
|
||||||
} else if (startsWith(beginning, flippedBOM)) {
|
|
||||||
problemAggregator.reportOtherProblem(
|
|
||||||
"Decoding as UTF-16 "
|
|
||||||
+ endianness
|
|
||||||
+ ", but a "
|
|
||||||
+ endianness.flip()
|
|
||||||
+ " BOM has been found.");
|
|
||||||
}
|
|
||||||
return charset;
|
|
||||||
}
|
|
||||||
|
|
||||||
enum Endianness {
|
|
||||||
BigEndian,
|
|
||||||
LittleEndian;
|
|
||||||
|
|
||||||
@Override
|
|
||||||
public String toString() {
|
|
||||||
return switch (this) {
|
|
||||||
case BigEndian -> "Big Endian";
|
|
||||||
case LittleEndian -> "Little Endian";
|
|
||||||
};
|
|
||||||
}
|
|
||||||
|
|
||||||
public Endianness flip() {
|
|
||||||
return switch (this) {
|
|
||||||
case BigEndian -> LittleEndian;
|
|
||||||
case LittleEndian -> BigEndian;
|
|
||||||
};
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
@ -1,7 +1,5 @@
|
|||||||
package org.enso.base.encoding;
|
package org.enso.base.encoding;
|
||||||
|
|
||||||
import java.io.BufferedInputStream;
|
|
||||||
import java.io.ByteArrayInputStream;
|
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
import java.io.InputStream;
|
import java.io.InputStream;
|
||||||
import java.io.OutputStream;
|
import java.io.OutputStream;
|
||||||
@ -13,7 +11,6 @@ import java.nio.charset.CharsetEncoder;
|
|||||||
import java.nio.charset.CoderResult;
|
import java.nio.charset.CoderResult;
|
||||||
import java.nio.charset.CodingErrorAction;
|
import java.nio.charset.CodingErrorAction;
|
||||||
import java.util.Arrays;
|
import java.util.Arrays;
|
||||||
import java.util.List;
|
|
||||||
import java.util.function.BiConsumer;
|
import java.util.function.BiConsumer;
|
||||||
import java.util.function.Function;
|
import java.util.function.Function;
|
||||||
import java.util.function.IntFunction;
|
import java.util.function.IntFunction;
|
||||||
@ -97,89 +94,25 @@ public class Encoding_Utils {
|
|||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Converts an array of encoded bytes into a string.
|
* Checks if the following stream can be decoded without errors using the given charset.
|
||||||
*
|
*
|
||||||
* @param bytes the bytes to convert
|
* <p>The stream is (partially or wholly) consumed as part of this process.
|
||||||
* @param charset the character set to use for decoding, use {@code null} to try auto-detection
|
|
||||||
* @return the resulting string, and any potential problems
|
|
||||||
*/
|
*/
|
||||||
public static WithProblems<String, DecodingProblem> from_bytes(byte[] bytes, Charset charset) {
|
public static boolean canDecodeWithoutErrors(InputStream stream, Charset charset)
|
||||||
if (bytes == null || bytes.length == 0) {
|
throws IOException {
|
||||||
return new WithProblems<>("", List.of());
|
|
||||||
}
|
|
||||||
|
|
||||||
DecodingProblemAggregator problemAggregator = new DecodingProblemAggregator();
|
DecodingProblemAggregator problemAggregator = new DecodingProblemAggregator();
|
||||||
ByteArrayInputStream inputStream = new ByteArrayInputStream(bytes);
|
try (var decoder = new ReportingStreamDecoder(stream, charset, problemAggregator, true)) {
|
||||||
ReportingStreamDecoder decoder;
|
char[] tmpBuffer = new char[1024];
|
||||||
try {
|
while (decoder.read(tmpBuffer) >= 0) {
|
||||||
decoder = create_stream_decoder(inputStream, charset, problemAggregator, true);
|
if (problemAggregator.hasEncounteredInvalidCharacters()) {
|
||||||
} catch (IOException e) {
|
// early exit - no need to process the stream any further
|
||||||
throw new IllegalStateException(
|
return false;
|
||||||
"Unexpected IO exception in internal code: " + e.getMessage(), e);
|
|
||||||
}
|
|
||||||
|
|
||||||
CharBuffer out = CharBuffer.allocate((int) (bytes.length * decoder.averageCharsPerByte()));
|
|
||||||
try {
|
|
||||||
int n;
|
|
||||||
do {
|
|
||||||
if (!out.hasRemaining()) {
|
|
||||||
out = resize(out, CharBuffer::allocate, CharBuffer::put);
|
|
||||||
}
|
}
|
||||||
// read is already polling safepoints so we don't have to
|
}
|
||||||
n = decoder.read(out);
|
|
||||||
} while (n >= 0);
|
|
||||||
} catch (IOException e) {
|
|
||||||
throw new IllegalStateException("Unexpected exception: " + e.getMessage(), e);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
out.flip();
|
// Check one more time after EOF
|
||||||
return new WithProblems<>(out.toString(), problemAggregator.summarize());
|
return !problemAggregator.hasEncounteredInvalidCharacters();
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Creates a new instance of {@code ReportingStreamDecoder} decoding a given charset.
|
|
||||||
*
|
|
||||||
* @param stream the input stream to decode
|
|
||||||
* @param charset the character set to use for decoding, use {@code null} to try auto-detection
|
|
||||||
* @param pollSafepoints whether to poll for safepoints during decoding. This should be true if
|
|
||||||
* the decoding will run on the main thread, and false otherwise.
|
|
||||||
*/
|
|
||||||
private static ReportingStreamDecoder create_stream_decoder(
|
|
||||||
InputStream stream,
|
|
||||||
Charset charset,
|
|
||||||
DecodingProblemAggregator problemAggregator,
|
|
||||||
boolean pollSafepoints)
|
|
||||||
throws IOException {
|
|
||||||
BufferedInputStream bufferedStream = new BufferedInputStream(stream);
|
|
||||||
EncodingRepresentation representation = EncodingRepresentation.fromCharset(charset);
|
|
||||||
// This may also advance the stream past the BOM
|
|
||||||
Charset detectedCharset = representation.detectCharset(bufferedStream, problemAggregator);
|
|
||||||
return new ReportingStreamDecoder(
|
|
||||||
bufferedStream, detectedCharset, problemAggregator, pollSafepoints);
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
|
||||||
* A helper function which runs an action with a created stream decoder and closes it afterwards.
|
|
||||||
*
|
|
||||||
* <p>It returns the result returned from the executed action and any encoding problems that
|
|
||||||
* occurred when processing it.
|
|
||||||
*
|
|
||||||
* @param stream the input stream to decode
|
|
||||||
* @param charset the character set to use for decoding, use {@code null} to try auto-detection
|
|
||||||
* @param action the action to run with the created decoder
|
|
||||||
* @return the result of the action and any problems that occurred during decoding
|
|
||||||
*/
|
|
||||||
public static WithProblems<Value, DecodingProblem> with_stream_decoder(
|
|
||||||
InputStream stream, Charset charset, Function<ReportingStreamDecoder, Value> action)
|
|
||||||
throws IOException {
|
|
||||||
DecodingProblemAggregator problemAggregator = new DecodingProblemAggregator();
|
|
||||||
Value result;
|
|
||||||
ReportingStreamDecoder decoder =
|
|
||||||
create_stream_decoder(stream, charset, problemAggregator, false);
|
|
||||||
try (decoder) {
|
|
||||||
result = action.apply(decoder);
|
|
||||||
}
|
|
||||||
return new WithProblems<>(result, problemAggregator.summarize());
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/** Creates a new instance of {@code ReportingStreamEncoder} encoding a given charset. */
|
/** Creates a new instance of {@code ReportingStreamEncoder} encoding a given charset. */
|
||||||
|
@ -2,8 +2,8 @@ package org.enso.base.encoding;
|
|||||||
|
|
||||||
import static org.enso.base.encoding.Encoding_Utils.INVALID_CHARACTER;
|
import static org.enso.base.encoding.Encoding_Utils.INVALID_CHARACTER;
|
||||||
|
|
||||||
import java.io.BufferedInputStream;
|
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
|
import java.io.InputStream;
|
||||||
import java.io.Reader;
|
import java.io.Reader;
|
||||||
import java.nio.ByteBuffer;
|
import java.nio.ByteBuffer;
|
||||||
import java.nio.CharBuffer;
|
import java.nio.CharBuffer;
|
||||||
@ -30,7 +30,7 @@ public class ReportingStreamDecoder extends Reader {
|
|||||||
return decoder.averageCharsPerByte();
|
return decoder.averageCharsPerByte();
|
||||||
}
|
}
|
||||||
|
|
||||||
private final BufferedInputStream bufferedInputStream;
|
private final InputStream inputStream;
|
||||||
private final CharsetDecoder decoder;
|
private final CharsetDecoder decoder;
|
||||||
private final Charset charset;
|
private final Charset charset;
|
||||||
|
|
||||||
@ -39,11 +39,11 @@ public class ReportingStreamDecoder extends Reader {
|
|||||||
}
|
}
|
||||||
|
|
||||||
public ReportingStreamDecoder(
|
public ReportingStreamDecoder(
|
||||||
BufferedInputStream stream,
|
InputStream stream,
|
||||||
Charset charset,
|
Charset charset,
|
||||||
DecodingProblemAggregator problemAggregator,
|
DecodingProblemAggregator problemAggregator,
|
||||||
boolean pollSafepoints) {
|
boolean pollSafepoints) {
|
||||||
bufferedInputStream = stream;
|
inputStream = stream;
|
||||||
this.charset = charset;
|
this.charset = charset;
|
||||||
this.decoder =
|
this.decoder =
|
||||||
charset
|
charset
|
||||||
@ -55,6 +55,23 @@ public class ReportingStreamDecoder extends Reader {
|
|||||||
this.pollSafepoints = pollSafepoints;
|
this.pollSafepoints = pollSafepoints;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/** Decodes the entire input stream into a String. */
|
||||||
|
public String readAllIntoMemory() throws IOException {
|
||||||
|
int initialCapacity = Math.max((int) (inputStream.available() * averageCharsPerByte()), 16);
|
||||||
|
CharBuffer out = CharBuffer.allocate(initialCapacity);
|
||||||
|
int n;
|
||||||
|
do {
|
||||||
|
if (!out.hasRemaining()) {
|
||||||
|
out = Encoding_Utils.resize(out, CharBuffer::allocate, CharBuffer::put);
|
||||||
|
}
|
||||||
|
// read is already polling safepoints so we don't have to
|
||||||
|
n = this.read(out);
|
||||||
|
} while (n >= 0);
|
||||||
|
|
||||||
|
out.flip();
|
||||||
|
return out.toString();
|
||||||
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Currently there is no easy way to check if a Context is available in the current thread and we
|
* Currently there is no easy way to check if a Context is available in the current thread and we
|
||||||
* can use safepoints or not. The issue tracking this feature can be found at: <a
|
* can use safepoints or not. The issue tracking this feature can be found at: <a
|
||||||
@ -245,7 +262,7 @@ public class ReportingStreamDecoder extends Reader {
|
|||||||
int bytesToRead = Math.max(expectedInputSize - bufferedInput, 1);
|
int bytesToRead = Math.max(expectedInputSize - bufferedInput, 1);
|
||||||
|
|
||||||
ensureWorkArraySize(bytesToRead);
|
ensureWorkArraySize(bytesToRead);
|
||||||
int bytesActuallyRead = bufferedInputStream.read(workArray, 0, bytesToRead);
|
int bytesActuallyRead = inputStream.read(workArray, 0, bytesToRead);
|
||||||
if (bytesActuallyRead == -1) {
|
if (bytesActuallyRead == -1) {
|
||||||
eof = true;
|
eof = true;
|
||||||
}
|
}
|
||||||
@ -383,6 +400,9 @@ public class ReportingStreamDecoder extends Reader {
|
|||||||
|
|
||||||
@Override
|
@Override
|
||||||
public void close() throws IOException {
|
public void close() throws IOException {
|
||||||
bufferedInputStream.close();
|
inputStream.close();
|
||||||
|
inputBuffer = null;
|
||||||
|
outputBuffer = null;
|
||||||
|
workArray = null;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -0,0 +1,25 @@
|
|||||||
|
package org.enso.base_test_helpers;
|
||||||
|
|
||||||
|
import java.io.IOException;
|
||||||
|
import java.io.InputStream;
|
||||||
|
|
||||||
|
/** A generic stream used for tests. */
|
||||||
|
public class RangeStream extends InputStream {
|
||||||
|
private final int end;
|
||||||
|
private int current;
|
||||||
|
|
||||||
|
public RangeStream(int start, int end) {
|
||||||
|
assert 0 <= start && start <= end;
|
||||||
|
this.end = end;
|
||||||
|
this.current = start;
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public int read() throws IOException {
|
||||||
|
if (current >= end) {
|
||||||
|
return -1;
|
||||||
|
} else {
|
||||||
|
return current++ % 256;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
@ -69,7 +69,6 @@ add_specs suite_builder =
|
|||||||
Problems.get_attached_warnings default_warning . should_contain_the_same_elements_as problems
|
Problems.get_attached_warnings default_warning . should_contain_the_same_elements_as problems
|
||||||
|
|
||||||
suite_builder.group "Default Encoding" group_builder->
|
suite_builder.group "Default Encoding" group_builder->
|
||||||
pending_fallback = "Fallback to Windows-1252 is not implemented yet."
|
|
||||||
group_builder.specify "should try reading as UTF-8 by default" <|
|
group_builder.specify "should try reading as UTF-8 by default" <|
|
||||||
bytes = [65, -60, -123, -60, -103]
|
bytes = [65, -60, -123, -60, -103]
|
||||||
# A ą ę
|
# A ą ę
|
||||||
@ -113,7 +112,7 @@ add_specs suite_builder =
|
|||||||
bytes.length . should_equal 4*2
|
bytes.length . should_equal 4*2
|
||||||
txt.length . should_equal 3
|
txt.length . should_equal 3
|
||||||
|
|
||||||
group_builder.specify "falls back to Windows-1252 if invalid Unicode is detected and no BOM" pending=pending_fallback <|
|
group_builder.specify "falls back to Windows-1252 if invalid Unicode is detected and no BOM" <|
|
||||||
# These are not valid bytes for UTF-8
|
# These are not valid bytes for UTF-8
|
||||||
bytes = [-30, -55, -1]
|
bytes = [-30, -55, -1]
|
||||||
Text.from_bytes bytes Encoding.utf_8 . should_fail_with Encoding_Error
|
Text.from_bytes bytes Encoding.utf_8 . should_fail_with Encoding_Error
|
||||||
@ -148,7 +147,6 @@ add_specs suite_builder =
|
|||||||
empty.should_equal ""
|
empty.should_equal ""
|
||||||
Problems.assume_no_problems empty
|
Problems.assume_no_problems empty
|
||||||
|
|
||||||
group_builder.specify "should work on 0 or 1 byte input (TODO merge with above)" pending=pending_fallback <|
|
|
||||||
txt = Text.from_bytes [-1] Encoding.default Problem_Behavior.Report_Warning
|
txt = Text.from_bytes [-1] Encoding.default Problem_Behavior.Report_Warning
|
||||||
txt.should_equal 'ÿ'
|
txt.should_equal 'ÿ'
|
||||||
# No problems, as falling back to Windows-1252.
|
# No problems, as falling back to Windows-1252.
|
||||||
@ -166,8 +164,6 @@ add_specs suite_builder =
|
|||||||
([-1, -2] + [65, 0, 5, 1, 25, 1]).write_bytes f Existing_File_Behavior.Overwrite . should_succeed
|
([-1, -2] + [65, 0, 5, 1, 25, 1]).write_bytes f Existing_File_Behavior.Overwrite . should_succeed
|
||||||
f.read . should_equal "Aąę"
|
f.read . should_equal "Aąę"
|
||||||
|
|
||||||
group_builder.specify "Default Encoding heuristics also work in File.read (TODO merge with above)" pending=pending_fallback <|
|
|
||||||
f = File.create_temporary_file "utf-heuristics" ".txt"
|
|
||||||
# Fallback to Windows-1252
|
# Fallback to Windows-1252
|
||||||
([-30, -55, -1]).write_bytes f Existing_File_Behavior.Overwrite . should_succeed
|
([-30, -55, -1]).write_bytes f Existing_File_Behavior.Overwrite . should_succeed
|
||||||
f.read . should_equal "âÉÿ"
|
f.read . should_equal "âÉÿ"
|
||||||
|
63
test/Base_Tests/src/System/Input_Stream_Spec.enso
Normal file
63
test/Base_Tests/src/System/Input_Stream_Spec.enso
Normal file
@ -0,0 +1,63 @@
|
|||||||
|
from Standard.Base import all
|
||||||
|
import Standard.Base.Errors.Illegal_State.Illegal_State
|
||||||
|
import Standard.Base.Runtime.Managed_Resource.Managed_Resource
|
||||||
|
import Standard.Base.System.Input_Stream.Input_Stream
|
||||||
|
|
||||||
|
from Standard.Test import all
|
||||||
|
|
||||||
|
polyglot java import org.enso.base_test_helpers.RangeStream
|
||||||
|
|
||||||
|
main filter=Nothing =
|
||||||
|
suite = Test.build suite_builder->
|
||||||
|
add_specs suite_builder
|
||||||
|
suite.run_with_filter filter
|
||||||
|
|
||||||
|
add_specs suite_builder = suite_builder.group "Input Stream" group_builder->
|
||||||
|
group_builder.specify "should be peekable if backed by memory" <|
|
||||||
|
Managed_Resource.bracket (Input_Stream.from_bytes [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]) (.close) stream->
|
||||||
|
stream.is_peekable . should_be_true
|
||||||
|
|
||||||
|
stream.peek_bytes 3 . should_equal [1, 2, 3]
|
||||||
|
stream.peek_bytes 3 . should_equal [1, 2, 3]
|
||||||
|
# After the peek operation, read still starts from beginning
|
||||||
|
stream.read_n_bytes 3 . should_equal [1, 2, 3]
|
||||||
|
|
||||||
|
# Further peek after a read, starts from where the next read would start
|
||||||
|
stream.peek_bytes 3 . should_equal [4, 5, 6]
|
||||||
|
stream.read_n_bytes 3 . should_equal [4, 5, 6]
|
||||||
|
stream.read_n_bytes 5 . should_equal [7, 8, 9, 10]
|
||||||
|
|
||||||
|
group_builder.specify "should allow to skip bytes" <|
|
||||||
|
Managed_Resource.bracket (Input_Stream.from_bytes [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]) (.close) stream->
|
||||||
|
stream.skip_n_bytes 3
|
||||||
|
stream.read_n_bytes 1 . should_equal [4]
|
||||||
|
stream.peek_bytes 3 . should_equal [5, 6, 7]
|
||||||
|
stream.skip_n_bytes 3
|
||||||
|
stream.read_n_bytes 4 . should_equal [8, 9, 10]
|
||||||
|
|
||||||
|
group_builder.specify "should not be peekable if generic stream is provided" <|
|
||||||
|
error_handler x = x
|
||||||
|
generic_stream = Input_Stream.new (RangeStream.new 100 120) error_handler
|
||||||
|
generic_stream.is_peekable . should_be_false
|
||||||
|
generic_stream.peek_bytes 3 . should_fail_with Illegal_State
|
||||||
|
|
||||||
|
group_builder.specify "should be possible to make peekable" <|
|
||||||
|
error_handler x = x
|
||||||
|
generic_stream = Input_Stream.new (RangeStream.new 100 120) error_handler
|
||||||
|
generic_stream.read_n_bytes 4 . should_equal [100, 101, 102, 103]
|
||||||
|
|
||||||
|
promoted_stream = generic_stream.as_peekable_stream
|
||||||
|
# The new stream starts at the same position as the old one was left
|
||||||
|
promoted_stream.peek_bytes 4 . should_equal [104, 105, 106, 107]
|
||||||
|
promoted_stream.read_n_bytes 4 . should_equal [104, 105, 106, 107]
|
||||||
|
promoted_stream.peek_bytes 2 . should_equal [108, 109]
|
||||||
|
promoted_stream.peek_bytes 2 . should_equal [108, 109]
|
||||||
|
|
||||||
|
group_builder.specify "should allow to peek beyond EOF, still correctly restarting afterwards" <|
|
||||||
|
error_handler x = x
|
||||||
|
generic_stream = Input_Stream.new (RangeStream.new 100 105) error_handler
|
||||||
|
promoted_stream = generic_stream.as_peekable_stream
|
||||||
|
promoted_stream.peek_bytes 10 . should_equal [100, 101, 102, 103, 104]
|
||||||
|
promoted_stream.peek_bytes 10 . should_equal [100, 101, 102, 103, 104]
|
||||||
|
# The read still succeeds - ensuring there isn't some early EOF
|
||||||
|
promoted_stream.read_n_bytes 10 . should_equal [100, 101, 102, 103, 104]
|
@ -0,0 +1,87 @@
|
|||||||
|
from Standard.Base import all
|
||||||
|
import Standard.Base.Errors.Illegal_State.Illegal_State
|
||||||
|
import Standard.Base.Runtime.Managed_Resource.Managed_Resource
|
||||||
|
import Standard.Base.System.File.Advanced.Temporary_File.Temporary_File
|
||||||
|
import Standard.Base.System.Input_Stream.Input_Stream
|
||||||
|
|
||||||
|
from Standard.Test import all
|
||||||
|
|
||||||
|
polyglot java import org.enso.base_test_helpers.RangeStream
|
||||||
|
|
||||||
|
main filter=Nothing =
|
||||||
|
suite = Test.build suite_builder->
|
||||||
|
add_specs suite_builder
|
||||||
|
suite.run_with_filter filter
|
||||||
|
|
||||||
|
add_specs suite_builder = suite_builder.group "Restartable Input Stream" group_builder->
|
||||||
|
group_builder.specify "should allow to restart a generic stream by caching its data" <|
|
||||||
|
error_handler x = x
|
||||||
|
generic_stream = Input_Stream.new (RangeStream.new 100 120) error_handler
|
||||||
|
generic_stream.read_n_bytes 3 . should_equal [100, 101, 102]
|
||||||
|
# A generic stream is frozen in its current state - the first few bytes that were already read will be lost
|
||||||
|
restartable_stream = generic_stream.as_restartable_stream
|
||||||
|
restartable_stream.with_fresh_stream stream->
|
||||||
|
stream.read_n_bytes 4 . should_equal [103, 104, 105, 106]
|
||||||
|
restartable_stream.with_fresh_stream stream->
|
||||||
|
stream.read_n_bytes 4 . should_equal [103, 104, 105, 106]
|
||||||
|
|
||||||
|
# Small stream is backed by memory:
|
||||||
|
restartable_stream.to_text . should_contain "From_Bytes"
|
||||||
|
|
||||||
|
group_builder.specify "will fall back to caching in a temporary file for a very large stream" <|
|
||||||
|
error_handler x = x
|
||||||
|
# 1MB
|
||||||
|
generic_stream = Input_Stream.new (RangeStream.new 1 1000000) error_handler
|
||||||
|
restartable_stream = generic_stream.as_restartable_stream
|
||||||
|
restartable_stream.with_fresh_stream stream->
|
||||||
|
stream.read_n_bytes 4 . should_equal [1, 2, 3, 4]
|
||||||
|
restartable_stream.to_text . should_contain "From_Temporary_File"
|
||||||
|
|
||||||
|
group_builder.specify "will re-use the original byte array if it was known" <|
|
||||||
|
byte_stream = Input_Stream.from_bytes [1, 2, 3, 4, 5]
|
||||||
|
byte_stream.read_n_bytes 3 . should_equal [1, 2, 3]
|
||||||
|
restartable_stream = byte_stream.as_restartable_stream
|
||||||
|
restartable_stream.with_fresh_stream stream->
|
||||||
|
# All bytes are preserved
|
||||||
|
stream.read_n_bytes 4 . should_equal [1, 2, 3, 4]
|
||||||
|
|
||||||
|
restartable_stream.to_text . should_contain "From_Bytes"
|
||||||
|
restartable_stream.with_fresh_stream stream->
|
||||||
|
stream.read_n_bytes 4 . should_equal [1, 2, 3, 4]
|
||||||
|
|
||||||
|
group_builder.specify "will be backed by a file, if a file stream is converted and extend_lifetime=False" <|
|
||||||
|
file = File.create_temporary_file "test" ".bin"
|
||||||
|
[10, 20, 30, 40, 50].write_bytes file . should_succeed
|
||||||
|
|
||||||
|
restartable_stream = file.with_input_stream [File_Access.Read] file_stream->
|
||||||
|
file_stream.as_restartable_stream extend_lifetime=False
|
||||||
|
|
||||||
|
restartable_stream.with_fresh_stream stream->
|
||||||
|
stream.read_n_bytes 3 . should_equal [10, 20, 30]
|
||||||
|
|
||||||
|
restartable_stream.to_text . should_contain "From_Existing_File"
|
||||||
|
|
||||||
|
restartable_stream.with_fresh_stream stream->
|
||||||
|
stream.read_n_bytes 3 . should_equal [10, 20, 30]
|
||||||
|
|
||||||
|
# However, if backed by existing file - the stream is prone to file modifications that happen in the meantime
|
||||||
|
[11, 12, 13, 14].write_bytes file on_existing_file=Existing_File_Behavior.Overwrite . should_succeed
|
||||||
|
restartable_stream.with_fresh_stream stream->
|
||||||
|
stream.read_n_bytes 3 . should_equal [11, 12, 13]
|
||||||
|
|
||||||
|
group_builder.specify "will not be tied to the original backing file if extend_lifetime=True (default)" <|
|
||||||
|
file = File.create_temporary_file "test" ".bin"
|
||||||
|
[10, 20, 30, 40, 50].write_bytes file . should_succeed
|
||||||
|
|
||||||
|
restartable_stream = file.with_input_stream [File_Access.Read] file_stream->
|
||||||
|
file_stream.as_restartable_stream
|
||||||
|
|
||||||
|
restartable_stream.with_fresh_stream stream->
|
||||||
|
stream.read_n_bytes 3 . should_equal [10, 20, 30]
|
||||||
|
|
||||||
|
# Modify backing file
|
||||||
|
[11, 12, 13, 14].write_bytes file on_existing_file=Existing_File_Behavior.Overwrite . should_succeed
|
||||||
|
|
||||||
|
# The stream still yields old values because it was untied:
|
||||||
|
restartable_stream.with_fresh_stream stream->
|
||||||
|
stream.read_n_bytes 3 . should_equal [10, 20, 30]
|
@ -92,37 +92,47 @@ add_specs suite_builder =
|
|||||||
tmp.to_text . should_not_contain "pref"
|
tmp.to_text . should_not_contain "pref"
|
||||||
tmp.to_text . should_not_contain "suf"
|
tmp.to_text . should_not_contain "suf"
|
||||||
|
|
||||||
group_builder.specify "should allow to materialize an input stream that is already associated with a temporary file without copying it" <|
|
group_builder.specify "should allow to materialize an input stream that is already associated with a temporary file without copying it (low-level)" <|
|
||||||
tmp = Temporary_File.new
|
tmp = Temporary_File.new
|
||||||
tmp.with_file f->
|
tmp.with_file f->
|
||||||
"test payload 3" . write f
|
"test payload 3" . write f
|
||||||
|
|
||||||
java_file = Java_File.new tmp.unsafe_get.absolute.path
|
java_file = Java_File.new tmp.unsafe_get.absolute.path
|
||||||
stream = Input_Stream.new (FileInputStream.new java_file) (File_Error.handle_java_exceptions tmp.unsafe_get) associated_file=tmp
|
stream = Input_Stream.new (FileInputStream.new java_file) (File_Error.handle_java_exceptions tmp.unsafe_get) associated_source=tmp
|
||||||
|
|
||||||
tmp2 = Temporary_File.from_stream_light stream
|
tmp2 = Temporary_File.from_stream_light stream
|
||||||
# The returned tmp file should be the same one as original.
|
# The returned tmp file should be the same one as original.
|
||||||
tmp2.should_be_a Temporary_File
|
tmp2.should_be_a Temporary_File
|
||||||
tmp2.unsafe_get.absolute.path . should_equal tmp.unsafe_get.absolute.path
|
tmp2.should_equal tmp
|
||||||
|
|
||||||
# If the raw file is associated, the stream will return that File descriptor (not as temporary file, but regular one):
|
# If the raw file is associated, the stream will return that File descriptor (not as temporary file, but regular one):
|
||||||
stream3 = Input_Stream.new (FileInputStream.new java_file) (File_Error.handle_java_exceptions tmp.unsafe_get) associated_file=tmp.unsafe_get
|
stream3 = Input_Stream.new (FileInputStream.new java_file) (File_Error.handle_java_exceptions tmp.unsafe_get) associated_source=tmp.unsafe_get
|
||||||
f3 = Temporary_File.from_stream_light stream3
|
f3 = Temporary_File.from_stream_light stream3
|
||||||
f3.should_be_a File
|
f3.should_be_a File
|
||||||
f3.absolute.path . should_equal tmp.unsafe_get.absolute.path
|
f3.absolute.path . should_equal tmp.unsafe_get.absolute.path
|
||||||
|
|
||||||
# But if there's no association, a new temporary file gets created:
|
# But if there's no association, a new temporary file gets created:
|
||||||
stream4 = Input_Stream.new (FileInputStream.new java_file) (File_Error.handle_java_exceptions tmp.unsafe_get) associated_file=Nothing
|
stream4 = Input_Stream.new (FileInputStream.new java_file) (File_Error.handle_java_exceptions tmp.unsafe_get) associated_source=Nothing
|
||||||
tmp4 = Temporary_File.from_stream_light stream4
|
tmp4 = Temporary_File.from_stream_light stream4
|
||||||
tmp4.should_be_a Temporary_File
|
tmp4.should_be_a Temporary_File
|
||||||
tmp4.unsafe_get.absolute.path . should_not_equal tmp.unsafe_get.absolute.path
|
tmp4.unsafe_get.absolute.path . should_not_equal tmp.unsafe_get.absolute.path
|
||||||
|
|
||||||
# The base variant of from_stream also always copies:
|
# The base variant of from_stream also always copies:
|
||||||
stream5 = Input_Stream.new (FileInputStream.new java_file) (File_Error.handle_java_exceptions tmp.unsafe_get) associated_file=tmp
|
stream5 = Input_Stream.new (FileInputStream.new java_file) (File_Error.handle_java_exceptions tmp.unsafe_get) associated_source=tmp
|
||||||
tmp5 = Temporary_File.from_stream stream5
|
tmp5 = Temporary_File.from_stream stream5
|
||||||
tmp5.should_be_a Temporary_File
|
tmp5.should_be_a Temporary_File
|
||||||
tmp5.unsafe_get.absolute.path . should_not_equal tmp.unsafe_get.absolute.path
|
tmp5.unsafe_get.absolute.path . should_not_equal tmp.unsafe_get.absolute.path
|
||||||
|
|
||||||
|
group_builder.specify "should allow to materialize an input stream opened to a temporary file without copying it" pending="Currently the file stream does not know it came from Temporary_File. May be revisited in the future to automatically correlate it." <|
|
||||||
|
tmp = Temporary_File.new
|
||||||
|
tmp.with_file f->
|
||||||
|
"test payload 4" . write f
|
||||||
|
|
||||||
|
tmp2 = tmp.with_file f-> f.with_input_stream [File_Access.Read] input_stream->
|
||||||
|
Temporary_File.from_stream_light input_stream
|
||||||
|
# The returned file should be the same as original
|
||||||
|
tmp2.should_equal tmp
|
||||||
|
|
||||||
make_stream text =
|
make_stream text =
|
||||||
raw_stream = ByteArrayInputStream.new text.utf_8
|
raw_stream = ByteArrayInputStream.new text.utf_8
|
||||||
Input_Stream.new raw_stream (File_Error.handle_java_exceptions Nothing)
|
Input_Stream.new raw_stream (File_Error.handle_java_exceptions Nothing)
|
||||||
|
@ -173,7 +173,8 @@ add_specs suite_builder =
|
|||||||
utf8_bytes = [97, 44, 98, 44, 99, 10, -60, -123, 44, -17, -65, -65, 44, -61, 40, -61, 40, 10]
|
utf8_bytes = [97, 44, 98, 44, 99, 10, -60, -123, 44, -17, -65, -65, 44, -61, 40, -61, 40, 10]
|
||||||
utf8_bytes.write_bytes utf8_file
|
utf8_bytes.write_bytes utf8_file
|
||||||
action_1 on_problems =
|
action_1 on_problems =
|
||||||
utf8_file.read (Delimited_Format.Delimited "," headers=True) on_problems
|
# We need to set the encoding explicitly, as otherwise we'd just fallback to Windows-1252 and have no errors
|
||||||
|
utf8_file.read (Delimited_Format.Delimited "," headers=True encoding=Encoding.utf_8) on_problems
|
||||||
tester_1 table =
|
tester_1 table =
|
||||||
table.columns.map .name . should_equal ['a', 'b', 'c']
|
table.columns.map .name . should_equal ['a', 'b', 'c']
|
||||||
table.at 'a' . to_vector . should_equal ['ą']
|
table.at 'a' . to_vector . should_equal ['ą']
|
||||||
@ -522,6 +523,18 @@ add_specs suite_builder =
|
|||||||
r.first_column.to_vector . should_equal ['\uFFFD']
|
r.first_column.to_vector . should_equal ['\uFFFD']
|
||||||
Problems.expect_only_warning Encoding_Error r
|
Problems.expect_only_warning Encoding_Error r
|
||||||
|
|
||||||
|
group_builder.specify "should fall back to Windows-1252 encoding if invalid UTF-8 characters are encountered in Default encoding" <|
|
||||||
|
f = File.create_temporary_file "delimited-invalid-utf-8" ".csv"
|
||||||
|
# On the simple characters all three encodings (ASCII, UTF-8 and Win-1252) agree, so we can use ASCII bytes.
|
||||||
|
bytes = ('A,B\n1,y'.bytes Encoding.ascii) + [-1] + ('z\n2,-'.bytes Encoding.ascii)
|
||||||
|
bytes.write_bytes f . should_succeed
|
||||||
|
r = f.read
|
||||||
|
r.should_be_a Table
|
||||||
|
r.column_names.should_equal ["A", "B"]
|
||||||
|
r.at "A" . to_vector . should_equal [1, 2]
|
||||||
|
# We fallback to Win-1252 where byte -1 means ÿ
|
||||||
|
r.at "B" . to_vector . should_equal ["yÿz", "-"]
|
||||||
|
|
||||||
main filter=Nothing =
|
main filter=Nothing =
|
||||||
suite = Test.build suite_builder->
|
suite = Test.build suite_builder->
|
||||||
add_specs suite_builder
|
add_specs suite_builder
|
||||||
|
@ -577,6 +577,22 @@ add_specs suite_builder =
|
|||||||
t.at "A" . to_vector . should_equal [1, 2, 3, 4]
|
t.at "A" . to_vector . should_equal [1, 2, 3, 4]
|
||||||
t.at "B" . to_vector . should_equal ["ąęćś", "💯", "żółw", "🐢"]
|
t.at "B" . to_vector . should_equal ["ąęćś", "💯", "żółw", "🐢"]
|
||||||
|
|
||||||
|
Test.with_clue "Windows-1252 fallback: " <|
|
||||||
|
initial_bytes = 'A,B\n1,a\n2,b\n3,¥\n4,d'.bytes Encoding.windows_1252
|
||||||
|
initial_bytes.write_bytes f on_existing_file=Existing_File_Behavior.Overwrite . should_succeed
|
||||||
|
|
||||||
|
r = (Table.new [["A", [5, 6]], ["B", ["æ", "💯"]]]).write f on_existing_file=Existing_File_Behavior.Append
|
||||||
|
# Should report problem writing the 💯 character, because selected Windows-1252 encoding does not support it
|
||||||
|
Problems.expect_only_warning Encoding_Error r
|
||||||
|
|
||||||
|
# Now we read in auto mode, the part appended to the table should have been correctly written in Windows-1252
|
||||||
|
t = f.read
|
||||||
|
t.should_be_a Table
|
||||||
|
t.column_names . should_equal ["A", "B"]
|
||||||
|
t.at "A" . to_vector . should_equal [1, 2, 3, 4, 5, 6]
|
||||||
|
# The æ should have been correctly written in Win-1252, the 💯 is replaced by `?`.
|
||||||
|
t.at "B" . to_vector . should_equal ["a", "b", "¥", "d", "æ", "?"]
|
||||||
|
|
||||||
group_builder.specify "should fail if the target file is read-only" <|
|
group_builder.specify "should fail if the target file is read-only" <|
|
||||||
f = enso_project.data / "transient" / "permission.csv"
|
f = enso_project.data / "transient" / "permission.csv"
|
||||||
f.delete_if_exists
|
f.delete_if_exists
|
||||||
@ -635,4 +651,3 @@ main filter=Nothing =
|
|||||||
suite = Test.build suite_builder->
|
suite = Test.build suite_builder->
|
||||||
add_specs suite_builder
|
add_specs suite_builder
|
||||||
suite.run_with_filter filter
|
suite.run_with_filter filter
|
||||||
|
|
||||||
|
Loading…
Reference in New Issue
Block a user