Implement Table from Text conversion (#3478)

Implements https://www.pivotaltracker.com/story/show/181824168
This commit is contained in:
Radosław Waśko 2022-05-26 14:04:25 +02:00 committed by GitHub
parent a81d3550f1
commit 8828d801ea
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
4 changed files with 71 additions and 22 deletions

View File

@ -128,6 +128,8 @@
- [Integrated value parsing with the `Delimited` file reader.][3463]
- [Implemented the `Infer` setting for headers in the `Delimited` file format
and made it the default.][3472]
- [Implemented a `Table.from Text` conversion allowing to parse strings
representing `Delimited` files without storing them on the filesystem.][3478]
[debug-shortcuts]:
https://github.com/enso-org/enso/blob/develop/app/gui/docs/product/shortcuts.md#debug
@ -199,6 +201,7 @@
[3462]: https://github.com/enso-org/enso/pull/3462
[3463]: https://github.com/enso-org/enso/pull/3463
[3472]: https://github.com/enso-org/enso/pull/3472
[3478]: https://github.com/enso-org/enso/pull/3478
#### Enso Compiler

View File

@ -7,9 +7,11 @@ import Standard.Visualization
from Standard.Base.Data.Time.Date as Date_Module import Date
import Standard.Table.Io.Spreadsheet_Write_Mode
import Standard.Table.Io.Format
import Standard.Table.Io.File_Format
import Standard.Table.Internal.Table_Helpers
import Standard.Table.Internal.Aggregate_Column_Helper
import Standard.Table.Internal.Parse_Values_Helper
import Standard.Table.Internal.Delimited_Reader
from Standard.Table.Data.Order_Rule as Order_Rule_Module import Order_Rule
from Standard.Table.Data.Column_Selector as Column_Selector_Module import Column_Selector, By_Index
@ -28,6 +30,7 @@ polyglot java import org.enso.table.data.table.Column as Java_Column
polyglot java import org.enso.table.operations.OrderBuilder
polyglot java import org.enso.table.format.csv.Writer as Csv_Writer
polyglot java import org.enso.table.format.xlsx.Writer as Spreadsheet_Writer
polyglot java import java.io.StringReader
## Creates a new table from a vector of `[name, items]` pairs.
@ -1523,3 +1526,7 @@ print_table header rows indices_count format_term =
- y: The right operand to the comparator.
comparator_to_java : (Any -> Any -> Ordering) -> Any -> Any -> Integer
comparator_to_java cmp x y = cmp x y . to_sign
Table.from (that : Text) (format:File_Format.Delimited|File_Format.Fixed_Width = File_Format.Delimited '\t') (on_problems:Problem_Behavior=Report_Warning) =
java_reader = StringReader.new that
Delimited_Reader.read_from_reader format java_reader on_problems

View File

@ -18,6 +18,7 @@ polyglot java import java.io.IOException
polyglot java import com.univocity.parsers.common.TextParsingException
polyglot java import org.enso.base.Encoding_Utils
polyglot java import java.io.InputStream
polyglot java import java.io.Reader
polyglot java import org.enso.table.parsing.IdentityParser
polyglot java import org.enso.table.parsing.TypeInferringParser
@ -45,9 +46,6 @@ read_file format file on_problems =
## PRIVATE
Reads an input stream according to the provided format.
The `encoding` parameter is ignored, instead the provided stream should
handle any necessary decoding.
Arguments:
- format: The specification of the delimited file format.
- java_stream: A Java `InputStream` used as the data source.
@ -60,8 +58,36 @@ read_file format file on_problems =
integer.
- related_file: The file related to the provided `java_stream`, if available,
or `Nothing`. It is used for more detailed error reporting.
read_stream : Delimited -> InputStream -> Problem_Behavior -> File | Nothing -> Any
read_stream : Delimited -> InputStream -> Problem_Behavior -> Integer -> File | Nothing -> Any
read_stream format java_stream on_problems max_columns=4096 related_file=Nothing =
handle_io_exception ~action = Panic.catch IOException action caught_panic->
Error.throw (File.wrap_io_exception related_file caught_panic.payload.cause)
java_charset = format.encoding.to_java_charset
handle_io_exception <|
Encoding_Utils.with_stream_decoder java_stream java_charset reporting_stream_decoder->
result = here.read_from_reader format reporting_stream_decoder on_problems max_columns
decoding_problems = Vector.Vector reporting_stream_decoder.getReportedProblems . map Encoding_Error
on_problems.attach_problems_after result decoding_problems
## PRIVATE
Reads data from the provided `Reader` according to the provided format.
The `encoding` parameter is ignored, instead the provided reader should
handle any necessary decoding.
Arguments:
- format: The specification of the delimited file format.
- java_reader: A Java `Reader` used as the source of decoded characters.
- on_problems: Specifies the behavior when a problem occurs during the
operation. By default, a warning is issued, but the operation proceeds.
If set to `Report_Error`, the operation fails with a dataflow error.
If set to `Ignore`, the operation proceeds without errors or warnings.
- max_columns: Specifies the limit of columns to read. The limit is set to
avoid `OutOfMemory` errors on malformed files. It must be a positive
integer.
read_from_reader : Delimited -> InputStream -> Problem_Behavior -> Integer -> Any
read_from_reader format java_reader on_problems max_columns=4096 =
java_headers = case format.headers of
True -> DelimitedReader.HeaderBehavior.USE_FIRST_ROW_AS_HEADERS
Infer -> DelimitedReader.HeaderBehavior.INFER
@ -91,25 +117,21 @@ read_stream format java_stream on_problems max_columns=4096 related_file=Nothing
translate_parsing_exception caught_panic =
cause = caught_panic.payload.cause.getCause
if Java.is_instance cause IOException then File.wrap_io_exception related_file cause else
if Java.is_instance cause IOException then Panic.throw cause else
Error.throw (Parser_Error caught_panic.payload)
handle_parsing_exception = Panic.catch TextParsingException handler=translate_parsing_exception
java_charset = format.encoding.to_java_charset
handle_illegal_arguments <| handle_parsing_failure <| handle_parsing_exception <|
Encoding_Utils.with_stream_decoder java_stream java_charset reporting_stream_decoder->
warnings_as_errors = on_problems == Problem_Behavior_Module.Report_Error
base_parser = if format.quote.is_nothing then IdentityParser.new else
QuoteStrippingParser.new format.quote
value_parser = if format.value_formatter.is_nothing then base_parser else
wrapped = format.value_formatter.wrap_base_parser base_parser
TypeInferringParser.new format.value_formatter.get_specific_type_parsers.to_array wrapped
cell_type_guesser = if format.headers != Infer then Nothing else
formatter = format.value_formatter.if_nothing Data_Formatter
TypeInferringParser.new formatter.get_specific_type_parsers.to_array IdentityParser.new
reader = DelimitedReader.new reporting_stream_decoder format.delimiter format.quote format.quote_escape java_headers skip_rows row_limit max_columns value_parser cell_type_guesser format.keep_invalid_rows warnings_as_errors
result = Table.Table reader.read
decoding_problems = Vector.Vector reporting_stream_decoder.getReportedProblems . map Encoding_Error
parsing_problems = Vector.Vector reader.getReportedProblems . map translate_parsing_problem
problems = decoding_problems + parsing_problems
on_problems.attach_problems_after result problems
warnings_as_errors = on_problems == Problem_Behavior_Module.Report_Error
base_parser = if format.quote.is_nothing then IdentityParser.new else
QuoteStrippingParser.new format.quote
value_parser = if format.value_formatter.is_nothing then base_parser else
wrapped = format.value_formatter.wrap_base_parser base_parser
TypeInferringParser.new format.value_formatter.get_specific_type_parsers.to_array wrapped
cell_type_guesser = if format.headers != Infer then Nothing else
formatter = format.value_formatter.if_nothing Data_Formatter
TypeInferringParser.new formatter.get_specific_type_parsers.to_array IdentityParser.new
reader = DelimitedReader.new java_reader format.delimiter format.quote format.quote_escape java_headers skip_rows row_limit max_columns value_parser cell_type_guesser format.keep_invalid_rows warnings_as_errors
result = Table.Table reader.read
parsing_problems = Vector.Vector reader.getReportedProblems . map translate_parsing_problem
on_problems.attach_problems_after result parsing_problems

View File

@ -311,4 +311,21 @@ spec =
t2.at "c" . to_vector . should_equal [3, 6]
t2.columns.map .name . should_equal ["a", "b", "c"]
Test.specify "should be able to parse raw text" <|
text1 = """
a,b,c
1,2,3
4,5,6
t1 = Table.Table.from text1 (format = File_Format.Delimited ",")
t1.columns.map .name . should_equal ["a", "b", "c"]
t1.at "a" . to_vector . should_equal [1, 4]
t1.at "b" . to_vector . should_equal [2, 5]
t1.at "c" . to_vector . should_equal [3, 6]
text2 = 'a\tb\n1\t2\n3\t4'
t2 = Table.Table.from text2
t2.columns.map .name . should_equal ["a", "b"]
t2.at "a" . to_vector . should_equal [1, 3]
t2.at "b" . to_vector . should_equal [2, 4]
main = Test.Suite.run_main here.spec