From 2af970fe523a987537829c28502aef55362a940a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Rados=C5=82aw=20Wa=C5=9Bko?= Date: Wed, 8 Jun 2022 11:53:18 +0200 Subject: [PATCH] Basic changes to File_Format (#3516) Implements https://www.pivotaltracker.com/story/show/182308987 --- CHANGELOG.md | 3 + .../Base/0.0.0-dev/src/Data/Locale.enso | 6 + .../0.0.0-dev/src/Data/Dialect/Postgres.enso | 2 +- .../0.0.0-dev/src/Data/Dialect/Sqlite.enso | 2 +- .../Standard/Table/0.0.0-dev/src/Error.enso | 7 ++ .../src/Internal/Delimited_Reader.enso | 6 +- .../Table/0.0.0-dev/src/Io/Excel.enso | 4 +- .../Table/0.0.0-dev/src/Io/File_Format.enso | 57 +++++++-- .../Table/0.0.0-dev/src/Io/Quote_Style.enso | 9 ++ .../org/enso/table/format/xlsx/Reader.java | 16 +-- test/Table_Tests/src/Delimited_Read_Spec.enso | 111 +++++++++++------- test/Table_Tests/src/File_Read_Spec.enso | 22 ++-- test/Tests/src/Data/Locale_Spec.enso | 5 + 13 files changed, 169 insertions(+), 81 deletions(-) create mode 100644 distribution/lib/Standard/Table/0.0.0-dev/src/Io/Quote_Style.enso diff --git a/CHANGELOG.md b/CHANGELOG.md index be5b4cd2a0..7f0683c0be 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -134,6 +134,8 @@ - [Added rank data, correlation and covariance statistics for `Vector`][3484] - [Implemented `Table.order_by` for the SQLite backend.][3502] - [Implemented `Table.order_by` for the PostgreSQL backend.][3514] +- [Renamed `File_Format.Text` to `Plain_Text`, updated `File_Format.Delimited` + API and added builders for customizing less common settings.][3516] [debug-shortcuts]: https://github.com/enso-org/enso/blob/develop/app/gui/docs/product/shortcuts.md#debug @@ -210,6 +212,7 @@ [3484]: https://github.com/enso-org/enso/pull/3484 [3502]: https://github.com/enso-org/enso/pull/3502 [3514]: https://github.com/enso-org/enso/pull/3514 +[3516]: https://github.com/enso-org/enso/pull/3516 #### Enso Compiler diff --git a/distribution/lib/Standard/Base/0.0.0-dev/src/Data/Locale.enso b/distribution/lib/Standard/Base/0.0.0-dev/src/Data/Locale.enso index 3af68115ca..970021292a 100644 --- a/distribution/lib/Standard/Base/0.0.0-dev/src/Data/Locale.enso +++ b/distribution/lib/Standard/Base/0.0.0-dev/src/Data/Locale.enso @@ -414,6 +414,12 @@ type Locale if this.variant.is_nothing.not then b.append ["variant", this.variant] Json.from_pairs b.to_vector + ## Compares two locales for equality. + == : Any -> Boolean + == other = case other of + Locale other_java_locale -> this.java_locale.equals other_java_locale + _ -> False + ## PRIVATE Convert a java locale to an Enso locale. diff --git a/distribution/lib/Standard/Database/0.0.0-dev/src/Data/Dialect/Postgres.enso b/distribution/lib/Standard/Database/0.0.0-dev/src/Data/Dialect/Postgres.enso index 420f1798c4..f261475347 100644 --- a/distribution/lib/Standard/Database/0.0.0-dev/src/Data/Dialect/Postgres.enso +++ b/distribution/lib/Standard/Database/0.0.0-dev/src/Data/Dialect/Postgres.enso @@ -235,7 +235,7 @@ make_order_descriptor internal_column sort_direction text_ordering = IR.Order_Descriptor internal_column.expression sort_direction nulls_order=nulls collation=Nothing True -> IR.Order_Descriptor internal_column.expression sort_direction nulls_order=nulls collation="ucs_basic" - Case_Insensitive locale -> case Locale.default.java_locale.equals locale.java_locale of + Case_Insensitive locale -> case locale == Locale.default of False -> Error.throw (Unsupported_Database_Operation_Error "Case insensitive ordering with custom locale is currently not supported. You may need to materialize the Table to perform this operation.") True -> diff --git a/distribution/lib/Standard/Database/0.0.0-dev/src/Data/Dialect/Sqlite.enso b/distribution/lib/Standard/Database/0.0.0-dev/src/Data/Dialect/Sqlite.enso index 5316c962fe..879b51026c 100644 --- a/distribution/lib/Standard/Database/0.0.0-dev/src/Data/Dialect/Sqlite.enso +++ b/distribution/lib/Standard/Database/0.0.0-dev/src/Data/Dialect/Sqlite.enso @@ -58,7 +58,7 @@ type Sqlite_Dialect IR.Order_Descriptor internal_column.expression sort_direction collation=Nothing True -> IR.Order_Descriptor internal_column.expression sort_direction collation="BINARY" - Case_Insensitive locale -> case Locale.default.java_locale.equals locale.java_locale of + Case_Insensitive locale -> case locale == Locale.default of False -> Error.throw (Unsupported_Database_Operation_Error "Case insensitive ordering with custom locale is not supported by the SQLite backend. You may need to materialize the Table to perform this operation.") True -> diff --git a/distribution/lib/Standard/Table/0.0.0-dev/src/Error.enso b/distribution/lib/Standard/Table/0.0.0-dev/src/Error.enso index 7af77ada1d..8235ea22fc 100644 --- a/distribution/lib/Standard/Table/0.0.0-dev/src/Error.enso +++ b/distribution/lib/Standard/Table/0.0.0-dev/src/Error.enso @@ -154,3 +154,10 @@ type Leading_Zeros column:Text (datatype:(Integer|Number|Date|Time|Time_Of_Day|B a parse is attempted anyway. If mixed types are requested, the column is not parsed due to ambiguity. type Duplicate_Type_Selector column:Text ambiguous:Boolean + +## Indicates that the given file type is not supported by the `Auto` format. +type Unsupported_File_Type filename + +Unsupported_File_Type.to_display_text : Text +Unsupported_File_Type.to_display_text = + "The "+this.filename+" has a type that is not supported by the Auto format." diff --git a/distribution/lib/Standard/Table/0.0.0-dev/src/Internal/Delimited_Reader.enso b/distribution/lib/Standard/Table/0.0.0-dev/src/Internal/Delimited_Reader.enso index 044c9392d5..6e8cd229af 100644 --- a/distribution/lib/Standard/Table/0.0.0-dev/src/Internal/Delimited_Reader.enso +++ b/distribution/lib/Standard/Table/0.0.0-dev/src/Internal/Delimited_Reader.enso @@ -94,10 +94,6 @@ read_from_reader format java_reader on_problems max_columns=4096 = True -> DelimitedReader.HeaderBehavior.USE_FIRST_ROW_AS_HEADERS Infer -> DelimitedReader.HeaderBehavior.INFER False -> DelimitedReader.HeaderBehavior.GENERATE_HEADERS - skip_rows = case format.skip_rows of - Nothing -> 0 - Integer -> format.skip_rows - _ -> Error.throw (Illegal_Argument_Error "`skip_rows` should be Integer or Nothing.") row_limit = case format.row_limit of Nothing -> -1 Integer -> format.row_limit @@ -127,7 +123,7 @@ read_from_reader format java_reader on_problems max_columns=4096 = cell_type_guesser = if format.headers != Infer then Nothing else formatter = format.value_formatter.if_nothing Data_Formatter TypeInferringParser.new formatter.get_specific_type_parsers.to_array IdentityParser.new - reader = DelimitedReader.new java_reader format.delimiter format.quote format.quote_escape java_headers skip_rows row_limit max_columns value_parser cell_type_guesser format.keep_invalid_rows warnings_as_errors + reader = DelimitedReader.new java_reader format.delimiter format.quote format.quote_escape java_headers format.skip_rows row_limit max_columns value_parser cell_type_guesser format.keep_invalid_rows warnings_as_errors result_with_problems = reader.read parsing_problems = Vector.Vector (result_with_problems.problems) . map here.translate_reader_problem on_problems.attach_problems_after (Table.Table result_with_problems.value) parsing_problems diff --git a/distribution/lib/Standard/Table/0.0.0-dev/src/Io/Excel.enso b/distribution/lib/Standard/Table/0.0.0-dev/src/Io/Excel.enso index 82dbd3af32..83b34c6ad8 100644 --- a/distribution/lib/Standard/Table/0.0.0-dev/src/Io/Excel.enso +++ b/distribution/lib/Standard/Table/0.0.0-dev/src/Io/Excel.enso @@ -20,11 +20,11 @@ type Excel_Section ## Gets the data from a specific sheet. Column names are the Excel column names. - type Sheet (sheet:(Integer|Text)) (skip_rows:(Integer|Nothing)=Nothing) (row_limit:(Integer|Nothing)=Nothing) + type Sheet (sheet:(Integer|Text)) (skip_rows:Integer=0) (row_limit:(Integer|Nothing)=Nothing) ## Gets a specific range (taking either a defined name or external style address) from the workbook. - type Range (address:(Text|Excel_Range)) (skip_rows:(Integer|Nothing)=Nothing) (row_limit:(Integer|Nothing)=Nothing) + type Range (address:(Text|Excel_Range)) (skip_rows:Integer=0) (row_limit:(Integer|Nothing)=Nothing) type Excel_Range ## Specifies a range within an Excel Workbook. diff --git a/distribution/lib/Standard/Table/0.0.0-dev/src/Io/File_Format.enso b/distribution/lib/Standard/Table/0.0.0-dev/src/Io/File_Format.enso index 5079c83e5e..935525b9ea 100644 --- a/distribution/lib/Standard/Table/0.0.0-dev/src/Io/File_Format.enso +++ b/distribution/lib/Standard/Table/0.0.0-dev/src/Io/File_Format.enso @@ -6,9 +6,11 @@ from Standard.Base.Error.Problem_Behavior as Problem_Behavior_Module import Prob from Standard.Base.Data.Text.Encoding as Encoding_Module import Encoding import Standard.Base.Runtime.Ref import Standard.Table.Internal.Delimited_Reader +from Standard.Table.Error as Table_Errors import Unsupported_File_Type from Standard.Table.Data.Data_Formatter as Data_Formatter_Module import Data_Formatter import Standard.Table.Io.Excel as Excel_Module +import Standard.Table.Io.Quote_Style ## This type needs to be here to allow for the usage of Standard.Table functions. Ideally, it would be an interface within Standard.Base and @@ -24,9 +26,9 @@ type Auto materialise file = extension = file.extension - output = Ref.new File_Format.Bytes - if ".txt".equals_ignore_case extension then output.put File_Format.Text - if ".log".equals_ignore_case extension then output.put File_Format.Text + output = Ref.new Nothing + if ".txt".equals_ignore_case extension then output.put File_Format.Plain_Text + if ".log".equals_ignore_case extension then output.put File_Format.Plain_Text if ".csv".equals_ignore_case extension then output.put (File_Format.Delimited ',') if ".tsv".equals_ignore_case extension then output.put (File_Format.Delimited '\t') if ".xlsx".equals_ignore_case extension then output.put File_Format.Excel @@ -34,7 +36,8 @@ type Auto if ".xls".equals_ignore_case extension then output.put File_Format.Excel if ".xlt".equals_ignore_case extension then output.put File_Format.Excel - output.get + output.get.if_nothing <| + Error.throw (Unsupported_File_Type file.name) ## Implements the `File.read` for this `File_Format` read : File -> Problem_Behavior -> Any @@ -52,8 +55,8 @@ type Bytes file.read_bytes ## Reads the file to a `Text` with specified encoding. -type Text - type Text (encoding:Encoding=Encoding.utf_8) +type Plain_Text + type Plain_Text (encoding:Encoding=Encoding.utf_8) ## Implements the `File.read` for this `File_Format` read : File -> Problem_Behavior -> Any @@ -72,6 +75,9 @@ type Delimited - delimiter: The delimiter character to split the file into columns. An `Illegal_Argument_Error` error is returned if this is an empty string. - encoding: The encoding to use when reading the file. + - skip_rows: The number of rows to skip from the top of the file. + - row_limit: The maximum number of rows to read from the file. This count + does not include the header row (if applicable). - quote: The quote character denotes the start and end of a quoted value. No quote character is used if set to `Nothing`. Quoted items are not split on the delimiter and can also contain newlines. Within a quoted @@ -83,27 +89,58 @@ type Delimited then escaping quotes is done by double quotes: `"ab""cd"` will yield the text `ab"cd"`. Another popular choice for `quote_escape` is the `\` character. Then `"ab\"cd"` will yield the same text. + - quote_style: The style of quoting to use when writing the file. - headers: If set to `True`, the first row is used as column names. If set to `False`, the column names are generated by adding increasing numeric suffixes to the base name `Column` (i.e. `Column_1`, `Column_2` etc.). If set to `Infer`, the process tries to infer if headers are present on the first row. If the column names are not unique, numeric suffixes will be appended to disambiguate them. - - skip_rows: The number of rows to skip from the top of the file. - - row_limit: The maximum number of rows to read from the file. This count - does not include the header row (if applicable). - value_formatter: Formatter to parse text values into numbers, dates, times, etc. If `Nothing` values are left as Text. - keep_invalid_rows: Specifies whether rows that contain less or more columns than expected should be kept (setting the missing columns to `Nothing` or dropping the excess columns) or dropped. - type Delimited (delimiter:Text) (encoding:Encoding=Encoding.utf_8) (quote:Text|Nothing='"') (quote_escape:Text|Nothing='"') (headers:True|False|Infer=Infer) (skip_rows:Integer|Nothing=Nothing) (row_limit:Integer|Nothing=Nothing) (value_formatter:Data_Formatter|Nothing=Data_Formatter) (keep_invalid_rows:Boolean=True) + type Delimited (delimiter:Text) (encoding:Encoding=Encoding.utf_8) (skip_rows:Integer=0) (row_limit:Integer|Nothing=Nothing) (quote:Text|Nothing='"') (quote_escape:Text|Nothing='"') (quote_style:Quote_Style=Quote_Style.Necessary) (headers:True|False|Infer=Infer) (value_formatter:Data_Formatter|Nothing=Data_Formatter) (keep_invalid_rows:Boolean=True) ## Implements the `File.read` for this `File_Format` read : File -> Problem_Behavior -> Any read file on_problems = Delimited_Reader.read_file this file on_problems + ## PRIVATE + Clone the instance with some properties overridden. + Note: This function is internal until such time as Atom cloning with modification is built into Enso. + clone : Text->Text->(Boolean|Infer)->Data_Formatter->Boolean->Delimited + clone (quote=this.quote) (quote_escape=this.quote_escape) (quote_style=this.quote_style) (headers=this.headers) (value_formatter=this.value_formatter) (keep_invalid_rows=this.keep_invalid_rows) = + Delimited this.delimiter this.encoding this.skip_rows this.row_limit quote quote_escape quote_style headers value_formatter keep_invalid_rows + + ## Create a clone of this with specified `quote` and `quote_escape`. + with_quotes : Text->Text->Quote_Style->Delimited + with_quotes quote quote_escape=quote quote_style=this.quote_style = + this.clone quote=quote quote_escape=quote_escape quote_style=quote_style + + ## Create a clone of this with first row treated as header. + with_headers : Delimited + with_headers = this.clone headers=True + + ## Create a clone of this where the first row is treated as data, not a + header. + without_headers : Delimited + without_headers = this.clone headers=False + + ## Create a clone of this with value parsing. + + A custom `Data_Formatter` can be provided to customize parser options. + with_parsing : Data_Formatter -> Delimited + with_parsing (value_formatter=Data_Formatter) = + this.clone value_formatter=value_formatter + + ## Create a clone of this without value parsing. + without_parsing : Delimited + without_parsing = + this.clone value_formatter=Nothing + ## A setting to infer the default behaviour of some option. type Infer diff --git a/distribution/lib/Standard/Table/0.0.0-dev/src/Io/Quote_Style.enso b/distribution/lib/Standard/Table/0.0.0-dev/src/Io/Quote_Style.enso new file mode 100644 index 0000000000..86fcb11150 --- /dev/null +++ b/distribution/lib/Standard/Table/0.0.0-dev/src/Io/Quote_Style.enso @@ -0,0 +1,9 @@ +type Quote_Style + ## Do not quote any values even if this will result in an invalid file. + type Never + + ## Quote text values which are empty or contain the delimiter or new lines. + type Necessary + + ## Quote all text values. + type Always diff --git a/std-bits/table/src/main/java/org/enso/table/format/xlsx/Reader.java b/std-bits/table/src/main/java/org/enso/table/format/xlsx/Reader.java index 81d0cb2bff..0846b11262 100644 --- a/std-bits/table/src/main/java/org/enso/table/format/xlsx/Reader.java +++ b/std-bits/table/src/main/java/org/enso/table/format/xlsx/Reader.java @@ -352,7 +352,7 @@ public class Reader { public static Table readSheetByName( InputStream stream, String sheetName, - Integer skip_rows, + int skip_rows, Integer row_limit, boolean xls_format) throws IOException, IllegalArgumentException { @@ -367,7 +367,7 @@ public class Reader { workbook, sheetIndex, null, - skip_rows == null ? 0 : skip_rows, + skip_rows, row_limit == null ? Integer.MAX_VALUE : row_limit); } @@ -383,7 +383,7 @@ public class Reader { * @throws IOException when the input stream cannot be read. */ public static Table readSheetByIndex( - InputStream stream, int index, Integer skip_rows, Integer row_limit, boolean xls_format) + InputStream stream, int index, int skip_rows, Integer row_limit, boolean xls_format) throws IOException, IllegalArgumentException { Workbook workbook = getWorkbook(stream, xls_format); @@ -397,7 +397,7 @@ public class Reader { workbook, index - 1, null, - skip_rows == null ? 0 : skip_rows, + skip_rows, row_limit == null ? Integer.MAX_VALUE : row_limit); } @@ -415,7 +415,7 @@ public class Reader { public static Table readRangeByName( InputStream stream, String rangeNameOrAddress, - Integer skip_rows, + int skip_rows, Integer row_limit, boolean xls_format) throws IOException { @@ -438,7 +438,7 @@ public class Reader { * @throws IOException when the input stream cannot be read. */ public static Table readRange( - InputStream stream, Range range, Integer skip_rows, Integer row_limit, boolean xls_format) + InputStream stream, Range range, int skip_rows, Integer row_limit, boolean xls_format) throws IOException { return readRange(getWorkbook(stream, xls_format), range, skip_rows, row_limit); } @@ -448,7 +448,7 @@ public class Reader { } private static Table readRange( - Workbook workbook, Range range, Integer skip_rows, Integer row_limit) { + Workbook workbook, Range range, int skip_rows, Integer row_limit) { int sheetIndex = getSheetIndex(workbook, range.getSheetName()); if (sheetIndex == -1) { throw new IllegalArgumentException("Unknown sheet '" + range.getSheetName() + "'."); @@ -458,7 +458,7 @@ public class Reader { workbook, sheetIndex, range, - skip_rows == null ? 0 : skip_rows, + skip_rows, row_limit == null ? Integer.MAX_VALUE : row_limit); } } diff --git a/test/Table_Tests/src/Delimited_Read_Spec.enso b/test/Table_Tests/src/Delimited_Read_Spec.enso index 4f21b19455..e24a570ef1 100644 --- a/test/Table_Tests/src/Delimited_Read_Spec.enso +++ b/test/Table_Tests/src/Delimited_Read_Spec.enso @@ -7,8 +7,9 @@ import Standard.Table.Data.Column from Standard.Table.Error import all import Standard.Table.Io.File_Read -import Standard.Table.Io.File_Format +from Standard.Table.Io.File_Format import Delimited from Standard.Table.Data.Data_Formatter as Data_Formatter_Module import Data_Formatter +import Standard.Table.Io.Quote_Style import Standard.Test import Standard.Test.Problems @@ -22,7 +23,7 @@ spec = c_2 = ["b", ['2', Nothing, '8', '11']] c_3 = ["c", [Nothing, '6', '9', '12']] expected_table = Table.new [c_1, c_2, c_3] - simple_empty = File.read (Enso_Project.data / "simple_empty.csv") (File_Format.Delimited "," headers=True value_formatter=Nothing) + simple_empty = File.read (Enso_Project.data / "simple_empty.csv") (Delimited "," headers=True value_formatter=Nothing) simple_empty.should_equal expected_table Test.specify "should load a simple table without headers" <| @@ -30,11 +31,11 @@ spec = c_2 = ["Column_2", ['b', '2', Nothing, '8', '11']] c_3 = ["Column_3", ['c', Nothing, '6', '9', '12']] expected_table = Table.new [c_1, c_2, c_3] - simple_empty = File.read (Enso_Project.data / "simple_empty.csv") (File_Format.Delimited "," headers=False value_formatter=Nothing) + simple_empty = File.read (Enso_Project.data / "simple_empty.csv") (Delimited "," headers=False value_formatter=Nothing) simple_empty.should_equal expected_table Test.specify "should work in presence of missing headers" <| - action on_problems = File.read (Enso_Project.data / "missing_header.csv") (File_Format.Delimited "," headers=True value_formatter=Nothing) on_problems + action on_problems = File.read (Enso_Project.data / "missing_header.csv") (Delimited "," headers=True value_formatter=Nothing) on_problems tester table = table.columns.map .name . should_equal ["a", "Column_1", "c", "Column_2", "d"] table.at "a" . to_vector . should_equal ["1"] @@ -46,61 +47,61 @@ spec = Problems.test_problem_handling action problems tester Test.specify "should infer headers based on the first two rows" <| - t1 = File.read (Enso_Project.data / "data_small.csv") (File_Format.Delimited "," headers=File_Format.Infer) + t1 = File.read (Enso_Project.data / "data_small.csv") (Delimited "," headers=File_Format.Infer) t1.columns.map .name . should_equal ["Code", "Index", "Flag", "Value", "ValueWithNothing", "TextWithNothing", "Hexadecimal", "Leading0s", "QuotedNumbers", "Mixed Types"] - t2 = File.read (Enso_Project.data / "all_text.csv") (File_Format.Delimited "," headers=File_Format.Infer) + t2 = File.read (Enso_Project.data / "all_text.csv") (Delimited "," headers=File_Format.Infer) t2.columns.map .name . should_equal ["Column_1", "Column_2"] t2.at "Column_1" . to_vector . should_equal ["a", "c", "e", "g"] t2.at "Column_2" . to_vector . should_equal ["b", "d", "f", "h"] - t3 = File.read (Enso_Project.data / "two_rows1.csv") (File_Format.Delimited "," headers=File_Format.Infer) + t3 = File.read (Enso_Project.data / "two_rows1.csv") (Delimited "," headers=File_Format.Infer) t3.columns.map .name . should_equal ["a", "b", "c"] t3.at "a" . to_vector . should_equal ["x"] t3.at "b" . to_vector . should_equal [Nothing] t3.at "c" . to_vector . should_equal [Nothing] - t4 = File.read (Enso_Project.data / "two_rows2.csv") (File_Format.Delimited "," headers=File_Format.Infer) + t4 = File.read (Enso_Project.data / "two_rows2.csv") (Delimited "," headers=File_Format.Infer) t4.columns.map .name . should_equal ["Column_1", "Column_2", "Column_3"] t4.at "Column_1" . to_vector . should_equal ["a", "d"] t4.at "Column_2" . to_vector . should_equal ["b", "e"] t4.at "Column_3" . to_vector . should_equal ["c", "f"] - t5 = File.read (Enso_Project.data / "numbers_in_header.csv") (File_Format.Delimited "," headers=File_Format.Infer) + t5 = File.read (Enso_Project.data / "numbers_in_header.csv") (Delimited "," headers=File_Format.Infer) t5.columns.map .name . should_equal ["Column_1", "Column_2", "Column_3"] t5.at "Column_1" . to_vector . should_equal ["a", "1"] t5.at "Column_2" . to_vector . should_equal ["b", "2"] t5.at "Column_3" . to_vector . should_equal [0, 3] - t6 = File.read (Enso_Project.data / "quoted_numbers_in_header.csv") (File_Format.Delimited "," headers=File_Format.Infer) + t6 = File.read (Enso_Project.data / "quoted_numbers_in_header.csv") (Delimited "," headers=File_Format.Infer) t6.columns.map .name . should_equal ["1", "x"] t6.at "1" . to_vector . should_equal ["y"] t6.at "x" . to_vector . should_equal [2] Test.specify "should not use the first row as headers if it is the only row, unless specifically asked to" <| - t1 = File.read (Enso_Project.data / "one_row.csv") (File_Format.Delimited "," headers=File_Format.Infer) + t1 = File.read (Enso_Project.data / "one_row.csv") (Delimited "," headers=File_Format.Infer) t1.columns.map .name . should_equal ["Column_1", "Column_2", "Column_3"] t1.at "Column_1" . to_vector . should_equal ["x"] t1.at "Column_2" . to_vector . should_equal ["y"] t1.at "Column_3" . to_vector . should_equal ["z"] - t2 = File.read (Enso_Project.data / "one_row.csv") (File_Format.Delimited "," headers=True) + t2 = File.read (Enso_Project.data / "one_row.csv") (Delimited "," headers=True) t2.columns.map .name . should_equal ["x", "y", "z"] t2.row_count . should_equal 0 t2.at "x" . to_vector . should_equal [] Test.specify "should be able to load even an empty file" <| - table = File.read (Enso_Project.data / "empty.txt") (File_Format.Delimited "," headers=True value_formatter=Nothing) + table = File.read (Enso_Project.data / "empty.txt") (Delimited "," headers=True value_formatter=Nothing) table.columns.map .name . should_equal [] table.row_count . should_equal 0 Test.specify "should correctly handle file opening issues" <| nonexistent_file = Enso_Project.data / "a_filename_that_does_not_exist.foobar" - r1 = File.read nonexistent_file (File_Format.Delimited "," headers=True value_formatter=Nothing) + r1 = File.read nonexistent_file (Delimited "," headers=True value_formatter=Nothing) r1.should_fail_with File.File_Not_Found directory = Enso_Project.data - r2 = File.read directory (File_Format.Delimited "," headers=True value_formatter=Nothing) Problem_Behavior.Report_Error + r2 = File.read directory (Delimited "," headers=True value_formatter=Nothing) Problem_Behavior.Report_Error r2.should_fail_with File.Io_Error Test.specify "should work with all kinds of line endings" <| @@ -111,7 +112,7 @@ spec = (path name).write_text text Encoding.utf_8 test_file name = - table = File.read (path name) (File_Format.Delimited "," headers=True value_formatter=Nothing) Problem_Behavior.Report_Error + table = File.read (path name) (Delimited "," headers=True value_formatter=Nothing) Problem_Behavior.Report_Error table.columns.map .name . should_equal ['a', 'b', 'c'] table.at 'a' . to_vector . should_equal ['d', '1'] table.at 'b' . to_vector . should_equal ['e', '2'] @@ -126,17 +127,17 @@ spec = # Currently mixed line endings are not supported. (path 'mixed.csv').write_text 'a,b,c\nd,e,f\r1,2,3' - File.read (path 'mixed.csv') (File_Format.Delimited "," headers=True value_formatter=Nothing) Problem_Behavior.Report_Error . should_fail_with Invalid_Row + File.read (path 'mixed.csv') (Delimited "," headers=True value_formatter=Nothing) Problem_Behavior.Report_Error . should_fail_with Invalid_Row Test.specify "should work with Windows-1252 encoding" <| - table = File.read (Enso_Project.data / "windows.csv") (File_Format.Delimited "," headers=True encoding=Encoding.windows_1252) Problem_Behavior.Report_Error + table = File.read (Enso_Project.data / "windows.csv") (Delimited "," headers=True encoding=Encoding.windows_1252) Problem_Behavior.Report_Error table.columns.map .name . should_equal ['a', 'b', 'c'] table.at 'a' . to_vector . should_equal ['$ยข'] table.at 'b' . to_vector . should_equal ['ยค'] table.at 'c' . to_vector . should_equal ['ยฅ'] Test.specify "should work with UTF-16 encoding" <| - table = File.read (Enso_Project.data / "utf16.csv") (File_Format.Delimited "," headers=True encoding=Encoding.utf_16_be) Problem_Behavior.Report_Error + table = File.read (Enso_Project.data / "utf16.csv") (Delimited "," headers=True encoding=Encoding.utf_16_be) Problem_Behavior.Report_Error table.columns.map .name . should_equal ['ฤ…', '๐Ÿš€b', 'ฤ‡๐Ÿ˜Ž'] table.at 'ฤ…' . to_vector . should_equal ['ฤ…'] table.at '๐Ÿš€b' . to_vector . should_equal ['โœจ๐Ÿš€๐Ÿšง๐Ÿ˜๐Ÿ˜ƒ๐Ÿ˜๐Ÿ˜Ž๐Ÿ˜™๐Ÿ˜‰โ˜บ'] @@ -147,7 +148,7 @@ spec = utf8_bytes = [97, 44, 98, 44, 99, 10, -60, -123, 44, -17, -65, -65, 44, -61, 40, -61, 40, 10] utf8_file.write_bytes utf8_bytes action_1 on_problems = - utf8_file.read (File_Format.Delimited "," headers=True) on_problems + utf8_file.read (Delimited "," headers=True) on_problems tester_1 table = table.columns.map .name . should_equal ['a', 'b', 'c'] table.at 'a' . to_vector . should_equal ['ฤ…'] @@ -157,7 +158,7 @@ spec = Problems.test_problem_handling action_1 problems_1 tester_1 action_2 on_problems = - (Enso_Project.data / "utf16_invalid.csv").read (File_Format.Delimited "," headers=True encoding=Encoding.utf_16_be) on_problems + (Enso_Project.data / "utf16_invalid.csv").read (Delimited "," headers=True encoding=Encoding.utf_16_be) on_problems tester_2 table = table.columns.map .name . should_equal ['a', 'b', 'c'] # This column does not raise a problem - the '\uFFFD' is simply present in the input file. @@ -170,7 +171,7 @@ spec = Test.specify "should handle duplicated columns" <| - action on_problems = File.read (Enso_Project.data / "duplicated_columns.csv") (File_Format.Delimited "," headers=True value_formatter=Nothing) on_problems + action on_problems = File.read (Enso_Project.data / "duplicated_columns.csv") (Delimited "," headers=True value_formatter=Nothing) on_problems tester table = table.columns.map .name . should_equal ['a', 'b', 'c', 'a_1'] table.at 'a' . to_vector . should_equal ['1'] @@ -179,27 +180,27 @@ spec = Problems.test_problem_handling action problems tester Test.specify "should handle quotes" <| - t1 = File.read (Enso_Project.data / "double_quoted.csv") (File_Format.Delimited "," headers=True value_formatter=Nothing) + t1 = File.read (Enso_Project.data / "double_quoted.csv") (Delimited "," headers=True value_formatter=Nothing) t1.at 'a' . to_vector . should_equal ['a, x', '"a'] t1.at 'c' . to_vector . should_equal ['3', '"'] - t2 = File.read (Enso_Project.data / "escape_quoted.csv") (File_Format.Delimited "," headers=True quote_escape="\" value_formatter=Nothing) + t2 = File.read (Enso_Project.data / "escape_quoted.csv") (Delimited "," headers=True quote_escape="\" value_formatter=Nothing) t2.at 'a' . to_vector . should_equal ['a"b', 'a\\\"z'] - t3 = File.read (Enso_Project.data / "no_quoting.csv") (File_Format.Delimited "," quote=Nothing headers=True value_formatter=Nothing) + t3 = File.read (Enso_Project.data / "no_quoting.csv") (Delimited "," quote=Nothing headers=True value_formatter=Nothing) t3.at 'a' . to_vector . should_equal ['"y'] t3.at 'b' . to_vector . should_equal ['z"'] t3.at 'c' . to_vector . should_equal ['a'] Test.specify "should support rows spanning multiple lines if quoted" <| - t1 = File.read (Enso_Project.data / "multiline_quoted.csv") (File_Format.Delimited "," headers=True value_formatter=Nothing) + t1 = File.read (Enso_Project.data / "multiline_quoted.csv") (Delimited "," headers=True value_formatter=Nothing) t1.at 'a' . to_vector . should_equal ['1', '4'] t1.at 'b' . to_vector . should_equal ['start\n\ncontinue', '5'] t1.at 'c' . to_vector . should_equal ['3', '6'] Test.specify "should behave correctly in presence of a mismatched quote" <| action_1 on_problems = - File.read (Enso_Project.data / "mismatched_quote.csv") (File_Format.Delimited "," headers=True value_formatter=Nothing) on_problems + File.read (Enso_Project.data / "mismatched_quote.csv") (Delimited "," headers=True value_formatter=Nothing) on_problems tester_1 table = table.columns.map .name . should_equal ['a', 'b', 'c'] @@ -210,7 +211,7 @@ spec = Problems.test_problem_handling action_1 problems_1 tester_1 action_2 on_problems = - File.read (Enso_Project.data / "mismatched_quote2.csv") (File_Format.Delimited "," headers=True value_formatter=Nothing) on_problems + File.read (Enso_Project.data / "mismatched_quote2.csv") (Delimited "," headers=True value_formatter=Nothing) on_problems tester_2 table = table.columns.map .name . should_equal ['a', 'b', 'c'] @@ -222,7 +223,7 @@ spec = Test.specify "should handle too long and too short rows" <| action keep_invalid_rows on_problems = - File.read (Enso_Project.data / "varying_rows.csv") (File_Format.Delimited "," headers=True keep_invalid_rows=keep_invalid_rows value_formatter=Nothing) on_problems + File.read (Enso_Project.data / "varying_rows.csv") (Delimited "," headers=True keep_invalid_rows=keep_invalid_rows value_formatter=Nothing) on_problems tester_kept table = table.columns.map .name . should_equal ['a', 'b', 'c'] @@ -242,7 +243,7 @@ spec = Test.specify "should aggregate invalid rows over some limit" <| action on_problems = - File.read (Enso_Project.data / "many_invalid_rows.csv") (File_Format.Delimited "," headers=True keep_invalid_rows=False value_formatter=Nothing) on_problems + File.read (Enso_Project.data / "many_invalid_rows.csv") (Delimited "," headers=True keep_invalid_rows=False value_formatter=Nothing) on_problems tester table = table.columns.map .name . should_equal ['a', 'b', 'c'] @@ -253,45 +254,45 @@ spec = Problems.test_problem_handling action problems tester Test.specify "should allow to skip rows" <| - t1 = File.read (Enso_Project.data / "simple_empty.csv") (File_Format.Delimited "," headers=False skip_rows=3 value_formatter=Nothing) + t1 = File.read (Enso_Project.data / "simple_empty.csv") (Delimited "," headers=False skip_rows=3 value_formatter=Nothing) t1.at "Column_1" . to_vector . should_equal ['7', '10'] - t2 = File.read (Enso_Project.data / "simple_empty.csv") (File_Format.Delimited "," headers=True skip_rows=3 value_formatter=Nothing) + t2 = File.read (Enso_Project.data / "simple_empty.csv") (Delimited "," headers=True skip_rows=3 value_formatter=Nothing) t2.columns.map .name . should_equal ['7', '8', '9'] t2.at "7" . to_vector . should_equal ['10'] Test.specify "should allow to set a limit of rows to read" <| - t1 = File.read (Enso_Project.data / "simple_empty.csv") (File_Format.Delimited "," headers=False row_limit=2 value_formatter=Nothing) + t1 = File.read (Enso_Project.data / "simple_empty.csv") (Delimited "," headers=False row_limit=2 value_formatter=Nothing) t1.at "Column_1" . to_vector . should_equal ['a', '1'] - t2 = File.read (Enso_Project.data / "simple_empty.csv") (File_Format.Delimited "," headers=True row_limit=2 value_formatter=Nothing) + t2 = File.read (Enso_Project.data / "simple_empty.csv") (Delimited "," headers=True row_limit=2 value_formatter=Nothing) t2.at "a" . to_vector . should_equal ['1', '4'] - t3 = File.read (Enso_Project.data / "simple_empty.csv") (File_Format.Delimited "," headers=False skip_rows=3 row_limit=1 value_formatter=Nothing) + t3 = File.read (Enso_Project.data / "simple_empty.csv") (Delimited "," headers=False skip_rows=3 row_limit=1 value_formatter=Nothing) t3.at "Column_1" . to_vector . should_equal ['7'] - t4 = File.read (Enso_Project.data / "simple_empty.csv") (File_Format.Delimited "," headers=False row_limit=0 value_formatter=Nothing) + t4 = File.read (Enso_Project.data / "simple_empty.csv") (Delimited "," headers=False row_limit=0 value_formatter=Nothing) t4.columns.map .name . should_equal ['Column_1', 'Column_2', 'Column_3'] t4.row_count . should_equal 0 - t5 = File.read (Enso_Project.data / "simple_empty.csv") (File_Format.Delimited "," headers=True row_limit=0 value_formatter=Nothing) + t5 = File.read (Enso_Project.data / "simple_empty.csv") (Delimited "," headers=True row_limit=0 value_formatter=Nothing) t5.columns.map .name . should_equal ['a', 'b', 'c'] t5.at 'a' . to_vector . should_equal [] t5.row_count . should_equal 0 - t6 = File.read (Enso_Project.data / "simple_empty.csv") (File_Format.Delimited "," headers=False skip_rows=3 row_limit=1000 value_formatter=Nothing) + t6 = File.read (Enso_Project.data / "simple_empty.csv") (Delimited "," headers=False skip_rows=3 row_limit=1000 value_formatter=Nothing) t6.at "Column_1" . to_vector . should_equal ['7', '10'] Test.specify "should check arguments" <| path = (Enso_Project.data / "simple_empty.csv") pb = Problem_Behavior.Report_Error - path.read (File_Format.Delimited "," headers=False quote='abc') pb . should_fail_with Illegal_Argument_Error - path.read (File_Format.Delimited "," headers=False quote='๐Ÿšง') pb . should_fail_with Illegal_Argument_Error - path.read (File_Format.Delimited "," headers=False quote_escape='//') pb . should_fail_with Illegal_Argument_Error - path.read (File_Format.Delimited 'a\u{301}' headers=False) pb . should_fail_with Illegal_Argument_Error + path.read (Delimited "," headers=False quote='abc') pb . should_fail_with Illegal_Argument_Error + path.read (Delimited "," headers=False quote='๐Ÿšง') pb . should_fail_with Illegal_Argument_Error + path.read (Delimited "," headers=False quote_escape='//') pb . should_fail_with Illegal_Argument_Error + path.read (Delimited 'a\u{301}' headers=False) pb . should_fail_with Illegal_Argument_Error Test.specify "should correctly guess column types" <| - t = (Enso_Project.data / "data_small.csv") . read (File_Format.Delimited "," headers=True) + t = (Enso_Project.data / "data_small.csv") . read (Delimited "," headers=True) t.at "Code" . to_vector . should_equal ["gxl", "wca", "nfw", "der"] t.at "Index" . to_vector . should_equal [7, 0, 1, 7] t.at "Flag" . to_vector . should_equal [True, False, True, True] @@ -303,7 +304,7 @@ spec = t.at "QuotedNumbers" . to_vector . should_equal ["1", "2", Nothing, "34"] t.at "Mixed Types" . to_vector . should_equal ["33", Nothing, "45", "True"] - t2 = (Enso_Project.data / "data_small.csv") . read (File_Format.Delimited "," headers=True value_formatter=(Data_Formatter allow_leading_zeros=True)) + t2 = (Enso_Project.data / "data_small.csv") . read (Delimited "," headers=True value_formatter=(Data_Formatter allow_leading_zeros=True)) t2.at "Leading0s" . to_vector . should_equal [1, 2, 123, Nothing] Test.specify "should be able to detect types automatically" <| @@ -322,7 +323,7 @@ spec = a,b,c 1,2,3 4,5,6 - t1 = Table.Table.from text1 (format = File_Format.Delimited ",") + t1 = Table.Table.from text1 (format = Delimited ",") t1.columns.map .name . should_equal ["a", "b", "c"] t1.at "a" . to_vector . should_equal [1, 4] t1.at "b" . to_vector . should_equal [2, 5] @@ -334,4 +335,24 @@ spec = t2.at "a" . to_vector . should_equal [1, 3] t2.at "b" . to_vector . should_equal [2, 4] + Test.specify "should allow to build the Delimited configuration using builders" <| + Delimited "," . clone . should_equal (Delimited ",") + Delimited "," encoding=Encoding.ascii skip_rows=123 row_limit=100 headers=False value_formatter=Nothing . clone . should_equal (Delimited "," headers=False value_formatter=Nothing skip_rows=123 row_limit=100 encoding=Encoding.ascii) + Delimited "," . clone quote="'" quote_escape='\\' quote_style=Quote_Style.Always headers=False value_formatter=Nothing . should_equal (Delimited "," headers=False value_formatter=Nothing quote="'" quote_escape='\\' quote_style=Quote_Style.Always) + + Delimited '\t' . with_quotes "|" . should_equal (Delimited '\t' quote='|' quote_escape='|') + Delimited '\t' quote_style=Quote_Style.Always . with_quotes "-" '\\' . should_equal (Delimited '\t' quote='-' quote_escape='\\' quote_style=Quote_Style.Always) + Delimited '\t' quote_style=Quote_Style.Always . with_quotes "-" '\\' Quote_Style.Never . should_equal (Delimited '\t' quote='-' quote_escape='\\' quote_style=Quote_Style.Never) + + Delimited ',' . with_headers . should_equal (Delimited ',' headers=True) + Delimited ',' . without_headers . should_equal (Delimited ',' headers=False) + Delimited "," skip_rows=123 headers=False value_formatter=Nothing quote_style=Quote_Style.Never . with_headers . should_equal (Delimited "," skip_rows=123 value_formatter=Nothing quote_style=Quote_Style.Never headers=True) + Delimited "," skip_rows=123 headers=True value_formatter=Nothing quote_style=Quote_Style.Never . without_headers . should_equal (Delimited "," skip_rows=123 value_formatter=Nothing quote_style=Quote_Style.Never headers=False) + + Delimited ',' . with_parsing . should_equal (Delimited ',') + Delimited ',' . without_parsing . should_equal (Delimited ',' value_formatter=Nothing) + custom_formatter = Data_Formatter true_values=["A", "B", "C"] false_values=["D", "E", "F"] + Delimited ',' . with_parsing custom_formatter . should_equal (Delimited ',' value_formatter=custom_formatter) + Delimited ',' row_limit=456 . without_parsing . should_equal (Delimited ',' value_formatter=Nothing row_limit=456) + main = Test.Suite.run_main here.spec diff --git a/test/Table_Tests/src/File_Read_Spec.enso b/test/Table_Tests/src/File_Read_Spec.enso index a32310601d..670bbd620f 100644 --- a/test/Table_Tests/src/File_Read_Spec.enso +++ b/test/Table_Tests/src/File_Read_Spec.enso @@ -2,6 +2,7 @@ from Standard.Base import all from Standard.Base.Data.Text.Encoding as Encoding_Module import Encoding, Encoding_Error import Standard.Table.Io.File_Read import Standard.Table.Io.File_Format +from Standard.Table.Error import Unsupported_File_Type import Standard.Test import Standard.Test.Problems @@ -13,18 +14,21 @@ spec = Test.group "File_Format.Auto materialise" <| Test.specify "should be Bytes for unknown file" <| - File_Format.Auto . materialise sample_xxx . should_be_a File_Format.Bytes + File_Format.Auto . materialise sample_xxx . should_fail_with Unsupported_File_Type Test.specify "should be Text for text file" <| - File_Format.Auto . materialise sample_txt . should_be_a File_Format.Text + File_Format.Auto . materialise sample_txt . should_be_a File_Format.Plain_Text Test.specify "should be Text for log file" <| - File_Format.Auto . materialise windows_log . should_be_a File_Format.Text + File_Format.Auto . materialise windows_log . should_be_a File_Format.Plain_Text + + Test.specify "should detect CSV files" <| + File_Format.Auto . materialise (Enso_Project.data / "data.csv") . should_equal (File_Format.Delimited ",") Test.group "File_Format.Auto" <| - Test.specify "should be able to read an unknown file" <| + Test.specify "should raise an error when reading an unknown file" <| bytes = sample_xxx.read - bytes.should_equal [72, 101, 108, 108, 111, 32, 87, 111, 114, 108, 100, 33] + bytes.should_fail_with Unsupported_File_Type Test.specify "should be able to read a text file" <| content = sample_txt.read @@ -44,17 +48,17 @@ spec = bytes = File.read path File_Format.Bytes bytes.should_equal [72, 101, 108, 108, 111, 32, 87, 111, 114, 108, 100, 33] - Test.group "File_Format.Text" <| + Test.group "File_Format.Plain_Text" <| Test.specify "should be able to read a file as Text" <| - text = sample_xxx.read File_Format.Text + text = sample_xxx.read File_Format.Plain_Text text.should_equal "Hello World!" Test.specify "should be able to read a file as Text with Encoding" <| - text = windows_log.read (File_Format.Text Encoding.windows_1252) + text = windows_log.read (File_Format.Plain_Text Encoding.windows_1252) text.should_equal "Hello World! $ยขยคยฅ" Test.specify "should raise a warning when invalid encoding in a Text file" <| - action = windows_log.read (File_Format.Text Encoding.ascii) on_problems=_ + action = windows_log.read (File_Format.Plain_Text Encoding.ascii) on_problems=_ tester result = result . should_equal 'Hello World! $\uFFFD\uFFFD\uFFFD' problems = [Encoding_Error "Encoding issues at 14, 15, 16."] Problems.test_problem_handling action problems tester diff --git a/test/Tests/src/Data/Locale_Spec.enso b/test/Tests/src/Data/Locale_Spec.enso index 964fcd1b20..f35da50e58 100644 --- a/test/Tests/src/Data/Locale_Spec.enso +++ b/test/Tests/src/Data/Locale_Spec.enso @@ -71,3 +71,8 @@ spec = Test.group "Locale" <| Test.specify "should convert to Json" <| en_gb.to_json.should_equal <| Json.from_pairs [["type", "Locale"], ["language", "en"], ["country", "GB"]] + Test.specify "should allow equality comparisons" <| + Locale.uk . should_equal Locale.uk + Locale.uk . should_not_equal Locale.us + +main = Test.Suite.run_main here.spec