diff --git a/CHANGELOG.md b/CHANGELOG.md index c5965c127fb..189102ec42e 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -114,6 +114,7 @@ - [Improved the `Range` type. Added a `down_to` counterpart to `up_to` and `with_step` allowing to change the range step.][3408] - [Aligned `Text.split` API with other methods and added `Text.lines`.][3415] +- [Implemented a basic reader for the `Delimited` file format.][3424] [debug-shortcuts]: https://github.com/enso-org/enso/blob/develop/app/gui/docs/product/shortcuts.md#debug @@ -175,6 +176,7 @@ [3390]: https://github.com/enso-org/enso/pull/3390 [3408]: https://github.com/enso-org/enso/pull/3408 [3415]: https://github.com/enso-org/enso/pull/3415 +[3424]: https://github.com/enso-org/enso/pull/3424 #### Enso Compiler diff --git a/distribution/lib/Standard/Base/0.0.0-dev/src/System/File.enso b/distribution/lib/Standard/Base/0.0.0-dev/src/System/File.enso index d99646b171c..1b64f6d80ea 100644 --- a/distribution/lib/Standard/Base/0.0.0-dev/src/System/File.enso +++ b/distribution/lib/Standard/Base/0.0.0-dev/src/System/File.enso @@ -882,10 +882,16 @@ type Input_Stream Utility method for running an action with Java exceptions mapping. handle_java_exceptions file ~action = - Panic.catch IOException handler=(caught_panic-> (Error.throw (Io_Error file "An IO error has occurred: " + caught_panic.payload.cause.getMessage))) <| - Panic.catch AccessDeniedException handler=(_-> (Error.throw (Io_Error file "You do not have permission to access the file"))) <| - Panic.catch NoSuchFileException handler=(_-> (Error.throw (File_Not_Found file))) <| - action + Panic.catch IOException action caught_panic-> + here.wrap_io_exception file caught_panic.payload.cause + +## PRIVATE + + Converts a Java `IOException` into its Enso counterpart. +wrap_io_exception file io_exception = + if Java.is_instance io_exception NoSuchFileException then Error.throw (File_Not_Found file) else + if Java.is_instance io_exception AccessDeniedException then Error.throw (Io_Error file "You do not have permission to access the file") else + Error.throw (Io_Error file "An IO error has occurred: "+io_exception.getMessage) ## PRIVATE diff --git a/distribution/lib/Standard/Table/0.0.0-dev/src/Error.enso b/distribution/lib/Standard/Table/0.0.0-dev/src/Error.enso index 7a49ede59fb..3bac2ddeaee 100644 --- a/distribution/lib/Standard/Table/0.0.0-dev/src/Error.enso +++ b/distribution/lib/Standard/Table/0.0.0-dev/src/Error.enso @@ -94,3 +94,20 @@ type Additional_Warnings (count:Integer) Additional_Warnings.to_display_text : Text Additional_Warnings.to_display_text = "There were "+this.count.to_text+" additional issues." + +## Indicates that when loading a delimited file, a row was encountered which had + too many or too few columns. + + Only the first 10 rows are reported, any additional ones are aggregated into + a single instance of `Additional_Invalid_Rows`. +type Invalid_Row (source_file_line_number : Integer) (index : Integer | Nothing) (row : [Text]) + +## Indicates how many additional `Invalid_Row` warnings have been suppressed. +type Additional_Invalid_Rows (count : Integer) + +## Indicates that a quote inside of a delimited file cell has been opened but + never closed. +type Mismatched_Quote + +## Indicates an unexpected parser error. +type Parser_Error cause diff --git a/distribution/lib/Standard/Table/0.0.0-dev/src/Internal/Delimited_Reader.enso b/distribution/lib/Standard/Table/0.0.0-dev/src/Internal/Delimited_Reader.enso new file mode 100644 index 00000000000..d2440e5e120 --- /dev/null +++ b/distribution/lib/Standard/Table/0.0.0-dev/src/Internal/Delimited_Reader.enso @@ -0,0 +1,94 @@ +from Standard.Base import all +import Standard.Table + +import Standard.Base.Error.Extensions as Errors +from Standard.Base.Error.Problem_Behavior as Problem_Behavior_Module import Problem_Behavior +from Standard.Table.Error as Table_Errors import Invalid_Row, Mismatched_Quote, Parser_Error, Additional_Invalid_Rows +from Standard.Base.Data.Text.Encoding as Encoding_Module import Encoding +from Standard.Table.Io.File_Format import Infer + +polyglot java import org.enso.table.read.DelimitedReader +polyglot java import org.enso.table.read.ParsingFailedException +polyglot java import org.enso.table.read.InvalidRow +polyglot java import org.enso.table.read.MismatchedQuote +polyglot java import org.enso.table.read.AdditionalInvalidRows +polyglot java import java.lang.IllegalArgumentException +polyglot java import java.io.IOException +polyglot java import com.univocity.parsers.common.TextParsingException +polyglot java import java.io.InputStream + +## Reads a delimited file according to the provided format. + + Arguments: + - format: The specification of the delimited file format. + - file: The file to read. + - on_problems: Specifies the behavior when a problem occurs during the + operation. By default, a warning is issued, but the operation proceeds. + If set to `Report_Error`, the operation fails with a dataflow error. + If set to `Ignore`, the operation proceeds without errors or warnings. +read_file : Delimited -> File -> Problem_Behavior -> Any +read_file format file on_problems = + if format.encoding != Encoding.utf_8 then Errors.unimplemented "Custom encodings when reading Delimited files are not implemented yet." else + ## We use the default `max_columns` setting. If we want to be able to + read files with unlimited column limits (risking OutOfMemory + exceptions), we can catch the exception indicating the limit has been + reached and restart parsing with an increased limit. + file.with_input_stream [File.Option.Read] stream-> + stream.with_java_stream java_stream-> + here.read_stream format java_stream on_problems related_file=file + +## PRIVATE + Reads an input stream according to the provided format. + + The `encoding` parameter is ignored, instead the provided stream should + handle any necessary decoding. + + Arguments: + - format: The specification of the delimited file format. + - java_stream: A Java `InputStream` used as the data source. + - on_problems: Specifies the behavior when a problem occurs during the + operation. By default, a warning is issued, but the operation proceeds. + If set to `Report_Error`, the operation fails with a dataflow error. + If set to `Ignore`, the operation proceeds without errors or warnings. + - max_columns: Specifies the limit of columns to read. The limit is set to + avoid `OutOfMemory` errors on malformed files. It must be a positive + integer. + - related_file: The file related to the provided `java_stream`, if available, + or `Nothing`. It is used for more detailed error reporting. +read_stream : Delimited -> InputStream -> Problem_Behavior -> File | Nothing -> Any +read_stream format java_stream on_problems max_columns=4096 related_file=Nothing = + java_headers = case format.headers of + True -> DelimitedReader.HeaderBehavior.USE_FIRST_ROW_AS_HEADERS + Infer -> Errors.unimplemented "Inferring headers is not implemented yet." + False -> DelimitedReader.HeaderBehavior.GENERATE_HEADERS + skip_rows = case format.skip_rows of + Nothing -> 0 + Integer -> format.skip_rows + _ -> Error.throw (Illegal_Argument_Error "`skip_rows` should be Integer or Nothing.") + row_limit = case format.row_limit of + Nothing -> -1 + Integer -> format.row_limit + _ -> Error.throw (Illegal_Argument_Error "`row_limit` should be Integer or Nothing.") + if format.parse_values then Errors.unimplemented "Parsing values is not implemented yet." else + translate_illegal_argument caught_panic = + Error.throw (Illegal_Argument_Error caught_panic.payload.cause.getMessage) + translate_problem java_problem = + if Java.is_instance java_problem InvalidRow then Invalid_Row java_problem.source_row java_problem.table_index (Vector.Vector java_problem.row) else + if Java.is_instance java_problem MismatchedQuote then Mismatched_Quote else + if Java.is_instance java_problem AdditionalInvalidRows then Additional_Invalid_Rows java_problem.count else + java_problem + translate_parsing_failure caught_panic = + Error.throw (translate_problem caught_panic.payload.cause.problem) + translate_parsing_exception caught_panic = + cause = caught_panic.payload.cause.getCause + if Java.is_instance cause IOException then File.wrap_io_exception related_file cause else + Error.throw (Parser_Error caught_panic.payload) + + Panic.catch IllegalArgumentException handler=translate_illegal_argument <| + Panic.catch ParsingFailedException handler=translate_parsing_failure <| + Panic.catch TextParsingException handler=translate_parsing_exception <| + warnings_as_errors = on_problems == Problem_Behavior_Module.Report_Error + reader = DelimitedReader.new java_stream format.delimiter format.quote format.quote_escape java_headers skip_rows row_limit max_columns format.keep_invalid_rows warnings_as_errors + result = Table.Table reader.read + problems = Vector.Vector reader.getReportedProblems . map translate_problem + on_problems.attach_problems_after result problems diff --git a/distribution/lib/Standard/Table/0.0.0-dev/src/Io/Csv.enso b/distribution/lib/Standard/Table/0.0.0-dev/src/Io/Csv.enso index 6d46c453505..00487b146d8 100644 --- a/distribution/lib/Standard/Table/0.0.0-dev/src/Io/Csv.enso +++ b/distribution/lib/Standard/Table/0.0.0-dev/src/Io/Csv.enso @@ -64,19 +64,18 @@ from_csv : File.File | Text -> Boolean -> Text -> Table ! Parse_Error from_csv csv has_header=True prefix='C' = parser_inst = Parser.create has_header prefix - handle_error error = case error of - Polyglot_Error err -> Error.throw (Parse_Error err.getMessage) - _ -> Panic.throw error + handle_error caught_panic = + Parse_Error caught_panic.payload.cause.getMessage case csv of Text -> input_stream = ByteArrayInputStream.new csv.utf_8.to_array - Panic.recover Any Table.Table (parser_inst.parse input_stream) . catch handle_error + Panic.catch Polyglot_Error (Table.Table (parser_inst.parse input_stream)) handle_error File.File _ -> - maybe_err = Panic.recover Any <| csv.with_input_stream [File.Option.Read] stream-> - stream.with_java_stream java_stream-> - Table.Table (parser_inst.parse java_stream) - maybe_err.catch handle_error + Panic.catch Polyglot_Error handler=handle_error <| + csv.with_input_stream [File.Option.Read] stream-> + stream.with_java_stream java_stream-> + Table.Table (parser_inst.parse java_stream) _ -> found_type_name = Meta.get_qualified_type_name csv file_name = Meta.get_qualified_type_name File.File diff --git a/distribution/lib/Standard/Table/0.0.0-dev/src/Io/File_Format.enso b/distribution/lib/Standard/Table/0.0.0-dev/src/Io/File_Format.enso index de0b87f3260..cad8845b3aa 100644 --- a/distribution/lib/Standard/Table/0.0.0-dev/src/Io/File_Format.enso +++ b/distribution/lib/Standard/Table/0.0.0-dev/src/Io/File_Format.enso @@ -1,6 +1,10 @@ from Standard.Base import all +import Standard.Table + +import Standard.Base.Error.Extensions as Errors from Standard.Base.Error.Problem_Behavior as Problem_Behavior_Module import Problem_Behavior from Standard.Base.Data.Text.Encoding as Encoding_Module import Encoding +import Standard.Table.Internal.Delimited_Reader ## This type needs to be here to allow for the usage of Standard.Table functions. Ideally, it would be an interface within Standard.Base and @@ -19,6 +23,8 @@ type Auto output = Ref.new File_Format.Bytes if ".txt".equals_ignore_case extension then Ref.put output File_Format.Text if ".log".equals_ignore_case extension then Ref.put output File_Format.Text + if ".csv".equals_ignore_case extension then Ref.put output (File_Format.Delimited ',') + if ".tsv".equals_ignore_case extension then Ref.put output (File_Format.Delimited '\t') Ref.get output @@ -45,3 +51,64 @@ type Text read : File -> Problem_Behavior -> Any read file on_problems = file.read_text this.encoding on_problems + +## Read delimited files such as CSVs into a Table. +type Delimited + ## Read delimited files such as CSVs into a Table. + + If a row does not match the first row's column count, the function raises + an `Invalid_Row`. If a quote is opened and never closed, a + `Mismatched_Quote` warning occurs. + + Arguments: + - delimiter: The delimiter character to split the file into columns. An + `Illegal_Argument_Error` error is returned if this is an empty string. + - encoding: The encoding to use when reading the file. + - quote: The quote character denotes the start and end of a quoted value. + No quote character is used if set to `Nothing`. Quoted items are not + split on the delimiter and can also contain newlines. Within a quoted + value, two consecutive quote characters are interpreted as an instance + of the quote character. Empty input strings must be quoted (e.g. "") as + otherwise an empty value is treated as `Nothing`. + - quote_escape: The character to escape the quote character in a quoted + value. For example, if both `quote` and `quote_escape` are set to `"`, + then escaping quotes is done by double quotes: `"ab""cd"` will yield + the text `ab"cd"`. Another popular choice for `quote_escape` is the `\` + character. Then `"ab\"cd"` will yield the same text. + - headers: If set to `True`, the first row is used as column names. If + set to `False`, the column names are generated by adding increasing + numeric suffixes to the base name `Column` (i.e. `Column_1`, + `Column_2` etc.). If set to `Infer`, the process tries to infer if + headers are present on the first row (`Infer` is not implemented yet). + If the column names are not unique, numeric suffixes will be appended + to disambiguate them. + - parse_values: The output columns are parsed using the default `Parser` + if 'True'. If more control over parsing is needed, the + `Table.parse_values` method allows full specifications of the parser + options. + - skip_rows: The number of rows to skip from the top of the file. + - row_limit: The maximum number of rows to read from the file. This count + does not include the header row (if applicable). + - keep_invalid_rows: Specifies whether rows that contain less or more + columns than expected should be kept (setting the missing columns to + `Nothing` or dropping the excess columns) or dropped. + + TODO [RW] The default for `headers` is temporarily changed to `False`, + because `Infer` is not supported. It should be changed to be the default + value once the corrresponding task is implemented: + https://www.pivotaltracker.com/story/show/181986831 + + TODO [RW] The default for `parse_values` is temporarily changed to + `False`, because this feature is not yet implemented. It should be + changed to `True` once the related task is implemented: + https://www.pivotaltracker.com/story/show/181824146 + type Delimited (delimiter:Text) (encoding:Encoding=Encoding.utf_8) (quote:Text|Nothing='"') (quote_escape:Text|Nothing='"') (headers:True|False|Infer=False) (parse_values:Boolean=False) (skip_rows:Integer|Nothing=Nothing) (row_limit:Integer|Nothing=Nothing) (keep_invalid_rows:Boolean=True) + + ## Implements the `File.read` for this `File_Format` + read : File -> Problem_Behavior -> Any + read file on_problems = + Delimited_Reader.read_file this file on_problems + +## A setting to infer the default behaviour of some option. +type Infer + diff --git a/distribution/lib/Standard/Test/0.0.0-dev/src/Main.enso b/distribution/lib/Standard/Test/0.0.0-dev/src/Main.enso index 8447ea56283..7ae6d101e28 100644 --- a/distribution/lib/Standard/Test/0.0.0-dev/src/Main.enso +++ b/distribution/lib/Standard/Test/0.0.0-dev/src/Main.enso @@ -165,7 +165,7 @@ fail message = Any.should_fail_with : Any -> Integer -> Assertion Any.should_fail_with matcher frames_to_skip=0 = loc = Meta.get_source_location 1+frames_to_skip - here.fail ("Expected an error " + matcher.to_text + " but none occurred (at " + loc + ").") + here.fail ("Expected an error " + matcher.to_text + " but no error occurred, instead got: " + this.to_text + " (at " + loc + ").") ## Expect a function to fail with the provided dataflow error. diff --git a/std-bits/table/src/main/java/org/enso/table/read/AdditionalInvalidRows.java b/std-bits/table/src/main/java/org/enso/table/read/AdditionalInvalidRows.java new file mode 100644 index 00000000000..d5d26a41ca8 --- /dev/null +++ b/std-bits/table/src/main/java/org/enso/table/read/AdditionalInvalidRows.java @@ -0,0 +1,10 @@ +package org.enso.table.read; + +/** A problem which indicates how many additional invalid rows were encountered. */ +public class AdditionalInvalidRows implements ParsingProblem { + public final long count; + + public AdditionalInvalidRows(long count) { + this.count = count; + } +} diff --git a/std-bits/table/src/main/java/org/enso/table/read/DelimitedReader.java b/std-bits/table/src/main/java/org/enso/table/read/DelimitedReader.java new file mode 100644 index 00000000000..b5ce43e62df --- /dev/null +++ b/std-bits/table/src/main/java/org/enso/table/read/DelimitedReader.java @@ -0,0 +1,332 @@ +package org.enso.table.read; + +import com.univocity.parsers.csv.CsvFormat; +import com.univocity.parsers.csv.CsvParser; +import com.univocity.parsers.csv.CsvParserSettings; +import java.io.InputStream; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.List; +import java.util.stream.Collectors; +import org.enso.table.data.column.builder.string.StorageBuilder; +import org.enso.table.data.column.builder.string.StringStorageBuilder; +import org.enso.table.data.column.storage.Storage; +import org.enso.table.data.index.DefaultIndex; +import org.enso.table.data.table.Column; +import org.enso.table.data.table.Table; +import org.enso.table.util.NameDeduplicator; + +/** A helper for reading delimited (CSV-like) files. */ +public class DelimitedReader { + + /** Specifies how to set the headers for the returned table. */ + public enum HeaderBehavior { + /** Tries to infer if the headers are present in the file. */ + INFER, + + /** Uses the first row in the file as headers. Duplicate names will be appended suffixes. */ + USE_FIRST_ROW_AS_HEADERS, + + /** + * Treats the first row as data and generates header names starting with {@code COLUMN_NAME}. + */ + GENERATE_HEADERS + } + + private static final String COLUMN_NAME = "Column"; + + private final char delimiter; + private final char quoteCharacter; + private final char quoteEscapeCharacter; + private final HeaderBehavior headerBehavior; + private final long skipRows; + private final long rowLimit; + private final int maxColumns; + private final List warnings = new ArrayList<>(); + private final CsvParser parser; + private final boolean keepInvalidRows; + private final boolean warningsAsErrors; + + private static final char noQuoteCharacter = '\0'; + + /** + * Creates a new reader. + * + * @param inputStream the stream to read from + * @param delimiter the delimiter, should be a single character, but is a String for proper + * interoperability with Enso; if a string that does not fit in a single character is + * provided, an exception is raised + * @param quote the quote character to use, should be a single character or {@code null}, but is a + * String for proper interoperability with Enso; if a string that does not fit in a single + * character is provided, an exception is raised + * @param quoteEscape the quote escape character to use, should be a single character or {@code + * null}, but is a * String for proper interoperability with Enso; if a string that does not + * fit in a single * character is provided, an exception is raised + * @param headerBehavior specifies how to set the header for the resulting table + * @param skipRows specifies how many rows from the input to skip + * @param rowLimit specifies how many rows to read (does not include the header row) + * @param maxColumns specifies how many columns can be expected at most + * @param keepInvalidRows specifies whether to keep rows that had an unexpected number of columns + * @param warningsAsErrors specifies if the first warning should be immediately raised as an error + * (used as a fast-path for the error-reporting mode to avoid computing a value that is going + * to be discarded anyway) + */ + public DelimitedReader( + InputStream inputStream, + String delimiter, + String quote, + String quoteEscape, + HeaderBehavior headerBehavior, + long skipRows, + long rowLimit, + int maxColumns, + boolean keepInvalidRows, + boolean warningsAsErrors) { + if (delimiter.isEmpty()) { + throw new IllegalArgumentException("Empty delimiters are not supported."); + } + if (delimiter.length() > 1) { + throw new IllegalArgumentException( + "Delimiters consisting of multiple characters or code units are not supported."); + } + + this.delimiter = delimiter.charAt(0); + + if (quote != null) { + if (quote.isEmpty()) { + throw new IllegalArgumentException( + "Empty quotes are not supported. Set the quote to `Nothing` to disable quotes."); + } + if (quote.length() > 1) { + throw new IllegalArgumentException( + "Quotes consisting of multiple characters or code units are not supported."); + } + + quoteCharacter = quote.charAt(0); + if (quoteCharacter == noQuoteCharacter) { + throw new IllegalArgumentException("Illegal quote character."); + } + } else { + quoteCharacter = noQuoteCharacter; + } + + if (quoteEscape != null) { + if (quoteEscape.isEmpty()) { + throw new IllegalArgumentException( + "Empty quote escapes are not supported. Set the escape to `Nothing` to disable escaping quotes."); + } + if (quoteEscape.length() > 1) { + throw new IllegalArgumentException( + "Quote escapes consisting of multiple characters or code units are not supported."); + } + + quoteEscapeCharacter = quoteEscape.charAt(0); + } else { + quoteEscapeCharacter = noQuoteCharacter; + } + + this.headerBehavior = headerBehavior; + this.skipRows = skipRows; + this.rowLimit = rowLimit; + this.maxColumns = maxColumns; + this.keepInvalidRows = keepInvalidRows; + this.warningsAsErrors = warningsAsErrors; + + parser = setupCsvParser(inputStream); + } + + /** Creates a {@code CsvParser} according to the settings specified at construction. */ + private CsvParser setupCsvParser(InputStream inputStream) { + CsvParserSettings settings = new CsvParserSettings(); + settings.setHeaderExtractionEnabled(false); + CsvFormat format = new CsvFormat(); + format.setDelimiter(delimiter); + format.setQuote(quoteCharacter); + format.setQuoteEscape(quoteEscapeCharacter); + settings.setFormat(format); + settings.setMaxCharsPerColumn(-1); + settings.setMaxColumns(maxColumns); + settings.setSkipEmptyLines(false); + settings.setKeepQuotes(true); + CsvParser parser = new CsvParser(settings); + parser.beginParsing(inputStream); + return parser; + } + + /** Parses a cell, removing surrounding quotes (if applicable). */ + private String parseCell(String cell) { + if (cell == null) return null; + + if (cell.isEmpty()) return cell; + if (cell.charAt(0) == quoteCharacter) { + return stripQuotes(cell); + } + + return cell; + } + + /** Parses a header cell, removing surrounding quotes (if applicable). */ + private String parseHeader(String cell) { + if (cell == null) return COLUMN_NAME; + + if (cell.isEmpty()) return cell; + if (cell.charAt(0) == quoteCharacter) { + return stripQuotes(cell); + } + + return cell; + } + + /** + * If the first character of a string is a quote, will remove the surrounding quotes. + * + *

If the first character of a string is a quote but the last one is not, mismatched quote + * problem is reported. + */ + private String stripQuotes(String cell) { + assert cell.charAt(0) == quoteCharacter; + + if (cell.length() < 2 || cell.charAt(cell.length() - 1) != quoteCharacter) { + reportMismatchedQuote(); + return cell.substring(1); + } else { + // Strip quotes. + return cell.substring(1, cell.length() - 1); + } + } + + private void reportMismatchedQuote() { + reportProblem(new MismatchedQuote()); + } + + private long invalidRowsCount = 0; + private static final long invalidRowsLimit = 10; + + private void reportInvalidRow(long source_row, Long table_index, String[] row) { + if (invalidRowsCount < invalidRowsLimit) { + reportProblem(new InvalidRow(source_row, table_index, row)); + } + + invalidRowsCount++; + } + + /** Returns a list of currently reported problems encountered when parsing the input. */ + public List getReportedProblems() { + List result = new ArrayList<>(warnings); + if (invalidRowsCount > invalidRowsLimit) { + long additionalInvalidRows = invalidRowsCount - invalidRowsLimit; + result.add(new AdditionalInvalidRows(additionalInvalidRows)); + } + return result; + } + + private void reportProblem(ParsingProblem problem) { + if (warningsAsErrors) { + throw new ParsingFailedException(problem); + } else { + warnings.add(problem); + } + } + + private long target_table_index = 0; + + /** The line number of the start of the current row in the input file. */ + private long current_line = 0; + + /** + * Reads the next row and updates the current line accordingly. + * + *

Will return {@code null} if no more rows are available. + */ + private String[] nextRow() { + current_line = parser.getContext().currentLine() + 1; + return parser.parseNext(); + } + + /** Reads the input stream and returns a Table. */ + public Table read() { + List headerNames; + String[] currentRow = nextRow(); + + // Skip the first N rows. + for (long i = 0; currentRow != null && i < skipRows; ++i) { + currentRow = nextRow(); + } + + // If there are no rows to even infer the headers, we return an empty table. + if (currentRow == null) { + return new Table(new Column[0]); + } + + switch (headerBehavior) { + case INFER: + throw new IllegalStateException("Inferring headers is not yet implemented"); + case USE_FIRST_ROW_AS_HEADERS: + List preprocessedHeaders = + Arrays.stream(currentRow).map(this::parseHeader).collect(Collectors.toList()); + headerNames = NameDeduplicator.deduplicate(preprocessedHeaders, "_"); + // We have 'used up' the first row, so we load a next one. + currentRow = nextRow(); + break; + case GENERATE_HEADERS: + headerNames = new ArrayList<>(currentRow.length); + for (int i = 0; i < currentRow.length; ++i) { + headerNames.add(COLUMN_NAME + "_" + (i + 1)); + } + break; + default: + throw new IllegalStateException("Impossible branch."); + } + + StorageBuilder[] builders = initBuilders(headerNames.size()); + + while (currentRow != null && (rowLimit < 0 || target_table_index < rowLimit)) { + if (currentRow.length != builders.length) { + reportInvalidRow(current_line, keepInvalidRows ? target_table_index : null, currentRow); + + if (keepInvalidRows) { + for (int i = 0; i < builders.length && i < currentRow.length; i++) { + String item = parseCell(currentRow[i]); + builders[i] = builders[i].parseAndAppend(item); + } + + // If the current row had less columns than expected, nulls are inserted for the missing + // values. + // If it had more columns, the excess columns are discarded. + for (int i = currentRow.length; i < builders.length; i++) { + builders[i] = builders[i].parseAndAppend(null); + } + + target_table_index++; + } + } else { + for (int i = 0; i < builders.length; i++) { + + String item = parseCell(currentRow[i]); + builders[i] = builders[i].parseAndAppend(item); + } + + target_table_index++; + } + + currentRow = nextRow(); + } + + parser.stopParsing(); + + Column[] columns = new Column[builders.length]; + for (int i = 0; i < builders.length; i++) { + Storage col = builders[i].seal(); + columns[i] = new Column(headerNames.get(i), new DefaultIndex(col.size()), col); + } + return new Table(columns); + } + + private StorageBuilder[] initBuilders(int count) { + StorageBuilder[] res = new StorageBuilder[count]; + for (int i = 0; i < count; i++) { + res[i] = new StringStorageBuilder(); + } + return res; + } +} diff --git a/std-bits/table/src/main/java/org/enso/table/read/InvalidRow.java b/std-bits/table/src/main/java/org/enso/table/read/InvalidRow.java new file mode 100644 index 00000000000..333b25a9956 --- /dev/null +++ b/std-bits/table/src/main/java/org/enso/table/read/InvalidRow.java @@ -0,0 +1,14 @@ +package org.enso.table.read; + +/** A problem indicating that a row contained more or less columns than expected. */ +public class InvalidRow implements ParsingProblem { + public final long source_row; + public final Long table_index; + public final String[] row; + + public InvalidRow(long source_row, Long table_index, String[] row) { + this.source_row = source_row; + this.table_index = table_index; + this.row = row; + } +} diff --git a/std-bits/table/src/main/java/org/enso/table/read/MismatchedQuote.java b/std-bits/table/src/main/java/org/enso/table/read/MismatchedQuote.java new file mode 100644 index 00000000000..0c303283c76 --- /dev/null +++ b/std-bits/table/src/main/java/org/enso/table/read/MismatchedQuote.java @@ -0,0 +1,4 @@ +package org.enso.table.read; + +/** A problem indicating that a quote has been opened and never closed. */ +public class MismatchedQuote implements ParsingProblem {} diff --git a/std-bits/table/src/main/java/org/enso/table/read/ParsingFailedException.java b/std-bits/table/src/main/java/org/enso/table/read/ParsingFailedException.java new file mode 100644 index 00000000000..102d1fa7dac --- /dev/null +++ b/std-bits/table/src/main/java/org/enso/table/read/ParsingFailedException.java @@ -0,0 +1,13 @@ +package org.enso.table.read; + +/** + * An exception thrown when a problem occured during parsing and the parser is running in a mode + * that does not try recovering, so the parsing is stopped. + */ +public class ParsingFailedException extends RuntimeException { + public final ParsingProblem problem; + + public ParsingFailedException(ParsingProblem problem) { + this.problem = problem; + } +} diff --git a/std-bits/table/src/main/java/org/enso/table/read/ParsingProblem.java b/std-bits/table/src/main/java/org/enso/table/read/ParsingProblem.java new file mode 100644 index 00000000000..fd3ada1728e --- /dev/null +++ b/std-bits/table/src/main/java/org/enso/table/read/ParsingProblem.java @@ -0,0 +1,7 @@ +package org.enso.table.read; + +/** + * A parent class for parsing problems which may be reported as warnings or errors, depending on the + * setup. + */ +public interface ParsingProblem {} diff --git a/test/Table_Tests/data/double_quoted.csv b/test/Table_Tests/data/double_quoted.csv new file mode 100644 index 00000000000..378b996a22c --- /dev/null +++ b/test/Table_Tests/data/double_quoted.csv @@ -0,0 +1,3 @@ +a,"b",c +"a, x",2,3 +"""a",4,"""" diff --git a/test/Table_Tests/data/duplicated_columns.csv b/test/Table_Tests/data/duplicated_columns.csv new file mode 100644 index 00000000000..92a121e6f65 --- /dev/null +++ b/test/Table_Tests/data/duplicated_columns.csv @@ -0,0 +1,2 @@ +a,b,c,a +1,2,3,4 diff --git a/test/Table_Tests/data/empty.txt b/test/Table_Tests/data/empty.txt new file mode 100644 index 00000000000..e69de29bb2d diff --git a/test/Table_Tests/data/escape_quoted.csv b/test/Table_Tests/data/escape_quoted.csv new file mode 100644 index 00000000000..3879fe58935 --- /dev/null +++ b/test/Table_Tests/data/escape_quoted.csv @@ -0,0 +1,3 @@ +a,b,c +"a\"b",2,3 +"a\\\"z",4,5 diff --git a/test/Table_Tests/data/many_invalid_rows.csv b/test/Table_Tests/data/many_invalid_rows.csv new file mode 100644 index 00000000000..fa7e378fea0 --- /dev/null +++ b/test/Table_Tests/data/many_invalid_rows.csv @@ -0,0 +1,16 @@ +a,b,c +0,x,y +1 +2 +3 +4 +5,u,v +6 +7 +8 +9 +10 +11 +12 +13 +14 diff --git a/test/Table_Tests/data/mismatched_quote.csv b/test/Table_Tests/data/mismatched_quote.csv new file mode 100644 index 00000000000..6ba867631c5 --- /dev/null +++ b/test/Table_Tests/data/mismatched_quote.csv @@ -0,0 +1,4 @@ +a,b,c +1,2,3 + abc,"def","g h i +7,8,9 diff --git a/test/Table_Tests/data/mismatched_quote2.csv b/test/Table_Tests/data/mismatched_quote2.csv new file mode 100644 index 00000000000..3cfe39451c7 --- /dev/null +++ b/test/Table_Tests/data/mismatched_quote2.csv @@ -0,0 +1,4 @@ +a,b,c +1,2,3 + abc,"def,g h i +7,8,9 diff --git a/test/Table_Tests/data/missing_header.csv b/test/Table_Tests/data/missing_header.csv new file mode 100644 index 00000000000..5908fa6adc4 --- /dev/null +++ b/test/Table_Tests/data/missing_header.csv @@ -0,0 +1,2 @@ +a,,c,,d +1,2,3,4,5 diff --git a/test/Table_Tests/data/multiline_quoted.csv b/test/Table_Tests/data/multiline_quoted.csv new file mode 100644 index 00000000000..41924ca1e85 --- /dev/null +++ b/test/Table_Tests/data/multiline_quoted.csv @@ -0,0 +1,5 @@ +a,b,c +1,"start + +continue",3 +4,5,6 diff --git a/test/Table_Tests/data/no_quoting.csv b/test/Table_Tests/data/no_quoting.csv new file mode 100644 index 00000000000..00b7d232242 --- /dev/null +++ b/test/Table_Tests/data/no_quoting.csv @@ -0,0 +1,2 @@ +a,b,c +"y,z",a diff --git a/test/Table_Tests/data/simple_empty.csv b/test/Table_Tests/data/simple_empty.csv index f93618d5d4d..6f6c21b6983 100644 --- a/test/Table_Tests/data/simple_empty.csv +++ b/test/Table_Tests/data/simple_empty.csv @@ -1,5 +1,5 @@ -a,b,c +a,b,"c" 1,2, -4,,6 +"4",,6 7,8,9 10,11,12 diff --git a/test/Table_Tests/data/varying_rows.csv b/test/Table_Tests/data/varying_rows.csv new file mode 100644 index 00000000000..ca39455cee1 --- /dev/null +++ b/test/Table_Tests/data/varying_rows.csv @@ -0,0 +1,7 @@ +a,b,c +1,2,3,4 +1,2,3 +1,2 + +1 +1,2,3,4,5,6,7,8 diff --git a/test/Table_Tests/src/Csv_Spec.enso b/test/Table_Tests/src/Csv_Spec.enso index f19a90dc832..c0331c4d3fd 100644 --- a/test/Table_Tests/src/Csv_Spec.enso +++ b/test/Table_Tests/src/Csv_Spec.enso @@ -172,3 +172,5 @@ spec = out_1.delete_if_exists out_2.delete_if_exists out_3.delete_if_exists + +main = Test.Suite.run_main here.spec diff --git a/test/Table_Tests/src/Delimited_Read_Spec.enso b/test/Table_Tests/src/Delimited_Read_Spec.enso new file mode 100644 index 00000000000..5a89ac155a9 --- /dev/null +++ b/test/Table_Tests/src/Delimited_Read_Spec.enso @@ -0,0 +1,171 @@ +from Standard.Base import all + +import Standard.Table +import Standard.Table.Data.Column +from Standard.Table.Error import all +import Standard.Table.Io.File_Format +import Standard.Base.Error.Problem_Behavior +import Standard.Test +import Standard.Test.Problems +import project.Util + +spec = + Test.group "Delimited File Parsing" <| + Test.specify "should load a simple table with headers" <| + c_1 = ["a", ['1', '4', '7', '10']] + c_2 = ["b", ['2', Nothing, '8', '11']] + c_3 = ["c", [Nothing, '6', '9', '12']] + expected_table = Table.new [c_1, c_2, c_3] + simple_empty = (File_Format.Delimited "," headers=True).read (Enso_Project.data / "simple_empty.csv") Problem_Behavior.Report_Error + simple_empty.should_equal expected_table + + Test.specify "should load a simple table without headers" <| + c_1 = ["Column_1", ['a', '1', '4', '7', '10']] + c_2 = ["Column_2", ['b', '2', Nothing, '8', '11']] + c_3 = ["Column_3", ['c', Nothing, '6', '9', '12']] + expected_table = Table.new [c_1, c_2, c_3] + simple_empty = (File_Format.Delimited "," headers=False).read (Enso_Project.data / "simple_empty.csv") Problem_Behavior.Report_Error + simple_empty.should_equal expected_table + + Test.specify "should work in presence of missing headers" <| + table = (File_Format.Delimited "," headers=True).read (Enso_Project.data / "missing_header.csv") Problem_Behavior.Report_Error + table.columns.map .name . should_equal ["a", "Column", "c", "Column_1", "d"] + table.at "a" . to_vector . should_equal ["1"] + table.at "Column" . to_vector . should_equal ["2"] + table.at "c" . to_vector . should_equal ["3"] + table.at "Column_1" . to_vector . should_equal ["4"] + table.at "d" . to_vector . should_equal ["5"] + + Test.specify "load even an empty file" <| + table = (File_Format.Delimited "," headers=True).read (Enso_Project.data / "empty.txt") Problem_Behavior.Report_Error + table.columns.map .name . should_equal [] + table.row_count . should_equal 0 + + Test.specify "should correctly handle file opening issues" <| + nonexistent_file = Enso_Project.data / "a_filename_that_does_not_exist.foobar" + r1 = (File_Format.Delimited "," headers=True).read nonexistent_file Problem_Behavior.Report_Error + r1.should_fail_with File.File_Not_Found + + directory = Enso_Project.data + r2 = (File_Format.Delimited "," headers=True).read directory Problem_Behavior.Report_Error + r2.should_fail_with File.Io_Error + + Test.specify "should handle duplicated columns" <| + table = (File_Format.Delimited "," headers=True).read (Enso_Project.data / "duplicated_columns.csv") Problem_Behavior.Report_Error + table.columns.map .name . should_equal ['a', 'b', 'c', 'a_1'] + table.at 'a' . to_vector . should_equal ['1'] + table.at 'a_1' . to_vector . should_equal ['4'] + + Test.specify "should handle quotes" <| + t1 = (File_Format.Delimited "," headers=True).read (Enso_Project.data / "double_quoted.csv") Problem_Behavior.Report_Error + t1.at 'a' . to_vector . should_equal ['a, x', '"a'] + t1.at 'c' . to_vector . should_equal ['3', '"'] + + t2 = (File_Format.Delimited "," headers=True quote_escape="\").read (Enso_Project.data / "escape_quoted.csv") Problem_Behavior.Report_Error + t2.at 'a' . to_vector . should_equal ['a"b', 'a\\\"z'] + + t3 = (File_Format.Delimited "," quote=Nothing headers=True).read (Enso_Project.data / "no_quoting.csv") Problem_Behavior.Report_Error + t3.at 'a' . to_vector . should_equal ['"y'] + t3.at 'b' . to_vector . should_equal ['z"'] + t3.at 'c' . to_vector . should_equal ['a'] + + Test.specify "should support rows spanning multiple lines if quoted" <| + t1 = (File_Format.Delimited "," headers=True).read (Enso_Project.data / "multiline_quoted.csv") Problem_Behavior.Report_Error + t1.at 'a' . to_vector . should_equal ['1', '4'] + t1.at 'b' . to_vector . should_equal ['start\n\ncontinue', '5'] + t1.at 'c' . to_vector . should_equal ['3', '6'] + + Test.specify "should behave correctly in presence of a mismatched quote" <| + action_1 on_problems = + (File_Format.Delimited "," headers=True).read (Enso_Project.data / "mismatched_quote.csv") on_problems + + tester_1 table = + table.columns.map .name . should_equal ['a', 'b', 'c'] + table.at 'a' . to_vector . should_equal ['1', 'abc'] + table.at 'b' . to_vector . should_equal ['2', 'def'] + table.at 'c' . to_vector . should_equal ['3', 'g h i\n7,8,9\n'] + problems_1 = [Mismatched_Quote] + Problems.test_problem_handling action_1 problems_1 tester_1 + + action_2 on_problems = + (File_Format.Delimited "," headers=True).read (Enso_Project.data / "mismatched_quote2.csv") on_problems + + tester_2 table = + table.columns.map .name . should_equal ['a', 'b', 'c'] + table.at 'a' . to_vector . should_equal ['1', 'abc'] + table.at 'b' . to_vector . should_equal ['2', 'def,g h i\n7,8,9\n'] + table.at 'c' . to_vector . should_equal ['3', Nothing] + problems_2 = [Invalid_Row 3 1 ['abc', '"def,g h i\n7,8,9\n'], Mismatched_Quote] + Problems.test_problem_handling action_2 problems_2 tester_2 + + Test.specify "should handle too long and too short rows" <| + action keep_invalid_rows on_problems = + (File_Format.Delimited "," headers=True keep_invalid_rows=keep_invalid_rows).read (Enso_Project.data / "varying_rows.csv") on_problems + + tester_kept table = + table.columns.map .name . should_equal ['a', 'b', 'c'] + table.at 'a' . to_vector . should_equal ['1', '1', '1', Nothing, '1', '1'] + table.at 'b' . to_vector . should_equal ['2', '2', '2', Nothing, Nothing, '2'] + table.at 'c' . to_vector . should_equal ['3', '3', Nothing, Nothing, Nothing, '3'] + problems_kept = [Invalid_Row 2 0 ['1', '2', '3', '4'], Invalid_Row 4 2 ['1', '2'], Invalid_Row 5 3 [Nothing], Invalid_Row 6 4 ['1'], Invalid_Row 7 5 ['1', '2', '3', '4', '5', '6', '7', '8']] + Problems.test_problem_handling (action keep_invalid_rows=True) problems_kept tester_kept + + tester_dropped table = + table.columns.map .name . should_equal ['a', 'b', 'c'] + table.at 'a' . to_vector . should_equal ['1'] + table.at 'b' . to_vector . should_equal ['2'] + table.at 'c' . to_vector . should_equal ['3'] + problems_dropped = [Invalid_Row 2 Nothing ['1', '2', '3', '4'], Invalid_Row 4 Nothing ['1', '2'], Invalid_Row 5 Nothing [Nothing], Invalid_Row 6 Nothing ['1'], Invalid_Row 7 Nothing ['1', '2', '3', '4', '5', '6', '7', '8']] + Problems.test_problem_handling (action keep_invalid_rows=False) problems_dropped tester_dropped + + Test.specify "should aggregate invalid rows over some limit" <| + action on_problems = + (File_Format.Delimited "," headers=True keep_invalid_rows=False).read (Enso_Project.data / "many_invalid_rows.csv") on_problems + + tester table = + table.columns.map .name . should_equal ['a', 'b', 'c'] + table.at 'a' . to_vector . should_equal ['0', '5'] + table.at 'b' . to_vector . should_equal ['x', 'u'] + table.at 'c' . to_vector . should_equal ['y', 'v'] + problems = [Invalid_Row 3 Nothing ['1'], Invalid_Row 4 Nothing ['2'], Invalid_Row 5 Nothing ['3'], Invalid_Row 6 Nothing ['4'], Invalid_Row 8 Nothing ['6'], Invalid_Row 9 Nothing ['7'], Invalid_Row 10 Nothing ['8'], Invalid_Row 11 Nothing ['9'], Invalid_Row 12 Nothing ['10'], Invalid_Row 13 Nothing ['11'], Additional_Invalid_Rows 3] + Problems.test_problem_handling action problems tester + + Test.specify "should allow to skip rows" <| + t1 = (File_Format.Delimited "," headers=False skip_rows=3).read (Enso_Project.data / "simple_empty.csv") Problem_Behavior.Report_Error + t1.at "Column_1" . to_vector . should_equal ['7', '10'] + + t2 = (File_Format.Delimited "," headers=True skip_rows=3).read (Enso_Project.data / "simple_empty.csv") Problem_Behavior.Report_Error + t2.columns.map .name . should_equal ['7', '8', '9'] + t2.at "7" . to_vector . should_equal ['10'] + + Test.specify "should allow to set a limit of rows to read" <| + t1 = (File_Format.Delimited "," headers=False row_limit=2).read (Enso_Project.data / "simple_empty.csv") Problem_Behavior.Report_Error + t1.at "Column_1" . to_vector . should_equal ['a', '1'] + + t2 = (File_Format.Delimited "," headers=True row_limit=2).read (Enso_Project.data / "simple_empty.csv") Problem_Behavior.Report_Error + t2.at "a" . to_vector . should_equal ['1', '4'] + + t3 = (File_Format.Delimited "," headers=False skip_rows=3 row_limit=1).read (Enso_Project.data / "simple_empty.csv") Problem_Behavior.Report_Error + t3.at "Column_1" . to_vector . should_equal ['7'] + + t4 = (File_Format.Delimited "," headers=False row_limit=0).read (Enso_Project.data / "simple_empty.csv") Problem_Behavior.Report_Error + t4.columns.map .name . should_equal ['Column_1', 'Column_2', 'Column_3'] + t4.row_count . should_equal 0 + + t5 = (File_Format.Delimited "," headers=True row_limit=0).read (Enso_Project.data / "simple_empty.csv") Problem_Behavior.Report_Error + t5.columns.map .name . should_equal ['a', 'b', 'c'] + t5.at 'a' . to_vector . should_equal [] + t5.row_count . should_equal 0 + + t6 = (File_Format.Delimited "," headers=False skip_rows=3 row_limit=1000).read (Enso_Project.data / "simple_empty.csv") Problem_Behavior.Report_Error + t6.at "Column_1" . to_vector . should_equal ['7', '10'] + + Test.specify "should check arguments" <| + path = (Enso_Project.data / "simple_empty.csv") + pb = Problem_Behavior.Report_Error + (File_Format.Delimited "," headers=False quote='abc').read path pb . should_fail_with Illegal_Argument_Error + (File_Format.Delimited "," headers=False quote='🚧').read path pb . should_fail_with Illegal_Argument_Error + (File_Format.Delimited "," headers=False quote_escape='//').read path pb . should_fail_with Illegal_Argument_Error + (File_Format.Delimited 'a\u{301}' headers=False).read path pb . should_fail_with Illegal_Argument_Error + +main = Test.Suite.run_main here.spec diff --git a/test/Table_Tests/src/In_Memory_Tests.enso b/test/Table_Tests/src/In_Memory_Tests.enso index 0d4ab4eba35..8f50f5bc5aa 100644 --- a/test/Table_Tests/src/In_Memory_Tests.enso +++ b/test/Table_Tests/src/In_Memory_Tests.enso @@ -5,6 +5,7 @@ import Standard.Test import project.Model_Spec import project.Column_Spec import project.Csv_Spec +import project.Delimited_Read_Spec import project.Json_Spec import project.Table_Spec import project.Spreadsheet_Spec @@ -14,6 +15,7 @@ import project.Aggregate_Spec in_memory_spec = Column_Spec.spec Csv_Spec.spec + Delimited_Read_Spec.spec Json_Spec.spec Spreadsheet_Spec.spec Table_Spec.spec