Improve support for reading Delimited files (#3424)

Implements https://www.pivotaltracker.com/story/show/181823957
This commit is contained in:
Radosław Waśko 2022-04-29 19:12:19 +02:00 committed by GitHub
parent 96a0c92c8b
commit 8219dca400
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
28 changed files with 803 additions and 15 deletions

View File

@ -114,6 +114,7 @@
- [Improved the `Range` type. Added a `down_to` counterpart to `up_to` and
`with_step` allowing to change the range step.][3408]
- [Aligned `Text.split` API with other methods and added `Text.lines`.][3415]
- [Implemented a basic reader for the `Delimited` file format.][3424]
[debug-shortcuts]:
https://github.com/enso-org/enso/blob/develop/app/gui/docs/product/shortcuts.md#debug
@ -175,6 +176,7 @@
[3390]: https://github.com/enso-org/enso/pull/3390
[3408]: https://github.com/enso-org/enso/pull/3408
[3415]: https://github.com/enso-org/enso/pull/3415
[3424]: https://github.com/enso-org/enso/pull/3424
#### Enso Compiler

View File

@ -882,10 +882,16 @@ type Input_Stream
Utility method for running an action with Java exceptions mapping.
handle_java_exceptions file ~action =
Panic.catch IOException handler=(caught_panic-> (Error.throw (Io_Error file "An IO error has occurred: " + caught_panic.payload.cause.getMessage))) <|
Panic.catch AccessDeniedException handler=(_-> (Error.throw (Io_Error file "You do not have permission to access the file"))) <|
Panic.catch NoSuchFileException handler=(_-> (Error.throw (File_Not_Found file))) <|
action
Panic.catch IOException action caught_panic->
here.wrap_io_exception file caught_panic.payload.cause
## PRIVATE
Converts a Java `IOException` into its Enso counterpart.
wrap_io_exception file io_exception =
if Java.is_instance io_exception NoSuchFileException then Error.throw (File_Not_Found file) else
if Java.is_instance io_exception AccessDeniedException then Error.throw (Io_Error file "You do not have permission to access the file") else
Error.throw (Io_Error file "An IO error has occurred: "+io_exception.getMessage)
## PRIVATE

View File

@ -94,3 +94,20 @@ type Additional_Warnings (count:Integer)
Additional_Warnings.to_display_text : Text
Additional_Warnings.to_display_text =
"There were "+this.count.to_text+" additional issues."
## Indicates that when loading a delimited file, a row was encountered which had
too many or too few columns.
Only the first 10 rows are reported, any additional ones are aggregated into
a single instance of `Additional_Invalid_Rows`.
type Invalid_Row (source_file_line_number : Integer) (index : Integer | Nothing) (row : [Text])
## Indicates how many additional `Invalid_Row` warnings have been suppressed.
type Additional_Invalid_Rows (count : Integer)
## Indicates that a quote inside of a delimited file cell has been opened but
never closed.
type Mismatched_Quote
## Indicates an unexpected parser error.
type Parser_Error cause

View File

@ -0,0 +1,94 @@
from Standard.Base import all
import Standard.Table
import Standard.Base.Error.Extensions as Errors
from Standard.Base.Error.Problem_Behavior as Problem_Behavior_Module import Problem_Behavior
from Standard.Table.Error as Table_Errors import Invalid_Row, Mismatched_Quote, Parser_Error, Additional_Invalid_Rows
from Standard.Base.Data.Text.Encoding as Encoding_Module import Encoding
from Standard.Table.Io.File_Format import Infer
polyglot java import org.enso.table.read.DelimitedReader
polyglot java import org.enso.table.read.ParsingFailedException
polyglot java import org.enso.table.read.InvalidRow
polyglot java import org.enso.table.read.MismatchedQuote
polyglot java import org.enso.table.read.AdditionalInvalidRows
polyglot java import java.lang.IllegalArgumentException
polyglot java import java.io.IOException
polyglot java import com.univocity.parsers.common.TextParsingException
polyglot java import java.io.InputStream
## Reads a delimited file according to the provided format.
Arguments:
- format: The specification of the delimited file format.
- file: The file to read.
- on_problems: Specifies the behavior when a problem occurs during the
operation. By default, a warning is issued, but the operation proceeds.
If set to `Report_Error`, the operation fails with a dataflow error.
If set to `Ignore`, the operation proceeds without errors or warnings.
read_file : Delimited -> File -> Problem_Behavior -> Any
read_file format file on_problems =
if format.encoding != Encoding.utf_8 then Errors.unimplemented "Custom encodings when reading Delimited files are not implemented yet." else
## We use the default `max_columns` setting. If we want to be able to
read files with unlimited column limits (risking OutOfMemory
exceptions), we can catch the exception indicating the limit has been
reached and restart parsing with an increased limit.
file.with_input_stream [File.Option.Read] stream->
stream.with_java_stream java_stream->
here.read_stream format java_stream on_problems related_file=file
## PRIVATE
Reads an input stream according to the provided format.
The `encoding` parameter is ignored, instead the provided stream should
handle any necessary decoding.
Arguments:
- format: The specification of the delimited file format.
- java_stream: A Java `InputStream` used as the data source.
- on_problems: Specifies the behavior when a problem occurs during the
operation. By default, a warning is issued, but the operation proceeds.
If set to `Report_Error`, the operation fails with a dataflow error.
If set to `Ignore`, the operation proceeds without errors or warnings.
- max_columns: Specifies the limit of columns to read. The limit is set to
avoid `OutOfMemory` errors on malformed files. It must be a positive
integer.
- related_file: The file related to the provided `java_stream`, if available,
or `Nothing`. It is used for more detailed error reporting.
read_stream : Delimited -> InputStream -> Problem_Behavior -> File | Nothing -> Any
read_stream format java_stream on_problems max_columns=4096 related_file=Nothing =
java_headers = case format.headers of
True -> DelimitedReader.HeaderBehavior.USE_FIRST_ROW_AS_HEADERS
Infer -> Errors.unimplemented "Inferring headers is not implemented yet."
False -> DelimitedReader.HeaderBehavior.GENERATE_HEADERS
skip_rows = case format.skip_rows of
Nothing -> 0
Integer -> format.skip_rows
_ -> Error.throw (Illegal_Argument_Error "`skip_rows` should be Integer or Nothing.")
row_limit = case format.row_limit of
Nothing -> -1
Integer -> format.row_limit
_ -> Error.throw (Illegal_Argument_Error "`row_limit` should be Integer or Nothing.")
if format.parse_values then Errors.unimplemented "Parsing values is not implemented yet." else
translate_illegal_argument caught_panic =
Error.throw (Illegal_Argument_Error caught_panic.payload.cause.getMessage)
translate_problem java_problem =
if Java.is_instance java_problem InvalidRow then Invalid_Row java_problem.source_row java_problem.table_index (Vector.Vector java_problem.row) else
if Java.is_instance java_problem MismatchedQuote then Mismatched_Quote else
if Java.is_instance java_problem AdditionalInvalidRows then Additional_Invalid_Rows java_problem.count else
java_problem
translate_parsing_failure caught_panic =
Error.throw (translate_problem caught_panic.payload.cause.problem)
translate_parsing_exception caught_panic =
cause = caught_panic.payload.cause.getCause
if Java.is_instance cause IOException then File.wrap_io_exception related_file cause else
Error.throw (Parser_Error caught_panic.payload)
Panic.catch IllegalArgumentException handler=translate_illegal_argument <|
Panic.catch ParsingFailedException handler=translate_parsing_failure <|
Panic.catch TextParsingException handler=translate_parsing_exception <|
warnings_as_errors = on_problems == Problem_Behavior_Module.Report_Error
reader = DelimitedReader.new java_stream format.delimiter format.quote format.quote_escape java_headers skip_rows row_limit max_columns format.keep_invalid_rows warnings_as_errors
result = Table.Table reader.read
problems = Vector.Vector reader.getReportedProblems . map translate_problem
on_problems.attach_problems_after result problems

View File

@ -64,19 +64,18 @@ from_csv : File.File | Text -> Boolean -> Text -> Table ! Parse_Error
from_csv csv has_header=True prefix='C' =
parser_inst = Parser.create has_header prefix
handle_error error = case error of
Polyglot_Error err -> Error.throw (Parse_Error err.getMessage)
_ -> Panic.throw error
handle_error caught_panic =
Parse_Error caught_panic.payload.cause.getMessage
case csv of
Text ->
input_stream = ByteArrayInputStream.new csv.utf_8.to_array
Panic.recover Any Table.Table (parser_inst.parse input_stream) . catch handle_error
Panic.catch Polyglot_Error (Table.Table (parser_inst.parse input_stream)) handle_error
File.File _ ->
maybe_err = Panic.recover Any <| csv.with_input_stream [File.Option.Read] stream->
stream.with_java_stream java_stream->
Table.Table (parser_inst.parse java_stream)
maybe_err.catch handle_error
Panic.catch Polyglot_Error handler=handle_error <|
csv.with_input_stream [File.Option.Read] stream->
stream.with_java_stream java_stream->
Table.Table (parser_inst.parse java_stream)
_ ->
found_type_name = Meta.get_qualified_type_name csv
file_name = Meta.get_qualified_type_name File.File

View File

@ -1,6 +1,10 @@
from Standard.Base import all
import Standard.Table
import Standard.Base.Error.Extensions as Errors
from Standard.Base.Error.Problem_Behavior as Problem_Behavior_Module import Problem_Behavior
from Standard.Base.Data.Text.Encoding as Encoding_Module import Encoding
import Standard.Table.Internal.Delimited_Reader
## This type needs to be here to allow for the usage of Standard.Table
functions. Ideally, it would be an interface within Standard.Base and
@ -19,6 +23,8 @@ type Auto
output = Ref.new File_Format.Bytes
if ".txt".equals_ignore_case extension then Ref.put output File_Format.Text
if ".log".equals_ignore_case extension then Ref.put output File_Format.Text
if ".csv".equals_ignore_case extension then Ref.put output (File_Format.Delimited ',')
if ".tsv".equals_ignore_case extension then Ref.put output (File_Format.Delimited '\t')
Ref.get output
@ -45,3 +51,64 @@ type Text
read : File -> Problem_Behavior -> Any
read file on_problems =
file.read_text this.encoding on_problems
## Read delimited files such as CSVs into a Table.
type Delimited
## Read delimited files such as CSVs into a Table.
If a row does not match the first row's column count, the function raises
an `Invalid_Row`. If a quote is opened and never closed, a
`Mismatched_Quote` warning occurs.
Arguments:
- delimiter: The delimiter character to split the file into columns. An
`Illegal_Argument_Error` error is returned if this is an empty string.
- encoding: The encoding to use when reading the file.
- quote: The quote character denotes the start and end of a quoted value.
No quote character is used if set to `Nothing`. Quoted items are not
split on the delimiter and can also contain newlines. Within a quoted
value, two consecutive quote characters are interpreted as an instance
of the quote character. Empty input strings must be quoted (e.g. "") as
otherwise an empty value is treated as `Nothing`.
- quote_escape: The character to escape the quote character in a quoted
value. For example, if both `quote` and `quote_escape` are set to `"`,
then escaping quotes is done by double quotes: `"ab""cd"` will yield
the text `ab"cd"`. Another popular choice for `quote_escape` is the `\`
character. Then `"ab\"cd"` will yield the same text.
- headers: If set to `True`, the first row is used as column names. If
set to `False`, the column names are generated by adding increasing
numeric suffixes to the base name `Column` (i.e. `Column_1`,
`Column_2` etc.). If set to `Infer`, the process tries to infer if
headers are present on the first row (`Infer` is not implemented yet).
If the column names are not unique, numeric suffixes will be appended
to disambiguate them.
- parse_values: The output columns are parsed using the default `Parser`
if 'True'. If more control over parsing is needed, the
`Table.parse_values` method allows full specifications of the parser
options.
- skip_rows: The number of rows to skip from the top of the file.
- row_limit: The maximum number of rows to read from the file. This count
does not include the header row (if applicable).
- keep_invalid_rows: Specifies whether rows that contain less or more
columns than expected should be kept (setting the missing columns to
`Nothing` or dropping the excess columns) or dropped.
TODO [RW] The default for `headers` is temporarily changed to `False`,
because `Infer` is not supported. It should be changed to be the default
value once the corrresponding task is implemented:
https://www.pivotaltracker.com/story/show/181986831
TODO [RW] The default for `parse_values` is temporarily changed to
`False`, because this feature is not yet implemented. It should be
changed to `True` once the related task is implemented:
https://www.pivotaltracker.com/story/show/181824146
type Delimited (delimiter:Text) (encoding:Encoding=Encoding.utf_8) (quote:Text|Nothing='"') (quote_escape:Text|Nothing='"') (headers:True|False|Infer=False) (parse_values:Boolean=False) (skip_rows:Integer|Nothing=Nothing) (row_limit:Integer|Nothing=Nothing) (keep_invalid_rows:Boolean=True)
## Implements the `File.read` for this `File_Format`
read : File -> Problem_Behavior -> Any
read file on_problems =
Delimited_Reader.read_file this file on_problems
## A setting to infer the default behaviour of some option.
type Infer

View File

@ -165,7 +165,7 @@ fail message =
Any.should_fail_with : Any -> Integer -> Assertion
Any.should_fail_with matcher frames_to_skip=0 =
loc = Meta.get_source_location 1+frames_to_skip
here.fail ("Expected an error " + matcher.to_text + " but none occurred (at " + loc + ").")
here.fail ("Expected an error " + matcher.to_text + " but no error occurred, instead got: " + this.to_text + " (at " + loc + ").")
## Expect a function to fail with the provided dataflow error.

View File

@ -0,0 +1,10 @@
package org.enso.table.read;
/** A problem which indicates how many additional invalid rows were encountered. */
public class AdditionalInvalidRows implements ParsingProblem {
public final long count;
public AdditionalInvalidRows(long count) {
this.count = count;
}
}

View File

@ -0,0 +1,332 @@
package org.enso.table.read;
import com.univocity.parsers.csv.CsvFormat;
import com.univocity.parsers.csv.CsvParser;
import com.univocity.parsers.csv.CsvParserSettings;
import java.io.InputStream;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;
import java.util.stream.Collectors;
import org.enso.table.data.column.builder.string.StorageBuilder;
import org.enso.table.data.column.builder.string.StringStorageBuilder;
import org.enso.table.data.column.storage.Storage;
import org.enso.table.data.index.DefaultIndex;
import org.enso.table.data.table.Column;
import org.enso.table.data.table.Table;
import org.enso.table.util.NameDeduplicator;
/** A helper for reading delimited (CSV-like) files. */
public class DelimitedReader {
/** Specifies how to set the headers for the returned table. */
public enum HeaderBehavior {
/** Tries to infer if the headers are present in the file. */
INFER,
/** Uses the first row in the file as headers. Duplicate names will be appended suffixes. */
USE_FIRST_ROW_AS_HEADERS,
/**
* Treats the first row as data and generates header names starting with {@code COLUMN_NAME}.
*/
GENERATE_HEADERS
}
private static final String COLUMN_NAME = "Column";
private final char delimiter;
private final char quoteCharacter;
private final char quoteEscapeCharacter;
private final HeaderBehavior headerBehavior;
private final long skipRows;
private final long rowLimit;
private final int maxColumns;
private final List<ParsingProblem> warnings = new ArrayList<>();
private final CsvParser parser;
private final boolean keepInvalidRows;
private final boolean warningsAsErrors;
private static final char noQuoteCharacter = '\0';
/**
* Creates a new reader.
*
* @param inputStream the stream to read from
* @param delimiter the delimiter, should be a single character, but is a String for proper
* interoperability with Enso; if a string that does not fit in a single character is
* provided, an exception is raised
* @param quote the quote character to use, should be a single character or {@code null}, but is a
* String for proper interoperability with Enso; if a string that does not fit in a single
* character is provided, an exception is raised
* @param quoteEscape the quote escape character to use, should be a single character or {@code
* null}, but is a * String for proper interoperability with Enso; if a string that does not
* fit in a single * character is provided, an exception is raised
* @param headerBehavior specifies how to set the header for the resulting table
* @param skipRows specifies how many rows from the input to skip
* @param rowLimit specifies how many rows to read (does not include the header row)
* @param maxColumns specifies how many columns can be expected at most
* @param keepInvalidRows specifies whether to keep rows that had an unexpected number of columns
* @param warningsAsErrors specifies if the first warning should be immediately raised as an error
* (used as a fast-path for the error-reporting mode to avoid computing a value that is going
* to be discarded anyway)
*/
public DelimitedReader(
InputStream inputStream,
String delimiter,
String quote,
String quoteEscape,
HeaderBehavior headerBehavior,
long skipRows,
long rowLimit,
int maxColumns,
boolean keepInvalidRows,
boolean warningsAsErrors) {
if (delimiter.isEmpty()) {
throw new IllegalArgumentException("Empty delimiters are not supported.");
}
if (delimiter.length() > 1) {
throw new IllegalArgumentException(
"Delimiters consisting of multiple characters or code units are not supported.");
}
this.delimiter = delimiter.charAt(0);
if (quote != null) {
if (quote.isEmpty()) {
throw new IllegalArgumentException(
"Empty quotes are not supported. Set the quote to `Nothing` to disable quotes.");
}
if (quote.length() > 1) {
throw new IllegalArgumentException(
"Quotes consisting of multiple characters or code units are not supported.");
}
quoteCharacter = quote.charAt(0);
if (quoteCharacter == noQuoteCharacter) {
throw new IllegalArgumentException("Illegal quote character.");
}
} else {
quoteCharacter = noQuoteCharacter;
}
if (quoteEscape != null) {
if (quoteEscape.isEmpty()) {
throw new IllegalArgumentException(
"Empty quote escapes are not supported. Set the escape to `Nothing` to disable escaping quotes.");
}
if (quoteEscape.length() > 1) {
throw new IllegalArgumentException(
"Quote escapes consisting of multiple characters or code units are not supported.");
}
quoteEscapeCharacter = quoteEscape.charAt(0);
} else {
quoteEscapeCharacter = noQuoteCharacter;
}
this.headerBehavior = headerBehavior;
this.skipRows = skipRows;
this.rowLimit = rowLimit;
this.maxColumns = maxColumns;
this.keepInvalidRows = keepInvalidRows;
this.warningsAsErrors = warningsAsErrors;
parser = setupCsvParser(inputStream);
}
/** Creates a {@code CsvParser} according to the settings specified at construction. */
private CsvParser setupCsvParser(InputStream inputStream) {
CsvParserSettings settings = new CsvParserSettings();
settings.setHeaderExtractionEnabled(false);
CsvFormat format = new CsvFormat();
format.setDelimiter(delimiter);
format.setQuote(quoteCharacter);
format.setQuoteEscape(quoteEscapeCharacter);
settings.setFormat(format);
settings.setMaxCharsPerColumn(-1);
settings.setMaxColumns(maxColumns);
settings.setSkipEmptyLines(false);
settings.setKeepQuotes(true);
CsvParser parser = new CsvParser(settings);
parser.beginParsing(inputStream);
return parser;
}
/** Parses a cell, removing surrounding quotes (if applicable). */
private String parseCell(String cell) {
if (cell == null) return null;
if (cell.isEmpty()) return cell;
if (cell.charAt(0) == quoteCharacter) {
return stripQuotes(cell);
}
return cell;
}
/** Parses a header cell, removing surrounding quotes (if applicable). */
private String parseHeader(String cell) {
if (cell == null) return COLUMN_NAME;
if (cell.isEmpty()) return cell;
if (cell.charAt(0) == quoteCharacter) {
return stripQuotes(cell);
}
return cell;
}
/**
* If the first character of a string is a quote, will remove the surrounding quotes.
*
* <p>If the first character of a string is a quote but the last one is not, mismatched quote
* problem is reported.
*/
private String stripQuotes(String cell) {
assert cell.charAt(0) == quoteCharacter;
if (cell.length() < 2 || cell.charAt(cell.length() - 1) != quoteCharacter) {
reportMismatchedQuote();
return cell.substring(1);
} else {
// Strip quotes.
return cell.substring(1, cell.length() - 1);
}
}
private void reportMismatchedQuote() {
reportProblem(new MismatchedQuote());
}
private long invalidRowsCount = 0;
private static final long invalidRowsLimit = 10;
private void reportInvalidRow(long source_row, Long table_index, String[] row) {
if (invalidRowsCount < invalidRowsLimit) {
reportProblem(new InvalidRow(source_row, table_index, row));
}
invalidRowsCount++;
}
/** Returns a list of currently reported problems encountered when parsing the input. */
public List<ParsingProblem> getReportedProblems() {
List<ParsingProblem> result = new ArrayList<>(warnings);
if (invalidRowsCount > invalidRowsLimit) {
long additionalInvalidRows = invalidRowsCount - invalidRowsLimit;
result.add(new AdditionalInvalidRows(additionalInvalidRows));
}
return result;
}
private void reportProblem(ParsingProblem problem) {
if (warningsAsErrors) {
throw new ParsingFailedException(problem);
} else {
warnings.add(problem);
}
}
private long target_table_index = 0;
/** The line number of the start of the current row in the input file. */
private long current_line = 0;
/**
* Reads the next row and updates the current line accordingly.
*
* <p>Will return {@code null} if no more rows are available.
*/
private String[] nextRow() {
current_line = parser.getContext().currentLine() + 1;
return parser.parseNext();
}
/** Reads the input stream and returns a Table. */
public Table read() {
List<String> headerNames;
String[] currentRow = nextRow();
// Skip the first N rows.
for (long i = 0; currentRow != null && i < skipRows; ++i) {
currentRow = nextRow();
}
// If there are no rows to even infer the headers, we return an empty table.
if (currentRow == null) {
return new Table(new Column[0]);
}
switch (headerBehavior) {
case INFER:
throw new IllegalStateException("Inferring headers is not yet implemented");
case USE_FIRST_ROW_AS_HEADERS:
List<String> preprocessedHeaders =
Arrays.stream(currentRow).map(this::parseHeader).collect(Collectors.toList());
headerNames = NameDeduplicator.deduplicate(preprocessedHeaders, "_");
// We have 'used up' the first row, so we load a next one.
currentRow = nextRow();
break;
case GENERATE_HEADERS:
headerNames = new ArrayList<>(currentRow.length);
for (int i = 0; i < currentRow.length; ++i) {
headerNames.add(COLUMN_NAME + "_" + (i + 1));
}
break;
default:
throw new IllegalStateException("Impossible branch.");
}
StorageBuilder[] builders = initBuilders(headerNames.size());
while (currentRow != null && (rowLimit < 0 || target_table_index < rowLimit)) {
if (currentRow.length != builders.length) {
reportInvalidRow(current_line, keepInvalidRows ? target_table_index : null, currentRow);
if (keepInvalidRows) {
for (int i = 0; i < builders.length && i < currentRow.length; i++) {
String item = parseCell(currentRow[i]);
builders[i] = builders[i].parseAndAppend(item);
}
// If the current row had less columns than expected, nulls are inserted for the missing
// values.
// If it had more columns, the excess columns are discarded.
for (int i = currentRow.length; i < builders.length; i++) {
builders[i] = builders[i].parseAndAppend(null);
}
target_table_index++;
}
} else {
for (int i = 0; i < builders.length; i++) {
String item = parseCell(currentRow[i]);
builders[i] = builders[i].parseAndAppend(item);
}
target_table_index++;
}
currentRow = nextRow();
}
parser.stopParsing();
Column[] columns = new Column[builders.length];
for (int i = 0; i < builders.length; i++) {
Storage col = builders[i].seal();
columns[i] = new Column(headerNames.get(i), new DefaultIndex(col.size()), col);
}
return new Table(columns);
}
private StorageBuilder[] initBuilders(int count) {
StorageBuilder[] res = new StorageBuilder[count];
for (int i = 0; i < count; i++) {
res[i] = new StringStorageBuilder();
}
return res;
}
}

View File

@ -0,0 +1,14 @@
package org.enso.table.read;
/** A problem indicating that a row contained more or less columns than expected. */
public class InvalidRow implements ParsingProblem {
public final long source_row;
public final Long table_index;
public final String[] row;
public InvalidRow(long source_row, Long table_index, String[] row) {
this.source_row = source_row;
this.table_index = table_index;
this.row = row;
}
}

View File

@ -0,0 +1,4 @@
package org.enso.table.read;
/** A problem indicating that a quote has been opened and never closed. */
public class MismatchedQuote implements ParsingProblem {}

View File

@ -0,0 +1,13 @@
package org.enso.table.read;
/**
* An exception thrown when a problem occured during parsing and the parser is running in a mode
* that does not try recovering, so the parsing is stopped.
*/
public class ParsingFailedException extends RuntimeException {
public final ParsingProblem problem;
public ParsingFailedException(ParsingProblem problem) {
this.problem = problem;
}
}

View File

@ -0,0 +1,7 @@
package org.enso.table.read;
/**
* A parent class for parsing problems which may be reported as warnings or errors, depending on the
* setup.
*/
public interface ParsingProblem {}

View File

@ -0,0 +1,3 @@
a,"b",c
"a, x",2,3
"""a",4,""""
1 a b c
2 a, x 2 3
3 "a 4 "

View File

@ -0,0 +1,2 @@
a,b,c,a
1,2,3,4
1 a b c a
2 1 2 3 4

View File

View File

@ -0,0 +1,3 @@
a,b,c
"a\"b",2,3
"a\\\"z",4,5
Can't render this file because it contains an unexpected character in line 2 and column 4.

View File

@ -0,0 +1,16 @@
a,b,c
0,x,y
1
2
3
4
5,u,v
6
7
8
9
10
11
12
13
14
1 a,b,c
2 0,x,y
3 1
4 2
5 3
6 4
7 5,u,v
8 6
9 7
10 8
11 9
12 10
13 11
14 12
15 13
16 14

View File

@ -0,0 +1,4 @@
a,b,c
1,2,3
abc,"def","g h i
7,8,9
Can't render this file because it contains an unexpected character in line 4 and column 7.

View File

@ -0,0 +1,4 @@
a,b,c
1,2,3
abc,"def,g h i
7,8,9
Can't render this file because it contains an unexpected character in line 4 and column 7.

View File

@ -0,0 +1,2 @@
a,,c,,d
1,2,3,4,5
1 a c d
2 1 2 3 4 5

View File

@ -0,0 +1,5 @@
a,b,c
1,"start
continue",3
4,5,6
1 a b c
2 1 start continue 3
3 4 5 6

View File

@ -0,0 +1,2 @@
a,b,c
"y,z",a
Can't render this file because it has a wrong number of fields in line 2.

View File

@ -1,5 +1,5 @@
a,b,c
a,b,"c"
1,2,
4,,6
"4",,6
7,8,9
10,11,12

1 a b c
2 1 2
3 4 6
4 7 8 9
5 10 11 12

View File

@ -0,0 +1,7 @@
a,b,c
1,2,3,4
1,2,3
1,2
1
1,2,3,4,5,6,7,8
1 a,b,c
2 1,2,3,4
3 1,2,3
4 1,2
5 1
6 1,2,3,4,5,6,7,8

View File

@ -172,3 +172,5 @@ spec =
out_1.delete_if_exists
out_2.delete_if_exists
out_3.delete_if_exists
main = Test.Suite.run_main here.spec

View File

@ -0,0 +1,171 @@
from Standard.Base import all
import Standard.Table
import Standard.Table.Data.Column
from Standard.Table.Error import all
import Standard.Table.Io.File_Format
import Standard.Base.Error.Problem_Behavior
import Standard.Test
import Standard.Test.Problems
import project.Util
spec =
Test.group "Delimited File Parsing" <|
Test.specify "should load a simple table with headers" <|
c_1 = ["a", ['1', '4', '7', '10']]
c_2 = ["b", ['2', Nothing, '8', '11']]
c_3 = ["c", [Nothing, '6', '9', '12']]
expected_table = Table.new [c_1, c_2, c_3]
simple_empty = (File_Format.Delimited "," headers=True).read (Enso_Project.data / "simple_empty.csv") Problem_Behavior.Report_Error
simple_empty.should_equal expected_table
Test.specify "should load a simple table without headers" <|
c_1 = ["Column_1", ['a', '1', '4', '7', '10']]
c_2 = ["Column_2", ['b', '2', Nothing, '8', '11']]
c_3 = ["Column_3", ['c', Nothing, '6', '9', '12']]
expected_table = Table.new [c_1, c_2, c_3]
simple_empty = (File_Format.Delimited "," headers=False).read (Enso_Project.data / "simple_empty.csv") Problem_Behavior.Report_Error
simple_empty.should_equal expected_table
Test.specify "should work in presence of missing headers" <|
table = (File_Format.Delimited "," headers=True).read (Enso_Project.data / "missing_header.csv") Problem_Behavior.Report_Error
table.columns.map .name . should_equal ["a", "Column", "c", "Column_1", "d"]
table.at "a" . to_vector . should_equal ["1"]
table.at "Column" . to_vector . should_equal ["2"]
table.at "c" . to_vector . should_equal ["3"]
table.at "Column_1" . to_vector . should_equal ["4"]
table.at "d" . to_vector . should_equal ["5"]
Test.specify "load even an empty file" <|
table = (File_Format.Delimited "," headers=True).read (Enso_Project.data / "empty.txt") Problem_Behavior.Report_Error
table.columns.map .name . should_equal []
table.row_count . should_equal 0
Test.specify "should correctly handle file opening issues" <|
nonexistent_file = Enso_Project.data / "a_filename_that_does_not_exist.foobar"
r1 = (File_Format.Delimited "," headers=True).read nonexistent_file Problem_Behavior.Report_Error
r1.should_fail_with File.File_Not_Found
directory = Enso_Project.data
r2 = (File_Format.Delimited "," headers=True).read directory Problem_Behavior.Report_Error
r2.should_fail_with File.Io_Error
Test.specify "should handle duplicated columns" <|
table = (File_Format.Delimited "," headers=True).read (Enso_Project.data / "duplicated_columns.csv") Problem_Behavior.Report_Error
table.columns.map .name . should_equal ['a', 'b', 'c', 'a_1']
table.at 'a' . to_vector . should_equal ['1']
table.at 'a_1' . to_vector . should_equal ['4']
Test.specify "should handle quotes" <|
t1 = (File_Format.Delimited "," headers=True).read (Enso_Project.data / "double_quoted.csv") Problem_Behavior.Report_Error
t1.at 'a' . to_vector . should_equal ['a, x', '"a']
t1.at 'c' . to_vector . should_equal ['3', '"']
t2 = (File_Format.Delimited "," headers=True quote_escape="\").read (Enso_Project.data / "escape_quoted.csv") Problem_Behavior.Report_Error
t2.at 'a' . to_vector . should_equal ['a"b', 'a\\\"z']
t3 = (File_Format.Delimited "," quote=Nothing headers=True).read (Enso_Project.data / "no_quoting.csv") Problem_Behavior.Report_Error
t3.at 'a' . to_vector . should_equal ['"y']
t3.at 'b' . to_vector . should_equal ['z"']
t3.at 'c' . to_vector . should_equal ['a']
Test.specify "should support rows spanning multiple lines if quoted" <|
t1 = (File_Format.Delimited "," headers=True).read (Enso_Project.data / "multiline_quoted.csv") Problem_Behavior.Report_Error
t1.at 'a' . to_vector . should_equal ['1', '4']
t1.at 'b' . to_vector . should_equal ['start\n\ncontinue', '5']
t1.at 'c' . to_vector . should_equal ['3', '6']
Test.specify "should behave correctly in presence of a mismatched quote" <|
action_1 on_problems =
(File_Format.Delimited "," headers=True).read (Enso_Project.data / "mismatched_quote.csv") on_problems
tester_1 table =
table.columns.map .name . should_equal ['a', 'b', 'c']
table.at 'a' . to_vector . should_equal ['1', 'abc']
table.at 'b' . to_vector . should_equal ['2', 'def']
table.at 'c' . to_vector . should_equal ['3', 'g h i\n7,8,9\n']
problems_1 = [Mismatched_Quote]
Problems.test_problem_handling action_1 problems_1 tester_1
action_2 on_problems =
(File_Format.Delimited "," headers=True).read (Enso_Project.data / "mismatched_quote2.csv") on_problems
tester_2 table =
table.columns.map .name . should_equal ['a', 'b', 'c']
table.at 'a' . to_vector . should_equal ['1', 'abc']
table.at 'b' . to_vector . should_equal ['2', 'def,g h i\n7,8,9\n']
table.at 'c' . to_vector . should_equal ['3', Nothing]
problems_2 = [Invalid_Row 3 1 ['abc', '"def,g h i\n7,8,9\n'], Mismatched_Quote]
Problems.test_problem_handling action_2 problems_2 tester_2
Test.specify "should handle too long and too short rows" <|
action keep_invalid_rows on_problems =
(File_Format.Delimited "," headers=True keep_invalid_rows=keep_invalid_rows).read (Enso_Project.data / "varying_rows.csv") on_problems
tester_kept table =
table.columns.map .name . should_equal ['a', 'b', 'c']
table.at 'a' . to_vector . should_equal ['1', '1', '1', Nothing, '1', '1']
table.at 'b' . to_vector . should_equal ['2', '2', '2', Nothing, Nothing, '2']
table.at 'c' . to_vector . should_equal ['3', '3', Nothing, Nothing, Nothing, '3']
problems_kept = [Invalid_Row 2 0 ['1', '2', '3', '4'], Invalid_Row 4 2 ['1', '2'], Invalid_Row 5 3 [Nothing], Invalid_Row 6 4 ['1'], Invalid_Row 7 5 ['1', '2', '3', '4', '5', '6', '7', '8']]
Problems.test_problem_handling (action keep_invalid_rows=True) problems_kept tester_kept
tester_dropped table =
table.columns.map .name . should_equal ['a', 'b', 'c']
table.at 'a' . to_vector . should_equal ['1']
table.at 'b' . to_vector . should_equal ['2']
table.at 'c' . to_vector . should_equal ['3']
problems_dropped = [Invalid_Row 2 Nothing ['1', '2', '3', '4'], Invalid_Row 4 Nothing ['1', '2'], Invalid_Row 5 Nothing [Nothing], Invalid_Row 6 Nothing ['1'], Invalid_Row 7 Nothing ['1', '2', '3', '4', '5', '6', '7', '8']]
Problems.test_problem_handling (action keep_invalid_rows=False) problems_dropped tester_dropped
Test.specify "should aggregate invalid rows over some limit" <|
action on_problems =
(File_Format.Delimited "," headers=True keep_invalid_rows=False).read (Enso_Project.data / "many_invalid_rows.csv") on_problems
tester table =
table.columns.map .name . should_equal ['a', 'b', 'c']
table.at 'a' . to_vector . should_equal ['0', '5']
table.at 'b' . to_vector . should_equal ['x', 'u']
table.at 'c' . to_vector . should_equal ['y', 'v']
problems = [Invalid_Row 3 Nothing ['1'], Invalid_Row 4 Nothing ['2'], Invalid_Row 5 Nothing ['3'], Invalid_Row 6 Nothing ['4'], Invalid_Row 8 Nothing ['6'], Invalid_Row 9 Nothing ['7'], Invalid_Row 10 Nothing ['8'], Invalid_Row 11 Nothing ['9'], Invalid_Row 12 Nothing ['10'], Invalid_Row 13 Nothing ['11'], Additional_Invalid_Rows 3]
Problems.test_problem_handling action problems tester
Test.specify "should allow to skip rows" <|
t1 = (File_Format.Delimited "," headers=False skip_rows=3).read (Enso_Project.data / "simple_empty.csv") Problem_Behavior.Report_Error
t1.at "Column_1" . to_vector . should_equal ['7', '10']
t2 = (File_Format.Delimited "," headers=True skip_rows=3).read (Enso_Project.data / "simple_empty.csv") Problem_Behavior.Report_Error
t2.columns.map .name . should_equal ['7', '8', '9']
t2.at "7" . to_vector . should_equal ['10']
Test.specify "should allow to set a limit of rows to read" <|
t1 = (File_Format.Delimited "," headers=False row_limit=2).read (Enso_Project.data / "simple_empty.csv") Problem_Behavior.Report_Error
t1.at "Column_1" . to_vector . should_equal ['a', '1']
t2 = (File_Format.Delimited "," headers=True row_limit=2).read (Enso_Project.data / "simple_empty.csv") Problem_Behavior.Report_Error
t2.at "a" . to_vector . should_equal ['1', '4']
t3 = (File_Format.Delimited "," headers=False skip_rows=3 row_limit=1).read (Enso_Project.data / "simple_empty.csv") Problem_Behavior.Report_Error
t3.at "Column_1" . to_vector . should_equal ['7']
t4 = (File_Format.Delimited "," headers=False row_limit=0).read (Enso_Project.data / "simple_empty.csv") Problem_Behavior.Report_Error
t4.columns.map .name . should_equal ['Column_1', 'Column_2', 'Column_3']
t4.row_count . should_equal 0
t5 = (File_Format.Delimited "," headers=True row_limit=0).read (Enso_Project.data / "simple_empty.csv") Problem_Behavior.Report_Error
t5.columns.map .name . should_equal ['a', 'b', 'c']
t5.at 'a' . to_vector . should_equal []
t5.row_count . should_equal 0
t6 = (File_Format.Delimited "," headers=False skip_rows=3 row_limit=1000).read (Enso_Project.data / "simple_empty.csv") Problem_Behavior.Report_Error
t6.at "Column_1" . to_vector . should_equal ['7', '10']
Test.specify "should check arguments" <|
path = (Enso_Project.data / "simple_empty.csv")
pb = Problem_Behavior.Report_Error
(File_Format.Delimited "," headers=False quote='abc').read path pb . should_fail_with Illegal_Argument_Error
(File_Format.Delimited "," headers=False quote='🚧').read path pb . should_fail_with Illegal_Argument_Error
(File_Format.Delimited "," headers=False quote_escape='//').read path pb . should_fail_with Illegal_Argument_Error
(File_Format.Delimited 'a\u{301}' headers=False).read path pb . should_fail_with Illegal_Argument_Error
main = Test.Suite.run_main here.spec

View File

@ -5,6 +5,7 @@ import Standard.Test
import project.Model_Spec
import project.Column_Spec
import project.Csv_Spec
import project.Delimited_Read_Spec
import project.Json_Spec
import project.Table_Spec
import project.Spreadsheet_Spec
@ -14,6 +15,7 @@ import project.Aggregate_Spec
in_memory_spec =
Column_Spec.spec
Csv_Spec.spec
Delimited_Read_Spec.spec
Json_Spec.spec
Spreadsheet_Spec.spec
Table_Spec.spec