The user should be able to have the headers Inferred when reading a Delimited file (#3472)

Implements https://www.pivotaltracker.com/story/show/181986831
This commit is contained in:
Radosław Waśko 2022-05-25 15:29:17 +02:00 committed by GitHub
parent 42d82bd8b7
commit 7f572bf3e4
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
27 changed files with 401 additions and 184 deletions

View File

@ -126,6 +126,8 @@
instance][3460]
- [Implemented automatic type detection for `Table.parse_values`.][3462]
- [Integrated value parsing with the `Delimited` file reader.][3463]
- [Implemented the `Infer` setting for headers in the `Delimited` file format
and made it the default.][3472]
[debug-shortcuts]:
https://github.com/enso-org/enso/blob/develop/app/gui/docs/product/shortcuts.md#debug
@ -196,6 +198,7 @@
[3460]: https://github.com/enso-org/enso/pull/3460
[3462]: https://github.com/enso-org/enso/pull/3462
[3463]: https://github.com/enso-org/enso/pull/3463
[3472]: https://github.com/enso-org/enso/pull/3472
#### Enso Compiler

View File

@ -89,10 +89,10 @@ Data_Formatter.make_datatype_parser datatype = case datatype of
Error.throw (Illegal_Argument_Error "Unsupported datatype: "+datatype.to_text)
## PRIVATE
Data_Formatter.get_parsers =
Data_Formatter.get_specific_type_parsers =
[this.make_integer_parser, this.make_decimal_parser, this.make_datetime_parser, this.make_date_parser, this.make_time_parser, this.make_boolean_parser]
## PRIVATE
Data_Formatter.make_auto_parser =
fallback_parser = this.make_identity_parser
TypeInferringParser.new this.get_parsers.to_array fallback_parser
TypeInferringParser.new this.get_specific_type_parsers.to_array fallback_parser

View File

@ -64,7 +64,7 @@ read_stream : Delimited -> InputStream -> Problem_Behavior -> File | Nothing ->
read_stream format java_stream on_problems max_columns=4096 related_file=Nothing =
java_headers = case format.headers of
True -> DelimitedReader.HeaderBehavior.USE_FIRST_ROW_AS_HEADERS
Infer -> Errors.unimplemented "Inferring headers is not implemented yet."
Infer -> DelimitedReader.HeaderBehavior.INFER
False -> DelimitedReader.HeaderBehavior.GENERATE_HEADERS
skip_rows = case format.skip_rows of
Nothing -> 0
@ -103,8 +103,11 @@ read_stream format java_stream on_problems max_columns=4096 related_file=Nothing
QuoteStrippingParser.new format.quote
value_parser = if format.value_formatter.is_nothing then base_parser else
wrapped = format.value_formatter.wrap_base_parser base_parser
TypeInferringParser.new format.value_formatter.get_parsers.to_array wrapped
reader = DelimitedReader.new reporting_stream_decoder format.delimiter format.quote format.quote_escape java_headers skip_rows row_limit max_columns value_parser format.keep_invalid_rows warnings_as_errors
TypeInferringParser.new format.value_formatter.get_specific_type_parsers.to_array wrapped
cell_type_guesser = if format.headers != Infer then Nothing else
formatter = format.value_formatter.if_nothing Data_Formatter
TypeInferringParser.new formatter.get_specific_type_parsers.to_array IdentityParser.new
reader = DelimitedReader.new reporting_stream_decoder format.delimiter format.quote format.quote_escape java_headers skip_rows row_limit max_columns value_parser cell_type_guesser format.keep_invalid_rows warnings_as_errors
result = Table.Table reader.read
decoding_problems = Vector.Vector reporting_stream_decoder.getReportedProblems . map Encoding_Error
parsing_problems = Vector.Vector reader.getReportedProblems . map translate_parsing_problem

View File

@ -87,9 +87,8 @@ type Delimited
set to `False`, the column names are generated by adding increasing
numeric suffixes to the base name `Column` (i.e. `Column_1`,
`Column_2` etc.). If set to `Infer`, the process tries to infer if
headers are present on the first row (`Infer` is not implemented yet).
If the column names are not unique, numeric suffixes will be appended
to disambiguate them.
headers are present on the first row. If the column names are not
unique, numeric suffixes will be appended to disambiguate them.
- skip_rows: The number of rows to skip from the top of the file.
- row_limit: The maximum number of rows to read from the file. This count
does not include the header row (if applicable).
@ -98,12 +97,7 @@ type Delimited
- keep_invalid_rows: Specifies whether rows that contain less or more
columns than expected should be kept (setting the missing columns to
`Nothing` or dropping the excess columns) or dropped.
TODO [RW] The default for `headers` is temporarily changed to `False`,
because `Infer` is not supported. It should be changed to be the default
value once the corrresponding task is implemented:
https://www.pivotaltracker.com/story/show/181986831
type Delimited (delimiter:Text) (encoding:Encoding=Encoding.utf_8) (quote:Text|Nothing='"') (quote_escape:Text|Nothing='"') (headers:True|False|Infer=False) (skip_rows:Integer|Nothing=Nothing) (row_limit:Integer|Nothing=Nothing) (value_formatter:Data_Formatter|Nothing=Data_Formatter) (keep_invalid_rows:Boolean=True)
type Delimited (delimiter:Text) (encoding:Encoding=Encoding.utf_8) (quote:Text|Nothing='"') (quote_escape:Text|Nothing='"') (headers:True|False|Infer=Infer) (skip_rows:Integer|Nothing=Nothing) (row_limit:Integer|Nothing=Nothing) (value_formatter:Data_Formatter|Nothing=Data_Formatter) (keep_invalid_rows:Boolean=True)
## Implements the `File.read` for this `File_Format`
read : File -> Problem_Behavior -> Any

View File

@ -38,9 +38,9 @@ public abstract class BaseTimeParser extends IncrementalDatatypeParser {
}
@Override
protected Builder makeBuilderWithCapacity(long capacity) {
protected Builder makeBuilderWithCapacity(int capacity) {
// Once datetime gets first-class support in our dataframes, a more specific builder type should
// be used.
return new ObjectBuilder((int) capacity);
return new ObjectBuilder(capacity);
}
}

View File

@ -33,7 +33,7 @@ public class BooleanParser extends IncrementalDatatypeParser {
}
@Override
protected Builder makeBuilderWithCapacity(long capacity) {
return new BoolBuilder((int) capacity);
protected Builder makeBuilderWithCapacity(int capacity) {
return new BoolBuilder(capacity);
}
}

View File

@ -2,13 +2,26 @@ package org.enso.table.parsing;
import org.enso.table.data.column.storage.Storage;
import org.enso.table.data.column.storage.StringStorage;
import org.enso.table.parsing.problems.ProblemAggregator;
import org.enso.table.read.WithProblems;
/** A base type for a parser capable of parsing a column of text values into some other type. */
public interface DatatypeParser {
public abstract class DatatypeParser {
/**
* Parses a single cell.
*
* @param text the text contents to parse, it will never be null in the default implementation -
* null values are just passed as-is without any parsing attempts by default
* @param problemAggregator an instance of the problem aggregator, used for reporting parsing
* problems
* @return the parsed value or null if the value could not be parsed or could be parsed but should
* be treated as missing value
*/
protected abstract Object parseSingleValue(String text, ProblemAggregator problemAggregator);
/**
* Parses a column of texts (represented as a {@code StringStorage}) and returns a new storage,
* containing the parsed elements.
*/
WithProblems<Storage> parseColumn(String columnName, StringStorage sourceStorage);
public abstract WithProblems<Storage> parseColumn(String columnName, StringStorage sourceStorage);
}

View File

@ -84,7 +84,7 @@ public class DecimalParser extends IncrementalDatatypeParser {
}
@Override
protected Builder makeBuilderWithCapacity(long capacity) {
return NumericBuilder.createDoubleBuilder((int) capacity);
protected Builder makeBuilderWithCapacity(int capacity) {
return NumericBuilder.createDoubleBuilder(capacity);
}
}

View File

@ -11,13 +11,13 @@ import org.enso.table.read.WithProblems;
public class IdentityParser extends IncrementalDatatypeParser {
@Override
public Object parseSingleValue(String text, ProblemAggregator problemAggregator) {
protected Object parseSingleValue(String text, ProblemAggregator problemAggregator) {
return text;
}
@Override
public StringBuilder makeBuilderWithCapacity(long capacity) {
return new StringBuilder((int) capacity);
public StringBuilder makeBuilderWithCapacity(int capacity) {
return new StringBuilder(capacity);
}
@Override

View File

@ -3,7 +3,7 @@ package org.enso.table.parsing;
import org.enso.table.data.column.builder.object.Builder;
import org.enso.table.data.column.storage.Storage;
import org.enso.table.data.column.storage.StringStorage;
import org.enso.table.parsing.problems.ProblemAggregator;
import org.enso.table.parsing.problems.ProblemAggregatorImpl;
import org.enso.table.read.WithProblems;
/**
@ -12,20 +12,7 @@ import org.enso.table.read.WithProblems;
* <p>It specifies the strategy for parsing text cells into some target type, reporting issues and
* building the resulting table column.
*/
public abstract class IncrementalDatatypeParser implements DatatypeParser {
/**
* Parses a single cell.
*
* @param text the text contents to parse, it will never be null in the default implementation -
* null values are just passed as-is without any parsing attempts by default
* @param problemAggregator an instance of the problem aggregator, used for reporting parsing
* problems
* @return the parsed value or null if the value could not be parsed or could be parsed but should
* be treated as missing value
*/
protected abstract Object parseSingleValue(String text, ProblemAggregator problemAggregator);
public abstract class IncrementalDatatypeParser extends DatatypeParser {
/**
* Creates a new column builder expecting the specific datatype, with a specified capacity.
*
@ -36,12 +23,15 @@ public abstract class IncrementalDatatypeParser implements DatatypeParser {
* builder returned here expects - it should never return a value that cannot be accepted by the
* builder.
*/
protected abstract Builder makeBuilderWithCapacity(long capacity);
protected abstract Builder makeBuilderWithCapacity(int capacity);
@Override
/**
* Parses a column of texts (represented as a {@code StringStorage}) and returns a new storage,
* containing the parsed elements.
*/
public WithProblems<Storage> parseColumn(String columnName, StringStorage sourceStorage) {
Builder builder = makeBuilderWithCapacity(sourceStorage.size());
var aggregator = new ProblemAggregator(columnName);
var aggregator = new ProblemAggregatorImpl(columnName);
for (int i = 0; i < sourceStorage.size(); ++i) {
String cell = sourceStorage.getItem(i);

View File

@ -55,7 +55,7 @@ public class IntegerParser extends IncrementalDatatypeParser {
}
@Override
protected Builder makeBuilderWithCapacity(long capacity) {
return NumericBuilder.createLongBuilder((int) capacity);
protected Builder makeBuilderWithCapacity(int capacity) {
return NumericBuilder.createLongBuilder(capacity);
}
}

View File

@ -4,6 +4,8 @@ import org.enso.table.data.column.builder.object.Builder;
import org.enso.table.data.column.storage.Storage;
import org.enso.table.data.column.storage.StringStorage;
import org.enso.table.parsing.problems.ProblemAggregator;
import org.enso.table.parsing.problems.ProblemAggregatorImpl;
import org.enso.table.parsing.problems.SimplifiedProblemAggregator;
import org.enso.table.read.WithProblems;
/**
@ -13,7 +15,7 @@ import org.enso.table.read.WithProblems;
* <p>If all parsers from the set reported problems, the fallback parser is used and its result is
* returned regardless of any problems.
*/
public class TypeInferringParser implements DatatypeParser {
public class TypeInferringParser extends DatatypeParser {
private final IncrementalDatatypeParser[] baseParsers;
private final DatatypeParser fallbackParser;
@ -24,12 +26,25 @@ public class TypeInferringParser implements DatatypeParser {
this.fallbackParser = fallbackParser;
}
@Override
public Object parseSingleValue(String text, ProblemAggregator problemAggregator) {
for (IncrementalDatatypeParser parser : baseParsers) {
SimplifiedProblemAggregator internal = new SimplifiedProblemAggregator();
Object result = parser.parseSingleValue(text, internal);
if (!internal.hasProblems()) {
return result;
}
}
return fallbackParser.parseSingleValue(text, problemAggregator);
}
@Override
public WithProblems<Storage> parseColumn(String columnName, StringStorage sourceStorage) {
parsers:
for (IncrementalDatatypeParser parser : baseParsers) {
Builder builder = parser.makeBuilderWithCapacity(sourceStorage.size());
var aggregator = new ProblemAggregator(columnName);
var aggregator = new ProblemAggregatorImpl(columnName);
for (int i = 0; i < sourceStorage.size(); ++i) {
String cell = sourceStorage.getItem(i);

View File

@ -21,7 +21,7 @@ public class WhitespaceStrippingParser extends IncrementalDatatypeParser {
}
@Override
protected Builder makeBuilderWithCapacity(long capacity) {
protected Builder makeBuilderWithCapacity(int capacity) {
return innerParser.makeBuilderWithCapacity(capacity);
}
}

View File

@ -0,0 +1,26 @@
package org.enso.table.parsing.problems;
import java.util.List;
/** A problem aggregator which ignores problems. */
public class NoOpProblemAggregator implements ProblemAggregator {
@Override
public void reportInvalidFormat(String cell) {}
@Override
public void reportLeadingZeroes(String cell) {}
@Override
public void reportMismatchedQuote() {}
@Override
public boolean hasProblems() {
throw new IllegalStateException("This implementation does not provide problem information.");
}
@Override
public List<ParsingProblem> getAggregatedProblems() {
throw new IllegalStateException("This implementation does not provide problem information.");
}
}

View File

@ -1,42 +1,23 @@
package org.enso.table.parsing.problems;
import java.util.ArrayList;
import java.util.List;
/**
* An aggregator for parsing problems.
*
* <p>Each strategy exposes a method that returns a summary of the problems. The particular methods
* for reporting each problem are defined in particular subclasses.
*/
public class ProblemAggregator {
private final List<String> invalidFormatCells = new ArrayList<>();
private final List<String> leadingZerosCells = new ArrayList<>();
private int mismatchedQuotes = 0;
private final String relatedColumnName;
public ProblemAggregator(String relatedColumnName) {
this.relatedColumnName = relatedColumnName;
}
/** An aggregator for parsing problems. */
public interface ProblemAggregator {
/**
* Reports a cell with an invalid format.
*
* <p>The reports are aggregated and finally a single problem containing all invalid cell for the
* <p>The reports are aggregated and finally a single problem containing all invalid cells for the
* given column is reported.
*/
public void reportInvalidFormat(String cell) {
invalidFormatCells.add(cell);
}
void reportInvalidFormat(String cell);
public void reportLeadingZeroes(String cell) {
leadingZerosCells.add(cell);
}
/** Reports a cell containing unexpected leading zeros. */
void reportLeadingZeroes(String cell);
public void reportMismatchedQuote() {
mismatchedQuotes++;
}
/** Reports that a mismatched quote has been encountered. */
void reportMismatchedQuote();
/**
* Checks if there are any problems already reported.
@ -44,28 +25,8 @@ public class ProblemAggregator {
* <p>This method returns true if and only if {@code getAggregatedProblems} would return a
* non-empty list.
*/
public boolean hasProblems() {
return !invalidFormatCells.isEmpty() || !leadingZerosCells.isEmpty() || mismatchedQuotes > 0;
}
boolean hasProblems();
/** Return an aggregated summary of problems that have been reported. */
public List<ParsingProblem> getAggregatedProblems() {
List<ParsingProblem> problems = new ArrayList<>();
if (!invalidFormatCells.isEmpty()) {
problems.add(new InvalidFormat(relatedColumnName, invalidFormatCells));
}
if (!leadingZerosCells.isEmpty()) {
problems.add(new LeadingZeros(relatedColumnName, leadingZerosCells));
}
for (int i = 0; i < mismatchedQuotes; ++i) {
problems.add(new MismatchedQuote());
}
assert problems.isEmpty() == !hasProblems();
return problems;
}
List<ParsingProblem> getAggregatedProblems();
}

View File

@ -0,0 +1,56 @@
package org.enso.table.parsing.problems;
import java.util.ArrayList;
import java.util.List;
public class ProblemAggregatorImpl implements ProblemAggregator {
public final String relatedColumnName;
private final List<String> invalidFormatCells = new ArrayList<>();
private final List<String> leadingZerosCells = new ArrayList<>();
private int mismatchedQuotes = 0;
public ProblemAggregatorImpl(String relatedColumnName) {
this.relatedColumnName = relatedColumnName;
}
@Override
public void reportInvalidFormat(String cell) {
invalidFormatCells.add(cell);
}
@Override
public void reportLeadingZeroes(String cell) {
leadingZerosCells.add(cell);
}
@Override
public void reportMismatchedQuote() {
mismatchedQuotes++;
}
@Override
public boolean hasProblems() {
return !invalidFormatCells.isEmpty() || !leadingZerosCells.isEmpty() || mismatchedQuotes > 0;
}
@Override
public List<ParsingProblem> getAggregatedProblems() {
List<ParsingProblem> problems = new ArrayList<>();
if (!invalidFormatCells.isEmpty()) {
problems.add(new InvalidFormat(relatedColumnName, invalidFormatCells));
}
if (!leadingZerosCells.isEmpty()) {
problems.add(new LeadingZeros(relatedColumnName, leadingZerosCells));
}
for (int i = 0; i < mismatchedQuotes; ++i) {
problems.add(new MismatchedQuote());
}
assert problems.isEmpty() == !hasProblems();
return problems;
}
}

View File

@ -0,0 +1,33 @@
package org.enso.table.parsing.problems;
import java.util.List;
public class SimplifiedProblemAggregator implements ProblemAggregator {
private boolean hasProblems = false;
@Override
public void reportInvalidFormat(String cell) {
hasProblems = true;
}
@Override
public void reportLeadingZeroes(String cell) {
hasProblems = true;
}
@Override
public void reportMismatchedQuote() {
hasProblems = true;
}
@Override
public boolean hasProblems() {
return hasProblems;
}
@Override
public List<ParsingProblem> getAggregatedProblems() {
throw new IllegalStateException("Problem aggregation is not available in this implementation.");
}
}

View File

@ -15,31 +15,20 @@ import org.enso.table.data.index.DefaultIndex;
import org.enso.table.data.table.Column;
import org.enso.table.data.table.Table;
import org.enso.table.parsing.DatatypeParser;
import org.enso.table.parsing.TypeInferringParser;
import org.enso.table.parsing.problems.AdditionalInvalidRows;
import org.enso.table.parsing.problems.InvalidRow;
import org.enso.table.parsing.problems.MismatchedQuote;
import org.enso.table.parsing.problems.NoOpProblemAggregator;
import org.enso.table.parsing.problems.ParsingProblem;
import org.enso.table.util.NameDeduplicator;
/** A helper for reading delimited (CSV-like) files. */
public class DelimitedReader {
/** Specifies how to set the headers for the returned table. */
public enum HeaderBehavior {
/** Tries to infer if the headers are present in the file. */
INFER,
/** Uses the first row in the file as headers. Duplicate names will be appended suffixes. */
USE_FIRST_ROW_AS_HEADERS,
/**
* Treats the first row as data and generates header names starting with {@code COLUMN_NAME}.
*/
GENERATE_HEADERS
}
private static final String COLUMN_NAME = "Column";
private static final char noQuoteCharacter = '\0';
private static final long invalidRowsLimit = 10;
private final char delimiter;
private final char quoteCharacter;
private final char quoteEscapeCharacter;
@ -50,10 +39,15 @@ public class DelimitedReader {
private final List<ParsingProblem> warnings = new ArrayList<>();
private final CsvParser parser;
private final DatatypeParser valueParser;
private final TypeInferringParser cellTypeGuesser;
private final boolean keepInvalidRows;
private final boolean warningsAsErrors;
private static final char noQuoteCharacter = '\0';
private final NoOpProblemAggregator noOpProblemAggregator = new NoOpProblemAggregator();
private long invalidRowsCount = 0;
private long targetTableIndex = 0;
/** The line number of the start of the current row in the input file. */
private long currentLine = 0;
private StringStorageBuilder[] builders = null;
/**
* Creates a new reader.
@ -74,6 +68,8 @@ public class DelimitedReader {
* @param maxColumns specifies how many columns can be expected at most
* @param valueParser an optional parser that is applied to each column to convert it to more
* specific datatype
* @param cellTypeGuesser a helper used to guess cell types, used for the purpose of inferring the
* headers, it must not be null if {@code headerBehavior} is set to {@code INFER}.
* @param keepInvalidRows specifies whether to keep rows that had an unexpected number of columns
* @param warningsAsErrors specifies if the first warning should be immediately raised as an error
* (used as a fast-path for the error-reporting mode to avoid computing a value that is going
@ -89,6 +85,7 @@ public class DelimitedReader {
long rowLimit,
int maxColumns,
DatatypeParser valueParser,
TypeInferringParser cellTypeGuesser,
boolean keepInvalidRows,
boolean warningsAsErrors) {
if (delimiter.isEmpty()) {
@ -142,6 +139,7 @@ public class DelimitedReader {
this.warningsAsErrors = warningsAsErrors;
this.valueParser = valueParser;
this.cellTypeGuesser = cellTypeGuesser;
parser = setupCsvParser(input);
}
@ -174,9 +172,6 @@ public class DelimitedReader {
reportProblem(new MismatchedQuote());
}
private long invalidRowsCount = 0;
private static final long invalidRowsLimit = 10;
private void reportInvalidRow(long source_row, Long table_index, String[] row) {
if (invalidRowsCount < invalidRowsLimit) {
reportProblem(new InvalidRow(source_row, table_index, row));
@ -203,29 +198,89 @@ public class DelimitedReader {
}
}
private long target_table_index = 0;
/** The line number of the start of the current row in the input file. */
private long current_line = 0;
/**
* Reads the next row and updates the current line accordingly.
*
* <p>Will return {@code null} if no more rows are available.
*/
private String[] nextRow() {
current_line = parser.getContext().currentLine() + 1;
private String[] readNextRow() {
currentLine = parser.getContext().currentLine() + 1;
return parser.parseNext();
}
private void appendRow(String[] row) {
assert builders != null;
assert canFitMoreRows();
if (row.length != builders.length) {
reportInvalidRow(currentLine, keepInvalidRows ? targetTableIndex : null, row);
if (keepInvalidRows) {
for (int i = 0; i < builders.length && i < row.length; i++) {
builders[i] = builders[i].parseAndAppend(row[i]);
}
// If the current row had less columns than expected, nulls are inserted for the missing
// values.
// If it had more columns, the excess columns are discarded.
for (int i = row.length; i < builders.length; i++) {
builders[i] = builders[i].parseAndAppend(null);
}
targetTableIndex++;
}
} else {
for (int i = 0; i < builders.length; i++) {
builders[i] = builders[i].parseAndAppend(row[i]);
}
targetTableIndex++;
}
}
private boolean canFitMoreRows() {
return rowLimit < 0 || targetTableIndex < rowLimit;
}
private void appendRowIfLimitPermits(String[] row) {
if (canFitMoreRows()) {
appendRow(row);
}
}
private List<String> headersFromRow(String[] row) {
List<String> preprocessedHeaders =
Arrays.stream(row).map(this::parseHeader).collect(Collectors.toList());
return NameDeduplicator.deduplicate(preprocessedHeaders, "_");
}
private List<String> generateDefaultHeaders(int columnCount) {
ArrayList<String> headerNames = new ArrayList<>(columnCount);
for (int i = 0; i < columnCount; ++i) {
headerNames.add(COLUMN_NAME + "_" + (i + 1));
}
return headerNames;
}
/**
* Checks if the given cell contains just plain text that is not null and is not convertible to
* any more specific type according to the {@code cellTypeGuesser}. This is used for checking the
* types when inferring the headers.
*/
private boolean isPlainText(String cell) {
if (cell == null) return false;
Object parsed = cellTypeGuesser.parseSingleValue(cell, noOpProblemAggregator);
return parsed instanceof String;
}
/** Reads the input stream and returns a Table. */
public Table read() {
List<String> headerNames;
String[] currentRow = nextRow();
String[] currentRow = readNextRow();
// Skip the first N rows.
for (long i = 0; currentRow != null && i < skipRows; ++i) {
currentRow = nextRow();
currentRow = readNextRow();
}
// If there are no rows to even infer the headers, we return an empty table.
@ -233,55 +288,50 @@ public class DelimitedReader {
return new Table(new Column[0]);
}
int expectedColumnCount = currentRow.length;
initBuilders(expectedColumnCount);
assert currentRow != null;
switch (headerBehavior) {
case INFER:
throw new IllegalStateException("Inferring headers is not yet implemented");
case USE_FIRST_ROW_AS_HEADERS:
List<String> preprocessedHeaders =
Arrays.stream(currentRow).map(this::parseHeader).collect(Collectors.toList());
headerNames = NameDeduplicator.deduplicate(preprocessedHeaders, "_");
// We have 'used up' the first row, so we load a next one.
currentRow = nextRow();
break;
case GENERATE_HEADERS:
headerNames = new ArrayList<>(currentRow.length);
for (int i = 0; i < currentRow.length; ++i) {
headerNames.add(COLUMN_NAME + "_" + (i + 1));
case INFER -> {
String[] firstRow = currentRow;
String[] secondRow = readNextRow();
if (secondRow == null) {
// If there is only one row in the file, we generate the headers and stop further processing (as nothing more to process).
headerNames = generateDefaultHeaders(expectedColumnCount);
appendRowIfLimitPermits(firstRow);
currentRow = null;
} else {
assert cellTypeGuesser != null;
boolean firstAllText = Arrays.stream(firstRow).allMatch(this::isPlainText);
boolean secondAllText = Arrays.stream(secondRow).allMatch(this ::isPlainText);
boolean useFirstRowAsHeader = firstAllText && !secondAllText;
if (useFirstRowAsHeader) {
headerNames = headersFromRow(firstRow);
appendRowIfLimitPermits(secondRow);
} else {
headerNames = generateDefaultHeaders(expectedColumnCount);
appendRowIfLimitPermits(firstRow);
appendRowIfLimitPermits(secondRow);
}
currentRow = readNextRow();
}
break;
default:
throw new IllegalStateException("Impossible branch.");
}
case USE_FIRST_ROW_AS_HEADERS -> {
headerNames = headersFromRow(currentRow);
// We have 'used up' the first row, so we load a next one.
currentRow = readNextRow();
}
case GENERATE_HEADERS -> {
headerNames = generateDefaultHeaders(expectedColumnCount);
}
default -> throw new IllegalStateException("Impossible branch.");
}
StringStorageBuilder[] builders = initBuilders(headerNames.size());
while (currentRow != null && (rowLimit < 0 || target_table_index < rowLimit)) {
if (currentRow.length != builders.length) {
reportInvalidRow(current_line, keepInvalidRows ? target_table_index : null, currentRow);
if (keepInvalidRows) {
for (int i = 0; i < builders.length && i < currentRow.length; i++) {
builders[i] = builders[i].parseAndAppend(currentRow[i]);
}
// If the current row had less columns than expected, nulls are inserted for the missing
// values.
// If it had more columns, the excess columns are discarded.
for (int i = currentRow.length; i < builders.length; i++) {
builders[i] = builders[i].parseAndAppend(null);
}
target_table_index++;
}
} else {
for (int i = 0; i < builders.length; i++) {
builders[i] = builders[i].parseAndAppend(currentRow[i]);
}
target_table_index++;
}
currentRow = nextRow();
while (currentRow != null && canFitMoreRows()) {
appendRow(currentRow);
currentRow = readNextRow();
}
parser.stopParsing();
@ -302,11 +352,24 @@ public class DelimitedReader {
return new Table(columns);
}
private StringStorageBuilder[] initBuilders(int count) {
StringStorageBuilder[] res = new StringStorageBuilder[count];
private void initBuilders(int count) {
builders = new StringStorageBuilder[count];
for (int i = 0; i < count; i++) {
res[i] = new StringStorageBuilder();
builders[i] = new StringStorageBuilder();
}
return res;
}
/** Specifies how to set the headers for the returned table. */
public enum HeaderBehavior {
/** Tries to infer if the headers are present in the file. */
INFER,
/** Uses the first row in the file as headers. Duplicate names will be appended suffixes. */
USE_FIRST_ROW_AS_HEADERS,
/**
* Treats the first row as data and generates header names starting with {@code COLUMN_NAME}.
*/
GENERATE_HEADERS
}
}

View File

@ -34,7 +34,7 @@ public class QuoteStrippingParser extends IncrementalDatatypeParser {
}
@Override
protected Builder makeBuilderWithCapacity(long capacity) {
return new StringBuilder((int) capacity);
protected Builder makeBuilderWithCapacity(int capacity) {
return new StringBuilder(capacity);
}
}

View File

@ -0,0 +1,4 @@
a,b
c,d
e,f
g,h
1 a b
2 c d
3 e f
4 g h

View File

@ -1,4 +1,4 @@
Code,Index,Flag,Value,ValueWithNothing,TextWithNothing,Hexadecimal,Leading0s,QuotedNumbers,Mixed
Code,Index,Flag,Value,ValueWithNothing,TextWithNothing,"Hexadecimal",Leading0s,QuotedNumbers,"Mixed Types"
gxl,7,True,38.76109,63.13, pq6igd2wyd ,4DD4675B,001,"1","33"
wca,0,False,-66.77495,31," 2pr4102wc4 ",,002,"2",
nfw,1, True , 88.65713 ,-68.71,"",01896EAB,123,,45

1 Code Index Flag Value ValueWithNothing TextWithNothing Hexadecimal Leading0s QuotedNumbers Mixed Mixed Types
2 gxl 7 True 38.76109 63.13 pq6igd2wyd 4DD4675B 001 1 33 33
3 wca 0 False -66.77495 31 2pr4102wc4 002 2
4 nfw 1 True 88.65713 -68.71 01896EAB 123 45 45

View File

@ -0,0 +1,2 @@
a,"b",0
1,2,3
1 a b 0
2 1 2 3

View File

@ -0,0 +1 @@
x,y,z
1 x y z

View File

@ -0,0 +1,2 @@
"1",x
y,2
1 1 x
2 y 2

View File

@ -0,0 +1,2 @@
a,b,c
x,,
1 a b c
2 x

View File

@ -0,0 +1,2 @@
a,b,c
d,e,f
1 a b c
2 d e f

View File

@ -42,7 +42,51 @@ spec =
table.at "Column_1" . to_vector . should_equal ["4"]
table.at "d" . to_vector . should_equal ["5"]
Test.specify "load even an empty file" <|
Test.specify "should infer headers based on the first two rows" <|
t1 = File.read (Enso_Project.data / "data_small.csv") (File_Format.Delimited "," headers=File_Format.Infer)
t1.columns.map .name . should_equal ["Code", "Index", "Flag", "Value", "ValueWithNothing", "TextWithNothing", "Hexadecimal", "Leading0s", "QuotedNumbers", "Mixed Types"]
t2 = File.read (Enso_Project.data / "all_text.csv") (File_Format.Delimited "," headers=File_Format.Infer)
t2.columns.map .name . should_equal ["Column_1", "Column_2"]
t2.at "Column_1" . to_vector . should_equal ["a", "c", "e", "g"]
t2.at "Column_2" . to_vector . should_equal ["b", "d", "f", "h"]
t3 = File.read (Enso_Project.data / "two_rows1.csv") (File_Format.Delimited "," headers=File_Format.Infer)
t3.columns.map .name . should_equal ["a", "b", "c"]
t3.at "a" . to_vector . should_equal ["x"]
t3.at "b" . to_vector . should_equal [Nothing]
t3.at "c" . to_vector . should_equal [Nothing]
t4 = File.read (Enso_Project.data / "two_rows2.csv") (File_Format.Delimited "," headers=File_Format.Infer)
t4.columns.map .name . should_equal ["Column_1", "Column_2", "Column_3"]
t4.at "Column_1" . to_vector . should_equal ["a", "d"]
t4.at "Column_2" . to_vector . should_equal ["b", "e"]
t4.at "Column_3" . to_vector . should_equal ["c", "f"]
t5 = File.read (Enso_Project.data / "numbers_in_header.csv") (File_Format.Delimited "," headers=File_Format.Infer)
t5.columns.map .name . should_equal ["Column_1", "Column_2", "Column_3"]
t5.at "Column_1" . to_vector . should_equal ["a", "1"]
t5.at "Column_2" . to_vector . should_equal ["b", "2"]
t5.at "Column_3" . to_vector . should_equal [0, 3]
t6 = File.read (Enso_Project.data / "quoted_numbers_in_header.csv") (File_Format.Delimited "," headers=File_Format.Infer)
t6.columns.map .name . should_equal ["1", "x"]
t6.at "1" . to_vector . should_equal ["y"]
t6.at "x" . to_vector . should_equal [2]
Test.specify "should not use the first row as headers if it is the only row, unless specifically asked to" <|
t1 = File.read (Enso_Project.data / "one_row.csv") (File_Format.Delimited "," headers=File_Format.Infer)
t1.columns.map .name . should_equal ["Column_1", "Column_2", "Column_3"]
t1.at "Column_1" . to_vector . should_equal ["x"]
t1.at "Column_2" . to_vector . should_equal ["y"]
t1.at "Column_3" . to_vector . should_equal ["z"]
t2 = File.read (Enso_Project.data / "one_row.csv") (File_Format.Delimited "," headers=True)
t2.columns.map .name . should_equal ["x", "y", "z"]
t2.row_count . should_equal 0
t2.at "x" . to_vector . should_equal []
Test.specify "should be able to load even an empty file" <|
table = File.read (Enso_Project.data / "empty.txt") (File_Format.Delimited "," headers=True value_formatter=Nothing)
table.columns.map .name . should_equal []
table.row_count . should_equal 0
@ -251,17 +295,20 @@ spec =
t.at "Hexadecimal" . to_vector . should_equal ["4DD4675B", Nothing, "01896EAB", "F32E1EFE"]
t.at "Leading0s" . to_vector . should_equal ["001", "002", "123", Nothing]
t.at "QuotedNumbers" . to_vector . should_equal ["1", "2", Nothing, "34"]
t.at "Mixed" . to_vector . should_equal ["33", Nothing, "45", "True"]
t.at "Mixed Types" . to_vector . should_equal ["33", Nothing, "45", "True"]
t2 = (Enso_Project.data / "data_small.csv") . read (File_Format.Delimited "," headers=True value_formatter=(Data_Formatter allow_leading_zeros=True))
t2.at "Leading0s" . to_vector . should_equal [1, 2, 123, Nothing]
Test.specify "should be able to detect known types automatically" <|
## TODO update this once headers are inferred (next PR)
Test.specify "should be able to detect types automatically" <|
t1 = (Enso_Project.data / "data_small.csv") . read
t1.at "Column_1" . to_vector . should_equal ["Code", "gxl", "wca", "nfw", "der"]
t1.at "Code" . to_vector . should_equal ["gxl", "wca", "nfw", "der"]
t1.at "Index" . to_vector . should_equal [7, 0, 1, 7]
t2 = (Enso_Project.data / "sample.tsv") . read
t2.at "Column_1" . to_vector . should_equal ["a", "1", "4"]
t2.at "a" . to_vector . should_equal [1, 4]
t2.at "b" . to_vector . should_equal [2, 5]
t2.at "c" . to_vector . should_equal [3, 6]
t2.columns.map .name . should_equal ["a", "b", "c"]
main = Test.Suite.run_main here.spec