mirror of
https://github.com/enso-org/enso.git
synced 2024-12-27 20:33:30 +03:00
The user should be able to have the headers Inferred when reading a Delimited file (#3472)
Implements https://www.pivotaltracker.com/story/show/181986831
This commit is contained in:
parent
42d82bd8b7
commit
7f572bf3e4
@ -126,6 +126,8 @@
|
||||
instance][3460]
|
||||
- [Implemented automatic type detection for `Table.parse_values`.][3462]
|
||||
- [Integrated value parsing with the `Delimited` file reader.][3463]
|
||||
- [Implemented the `Infer` setting for headers in the `Delimited` file format
|
||||
and made it the default.][3472]
|
||||
|
||||
[debug-shortcuts]:
|
||||
https://github.com/enso-org/enso/blob/develop/app/gui/docs/product/shortcuts.md#debug
|
||||
@ -196,6 +198,7 @@
|
||||
[3460]: https://github.com/enso-org/enso/pull/3460
|
||||
[3462]: https://github.com/enso-org/enso/pull/3462
|
||||
[3463]: https://github.com/enso-org/enso/pull/3463
|
||||
[3472]: https://github.com/enso-org/enso/pull/3472
|
||||
|
||||
#### Enso Compiler
|
||||
|
||||
|
@ -89,10 +89,10 @@ Data_Formatter.make_datatype_parser datatype = case datatype of
|
||||
Error.throw (Illegal_Argument_Error "Unsupported datatype: "+datatype.to_text)
|
||||
|
||||
## PRIVATE
|
||||
Data_Formatter.get_parsers =
|
||||
Data_Formatter.get_specific_type_parsers =
|
||||
[this.make_integer_parser, this.make_decimal_parser, this.make_datetime_parser, this.make_date_parser, this.make_time_parser, this.make_boolean_parser]
|
||||
|
||||
## PRIVATE
|
||||
Data_Formatter.make_auto_parser =
|
||||
fallback_parser = this.make_identity_parser
|
||||
TypeInferringParser.new this.get_parsers.to_array fallback_parser
|
||||
TypeInferringParser.new this.get_specific_type_parsers.to_array fallback_parser
|
||||
|
@ -64,7 +64,7 @@ read_stream : Delimited -> InputStream -> Problem_Behavior -> File | Nothing ->
|
||||
read_stream format java_stream on_problems max_columns=4096 related_file=Nothing =
|
||||
java_headers = case format.headers of
|
||||
True -> DelimitedReader.HeaderBehavior.USE_FIRST_ROW_AS_HEADERS
|
||||
Infer -> Errors.unimplemented "Inferring headers is not implemented yet."
|
||||
Infer -> DelimitedReader.HeaderBehavior.INFER
|
||||
False -> DelimitedReader.HeaderBehavior.GENERATE_HEADERS
|
||||
skip_rows = case format.skip_rows of
|
||||
Nothing -> 0
|
||||
@ -103,8 +103,11 @@ read_stream format java_stream on_problems max_columns=4096 related_file=Nothing
|
||||
QuoteStrippingParser.new format.quote
|
||||
value_parser = if format.value_formatter.is_nothing then base_parser else
|
||||
wrapped = format.value_formatter.wrap_base_parser base_parser
|
||||
TypeInferringParser.new format.value_formatter.get_parsers.to_array wrapped
|
||||
reader = DelimitedReader.new reporting_stream_decoder format.delimiter format.quote format.quote_escape java_headers skip_rows row_limit max_columns value_parser format.keep_invalid_rows warnings_as_errors
|
||||
TypeInferringParser.new format.value_formatter.get_specific_type_parsers.to_array wrapped
|
||||
cell_type_guesser = if format.headers != Infer then Nothing else
|
||||
formatter = format.value_formatter.if_nothing Data_Formatter
|
||||
TypeInferringParser.new formatter.get_specific_type_parsers.to_array IdentityParser.new
|
||||
reader = DelimitedReader.new reporting_stream_decoder format.delimiter format.quote format.quote_escape java_headers skip_rows row_limit max_columns value_parser cell_type_guesser format.keep_invalid_rows warnings_as_errors
|
||||
result = Table.Table reader.read
|
||||
decoding_problems = Vector.Vector reporting_stream_decoder.getReportedProblems . map Encoding_Error
|
||||
parsing_problems = Vector.Vector reader.getReportedProblems . map translate_parsing_problem
|
||||
|
@ -87,9 +87,8 @@ type Delimited
|
||||
set to `False`, the column names are generated by adding increasing
|
||||
numeric suffixes to the base name `Column` (i.e. `Column_1`,
|
||||
`Column_2` etc.). If set to `Infer`, the process tries to infer if
|
||||
headers are present on the first row (`Infer` is not implemented yet).
|
||||
If the column names are not unique, numeric suffixes will be appended
|
||||
to disambiguate them.
|
||||
headers are present on the first row. If the column names are not
|
||||
unique, numeric suffixes will be appended to disambiguate them.
|
||||
- skip_rows: The number of rows to skip from the top of the file.
|
||||
- row_limit: The maximum number of rows to read from the file. This count
|
||||
does not include the header row (if applicable).
|
||||
@ -98,12 +97,7 @@ type Delimited
|
||||
- keep_invalid_rows: Specifies whether rows that contain less or more
|
||||
columns than expected should be kept (setting the missing columns to
|
||||
`Nothing` or dropping the excess columns) or dropped.
|
||||
|
||||
TODO [RW] The default for `headers` is temporarily changed to `False`,
|
||||
because `Infer` is not supported. It should be changed to be the default
|
||||
value once the corrresponding task is implemented:
|
||||
https://www.pivotaltracker.com/story/show/181986831
|
||||
type Delimited (delimiter:Text) (encoding:Encoding=Encoding.utf_8) (quote:Text|Nothing='"') (quote_escape:Text|Nothing='"') (headers:True|False|Infer=False) (skip_rows:Integer|Nothing=Nothing) (row_limit:Integer|Nothing=Nothing) (value_formatter:Data_Formatter|Nothing=Data_Formatter) (keep_invalid_rows:Boolean=True)
|
||||
type Delimited (delimiter:Text) (encoding:Encoding=Encoding.utf_8) (quote:Text|Nothing='"') (quote_escape:Text|Nothing='"') (headers:True|False|Infer=Infer) (skip_rows:Integer|Nothing=Nothing) (row_limit:Integer|Nothing=Nothing) (value_formatter:Data_Formatter|Nothing=Data_Formatter) (keep_invalid_rows:Boolean=True)
|
||||
|
||||
## Implements the `File.read` for this `File_Format`
|
||||
read : File -> Problem_Behavior -> Any
|
||||
|
@ -38,9 +38,9 @@ public abstract class BaseTimeParser extends IncrementalDatatypeParser {
|
||||
}
|
||||
|
||||
@Override
|
||||
protected Builder makeBuilderWithCapacity(long capacity) {
|
||||
protected Builder makeBuilderWithCapacity(int capacity) {
|
||||
// Once datetime gets first-class support in our dataframes, a more specific builder type should
|
||||
// be used.
|
||||
return new ObjectBuilder((int) capacity);
|
||||
return new ObjectBuilder(capacity);
|
||||
}
|
||||
}
|
||||
|
@ -33,7 +33,7 @@ public class BooleanParser extends IncrementalDatatypeParser {
|
||||
}
|
||||
|
||||
@Override
|
||||
protected Builder makeBuilderWithCapacity(long capacity) {
|
||||
return new BoolBuilder((int) capacity);
|
||||
protected Builder makeBuilderWithCapacity(int capacity) {
|
||||
return new BoolBuilder(capacity);
|
||||
}
|
||||
}
|
||||
|
@ -2,13 +2,26 @@ package org.enso.table.parsing;
|
||||
|
||||
import org.enso.table.data.column.storage.Storage;
|
||||
import org.enso.table.data.column.storage.StringStorage;
|
||||
import org.enso.table.parsing.problems.ProblemAggregator;
|
||||
import org.enso.table.read.WithProblems;
|
||||
|
||||
/** A base type for a parser capable of parsing a column of text values into some other type. */
|
||||
public interface DatatypeParser {
|
||||
public abstract class DatatypeParser {
|
||||
/**
|
||||
* Parses a single cell.
|
||||
*
|
||||
* @param text the text contents to parse, it will never be null in the default implementation -
|
||||
* null values are just passed as-is without any parsing attempts by default
|
||||
* @param problemAggregator an instance of the problem aggregator, used for reporting parsing
|
||||
* problems
|
||||
* @return the parsed value or null if the value could not be parsed or could be parsed but should
|
||||
* be treated as missing value
|
||||
*/
|
||||
protected abstract Object parseSingleValue(String text, ProblemAggregator problemAggregator);
|
||||
|
||||
/**
|
||||
* Parses a column of texts (represented as a {@code StringStorage}) and returns a new storage,
|
||||
* containing the parsed elements.
|
||||
*/
|
||||
WithProblems<Storage> parseColumn(String columnName, StringStorage sourceStorage);
|
||||
public abstract WithProblems<Storage> parseColumn(String columnName, StringStorage sourceStorage);
|
||||
}
|
||||
|
@ -84,7 +84,7 @@ public class DecimalParser extends IncrementalDatatypeParser {
|
||||
}
|
||||
|
||||
@Override
|
||||
protected Builder makeBuilderWithCapacity(long capacity) {
|
||||
return NumericBuilder.createDoubleBuilder((int) capacity);
|
||||
protected Builder makeBuilderWithCapacity(int capacity) {
|
||||
return NumericBuilder.createDoubleBuilder(capacity);
|
||||
}
|
||||
}
|
||||
|
@ -11,13 +11,13 @@ import org.enso.table.read.WithProblems;
|
||||
public class IdentityParser extends IncrementalDatatypeParser {
|
||||
|
||||
@Override
|
||||
public Object parseSingleValue(String text, ProblemAggregator problemAggregator) {
|
||||
protected Object parseSingleValue(String text, ProblemAggregator problemAggregator) {
|
||||
return text;
|
||||
}
|
||||
|
||||
@Override
|
||||
public StringBuilder makeBuilderWithCapacity(long capacity) {
|
||||
return new StringBuilder((int) capacity);
|
||||
public StringBuilder makeBuilderWithCapacity(int capacity) {
|
||||
return new StringBuilder(capacity);
|
||||
}
|
||||
|
||||
@Override
|
||||
|
@ -3,7 +3,7 @@ package org.enso.table.parsing;
|
||||
import org.enso.table.data.column.builder.object.Builder;
|
||||
import org.enso.table.data.column.storage.Storage;
|
||||
import org.enso.table.data.column.storage.StringStorage;
|
||||
import org.enso.table.parsing.problems.ProblemAggregator;
|
||||
import org.enso.table.parsing.problems.ProblemAggregatorImpl;
|
||||
import org.enso.table.read.WithProblems;
|
||||
|
||||
/**
|
||||
@ -12,20 +12,7 @@ import org.enso.table.read.WithProblems;
|
||||
* <p>It specifies the strategy for parsing text cells into some target type, reporting issues and
|
||||
* building the resulting table column.
|
||||
*/
|
||||
public abstract class IncrementalDatatypeParser implements DatatypeParser {
|
||||
|
||||
/**
|
||||
* Parses a single cell.
|
||||
*
|
||||
* @param text the text contents to parse, it will never be null in the default implementation -
|
||||
* null values are just passed as-is without any parsing attempts by default
|
||||
* @param problemAggregator an instance of the problem aggregator, used for reporting parsing
|
||||
* problems
|
||||
* @return the parsed value or null if the value could not be parsed or could be parsed but should
|
||||
* be treated as missing value
|
||||
*/
|
||||
protected abstract Object parseSingleValue(String text, ProblemAggregator problemAggregator);
|
||||
|
||||
public abstract class IncrementalDatatypeParser extends DatatypeParser {
|
||||
/**
|
||||
* Creates a new column builder expecting the specific datatype, with a specified capacity.
|
||||
*
|
||||
@ -36,12 +23,15 @@ public abstract class IncrementalDatatypeParser implements DatatypeParser {
|
||||
* builder returned here expects - it should never return a value that cannot be accepted by the
|
||||
* builder.
|
||||
*/
|
||||
protected abstract Builder makeBuilderWithCapacity(long capacity);
|
||||
protected abstract Builder makeBuilderWithCapacity(int capacity);
|
||||
|
||||
@Override
|
||||
/**
|
||||
* Parses a column of texts (represented as a {@code StringStorage}) and returns a new storage,
|
||||
* containing the parsed elements.
|
||||
*/
|
||||
public WithProblems<Storage> parseColumn(String columnName, StringStorage sourceStorage) {
|
||||
Builder builder = makeBuilderWithCapacity(sourceStorage.size());
|
||||
var aggregator = new ProblemAggregator(columnName);
|
||||
var aggregator = new ProblemAggregatorImpl(columnName);
|
||||
|
||||
for (int i = 0; i < sourceStorage.size(); ++i) {
|
||||
String cell = sourceStorage.getItem(i);
|
||||
|
@ -55,7 +55,7 @@ public class IntegerParser extends IncrementalDatatypeParser {
|
||||
}
|
||||
|
||||
@Override
|
||||
protected Builder makeBuilderWithCapacity(long capacity) {
|
||||
return NumericBuilder.createLongBuilder((int) capacity);
|
||||
protected Builder makeBuilderWithCapacity(int capacity) {
|
||||
return NumericBuilder.createLongBuilder(capacity);
|
||||
}
|
||||
}
|
||||
|
@ -4,6 +4,8 @@ import org.enso.table.data.column.builder.object.Builder;
|
||||
import org.enso.table.data.column.storage.Storage;
|
||||
import org.enso.table.data.column.storage.StringStorage;
|
||||
import org.enso.table.parsing.problems.ProblemAggregator;
|
||||
import org.enso.table.parsing.problems.ProblemAggregatorImpl;
|
||||
import org.enso.table.parsing.problems.SimplifiedProblemAggregator;
|
||||
import org.enso.table.read.WithProblems;
|
||||
|
||||
/**
|
||||
@ -13,7 +15,7 @@ import org.enso.table.read.WithProblems;
|
||||
* <p>If all parsers from the set reported problems, the fallback parser is used and its result is
|
||||
* returned regardless of any problems.
|
||||
*/
|
||||
public class TypeInferringParser implements DatatypeParser {
|
||||
public class TypeInferringParser extends DatatypeParser {
|
||||
|
||||
private final IncrementalDatatypeParser[] baseParsers;
|
||||
private final DatatypeParser fallbackParser;
|
||||
@ -24,12 +26,25 @@ public class TypeInferringParser implements DatatypeParser {
|
||||
this.fallbackParser = fallbackParser;
|
||||
}
|
||||
|
||||
@Override
|
||||
public Object parseSingleValue(String text, ProblemAggregator problemAggregator) {
|
||||
for (IncrementalDatatypeParser parser : baseParsers) {
|
||||
SimplifiedProblemAggregator internal = new SimplifiedProblemAggregator();
|
||||
Object result = parser.parseSingleValue(text, internal);
|
||||
if (!internal.hasProblems()) {
|
||||
return result;
|
||||
}
|
||||
}
|
||||
|
||||
return fallbackParser.parseSingleValue(text, problemAggregator);
|
||||
}
|
||||
|
||||
@Override
|
||||
public WithProblems<Storage> parseColumn(String columnName, StringStorage sourceStorage) {
|
||||
parsers:
|
||||
for (IncrementalDatatypeParser parser : baseParsers) {
|
||||
Builder builder = parser.makeBuilderWithCapacity(sourceStorage.size());
|
||||
var aggregator = new ProblemAggregator(columnName);
|
||||
var aggregator = new ProblemAggregatorImpl(columnName);
|
||||
|
||||
for (int i = 0; i < sourceStorage.size(); ++i) {
|
||||
String cell = sourceStorage.getItem(i);
|
||||
|
@ -21,7 +21,7 @@ public class WhitespaceStrippingParser extends IncrementalDatatypeParser {
|
||||
}
|
||||
|
||||
@Override
|
||||
protected Builder makeBuilderWithCapacity(long capacity) {
|
||||
protected Builder makeBuilderWithCapacity(int capacity) {
|
||||
return innerParser.makeBuilderWithCapacity(capacity);
|
||||
}
|
||||
}
|
||||
|
@ -0,0 +1,26 @@
|
||||
package org.enso.table.parsing.problems;
|
||||
|
||||
import java.util.List;
|
||||
|
||||
/** A problem aggregator which ignores problems. */
|
||||
public class NoOpProblemAggregator implements ProblemAggregator {
|
||||
|
||||
@Override
|
||||
public void reportInvalidFormat(String cell) {}
|
||||
|
||||
@Override
|
||||
public void reportLeadingZeroes(String cell) {}
|
||||
|
||||
@Override
|
||||
public void reportMismatchedQuote() {}
|
||||
|
||||
@Override
|
||||
public boolean hasProblems() {
|
||||
throw new IllegalStateException("This implementation does not provide problem information.");
|
||||
}
|
||||
|
||||
@Override
|
||||
public List<ParsingProblem> getAggregatedProblems() {
|
||||
throw new IllegalStateException("This implementation does not provide problem information.");
|
||||
}
|
||||
}
|
@ -1,42 +1,23 @@
|
||||
package org.enso.table.parsing.problems;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
|
||||
/**
|
||||
* An aggregator for parsing problems.
|
||||
*
|
||||
* <p>Each strategy exposes a method that returns a summary of the problems. The particular methods
|
||||
* for reporting each problem are defined in particular subclasses.
|
||||
*/
|
||||
public class ProblemAggregator {
|
||||
|
||||
private final List<String> invalidFormatCells = new ArrayList<>();
|
||||
private final List<String> leadingZerosCells = new ArrayList<>();
|
||||
private int mismatchedQuotes = 0;
|
||||
private final String relatedColumnName;
|
||||
|
||||
public ProblemAggregator(String relatedColumnName) {
|
||||
this.relatedColumnName = relatedColumnName;
|
||||
}
|
||||
/** An aggregator for parsing problems. */
|
||||
public interface ProblemAggregator {
|
||||
|
||||
/**
|
||||
* Reports a cell with an invalid format.
|
||||
*
|
||||
* <p>The reports are aggregated and finally a single problem containing all invalid cell for the
|
||||
* <p>The reports are aggregated and finally a single problem containing all invalid cells for the
|
||||
* given column is reported.
|
||||
*/
|
||||
public void reportInvalidFormat(String cell) {
|
||||
invalidFormatCells.add(cell);
|
||||
}
|
||||
void reportInvalidFormat(String cell);
|
||||
|
||||
public void reportLeadingZeroes(String cell) {
|
||||
leadingZerosCells.add(cell);
|
||||
}
|
||||
/** Reports a cell containing unexpected leading zeros. */
|
||||
void reportLeadingZeroes(String cell);
|
||||
|
||||
public void reportMismatchedQuote() {
|
||||
mismatchedQuotes++;
|
||||
}
|
||||
/** Reports that a mismatched quote has been encountered. */
|
||||
void reportMismatchedQuote();
|
||||
|
||||
/**
|
||||
* Checks if there are any problems already reported.
|
||||
@ -44,28 +25,8 @@ public class ProblemAggregator {
|
||||
* <p>This method returns true if and only if {@code getAggregatedProblems} would return a
|
||||
* non-empty list.
|
||||
*/
|
||||
public boolean hasProblems() {
|
||||
return !invalidFormatCells.isEmpty() || !leadingZerosCells.isEmpty() || mismatchedQuotes > 0;
|
||||
}
|
||||
boolean hasProblems();
|
||||
|
||||
/** Return an aggregated summary of problems that have been reported. */
|
||||
public List<ParsingProblem> getAggregatedProblems() {
|
||||
List<ParsingProblem> problems = new ArrayList<>();
|
||||
|
||||
if (!invalidFormatCells.isEmpty()) {
|
||||
problems.add(new InvalidFormat(relatedColumnName, invalidFormatCells));
|
||||
}
|
||||
|
||||
if (!leadingZerosCells.isEmpty()) {
|
||||
problems.add(new LeadingZeros(relatedColumnName, leadingZerosCells));
|
||||
}
|
||||
|
||||
for (int i = 0; i < mismatchedQuotes; ++i) {
|
||||
problems.add(new MismatchedQuote());
|
||||
}
|
||||
|
||||
assert problems.isEmpty() == !hasProblems();
|
||||
|
||||
return problems;
|
||||
}
|
||||
List<ParsingProblem> getAggregatedProblems();
|
||||
}
|
||||
|
@ -0,0 +1,56 @@
|
||||
package org.enso.table.parsing.problems;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
|
||||
public class ProblemAggregatorImpl implements ProblemAggregator {
|
||||
public final String relatedColumnName;
|
||||
private final List<String> invalidFormatCells = new ArrayList<>();
|
||||
private final List<String> leadingZerosCells = new ArrayList<>();
|
||||
private int mismatchedQuotes = 0;
|
||||
|
||||
public ProblemAggregatorImpl(String relatedColumnName) {
|
||||
this.relatedColumnName = relatedColumnName;
|
||||
}
|
||||
|
||||
@Override
|
||||
public void reportInvalidFormat(String cell) {
|
||||
invalidFormatCells.add(cell);
|
||||
}
|
||||
|
||||
@Override
|
||||
public void reportLeadingZeroes(String cell) {
|
||||
leadingZerosCells.add(cell);
|
||||
}
|
||||
|
||||
@Override
|
||||
public void reportMismatchedQuote() {
|
||||
mismatchedQuotes++;
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean hasProblems() {
|
||||
return !invalidFormatCells.isEmpty() || !leadingZerosCells.isEmpty() || mismatchedQuotes > 0;
|
||||
}
|
||||
|
||||
@Override
|
||||
public List<ParsingProblem> getAggregatedProblems() {
|
||||
List<ParsingProblem> problems = new ArrayList<>();
|
||||
|
||||
if (!invalidFormatCells.isEmpty()) {
|
||||
problems.add(new InvalidFormat(relatedColumnName, invalidFormatCells));
|
||||
}
|
||||
|
||||
if (!leadingZerosCells.isEmpty()) {
|
||||
problems.add(new LeadingZeros(relatedColumnName, leadingZerosCells));
|
||||
}
|
||||
|
||||
for (int i = 0; i < mismatchedQuotes; ++i) {
|
||||
problems.add(new MismatchedQuote());
|
||||
}
|
||||
|
||||
assert problems.isEmpty() == !hasProblems();
|
||||
|
||||
return problems;
|
||||
}
|
||||
}
|
@ -0,0 +1,33 @@
|
||||
package org.enso.table.parsing.problems;
|
||||
|
||||
import java.util.List;
|
||||
|
||||
public class SimplifiedProblemAggregator implements ProblemAggregator {
|
||||
|
||||
private boolean hasProblems = false;
|
||||
|
||||
@Override
|
||||
public void reportInvalidFormat(String cell) {
|
||||
hasProblems = true;
|
||||
}
|
||||
|
||||
@Override
|
||||
public void reportLeadingZeroes(String cell) {
|
||||
hasProblems = true;
|
||||
}
|
||||
|
||||
@Override
|
||||
public void reportMismatchedQuote() {
|
||||
hasProblems = true;
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean hasProblems() {
|
||||
return hasProblems;
|
||||
}
|
||||
|
||||
@Override
|
||||
public List<ParsingProblem> getAggregatedProblems() {
|
||||
throw new IllegalStateException("Problem aggregation is not available in this implementation.");
|
||||
}
|
||||
}
|
@ -15,31 +15,20 @@ import org.enso.table.data.index.DefaultIndex;
|
||||
import org.enso.table.data.table.Column;
|
||||
import org.enso.table.data.table.Table;
|
||||
import org.enso.table.parsing.DatatypeParser;
|
||||
import org.enso.table.parsing.TypeInferringParser;
|
||||
import org.enso.table.parsing.problems.AdditionalInvalidRows;
|
||||
import org.enso.table.parsing.problems.InvalidRow;
|
||||
import org.enso.table.parsing.problems.MismatchedQuote;
|
||||
import org.enso.table.parsing.problems.NoOpProblemAggregator;
|
||||
import org.enso.table.parsing.problems.ParsingProblem;
|
||||
import org.enso.table.util.NameDeduplicator;
|
||||
|
||||
/** A helper for reading delimited (CSV-like) files. */
|
||||
public class DelimitedReader {
|
||||
|
||||
/** Specifies how to set the headers for the returned table. */
|
||||
public enum HeaderBehavior {
|
||||
/** Tries to infer if the headers are present in the file. */
|
||||
INFER,
|
||||
|
||||
/** Uses the first row in the file as headers. Duplicate names will be appended suffixes. */
|
||||
USE_FIRST_ROW_AS_HEADERS,
|
||||
|
||||
/**
|
||||
* Treats the first row as data and generates header names starting with {@code COLUMN_NAME}.
|
||||
*/
|
||||
GENERATE_HEADERS
|
||||
}
|
||||
|
||||
private static final String COLUMN_NAME = "Column";
|
||||
|
||||
private static final char noQuoteCharacter = '\0';
|
||||
private static final long invalidRowsLimit = 10;
|
||||
private final char delimiter;
|
||||
private final char quoteCharacter;
|
||||
private final char quoteEscapeCharacter;
|
||||
@ -50,10 +39,15 @@ public class DelimitedReader {
|
||||
private final List<ParsingProblem> warnings = new ArrayList<>();
|
||||
private final CsvParser parser;
|
||||
private final DatatypeParser valueParser;
|
||||
private final TypeInferringParser cellTypeGuesser;
|
||||
private final boolean keepInvalidRows;
|
||||
private final boolean warningsAsErrors;
|
||||
|
||||
private static final char noQuoteCharacter = '\0';
|
||||
private final NoOpProblemAggregator noOpProblemAggregator = new NoOpProblemAggregator();
|
||||
private long invalidRowsCount = 0;
|
||||
private long targetTableIndex = 0;
|
||||
/** The line number of the start of the current row in the input file. */
|
||||
private long currentLine = 0;
|
||||
private StringStorageBuilder[] builders = null;
|
||||
|
||||
/**
|
||||
* Creates a new reader.
|
||||
@ -74,6 +68,8 @@ public class DelimitedReader {
|
||||
* @param maxColumns specifies how many columns can be expected at most
|
||||
* @param valueParser an optional parser that is applied to each column to convert it to more
|
||||
* specific datatype
|
||||
* @param cellTypeGuesser a helper used to guess cell types, used for the purpose of inferring the
|
||||
* headers, it must not be null if {@code headerBehavior} is set to {@code INFER}.
|
||||
* @param keepInvalidRows specifies whether to keep rows that had an unexpected number of columns
|
||||
* @param warningsAsErrors specifies if the first warning should be immediately raised as an error
|
||||
* (used as a fast-path for the error-reporting mode to avoid computing a value that is going
|
||||
@ -89,6 +85,7 @@ public class DelimitedReader {
|
||||
long rowLimit,
|
||||
int maxColumns,
|
||||
DatatypeParser valueParser,
|
||||
TypeInferringParser cellTypeGuesser,
|
||||
boolean keepInvalidRows,
|
||||
boolean warningsAsErrors) {
|
||||
if (delimiter.isEmpty()) {
|
||||
@ -142,6 +139,7 @@ public class DelimitedReader {
|
||||
this.warningsAsErrors = warningsAsErrors;
|
||||
|
||||
this.valueParser = valueParser;
|
||||
this.cellTypeGuesser = cellTypeGuesser;
|
||||
parser = setupCsvParser(input);
|
||||
}
|
||||
|
||||
@ -174,9 +172,6 @@ public class DelimitedReader {
|
||||
reportProblem(new MismatchedQuote());
|
||||
}
|
||||
|
||||
private long invalidRowsCount = 0;
|
||||
private static final long invalidRowsLimit = 10;
|
||||
|
||||
private void reportInvalidRow(long source_row, Long table_index, String[] row) {
|
||||
if (invalidRowsCount < invalidRowsLimit) {
|
||||
reportProblem(new InvalidRow(source_row, table_index, row));
|
||||
@ -203,29 +198,89 @@ public class DelimitedReader {
|
||||
}
|
||||
}
|
||||
|
||||
private long target_table_index = 0;
|
||||
|
||||
/** The line number of the start of the current row in the input file. */
|
||||
private long current_line = 0;
|
||||
|
||||
/**
|
||||
* Reads the next row and updates the current line accordingly.
|
||||
*
|
||||
* <p>Will return {@code null} if no more rows are available.
|
||||
*/
|
||||
private String[] nextRow() {
|
||||
current_line = parser.getContext().currentLine() + 1;
|
||||
private String[] readNextRow() {
|
||||
currentLine = parser.getContext().currentLine() + 1;
|
||||
return parser.parseNext();
|
||||
}
|
||||
|
||||
private void appendRow(String[] row) {
|
||||
assert builders != null;
|
||||
assert canFitMoreRows();
|
||||
|
||||
if (row.length != builders.length) {
|
||||
reportInvalidRow(currentLine, keepInvalidRows ? targetTableIndex : null, row);
|
||||
|
||||
if (keepInvalidRows) {
|
||||
for (int i = 0; i < builders.length && i < row.length; i++) {
|
||||
builders[i] = builders[i].parseAndAppend(row[i]);
|
||||
}
|
||||
|
||||
// If the current row had less columns than expected, nulls are inserted for the missing
|
||||
// values.
|
||||
// If it had more columns, the excess columns are discarded.
|
||||
for (int i = row.length; i < builders.length; i++) {
|
||||
builders[i] = builders[i].parseAndAppend(null);
|
||||
}
|
||||
|
||||
targetTableIndex++;
|
||||
}
|
||||
} else {
|
||||
for (int i = 0; i < builders.length; i++) {
|
||||
builders[i] = builders[i].parseAndAppend(row[i]);
|
||||
}
|
||||
|
||||
targetTableIndex++;
|
||||
}
|
||||
}
|
||||
|
||||
private boolean canFitMoreRows() {
|
||||
return rowLimit < 0 || targetTableIndex < rowLimit;
|
||||
}
|
||||
|
||||
private void appendRowIfLimitPermits(String[] row) {
|
||||
if (canFitMoreRows()) {
|
||||
appendRow(row);
|
||||
}
|
||||
}
|
||||
|
||||
private List<String> headersFromRow(String[] row) {
|
||||
List<String> preprocessedHeaders =
|
||||
Arrays.stream(row).map(this::parseHeader).collect(Collectors.toList());
|
||||
return NameDeduplicator.deduplicate(preprocessedHeaders, "_");
|
||||
}
|
||||
|
||||
private List<String> generateDefaultHeaders(int columnCount) {
|
||||
ArrayList<String> headerNames = new ArrayList<>(columnCount);
|
||||
for (int i = 0; i < columnCount; ++i) {
|
||||
headerNames.add(COLUMN_NAME + "_" + (i + 1));
|
||||
}
|
||||
return headerNames;
|
||||
}
|
||||
|
||||
/**
|
||||
* Checks if the given cell contains just plain text that is not null and is not convertible to
|
||||
* any more specific type according to the {@code cellTypeGuesser}. This is used for checking the
|
||||
* types when inferring the headers.
|
||||
*/
|
||||
private boolean isPlainText(String cell) {
|
||||
if (cell == null) return false;
|
||||
Object parsed = cellTypeGuesser.parseSingleValue(cell, noOpProblemAggregator);
|
||||
return parsed instanceof String;
|
||||
}
|
||||
|
||||
/** Reads the input stream and returns a Table. */
|
||||
public Table read() {
|
||||
List<String> headerNames;
|
||||
String[] currentRow = nextRow();
|
||||
String[] currentRow = readNextRow();
|
||||
|
||||
// Skip the first N rows.
|
||||
for (long i = 0; currentRow != null && i < skipRows; ++i) {
|
||||
currentRow = nextRow();
|
||||
currentRow = readNextRow();
|
||||
}
|
||||
|
||||
// If there are no rows to even infer the headers, we return an empty table.
|
||||
@ -233,55 +288,50 @@ public class DelimitedReader {
|
||||
return new Table(new Column[0]);
|
||||
}
|
||||
|
||||
int expectedColumnCount = currentRow.length;
|
||||
initBuilders(expectedColumnCount);
|
||||
|
||||
assert currentRow != null;
|
||||
switch (headerBehavior) {
|
||||
case INFER:
|
||||
throw new IllegalStateException("Inferring headers is not yet implemented");
|
||||
case USE_FIRST_ROW_AS_HEADERS:
|
||||
List<String> preprocessedHeaders =
|
||||
Arrays.stream(currentRow).map(this::parseHeader).collect(Collectors.toList());
|
||||
headerNames = NameDeduplicator.deduplicate(preprocessedHeaders, "_");
|
||||
// We have 'used up' the first row, so we load a next one.
|
||||
currentRow = nextRow();
|
||||
break;
|
||||
case GENERATE_HEADERS:
|
||||
headerNames = new ArrayList<>(currentRow.length);
|
||||
for (int i = 0; i < currentRow.length; ++i) {
|
||||
headerNames.add(COLUMN_NAME + "_" + (i + 1));
|
||||
case INFER -> {
|
||||
String[] firstRow = currentRow;
|
||||
String[] secondRow = readNextRow();
|
||||
if (secondRow == null) {
|
||||
// If there is only one row in the file, we generate the headers and stop further processing (as nothing more to process).
|
||||
headerNames = generateDefaultHeaders(expectedColumnCount);
|
||||
appendRowIfLimitPermits(firstRow);
|
||||
currentRow = null;
|
||||
} else {
|
||||
assert cellTypeGuesser != null;
|
||||
boolean firstAllText = Arrays.stream(firstRow).allMatch(this::isPlainText);
|
||||
boolean secondAllText = Arrays.stream(secondRow).allMatch(this ::isPlainText);
|
||||
boolean useFirstRowAsHeader = firstAllText && !secondAllText;
|
||||
if (useFirstRowAsHeader) {
|
||||
headerNames = headersFromRow(firstRow);
|
||||
appendRowIfLimitPermits(secondRow);
|
||||
} else {
|
||||
headerNames = generateDefaultHeaders(expectedColumnCount);
|
||||
appendRowIfLimitPermits(firstRow);
|
||||
appendRowIfLimitPermits(secondRow);
|
||||
}
|
||||
|
||||
currentRow = readNextRow();
|
||||
}
|
||||
break;
|
||||
default:
|
||||
throw new IllegalStateException("Impossible branch.");
|
||||
}
|
||||
case USE_FIRST_ROW_AS_HEADERS -> {
|
||||
headerNames = headersFromRow(currentRow);
|
||||
// We have 'used up' the first row, so we load a next one.
|
||||
currentRow = readNextRow();
|
||||
}
|
||||
case GENERATE_HEADERS -> {
|
||||
headerNames = generateDefaultHeaders(expectedColumnCount);
|
||||
}
|
||||
default -> throw new IllegalStateException("Impossible branch.");
|
||||
}
|
||||
|
||||
StringStorageBuilder[] builders = initBuilders(headerNames.size());
|
||||
|
||||
while (currentRow != null && (rowLimit < 0 || target_table_index < rowLimit)) {
|
||||
if (currentRow.length != builders.length) {
|
||||
reportInvalidRow(current_line, keepInvalidRows ? target_table_index : null, currentRow);
|
||||
|
||||
if (keepInvalidRows) {
|
||||
for (int i = 0; i < builders.length && i < currentRow.length; i++) {
|
||||
builders[i] = builders[i].parseAndAppend(currentRow[i]);
|
||||
}
|
||||
|
||||
// If the current row had less columns than expected, nulls are inserted for the missing
|
||||
// values.
|
||||
// If it had more columns, the excess columns are discarded.
|
||||
for (int i = currentRow.length; i < builders.length; i++) {
|
||||
builders[i] = builders[i].parseAndAppend(null);
|
||||
}
|
||||
|
||||
target_table_index++;
|
||||
}
|
||||
} else {
|
||||
for (int i = 0; i < builders.length; i++) {
|
||||
builders[i] = builders[i].parseAndAppend(currentRow[i]);
|
||||
}
|
||||
|
||||
target_table_index++;
|
||||
}
|
||||
|
||||
currentRow = nextRow();
|
||||
while (currentRow != null && canFitMoreRows()) {
|
||||
appendRow(currentRow);
|
||||
currentRow = readNextRow();
|
||||
}
|
||||
|
||||
parser.stopParsing();
|
||||
@ -302,11 +352,24 @@ public class DelimitedReader {
|
||||
return new Table(columns);
|
||||
}
|
||||
|
||||
private StringStorageBuilder[] initBuilders(int count) {
|
||||
StringStorageBuilder[] res = new StringStorageBuilder[count];
|
||||
private void initBuilders(int count) {
|
||||
builders = new StringStorageBuilder[count];
|
||||
for (int i = 0; i < count; i++) {
|
||||
res[i] = new StringStorageBuilder();
|
||||
builders[i] = new StringStorageBuilder();
|
||||
}
|
||||
return res;
|
||||
}
|
||||
|
||||
/** Specifies how to set the headers for the returned table. */
|
||||
public enum HeaderBehavior {
|
||||
/** Tries to infer if the headers are present in the file. */
|
||||
INFER,
|
||||
|
||||
/** Uses the first row in the file as headers. Duplicate names will be appended suffixes. */
|
||||
USE_FIRST_ROW_AS_HEADERS,
|
||||
|
||||
/**
|
||||
* Treats the first row as data and generates header names starting with {@code COLUMN_NAME}.
|
||||
*/
|
||||
GENERATE_HEADERS
|
||||
}
|
||||
}
|
||||
|
@ -34,7 +34,7 @@ public class QuoteStrippingParser extends IncrementalDatatypeParser {
|
||||
}
|
||||
|
||||
@Override
|
||||
protected Builder makeBuilderWithCapacity(long capacity) {
|
||||
return new StringBuilder((int) capacity);
|
||||
protected Builder makeBuilderWithCapacity(int capacity) {
|
||||
return new StringBuilder(capacity);
|
||||
}
|
||||
}
|
||||
|
4
test/Table_Tests/data/all_text.csv
Normal file
4
test/Table_Tests/data/all_text.csv
Normal file
@ -0,0 +1,4 @@
|
||||
a,b
|
||||
c,d
|
||||
e,f
|
||||
g,h
|
|
@ -1,4 +1,4 @@
|
||||
Code,Index,Flag,Value,ValueWithNothing,TextWithNothing,Hexadecimal,Leading0s,QuotedNumbers,Mixed
|
||||
Code,Index,Flag,Value,ValueWithNothing,TextWithNothing,"Hexadecimal",Leading0s,QuotedNumbers,"Mixed Types"
|
||||
gxl,7,True,38.76109,63.13, pq6igd2wyd ,4DD4675B,001,"1","33"
|
||||
wca,0,False,-66.77495,31," 2pr4102wc4 ",,002,"2",
|
||||
nfw,1, True , 88.65713 ,-68.71,"",01896EAB,123,,45
|
||||
|
|
2
test/Table_Tests/data/numbers_in_header.csv
Normal file
2
test/Table_Tests/data/numbers_in_header.csv
Normal file
@ -0,0 +1,2 @@
|
||||
a,"b",0
|
||||
1,2,3
|
|
1
test/Table_Tests/data/one_row.csv
Normal file
1
test/Table_Tests/data/one_row.csv
Normal file
@ -0,0 +1 @@
|
||||
x,y,z
|
|
2
test/Table_Tests/data/quoted_numbers_in_header.csv
Normal file
2
test/Table_Tests/data/quoted_numbers_in_header.csv
Normal file
@ -0,0 +1,2 @@
|
||||
"1",x
|
||||
y,2
|
|
2
test/Table_Tests/data/two_rows1.csv
Normal file
2
test/Table_Tests/data/two_rows1.csv
Normal file
@ -0,0 +1,2 @@
|
||||
a,b,c
|
||||
x,,
|
|
2
test/Table_Tests/data/two_rows2.csv
Normal file
2
test/Table_Tests/data/two_rows2.csv
Normal file
@ -0,0 +1,2 @@
|
||||
a,b,c
|
||||
d,e,f
|
|
@ -42,7 +42,51 @@ spec =
|
||||
table.at "Column_1" . to_vector . should_equal ["4"]
|
||||
table.at "d" . to_vector . should_equal ["5"]
|
||||
|
||||
Test.specify "load even an empty file" <|
|
||||
Test.specify "should infer headers based on the first two rows" <|
|
||||
t1 = File.read (Enso_Project.data / "data_small.csv") (File_Format.Delimited "," headers=File_Format.Infer)
|
||||
t1.columns.map .name . should_equal ["Code", "Index", "Flag", "Value", "ValueWithNothing", "TextWithNothing", "Hexadecimal", "Leading0s", "QuotedNumbers", "Mixed Types"]
|
||||
|
||||
t2 = File.read (Enso_Project.data / "all_text.csv") (File_Format.Delimited "," headers=File_Format.Infer)
|
||||
t2.columns.map .name . should_equal ["Column_1", "Column_2"]
|
||||
t2.at "Column_1" . to_vector . should_equal ["a", "c", "e", "g"]
|
||||
t2.at "Column_2" . to_vector . should_equal ["b", "d", "f", "h"]
|
||||
|
||||
t3 = File.read (Enso_Project.data / "two_rows1.csv") (File_Format.Delimited "," headers=File_Format.Infer)
|
||||
t3.columns.map .name . should_equal ["a", "b", "c"]
|
||||
t3.at "a" . to_vector . should_equal ["x"]
|
||||
t3.at "b" . to_vector . should_equal [Nothing]
|
||||
t3.at "c" . to_vector . should_equal [Nothing]
|
||||
|
||||
t4 = File.read (Enso_Project.data / "two_rows2.csv") (File_Format.Delimited "," headers=File_Format.Infer)
|
||||
t4.columns.map .name . should_equal ["Column_1", "Column_2", "Column_3"]
|
||||
t4.at "Column_1" . to_vector . should_equal ["a", "d"]
|
||||
t4.at "Column_2" . to_vector . should_equal ["b", "e"]
|
||||
t4.at "Column_3" . to_vector . should_equal ["c", "f"]
|
||||
|
||||
t5 = File.read (Enso_Project.data / "numbers_in_header.csv") (File_Format.Delimited "," headers=File_Format.Infer)
|
||||
t5.columns.map .name . should_equal ["Column_1", "Column_2", "Column_3"]
|
||||
t5.at "Column_1" . to_vector . should_equal ["a", "1"]
|
||||
t5.at "Column_2" . to_vector . should_equal ["b", "2"]
|
||||
t5.at "Column_3" . to_vector . should_equal [0, 3]
|
||||
|
||||
t6 = File.read (Enso_Project.data / "quoted_numbers_in_header.csv") (File_Format.Delimited "," headers=File_Format.Infer)
|
||||
t6.columns.map .name . should_equal ["1", "x"]
|
||||
t6.at "1" . to_vector . should_equal ["y"]
|
||||
t6.at "x" . to_vector . should_equal [2]
|
||||
|
||||
Test.specify "should not use the first row as headers if it is the only row, unless specifically asked to" <|
|
||||
t1 = File.read (Enso_Project.data / "one_row.csv") (File_Format.Delimited "," headers=File_Format.Infer)
|
||||
t1.columns.map .name . should_equal ["Column_1", "Column_2", "Column_3"]
|
||||
t1.at "Column_1" . to_vector . should_equal ["x"]
|
||||
t1.at "Column_2" . to_vector . should_equal ["y"]
|
||||
t1.at "Column_3" . to_vector . should_equal ["z"]
|
||||
|
||||
t2 = File.read (Enso_Project.data / "one_row.csv") (File_Format.Delimited "," headers=True)
|
||||
t2.columns.map .name . should_equal ["x", "y", "z"]
|
||||
t2.row_count . should_equal 0
|
||||
t2.at "x" . to_vector . should_equal []
|
||||
|
||||
Test.specify "should be able to load even an empty file" <|
|
||||
table = File.read (Enso_Project.data / "empty.txt") (File_Format.Delimited "," headers=True value_formatter=Nothing)
|
||||
table.columns.map .name . should_equal []
|
||||
table.row_count . should_equal 0
|
||||
@ -251,17 +295,20 @@ spec =
|
||||
t.at "Hexadecimal" . to_vector . should_equal ["4DD4675B", Nothing, "01896EAB", "F32E1EFE"]
|
||||
t.at "Leading0s" . to_vector . should_equal ["001", "002", "123", Nothing]
|
||||
t.at "QuotedNumbers" . to_vector . should_equal ["1", "2", Nothing, "34"]
|
||||
t.at "Mixed" . to_vector . should_equal ["33", Nothing, "45", "True"]
|
||||
t.at "Mixed Types" . to_vector . should_equal ["33", Nothing, "45", "True"]
|
||||
|
||||
t2 = (Enso_Project.data / "data_small.csv") . read (File_Format.Delimited "," headers=True value_formatter=(Data_Formatter allow_leading_zeros=True))
|
||||
t2.at "Leading0s" . to_vector . should_equal [1, 2, 123, Nothing]
|
||||
|
||||
Test.specify "should be able to detect known types automatically" <|
|
||||
## TODO update this once headers are inferred (next PR)
|
||||
Test.specify "should be able to detect types automatically" <|
|
||||
t1 = (Enso_Project.data / "data_small.csv") . read
|
||||
t1.at "Column_1" . to_vector . should_equal ["Code", "gxl", "wca", "nfw", "der"]
|
||||
t1.at "Code" . to_vector . should_equal ["gxl", "wca", "nfw", "der"]
|
||||
t1.at "Index" . to_vector . should_equal [7, 0, 1, 7]
|
||||
|
||||
t2 = (Enso_Project.data / "sample.tsv") . read
|
||||
t2.at "Column_1" . to_vector . should_equal ["a", "1", "4"]
|
||||
t2.at "a" . to_vector . should_equal [1, 4]
|
||||
t2.at "b" . to_vector . should_equal [2, 5]
|
||||
t2.at "c" . to_vector . should_equal [3, 6]
|
||||
t2.columns.map .name . should_equal ["a", "b", "c"]
|
||||
|
||||
main = Test.Suite.run_main here.spec
|
||||
|
Loading…
Reference in New Issue
Block a user