diff --git a/CHANGELOG.md b/CHANGELOG.md index 9f63ecc740f..c8da00f6552 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -126,6 +126,8 @@ instance][3460] - [Implemented automatic type detection for `Table.parse_values`.][3462] - [Integrated value parsing with the `Delimited` file reader.][3463] +- [Implemented the `Infer` setting for headers in the `Delimited` file format + and made it the default.][3472] [debug-shortcuts]: https://github.com/enso-org/enso/blob/develop/app/gui/docs/product/shortcuts.md#debug @@ -196,6 +198,7 @@ [3460]: https://github.com/enso-org/enso/pull/3460 [3462]: https://github.com/enso-org/enso/pull/3462 [3463]: https://github.com/enso-org/enso/pull/3463 +[3472]: https://github.com/enso-org/enso/pull/3472 #### Enso Compiler diff --git a/distribution/lib/Standard/Table/0.0.0-dev/src/Data/Data_Formatter.enso b/distribution/lib/Standard/Table/0.0.0-dev/src/Data/Data_Formatter.enso index e26ca2289ea..374c0063d2f 100644 --- a/distribution/lib/Standard/Table/0.0.0-dev/src/Data/Data_Formatter.enso +++ b/distribution/lib/Standard/Table/0.0.0-dev/src/Data/Data_Formatter.enso @@ -89,10 +89,10 @@ Data_Formatter.make_datatype_parser datatype = case datatype of Error.throw (Illegal_Argument_Error "Unsupported datatype: "+datatype.to_text) ## PRIVATE -Data_Formatter.get_parsers = +Data_Formatter.get_specific_type_parsers = [this.make_integer_parser, this.make_decimal_parser, this.make_datetime_parser, this.make_date_parser, this.make_time_parser, this.make_boolean_parser] ## PRIVATE Data_Formatter.make_auto_parser = fallback_parser = this.make_identity_parser - TypeInferringParser.new this.get_parsers.to_array fallback_parser + TypeInferringParser.new this.get_specific_type_parsers.to_array fallback_parser diff --git a/distribution/lib/Standard/Table/0.0.0-dev/src/Internal/Delimited_Reader.enso b/distribution/lib/Standard/Table/0.0.0-dev/src/Internal/Delimited_Reader.enso index 517d0a84313..fc7ea66830c 100644 --- a/distribution/lib/Standard/Table/0.0.0-dev/src/Internal/Delimited_Reader.enso +++ b/distribution/lib/Standard/Table/0.0.0-dev/src/Internal/Delimited_Reader.enso @@ -64,7 +64,7 @@ read_stream : Delimited -> InputStream -> Problem_Behavior -> File | Nothing -> read_stream format java_stream on_problems max_columns=4096 related_file=Nothing = java_headers = case format.headers of True -> DelimitedReader.HeaderBehavior.USE_FIRST_ROW_AS_HEADERS - Infer -> Errors.unimplemented "Inferring headers is not implemented yet." + Infer -> DelimitedReader.HeaderBehavior.INFER False -> DelimitedReader.HeaderBehavior.GENERATE_HEADERS skip_rows = case format.skip_rows of Nothing -> 0 @@ -103,8 +103,11 @@ read_stream format java_stream on_problems max_columns=4096 related_file=Nothing QuoteStrippingParser.new format.quote value_parser = if format.value_formatter.is_nothing then base_parser else wrapped = format.value_formatter.wrap_base_parser base_parser - TypeInferringParser.new format.value_formatter.get_parsers.to_array wrapped - reader = DelimitedReader.new reporting_stream_decoder format.delimiter format.quote format.quote_escape java_headers skip_rows row_limit max_columns value_parser format.keep_invalid_rows warnings_as_errors + TypeInferringParser.new format.value_formatter.get_specific_type_parsers.to_array wrapped + cell_type_guesser = if format.headers != Infer then Nothing else + formatter = format.value_formatter.if_nothing Data_Formatter + TypeInferringParser.new formatter.get_specific_type_parsers.to_array IdentityParser.new + reader = DelimitedReader.new reporting_stream_decoder format.delimiter format.quote format.quote_escape java_headers skip_rows row_limit max_columns value_parser cell_type_guesser format.keep_invalid_rows warnings_as_errors result = Table.Table reader.read decoding_problems = Vector.Vector reporting_stream_decoder.getReportedProblems . map Encoding_Error parsing_problems = Vector.Vector reader.getReportedProblems . map translate_parsing_problem diff --git a/distribution/lib/Standard/Table/0.0.0-dev/src/Io/File_Format.enso b/distribution/lib/Standard/Table/0.0.0-dev/src/Io/File_Format.enso index 5d23258acc9..5079c83e5ed 100644 --- a/distribution/lib/Standard/Table/0.0.0-dev/src/Io/File_Format.enso +++ b/distribution/lib/Standard/Table/0.0.0-dev/src/Io/File_Format.enso @@ -87,9 +87,8 @@ type Delimited set to `False`, the column names are generated by adding increasing numeric suffixes to the base name `Column` (i.e. `Column_1`, `Column_2` etc.). If set to `Infer`, the process tries to infer if - headers are present on the first row (`Infer` is not implemented yet). - If the column names are not unique, numeric suffixes will be appended - to disambiguate them. + headers are present on the first row. If the column names are not + unique, numeric suffixes will be appended to disambiguate them. - skip_rows: The number of rows to skip from the top of the file. - row_limit: The maximum number of rows to read from the file. This count does not include the header row (if applicable). @@ -98,12 +97,7 @@ type Delimited - keep_invalid_rows: Specifies whether rows that contain less or more columns than expected should be kept (setting the missing columns to `Nothing` or dropping the excess columns) or dropped. - - TODO [RW] The default for `headers` is temporarily changed to `False`, - because `Infer` is not supported. It should be changed to be the default - value once the corrresponding task is implemented: - https://www.pivotaltracker.com/story/show/181986831 - type Delimited (delimiter:Text) (encoding:Encoding=Encoding.utf_8) (quote:Text|Nothing='"') (quote_escape:Text|Nothing='"') (headers:True|False|Infer=False) (skip_rows:Integer|Nothing=Nothing) (row_limit:Integer|Nothing=Nothing) (value_formatter:Data_Formatter|Nothing=Data_Formatter) (keep_invalid_rows:Boolean=True) + type Delimited (delimiter:Text) (encoding:Encoding=Encoding.utf_8) (quote:Text|Nothing='"') (quote_escape:Text|Nothing='"') (headers:True|False|Infer=Infer) (skip_rows:Integer|Nothing=Nothing) (row_limit:Integer|Nothing=Nothing) (value_formatter:Data_Formatter|Nothing=Data_Formatter) (keep_invalid_rows:Boolean=True) ## Implements the `File.read` for this `File_Format` read : File -> Problem_Behavior -> Any diff --git a/std-bits/table/src/main/java/org/enso/table/parsing/BaseTimeParser.java b/std-bits/table/src/main/java/org/enso/table/parsing/BaseTimeParser.java index 0caeba50def..cb13bad328d 100644 --- a/std-bits/table/src/main/java/org/enso/table/parsing/BaseTimeParser.java +++ b/std-bits/table/src/main/java/org/enso/table/parsing/BaseTimeParser.java @@ -38,9 +38,9 @@ public abstract class BaseTimeParser extends IncrementalDatatypeParser { } @Override - protected Builder makeBuilderWithCapacity(long capacity) { + protected Builder makeBuilderWithCapacity(int capacity) { // Once datetime gets first-class support in our dataframes, a more specific builder type should // be used. - return new ObjectBuilder((int) capacity); + return new ObjectBuilder(capacity); } } diff --git a/std-bits/table/src/main/java/org/enso/table/parsing/BooleanParser.java b/std-bits/table/src/main/java/org/enso/table/parsing/BooleanParser.java index 75bf818494b..de284c10e96 100644 --- a/std-bits/table/src/main/java/org/enso/table/parsing/BooleanParser.java +++ b/std-bits/table/src/main/java/org/enso/table/parsing/BooleanParser.java @@ -33,7 +33,7 @@ public class BooleanParser extends IncrementalDatatypeParser { } @Override - protected Builder makeBuilderWithCapacity(long capacity) { - return new BoolBuilder((int) capacity); + protected Builder makeBuilderWithCapacity(int capacity) { + return new BoolBuilder(capacity); } } diff --git a/std-bits/table/src/main/java/org/enso/table/parsing/DatatypeParser.java b/std-bits/table/src/main/java/org/enso/table/parsing/DatatypeParser.java index b2a4f52033c..3cb87741ca2 100644 --- a/std-bits/table/src/main/java/org/enso/table/parsing/DatatypeParser.java +++ b/std-bits/table/src/main/java/org/enso/table/parsing/DatatypeParser.java @@ -2,13 +2,26 @@ package org.enso.table.parsing; import org.enso.table.data.column.storage.Storage; import org.enso.table.data.column.storage.StringStorage; +import org.enso.table.parsing.problems.ProblemAggregator; import org.enso.table.read.WithProblems; /** A base type for a parser capable of parsing a column of text values into some other type. */ -public interface DatatypeParser { +public abstract class DatatypeParser { + /** + * Parses a single cell. + * + * @param text the text contents to parse, it will never be null in the default implementation - + * null values are just passed as-is without any parsing attempts by default + * @param problemAggregator an instance of the problem aggregator, used for reporting parsing + * problems + * @return the parsed value or null if the value could not be parsed or could be parsed but should + * be treated as missing value + */ + protected abstract Object parseSingleValue(String text, ProblemAggregator problemAggregator); + /** * Parses a column of texts (represented as a {@code StringStorage}) and returns a new storage, * containing the parsed elements. */ - WithProblems parseColumn(String columnName, StringStorage sourceStorage); + public abstract WithProblems parseColumn(String columnName, StringStorage sourceStorage); } diff --git a/std-bits/table/src/main/java/org/enso/table/parsing/DecimalParser.java b/std-bits/table/src/main/java/org/enso/table/parsing/DecimalParser.java index 38bd979ab8f..3c36a2c6917 100644 --- a/std-bits/table/src/main/java/org/enso/table/parsing/DecimalParser.java +++ b/std-bits/table/src/main/java/org/enso/table/parsing/DecimalParser.java @@ -84,7 +84,7 @@ public class DecimalParser extends IncrementalDatatypeParser { } @Override - protected Builder makeBuilderWithCapacity(long capacity) { - return NumericBuilder.createDoubleBuilder((int) capacity); + protected Builder makeBuilderWithCapacity(int capacity) { + return NumericBuilder.createDoubleBuilder(capacity); } } diff --git a/std-bits/table/src/main/java/org/enso/table/parsing/IdentityParser.java b/std-bits/table/src/main/java/org/enso/table/parsing/IdentityParser.java index ff206bb26c8..5baeb28962a 100644 --- a/std-bits/table/src/main/java/org/enso/table/parsing/IdentityParser.java +++ b/std-bits/table/src/main/java/org/enso/table/parsing/IdentityParser.java @@ -11,13 +11,13 @@ import org.enso.table.read.WithProblems; public class IdentityParser extends IncrementalDatatypeParser { @Override - public Object parseSingleValue(String text, ProblemAggregator problemAggregator) { + protected Object parseSingleValue(String text, ProblemAggregator problemAggregator) { return text; } @Override - public StringBuilder makeBuilderWithCapacity(long capacity) { - return new StringBuilder((int) capacity); + public StringBuilder makeBuilderWithCapacity(int capacity) { + return new StringBuilder(capacity); } @Override diff --git a/std-bits/table/src/main/java/org/enso/table/parsing/IncrementalDatatypeParser.java b/std-bits/table/src/main/java/org/enso/table/parsing/IncrementalDatatypeParser.java index 107ec632695..56fb44ca0a6 100644 --- a/std-bits/table/src/main/java/org/enso/table/parsing/IncrementalDatatypeParser.java +++ b/std-bits/table/src/main/java/org/enso/table/parsing/IncrementalDatatypeParser.java @@ -3,7 +3,7 @@ package org.enso.table.parsing; import org.enso.table.data.column.builder.object.Builder; import org.enso.table.data.column.storage.Storage; import org.enso.table.data.column.storage.StringStorage; -import org.enso.table.parsing.problems.ProblemAggregator; +import org.enso.table.parsing.problems.ProblemAggregatorImpl; import org.enso.table.read.WithProblems; /** @@ -12,20 +12,7 @@ import org.enso.table.read.WithProblems; *

It specifies the strategy for parsing text cells into some target type, reporting issues and * building the resulting table column. */ -public abstract class IncrementalDatatypeParser implements DatatypeParser { - - /** - * Parses a single cell. - * - * @param text the text contents to parse, it will never be null in the default implementation - - * null values are just passed as-is without any parsing attempts by default - * @param problemAggregator an instance of the problem aggregator, used for reporting parsing - * problems - * @return the parsed value or null if the value could not be parsed or could be parsed but should - * be treated as missing value - */ - protected abstract Object parseSingleValue(String text, ProblemAggregator problemAggregator); - +public abstract class IncrementalDatatypeParser extends DatatypeParser { /** * Creates a new column builder expecting the specific datatype, with a specified capacity. * @@ -36,12 +23,15 @@ public abstract class IncrementalDatatypeParser implements DatatypeParser { * builder returned here expects - it should never return a value that cannot be accepted by the * builder. */ - protected abstract Builder makeBuilderWithCapacity(long capacity); + protected abstract Builder makeBuilderWithCapacity(int capacity); - @Override + /** + * Parses a column of texts (represented as a {@code StringStorage}) and returns a new storage, + * containing the parsed elements. + */ public WithProblems parseColumn(String columnName, StringStorage sourceStorage) { Builder builder = makeBuilderWithCapacity(sourceStorage.size()); - var aggregator = new ProblemAggregator(columnName); + var aggregator = new ProblemAggregatorImpl(columnName); for (int i = 0; i < sourceStorage.size(); ++i) { String cell = sourceStorage.getItem(i); diff --git a/std-bits/table/src/main/java/org/enso/table/parsing/IntegerParser.java b/std-bits/table/src/main/java/org/enso/table/parsing/IntegerParser.java index 18a78365baf..04df8485706 100644 --- a/std-bits/table/src/main/java/org/enso/table/parsing/IntegerParser.java +++ b/std-bits/table/src/main/java/org/enso/table/parsing/IntegerParser.java @@ -55,7 +55,7 @@ public class IntegerParser extends IncrementalDatatypeParser { } @Override - protected Builder makeBuilderWithCapacity(long capacity) { - return NumericBuilder.createLongBuilder((int) capacity); + protected Builder makeBuilderWithCapacity(int capacity) { + return NumericBuilder.createLongBuilder(capacity); } } diff --git a/std-bits/table/src/main/java/org/enso/table/parsing/TypeInferringParser.java b/std-bits/table/src/main/java/org/enso/table/parsing/TypeInferringParser.java index 15090b0fa58..9d0c71912f9 100644 --- a/std-bits/table/src/main/java/org/enso/table/parsing/TypeInferringParser.java +++ b/std-bits/table/src/main/java/org/enso/table/parsing/TypeInferringParser.java @@ -4,6 +4,8 @@ import org.enso.table.data.column.builder.object.Builder; import org.enso.table.data.column.storage.Storage; import org.enso.table.data.column.storage.StringStorage; import org.enso.table.parsing.problems.ProblemAggregator; +import org.enso.table.parsing.problems.ProblemAggregatorImpl; +import org.enso.table.parsing.problems.SimplifiedProblemAggregator; import org.enso.table.read.WithProblems; /** @@ -13,7 +15,7 @@ import org.enso.table.read.WithProblems; *

If all parsers from the set reported problems, the fallback parser is used and its result is * returned regardless of any problems. */ -public class TypeInferringParser implements DatatypeParser { +public class TypeInferringParser extends DatatypeParser { private final IncrementalDatatypeParser[] baseParsers; private final DatatypeParser fallbackParser; @@ -24,12 +26,25 @@ public class TypeInferringParser implements DatatypeParser { this.fallbackParser = fallbackParser; } + @Override + public Object parseSingleValue(String text, ProblemAggregator problemAggregator) { + for (IncrementalDatatypeParser parser : baseParsers) { + SimplifiedProblemAggregator internal = new SimplifiedProblemAggregator(); + Object result = parser.parseSingleValue(text, internal); + if (!internal.hasProblems()) { + return result; + } + } + + return fallbackParser.parseSingleValue(text, problemAggregator); + } + @Override public WithProblems parseColumn(String columnName, StringStorage sourceStorage) { parsers: for (IncrementalDatatypeParser parser : baseParsers) { Builder builder = parser.makeBuilderWithCapacity(sourceStorage.size()); - var aggregator = new ProblemAggregator(columnName); + var aggregator = new ProblemAggregatorImpl(columnName); for (int i = 0; i < sourceStorage.size(); ++i) { String cell = sourceStorage.getItem(i); diff --git a/std-bits/table/src/main/java/org/enso/table/parsing/WhitespaceStrippingParser.java b/std-bits/table/src/main/java/org/enso/table/parsing/WhitespaceStrippingParser.java index 6108746e138..092eedfe9c1 100644 --- a/std-bits/table/src/main/java/org/enso/table/parsing/WhitespaceStrippingParser.java +++ b/std-bits/table/src/main/java/org/enso/table/parsing/WhitespaceStrippingParser.java @@ -21,7 +21,7 @@ public class WhitespaceStrippingParser extends IncrementalDatatypeParser { } @Override - protected Builder makeBuilderWithCapacity(long capacity) { + protected Builder makeBuilderWithCapacity(int capacity) { return innerParser.makeBuilderWithCapacity(capacity); } } diff --git a/std-bits/table/src/main/java/org/enso/table/parsing/problems/NoOpProblemAggregator.java b/std-bits/table/src/main/java/org/enso/table/parsing/problems/NoOpProblemAggregator.java new file mode 100644 index 00000000000..ca56dea3b62 --- /dev/null +++ b/std-bits/table/src/main/java/org/enso/table/parsing/problems/NoOpProblemAggregator.java @@ -0,0 +1,26 @@ +package org.enso.table.parsing.problems; + +import java.util.List; + +/** A problem aggregator which ignores problems. */ +public class NoOpProblemAggregator implements ProblemAggregator { + + @Override + public void reportInvalidFormat(String cell) {} + + @Override + public void reportLeadingZeroes(String cell) {} + + @Override + public void reportMismatchedQuote() {} + + @Override + public boolean hasProblems() { + throw new IllegalStateException("This implementation does not provide problem information."); + } + + @Override + public List getAggregatedProblems() { + throw new IllegalStateException("This implementation does not provide problem information."); + } +} diff --git a/std-bits/table/src/main/java/org/enso/table/parsing/problems/ProblemAggregator.java b/std-bits/table/src/main/java/org/enso/table/parsing/problems/ProblemAggregator.java index 2792653fc27..f39ac791a9f 100644 --- a/std-bits/table/src/main/java/org/enso/table/parsing/problems/ProblemAggregator.java +++ b/std-bits/table/src/main/java/org/enso/table/parsing/problems/ProblemAggregator.java @@ -1,42 +1,23 @@ package org.enso.table.parsing.problems; -import java.util.ArrayList; import java.util.List; -/** - * An aggregator for parsing problems. - * - *

Each strategy exposes a method that returns a summary of the problems. The particular methods - * for reporting each problem are defined in particular subclasses. - */ -public class ProblemAggregator { - - private final List invalidFormatCells = new ArrayList<>(); - private final List leadingZerosCells = new ArrayList<>(); - private int mismatchedQuotes = 0; - private final String relatedColumnName; - - public ProblemAggregator(String relatedColumnName) { - this.relatedColumnName = relatedColumnName; - } +/** An aggregator for parsing problems. */ +public interface ProblemAggregator { /** * Reports a cell with an invalid format. * - *

The reports are aggregated and finally a single problem containing all invalid cell for the + *

The reports are aggregated and finally a single problem containing all invalid cells for the * given column is reported. */ - public void reportInvalidFormat(String cell) { - invalidFormatCells.add(cell); - } + void reportInvalidFormat(String cell); - public void reportLeadingZeroes(String cell) { - leadingZerosCells.add(cell); - } + /** Reports a cell containing unexpected leading zeros. */ + void reportLeadingZeroes(String cell); - public void reportMismatchedQuote() { - mismatchedQuotes++; - } + /** Reports that a mismatched quote has been encountered. */ + void reportMismatchedQuote(); /** * Checks if there are any problems already reported. @@ -44,28 +25,8 @@ public class ProblemAggregator { *

This method returns true if and only if {@code getAggregatedProblems} would return a * non-empty list. */ - public boolean hasProblems() { - return !invalidFormatCells.isEmpty() || !leadingZerosCells.isEmpty() || mismatchedQuotes > 0; - } + boolean hasProblems(); /** Return an aggregated summary of problems that have been reported. */ - public List getAggregatedProblems() { - List problems = new ArrayList<>(); - - if (!invalidFormatCells.isEmpty()) { - problems.add(new InvalidFormat(relatedColumnName, invalidFormatCells)); - } - - if (!leadingZerosCells.isEmpty()) { - problems.add(new LeadingZeros(relatedColumnName, leadingZerosCells)); - } - - for (int i = 0; i < mismatchedQuotes; ++i) { - problems.add(new MismatchedQuote()); - } - - assert problems.isEmpty() == !hasProblems(); - - return problems; - } + List getAggregatedProblems(); } diff --git a/std-bits/table/src/main/java/org/enso/table/parsing/problems/ProblemAggregatorImpl.java b/std-bits/table/src/main/java/org/enso/table/parsing/problems/ProblemAggregatorImpl.java new file mode 100644 index 00000000000..00419a80c29 --- /dev/null +++ b/std-bits/table/src/main/java/org/enso/table/parsing/problems/ProblemAggregatorImpl.java @@ -0,0 +1,56 @@ +package org.enso.table.parsing.problems; + +import java.util.ArrayList; +import java.util.List; + +public class ProblemAggregatorImpl implements ProblemAggregator { + public final String relatedColumnName; + private final List invalidFormatCells = new ArrayList<>(); + private final List leadingZerosCells = new ArrayList<>(); + private int mismatchedQuotes = 0; + + public ProblemAggregatorImpl(String relatedColumnName) { + this.relatedColumnName = relatedColumnName; + } + + @Override + public void reportInvalidFormat(String cell) { + invalidFormatCells.add(cell); + } + + @Override + public void reportLeadingZeroes(String cell) { + leadingZerosCells.add(cell); + } + + @Override + public void reportMismatchedQuote() { + mismatchedQuotes++; + } + + @Override + public boolean hasProblems() { + return !invalidFormatCells.isEmpty() || !leadingZerosCells.isEmpty() || mismatchedQuotes > 0; + } + + @Override + public List getAggregatedProblems() { + List problems = new ArrayList<>(); + + if (!invalidFormatCells.isEmpty()) { + problems.add(new InvalidFormat(relatedColumnName, invalidFormatCells)); + } + + if (!leadingZerosCells.isEmpty()) { + problems.add(new LeadingZeros(relatedColumnName, leadingZerosCells)); + } + + for (int i = 0; i < mismatchedQuotes; ++i) { + problems.add(new MismatchedQuote()); + } + + assert problems.isEmpty() == !hasProblems(); + + return problems; + } +} diff --git a/std-bits/table/src/main/java/org/enso/table/parsing/problems/SimplifiedProblemAggregator.java b/std-bits/table/src/main/java/org/enso/table/parsing/problems/SimplifiedProblemAggregator.java new file mode 100644 index 00000000000..6977288bfb4 --- /dev/null +++ b/std-bits/table/src/main/java/org/enso/table/parsing/problems/SimplifiedProblemAggregator.java @@ -0,0 +1,33 @@ +package org.enso.table.parsing.problems; + +import java.util.List; + +public class SimplifiedProblemAggregator implements ProblemAggregator { + + private boolean hasProblems = false; + + @Override + public void reportInvalidFormat(String cell) { + hasProblems = true; + } + + @Override + public void reportLeadingZeroes(String cell) { + hasProblems = true; + } + + @Override + public void reportMismatchedQuote() { + hasProblems = true; + } + + @Override + public boolean hasProblems() { + return hasProblems; + } + + @Override + public List getAggregatedProblems() { + throw new IllegalStateException("Problem aggregation is not available in this implementation."); + } +} diff --git a/std-bits/table/src/main/java/org/enso/table/read/DelimitedReader.java b/std-bits/table/src/main/java/org/enso/table/read/DelimitedReader.java index 69b34856b9f..904b31e09de 100644 --- a/std-bits/table/src/main/java/org/enso/table/read/DelimitedReader.java +++ b/std-bits/table/src/main/java/org/enso/table/read/DelimitedReader.java @@ -15,31 +15,20 @@ import org.enso.table.data.index.DefaultIndex; import org.enso.table.data.table.Column; import org.enso.table.data.table.Table; import org.enso.table.parsing.DatatypeParser; +import org.enso.table.parsing.TypeInferringParser; import org.enso.table.parsing.problems.AdditionalInvalidRows; import org.enso.table.parsing.problems.InvalidRow; import org.enso.table.parsing.problems.MismatchedQuote; +import org.enso.table.parsing.problems.NoOpProblemAggregator; import org.enso.table.parsing.problems.ParsingProblem; import org.enso.table.util.NameDeduplicator; /** A helper for reading delimited (CSV-like) files. */ public class DelimitedReader { - /** Specifies how to set the headers for the returned table. */ - public enum HeaderBehavior { - /** Tries to infer if the headers are present in the file. */ - INFER, - - /** Uses the first row in the file as headers. Duplicate names will be appended suffixes. */ - USE_FIRST_ROW_AS_HEADERS, - - /** - * Treats the first row as data and generates header names starting with {@code COLUMN_NAME}. - */ - GENERATE_HEADERS - } - private static final String COLUMN_NAME = "Column"; - + private static final char noQuoteCharacter = '\0'; + private static final long invalidRowsLimit = 10; private final char delimiter; private final char quoteCharacter; private final char quoteEscapeCharacter; @@ -50,10 +39,15 @@ public class DelimitedReader { private final List warnings = new ArrayList<>(); private final CsvParser parser; private final DatatypeParser valueParser; + private final TypeInferringParser cellTypeGuesser; private final boolean keepInvalidRows; private final boolean warningsAsErrors; - - private static final char noQuoteCharacter = '\0'; + private final NoOpProblemAggregator noOpProblemAggregator = new NoOpProblemAggregator(); + private long invalidRowsCount = 0; + private long targetTableIndex = 0; + /** The line number of the start of the current row in the input file. */ + private long currentLine = 0; + private StringStorageBuilder[] builders = null; /** * Creates a new reader. @@ -74,6 +68,8 @@ public class DelimitedReader { * @param maxColumns specifies how many columns can be expected at most * @param valueParser an optional parser that is applied to each column to convert it to more * specific datatype + * @param cellTypeGuesser a helper used to guess cell types, used for the purpose of inferring the + * headers, it must not be null if {@code headerBehavior} is set to {@code INFER}. * @param keepInvalidRows specifies whether to keep rows that had an unexpected number of columns * @param warningsAsErrors specifies if the first warning should be immediately raised as an error * (used as a fast-path for the error-reporting mode to avoid computing a value that is going @@ -89,6 +85,7 @@ public class DelimitedReader { long rowLimit, int maxColumns, DatatypeParser valueParser, + TypeInferringParser cellTypeGuesser, boolean keepInvalidRows, boolean warningsAsErrors) { if (delimiter.isEmpty()) { @@ -142,6 +139,7 @@ public class DelimitedReader { this.warningsAsErrors = warningsAsErrors; this.valueParser = valueParser; + this.cellTypeGuesser = cellTypeGuesser; parser = setupCsvParser(input); } @@ -174,9 +172,6 @@ public class DelimitedReader { reportProblem(new MismatchedQuote()); } - private long invalidRowsCount = 0; - private static final long invalidRowsLimit = 10; - private void reportInvalidRow(long source_row, Long table_index, String[] row) { if (invalidRowsCount < invalidRowsLimit) { reportProblem(new InvalidRow(source_row, table_index, row)); @@ -203,29 +198,89 @@ public class DelimitedReader { } } - private long target_table_index = 0; - - /** The line number of the start of the current row in the input file. */ - private long current_line = 0; - /** * Reads the next row and updates the current line accordingly. * *

Will return {@code null} if no more rows are available. */ - private String[] nextRow() { - current_line = parser.getContext().currentLine() + 1; + private String[] readNextRow() { + currentLine = parser.getContext().currentLine() + 1; return parser.parseNext(); } + private void appendRow(String[] row) { + assert builders != null; + assert canFitMoreRows(); + + if (row.length != builders.length) { + reportInvalidRow(currentLine, keepInvalidRows ? targetTableIndex : null, row); + + if (keepInvalidRows) { + for (int i = 0; i < builders.length && i < row.length; i++) { + builders[i] = builders[i].parseAndAppend(row[i]); + } + + // If the current row had less columns than expected, nulls are inserted for the missing + // values. + // If it had more columns, the excess columns are discarded. + for (int i = row.length; i < builders.length; i++) { + builders[i] = builders[i].parseAndAppend(null); + } + + targetTableIndex++; + } + } else { + for (int i = 0; i < builders.length; i++) { + builders[i] = builders[i].parseAndAppend(row[i]); + } + + targetTableIndex++; + } + } + + private boolean canFitMoreRows() { + return rowLimit < 0 || targetTableIndex < rowLimit; + } + + private void appendRowIfLimitPermits(String[] row) { + if (canFitMoreRows()) { + appendRow(row); + } + } + + private List headersFromRow(String[] row) { + List preprocessedHeaders = + Arrays.stream(row).map(this::parseHeader).collect(Collectors.toList()); + return NameDeduplicator.deduplicate(preprocessedHeaders, "_"); + } + + private List generateDefaultHeaders(int columnCount) { + ArrayList headerNames = new ArrayList<>(columnCount); + for (int i = 0; i < columnCount; ++i) { + headerNames.add(COLUMN_NAME + "_" + (i + 1)); + } + return headerNames; + } + + /** + * Checks if the given cell contains just plain text that is not null and is not convertible to + * any more specific type according to the {@code cellTypeGuesser}. This is used for checking the + * types when inferring the headers. + */ + private boolean isPlainText(String cell) { + if (cell == null) return false; + Object parsed = cellTypeGuesser.parseSingleValue(cell, noOpProblemAggregator); + return parsed instanceof String; + } + /** Reads the input stream and returns a Table. */ public Table read() { List headerNames; - String[] currentRow = nextRow(); + String[] currentRow = readNextRow(); // Skip the first N rows. for (long i = 0; currentRow != null && i < skipRows; ++i) { - currentRow = nextRow(); + currentRow = readNextRow(); } // If there are no rows to even infer the headers, we return an empty table. @@ -233,55 +288,50 @@ public class DelimitedReader { return new Table(new Column[0]); } + int expectedColumnCount = currentRow.length; + initBuilders(expectedColumnCount); + + assert currentRow != null; switch (headerBehavior) { - case INFER: - throw new IllegalStateException("Inferring headers is not yet implemented"); - case USE_FIRST_ROW_AS_HEADERS: - List preprocessedHeaders = - Arrays.stream(currentRow).map(this::parseHeader).collect(Collectors.toList()); - headerNames = NameDeduplicator.deduplicate(preprocessedHeaders, "_"); - // We have 'used up' the first row, so we load a next one. - currentRow = nextRow(); - break; - case GENERATE_HEADERS: - headerNames = new ArrayList<>(currentRow.length); - for (int i = 0; i < currentRow.length; ++i) { - headerNames.add(COLUMN_NAME + "_" + (i + 1)); + case INFER -> { + String[] firstRow = currentRow; + String[] secondRow = readNextRow(); + if (secondRow == null) { + // If there is only one row in the file, we generate the headers and stop further processing (as nothing more to process). + headerNames = generateDefaultHeaders(expectedColumnCount); + appendRowIfLimitPermits(firstRow); + currentRow = null; + } else { + assert cellTypeGuesser != null; + boolean firstAllText = Arrays.stream(firstRow).allMatch(this::isPlainText); + boolean secondAllText = Arrays.stream(secondRow).allMatch(this ::isPlainText); + boolean useFirstRowAsHeader = firstAllText && !secondAllText; + if (useFirstRowAsHeader) { + headerNames = headersFromRow(firstRow); + appendRowIfLimitPermits(secondRow); + } else { + headerNames = generateDefaultHeaders(expectedColumnCount); + appendRowIfLimitPermits(firstRow); + appendRowIfLimitPermits(secondRow); + } + + currentRow = readNextRow(); } - break; - default: - throw new IllegalStateException("Impossible branch."); + } + case USE_FIRST_ROW_AS_HEADERS -> { + headerNames = headersFromRow(currentRow); + // We have 'used up' the first row, so we load a next one. + currentRow = readNextRow(); + } + case GENERATE_HEADERS -> { + headerNames = generateDefaultHeaders(expectedColumnCount); + } + default -> throw new IllegalStateException("Impossible branch."); } - StringStorageBuilder[] builders = initBuilders(headerNames.size()); - - while (currentRow != null && (rowLimit < 0 || target_table_index < rowLimit)) { - if (currentRow.length != builders.length) { - reportInvalidRow(current_line, keepInvalidRows ? target_table_index : null, currentRow); - - if (keepInvalidRows) { - for (int i = 0; i < builders.length && i < currentRow.length; i++) { - builders[i] = builders[i].parseAndAppend(currentRow[i]); - } - - // If the current row had less columns than expected, nulls are inserted for the missing - // values. - // If it had more columns, the excess columns are discarded. - for (int i = currentRow.length; i < builders.length; i++) { - builders[i] = builders[i].parseAndAppend(null); - } - - target_table_index++; - } - } else { - for (int i = 0; i < builders.length; i++) { - builders[i] = builders[i].parseAndAppend(currentRow[i]); - } - - target_table_index++; - } - - currentRow = nextRow(); + while (currentRow != null && canFitMoreRows()) { + appendRow(currentRow); + currentRow = readNextRow(); } parser.stopParsing(); @@ -302,11 +352,24 @@ public class DelimitedReader { return new Table(columns); } - private StringStorageBuilder[] initBuilders(int count) { - StringStorageBuilder[] res = new StringStorageBuilder[count]; + private void initBuilders(int count) { + builders = new StringStorageBuilder[count]; for (int i = 0; i < count; i++) { - res[i] = new StringStorageBuilder(); + builders[i] = new StringStorageBuilder(); } - return res; + } + + /** Specifies how to set the headers for the returned table. */ + public enum HeaderBehavior { + /** Tries to infer if the headers are present in the file. */ + INFER, + + /** Uses the first row in the file as headers. Duplicate names will be appended suffixes. */ + USE_FIRST_ROW_AS_HEADERS, + + /** + * Treats the first row as data and generates header names starting with {@code COLUMN_NAME}. + */ + GENERATE_HEADERS } } diff --git a/std-bits/table/src/main/java/org/enso/table/read/QuoteStrippingParser.java b/std-bits/table/src/main/java/org/enso/table/read/QuoteStrippingParser.java index bcca379a03c..1228627c80a 100644 --- a/std-bits/table/src/main/java/org/enso/table/read/QuoteStrippingParser.java +++ b/std-bits/table/src/main/java/org/enso/table/read/QuoteStrippingParser.java @@ -34,7 +34,7 @@ public class QuoteStrippingParser extends IncrementalDatatypeParser { } @Override - protected Builder makeBuilderWithCapacity(long capacity) { - return new StringBuilder((int) capacity); + protected Builder makeBuilderWithCapacity(int capacity) { + return new StringBuilder(capacity); } } diff --git a/test/Table_Tests/data/all_text.csv b/test/Table_Tests/data/all_text.csv new file mode 100644 index 00000000000..ae5fb0a4da7 --- /dev/null +++ b/test/Table_Tests/data/all_text.csv @@ -0,0 +1,4 @@ +a,b +c,d +e,f +g,h diff --git a/test/Table_Tests/data/data_small.csv b/test/Table_Tests/data/data_small.csv index 0619bacfe91..1dedb7d94a5 100644 --- a/test/Table_Tests/data/data_small.csv +++ b/test/Table_Tests/data/data_small.csv @@ -1,4 +1,4 @@ -Code,Index,Flag,Value,ValueWithNothing,TextWithNothing,Hexadecimal,Leading0s,QuotedNumbers,Mixed +Code,Index,Flag,Value,ValueWithNothing,TextWithNothing,"Hexadecimal",Leading0s,QuotedNumbers,"Mixed Types" gxl,7,True,38.76109,63.13, pq6igd2wyd ,4DD4675B,001,"1","33" wca,0,False,-66.77495,31," 2pr4102wc4 ",,002,"2", nfw,1, True , 88.65713 ,-68.71,"",01896EAB,123,,45 diff --git a/test/Table_Tests/data/numbers_in_header.csv b/test/Table_Tests/data/numbers_in_header.csv new file mode 100644 index 00000000000..a4f6bf116b4 --- /dev/null +++ b/test/Table_Tests/data/numbers_in_header.csv @@ -0,0 +1,2 @@ +a,"b",0 +1,2,3 diff --git a/test/Table_Tests/data/one_row.csv b/test/Table_Tests/data/one_row.csv new file mode 100644 index 00000000000..ce1c6caa2e4 --- /dev/null +++ b/test/Table_Tests/data/one_row.csv @@ -0,0 +1 @@ +x,y,z diff --git a/test/Table_Tests/data/quoted_numbers_in_header.csv b/test/Table_Tests/data/quoted_numbers_in_header.csv new file mode 100644 index 00000000000..84103844ea8 --- /dev/null +++ b/test/Table_Tests/data/quoted_numbers_in_header.csv @@ -0,0 +1,2 @@ +"1",x +y,2 diff --git a/test/Table_Tests/data/two_rows1.csv b/test/Table_Tests/data/two_rows1.csv new file mode 100644 index 00000000000..4b675e9fe72 --- /dev/null +++ b/test/Table_Tests/data/two_rows1.csv @@ -0,0 +1,2 @@ +a,b,c +x,, diff --git a/test/Table_Tests/data/two_rows2.csv b/test/Table_Tests/data/two_rows2.csv new file mode 100644 index 00000000000..2e8f4cae1e1 --- /dev/null +++ b/test/Table_Tests/data/two_rows2.csv @@ -0,0 +1,2 @@ +a,b,c +d,e,f diff --git a/test/Table_Tests/src/Delimited_Read_Spec.enso b/test/Table_Tests/src/Delimited_Read_Spec.enso index b9f44cec100..bc186317b7d 100644 --- a/test/Table_Tests/src/Delimited_Read_Spec.enso +++ b/test/Table_Tests/src/Delimited_Read_Spec.enso @@ -42,7 +42,51 @@ spec = table.at "Column_1" . to_vector . should_equal ["4"] table.at "d" . to_vector . should_equal ["5"] - Test.specify "load even an empty file" <| + Test.specify "should infer headers based on the first two rows" <| + t1 = File.read (Enso_Project.data / "data_small.csv") (File_Format.Delimited "," headers=File_Format.Infer) + t1.columns.map .name . should_equal ["Code", "Index", "Flag", "Value", "ValueWithNothing", "TextWithNothing", "Hexadecimal", "Leading0s", "QuotedNumbers", "Mixed Types"] + + t2 = File.read (Enso_Project.data / "all_text.csv") (File_Format.Delimited "," headers=File_Format.Infer) + t2.columns.map .name . should_equal ["Column_1", "Column_2"] + t2.at "Column_1" . to_vector . should_equal ["a", "c", "e", "g"] + t2.at "Column_2" . to_vector . should_equal ["b", "d", "f", "h"] + + t3 = File.read (Enso_Project.data / "two_rows1.csv") (File_Format.Delimited "," headers=File_Format.Infer) + t3.columns.map .name . should_equal ["a", "b", "c"] + t3.at "a" . to_vector . should_equal ["x"] + t3.at "b" . to_vector . should_equal [Nothing] + t3.at "c" . to_vector . should_equal [Nothing] + + t4 = File.read (Enso_Project.data / "two_rows2.csv") (File_Format.Delimited "," headers=File_Format.Infer) + t4.columns.map .name . should_equal ["Column_1", "Column_2", "Column_3"] + t4.at "Column_1" . to_vector . should_equal ["a", "d"] + t4.at "Column_2" . to_vector . should_equal ["b", "e"] + t4.at "Column_3" . to_vector . should_equal ["c", "f"] + + t5 = File.read (Enso_Project.data / "numbers_in_header.csv") (File_Format.Delimited "," headers=File_Format.Infer) + t5.columns.map .name . should_equal ["Column_1", "Column_2", "Column_3"] + t5.at "Column_1" . to_vector . should_equal ["a", "1"] + t5.at "Column_2" . to_vector . should_equal ["b", "2"] + t5.at "Column_3" . to_vector . should_equal [0, 3] + + t6 = File.read (Enso_Project.data / "quoted_numbers_in_header.csv") (File_Format.Delimited "," headers=File_Format.Infer) + t6.columns.map .name . should_equal ["1", "x"] + t6.at "1" . to_vector . should_equal ["y"] + t6.at "x" . to_vector . should_equal [2] + + Test.specify "should not use the first row as headers if it is the only row, unless specifically asked to" <| + t1 = File.read (Enso_Project.data / "one_row.csv") (File_Format.Delimited "," headers=File_Format.Infer) + t1.columns.map .name . should_equal ["Column_1", "Column_2", "Column_3"] + t1.at "Column_1" . to_vector . should_equal ["x"] + t1.at "Column_2" . to_vector . should_equal ["y"] + t1.at "Column_3" . to_vector . should_equal ["z"] + + t2 = File.read (Enso_Project.data / "one_row.csv") (File_Format.Delimited "," headers=True) + t2.columns.map .name . should_equal ["x", "y", "z"] + t2.row_count . should_equal 0 + t2.at "x" . to_vector . should_equal [] + + Test.specify "should be able to load even an empty file" <| table = File.read (Enso_Project.data / "empty.txt") (File_Format.Delimited "," headers=True value_formatter=Nothing) table.columns.map .name . should_equal [] table.row_count . should_equal 0 @@ -251,17 +295,20 @@ spec = t.at "Hexadecimal" . to_vector . should_equal ["4DD4675B", Nothing, "01896EAB", "F32E1EFE"] t.at "Leading0s" . to_vector . should_equal ["001", "002", "123", Nothing] t.at "QuotedNumbers" . to_vector . should_equal ["1", "2", Nothing, "34"] - t.at "Mixed" . to_vector . should_equal ["33", Nothing, "45", "True"] + t.at "Mixed Types" . to_vector . should_equal ["33", Nothing, "45", "True"] t2 = (Enso_Project.data / "data_small.csv") . read (File_Format.Delimited "," headers=True value_formatter=(Data_Formatter allow_leading_zeros=True)) t2.at "Leading0s" . to_vector . should_equal [1, 2, 123, Nothing] - Test.specify "should be able to detect known types automatically" <| - ## TODO update this once headers are inferred (next PR) + Test.specify "should be able to detect types automatically" <| t1 = (Enso_Project.data / "data_small.csv") . read - t1.at "Column_1" . to_vector . should_equal ["Code", "gxl", "wca", "nfw", "der"] + t1.at "Code" . to_vector . should_equal ["gxl", "wca", "nfw", "der"] + t1.at "Index" . to_vector . should_equal [7, 0, 1, 7] t2 = (Enso_Project.data / "sample.tsv") . read - t2.at "Column_1" . to_vector . should_equal ["a", "1", "4"] + t2.at "a" . to_vector . should_equal [1, 4] + t2.at "b" . to_vector . should_equal [2, 5] + t2.at "c" . to_vector . should_equal [3, 6] + t2.columns.map .name . should_equal ["a", "b", "c"] main = Test.Suite.run_main here.spec