Automatic inference of value types when parsing table columns (#3462)

Implements https://www.pivotaltracker.com/story/show/182199966
This commit is contained in:
Radosław Waśko 2022-05-20 17:08:36 +02:00 committed by GitHub
parent 0073f461d9
commit ff7700ebb1
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
18 changed files with 347 additions and 200 deletions

View File

@ -124,6 +124,7 @@
specified type.][3455]
- [Promote with, take, finalize to be methods of Managed_Resource
instance][3460]
- [Implemented automatic type detection for `Table.parse_values`.][3462]
[debug-shortcuts]:
https://github.com/enso-org/enso/blob/develop/app/gui/docs/product/shortcuts.md#debug
@ -192,6 +193,7 @@
[3457]: https://github.com/enso-org/enso/pull/3457
[3455]: https://github.com/enso-org/enso/pull/3455
[3460]: https://github.com/enso-org/enso/pull/3460
[3462]: https://github.com/enso-org/enso/pull/3462
#### Enso Compiler

View File

@ -670,10 +670,10 @@ type Table
## Parsing values is not supported in database tables, the table has to be
materialized first with `to_dataframe`.
parse_values : Data_Formatter -> (Nothing | [Column_Type_Selection]) -> Problem_Behavior -> Table
parse_values parser=Data_Formatter column_types=Nothing on_problems=Report_Warning =
parse_values value_formatter=Data_Formatter column_types=Nothing on_problems=Report_Warning =
## Avoid unused arguments warning. We cannot rename arguments to `_`,
because we need to keep the API consistent with the in-memory table.
_ = [parser, column_types, on_problems]
_ = [value_formatter, column_types, on_problems]
msg = "Parsing values is not supported in database tables, the table has to be materialized first with `to_dataframe`."
Error.throw (Unsupported_Database_Operation_Error msg)

View File

@ -1,5 +1,19 @@
from Standard.Base import all
from Standard.Base.Data.Time.Date as Date_Module import Date
from Standard.Base.Data.Time as Time_Module import Time
from Standard.Base.Data.Time.Time_Of_Day as Time_Of_Day_Module import Time_Of_Day
polyglot java import org.enso.table.parsing.IntegerParser
polyglot java import org.enso.table.parsing.DecimalParser
polyglot java import org.enso.table.parsing.BooleanParser
polyglot java import org.enso.table.parsing.DateParser
polyglot java import org.enso.table.parsing.TimeParser
polyglot java import org.enso.table.parsing.DateTimeParser
polyglot java import org.enso.table.parsing.WhitespaceStrippingParser
polyglot java import org.enso.table.parsing.IdentityParser
polyglot java import org.enso.table.parsing.TypeInferringParser
## Specifies options for reading text data in a table to more specific types and
serializing them back.
@ -27,3 +41,55 @@ from Standard.Base import all
- true_values: Values representing True.
- false_values: Values representing False.
type Data_Formatter trim_values:Boolean=True allow_leading_zeros:Boolean=False decimal_point:Text='.' thousand_separator:Text='' datetime_formats:[Text]=["yyyy-MM-dd HH:mm:ss"] date_formats:[Text]=["yyyy-MM-dd"] time_formats:[Text]=["HH:mm:ss"] locale:Locale=Locale.default true_values:[Text]=["True","true","TRUE"] false_values:[Text]=["False","false","FALSE"]
## PRIVATE
Data_Formatter.get_thousand_separator = if this.thousand_separator.is_empty then Nothing else this.thousand_separator
## PRIVATE
Data_Formatter.wrap_base_parser base_parser =
if this.trim_values.not then base_parser else
WhitespaceStrippingParser.new base_parser
## PRIVATE
Data_Formatter.make_integer_parser = this.wrap_base_parser <|
IntegerParser.new this.get_thousand_separator this.allow_leading_zeros
## PRIVATE
Data_Formatter.make_decimal_parser = this.wrap_base_parser <|
DecimalParser.new this.decimal_point this.get_thousand_separator this.allow_leading_zeros
## PRIVATE
Data_Formatter.make_boolean_parser = this.wrap_base_parser <|
BooleanParser.new this.true_values.to_array this.false_values.to_array
## PRIVATE
Data_Formatter.make_date_parser = this.wrap_base_parser <|
DateParser.new this.date_formats.to_array this.locale.java_locale
## PRIVATE
Data_Formatter.make_identity_parser = this.wrap_base_parser IdentityParser.new
## PRIVATE
Data_Formatter.make_datetime_parser = this.wrap_base_parser <|
DateTimeParser.new this.datetime_formats.to_array this.locale.java_locale
## PRIVATE
Data_Formatter.make_time_parser = this.wrap_base_parser <|
TimeParser.new this.time_formats.to_array this.locale.java_locale
## PRIVATE
Data_Formatter.make_datatype_parser datatype = case datatype of
Integer -> this.make_integer_parser
Decimal -> this.make_decimal_parser
Boolean -> this.make_boolean_parser
_ ->
if datatype == Date then this.make_date_parser else
if datatype == Time then this.make_datetime_parser else
if datatype == Time_Of_Day then this.make_time_parser else
Error.throw (Illegal_Argument_Error "Unsupported datatype: "+datatype.to_text)
## PRIVATE
Data_Formatter.make_auto_parser =
parsers = [this.make_integer_parser, this.make_decimal_parser, this.make_datetime_parser, this.make_date_parser, this.make_time_parser, this.make_boolean_parser]
fallback_parser = this.make_identity_parser
TypeInferringParser.new parsers.to_array fallback_parser

View File

@ -5,8 +5,6 @@ import Standard.Table.Data.Column
import Standard.Table.Io.Csv
import Standard.Visualization
from Standard.Base.Data.Time.Date as Date_Module import Date
from Standard.Base.Data.Time as Time_Module import Time
from Standard.Base.Data.Time.Time_Of_Day as Time_Of_Day_Module import Time_Of_Day
import Standard.Table.Io.Spreadsheet_Write_Mode
import Standard.Table.Io.Format
import Standard.Table.Internal.Table_Helpers
@ -31,14 +29,6 @@ polyglot java import org.enso.table.operations.OrderBuilder
polyglot java import org.enso.table.format.csv.Writer as Csv_Writer
polyglot java import org.enso.table.format.xlsx.Writer as Spreadsheet_Writer
polyglot java import org.enso.table.parsing.IntegerParser
polyglot java import org.enso.table.parsing.DecimalParser
polyglot java import org.enso.table.parsing.BooleanParser
polyglot java import org.enso.table.parsing.DateParser
polyglot java import org.enso.table.parsing.TimeParser
polyglot java import org.enso.table.parsing.DateTimeParser
polyglot java import org.enso.table.parsing.WhitespaceStrippingParser
## Creates a new table from a vector of `[name, items]` pairs.
Arguments:
@ -553,7 +543,7 @@ type Table
a leading 0). However, settings in the `Data_Formatter` can
control this.
parse_values : Data_Formatter -> (Nothing | [Column_Type_Selection]) -> Problem_Behavior -> Table
parse_values parser=Data_Formatter column_types=Nothing on_problems=Report_Warning =
parse_values value_formatter=Data_Formatter column_types=Nothing on_problems=Report_Warning =
columns = this.columns
problem_builder = Vector.new_builder
@ -595,22 +585,9 @@ type Table
new_columns = columns.zip expected_types column-> expected_type-> case expected_type of
Nothing -> column
Auto -> Error.unimplemented "Automatic datatype inference is not implemented yet."
_ ->
parse_options = parser
thousand_separator = if parse_options.thousand_separator.is_empty then Nothing else parse_options.thousand_separator
base_parser = case expected_type of
Integer -> IntegerParser.new thousand_separator parse_options.allow_leading_zeros
Decimal -> DecimalParser.new parse_options.decimal_point thousand_separator parse_options.allow_leading_zeros
Boolean -> BooleanParser.new parse_options.true_values.to_array parse_options.false_values.to_array
_ ->
if expected_type == Date then DateParser.new parse_options.date_formats.to_array parse_options.locale.java_locale else
if expected_type == Time then DateTimeParser.new parse_options.datetime_formats.to_array parse_options.locale.java_locale else
if expected_type == Time_Of_Day then TimeParser.new parse_options.time_formats.to_array parse_options.locale.java_locale else
Error.throw (Illegal_Argument_Error "Unsupported target datatype: "+expected_type.to_text)
parser = case parse_options.trim_values of
False -> base_parser
True -> WhitespaceStrippingParser.new base_parser
parser = if expected_type == Auto then value_formatter.make_auto_parser else
value_formatter.make_datatype_parser expected_type
storage = column.java_column.getStorage
new_storage_and_problems = parser.parseColumn storage
new_storage = new_storage_and_problems.value

View File

@ -29,7 +29,7 @@ public class StringStorageBuilder extends StorageBuilder {
/** @inheritDoc */
@Override
public StorageBuilder parseAndAppend(String value) {
public StringStorageBuilder parseAndAppend(String value) {
ensureAppendable();
data[size++] = value;
return this;
@ -45,7 +45,7 @@ public class StringStorageBuilder extends StorageBuilder {
/** @inheritDoc */
@Override
public Storage seal() {
public StringStorage seal() {
return new StringStorage(data, size);
}
}

View File

@ -5,9 +5,9 @@ import java.time.format.DateTimeParseException;
import java.util.Locale;
import org.enso.table.data.column.builder.object.Builder;
import org.enso.table.data.column.builder.object.ObjectBuilder;
import org.enso.table.parsing.problems.InvalidFormatProblemAggregator;
import org.enso.table.parsing.problems.ProblemAggregator;
public abstract class BaseTimeParser extends DatatypeParser<InvalidFormatProblemAggregator> {
public abstract class BaseTimeParser extends IncrementalDatatypeParser {
protected interface ParseStrategy {
Object parse(String text, DateTimeFormatter formatter) throws DateTimeParseException;
}
@ -25,7 +25,7 @@ public abstract class BaseTimeParser extends DatatypeParser<InvalidFormatProblem
}
@Override
public Object parseSingleValue(String text, InvalidFormatProblemAggregator problemAggregator) {
protected Object parseSingleValue(String text, ProblemAggregator problemAggregator) {
for (var formatter : formatters) {
try {
return parseStrategy.parse(text, formatter);
@ -38,14 +38,9 @@ public abstract class BaseTimeParser extends DatatypeParser<InvalidFormatProblem
}
@Override
public Builder makeBuilderWithCapacity(long capacity) {
protected Builder makeBuilderWithCapacity(long capacity) {
// Once datetime gets first-class support in our dataframes, a more specific builder type should
// be used.
return new ObjectBuilder((int) capacity);
}
@Override
public InvalidFormatProblemAggregator makeProblemAggregator() {
return new InvalidFormatProblemAggregator();
}
}

View File

@ -2,10 +2,10 @@ package org.enso.table.parsing;
import org.enso.table.data.column.builder.object.BoolBuilder;
import org.enso.table.data.column.builder.object.Builder;
import org.enso.table.parsing.problems.InvalidFormatProblemAggregator;
import org.enso.table.parsing.problems.ProblemAggregator;
import org.graalvm.collections.EconomicSet;
public class BooleanParser extends DatatypeParser<InvalidFormatProblemAggregator> {
public class BooleanParser extends IncrementalDatatypeParser {
private final EconomicSet<String> trueValues;
private final EconomicSet<String> falseValues;
@ -22,7 +22,7 @@ public class BooleanParser extends DatatypeParser<InvalidFormatProblemAggregator
}
@Override
public Object parseSingleValue(String text, InvalidFormatProblemAggregator problemAggregator) {
protected Object parseSingleValue(String text, ProblemAggregator problemAggregator) {
// TODO we may want to use equality checks taking Unicode Normalization into account, to be
// revised in: https://www.pivotaltracker.com/story/show/182166382
if (trueValues.contains(text)) return true;
@ -33,12 +33,7 @@ public class BooleanParser extends DatatypeParser<InvalidFormatProblemAggregator
}
@Override
public Builder makeBuilderWithCapacity(long capacity) {
protected Builder makeBuilderWithCapacity(long capacity) {
return new BoolBuilder((int) capacity);
}
@Override
public InvalidFormatProblemAggregator makeProblemAggregator() {
return new InvalidFormatProblemAggregator();
}
}

View File

@ -1,67 +1,14 @@
package org.enso.table.parsing;
import org.enso.table.data.column.builder.object.Builder;
import org.enso.table.data.column.storage.Storage;
import org.enso.table.data.column.storage.StringStorage;
import org.enso.table.parsing.problems.ProblemAggregator;
import org.enso.table.read.WithProblems;
/**
* A base type for a datatype parsing strategy.
*
* <p>It specifies the strategy for parsing text cells into some target type, reporting issues and
* building the resulting table column.
*
* @param <PA> the specific problem aggregator type; the type is refined to be able to handle
* various strategies for aggregating problems, depending on the particular datatype
*/
public abstract class DatatypeParser<PA extends ProblemAggregator> {
/**
* Parses a single cell.
*
* @param text the text contents to parse, it will never be null in the default implementation -
* null values are just passed as-is without any parsing attempts by default
* @param problemAggregator an instance of the problem aggregator, used for reporting parsing
* problems
* @return the parsed value or null if the value could not be parsed or could be parsed but should
* be treated as missing value
*/
public abstract Object parseSingleValue(String text, PA problemAggregator);
/**
* Creates a new column builder expecting the specific datatype, with a specified capacity.
*
* <p>The {@code parseColumn} method will use {@code appendNoGrow} function, so the initial
* capacity should be set properly so that the builder can hold all expected elements.
*
* <p>The type returned from {@code parseSingleValue} should be consistent with the types that the
* builder returned here expects - it should never return a value that cannot be accepted by the
* builder.
*/
public abstract Builder makeBuilderWithCapacity(long capacity);
/** Creates a new instance of the specific problem aggregator type. */
public abstract PA makeProblemAggregator();
/** A base type for a parser capable of parsing a column of text values into some other type. */
public interface DatatypeParser {
/**
* Parses a column of texts (represented as a {@code StringStorage}) and returns a new storage,
* containing the parsed elements.
*/
public WithProblems<Storage> parseColumn(StringStorage sourceStorage) {
Builder builder = makeBuilderWithCapacity(sourceStorage.size());
PA aggregator = makeProblemAggregator();
for (int i = 0; i < sourceStorage.size(); ++i) {
String cell = sourceStorage.getItem(i);
if (cell != null) {
Object parsed = parseSingleValue(cell, aggregator);
builder.appendNoGrow(parsed);
} else {
builder.appendNoGrow(null);
}
}
return new WithProblems<>(builder.seal(), aggregator.getAggregatedProblems());
}
WithProblems<Storage> parseColumn(StringStorage sourceStorage);
}

View File

@ -4,9 +4,9 @@ import java.text.DecimalFormat;
import java.text.ParsePosition;
import org.enso.table.data.column.builder.object.Builder;
import org.enso.table.data.column.builder.object.NumericBuilder;
import org.enso.table.parsing.problems.NumericProblemAggregator;
import org.enso.table.parsing.problems.ProblemAggregator;
public class DecimalParser extends DatatypeParser<NumericProblemAggregator> {
public class DecimalParser extends IncrementalDatatypeParser {
private final String thousandsSeparator;
private final char decimalPoint;
private final DecimalFormat decimalFormat;
@ -38,7 +38,7 @@ public class DecimalParser extends DatatypeParser<NumericProblemAggregator> {
}
@Override
public Object parseSingleValue(String text, NumericProblemAggregator problemAggregator) {
protected Object parseSingleValue(String text, ProblemAggregator problemAggregator) {
if (thousandsSeparator != null
&& (text.startsWith(thousandsSeparator) || text.endsWith(thousandsSeparator))) {
problemAggregator.reportInvalidFormat(text);
@ -84,12 +84,7 @@ public class DecimalParser extends DatatypeParser<NumericProblemAggregator> {
}
@Override
public Builder makeBuilderWithCapacity(long capacity) {
protected Builder makeBuilderWithCapacity(long capacity) {
return NumericBuilder.createDoubleBuilder((int) capacity);
}
@Override
public NumericProblemAggregator makeProblemAggregator() {
return new NumericProblemAggregator();
}
}

View File

@ -0,0 +1,27 @@
package org.enso.table.parsing;
import java.util.List;
import org.enso.table.data.column.builder.object.StringBuilder;
import org.enso.table.data.column.storage.Storage;
import org.enso.table.data.column.storage.StringStorage;
import org.enso.table.parsing.problems.ProblemAggregator;
import org.enso.table.read.WithProblems;
/** A parser that just returns its input. Useful as a fallback. */
public class IdentityParser extends IncrementalDatatypeParser {
@Override
public Object parseSingleValue(String text, ProblemAggregator problemAggregator) {
return text;
}
@Override
public StringBuilder makeBuilderWithCapacity(long capacity) {
return new StringBuilder((int) capacity);
}
@Override
public WithProblems<Storage> parseColumn(StringStorage sourceStorage) {
return new WithProblems<>(sourceStorage, List.of());
}
}

View File

@ -0,0 +1,58 @@
package org.enso.table.parsing;
import org.enso.table.data.column.builder.object.Builder;
import org.enso.table.data.column.storage.Storage;
import org.enso.table.data.column.storage.StringStorage;
import org.enso.table.parsing.problems.ProblemAggregator;
import org.enso.table.read.WithProblems;
/**
* A base type for a datatype parsing strategy which relies on a method parsing a single value.
*
* <p>It specifies the strategy for parsing text cells into some target type, reporting issues and
* building the resulting table column.
*/
public abstract class IncrementalDatatypeParser implements DatatypeParser {
/**
* Parses a single cell.
*
* @param text the text contents to parse, it will never be null in the default implementation -
* null values are just passed as-is without any parsing attempts by default
* @param problemAggregator an instance of the problem aggregator, used for reporting parsing
* problems
* @return the parsed value or null if the value could not be parsed or could be parsed but should
* be treated as missing value
*/
protected abstract Object parseSingleValue(String text, ProblemAggregator problemAggregator);
/**
* Creates a new column builder expecting the specific datatype, with a specified capacity.
*
* <p>The {@code parseColumn} method will use {@code appendNoGrow} function, so the initial
* capacity should be set properly so that the builder can hold all expected elements.
*
* <p>The type returned from {@code parseSingleValue} should be consistent with the types that the
* builder returned here expects - it should never return a value that cannot be accepted by the
* builder.
*/
protected abstract Builder makeBuilderWithCapacity(long capacity);
@Override
public WithProblems<Storage> parseColumn(StringStorage sourceStorage) {
Builder builder = makeBuilderWithCapacity(sourceStorage.size());
var aggregator = new ProblemAggregator();
for (int i = 0; i < sourceStorage.size(); ++i) {
String cell = sourceStorage.getItem(i);
if (cell != null) {
Object parsed = parseSingleValue(cell, aggregator);
builder.appendNoGrow(parsed);
} else {
builder.appendNoGrow(null);
}
}
return new WithProblems<>(builder.seal(), aggregator.getAggregatedProblems());
}
}

View File

@ -2,9 +2,9 @@ package org.enso.table.parsing;
import org.enso.table.data.column.builder.object.Builder;
import org.enso.table.data.column.builder.object.NumericBuilder;
import org.enso.table.parsing.problems.NumericProblemAggregator;
import org.enso.table.parsing.problems.ProblemAggregator;
public class IntegerParser extends DatatypeParser<NumericProblemAggregator> {
public class IntegerParser extends IncrementalDatatypeParser {
private final String thousandsSeparator;
private final boolean leadingZerosAllowed;
@ -18,7 +18,7 @@ public class IntegerParser extends DatatypeParser<NumericProblemAggregator> {
}
@Override
public Object parseSingleValue(String text, NumericProblemAggregator problemAggregator) {
protected Object parseSingleValue(String text, ProblemAggregator problemAggregator) {
if (thousandsSeparator != null
&& (text.startsWith(thousandsSeparator) || text.endsWith(thousandsSeparator))) {
problemAggregator.reportInvalidFormat(text);
@ -55,12 +55,7 @@ public class IntegerParser extends DatatypeParser<NumericProblemAggregator> {
}
@Override
public Builder makeBuilderWithCapacity(long capacity) {
protected Builder makeBuilderWithCapacity(long capacity) {
return NumericBuilder.createLongBuilder((int) capacity);
}
@Override
public NumericProblemAggregator makeProblemAggregator() {
return new NumericProblemAggregator();
}
}

View File

@ -0,0 +1,52 @@
package org.enso.table.parsing;
import org.enso.table.data.column.builder.object.Builder;
import org.enso.table.data.column.storage.Storage;
import org.enso.table.data.column.storage.StringStorage;
import org.enso.table.parsing.problems.ProblemAggregator;
import org.enso.table.read.WithProblems;
/**
* The type inferring parser tries to parse the given column using a set of provided parsers. It
* returns the result of the first parser that succeeds without reporting any problems.
*
* <p>If all parsers from the set reported problems, the fallback parser is used and its result is
* returned regardless of any problems.
*/
public class TypeInferringParser implements DatatypeParser {
private final IncrementalDatatypeParser[] baseParsers;
private final DatatypeParser fallbackParser;
public TypeInferringParser(
IncrementalDatatypeParser[] baseParsers, DatatypeParser fallbackParser) {
this.baseParsers = baseParsers;
this.fallbackParser = fallbackParser;
}
@Override
public WithProblems<Storage> parseColumn(StringStorage sourceStorage) {
parsers:
for (IncrementalDatatypeParser parser : baseParsers) {
Builder builder = parser.makeBuilderWithCapacity(sourceStorage.size());
var aggregator = new ProblemAggregator();
for (int i = 0; i < sourceStorage.size(); ++i) {
String cell = sourceStorage.getItem(i);
if (cell != null) {
Object parsed = parser.parseSingleValue(cell, aggregator);
if (aggregator.hasProblems()) {
continue parsers;
}
builder.appendNoGrow(parsed);
} else {
builder.appendNoGrow(null);
}
}
return new WithProblems<>(builder.seal(), aggregator.getAggregatedProblems());
}
return fallbackParser.parseColumn(sourceStorage);
}
}

View File

@ -3,26 +3,25 @@ package org.enso.table.parsing;
import org.enso.table.data.column.builder.object.Builder;
import org.enso.table.parsing.problems.ProblemAggregator;
public class WhitespaceStrippingParser<PA extends ProblemAggregator> extends DatatypeParser<PA> {
private final DatatypeParser<PA> innerParser;
/**
* An incremental parser which wraps another parser of that type, delegating the parsing logic to
* it, but first transforming the input text by stripping any leading and trailing whitespace.
*/
public class WhitespaceStrippingParser extends IncrementalDatatypeParser {
private final IncrementalDatatypeParser innerParser;
public WhitespaceStrippingParser(DatatypeParser<PA> innerParser) {
public WhitespaceStrippingParser(IncrementalDatatypeParser innerParser) {
this.innerParser = innerParser;
}
@Override
public Object parseSingleValue(String text, PA problemAggregator) {
protected Object parseSingleValue(String text, ProblemAggregator problemAggregator) {
String stripped = text.strip();
return innerParser.parseSingleValue(stripped, problemAggregator);
}
@Override
public Builder makeBuilderWithCapacity(long capacity) {
protected Builder makeBuilderWithCapacity(long capacity) {
return innerParser.makeBuilderWithCapacity(capacity);
}
@Override
public PA makeProblemAggregator() {
return innerParser.makeProblemAggregator();
}
}

View File

@ -1,28 +0,0 @@
package org.enso.table.parsing.problems;
import java.util.ArrayList;
import java.util.List;
/**
* A base problem aggregator that allows reporting the most generic {@code InvalidFormat} problem.
*/
public class InvalidFormatProblemAggregator implements ProblemAggregator {
private final List<String> invalidFormatCells = new ArrayList<>();
/**
* Reports a cell with an invalid format.
*
* <p>The reports are aggregated and finally a single problem containing all invalid cell for the
* given column is reported.
*/
public void reportInvalidFormat(String cell) {
invalidFormatCells.add(cell);
}
@Override
public List<ParsingProblem> getAggregatedProblems() {
if (invalidFormatCells.isEmpty()) return List.of();
else return List.of(new InvalidFormat(invalidFormatCells));
}
}

View File

@ -1,27 +0,0 @@
package org.enso.table.parsing.problems;
import java.util.ArrayList;
import java.util.List;
/**
* A problem aggregator capable of reporting {@code InvalidFormat} and {@code LeadingZeros}
* problems.
*/
public class NumericProblemAggregator extends InvalidFormatProblemAggregator {
private final List<String> leadingZerosCells = new ArrayList<>();
public void reportLeadingZeroes(String cell) {
leadingZerosCells.add(cell);
}
@Override
public List<ParsingProblem> getAggregatedProblems() {
List<ParsingProblem> problems = new ArrayList<>(super.getAggregatedProblems());
if (!leadingZerosCells.isEmpty()) {
problems.add(new LeadingZeros(leadingZerosCells));
}
return problems;
}
}

View File

@ -1,14 +1,57 @@
package org.enso.table.parsing.problems;
import java.util.ArrayList;
import java.util.List;
/**
* A base class for strategies for aggregating problems.
* An aggregator for parsing problems.
*
* <p>Each strategy exposes a method that returns a summary of the problems. The particular methods
* for reporting each problem are defined in particular subclasses.
*/
public interface ProblemAggregator {
public class ProblemAggregator {
private final List<String> invalidFormatCells = new ArrayList<>();
private final List<String> leadingZerosCells = new ArrayList<>();
/**
* Reports a cell with an invalid format.
*
* <p>The reports are aggregated and finally a single problem containing all invalid cell for the
* given column is reported.
*/
public void reportInvalidFormat(String cell) {
invalidFormatCells.add(cell);
}
public void reportLeadingZeroes(String cell) {
leadingZerosCells.add(cell);
}
/**
* Checks if there are any problems already reported.
*
* <p>This method returns true if and only if {@code getAggregatedProblems} would return a
* non-empty list.
*/
public boolean hasProblems() {
return !invalidFormatCells.isEmpty() || !leadingZerosCells.isEmpty();
}
/** Return an aggregated summary of problems that have been reported. */
List<ParsingProblem> getAggregatedProblems();
public List<ParsingProblem> getAggregatedProblems() {
List<ParsingProblem> problems = new ArrayList<>();
if (!invalidFormatCells.isEmpty()) {
problems.add(new InvalidFormat(invalidFormatCells));
}
if (!leadingZerosCells.isEmpty()) {
problems.add(new LeadingZeros(leadingZerosCells));
}
assert problems.isEmpty() == !hasProblems();
return problems;
}
}

View File

@ -11,7 +11,7 @@ import Standard.Test.Problems
import Standard.Visualization
from Standard.Table.Data.Data_Formatter as Data_Formatter_Module import Data_Formatter
from Standard.Table.Data.Column_Type_Selection as Column_Type_Selection_Module import Column_Type_Selection
from Standard.Table.Data.Column_Type_Selection as Column_Type_Selection_Module import Column_Type_Selection, Auto
from Standard.Table.Error as Table_Errors import Invalid_Format, Leading_Zeros, Missing_Input_Columns, Column_Indexes_Out_Of_Range, Duplicate_Type_Selector
@ -55,14 +55,14 @@ spec = Test.group "Table.parse_values" <|
opts = Data_Formatter allow_leading_zeros=True
t1_parsed_zeros = [0, 0, 0, 1, -1, 1, 0, 10, 12345, Nothing]
t6 = t1.parse_values parser=opts column_types=[Column_Type_Selection 0 Integer]
t6 = t1.parse_values value_formatter=opts column_types=[Column_Type_Selection 0 Integer]
t6.at "ints" . to_vector . should_equal t1_parsed_zeros
Warning.get_all t6 . should_equal []
t7 = t1.parse_values parser=opts column_types=[Column_Type_Selection 0 Decimal]
t7 = t1.parse_values value_formatter=opts column_types=[Column_Type_Selection 0 Decimal]
t7.at "ints" . to_vector . should_equal t1_parsed_zeros
Warning.get_all t7 . should_equal []
t8 = t2.parse_values parser=opts column_types=[Column_Type_Selection 0 Decimal]
t8 = t2.parse_values value_formatter=opts column_types=[Column_Type_Selection 0 Decimal]
t8.at "floats" . to_vector . should_equal [0.0, 0.0, 0.0, 1.0, -10.0, 1.0]
Warning.get_all t8 . should_equal []
@ -158,9 +158,52 @@ spec = Test.group "Table.parse_values" <|
problems = [(Duplicate_Type_Selector "floats" ambiguous=True), (Duplicate_Type_Selector "bools" ambiguous=False)]
Problems.test_problem_handling action problems tester
Test.specify "should guess the datatype for columns" pending="TODO" <|
# TODO (next PR): ints, decimals, int+dec, just text, all dates, mixed dates, ints+text, ints in quotes
Error.throw "TODO"
Test.specify "should guess the datatype for columns" <|
c1 = ["ints", ["1", " +2", "-123", Nothing]]
c2 = ["ints0", ["01", "02 ", Nothing, "-1"]]
c3 = ["floats", [" 1.0 ", "2.2", Nothing, "-1.0"]]
c4 = ["bools", ["true", " False", Nothing, "True"]]
c5 = ["floats+ints", ["1", "2.2 ", "-1.0", Nothing]]
c6 = ["text", ["foobar", "foo", "", Nothing]]
c7 = ["dates", ["2022-10-01", " 2000-01-01", "1999-01-02", Nothing]]
c8 = ["datetimes", ["2022-10-01 01:02:03 ", "2000-01-01 01:02:03", "1999-01-02 01:02:03", Nothing]]
c9 = ["times", ["01:02:03", " 00:00:00 ", "01:02:03", Nothing]]
c10 = ["mixeddates", ["2022-10-01", "2000-01-01 01:02:03", "01:02:03", Nothing]]
c11 = ["text+ints", ["1", "2", " foobar", Nothing]]
t = Table.new [c1, c2, c3, c4, c5, c6, c7, c8, c9, c10, c11]
t2 = t.parse_values
Warning.get_all t2 . should_equal []
t2.at "ints" . to_vector . should_equal [1, 2, -123, Nothing]
t2.at "ints" . to_vector . first . should_be_an Integer
t2.at "ints0" . to_vector . should_equal ["01", "02", Nothing, "-1"]
t2.at "floats" . to_vector . should_equal [1.0, 2.2, Nothing, -1.0]
t2.at "bools" . to_vector . should_equal [True, False, Nothing, True]
t2.at "floats+ints" . to_vector . should_equal [1.0, 2.2, -1.0, Nothing]
t2.at "text" . to_vector . should_equal ["foobar", "foo", "", Nothing]
t2.at "dates" . to_vector . map date_as_vector . should_equal [[2022, 10, 1], [2000, 1, 1], [1999, 1, 2], Nothing]
t2.at "datetimes" . to_vector . map datetime_as_vector . should_equal [[2022, 10, 1, 1, 2, 3, 0], [2000, 1, 1, 1, 2, 3, 0], [1999, 1, 2, 1, 2, 3, 0], Nothing]
t2.at "times" . to_vector . map time_as_vector . should_equal [[1, 2, 3, 0], [0, 0, 0, 0], [1, 2, 3, 0], Nothing]
t2.at "mixeddates" . to_vector . should_equal ["2022-10-01", "2000-01-01 01:02:03", "01:02:03", Nothing]
t2.at "text+ints" . to_vector . should_equal ["1", "2", "foobar", Nothing]
t3 = Table.new [["bools", ["1", "0", "True"]], ["ints", ["1", "0", "0"]]] . parse_values (Data_Formatter true_values=["1", "True"] false_values=["0", "False"])
t3.at "bools" . to_vector . should_equal [True, False, True]
t3.at "ints" . to_vector . should_equal [1, 0, 0]
t4 = Table.new [c2] . parse_values (Data_Formatter allow_leading_zeros=True)
t4 . at "ints0" . to_vector . should_equal [1, 2, Nothing, -1]
t5 = t.parse_values column_types=[Column_Type_Selection "ints" Decimal, Column_Type_Selection "floats" Auto, Column_Type_Selection "text+ints" Auto]
t5.at "ints" . to_vector . should_equal [1.0, 2.0, -123.0, Nothing]
# `ints` are requested to be parsed as decimals.
t5.at "ints" . to_vector . first . should_be_a Decimal
# `floats` are auto-detected as decimals.
t5.at "floats" . to_vector . should_equal [1.0, 2.2, Nothing, -1.0]
# `text+ints` is attempted to be parsed (hence whitespace is stripped), but it only fits the text type.
t5.at "text+ints" . to_vector . should_equal ["1", "2", "foobar", Nothing]
# `bools` are not requested to be parsed, so they are kept as-is, with leading whitespace etc.
t5.at "bools" . to_vector . should_equal ["true", " False", Nothing, "True"]
Test.specify "should allow to specify a thousands separator and a custom decimal point" <|
opts = Data_Formatter decimal_point=',' thousand_separator='_'
@ -234,8 +277,16 @@ spec = Test.group "Table.parse_values" <|
expected_warnings.append (Invalid_Format "times" Time_Of_Day.Time_Of_Day ["11:00:00 ", " 00:00:00", "00 : 00 : 00"])
warnings.should_contain_the_same_elements_as expected_warnings.to_vector
Test.specify "should fallback to text if whitespace is present and trimming is turned off" pending="TODO" <|
## TODO next PR
Error.throw "TODO"
Test.specify "should fallback to text if whitespace is present and trimming is turned off" <|
c1 = ["1", " +2", "-123", Nothing]
c2 = [" 1.0 ", "2.2", Nothing, "-1.0"]
c3 = ["true", " False", Nothing, "True"]
t = Table.new [["ints", c1], ["floats", c2], ["bools", c3]]
t2 = t.parse_values (Data_Formatter trim_values=False)
Warning.get_all t2 . should_equal []
t2.at "ints" . to_vector . should_equal c1
t2.at "floats" . to_vector . should_equal c2
t2.at "bools" . to_vector . should_equal c3
main = Test.Suite.run_main here.spec