mirror of
https://github.com/enso-org/enso.git
synced 2024-12-23 17:03:32 +03:00
Automatic inference of value types when parsing table columns (#3462)
Implements https://www.pivotaltracker.com/story/show/182199966
This commit is contained in:
parent
0073f461d9
commit
ff7700ebb1
@ -124,6 +124,7 @@
|
||||
specified type.][3455]
|
||||
- [Promote with, take, finalize to be methods of Managed_Resource
|
||||
instance][3460]
|
||||
- [Implemented automatic type detection for `Table.parse_values`.][3462]
|
||||
|
||||
[debug-shortcuts]:
|
||||
https://github.com/enso-org/enso/blob/develop/app/gui/docs/product/shortcuts.md#debug
|
||||
@ -192,6 +193,7 @@
|
||||
[3457]: https://github.com/enso-org/enso/pull/3457
|
||||
[3455]: https://github.com/enso-org/enso/pull/3455
|
||||
[3460]: https://github.com/enso-org/enso/pull/3460
|
||||
[3462]: https://github.com/enso-org/enso/pull/3462
|
||||
|
||||
#### Enso Compiler
|
||||
|
||||
|
@ -670,10 +670,10 @@ type Table
|
||||
## Parsing values is not supported in database tables, the table has to be
|
||||
materialized first with `to_dataframe`.
|
||||
parse_values : Data_Formatter -> (Nothing | [Column_Type_Selection]) -> Problem_Behavior -> Table
|
||||
parse_values parser=Data_Formatter column_types=Nothing on_problems=Report_Warning =
|
||||
parse_values value_formatter=Data_Formatter column_types=Nothing on_problems=Report_Warning =
|
||||
## Avoid unused arguments warning. We cannot rename arguments to `_`,
|
||||
because we need to keep the API consistent with the in-memory table.
|
||||
_ = [parser, column_types, on_problems]
|
||||
_ = [value_formatter, column_types, on_problems]
|
||||
msg = "Parsing values is not supported in database tables, the table has to be materialized first with `to_dataframe`."
|
||||
Error.throw (Unsupported_Database_Operation_Error msg)
|
||||
|
||||
|
@ -1,5 +1,19 @@
|
||||
from Standard.Base import all
|
||||
|
||||
from Standard.Base.Data.Time.Date as Date_Module import Date
|
||||
from Standard.Base.Data.Time as Time_Module import Time
|
||||
from Standard.Base.Data.Time.Time_Of_Day as Time_Of_Day_Module import Time_Of_Day
|
||||
|
||||
polyglot java import org.enso.table.parsing.IntegerParser
|
||||
polyglot java import org.enso.table.parsing.DecimalParser
|
||||
polyglot java import org.enso.table.parsing.BooleanParser
|
||||
polyglot java import org.enso.table.parsing.DateParser
|
||||
polyglot java import org.enso.table.parsing.TimeParser
|
||||
polyglot java import org.enso.table.parsing.DateTimeParser
|
||||
polyglot java import org.enso.table.parsing.WhitespaceStrippingParser
|
||||
polyglot java import org.enso.table.parsing.IdentityParser
|
||||
polyglot java import org.enso.table.parsing.TypeInferringParser
|
||||
|
||||
## Specifies options for reading text data in a table to more specific types and
|
||||
serializing them back.
|
||||
|
||||
@ -27,3 +41,55 @@ from Standard.Base import all
|
||||
- true_values: Values representing True.
|
||||
- false_values: Values representing False.
|
||||
type Data_Formatter trim_values:Boolean=True allow_leading_zeros:Boolean=False decimal_point:Text='.' thousand_separator:Text='' datetime_formats:[Text]=["yyyy-MM-dd HH:mm:ss"] date_formats:[Text]=["yyyy-MM-dd"] time_formats:[Text]=["HH:mm:ss"] locale:Locale=Locale.default true_values:[Text]=["True","true","TRUE"] false_values:[Text]=["False","false","FALSE"]
|
||||
|
||||
## PRIVATE
|
||||
Data_Formatter.get_thousand_separator = if this.thousand_separator.is_empty then Nothing else this.thousand_separator
|
||||
|
||||
## PRIVATE
|
||||
Data_Formatter.wrap_base_parser base_parser =
|
||||
if this.trim_values.not then base_parser else
|
||||
WhitespaceStrippingParser.new base_parser
|
||||
|
||||
## PRIVATE
|
||||
Data_Formatter.make_integer_parser = this.wrap_base_parser <|
|
||||
IntegerParser.new this.get_thousand_separator this.allow_leading_zeros
|
||||
|
||||
## PRIVATE
|
||||
Data_Formatter.make_decimal_parser = this.wrap_base_parser <|
|
||||
DecimalParser.new this.decimal_point this.get_thousand_separator this.allow_leading_zeros
|
||||
|
||||
## PRIVATE
|
||||
Data_Formatter.make_boolean_parser = this.wrap_base_parser <|
|
||||
BooleanParser.new this.true_values.to_array this.false_values.to_array
|
||||
|
||||
## PRIVATE
|
||||
Data_Formatter.make_date_parser = this.wrap_base_parser <|
|
||||
DateParser.new this.date_formats.to_array this.locale.java_locale
|
||||
|
||||
## PRIVATE
|
||||
Data_Formatter.make_identity_parser = this.wrap_base_parser IdentityParser.new
|
||||
|
||||
## PRIVATE
|
||||
Data_Formatter.make_datetime_parser = this.wrap_base_parser <|
|
||||
DateTimeParser.new this.datetime_formats.to_array this.locale.java_locale
|
||||
|
||||
## PRIVATE
|
||||
Data_Formatter.make_time_parser = this.wrap_base_parser <|
|
||||
TimeParser.new this.time_formats.to_array this.locale.java_locale
|
||||
|
||||
## PRIVATE
|
||||
Data_Formatter.make_datatype_parser datatype = case datatype of
|
||||
Integer -> this.make_integer_parser
|
||||
Decimal -> this.make_decimal_parser
|
||||
Boolean -> this.make_boolean_parser
|
||||
_ ->
|
||||
if datatype == Date then this.make_date_parser else
|
||||
if datatype == Time then this.make_datetime_parser else
|
||||
if datatype == Time_Of_Day then this.make_time_parser else
|
||||
Error.throw (Illegal_Argument_Error "Unsupported datatype: "+datatype.to_text)
|
||||
|
||||
## PRIVATE
|
||||
Data_Formatter.make_auto_parser =
|
||||
parsers = [this.make_integer_parser, this.make_decimal_parser, this.make_datetime_parser, this.make_date_parser, this.make_time_parser, this.make_boolean_parser]
|
||||
fallback_parser = this.make_identity_parser
|
||||
TypeInferringParser.new parsers.to_array fallback_parser
|
||||
|
@ -5,8 +5,6 @@ import Standard.Table.Data.Column
|
||||
import Standard.Table.Io.Csv
|
||||
import Standard.Visualization
|
||||
from Standard.Base.Data.Time.Date as Date_Module import Date
|
||||
from Standard.Base.Data.Time as Time_Module import Time
|
||||
from Standard.Base.Data.Time.Time_Of_Day as Time_Of_Day_Module import Time_Of_Day
|
||||
import Standard.Table.Io.Spreadsheet_Write_Mode
|
||||
import Standard.Table.Io.Format
|
||||
import Standard.Table.Internal.Table_Helpers
|
||||
@ -31,14 +29,6 @@ polyglot java import org.enso.table.operations.OrderBuilder
|
||||
polyglot java import org.enso.table.format.csv.Writer as Csv_Writer
|
||||
polyglot java import org.enso.table.format.xlsx.Writer as Spreadsheet_Writer
|
||||
|
||||
polyglot java import org.enso.table.parsing.IntegerParser
|
||||
polyglot java import org.enso.table.parsing.DecimalParser
|
||||
polyglot java import org.enso.table.parsing.BooleanParser
|
||||
polyglot java import org.enso.table.parsing.DateParser
|
||||
polyglot java import org.enso.table.parsing.TimeParser
|
||||
polyglot java import org.enso.table.parsing.DateTimeParser
|
||||
polyglot java import org.enso.table.parsing.WhitespaceStrippingParser
|
||||
|
||||
## Creates a new table from a vector of `[name, items]` pairs.
|
||||
|
||||
Arguments:
|
||||
@ -553,7 +543,7 @@ type Table
|
||||
a leading 0). However, settings in the `Data_Formatter` can
|
||||
control this.
|
||||
parse_values : Data_Formatter -> (Nothing | [Column_Type_Selection]) -> Problem_Behavior -> Table
|
||||
parse_values parser=Data_Formatter column_types=Nothing on_problems=Report_Warning =
|
||||
parse_values value_formatter=Data_Formatter column_types=Nothing on_problems=Report_Warning =
|
||||
columns = this.columns
|
||||
problem_builder = Vector.new_builder
|
||||
|
||||
@ -595,22 +585,9 @@ type Table
|
||||
|
||||
new_columns = columns.zip expected_types column-> expected_type-> case expected_type of
|
||||
Nothing -> column
|
||||
Auto -> Error.unimplemented "Automatic datatype inference is not implemented yet."
|
||||
_ ->
|
||||
parse_options = parser
|
||||
thousand_separator = if parse_options.thousand_separator.is_empty then Nothing else parse_options.thousand_separator
|
||||
base_parser = case expected_type of
|
||||
Integer -> IntegerParser.new thousand_separator parse_options.allow_leading_zeros
|
||||
Decimal -> DecimalParser.new parse_options.decimal_point thousand_separator parse_options.allow_leading_zeros
|
||||
Boolean -> BooleanParser.new parse_options.true_values.to_array parse_options.false_values.to_array
|
||||
_ ->
|
||||
if expected_type == Date then DateParser.new parse_options.date_formats.to_array parse_options.locale.java_locale else
|
||||
if expected_type == Time then DateTimeParser.new parse_options.datetime_formats.to_array parse_options.locale.java_locale else
|
||||
if expected_type == Time_Of_Day then TimeParser.new parse_options.time_formats.to_array parse_options.locale.java_locale else
|
||||
Error.throw (Illegal_Argument_Error "Unsupported target datatype: "+expected_type.to_text)
|
||||
parser = case parse_options.trim_values of
|
||||
False -> base_parser
|
||||
True -> WhitespaceStrippingParser.new base_parser
|
||||
parser = if expected_type == Auto then value_formatter.make_auto_parser else
|
||||
value_formatter.make_datatype_parser expected_type
|
||||
storage = column.java_column.getStorage
|
||||
new_storage_and_problems = parser.parseColumn storage
|
||||
new_storage = new_storage_and_problems.value
|
||||
|
@ -29,7 +29,7 @@ public class StringStorageBuilder extends StorageBuilder {
|
||||
|
||||
/** @inheritDoc */
|
||||
@Override
|
||||
public StorageBuilder parseAndAppend(String value) {
|
||||
public StringStorageBuilder parseAndAppend(String value) {
|
||||
ensureAppendable();
|
||||
data[size++] = value;
|
||||
return this;
|
||||
@ -45,7 +45,7 @@ public class StringStorageBuilder extends StorageBuilder {
|
||||
|
||||
/** @inheritDoc */
|
||||
@Override
|
||||
public Storage seal() {
|
||||
public StringStorage seal() {
|
||||
return new StringStorage(data, size);
|
||||
}
|
||||
}
|
||||
|
@ -5,9 +5,9 @@ import java.time.format.DateTimeParseException;
|
||||
import java.util.Locale;
|
||||
import org.enso.table.data.column.builder.object.Builder;
|
||||
import org.enso.table.data.column.builder.object.ObjectBuilder;
|
||||
import org.enso.table.parsing.problems.InvalidFormatProblemAggregator;
|
||||
import org.enso.table.parsing.problems.ProblemAggregator;
|
||||
|
||||
public abstract class BaseTimeParser extends DatatypeParser<InvalidFormatProblemAggregator> {
|
||||
public abstract class BaseTimeParser extends IncrementalDatatypeParser {
|
||||
protected interface ParseStrategy {
|
||||
Object parse(String text, DateTimeFormatter formatter) throws DateTimeParseException;
|
||||
}
|
||||
@ -25,7 +25,7 @@ public abstract class BaseTimeParser extends DatatypeParser<InvalidFormatProblem
|
||||
}
|
||||
|
||||
@Override
|
||||
public Object parseSingleValue(String text, InvalidFormatProblemAggregator problemAggregator) {
|
||||
protected Object parseSingleValue(String text, ProblemAggregator problemAggregator) {
|
||||
for (var formatter : formatters) {
|
||||
try {
|
||||
return parseStrategy.parse(text, formatter);
|
||||
@ -38,14 +38,9 @@ public abstract class BaseTimeParser extends DatatypeParser<InvalidFormatProblem
|
||||
}
|
||||
|
||||
@Override
|
||||
public Builder makeBuilderWithCapacity(long capacity) {
|
||||
protected Builder makeBuilderWithCapacity(long capacity) {
|
||||
// Once datetime gets first-class support in our dataframes, a more specific builder type should
|
||||
// be used.
|
||||
return new ObjectBuilder((int) capacity);
|
||||
}
|
||||
|
||||
@Override
|
||||
public InvalidFormatProblemAggregator makeProblemAggregator() {
|
||||
return new InvalidFormatProblemAggregator();
|
||||
}
|
||||
}
|
||||
|
@ -2,10 +2,10 @@ package org.enso.table.parsing;
|
||||
|
||||
import org.enso.table.data.column.builder.object.BoolBuilder;
|
||||
import org.enso.table.data.column.builder.object.Builder;
|
||||
import org.enso.table.parsing.problems.InvalidFormatProblemAggregator;
|
||||
import org.enso.table.parsing.problems.ProblemAggregator;
|
||||
import org.graalvm.collections.EconomicSet;
|
||||
|
||||
public class BooleanParser extends DatatypeParser<InvalidFormatProblemAggregator> {
|
||||
public class BooleanParser extends IncrementalDatatypeParser {
|
||||
|
||||
private final EconomicSet<String> trueValues;
|
||||
private final EconomicSet<String> falseValues;
|
||||
@ -22,7 +22,7 @@ public class BooleanParser extends DatatypeParser<InvalidFormatProblemAggregator
|
||||
}
|
||||
|
||||
@Override
|
||||
public Object parseSingleValue(String text, InvalidFormatProblemAggregator problemAggregator) {
|
||||
protected Object parseSingleValue(String text, ProblemAggregator problemAggregator) {
|
||||
// TODO we may want to use equality checks taking Unicode Normalization into account, to be
|
||||
// revised in: https://www.pivotaltracker.com/story/show/182166382
|
||||
if (trueValues.contains(text)) return true;
|
||||
@ -33,12 +33,7 @@ public class BooleanParser extends DatatypeParser<InvalidFormatProblemAggregator
|
||||
}
|
||||
|
||||
@Override
|
||||
public Builder makeBuilderWithCapacity(long capacity) {
|
||||
protected Builder makeBuilderWithCapacity(long capacity) {
|
||||
return new BoolBuilder((int) capacity);
|
||||
}
|
||||
|
||||
@Override
|
||||
public InvalidFormatProblemAggregator makeProblemAggregator() {
|
||||
return new InvalidFormatProblemAggregator();
|
||||
}
|
||||
}
|
||||
|
@ -1,67 +1,14 @@
|
||||
package org.enso.table.parsing;
|
||||
|
||||
import org.enso.table.data.column.builder.object.Builder;
|
||||
import org.enso.table.data.column.storage.Storage;
|
||||
import org.enso.table.data.column.storage.StringStorage;
|
||||
import org.enso.table.parsing.problems.ProblemAggregator;
|
||||
import org.enso.table.read.WithProblems;
|
||||
|
||||
/**
|
||||
* A base type for a datatype parsing strategy.
|
||||
*
|
||||
* <p>It specifies the strategy for parsing text cells into some target type, reporting issues and
|
||||
* building the resulting table column.
|
||||
*
|
||||
* @param <PA> the specific problem aggregator type; the type is refined to be able to handle
|
||||
* various strategies for aggregating problems, depending on the particular datatype
|
||||
*/
|
||||
public abstract class DatatypeParser<PA extends ProblemAggregator> {
|
||||
|
||||
/**
|
||||
* Parses a single cell.
|
||||
*
|
||||
* @param text the text contents to parse, it will never be null in the default implementation -
|
||||
* null values are just passed as-is without any parsing attempts by default
|
||||
* @param problemAggregator an instance of the problem aggregator, used for reporting parsing
|
||||
* problems
|
||||
* @return the parsed value or null if the value could not be parsed or could be parsed but should
|
||||
* be treated as missing value
|
||||
*/
|
||||
public abstract Object parseSingleValue(String text, PA problemAggregator);
|
||||
|
||||
/**
|
||||
* Creates a new column builder expecting the specific datatype, with a specified capacity.
|
||||
*
|
||||
* <p>The {@code parseColumn} method will use {@code appendNoGrow} function, so the initial
|
||||
* capacity should be set properly so that the builder can hold all expected elements.
|
||||
*
|
||||
* <p>The type returned from {@code parseSingleValue} should be consistent with the types that the
|
||||
* builder returned here expects - it should never return a value that cannot be accepted by the
|
||||
* builder.
|
||||
*/
|
||||
public abstract Builder makeBuilderWithCapacity(long capacity);
|
||||
|
||||
/** Creates a new instance of the specific problem aggregator type. */
|
||||
public abstract PA makeProblemAggregator();
|
||||
|
||||
/** A base type for a parser capable of parsing a column of text values into some other type. */
|
||||
public interface DatatypeParser {
|
||||
/**
|
||||
* Parses a column of texts (represented as a {@code StringStorage}) and returns a new storage,
|
||||
* containing the parsed elements.
|
||||
*/
|
||||
public WithProblems<Storage> parseColumn(StringStorage sourceStorage) {
|
||||
Builder builder = makeBuilderWithCapacity(sourceStorage.size());
|
||||
PA aggregator = makeProblemAggregator();
|
||||
|
||||
for (int i = 0; i < sourceStorage.size(); ++i) {
|
||||
String cell = sourceStorage.getItem(i);
|
||||
if (cell != null) {
|
||||
Object parsed = parseSingleValue(cell, aggregator);
|
||||
builder.appendNoGrow(parsed);
|
||||
} else {
|
||||
builder.appendNoGrow(null);
|
||||
}
|
||||
}
|
||||
|
||||
return new WithProblems<>(builder.seal(), aggregator.getAggregatedProblems());
|
||||
}
|
||||
WithProblems<Storage> parseColumn(StringStorage sourceStorage);
|
||||
}
|
||||
|
@ -4,9 +4,9 @@ import java.text.DecimalFormat;
|
||||
import java.text.ParsePosition;
|
||||
import org.enso.table.data.column.builder.object.Builder;
|
||||
import org.enso.table.data.column.builder.object.NumericBuilder;
|
||||
import org.enso.table.parsing.problems.NumericProblemAggregator;
|
||||
import org.enso.table.parsing.problems.ProblemAggregator;
|
||||
|
||||
public class DecimalParser extends DatatypeParser<NumericProblemAggregator> {
|
||||
public class DecimalParser extends IncrementalDatatypeParser {
|
||||
private final String thousandsSeparator;
|
||||
private final char decimalPoint;
|
||||
private final DecimalFormat decimalFormat;
|
||||
@ -38,7 +38,7 @@ public class DecimalParser extends DatatypeParser<NumericProblemAggregator> {
|
||||
}
|
||||
|
||||
@Override
|
||||
public Object parseSingleValue(String text, NumericProblemAggregator problemAggregator) {
|
||||
protected Object parseSingleValue(String text, ProblemAggregator problemAggregator) {
|
||||
if (thousandsSeparator != null
|
||||
&& (text.startsWith(thousandsSeparator) || text.endsWith(thousandsSeparator))) {
|
||||
problemAggregator.reportInvalidFormat(text);
|
||||
@ -84,12 +84,7 @@ public class DecimalParser extends DatatypeParser<NumericProblemAggregator> {
|
||||
}
|
||||
|
||||
@Override
|
||||
public Builder makeBuilderWithCapacity(long capacity) {
|
||||
protected Builder makeBuilderWithCapacity(long capacity) {
|
||||
return NumericBuilder.createDoubleBuilder((int) capacity);
|
||||
}
|
||||
|
||||
@Override
|
||||
public NumericProblemAggregator makeProblemAggregator() {
|
||||
return new NumericProblemAggregator();
|
||||
}
|
||||
}
|
||||
|
@ -0,0 +1,27 @@
|
||||
package org.enso.table.parsing;
|
||||
|
||||
import java.util.List;
|
||||
import org.enso.table.data.column.builder.object.StringBuilder;
|
||||
import org.enso.table.data.column.storage.Storage;
|
||||
import org.enso.table.data.column.storage.StringStorage;
|
||||
import org.enso.table.parsing.problems.ProblemAggregator;
|
||||
import org.enso.table.read.WithProblems;
|
||||
|
||||
/** A parser that just returns its input. Useful as a fallback. */
|
||||
public class IdentityParser extends IncrementalDatatypeParser {
|
||||
|
||||
@Override
|
||||
public Object parseSingleValue(String text, ProblemAggregator problemAggregator) {
|
||||
return text;
|
||||
}
|
||||
|
||||
@Override
|
||||
public StringBuilder makeBuilderWithCapacity(long capacity) {
|
||||
return new StringBuilder((int) capacity);
|
||||
}
|
||||
|
||||
@Override
|
||||
public WithProblems<Storage> parseColumn(StringStorage sourceStorage) {
|
||||
return new WithProblems<>(sourceStorage, List.of());
|
||||
}
|
||||
}
|
@ -0,0 +1,58 @@
|
||||
package org.enso.table.parsing;
|
||||
|
||||
import org.enso.table.data.column.builder.object.Builder;
|
||||
import org.enso.table.data.column.storage.Storage;
|
||||
import org.enso.table.data.column.storage.StringStorage;
|
||||
import org.enso.table.parsing.problems.ProblemAggregator;
|
||||
import org.enso.table.read.WithProblems;
|
||||
|
||||
/**
|
||||
* A base type for a datatype parsing strategy which relies on a method parsing a single value.
|
||||
*
|
||||
* <p>It specifies the strategy for parsing text cells into some target type, reporting issues and
|
||||
* building the resulting table column.
|
||||
*/
|
||||
public abstract class IncrementalDatatypeParser implements DatatypeParser {
|
||||
|
||||
/**
|
||||
* Parses a single cell.
|
||||
*
|
||||
* @param text the text contents to parse, it will never be null in the default implementation -
|
||||
* null values are just passed as-is without any parsing attempts by default
|
||||
* @param problemAggregator an instance of the problem aggregator, used for reporting parsing
|
||||
* problems
|
||||
* @return the parsed value or null if the value could not be parsed or could be parsed but should
|
||||
* be treated as missing value
|
||||
*/
|
||||
protected abstract Object parseSingleValue(String text, ProblemAggregator problemAggregator);
|
||||
|
||||
/**
|
||||
* Creates a new column builder expecting the specific datatype, with a specified capacity.
|
||||
*
|
||||
* <p>The {@code parseColumn} method will use {@code appendNoGrow} function, so the initial
|
||||
* capacity should be set properly so that the builder can hold all expected elements.
|
||||
*
|
||||
* <p>The type returned from {@code parseSingleValue} should be consistent with the types that the
|
||||
* builder returned here expects - it should never return a value that cannot be accepted by the
|
||||
* builder.
|
||||
*/
|
||||
protected abstract Builder makeBuilderWithCapacity(long capacity);
|
||||
|
||||
@Override
|
||||
public WithProblems<Storage> parseColumn(StringStorage sourceStorage) {
|
||||
Builder builder = makeBuilderWithCapacity(sourceStorage.size());
|
||||
var aggregator = new ProblemAggregator();
|
||||
|
||||
for (int i = 0; i < sourceStorage.size(); ++i) {
|
||||
String cell = sourceStorage.getItem(i);
|
||||
if (cell != null) {
|
||||
Object parsed = parseSingleValue(cell, aggregator);
|
||||
builder.appendNoGrow(parsed);
|
||||
} else {
|
||||
builder.appendNoGrow(null);
|
||||
}
|
||||
}
|
||||
|
||||
return new WithProblems<>(builder.seal(), aggregator.getAggregatedProblems());
|
||||
}
|
||||
}
|
@ -2,9 +2,9 @@ package org.enso.table.parsing;
|
||||
|
||||
import org.enso.table.data.column.builder.object.Builder;
|
||||
import org.enso.table.data.column.builder.object.NumericBuilder;
|
||||
import org.enso.table.parsing.problems.NumericProblemAggregator;
|
||||
import org.enso.table.parsing.problems.ProblemAggregator;
|
||||
|
||||
public class IntegerParser extends DatatypeParser<NumericProblemAggregator> {
|
||||
public class IntegerParser extends IncrementalDatatypeParser {
|
||||
private final String thousandsSeparator;
|
||||
private final boolean leadingZerosAllowed;
|
||||
|
||||
@ -18,7 +18,7 @@ public class IntegerParser extends DatatypeParser<NumericProblemAggregator> {
|
||||
}
|
||||
|
||||
@Override
|
||||
public Object parseSingleValue(String text, NumericProblemAggregator problemAggregator) {
|
||||
protected Object parseSingleValue(String text, ProblemAggregator problemAggregator) {
|
||||
if (thousandsSeparator != null
|
||||
&& (text.startsWith(thousandsSeparator) || text.endsWith(thousandsSeparator))) {
|
||||
problemAggregator.reportInvalidFormat(text);
|
||||
@ -55,12 +55,7 @@ public class IntegerParser extends DatatypeParser<NumericProblemAggregator> {
|
||||
}
|
||||
|
||||
@Override
|
||||
public Builder makeBuilderWithCapacity(long capacity) {
|
||||
protected Builder makeBuilderWithCapacity(long capacity) {
|
||||
return NumericBuilder.createLongBuilder((int) capacity);
|
||||
}
|
||||
|
||||
@Override
|
||||
public NumericProblemAggregator makeProblemAggregator() {
|
||||
return new NumericProblemAggregator();
|
||||
}
|
||||
}
|
||||
|
@ -0,0 +1,52 @@
|
||||
package org.enso.table.parsing;
|
||||
|
||||
import org.enso.table.data.column.builder.object.Builder;
|
||||
import org.enso.table.data.column.storage.Storage;
|
||||
import org.enso.table.data.column.storage.StringStorage;
|
||||
import org.enso.table.parsing.problems.ProblemAggregator;
|
||||
import org.enso.table.read.WithProblems;
|
||||
|
||||
/**
|
||||
* The type inferring parser tries to parse the given column using a set of provided parsers. It
|
||||
* returns the result of the first parser that succeeds without reporting any problems.
|
||||
*
|
||||
* <p>If all parsers from the set reported problems, the fallback parser is used and its result is
|
||||
* returned regardless of any problems.
|
||||
*/
|
||||
public class TypeInferringParser implements DatatypeParser {
|
||||
|
||||
private final IncrementalDatatypeParser[] baseParsers;
|
||||
private final DatatypeParser fallbackParser;
|
||||
|
||||
public TypeInferringParser(
|
||||
IncrementalDatatypeParser[] baseParsers, DatatypeParser fallbackParser) {
|
||||
this.baseParsers = baseParsers;
|
||||
this.fallbackParser = fallbackParser;
|
||||
}
|
||||
|
||||
@Override
|
||||
public WithProblems<Storage> parseColumn(StringStorage sourceStorage) {
|
||||
parsers:
|
||||
for (IncrementalDatatypeParser parser : baseParsers) {
|
||||
Builder builder = parser.makeBuilderWithCapacity(sourceStorage.size());
|
||||
var aggregator = new ProblemAggregator();
|
||||
|
||||
for (int i = 0; i < sourceStorage.size(); ++i) {
|
||||
String cell = sourceStorage.getItem(i);
|
||||
if (cell != null) {
|
||||
Object parsed = parser.parseSingleValue(cell, aggregator);
|
||||
if (aggregator.hasProblems()) {
|
||||
continue parsers;
|
||||
}
|
||||
builder.appendNoGrow(parsed);
|
||||
} else {
|
||||
builder.appendNoGrow(null);
|
||||
}
|
||||
}
|
||||
|
||||
return new WithProblems<>(builder.seal(), aggregator.getAggregatedProblems());
|
||||
}
|
||||
|
||||
return fallbackParser.parseColumn(sourceStorage);
|
||||
}
|
||||
}
|
@ -3,26 +3,25 @@ package org.enso.table.parsing;
|
||||
import org.enso.table.data.column.builder.object.Builder;
|
||||
import org.enso.table.parsing.problems.ProblemAggregator;
|
||||
|
||||
public class WhitespaceStrippingParser<PA extends ProblemAggregator> extends DatatypeParser<PA> {
|
||||
private final DatatypeParser<PA> innerParser;
|
||||
/**
|
||||
* An incremental parser which wraps another parser of that type, delegating the parsing logic to
|
||||
* it, but first transforming the input text by stripping any leading and trailing whitespace.
|
||||
*/
|
||||
public class WhitespaceStrippingParser extends IncrementalDatatypeParser {
|
||||
private final IncrementalDatatypeParser innerParser;
|
||||
|
||||
public WhitespaceStrippingParser(DatatypeParser<PA> innerParser) {
|
||||
public WhitespaceStrippingParser(IncrementalDatatypeParser innerParser) {
|
||||
this.innerParser = innerParser;
|
||||
}
|
||||
|
||||
@Override
|
||||
public Object parseSingleValue(String text, PA problemAggregator) {
|
||||
protected Object parseSingleValue(String text, ProblemAggregator problemAggregator) {
|
||||
String stripped = text.strip();
|
||||
return innerParser.parseSingleValue(stripped, problemAggregator);
|
||||
}
|
||||
|
||||
@Override
|
||||
public Builder makeBuilderWithCapacity(long capacity) {
|
||||
protected Builder makeBuilderWithCapacity(long capacity) {
|
||||
return innerParser.makeBuilderWithCapacity(capacity);
|
||||
}
|
||||
|
||||
@Override
|
||||
public PA makeProblemAggregator() {
|
||||
return innerParser.makeProblemAggregator();
|
||||
}
|
||||
}
|
||||
|
@ -1,28 +0,0 @@
|
||||
package org.enso.table.parsing.problems;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
|
||||
/**
|
||||
* A base problem aggregator that allows reporting the most generic {@code InvalidFormat} problem.
|
||||
*/
|
||||
public class InvalidFormatProblemAggregator implements ProblemAggregator {
|
||||
|
||||
private final List<String> invalidFormatCells = new ArrayList<>();
|
||||
|
||||
/**
|
||||
* Reports a cell with an invalid format.
|
||||
*
|
||||
* <p>The reports are aggregated and finally a single problem containing all invalid cell for the
|
||||
* given column is reported.
|
||||
*/
|
||||
public void reportInvalidFormat(String cell) {
|
||||
invalidFormatCells.add(cell);
|
||||
}
|
||||
|
||||
@Override
|
||||
public List<ParsingProblem> getAggregatedProblems() {
|
||||
if (invalidFormatCells.isEmpty()) return List.of();
|
||||
else return List.of(new InvalidFormat(invalidFormatCells));
|
||||
}
|
||||
}
|
@ -1,27 +0,0 @@
|
||||
package org.enso.table.parsing.problems;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
|
||||
/**
|
||||
* A problem aggregator capable of reporting {@code InvalidFormat} and {@code LeadingZeros}
|
||||
* problems.
|
||||
*/
|
||||
public class NumericProblemAggregator extends InvalidFormatProblemAggregator {
|
||||
private final List<String> leadingZerosCells = new ArrayList<>();
|
||||
|
||||
public void reportLeadingZeroes(String cell) {
|
||||
leadingZerosCells.add(cell);
|
||||
}
|
||||
|
||||
@Override
|
||||
public List<ParsingProblem> getAggregatedProblems() {
|
||||
List<ParsingProblem> problems = new ArrayList<>(super.getAggregatedProblems());
|
||||
|
||||
if (!leadingZerosCells.isEmpty()) {
|
||||
problems.add(new LeadingZeros(leadingZerosCells));
|
||||
}
|
||||
|
||||
return problems;
|
||||
}
|
||||
}
|
@ -1,14 +1,57 @@
|
||||
package org.enso.table.parsing.problems;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
|
||||
/**
|
||||
* A base class for strategies for aggregating problems.
|
||||
* An aggregator for parsing problems.
|
||||
*
|
||||
* <p>Each strategy exposes a method that returns a summary of the problems. The particular methods
|
||||
* for reporting each problem are defined in particular subclasses.
|
||||
*/
|
||||
public interface ProblemAggregator {
|
||||
public class ProblemAggregator {
|
||||
|
||||
private final List<String> invalidFormatCells = new ArrayList<>();
|
||||
private final List<String> leadingZerosCells = new ArrayList<>();
|
||||
|
||||
/**
|
||||
* Reports a cell with an invalid format.
|
||||
*
|
||||
* <p>The reports are aggregated and finally a single problem containing all invalid cell for the
|
||||
* given column is reported.
|
||||
*/
|
||||
public void reportInvalidFormat(String cell) {
|
||||
invalidFormatCells.add(cell);
|
||||
}
|
||||
|
||||
public void reportLeadingZeroes(String cell) {
|
||||
leadingZerosCells.add(cell);
|
||||
}
|
||||
|
||||
/**
|
||||
* Checks if there are any problems already reported.
|
||||
*
|
||||
* <p>This method returns true if and only if {@code getAggregatedProblems} would return a
|
||||
* non-empty list.
|
||||
*/
|
||||
public boolean hasProblems() {
|
||||
return !invalidFormatCells.isEmpty() || !leadingZerosCells.isEmpty();
|
||||
}
|
||||
|
||||
/** Return an aggregated summary of problems that have been reported. */
|
||||
List<ParsingProblem> getAggregatedProblems();
|
||||
public List<ParsingProblem> getAggregatedProblems() {
|
||||
List<ParsingProblem> problems = new ArrayList<>();
|
||||
|
||||
if (!invalidFormatCells.isEmpty()) {
|
||||
problems.add(new InvalidFormat(invalidFormatCells));
|
||||
}
|
||||
|
||||
if (!leadingZerosCells.isEmpty()) {
|
||||
problems.add(new LeadingZeros(leadingZerosCells));
|
||||
}
|
||||
|
||||
assert problems.isEmpty() == !hasProblems();
|
||||
|
||||
return problems;
|
||||
}
|
||||
}
|
||||
|
@ -11,7 +11,7 @@ import Standard.Test.Problems
|
||||
import Standard.Visualization
|
||||
|
||||
from Standard.Table.Data.Data_Formatter as Data_Formatter_Module import Data_Formatter
|
||||
from Standard.Table.Data.Column_Type_Selection as Column_Type_Selection_Module import Column_Type_Selection
|
||||
from Standard.Table.Data.Column_Type_Selection as Column_Type_Selection_Module import Column_Type_Selection, Auto
|
||||
|
||||
from Standard.Table.Error as Table_Errors import Invalid_Format, Leading_Zeros, Missing_Input_Columns, Column_Indexes_Out_Of_Range, Duplicate_Type_Selector
|
||||
|
||||
@ -55,14 +55,14 @@ spec = Test.group "Table.parse_values" <|
|
||||
|
||||
opts = Data_Formatter allow_leading_zeros=True
|
||||
t1_parsed_zeros = [0, 0, 0, 1, -1, 1, 0, 10, 12345, Nothing]
|
||||
t6 = t1.parse_values parser=opts column_types=[Column_Type_Selection 0 Integer]
|
||||
t6 = t1.parse_values value_formatter=opts column_types=[Column_Type_Selection 0 Integer]
|
||||
t6.at "ints" . to_vector . should_equal t1_parsed_zeros
|
||||
Warning.get_all t6 . should_equal []
|
||||
|
||||
t7 = t1.parse_values parser=opts column_types=[Column_Type_Selection 0 Decimal]
|
||||
t7 = t1.parse_values value_formatter=opts column_types=[Column_Type_Selection 0 Decimal]
|
||||
t7.at "ints" . to_vector . should_equal t1_parsed_zeros
|
||||
Warning.get_all t7 . should_equal []
|
||||
t8 = t2.parse_values parser=opts column_types=[Column_Type_Selection 0 Decimal]
|
||||
t8 = t2.parse_values value_formatter=opts column_types=[Column_Type_Selection 0 Decimal]
|
||||
t8.at "floats" . to_vector . should_equal [0.0, 0.0, 0.0, 1.0, -10.0, 1.0]
|
||||
Warning.get_all t8 . should_equal []
|
||||
|
||||
@ -158,9 +158,52 @@ spec = Test.group "Table.parse_values" <|
|
||||
problems = [(Duplicate_Type_Selector "floats" ambiguous=True), (Duplicate_Type_Selector "bools" ambiguous=False)]
|
||||
Problems.test_problem_handling action problems tester
|
||||
|
||||
Test.specify "should guess the datatype for columns" pending="TODO" <|
|
||||
# TODO (next PR): ints, decimals, int+dec, just text, all dates, mixed dates, ints+text, ints in quotes
|
||||
Error.throw "TODO"
|
||||
Test.specify "should guess the datatype for columns" <|
|
||||
c1 = ["ints", ["1", " +2", "-123", Nothing]]
|
||||
c2 = ["ints0", ["01", "02 ", Nothing, "-1"]]
|
||||
c3 = ["floats", [" 1.0 ", "2.2", Nothing, "-1.0"]]
|
||||
c4 = ["bools", ["true", " False", Nothing, "True"]]
|
||||
c5 = ["floats+ints", ["1", "2.2 ", "-1.0", Nothing]]
|
||||
c6 = ["text", ["foobar", "foo", "", Nothing]]
|
||||
c7 = ["dates", ["2022-10-01", " 2000-01-01", "1999-01-02", Nothing]]
|
||||
c8 = ["datetimes", ["2022-10-01 01:02:03 ", "2000-01-01 01:02:03", "1999-01-02 01:02:03", Nothing]]
|
||||
c9 = ["times", ["01:02:03", " 00:00:00 ", "01:02:03", Nothing]]
|
||||
c10 = ["mixeddates", ["2022-10-01", "2000-01-01 01:02:03", "01:02:03", Nothing]]
|
||||
c11 = ["text+ints", ["1", "2", " foobar", Nothing]]
|
||||
t = Table.new [c1, c2, c3, c4, c5, c6, c7, c8, c9, c10, c11]
|
||||
t2 = t.parse_values
|
||||
|
||||
Warning.get_all t2 . should_equal []
|
||||
t2.at "ints" . to_vector . should_equal [1, 2, -123, Nothing]
|
||||
t2.at "ints" . to_vector . first . should_be_an Integer
|
||||
t2.at "ints0" . to_vector . should_equal ["01", "02", Nothing, "-1"]
|
||||
t2.at "floats" . to_vector . should_equal [1.0, 2.2, Nothing, -1.0]
|
||||
t2.at "bools" . to_vector . should_equal [True, False, Nothing, True]
|
||||
t2.at "floats+ints" . to_vector . should_equal [1.0, 2.2, -1.0, Nothing]
|
||||
t2.at "text" . to_vector . should_equal ["foobar", "foo", "", Nothing]
|
||||
t2.at "dates" . to_vector . map date_as_vector . should_equal [[2022, 10, 1], [2000, 1, 1], [1999, 1, 2], Nothing]
|
||||
t2.at "datetimes" . to_vector . map datetime_as_vector . should_equal [[2022, 10, 1, 1, 2, 3, 0], [2000, 1, 1, 1, 2, 3, 0], [1999, 1, 2, 1, 2, 3, 0], Nothing]
|
||||
t2.at "times" . to_vector . map time_as_vector . should_equal [[1, 2, 3, 0], [0, 0, 0, 0], [1, 2, 3, 0], Nothing]
|
||||
t2.at "mixeddates" . to_vector . should_equal ["2022-10-01", "2000-01-01 01:02:03", "01:02:03", Nothing]
|
||||
t2.at "text+ints" . to_vector . should_equal ["1", "2", "foobar", Nothing]
|
||||
|
||||
t3 = Table.new [["bools", ["1", "0", "True"]], ["ints", ["1", "0", "0"]]] . parse_values (Data_Formatter true_values=["1", "True"] false_values=["0", "False"])
|
||||
t3.at "bools" . to_vector . should_equal [True, False, True]
|
||||
t3.at "ints" . to_vector . should_equal [1, 0, 0]
|
||||
|
||||
t4 = Table.new [c2] . parse_values (Data_Formatter allow_leading_zeros=True)
|
||||
t4 . at "ints0" . to_vector . should_equal [1, 2, Nothing, -1]
|
||||
|
||||
t5 = t.parse_values column_types=[Column_Type_Selection "ints" Decimal, Column_Type_Selection "floats" Auto, Column_Type_Selection "text+ints" Auto]
|
||||
t5.at "ints" . to_vector . should_equal [1.0, 2.0, -123.0, Nothing]
|
||||
# `ints` are requested to be parsed as decimals.
|
||||
t5.at "ints" . to_vector . first . should_be_a Decimal
|
||||
# `floats` are auto-detected as decimals.
|
||||
t5.at "floats" . to_vector . should_equal [1.0, 2.2, Nothing, -1.0]
|
||||
# `text+ints` is attempted to be parsed (hence whitespace is stripped), but it only fits the text type.
|
||||
t5.at "text+ints" . to_vector . should_equal ["1", "2", "foobar", Nothing]
|
||||
# `bools` are not requested to be parsed, so they are kept as-is, with leading whitespace etc.
|
||||
t5.at "bools" . to_vector . should_equal ["true", " False", Nothing, "True"]
|
||||
|
||||
Test.specify "should allow to specify a thousands separator and a custom decimal point" <|
|
||||
opts = Data_Formatter decimal_point=',' thousand_separator='_'
|
||||
@ -234,8 +277,16 @@ spec = Test.group "Table.parse_values" <|
|
||||
expected_warnings.append (Invalid_Format "times" Time_Of_Day.Time_Of_Day ["11:00:00 ", " 00:00:00", "00 : 00 : 00"])
|
||||
warnings.should_contain_the_same_elements_as expected_warnings.to_vector
|
||||
|
||||
Test.specify "should fallback to text if whitespace is present and trimming is turned off" pending="TODO" <|
|
||||
## TODO next PR
|
||||
Error.throw "TODO"
|
||||
Test.specify "should fallback to text if whitespace is present and trimming is turned off" <|
|
||||
c1 = ["1", " +2", "-123", Nothing]
|
||||
c2 = [" 1.0 ", "2.2", Nothing, "-1.0"]
|
||||
c3 = ["true", " False", Nothing, "True"]
|
||||
t = Table.new [["ints", c1], ["floats", c2], ["bools", c3]]
|
||||
t2 = t.parse_values (Data_Formatter trim_values=False)
|
||||
|
||||
Warning.get_all t2 . should_equal []
|
||||
t2.at "ints" . to_vector . should_equal c1
|
||||
t2.at "floats" . to_vector . should_equal c2
|
||||
t2.at "bools" . to_vector . should_equal c3
|
||||
|
||||
main = Test.Suite.run_main here.spec
|
||||
|
Loading…
Reference in New Issue
Block a user