Fix NumberParser to avoid thousandSeparator==decimalPoint and prefer US decimal format (#7946)

Closes #7930
This commit is contained in:
Radosław Waśko 2023-10-03 20:07:54 +02:00 committed by GitHub
parent 2d39e644b8
commit 08cd449a99
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
3 changed files with 181 additions and 60 deletions

View File

@ -97,8 +97,9 @@ type Data_Formatter
- thousand_separator: A separator that can be used to separate groups of digits in numbers.
- allow_leading_zeros: Specifies if values starting with leading zeroes should be treated as numbers.
- allow_exponential_notation: Allow parsing of exponential notation format.
with_number_formatting : Text -> Text -> Boolean -> Boolean -> Data_Formatter
with_number_formatting : Text -> Text -> Boolean -> Boolean -> Data_Formatter ! Illegal_Argument
with_number_formatting self (decimal_point=self.decimal_point) (thousand_separator=self.thousand_separator) (allow_leading_zeros=self.allow_leading_zeros) (allow_exponential_notation=self.allow_exponential_notation) =
if decimal_point == thousand_separator then Error.throw (Illegal_Argument.Error "The decimal point and thousand separator must be different.") else
self.clone decimal_point=decimal_point thousand_separator=thousand_separator allow_leading_zeros=allow_leading_zeros allow_exponential_notation=allow_exponential_notation
## Specify values for Date/Time parsing.
@ -187,14 +188,16 @@ type Data_Formatter
## PRIVATE
make_integer_parser self auto_mode=False target_type=Value_Type.Integer =
separator = if self.thousand_separator.is_empty then Nothing else self.thousand_separator
thousand_separator = if self.thousand_separator.is_empty then Nothing else self.thousand_separator
decimal_point = if self.decimal_point == Auto then Nothing else self.decimal_point
storage_type = Storage.from_value_type_strict target_type
NumberParser.createIntegerParser storage_type auto_mode.not (auto_mode.not || self.allow_leading_zeros) self.trim_values separator
NumberParser.createIntegerParser storage_type auto_mode.not (auto_mode.not || self.allow_leading_zeros) self.trim_values decimal_point thousand_separator
## PRIVATE
make_decimal_parser self auto_mode=False =
if self.decimal_point == Auto then NumberParser.createAutoDecimalParser auto_mode.not (auto_mode.not || self.allow_leading_zeros) self.trim_values self.allow_exponential_notation else
NumberParser.createFixedDecimalParser auto_mode.not (auto_mode.not || self.allow_leading_zeros) self.trim_values self.allow_exponential_notation self.thousand_separator self.decimal_point
decimal_point = if self.decimal_point == Auto then Nothing else self.decimal_point
thousand_separator = if self.thousand_separator.is_empty then Nothing else self.thousand_separator
NumberParser.createDecimalParser auto_mode.not (auto_mode.not || self.allow_leading_zeros) self.trim_values self.allow_exponential_notation decimal_point thousand_separator
## PRIVATE
make_boolean_parser self = self.wrap_base_parser <|
@ -219,7 +222,7 @@ type Data_Formatter
make_identity_parser self = self.wrap_base_parser IdentityParser.new
## PRIVATE
make_value_type_parser self value_type = case value_type of
make_value_type_parser self value_type = Illegal_Argument.handle_java_exception <| case value_type of
Value_Type.Integer _ ->
self.make_integer_parser target_type=value_type
# TODO once we implement #6109 we can support 32-bit floats
@ -234,7 +237,30 @@ type Data_Formatter
## PRIVATE
get_specific_type_parsers self =
[self.make_integer_parser True, self.make_decimal_parser True, self.make_date_time_parser, self.make_date_parser, self.make_time_of_day_parser, self.make_boolean_parser]
try_us_parsers_first = self.decimal_point == Auto && self.thousand_separator != "."
preferred_auto_parsers = case try_us_parsers_first of
## If we are in auto mode, we will first try parsing US integers,
then US floats and only then other integers and floats.
Under normal circumstances, we first try integers and later
floats - but this would cause `1.000` to be interpreted as `1000`
because _all_ integers take precedence and floats are considered
later. But we want `1.000` to be interpreted as a `1.0` float by
default, so we change the ordering a bit.
True ->
us_preferred = self.with_number_formatting decimal_point='.'
[us_preferred.make_integer_parser auto_mode=True, us_preferred.make_decimal_parser auto_mode=True]
## However, if the `decimal_point` is set to something else,
we don't do auto inference, so this extra logic is not needed.
False -> []
remaining_parsers = [self.make_integer_parser auto_mode=True, self.make_decimal_parser auto_mode=True, self.make_date_time_parser, self.make_date_parser, self.make_time_of_day_parser, self.make_boolean_parser]
parsers = preferred_auto_parsers + remaining_parsers
## Unfortunately, the [] literal allows to create a vector containing
dataflow errors. That is not handled well later by Polyglot. So we
ensure all errors surface here.
parsers_with_propagated_error = parsers.map x->x
parsers_with_propagated_error
## PRIVATE
make_auto_parser self =

View File

@ -10,8 +10,11 @@ import org.enso.table.problems.AggregatedProblems;
import org.enso.table.problems.WithAggregatedProblems;
import org.graalvm.polyglot.Context;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.Objects;
import java.util.regex.Pattern;
/** A parser for numbers.
@ -43,22 +46,78 @@ public class NumberParser extends IncrementalDatatypeParser {
private final static String CCY = "(?<ccy>[^0-9(),. '+-]+)";
private final static String EXP = "(?<exp>[eE][+-]?\\d+)?";
private final static String SPACE = "\\s*";
private final static String[] SEPARATORS = new String[] {",.", ".,", " ,", "',"};
private record Separators(String thousand, String decimal) {}
private final Separators[] SEPARATORS;
private final static Map<String, Pattern> PATTERNS = new HashMap<>();
private final IntegerType integerTargetType;
private static Pattern getPattern(boolean allowDecimal, boolean allowCurrency, boolean allowScientific, boolean trimValues, int index) {
int allowedSet = (allowCurrency ? ALLOWED_CCY_PATTERNS : ALLOWED_NON_CCY_PATTERNS);
int separatorsIndex = index / allowedSet;
int patternIndex = index % allowedSet;
private static void validateSeparator(String name, String value) {
if (value == null) return;
if (separatorsIndex >= SEPARATORS.length) {
return null;
if (value.length() != 1) {
throw new IllegalArgumentException(name + " must be a single character, but it was '" + value + "'.");
}
var separators = SEPARATORS[separatorsIndex];
return getPattern(allowDecimal, allowCurrency, allowScientific, trimValues, patternIndex, separators);
// If we allowed separators to be a digit, super crazy stuff could happen - e.g. technically 10000 could be interpreted as 1000 by interpreting the first 0 as a thousand separator. Let's not do that.
if (Character.isDigit(value.charAt(0))) {
throw new IllegalArgumentException(name + " cannot be a digit, but it was '" + value + "'.");
}
}
/**
* Builds a list of possible separator pairs.
* <p>
* If one of the parameters is null, it is meant to be inferred (multiple separator pairs will be provided for
* it), if
* it is set to a concrete value, it will be fixed.
*/
private static Separators[] buildSeparators(boolean allowDecimal, String decimalPoint, String thousandSeparator) {
validateSeparator("Decimal point", decimalPoint);
validateSeparator("Thousand separator", thousandSeparator);
if (decimalPoint != null && decimalPoint.equals(thousandSeparator)) {
throw new IllegalArgumentException("Decimal point and thousand separator cannot be the same, but they were both '" + decimalPoint + "'.");
}
boolean fullAutomaticMode = allowDecimal && decimalPoint == null && thousandSeparator == null;
if (fullAutomaticMode) {
return new Separators[] {
new Separators(",", "."),
new Separators(".", ","),
new Separators(" ", ","),
new Separators("'", ","),
};
}
List<String> thousandSeparators;
if (thousandSeparator == null) {
List<String> autoThousandSeparators = List.of(",", ".", "'", " ");
thousandSeparators = autoThousandSeparators.stream().filter(sep -> !sep.equals(decimalPoint)).toList();
} else {
thousandSeparators = List.of(thousandSeparator);
}
List<String> decimalPoints;
if (decimalPoint == null) {
if (allowDecimal) {
List<String> autoDecimalPoints = List.of(",", ".");
assert thousandSeparator != null;
decimalPoints = autoDecimalPoints.stream().filter(sep -> !sep.equals(thousandSeparator)).toList();
} else {
// List.of(null) is not permitted...
decimalPoints = new ArrayList<>();
decimalPoints.add(null);
}
} else {
decimalPoints = List.of(decimalPoint);
}
return thousandSeparators.stream()
.flatMap(thousand -> decimalPoints.stream().map(decimal -> new Separators(thousand, decimal)))
.toArray(Separators[]::new);
}
/** The number of patterns that are allowed for non-currency numbers. */
@ -67,7 +126,7 @@ public class NumberParser extends IncrementalDatatypeParser {
/** The number of patterns that are allowed for currency numbers. */
private static final int ALLOWED_CCY_PATTERNS = 6;
private static Pattern getPattern(boolean allowDecimal, boolean allowCurrency, boolean allowScientific, boolean trimValues, int patternIndex, String separators) {
private static Pattern buildPattern(boolean allowDecimal, boolean allowCurrency, boolean allowScientific, boolean trimValues, int patternIndex, Separators separators) {
if (allowScientific && !allowDecimal) {
throw new IllegalArgumentException("Scientific notation requires decimal numbers.");
}
@ -76,9 +135,9 @@ public class NumberParser extends IncrementalDatatypeParser {
return null;
}
var INTEGER = "(?<integer>(\\d*)" + (separators.length() == 1 ? "" : "|(\\d{1,3}([" + separators.charAt(0) + "]\\d{3})*)") + ")";
String INTEGER = "(?<integer>(\\d*)" + (separators.thousand == null ? "" : "|(\\d{1,3}([" + separators.thousand + "]\\d{3})*)") + ")";
var decimalPoint = (separators.length() == 1 ? separators : separators.charAt(1));
String decimalPoint = allowDecimal ? Objects.requireNonNull(separators.decimal) : null;
var NUMBER = INTEGER + (allowDecimal ? "(?<decimal>[" + decimalPoint + "]\\d*)?" : "") + (allowScientific ? EXP : "");
var pattern = switch (patternIndex) {
@ -103,7 +162,6 @@ public class NumberParser extends IncrementalDatatypeParser {
private final boolean allowLeadingZeros;
private final boolean allowScientific;
private final boolean trimValues;
private final String separators;
/**
* Creates a new integer instance of this parser.
@ -112,11 +170,15 @@ public class NumberParser extends IncrementalDatatypeParser {
* @param allowCurrency whether to allow currency symbols
* @param allowLeadingZeros whether to allow leading zeros
* @param trimValues whether to trim the input values
* @param thousandSeparator the thousand separator to use
* @param decimalPoint the decimal point set for the current format, or null if not specified; this parser does
* not use decimal point (since it is for integers) but it ensure that if a decimal point is chosen, the inferred
* thousand separator will not clash with that specific decimal point
* @param thousandSeparator the thousand separator to use (if null then will be inferred)
*/
public static NumberParser createIntegerParser(IntegerType integerTargetType, boolean allowCurrency, boolean allowLeadingZeros, boolean trimValues, String thousandSeparator) {
var separator = thousandSeparator == null ? null : (thousandSeparator + '_');
return new NumberParser(false, integerTargetType, allowCurrency, allowLeadingZeros, trimValues, false, separator);
public static NumberParser createIntegerParser(IntegerType integerTargetType, boolean allowCurrency,
boolean allowLeadingZeros, boolean trimValues,
String decimalPoint, String thousandSeparator) {
return new NumberParser(false, integerTargetType, allowCurrency, allowLeadingZeros, trimValues, false, decimalPoint, thousandSeparator);
}
/**
@ -126,50 +188,39 @@ public class NumberParser extends IncrementalDatatypeParser {
* @param allowLeadingZeros whether to allow leading zeros
* @param trimValues whether to trim the input values
* @param allowScientific whether to allow scientific notation
* @param decimalPoint the decimal separator to use (if null, then will be inferred)
* @param thousandSeparator the thousand separator to use (if null, then will be inferred)
*/
public static NumberParser createAutoDecimalParser(boolean allowCurrency, boolean allowLeadingZeros, boolean trimValues, boolean allowScientific) {
return new NumberParser(true, null, allowCurrency, allowLeadingZeros, trimValues, allowScientific, null);
public static NumberParser createDecimalParser(boolean allowCurrency, boolean allowLeadingZeros, boolean trimValues, boolean allowScientific, String decimalPoint, String thousandSeparator) {
return new NumberParser(true, null, allowCurrency, allowLeadingZeros, trimValues, allowScientific, decimalPoint, thousandSeparator);
}
/**
* Creates a new decimal instance of this parser.
*
* @param allowCurrency whether to allow currency symbols
* @param allowLeadingZeros whether to allow leading zeros
* @param trimValues whether to trim the input values
* @param allowScientific whether to allow scientific notation
* @param thousandSeparator the thousand separator to use
* @param decimalSeparator the decimal separator to use
*/
public static NumberParser createFixedDecimalParser(boolean allowCurrency, boolean allowLeadingZeros, boolean trimValues, boolean allowScientific, String thousandSeparator, String decimalSeparator) {
if (decimalSeparator == null || decimalSeparator.length() != 1) {
throw new IllegalArgumentException("Decimal separator must be a single character.");
}
thousandSeparator = thousandSeparator == null ? "" : thousandSeparator;
return new NumberParser(true, null, allowCurrency, allowLeadingZeros, trimValues, allowScientific, thousandSeparator + decimalSeparator);
}
private NumberParser(boolean allowDecimal, IntegerType integerTargetType, boolean allowCurrency, boolean allowLeadingZeros, boolean trimValues, boolean allowScientific, String separators) {
private NumberParser(boolean allowDecimal, IntegerType integerTargetType, boolean allowCurrency, boolean allowLeadingZeros, boolean trimValues, boolean allowScientific, String decimalPoint, String thousandSeparator) {
this.allowDecimal = allowDecimal;
this.integerTargetType = integerTargetType;
this.allowCurrency = allowCurrency;
this.allowLeadingZeros = allowLeadingZeros;
this.trimValues = trimValues;
this.allowScientific = allowScientific;
this.separators = separators;
SEPARATORS = buildSeparators(allowDecimal, decimalPoint, thousandSeparator);
}
/**
* Creates a Pattern for the given index.
* The index will be decoded into a specific set of separators (unless fixed
* separators are used) and then paired with on of the valid patterns for
* separators are used) and then paired with one of the valid patterns for
* the given parser.
*/
private Pattern patternForIndex(int index) {
return separators == null
? getPattern(allowDecimal, allowCurrency, allowScientific, trimValues, index)
: getPattern(allowDecimal, allowCurrency, allowScientific, trimValues, index, separators);
int allowedSet = (allowCurrency ? ALLOWED_CCY_PATTERNS : ALLOWED_NON_CCY_PATTERNS);
int separatorsIndex = index / allowedSet;
int patternIndex = index % allowedSet;
if (separatorsIndex >= SEPARATORS.length) {
return null;
}
return buildPattern(allowDecimal, allowCurrency, allowScientific, trimValues, patternIndex, SEPARATORS[separatorsIndex]);
}
@Override

View File

@ -47,6 +47,20 @@ spec =
formatter.parse "-Infinity" . should_equal (Number.negative_infinity)
formatter.parse "NaN" . is_nan . should_be_true
Test.specify "should prefer the US decimal point in auto mode" <|
formatter = Data_Formatter.Value
formatter.parse "1.5" . should_equal 1.5
formatter.parse "1.25" . should_equal 1.25
formatter.parse "1.125" . should_equal 1.125
formatter.parse "1,125" . should_equal 1125
formatter.parse "1,000,000" . should_equal 1000000
formatter.parse "1,000.5" . should_equal 1000.5
# But if it doesn't fit the US number, other formats should work as well:
formatter.parse "1.000,25" . should_equal 1000.25
formatter.parse "1'000" . should_equal 1000
formatter.parse "1.000.000" . should_equal 1000000
Test.specify "should allow customizing the decimal point and thousand separator" <|
formatter = Data_Formatter.Value thousand_separator="_" decimal_point=","
formatter.parse "123" . should_equal 123
@ -59,6 +73,36 @@ spec =
formatter.parse "-1,0" . should_equal -1.0
formatter.parse "1,0001" . should_equal 1.0001
Test.specify "should never infer thousand separator to be equal to decimal point" <|
f1 = Data_Formatter.Value decimal_point="."
f1.parse "1.0" . should_equal 1.0
f1.parse "1.000" . should_equal 1.0
# This only makes sense as thousand separator but because of the decimal point clashing, it cannot be parsed and stays as text:
f1.parse "1.000.000" . should_equal "1.000.000"
f1.parse "1,000,000" . should_equal 1000000
f2 = Data_Formatter.Value decimal_point=","
f2.parse "1,0" . should_equal 1.0
f2.parse "1,000" . should_equal 1.0
f2.parse "1,000,000" . should_equal "1,000,000"
f2.parse "1.0" . should_equal "1.0"
f2.parse "1.000" . should_equal 1000
f2.parse "1.000.000" . should_equal 1000000
f3 = Data_Formatter.Value thousand_separator="."
f3.parse "1.000" . should_equal 1000
f3.parse "1.0" . should_equal "1.0"
f4 = Data_Formatter.Value thousand_separator=","
f4.parse "1,000" . should_equal 1000
f4.parse "1,0" . should_equal "1,0"
r5 = Data_Formatter.Value . with_number_formatting decimal_point="." thousand_separator="."
r5.should_fail_with Illegal_Argument
r6 = Data_Formatter.Value decimal_point="." thousand_separator="."
r6.parse "1.000" . should_fail_with Illegal_Argument
Test.specify "should support exponential notation, but only if explicitly enabled" <|
plain_formatter = Data_Formatter.Value
exponential_formatter = Data_Formatter.Value allow_exponential_notation=True