mirror of
https://github.com/enso-org/enso.git
synced 2024-12-23 08:21:49 +03:00
Fix NumberParser
to avoid thousandSeparator==decimalPoint
and prefer US decimal format (#7946)
Closes #7930
This commit is contained in:
parent
2d39e644b8
commit
08cd449a99
@ -97,8 +97,9 @@ type Data_Formatter
|
||||
- thousand_separator: A separator that can be used to separate groups of digits in numbers.
|
||||
- allow_leading_zeros: Specifies if values starting with leading zeroes should be treated as numbers.
|
||||
- allow_exponential_notation: Allow parsing of exponential notation format.
|
||||
with_number_formatting : Text -> Text -> Boolean -> Boolean -> Data_Formatter
|
||||
with_number_formatting : Text -> Text -> Boolean -> Boolean -> Data_Formatter ! Illegal_Argument
|
||||
with_number_formatting self (decimal_point=self.decimal_point) (thousand_separator=self.thousand_separator) (allow_leading_zeros=self.allow_leading_zeros) (allow_exponential_notation=self.allow_exponential_notation) =
|
||||
if decimal_point == thousand_separator then Error.throw (Illegal_Argument.Error "The decimal point and thousand separator must be different.") else
|
||||
self.clone decimal_point=decimal_point thousand_separator=thousand_separator allow_leading_zeros=allow_leading_zeros allow_exponential_notation=allow_exponential_notation
|
||||
|
||||
## Specify values for Date/Time parsing.
|
||||
@ -187,14 +188,16 @@ type Data_Formatter
|
||||
|
||||
## PRIVATE
|
||||
make_integer_parser self auto_mode=False target_type=Value_Type.Integer =
|
||||
separator = if self.thousand_separator.is_empty then Nothing else self.thousand_separator
|
||||
thousand_separator = if self.thousand_separator.is_empty then Nothing else self.thousand_separator
|
||||
decimal_point = if self.decimal_point == Auto then Nothing else self.decimal_point
|
||||
storage_type = Storage.from_value_type_strict target_type
|
||||
NumberParser.createIntegerParser storage_type auto_mode.not (auto_mode.not || self.allow_leading_zeros) self.trim_values separator
|
||||
NumberParser.createIntegerParser storage_type auto_mode.not (auto_mode.not || self.allow_leading_zeros) self.trim_values decimal_point thousand_separator
|
||||
|
||||
## PRIVATE
|
||||
make_decimal_parser self auto_mode=False =
|
||||
if self.decimal_point == Auto then NumberParser.createAutoDecimalParser auto_mode.not (auto_mode.not || self.allow_leading_zeros) self.trim_values self.allow_exponential_notation else
|
||||
NumberParser.createFixedDecimalParser auto_mode.not (auto_mode.not || self.allow_leading_zeros) self.trim_values self.allow_exponential_notation self.thousand_separator self.decimal_point
|
||||
decimal_point = if self.decimal_point == Auto then Nothing else self.decimal_point
|
||||
thousand_separator = if self.thousand_separator.is_empty then Nothing else self.thousand_separator
|
||||
NumberParser.createDecimalParser auto_mode.not (auto_mode.not || self.allow_leading_zeros) self.trim_values self.allow_exponential_notation decimal_point thousand_separator
|
||||
|
||||
## PRIVATE
|
||||
make_boolean_parser self = self.wrap_base_parser <|
|
||||
@ -219,7 +222,7 @@ type Data_Formatter
|
||||
make_identity_parser self = self.wrap_base_parser IdentityParser.new
|
||||
|
||||
## PRIVATE
|
||||
make_value_type_parser self value_type = case value_type of
|
||||
make_value_type_parser self value_type = Illegal_Argument.handle_java_exception <| case value_type of
|
||||
Value_Type.Integer _ ->
|
||||
self.make_integer_parser target_type=value_type
|
||||
# TODO once we implement #6109 we can support 32-bit floats
|
||||
@ -234,7 +237,30 @@ type Data_Formatter
|
||||
|
||||
## PRIVATE
|
||||
get_specific_type_parsers self =
|
||||
[self.make_integer_parser True, self.make_decimal_parser True, self.make_date_time_parser, self.make_date_parser, self.make_time_of_day_parser, self.make_boolean_parser]
|
||||
try_us_parsers_first = self.decimal_point == Auto && self.thousand_separator != "."
|
||||
preferred_auto_parsers = case try_us_parsers_first of
|
||||
## If we are in auto mode, we will first try parsing US integers,
|
||||
then US floats and only then other integers and floats.
|
||||
|
||||
Under normal circumstances, we first try integers and later
|
||||
floats - but this would cause `1.000` to be interpreted as `1000`
|
||||
because _all_ integers take precedence and floats are considered
|
||||
later. But we want `1.000` to be interpreted as a `1.0` float by
|
||||
default, so we change the ordering a bit.
|
||||
True ->
|
||||
us_preferred = self.with_number_formatting decimal_point='.'
|
||||
[us_preferred.make_integer_parser auto_mode=True, us_preferred.make_decimal_parser auto_mode=True]
|
||||
|
||||
## However, if the `decimal_point` is set to something else,
|
||||
we don't do auto inference, so this extra logic is not needed.
|
||||
False -> []
|
||||
remaining_parsers = [self.make_integer_parser auto_mode=True, self.make_decimal_parser auto_mode=True, self.make_date_time_parser, self.make_date_parser, self.make_time_of_day_parser, self.make_boolean_parser]
|
||||
parsers = preferred_auto_parsers + remaining_parsers
|
||||
## Unfortunately, the [] literal allows to create a vector containing
|
||||
dataflow errors. That is not handled well later by Polyglot. So we
|
||||
ensure all errors surface here.
|
||||
parsers_with_propagated_error = parsers.map x->x
|
||||
parsers_with_propagated_error
|
||||
|
||||
## PRIVATE
|
||||
make_auto_parser self =
|
||||
|
@ -10,8 +10,11 @@ import org.enso.table.problems.AggregatedProblems;
|
||||
import org.enso.table.problems.WithAggregatedProblems;
|
||||
import org.graalvm.polyglot.Context;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.HashMap;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
import java.util.Objects;
|
||||
import java.util.regex.Pattern;
|
||||
|
||||
/** A parser for numbers.
|
||||
@ -43,22 +46,78 @@ public class NumberParser extends IncrementalDatatypeParser {
|
||||
private final static String CCY = "(?<ccy>[^0-9(),. '+-]+)";
|
||||
private final static String EXP = "(?<exp>[eE][+-]?\\d+)?";
|
||||
private final static String SPACE = "\\s*";
|
||||
private final static String[] SEPARATORS = new String[] {",.", ".,", " ,", "',"};
|
||||
|
||||
private record Separators(String thousand, String decimal) {}
|
||||
|
||||
private final Separators[] SEPARATORS;
|
||||
|
||||
private final static Map<String, Pattern> PATTERNS = new HashMap<>();
|
||||
private final IntegerType integerTargetType;
|
||||
|
||||
private static Pattern getPattern(boolean allowDecimal, boolean allowCurrency, boolean allowScientific, boolean trimValues, int index) {
|
||||
int allowedSet = (allowCurrency ? ALLOWED_CCY_PATTERNS : ALLOWED_NON_CCY_PATTERNS);
|
||||
int separatorsIndex = index / allowedSet;
|
||||
int patternIndex = index % allowedSet;
|
||||
private static void validateSeparator(String name, String value) {
|
||||
if (value == null) return;
|
||||
|
||||
if (separatorsIndex >= SEPARATORS.length) {
|
||||
return null;
|
||||
if (value.length() != 1) {
|
||||
throw new IllegalArgumentException(name + " must be a single character, but it was '" + value + "'.");
|
||||
}
|
||||
|
||||
var separators = SEPARATORS[separatorsIndex];
|
||||
return getPattern(allowDecimal, allowCurrency, allowScientific, trimValues, patternIndex, separators);
|
||||
// If we allowed separators to be a digit, super crazy stuff could happen - e.g. technically 10000 could be interpreted as 1000 by interpreting the first 0 as a thousand separator. Let's not do that.
|
||||
if (Character.isDigit(value.charAt(0))) {
|
||||
throw new IllegalArgumentException(name + " cannot be a digit, but it was '" + value + "'.");
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Builds a list of possible separator pairs.
|
||||
* <p>
|
||||
* If one of the parameters is null, it is meant to be inferred (multiple separator pairs will be provided for
|
||||
* it), if
|
||||
* it is set to a concrete value, it will be fixed.
|
||||
*/
|
||||
private static Separators[] buildSeparators(boolean allowDecimal, String decimalPoint, String thousandSeparator) {
|
||||
validateSeparator("Decimal point", decimalPoint);
|
||||
validateSeparator("Thousand separator", thousandSeparator);
|
||||
if (decimalPoint != null && decimalPoint.equals(thousandSeparator)) {
|
||||
throw new IllegalArgumentException("Decimal point and thousand separator cannot be the same, but they were both '" + decimalPoint + "'.");
|
||||
}
|
||||
|
||||
boolean fullAutomaticMode = allowDecimal && decimalPoint == null && thousandSeparator == null;
|
||||
if (fullAutomaticMode) {
|
||||
return new Separators[] {
|
||||
new Separators(",", "."),
|
||||
new Separators(".", ","),
|
||||
new Separators(" ", ","),
|
||||
new Separators("'", ","),
|
||||
};
|
||||
}
|
||||
|
||||
List<String> thousandSeparators;
|
||||
if (thousandSeparator == null) {
|
||||
List<String> autoThousandSeparators = List.of(",", ".", "'", " ");
|
||||
thousandSeparators = autoThousandSeparators.stream().filter(sep -> !sep.equals(decimalPoint)).toList();
|
||||
} else {
|
||||
thousandSeparators = List.of(thousandSeparator);
|
||||
}
|
||||
|
||||
List<String> decimalPoints;
|
||||
if (decimalPoint == null) {
|
||||
if (allowDecimal) {
|
||||
List<String> autoDecimalPoints = List.of(",", ".");
|
||||
assert thousandSeparator != null;
|
||||
decimalPoints = autoDecimalPoints.stream().filter(sep -> !sep.equals(thousandSeparator)).toList();
|
||||
} else {
|
||||
// List.of(null) is not permitted...
|
||||
decimalPoints = new ArrayList<>();
|
||||
decimalPoints.add(null);
|
||||
}
|
||||
} else {
|
||||
decimalPoints = List.of(decimalPoint);
|
||||
}
|
||||
|
||||
return thousandSeparators.stream()
|
||||
.flatMap(thousand -> decimalPoints.stream().map(decimal -> new Separators(thousand, decimal)))
|
||||
.toArray(Separators[]::new);
|
||||
|
||||
}
|
||||
|
||||
/** The number of patterns that are allowed for non-currency numbers. */
|
||||
@ -67,7 +126,7 @@ public class NumberParser extends IncrementalDatatypeParser {
|
||||
/** The number of patterns that are allowed for currency numbers. */
|
||||
private static final int ALLOWED_CCY_PATTERNS = 6;
|
||||
|
||||
private static Pattern getPattern(boolean allowDecimal, boolean allowCurrency, boolean allowScientific, boolean trimValues, int patternIndex, String separators) {
|
||||
private static Pattern buildPattern(boolean allowDecimal, boolean allowCurrency, boolean allowScientific, boolean trimValues, int patternIndex, Separators separators) {
|
||||
if (allowScientific && !allowDecimal) {
|
||||
throw new IllegalArgumentException("Scientific notation requires decimal numbers.");
|
||||
}
|
||||
@ -76,9 +135,9 @@ public class NumberParser extends IncrementalDatatypeParser {
|
||||
return null;
|
||||
}
|
||||
|
||||
var INTEGER = "(?<integer>(\\d*)" + (separators.length() == 1 ? "" : "|(\\d{1,3}([" + separators.charAt(0) + "]\\d{3})*)") + ")";
|
||||
String INTEGER = "(?<integer>(\\d*)" + (separators.thousand == null ? "" : "|(\\d{1,3}([" + separators.thousand + "]\\d{3})*)") + ")";
|
||||
|
||||
var decimalPoint = (separators.length() == 1 ? separators : separators.charAt(1));
|
||||
String decimalPoint = allowDecimal ? Objects.requireNonNull(separators.decimal) : null;
|
||||
var NUMBER = INTEGER + (allowDecimal ? "(?<decimal>[" + decimalPoint + "]\\d*)?" : "") + (allowScientific ? EXP : "");
|
||||
|
||||
var pattern = switch (patternIndex) {
|
||||
@ -103,7 +162,6 @@ public class NumberParser extends IncrementalDatatypeParser {
|
||||
private final boolean allowLeadingZeros;
|
||||
private final boolean allowScientific;
|
||||
private final boolean trimValues;
|
||||
private final String separators;
|
||||
|
||||
/**
|
||||
* Creates a new integer instance of this parser.
|
||||
@ -112,11 +170,15 @@ public class NumberParser extends IncrementalDatatypeParser {
|
||||
* @param allowCurrency whether to allow currency symbols
|
||||
* @param allowLeadingZeros whether to allow leading zeros
|
||||
* @param trimValues whether to trim the input values
|
||||
* @param thousandSeparator the thousand separator to use
|
||||
* @param decimalPoint the decimal point set for the current format, or null if not specified; this parser does
|
||||
* not use decimal point (since it is for integers) but it ensure that if a decimal point is chosen, the inferred
|
||||
* thousand separator will not clash with that specific decimal point
|
||||
* @param thousandSeparator the thousand separator to use (if null then will be inferred)
|
||||
*/
|
||||
public static NumberParser createIntegerParser(IntegerType integerTargetType, boolean allowCurrency, boolean allowLeadingZeros, boolean trimValues, String thousandSeparator) {
|
||||
var separator = thousandSeparator == null ? null : (thousandSeparator + '_');
|
||||
return new NumberParser(false, integerTargetType, allowCurrency, allowLeadingZeros, trimValues, false, separator);
|
||||
public static NumberParser createIntegerParser(IntegerType integerTargetType, boolean allowCurrency,
|
||||
boolean allowLeadingZeros, boolean trimValues,
|
||||
String decimalPoint, String thousandSeparator) {
|
||||
return new NumberParser(false, integerTargetType, allowCurrency, allowLeadingZeros, trimValues, false, decimalPoint, thousandSeparator);
|
||||
}
|
||||
|
||||
/**
|
||||
@ -126,50 +188,39 @@ public class NumberParser extends IncrementalDatatypeParser {
|
||||
* @param allowLeadingZeros whether to allow leading zeros
|
||||
* @param trimValues whether to trim the input values
|
||||
* @param allowScientific whether to allow scientific notation
|
||||
* @param decimalPoint the decimal separator to use (if null, then will be inferred)
|
||||
* @param thousandSeparator the thousand separator to use (if null, then will be inferred)
|
||||
*/
|
||||
public static NumberParser createAutoDecimalParser(boolean allowCurrency, boolean allowLeadingZeros, boolean trimValues, boolean allowScientific) {
|
||||
return new NumberParser(true, null, allowCurrency, allowLeadingZeros, trimValues, allowScientific, null);
|
||||
public static NumberParser createDecimalParser(boolean allowCurrency, boolean allowLeadingZeros, boolean trimValues, boolean allowScientific, String decimalPoint, String thousandSeparator) {
|
||||
return new NumberParser(true, null, allowCurrency, allowLeadingZeros, trimValues, allowScientific, decimalPoint, thousandSeparator);
|
||||
}
|
||||
|
||||
/**
|
||||
* Creates a new decimal instance of this parser.
|
||||
*
|
||||
* @param allowCurrency whether to allow currency symbols
|
||||
* @param allowLeadingZeros whether to allow leading zeros
|
||||
* @param trimValues whether to trim the input values
|
||||
* @param allowScientific whether to allow scientific notation
|
||||
* @param thousandSeparator the thousand separator to use
|
||||
* @param decimalSeparator the decimal separator to use
|
||||
*/
|
||||
public static NumberParser createFixedDecimalParser(boolean allowCurrency, boolean allowLeadingZeros, boolean trimValues, boolean allowScientific, String thousandSeparator, String decimalSeparator) {
|
||||
if (decimalSeparator == null || decimalSeparator.length() != 1) {
|
||||
throw new IllegalArgumentException("Decimal separator must be a single character.");
|
||||
}
|
||||
|
||||
thousandSeparator = thousandSeparator == null ? "" : thousandSeparator;
|
||||
return new NumberParser(true, null, allowCurrency, allowLeadingZeros, trimValues, allowScientific, thousandSeparator + decimalSeparator);
|
||||
}
|
||||
|
||||
private NumberParser(boolean allowDecimal, IntegerType integerTargetType, boolean allowCurrency, boolean allowLeadingZeros, boolean trimValues, boolean allowScientific, String separators) {
|
||||
private NumberParser(boolean allowDecimal, IntegerType integerTargetType, boolean allowCurrency, boolean allowLeadingZeros, boolean trimValues, boolean allowScientific, String decimalPoint, String thousandSeparator) {
|
||||
this.allowDecimal = allowDecimal;
|
||||
this.integerTargetType = integerTargetType;
|
||||
this.allowCurrency = allowCurrency;
|
||||
this.allowLeadingZeros = allowLeadingZeros;
|
||||
this.trimValues = trimValues;
|
||||
this.allowScientific = allowScientific;
|
||||
this.separators = separators;
|
||||
SEPARATORS = buildSeparators(allowDecimal, decimalPoint, thousandSeparator);
|
||||
}
|
||||
|
||||
/**
|
||||
* Creates a Pattern for the given index.
|
||||
* The index will be decoded into a specific set of separators (unless fixed
|
||||
* separators are used) and then paired with on of the valid patterns for
|
||||
* separators are used) and then paired with one of the valid patterns for
|
||||
* the given parser.
|
||||
*/
|
||||
private Pattern patternForIndex(int index) {
|
||||
return separators == null
|
||||
? getPattern(allowDecimal, allowCurrency, allowScientific, trimValues, index)
|
||||
: getPattern(allowDecimal, allowCurrency, allowScientific, trimValues, index, separators);
|
||||
int allowedSet = (allowCurrency ? ALLOWED_CCY_PATTERNS : ALLOWED_NON_CCY_PATTERNS);
|
||||
int separatorsIndex = index / allowedSet;
|
||||
int patternIndex = index % allowedSet;
|
||||
|
||||
if (separatorsIndex >= SEPARATORS.length) {
|
||||
return null;
|
||||
}
|
||||
|
||||
return buildPattern(allowDecimal, allowCurrency, allowScientific, trimValues, patternIndex, SEPARATORS[separatorsIndex]);
|
||||
}
|
||||
|
||||
@Override
|
||||
|
@ -47,6 +47,20 @@ spec =
|
||||
formatter.parse "-Infinity" . should_equal (Number.negative_infinity)
|
||||
formatter.parse "NaN" . is_nan . should_be_true
|
||||
|
||||
Test.specify "should prefer the US decimal point in auto mode" <|
|
||||
formatter = Data_Formatter.Value
|
||||
formatter.parse "1.5" . should_equal 1.5
|
||||
formatter.parse "1.25" . should_equal 1.25
|
||||
formatter.parse "1.125" . should_equal 1.125
|
||||
formatter.parse "1,125" . should_equal 1125
|
||||
formatter.parse "1,000,000" . should_equal 1000000
|
||||
formatter.parse "1,000.5" . should_equal 1000.5
|
||||
|
||||
# But if it doesn't fit the US number, other formats should work as well:
|
||||
formatter.parse "1.000,25" . should_equal 1000.25
|
||||
formatter.parse "1'000" . should_equal 1000
|
||||
formatter.parse "1.000.000" . should_equal 1000000
|
||||
|
||||
Test.specify "should allow customizing the decimal point and thousand separator" <|
|
||||
formatter = Data_Formatter.Value thousand_separator="_" decimal_point=","
|
||||
formatter.parse "123" . should_equal 123
|
||||
@ -59,6 +73,36 @@ spec =
|
||||
formatter.parse "-1,0" . should_equal -1.0
|
||||
formatter.parse "1,0001" . should_equal 1.0001
|
||||
|
||||
Test.specify "should never infer thousand separator to be equal to decimal point" <|
|
||||
f1 = Data_Formatter.Value decimal_point="."
|
||||
f1.parse "1.0" . should_equal 1.0
|
||||
f1.parse "1.000" . should_equal 1.0
|
||||
# This only makes sense as thousand separator but because of the decimal point clashing, it cannot be parsed and stays as text:
|
||||
f1.parse "1.000.000" . should_equal "1.000.000"
|
||||
f1.parse "1,000,000" . should_equal 1000000
|
||||
|
||||
f2 = Data_Formatter.Value decimal_point=","
|
||||
f2.parse "1,0" . should_equal 1.0
|
||||
f2.parse "1,000" . should_equal 1.0
|
||||
f2.parse "1,000,000" . should_equal "1,000,000"
|
||||
f2.parse "1.0" . should_equal "1.0"
|
||||
f2.parse "1.000" . should_equal 1000
|
||||
f2.parse "1.000.000" . should_equal 1000000
|
||||
|
||||
f3 = Data_Formatter.Value thousand_separator="."
|
||||
f3.parse "1.000" . should_equal 1000
|
||||
f3.parse "1.0" . should_equal "1.0"
|
||||
|
||||
f4 = Data_Formatter.Value thousand_separator=","
|
||||
f4.parse "1,000" . should_equal 1000
|
||||
f4.parse "1,0" . should_equal "1,0"
|
||||
|
||||
r5 = Data_Formatter.Value . with_number_formatting decimal_point="." thousand_separator="."
|
||||
r5.should_fail_with Illegal_Argument
|
||||
|
||||
r6 = Data_Formatter.Value decimal_point="." thousand_separator="."
|
||||
r6.parse "1.000" . should_fail_with Illegal_Argument
|
||||
|
||||
Test.specify "should support exponential notation, but only if explicitly enabled" <|
|
||||
plain_formatter = Data_Formatter.Value
|
||||
exponential_formatter = Data_Formatter.Value allow_exponential_notation=True
|
||||
|
Loading…
Reference in New Issue
Block a user