From 08cd449a995473923ce09e4a852ddcda361963a5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Rados=C5=82aw=20Wa=C5=9Bko?= Date: Tue, 3 Oct 2023 20:07:54 +0200 Subject: [PATCH] Fix `NumberParser` to avoid `thousandSeparator==decimalPoint` and prefer US decimal format (#7946) Closes #7930 --- .../0.0.0-dev/src/Data/Data_Formatter.enso | 42 ++++- .../org/enso/table/parsing/NumberParser.java | 155 ++++++++++++------ .../src/Formatting/Data_Formatter_Spec.enso | 44 +++++ 3 files changed, 181 insertions(+), 60 deletions(-) diff --git a/distribution/lib/Standard/Table/0.0.0-dev/src/Data/Data_Formatter.enso b/distribution/lib/Standard/Table/0.0.0-dev/src/Data/Data_Formatter.enso index c365ac1ffdc..315c940b217 100644 --- a/distribution/lib/Standard/Table/0.0.0-dev/src/Data/Data_Formatter.enso +++ b/distribution/lib/Standard/Table/0.0.0-dev/src/Data/Data_Formatter.enso @@ -97,9 +97,10 @@ type Data_Formatter - thousand_separator: A separator that can be used to separate groups of digits in numbers. - allow_leading_zeros: Specifies if values starting with leading zeroes should be treated as numbers. - allow_exponential_notation: Allow parsing of exponential notation format. - with_number_formatting : Text -> Text -> Boolean -> Boolean -> Data_Formatter + with_number_formatting : Text -> Text -> Boolean -> Boolean -> Data_Formatter ! Illegal_Argument with_number_formatting self (decimal_point=self.decimal_point) (thousand_separator=self.thousand_separator) (allow_leading_zeros=self.allow_leading_zeros) (allow_exponential_notation=self.allow_exponential_notation) = - self.clone decimal_point=decimal_point thousand_separator=thousand_separator allow_leading_zeros=allow_leading_zeros allow_exponential_notation=allow_exponential_notation + if decimal_point == thousand_separator then Error.throw (Illegal_Argument.Error "The decimal point and thousand separator must be different.") else + self.clone decimal_point=decimal_point thousand_separator=thousand_separator allow_leading_zeros=allow_leading_zeros allow_exponential_notation=allow_exponential_notation ## Specify values for Date/Time parsing. @@ -187,14 +188,16 @@ type Data_Formatter ## PRIVATE make_integer_parser self auto_mode=False target_type=Value_Type.Integer = - separator = if self.thousand_separator.is_empty then Nothing else self.thousand_separator + thousand_separator = if self.thousand_separator.is_empty then Nothing else self.thousand_separator + decimal_point = if self.decimal_point == Auto then Nothing else self.decimal_point storage_type = Storage.from_value_type_strict target_type - NumberParser.createIntegerParser storage_type auto_mode.not (auto_mode.not || self.allow_leading_zeros) self.trim_values separator + NumberParser.createIntegerParser storage_type auto_mode.not (auto_mode.not || self.allow_leading_zeros) self.trim_values decimal_point thousand_separator ## PRIVATE make_decimal_parser self auto_mode=False = - if self.decimal_point == Auto then NumberParser.createAutoDecimalParser auto_mode.not (auto_mode.not || self.allow_leading_zeros) self.trim_values self.allow_exponential_notation else - NumberParser.createFixedDecimalParser auto_mode.not (auto_mode.not || self.allow_leading_zeros) self.trim_values self.allow_exponential_notation self.thousand_separator self.decimal_point + decimal_point = if self.decimal_point == Auto then Nothing else self.decimal_point + thousand_separator = if self.thousand_separator.is_empty then Nothing else self.thousand_separator + NumberParser.createDecimalParser auto_mode.not (auto_mode.not || self.allow_leading_zeros) self.trim_values self.allow_exponential_notation decimal_point thousand_separator ## PRIVATE make_boolean_parser self = self.wrap_base_parser <| @@ -219,7 +222,7 @@ type Data_Formatter make_identity_parser self = self.wrap_base_parser IdentityParser.new ## PRIVATE - make_value_type_parser self value_type = case value_type of + make_value_type_parser self value_type = Illegal_Argument.handle_java_exception <| case value_type of Value_Type.Integer _ -> self.make_integer_parser target_type=value_type # TODO once we implement #6109 we can support 32-bit floats @@ -234,7 +237,30 @@ type Data_Formatter ## PRIVATE get_specific_type_parsers self = - [self.make_integer_parser True, self.make_decimal_parser True, self.make_date_time_parser, self.make_date_parser, self.make_time_of_day_parser, self.make_boolean_parser] + try_us_parsers_first = self.decimal_point == Auto && self.thousand_separator != "." + preferred_auto_parsers = case try_us_parsers_first of + ## If we are in auto mode, we will first try parsing US integers, + then US floats and only then other integers and floats. + + Under normal circumstances, we first try integers and later + floats - but this would cause `1.000` to be interpreted as `1000` + because _all_ integers take precedence and floats are considered + later. But we want `1.000` to be interpreted as a `1.0` float by + default, so we change the ordering a bit. + True -> + us_preferred = self.with_number_formatting decimal_point='.' + [us_preferred.make_integer_parser auto_mode=True, us_preferred.make_decimal_parser auto_mode=True] + + ## However, if the `decimal_point` is set to something else, + we don't do auto inference, so this extra logic is not needed. + False -> [] + remaining_parsers = [self.make_integer_parser auto_mode=True, self.make_decimal_parser auto_mode=True, self.make_date_time_parser, self.make_date_parser, self.make_time_of_day_parser, self.make_boolean_parser] + parsers = preferred_auto_parsers + remaining_parsers + ## Unfortunately, the [] literal allows to create a vector containing + dataflow errors. That is not handled well later by Polyglot. So we + ensure all errors surface here. + parsers_with_propagated_error = parsers.map x->x + parsers_with_propagated_error ## PRIVATE make_auto_parser self = diff --git a/std-bits/table/src/main/java/org/enso/table/parsing/NumberParser.java b/std-bits/table/src/main/java/org/enso/table/parsing/NumberParser.java index d30cf9c3669..667c056e7b3 100644 --- a/std-bits/table/src/main/java/org/enso/table/parsing/NumberParser.java +++ b/std-bits/table/src/main/java/org/enso/table/parsing/NumberParser.java @@ -10,8 +10,11 @@ import org.enso.table.problems.AggregatedProblems; import org.enso.table.problems.WithAggregatedProblems; import org.graalvm.polyglot.Context; +import java.util.ArrayList; import java.util.HashMap; +import java.util.List; import java.util.Map; +import java.util.Objects; import java.util.regex.Pattern; /** A parser for numbers. @@ -43,22 +46,78 @@ public class NumberParser extends IncrementalDatatypeParser { private final static String CCY = "(?[^0-9(),. '+-]+)"; private final static String EXP = "(?[eE][+-]?\\d+)?"; private final static String SPACE = "\\s*"; - private final static String[] SEPARATORS = new String[] {",.", ".,", " ,", "',"}; + + private record Separators(String thousand, String decimal) {} + + private final Separators[] SEPARATORS; private final static Map PATTERNS = new HashMap<>(); private final IntegerType integerTargetType; - private static Pattern getPattern(boolean allowDecimal, boolean allowCurrency, boolean allowScientific, boolean trimValues, int index) { - int allowedSet = (allowCurrency ? ALLOWED_CCY_PATTERNS : ALLOWED_NON_CCY_PATTERNS); - int separatorsIndex = index / allowedSet; - int patternIndex = index % allowedSet; + private static void validateSeparator(String name, String value) { + if (value == null) return; - if (separatorsIndex >= SEPARATORS.length) { - return null; + if (value.length() != 1) { + throw new IllegalArgumentException(name + " must be a single character, but it was '" + value + "'."); } - var separators = SEPARATORS[separatorsIndex]; - return getPattern(allowDecimal, allowCurrency, allowScientific, trimValues, patternIndex, separators); + // If we allowed separators to be a digit, super crazy stuff could happen - e.g. technically 10000 could be interpreted as 1000 by interpreting the first 0 as a thousand separator. Let's not do that. + if (Character.isDigit(value.charAt(0))) { + throw new IllegalArgumentException(name + " cannot be a digit, but it was '" + value + "'."); + } + } + + /** + * Builds a list of possible separator pairs. + *

+ * If one of the parameters is null, it is meant to be inferred (multiple separator pairs will be provided for + * it), if + * it is set to a concrete value, it will be fixed. + */ + private static Separators[] buildSeparators(boolean allowDecimal, String decimalPoint, String thousandSeparator) { + validateSeparator("Decimal point", decimalPoint); + validateSeparator("Thousand separator", thousandSeparator); + if (decimalPoint != null && decimalPoint.equals(thousandSeparator)) { + throw new IllegalArgumentException("Decimal point and thousand separator cannot be the same, but they were both '" + decimalPoint + "'."); + } + + boolean fullAutomaticMode = allowDecimal && decimalPoint == null && thousandSeparator == null; + if (fullAutomaticMode) { + return new Separators[] { + new Separators(",", "."), + new Separators(".", ","), + new Separators(" ", ","), + new Separators("'", ","), + }; + } + + List thousandSeparators; + if (thousandSeparator == null) { + List autoThousandSeparators = List.of(",", ".", "'", " "); + thousandSeparators = autoThousandSeparators.stream().filter(sep -> !sep.equals(decimalPoint)).toList(); + } else { + thousandSeparators = List.of(thousandSeparator); + } + + List decimalPoints; + if (decimalPoint == null) { + if (allowDecimal) { + List autoDecimalPoints = List.of(",", "."); + assert thousandSeparator != null; + decimalPoints = autoDecimalPoints.stream().filter(sep -> !sep.equals(thousandSeparator)).toList(); + } else { + // List.of(null) is not permitted... + decimalPoints = new ArrayList<>(); + decimalPoints.add(null); + } + } else { + decimalPoints = List.of(decimalPoint); + } + + return thousandSeparators.stream() + .flatMap(thousand -> decimalPoints.stream().map(decimal -> new Separators(thousand, decimal))) + .toArray(Separators[]::new); + } /** The number of patterns that are allowed for non-currency numbers. */ @@ -67,7 +126,7 @@ public class NumberParser extends IncrementalDatatypeParser { /** The number of patterns that are allowed for currency numbers. */ private static final int ALLOWED_CCY_PATTERNS = 6; - private static Pattern getPattern(boolean allowDecimal, boolean allowCurrency, boolean allowScientific, boolean trimValues, int patternIndex, String separators) { + private static Pattern buildPattern(boolean allowDecimal, boolean allowCurrency, boolean allowScientific, boolean trimValues, int patternIndex, Separators separators) { if (allowScientific && !allowDecimal) { throw new IllegalArgumentException("Scientific notation requires decimal numbers."); } @@ -76,9 +135,9 @@ public class NumberParser extends IncrementalDatatypeParser { return null; } - var INTEGER = "(?(\\d*)" + (separators.length() == 1 ? "" : "|(\\d{1,3}([" + separators.charAt(0) + "]\\d{3})*)") + ")"; + String INTEGER = "(?(\\d*)" + (separators.thousand == null ? "" : "|(\\d{1,3}([" + separators.thousand + "]\\d{3})*)") + ")"; - var decimalPoint = (separators.length() == 1 ? separators : separators.charAt(1)); + String decimalPoint = allowDecimal ? Objects.requireNonNull(separators.decimal) : null; var NUMBER = INTEGER + (allowDecimal ? "(?[" + decimalPoint + "]\\d*)?" : "") + (allowScientific ? EXP : ""); var pattern = switch (patternIndex) { @@ -103,20 +162,23 @@ public class NumberParser extends IncrementalDatatypeParser { private final boolean allowLeadingZeros; private final boolean allowScientific; private final boolean trimValues; - private final String separators; - /** - * Creates a new integer instance of this parser. - * - * @param integerTargetType the target type describing how large integer values can be accepted - * @param allowCurrency whether to allow currency symbols - * @param allowLeadingZeros whether to allow leading zeros - * @param trimValues whether to trim the input values - * @param thousandSeparator the thousand separator to use - */ - public static NumberParser createIntegerParser(IntegerType integerTargetType, boolean allowCurrency, boolean allowLeadingZeros, boolean trimValues, String thousandSeparator) { - var separator = thousandSeparator == null ? null : (thousandSeparator + '_'); - return new NumberParser(false, integerTargetType, allowCurrency, allowLeadingZeros, trimValues, false, separator); + /** + * Creates a new integer instance of this parser. + * + * @param integerTargetType the target type describing how large integer values can be accepted + * @param allowCurrency whether to allow currency symbols + * @param allowLeadingZeros whether to allow leading zeros + * @param trimValues whether to trim the input values + * @param decimalPoint the decimal point set for the current format, or null if not specified; this parser does + * not use decimal point (since it is for integers) but it ensure that if a decimal point is chosen, the inferred + * thousand separator will not clash with that specific decimal point + * @param thousandSeparator the thousand separator to use (if null then will be inferred) + */ + public static NumberParser createIntegerParser(IntegerType integerTargetType, boolean allowCurrency, + boolean allowLeadingZeros, boolean trimValues, + String decimalPoint, String thousandSeparator) { + return new NumberParser(false, integerTargetType, allowCurrency, allowLeadingZeros, trimValues, false, decimalPoint, thousandSeparator); } /** @@ -126,50 +188,39 @@ public class NumberParser extends IncrementalDatatypeParser { * @param allowLeadingZeros whether to allow leading zeros * @param trimValues whether to trim the input values * @param allowScientific whether to allow scientific notation + * @param decimalPoint the decimal separator to use (if null, then will be inferred) + * @param thousandSeparator the thousand separator to use (if null, then will be inferred) */ - public static NumberParser createAutoDecimalParser(boolean allowCurrency, boolean allowLeadingZeros, boolean trimValues, boolean allowScientific) { - return new NumberParser(true, null, allowCurrency, allowLeadingZeros, trimValues, allowScientific, null); + public static NumberParser createDecimalParser(boolean allowCurrency, boolean allowLeadingZeros, boolean trimValues, boolean allowScientific, String decimalPoint, String thousandSeparator) { + return new NumberParser(true, null, allowCurrency, allowLeadingZeros, trimValues, allowScientific, decimalPoint, thousandSeparator); } - /** - * Creates a new decimal instance of this parser. - * - * @param allowCurrency whether to allow currency symbols - * @param allowLeadingZeros whether to allow leading zeros - * @param trimValues whether to trim the input values - * @param allowScientific whether to allow scientific notation - * @param thousandSeparator the thousand separator to use - * @param decimalSeparator the decimal separator to use - */ - public static NumberParser createFixedDecimalParser(boolean allowCurrency, boolean allowLeadingZeros, boolean trimValues, boolean allowScientific, String thousandSeparator, String decimalSeparator) { - if (decimalSeparator == null || decimalSeparator.length() != 1) { - throw new IllegalArgumentException("Decimal separator must be a single character."); - } - - thousandSeparator = thousandSeparator == null ? "" : thousandSeparator; - return new NumberParser(true, null, allowCurrency, allowLeadingZeros, trimValues, allowScientific, thousandSeparator + decimalSeparator); - } - - private NumberParser(boolean allowDecimal, IntegerType integerTargetType, boolean allowCurrency, boolean allowLeadingZeros, boolean trimValues, boolean allowScientific, String separators) { + private NumberParser(boolean allowDecimal, IntegerType integerTargetType, boolean allowCurrency, boolean allowLeadingZeros, boolean trimValues, boolean allowScientific, String decimalPoint, String thousandSeparator) { this.allowDecimal = allowDecimal; this.integerTargetType = integerTargetType; this.allowCurrency = allowCurrency; this.allowLeadingZeros = allowLeadingZeros; this.trimValues = trimValues; this.allowScientific = allowScientific; - this.separators = separators; + SEPARATORS = buildSeparators(allowDecimal, decimalPoint, thousandSeparator); } /** * Creates a Pattern for the given index. * The index will be decoded into a specific set of separators (unless fixed - * separators are used) and then paired with on of the valid patterns for + * separators are used) and then paired with one of the valid patterns for * the given parser. */ private Pattern patternForIndex(int index) { - return separators == null - ? getPattern(allowDecimal, allowCurrency, allowScientific, trimValues, index) - : getPattern(allowDecimal, allowCurrency, allowScientific, trimValues, index, separators); + int allowedSet = (allowCurrency ? ALLOWED_CCY_PATTERNS : ALLOWED_NON_CCY_PATTERNS); + int separatorsIndex = index / allowedSet; + int patternIndex = index % allowedSet; + + if (separatorsIndex >= SEPARATORS.length) { + return null; + } + + return buildPattern(allowDecimal, allowCurrency, allowScientific, trimValues, patternIndex, SEPARATORS[separatorsIndex]); } @Override diff --git a/test/Table_Tests/src/Formatting/Data_Formatter_Spec.enso b/test/Table_Tests/src/Formatting/Data_Formatter_Spec.enso index 8f45c6cf501..bd781ff768b 100644 --- a/test/Table_Tests/src/Formatting/Data_Formatter_Spec.enso +++ b/test/Table_Tests/src/Formatting/Data_Formatter_Spec.enso @@ -47,6 +47,20 @@ spec = formatter.parse "-Infinity" . should_equal (Number.negative_infinity) formatter.parse "NaN" . is_nan . should_be_true + Test.specify "should prefer the US decimal point in auto mode" <| + formatter = Data_Formatter.Value + formatter.parse "1.5" . should_equal 1.5 + formatter.parse "1.25" . should_equal 1.25 + formatter.parse "1.125" . should_equal 1.125 + formatter.parse "1,125" . should_equal 1125 + formatter.parse "1,000,000" . should_equal 1000000 + formatter.parse "1,000.5" . should_equal 1000.5 + + # But if it doesn't fit the US number, other formats should work as well: + formatter.parse "1.000,25" . should_equal 1000.25 + formatter.parse "1'000" . should_equal 1000 + formatter.parse "1.000.000" . should_equal 1000000 + Test.specify "should allow customizing the decimal point and thousand separator" <| formatter = Data_Formatter.Value thousand_separator="_" decimal_point="," formatter.parse "123" . should_equal 123 @@ -59,6 +73,36 @@ spec = formatter.parse "-1,0" . should_equal -1.0 formatter.parse "1,0001" . should_equal 1.0001 + Test.specify "should never infer thousand separator to be equal to decimal point" <| + f1 = Data_Formatter.Value decimal_point="." + f1.parse "1.0" . should_equal 1.0 + f1.parse "1.000" . should_equal 1.0 + # This only makes sense as thousand separator but because of the decimal point clashing, it cannot be parsed and stays as text: + f1.parse "1.000.000" . should_equal "1.000.000" + f1.parse "1,000,000" . should_equal 1000000 + + f2 = Data_Formatter.Value decimal_point="," + f2.parse "1,0" . should_equal 1.0 + f2.parse "1,000" . should_equal 1.0 + f2.parse "1,000,000" . should_equal "1,000,000" + f2.parse "1.0" . should_equal "1.0" + f2.parse "1.000" . should_equal 1000 + f2.parse "1.000.000" . should_equal 1000000 + + f3 = Data_Formatter.Value thousand_separator="." + f3.parse "1.000" . should_equal 1000 + f3.parse "1.0" . should_equal "1.0" + + f4 = Data_Formatter.Value thousand_separator="," + f4.parse "1,000" . should_equal 1000 + f4.parse "1,0" . should_equal "1,0" + + r5 = Data_Formatter.Value . with_number_formatting decimal_point="." thousand_separator="." + r5.should_fail_with Illegal_Argument + + r6 = Data_Formatter.Value decimal_point="." thousand_separator="." + r6.parse "1.000" . should_fail_with Illegal_Argument + Test.specify "should support exponential notation, but only if explicitly enabled" <| plain_formatter = Data_Formatter.Value exponential_formatter = Data_Formatter.Value allow_exponential_notation=True