Fix NumberParser to avoid thousandSeparator==decimalPoint and prefer US decimal format (#7946)

Closes #7930
2024-12-23 08:21:49 +03:00 · 2023-10-03 20:07:54 +02:00 · 2023-10-03 20:07:54 +02:00 · 08cd449a99
commit 08cd449a99
parent 2d39e644b8
3 changed files with 181 additions and 60 deletions
--- a/distribution/lib/Standard/Table/0.0.0-dev/src/Data/Data_Formatter.enso
+++ b/distribution/lib/Standard/Table/0.0.0-dev/src/Data/Data_Formatter.enso
@ -97,8 +97,9 @@ type Data_Formatter
       - thousand_separator: A separator that can be used to separate groups of digits in numbers.
       - allow_leading_zeros: Specifies if values starting with leading zeroes should be treated as numbers.
       - allow_exponential_notation: Allow parsing of exponential notation format.
-    with_number_formatting : Text -> Text -> Boolean -> Boolean -> Data_Formatter
+    with_number_formatting : Text -> Text -> Boolean -> Boolean -> Data_Formatter ! Illegal_Argument
    with_number_formatting self (decimal_point=self.decimal_point) (thousand_separator=self.thousand_separator) (allow_leading_zeros=self.allow_leading_zeros) (allow_exponential_notation=self.allow_exponential_notation) =
+        if decimal_point == thousand_separator then Error.throw (Illegal_Argument.Error "The decimal point and thousand separator must be different.") else
            self.clone decimal_point=decimal_point thousand_separator=thousand_separator allow_leading_zeros=allow_leading_zeros allow_exponential_notation=allow_exponential_notation

    ## Specify values for Date/Time parsing.
@ -187,14 +188,16 @@ type Data_Formatter

    ## PRIVATE
    make_integer_parser self auto_mode=False target_type=Value_Type.Integer =
-        separator = if self.thousand_separator.is_empty then Nothing else self.thousand_separator
+        thousand_separator = if self.thousand_separator.is_empty then Nothing else self.thousand_separator
+        decimal_point = if self.decimal_point == Auto then Nothing else self.decimal_point
        storage_type = Storage.from_value_type_strict target_type
-        NumberParser.createIntegerParser storage_type auto_mode.not (auto_mode.not || self.allow_leading_zeros) self.trim_values separator
+        NumberParser.createIntegerParser storage_type auto_mode.not (auto_mode.not || self.allow_leading_zeros) self.trim_values decimal_point thousand_separator

    ## PRIVATE
    make_decimal_parser self auto_mode=False =
-        if self.decimal_point == Auto then NumberParser.createAutoDecimalParser auto_mode.not (auto_mode.not || self.allow_leading_zeros) self.trim_values self.allow_exponential_notation else
-            NumberParser.createFixedDecimalParser auto_mode.not (auto_mode.not || self.allow_leading_zeros) self.trim_values self.allow_exponential_notation self.thousand_separator self.decimal_point
+        decimal_point = if self.decimal_point == Auto then Nothing else self.decimal_point
+        thousand_separator = if self.thousand_separator.is_empty then Nothing else self.thousand_separator
+        NumberParser.createDecimalParser auto_mode.not (auto_mode.not || self.allow_leading_zeros) self.trim_values self.allow_exponential_notation decimal_point thousand_separator

    ## PRIVATE
    make_boolean_parser self = self.wrap_base_parser <|
@ -219,7 +222,7 @@ type Data_Formatter
    make_identity_parser self = self.wrap_base_parser IdentityParser.new

    ## PRIVATE
-    make_value_type_parser self value_type = case value_type of
+    make_value_type_parser self value_type = Illegal_Argument.handle_java_exception <| case value_type of
        Value_Type.Integer _ ->
            self.make_integer_parser target_type=value_type
        # TODO once we implement #6109 we can support 32-bit floats
@ -234,7 +237,30 @@ type Data_Formatter

    ## PRIVATE
    get_specific_type_parsers self =
-        [self.make_integer_parser True, self.make_decimal_parser True, self.make_date_time_parser, self.make_date_parser, self.make_time_of_day_parser, self.make_boolean_parser]
+        try_us_parsers_first = self.decimal_point == Auto && self.thousand_separator != "."
+        preferred_auto_parsers = case try_us_parsers_first of
+            ## If we are in auto mode, we will first try parsing US integers,
+               then US floats and only then other integers and floats.
+
+               Under normal circumstances, we first try integers and later
+               floats - but this would cause `1.000` to be interpreted as `1000`
+               because _all_ integers take precedence and floats are considered
+               later. But we want `1.000` to be interpreted as a `1.0` float by
+               default, so we change the ordering a bit.
+            True ->
+                us_preferred = self.with_number_formatting decimal_point='.'
+                [us_preferred.make_integer_parser auto_mode=True, us_preferred.make_decimal_parser auto_mode=True]
+
+            ## However, if the `decimal_point` is set to something else,
+               we don't do auto inference, so this extra logic is not needed.
+            False -> []
+        remaining_parsers = [self.make_integer_parser auto_mode=True, self.make_decimal_parser auto_mode=True, self.make_date_time_parser, self.make_date_parser, self.make_time_of_day_parser, self.make_boolean_parser]
+        parsers = preferred_auto_parsers + remaining_parsers
+        ## Unfortunately, the [] literal allows to create a vector containing
+           dataflow errors. That is not handled well later by Polyglot. So we
+           ensure all errors surface here.
+        parsers_with_propagated_error = parsers.map x->x
+        parsers_with_propagated_error

    ## PRIVATE
    make_auto_parser self =
--- a/std-bits/table/src/main/java/org/enso/table/parsing/NumberParser.java
+++ b/std-bits/table/src/main/java/org/enso/table/parsing/NumberParser.java
@ -10,8 +10,11 @@ import org.enso.table.problems.AggregatedProblems;
 import org.enso.table.problems.WithAggregatedProblems;
 import org.graalvm.polyglot.Context;

+import java.util.ArrayList;
 import java.util.HashMap;
+import java.util.List;
 import java.util.Map;
+import java.util.Objects;
 import java.util.regex.Pattern;

 /** A parser for numbers.
@ -43,22 +46,78 @@ public class NumberParser extends IncrementalDatatypeParser {
    private final static String CCY = "(?<ccy>[^0-9(),. '+-]+)";
    private final static String EXP = "(?<exp>[eE][+-]?\\d+)?";
    private final static String SPACE = "\\s*";
-    private final static String[] SEPARATORS = new String[] {",.", ".,", " ,", "',"};
+
+    private record Separators(String thousand, String decimal) {}
+
+    private final Separators[] SEPARATORS;

    private final static Map<String, Pattern> PATTERNS = new HashMap<>();
    private final IntegerType integerTargetType;

-    private static Pattern getPattern(boolean allowDecimal, boolean allowCurrency, boolean allowScientific, boolean trimValues, int index) {
-        int allowedSet = (allowCurrency ? ALLOWED_CCY_PATTERNS : ALLOWED_NON_CCY_PATTERNS);
-        int separatorsIndex = index / allowedSet;
-        int patternIndex = index % allowedSet;
+    private static void validateSeparator(String name, String value) {
+        if (value == null) return;

-        if (separatorsIndex >= SEPARATORS.length) {
-            return null;
+        if (value.length() != 1) {
+            throw new IllegalArgumentException(name + " must be a single character, but it was '" + value + "'.");
        }

-        var separators = SEPARATORS[separatorsIndex];
-        return getPattern(allowDecimal, allowCurrency, allowScientific, trimValues, patternIndex, separators);
+        // If we allowed separators to be a digit, super crazy stuff could happen - e.g. technically 10000 could be interpreted as 1000 by interpreting the first 0 as a thousand separator. Let's not do that.
+        if (Character.isDigit(value.charAt(0))) {
+            throw new IllegalArgumentException(name + " cannot be a digit, but it was '" + value + "'.");
+        }
+    }
+
+    /**
+     * Builds a list of possible separator pairs.
+     * <p>
+     * If one of the parameters is null, it is meant to be inferred (multiple separator pairs will be provided for
+     * it), if
+     * it is set to a concrete value, it will be fixed.
+     */
+    private static Separators[] buildSeparators(boolean allowDecimal, String decimalPoint, String thousandSeparator) {
+        validateSeparator("Decimal point", decimalPoint);
+        validateSeparator("Thousand separator", thousandSeparator);
+        if (decimalPoint != null && decimalPoint.equals(thousandSeparator)) {
+            throw new IllegalArgumentException("Decimal point and thousand separator cannot be the same, but they were both '" + decimalPoint + "'.");
+        }
+
+        boolean fullAutomaticMode = allowDecimal && decimalPoint == null && thousandSeparator == null;
+        if (fullAutomaticMode) {
+            return new Separators[] {
+                new Separators(",", "."),
+                new Separators(".", ","),
+                new Separators(" ", ","),
+                new Separators("'", ","),
+            };
+        }
+
+        List<String> thousandSeparators;
+        if (thousandSeparator == null) {
+            List<String> autoThousandSeparators = List.of(",", ".", "'", " ");
+            thousandSeparators = autoThousandSeparators.stream().filter(sep -> !sep.equals(decimalPoint)).toList();
+        } else {
+            thousandSeparators = List.of(thousandSeparator);
+        }
+
+        List<String> decimalPoints;
+        if (decimalPoint == null) {
+            if (allowDecimal) {
+                List<String> autoDecimalPoints = List.of(",", ".");
+                assert thousandSeparator != null;
+                decimalPoints = autoDecimalPoints.stream().filter(sep -> !sep.equals(thousandSeparator)).toList();
+            } else {
+                // List.of(null) is not permitted...
+                decimalPoints = new ArrayList<>();
+                decimalPoints.add(null);
+            }
+        } else {
+            decimalPoints = List.of(decimalPoint);
+        }
+
+        return thousandSeparators.stream()
+                .flatMap(thousand -> decimalPoints.stream().map(decimal -> new Separators(thousand, decimal)))
+                .toArray(Separators[]::new);
+
    }

    /** The number of patterns that are allowed for non-currency numbers. */
@ -67,7 +126,7 @@ public class NumberParser extends IncrementalDatatypeParser {
    /** The number of patterns that are allowed for currency numbers. */
    private static final int ALLOWED_CCY_PATTERNS = 6;

-    private static Pattern getPattern(boolean allowDecimal, boolean allowCurrency, boolean allowScientific, boolean trimValues, int patternIndex, String separators) {
+    private static Pattern buildPattern(boolean allowDecimal, boolean allowCurrency, boolean allowScientific, boolean trimValues, int patternIndex, Separators separators) {
        if (allowScientific && !allowDecimal) {
            throw new IllegalArgumentException("Scientific notation requires decimal numbers.");
        }
@ -76,9 +135,9 @@ public class NumberParser extends IncrementalDatatypeParser {
            return null;
        }

-        var INTEGER = "(?<integer>(\\d*)" + (separators.length() == 1 ? "" : "|(\\d{1,3}([" + separators.charAt(0) + "]\\d{3})*)") + ")";
+        String INTEGER = "(?<integer>(\\d*)" + (separators.thousand == null ? "" : "|(\\d{1,3}([" + separators.thousand + "]\\d{3})*)") + ")";

-        var decimalPoint = (separators.length() == 1 ? separators : separators.charAt(1));
+        String decimalPoint = allowDecimal ? Objects.requireNonNull(separators.decimal) : null;
        var NUMBER = INTEGER + (allowDecimal ? "(?<decimal>[" + decimalPoint + "]\\d*)?" : "") + (allowScientific ? EXP : "");

        var pattern = switch (patternIndex) {
@ -103,7 +162,6 @@ public class NumberParser extends IncrementalDatatypeParser {
    private final boolean allowLeadingZeros;
    private final boolean allowScientific;
    private final boolean trimValues;
-    private final String separators;

  /**
   * Creates a new integer instance of this parser.
@ -112,11 +170,15 @@ public class NumberParser extends IncrementalDatatypeParser {
   * @param allowCurrency whether to allow currency symbols
   * @param allowLeadingZeros whether to allow leading zeros
   * @param trimValues whether to trim the input values
-     * @param thousandSeparator the thousand separator to use
+   * @param decimalPoint the decimal point set for the current format, or null if not specified; this parser does
+   *     not use decimal point (since it is for integers) but it ensure that if a decimal point is chosen, the inferred
+   *     thousand separator will not clash with that specific decimal point
+   * @param thousandSeparator the thousand separator to use (if null then will be inferred)
   */
-    public static NumberParser createIntegerParser(IntegerType integerTargetType, boolean allowCurrency, boolean allowLeadingZeros, boolean trimValues, String thousandSeparator) {
-        var separator = thousandSeparator == null ? null : (thousandSeparator + '_');
-        return new NumberParser(false, integerTargetType, allowCurrency, allowLeadingZeros, trimValues, false, separator);
+  public static NumberParser createIntegerParser(IntegerType integerTargetType, boolean allowCurrency,
+                                                 boolean allowLeadingZeros, boolean trimValues,
+                                                 String decimalPoint, String thousandSeparator) {
+        return new NumberParser(false, integerTargetType, allowCurrency, allowLeadingZeros, trimValues, false, decimalPoint, thousandSeparator);
    }

    /**
@ -126,50 +188,39 @@ public class NumberParser extends IncrementalDatatypeParser {
     * @param allowLeadingZeros whether to allow leading zeros
     * @param trimValues whether to trim the input values
     * @param allowScientific whether to allow scientific notation
+     * @param decimalPoint the decimal separator to use (if null, then will be inferred)
+     * @param thousandSeparator the thousand separator to use (if null, then will be inferred)
     */
-    public static NumberParser createAutoDecimalParser(boolean allowCurrency, boolean allowLeadingZeros, boolean trimValues, boolean allowScientific) {
-        return new NumberParser(true, null, allowCurrency, allowLeadingZeros, trimValues, allowScientific, null);
+    public static NumberParser createDecimalParser(boolean allowCurrency, boolean allowLeadingZeros, boolean trimValues, boolean allowScientific, String decimalPoint, String thousandSeparator) {
+        return new NumberParser(true, null, allowCurrency, allowLeadingZeros, trimValues, allowScientific, decimalPoint, thousandSeparator);
    }

-    /**
-     * Creates a new decimal instance of this parser.
-     *
-     * @param allowCurrency whether to allow currency symbols
-     * @param allowLeadingZeros whether to allow leading zeros
-     * @param trimValues whether to trim the input values
-     * @param allowScientific whether to allow scientific notation
-     * @param thousandSeparator the thousand separator to use
-     * @param decimalSeparator the decimal separator to use
-     */
-    public static NumberParser createFixedDecimalParser(boolean allowCurrency, boolean allowLeadingZeros, boolean trimValues, boolean allowScientific, String thousandSeparator, String decimalSeparator) {
-        if (decimalSeparator == null || decimalSeparator.length() != 1) {
-            throw new IllegalArgumentException("Decimal separator must be a single character.");
-        }
-
-        thousandSeparator = thousandSeparator == null ? "" : thousandSeparator;
-        return new NumberParser(true, null, allowCurrency, allowLeadingZeros, trimValues, allowScientific, thousandSeparator + decimalSeparator);
-    }
-
-    private NumberParser(boolean allowDecimal, IntegerType integerTargetType, boolean allowCurrency, boolean allowLeadingZeros, boolean trimValues, boolean allowScientific, String separators) {
+    private NumberParser(boolean allowDecimal, IntegerType integerTargetType, boolean allowCurrency, boolean allowLeadingZeros, boolean trimValues, boolean allowScientific, String decimalPoint, String thousandSeparator) {
        this.allowDecimal = allowDecimal;
        this.integerTargetType = integerTargetType;
        this.allowCurrency = allowCurrency;
        this.allowLeadingZeros = allowLeadingZeros;
        this.trimValues = trimValues;
        this.allowScientific = allowScientific;
-        this.separators = separators;
+        SEPARATORS = buildSeparators(allowDecimal, decimalPoint, thousandSeparator);
    }

    /**
     * Creates a Pattern for the given index.
     * The index will be decoded into a specific set of separators (unless fixed
-     * separators are used) and then paired with on of the valid patterns for
+     * separators are used) and then paired with one of the valid patterns for
     * the given parser.
     */
    private Pattern patternForIndex(int index) {
-        return separators == null
-            ? getPattern(allowDecimal, allowCurrency, allowScientific, trimValues, index)
-            : getPattern(allowDecimal, allowCurrency, allowScientific, trimValues, index, separators);
+        int allowedSet = (allowCurrency ? ALLOWED_CCY_PATTERNS : ALLOWED_NON_CCY_PATTERNS);
+        int separatorsIndex = index / allowedSet;
+        int patternIndex = index % allowedSet;
+
+        if (separatorsIndex >= SEPARATORS.length) {
+            return null;
+        }
+
+        return buildPattern(allowDecimal, allowCurrency, allowScientific, trimValues, patternIndex, SEPARATORS[separatorsIndex]);
    }

    @Override
--- a/test/Table_Tests/src/Formatting/Data_Formatter_Spec.enso
+++ b/test/Table_Tests/src/Formatting/Data_Formatter_Spec.enso
@ -47,6 +47,20 @@ spec =
            formatter.parse "-Infinity" . should_equal (Number.negative_infinity)
            formatter.parse "NaN" . is_nan . should_be_true

+        Test.specify "should prefer the US decimal point in auto mode" <|
+            formatter = Data_Formatter.Value
+            formatter.parse "1.5" . should_equal 1.5
+            formatter.parse "1.25" . should_equal 1.25
+            formatter.parse "1.125" . should_equal 1.125
+            formatter.parse "1,125" . should_equal 1125
+            formatter.parse "1,000,000" . should_equal 1000000
+            formatter.parse "1,000.5" . should_equal 1000.5
+
+            # But if it doesn't fit the US number, other formats should work as well:
+            formatter.parse "1.000,25" . should_equal 1000.25
+            formatter.parse "1'000" . should_equal 1000
+            formatter.parse "1.000.000" . should_equal 1000000
+
        Test.specify "should allow customizing the decimal point and thousand separator" <|
            formatter = Data_Formatter.Value thousand_separator="_" decimal_point=","
            formatter.parse "123" . should_equal 123
@ -59,6 +73,36 @@ spec =
            formatter.parse "-1,0" . should_equal -1.0
            formatter.parse "1,0001" . should_equal 1.0001

+        Test.specify "should never infer thousand separator to be equal to decimal point" <|
+            f1 = Data_Formatter.Value decimal_point="."
+            f1.parse "1.0" . should_equal 1.0
+            f1.parse "1.000" . should_equal 1.0
+            # This only makes sense as thousand separator but because of the decimal point clashing, it cannot be parsed and stays as text:
+            f1.parse "1.000.000" . should_equal "1.000.000"
+            f1.parse "1,000,000" . should_equal 1000000
+
+            f2 = Data_Formatter.Value decimal_point=","
+            f2.parse "1,0" . should_equal 1.0
+            f2.parse "1,000" . should_equal 1.0
+            f2.parse "1,000,000" . should_equal "1,000,000"
+            f2.parse "1.0" . should_equal "1.0"
+            f2.parse "1.000" . should_equal 1000
+            f2.parse "1.000.000" . should_equal 1000000
+
+            f3 = Data_Formatter.Value thousand_separator="."
+            f3.parse "1.000" . should_equal 1000
+            f3.parse "1.0" . should_equal "1.0"
+
+            f4 = Data_Formatter.Value thousand_separator=","
+            f4.parse "1,000" . should_equal 1000
+            f4.parse "1,0" . should_equal "1,0"
+
+            r5 = Data_Formatter.Value . with_number_formatting decimal_point="." thousand_separator="."
+            r5.should_fail_with Illegal_Argument
+
+            r6 = Data_Formatter.Value decimal_point="." thousand_separator="."
+            r6.parse "1.000" . should_fail_with Illegal_Argument
+
        Test.specify "should support exponential notation, but only if explicitly enabled" <|
            plain_formatter = Data_Formatter.Value
            exponential_formatter = Data_Formatter.Value allow_exponential_notation=True