From 6b544650b36baec742fe0b317208321e1d67c916 Mon Sep 17 00:00:00 2001 From: James Dunkerley Date: Wed, 13 Nov 2024 19:08:23 +0000 Subject: [PATCH] New NumberParser for Table parsing (#11499) Replaces the Regex based number parser with a new parser which works out the same by working out each part as it sees and example of it. Close #7398 - performance of reading the large CSV now about 2s (down from 15-20s). --- .../Table/0.0.0-dev/src/Data_Formatter.enso | 24 +- .../parser/FormatDetectingNumberParser.java | 324 +++++++++++ .../org/enso/base/parser/NegativeSign.java | 40 ++ .../base/parser/NumberWithSeparators.java | 503 ++++++++++++++++++ .../java/org/enso/base/parser/Separators.java | 239 +++++++++ .../org/enso/table/parsing/NumberParser.java | 453 +++------------- .../src/Formatting/Parse_Values_Spec.enso | 4 +- 7 files changed, 1197 insertions(+), 390 deletions(-) create mode 100644 std-bits/base/src/main/java/org/enso/base/parser/FormatDetectingNumberParser.java create mode 100644 std-bits/base/src/main/java/org/enso/base/parser/NegativeSign.java create mode 100644 std-bits/base/src/main/java/org/enso/base/parser/NumberWithSeparators.java create mode 100644 std-bits/base/src/main/java/org/enso/base/parser/Separators.java diff --git a/distribution/lib/Standard/Table/0.0.0-dev/src/Data_Formatter.enso b/distribution/lib/Standard/Table/0.0.0-dev/src/Data_Formatter.enso index 925eff9cfd..bcc54682fd 100644 --- a/distribution/lib/Standard/Table/0.0.0-dev/src/Data_Formatter.enso +++ b/distribution/lib/Standard/Table/0.0.0-dev/src/Data_Formatter.enso @@ -249,26 +249,12 @@ type Data_Formatter get_specific_type_parsers self = ## Have to do date parsing first to allow for pure numeric formats. date_parsers = [self.make_date_time_parser, self.make_date_parser, self.make_time_of_day_parser] - - try_us_parsers_first = self.decimal_point == Auto && self.thousand_separator != "." - preferred_auto_parsers = case try_us_parsers_first of - ## If we are in auto mode, we will first try parsing US integers, - then US floats and only then other integers and floats. - - Under normal circumstances, we first try integers and later - floats - but this would cause `1.000` to be interpreted as `1000` - because _all_ integers take precedence and floats are considered - later. But we want `1.000` to be interpreted as a `1.0` float by - default, so we change the ordering a bit. - True -> - us_preferred = self.with_number_formatting decimal_point='.' - [us_preferred.make_integer_parser auto_mode=True, us_preferred.make_decimal_parser auto_mode=True] - - ## However, if the `decimal_point` is set to something else, - we don't do auto inference, so this extra logic is not needed. - False -> [] + us_parsers = if self.decimal_point != Auto || self.thousand_separator != "" then [] else + ## In default mode, add the English pattern to ensure they dominate. + english = self.with_number_formatting decimal_point='.' thousand_separator=',' + [english.make_integer_parser auto_mode=True, english.make_decimal_parser auto_mode=True] remaining_parsers = [self.make_integer_parser auto_mode=True, self.make_decimal_parser auto_mode=True, self.make_boolean_parser] - parsers = date_parsers + preferred_auto_parsers + remaining_parsers + parsers = date_parsers + us_parsers + remaining_parsers ## Unfortunately, the [] literal allows to create a vector containing dataflow errors. That is not handled well later by Polyglot. So we ensure all errors surface here. diff --git a/std-bits/base/src/main/java/org/enso/base/parser/FormatDetectingNumberParser.java b/std-bits/base/src/main/java/org/enso/base/parser/FormatDetectingNumberParser.java new file mode 100644 index 0000000000..e02f24c4fd --- /dev/null +++ b/std-bits/base/src/main/java/org/enso/base/parser/FormatDetectingNumberParser.java @@ -0,0 +1,324 @@ +package org.enso.base.parser; + +/** + * Parse a String into a Number. It supports the following patterns: + * + * + */ +public class FormatDetectingNumberParser { + public interface NumberParseResult {} + + public interface NumberParseResultSuccess extends NumberParseResult { + NumberParseResultSuccess negate(); + + NumberParseResultSuccess withSymbol(String symbol); + } + + public record NumberParseLong(long number, String symbol, boolean negated) + implements NumberParseResultSuccess { + @Override + public NumberParseResultSuccess negate() { + return new NumberParseLong(-number, symbol, true); + } + + @Override + public NumberParseResultSuccess withSymbol(String symbol) { + return new NumberParseLong(number, symbol, negated); + } + } + + public record NumberParseDouble(double number, String symbol) + implements NumberParseResultSuccess { + @Override + public NumberParseResultSuccess negate() { + return new NumberParseDouble(-number, symbol); + } + + @Override + public NumberParseResultSuccess withSymbol(String symbol) { + return new NumberParseDouble(number, symbol); + } + } + + public record NumberParseFailure(String message) implements NumberParseResult {} + + private final boolean allowSymbol; + private final boolean allowLeadingZeroes; + private final boolean allowLeadingTrailingWhitespace; + private final boolean allowExponentialNotation; + private NegativeSign negativeSign; + private NumberWithSeparators numberWithSeparators; + + public FormatDetectingNumberParser( + boolean allowSymbol, + boolean allowLeadingZeroes, + boolean allowLeadingTrailingWhitespace, + boolean allowExponentialNotation, + NegativeSign negativeSign, + NumberWithSeparators numberWithSeparators) { + this.allowSymbol = allowSymbol; + this.allowLeadingZeroes = allowLeadingZeroes; + this.allowLeadingTrailingWhitespace = allowLeadingTrailingWhitespace; + this.allowExponentialNotation = allowExponentialNotation; + this.negativeSign = negativeSign; + this.numberWithSeparators = numberWithSeparators; + } + + public NegativeSign negativeSign() { + return negativeSign; + } + + public void setNegativeSign(NegativeSign newNegativeSign) { + if (negativeSign != NegativeSign.UNKNOWN) { + throw new IllegalStateException("Negative Sign Already Set."); + } + negativeSign = newNegativeSign; + } + + public NumberWithSeparators numberWithSeparators() { + return numberWithSeparators; + } + + /** + * Parse a string into a number. + * + * @param value the string to parse. + * @param integer whether to parse a Long or a Double. + * @return the parsed number, or a failure if the parse was unsuccessful. + */ + public NumberParseResult parse(CharSequence value, boolean integer) { + // Ensure that if we are allowing exponential notation, we are not parsing an integer. + assert !(allowExponentialNotation && integer); + + // State + boolean lastWasWhitespace = false; + boolean encounteredContent = false; + boolean encounteredSign = false; + boolean needsNegating = false; + NumberParseResultSuccess number = null; + String symbol = ""; + + // Scan the value + int idx = 0; + int length = value.length(); + while (idx < length) { + char c = value.charAt(idx); + + if (Character.isWhitespace(c)) { + if (!allowLeadingTrailingWhitespace && !encounteredContent) { + return new NumberParseFailure("Unexpected leading Whitespace."); + } + + if (idx > 0 && (value.charAt(idx - 1) == '-' || value.charAt(idx - 1) == '+')) { + return new NumberParseFailure("Unexpected whitespace after sign."); + } + + idx++; + lastWasWhitespace = true; + } else { + encounteredContent = true; + lastWasWhitespace = false; + + if (NumberWithSeparators.isDigit(c) || Separators.isSeparator(c)) { + if (number != null) { + return new NumberParseFailure("Multiple Number Sections."); + } + + var numberPart = + numberWithSeparators.parse(value, idx, integer, allowExponentialNotation); + + // If the format changed, catch new format and unwrap result. + if (numberPart instanceof NumberWithSeparators.NumberParseResultWithFormat newFormat) { + numberWithSeparators = newFormat.format(); + numberPart = newFormat.result(); + } + + // Result should either be a new index or a failure. + // If it is a new index, update the index and unwrap the result. + if (numberPart instanceof NumberWithSeparators.NumberParseResultWithIndex newIndex) { + // Check for leading zeroes (0 or 0 is acceptable). + if (!allowLeadingZeroes + && c == '0' + && newIndex.endIdx() > idx + 1 + && value.charAt(idx + 1) != numberWithSeparators.getDecimal()) { + return new NumberParseFailure("Leading Zero."); + } + + idx = newIndex.endIdx(); + numberPart = newIndex.result(); + } + + if (numberPart instanceof NumberParseResultSuccess numberSuccess) { + number = numberSuccess; + } else { + return numberPart; + } + } else if (NegativeSign.isOpenSign(c)) { + if (encounteredSign || number != null) { + return new NumberParseFailure("Unexpected sign character."); + } + + var signOk = negativeSign.checkValid(c); + if (signOk.isEmpty()) { + return new NumberParseFailure("Inconsistent negative format."); + } + + negativeSign = signOk.get(); + encounteredSign = true; + needsNegating = c != '+'; + idx++; + } else if (c == ')') { + if (!needsNegating || negativeSign != NegativeSign.BRACKET_OPEN || number == null) { + return new NumberParseFailure("Unexpected bracket close."); + } + + // Should only be whitespace left. + idx++; + while (idx < length) { + if (!Character.isWhitespace(value.charAt(idx))) { + return new NumberParseFailure("Unexpected characters after bracket close."); + } + idx++; + lastWasWhitespace = true; + } + + // Negate here so can tell finished. + number = number.negate(); + needsNegating = false; + } else if (!integer + && number == null + && isSameSequence(value, idx, "infinity", "INFINITY")) { + // Identify Infinity + number = new NumberParseDouble(Double.POSITIVE_INFINITY, ""); + idx += 8; + } else if (!integer + && number == null + && !encounteredSign + && !needsNegating + && isSameSequence(value, idx, "nan", "NAN")) { + // Identify NaN + number = new NumberParseDouble(Double.NaN, ""); + idx += 3; + } else { + if (!symbol.isEmpty()) { + return new NumberParseFailure("Multiple Symbol Sections."); + } + + if (!allowSymbol) { + return new NumberParseFailure("Symbols not allowed."); + } + + // ToDo: Locking symbol position within text parts. + int endIdx = idx; + while (endIdx < length + && !NumberWithSeparators.isDigit(c) + && !Separators.isSeparator(c) + && !NegativeSign.isSign(c) + && !Character.isWhitespace(c)) { + endIdx++; + if (endIdx < length) { + c = value.charAt(endIdx); + } + } + + symbol = value.subSequence(idx, endIdx).toString(); + idx = endIdx; + } + } + } + + // Check for trailing whitespace. + if (!allowLeadingTrailingWhitespace && lastWasWhitespace) { + return new NumberParseFailure("Trailing Whitespace."); + } + + // Special check for unclosed bracket. + if (negativeSign == NegativeSign.BRACKET_OPEN && needsNegating) { + return new NumberParseFailure("Unclosed bracket."); + } + + // Fail if no number found. + if (number == null) { + return new NumberParseFailure("No Number Found."); + } + + // Return Result + number = needsNegating ? number.negate() : number; + + // Handle Special Case of Negated 0 If Not An Integer + if (!integer + && number instanceof NumberParseLong longNumber + && longNumber.number() == 0 + && longNumber.negated()) { + // Catch -0 double. + number = new NumberParseDouble(-0.0, longNumber.symbol()); + } + + return symbol.isEmpty() ? number : number.withSymbol(symbol); + } + + public Long parseLong(CharSequence value) { + var result = parse(value, true); + if (result instanceof NumberParseLong numberSuccess) { + return numberSuccess.number(); + } + return null; + } + + public Double parseDouble(CharSequence value) { + var result = parse(value, false); + if (result instanceof NumberParseDouble numberSuccess) { + return numberSuccess.number(); + } else if (result instanceof NumberParseLong longNumber) { + return (double) longNumber.number(); + } + return null; + } + + public NumberParseResult[] parseMany(CharSequence[] values, boolean integer) { + var results = new NumberParseResult[values.length]; + + int i = 0; + while (i < values.length) { + var previous = numberWithSeparators; + results[i] = parse(values[i], integer); + + if (numberWithSeparators != previous + && ((previous == NumberWithSeparators.DOT_UNKNOWN + && numberWithSeparators != NumberWithSeparators.DOT_COMMA) + || (previous == NumberWithSeparators.COMMA_UNKNOWN + && numberWithSeparators != NumberWithSeparators.DOT_COMMA))) { + // Start scan over, as format was incorrect. + i = 0; + } else { + i++; + } + } + + return results; + } + + private static boolean isSameSequence( + CharSequence sequence, int index, CharSequence toMatchLower, CharSequence toMatchUpper) { + assert toMatchLower.length() == toMatchUpper.length(); + if (index + toMatchLower.length() > sequence.length()) { + return false; + } + + for (int i = 0; i < toMatchLower.length(); i++) { + char c = sequence.charAt(index + i); + if (c != toMatchLower.charAt(i) && c != toMatchUpper.charAt(i)) { + return false; + } + } + + return true; + } +} diff --git a/std-bits/base/src/main/java/org/enso/base/parser/NegativeSign.java b/std-bits/base/src/main/java/org/enso/base/parser/NegativeSign.java new file mode 100644 index 0000000000..d7cdbc3278 --- /dev/null +++ b/std-bits/base/src/main/java/org/enso/base/parser/NegativeSign.java @@ -0,0 +1,40 @@ +package org.enso.base.parser; + +import java.util.Optional; + +public enum NegativeSign { + /** No sign encountered, so could be either. */ + UNKNOWN, + /** Minus or Plus sign - e.g. +123 or -123. */ + MINUS, + /** Brackets - e.g. (123) */ + BRACKET_OPEN; + + /** + * Checks if the given character is a valid negative sign. + * + * @param c the character to check + * @return the new state of the negative sign or Optional.empty if the character is invalid. + */ + public Optional checkValid(char c) { + var result = + switch (this) { + case UNKNOWN -> c == '-' || c == '+' ? MINUS : c == '(' ? BRACKET_OPEN : null; + case MINUS -> c == '(' ? null : this; + case BRACKET_OPEN -> c != '(' ? null : this; + }; + return Optional.ofNullable(result); + } + + static boolean isOpenSign(char c) { + return c == '-' || c == '+' || c == '('; + } + + static boolean isCloseSign(char c) { + return c == ')'; + } + + static boolean isSign(char c) { + return isOpenSign(c) || isCloseSign(c); + } +} diff --git a/std-bits/base/src/main/java/org/enso/base/parser/NumberWithSeparators.java b/std-bits/base/src/main/java/org/enso/base/parser/NumberWithSeparators.java new file mode 100644 index 0000000000..f3cd2bc1fa --- /dev/null +++ b/std-bits/base/src/main/java/org/enso/base/parser/NumberWithSeparators.java @@ -0,0 +1,503 @@ +package org.enso.base.parser; + +import java.util.Optional; +import org.enso.base.parser.FormatDetectingNumberParser.NumberParseDouble; +import org.enso.base.parser.FormatDetectingNumberParser.NumberParseFailure; +import org.enso.base.parser.FormatDetectingNumberParser.NumberParseLong; +import org.enso.base.parser.FormatDetectingNumberParser.NumberParseResult; + +/** + * Number parsing with separators. Specifies the universe of number formats that can be parsed. Two + * special cases, where we default to English format over European: + * + *
    + *
  • Encounter a single . or , with 3 trailing numbers. + *
  • Could be either DOT_COMMA or COMMA_DOT. + *
  • If a single . then uses DOT_UNKNOWN. + *
  • If a single , then uses COMMA_UNKNOWN. + *
+ */ +public enum NumberWithSeparators { + UNKNOWN(Constants.UNKNOWN, Constants.UNKNOWN), + + // Special case where we have encountered a . with 3 trailing digits. Such as + // ##0.123 ###.123 + DOT_UNKNOWN(Constants.UNKNOWN, '.'), + // Special case where we have encountered a single . within 3 digits from + // start and without 3 digits from end. Such as ##3.1# or ##3.1415... + UNKNOWN_DOT(Constants.UNKNOWN, '.'), + // Special case where we have encountered a , with 3 trailing digits. Such as + // ##0,123 ###,123 + COMMA_UNKNOWN(',', Constants.UNKNOWN), + // Special case where we have encountered a single . within 3 digits from + // start and without 3 digits from end. Such as ##3,1# or ##3,1415... + UNKNOWN_COMMA(Constants.UNKNOWN, ','), + + NO_UNKNOWN(Constants.NONE, Constants.UNKNOWN), + NO_DOT(Constants.NONE, '.'), + NO_COMMA(Constants.NONE, ','), + + // European format (e.g. 1.234,56) + DOT_COMMA('.', ','), + + // English format (e.g. 1,234.56) + COMMA_DOT(',', '.'), + + SPACE_UNKNOWN(' ', Constants.UNKNOWN), + SPACE_DOT(' ', '.'), + SPACE_COMMA(' ', ','), + + SWISS_UNKNOWN('\'', Constants.UNKNOWN), + SWISS_DOT('\'', '.'), + SWISS_COMMA('\'', ','), + + UNDERSCORE_UNKNOWN('_', Constants.UNKNOWN), + UNDERSCORE_DOT('_', '.'), + UNDERSCORE_COMMA('_', ','); + + public static NumberWithSeparators fromSeparators(String thousand, String decimal) { + if (thousand != null && thousand.length() > 1) { + throw new IllegalArgumentException("Invalid thousand separator (more than one character)."); + } + + if (decimal != null && decimal.length() > 1) { + throw new IllegalArgumentException("Invalid decimal separator (more than one character)."); + } + + char thousands = + thousand == null + ? Constants.UNKNOWN + : (thousand.isEmpty() ? Constants.NONE : thousand.charAt(0)); + char decimals = + decimal == null + ? Constants.UNKNOWN + : (decimal.isEmpty() ? Constants.NONE : decimal.charAt(0)); + + Optional matched = + switch (thousands) { + case Constants.NONE -> matchForNone(decimals); + case Constants.UNKNOWN -> matchForUnknown(decimals); + case ',' -> switch (decimals) { + case Constants.UNKNOWN, Constants.NONE, '.' -> Optional.of(COMMA_DOT); + default -> Optional.empty(); + }; + case '.' -> switch (decimals) { + case Constants.UNKNOWN, Constants.NONE, ',' -> Optional.of(DOT_COMMA); + default -> Optional.empty(); + }; + case ' ' -> matchForSpace(decimals); + case '\'' -> matchForSwiss(decimals); + case '_' -> matchForUnderscore(decimals); + default -> Optional.empty(); + }; + + if (matched.isEmpty()) { + throw new IllegalArgumentException("Invalid separators."); + } + return matched.get(); + } + + private static Optional matchForNone(char decimal) { + return switch (decimal) { + case Constants.UNKNOWN -> Optional.of(NO_UNKNOWN); + case '.' -> Optional.of(NO_DOT); + case ',' -> Optional.of(NO_COMMA); + default -> Optional.empty(); + }; + } + + private static Optional matchForUnknown(char decimal) { + return switch (decimal) { + case Constants.UNKNOWN -> Optional.of(UNKNOWN); + case '.' -> Optional.of(UNKNOWN_DOT); + case ',' -> Optional.of(UNKNOWN_COMMA); + default -> Optional.empty(); + }; + } + + private static Optional matchForSpace(char decimal) { + return switch (decimal) { + case Constants.UNKNOWN -> Optional.of(SPACE_UNKNOWN); + case '.' -> Optional.of(SPACE_DOT); + case ',' -> Optional.of(SPACE_COMMA); + default -> Optional.empty(); + }; + } + + private static Optional matchForSwiss(char decimal) { + return switch (decimal) { + case Constants.UNKNOWN -> Optional.of(SWISS_UNKNOWN); + case '.' -> Optional.of(SWISS_DOT); + case ',' -> Optional.of(SWISS_COMMA); + default -> Optional.empty(); + }; + } + + private static Optional matchForUnderscore(char decimal) { + return switch (decimal) { + case Constants.UNKNOWN -> Optional.of(UNDERSCORE_UNKNOWN); + case '.' -> Optional.of(UNDERSCORE_DOT); + case ',' -> Optional.of(UNDERSCORE_COMMA); + default -> Optional.empty(); + }; + } + + static class Constants { + static final char NONE = '\0'; + static final char UNKNOWN = '\uFFFD'; + } + + static boolean isDigit(char c) { + return (c >= '0' && c <= '9'); + } + + private final char thousands; + private final char decimal; + + NumberWithSeparators(char thousands, char decimal) { + this.thousands = thousands; + this.decimal = decimal; + } + + public char getThousands() { + return thousands; + } + + public char getDecimal() { + return decimal; + } + + /** + * While currently the format is treated as English, could be incorrect and actually is European. + */ + public boolean mightBeEuropean() { + return this == COMMA_UNKNOWN || this == DOT_UNKNOWN; + } + + NumberParseResult parse( + CharSequence value, int idx, boolean integer, boolean allowExponentialNotation) { + var separators = Separators.parse(value, idx, integer, allowExponentialNotation); + // TODO: Add more detail on separator failure. + if (separators == null) { + return new NumberParseFailure("Invalid separators."); + } + + if (thousands != Constants.UNKNOWN && (integer || decimal != Constants.UNKNOWN)) { + // If we have a fixed format then we can parse the number. + return integer + ? parseFixedInteger(value, idx, separators.endIdx(), separators.first()) + : parseFixedDecimal( + value, + idx, + separators.endIdx(), + separators.first(), + separators.second(), + separators.exponential()); + } + + return integer + ? parseUnknownInteger( + value, idx, separators.endIdx(), separators.first(), separators.count()) + : parseUnknownDecimal( + value, + idx, + separators.endIdx(), + separators.first(), + separators.second(), + separators.count(), + separators.lastSeparatorIdx(), + separators.exponential()); + } + + /** Internal record for returning when a new format is matched. */ + record NumberParseResultWithFormat(NumberWithSeparators format, NumberParseResult result) + implements NumberParseResult {} + + /** Internal record for returning the end index of the matched number. */ + record NumberParseResultWithIndex(int endIdx, NumberParseResult result) + implements NumberParseResult { + + boolean exceedsThousand() { + return switch (result) { + case NumberParseLong lngValue -> lngValue.number() >= 1000; + case NumberParseDouble dblValue -> dblValue.number() >= 1000; + default -> false; + }; + } + } + + /** Given a known integer format, parse the sequence. */ + private NumberParseResult parseFixedInteger( + CharSequence value, int idx, int endIdx, char firstSeparator) { + assert thousands != Constants.UNKNOWN; + + // Strip out the separators. + int origEndIdx = endIdx; + if (thousands != Constants.NONE) { + value = Separators.strip(value, idx, endIdx, thousands, decimal); + if (value == null) { + return new NumberParseFailure("Invalid number."); + } + idx = 0; + endIdx = value.length(); + } + + try { + long number = Long.parseLong(value, idx, endIdx, 10); + return new NumberParseResultWithIndex(origEndIdx, new NumberParseLong(number, "", false)); + } catch (NumberFormatException e) { + return new NumberParseFailure("Invalid number."); + } + } + + /** Parse an unknown format with no separators. */ + private NumberParseResult parseUnknownIntegerNone(CharSequence value, int idx, int endIdx) { + assert thousands == Constants.UNKNOWN; + + // We haven't encountered any separators. So parse the number as a long. + try { + long number = Long.parseLong(value, idx, endIdx, 10); + var result = new NumberParseResultWithIndex(endIdx, new NumberParseLong(number, "", false)); + + // If greater than or equal 1000, then we know no thousand separators. + if (number >= 1000) { + var format = + switch (decimal) { + case '.' -> NO_DOT; + case ',' -> NO_COMMA; + default -> NO_UNKNOWN; + }; + + if (this != format) { + return new NumberParseResultWithFormat(format, result); + } + } + + return result; + } catch (NumberFormatException e) { + return new NumberParseFailure("Invalid number."); + } + } + + /** Parse an unknown Integer format. */ + private NumberParseResult parseUnknownInteger( + CharSequence value, int idx, int endIdx, char separator, int separatorCount) { + assert thousands == Constants.UNKNOWN; + + if (separator == decimal) { + // Encountered a decimal point, so can't be an integer. + return new NumberParseFailure("Encountered Decimal Point - Can't Be Integer."); + } + + if (separator == Constants.NONE) { + // Didn't encounter any separators so use simpler logic. + return parseUnknownIntegerNone(value, idx, endIdx); + } + + // Find the correct format + var format = + switch (separator) { + case '.' -> DOT_COMMA; + case ',' -> separatorCount == 1 ? COMMA_UNKNOWN : COMMA_DOT; + case ' ' -> (decimal == Constants.UNKNOWN + ? SPACE_UNKNOWN + : (decimal == '.' ? SPACE_DOT : SPACE_COMMA)); + case '\'' -> (decimal == Constants.UNKNOWN + ? SWISS_UNKNOWN + : (decimal == '.' ? SWISS_DOT : SWISS_COMMA)); + default -> null; + }; + if (format == null) { + return new NumberParseFailure("No matching number format."); + } + + var result = format.parseFixedInteger(value, idx, endIdx, separator); + return (result instanceof NumberParseFailure) + ? result + : new NumberParseResultWithFormat(format, result); + } + + /** Given a known double format, parse the sequence. */ + private NumberParseResult parseFixedDecimal( + CharSequence value, + int idx, + int endIdx, + char firstSeparator, + char secondSeparator, + boolean exponential) { + // Deal with the special cases first. + if (this == DOT_UNKNOWN || this == UNKNOWN_DOT) { + // Haven't encountered a thousand separator, but know the decimal separator. + // If DOT_UNKNOWN then could be European or English, but treat as English. + assert firstSeparator == '.' && secondSeparator == Constants.NONE; + return NO_DOT.parseFixedDecimal( + value, idx, endIdx, firstSeparator, secondSeparator, exponential); + } else if (this == COMMA_UNKNOWN) { + // Have only encountered a Comma(s), so treat as English format (COMMA_DOT). + assert firstSeparator == ',' && secondSeparator == Constants.NONE; + return COMMA_DOT.parseFixedDecimal( + value, idx, endIdx, firstSeparator, secondSeparator, exponential); + } else if (this == UNKNOWN_COMMA) { + // Have encountered a comma and know is a decimal separator. + assert firstSeparator == ',' && secondSeparator == Constants.NONE; + return NO_COMMA.parseFixedDecimal( + value, idx, endIdx, firstSeparator, secondSeparator, exponential); + } + + assert thousands != Constants.UNKNOWN && decimal != Constants.UNKNOWN; + + // If no decimal separator, then must be an integer. + if (!exponential && firstSeparator != decimal && secondSeparator != decimal) { + return parseFixedInteger(value, idx, endIdx, firstSeparator); + } + + // Validate Separators. + if (firstSeparator != Constants.NONE) { + if ((secondSeparator == Constants.NONE + && firstSeparator != thousands + && firstSeparator != decimal) + || (secondSeparator != Constants.NONE + && (firstSeparator != thousands || secondSeparator != decimal))) { + return new NumberParseFailure("Invalid separator."); + } + } + + // Strip out the separators. + int origEndIdx = endIdx; + if (thousands != Constants.NONE || decimal != '.') { + value = Separators.strip(value, idx, endIdx, thousands, decimal); + if (value == null) { + return new NumberParseFailure("Invalid number."); + } + idx = 0; + endIdx = value.length(); + } + + try { + double number = Double.parseDouble(value.subSequence(idx, endIdx).toString()); + return new NumberParseResultWithIndex(origEndIdx, new NumberParseDouble(number, "")); + } catch (NumberFormatException e) { + return new NumberParseFailure("Invalid number."); + } + } + + /** Given a unknown format, parse the sequence. */ + private NumberParseResult parseUnknownDecimal( + CharSequence value, + int idx, + int endIdx, + char firstSeparator, + char secondSeparator, + int separatorCount, + int lastSeparatorIdx, + boolean exponential) { + assert thousands == Constants.UNKNOWN || decimal == Constants.UNKNOWN; + + // Special case when single separator equal to decimal point. + if (separatorCount == 1 && firstSeparator == decimal) { + var fixed = decimal == '.' ? NO_DOT : NO_COMMA; + var result = + fixed.parseFixedDecimal(value, idx, endIdx, Constants.NONE, decimal, exponential); + if (result instanceof NumberParseResultWithIndex resultWithIndex + && resultWithIndex.exceedsThousand()) { + return new NumberParseResultWithFormat(fixed, result); + } else { + return result; + } + } + + // Cases of no separators or repeated single separator - must be integer. + if (!exponential + && (firstSeparator == Constants.NONE + || (secondSeparator == Constants.NONE + && (separatorCount > 1 || firstSeparator == ' ' || firstSeparator == '\'')))) { + if (mightBeEuropean() && firstSeparator == '.') { + // We know we are wrong. + var result = DOT_COMMA.parseFixedInteger(value, idx, endIdx, '.'); + return (result instanceof NumberParseFailure) + ? result + : new NumberParseResultWithFormat(DOT_COMMA, result); + } + + var result = + thousands == Constants.UNKNOWN + ? parseUnknownInteger(value, idx, endIdx, firstSeparator, separatorCount) + : parseFixedInteger( + value, idx, endIdx, separatorCount == 0 ? thousands : firstSeparator); + + // Special case if COMMA_UNKNOWN and count > 1 then is COMMA_DOT. + boolean resolveCommaUnknown = this == COMMA_UNKNOWN && separatorCount > 1; + return (result instanceof NumberParseFailure) + ? result + : (resolveCommaUnknown ? new NumberParseResultWithFormat(COMMA_DOT, result) : result); + } + + // Case when in exponential notation and no separators. + if (exponential && firstSeparator == Constants.NONE) { + return NO_DOT.parseFixedDecimal( + value, idx, endIdx, Constants.NONE, Constants.NONE, exponential); + } + + // Need to resolve the format. + NumberWithSeparators format = null; + if (secondSeparator != Constants.NONE) { + format = + switch (firstSeparator) { + case '.' -> secondSeparator == ',' ? DOT_COMMA : null; + case ',' -> secondSeparator == '.' ? COMMA_DOT : null; + case ' ' -> secondSeparator == '.' + ? SPACE_DOT + : secondSeparator == ',' ? SPACE_COMMA : null; + case '\'' -> secondSeparator == '.' + ? SWISS_DOT + : secondSeparator == ',' ? SWISS_COMMA : null; + default -> null; + }; + } else if (firstSeparator == '.') { + // if separatorCount > 1, must be a thousand separator, hence DOT_COMMA (covered above). + // if index of separator > 3, must be a decimal point without a thousand separator, hence + // NO_DOT. + // if 3 digits following then could either, hence DOT_UNKNOWN. + // Otherwise, must be decimal point, hence UNKNOWN_DOT. + format = + lastSeparatorIdx - idx > 3 + ? NO_DOT + : (lastSeparatorIdx != endIdx - 4 + ? UNKNOWN_DOT + : (decimal == ',' ? DOT_COMMA : DOT_UNKNOWN)); + } else if (firstSeparator == ',') { + // if separatorCount > 1, must be a thousand separator, hence COMMA_DOT (covered above). + // if index of separator > 3, must be a decimal point without a thousand separator, hence + // NO_COMMA. + // if 3 digits following then could either, hence COMMA_UNKNOWN. + // Otherwise, must be decimal point, hence UNKNOWN_COMMA. + format = + lastSeparatorIdx - idx > 3 + ? NO_COMMA + : (lastSeparatorIdx != endIdx - 4 + ? UNKNOWN_COMMA + : (decimal == '.' ? COMMA_DOT : COMMA_UNKNOWN)); + } + if (format == null) { + return new NumberParseFailure("No matching number format."); + } + + // Validate that the new format matches. + if (this.mightBeEuropean()) { + if (this == DOT_UNKNOWN && format.decimal != '.' && format.thousands != '.') { + return new NumberParseFailure("Invalid format matched."); + } else if (this == COMMA_UNKNOWN && format.decimal != ',' && format.thousands != ',') { + return new NumberParseFailure("Invalid format matched."); + } + } else if ((thousands != Constants.UNKNOWN && format.thousands != thousands) + || (decimal != Constants.UNKNOWN && format.decimal != decimal)) { + return new NumberParseFailure("Invalid format matched."); + } + + var result = + format.parseFixedDecimal(value, idx, endIdx, firstSeparator, secondSeparator, exponential); + return (result instanceof NumberParseFailure) + ? result + : new NumberParseResultWithFormat(format, result); + } +} diff --git a/std-bits/base/src/main/java/org/enso/base/parser/Separators.java b/std-bits/base/src/main/java/org/enso/base/parser/Separators.java new file mode 100644 index 0000000000..51045ead3b --- /dev/null +++ b/std-bits/base/src/main/java/org/enso/base/parser/Separators.java @@ -0,0 +1,239 @@ +package org.enso.base.parser; + +import static org.enso.base.parser.NumberWithSeparators.isDigit; + +import java.nio.CharBuffer; + +/** + * Record to hold information about the separators found in a number. + * + * @param first - the first encountered separator or Constants.NONE if none found. + * @param second - the second distinct separator or Constants.NONE if none found. + * @param count - the number of separators found. + * @param endIdx - the index of the last character in the number. + * @param lastSeparatorIdx - the index of the last separator found. + * @param exponential - whether the number is in exponential notation. + */ +public record Separators( + char first, char second, int count, int endIdx, int lastSeparatorIdx, boolean exponential) { + /** + * Strip out the specified separators and replace with just full stop for decimal. If any + * character other than a digit, thousands or decimal separator is encountered then return null. + * If multiple decimal separators are encountered then return null. + */ + static CharSequence strip( + CharSequence value, int startIdx, int endIdx, char thousands, char decimal) { + int lastThousand = -1; + boolean foundDecimal = false; + char[] results = new char[endIdx - startIdx]; + int resultIdx = 0; + for (int i = startIdx; i < endIdx; i++) { + char c = value.charAt(i); + if (c == decimal) { + if (foundDecimal) { + return null; + } + if (lastThousand != -1 && i != lastThousand + 4) { + return null; + } + results[resultIdx++] = '.'; + foundDecimal = true; + } else if (isDigit(c)) { + results[resultIdx++] = c; + } else if (c == thousands) { + // Cannot have thousands post decimal separator. + if (foundDecimal) { + return null; + } + + // Must be 4 away from last thousand separator. + if (lastThousand != -1) { + if (i != lastThousand + 4) { + return null; + } + } + + lastThousand = i; + } else { + return null; + } + } + + if (!foundDecimal && lastThousand != -1 && endIdx != lastThousand + 4) { + return null; + } + + return CharBuffer.wrap(results, 0, resultIdx); + } + + /** Check if the character is a separator. */ + static boolean isSeparator(char c) { + return c == '.' || c == ',' || c == ' ' || c == '\'' || c == '_'; + } + + /** Check if the character is a decimal separator. */ + private static boolean isDecimalSeparator(char c) { + return c == '.' || c == ','; + } + + /** Check if the character is part of the current number. */ + private static boolean validChar(ExponentState exponentState, char c, char first, char second) { + if (isDigit(c)) { + return true; + } + + // If scientific notation is allowed then check for 'e' or 'E'. + // Can then be followed by a +/- sign. + if (exponentState == ExponentState.START && (c == 'e' || c == 'E')) { + return true; + } + + // Sign can only be encountered after an E/e in scientific notation. + if (exponentState == ExponentState.E_SIGN && (c == '+' || c == '-')) { + return true; + } + + // Separators not valid in scientific notation if not in start. + if (exponentState != ExponentState.START && exponentState != ExponentState.NOT_ALLOWED) { + return false; + } + + // We haven't encountered a separator yet, so valid if it is a separator. + if (first == NumberWithSeparators.Constants.NONE) { + return isSeparator(c); + } + + // We have encountered the first separator, so valid if it is the same as + // the first or a decimal separator. + if (second == NumberWithSeparators.Constants.NONE) { + return c == first || isDecimalSeparator(c); + } + + // We have encountered the second separator, so invalid to encounter another + // separator. + return false; + } + + /** + * Find the number and separators section. Validate the spacing of separators. Return the + * separators found or null if invalid. + * + * @param value the value to parse. + * @param idx the index to start parsing from. + * @param integer if the number is an integer. + * @param allowExponentialNotation is exponential notation allowed. + */ + static Separators parse( + CharSequence value, int idx, boolean integer, boolean allowExponentialNotation) { + int endIdx = idx; + char firstSeparator = NumberWithSeparators.Constants.NONE; + char secondSeparator = NumberWithSeparators.Constants.NONE; + + boolean firstWasSeparator = false; + int lastSeparator = -1; + int separatorCount = 0; + + // Set initial state for exponential notation. + ExponentState exponentState = + !integer && allowExponentialNotation ? ExponentState.START : ExponentState.NOT_ALLOWED; + + // Scan the text, find and validate spacing of separators. + // Space and ' are both valid thousands separators, but can't be second separator. + for (endIdx = idx; endIdx < value.length(); endIdx++) { + char c = value.charAt(endIdx); + if (!validChar(exponentState, c, firstSeparator, secondSeparator)) { + break; + } + + // Cope with digits or scientific notation. + if (isDigit(c) || c == 'e' || c == 'E' || c == '+' || c == '-') { + // Update Exponent State. + if (c == 'e' || c == 'E') { + exponentState = ExponentState.E_SIGN; + } else if (c == '+' || c == '-') { + exponentState = ExponentState.SIGN; + } else if (exponentState == ExponentState.SIGN || exponentState == ExponentState.E_SIGN) { + exponentState = ExponentState.EXPONENT; + } + + continue; + } + + // If first digit is a separator then only valid if a decimal separator. + if (endIdx == idx) { + if (integer || !isDecimalSeparator(c)) { + return null; + } + firstWasSeparator = true; + } + + if (firstSeparator == NumberWithSeparators.Constants.NONE) { + // Found the first separator. + firstSeparator = c; + } else { + // TODO: This check is probably now redundant as strip does it as well. + // Encountered another separator - must be 4 away from last separator. + if (endIdx != lastSeparator + 4) { + // Special case if last was a space as could be separating symbol. + if (c == ' ') { + break; + } + return null; + } + + // Must have been a decimal separator. + if (firstWasSeparator) { + return null; + } + + // Encountered a second separator, only valid if !integer. + if (firstSeparator != c) { + if (!integer) { + secondSeparator = c; + } else { + return null; + } + } + } + + lastSeparator = endIdx; + separatorCount++; + } + + // Special case when firstSeparator is a space and no secondSeparator and ending with a space. + if (firstSeparator == ' ' && value.charAt(endIdx - 1) == ' ') { + separatorCount--; + endIdx--; + lastSeparator -= 4; + if (separatorCount == 0) { + firstSeparator = NumberWithSeparators.Constants.NONE; + } + } + + // If in integer mode then must be a thousand separator, validate final spacing. + if (integer && separatorCount > 0 && lastSeparator != endIdx - 4) { + return null; + } + + return new Separators( + firstSeparator, + secondSeparator, + separatorCount, + endIdx, + lastSeparator, + exponentState == ExponentState.EXPONENT); + } + + private enum ExponentState { + /** Scientific notation not allowed. */ + NOT_ALLOWED, + /** Have not encountered an E/e yet. */ + START, + /** Have encountered an E/e. */ + E_SIGN, + /** Have encountered an E/e and a sign. */ + SIGN, + /** Have encountered an E/e, a sign and a digit. */ + EXPONENT + } +} diff --git a/std-bits/table/src/main/java/org/enso/table/parsing/NumberParser.java b/std-bits/table/src/main/java/org/enso/table/parsing/NumberParser.java index 3496bddae1..8ec015d93d 100644 --- a/std-bits/table/src/main/java/org/enso/table/parsing/NumberParser.java +++ b/std-bits/table/src/main/java/org/enso/table/parsing/NumberParser.java @@ -1,11 +1,8 @@ package org.enso.table.parsing; -import java.util.ArrayList; -import java.util.HashMap; -import java.util.List; -import java.util.Map; -import java.util.Objects; -import java.util.regex.Pattern; +import org.enso.base.parser.FormatDetectingNumberParser; +import org.enso.base.parser.NegativeSign; +import org.enso.base.parser.NumberWithSeparators; import org.enso.table.data.column.builder.Builder; import org.enso.table.data.column.builder.NumericBuilder; import org.enso.table.data.column.storage.Storage; @@ -15,178 +12,11 @@ import org.enso.table.parsing.problems.ParseProblemAggregator; import org.enso.table.problems.ProblemAggregator; import org.graalvm.polyglot.Context; -/** - * A parser for numbers. - * - *

This parser will attempt to work out what the decimal point and thousand separators used in - * the input. It will try various ways of formatting a number and can be set to allow for scientific - * notation, currency symbols. - * - *

If parsing a column it will select the format that parses the longest set without an issue - * from the top and then apply this format to all the rows. - * - *

The separators will be tried in British, German, French and Swiss order. - Thousand separator - * must be followed by groups of 3 numbers. - Scientific notation is only allowed on decimals and - * must be on a value between -10 and 10. The notation is an `E` followed by an integer. - * - *

The following formats are supported: - Sign (+/-) followed by Number (e.g. +1,234.56) - Using - * brackets to indicate a negative number (e.g. (1,234.56)) - Currency symbols (if enabled) can be - * placed before or after the sign and number. - If using brackets, the currency symbol must be - * placed after the opening bracket. - */ public class NumberParser extends IncrementalDatatypeParser { - private static final String SIGN = "(?[-+])?"; - private static final String BRACKETS = "(?\\((?=.*\\)\\s*$))?\\s*"; - private static final String BRACKET_CLOSE = "\\)?"; - private static final String CCY = "(?[^0-9(),. '+-]+)"; - private static final String EXP = "(?[eE][+-]?\\d+)?"; - private static final String SPACE = "\\s*"; - - private record Separators(String thousand, String decimal) {} - - private final Separators[] SEPARATORS; - - private static final Map PATTERNS = new HashMap<>(); - private final IntegerType integerTargetType; - - private static void validateSeparator(String name, String value) { - if (value == null) return; - - if (value.length() != 1) { - throw new IllegalArgumentException( - name + " must be a single character, but it was '" + value + "'."); - } - - // If we allowed separators to be a digit, super crazy stuff could happen - e.g. technically - // 10000 could be interpreted as 1000 by interpreting the first 0 as a thousand separator. Let's - // not do that. - if (Character.isDigit(value.charAt(0))) { - throw new IllegalArgumentException(name + " cannot be a digit, but it was '" + value + "'."); - } - } - - /** - * Builds a list of possible separator pairs. - * - *

If one of the parameters is null, it is meant to be inferred (multiple separator pairs will - * be provided for it), if it is set to a concrete value, it will be fixed. - */ - private static Separators[] buildSeparators( - boolean allowDecimal, String decimalPoint, String thousandSeparator) { - validateSeparator("Decimal point", decimalPoint); - validateSeparator("Thousand separator", thousandSeparator); - if (decimalPoint != null && decimalPoint.equals(thousandSeparator)) { - throw new IllegalArgumentException( - "Decimal point and thousand separator cannot be the same, but they were both '" - + decimalPoint - + "'."); - } - - boolean fullAutomaticMode = allowDecimal && decimalPoint == null && thousandSeparator == null; - if (fullAutomaticMode) { - return new Separators[] { - new Separators(",", "."), - new Separators(".", ","), - new Separators(" ", ","), - new Separators("'", ","), - }; - } - - List thousandSeparators; - if (thousandSeparator == null) { - List autoThousandSeparators = List.of(",", ".", "'", " "); - thousandSeparators = - autoThousandSeparators.stream().filter(sep -> !sep.equals(decimalPoint)).toList(); - } else { - thousandSeparators = List.of(thousandSeparator); - } - - List decimalPoints; - if (decimalPoint == null) { - if (allowDecimal) { - List autoDecimalPoints = List.of(",", "."); - assert thousandSeparator != null; - decimalPoints = - autoDecimalPoints.stream().filter(sep -> !sep.equals(thousandSeparator)).toList(); - } else { - // List.of(null) is not permitted... - decimalPoints = new ArrayList<>(); - decimalPoints.add(null); - } - } else { - decimalPoints = List.of(decimalPoint); - } - - return thousandSeparators.stream() - .flatMap( - thousand -> decimalPoints.stream().map(decimal -> new Separators(thousand, decimal))) - .toArray(Separators[]::new); - } - - /** The number of patterns that are allowed for non-currency numbers. */ - private static final int ALLOWED_NON_CCY_PATTERNS = 2; - - /** The number of patterns that are allowed for currency numbers. */ - private static final int ALLOWED_CCY_PATTERNS = 6; - - private static Pattern buildPattern( - boolean allowDecimal, - boolean allowCurrency, - boolean allowScientific, - boolean trimValues, - int patternIndex, - Separators separators) { - if (allowScientific && !allowDecimal) { - throw new IllegalArgumentException("Scientific notation requires decimal numbers."); - } - - if (patternIndex >= (allowCurrency ? ALLOWED_CCY_PATTERNS : ALLOWED_NON_CCY_PATTERNS)) { - return null; - } - - String INTEGER = - "(?(\\d*)" - + (separators.thousand == null - ? "" - : "|(\\d{1,3}([" + separators.thousand + "]\\d{3})*)") - + ")"; - - String decimalPoint = allowDecimal ? Objects.requireNonNull(separators.decimal) : null; - var NUMBER = - INTEGER - + (allowDecimal ? "(?[" + decimalPoint + "]\\d*)?" : "") - + (allowScientific ? EXP : ""); - - var pattern = - switch (patternIndex) { - case 0 -> SIGN + NUMBER; - case 1 -> BRACKETS + NUMBER + BRACKET_CLOSE; - case 2 -> SIGN + CCY + SPACE + NUMBER; - case 3 -> CCY + SPACE + SIGN + NUMBER; - case 4 -> SIGN + NUMBER + CCY; - case 5 -> BRACKETS + CCY + SPACE + NUMBER + BRACKET_CLOSE; - default -> throw new IllegalArgumentException("Invalid pattern index: " + patternIndex); - }; - - if (trimValues) { - pattern = SPACE + pattern + SPACE; - } - - return PATTERNS.computeIfAbsent("^" + pattern + "$", Pattern::compile); - } - - private final boolean allowDecimal; - private final boolean allowCurrency; - private final boolean allowLeadingZeros; - private final boolean allowScientific; - private final boolean trimValues; - /** * Creates a new integer instance of this parser. * * @param integerTargetType the target type describing how large integer values can be accepted - * @param allowCurrency whether to allow currency symbols - * @param allowLeadingZeros whether to allow leading zeros * @param trimValues whether to trim the input values * @param decimalPoint the decimal point set for the current format, or null if not specified; * this parser does not use decimal point (since it is for integers) but it ensure that if a @@ -196,16 +26,16 @@ public class NumberParser extends IncrementalDatatypeParser { */ public static NumberParser createIntegerParser( IntegerType integerTargetType, - boolean allowCurrency, - boolean allowLeadingZeros, + boolean allowSymbol, + boolean allowLeadingZeroes, boolean trimValues, String decimalPoint, String thousandSeparator) { + assert integerTargetType != null; return new NumberParser( - false, integerTargetType, - allowCurrency, - allowLeadingZeros, + allowSymbol, + allowLeadingZeroes, trimValues, false, decimalPoint, @@ -215,240 +45,125 @@ public class NumberParser extends IncrementalDatatypeParser { /** * Creates a new decimal instance of this parser. * - * @param allowCurrency whether to allow currency symbols - * @param allowLeadingZeros whether to allow leading zeros + * @param allowSymbol whether to allow symbols in the input + * @param allowLeadingZeroes whether to allow leading zeroes in the input * @param trimValues whether to trim the input values - * @param allowScientific whether to allow scientific notation - * @param decimalPoint the decimal separator to use (if null, then will be inferred) - * @param thousandSeparator the thousand separator to use (if null, then will be inferred) + * @param allowExponentialNotation whether to allow exponential notation in the input + * @param decimalPoint the decimal point set for the current format (if null then will be + * inferred) + * @param thousandSeparator the thousand separator to use (if null then will be inferred) */ public static NumberParser createDecimalParser( - boolean allowCurrency, - boolean allowLeadingZeros, + boolean allowSymbol, + boolean allowLeadingZeroes, boolean trimValues, - boolean allowScientific, + boolean allowExponentialNotation, String decimalPoint, String thousandSeparator) { return new NumberParser( - true, null, - allowCurrency, - allowLeadingZeros, + allowSymbol, + allowLeadingZeroes, trimValues, - allowScientific, + allowExponentialNotation, decimalPoint, thousandSeparator); } + private final IntegerType integerTargetType; + + private final FormatDetectingNumberParser parser; + private NumberParser( - boolean allowDecimal, IntegerType integerTargetType, - boolean allowCurrency, - boolean allowLeadingZeros, - boolean trimValues, - boolean allowScientific, + boolean allowSymbol, + boolean allowLeadingZeroes, + boolean allowLeadingTrailingWhitespace, + boolean allowExponentialNotation, String decimalPoint, String thousandSeparator) { - this.allowDecimal = allowDecimal; this.integerTargetType = integerTargetType; - this.allowCurrency = allowCurrency; - this.allowLeadingZeros = allowLeadingZeros; - this.trimValues = trimValues; - this.allowScientific = allowScientific; - SEPARATORS = buildSeparators(allowDecimal, decimalPoint, thousandSeparator); + + var numberWithSeparators = NumberWithSeparators.fromSeparators(thousandSeparator, decimalPoint); + this.parser = + new FormatDetectingNumberParser( + allowSymbol, + allowLeadingZeroes, + allowLeadingTrailingWhitespace, + allowExponentialNotation, + NegativeSign.UNKNOWN, + numberWithSeparators); } - /** - * Creates a Pattern for the given index. The index will be decoded into a specific set of - * separators (unless fixed separators are used) and then paired with one of the valid patterns - * for the given parser. - */ - private Pattern patternForIndex(int index) { - int allowedSet = (allowCurrency ? ALLOWED_CCY_PATTERNS : ALLOWED_NON_CCY_PATTERNS); - int separatorsIndex = index / allowedSet; - int patternIndex = index % allowedSet; - - if (separatorsIndex >= SEPARATORS.length) { - return null; - } - - return buildPattern( - allowDecimal, - allowCurrency, - allowScientific, - trimValues, - patternIndex, - SEPARATORS[separatorsIndex]); + private boolean isInteger() { + return integerTargetType != null; } @Override - public Object parseSingleValue(String text, ParseProblemAggregator problemAggregator) { - int index = 0; - var pattern = patternForIndex(index); - while (pattern != null) { - var value = innerParseSingleValue(text, pattern); - if (value != null) { - return value; - } - - index++; - pattern = patternForIndex(index); - } - - problemAggregator.reportInvalidFormat(text); - return null; + protected Builder makeBuilderWithCapacity(int capacity, ProblemAggregator problemAggregator) { + return isInteger() + ? NumericBuilder.createLongBuilder(capacity, integerTargetType, problemAggregator) + : NumericBuilder.createDoubleBuilder(capacity, problemAggregator); } @Override public Storage parseColumn( Storage sourceStorage, CommonParseProblemAggregator problemAggregator) { - int index = 0; - var pattern = patternForIndex(index); + Builder builder = + makeBuilderWithCapacity(sourceStorage.size(), problemAggregator.createSimpleChild()); - int bestIndex = 0; - int bestCount = -1; - while (pattern != null) { - ProblemAggregator inner = problemAggregator.createSimpleChild(); - Builder builder = makeBuilderWithCapacity(sourceStorage.size(), inner); - int failedAt = parseColumnWithPattern(pattern, sourceStorage, builder, null); - if (failedAt == -1) { - return builder.seal(); - } - - // If there was a failure, we abandon this branch - thus we discard any problems that might - // have been reported by the inner aggregator. - inner.detachFromParent(); - - if (failedAt > bestCount) { - bestCount = failedAt; - bestIndex = index; - } - - index++; - pattern = patternForIndex(index); - } - - CommonParseProblemAggregator aggregator = problemAggregator.createContextAwareChild(); - Builder fallback = makeBuilderWithCapacity(sourceStorage.size(), aggregator); - parseColumnWithPattern(patternForIndex(bestIndex), sourceStorage, fallback, aggregator); - return fallback.seal(); - } - - private int parseColumnWithPattern( - Pattern pattern, - Storage sourceStorage, - Builder builder, - ParseProblemAggregator aggregator) { - Context context = Context.getCurrent(); + var context = Context.getCurrent(); for (int i = 0; i < sourceStorage.size(); i++) { var text = sourceStorage.getItemBoxed(i); - if (text == null) { + + // Check if in unknown state + var mightBeEuropean = !isInteger() && parser.numberWithSeparators().mightBeEuropean(); + + // Try and parse the value + var result = text == null ? null : parseSingleValue(text, problemAggregator); + + // Do we need to rescan? + if (mightBeEuropean && parser.numberWithSeparators() != NumberWithSeparators.DOT_COMMA) { + builder = + makeBuilderWithCapacity(sourceStorage.size(), problemAggregator.createSimpleChild()); + for (int j = 0; j < i; j++) { + var subText = sourceStorage.getItemBoxed(j); + var subResult = subText == null ? null : parseSingleValue(subText, problemAggregator); + if (subResult == null) { + builder.appendNulls(1); + } else { + builder.append(subResult); + } + } + } + + // Append the result + if (result == null) { builder.appendNulls(1); } else { - var value = innerParseSingleValue(text, pattern); - if (value != null) { - builder.appendNoGrow(value); - } else { - if (aggregator == null) { - return i; - } - - aggregator.reportInvalidFormat(text); - builder.appendNulls(1); - } + builder.append(result); } context.safepoint(); } - return -1; + + return builder.seal(); } @Override - protected Builder makeBuilderWithCapacity(int capacity, ProblemAggregator problemAggregator) { - return allowDecimal - ? NumericBuilder.createDoubleBuilder(capacity, problemAggregator) - : NumericBuilder.createLongBuilder(capacity, integerTargetType, problemAggregator); - } + public Object parseSingleValue(String text, ParseProblemAggregator problemAggregator) { + var result = parser.parse(text, isInteger()); - private Object innerParseSingleValue(String text, Pattern pattern) { - if (allowDecimal) { - var trimmed = trimValues ? text.trim() : text; - if (trimmed.equals("NaN")) { - return Double.NaN; - } - if (trimmed.equals("Infinity")) { - return Double.POSITIVE_INFINITY; - } - if (trimmed.equals("-Infinity")) { - return Double.NEGATIVE_INFINITY; - } - } - - var parsed = pattern.matcher(text); - if (!parsed.matches()) { + // TODO: Capture the message into the problem aggregator. + if (result instanceof FormatDetectingNumberParser.NumberParseFailure) { + problemAggregator.reportInvalidFormat(text); return null; } - try { - var sign = parsed.group("sign"); - long sign_value = sign != null && !sign.equals("+") ? -1 : 1; - - var integer = parsed.group("integer").replaceAll("\\D", ""); - - if (!allowLeadingZeros && integer.length() > 1 && integer.charAt(0) == '0') { - return null; - } - - if (allowDecimal) { - String decimal = parsed.group("decimal"); - String decimalPrepared = decimal == null ? "" : ("." + decimal.substring(1)); - - if (integer.equals("") && decimalPrepared.equals("")) { - return null; - } - - integer = integer.equals("") ? "0" : integer; - - String exp = allowScientific ? parsed.group("exp") : null; - if (exp != null) { - if (integer.length() > 1) { - return null; - } - decimalPrepared = decimalPrepared + exp; - } - - // If there is no decimal part, we parse as integer, as this will allow us more specialized - // handling. - // For example, we can get the exact value instead of a rounded one for big values. We can - // then round - // later, but first handle any warnings. - if (decimalPrepared.equals("")) { - long integer_part = Long.parseLong(integer); - - // Special handling for values like `-0` - if we treat them as integers, they will lose - // the `-` sign. - if (integer_part == 0 && sign_value < 0) { - return -0.0; - } - - return sign_value * integer_part; - } - - return sign_value * Double.parseDouble(integer + decimalPrepared); - } - - if (integer.equals("")) { - return null; - } - - long integer_value = sign_value * Long.parseLong(integer); - if (integerTargetType.fits(integer_value)) { - return integer_value; - } else { - return null; - } - } catch (NumberFormatException e) { - throw new IllegalStateException("Java parse failed to parse number: " + text, e); - } + return switch (result) { + case FormatDetectingNumberParser.NumberParseDouble doubleResult -> doubleResult.number(); + case FormatDetectingNumberParser.NumberParseLong longResult -> longResult.number(); + default -> throw new IllegalStateException("Unexpected result type: " + result.getClass()); + }; } } diff --git a/test/Table_Tests/src/Formatting/Parse_Values_Spec.enso b/test/Table_Tests/src/Formatting/Parse_Values_Spec.enso index b10d5fe479..a7f0669b17 100644 --- a/test/Table_Tests/src/Formatting/Parse_Values_Spec.enso +++ b/test/Table_Tests/src/Formatting/Parse_Values_Spec.enso @@ -420,9 +420,9 @@ add_specs suite_builder = pUS3.to_vector . should_equal [1, -123, Nothing, 1234567, Nothing] Problems.expect_warning (Invalid_Format.Error "ints" Value_Type.Integer 2 ["-1,234", "12,34,56"]) pUS3 - cUS4 = Column.from_vector "ints" ["$1234", "$1,234", "$1,234,567","-$1,234", "($1,234,567)"] + cUS4 = Column.from_vector "ints" ["$234", "$1,234", "$1,234,567","-$1,234", "($1,234,567)"] pUS4 = cUS4.parse type=Value_Type.Integer - pUS4.to_vector . should_equal [1234, 1234, 1234567, -1234, Nothing] + pUS4.to_vector . should_equal [234, 1234, 1234567, -1234, Nothing] Problems.expect_warning (Invalid_Format.Error "ints" Value_Type.Integer 1 ["($1,234,567)"]) pUS4 ## Reject bracket notation for negative numbers if already seen a minus sign