New NumberParser for Table parsing (#11499)

Replaces the Regex based number parser with a new parser which works out the same by working out each part as it sees and example of it.

Close #7398 - performance of reading the large CSV now about 2s (down from 15-20s).
This commit is contained in:
James Dunkerley 2024-11-13 19:08:23 +00:00 committed by GitHub
parent fb50a8f24f
commit 6b544650b3
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
7 changed files with 1197 additions and 390 deletions

View File

@ -249,26 +249,12 @@ type Data_Formatter
get_specific_type_parsers self =
## Have to do date parsing first to allow for pure numeric formats.
date_parsers = [self.make_date_time_parser, self.make_date_parser, self.make_time_of_day_parser]
try_us_parsers_first = self.decimal_point == Auto && self.thousand_separator != "."
preferred_auto_parsers = case try_us_parsers_first of
## If we are in auto mode, we will first try parsing US integers,
then US floats and only then other integers and floats.
Under normal circumstances, we first try integers and later
floats - but this would cause `1.000` to be interpreted as `1000`
because _all_ integers take precedence and floats are considered
later. But we want `1.000` to be interpreted as a `1.0` float by
default, so we change the ordering a bit.
True ->
us_preferred = self.with_number_formatting decimal_point='.'
[us_preferred.make_integer_parser auto_mode=True, us_preferred.make_decimal_parser auto_mode=True]
## However, if the `decimal_point` is set to something else,
we don't do auto inference, so this extra logic is not needed.
False -> []
us_parsers = if self.decimal_point != Auto || self.thousand_separator != "" then [] else
## In default mode, add the English pattern to ensure they dominate.
english = self.with_number_formatting decimal_point='.' thousand_separator=','
[english.make_integer_parser auto_mode=True, english.make_decimal_parser auto_mode=True]
remaining_parsers = [self.make_integer_parser auto_mode=True, self.make_decimal_parser auto_mode=True, self.make_boolean_parser]
parsers = date_parsers + preferred_auto_parsers + remaining_parsers
parsers = date_parsers + us_parsers + remaining_parsers
## Unfortunately, the [] literal allows to create a vector containing
dataflow errors. That is not handled well later by Polyglot. So we
ensure all errors surface here.

View File

@ -0,0 +1,324 @@
package org.enso.base.parser;
/**
* Parse a String into a Number. It supports the following patterns:
*
* <ul>
* <li>SIGN + NUMBER
* <li>SYMBOL + SIGN + NUMBER
* <li>SIGN + NUMBER + SYMBOL
* <li>SIGN + BRACKETS + NUMBER + BRACKET_CLOSE
* <li>BRACKETS + SYMBOL + NUMBER + BRACKET_CLOSE
* <li>BRACKETS + NUMBER + SYMBOL + BRACKET_CLOSE
* </ul>
*/
public class FormatDetectingNumberParser {
public interface NumberParseResult {}
public interface NumberParseResultSuccess extends NumberParseResult {
NumberParseResultSuccess negate();
NumberParseResultSuccess withSymbol(String symbol);
}
public record NumberParseLong(long number, String symbol, boolean negated)
implements NumberParseResultSuccess {
@Override
public NumberParseResultSuccess negate() {
return new NumberParseLong(-number, symbol, true);
}
@Override
public NumberParseResultSuccess withSymbol(String symbol) {
return new NumberParseLong(number, symbol, negated);
}
}
public record NumberParseDouble(double number, String symbol)
implements NumberParseResultSuccess {
@Override
public NumberParseResultSuccess negate() {
return new NumberParseDouble(-number, symbol);
}
@Override
public NumberParseResultSuccess withSymbol(String symbol) {
return new NumberParseDouble(number, symbol);
}
}
public record NumberParseFailure(String message) implements NumberParseResult {}
private final boolean allowSymbol;
private final boolean allowLeadingZeroes;
private final boolean allowLeadingTrailingWhitespace;
private final boolean allowExponentialNotation;
private NegativeSign negativeSign;
private NumberWithSeparators numberWithSeparators;
public FormatDetectingNumberParser(
boolean allowSymbol,
boolean allowLeadingZeroes,
boolean allowLeadingTrailingWhitespace,
boolean allowExponentialNotation,
NegativeSign negativeSign,
NumberWithSeparators numberWithSeparators) {
this.allowSymbol = allowSymbol;
this.allowLeadingZeroes = allowLeadingZeroes;
this.allowLeadingTrailingWhitespace = allowLeadingTrailingWhitespace;
this.allowExponentialNotation = allowExponentialNotation;
this.negativeSign = negativeSign;
this.numberWithSeparators = numberWithSeparators;
}
public NegativeSign negativeSign() {
return negativeSign;
}
public void setNegativeSign(NegativeSign newNegativeSign) {
if (negativeSign != NegativeSign.UNKNOWN) {
throw new IllegalStateException("Negative Sign Already Set.");
}
negativeSign = newNegativeSign;
}
public NumberWithSeparators numberWithSeparators() {
return numberWithSeparators;
}
/**
* Parse a string into a number.
*
* @param value the string to parse.
* @param integer whether to parse a Long or a Double.
* @return the parsed number, or a failure if the parse was unsuccessful.
*/
public NumberParseResult parse(CharSequence value, boolean integer) {
// Ensure that if we are allowing exponential notation, we are not parsing an integer.
assert !(allowExponentialNotation && integer);
// State
boolean lastWasWhitespace = false;
boolean encounteredContent = false;
boolean encounteredSign = false;
boolean needsNegating = false;
NumberParseResultSuccess number = null;
String symbol = "";
// Scan the value
int idx = 0;
int length = value.length();
while (idx < length) {
char c = value.charAt(idx);
if (Character.isWhitespace(c)) {
if (!allowLeadingTrailingWhitespace && !encounteredContent) {
return new NumberParseFailure("Unexpected leading Whitespace.");
}
if (idx > 0 && (value.charAt(idx - 1) == '-' || value.charAt(idx - 1) == '+')) {
return new NumberParseFailure("Unexpected whitespace after sign.");
}
idx++;
lastWasWhitespace = true;
} else {
encounteredContent = true;
lastWasWhitespace = false;
if (NumberWithSeparators.isDigit(c) || Separators.isSeparator(c)) {
if (number != null) {
return new NumberParseFailure("Multiple Number Sections.");
}
var numberPart =
numberWithSeparators.parse(value, idx, integer, allowExponentialNotation);
// If the format changed, catch new format and unwrap result.
if (numberPart instanceof NumberWithSeparators.NumberParseResultWithFormat newFormat) {
numberWithSeparators = newFormat.format();
numberPart = newFormat.result();
}
// Result should either be a new index or a failure.
// If it is a new index, update the index and unwrap the result.
if (numberPart instanceof NumberWithSeparators.NumberParseResultWithIndex newIndex) {
// Check for leading zeroes (0 or 0<decimal> is acceptable).
if (!allowLeadingZeroes
&& c == '0'
&& newIndex.endIdx() > idx + 1
&& value.charAt(idx + 1) != numberWithSeparators.getDecimal()) {
return new NumberParseFailure("Leading Zero.");
}
idx = newIndex.endIdx();
numberPart = newIndex.result();
}
if (numberPart instanceof NumberParseResultSuccess numberSuccess) {
number = numberSuccess;
} else {
return numberPart;
}
} else if (NegativeSign.isOpenSign(c)) {
if (encounteredSign || number != null) {
return new NumberParseFailure("Unexpected sign character.");
}
var signOk = negativeSign.checkValid(c);
if (signOk.isEmpty()) {
return new NumberParseFailure("Inconsistent negative format.");
}
negativeSign = signOk.get();
encounteredSign = true;
needsNegating = c != '+';
idx++;
} else if (c == ')') {
if (!needsNegating || negativeSign != NegativeSign.BRACKET_OPEN || number == null) {
return new NumberParseFailure("Unexpected bracket close.");
}
// Should only be whitespace left.
idx++;
while (idx < length) {
if (!Character.isWhitespace(value.charAt(idx))) {
return new NumberParseFailure("Unexpected characters after bracket close.");
}
idx++;
lastWasWhitespace = true;
}
// Negate here so can tell finished.
number = number.negate();
needsNegating = false;
} else if (!integer
&& number == null
&& isSameSequence(value, idx, "infinity", "INFINITY")) {
// Identify Infinity
number = new NumberParseDouble(Double.POSITIVE_INFINITY, "");
idx += 8;
} else if (!integer
&& number == null
&& !encounteredSign
&& !needsNegating
&& isSameSequence(value, idx, "nan", "NAN")) {
// Identify NaN
number = new NumberParseDouble(Double.NaN, "");
idx += 3;
} else {
if (!symbol.isEmpty()) {
return new NumberParseFailure("Multiple Symbol Sections.");
}
if (!allowSymbol) {
return new NumberParseFailure("Symbols not allowed.");
}
// ToDo: Locking symbol position within text parts.
int endIdx = idx;
while (endIdx < length
&& !NumberWithSeparators.isDigit(c)
&& !Separators.isSeparator(c)
&& !NegativeSign.isSign(c)
&& !Character.isWhitespace(c)) {
endIdx++;
if (endIdx < length) {
c = value.charAt(endIdx);
}
}
symbol = value.subSequence(idx, endIdx).toString();
idx = endIdx;
}
}
}
// Check for trailing whitespace.
if (!allowLeadingTrailingWhitespace && lastWasWhitespace) {
return new NumberParseFailure("Trailing Whitespace.");
}
// Special check for unclosed bracket.
if (negativeSign == NegativeSign.BRACKET_OPEN && needsNegating) {
return new NumberParseFailure("Unclosed bracket.");
}
// Fail if no number found.
if (number == null) {
return new NumberParseFailure("No Number Found.");
}
// Return Result
number = needsNegating ? number.negate() : number;
// Handle Special Case of Negated 0 If Not An Integer
if (!integer
&& number instanceof NumberParseLong longNumber
&& longNumber.number() == 0
&& longNumber.negated()) {
// Catch -0 double.
number = new NumberParseDouble(-0.0, longNumber.symbol());
}
return symbol.isEmpty() ? number : number.withSymbol(symbol);
}
public Long parseLong(CharSequence value) {
var result = parse(value, true);
if (result instanceof NumberParseLong numberSuccess) {
return numberSuccess.number();
}
return null;
}
public Double parseDouble(CharSequence value) {
var result = parse(value, false);
if (result instanceof NumberParseDouble numberSuccess) {
return numberSuccess.number();
} else if (result instanceof NumberParseLong longNumber) {
return (double) longNumber.number();
}
return null;
}
public NumberParseResult[] parseMany(CharSequence[] values, boolean integer) {
var results = new NumberParseResult[values.length];
int i = 0;
while (i < values.length) {
var previous = numberWithSeparators;
results[i] = parse(values[i], integer);
if (numberWithSeparators != previous
&& ((previous == NumberWithSeparators.DOT_UNKNOWN
&& numberWithSeparators != NumberWithSeparators.DOT_COMMA)
|| (previous == NumberWithSeparators.COMMA_UNKNOWN
&& numberWithSeparators != NumberWithSeparators.DOT_COMMA))) {
// Start scan over, as format was incorrect.
i = 0;
} else {
i++;
}
}
return results;
}
private static boolean isSameSequence(
CharSequence sequence, int index, CharSequence toMatchLower, CharSequence toMatchUpper) {
assert toMatchLower.length() == toMatchUpper.length();
if (index + toMatchLower.length() > sequence.length()) {
return false;
}
for (int i = 0; i < toMatchLower.length(); i++) {
char c = sequence.charAt(index + i);
if (c != toMatchLower.charAt(i) && c != toMatchUpper.charAt(i)) {
return false;
}
}
return true;
}
}

View File

@ -0,0 +1,40 @@
package org.enso.base.parser;
import java.util.Optional;
public enum NegativeSign {
/** No sign encountered, so could be either. */
UNKNOWN,
/** Minus or Plus sign - e.g. +123 or -123. */
MINUS,
/** Brackets - e.g. (123) */
BRACKET_OPEN;
/**
* Checks if the given character is a valid negative sign.
*
* @param c the character to check
* @return the new state of the negative sign or Optional.empty if the character is invalid.
*/
public Optional<NegativeSign> checkValid(char c) {
var result =
switch (this) {
case UNKNOWN -> c == '-' || c == '+' ? MINUS : c == '(' ? BRACKET_OPEN : null;
case MINUS -> c == '(' ? null : this;
case BRACKET_OPEN -> c != '(' ? null : this;
};
return Optional.ofNullable(result);
}
static boolean isOpenSign(char c) {
return c == '-' || c == '+' || c == '(';
}
static boolean isCloseSign(char c) {
return c == ')';
}
static boolean isSign(char c) {
return isOpenSign(c) || isCloseSign(c);
}
}

View File

@ -0,0 +1,503 @@
package org.enso.base.parser;
import java.util.Optional;
import org.enso.base.parser.FormatDetectingNumberParser.NumberParseDouble;
import org.enso.base.parser.FormatDetectingNumberParser.NumberParseFailure;
import org.enso.base.parser.FormatDetectingNumberParser.NumberParseLong;
import org.enso.base.parser.FormatDetectingNumberParser.NumberParseResult;
/**
* Number parsing with separators. Specifies the universe of number formats that can be parsed. Two
* special cases, where we default to English format over European:
*
* <ul>
* <li>Encounter a single . or , with 3 trailing numbers.
* <li>Could be either DOT_COMMA or COMMA_DOT.
* <li>If a single . then uses DOT_UNKNOWN.
* <li>If a single , then uses COMMA_UNKNOWN.
* </ul>
*/
public enum NumberWithSeparators {
UNKNOWN(Constants.UNKNOWN, Constants.UNKNOWN),
// Special case where we have encountered a . with 3 trailing digits. Such as
// ##0.123 ###.123
DOT_UNKNOWN(Constants.UNKNOWN, '.'),
// Special case where we have encountered a single . within 3 digits from
// start and without 3 digits from end. Such as ##3.1# or ##3.1415...
UNKNOWN_DOT(Constants.UNKNOWN, '.'),
// Special case where we have encountered a , with 3 trailing digits. Such as
// ##0,123 ###,123
COMMA_UNKNOWN(',', Constants.UNKNOWN),
// Special case where we have encountered a single . within 3 digits from
// start and without 3 digits from end. Such as ##3,1# or ##3,1415...
UNKNOWN_COMMA(Constants.UNKNOWN, ','),
NO_UNKNOWN(Constants.NONE, Constants.UNKNOWN),
NO_DOT(Constants.NONE, '.'),
NO_COMMA(Constants.NONE, ','),
// European format (e.g. 1.234,56)
DOT_COMMA('.', ','),
// English format (e.g. 1,234.56)
COMMA_DOT(',', '.'),
SPACE_UNKNOWN(' ', Constants.UNKNOWN),
SPACE_DOT(' ', '.'),
SPACE_COMMA(' ', ','),
SWISS_UNKNOWN('\'', Constants.UNKNOWN),
SWISS_DOT('\'', '.'),
SWISS_COMMA('\'', ','),
UNDERSCORE_UNKNOWN('_', Constants.UNKNOWN),
UNDERSCORE_DOT('_', '.'),
UNDERSCORE_COMMA('_', ',');
public static NumberWithSeparators fromSeparators(String thousand, String decimal) {
if (thousand != null && thousand.length() > 1) {
throw new IllegalArgumentException("Invalid thousand separator (more than one character).");
}
if (decimal != null && decimal.length() > 1) {
throw new IllegalArgumentException("Invalid decimal separator (more than one character).");
}
char thousands =
thousand == null
? Constants.UNKNOWN
: (thousand.isEmpty() ? Constants.NONE : thousand.charAt(0));
char decimals =
decimal == null
? Constants.UNKNOWN
: (decimal.isEmpty() ? Constants.NONE : decimal.charAt(0));
Optional<NumberWithSeparators> matched =
switch (thousands) {
case Constants.NONE -> matchForNone(decimals);
case Constants.UNKNOWN -> matchForUnknown(decimals);
case ',' -> switch (decimals) {
case Constants.UNKNOWN, Constants.NONE, '.' -> Optional.of(COMMA_DOT);
default -> Optional.empty();
};
case '.' -> switch (decimals) {
case Constants.UNKNOWN, Constants.NONE, ',' -> Optional.of(DOT_COMMA);
default -> Optional.empty();
};
case ' ' -> matchForSpace(decimals);
case '\'' -> matchForSwiss(decimals);
case '_' -> matchForUnderscore(decimals);
default -> Optional.empty();
};
if (matched.isEmpty()) {
throw new IllegalArgumentException("Invalid separators.");
}
return matched.get();
}
private static Optional<NumberWithSeparators> matchForNone(char decimal) {
return switch (decimal) {
case Constants.UNKNOWN -> Optional.of(NO_UNKNOWN);
case '.' -> Optional.of(NO_DOT);
case ',' -> Optional.of(NO_COMMA);
default -> Optional.empty();
};
}
private static Optional<NumberWithSeparators> matchForUnknown(char decimal) {
return switch (decimal) {
case Constants.UNKNOWN -> Optional.of(UNKNOWN);
case '.' -> Optional.of(UNKNOWN_DOT);
case ',' -> Optional.of(UNKNOWN_COMMA);
default -> Optional.empty();
};
}
private static Optional<NumberWithSeparators> matchForSpace(char decimal) {
return switch (decimal) {
case Constants.UNKNOWN -> Optional.of(SPACE_UNKNOWN);
case '.' -> Optional.of(SPACE_DOT);
case ',' -> Optional.of(SPACE_COMMA);
default -> Optional.empty();
};
}
private static Optional<NumberWithSeparators> matchForSwiss(char decimal) {
return switch (decimal) {
case Constants.UNKNOWN -> Optional.of(SWISS_UNKNOWN);
case '.' -> Optional.of(SWISS_DOT);
case ',' -> Optional.of(SWISS_COMMA);
default -> Optional.empty();
};
}
private static Optional<NumberWithSeparators> matchForUnderscore(char decimal) {
return switch (decimal) {
case Constants.UNKNOWN -> Optional.of(UNDERSCORE_UNKNOWN);
case '.' -> Optional.of(UNDERSCORE_DOT);
case ',' -> Optional.of(UNDERSCORE_COMMA);
default -> Optional.empty();
};
}
static class Constants {
static final char NONE = '\0';
static final char UNKNOWN = '\uFFFD';
}
static boolean isDigit(char c) {
return (c >= '0' && c <= '9');
}
private final char thousands;
private final char decimal;
NumberWithSeparators(char thousands, char decimal) {
this.thousands = thousands;
this.decimal = decimal;
}
public char getThousands() {
return thousands;
}
public char getDecimal() {
return decimal;
}
/**
* While currently the format is treated as English, could be incorrect and actually is European.
*/
public boolean mightBeEuropean() {
return this == COMMA_UNKNOWN || this == DOT_UNKNOWN;
}
NumberParseResult parse(
CharSequence value, int idx, boolean integer, boolean allowExponentialNotation) {
var separators = Separators.parse(value, idx, integer, allowExponentialNotation);
// TODO: Add more detail on separator failure.
if (separators == null) {
return new NumberParseFailure("Invalid separators.");
}
if (thousands != Constants.UNKNOWN && (integer || decimal != Constants.UNKNOWN)) {
// If we have a fixed format then we can parse the number.
return integer
? parseFixedInteger(value, idx, separators.endIdx(), separators.first())
: parseFixedDecimal(
value,
idx,
separators.endIdx(),
separators.first(),
separators.second(),
separators.exponential());
}
return integer
? parseUnknownInteger(
value, idx, separators.endIdx(), separators.first(), separators.count())
: parseUnknownDecimal(
value,
idx,
separators.endIdx(),
separators.first(),
separators.second(),
separators.count(),
separators.lastSeparatorIdx(),
separators.exponential());
}
/** Internal record for returning when a new format is matched. */
record NumberParseResultWithFormat(NumberWithSeparators format, NumberParseResult result)
implements NumberParseResult {}
/** Internal record for returning the end index of the matched number. */
record NumberParseResultWithIndex(int endIdx, NumberParseResult result)
implements NumberParseResult {
boolean exceedsThousand() {
return switch (result) {
case NumberParseLong lngValue -> lngValue.number() >= 1000;
case NumberParseDouble dblValue -> dblValue.number() >= 1000;
default -> false;
};
}
}
/** Given a known integer format, parse the sequence. */
private NumberParseResult parseFixedInteger(
CharSequence value, int idx, int endIdx, char firstSeparator) {
assert thousands != Constants.UNKNOWN;
// Strip out the separators.
int origEndIdx = endIdx;
if (thousands != Constants.NONE) {
value = Separators.strip(value, idx, endIdx, thousands, decimal);
if (value == null) {
return new NumberParseFailure("Invalid number.");
}
idx = 0;
endIdx = value.length();
}
try {
long number = Long.parseLong(value, idx, endIdx, 10);
return new NumberParseResultWithIndex(origEndIdx, new NumberParseLong(number, "", false));
} catch (NumberFormatException e) {
return new NumberParseFailure("Invalid number.");
}
}
/** Parse an unknown format with no separators. */
private NumberParseResult parseUnknownIntegerNone(CharSequence value, int idx, int endIdx) {
assert thousands == Constants.UNKNOWN;
// We haven't encountered any separators. So parse the number as a long.
try {
long number = Long.parseLong(value, idx, endIdx, 10);
var result = new NumberParseResultWithIndex(endIdx, new NumberParseLong(number, "", false));
// If greater than or equal 1000, then we know no thousand separators.
if (number >= 1000) {
var format =
switch (decimal) {
case '.' -> NO_DOT;
case ',' -> NO_COMMA;
default -> NO_UNKNOWN;
};
if (this != format) {
return new NumberParseResultWithFormat(format, result);
}
}
return result;
} catch (NumberFormatException e) {
return new NumberParseFailure("Invalid number.");
}
}
/** Parse an unknown Integer format. */
private NumberParseResult parseUnknownInteger(
CharSequence value, int idx, int endIdx, char separator, int separatorCount) {
assert thousands == Constants.UNKNOWN;
if (separator == decimal) {
// Encountered a decimal point, so can't be an integer.
return new NumberParseFailure("Encountered Decimal Point - Can't Be Integer.");
}
if (separator == Constants.NONE) {
// Didn't encounter any separators so use simpler logic.
return parseUnknownIntegerNone(value, idx, endIdx);
}
// Find the correct format
var format =
switch (separator) {
case '.' -> DOT_COMMA;
case ',' -> separatorCount == 1 ? COMMA_UNKNOWN : COMMA_DOT;
case ' ' -> (decimal == Constants.UNKNOWN
? SPACE_UNKNOWN
: (decimal == '.' ? SPACE_DOT : SPACE_COMMA));
case '\'' -> (decimal == Constants.UNKNOWN
? SWISS_UNKNOWN
: (decimal == '.' ? SWISS_DOT : SWISS_COMMA));
default -> null;
};
if (format == null) {
return new NumberParseFailure("No matching number format.");
}
var result = format.parseFixedInteger(value, idx, endIdx, separator);
return (result instanceof NumberParseFailure)
? result
: new NumberParseResultWithFormat(format, result);
}
/** Given a known double format, parse the sequence. */
private NumberParseResult parseFixedDecimal(
CharSequence value,
int idx,
int endIdx,
char firstSeparator,
char secondSeparator,
boolean exponential) {
// Deal with the special cases first.
if (this == DOT_UNKNOWN || this == UNKNOWN_DOT) {
// Haven't encountered a thousand separator, but know the decimal separator.
// If DOT_UNKNOWN then could be European or English, but treat as English.
assert firstSeparator == '.' && secondSeparator == Constants.NONE;
return NO_DOT.parseFixedDecimal(
value, idx, endIdx, firstSeparator, secondSeparator, exponential);
} else if (this == COMMA_UNKNOWN) {
// Have only encountered a Comma(s), so treat as English format (COMMA_DOT).
assert firstSeparator == ',' && secondSeparator == Constants.NONE;
return COMMA_DOT.parseFixedDecimal(
value, idx, endIdx, firstSeparator, secondSeparator, exponential);
} else if (this == UNKNOWN_COMMA) {
// Have encountered a comma and know is a decimal separator.
assert firstSeparator == ',' && secondSeparator == Constants.NONE;
return NO_COMMA.parseFixedDecimal(
value, idx, endIdx, firstSeparator, secondSeparator, exponential);
}
assert thousands != Constants.UNKNOWN && decimal != Constants.UNKNOWN;
// If no decimal separator, then must be an integer.
if (!exponential && firstSeparator != decimal && secondSeparator != decimal) {
return parseFixedInteger(value, idx, endIdx, firstSeparator);
}
// Validate Separators.
if (firstSeparator != Constants.NONE) {
if ((secondSeparator == Constants.NONE
&& firstSeparator != thousands
&& firstSeparator != decimal)
|| (secondSeparator != Constants.NONE
&& (firstSeparator != thousands || secondSeparator != decimal))) {
return new NumberParseFailure("Invalid separator.");
}
}
// Strip out the separators.
int origEndIdx = endIdx;
if (thousands != Constants.NONE || decimal != '.') {
value = Separators.strip(value, idx, endIdx, thousands, decimal);
if (value == null) {
return new NumberParseFailure("Invalid number.");
}
idx = 0;
endIdx = value.length();
}
try {
double number = Double.parseDouble(value.subSequence(idx, endIdx).toString());
return new NumberParseResultWithIndex(origEndIdx, new NumberParseDouble(number, ""));
} catch (NumberFormatException e) {
return new NumberParseFailure("Invalid number.");
}
}
/** Given a unknown format, parse the sequence. */
private NumberParseResult parseUnknownDecimal(
CharSequence value,
int idx,
int endIdx,
char firstSeparator,
char secondSeparator,
int separatorCount,
int lastSeparatorIdx,
boolean exponential) {
assert thousands == Constants.UNKNOWN || decimal == Constants.UNKNOWN;
// Special case when single separator equal to decimal point.
if (separatorCount == 1 && firstSeparator == decimal) {
var fixed = decimal == '.' ? NO_DOT : NO_COMMA;
var result =
fixed.parseFixedDecimal(value, idx, endIdx, Constants.NONE, decimal, exponential);
if (result instanceof NumberParseResultWithIndex resultWithIndex
&& resultWithIndex.exceedsThousand()) {
return new NumberParseResultWithFormat(fixed, result);
} else {
return result;
}
}
// Cases of no separators or repeated single separator - must be integer.
if (!exponential
&& (firstSeparator == Constants.NONE
|| (secondSeparator == Constants.NONE
&& (separatorCount > 1 || firstSeparator == ' ' || firstSeparator == '\'')))) {
if (mightBeEuropean() && firstSeparator == '.') {
// We know we are wrong.
var result = DOT_COMMA.parseFixedInteger(value, idx, endIdx, '.');
return (result instanceof NumberParseFailure)
? result
: new NumberParseResultWithFormat(DOT_COMMA, result);
}
var result =
thousands == Constants.UNKNOWN
? parseUnknownInteger(value, idx, endIdx, firstSeparator, separatorCount)
: parseFixedInteger(
value, idx, endIdx, separatorCount == 0 ? thousands : firstSeparator);
// Special case if COMMA_UNKNOWN and count > 1 then is COMMA_DOT.
boolean resolveCommaUnknown = this == COMMA_UNKNOWN && separatorCount > 1;
return (result instanceof NumberParseFailure)
? result
: (resolveCommaUnknown ? new NumberParseResultWithFormat(COMMA_DOT, result) : result);
}
// Case when in exponential notation and no separators.
if (exponential && firstSeparator == Constants.NONE) {
return NO_DOT.parseFixedDecimal(
value, idx, endIdx, Constants.NONE, Constants.NONE, exponential);
}
// Need to resolve the format.
NumberWithSeparators format = null;
if (secondSeparator != Constants.NONE) {
format =
switch (firstSeparator) {
case '.' -> secondSeparator == ',' ? DOT_COMMA : null;
case ',' -> secondSeparator == '.' ? COMMA_DOT : null;
case ' ' -> secondSeparator == '.'
? SPACE_DOT
: secondSeparator == ',' ? SPACE_COMMA : null;
case '\'' -> secondSeparator == '.'
? SWISS_DOT
: secondSeparator == ',' ? SWISS_COMMA : null;
default -> null;
};
} else if (firstSeparator == '.') {
// if separatorCount > 1, must be a thousand separator, hence DOT_COMMA (covered above).
// if index of separator > 3, must be a decimal point without a thousand separator, hence
// NO_DOT.
// if 3 digits following then could either, hence DOT_UNKNOWN.
// Otherwise, must be decimal point, hence UNKNOWN_DOT.
format =
lastSeparatorIdx - idx > 3
? NO_DOT
: (lastSeparatorIdx != endIdx - 4
? UNKNOWN_DOT
: (decimal == ',' ? DOT_COMMA : DOT_UNKNOWN));
} else if (firstSeparator == ',') {
// if separatorCount > 1, must be a thousand separator, hence COMMA_DOT (covered above).
// if index of separator > 3, must be a decimal point without a thousand separator, hence
// NO_COMMA.
// if 3 digits following then could either, hence COMMA_UNKNOWN.
// Otherwise, must be decimal point, hence UNKNOWN_COMMA.
format =
lastSeparatorIdx - idx > 3
? NO_COMMA
: (lastSeparatorIdx != endIdx - 4
? UNKNOWN_COMMA
: (decimal == '.' ? COMMA_DOT : COMMA_UNKNOWN));
}
if (format == null) {
return new NumberParseFailure("No matching number format.");
}
// Validate that the new format matches.
if (this.mightBeEuropean()) {
if (this == DOT_UNKNOWN && format.decimal != '.' && format.thousands != '.') {
return new NumberParseFailure("Invalid format matched.");
} else if (this == COMMA_UNKNOWN && format.decimal != ',' && format.thousands != ',') {
return new NumberParseFailure("Invalid format matched.");
}
} else if ((thousands != Constants.UNKNOWN && format.thousands != thousands)
|| (decimal != Constants.UNKNOWN && format.decimal != decimal)) {
return new NumberParseFailure("Invalid format matched.");
}
var result =
format.parseFixedDecimal(value, idx, endIdx, firstSeparator, secondSeparator, exponential);
return (result instanceof NumberParseFailure)
? result
: new NumberParseResultWithFormat(format, result);
}
}

View File

@ -0,0 +1,239 @@
package org.enso.base.parser;
import static org.enso.base.parser.NumberWithSeparators.isDigit;
import java.nio.CharBuffer;
/**
* Record to hold information about the separators found in a number.
*
* @param first - the first encountered separator or Constants.NONE if none found.
* @param second - the second distinct separator or Constants.NONE if none found.
* @param count - the number of separators found.
* @param endIdx - the index of the last character in the number.
* @param lastSeparatorIdx - the index of the last separator found.
* @param exponential - whether the number is in exponential notation.
*/
public record Separators(
char first, char second, int count, int endIdx, int lastSeparatorIdx, boolean exponential) {
/**
* Strip out the specified separators and replace with just full stop for decimal. If any
* character other than a digit, thousands or decimal separator is encountered then return null.
* If multiple decimal separators are encountered then return null.
*/
static CharSequence strip(
CharSequence value, int startIdx, int endIdx, char thousands, char decimal) {
int lastThousand = -1;
boolean foundDecimal = false;
char[] results = new char[endIdx - startIdx];
int resultIdx = 0;
for (int i = startIdx; i < endIdx; i++) {
char c = value.charAt(i);
if (c == decimal) {
if (foundDecimal) {
return null;
}
if (lastThousand != -1 && i != lastThousand + 4) {
return null;
}
results[resultIdx++] = '.';
foundDecimal = true;
} else if (isDigit(c)) {
results[resultIdx++] = c;
} else if (c == thousands) {
// Cannot have thousands post decimal separator.
if (foundDecimal) {
return null;
}
// Must be 4 away from last thousand separator.
if (lastThousand != -1) {
if (i != lastThousand + 4) {
return null;
}
}
lastThousand = i;
} else {
return null;
}
}
if (!foundDecimal && lastThousand != -1 && endIdx != lastThousand + 4) {
return null;
}
return CharBuffer.wrap(results, 0, resultIdx);
}
/** Check if the character is a separator. */
static boolean isSeparator(char c) {
return c == '.' || c == ',' || c == ' ' || c == '\'' || c == '_';
}
/** Check if the character is a decimal separator. */
private static boolean isDecimalSeparator(char c) {
return c == '.' || c == ',';
}
/** Check if the character is part of the current number. */
private static boolean validChar(ExponentState exponentState, char c, char first, char second) {
if (isDigit(c)) {
return true;
}
// If scientific notation is allowed then check for 'e' or 'E'.
// Can then be followed by a +/- sign.
if (exponentState == ExponentState.START && (c == 'e' || c == 'E')) {
return true;
}
// Sign can only be encountered after an E/e in scientific notation.
if (exponentState == ExponentState.E_SIGN && (c == '+' || c == '-')) {
return true;
}
// Separators not valid in scientific notation if not in start.
if (exponentState != ExponentState.START && exponentState != ExponentState.NOT_ALLOWED) {
return false;
}
// We haven't encountered a separator yet, so valid if it is a separator.
if (first == NumberWithSeparators.Constants.NONE) {
return isSeparator(c);
}
// We have encountered the first separator, so valid if it is the same as
// the first or a decimal separator.
if (second == NumberWithSeparators.Constants.NONE) {
return c == first || isDecimalSeparator(c);
}
// We have encountered the second separator, so invalid to encounter another
// separator.
return false;
}
/**
* Find the number and separators section. Validate the spacing of separators. Return the
* separators found or null if invalid.
*
* @param value the value to parse.
* @param idx the index to start parsing from.
* @param integer if the number is an integer.
* @param allowExponentialNotation is exponential notation allowed.
*/
static Separators parse(
CharSequence value, int idx, boolean integer, boolean allowExponentialNotation) {
int endIdx = idx;
char firstSeparator = NumberWithSeparators.Constants.NONE;
char secondSeparator = NumberWithSeparators.Constants.NONE;
boolean firstWasSeparator = false;
int lastSeparator = -1;
int separatorCount = 0;
// Set initial state for exponential notation.
ExponentState exponentState =
!integer && allowExponentialNotation ? ExponentState.START : ExponentState.NOT_ALLOWED;
// Scan the text, find and validate spacing of separators.
// Space and ' are both valid thousands separators, but can't be second separator.
for (endIdx = idx; endIdx < value.length(); endIdx++) {
char c = value.charAt(endIdx);
if (!validChar(exponentState, c, firstSeparator, secondSeparator)) {
break;
}
// Cope with digits or scientific notation.
if (isDigit(c) || c == 'e' || c == 'E' || c == '+' || c == '-') {
// Update Exponent State.
if (c == 'e' || c == 'E') {
exponentState = ExponentState.E_SIGN;
} else if (c == '+' || c == '-') {
exponentState = ExponentState.SIGN;
} else if (exponentState == ExponentState.SIGN || exponentState == ExponentState.E_SIGN) {
exponentState = ExponentState.EXPONENT;
}
continue;
}
// If first digit is a separator then only valid if a decimal separator.
if (endIdx == idx) {
if (integer || !isDecimalSeparator(c)) {
return null;
}
firstWasSeparator = true;
}
if (firstSeparator == NumberWithSeparators.Constants.NONE) {
// Found the first separator.
firstSeparator = c;
} else {
// TODO: This check is probably now redundant as strip does it as well.
// Encountered another separator - must be 4 away from last separator.
if (endIdx != lastSeparator + 4) {
// Special case if last was a space as could be separating symbol.
if (c == ' ') {
break;
}
return null;
}
// Must have been a decimal separator.
if (firstWasSeparator) {
return null;
}
// Encountered a second separator, only valid if !integer.
if (firstSeparator != c) {
if (!integer) {
secondSeparator = c;
} else {
return null;
}
}
}
lastSeparator = endIdx;
separatorCount++;
}
// Special case when firstSeparator is a space and no secondSeparator and ending with a space.
if (firstSeparator == ' ' && value.charAt(endIdx - 1) == ' ') {
separatorCount--;
endIdx--;
lastSeparator -= 4;
if (separatorCount == 0) {
firstSeparator = NumberWithSeparators.Constants.NONE;
}
}
// If in integer mode then must be a thousand separator, validate final spacing.
if (integer && separatorCount > 0 && lastSeparator != endIdx - 4) {
return null;
}
return new Separators(
firstSeparator,
secondSeparator,
separatorCount,
endIdx,
lastSeparator,
exponentState == ExponentState.EXPONENT);
}
private enum ExponentState {
/** Scientific notation not allowed. */
NOT_ALLOWED,
/** Have not encountered an E/e yet. */
START,
/** Have encountered an E/e. */
E_SIGN,
/** Have encountered an E/e and a sign. */
SIGN,
/** Have encountered an E/e, a sign and a digit. */
EXPONENT
}
}

View File

@ -1,11 +1,8 @@
package org.enso.table.parsing;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.Objects;
import java.util.regex.Pattern;
import org.enso.base.parser.FormatDetectingNumberParser;
import org.enso.base.parser.NegativeSign;
import org.enso.base.parser.NumberWithSeparators;
import org.enso.table.data.column.builder.Builder;
import org.enso.table.data.column.builder.NumericBuilder;
import org.enso.table.data.column.storage.Storage;
@ -15,178 +12,11 @@ import org.enso.table.parsing.problems.ParseProblemAggregator;
import org.enso.table.problems.ProblemAggregator;
import org.graalvm.polyglot.Context;
/**
* A parser for numbers.
*
* <p>This parser will attempt to work out what the decimal point and thousand separators used in
* the input. It will try various ways of formatting a number and can be set to allow for scientific
* notation, currency symbols.
*
* <p>If parsing a column it will select the format that parses the longest set without an issue
* from the top and then apply this format to all the rows.
*
* <p>The separators will be tried in British, German, French and Swiss order. - Thousand separator
* must be followed by groups of 3 numbers. - Scientific notation is only allowed on decimals and
* must be on a value between -10 and 10. The notation is an `E` followed by an integer.
*
* <p>The following formats are supported: - Sign (+/-) followed by Number (e.g. +1,234.56) - Using
* brackets to indicate a negative number (e.g. (1,234.56)) - Currency symbols (if enabled) can be
* placed before or after the sign and number. - If using brackets, the currency symbol must be
* placed after the opening bracket.
*/
public class NumberParser extends IncrementalDatatypeParser {
private static final String SIGN = "(?<sign>[-+])?";
private static final String BRACKETS = "(?<sign>\\((?=.*\\)\\s*$))?\\s*";
private static final String BRACKET_CLOSE = "\\)?";
private static final String CCY = "(?<ccy>[^0-9(),. '+-]+)";
private static final String EXP = "(?<exp>[eE][+-]?\\d+)?";
private static final String SPACE = "\\s*";
private record Separators(String thousand, String decimal) {}
private final Separators[] SEPARATORS;
private static final Map<String, Pattern> PATTERNS = new HashMap<>();
private final IntegerType integerTargetType;
private static void validateSeparator(String name, String value) {
if (value == null) return;
if (value.length() != 1) {
throw new IllegalArgumentException(
name + " must be a single character, but it was '" + value + "'.");
}
// If we allowed separators to be a digit, super crazy stuff could happen - e.g. technically
// 10000 could be interpreted as 1000 by interpreting the first 0 as a thousand separator. Let's
// not do that.
if (Character.isDigit(value.charAt(0))) {
throw new IllegalArgumentException(name + " cannot be a digit, but it was '" + value + "'.");
}
}
/**
* Builds a list of possible separator pairs.
*
* <p>If one of the parameters is null, it is meant to be inferred (multiple separator pairs will
* be provided for it), if it is set to a concrete value, it will be fixed.
*/
private static Separators[] buildSeparators(
boolean allowDecimal, String decimalPoint, String thousandSeparator) {
validateSeparator("Decimal point", decimalPoint);
validateSeparator("Thousand separator", thousandSeparator);
if (decimalPoint != null && decimalPoint.equals(thousandSeparator)) {
throw new IllegalArgumentException(
"Decimal point and thousand separator cannot be the same, but they were both '"
+ decimalPoint
+ "'.");
}
boolean fullAutomaticMode = allowDecimal && decimalPoint == null && thousandSeparator == null;
if (fullAutomaticMode) {
return new Separators[] {
new Separators(",", "."),
new Separators(".", ","),
new Separators(" ", ","),
new Separators("'", ","),
};
}
List<String> thousandSeparators;
if (thousandSeparator == null) {
List<String> autoThousandSeparators = List.of(",", ".", "'", " ");
thousandSeparators =
autoThousandSeparators.stream().filter(sep -> !sep.equals(decimalPoint)).toList();
} else {
thousandSeparators = List.of(thousandSeparator);
}
List<String> decimalPoints;
if (decimalPoint == null) {
if (allowDecimal) {
List<String> autoDecimalPoints = List.of(",", ".");
assert thousandSeparator != null;
decimalPoints =
autoDecimalPoints.stream().filter(sep -> !sep.equals(thousandSeparator)).toList();
} else {
// List.of(null) is not permitted...
decimalPoints = new ArrayList<>();
decimalPoints.add(null);
}
} else {
decimalPoints = List.of(decimalPoint);
}
return thousandSeparators.stream()
.flatMap(
thousand -> decimalPoints.stream().map(decimal -> new Separators(thousand, decimal)))
.toArray(Separators[]::new);
}
/** The number of patterns that are allowed for non-currency numbers. */
private static final int ALLOWED_NON_CCY_PATTERNS = 2;
/** The number of patterns that are allowed for currency numbers. */
private static final int ALLOWED_CCY_PATTERNS = 6;
private static Pattern buildPattern(
boolean allowDecimal,
boolean allowCurrency,
boolean allowScientific,
boolean trimValues,
int patternIndex,
Separators separators) {
if (allowScientific && !allowDecimal) {
throw new IllegalArgumentException("Scientific notation requires decimal numbers.");
}
if (patternIndex >= (allowCurrency ? ALLOWED_CCY_PATTERNS : ALLOWED_NON_CCY_PATTERNS)) {
return null;
}
String INTEGER =
"(?<integer>(\\d*)"
+ (separators.thousand == null
? ""
: "|(\\d{1,3}([" + separators.thousand + "]\\d{3})*)")
+ ")";
String decimalPoint = allowDecimal ? Objects.requireNonNull(separators.decimal) : null;
var NUMBER =
INTEGER
+ (allowDecimal ? "(?<decimal>[" + decimalPoint + "]\\d*)?" : "")
+ (allowScientific ? EXP : "");
var pattern =
switch (patternIndex) {
case 0 -> SIGN + NUMBER;
case 1 -> BRACKETS + NUMBER + BRACKET_CLOSE;
case 2 -> SIGN + CCY + SPACE + NUMBER;
case 3 -> CCY + SPACE + SIGN + NUMBER;
case 4 -> SIGN + NUMBER + CCY;
case 5 -> BRACKETS + CCY + SPACE + NUMBER + BRACKET_CLOSE;
default -> throw new IllegalArgumentException("Invalid pattern index: " + patternIndex);
};
if (trimValues) {
pattern = SPACE + pattern + SPACE;
}
return PATTERNS.computeIfAbsent("^" + pattern + "$", Pattern::compile);
}
private final boolean allowDecimal;
private final boolean allowCurrency;
private final boolean allowLeadingZeros;
private final boolean allowScientific;
private final boolean trimValues;
/**
* Creates a new integer instance of this parser.
*
* @param integerTargetType the target type describing how large integer values can be accepted
* @param allowCurrency whether to allow currency symbols
* @param allowLeadingZeros whether to allow leading zeros
* @param trimValues whether to trim the input values
* @param decimalPoint the decimal point set for the current format, or null if not specified;
* this parser does not use decimal point (since it is for integers) but it ensure that if a
@ -196,16 +26,16 @@ public class NumberParser extends IncrementalDatatypeParser {
*/
public static NumberParser createIntegerParser(
IntegerType integerTargetType,
boolean allowCurrency,
boolean allowLeadingZeros,
boolean allowSymbol,
boolean allowLeadingZeroes,
boolean trimValues,
String decimalPoint,
String thousandSeparator) {
assert integerTargetType != null;
return new NumberParser(
false,
integerTargetType,
allowCurrency,
allowLeadingZeros,
allowSymbol,
allowLeadingZeroes,
trimValues,
false,
decimalPoint,
@ -215,240 +45,125 @@ public class NumberParser extends IncrementalDatatypeParser {
/**
* Creates a new decimal instance of this parser.
*
* @param allowCurrency whether to allow currency symbols
* @param allowLeadingZeros whether to allow leading zeros
* @param allowSymbol whether to allow symbols in the input
* @param allowLeadingZeroes whether to allow leading zeroes in the input
* @param trimValues whether to trim the input values
* @param allowScientific whether to allow scientific notation
* @param decimalPoint the decimal separator to use (if null, then will be inferred)
* @param thousandSeparator the thousand separator to use (if null, then will be inferred)
* @param allowExponentialNotation whether to allow exponential notation in the input
* @param decimalPoint the decimal point set for the current format (if null then will be
* inferred)
* @param thousandSeparator the thousand separator to use (if null then will be inferred)
*/
public static NumberParser createDecimalParser(
boolean allowCurrency,
boolean allowLeadingZeros,
boolean allowSymbol,
boolean allowLeadingZeroes,
boolean trimValues,
boolean allowScientific,
boolean allowExponentialNotation,
String decimalPoint,
String thousandSeparator) {
return new NumberParser(
true,
null,
allowCurrency,
allowLeadingZeros,
allowSymbol,
allowLeadingZeroes,
trimValues,
allowScientific,
allowExponentialNotation,
decimalPoint,
thousandSeparator);
}
private final IntegerType integerTargetType;
private final FormatDetectingNumberParser parser;
private NumberParser(
boolean allowDecimal,
IntegerType integerTargetType,
boolean allowCurrency,
boolean allowLeadingZeros,
boolean trimValues,
boolean allowScientific,
boolean allowSymbol,
boolean allowLeadingZeroes,
boolean allowLeadingTrailingWhitespace,
boolean allowExponentialNotation,
String decimalPoint,
String thousandSeparator) {
this.allowDecimal = allowDecimal;
this.integerTargetType = integerTargetType;
this.allowCurrency = allowCurrency;
this.allowLeadingZeros = allowLeadingZeros;
this.trimValues = trimValues;
this.allowScientific = allowScientific;
SEPARATORS = buildSeparators(allowDecimal, decimalPoint, thousandSeparator);
var numberWithSeparators = NumberWithSeparators.fromSeparators(thousandSeparator, decimalPoint);
this.parser =
new FormatDetectingNumberParser(
allowSymbol,
allowLeadingZeroes,
allowLeadingTrailingWhitespace,
allowExponentialNotation,
NegativeSign.UNKNOWN,
numberWithSeparators);
}
/**
* Creates a Pattern for the given index. The index will be decoded into a specific set of
* separators (unless fixed separators are used) and then paired with one of the valid patterns
* for the given parser.
*/
private Pattern patternForIndex(int index) {
int allowedSet = (allowCurrency ? ALLOWED_CCY_PATTERNS : ALLOWED_NON_CCY_PATTERNS);
int separatorsIndex = index / allowedSet;
int patternIndex = index % allowedSet;
if (separatorsIndex >= SEPARATORS.length) {
return null;
}
return buildPattern(
allowDecimal,
allowCurrency,
allowScientific,
trimValues,
patternIndex,
SEPARATORS[separatorsIndex]);
private boolean isInteger() {
return integerTargetType != null;
}
@Override
public Object parseSingleValue(String text, ParseProblemAggregator problemAggregator) {
int index = 0;
var pattern = patternForIndex(index);
while (pattern != null) {
var value = innerParseSingleValue(text, pattern);
if (value != null) {
return value;
}
index++;
pattern = patternForIndex(index);
}
problemAggregator.reportInvalidFormat(text);
return null;
protected Builder makeBuilderWithCapacity(int capacity, ProblemAggregator problemAggregator) {
return isInteger()
? NumericBuilder.createLongBuilder(capacity, integerTargetType, problemAggregator)
: NumericBuilder.createDoubleBuilder(capacity, problemAggregator);
}
@Override
public Storage<?> parseColumn(
Storage<String> sourceStorage, CommonParseProblemAggregator problemAggregator) {
int index = 0;
var pattern = patternForIndex(index);
Builder builder =
makeBuilderWithCapacity(sourceStorage.size(), problemAggregator.createSimpleChild());
int bestIndex = 0;
int bestCount = -1;
while (pattern != null) {
ProblemAggregator inner = problemAggregator.createSimpleChild();
Builder builder = makeBuilderWithCapacity(sourceStorage.size(), inner);
int failedAt = parseColumnWithPattern(pattern, sourceStorage, builder, null);
if (failedAt == -1) {
return builder.seal();
}
// If there was a failure, we abandon this branch - thus we discard any problems that might
// have been reported by the inner aggregator.
inner.detachFromParent();
if (failedAt > bestCount) {
bestCount = failedAt;
bestIndex = index;
}
index++;
pattern = patternForIndex(index);
}
CommonParseProblemAggregator aggregator = problemAggregator.createContextAwareChild();
Builder fallback = makeBuilderWithCapacity(sourceStorage.size(), aggregator);
parseColumnWithPattern(patternForIndex(bestIndex), sourceStorage, fallback, aggregator);
return fallback.seal();
}
private int parseColumnWithPattern(
Pattern pattern,
Storage<String> sourceStorage,
Builder builder,
ParseProblemAggregator aggregator) {
Context context = Context.getCurrent();
var context = Context.getCurrent();
for (int i = 0; i < sourceStorage.size(); i++) {
var text = sourceStorage.getItemBoxed(i);
if (text == null) {
// Check if in unknown state
var mightBeEuropean = !isInteger() && parser.numberWithSeparators().mightBeEuropean();
// Try and parse the value
var result = text == null ? null : parseSingleValue(text, problemAggregator);
// Do we need to rescan?
if (mightBeEuropean && parser.numberWithSeparators() != NumberWithSeparators.DOT_COMMA) {
builder =
makeBuilderWithCapacity(sourceStorage.size(), problemAggregator.createSimpleChild());
for (int j = 0; j < i; j++) {
var subText = sourceStorage.getItemBoxed(j);
var subResult = subText == null ? null : parseSingleValue(subText, problemAggregator);
if (subResult == null) {
builder.appendNulls(1);
} else {
builder.append(subResult);
}
}
}
// Append the result
if (result == null) {
builder.appendNulls(1);
} else {
var value = innerParseSingleValue(text, pattern);
if (value != null) {
builder.appendNoGrow(value);
} else {
if (aggregator == null) {
return i;
}
aggregator.reportInvalidFormat(text);
builder.appendNulls(1);
}
builder.append(result);
}
context.safepoint();
}
return -1;
return builder.seal();
}
@Override
protected Builder makeBuilderWithCapacity(int capacity, ProblemAggregator problemAggregator) {
return allowDecimal
? NumericBuilder.createDoubleBuilder(capacity, problemAggregator)
: NumericBuilder.createLongBuilder(capacity, integerTargetType, problemAggregator);
}
public Object parseSingleValue(String text, ParseProblemAggregator problemAggregator) {
var result = parser.parse(text, isInteger());
private Object innerParseSingleValue(String text, Pattern pattern) {
if (allowDecimal) {
var trimmed = trimValues ? text.trim() : text;
if (trimmed.equals("NaN")) {
return Double.NaN;
}
if (trimmed.equals("Infinity")) {
return Double.POSITIVE_INFINITY;
}
if (trimmed.equals("-Infinity")) {
return Double.NEGATIVE_INFINITY;
}
}
var parsed = pattern.matcher(text);
if (!parsed.matches()) {
// TODO: Capture the message into the problem aggregator.
if (result instanceof FormatDetectingNumberParser.NumberParseFailure) {
problemAggregator.reportInvalidFormat(text);
return null;
}
try {
var sign = parsed.group("sign");
long sign_value = sign != null && !sign.equals("+") ? -1 : 1;
var integer = parsed.group("integer").replaceAll("\\D", "");
if (!allowLeadingZeros && integer.length() > 1 && integer.charAt(0) == '0') {
return null;
}
if (allowDecimal) {
String decimal = parsed.group("decimal");
String decimalPrepared = decimal == null ? "" : ("." + decimal.substring(1));
if (integer.equals("") && decimalPrepared.equals("")) {
return null;
}
integer = integer.equals("") ? "0" : integer;
String exp = allowScientific ? parsed.group("exp") : null;
if (exp != null) {
if (integer.length() > 1) {
return null;
}
decimalPrepared = decimalPrepared + exp;
}
// If there is no decimal part, we parse as integer, as this will allow us more specialized
// handling.
// For example, we can get the exact value instead of a rounded one for big values. We can
// then round
// later, but first handle any warnings.
if (decimalPrepared.equals("")) {
long integer_part = Long.parseLong(integer);
// Special handling for values like `-0` - if we treat them as integers, they will lose
// the `-` sign.
if (integer_part == 0 && sign_value < 0) {
return -0.0;
}
return sign_value * integer_part;
}
return sign_value * Double.parseDouble(integer + decimalPrepared);
}
if (integer.equals("")) {
return null;
}
long integer_value = sign_value * Long.parseLong(integer);
if (integerTargetType.fits(integer_value)) {
return integer_value;
} else {
return null;
}
} catch (NumberFormatException e) {
throw new IllegalStateException("Java parse failed to parse number: " + text, e);
}
return switch (result) {
case FormatDetectingNumberParser.NumberParseDouble doubleResult -> doubleResult.number();
case FormatDetectingNumberParser.NumberParseLong longResult -> longResult.number();
default -> throw new IllegalStateException("Unexpected result type: " + result.getClass());
};
}
}

View File

@ -420,9 +420,9 @@ add_specs suite_builder =
pUS3.to_vector . should_equal [1, -123, Nothing, 1234567, Nothing]
Problems.expect_warning (Invalid_Format.Error "ints" Value_Type.Integer 2 ["-1,234", "12,34,56"]) pUS3
cUS4 = Column.from_vector "ints" ["$1234", "$1,234", "$1,234,567","-$1,234", "($1,234,567)"]
cUS4 = Column.from_vector "ints" ["$234", "$1,234", "$1,234,567","-$1,234", "($1,234,567)"]
pUS4 = cUS4.parse type=Value_Type.Integer
pUS4.to_vector . should_equal [1234, 1234, 1234567, -1234, Nothing]
pUS4.to_vector . should_equal [234, 1234, 1234567, -1234, Nothing]
Problems.expect_warning (Invalid_Format.Error "ints" Value_Type.Integer 1 ["($1,234,567)"]) pUS4
## Reject bracket notation for negative numbers if already seen a minus sign