mirror of
https://github.com/enso-org/enso.git
synced 2024-11-25 21:25:20 +03:00
New NumberParser for Table parsing (#11499)
Replaces the Regex based number parser with a new parser which works out the same by working out each part as it sees and example of it. Close #7398 - performance of reading the large CSV now about 2s (down from 15-20s).
This commit is contained in:
parent
fb50a8f24f
commit
6b544650b3
@ -249,26 +249,12 @@ type Data_Formatter
|
||||
get_specific_type_parsers self =
|
||||
## Have to do date parsing first to allow for pure numeric formats.
|
||||
date_parsers = [self.make_date_time_parser, self.make_date_parser, self.make_time_of_day_parser]
|
||||
|
||||
try_us_parsers_first = self.decimal_point == Auto && self.thousand_separator != "."
|
||||
preferred_auto_parsers = case try_us_parsers_first of
|
||||
## If we are in auto mode, we will first try parsing US integers,
|
||||
then US floats and only then other integers and floats.
|
||||
|
||||
Under normal circumstances, we first try integers and later
|
||||
floats - but this would cause `1.000` to be interpreted as `1000`
|
||||
because _all_ integers take precedence and floats are considered
|
||||
later. But we want `1.000` to be interpreted as a `1.0` float by
|
||||
default, so we change the ordering a bit.
|
||||
True ->
|
||||
us_preferred = self.with_number_formatting decimal_point='.'
|
||||
[us_preferred.make_integer_parser auto_mode=True, us_preferred.make_decimal_parser auto_mode=True]
|
||||
|
||||
## However, if the `decimal_point` is set to something else,
|
||||
we don't do auto inference, so this extra logic is not needed.
|
||||
False -> []
|
||||
us_parsers = if self.decimal_point != Auto || self.thousand_separator != "" then [] else
|
||||
## In default mode, add the English pattern to ensure they dominate.
|
||||
english = self.with_number_formatting decimal_point='.' thousand_separator=','
|
||||
[english.make_integer_parser auto_mode=True, english.make_decimal_parser auto_mode=True]
|
||||
remaining_parsers = [self.make_integer_parser auto_mode=True, self.make_decimal_parser auto_mode=True, self.make_boolean_parser]
|
||||
parsers = date_parsers + preferred_auto_parsers + remaining_parsers
|
||||
parsers = date_parsers + us_parsers + remaining_parsers
|
||||
## Unfortunately, the [] literal allows to create a vector containing
|
||||
dataflow errors. That is not handled well later by Polyglot. So we
|
||||
ensure all errors surface here.
|
||||
|
@ -0,0 +1,324 @@
|
||||
package org.enso.base.parser;
|
||||
|
||||
/**
|
||||
* Parse a String into a Number. It supports the following patterns:
|
||||
*
|
||||
* <ul>
|
||||
* <li>SIGN + NUMBER
|
||||
* <li>SYMBOL + SIGN + NUMBER
|
||||
* <li>SIGN + NUMBER + SYMBOL
|
||||
* <li>SIGN + BRACKETS + NUMBER + BRACKET_CLOSE
|
||||
* <li>BRACKETS + SYMBOL + NUMBER + BRACKET_CLOSE
|
||||
* <li>BRACKETS + NUMBER + SYMBOL + BRACKET_CLOSE
|
||||
* </ul>
|
||||
*/
|
||||
public class FormatDetectingNumberParser {
|
||||
public interface NumberParseResult {}
|
||||
|
||||
public interface NumberParseResultSuccess extends NumberParseResult {
|
||||
NumberParseResultSuccess negate();
|
||||
|
||||
NumberParseResultSuccess withSymbol(String symbol);
|
||||
}
|
||||
|
||||
public record NumberParseLong(long number, String symbol, boolean negated)
|
||||
implements NumberParseResultSuccess {
|
||||
@Override
|
||||
public NumberParseResultSuccess negate() {
|
||||
return new NumberParseLong(-number, symbol, true);
|
||||
}
|
||||
|
||||
@Override
|
||||
public NumberParseResultSuccess withSymbol(String symbol) {
|
||||
return new NumberParseLong(number, symbol, negated);
|
||||
}
|
||||
}
|
||||
|
||||
public record NumberParseDouble(double number, String symbol)
|
||||
implements NumberParseResultSuccess {
|
||||
@Override
|
||||
public NumberParseResultSuccess negate() {
|
||||
return new NumberParseDouble(-number, symbol);
|
||||
}
|
||||
|
||||
@Override
|
||||
public NumberParseResultSuccess withSymbol(String symbol) {
|
||||
return new NumberParseDouble(number, symbol);
|
||||
}
|
||||
}
|
||||
|
||||
public record NumberParseFailure(String message) implements NumberParseResult {}
|
||||
|
||||
private final boolean allowSymbol;
|
||||
private final boolean allowLeadingZeroes;
|
||||
private final boolean allowLeadingTrailingWhitespace;
|
||||
private final boolean allowExponentialNotation;
|
||||
private NegativeSign negativeSign;
|
||||
private NumberWithSeparators numberWithSeparators;
|
||||
|
||||
public FormatDetectingNumberParser(
|
||||
boolean allowSymbol,
|
||||
boolean allowLeadingZeroes,
|
||||
boolean allowLeadingTrailingWhitespace,
|
||||
boolean allowExponentialNotation,
|
||||
NegativeSign negativeSign,
|
||||
NumberWithSeparators numberWithSeparators) {
|
||||
this.allowSymbol = allowSymbol;
|
||||
this.allowLeadingZeroes = allowLeadingZeroes;
|
||||
this.allowLeadingTrailingWhitespace = allowLeadingTrailingWhitespace;
|
||||
this.allowExponentialNotation = allowExponentialNotation;
|
||||
this.negativeSign = negativeSign;
|
||||
this.numberWithSeparators = numberWithSeparators;
|
||||
}
|
||||
|
||||
public NegativeSign negativeSign() {
|
||||
return negativeSign;
|
||||
}
|
||||
|
||||
public void setNegativeSign(NegativeSign newNegativeSign) {
|
||||
if (negativeSign != NegativeSign.UNKNOWN) {
|
||||
throw new IllegalStateException("Negative Sign Already Set.");
|
||||
}
|
||||
negativeSign = newNegativeSign;
|
||||
}
|
||||
|
||||
public NumberWithSeparators numberWithSeparators() {
|
||||
return numberWithSeparators;
|
||||
}
|
||||
|
||||
/**
|
||||
* Parse a string into a number.
|
||||
*
|
||||
* @param value the string to parse.
|
||||
* @param integer whether to parse a Long or a Double.
|
||||
* @return the parsed number, or a failure if the parse was unsuccessful.
|
||||
*/
|
||||
public NumberParseResult parse(CharSequence value, boolean integer) {
|
||||
// Ensure that if we are allowing exponential notation, we are not parsing an integer.
|
||||
assert !(allowExponentialNotation && integer);
|
||||
|
||||
// State
|
||||
boolean lastWasWhitespace = false;
|
||||
boolean encounteredContent = false;
|
||||
boolean encounteredSign = false;
|
||||
boolean needsNegating = false;
|
||||
NumberParseResultSuccess number = null;
|
||||
String symbol = "";
|
||||
|
||||
// Scan the value
|
||||
int idx = 0;
|
||||
int length = value.length();
|
||||
while (idx < length) {
|
||||
char c = value.charAt(idx);
|
||||
|
||||
if (Character.isWhitespace(c)) {
|
||||
if (!allowLeadingTrailingWhitespace && !encounteredContent) {
|
||||
return new NumberParseFailure("Unexpected leading Whitespace.");
|
||||
}
|
||||
|
||||
if (idx > 0 && (value.charAt(idx - 1) == '-' || value.charAt(idx - 1) == '+')) {
|
||||
return new NumberParseFailure("Unexpected whitespace after sign.");
|
||||
}
|
||||
|
||||
idx++;
|
||||
lastWasWhitespace = true;
|
||||
} else {
|
||||
encounteredContent = true;
|
||||
lastWasWhitespace = false;
|
||||
|
||||
if (NumberWithSeparators.isDigit(c) || Separators.isSeparator(c)) {
|
||||
if (number != null) {
|
||||
return new NumberParseFailure("Multiple Number Sections.");
|
||||
}
|
||||
|
||||
var numberPart =
|
||||
numberWithSeparators.parse(value, idx, integer, allowExponentialNotation);
|
||||
|
||||
// If the format changed, catch new format and unwrap result.
|
||||
if (numberPart instanceof NumberWithSeparators.NumberParseResultWithFormat newFormat) {
|
||||
numberWithSeparators = newFormat.format();
|
||||
numberPart = newFormat.result();
|
||||
}
|
||||
|
||||
// Result should either be a new index or a failure.
|
||||
// If it is a new index, update the index and unwrap the result.
|
||||
if (numberPart instanceof NumberWithSeparators.NumberParseResultWithIndex newIndex) {
|
||||
// Check for leading zeroes (0 or 0<decimal> is acceptable).
|
||||
if (!allowLeadingZeroes
|
||||
&& c == '0'
|
||||
&& newIndex.endIdx() > idx + 1
|
||||
&& value.charAt(idx + 1) != numberWithSeparators.getDecimal()) {
|
||||
return new NumberParseFailure("Leading Zero.");
|
||||
}
|
||||
|
||||
idx = newIndex.endIdx();
|
||||
numberPart = newIndex.result();
|
||||
}
|
||||
|
||||
if (numberPart instanceof NumberParseResultSuccess numberSuccess) {
|
||||
number = numberSuccess;
|
||||
} else {
|
||||
return numberPart;
|
||||
}
|
||||
} else if (NegativeSign.isOpenSign(c)) {
|
||||
if (encounteredSign || number != null) {
|
||||
return new NumberParseFailure("Unexpected sign character.");
|
||||
}
|
||||
|
||||
var signOk = negativeSign.checkValid(c);
|
||||
if (signOk.isEmpty()) {
|
||||
return new NumberParseFailure("Inconsistent negative format.");
|
||||
}
|
||||
|
||||
negativeSign = signOk.get();
|
||||
encounteredSign = true;
|
||||
needsNegating = c != '+';
|
||||
idx++;
|
||||
} else if (c == ')') {
|
||||
if (!needsNegating || negativeSign != NegativeSign.BRACKET_OPEN || number == null) {
|
||||
return new NumberParseFailure("Unexpected bracket close.");
|
||||
}
|
||||
|
||||
// Should only be whitespace left.
|
||||
idx++;
|
||||
while (idx < length) {
|
||||
if (!Character.isWhitespace(value.charAt(idx))) {
|
||||
return new NumberParseFailure("Unexpected characters after bracket close.");
|
||||
}
|
||||
idx++;
|
||||
lastWasWhitespace = true;
|
||||
}
|
||||
|
||||
// Negate here so can tell finished.
|
||||
number = number.negate();
|
||||
needsNegating = false;
|
||||
} else if (!integer
|
||||
&& number == null
|
||||
&& isSameSequence(value, idx, "infinity", "INFINITY")) {
|
||||
// Identify Infinity
|
||||
number = new NumberParseDouble(Double.POSITIVE_INFINITY, "");
|
||||
idx += 8;
|
||||
} else if (!integer
|
||||
&& number == null
|
||||
&& !encounteredSign
|
||||
&& !needsNegating
|
||||
&& isSameSequence(value, idx, "nan", "NAN")) {
|
||||
// Identify NaN
|
||||
number = new NumberParseDouble(Double.NaN, "");
|
||||
idx += 3;
|
||||
} else {
|
||||
if (!symbol.isEmpty()) {
|
||||
return new NumberParseFailure("Multiple Symbol Sections.");
|
||||
}
|
||||
|
||||
if (!allowSymbol) {
|
||||
return new NumberParseFailure("Symbols not allowed.");
|
||||
}
|
||||
|
||||
// ToDo: Locking symbol position within text parts.
|
||||
int endIdx = idx;
|
||||
while (endIdx < length
|
||||
&& !NumberWithSeparators.isDigit(c)
|
||||
&& !Separators.isSeparator(c)
|
||||
&& !NegativeSign.isSign(c)
|
||||
&& !Character.isWhitespace(c)) {
|
||||
endIdx++;
|
||||
if (endIdx < length) {
|
||||
c = value.charAt(endIdx);
|
||||
}
|
||||
}
|
||||
|
||||
symbol = value.subSequence(idx, endIdx).toString();
|
||||
idx = endIdx;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Check for trailing whitespace.
|
||||
if (!allowLeadingTrailingWhitespace && lastWasWhitespace) {
|
||||
return new NumberParseFailure("Trailing Whitespace.");
|
||||
}
|
||||
|
||||
// Special check for unclosed bracket.
|
||||
if (negativeSign == NegativeSign.BRACKET_OPEN && needsNegating) {
|
||||
return new NumberParseFailure("Unclosed bracket.");
|
||||
}
|
||||
|
||||
// Fail if no number found.
|
||||
if (number == null) {
|
||||
return new NumberParseFailure("No Number Found.");
|
||||
}
|
||||
|
||||
// Return Result
|
||||
number = needsNegating ? number.negate() : number;
|
||||
|
||||
// Handle Special Case of Negated 0 If Not An Integer
|
||||
if (!integer
|
||||
&& number instanceof NumberParseLong longNumber
|
||||
&& longNumber.number() == 0
|
||||
&& longNumber.negated()) {
|
||||
// Catch -0 double.
|
||||
number = new NumberParseDouble(-0.0, longNumber.symbol());
|
||||
}
|
||||
|
||||
return symbol.isEmpty() ? number : number.withSymbol(symbol);
|
||||
}
|
||||
|
||||
public Long parseLong(CharSequence value) {
|
||||
var result = parse(value, true);
|
||||
if (result instanceof NumberParseLong numberSuccess) {
|
||||
return numberSuccess.number();
|
||||
}
|
||||
return null;
|
||||
}
|
||||
|
||||
public Double parseDouble(CharSequence value) {
|
||||
var result = parse(value, false);
|
||||
if (result instanceof NumberParseDouble numberSuccess) {
|
||||
return numberSuccess.number();
|
||||
} else if (result instanceof NumberParseLong longNumber) {
|
||||
return (double) longNumber.number();
|
||||
}
|
||||
return null;
|
||||
}
|
||||
|
||||
public NumberParseResult[] parseMany(CharSequence[] values, boolean integer) {
|
||||
var results = new NumberParseResult[values.length];
|
||||
|
||||
int i = 0;
|
||||
while (i < values.length) {
|
||||
var previous = numberWithSeparators;
|
||||
results[i] = parse(values[i], integer);
|
||||
|
||||
if (numberWithSeparators != previous
|
||||
&& ((previous == NumberWithSeparators.DOT_UNKNOWN
|
||||
&& numberWithSeparators != NumberWithSeparators.DOT_COMMA)
|
||||
|| (previous == NumberWithSeparators.COMMA_UNKNOWN
|
||||
&& numberWithSeparators != NumberWithSeparators.DOT_COMMA))) {
|
||||
// Start scan over, as format was incorrect.
|
||||
i = 0;
|
||||
} else {
|
||||
i++;
|
||||
}
|
||||
}
|
||||
|
||||
return results;
|
||||
}
|
||||
|
||||
private static boolean isSameSequence(
|
||||
CharSequence sequence, int index, CharSequence toMatchLower, CharSequence toMatchUpper) {
|
||||
assert toMatchLower.length() == toMatchUpper.length();
|
||||
if (index + toMatchLower.length() > sequence.length()) {
|
||||
return false;
|
||||
}
|
||||
|
||||
for (int i = 0; i < toMatchLower.length(); i++) {
|
||||
char c = sequence.charAt(index + i);
|
||||
if (c != toMatchLower.charAt(i) && c != toMatchUpper.charAt(i)) {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
}
|
@ -0,0 +1,40 @@
|
||||
package org.enso.base.parser;
|
||||
|
||||
import java.util.Optional;
|
||||
|
||||
public enum NegativeSign {
|
||||
/** No sign encountered, so could be either. */
|
||||
UNKNOWN,
|
||||
/** Minus or Plus sign - e.g. +123 or -123. */
|
||||
MINUS,
|
||||
/** Brackets - e.g. (123) */
|
||||
BRACKET_OPEN;
|
||||
|
||||
/**
|
||||
* Checks if the given character is a valid negative sign.
|
||||
*
|
||||
* @param c the character to check
|
||||
* @return the new state of the negative sign or Optional.empty if the character is invalid.
|
||||
*/
|
||||
public Optional<NegativeSign> checkValid(char c) {
|
||||
var result =
|
||||
switch (this) {
|
||||
case UNKNOWN -> c == '-' || c == '+' ? MINUS : c == '(' ? BRACKET_OPEN : null;
|
||||
case MINUS -> c == '(' ? null : this;
|
||||
case BRACKET_OPEN -> c != '(' ? null : this;
|
||||
};
|
||||
return Optional.ofNullable(result);
|
||||
}
|
||||
|
||||
static boolean isOpenSign(char c) {
|
||||
return c == '-' || c == '+' || c == '(';
|
||||
}
|
||||
|
||||
static boolean isCloseSign(char c) {
|
||||
return c == ')';
|
||||
}
|
||||
|
||||
static boolean isSign(char c) {
|
||||
return isOpenSign(c) || isCloseSign(c);
|
||||
}
|
||||
}
|
@ -0,0 +1,503 @@
|
||||
package org.enso.base.parser;
|
||||
|
||||
import java.util.Optional;
|
||||
import org.enso.base.parser.FormatDetectingNumberParser.NumberParseDouble;
|
||||
import org.enso.base.parser.FormatDetectingNumberParser.NumberParseFailure;
|
||||
import org.enso.base.parser.FormatDetectingNumberParser.NumberParseLong;
|
||||
import org.enso.base.parser.FormatDetectingNumberParser.NumberParseResult;
|
||||
|
||||
/**
|
||||
* Number parsing with separators. Specifies the universe of number formats that can be parsed. Two
|
||||
* special cases, where we default to English format over European:
|
||||
*
|
||||
* <ul>
|
||||
* <li>Encounter a single . or , with 3 trailing numbers.
|
||||
* <li>Could be either DOT_COMMA or COMMA_DOT.
|
||||
* <li>If a single . then uses DOT_UNKNOWN.
|
||||
* <li>If a single , then uses COMMA_UNKNOWN.
|
||||
* </ul>
|
||||
*/
|
||||
public enum NumberWithSeparators {
|
||||
UNKNOWN(Constants.UNKNOWN, Constants.UNKNOWN),
|
||||
|
||||
// Special case where we have encountered a . with 3 trailing digits. Such as
|
||||
// ##0.123 ###.123
|
||||
DOT_UNKNOWN(Constants.UNKNOWN, '.'),
|
||||
// Special case where we have encountered a single . within 3 digits from
|
||||
// start and without 3 digits from end. Such as ##3.1# or ##3.1415...
|
||||
UNKNOWN_DOT(Constants.UNKNOWN, '.'),
|
||||
// Special case where we have encountered a , with 3 trailing digits. Such as
|
||||
// ##0,123 ###,123
|
||||
COMMA_UNKNOWN(',', Constants.UNKNOWN),
|
||||
// Special case where we have encountered a single . within 3 digits from
|
||||
// start and without 3 digits from end. Such as ##3,1# or ##3,1415...
|
||||
UNKNOWN_COMMA(Constants.UNKNOWN, ','),
|
||||
|
||||
NO_UNKNOWN(Constants.NONE, Constants.UNKNOWN),
|
||||
NO_DOT(Constants.NONE, '.'),
|
||||
NO_COMMA(Constants.NONE, ','),
|
||||
|
||||
// European format (e.g. 1.234,56)
|
||||
DOT_COMMA('.', ','),
|
||||
|
||||
// English format (e.g. 1,234.56)
|
||||
COMMA_DOT(',', '.'),
|
||||
|
||||
SPACE_UNKNOWN(' ', Constants.UNKNOWN),
|
||||
SPACE_DOT(' ', '.'),
|
||||
SPACE_COMMA(' ', ','),
|
||||
|
||||
SWISS_UNKNOWN('\'', Constants.UNKNOWN),
|
||||
SWISS_DOT('\'', '.'),
|
||||
SWISS_COMMA('\'', ','),
|
||||
|
||||
UNDERSCORE_UNKNOWN('_', Constants.UNKNOWN),
|
||||
UNDERSCORE_DOT('_', '.'),
|
||||
UNDERSCORE_COMMA('_', ',');
|
||||
|
||||
public static NumberWithSeparators fromSeparators(String thousand, String decimal) {
|
||||
if (thousand != null && thousand.length() > 1) {
|
||||
throw new IllegalArgumentException("Invalid thousand separator (more than one character).");
|
||||
}
|
||||
|
||||
if (decimal != null && decimal.length() > 1) {
|
||||
throw new IllegalArgumentException("Invalid decimal separator (more than one character).");
|
||||
}
|
||||
|
||||
char thousands =
|
||||
thousand == null
|
||||
? Constants.UNKNOWN
|
||||
: (thousand.isEmpty() ? Constants.NONE : thousand.charAt(0));
|
||||
char decimals =
|
||||
decimal == null
|
||||
? Constants.UNKNOWN
|
||||
: (decimal.isEmpty() ? Constants.NONE : decimal.charAt(0));
|
||||
|
||||
Optional<NumberWithSeparators> matched =
|
||||
switch (thousands) {
|
||||
case Constants.NONE -> matchForNone(decimals);
|
||||
case Constants.UNKNOWN -> matchForUnknown(decimals);
|
||||
case ',' -> switch (decimals) {
|
||||
case Constants.UNKNOWN, Constants.NONE, '.' -> Optional.of(COMMA_DOT);
|
||||
default -> Optional.empty();
|
||||
};
|
||||
case '.' -> switch (decimals) {
|
||||
case Constants.UNKNOWN, Constants.NONE, ',' -> Optional.of(DOT_COMMA);
|
||||
default -> Optional.empty();
|
||||
};
|
||||
case ' ' -> matchForSpace(decimals);
|
||||
case '\'' -> matchForSwiss(decimals);
|
||||
case '_' -> matchForUnderscore(decimals);
|
||||
default -> Optional.empty();
|
||||
};
|
||||
|
||||
if (matched.isEmpty()) {
|
||||
throw new IllegalArgumentException("Invalid separators.");
|
||||
}
|
||||
return matched.get();
|
||||
}
|
||||
|
||||
private static Optional<NumberWithSeparators> matchForNone(char decimal) {
|
||||
return switch (decimal) {
|
||||
case Constants.UNKNOWN -> Optional.of(NO_UNKNOWN);
|
||||
case '.' -> Optional.of(NO_DOT);
|
||||
case ',' -> Optional.of(NO_COMMA);
|
||||
default -> Optional.empty();
|
||||
};
|
||||
}
|
||||
|
||||
private static Optional<NumberWithSeparators> matchForUnknown(char decimal) {
|
||||
return switch (decimal) {
|
||||
case Constants.UNKNOWN -> Optional.of(UNKNOWN);
|
||||
case '.' -> Optional.of(UNKNOWN_DOT);
|
||||
case ',' -> Optional.of(UNKNOWN_COMMA);
|
||||
default -> Optional.empty();
|
||||
};
|
||||
}
|
||||
|
||||
private static Optional<NumberWithSeparators> matchForSpace(char decimal) {
|
||||
return switch (decimal) {
|
||||
case Constants.UNKNOWN -> Optional.of(SPACE_UNKNOWN);
|
||||
case '.' -> Optional.of(SPACE_DOT);
|
||||
case ',' -> Optional.of(SPACE_COMMA);
|
||||
default -> Optional.empty();
|
||||
};
|
||||
}
|
||||
|
||||
private static Optional<NumberWithSeparators> matchForSwiss(char decimal) {
|
||||
return switch (decimal) {
|
||||
case Constants.UNKNOWN -> Optional.of(SWISS_UNKNOWN);
|
||||
case '.' -> Optional.of(SWISS_DOT);
|
||||
case ',' -> Optional.of(SWISS_COMMA);
|
||||
default -> Optional.empty();
|
||||
};
|
||||
}
|
||||
|
||||
private static Optional<NumberWithSeparators> matchForUnderscore(char decimal) {
|
||||
return switch (decimal) {
|
||||
case Constants.UNKNOWN -> Optional.of(UNDERSCORE_UNKNOWN);
|
||||
case '.' -> Optional.of(UNDERSCORE_DOT);
|
||||
case ',' -> Optional.of(UNDERSCORE_COMMA);
|
||||
default -> Optional.empty();
|
||||
};
|
||||
}
|
||||
|
||||
static class Constants {
|
||||
static final char NONE = '\0';
|
||||
static final char UNKNOWN = '\uFFFD';
|
||||
}
|
||||
|
||||
static boolean isDigit(char c) {
|
||||
return (c >= '0' && c <= '9');
|
||||
}
|
||||
|
||||
private final char thousands;
|
||||
private final char decimal;
|
||||
|
||||
NumberWithSeparators(char thousands, char decimal) {
|
||||
this.thousands = thousands;
|
||||
this.decimal = decimal;
|
||||
}
|
||||
|
||||
public char getThousands() {
|
||||
return thousands;
|
||||
}
|
||||
|
||||
public char getDecimal() {
|
||||
return decimal;
|
||||
}
|
||||
|
||||
/**
|
||||
* While currently the format is treated as English, could be incorrect and actually is European.
|
||||
*/
|
||||
public boolean mightBeEuropean() {
|
||||
return this == COMMA_UNKNOWN || this == DOT_UNKNOWN;
|
||||
}
|
||||
|
||||
NumberParseResult parse(
|
||||
CharSequence value, int idx, boolean integer, boolean allowExponentialNotation) {
|
||||
var separators = Separators.parse(value, idx, integer, allowExponentialNotation);
|
||||
// TODO: Add more detail on separator failure.
|
||||
if (separators == null) {
|
||||
return new NumberParseFailure("Invalid separators.");
|
||||
}
|
||||
|
||||
if (thousands != Constants.UNKNOWN && (integer || decimal != Constants.UNKNOWN)) {
|
||||
// If we have a fixed format then we can parse the number.
|
||||
return integer
|
||||
? parseFixedInteger(value, idx, separators.endIdx(), separators.first())
|
||||
: parseFixedDecimal(
|
||||
value,
|
||||
idx,
|
||||
separators.endIdx(),
|
||||
separators.first(),
|
||||
separators.second(),
|
||||
separators.exponential());
|
||||
}
|
||||
|
||||
return integer
|
||||
? parseUnknownInteger(
|
||||
value, idx, separators.endIdx(), separators.first(), separators.count())
|
||||
: parseUnknownDecimal(
|
||||
value,
|
||||
idx,
|
||||
separators.endIdx(),
|
||||
separators.first(),
|
||||
separators.second(),
|
||||
separators.count(),
|
||||
separators.lastSeparatorIdx(),
|
||||
separators.exponential());
|
||||
}
|
||||
|
||||
/** Internal record for returning when a new format is matched. */
|
||||
record NumberParseResultWithFormat(NumberWithSeparators format, NumberParseResult result)
|
||||
implements NumberParseResult {}
|
||||
|
||||
/** Internal record for returning the end index of the matched number. */
|
||||
record NumberParseResultWithIndex(int endIdx, NumberParseResult result)
|
||||
implements NumberParseResult {
|
||||
|
||||
boolean exceedsThousand() {
|
||||
return switch (result) {
|
||||
case NumberParseLong lngValue -> lngValue.number() >= 1000;
|
||||
case NumberParseDouble dblValue -> dblValue.number() >= 1000;
|
||||
default -> false;
|
||||
};
|
||||
}
|
||||
}
|
||||
|
||||
/** Given a known integer format, parse the sequence. */
|
||||
private NumberParseResult parseFixedInteger(
|
||||
CharSequence value, int idx, int endIdx, char firstSeparator) {
|
||||
assert thousands != Constants.UNKNOWN;
|
||||
|
||||
// Strip out the separators.
|
||||
int origEndIdx = endIdx;
|
||||
if (thousands != Constants.NONE) {
|
||||
value = Separators.strip(value, idx, endIdx, thousands, decimal);
|
||||
if (value == null) {
|
||||
return new NumberParseFailure("Invalid number.");
|
||||
}
|
||||
idx = 0;
|
||||
endIdx = value.length();
|
||||
}
|
||||
|
||||
try {
|
||||
long number = Long.parseLong(value, idx, endIdx, 10);
|
||||
return new NumberParseResultWithIndex(origEndIdx, new NumberParseLong(number, "", false));
|
||||
} catch (NumberFormatException e) {
|
||||
return new NumberParseFailure("Invalid number.");
|
||||
}
|
||||
}
|
||||
|
||||
/** Parse an unknown format with no separators. */
|
||||
private NumberParseResult parseUnknownIntegerNone(CharSequence value, int idx, int endIdx) {
|
||||
assert thousands == Constants.UNKNOWN;
|
||||
|
||||
// We haven't encountered any separators. So parse the number as a long.
|
||||
try {
|
||||
long number = Long.parseLong(value, idx, endIdx, 10);
|
||||
var result = new NumberParseResultWithIndex(endIdx, new NumberParseLong(number, "", false));
|
||||
|
||||
// If greater than or equal 1000, then we know no thousand separators.
|
||||
if (number >= 1000) {
|
||||
var format =
|
||||
switch (decimal) {
|
||||
case '.' -> NO_DOT;
|
||||
case ',' -> NO_COMMA;
|
||||
default -> NO_UNKNOWN;
|
||||
};
|
||||
|
||||
if (this != format) {
|
||||
return new NumberParseResultWithFormat(format, result);
|
||||
}
|
||||
}
|
||||
|
||||
return result;
|
||||
} catch (NumberFormatException e) {
|
||||
return new NumberParseFailure("Invalid number.");
|
||||
}
|
||||
}
|
||||
|
||||
/** Parse an unknown Integer format. */
|
||||
private NumberParseResult parseUnknownInteger(
|
||||
CharSequence value, int idx, int endIdx, char separator, int separatorCount) {
|
||||
assert thousands == Constants.UNKNOWN;
|
||||
|
||||
if (separator == decimal) {
|
||||
// Encountered a decimal point, so can't be an integer.
|
||||
return new NumberParseFailure("Encountered Decimal Point - Can't Be Integer.");
|
||||
}
|
||||
|
||||
if (separator == Constants.NONE) {
|
||||
// Didn't encounter any separators so use simpler logic.
|
||||
return parseUnknownIntegerNone(value, idx, endIdx);
|
||||
}
|
||||
|
||||
// Find the correct format
|
||||
var format =
|
||||
switch (separator) {
|
||||
case '.' -> DOT_COMMA;
|
||||
case ',' -> separatorCount == 1 ? COMMA_UNKNOWN : COMMA_DOT;
|
||||
case ' ' -> (decimal == Constants.UNKNOWN
|
||||
? SPACE_UNKNOWN
|
||||
: (decimal == '.' ? SPACE_DOT : SPACE_COMMA));
|
||||
case '\'' -> (decimal == Constants.UNKNOWN
|
||||
? SWISS_UNKNOWN
|
||||
: (decimal == '.' ? SWISS_DOT : SWISS_COMMA));
|
||||
default -> null;
|
||||
};
|
||||
if (format == null) {
|
||||
return new NumberParseFailure("No matching number format.");
|
||||
}
|
||||
|
||||
var result = format.parseFixedInteger(value, idx, endIdx, separator);
|
||||
return (result instanceof NumberParseFailure)
|
||||
? result
|
||||
: new NumberParseResultWithFormat(format, result);
|
||||
}
|
||||
|
||||
/** Given a known double format, parse the sequence. */
|
||||
private NumberParseResult parseFixedDecimal(
|
||||
CharSequence value,
|
||||
int idx,
|
||||
int endIdx,
|
||||
char firstSeparator,
|
||||
char secondSeparator,
|
||||
boolean exponential) {
|
||||
// Deal with the special cases first.
|
||||
if (this == DOT_UNKNOWN || this == UNKNOWN_DOT) {
|
||||
// Haven't encountered a thousand separator, but know the decimal separator.
|
||||
// If DOT_UNKNOWN then could be European or English, but treat as English.
|
||||
assert firstSeparator == '.' && secondSeparator == Constants.NONE;
|
||||
return NO_DOT.parseFixedDecimal(
|
||||
value, idx, endIdx, firstSeparator, secondSeparator, exponential);
|
||||
} else if (this == COMMA_UNKNOWN) {
|
||||
// Have only encountered a Comma(s), so treat as English format (COMMA_DOT).
|
||||
assert firstSeparator == ',' && secondSeparator == Constants.NONE;
|
||||
return COMMA_DOT.parseFixedDecimal(
|
||||
value, idx, endIdx, firstSeparator, secondSeparator, exponential);
|
||||
} else if (this == UNKNOWN_COMMA) {
|
||||
// Have encountered a comma and know is a decimal separator.
|
||||
assert firstSeparator == ',' && secondSeparator == Constants.NONE;
|
||||
return NO_COMMA.parseFixedDecimal(
|
||||
value, idx, endIdx, firstSeparator, secondSeparator, exponential);
|
||||
}
|
||||
|
||||
assert thousands != Constants.UNKNOWN && decimal != Constants.UNKNOWN;
|
||||
|
||||
// If no decimal separator, then must be an integer.
|
||||
if (!exponential && firstSeparator != decimal && secondSeparator != decimal) {
|
||||
return parseFixedInteger(value, idx, endIdx, firstSeparator);
|
||||
}
|
||||
|
||||
// Validate Separators.
|
||||
if (firstSeparator != Constants.NONE) {
|
||||
if ((secondSeparator == Constants.NONE
|
||||
&& firstSeparator != thousands
|
||||
&& firstSeparator != decimal)
|
||||
|| (secondSeparator != Constants.NONE
|
||||
&& (firstSeparator != thousands || secondSeparator != decimal))) {
|
||||
return new NumberParseFailure("Invalid separator.");
|
||||
}
|
||||
}
|
||||
|
||||
// Strip out the separators.
|
||||
int origEndIdx = endIdx;
|
||||
if (thousands != Constants.NONE || decimal != '.') {
|
||||
value = Separators.strip(value, idx, endIdx, thousands, decimal);
|
||||
if (value == null) {
|
||||
return new NumberParseFailure("Invalid number.");
|
||||
}
|
||||
idx = 0;
|
||||
endIdx = value.length();
|
||||
}
|
||||
|
||||
try {
|
||||
double number = Double.parseDouble(value.subSequence(idx, endIdx).toString());
|
||||
return new NumberParseResultWithIndex(origEndIdx, new NumberParseDouble(number, ""));
|
||||
} catch (NumberFormatException e) {
|
||||
return new NumberParseFailure("Invalid number.");
|
||||
}
|
||||
}
|
||||
|
||||
/** Given a unknown format, parse the sequence. */
|
||||
private NumberParseResult parseUnknownDecimal(
|
||||
CharSequence value,
|
||||
int idx,
|
||||
int endIdx,
|
||||
char firstSeparator,
|
||||
char secondSeparator,
|
||||
int separatorCount,
|
||||
int lastSeparatorIdx,
|
||||
boolean exponential) {
|
||||
assert thousands == Constants.UNKNOWN || decimal == Constants.UNKNOWN;
|
||||
|
||||
// Special case when single separator equal to decimal point.
|
||||
if (separatorCount == 1 && firstSeparator == decimal) {
|
||||
var fixed = decimal == '.' ? NO_DOT : NO_COMMA;
|
||||
var result =
|
||||
fixed.parseFixedDecimal(value, idx, endIdx, Constants.NONE, decimal, exponential);
|
||||
if (result instanceof NumberParseResultWithIndex resultWithIndex
|
||||
&& resultWithIndex.exceedsThousand()) {
|
||||
return new NumberParseResultWithFormat(fixed, result);
|
||||
} else {
|
||||
return result;
|
||||
}
|
||||
}
|
||||
|
||||
// Cases of no separators or repeated single separator - must be integer.
|
||||
if (!exponential
|
||||
&& (firstSeparator == Constants.NONE
|
||||
|| (secondSeparator == Constants.NONE
|
||||
&& (separatorCount > 1 || firstSeparator == ' ' || firstSeparator == '\'')))) {
|
||||
if (mightBeEuropean() && firstSeparator == '.') {
|
||||
// We know we are wrong.
|
||||
var result = DOT_COMMA.parseFixedInteger(value, idx, endIdx, '.');
|
||||
return (result instanceof NumberParseFailure)
|
||||
? result
|
||||
: new NumberParseResultWithFormat(DOT_COMMA, result);
|
||||
}
|
||||
|
||||
var result =
|
||||
thousands == Constants.UNKNOWN
|
||||
? parseUnknownInteger(value, idx, endIdx, firstSeparator, separatorCount)
|
||||
: parseFixedInteger(
|
||||
value, idx, endIdx, separatorCount == 0 ? thousands : firstSeparator);
|
||||
|
||||
// Special case if COMMA_UNKNOWN and count > 1 then is COMMA_DOT.
|
||||
boolean resolveCommaUnknown = this == COMMA_UNKNOWN && separatorCount > 1;
|
||||
return (result instanceof NumberParseFailure)
|
||||
? result
|
||||
: (resolveCommaUnknown ? new NumberParseResultWithFormat(COMMA_DOT, result) : result);
|
||||
}
|
||||
|
||||
// Case when in exponential notation and no separators.
|
||||
if (exponential && firstSeparator == Constants.NONE) {
|
||||
return NO_DOT.parseFixedDecimal(
|
||||
value, idx, endIdx, Constants.NONE, Constants.NONE, exponential);
|
||||
}
|
||||
|
||||
// Need to resolve the format.
|
||||
NumberWithSeparators format = null;
|
||||
if (secondSeparator != Constants.NONE) {
|
||||
format =
|
||||
switch (firstSeparator) {
|
||||
case '.' -> secondSeparator == ',' ? DOT_COMMA : null;
|
||||
case ',' -> secondSeparator == '.' ? COMMA_DOT : null;
|
||||
case ' ' -> secondSeparator == '.'
|
||||
? SPACE_DOT
|
||||
: secondSeparator == ',' ? SPACE_COMMA : null;
|
||||
case '\'' -> secondSeparator == '.'
|
||||
? SWISS_DOT
|
||||
: secondSeparator == ',' ? SWISS_COMMA : null;
|
||||
default -> null;
|
||||
};
|
||||
} else if (firstSeparator == '.') {
|
||||
// if separatorCount > 1, must be a thousand separator, hence DOT_COMMA (covered above).
|
||||
// if index of separator > 3, must be a decimal point without a thousand separator, hence
|
||||
// NO_DOT.
|
||||
// if 3 digits following then could either, hence DOT_UNKNOWN.
|
||||
// Otherwise, must be decimal point, hence UNKNOWN_DOT.
|
||||
format =
|
||||
lastSeparatorIdx - idx > 3
|
||||
? NO_DOT
|
||||
: (lastSeparatorIdx != endIdx - 4
|
||||
? UNKNOWN_DOT
|
||||
: (decimal == ',' ? DOT_COMMA : DOT_UNKNOWN));
|
||||
} else if (firstSeparator == ',') {
|
||||
// if separatorCount > 1, must be a thousand separator, hence COMMA_DOT (covered above).
|
||||
// if index of separator > 3, must be a decimal point without a thousand separator, hence
|
||||
// NO_COMMA.
|
||||
// if 3 digits following then could either, hence COMMA_UNKNOWN.
|
||||
// Otherwise, must be decimal point, hence UNKNOWN_COMMA.
|
||||
format =
|
||||
lastSeparatorIdx - idx > 3
|
||||
? NO_COMMA
|
||||
: (lastSeparatorIdx != endIdx - 4
|
||||
? UNKNOWN_COMMA
|
||||
: (decimal == '.' ? COMMA_DOT : COMMA_UNKNOWN));
|
||||
}
|
||||
if (format == null) {
|
||||
return new NumberParseFailure("No matching number format.");
|
||||
}
|
||||
|
||||
// Validate that the new format matches.
|
||||
if (this.mightBeEuropean()) {
|
||||
if (this == DOT_UNKNOWN && format.decimal != '.' && format.thousands != '.') {
|
||||
return new NumberParseFailure("Invalid format matched.");
|
||||
} else if (this == COMMA_UNKNOWN && format.decimal != ',' && format.thousands != ',') {
|
||||
return new NumberParseFailure("Invalid format matched.");
|
||||
}
|
||||
} else if ((thousands != Constants.UNKNOWN && format.thousands != thousands)
|
||||
|| (decimal != Constants.UNKNOWN && format.decimal != decimal)) {
|
||||
return new NumberParseFailure("Invalid format matched.");
|
||||
}
|
||||
|
||||
var result =
|
||||
format.parseFixedDecimal(value, idx, endIdx, firstSeparator, secondSeparator, exponential);
|
||||
return (result instanceof NumberParseFailure)
|
||||
? result
|
||||
: new NumberParseResultWithFormat(format, result);
|
||||
}
|
||||
}
|
239
std-bits/base/src/main/java/org/enso/base/parser/Separators.java
Normal file
239
std-bits/base/src/main/java/org/enso/base/parser/Separators.java
Normal file
@ -0,0 +1,239 @@
|
||||
package org.enso.base.parser;
|
||||
|
||||
import static org.enso.base.parser.NumberWithSeparators.isDigit;
|
||||
|
||||
import java.nio.CharBuffer;
|
||||
|
||||
/**
|
||||
* Record to hold information about the separators found in a number.
|
||||
*
|
||||
* @param first - the first encountered separator or Constants.NONE if none found.
|
||||
* @param second - the second distinct separator or Constants.NONE if none found.
|
||||
* @param count - the number of separators found.
|
||||
* @param endIdx - the index of the last character in the number.
|
||||
* @param lastSeparatorIdx - the index of the last separator found.
|
||||
* @param exponential - whether the number is in exponential notation.
|
||||
*/
|
||||
public record Separators(
|
||||
char first, char second, int count, int endIdx, int lastSeparatorIdx, boolean exponential) {
|
||||
/**
|
||||
* Strip out the specified separators and replace with just full stop for decimal. If any
|
||||
* character other than a digit, thousands or decimal separator is encountered then return null.
|
||||
* If multiple decimal separators are encountered then return null.
|
||||
*/
|
||||
static CharSequence strip(
|
||||
CharSequence value, int startIdx, int endIdx, char thousands, char decimal) {
|
||||
int lastThousand = -1;
|
||||
boolean foundDecimal = false;
|
||||
char[] results = new char[endIdx - startIdx];
|
||||
int resultIdx = 0;
|
||||
for (int i = startIdx; i < endIdx; i++) {
|
||||
char c = value.charAt(i);
|
||||
if (c == decimal) {
|
||||
if (foundDecimal) {
|
||||
return null;
|
||||
}
|
||||
if (lastThousand != -1 && i != lastThousand + 4) {
|
||||
return null;
|
||||
}
|
||||
results[resultIdx++] = '.';
|
||||
foundDecimal = true;
|
||||
} else if (isDigit(c)) {
|
||||
results[resultIdx++] = c;
|
||||
} else if (c == thousands) {
|
||||
// Cannot have thousands post decimal separator.
|
||||
if (foundDecimal) {
|
||||
return null;
|
||||
}
|
||||
|
||||
// Must be 4 away from last thousand separator.
|
||||
if (lastThousand != -1) {
|
||||
if (i != lastThousand + 4) {
|
||||
return null;
|
||||
}
|
||||
}
|
||||
|
||||
lastThousand = i;
|
||||
} else {
|
||||
return null;
|
||||
}
|
||||
}
|
||||
|
||||
if (!foundDecimal && lastThousand != -1 && endIdx != lastThousand + 4) {
|
||||
return null;
|
||||
}
|
||||
|
||||
return CharBuffer.wrap(results, 0, resultIdx);
|
||||
}
|
||||
|
||||
/** Check if the character is a separator. */
|
||||
static boolean isSeparator(char c) {
|
||||
return c == '.' || c == ',' || c == ' ' || c == '\'' || c == '_';
|
||||
}
|
||||
|
||||
/** Check if the character is a decimal separator. */
|
||||
private static boolean isDecimalSeparator(char c) {
|
||||
return c == '.' || c == ',';
|
||||
}
|
||||
|
||||
/** Check if the character is part of the current number. */
|
||||
private static boolean validChar(ExponentState exponentState, char c, char first, char second) {
|
||||
if (isDigit(c)) {
|
||||
return true;
|
||||
}
|
||||
|
||||
// If scientific notation is allowed then check for 'e' or 'E'.
|
||||
// Can then be followed by a +/- sign.
|
||||
if (exponentState == ExponentState.START && (c == 'e' || c == 'E')) {
|
||||
return true;
|
||||
}
|
||||
|
||||
// Sign can only be encountered after an E/e in scientific notation.
|
||||
if (exponentState == ExponentState.E_SIGN && (c == '+' || c == '-')) {
|
||||
return true;
|
||||
}
|
||||
|
||||
// Separators not valid in scientific notation if not in start.
|
||||
if (exponentState != ExponentState.START && exponentState != ExponentState.NOT_ALLOWED) {
|
||||
return false;
|
||||
}
|
||||
|
||||
// We haven't encountered a separator yet, so valid if it is a separator.
|
||||
if (first == NumberWithSeparators.Constants.NONE) {
|
||||
return isSeparator(c);
|
||||
}
|
||||
|
||||
// We have encountered the first separator, so valid if it is the same as
|
||||
// the first or a decimal separator.
|
||||
if (second == NumberWithSeparators.Constants.NONE) {
|
||||
return c == first || isDecimalSeparator(c);
|
||||
}
|
||||
|
||||
// We have encountered the second separator, so invalid to encounter another
|
||||
// separator.
|
||||
return false;
|
||||
}
|
||||
|
||||
/**
|
||||
* Find the number and separators section. Validate the spacing of separators. Return the
|
||||
* separators found or null if invalid.
|
||||
*
|
||||
* @param value the value to parse.
|
||||
* @param idx the index to start parsing from.
|
||||
* @param integer if the number is an integer.
|
||||
* @param allowExponentialNotation is exponential notation allowed.
|
||||
*/
|
||||
static Separators parse(
|
||||
CharSequence value, int idx, boolean integer, boolean allowExponentialNotation) {
|
||||
int endIdx = idx;
|
||||
char firstSeparator = NumberWithSeparators.Constants.NONE;
|
||||
char secondSeparator = NumberWithSeparators.Constants.NONE;
|
||||
|
||||
boolean firstWasSeparator = false;
|
||||
int lastSeparator = -1;
|
||||
int separatorCount = 0;
|
||||
|
||||
// Set initial state for exponential notation.
|
||||
ExponentState exponentState =
|
||||
!integer && allowExponentialNotation ? ExponentState.START : ExponentState.NOT_ALLOWED;
|
||||
|
||||
// Scan the text, find and validate spacing of separators.
|
||||
// Space and ' are both valid thousands separators, but can't be second separator.
|
||||
for (endIdx = idx; endIdx < value.length(); endIdx++) {
|
||||
char c = value.charAt(endIdx);
|
||||
if (!validChar(exponentState, c, firstSeparator, secondSeparator)) {
|
||||
break;
|
||||
}
|
||||
|
||||
// Cope with digits or scientific notation.
|
||||
if (isDigit(c) || c == 'e' || c == 'E' || c == '+' || c == '-') {
|
||||
// Update Exponent State.
|
||||
if (c == 'e' || c == 'E') {
|
||||
exponentState = ExponentState.E_SIGN;
|
||||
} else if (c == '+' || c == '-') {
|
||||
exponentState = ExponentState.SIGN;
|
||||
} else if (exponentState == ExponentState.SIGN || exponentState == ExponentState.E_SIGN) {
|
||||
exponentState = ExponentState.EXPONENT;
|
||||
}
|
||||
|
||||
continue;
|
||||
}
|
||||
|
||||
// If first digit is a separator then only valid if a decimal separator.
|
||||
if (endIdx == idx) {
|
||||
if (integer || !isDecimalSeparator(c)) {
|
||||
return null;
|
||||
}
|
||||
firstWasSeparator = true;
|
||||
}
|
||||
|
||||
if (firstSeparator == NumberWithSeparators.Constants.NONE) {
|
||||
// Found the first separator.
|
||||
firstSeparator = c;
|
||||
} else {
|
||||
// TODO: This check is probably now redundant as strip does it as well.
|
||||
// Encountered another separator - must be 4 away from last separator.
|
||||
if (endIdx != lastSeparator + 4) {
|
||||
// Special case if last was a space as could be separating symbol.
|
||||
if (c == ' ') {
|
||||
break;
|
||||
}
|
||||
return null;
|
||||
}
|
||||
|
||||
// Must have been a decimal separator.
|
||||
if (firstWasSeparator) {
|
||||
return null;
|
||||
}
|
||||
|
||||
// Encountered a second separator, only valid if !integer.
|
||||
if (firstSeparator != c) {
|
||||
if (!integer) {
|
||||
secondSeparator = c;
|
||||
} else {
|
||||
return null;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
lastSeparator = endIdx;
|
||||
separatorCount++;
|
||||
}
|
||||
|
||||
// Special case when firstSeparator is a space and no secondSeparator and ending with a space.
|
||||
if (firstSeparator == ' ' && value.charAt(endIdx - 1) == ' ') {
|
||||
separatorCount--;
|
||||
endIdx--;
|
||||
lastSeparator -= 4;
|
||||
if (separatorCount == 0) {
|
||||
firstSeparator = NumberWithSeparators.Constants.NONE;
|
||||
}
|
||||
}
|
||||
|
||||
// If in integer mode then must be a thousand separator, validate final spacing.
|
||||
if (integer && separatorCount > 0 && lastSeparator != endIdx - 4) {
|
||||
return null;
|
||||
}
|
||||
|
||||
return new Separators(
|
||||
firstSeparator,
|
||||
secondSeparator,
|
||||
separatorCount,
|
||||
endIdx,
|
||||
lastSeparator,
|
||||
exponentState == ExponentState.EXPONENT);
|
||||
}
|
||||
|
||||
private enum ExponentState {
|
||||
/** Scientific notation not allowed. */
|
||||
NOT_ALLOWED,
|
||||
/** Have not encountered an E/e yet. */
|
||||
START,
|
||||
/** Have encountered an E/e. */
|
||||
E_SIGN,
|
||||
/** Have encountered an E/e and a sign. */
|
||||
SIGN,
|
||||
/** Have encountered an E/e, a sign and a digit. */
|
||||
EXPONENT
|
||||
}
|
||||
}
|
@ -1,11 +1,8 @@
|
||||
package org.enso.table.parsing;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.HashMap;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
import java.util.Objects;
|
||||
import java.util.regex.Pattern;
|
||||
import org.enso.base.parser.FormatDetectingNumberParser;
|
||||
import org.enso.base.parser.NegativeSign;
|
||||
import org.enso.base.parser.NumberWithSeparators;
|
||||
import org.enso.table.data.column.builder.Builder;
|
||||
import org.enso.table.data.column.builder.NumericBuilder;
|
||||
import org.enso.table.data.column.storage.Storage;
|
||||
@ -15,178 +12,11 @@ import org.enso.table.parsing.problems.ParseProblemAggregator;
|
||||
import org.enso.table.problems.ProblemAggregator;
|
||||
import org.graalvm.polyglot.Context;
|
||||
|
||||
/**
|
||||
* A parser for numbers.
|
||||
*
|
||||
* <p>This parser will attempt to work out what the decimal point and thousand separators used in
|
||||
* the input. It will try various ways of formatting a number and can be set to allow for scientific
|
||||
* notation, currency symbols.
|
||||
*
|
||||
* <p>If parsing a column it will select the format that parses the longest set without an issue
|
||||
* from the top and then apply this format to all the rows.
|
||||
*
|
||||
* <p>The separators will be tried in British, German, French and Swiss order. - Thousand separator
|
||||
* must be followed by groups of 3 numbers. - Scientific notation is only allowed on decimals and
|
||||
* must be on a value between -10 and 10. The notation is an `E` followed by an integer.
|
||||
*
|
||||
* <p>The following formats are supported: - Sign (+/-) followed by Number (e.g. +1,234.56) - Using
|
||||
* brackets to indicate a negative number (e.g. (1,234.56)) - Currency symbols (if enabled) can be
|
||||
* placed before or after the sign and number. - If using brackets, the currency symbol must be
|
||||
* placed after the opening bracket.
|
||||
*/
|
||||
public class NumberParser extends IncrementalDatatypeParser {
|
||||
private static final String SIGN = "(?<sign>[-+])?";
|
||||
private static final String BRACKETS = "(?<sign>\\((?=.*\\)\\s*$))?\\s*";
|
||||
private static final String BRACKET_CLOSE = "\\)?";
|
||||
private static final String CCY = "(?<ccy>[^0-9(),. '+-]+)";
|
||||
private static final String EXP = "(?<exp>[eE][+-]?\\d+)?";
|
||||
private static final String SPACE = "\\s*";
|
||||
|
||||
private record Separators(String thousand, String decimal) {}
|
||||
|
||||
private final Separators[] SEPARATORS;
|
||||
|
||||
private static final Map<String, Pattern> PATTERNS = new HashMap<>();
|
||||
private final IntegerType integerTargetType;
|
||||
|
||||
private static void validateSeparator(String name, String value) {
|
||||
if (value == null) return;
|
||||
|
||||
if (value.length() != 1) {
|
||||
throw new IllegalArgumentException(
|
||||
name + " must be a single character, but it was '" + value + "'.");
|
||||
}
|
||||
|
||||
// If we allowed separators to be a digit, super crazy stuff could happen - e.g. technically
|
||||
// 10000 could be interpreted as 1000 by interpreting the first 0 as a thousand separator. Let's
|
||||
// not do that.
|
||||
if (Character.isDigit(value.charAt(0))) {
|
||||
throw new IllegalArgumentException(name + " cannot be a digit, but it was '" + value + "'.");
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Builds a list of possible separator pairs.
|
||||
*
|
||||
* <p>If one of the parameters is null, it is meant to be inferred (multiple separator pairs will
|
||||
* be provided for it), if it is set to a concrete value, it will be fixed.
|
||||
*/
|
||||
private static Separators[] buildSeparators(
|
||||
boolean allowDecimal, String decimalPoint, String thousandSeparator) {
|
||||
validateSeparator("Decimal point", decimalPoint);
|
||||
validateSeparator("Thousand separator", thousandSeparator);
|
||||
if (decimalPoint != null && decimalPoint.equals(thousandSeparator)) {
|
||||
throw new IllegalArgumentException(
|
||||
"Decimal point and thousand separator cannot be the same, but they were both '"
|
||||
+ decimalPoint
|
||||
+ "'.");
|
||||
}
|
||||
|
||||
boolean fullAutomaticMode = allowDecimal && decimalPoint == null && thousandSeparator == null;
|
||||
if (fullAutomaticMode) {
|
||||
return new Separators[] {
|
||||
new Separators(",", "."),
|
||||
new Separators(".", ","),
|
||||
new Separators(" ", ","),
|
||||
new Separators("'", ","),
|
||||
};
|
||||
}
|
||||
|
||||
List<String> thousandSeparators;
|
||||
if (thousandSeparator == null) {
|
||||
List<String> autoThousandSeparators = List.of(",", ".", "'", " ");
|
||||
thousandSeparators =
|
||||
autoThousandSeparators.stream().filter(sep -> !sep.equals(decimalPoint)).toList();
|
||||
} else {
|
||||
thousandSeparators = List.of(thousandSeparator);
|
||||
}
|
||||
|
||||
List<String> decimalPoints;
|
||||
if (decimalPoint == null) {
|
||||
if (allowDecimal) {
|
||||
List<String> autoDecimalPoints = List.of(",", ".");
|
||||
assert thousandSeparator != null;
|
||||
decimalPoints =
|
||||
autoDecimalPoints.stream().filter(sep -> !sep.equals(thousandSeparator)).toList();
|
||||
} else {
|
||||
// List.of(null) is not permitted...
|
||||
decimalPoints = new ArrayList<>();
|
||||
decimalPoints.add(null);
|
||||
}
|
||||
} else {
|
||||
decimalPoints = List.of(decimalPoint);
|
||||
}
|
||||
|
||||
return thousandSeparators.stream()
|
||||
.flatMap(
|
||||
thousand -> decimalPoints.stream().map(decimal -> new Separators(thousand, decimal)))
|
||||
.toArray(Separators[]::new);
|
||||
}
|
||||
|
||||
/** The number of patterns that are allowed for non-currency numbers. */
|
||||
private static final int ALLOWED_NON_CCY_PATTERNS = 2;
|
||||
|
||||
/** The number of patterns that are allowed for currency numbers. */
|
||||
private static final int ALLOWED_CCY_PATTERNS = 6;
|
||||
|
||||
private static Pattern buildPattern(
|
||||
boolean allowDecimal,
|
||||
boolean allowCurrency,
|
||||
boolean allowScientific,
|
||||
boolean trimValues,
|
||||
int patternIndex,
|
||||
Separators separators) {
|
||||
if (allowScientific && !allowDecimal) {
|
||||
throw new IllegalArgumentException("Scientific notation requires decimal numbers.");
|
||||
}
|
||||
|
||||
if (patternIndex >= (allowCurrency ? ALLOWED_CCY_PATTERNS : ALLOWED_NON_CCY_PATTERNS)) {
|
||||
return null;
|
||||
}
|
||||
|
||||
String INTEGER =
|
||||
"(?<integer>(\\d*)"
|
||||
+ (separators.thousand == null
|
||||
? ""
|
||||
: "|(\\d{1,3}([" + separators.thousand + "]\\d{3})*)")
|
||||
+ ")";
|
||||
|
||||
String decimalPoint = allowDecimal ? Objects.requireNonNull(separators.decimal) : null;
|
||||
var NUMBER =
|
||||
INTEGER
|
||||
+ (allowDecimal ? "(?<decimal>[" + decimalPoint + "]\\d*)?" : "")
|
||||
+ (allowScientific ? EXP : "");
|
||||
|
||||
var pattern =
|
||||
switch (patternIndex) {
|
||||
case 0 -> SIGN + NUMBER;
|
||||
case 1 -> BRACKETS + NUMBER + BRACKET_CLOSE;
|
||||
case 2 -> SIGN + CCY + SPACE + NUMBER;
|
||||
case 3 -> CCY + SPACE + SIGN + NUMBER;
|
||||
case 4 -> SIGN + NUMBER + CCY;
|
||||
case 5 -> BRACKETS + CCY + SPACE + NUMBER + BRACKET_CLOSE;
|
||||
default -> throw new IllegalArgumentException("Invalid pattern index: " + patternIndex);
|
||||
};
|
||||
|
||||
if (trimValues) {
|
||||
pattern = SPACE + pattern + SPACE;
|
||||
}
|
||||
|
||||
return PATTERNS.computeIfAbsent("^" + pattern + "$", Pattern::compile);
|
||||
}
|
||||
|
||||
private final boolean allowDecimal;
|
||||
private final boolean allowCurrency;
|
||||
private final boolean allowLeadingZeros;
|
||||
private final boolean allowScientific;
|
||||
private final boolean trimValues;
|
||||
|
||||
/**
|
||||
* Creates a new integer instance of this parser.
|
||||
*
|
||||
* @param integerTargetType the target type describing how large integer values can be accepted
|
||||
* @param allowCurrency whether to allow currency symbols
|
||||
* @param allowLeadingZeros whether to allow leading zeros
|
||||
* @param trimValues whether to trim the input values
|
||||
* @param decimalPoint the decimal point set for the current format, or null if not specified;
|
||||
* this parser does not use decimal point (since it is for integers) but it ensure that if a
|
||||
@ -196,16 +26,16 @@ public class NumberParser extends IncrementalDatatypeParser {
|
||||
*/
|
||||
public static NumberParser createIntegerParser(
|
||||
IntegerType integerTargetType,
|
||||
boolean allowCurrency,
|
||||
boolean allowLeadingZeros,
|
||||
boolean allowSymbol,
|
||||
boolean allowLeadingZeroes,
|
||||
boolean trimValues,
|
||||
String decimalPoint,
|
||||
String thousandSeparator) {
|
||||
assert integerTargetType != null;
|
||||
return new NumberParser(
|
||||
false,
|
||||
integerTargetType,
|
||||
allowCurrency,
|
||||
allowLeadingZeros,
|
||||
allowSymbol,
|
||||
allowLeadingZeroes,
|
||||
trimValues,
|
||||
false,
|
||||
decimalPoint,
|
||||
@ -215,240 +45,125 @@ public class NumberParser extends IncrementalDatatypeParser {
|
||||
/**
|
||||
* Creates a new decimal instance of this parser.
|
||||
*
|
||||
* @param allowCurrency whether to allow currency symbols
|
||||
* @param allowLeadingZeros whether to allow leading zeros
|
||||
* @param allowSymbol whether to allow symbols in the input
|
||||
* @param allowLeadingZeroes whether to allow leading zeroes in the input
|
||||
* @param trimValues whether to trim the input values
|
||||
* @param allowScientific whether to allow scientific notation
|
||||
* @param decimalPoint the decimal separator to use (if null, then will be inferred)
|
||||
* @param thousandSeparator the thousand separator to use (if null, then will be inferred)
|
||||
* @param allowExponentialNotation whether to allow exponential notation in the input
|
||||
* @param decimalPoint the decimal point set for the current format (if null then will be
|
||||
* inferred)
|
||||
* @param thousandSeparator the thousand separator to use (if null then will be inferred)
|
||||
*/
|
||||
public static NumberParser createDecimalParser(
|
||||
boolean allowCurrency,
|
||||
boolean allowLeadingZeros,
|
||||
boolean allowSymbol,
|
||||
boolean allowLeadingZeroes,
|
||||
boolean trimValues,
|
||||
boolean allowScientific,
|
||||
boolean allowExponentialNotation,
|
||||
String decimalPoint,
|
||||
String thousandSeparator) {
|
||||
return new NumberParser(
|
||||
true,
|
||||
null,
|
||||
allowCurrency,
|
||||
allowLeadingZeros,
|
||||
allowSymbol,
|
||||
allowLeadingZeroes,
|
||||
trimValues,
|
||||
allowScientific,
|
||||
allowExponentialNotation,
|
||||
decimalPoint,
|
||||
thousandSeparator);
|
||||
}
|
||||
|
||||
private final IntegerType integerTargetType;
|
||||
|
||||
private final FormatDetectingNumberParser parser;
|
||||
|
||||
private NumberParser(
|
||||
boolean allowDecimal,
|
||||
IntegerType integerTargetType,
|
||||
boolean allowCurrency,
|
||||
boolean allowLeadingZeros,
|
||||
boolean trimValues,
|
||||
boolean allowScientific,
|
||||
boolean allowSymbol,
|
||||
boolean allowLeadingZeroes,
|
||||
boolean allowLeadingTrailingWhitespace,
|
||||
boolean allowExponentialNotation,
|
||||
String decimalPoint,
|
||||
String thousandSeparator) {
|
||||
this.allowDecimal = allowDecimal;
|
||||
this.integerTargetType = integerTargetType;
|
||||
this.allowCurrency = allowCurrency;
|
||||
this.allowLeadingZeros = allowLeadingZeros;
|
||||
this.trimValues = trimValues;
|
||||
this.allowScientific = allowScientific;
|
||||
SEPARATORS = buildSeparators(allowDecimal, decimalPoint, thousandSeparator);
|
||||
|
||||
var numberWithSeparators = NumberWithSeparators.fromSeparators(thousandSeparator, decimalPoint);
|
||||
this.parser =
|
||||
new FormatDetectingNumberParser(
|
||||
allowSymbol,
|
||||
allowLeadingZeroes,
|
||||
allowLeadingTrailingWhitespace,
|
||||
allowExponentialNotation,
|
||||
NegativeSign.UNKNOWN,
|
||||
numberWithSeparators);
|
||||
}
|
||||
|
||||
/**
|
||||
* Creates a Pattern for the given index. The index will be decoded into a specific set of
|
||||
* separators (unless fixed separators are used) and then paired with one of the valid patterns
|
||||
* for the given parser.
|
||||
*/
|
||||
private Pattern patternForIndex(int index) {
|
||||
int allowedSet = (allowCurrency ? ALLOWED_CCY_PATTERNS : ALLOWED_NON_CCY_PATTERNS);
|
||||
int separatorsIndex = index / allowedSet;
|
||||
int patternIndex = index % allowedSet;
|
||||
|
||||
if (separatorsIndex >= SEPARATORS.length) {
|
||||
return null;
|
||||
}
|
||||
|
||||
return buildPattern(
|
||||
allowDecimal,
|
||||
allowCurrency,
|
||||
allowScientific,
|
||||
trimValues,
|
||||
patternIndex,
|
||||
SEPARATORS[separatorsIndex]);
|
||||
private boolean isInteger() {
|
||||
return integerTargetType != null;
|
||||
}
|
||||
|
||||
@Override
|
||||
public Object parseSingleValue(String text, ParseProblemAggregator problemAggregator) {
|
||||
int index = 0;
|
||||
var pattern = patternForIndex(index);
|
||||
while (pattern != null) {
|
||||
var value = innerParseSingleValue(text, pattern);
|
||||
if (value != null) {
|
||||
return value;
|
||||
}
|
||||
|
||||
index++;
|
||||
pattern = patternForIndex(index);
|
||||
}
|
||||
|
||||
problemAggregator.reportInvalidFormat(text);
|
||||
return null;
|
||||
protected Builder makeBuilderWithCapacity(int capacity, ProblemAggregator problemAggregator) {
|
||||
return isInteger()
|
||||
? NumericBuilder.createLongBuilder(capacity, integerTargetType, problemAggregator)
|
||||
: NumericBuilder.createDoubleBuilder(capacity, problemAggregator);
|
||||
}
|
||||
|
||||
@Override
|
||||
public Storage<?> parseColumn(
|
||||
Storage<String> sourceStorage, CommonParseProblemAggregator problemAggregator) {
|
||||
int index = 0;
|
||||
var pattern = patternForIndex(index);
|
||||
Builder builder =
|
||||
makeBuilderWithCapacity(sourceStorage.size(), problemAggregator.createSimpleChild());
|
||||
|
||||
int bestIndex = 0;
|
||||
int bestCount = -1;
|
||||
while (pattern != null) {
|
||||
ProblemAggregator inner = problemAggregator.createSimpleChild();
|
||||
Builder builder = makeBuilderWithCapacity(sourceStorage.size(), inner);
|
||||
int failedAt = parseColumnWithPattern(pattern, sourceStorage, builder, null);
|
||||
if (failedAt == -1) {
|
||||
return builder.seal();
|
||||
}
|
||||
|
||||
// If there was a failure, we abandon this branch - thus we discard any problems that might
|
||||
// have been reported by the inner aggregator.
|
||||
inner.detachFromParent();
|
||||
|
||||
if (failedAt > bestCount) {
|
||||
bestCount = failedAt;
|
||||
bestIndex = index;
|
||||
}
|
||||
|
||||
index++;
|
||||
pattern = patternForIndex(index);
|
||||
}
|
||||
|
||||
CommonParseProblemAggregator aggregator = problemAggregator.createContextAwareChild();
|
||||
Builder fallback = makeBuilderWithCapacity(sourceStorage.size(), aggregator);
|
||||
parseColumnWithPattern(patternForIndex(bestIndex), sourceStorage, fallback, aggregator);
|
||||
return fallback.seal();
|
||||
}
|
||||
|
||||
private int parseColumnWithPattern(
|
||||
Pattern pattern,
|
||||
Storage<String> sourceStorage,
|
||||
Builder builder,
|
||||
ParseProblemAggregator aggregator) {
|
||||
Context context = Context.getCurrent();
|
||||
var context = Context.getCurrent();
|
||||
for (int i = 0; i < sourceStorage.size(); i++) {
|
||||
var text = sourceStorage.getItemBoxed(i);
|
||||
if (text == null) {
|
||||
|
||||
// Check if in unknown state
|
||||
var mightBeEuropean = !isInteger() && parser.numberWithSeparators().mightBeEuropean();
|
||||
|
||||
// Try and parse the value
|
||||
var result = text == null ? null : parseSingleValue(text, problemAggregator);
|
||||
|
||||
// Do we need to rescan?
|
||||
if (mightBeEuropean && parser.numberWithSeparators() != NumberWithSeparators.DOT_COMMA) {
|
||||
builder =
|
||||
makeBuilderWithCapacity(sourceStorage.size(), problemAggregator.createSimpleChild());
|
||||
for (int j = 0; j < i; j++) {
|
||||
var subText = sourceStorage.getItemBoxed(j);
|
||||
var subResult = subText == null ? null : parseSingleValue(subText, problemAggregator);
|
||||
if (subResult == null) {
|
||||
builder.appendNulls(1);
|
||||
} else {
|
||||
var value = innerParseSingleValue(text, pattern);
|
||||
if (value != null) {
|
||||
builder.appendNoGrow(value);
|
||||
} else {
|
||||
if (aggregator == null) {
|
||||
return i;
|
||||
builder.append(subResult);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
aggregator.reportInvalidFormat(text);
|
||||
// Append the result
|
||||
if (result == null) {
|
||||
builder.appendNulls(1);
|
||||
}
|
||||
} else {
|
||||
builder.append(result);
|
||||
}
|
||||
|
||||
context.safepoint();
|
||||
}
|
||||
return -1;
|
||||
|
||||
return builder.seal();
|
||||
}
|
||||
|
||||
@Override
|
||||
protected Builder makeBuilderWithCapacity(int capacity, ProblemAggregator problemAggregator) {
|
||||
return allowDecimal
|
||||
? NumericBuilder.createDoubleBuilder(capacity, problemAggregator)
|
||||
: NumericBuilder.createLongBuilder(capacity, integerTargetType, problemAggregator);
|
||||
}
|
||||
public Object parseSingleValue(String text, ParseProblemAggregator problemAggregator) {
|
||||
var result = parser.parse(text, isInteger());
|
||||
|
||||
private Object innerParseSingleValue(String text, Pattern pattern) {
|
||||
if (allowDecimal) {
|
||||
var trimmed = trimValues ? text.trim() : text;
|
||||
if (trimmed.equals("NaN")) {
|
||||
return Double.NaN;
|
||||
}
|
||||
if (trimmed.equals("Infinity")) {
|
||||
return Double.POSITIVE_INFINITY;
|
||||
}
|
||||
if (trimmed.equals("-Infinity")) {
|
||||
return Double.NEGATIVE_INFINITY;
|
||||
}
|
||||
}
|
||||
|
||||
var parsed = pattern.matcher(text);
|
||||
if (!parsed.matches()) {
|
||||
// TODO: Capture the message into the problem aggregator.
|
||||
if (result instanceof FormatDetectingNumberParser.NumberParseFailure) {
|
||||
problemAggregator.reportInvalidFormat(text);
|
||||
return null;
|
||||
}
|
||||
|
||||
try {
|
||||
var sign = parsed.group("sign");
|
||||
long sign_value = sign != null && !sign.equals("+") ? -1 : 1;
|
||||
|
||||
var integer = parsed.group("integer").replaceAll("\\D", "");
|
||||
|
||||
if (!allowLeadingZeros && integer.length() > 1 && integer.charAt(0) == '0') {
|
||||
return null;
|
||||
}
|
||||
|
||||
if (allowDecimal) {
|
||||
String decimal = parsed.group("decimal");
|
||||
String decimalPrepared = decimal == null ? "" : ("." + decimal.substring(1));
|
||||
|
||||
if (integer.equals("") && decimalPrepared.equals("")) {
|
||||
return null;
|
||||
}
|
||||
|
||||
integer = integer.equals("") ? "0" : integer;
|
||||
|
||||
String exp = allowScientific ? parsed.group("exp") : null;
|
||||
if (exp != null) {
|
||||
if (integer.length() > 1) {
|
||||
return null;
|
||||
}
|
||||
decimalPrepared = decimalPrepared + exp;
|
||||
}
|
||||
|
||||
// If there is no decimal part, we parse as integer, as this will allow us more specialized
|
||||
// handling.
|
||||
// For example, we can get the exact value instead of a rounded one for big values. We can
|
||||
// then round
|
||||
// later, but first handle any warnings.
|
||||
if (decimalPrepared.equals("")) {
|
||||
long integer_part = Long.parseLong(integer);
|
||||
|
||||
// Special handling for values like `-0` - if we treat them as integers, they will lose
|
||||
// the `-` sign.
|
||||
if (integer_part == 0 && sign_value < 0) {
|
||||
return -0.0;
|
||||
}
|
||||
|
||||
return sign_value * integer_part;
|
||||
}
|
||||
|
||||
return sign_value * Double.parseDouble(integer + decimalPrepared);
|
||||
}
|
||||
|
||||
if (integer.equals("")) {
|
||||
return null;
|
||||
}
|
||||
|
||||
long integer_value = sign_value * Long.parseLong(integer);
|
||||
if (integerTargetType.fits(integer_value)) {
|
||||
return integer_value;
|
||||
} else {
|
||||
return null;
|
||||
}
|
||||
} catch (NumberFormatException e) {
|
||||
throw new IllegalStateException("Java parse failed to parse number: " + text, e);
|
||||
}
|
||||
return switch (result) {
|
||||
case FormatDetectingNumberParser.NumberParseDouble doubleResult -> doubleResult.number();
|
||||
case FormatDetectingNumberParser.NumberParseLong longResult -> longResult.number();
|
||||
default -> throw new IllegalStateException("Unexpected result type: " + result.getClass());
|
||||
};
|
||||
}
|
||||
}
|
||||
|
@ -420,9 +420,9 @@ add_specs suite_builder =
|
||||
pUS3.to_vector . should_equal [1, -123, Nothing, 1234567, Nothing]
|
||||
Problems.expect_warning (Invalid_Format.Error "ints" Value_Type.Integer 2 ["-1,234", "12,34,56"]) pUS3
|
||||
|
||||
cUS4 = Column.from_vector "ints" ["$1234", "$1,234", "$1,234,567","-$1,234", "($1,234,567)"]
|
||||
cUS4 = Column.from_vector "ints" ["$234", "$1,234", "$1,234,567","-$1,234", "($1,234,567)"]
|
||||
pUS4 = cUS4.parse type=Value_Type.Integer
|
||||
pUS4.to_vector . should_equal [1234, 1234, 1234567, -1234, Nothing]
|
||||
pUS4.to_vector . should_equal [234, 1234, 1234567, -1234, Nothing]
|
||||
Problems.expect_warning (Invalid_Format.Error "ints" Value_Type.Integer 1 ["($1,234,567)"]) pUS4
|
||||
|
||||
## Reject bracket notation for negative numbers if already seen a minus sign
|
||||
|
Loading…
Reference in New Issue
Block a user