csv: decimal-mark rule to help with number parsing

Journal keeps a new piece of parsing state, a decimal mark character,
which can optionally be set to force the number format expected by all
amount parsers.
This commit is contained in:
Simon Michael 2020-11-06 18:45:52 -10:00
parent 4242a8592a
commit 524e23bc37
6 changed files with 136 additions and 32 deletions

View File

@ -175,6 +175,7 @@ instance Semigroup Journal where
j1 <> j2 = Journal {
jparsedefaultyear = jparsedefaultyear j2
,jparsedefaultcommodity = jparsedefaultcommodity j2
,jparsedecimalmark = jparsedecimalmark j2
,jparseparentaccounts = jparseparentaccounts j2
,jparsealiases = jparsealiases j2
-- ,jparsetransactioncount = jparsetransactioncount j1 + jparsetransactioncount j2
@ -201,6 +202,7 @@ nulljournal :: Journal
nulljournal = Journal {
jparsedefaultyear = Nothing
,jparsedefaultcommodity = Nothing
,jparsedecimalmark = Nothing
,jparseparentaccounts = []
,jparsealiases = []
-- ,jparsetransactioncount = 0

View File

@ -159,6 +159,12 @@ data AccountAlias = BasicAlias AccountName AccountName
data Side = L | R deriving (Eq,Show,Read,Ord,Generic)
-- | One of the decimal marks we support: either period or comma.
type DecimalMark = Char
isDecimalMark :: Char -> Bool
isDecimalMark c = c == '.' || c == ','
-- | The basic numeric type used in amounts.
type Quantity = Decimal
-- The following is for hledger-web, and requires blaze-markup.
@ -440,6 +446,7 @@ data Journal = Journal {
-- parsing-related data
jparsedefaultyear :: Maybe Year -- ^ the current default year, specified by the most recent Y directive (or current date)
,jparsedefaultcommodity :: Maybe (CommoditySymbol,AmountStyle) -- ^ the current default commodity and its format, specified by the most recent D directive
,jparsedecimalmark :: Maybe DecimalMark -- ^ the character to always parse as decimal point, if set by CsvReader's decimal-mark (or a future journal directive)
,jparseparentaccounts :: [AccountName] -- ^ the current stack of parent account names, specified by apply account directives
,jparsealiases :: [AccountAlias] -- ^ the current account name aliases in effect, specified by alias directives (& options ?)
-- ,jparsetransactioncount :: Integer -- ^ the current count of transactions parsed so far (only journal format txns, currently)

View File

@ -349,6 +349,15 @@ setYear y = modify' (\j -> j{jparsedefaultyear=Just y})
getYear :: JournalParser m (Maybe Year)
getYear = fmap jparsedefaultyear get
-- | Get the decimal mark that has been specified for parsing, if any
-- (eg by the CSV decimal-mark rule, or possibly a future journal directive).
-- Return it as an AmountStyle that amount parsers can use.
getDecimalMarkStyle :: JournalParser m (Maybe AmountStyle)
getDecimalMarkStyle = do
Journal{jparsedecimalmark} <- get
let mdecmarkStyle = maybe Nothing (\c -> Just $ amountstyle{asdecimalpoint=Just c}) jparsedecimalmark
return mdecmarkStyle
setDefaultCommodityAndStyle :: (CommoditySymbol,AmountStyle) -> JournalParser m ()
setDefaultCommodityAndStyle cs = modify' (\j -> j{jparsedefaultcommodity=Just cs})
@ -640,9 +649,26 @@ spaceandamountormissingp =
-- or right, followed by, in any order: an optional transaction price,
-- an optional ledger-style lot price, and/or an optional ledger-style
-- lot date. A lot price and lot date will be ignored.
--
-- To parse the amount's quantity (number) we need to know which character
-- represents a decimal mark. We find it in one of three ways:
--
-- 1. If a decimal mark has been set explicitly in the journal parse state,
-- we use that
--
-- 2. Or if the journal has a commodity declaration for the amount's commodity,
-- we get the decimal mark from that
--
-- 3. Otherwise we will parse any valid decimal mark appearing in the
-- number, as long as the number appears well formed.
--
-- Note 3 is the default zero-config case; it means we automatically handle
-- files with any supported decimal mark, but it also allows different decimal marks
-- in different amounts, which is a bit too loose. There's an open issue.
amountp :: JournalParser m Amount
amountp = label "amount" $ do
let spaces = lift $ skipNonNewlineSpaces
let
spaces = lift $ skipNonNewlineSpaces
amount <- amountwithoutpricep <* spaces
(mprice, _elotprice, _elotdate) <- runPermutation $
(,,) <$> toPermutationWithDefault Nothing (Just <$> priceamountp <* spaces)
@ -650,9 +676,8 @@ amountp = label "amount" $ do
<*> toPermutationWithDefault Nothing (Just <$> lotdatep <* spaces)
pure $ amount { aprice = mprice }
-- XXX Just like amountp but don't allow lot prices. Needed for balanceassertionp.
amountpnolotprices :: JournalParser m Amount
amountpnolotprices = label "amount" $ do
amountpnolotpricesp :: JournalParser m Amount
amountpnolotpricesp = label "amount" $ do
let spaces = lift $ skipNonNewlineSpaces
amount <- amountwithoutpricep
spaces
@ -669,7 +694,9 @@ amountwithoutpricep = do
leftsymbolamountp :: Bool -> (Decimal -> Decimal) -> JournalParser m Amount
leftsymbolamountp mult sign = label "amount" $ do
c <- lift commoditysymbolp
suggestedStyle <- getAmountStyle c
mdecmarkStyle <- getDecimalMarkStyle
mcommodityStyle <- getAmountStyle c
let suggestedStyle = mdecmarkStyle <|> mcommodityStyle
commodityspaced <- lift skipNonNewlineSpaces'
sign2 <- lift $ signp
offBeforeNum <- getOffset
@ -692,14 +719,18 @@ amountwithoutpricep = do
case mSpaceAndCommodity of
-- right symbol amount
Just (commodityspaced, c) -> do
suggestedStyle <- getAmountStyle c
(q,prec,mdec,mgrps) <- lift $ interpretNumber numRegion suggestedStyle ambiguousRawNum mExponent
mdecmarkStyle <- getDecimalMarkStyle
mcommodityStyle <- getAmountStyle c
let msuggestedStyle = mdecmarkStyle <|> mcommodityStyle
(q,prec,mdec,mgrps) <- lift $ interpretNumber numRegion msuggestedStyle ambiguousRawNum mExponent
let s = amountstyle{ascommodityside=R, ascommodityspaced=commodityspaced, asprecision=prec, asdecimalpoint=mdec, asdigitgroups=mgrps}
return $ nullamt{acommodity=c, aquantity=sign q, aismultiplier=mult, astyle=s, aprice=Nothing}
-- no symbol amount
Nothing -> do
suggestedStyle <- getDefaultAmountStyle
(q,prec,mdec,mgrps) <- lift $ interpretNumber numRegion suggestedStyle ambiguousRawNum mExponent
mdecmarkStyle <- getDecimalMarkStyle
mcommodityStyle <- getDefaultAmountStyle
let msuggestedStyle = mdecmarkStyle <|> mcommodityStyle
(q,prec,mdec,mgrps) <- lift $ interpretNumber numRegion msuggestedStyle ambiguousRawNum mExponent
-- if a default commodity has been set, apply it and its style to this amount
-- (unless it's a multiplier in an automated posting)
defcs <- getDefaultCommodityAndStyle
@ -716,8 +747,8 @@ amountwithoutpricep = do
-> Either AmbiguousNumber RawNumber
-> Maybe Integer
-> TextParser m (Quantity, AmountPrecision, Maybe Char, Maybe DigitGroupStyle)
interpretNumber posRegion suggestedStyle ambiguousNum mExp =
let rawNum = either (disambiguateNumber suggestedStyle) id ambiguousNum
interpretNumber posRegion msuggestedStyle ambiguousNum mExp =
let rawNum = either (disambiguateNumber msuggestedStyle) id ambiguousNum
in case fromRawNumber rawNum mExp of
Left errMsg -> customFailure $
uncurry parseErrorAtRegion posRegion errMsg
@ -776,7 +807,7 @@ balanceassertionp = do
lift skipNonNewlineSpaces
-- this amount can have a price; balance assertions ignore it,
-- but balance assignments will use it
a <- amountpnolotprices <?> "amount (for a balance assertion or assignment)"
a <- amountpnolotpricesp <?> "amount (for a balance assertion or assignment)"
return BalanceAssertion
{ baamount = a
, batotal = istotal
@ -884,13 +915,12 @@ fromRawNumber raw mExp = do
(a:b:cs) | a < b -> b:cs
gs -> gs
disambiguateNumber :: Maybe AmountStyle -> AmbiguousNumber -> RawNumber
disambiguateNumber suggestedStyle (AmbiguousNumber grp1 sep grp2) =
disambiguateNumber msuggestedStyle (AmbiguousNumber grp1 sep grp2) =
-- If present, use the suggested style to disambiguate;
-- otherwise, assume that the separator is a decimal point where possible.
if isDecimalPointChar sep &&
maybe True (sep `isValidDecimalBy`) suggestedStyle
if isDecimalMark sep &&
maybe True (sep `isValidDecimalBy`) msuggestedStyle
then NoSeparators grp1 (Just (sep, grp2))
else WithSeparators sep [grp1, grp2] Nothing
where
@ -925,7 +955,7 @@ rawnumberp = label "number" $ do
rawNumber <- fmap Right leadingDecimalPt <|> leadingDigits
-- Guard against mistyped numbers
mExtraDecimalSep <- optional $ lookAhead $ satisfy isDecimalPointChar
mExtraDecimalSep <- optional $ lookAhead $ satisfy isDecimalMark
when (isJust mExtraDecimalSep) $
Fail.fail "invalid number (invalid use of separator)"
@ -941,7 +971,7 @@ rawnumberp = label "number" $ do
leadingDecimalPt :: TextParser m RawNumber
leadingDecimalPt = do
decPt <- satisfy isDecimalPointChar
decPt <- satisfy isDecimalMark
decGrp <- digitgroupp
pure $ NoSeparators mempty (Just (decPt, decGrp))
@ -962,7 +992,7 @@ rawnumberp = label "number" $ do
withDecimalPt :: Char -> [DigitGrp] -> TextParser m RawNumber
withDecimalPt digitSep digitGroups = do
decPt <- satisfy $ \c -> isDecimalPointChar c && c /= digitSep
decPt <- satisfy $ \c -> isDecimalMark c && c /= digitSep
decDigitGrp <- option mempty digitgroupp
pure $ WithSeparators digitSep digitGroups (Just (decPt, decDigitGrp))
@ -974,21 +1004,17 @@ rawnumberp = label "number" $ do
-> [DigitGrp]
-> Either AmbiguousNumber RawNumber
withoutDecimalPt grp1 sep grp2 grps
| null grps && isDecimalPointChar sep =
| null grps && isDecimalMark sep =
Left $ AmbiguousNumber grp1 sep grp2
| otherwise = Right $ WithSeparators sep (grp1:grp2:grps) Nothing
trailingDecimalPt :: DigitGrp -> TextParser m RawNumber
trailingDecimalPt grp1 = do
decPt <- satisfy isDecimalPointChar
decPt <- satisfy isDecimalMark
pure $ NoSeparators grp1 (Just (decPt, mempty))
isDecimalPointChar :: Char -> Bool
isDecimalPointChar c = c == '.' || c == ','
isDigitSeparatorChar :: Char -> Bool
isDigitSeparatorChar c = isDecimalPointChar c || c == ' '
isDigitSeparatorChar c = isDecimalMark c || c == ' '
-- | Some kinds of number literal we might parse.
data RawNumber

View File

@ -78,7 +78,7 @@ import Text.Printf (printf)
import Hledger.Data
import Hledger.Utils
import Hledger.Read.Common (Reader(..),InputOpts(..),amountp, statusp, genericSourcePos, journalFinalise)
import Hledger.Read.Common ( Reader(..),InputOpts(..), amountp, statusp, genericSourcePos, journalFinalise )
--- ** doctest setup
-- $setup
@ -364,7 +364,7 @@ Grammar for the CSV conversion rules, more or less:
RULES: RULE*
RULE: ( FIELD-LIST | FIELD-ASSIGNMENT | CONDITIONAL-BLOCK | SKIP | NEWEST-FIRST | DATE-FORMAT | COMMENT | BLANK ) NEWLINE
RULE: ( FIELD-LIST | FIELD-ASSIGNMENT | CONDITIONAL-BLOCK | SKIP | NEWEST-FIRST | DATE-FORMAT | DECIMAL-MARK | COMMENT | BLANK ) NEWLINE
FIELD-LIST: fields SPACE FIELD-NAME ( SPACE? , SPACE? FIELD-NAME )*
@ -462,6 +462,7 @@ directivep = (do
directives :: [String]
directives =
["date-format"
,"decimal-mark"
,"separator"
-- ,"default-account"
-- ,"default-currency"
@ -1048,9 +1049,10 @@ getBalance rules record currency n = do
parseAmount :: CsvRules -> CsvRecord -> String -> String -> MixedAmount
parseAmount rules record currency s =
either mkerror (Mixed . (:[])) $ -- PARTIAL:
runParser (evalStateT (amountp <* eof) nulljournal) "" $
runParser (evalStateT (amountp <* eof) journalparsestate) "" $
T.pack $ (currency++) $ simplifySign s
where
journalparsestate = nulljournal{jparsedecimalmark=parseDecimalMark rules}
mkerror e = error' $ unlines
["error: could not parse \""++s++"\" as an amount"
,showRecord record
@ -1062,7 +1064,8 @@ parseAmount rules record currency s =
++"or add or change your skip rule"
]
-- XXX unify these
-- XXX unify these ^v
-- | Almost but not quite the same as parseAmount.
-- Given a non-empty amount string (from CSV) to parse, along with a
-- possibly non-empty currency symbol to prepend,
@ -1071,10 +1074,11 @@ parseAmount rules record currency s =
parseBalanceAmount :: CsvRules -> CsvRecord -> String -> Int -> String -> Amount
parseBalanceAmount rules record currency n s =
either (mkerror n s) id $
runParser (evalStateT (amountp <* eof) nulljournal) "" $
runParser (evalStateT (amountp <* eof) journalparsestate) "" $
T.pack $ (currency++) $ simplifySign s
-- the csv record's line number would be good
where
journalparsestate = nulljournal{jparsedecimalmark=parseDecimalMark rules}
mkerror n s e = error' $ unlines
["error: could not parse \""++s++"\" as balance"++show n++" amount"
,showRecord record
@ -1083,6 +1087,15 @@ parseBalanceAmount rules record currency n s =
,"the parse error is: "++customErrorBundlePretty e
]
-- Read a valid decimal mark from the decimal-mark rule, if any.
-- If the rule is present with an invalid argument, raise an error.
parseDecimalMark :: CsvRules -> Maybe DecimalMark
parseDecimalMark rules =
case rules `csvRule` "decimal-mark" of
Nothing -> Nothing
Just [c] | isDecimalMark c -> Just c
Just s -> error' $ "decimal-mark's argument should be \".\" or \",\" (not \""++s++"\")"
-- | Make a balance assertion for the given amount, with the given parse
-- position (to be shown in assertion failures), with the assertion type
-- possibly set by a balance-type rule.

View File

@ -45,7 +45,8 @@ these are described more fully below, after the examples:
| [**`if` block**](#if-block) | apply some rules to CSV records matched by patterns |
| [**`if` table**](#if-table) | apply some rules to CSV records matched by patterns, alternate syntax |
| [**`end`**](#end) | skip the remaining CSV records |
| [**`date-format`**](#date-format) | describe the format of CSV dates |
| [**`date-format`**](#date-format) | how to parse dates in CSV records |
| [**`decimal-mark`**](#decimal-mark) | the decimal mark used in CSV amounts, if ambiguous |
| [**`newest-first`**](#newest-first) | disambiguate record order when there's only one date |
| [**`include`**](#include) | inline another CSV rules file |
| [**`balance-type`**](#balance-type) | choose which type of balance assignments to use |
@ -716,6 +717,21 @@ For the supported strptime syntax, see:\
<https://hackage.haskell.org/package/time/docs/Data-Time-Format.html#v:formatTime>
## `decimal-mark`
```rules
decimal-mark .
```
or:
```rules
decimal-mark ,
```
hledger automatically accepts either period or comma as a decimal mark when parsing numbers
(cf [Amounts](journal.html#amounts)).
However if any numbers in the CSV contain digit group marks, such as thousand-separating commas,
you should declare the decimal mark explicitly with this rule, to avoid misparsed numbers.
## `newest-first`
hledger always sorts the generated transactions by date.

View File

@ -906,6 +906,46 @@ $ ./csvtest.sh
>=0
# 45. decimal-mark helps parse ambiguous decimals correctly
<
2020-01-01,"1,000"
2020-01-02,"1.000"
RULES
fields date,amount
decimal-mark .
$ ./csvtest.sh
2020-01-01
expenses:unknown 1,000.000
income:unknown -1,000.000
2020-01-02
expenses:unknown 1.000
income:unknown -1.000
>=
# 46.
<
2020-01-01,"1,000"
2020-01-02,"1.000"
RULES
fields date,amount
decimal-mark ,
$ ./csvtest.sh
2020-01-01
expenses:unknown 1,000
income:unknown -1,000
2020-01-02
expenses:unknown 1.000,000
income:unknown -1.000,000
>=
## .
#<