mirror of
https://github.com/simonmichael/hledger.git
synced 2024-09-19 10:17:35 +03:00
csv: decimal-mark rule to help with number parsing
Journal keeps a new piece of parsing state, a decimal mark character, which can optionally be set to force the number format expected by all amount parsers.
This commit is contained in:
parent
4242a8592a
commit
524e23bc37
@ -175,6 +175,7 @@ instance Semigroup Journal where
|
||||
j1 <> j2 = Journal {
|
||||
jparsedefaultyear = jparsedefaultyear j2
|
||||
,jparsedefaultcommodity = jparsedefaultcommodity j2
|
||||
,jparsedecimalmark = jparsedecimalmark j2
|
||||
,jparseparentaccounts = jparseparentaccounts j2
|
||||
,jparsealiases = jparsealiases j2
|
||||
-- ,jparsetransactioncount = jparsetransactioncount j1 + jparsetransactioncount j2
|
||||
@ -201,6 +202,7 @@ nulljournal :: Journal
|
||||
nulljournal = Journal {
|
||||
jparsedefaultyear = Nothing
|
||||
,jparsedefaultcommodity = Nothing
|
||||
,jparsedecimalmark = Nothing
|
||||
,jparseparentaccounts = []
|
||||
,jparsealiases = []
|
||||
-- ,jparsetransactioncount = 0
|
||||
|
@ -159,6 +159,12 @@ data AccountAlias = BasicAlias AccountName AccountName
|
||||
|
||||
data Side = L | R deriving (Eq,Show,Read,Ord,Generic)
|
||||
|
||||
-- | One of the decimal marks we support: either period or comma.
|
||||
type DecimalMark = Char
|
||||
|
||||
isDecimalMark :: Char -> Bool
|
||||
isDecimalMark c = c == '.' || c == ','
|
||||
|
||||
-- | The basic numeric type used in amounts.
|
||||
type Quantity = Decimal
|
||||
-- The following is for hledger-web, and requires blaze-markup.
|
||||
@ -440,6 +446,7 @@ data Journal = Journal {
|
||||
-- parsing-related data
|
||||
jparsedefaultyear :: Maybe Year -- ^ the current default year, specified by the most recent Y directive (or current date)
|
||||
,jparsedefaultcommodity :: Maybe (CommoditySymbol,AmountStyle) -- ^ the current default commodity and its format, specified by the most recent D directive
|
||||
,jparsedecimalmark :: Maybe DecimalMark -- ^ the character to always parse as decimal point, if set by CsvReader's decimal-mark (or a future journal directive)
|
||||
,jparseparentaccounts :: [AccountName] -- ^ the current stack of parent account names, specified by apply account directives
|
||||
,jparsealiases :: [AccountAlias] -- ^ the current account name aliases in effect, specified by alias directives (& options ?)
|
||||
-- ,jparsetransactioncount :: Integer -- ^ the current count of transactions parsed so far (only journal format txns, currently)
|
||||
|
@ -349,6 +349,15 @@ setYear y = modify' (\j -> j{jparsedefaultyear=Just y})
|
||||
getYear :: JournalParser m (Maybe Year)
|
||||
getYear = fmap jparsedefaultyear get
|
||||
|
||||
-- | Get the decimal mark that has been specified for parsing, if any
|
||||
-- (eg by the CSV decimal-mark rule, or possibly a future journal directive).
|
||||
-- Return it as an AmountStyle that amount parsers can use.
|
||||
getDecimalMarkStyle :: JournalParser m (Maybe AmountStyle)
|
||||
getDecimalMarkStyle = do
|
||||
Journal{jparsedecimalmark} <- get
|
||||
let mdecmarkStyle = maybe Nothing (\c -> Just $ amountstyle{asdecimalpoint=Just c}) jparsedecimalmark
|
||||
return mdecmarkStyle
|
||||
|
||||
setDefaultCommodityAndStyle :: (CommoditySymbol,AmountStyle) -> JournalParser m ()
|
||||
setDefaultCommodityAndStyle cs = modify' (\j -> j{jparsedefaultcommodity=Just cs})
|
||||
|
||||
@ -640,9 +649,26 @@ spaceandamountormissingp =
|
||||
-- or right, followed by, in any order: an optional transaction price,
|
||||
-- an optional ledger-style lot price, and/or an optional ledger-style
|
||||
-- lot date. A lot price and lot date will be ignored.
|
||||
--
|
||||
-- To parse the amount's quantity (number) we need to know which character
|
||||
-- represents a decimal mark. We find it in one of three ways:
|
||||
--
|
||||
-- 1. If a decimal mark has been set explicitly in the journal parse state,
|
||||
-- we use that
|
||||
--
|
||||
-- 2. Or if the journal has a commodity declaration for the amount's commodity,
|
||||
-- we get the decimal mark from that
|
||||
--
|
||||
-- 3. Otherwise we will parse any valid decimal mark appearing in the
|
||||
-- number, as long as the number appears well formed.
|
||||
--
|
||||
-- Note 3 is the default zero-config case; it means we automatically handle
|
||||
-- files with any supported decimal mark, but it also allows different decimal marks
|
||||
-- in different amounts, which is a bit too loose. There's an open issue.
|
||||
amountp :: JournalParser m Amount
|
||||
amountp = label "amount" $ do
|
||||
let spaces = lift $ skipNonNewlineSpaces
|
||||
let
|
||||
spaces = lift $ skipNonNewlineSpaces
|
||||
amount <- amountwithoutpricep <* spaces
|
||||
(mprice, _elotprice, _elotdate) <- runPermutation $
|
||||
(,,) <$> toPermutationWithDefault Nothing (Just <$> priceamountp <* spaces)
|
||||
@ -650,9 +676,8 @@ amountp = label "amount" $ do
|
||||
<*> toPermutationWithDefault Nothing (Just <$> lotdatep <* spaces)
|
||||
pure $ amount { aprice = mprice }
|
||||
|
||||
-- XXX Just like amountp but don't allow lot prices. Needed for balanceassertionp.
|
||||
amountpnolotprices :: JournalParser m Amount
|
||||
amountpnolotprices = label "amount" $ do
|
||||
amountpnolotpricesp :: JournalParser m Amount
|
||||
amountpnolotpricesp = label "amount" $ do
|
||||
let spaces = lift $ skipNonNewlineSpaces
|
||||
amount <- amountwithoutpricep
|
||||
spaces
|
||||
@ -669,7 +694,9 @@ amountwithoutpricep = do
|
||||
leftsymbolamountp :: Bool -> (Decimal -> Decimal) -> JournalParser m Amount
|
||||
leftsymbolamountp mult sign = label "amount" $ do
|
||||
c <- lift commoditysymbolp
|
||||
suggestedStyle <- getAmountStyle c
|
||||
mdecmarkStyle <- getDecimalMarkStyle
|
||||
mcommodityStyle <- getAmountStyle c
|
||||
let suggestedStyle = mdecmarkStyle <|> mcommodityStyle
|
||||
commodityspaced <- lift skipNonNewlineSpaces'
|
||||
sign2 <- lift $ signp
|
||||
offBeforeNum <- getOffset
|
||||
@ -692,14 +719,18 @@ amountwithoutpricep = do
|
||||
case mSpaceAndCommodity of
|
||||
-- right symbol amount
|
||||
Just (commodityspaced, c) -> do
|
||||
suggestedStyle <- getAmountStyle c
|
||||
(q,prec,mdec,mgrps) <- lift $ interpretNumber numRegion suggestedStyle ambiguousRawNum mExponent
|
||||
mdecmarkStyle <- getDecimalMarkStyle
|
||||
mcommodityStyle <- getAmountStyle c
|
||||
let msuggestedStyle = mdecmarkStyle <|> mcommodityStyle
|
||||
(q,prec,mdec,mgrps) <- lift $ interpretNumber numRegion msuggestedStyle ambiguousRawNum mExponent
|
||||
let s = amountstyle{ascommodityside=R, ascommodityspaced=commodityspaced, asprecision=prec, asdecimalpoint=mdec, asdigitgroups=mgrps}
|
||||
return $ nullamt{acommodity=c, aquantity=sign q, aismultiplier=mult, astyle=s, aprice=Nothing}
|
||||
-- no symbol amount
|
||||
Nothing -> do
|
||||
suggestedStyle <- getDefaultAmountStyle
|
||||
(q,prec,mdec,mgrps) <- lift $ interpretNumber numRegion suggestedStyle ambiguousRawNum mExponent
|
||||
mdecmarkStyle <- getDecimalMarkStyle
|
||||
mcommodityStyle <- getDefaultAmountStyle
|
||||
let msuggestedStyle = mdecmarkStyle <|> mcommodityStyle
|
||||
(q,prec,mdec,mgrps) <- lift $ interpretNumber numRegion msuggestedStyle ambiguousRawNum mExponent
|
||||
-- if a default commodity has been set, apply it and its style to this amount
|
||||
-- (unless it's a multiplier in an automated posting)
|
||||
defcs <- getDefaultCommodityAndStyle
|
||||
@ -716,8 +747,8 @@ amountwithoutpricep = do
|
||||
-> Either AmbiguousNumber RawNumber
|
||||
-> Maybe Integer
|
||||
-> TextParser m (Quantity, AmountPrecision, Maybe Char, Maybe DigitGroupStyle)
|
||||
interpretNumber posRegion suggestedStyle ambiguousNum mExp =
|
||||
let rawNum = either (disambiguateNumber suggestedStyle) id ambiguousNum
|
||||
interpretNumber posRegion msuggestedStyle ambiguousNum mExp =
|
||||
let rawNum = either (disambiguateNumber msuggestedStyle) id ambiguousNum
|
||||
in case fromRawNumber rawNum mExp of
|
||||
Left errMsg -> customFailure $
|
||||
uncurry parseErrorAtRegion posRegion errMsg
|
||||
@ -776,7 +807,7 @@ balanceassertionp = do
|
||||
lift skipNonNewlineSpaces
|
||||
-- this amount can have a price; balance assertions ignore it,
|
||||
-- but balance assignments will use it
|
||||
a <- amountpnolotprices <?> "amount (for a balance assertion or assignment)"
|
||||
a <- amountpnolotpricesp <?> "amount (for a balance assertion or assignment)"
|
||||
return BalanceAssertion
|
||||
{ baamount = a
|
||||
, batotal = istotal
|
||||
@ -884,13 +915,12 @@ fromRawNumber raw mExp = do
|
||||
(a:b:cs) | a < b -> b:cs
|
||||
gs -> gs
|
||||
|
||||
|
||||
disambiguateNumber :: Maybe AmountStyle -> AmbiguousNumber -> RawNumber
|
||||
disambiguateNumber suggestedStyle (AmbiguousNumber grp1 sep grp2) =
|
||||
disambiguateNumber msuggestedStyle (AmbiguousNumber grp1 sep grp2) =
|
||||
-- If present, use the suggested style to disambiguate;
|
||||
-- otherwise, assume that the separator is a decimal point where possible.
|
||||
if isDecimalPointChar sep &&
|
||||
maybe True (sep `isValidDecimalBy`) suggestedStyle
|
||||
if isDecimalMark sep &&
|
||||
maybe True (sep `isValidDecimalBy`) msuggestedStyle
|
||||
then NoSeparators grp1 (Just (sep, grp2))
|
||||
else WithSeparators sep [grp1, grp2] Nothing
|
||||
where
|
||||
@ -925,7 +955,7 @@ rawnumberp = label "number" $ do
|
||||
rawNumber <- fmap Right leadingDecimalPt <|> leadingDigits
|
||||
|
||||
-- Guard against mistyped numbers
|
||||
mExtraDecimalSep <- optional $ lookAhead $ satisfy isDecimalPointChar
|
||||
mExtraDecimalSep <- optional $ lookAhead $ satisfy isDecimalMark
|
||||
when (isJust mExtraDecimalSep) $
|
||||
Fail.fail "invalid number (invalid use of separator)"
|
||||
|
||||
@ -941,7 +971,7 @@ rawnumberp = label "number" $ do
|
||||
|
||||
leadingDecimalPt :: TextParser m RawNumber
|
||||
leadingDecimalPt = do
|
||||
decPt <- satisfy isDecimalPointChar
|
||||
decPt <- satisfy isDecimalMark
|
||||
decGrp <- digitgroupp
|
||||
pure $ NoSeparators mempty (Just (decPt, decGrp))
|
||||
|
||||
@ -962,7 +992,7 @@ rawnumberp = label "number" $ do
|
||||
|
||||
withDecimalPt :: Char -> [DigitGrp] -> TextParser m RawNumber
|
||||
withDecimalPt digitSep digitGroups = do
|
||||
decPt <- satisfy $ \c -> isDecimalPointChar c && c /= digitSep
|
||||
decPt <- satisfy $ \c -> isDecimalMark c && c /= digitSep
|
||||
decDigitGrp <- option mempty digitgroupp
|
||||
|
||||
pure $ WithSeparators digitSep digitGroups (Just (decPt, decDigitGrp))
|
||||
@ -974,21 +1004,17 @@ rawnumberp = label "number" $ do
|
||||
-> [DigitGrp]
|
||||
-> Either AmbiguousNumber RawNumber
|
||||
withoutDecimalPt grp1 sep grp2 grps
|
||||
| null grps && isDecimalPointChar sep =
|
||||
| null grps && isDecimalMark sep =
|
||||
Left $ AmbiguousNumber grp1 sep grp2
|
||||
| otherwise = Right $ WithSeparators sep (grp1:grp2:grps) Nothing
|
||||
|
||||
trailingDecimalPt :: DigitGrp -> TextParser m RawNumber
|
||||
trailingDecimalPt grp1 = do
|
||||
decPt <- satisfy isDecimalPointChar
|
||||
decPt <- satisfy isDecimalMark
|
||||
pure $ NoSeparators grp1 (Just (decPt, mempty))
|
||||
|
||||
|
||||
isDecimalPointChar :: Char -> Bool
|
||||
isDecimalPointChar c = c == '.' || c == ','
|
||||
|
||||
isDigitSeparatorChar :: Char -> Bool
|
||||
isDigitSeparatorChar c = isDecimalPointChar c || c == ' '
|
||||
isDigitSeparatorChar c = isDecimalMark c || c == ' '
|
||||
|
||||
-- | Some kinds of number literal we might parse.
|
||||
data RawNumber
|
||||
|
@ -364,7 +364,7 @@ Grammar for the CSV conversion rules, more or less:
|
||||
|
||||
RULES: RULE*
|
||||
|
||||
RULE: ( FIELD-LIST | FIELD-ASSIGNMENT | CONDITIONAL-BLOCK | SKIP | NEWEST-FIRST | DATE-FORMAT | COMMENT | BLANK ) NEWLINE
|
||||
RULE: ( FIELD-LIST | FIELD-ASSIGNMENT | CONDITIONAL-BLOCK | SKIP | NEWEST-FIRST | DATE-FORMAT | DECIMAL-MARK | COMMENT | BLANK ) NEWLINE
|
||||
|
||||
FIELD-LIST: fields SPACE FIELD-NAME ( SPACE? , SPACE? FIELD-NAME )*
|
||||
|
||||
@ -462,6 +462,7 @@ directivep = (do
|
||||
directives :: [String]
|
||||
directives =
|
||||
["date-format"
|
||||
,"decimal-mark"
|
||||
,"separator"
|
||||
-- ,"default-account"
|
||||
-- ,"default-currency"
|
||||
@ -1048,9 +1049,10 @@ getBalance rules record currency n = do
|
||||
parseAmount :: CsvRules -> CsvRecord -> String -> String -> MixedAmount
|
||||
parseAmount rules record currency s =
|
||||
either mkerror (Mixed . (:[])) $ -- PARTIAL:
|
||||
runParser (evalStateT (amountp <* eof) nulljournal) "" $
|
||||
runParser (evalStateT (amountp <* eof) journalparsestate) "" $
|
||||
T.pack $ (currency++) $ simplifySign s
|
||||
where
|
||||
journalparsestate = nulljournal{jparsedecimalmark=parseDecimalMark rules}
|
||||
mkerror e = error' $ unlines
|
||||
["error: could not parse \""++s++"\" as an amount"
|
||||
,showRecord record
|
||||
@ -1062,7 +1064,8 @@ parseAmount rules record currency s =
|
||||
++"or add or change your skip rule"
|
||||
]
|
||||
|
||||
-- XXX unify these
|
||||
-- XXX unify these ^v
|
||||
|
||||
-- | Almost but not quite the same as parseAmount.
|
||||
-- Given a non-empty amount string (from CSV) to parse, along with a
|
||||
-- possibly non-empty currency symbol to prepend,
|
||||
@ -1071,10 +1074,11 @@ parseAmount rules record currency s =
|
||||
parseBalanceAmount :: CsvRules -> CsvRecord -> String -> Int -> String -> Amount
|
||||
parseBalanceAmount rules record currency n s =
|
||||
either (mkerror n s) id $
|
||||
runParser (evalStateT (amountp <* eof) nulljournal) "" $
|
||||
runParser (evalStateT (amountp <* eof) journalparsestate) "" $
|
||||
T.pack $ (currency++) $ simplifySign s
|
||||
-- the csv record's line number would be good
|
||||
where
|
||||
journalparsestate = nulljournal{jparsedecimalmark=parseDecimalMark rules}
|
||||
mkerror n s e = error' $ unlines
|
||||
["error: could not parse \""++s++"\" as balance"++show n++" amount"
|
||||
,showRecord record
|
||||
@ -1083,6 +1087,15 @@ parseBalanceAmount rules record currency n s =
|
||||
,"the parse error is: "++customErrorBundlePretty e
|
||||
]
|
||||
|
||||
-- Read a valid decimal mark from the decimal-mark rule, if any.
|
||||
-- If the rule is present with an invalid argument, raise an error.
|
||||
parseDecimalMark :: CsvRules -> Maybe DecimalMark
|
||||
parseDecimalMark rules =
|
||||
case rules `csvRule` "decimal-mark" of
|
||||
Nothing -> Nothing
|
||||
Just [c] | isDecimalMark c -> Just c
|
||||
Just s -> error' $ "decimal-mark's argument should be \".\" or \",\" (not \""++s++"\")"
|
||||
|
||||
-- | Make a balance assertion for the given amount, with the given parse
|
||||
-- position (to be shown in assertion failures), with the assertion type
|
||||
-- possibly set by a balance-type rule.
|
||||
|
@ -45,7 +45,8 @@ these are described more fully below, after the examples:
|
||||
| [**`if` block**](#if-block) | apply some rules to CSV records matched by patterns |
|
||||
| [**`if` table**](#if-table) | apply some rules to CSV records matched by patterns, alternate syntax |
|
||||
| [**`end`**](#end) | skip the remaining CSV records |
|
||||
| [**`date-format`**](#date-format) | describe the format of CSV dates |
|
||||
| [**`date-format`**](#date-format) | how to parse dates in CSV records |
|
||||
| [**`decimal-mark`**](#decimal-mark) | the decimal mark used in CSV amounts, if ambiguous |
|
||||
| [**`newest-first`**](#newest-first) | disambiguate record order when there's only one date |
|
||||
| [**`include`**](#include) | inline another CSV rules file |
|
||||
| [**`balance-type`**](#balance-type) | choose which type of balance assignments to use |
|
||||
@ -716,6 +717,21 @@ For the supported strptime syntax, see:\
|
||||
<https://hackage.haskell.org/package/time/docs/Data-Time-Format.html#v:formatTime>
|
||||
|
||||
|
||||
## `decimal-mark`
|
||||
|
||||
```rules
|
||||
decimal-mark .
|
||||
```
|
||||
or:
|
||||
```rules
|
||||
decimal-mark ,
|
||||
```
|
||||
|
||||
hledger automatically accepts either period or comma as a decimal mark when parsing numbers
|
||||
(cf [Amounts](journal.html#amounts)).
|
||||
However if any numbers in the CSV contain digit group marks, such as thousand-separating commas,
|
||||
you should declare the decimal mark explicitly with this rule, to avoid misparsed numbers.
|
||||
|
||||
## `newest-first`
|
||||
|
||||
hledger always sorts the generated transactions by date.
|
||||
|
@ -906,6 +906,46 @@ $ ./csvtest.sh
|
||||
|
||||
>=0
|
||||
|
||||
# 45. decimal-mark helps parse ambiguous decimals correctly
|
||||
<
|
||||
2020-01-01,"1,000"
|
||||
2020-01-02,"1.000"
|
||||
|
||||
RULES
|
||||
fields date,amount
|
||||
decimal-mark .
|
||||
|
||||
$ ./csvtest.sh
|
||||
2020-01-01
|
||||
expenses:unknown 1,000.000
|
||||
income:unknown -1,000.000
|
||||
|
||||
2020-01-02
|
||||
expenses:unknown 1.000
|
||||
income:unknown -1.000
|
||||
|
||||
>=
|
||||
|
||||
# 46.
|
||||
<
|
||||
2020-01-01,"1,000"
|
||||
2020-01-02,"1.000"
|
||||
|
||||
RULES
|
||||
fields date,amount
|
||||
decimal-mark ,
|
||||
|
||||
$ ./csvtest.sh
|
||||
2020-01-01
|
||||
expenses:unknown 1,000
|
||||
income:unknown -1,000
|
||||
|
||||
2020-01-02
|
||||
expenses:unknown 1.000,000
|
||||
income:unknown -1.000,000
|
||||
|
||||
>=
|
||||
|
||||
|
||||
## .
|
||||
#<
|
||||
|
Loading…
Reference in New Issue
Block a user