From 524e23bc37a4673500fc163bcb1ab62e24ebc9e5 Mon Sep 17 00:00:00 2001 From: Simon Michael Date: Fri, 6 Nov 2020 18:45:52 -1000 Subject: [PATCH] csv: decimal-mark rule to help with number parsing Journal keeps a new piece of parsing state, a decimal mark character, which can optionally be set to force the number format expected by all amount parsers. --- hledger-lib/Hledger/Data/Journal.hs | 2 + hledger-lib/Hledger/Data/Types.hs | 7 +++ hledger-lib/Hledger/Read/Common.hs | 78 ++++++++++++++++++--------- hledger-lib/Hledger/Read/CsvReader.hs | 23 ++++++-- hledger-lib/hledger_csv.m4.md | 18 ++++++- tests/csv.test | 40 ++++++++++++++ 6 files changed, 136 insertions(+), 32 deletions(-) diff --git a/hledger-lib/Hledger/Data/Journal.hs b/hledger-lib/Hledger/Data/Journal.hs index 34af2be39..9d19f4333 100644 --- a/hledger-lib/Hledger/Data/Journal.hs +++ b/hledger-lib/Hledger/Data/Journal.hs @@ -175,6 +175,7 @@ instance Semigroup Journal where j1 <> j2 = Journal { jparsedefaultyear = jparsedefaultyear j2 ,jparsedefaultcommodity = jparsedefaultcommodity j2 + ,jparsedecimalmark = jparsedecimalmark j2 ,jparseparentaccounts = jparseparentaccounts j2 ,jparsealiases = jparsealiases j2 -- ,jparsetransactioncount = jparsetransactioncount j1 + jparsetransactioncount j2 @@ -201,6 +202,7 @@ nulljournal :: Journal nulljournal = Journal { jparsedefaultyear = Nothing ,jparsedefaultcommodity = Nothing + ,jparsedecimalmark = Nothing ,jparseparentaccounts = [] ,jparsealiases = [] -- ,jparsetransactioncount = 0 diff --git a/hledger-lib/Hledger/Data/Types.hs b/hledger-lib/Hledger/Data/Types.hs index 4d1f63b15..791c71b1b 100644 --- a/hledger-lib/Hledger/Data/Types.hs +++ b/hledger-lib/Hledger/Data/Types.hs @@ -159,6 +159,12 @@ data AccountAlias = BasicAlias AccountName AccountName data Side = L | R deriving (Eq,Show,Read,Ord,Generic) +-- | One of the decimal marks we support: either period or comma. +type DecimalMark = Char + +isDecimalMark :: Char -> Bool +isDecimalMark c = c == '.' || c == ',' + -- | The basic numeric type used in amounts. type Quantity = Decimal -- The following is for hledger-web, and requires blaze-markup. @@ -440,6 +446,7 @@ data Journal = Journal { -- parsing-related data jparsedefaultyear :: Maybe Year -- ^ the current default year, specified by the most recent Y directive (or current date) ,jparsedefaultcommodity :: Maybe (CommoditySymbol,AmountStyle) -- ^ the current default commodity and its format, specified by the most recent D directive + ,jparsedecimalmark :: Maybe DecimalMark -- ^ the character to always parse as decimal point, if set by CsvReader's decimal-mark (or a future journal directive) ,jparseparentaccounts :: [AccountName] -- ^ the current stack of parent account names, specified by apply account directives ,jparsealiases :: [AccountAlias] -- ^ the current account name aliases in effect, specified by alias directives (& options ?) -- ,jparsetransactioncount :: Integer -- ^ the current count of transactions parsed so far (only journal format txns, currently) diff --git a/hledger-lib/Hledger/Read/Common.hs b/hledger-lib/Hledger/Read/Common.hs index 1366c941a..b55d74822 100644 --- a/hledger-lib/Hledger/Read/Common.hs +++ b/hledger-lib/Hledger/Read/Common.hs @@ -349,6 +349,15 @@ setYear y = modify' (\j -> j{jparsedefaultyear=Just y}) getYear :: JournalParser m (Maybe Year) getYear = fmap jparsedefaultyear get +-- | Get the decimal mark that has been specified for parsing, if any +-- (eg by the CSV decimal-mark rule, or possibly a future journal directive). +-- Return it as an AmountStyle that amount parsers can use. +getDecimalMarkStyle :: JournalParser m (Maybe AmountStyle) +getDecimalMarkStyle = do + Journal{jparsedecimalmark} <- get + let mdecmarkStyle = maybe Nothing (\c -> Just $ amountstyle{asdecimalpoint=Just c}) jparsedecimalmark + return mdecmarkStyle + setDefaultCommodityAndStyle :: (CommoditySymbol,AmountStyle) -> JournalParser m () setDefaultCommodityAndStyle cs = modify' (\j -> j{jparsedefaultcommodity=Just cs}) @@ -640,9 +649,26 @@ spaceandamountormissingp = -- or right, followed by, in any order: an optional transaction price, -- an optional ledger-style lot price, and/or an optional ledger-style -- lot date. A lot price and lot date will be ignored. +-- +-- To parse the amount's quantity (number) we need to know which character +-- represents a decimal mark. We find it in one of three ways: +-- +-- 1. If a decimal mark has been set explicitly in the journal parse state, +-- we use that +-- +-- 2. Or if the journal has a commodity declaration for the amount's commodity, +-- we get the decimal mark from that +-- +-- 3. Otherwise we will parse any valid decimal mark appearing in the +-- number, as long as the number appears well formed. +-- +-- Note 3 is the default zero-config case; it means we automatically handle +-- files with any supported decimal mark, but it also allows different decimal marks +-- in different amounts, which is a bit too loose. There's an open issue. amountp :: JournalParser m Amount amountp = label "amount" $ do - let spaces = lift $ skipNonNewlineSpaces + let + spaces = lift $ skipNonNewlineSpaces amount <- amountwithoutpricep <* spaces (mprice, _elotprice, _elotdate) <- runPermutation $ (,,) <$> toPermutationWithDefault Nothing (Just <$> priceamountp <* spaces) @@ -650,9 +676,8 @@ amountp = label "amount" $ do <*> toPermutationWithDefault Nothing (Just <$> lotdatep <* spaces) pure $ amount { aprice = mprice } --- XXX Just like amountp but don't allow lot prices. Needed for balanceassertionp. -amountpnolotprices :: JournalParser m Amount -amountpnolotprices = label "amount" $ do +amountpnolotpricesp :: JournalParser m Amount +amountpnolotpricesp = label "amount" $ do let spaces = lift $ skipNonNewlineSpaces amount <- amountwithoutpricep spaces @@ -669,7 +694,9 @@ amountwithoutpricep = do leftsymbolamountp :: Bool -> (Decimal -> Decimal) -> JournalParser m Amount leftsymbolamountp mult sign = label "amount" $ do c <- lift commoditysymbolp - suggestedStyle <- getAmountStyle c + mdecmarkStyle <- getDecimalMarkStyle + mcommodityStyle <- getAmountStyle c + let suggestedStyle = mdecmarkStyle <|> mcommodityStyle commodityspaced <- lift skipNonNewlineSpaces' sign2 <- lift $ signp offBeforeNum <- getOffset @@ -692,14 +719,18 @@ amountwithoutpricep = do case mSpaceAndCommodity of -- right symbol amount Just (commodityspaced, c) -> do - suggestedStyle <- getAmountStyle c - (q,prec,mdec,mgrps) <- lift $ interpretNumber numRegion suggestedStyle ambiguousRawNum mExponent + mdecmarkStyle <- getDecimalMarkStyle + mcommodityStyle <- getAmountStyle c + let msuggestedStyle = mdecmarkStyle <|> mcommodityStyle + (q,prec,mdec,mgrps) <- lift $ interpretNumber numRegion msuggestedStyle ambiguousRawNum mExponent let s = amountstyle{ascommodityside=R, ascommodityspaced=commodityspaced, asprecision=prec, asdecimalpoint=mdec, asdigitgroups=mgrps} return $ nullamt{acommodity=c, aquantity=sign q, aismultiplier=mult, astyle=s, aprice=Nothing} -- no symbol amount Nothing -> do - suggestedStyle <- getDefaultAmountStyle - (q,prec,mdec,mgrps) <- lift $ interpretNumber numRegion suggestedStyle ambiguousRawNum mExponent + mdecmarkStyle <- getDecimalMarkStyle + mcommodityStyle <- getDefaultAmountStyle + let msuggestedStyle = mdecmarkStyle <|> mcommodityStyle + (q,prec,mdec,mgrps) <- lift $ interpretNumber numRegion msuggestedStyle ambiguousRawNum mExponent -- if a default commodity has been set, apply it and its style to this amount -- (unless it's a multiplier in an automated posting) defcs <- getDefaultCommodityAndStyle @@ -716,8 +747,8 @@ amountwithoutpricep = do -> Either AmbiguousNumber RawNumber -> Maybe Integer -> TextParser m (Quantity, AmountPrecision, Maybe Char, Maybe DigitGroupStyle) - interpretNumber posRegion suggestedStyle ambiguousNum mExp = - let rawNum = either (disambiguateNumber suggestedStyle) id ambiguousNum + interpretNumber posRegion msuggestedStyle ambiguousNum mExp = + let rawNum = either (disambiguateNumber msuggestedStyle) id ambiguousNum in case fromRawNumber rawNum mExp of Left errMsg -> customFailure $ uncurry parseErrorAtRegion posRegion errMsg @@ -776,7 +807,7 @@ balanceassertionp = do lift skipNonNewlineSpaces -- this amount can have a price; balance assertions ignore it, -- but balance assignments will use it - a <- amountpnolotprices "amount (for a balance assertion or assignment)" + a <- amountpnolotpricesp "amount (for a balance assertion or assignment)" return BalanceAssertion { baamount = a , batotal = istotal @@ -884,13 +915,12 @@ fromRawNumber raw mExp = do (a:b:cs) | a < b -> b:cs gs -> gs - disambiguateNumber :: Maybe AmountStyle -> AmbiguousNumber -> RawNumber -disambiguateNumber suggestedStyle (AmbiguousNumber grp1 sep grp2) = +disambiguateNumber msuggestedStyle (AmbiguousNumber grp1 sep grp2) = -- If present, use the suggested style to disambiguate; -- otherwise, assume that the separator is a decimal point where possible. - if isDecimalPointChar sep && - maybe True (sep `isValidDecimalBy`) suggestedStyle + if isDecimalMark sep && + maybe True (sep `isValidDecimalBy`) msuggestedStyle then NoSeparators grp1 (Just (sep, grp2)) else WithSeparators sep [grp1, grp2] Nothing where @@ -925,7 +955,7 @@ rawnumberp = label "number" $ do rawNumber <- fmap Right leadingDecimalPt <|> leadingDigits -- Guard against mistyped numbers - mExtraDecimalSep <- optional $ lookAhead $ satisfy isDecimalPointChar + mExtraDecimalSep <- optional $ lookAhead $ satisfy isDecimalMark when (isJust mExtraDecimalSep) $ Fail.fail "invalid number (invalid use of separator)" @@ -941,7 +971,7 @@ rawnumberp = label "number" $ do leadingDecimalPt :: TextParser m RawNumber leadingDecimalPt = do - decPt <- satisfy isDecimalPointChar + decPt <- satisfy isDecimalMark decGrp <- digitgroupp pure $ NoSeparators mempty (Just (decPt, decGrp)) @@ -962,7 +992,7 @@ rawnumberp = label "number" $ do withDecimalPt :: Char -> [DigitGrp] -> TextParser m RawNumber withDecimalPt digitSep digitGroups = do - decPt <- satisfy $ \c -> isDecimalPointChar c && c /= digitSep + decPt <- satisfy $ \c -> isDecimalMark c && c /= digitSep decDigitGrp <- option mempty digitgroupp pure $ WithSeparators digitSep digitGroups (Just (decPt, decDigitGrp)) @@ -974,21 +1004,17 @@ rawnumberp = label "number" $ do -> [DigitGrp] -> Either AmbiguousNumber RawNumber withoutDecimalPt grp1 sep grp2 grps - | null grps && isDecimalPointChar sep = + | null grps && isDecimalMark sep = Left $ AmbiguousNumber grp1 sep grp2 | otherwise = Right $ WithSeparators sep (grp1:grp2:grps) Nothing trailingDecimalPt :: DigitGrp -> TextParser m RawNumber trailingDecimalPt grp1 = do - decPt <- satisfy isDecimalPointChar + decPt <- satisfy isDecimalMark pure $ NoSeparators grp1 (Just (decPt, mempty)) - -isDecimalPointChar :: Char -> Bool -isDecimalPointChar c = c == '.' || c == ',' - isDigitSeparatorChar :: Char -> Bool -isDigitSeparatorChar c = isDecimalPointChar c || c == ' ' +isDigitSeparatorChar c = isDecimalMark c || c == ' ' -- | Some kinds of number literal we might parse. data RawNumber diff --git a/hledger-lib/Hledger/Read/CsvReader.hs b/hledger-lib/Hledger/Read/CsvReader.hs index 7a7f234ef..3d78aff29 100644 --- a/hledger-lib/Hledger/Read/CsvReader.hs +++ b/hledger-lib/Hledger/Read/CsvReader.hs @@ -78,7 +78,7 @@ import Text.Printf (printf) import Hledger.Data import Hledger.Utils -import Hledger.Read.Common (Reader(..),InputOpts(..),amountp, statusp, genericSourcePos, journalFinalise) +import Hledger.Read.Common ( Reader(..),InputOpts(..), amountp, statusp, genericSourcePos, journalFinalise ) --- ** doctest setup -- $setup @@ -364,7 +364,7 @@ Grammar for the CSV conversion rules, more or less: RULES: RULE* -RULE: ( FIELD-LIST | FIELD-ASSIGNMENT | CONDITIONAL-BLOCK | SKIP | NEWEST-FIRST | DATE-FORMAT | COMMENT | BLANK ) NEWLINE +RULE: ( FIELD-LIST | FIELD-ASSIGNMENT | CONDITIONAL-BLOCK | SKIP | NEWEST-FIRST | DATE-FORMAT | DECIMAL-MARK | COMMENT | BLANK ) NEWLINE FIELD-LIST: fields SPACE FIELD-NAME ( SPACE? , SPACE? FIELD-NAME )* @@ -462,6 +462,7 @@ directivep = (do directives :: [String] directives = ["date-format" + ,"decimal-mark" ,"separator" -- ,"default-account" -- ,"default-currency" @@ -1048,9 +1049,10 @@ getBalance rules record currency n = do parseAmount :: CsvRules -> CsvRecord -> String -> String -> MixedAmount parseAmount rules record currency s = either mkerror (Mixed . (:[])) $ -- PARTIAL: - runParser (evalStateT (amountp <* eof) nulljournal) "" $ + runParser (evalStateT (amountp <* eof) journalparsestate) "" $ T.pack $ (currency++) $ simplifySign s where + journalparsestate = nulljournal{jparsedecimalmark=parseDecimalMark rules} mkerror e = error' $ unlines ["error: could not parse \""++s++"\" as an amount" ,showRecord record @@ -1062,7 +1064,8 @@ parseAmount rules record currency s = ++"or add or change your skip rule" ] --- XXX unify these +-- XXX unify these ^v + -- | Almost but not quite the same as parseAmount. -- Given a non-empty amount string (from CSV) to parse, along with a -- possibly non-empty currency symbol to prepend, @@ -1071,10 +1074,11 @@ parseAmount rules record currency s = parseBalanceAmount :: CsvRules -> CsvRecord -> String -> Int -> String -> Amount parseBalanceAmount rules record currency n s = either (mkerror n s) id $ - runParser (evalStateT (amountp <* eof) nulljournal) "" $ + runParser (evalStateT (amountp <* eof) journalparsestate) "" $ T.pack $ (currency++) $ simplifySign s -- the csv record's line number would be good where + journalparsestate = nulljournal{jparsedecimalmark=parseDecimalMark rules} mkerror n s e = error' $ unlines ["error: could not parse \""++s++"\" as balance"++show n++" amount" ,showRecord record @@ -1083,6 +1087,15 @@ parseBalanceAmount rules record currency n s = ,"the parse error is: "++customErrorBundlePretty e ] +-- Read a valid decimal mark from the decimal-mark rule, if any. +-- If the rule is present with an invalid argument, raise an error. +parseDecimalMark :: CsvRules -> Maybe DecimalMark +parseDecimalMark rules = + case rules `csvRule` "decimal-mark" of + Nothing -> Nothing + Just [c] | isDecimalMark c -> Just c + Just s -> error' $ "decimal-mark's argument should be \".\" or \",\" (not \""++s++"\")" + -- | Make a balance assertion for the given amount, with the given parse -- position (to be shown in assertion failures), with the assertion type -- possibly set by a balance-type rule. diff --git a/hledger-lib/hledger_csv.m4.md b/hledger-lib/hledger_csv.m4.md index a8431b9fd..c36680f1c 100644 --- a/hledger-lib/hledger_csv.m4.md +++ b/hledger-lib/hledger_csv.m4.md @@ -45,7 +45,8 @@ these are described more fully below, after the examples: | [**`if` block**](#if-block) | apply some rules to CSV records matched by patterns | | [**`if` table**](#if-table) | apply some rules to CSV records matched by patterns, alternate syntax | | [**`end`**](#end) | skip the remaining CSV records | -| [**`date-format`**](#date-format) | describe the format of CSV dates | +| [**`date-format`**](#date-format) | how to parse dates in CSV records | +| [**`decimal-mark`**](#decimal-mark) | the decimal mark used in CSV amounts, if ambiguous | | [**`newest-first`**](#newest-first) | disambiguate record order when there's only one date | | [**`include`**](#include) | inline another CSV rules file | | [**`balance-type`**](#balance-type) | choose which type of balance assignments to use | @@ -716,6 +717,21 @@ For the supported strptime syntax, see:\ +## `decimal-mark` + +```rules +decimal-mark . +``` +or: +```rules +decimal-mark , +``` + +hledger automatically accepts either period or comma as a decimal mark when parsing numbers +(cf [Amounts](journal.html#amounts)). +However if any numbers in the CSV contain digit group marks, such as thousand-separating commas, +you should declare the decimal mark explicitly with this rule, to avoid misparsed numbers. + ## `newest-first` hledger always sorts the generated transactions by date. diff --git a/tests/csv.test b/tests/csv.test index 272b24a3b..c28bd6723 100644 --- a/tests/csv.test +++ b/tests/csv.test @@ -906,6 +906,46 @@ $ ./csvtest.sh >=0 +# 45. decimal-mark helps parse ambiguous decimals correctly +< +2020-01-01,"1,000" +2020-01-02,"1.000" + +RULES +fields date,amount +decimal-mark . + +$ ./csvtest.sh +2020-01-01 + expenses:unknown 1,000.000 + income:unknown -1,000.000 + +2020-01-02 + expenses:unknown 1.000 + income:unknown -1.000 + +>= + +# 46. +< +2020-01-01,"1,000" +2020-01-02,"1.000" + +RULES +fields date,amount +decimal-mark , + +$ ./csvtest.sh +2020-01-01 + expenses:unknown 1,000 + income:unknown -1,000 + +2020-01-02 + expenses:unknown 1.000,000 + income:unknown -1.000,000 + +>= + ## . #<