mirror of
https://github.com/simonmichael/hledger.git
synced 2024-09-20 02:37:12 +03:00
csv: allow single field matching; more docs and tests
This commit is contained in:
parent
becd891dd1
commit
d537f1fe07
@ -3,6 +3,10 @@
|
||||
A reader for CSV data, using an extra rules file to help interpret the data.
|
||||
|
||||
-}
|
||||
-- Lots of haddocks in this file are for non-exported types.
|
||||
-- Here's a command that will render them:
|
||||
-- stack haddock hledger-lib --fast --no-haddock-deps --haddock-arguments='--ignore-all-exports' --open
|
||||
|
||||
{-# LANGUAGE CPP #-}
|
||||
{-# LANGUAGE FlexibleContexts #-}
|
||||
{-# LANGUAGE ScopedTypeVariables #-}
|
||||
@ -12,6 +16,8 @@ A reader for CSV data, using an extra rules file to help interpret the data.
|
||||
{-# LANGUAGE TypeSynonymInstances #-}
|
||||
{-# LANGUAGE FlexibleInstances #-}
|
||||
{-# LANGUAGE PackageImports #-}
|
||||
{-# LANGUAGE MultiWayIf #-}
|
||||
{-# LANGUAGE NamedFieldPuns #-}
|
||||
{-# LANGUAGE RecordWildCards #-}
|
||||
|
||||
module Hledger.Read.CsvReader (
|
||||
@ -115,15 +121,19 @@ parseSeparator = specials . map toLower
|
||||
|
||||
-- | Read a Journal from the given CSV data (and filename, used for error
|
||||
-- messages), or return an error. Proceed as follows:
|
||||
-- @
|
||||
--
|
||||
-- 1. parse CSV conversion rules from the specified rules file, or from
|
||||
-- the default rules file for the specified CSV file, if it exists,
|
||||
-- or throw a parse error; if it doesn't exist, use built-in default rules
|
||||
--
|
||||
-- 2. parse the CSV data, or throw a parse error
|
||||
--
|
||||
-- 3. convert the CSV records to transactions using the rules
|
||||
--
|
||||
-- 4. if the rules file didn't exist, create it with the default rules and filename
|
||||
--
|
||||
-- 5. return the transactions as a Journal
|
||||
-- @
|
||||
--
|
||||
readJournalFromCsv :: Maybe FilePath -> FilePath -> Text -> IO (Either String Journal)
|
||||
readJournalFromCsv Nothing "-" _ = return $ Left "please use --rules-file when reading CSV from stdin"
|
||||
readJournalFromCsv mrulesfile csvfile csvdata =
|
||||
@ -379,30 +389,44 @@ convert a particular CSV data file into meaningful journal transactions.
|
||||
-}
|
||||
data CsvRules = CsvRules {
|
||||
rdirectives :: [(DirectiveName,String)],
|
||||
-- ^ top-level rules, as (keyword, value) pairs
|
||||
rcsvfieldindexes :: [(CsvFieldName, CsvFieldIndex)],
|
||||
-- ^ csv field names and their column number, if declared by a fields list
|
||||
rassignments :: [(JournalFieldName, FieldTemplate)],
|
||||
-- ^ top-level assignments to hledger fields, as (field name, value template) pairs
|
||||
rconditionalblocks :: [ConditionalBlock]
|
||||
-- ^ conditional blocks, which containing additional assignments/rules to apply to matched csv records
|
||||
} deriving (Show, Eq)
|
||||
|
||||
type CsvRulesParser a = StateT CsvRules SimpleTextParser a
|
||||
|
||||
-- | The keyword of a CSV rule - "fields", "skip", "if", etc.
|
||||
type DirectiveName = String
|
||||
-- | CSV field name.
|
||||
type CsvFieldName = String
|
||||
-- | 1-based CSV column number.
|
||||
type CsvFieldIndex = Int
|
||||
type JournalFieldName = String
|
||||
-- | Percent symbol followed by a CSV field name or column number. Eg: %date, %1.
|
||||
type CsvFieldReference = String
|
||||
-- | One of the standard hledger field names that can be assigned to.
|
||||
type JournalFieldName = String -- XXX rename to HledgerFieldName
|
||||
-- | A text value to be assigned to a hledger field, possibly
|
||||
-- containing csv field references to be interpolated.
|
||||
type FieldTemplate = String
|
||||
-- | A strptime date parsing pattern, as supported by Data.Time.Format.
|
||||
type DateFormat = String
|
||||
-- | A regular expression.
|
||||
type RegexpPattern = String
|
||||
|
||||
-- | A single test for matching a CSV record, in one way or another.
|
||||
data Matcher =
|
||||
RecordMatcher RegexpPattern -- ^ match if this regexp matches the overall CSV record
|
||||
-- | FieldMatcher CsvFieldName RegexpPattern -- ^ match if this regexp matches the named CSV field
|
||||
| FieldMatcher CsvFieldReference RegexpPattern -- ^ match if this regexp matches the referenced CSV field's value
|
||||
deriving (Show, Eq)
|
||||
|
||||
-- | A conditional block: a set of CSV record matchers, and a sequence
|
||||
-- of zero or more rules which will be enabled only when one or of the
|
||||
-- matchers succeeds.
|
||||
-- of rules which will be enabled only if one or more of the matchers
|
||||
-- succeeds.
|
||||
--
|
||||
-- Three types of rule are allowed inside conditional blocks: field
|
||||
-- assignments, skip, end. (A skip or end rule is stored as if it was
|
||||
@ -634,6 +658,7 @@ fieldvalp = do
|
||||
lift $ dbgparse 2 "trying fieldvalp"
|
||||
anySingle `manyTill` lift eolof
|
||||
|
||||
-- A conditional block: one or more matchers, one per line, followed by one or more indented rules.
|
||||
conditionalblockp :: CsvRulesParser ConditionalBlock
|
||||
conditionalblockp = do
|
||||
lift $ dbgparse 3 "trying conditionalblockp"
|
||||
@ -645,10 +670,14 @@ conditionalblockp = do
|
||||
return $ CB{cbMatchers=ms, cbAssignments=as}
|
||||
<?> "conditional block"
|
||||
|
||||
-- A single matcher, on one line
|
||||
-- XXX Currently only parses a RecordMatcher
|
||||
-- A single matcher, on one line.
|
||||
matcherp :: CsvRulesParser Matcher
|
||||
matcherp = do
|
||||
matcherp = try fieldmatcherp <|> recordmatcherp
|
||||
|
||||
-- A single whole-record matcher.
|
||||
-- A pattern on the whole line, not containing any of the match operators (~).
|
||||
recordmatcherp :: CsvRulesParser Matcher
|
||||
recordmatcherp = do
|
||||
lift $ dbgparse 2 "trying matcherp"
|
||||
-- pos <- currentPos
|
||||
_ <- optional (matchoperatorp >> lift (skipMany spacenonewline) >> optional newline)
|
||||
@ -658,8 +687,35 @@ matcherp = do
|
||||
return $ RecordMatcher r
|
||||
<?> "record matcher"
|
||||
|
||||
-- An operator indicating the type of match
|
||||
-- XXX currently only ~ (regex), unused
|
||||
-- | A single matcher for a specific field. A csv field reference (like %date or %1),
|
||||
-- a match operator (~), and a pattern on the rest of the line, optionally
|
||||
-- space-separated. Eg:
|
||||
-- %description ~ chez jacques
|
||||
fieldmatcherp :: CsvRulesParser Matcher
|
||||
fieldmatcherp = do
|
||||
lift $ dbgparse 2 "trying fieldmatcher"
|
||||
-- An optional fieldname (default: "all")
|
||||
-- f <- fromMaybe "all" `fmap` (optional $ do
|
||||
-- f' <- fieldnamep
|
||||
-- lift (skipMany spacenonewline)
|
||||
-- return f')
|
||||
f <- csvfieldreferencep <* lift (skipMany spacenonewline)
|
||||
-- optional operator.. just ~ (case insensitive infix regex) for now
|
||||
_op <- fromMaybe "~" <$> optional matchoperatorp
|
||||
lift (skipMany spacenonewline)
|
||||
r <- regexp
|
||||
return $ FieldMatcher f r
|
||||
<?> "field matcher"
|
||||
|
||||
csvfieldreferencep :: CsvRulesParser CsvFieldReference
|
||||
csvfieldreferencep = do
|
||||
lift $ dbgparse 3 "trying csvfieldreferencep"
|
||||
char '%'
|
||||
f <- fieldnamep
|
||||
return $ '%' : quoteIfNeeded f
|
||||
|
||||
-- A match operator, indicating the type of match to perform.
|
||||
-- Currently just ~ meaning case insensitive infix regex match.
|
||||
matchoperatorp :: CsvRulesParser String
|
||||
matchoperatorp = fmap T.unpack $ choiceInState $ map string
|
||||
["~"
|
||||
@ -677,19 +733,6 @@ regexp = do
|
||||
cs <- anySingle `manyTill` lift eolof
|
||||
return $ strip $ c:cs
|
||||
|
||||
-- fieldmatcher = do
|
||||
-- dbgparse 2 "trying fieldmatcher"
|
||||
-- f <- fromMaybe "all" `fmap` (optional $ do
|
||||
-- f' <- fieldname
|
||||
-- lift (skipMany spacenonewline)
|
||||
-- return f')
|
||||
-- char '~'
|
||||
-- lift (skipMany spacenonewline)
|
||||
-- ps <- patterns
|
||||
-- let r = "(" ++ intercalate "|" ps ++ ")"
|
||||
-- return (f,r)
|
||||
-- <?> "field matcher"
|
||||
|
||||
--------------------------------------------------------------------------------
|
||||
-- Converting CSV records to journal transactions
|
||||
|
||||
@ -962,40 +1005,64 @@ showRecord :: CsvRecord -> String
|
||||
showRecord r = "the CSV record is: "++intercalate "," (map show r)
|
||||
|
||||
-- | Given the conversion rules, a CSV record and a journal entry field name, find
|
||||
-- the template value ultimately assigned to this field, either at top
|
||||
-- level or in a matching conditional block. Conditional blocks'
|
||||
-- patterns are matched against an approximation of the original CSV
|
||||
-- record: all the field values with commas intercalated.
|
||||
-- the template value ultimately assigned to this field, if any,
|
||||
-- by a field assignment at top level or in a conditional block matching this record.
|
||||
--
|
||||
-- Note conditional blocks' patterns are matched against an approximation of the
|
||||
-- CSV record: all the field values, without enclosing quotes, comma-separated.
|
||||
--
|
||||
getEffectiveAssignment :: CsvRules -> CsvRecord -> JournalFieldName -> Maybe FieldTemplate
|
||||
getEffectiveAssignment rules record f = lastMay $ assignmentsFor f
|
||||
getEffectiveAssignment rules record f = lastMay $ map snd $ assignments
|
||||
where
|
||||
assignmentsFor f = map snd $ filter ((==f).fst) $ toplevelassignments ++ conditionalassignments
|
||||
-- all active assignments to field f, in order
|
||||
assignments = dbg2 "assignments" $ filter ((==f).fst) $ toplevelassignments ++ conditionalassignments
|
||||
where
|
||||
-- all top level field assignments
|
||||
toplevelassignments = rassignments rules
|
||||
conditionalassignments = concatMap cbAssignments $ filter blockMatches $ blocksAssigning f
|
||||
-- all field assignments in conditional blocks assigning to field f and active for the current csv record
|
||||
conditionalassignments = concatMap cbAssignments $ filter isBlockActive $ blocksAssigning f
|
||||
where
|
||||
-- all conditional blocks which can potentially assign field f
|
||||
blocksAssigning f = filter (any ((==f).fst) . cbAssignments) $ rconditionalblocks rules
|
||||
blockMatches :: ConditionalBlock -> Bool
|
||||
blockMatches CB{..} = any matcherMatches cbMatchers
|
||||
-- does this conditional block match the current csv record ?
|
||||
isBlockActive :: ConditionalBlock -> Bool
|
||||
isBlockActive CB{..} = any matcherMatches cbMatchers
|
||||
where
|
||||
-- does this individual matcher match the current csv record ?
|
||||
matcherMatches :: Matcher -> Bool
|
||||
matcherMatches (RecordMatcher pat) = regexMatchesCI pat csvline
|
||||
matcherMatches (RecordMatcher pat) = regexMatchesCI pat wholecsvline
|
||||
where
|
||||
csvline = intercalate "," record
|
||||
-- matcherMatches (FieldMatcher field pat) = undefined
|
||||
-- a synthetic whole CSV record to match against; note, it has
|
||||
-- no quotes enclosing fields, and is always comma-separated,
|
||||
-- so may differ from the actual record, and may not be valid CSV.
|
||||
wholecsvline = dbg3 "wholecsvline" $ intercalate "," record
|
||||
matcherMatches (FieldMatcher csvfieldref pat) = regexMatchesCI pat csvfieldvalue
|
||||
where
|
||||
-- the value of the referenced CSV field to match against.
|
||||
csvfieldvalue = dbg3 "csvfieldvalue" $ replaceCsvFieldReference rules record csvfieldref
|
||||
|
||||
-- | Render a field assigment's template, possibly interpolating referenced
|
||||
-- CSV field values. Outer whitespace is removed from interpolated values.
|
||||
renderTemplate :: CsvRules -> CsvRecord -> FieldTemplate -> String
|
||||
renderTemplate rules record t = regexReplaceBy "%[A-z0-9_-]+" replace t
|
||||
where
|
||||
replace ('%':pat) = maybe pat (\i -> strip $ atDef "" record (i-1)) mindex
|
||||
where
|
||||
mindex | all isDigit pat = readMay pat
|
||||
| otherwise = lookup (map toLower pat) $ rcsvfieldindexes rules
|
||||
replace pat = pat
|
||||
renderTemplate rules record t = regexReplaceBy "%[A-z0-9_-]+" (replaceCsvFieldReference rules record) t
|
||||
|
||||
-- Parse the date string using the specified date-format, or if unspecified try these default formats:
|
||||
-- | Replace something that looks like a reference to a csv field ("%date" or "%1)
|
||||
-- with that field's value. If it doesn't look like a field reference, or if we
|
||||
-- can't find such a field, leave it unchanged.
|
||||
replaceCsvFieldReference :: CsvRules -> CsvRecord -> CsvFieldReference -> String
|
||||
replaceCsvFieldReference rules record s@('%':fieldname) = fromMaybe s $ csvFieldValue rules record fieldname
|
||||
replaceCsvFieldReference _ _ s = s
|
||||
|
||||
-- | Get the (whitespace-stripped) value of a CSV field, identified by its name or
|
||||
-- column number, ("date" or "1"), from the given CSV record, if such a field exists.
|
||||
csvFieldValue :: CsvRules -> CsvRecord -> CsvFieldName -> Maybe String
|
||||
csvFieldValue rules record fieldname = do
|
||||
fieldindex <- if | all isDigit fieldname -> readMay fieldname
|
||||
| otherwise -> lookup (map toLower fieldname) $ rcsvfieldindexes rules
|
||||
fieldvalue <- strip <$> atMay record (fieldindex-1)
|
||||
return fieldvalue
|
||||
|
||||
-- | Parse the date string using the specified date-format, or if unspecified try these default formats:
|
||||
-- YYYY/MM/DD, YYYY-MM-DD, YYYY.MM.DD, MM/DD/YYYY (month and day can be 1 or 2 digits, year must be 4).
|
||||
parseDateWithFormatOrDefaultFormats :: Maybe DateFormat -> String -> Maybe Day
|
||||
parseDateWithFormatOrDefaultFormats mformat s = firstJust $ map parsewith formats
|
||||
@ -1025,26 +1092,60 @@ parseDateWithFormatOrDefaultFormats mformat s = firstJust $ map parsewith format
|
||||
|
||||
tests_CsvReader = tests "CsvReader" [
|
||||
tests "parseCsvRules" [
|
||||
test"empty file" $
|
||||
test "empty file" $
|
||||
parseCsvRules "unknown" "" @?= Right defrules
|
||||
]
|
||||
]
|
||||
,tests "rulesp" [
|
||||
test"trailing comments" $
|
||||
test "trailing comments" $
|
||||
parseWithState' defrules rulesp "skip\n# \n#\n" @?= Right defrules{rdirectives = [("skip","")]}
|
||||
|
||||
,test"trailing blank lines" $
|
||||
,test "trailing blank lines" $
|
||||
parseWithState' defrules rulesp "skip\n\n \n" @?= (Right defrules{rdirectives = [("skip","")]})
|
||||
|
||||
,test"no final newline" $
|
||||
,test "no final newline" $
|
||||
parseWithState' defrules rulesp "skip" @?= (Right defrules{rdirectives=[("skip","")]})
|
||||
|
||||
,test"assignment with empty value" $
|
||||
,test "assignment with empty value" $
|
||||
parseWithState' defrules rulesp "account1 \nif foo\n account2 foo\n" @?=
|
||||
(Right defrules{rassignments = [("account1","")], rconditionalblocks = [CB{cbMatchers=[RecordMatcher "foo"],cbAssignments=[("account2","foo")]}]})
|
||||
]
|
||||
]
|
||||
,tests "conditionalblockp" [
|
||||
test"space after conditional" $ -- #1120
|
||||
test "space after conditional" $ -- #1120
|
||||
parseWithState' defrules conditionalblockp "if a\n account2 b\n \n" @?=
|
||||
(Right $ CB{cbMatchers=[RecordMatcher "a"],cbAssignments=[("account2","b")]})
|
||||
|
||||
,tests "csvfieldreferencep" [
|
||||
test "number" $ parseWithState' defrules csvfieldreferencep "%1" @?= (Right "%1")
|
||||
,test "name" $ parseWithState' defrules csvfieldreferencep "%date" @?= (Right "%date")
|
||||
,test "quoted name" $ parseWithState' defrules csvfieldreferencep "%\"csv date\"" @?= (Right "%\"csv date\"")
|
||||
]
|
||||
|
||||
,tests "matcherp" [
|
||||
|
||||
test "recordmatcherp" $
|
||||
parseWithState' defrules matcherp "A A\n" @?= (Right $ RecordMatcher "A A")
|
||||
|
||||
,test "fieldmatcherp.starts-with-%" $
|
||||
parseWithState' defrules matcherp "description A A\n" @?= (Right $ RecordMatcher "description A A")
|
||||
|
||||
,test "fieldmatcherp" $
|
||||
parseWithState' defrules matcherp "%description A A\n" @?= (Right $ FieldMatcher "%description" "A A")
|
||||
|
||||
,test "fieldmatcherp with operator" $
|
||||
parseWithState' defrules matcherp "%description ~ A A\n" @?= (Right $ FieldMatcher "%description" "A A")
|
||||
|
||||
]
|
||||
|
||||
,tests "getEffectiveAssignment" [
|
||||
let rules = defrules{rcsvfieldindexes=[("csvdate",1)],rassignments=[("date","%csvdate")]}
|
||||
|
||||
in test "toplevel" $ getEffectiveAssignment rules ["a","b"] "date" @?= (Just "%csvdate")
|
||||
|
||||
,let rules = defrules{rcsvfieldindexes=[("csvdate",1)], rconditionalblocks=[CB [FieldMatcher "%csvdate" "a"] [("date","%csvdate")]]}
|
||||
in test "conditional" $ getEffectiveAssignment rules ["a","b"] "date" @?= (Just "%csvdate")
|
||||
|
||||
]
|
||||
|
||||
]
|
||||
]
|
||||
|
||||
]
|
||||
|
@ -495,13 +495,13 @@ See also: [File Extension](#file-extension).
|
||||
## `if`
|
||||
|
||||
```rules
|
||||
if PATTERN
|
||||
if MATCHER
|
||||
RULE
|
||||
|
||||
if
|
||||
PATTERN
|
||||
PATTERN
|
||||
PATTERN
|
||||
MATCHER
|
||||
MATCHER
|
||||
MATCHER
|
||||
RULE
|
||||
RULE
|
||||
```
|
||||
@ -510,22 +510,29 @@ Conditional blocks ("if blocks") are a block of rules that are applied
|
||||
only to CSV records which match certain patterns. They are often used
|
||||
for customising account names based on transaction descriptions.
|
||||
|
||||
A single pattern can be written on the same line as the "if";
|
||||
or multiple patterns can be written on the following lines, non-indented.
|
||||
Multiple patterns are OR'd (any one of them can match).
|
||||
Patterns are case-insensitive regular expressions
|
||||
which try to match anywhere within the whole CSV record
|
||||
(POSIX extended regular expressions with some additions, see https://hledger.org/hledger.html#regular-expressions).
|
||||
Note the CSV record they see is close to, but not identical to, the one in the CSV file;
|
||||
enclosing double quotes will be removed, and the separator character is always comma.
|
||||
|
||||
It's not yet easy to match within a specific field.
|
||||
If the data does not contain commas, you can hack it with a regular expression like:
|
||||
Each MATCHER can be a record matcher, which looks like this:
|
||||
```rules
|
||||
# match "foo" in the fourth field
|
||||
if ^([^,]*,){3}foo
|
||||
REGEX
|
||||
```
|
||||
|
||||
REGEX is a case-insensitive regular expression which tries to match anywhere within the CSV record.
|
||||
It is a POSIX extended regular expressions with some additions (see
|
||||
[Regular expressions](https://hledger.org/hledger.html#regular-expressions) in the hledger manual).
|
||||
Note: the "CSV record" it is matched against is not the original record, but a synthetic one,
|
||||
with enclosing double quotes or whitespace removed, and always comma-separated.
|
||||
(Eg, an SSV record `2020-01-01; "Acme, Inc."; 1,000` appears to REGEX as `2020-01-01,Acme, Inc.,1,000`).
|
||||
|
||||
Or, MATCHER can be a field matcher, like this:
|
||||
```rules
|
||||
%CSVFIELD REGEX
|
||||
```
|
||||
which matches just the content of a particular CSV field.
|
||||
CSVFIELD is a percent sign followed by the field's name or column number, like `%date` or `%1`.
|
||||
|
||||
A single matcher can be written on the same line as the "if";
|
||||
or multiple matchers can be written on the following lines, non-indented.
|
||||
Multiple matchers are OR'd (any one of them can match).
|
||||
|
||||
After the patterns there should be one or more rules to apply, all
|
||||
indented by at least one space. Three kinds of rule are allowed in
|
||||
conditional blocks:
|
||||
|
@ -560,8 +560,26 @@ $ ./hledger-csv
|
||||
|
||||
>=0
|
||||
|
||||
# 27. match a specific field
|
||||
<
|
||||
2020-01-01, 1
|
||||
2020-01-01, 2
|
||||
RULES
|
||||
fields date, desc
|
||||
if %desc 1
|
||||
description one
|
||||
|
||||
## 26. A single unbalanced posting with number other than 1 also should not generate a balancing posting.
|
||||
$ ./hledger-csv desc:one
|
||||
2020-01-01 one
|
||||
|
||||
>=0
|
||||
|
||||
## .
|
||||
#<
|
||||
#$ ./hledger-csv
|
||||
#>=0
|
||||
|
||||
## . A single unbalanced posting with number other than 1 also should not generate a balancing posting.
|
||||
#<
|
||||
#2019-01-01,1
|
||||
#
|
||||
@ -575,7 +593,7 @@ $ ./hledger-csv
|
||||
#
|
||||
#>=0
|
||||
#
|
||||
## 27. A single posting that's zero also should not generate a balancing posting.
|
||||
## . A single posting that's zero also should not generate a balancing posting.
|
||||
#<
|
||||
#2019-01-01,0
|
||||
#
|
||||
@ -589,7 +607,7 @@ $ ./hledger-csv
|
||||
#
|
||||
#>=0
|
||||
|
||||
## 28. With a bracketed account name, the auto-generated second posting should also be bracketed.
|
||||
## . With a bracketed account name, the auto-generated second posting should also be bracketed.
|
||||
#<
|
||||
#2019-01-01,1
|
||||
#
|
||||
@ -604,8 +622,6 @@ $ ./hledger-csv
|
||||
#
|
||||
#>=0
|
||||
|
||||
|
||||
|
||||
# . TODO: without --separator gives obscure error
|
||||
# |
|
||||
# 1 | 10/2009/09;Flubber Co🎅;50;
|
||||
|
Loading…
Reference in New Issue
Block a user