csv: allow single field matching; more docs and tests

2024-09-20 02:37:12 +03:00 · 2020-02-25 17:54:16 -08:00 · 2020-02-25 17:54:16 -08:00 · d537f1fe07
commit d537f1fe07
parent becd891dd1
3 changed files with 199 additions and 75 deletions
--- a/hledger-lib/Hledger/Read/CsvReader.hs
+++ b/hledger-lib/Hledger/Read/CsvReader.hs
@ -3,6 +3,10 @@
 A reader for CSV data, using an extra rules file to help interpret the data.

 -}
+-- Lots of haddocks in this file are for non-exported types.
+-- Here's a command that will render them:
+-- stack haddock hledger-lib --fast --no-haddock-deps --haddock-arguments='--ignore-all-exports' --open
+
 {-# LANGUAGE CPP #-}
 {-# LANGUAGE FlexibleContexts #-}
 {-# LANGUAGE ScopedTypeVariables #-}
@ -12,6 +16,8 @@ A reader for CSV data, using an extra rules file to help interpret the data.
 {-# LANGUAGE TypeSynonymInstances #-}
 {-# LANGUAGE FlexibleInstances #-}
 {-# LANGUAGE PackageImports #-}
+{-# LANGUAGE MultiWayIf #-}
+{-# LANGUAGE NamedFieldPuns #-}
 {-# LANGUAGE RecordWildCards #-}

 module Hledger.Read.CsvReader (
@ -115,15 +121,19 @@ parseSeparator = specials . map toLower

 -- | Read a Journal from the given CSV data (and filename, used for error
 -- messages), or return an error. Proceed as follows:
-- @
+--
 -- 1. parse CSV conversion rules from the specified rules file, or from
 --    the default rules file for the specified CSV file, if it exists,
 --    or throw a parse error; if it doesn't exist, use built-in default rules
+--
 -- 2. parse the CSV data, or throw a parse error
+--
 -- 3. convert the CSV records to transactions using the rules
+--
 -- 4. if the rules file didn't exist, create it with the default rules and filename
+--
 -- 5. return the transactions as a Journal
-- @
+-- 
 readJournalFromCsv :: Maybe FilePath -> FilePath -> Text -> IO (Either String Journal)
 readJournalFromCsv Nothing "-" _ = return $ Left "please use --rules-file when reading CSV from stdin"
 readJournalFromCsv mrulesfile csvfile csvdata =
@ -379,30 +389,44 @@ convert a particular CSV data file into meaningful journal transactions.
 -}
 data CsvRules = CsvRules {
  rdirectives        :: [(DirectiveName,String)],
+    -- ^ top-level rules, as (keyword, value) pairs
  rcsvfieldindexes   :: [(CsvFieldName, CsvFieldIndex)],
+    -- ^ csv field names and their column number, if declared by a fields list
  rassignments       :: [(JournalFieldName, FieldTemplate)],
+    -- ^ top-level assignments to hledger fields, as (field name, value template) pairs
  rconditionalblocks :: [ConditionalBlock]
+    -- ^ conditional blocks, which containing additional assignments/rules to apply to matched csv records
 } deriving (Show, Eq)

 type CsvRulesParser a = StateT CsvRules SimpleTextParser a

+-- | The keyword of a CSV rule - "fields", "skip", "if", etc.
 type DirectiveName    = String
+-- | CSV field name.
 type CsvFieldName     = String
+-- | 1-based CSV column number.
 type CsvFieldIndex    = Int
-type JournalFieldName = String
+-- | Percent symbol followed by a CSV field name or column number. Eg: %date, %1.
+type CsvFieldReference = String
+-- | One of the standard hledger field names that can be assigned to.
+type JournalFieldName = String  -- XXX rename to HledgerFieldName
+-- | A text value to be assigned to a hledger field, possibly
+-- containing csv field references to be interpolated.
 type FieldTemplate    = String
+-- | A strptime date parsing pattern, as supported by Data.Time.Format.
 type DateFormat       = String
+-- | A regular expression.
 type RegexpPattern    = String

 -- | A single test for matching a CSV record, in one way or another.
 data Matcher =
    RecordMatcher RegexpPattern              -- ^ match if this regexp matches the overall CSV record
-  -- | FieldMatcher CsvFieldName RegexpPattern  -- ^ match if this regexp matches the named CSV field
+  | FieldMatcher CsvFieldReference RegexpPattern  -- ^ match if this regexp matches the referenced CSV field's value
  deriving (Show, Eq)

 -- | A conditional block: a set of CSV record matchers, and a sequence
-- of zero or more rules which will be enabled only when one or of the
-- matchers succeeds.
+-- of rules which will be enabled only if one or more of the matchers
+-- succeeds.
 --
 -- Three types of rule are allowed inside conditional blocks: field
 -- assignments, skip, end. (A skip or end rule is stored as if it was
@ -634,6 +658,7 @@ fieldvalp = do
  lift $ dbgparse 2 "trying fieldvalp"
  anySingle `manyTill` lift eolof

+-- A conditional block: one or more matchers, one per line, followed by one or more indented rules.
 conditionalblockp :: CsvRulesParser ConditionalBlock
 conditionalblockp = do
  lift $ dbgparse 3 "trying conditionalblockp"
@ -645,10 +670,14 @@ conditionalblockp = do
  return $ CB{cbMatchers=ms, cbAssignments=as}
  <?> "conditional block"

-- A single matcher, on one line
-- XXX Currently only parses a RecordMatcher
+-- A single matcher, on one line.
 matcherp :: CsvRulesParser Matcher
-matcherp = do
+matcherp = try fieldmatcherp <|> recordmatcherp
+
+-- A single whole-record matcher.
+-- A pattern on the whole line, not containing any of the match operators (~).
+recordmatcherp :: CsvRulesParser Matcher
+recordmatcherp = do
  lift $ dbgparse 2 "trying matcherp"
  -- pos <- currentPos
  _  <- optional (matchoperatorp >> lift (skipMany spacenonewline) >> optional newline)
@ -658,8 +687,35 @@ matcherp = do
  return $ RecordMatcher r
  <?> "record matcher"

-- An operator indicating the type of match
-- XXX currently only ~ (regex), unused
+-- | A single matcher for a specific field. A csv field reference (like %date or %1),
+-- a match operator (~), and a pattern on the rest of the line, optionally
+-- space-separated. Eg:
+-- %description ~ chez jacques
+fieldmatcherp :: CsvRulesParser Matcher
+fieldmatcherp = do
+  lift $ dbgparse 2 "trying fieldmatcher"
+  -- An optional fieldname (default: "all")
+  -- f <- fromMaybe "all" `fmap` (optional $ do
+  --        f' <- fieldnamep
+  --        lift (skipMany spacenonewline)
+  --        return f')
+  f <- csvfieldreferencep <* lift (skipMany spacenonewline)
+  -- optional operator.. just ~ (case insensitive infix regex) for now
+  _op <- fromMaybe "~" <$> optional matchoperatorp
+  lift (skipMany spacenonewline)
+  r <- regexp
+  return $ FieldMatcher f r
+  <?> "field matcher"
+
+csvfieldreferencep :: CsvRulesParser CsvFieldReference
+csvfieldreferencep = do
+  lift $ dbgparse 3 "trying csvfieldreferencep"
+  char '%'
+  f <- fieldnamep
+  return $ '%' : quoteIfNeeded f
+
+-- A match operator, indicating the type of match to perform.
+-- Currently just ~ meaning case insensitive infix regex match.
 matchoperatorp :: CsvRulesParser String
 matchoperatorp = fmap T.unpack $ choiceInState $ map string
  ["~"
@ -677,19 +733,6 @@ regexp = do
  cs <- anySingle `manyTill` lift eolof
  return $ strip $ c:cs

-- fieldmatcher = do
--   dbgparse 2 "trying fieldmatcher"
--   f <- fromMaybe "all" `fmap` (optional $ do
--          f' <- fieldname
--          lift (skipMany spacenonewline)
--          return f')
--   char '~'
--   lift (skipMany spacenonewline)
--   ps <- patterns
--   let r = "(" ++ intercalate "|" ps ++ ")"
--   return (f,r)
--   <?> "field matcher"
-
 --------------------------------------------------------------------------------
 -- Converting CSV records to journal transactions

@ -962,40 +1005,64 @@ showRecord :: CsvRecord -> String
 showRecord r = "the CSV record is:       "++intercalate "," (map show r)

 -- | Given the conversion rules, a CSV record and a journal entry field name, find
-- the template value ultimately assigned to this field, either at top
-- level or in a matching conditional block.  Conditional blocks'
-- patterns are matched against an approximation of the original CSV
-- record: all the field values with commas intercalated.
+-- the template value ultimately assigned to this field, if any,
+-- by a field assignment at top level or in a conditional block matching this record.
+--
+-- Note conditional blocks' patterns are matched against an approximation of the
+-- CSV record: all the field values, without enclosing quotes, comma-separated.
+--
 getEffectiveAssignment :: CsvRules -> CsvRecord -> JournalFieldName -> Maybe FieldTemplate
-getEffectiveAssignment rules record f = lastMay $ assignmentsFor f
+getEffectiveAssignment rules record f = lastMay $ map snd $ assignments
  where
-    assignmentsFor f = map snd $ filter ((==f).fst) $ toplevelassignments ++ conditionalassignments
+    -- all active assignments to field f, in order
+    assignments = dbg2 "assignments" $ filter ((==f).fst) $ toplevelassignments ++ conditionalassignments
      where
+        -- all top level field assignments
        toplevelassignments    = rassignments rules
-        conditionalassignments = concatMap cbAssignments $ filter blockMatches $ blocksAssigning f
+        -- all field assignments in conditional blocks assigning to field f and active for the current csv record
+        conditionalassignments = concatMap cbAssignments $ filter isBlockActive $ blocksAssigning f
          where
+            -- all conditional blocks which can potentially assign field f
            blocksAssigning f = filter (any ((==f).fst) . cbAssignments) $ rconditionalblocks rules
-            blockMatches :: ConditionalBlock -> Bool
-            blockMatches CB{..} = any matcherMatches cbMatchers
+            -- does this conditional block match the current csv record ?
+            isBlockActive :: ConditionalBlock -> Bool
+            isBlockActive CB{..} = any matcherMatches cbMatchers
              where
+                -- does this individual matcher match the current csv record ?
                matcherMatches :: Matcher -> Bool
-                matcherMatches (RecordMatcher pat) = regexMatchesCI pat csvline
+                matcherMatches (RecordMatcher pat) = regexMatchesCI pat wholecsvline
                  where
-                    csvline = intercalate "," record
-                -- matcherMatches (FieldMatcher field pat) = undefined
+                    -- a synthetic whole CSV record to match against; note, it has
+                    -- no quotes enclosing fields, and is always comma-separated,
+                    -- so may differ from the actual record, and may not be valid CSV.
+                    wholecsvline = dbg3 "wholecsvline" $ intercalate "," record
+                matcherMatches (FieldMatcher csvfieldref pat) = regexMatchesCI pat csvfieldvalue
+                  where
+                    -- the value of the referenced CSV field to match against.
+                    csvfieldvalue = dbg3 "csvfieldvalue" $ replaceCsvFieldReference rules record csvfieldref

 -- | Render a field assigment's template, possibly interpolating referenced
 -- CSV field values. Outer whitespace is removed from interpolated values.
 renderTemplate ::  CsvRules -> CsvRecord -> FieldTemplate -> String
-renderTemplate rules record t = regexReplaceBy "%[A-z0-9_-]+" replace t
-  where
-    replace ('%':pat) = maybe pat (\i -> strip $ atDef "" record (i-1)) mindex
-      where
-        mindex | all isDigit pat = readMay pat
-               | otherwise       = lookup (map toLower pat) $ rcsvfieldindexes rules
-    replace pat       = pat
+renderTemplate rules record t = regexReplaceBy "%[A-z0-9_-]+" (replaceCsvFieldReference rules record) t

-- Parse the date string using the specified date-format, or if unspecified try these default formats:
+-- | Replace something that looks like a reference to a csv field ("%date" or "%1)
+-- with that field's value. If it doesn't look like a field reference, or if we
+-- can't find such a field, leave it unchanged.
+replaceCsvFieldReference :: CsvRules -> CsvRecord -> CsvFieldReference -> String
+replaceCsvFieldReference rules record s@('%':fieldname) = fromMaybe s $ csvFieldValue rules record fieldname
+replaceCsvFieldReference _ _ s = s
+
+-- | Get the (whitespace-stripped) value of a CSV field, identified by its name or
+-- column number, ("date" or "1"), from the given CSV record, if such a field exists.
+csvFieldValue :: CsvRules -> CsvRecord -> CsvFieldName -> Maybe String
+csvFieldValue rules record fieldname = do
+  fieldindex <- if | all isDigit fieldname -> readMay fieldname
+                   | otherwise             -> lookup (map toLower fieldname) $ rcsvfieldindexes rules
+  fieldvalue <- strip <$> atMay record (fieldindex-1)
+  return fieldvalue
+
+-- | Parse the date string using the specified date-format, or if unspecified try these default formats:
 -- YYYY/MM/DD, YYYY-MM-DD, YYYY.MM.DD, MM/DD/YYYY (month and day can be 1 or 2 digits, year must be 4).
 parseDateWithFormatOrDefaultFormats :: Maybe DateFormat -> String -> Maybe Day
 parseDateWithFormatOrDefaultFormats mformat s = firstJust $ map parsewith formats
@ -1025,26 +1092,60 @@ parseDateWithFormatOrDefaultFormats mformat s = firstJust $ map parsewith format

 tests_CsvReader = tests "CsvReader" [
   tests "parseCsvRules" [
-     test"empty file" $
+     test "empty file" $
      parseCsvRules "unknown" "" @?= Right defrules
-    ]
+   ]
  ,tests "rulesp" [
-     test"trailing comments" $
+     test "trailing comments" $
      parseWithState' defrules rulesp "skip\n# \n#\n" @?= Right defrules{rdirectives = [("skip","")]}

-    ,test"trailing blank lines" $
+    ,test "trailing blank lines" $
      parseWithState' defrules rulesp "skip\n\n  \n" @?= (Right defrules{rdirectives = [("skip","")]})

-    ,test"no final newline" $
+    ,test "no final newline" $
      parseWithState' defrules rulesp "skip" @?= (Right defrules{rdirectives=[("skip","")]})

-    ,test"assignment with empty value" $
+    ,test "assignment with empty value" $
      parseWithState' defrules rulesp "account1 \nif foo\n  account2 foo\n" @?=
        (Right defrules{rassignments = [("account1","")], rconditionalblocks = [CB{cbMatchers=[RecordMatcher "foo"],cbAssignments=[("account2","foo")]}]})
-  ]
+   ]
  ,tests "conditionalblockp" [
-    test"space after conditional" $ -- #1120
+    test "space after conditional" $ -- #1120
      parseWithState' defrules conditionalblockp "if a\n account2 b\n \n" @?=
        (Right $ CB{cbMatchers=[RecordMatcher "a"],cbAssignments=[("account2","b")]})
+
+  ,tests "csvfieldreferencep" [
+    test "number" $ parseWithState' defrules csvfieldreferencep "%1" @?= (Right "%1")
+   ,test "name" $ parseWithState' defrules csvfieldreferencep "%date" @?= (Right "%date")
+   ,test "quoted name" $ parseWithState' defrules csvfieldreferencep "%\"csv date\"" @?= (Right "%\"csv date\"")
+   ]
+
+  ,tests "matcherp" [
+
+    test "recordmatcherp" $
+      parseWithState' defrules matcherp "A A\n" @?= (Right $ RecordMatcher "A A")
+
+   ,test "fieldmatcherp.starts-with-%" $
+      parseWithState' defrules matcherp "description A A\n" @?= (Right $ RecordMatcher "description A A")
+
+   ,test "fieldmatcherp" $
+      parseWithState' defrules matcherp "%description A A\n" @?= (Right $ FieldMatcher "%description" "A A")
+
+   ,test "fieldmatcherp with operator" $
+      parseWithState' defrules matcherp "%description ~ A A\n" @?= (Right $ FieldMatcher "%description" "A A")
+
+   ]
+
+  ,tests "getEffectiveAssignment" [
+    let rules = defrules{rcsvfieldindexes=[("csvdate",1)],rassignments=[("date","%csvdate")]}
+    
+    in test "toplevel" $ getEffectiveAssignment rules ["a","b"] "date" @?= (Just "%csvdate")
+
+   ,let rules = defrules{rcsvfieldindexes=[("csvdate",1)], rconditionalblocks=[CB [FieldMatcher "%csvdate" "a"] [("date","%csvdate")]]}
+    in test "conditional" $ getEffectiveAssignment rules ["a","b"] "date" @?= (Just "%csvdate")
+       
+   ]
+
  ]
-  ]
+
+ ]
--- a/hledger-lib/hledger_csv.m4.md
+++ b/hledger-lib/hledger_csv.m4.md
@ -495,13 +495,13 @@ See also: [File Extension](#file-extension).
 ## `if`

 ```rules
-if PATTERN
+if MATCHER
 RULE

 if
-PATTERN
-PATTERN
-PATTERN
+MATCHER
+MATCHER
+MATCHER
 RULE
 RULE
 ```
@ -510,22 +510,29 @@ Conditional blocks ("if blocks") are a block of rules that are applied
 only to CSV records which match certain patterns. They are often used
 for customising account names based on transaction descriptions.

-A single pattern can be written on the same line as the "if";
-or multiple patterns can be written on the following lines, non-indented.
-Multiple patterns are OR'd (any one of them can match).
-Patterns are case-insensitive regular expressions
-which try to match anywhere within the whole CSV record
-(POSIX extended regular expressions with some additions, see https://hledger.org/hledger.html#regular-expressions).
-Note the CSV record they see is close to, but not identical to, the one in the CSV file;
-enclosing double quotes will be removed, and the separator character is always comma.
-
-It's not yet easy to match within a specific field.
-If the data does not contain commas, you can hack it with a regular expression like:
+Each MATCHER can be a record matcher, which looks like this:
 ```rules
-# match "foo" in the fourth field
-if ^([^,]*,){3}foo
+REGEX
 ```

+REGEX is a case-insensitive regular expression which tries to match anywhere within the CSV record.
+It is a POSIX extended regular expressions with some additions (see 
+[Regular expressions](https://hledger.org/hledger.html#regular-expressions) in the hledger manual).
+Note: the "CSV record" it is matched against is not the original record, but a synthetic one,
+with enclosing double quotes or whitespace removed, and always comma-separated.
+(Eg, an SSV record `2020-01-01; "Acme, Inc."; 1,000` appears to REGEX as `2020-01-01,Acme, Inc.,1,000`).
+
+Or, MATCHER can be a field matcher, like this:
+```rules
+%CSVFIELD REGEX
+```
+which matches just the content of a particular CSV field.
+CSVFIELD is a percent sign followed by the field's name or column number, like `%date` or `%1`.
+
+A single matcher can be written on the same line as the "if";
+or multiple matchers can be written on the following lines, non-indented.
+Multiple matchers are OR'd (any one of them can match).
+
 After the patterns there should be one or more rules to apply, all
 indented by at least one space. Three kinds of rule are allowed in
 conditional blocks:
--- a/tests/csv.test
+++ b/tests/csv.test
@ -560,8 +560,26 @@ $  ./hledger-csv

 >=0

+# 27. match a specific field
+<
+2020-01-01, 1
+2020-01-01, 2
+RULES
+fields date, desc
+if %desc 1
+ description one

-## 26. A single unbalanced posting with number other than 1 also should not generate a balancing posting.
+$  ./hledger-csv desc:one
+2020-01-01 one
+
+>=0
+
+## . 
+#<
+#$  ./hledger-csv
+#>=0
+
+## . A single unbalanced posting with number other than 1 also should not generate a balancing posting.
 #<
 #2019-01-01,1
 #
@ -575,7 +593,7 @@ $  ./hledger-csv
 #
 #>=0
 #
-## 27. A single posting that's zero also should not generate a balancing posting.
+## . A single posting that's zero also should not generate a balancing posting.
 #<
 #2019-01-01,0
 #
@ -589,7 +607,7 @@ $  ./hledger-csv
 #
 #>=0

-## 28. With a bracketed account name, the auto-generated second posting should also be bracketed.
+## . With a bracketed account name, the auto-generated second posting should also be bracketed.
 #<
 #2019-01-01,1
 #
@ -604,8 +622,6 @@ $  ./hledger-csv
 #
 #>=0

-
-
 # . TODO: without --separator gives obscure error
 #   |
 # 1 | 10/2009/09;Flubber Co🎅;50;