convert: new rules file format, more docs

2024-12-28 21:02:04 +03:00 · 2009-12-03 20:54:20 +00:00 · 2009-12-03 20:54:20 +00:00 · 2607082e9e
commit 2607082e9e
parent 43e327d05b
3 changed files with 285 additions and 95 deletions
--- a/Commands/Convert.hs
+++ b/Commands/Convert.hs
@ -1,98 +1,249 @@
 {-|
-
 Convert account data in CSV format (eg downloaded from a bank) to ledger
 format, and print it on stdout. See the manual for more details.
-
 -}

 module Commands.Convert where
-import Data.List.Split (splitOn)
-import Options -- (Opt,Debug)
-import Ledger.Types (Ledger,AccountName)
-import Ledger.Utils (strip)
-import System.IO (stderr, hPutStr)
+import Options (Opt(Debug))
+import Ledger.Types (Ledger,AccountName,LedgerTransaction(..),Posting(..),PostingType(..))
+import Ledger.Utils (strip, spacenonewline, restofline)
+import Ledger.Parse (someamount, emptyCtx, ledgeraccountname)
+import Ledger.Amount (nullmixedamt)
+import System.IO (stderr)
 import Text.CSV (parseCSVFromFile, printCSV)
-import Text.Printf (printf)
+import Text.Printf (hPrintf)
 import Text.RegexPR (matchRegexPR)
 import Data.Maybe
-import Ledger.Dates (firstJust, showDate)
+import Ledger.Dates (firstJust, showDate, parsedate)
 import Locale (defaultTimeLocale)
 import Data.Time.Format (parseTime)
-import Control.Monad (when)
-import Safe (readMay, readDef)
+import Control.Monad (when, guard)
+import Safe (readDef, readMay)
+import System.FilePath.Posix (takeBaseName)
+import Text.ParserCombinators.Parsec


 convert :: [Opt] -> [String] -> Ledger -> IO ()
 convert opts args _ = do
-  when (length args /= 3) (error "please specify a csv file, base account, and import rules file.")
-  let [csvfile,baseacct,rulesfile] = args
-  rulesstr <- readFile rulesfile
-  (fieldpositions,rules) <- parseRules rulesstr
-  parse <- parseCSVFromFile csvfile
-  let records = case parse of
+  when (length args /= 2) (error "please specify a csv data file and conversion rules file.")
+  let debug = Debug `elem` opts
+      [csvfile,rulesfile] = args
+  csvparse <- parseCSVFromFile csvfile
+  let records = case csvparse of
                  Left e -> error $ show e
-                  Right rs -> reverse rs
-  mapM_ (print_ledger_txn (Debug `elem` opts) (baseacct,fieldpositions,rules)) records
+                  Right rs -> reverse $ filter (/= [""]) rs
+  rulesstr <- readFile rulesfile
+  let rules = case parseCsvRules (takeBaseName csvfile) rulesstr of
+                  Left e -> error $ show e
+                  Right r -> r
+  when debug $ hPrintf stderr "using csv conversion rules file %s\n" rulesfile
+  when debug $ hPrintf stderr "%s\n" (show rules)
+  mapM_ (printTxn debug rules) records

+{- |
+A set of data definitions and account-matching patterns sufficient to
+convert a particular CSV data file into meaningful ledger transactions. See above.
+-}
+data CsvRules = CsvRules {
+      dateField :: Maybe FieldPosition,
+      statusField :: Maybe FieldPosition,
+      codeField :: Maybe FieldPosition,
+      descriptionField :: Maybe FieldPosition,
+      amountField :: Maybe FieldPosition,
+      currencyField :: Maybe FieldPosition,
+      baseCurrency :: Maybe String,
+      baseAccount :: AccountName,
+      accountRules :: [AccountRule]
+} deriving (Show)

-type Rule = (
-   [(String, Maybe String)] -- list of patterns and optional replacements
-  ,AccountName              -- account name to use for a matched transaction
+nullrules = CsvRules {
+      dateField=Nothing,
+      statusField=Nothing,
+      codeField=Nothing,
+      descriptionField=Nothing,
+      amountField=Nothing,
+      currencyField=Nothing,
+      baseCurrency=Nothing,
+      baseAccount="unknown",
+      accountRules=[]
+}
+
+type FieldPosition = Int
+
+type AccountRule = (
+   [(String, Maybe String)] -- list of regex match patterns with optional replacements
+  ,AccountName              -- account name to use for a transaction matching this rule
  )

-parseRules :: String -> IO ([Maybe Int],[Rule])
-parseRules s = do
-  let ls = map strip $ lines s
-  let paras = splitOn [""] ls
-  let fieldpositions = map readMay $ splitOn "," $ head $ head paras
-  let rules = [(map parsePatRepl $ init ls, last ls) | ls <- tail paras]
-  return (fieldpositions,rules)
+type CsvRecord = [String]

-parsePatRepl :: String -> (String, Maybe String)
-parsePatRepl l = case splitOn "=" l of
-                   (p:r:_) -> (p, Just r)
-                   _       -> (l, Nothing)
+-- rules file parser

-print_ledger_txn :: Bool -> (String,[Maybe Int],[Rule]) -> [String] -> IO ()
-print_ledger_txn _ (_,[],_) _ = return ()
-print_ledger_txn _ (('#':_),_,_) _ = return ()
-print_ledger_txn debug (baseacct,fieldpositions,rules) csvrecord
-    | length csvrecord < maximum (map (fromMaybe 0) fieldpositions) + 1 = return ()
-    | otherwise =
- do
-  when debug $ hPutStr stderr $ printCSV [csvrecord]
-  let date = maybe "" (csvrecord !!) (fieldpositions !! 0)
-      number = maybe "" (csvrecord !!) (fieldpositions !! 1)
-      description = maybe "" (csvrecord !!) (fieldpositions !! 2)
-      amount = maybe "" (csvrecord !!) (fieldpositions !! 3)
-      amount' = strnegate amount where strnegate ('-':s) = s
-                                       strnegate s = '-':s
-      unknownacct | (readDef 0 amount' :: Double) < 0 = "income:unknown"
+parseCsvRules :: String -> String -> Either ParseError CsvRules
+parseCsvRules basefilename s = runParser csvrulesP nullrules{baseAccount=basefilename} "" s
+
+csvrulesP :: GenParser Char CsvRules CsvRules
+csvrulesP = do
+  optional blanklines
+  many definitions
+  r <- getState
+  ars <- many accountrule
+  optional blanklines
+  eof
+  return r{accountRules=ars}
+
+-- | Real independent parser choice, even when alternative matches share a prefix.
+choice' parsers = choice $ map try (init parsers) ++ [last parsers]
+
+definitions :: GenParser Char CsvRules ()
+definitions = do
+  choice' [
+    datefield
+   ,statusfield
+   ,codefield
+   ,descriptionfield
+   ,amountfield
+   ,currencyfield
+   ,basecurrency
+   ,baseaccount
+   ] <?> "definition"
+  return ()
+
+datefield = do
+  string "date-field"
+  many1 spacenonewline
+  v <- restofline
+  r <- getState
+  setState r{dateField=readMay v}
+
+codefield = do
+  string "code-field"
+  many1 spacenonewline
+  v <- restofline
+  r <- getState
+  setState r{codeField=readMay v}
+
+statusfield = do
+  string "status-field"
+  many1 spacenonewline
+  v <- restofline
+  r <- getState
+  setState r{statusField=readMay v}
+
+descriptionfield = do
+  string "description-field"
+  many1 spacenonewline
+  v <- restofline
+  r <- getState
+  setState r{descriptionField=readMay v}
+
+amountfield = do
+  string "amount-field"
+  many1 spacenonewline
+  v <- restofline
+  r <- getState
+  setState r{amountField=readMay v}
+
+currencyfield = do
+  string "currency-field"
+  many1 spacenonewline
+  v <- restofline
+  r <- getState
+  setState r{currencyField=readMay v}
+
+basecurrency = do
+  string "currency"
+  many1 spacenonewline
+  v <- restofline
+  r <- getState
+  setState r{baseCurrency=Just v}
+
+baseaccount = do
+  string "base-account"
+  many1 spacenonewline
+  v <- ledgeraccountname
+  optional newline
+  r <- getState
+  setState r{baseAccount=v}
+
+accountrule :: GenParser Char CsvRules AccountRule
+accountrule = do
+  blanklines
+  pats <- many1 matchreplacepattern
+  guard $ length pats >= 2
+  let pats' = init pats
+      acct = either (fail.show) id $ runParser ledgeraccountname () "" $ fst $ last pats
+  return (pats',acct)
+
+blanklines = many1 blankline >> return ()
+
+blankline = many spacenonewline >> newline >> return () <?> "blank line"
+
+matchreplacepattern = do
+  matchpat <- many1 (noneOf "=\n")
+  replpat <- optionMaybe $ do {char '='; many $ noneOf "\n"}
+  newline
+  return (matchpat,replpat)
+
+printTxn :: Bool -> CsvRules -> CsvRecord -> IO ()
+printTxn debug rules rec = do
+  when debug $ hPrintf stderr "csv: %s" (printCSV [rec])
+  putStr $ show $ transactionFromCsvRecord rules rec
+
+-- csv record conversion
+
+transactionFromCsvRecord :: CsvRules -> CsvRecord -> LedgerTransaction
+transactionFromCsvRecord rules fields =
+  let 
+      date = parsedate $ normaliseDate $ maybe "1900/1/1" (fields !!) (dateField rules)
+      status = maybe False (null . strip . (fields !!)) (statusField rules)
+      code = maybe "" (fields !!) (codeField rules)
+      desc = maybe "" (fields !!) (descriptionField rules)
+      comment = ""
+      precomment = ""
+      amountstr = maybe "" (fields !!) (amountField rules)
+      amountstr' = strnegate amountstr where strnegate ('-':s) = s
+                                             strnegate s = '-':s
+      currency = maybe (fromMaybe "" $ baseCurrency rules) (fields !!) (currencyField rules)
+      amountstr'' = currency ++ amountstr'
+      amountparse = runParser someamount emptyCtx "" amountstr''
+      amount = either (const nullmixedamt) id amountparse
+      unknownacct | (readDef 0 amountstr' :: Double) < 0 = "income:unknown"
                  | otherwise = "expenses:unknown"
-      (acct,desc) = choose_acct_desc rules (unknownacct,description)
-  printf "%s%s %s\n" (fixdate date) (if not (null number) then printf " (%s)" number else "") desc
-  printf "    %-30s  %15s\n" acct (printf "$%s" amount' :: String)
-  printf "    %s\n\n" baseacct
+      (acct,newdesc) = identify (accountRules rules) unknownacct desc
+  in
+    LedgerTransaction {
+              ltdate=date,
+              lteffectivedate=Nothing,
+              ltstatus=status,
+              ltcode=code,
+              ltdescription=newdesc,
+              ltcomment=comment,
+              ltpreceding_comment_lines=precomment,
+              ltpostings=[
+                   Posting {
+                     pstatus=False,
+                     paccount=acct,
+                     pamount=amount,
+                     pcomment="",
+                     ptype=RegularPosting
+                   },
+                   Posting {
+                     pstatus=False,
+                     paccount=baseAccount rules,
+                     pamount=(-amount),
+                     pcomment="",
+                     ptype=RegularPosting
+                   }
+                  ]
+            }

-choose_acct_desc :: [Rule] -> (String,String) -> (String,String)
-choose_acct_desc rules (acct,desc) | null matchingrules = (acct,desc)
-                                   | otherwise = (a,d)
-    where
-      matchingrules = filter ismatch rules :: [Rule]
-          where ismatch = any (isJust . flip matchregex desc . fst) . fst
-      (prs,a) = head matchingrules
-      mrs = filter (isJust . fst) $ map (\(p,r) -> (matchregex p desc, r)) prs
-      (m,repl) = head mrs
-      matched = fst $ fst $ fromJust m
-      d = fromMaybe matched repl
-
-matchregex = matchRegexPR . ("(?i)" ++)
-
-fixdate :: String -> String
-fixdate s = maybe "0000/00/00" showDate $ 
+-- | Convert some date string with unknown format to YYYY/MM/DD.
+normaliseDate :: String -> String
+normaliseDate s = maybe "0000/00/00" showDate $
              firstJust
              [parseTime defaultTimeLocale "%Y/%m/%e" s
-               -- can't parse a month without leading 0, try adding onee
+               -- can't parse a month without leading 0, try adding one
              ,parseTime defaultTimeLocale "%Y/%m/%e" (take 5 s ++ "0" ++ drop 5 s)
              ,parseTime defaultTimeLocale "%Y-%m-%e" s
              ,parseTime defaultTimeLocale "%Y-%m-%e" (take 5 s ++ "0" ++ drop 5 s)
@ -102,3 +253,19 @@ fixdate s = maybe "0000/00/00" showDate $
              ,parseTime defaultTimeLocale "%m-%e-%Y" ('0':s)
              ]

+-- | Apply account matching rules to a transaction description to obtain
+-- the most appropriate account and a new description.
+identify :: [AccountRule] -> String -> String -> (String,String)
+identify rules defacct desc | null matchingrules = (defacct,desc)
+                            | otherwise = (acct,newdesc)
+    where
+      matchingrules = filter ismatch rules :: [AccountRule]
+          where ismatch = any (isJust . flip matchregex desc . fst) . fst
+      (prs,acct) = head matchingrules
+      mrs = filter (isJust . fst) $ map (\(p,r) -> (matchregex p desc, r)) prs
+      (m,repl) = head mrs
+      matched = fst $ fst $ fromJust m
+      newdesc = fromMaybe matched repl
+
+matchregex = matchRegexPR . ("(?i)" ++)
+
--- a/Options.hs
+++ b/Options.hs
@ -20,7 +20,7 @@ timeprogname  = "hours"
 usagehdr =
  "Usage: hledger [OPTIONS] [COMMAND [PATTERNS]]\n" ++
  "       hours   [OPTIONS] [COMMAND [PATTERNS]]\n" ++
-  "       hledger convert CSVFILE ACCOUNTNAME RULESFILE\n" ++
+  "       hledger convert CSVFILE RULESFILE\n" ++
  "\n" ++
  "hledger uses your ~/.ledger or $LEDGER file (or another specified with -f),\n" ++
  "while hours uses your ~/.timelog or $TIMELOG file.\n" ++
--- a/73
+++ b/73
@ -157,26 +157,35 @@ Commands
 Convert
 ,,,,,,,

-The convert command converts CSV (comma-separated value) files downloaded
-from a bank into ledger format. Doing a bulk import in this way can be a
-easier than entering each transaction by hand. The downside is you no
-longer have your own data with which to detect errors by the bank.
+The convert command reads a CSV_ file you have downloaded from your bank,
+and prints out the transactions in ledger format, suitable for adding to
+your ledger.  This can be a lot quicker than entering every transaction by
+hand. (The downside is that you are less likely to notice if your bank makes an
+error, so keep an eye on them!)

-Usage is different from the other commands, typically it looks like this::
+Use it like this::

- hledger convert FILE.csv BASEACCOUNT FILE.rules >FILE.ledger
- (review FILE.ledger, then copy and paste into main ledger)
+ hledger convert FILE.csv FILE.rules >FILE.ledger

-Ie convert the csv data and save the output into a similarly named ledger file.
-(It's not required, but convenient to use the same base filename for the
-csv, rules and output files as above.)
+This will convert the csv data in FILE.csv using the conversion hints in
+FILE.rules, and save the output into a temporary ledger file. Then you
+should review FILE.ledger for problems; update the rules and convert again
+if needed; and finally copy/paste transactions which are new into your
+main ledger.

-BASEACCOUNT is the source account for these transactions, eg
-``assets:bank:checking``.
+rules file
+''''''''''

-FILE.rules contains some rules to help convert the data. Here's an example::
+A .rules file contains some data definitions and some rules for assigning
+destination accounts to transactions. Typically you will have one csv file
+and corresponding rules file per bank account. Here's an example rules
+file for converting the csv download from a Wells Fargo checking account::

- 0,-,4,1
+ base-account assets:bank:checking
+ date-field 0
+ description-field 4
+ amount-field 1
+ currency $

 SPECTRUM
 expenses:health:gym
@ -188,21 +197,33 @@ FILE.rules contains some rules to help convert the data. Here's an example::
 (TO|FROM) SAVINGS
 assets:bank:savings

-It must contain:
+This says:

- paragraphs separated by one blank line.
+- the ledger account corresponding to this csv file is assets:bank:checking
+- the first csv field is the date, the second is the amount, the fifth is the description
+- prepend a dollar sign to the amount field
+- if description contains SPECTRUM (case-insensitive), the transaction is a gym expense
+- if description contains ITUNES or BLKBSTR, the transaction is an entertainment expense; also rewrite BLKBSTR as BLOCKBUSTER
+- if description contains TO SAVINGS or FROM SAVINGS, the transaction is a savings transfer

- The first paragraph is a single line of four comma-separated fields,
-  which are numbers indicating the (0-based) csv field positions
-  corresponding to the transaction date, code/number, description, and amount.
-  If a field does not exist in the csv, use - to specify a default value.
+Notes:

- Other paragraphs consist of one or more regular expression patterns, one
-  per line, followed by a line specifying the account to use when a
-  transaction's description matches any of these patterns. Patterns may
-  optionally have a replacement pattern specified after =, otherwise the
-  matching part is used.
+- Definitions must come first, one per line, all in one paragraph.  Each
+  is a name and a value separated by whitespace. Supported names are:
+  base-account, date-field, status-field, code-field, description-field,
+  amount-field, currency-field, currency. All are optional and will
+  use defaults if not specified.

+- The remainder of the file is account-assigning rules. Each is a
+  paragraph consisting of one or more description-matching patterns
+  (case-insensitive regular expressions), one per line, followed by the
+  account name to use when the transaction's description matches any of
+  these patterns.
+
+- A match pattern may optionally be followed by = and a replacement
+  pattern, which will become the ledger transaction's description.
+  Otherwise the matched part of the csv description is used. (To preserve
+  the full csv description, use .* before and after the match pattern.)

 Smart dates
 ...........
@ -377,3 +398,5 @@ Other differences
 .. _c++ ledger's manual:  http://joyful.com/repos/ledger/doc/ledger.html
 .. _binaries:             http://hledger.org/binaries/
 .. _Haskell Platform:     http://hackage.haskell.org/platform/
+.. _CSV:                  http://en.wikipedia.org/wiki/Comma-separated_values
+