2020-03-29 03:09:47 +03:00
--- * -*- outline-regexp:"--- \\*"; -*-
2020-03-02 09:00:39 +03:00
--- ** doc
2020-02-28 10:28:33 +03:00
-- In Emacs, use TAB on lines beginning with "-- *" to collapse/expand sections.
2012-03-11 01:55:48 +04:00
{- |
2013-03-30 02:56:55 +04:00
A reader for CSV data , using an extra rules file to help interpret the data .
2012-03-11 01:55:48 +04:00
- }
2020-02-26 04:54:16 +03:00
-- Lots of haddocks in this file are for non-exported types.
-- Here's a command that will render them:
-- stack haddock hledger-lib --fast --no-haddock-deps --haddock-arguments='--ignore-all-exports' --open
2020-03-02 09:00:39 +03:00
--- ** language
2014-11-03 08:52:12 +03:00
{- # LANGUAGE FlexibleContexts # -}
2016-09-25 22:56:28 +03:00
{- # LANGUAGE FlexibleInstances # -}
2020-02-26 04:54:16 +03:00
{- # LANGUAGE MultiWayIf # -}
{- # LANGUAGE NamedFieldPuns # -}
2020-02-28 10:28:33 +03:00
{- # LANGUAGE OverloadedStrings # -}
{- # LANGUAGE PackageImports # -}
2020-02-12 06:04:03 +03:00
{- # LANGUAGE RecordWildCards # -}
2020-02-28 10:28:33 +03:00
{- # LANGUAGE ScopedTypeVariables # -}
{- # LANGUAGE TypeFamilies # -}
{- # LANGUAGE TypeSynonymInstances # -}
{- # LANGUAGE ViewPatterns # -}
2020-03-02 09:00:39 +03:00
--- ** exports
2012-03-11 01:55:48 +04:00
module Hledger.Read.CsvReader (
2012-03-24 22:08:11 +04:00
-- * Reader
2012-03-23 20:21:41 +04:00
reader ,
2012-05-30 01:00:49 +04:00
-- * Misc.
2020-02-28 10:28:33 +03:00
CSV , CsvRecord , CsvValue ,
csvFileFor ,
2012-05-30 01:00:49 +04:00
rulesFileFor ,
2013-03-30 02:56:55 +04:00
parseRulesFile ,
2018-09-07 20:12:13 +03:00
printCSV ,
2012-03-24 22:08:11 +04:00
-- * Tests
2018-09-06 23:08:26 +03:00
tests_CsvReader ,
2012-03-11 01:55:48 +04:00
)
where
2020-02-28 10:28:33 +03:00
2020-03-02 09:00:39 +03:00
--- ** imports
2019-12-01 20:31:36 +03:00
import Prelude ( )
import " base-compat-batteries " Prelude.Compat hiding ( fail )
2020-08-31 15:44:41 +03:00
import Control.Applicative ( liftA2 )
2019-09-14 04:11:40 +03:00
import Control.Exception ( IOException , handle , throw )
import Control.Monad ( liftM , unless , when )
2019-09-14 12:04:00 +03:00
import Control.Monad.Except ( ExceptT , throwError )
2020-08-15 12:14:27 +03:00
import qualified Control.Monad.Fail as Fail
2020-03-01 21:16:52 +03:00
import Control.Monad.IO.Class ( MonadIO , liftIO )
2016-07-29 19:55:02 +03:00
import Control.Monad.State.Strict ( StateT , get , modify' , evalStateT )
2019-09-14 04:11:40 +03:00
import Control.Monad.Trans.Class ( lift )
2020-08-31 15:44:41 +03:00
import Data.Char ( toLower , isDigit , isSpace , isAlphaNum , isAscii , ord )
2019-09-14 12:04:00 +03:00
import Data.Bifunctor ( first )
2018-06-05 02:28:28 +03:00
import " base-compat-batteries " Data.List.Compat
2020-06-18 02:58:49 +03:00
import qualified Data.List.Split as LS ( splitOn )
2020-08-31 15:44:41 +03:00
import Data.Maybe ( catMaybes , fromMaybe , isJust )
2020-06-22 03:08:33 +03:00
import Data.MemoUgly ( memo )
2020-08-31 15:44:41 +03:00
import Data.Ord ( comparing )
2018-05-22 01:47:56 +03:00
import qualified Data.Set as S
lib: textification: parse stream
10% more allocation, but 35% lower maximum residency, and slightly quicker.
hledger -f data/100x100x10.journal stats
<<ghc: 39327768 bytes, 77 GCs, 196834/269496 avg/max bytes residency (3 samples), 2M in use, 0.000 INIT (0.010 elapsed), 0.020 MUT (0.092 elapsed), 0.014 GC (0.119 elapsed) :ghc>>
<<ghc: 42842136 bytes, 84 GCs, 194010/270912 avg/max bytes residency (3 samples), 2M in use, 0.000 INIT (0.009 elapsed), 0.016 MUT (0.029 elapsed), 0.012 GC (0.120 elapsed) :ghc>>
hledger -f data/1000x1000x10.journal stats
<<ghc: 314291440 bytes, 612 GCs, 2070776/6628048 avg/max bytes residency (7 samples), 16M in use, 0.000 INIT (0.000 elapsed), 0.128 MUT (0.144 elapsed), 0.059 GC (0.070 elapsed) :ghc>>
<<ghc: 349558872 bytes, 681 GCs, 1397597/4106384 avg/max bytes residency (7 samples), 11M in use, 0.000 INIT (0.004 elapsed), 0.124 MUT (0.133 elapsed), 0.047 GC (0.053 elapsed) :ghc>>
hledger -f data/10000x1000x10.journal stats
<<ghc: 3070026824 bytes, 5973 GCs, 12698030/62951784 avg/max bytes residency (10 samples), 124M in use, 0.000 INIT (0.002 elapsed), 1.268 MUT (1.354 elapsed), 0.514 GC (0.587 elapsed) :ghc>>
<<ghc: 3424013128 bytes, 6658 GCs, 11405501/41071624 avg/max bytes residency (11 samples), 111M in use, 0.000 INIT (0.001 elapsed), 1.343 MUT (1.406 elapsed), 0.511 GC (0.573 elapsed) :ghc>>
hledger -f data/100000x1000x10.journal stats
<<ghc: 30753387392 bytes, 59811 GCs, 117615462/666703600 avg/max bytes residency (14 samples), 1588M in use, 0.000 INIT (0.000 elapsed), 12.068 MUT (12.238 elapsed), 6.015 GC (7.190 elapsed) :ghc>>
<<ghc: 34306530696 bytes, 66727 GCs, 76806196/414629312 avg/max bytes residency (14 samples), 1009M in use, 0.000 INIT (0.010 elapsed), 14.357 MUT (16.370 elapsed), 5.298 GC (6.534 elapsed) :ghc>>
2016-05-25 01:58:23 +03:00
import Data.Text ( Text )
lib: textification begins! account names
The first of several conversions from String to (strict) Text, hopefully
reducing space and time usage.
This one shows a small improvement, with GHC 7.10.3 and text-1.2.2.1:
hledger -f data/100x100x10.journal stats
string: <<ghc: 39471064 bytes, 77 GCs, 198421/275048 avg/max bytes residency (3 samples), 2M in use, 0.000 INIT (0.001 elapsed), 0.015 MUT (0.020 elapsed), 0.010 GC (0.014 elapsed) :ghc>>
text: <<ghc: 39268024 bytes, 77 GCs, 197018/270840 avg/max bytes residency (3 samples), 2M in use, 0.000 INIT (0.002 elapsed), 0.016 MUT (0.022 elapsed), 0.009 GC (0.011 elapsed) :ghc>>
hledger -f data/1000x100x10.journal stats
string: <<ghc: 318555920 bytes, 617 GCs, 2178997/7134472 avg/max bytes residency (7 samples), 16M in use, 0.000 INIT (0.001 elapsed), 0.129 MUT (0.136 elapsed), 0.067 GC (0.077 elapsed) :ghc>>
text: <<ghc: 314248496 bytes, 612 GCs, 2074045/6617960 avg/max bytes residency (7 samples), 16M in use, 0.000 INIT (0.003 elapsed), 0.137 MUT (0.145 elapsed), 0.067 GC (0.079 elapsed) :ghc>>
hledger -f data/10000x100x10.journal stats
string: <<ghc: 3114763608 bytes, 6026 GCs, 18858950/75552024 avg/max bytes residency (11 samples), 201M in use, 0.000 INIT (0.000 elapsed), 1.331 MUT (1.372 elapsed), 0.699 GC (0.812 elapsed) :ghc>>
text: <<ghc: 3071468920 bytes, 5968 GCs, 14120344/62951360 avg/max bytes residency (9 samples), 124M in use, 0.000 INIT (0.003 elapsed), 1.272 MUT (1.349 elapsed), 0.513 GC (0.578 elapsed) :ghc>>
hledger -f data/100000x100x10.journal stats
string: <<ghc: 31186579432 bytes, 60278 GCs, 135332581/740228992 avg/max bytes residency (13 samples), 1697M in use, 0.000 INIT (0.008 elapsed), 14.677 MUT (15.508 elapsed), 7.081 GC (8.074 elapsed) :ghc>>
text: <<ghc: 30753427672 bytes, 59763 GCs, 117595958/666457240 avg/max bytes residency (14 samples), 1588M in use, 0.000 INIT (0.008 elapsed), 13.713 MUT (13.966 elapsed), 6.220 GC (7.108 elapsed) :ghc>>
2016-05-24 04:16:21 +03:00
import qualified Data.Text as T
2018-09-07 20:12:13 +03:00
import qualified Data.Text.Encoding as T
2017-07-05 18:04:48 +03:00
import qualified Data.Text.IO as T
2013-03-30 02:56:55 +04:00
import Data.Time.Calendar ( Day )
2015-03-30 02:12:54 +03:00
import Data.Time.Format ( parseTimeM , defaultTimeLocale )
2020-08-31 15:44:41 +03:00
import Safe ( atMay , headMay , lastMay , readDef , readMay )
2012-03-11 01:55:48 +04:00
import System.Directory ( doesFileExist )
2020-08-31 15:44:41 +03:00
import System.FilePath ( ( </> ) , takeDirectory , takeExtension , takeFileName )
2018-09-07 20:12:13 +03:00
import qualified Data.Csv as Cassava
import qualified Data.Csv.Parser.Megaparsec as CassavaMP
import qualified Data.ByteString as B
2018-09-30 04:32:08 +03:00
import qualified Data.ByteString.Lazy as BL
2020-08-31 15:44:41 +03:00
import Data.Foldable ( asum , toList )
2020-08-15 12:14:27 +03:00
import Text.Megaparsec hiding ( match , parse )
2020-08-31 15:44:41 +03:00
import Text.Megaparsec.Char ( char , newline , string )
import Text.Megaparsec.Custom ( customErrorBundlePretty , parseErrorAt )
2017-07-06 16:07:46 +03:00
import Text.Printf ( printf )
2012-03-11 01:55:48 +04:00
import Hledger.Data
import Hledger.Utils
2020-11-24 20:17:01 +03:00
import Hledger.Read.Common ( aliasesFromOpts , Reader ( .. ) , InputOpts ( .. ) , amountp , statusp , genericSourcePos , journalFinalise )
2012-03-24 22:08:11 +04:00
2020-03-02 09:00:39 +03:00
--- ** doctest setup
-- $setup
-- >>> :set -XOverloadedStrings
--- ** some types
2018-09-07 20:12:13 +03:00
2020-02-28 10:28:33 +03:00
type CSV = [ CsvRecord ]
type CsvRecord = [ CsvValue ]
type CsvValue = String
2018-09-07 20:12:13 +03:00
2020-03-02 09:00:39 +03:00
--- ** reader
2018-09-07 20:12:13 +03:00
2020-03-01 21:16:52 +03:00
reader :: MonadIO m => Reader m
2016-11-19 00:24:57 +03:00
reader = Reader
{ rFormat = " csv "
2020-01-20 23:56:31 +03:00
, rExtensions = [ " csv " , " tsv " , " ssv " ]
2020-03-01 21:16:52 +03:00
, rReadFn = parse
2020-08-06 02:05:56 +03:00
, rParser = error ' " s o r r y , C S V f i l e s c a n' t be included yet " -- PARTIAL:
2016-11-19 00:24:57 +03:00
}
2012-03-11 01:55:48 +04:00
-- | Parse and post-process a "Journal" from CSV data, or give an error.
2019-11-08 05:58:12 +03:00
-- Does not check balance assertions.
-- XXX currently ignores the provided data, reads it from the file path instead.
2018-04-17 00:47:04 +03:00
parse :: InputOpts -> FilePath -> Text -> ExceptT String IO Journal
parse iopts f t = do
let rulesfile = mrules_file_ iopts
2020-01-05 20:56:04 +03:00
r <- liftIO $ readJournalFromCsv rulesfile f t
2019-11-08 05:58:12 +03:00
case r of Left e -> throwError e
2020-11-26 07:59:07 +03:00
Right pj ->
-- journalFinalise assumes the journal's items are
-- reversed, as produced by JournalReader's parser.
-- But here they are already properly ordered. So we'd
-- better preemptively reverse them once more. XXX inefficient
let pj' = journalReverse pj
-- apply any command line account aliases. Can fail with a bad replacement pattern.
in case journalApplyAliases ( aliasesFromOpts iopts ) pj' of
Left e -> throwError e
Right pj'' -> journalFinalise iopts { ignore_assertions_ = True } f t pj''
2012-03-11 01:55:48 +04:00
2020-03-02 09:00:39 +03:00
--- ** reading rules files
--- *** rules utilities
2013-03-30 02:56:55 +04:00
2020-02-28 10:28:33 +03:00
-- Not used by hledger; just for lib users,
-- | An pure-exception-throwing IO action that parses this file's content
-- as CSV conversion rules, interpolating any included files first,
-- and runs some extra validation checks.
parseRulesFile :: FilePath -> ExceptT String IO CsvRules
parseRulesFile f =
liftIO ( readFilePortably f >>= expandIncludes ( takeDirectory f ) )
>>= either throwError return . parseAndValidateCsvRules f
2012-05-28 22:40:36 +04:00
2020-02-28 10:28:33 +03:00
-- | Given a CSV file path, what would normally be the corresponding rules file ?
2013-03-30 02:56:55 +04:00
rulesFileFor :: FilePath -> FilePath
rulesFileFor = ( ++ " .rules " )
2012-03-24 22:08:11 +04:00
2020-02-28 10:28:33 +03:00
-- | Given a CSV rules file path, what would normally be the corresponding CSV file ?
2013-03-30 02:56:55 +04:00
csvFileFor :: FilePath -> FilePath
csvFileFor = reverse . drop 6 . reverse
2012-03-11 01:55:48 +04:00
2016-11-23 00:59:31 +03:00
defaultRulesText :: FilePath -> Text
defaultRulesText csvfile = T . pack $ unlines
[ " # hledger csv conversion rules for " ++ csvFileFor ( takeFileName csvfile )
2014-08-08 00:15:40 +04:00
, " # cf http://hledger.org/manual#csv-files "
2013-03-30 02:56:55 +04:00
, " "
, " account1 assets:bank:checking "
, " "
2019-10-16 00:42:55 +03:00
, " fields date, description, amount1 "
2013-03-30 02:56:55 +04:00
, " "
, " #skip 1 "
2017-07-05 17:24:17 +03:00
, " #newest-first "
2013-03-30 02:56:55 +04:00
, " "
, " #date-format %-d/%-m/%Y "
, " #date-format %-m/%-d/%Y "
, " #date-format %Y-%h-%d "
, " "
, " #currency $ "
, " "
, " if ITUNES "
, " account2 expenses:entertainment "
, " "
, " if (TO|FROM) SAVINGS "
, " account2 assets:bank:savings \ n "
]
2012-03-11 01:55:48 +04:00
2020-06-22 03:08:33 +03:00
addDirective :: ( DirectiveName , String ) -> CsvRulesParsed -> CsvRulesParsed
2020-02-28 10:28:33 +03:00
addDirective d r = r { rdirectives = d : rdirectives r }
2012-03-11 01:55:48 +04:00
2020-06-22 03:08:33 +03:00
addAssignment :: ( HledgerFieldName , FieldTemplate ) -> CsvRulesParsed -> CsvRulesParsed
2020-02-28 10:28:33 +03:00
addAssignment a r = r { rassignments = a : rassignments r }
2013-03-30 02:56:55 +04:00
2020-06-22 03:08:33 +03:00
setIndexesAndAssignmentsFromList :: [ CsvFieldName ] -> CsvRulesParsed -> CsvRulesParsed
2020-02-28 10:28:33 +03:00
setIndexesAndAssignmentsFromList fs r = addAssignmentsFromList fs . setCsvFieldIndexesFromList fs $ r
2013-03-30 02:56:55 +04:00
2020-06-22 03:08:33 +03:00
setCsvFieldIndexesFromList :: [ CsvFieldName ] -> CsvRulesParsed -> CsvRulesParsed
2020-02-28 10:28:33 +03:00
setCsvFieldIndexesFromList fs r = r { rcsvfieldindexes = zip fs [ 1 .. ] }
2013-03-30 02:56:55 +04:00
2020-06-22 03:08:33 +03:00
addAssignmentsFromList :: [ CsvFieldName ] -> CsvRulesParsed -> CsvRulesParsed
2020-02-28 10:28:33 +03:00
addAssignmentsFromList fs r = foldl' maybeAddAssignment r journalfieldnames
where
maybeAddAssignment rules f = ( maybe id addAssignmentFromIndex $ elemIndex f fs ) rules
where
addAssignmentFromIndex i = addAssignment ( f , " % " ++ show ( i + 1 ) )
2013-03-30 02:56:55 +04:00
2020-06-22 03:08:33 +03:00
addConditionalBlock :: ConditionalBlock -> CsvRulesParsed -> CsvRulesParsed
2020-02-28 10:28:33 +03:00
addConditionalBlock b r = r { rconditionalblocks = b : rconditionalblocks r }
2013-03-30 02:56:55 +04:00
2020-06-22 03:08:33 +03:00
addConditionalBlocks :: [ ConditionalBlock ] -> CsvRulesParsed -> CsvRulesParsed
2020-06-18 02:58:49 +03:00
addConditionalBlocks bs r = r { rconditionalblocks = bs ++ rconditionalblocks r }
2020-02-28 10:28:33 +03:00
getDirective :: DirectiveName -> CsvRules -> Maybe FieldTemplate
getDirective directivename = lookup directivename . rdirectives
2013-03-30 02:56:55 +04:00
2020-02-28 10:28:33 +03:00
instance ShowErrorComponent String where
showErrorComponent = id
2013-03-30 02:56:55 +04:00
2020-02-28 10:28:33 +03:00
-- | Inline all files referenced by include directives in this hledger CSV rules text, recursively.
-- Included file paths may be relative to the directory of the provided file path.
-- This is done as a pre-parse step to simplify the CSV rules parser.
expandIncludes :: FilePath -> Text -> IO Text
expandIncludes dir content = mapM ( expandLine dir ) ( T . lines content ) >>= return . T . unlines
where
expandLine dir line =
case line of
( T . stripPrefix " include " -> Just f ) -> expandIncludes dir' =<< T . readFile f'
where
f' = dir </> dropWhile isSpace ( T . unpack f )
dir' = takeDirectory f'
_ -> return line
2013-03-30 02:56:55 +04:00
2020-02-28 10:28:33 +03:00
-- | An error-throwing IO action that parses this text as CSV conversion rules
-- and runs some extra validation checks. The file path is used in error messages.
parseAndValidateCsvRules :: FilePath -> T . Text -> Either String CsvRules
parseAndValidateCsvRules rulesfile s =
case parseCsvRules rulesfile s of
Left err -> Left $ customErrorBundlePretty err
Right rules -> first makeFancyParseError $ validateRules rules
where
makeFancyParseError :: String -> String
makeFancyParseError errorString =
parseErrorPretty ( FancyError 0 ( S . singleton $ ErrorFail errorString ) :: ParseError Text String )
2013-03-30 02:56:55 +04:00
2020-02-28 10:28:33 +03:00
-- | Parse this text as CSV conversion rules. The file path is for error messages.
parseCsvRules :: FilePath -> T . Text -> Either ( ParseErrorBundle T . Text CustomErr ) CsvRules
-- parseCsvRules rulesfile s = runParser csvrulesfile nullrules{baseAccount=takeBaseName rulesfile} rulesfile s
parseCsvRules rulesfile s =
runParser ( evalStateT rulesp defrules ) rulesfile s
2013-03-30 02:56:55 +04:00
2020-02-28 10:28:33 +03:00
-- | Return the validated rules, or an error.
validateRules :: CsvRules -> Either String CsvRules
validateRules rules = do
unless ( isAssigned " date " ) $ Left " Please specify (at top level) the date field. Eg: date %1 \ n "
Right rules
where
isAssigned f = isJust $ getEffectiveAssignment rules [] f
2013-03-30 02:56:55 +04:00
2020-03-02 09:00:39 +03:00
--- *** rules types
2013-03-30 02:56:55 +04:00
2020-02-28 10:28:33 +03:00
-- | A set of data definitions and account-matching patterns sufficient to
-- convert a particular CSV data file into meaningful journal transactions.
2020-06-22 03:08:33 +03:00
data CsvRules' a = CsvRules' {
2013-03-30 02:56:55 +04:00
rdirectives :: [ ( DirectiveName , String ) ] ,
2020-02-26 04:54:16 +03:00
-- ^ top-level rules, as (keyword, value) pairs
2013-03-30 02:56:55 +04:00
rcsvfieldindexes :: [ ( CsvFieldName , CsvFieldIndex ) ] ,
2020-02-26 04:54:16 +03:00
-- ^ csv field names and their column number, if declared by a fields list
2020-02-27 12:00:35 +03:00
rassignments :: [ ( HledgerFieldName , FieldTemplate ) ] ,
2020-02-26 04:54:16 +03:00
-- ^ top-level assignments to hledger fields, as (field name, value template) pairs
2020-06-22 03:08:33 +03:00
rconditionalblocks :: [ ConditionalBlock ] ,
2020-02-26 04:54:16 +03:00
-- ^ conditional blocks, which containing additional assignments/rules to apply to matched csv records
2020-06-22 03:08:33 +03:00
rblocksassigning :: a -- (String -> [ConditionalBlock])
-- ^ all conditional blocks which can potentially assign field with a given name (memoized)
}
-- | Type used by parsers. Directives, assignments and conditional blocks
-- are in the reverse order compared to what is in the file and rblocksassigning is non-functional,
-- could not be used for processing CSV records yet
type CsvRulesParsed = CsvRules' ()
-- | Type used after parsing is done. Directives, assignments and conditional blocks
-- are in the same order as they were in the unput file and rblocksassigning is functional.
-- Ready to be used for CSV record processing
type CsvRules = CsvRules' ( String -> [ ConditionalBlock ] )
instance Eq CsvRules where
r1 == r2 = ( rdirectives r1 , rcsvfieldindexes r1 , rassignments r1 ) ==
( rdirectives r2 , rcsvfieldindexes r2 , rassignments r2 )
2013-03-30 02:56:55 +04:00
2020-11-18 03:10:57 +03:00
-- Custom Show instance used for debug output: omit the rblocksassigning field, which isn't showable.
2020-06-22 03:08:33 +03:00
instance Show CsvRules where
2020-11-18 03:10:57 +03:00
show r = " CsvRules { rdirectives = " ++ show ( rdirectives r ) ++
" , rcsvfieldindexes = " ++ show ( rcsvfieldindexes r ) ++
" , rassignments = " ++ show ( rassignments r ) ++
" , rconditionalblocks = " ++ show ( rconditionalblocks r ) ++
2020-06-22 03:08:33 +03:00
" } "
type CsvRulesParser a = StateT CsvRulesParsed SimpleTextParser a
2016-07-29 18:57:10 +03:00
2020-02-26 04:54:16 +03:00
-- | The keyword of a CSV rule - "fields", "skip", "if", etc.
2013-03-30 02:56:55 +04:00
type DirectiveName = String
2020-02-28 10:28:33 +03:00
2020-02-26 04:54:16 +03:00
-- | CSV field name.
2013-03-30 02:56:55 +04:00
type CsvFieldName = String
2020-02-28 10:28:33 +03:00
2020-02-26 04:54:16 +03:00
-- | 1-based CSV column number.
2013-03-30 02:56:55 +04:00
type CsvFieldIndex = Int
2020-02-28 10:28:33 +03:00
2020-02-26 04:54:16 +03:00
-- | Percent symbol followed by a CSV field name or column number. Eg: %date, %1.
type CsvFieldReference = String
2020-02-28 10:28:33 +03:00
2020-02-27 12:00:35 +03:00
-- | One of the standard hledger fields or pseudo-fields that can be assigned to.
-- Eg date, account1, amount, amount1-in, date-format.
type HledgerFieldName = String
2020-02-28 10:28:33 +03:00
2020-02-26 04:54:16 +03:00
-- | A text value to be assigned to a hledger field, possibly
-- containing csv field references to be interpolated.
2013-03-30 02:56:55 +04:00
type FieldTemplate = String
2020-02-28 10:28:33 +03:00
2020-02-26 04:54:16 +03:00
-- | A strptime date parsing pattern, as supported by Data.Time.Format.
2020-02-12 17:20:40 +03:00
type DateFormat = String
2020-02-28 10:28:33 +03:00
2020-07-06 22:17:35 +03:00
-- | A prefix for a matcher test, either & or none (implicit or).
data MatcherPrefix = And | None
deriving ( Show , Eq )
2020-02-12 17:20:40 +03:00
-- | A single test for matching a CSV record, in one way or another.
data Matcher =
2020-08-15 12:14:27 +03:00
RecordMatcher MatcherPrefix Regexp -- ^ match if this regexp matches the overall CSV record
| FieldMatcher MatcherPrefix CsvFieldReference Regexp -- ^ match if this regexp matches the referenced CSV field's value
2020-02-12 17:20:40 +03:00
deriving ( Show , Eq )
2020-02-12 06:04:03 +03:00
2020-02-12 17:20:40 +03:00
-- | A conditional block: a set of CSV record matchers, and a sequence
2020-02-26 04:54:16 +03:00
-- of rules which will be enabled only if one or more of the matchers
-- succeeds.
2020-02-12 17:20:40 +03:00
--
-- Three types of rule are allowed inside conditional blocks: field
2020-02-12 06:04:03 +03:00
-- assignments, skip, end. (A skip or end rule is stored as if it was
-- a field assignment, and executed in validateCsv. XXX)
data ConditionalBlock = CB {
2020-02-12 17:20:40 +03:00
cbMatchers :: [ Matcher ]
2020-02-27 12:00:35 +03:00
, cbAssignments :: [ ( HledgerFieldName , FieldTemplate ) ]
2020-02-12 06:04:03 +03:00
} deriving ( Show , Eq )
2020-06-22 03:08:33 +03:00
defrules :: CsvRulesParsed
defrules = CsvRules' {
2013-03-30 02:56:55 +04:00
rdirectives = [] ,
rcsvfieldindexes = [] ,
rassignments = [] ,
2020-06-22 03:08:33 +03:00
rconditionalblocks = [] ,
rblocksassigning = ()
}
2013-03-30 02:56:55 +04:00
2020-06-22 03:08:33 +03:00
-- | Create CsvRules from the content parsed out of the rules file
mkrules :: CsvRulesParsed -> CsvRules
mkrules rules =
2020-06-22 21:12:00 +03:00
let conditionalblocks = reverse $ rconditionalblocks rules
maybeMemo = if length conditionalblocks >= 15 then memo else id
in
2020-06-22 03:08:33 +03:00
CsvRules' {
rdirectives = reverse $ rdirectives rules ,
rcsvfieldindexes = rcsvfieldindexes rules ,
rassignments = reverse $ rassignments rules ,
rconditionalblocks = conditionalblocks ,
2020-06-22 21:12:00 +03:00
rblocksassigning = maybeMemo ( \ f -> filter ( any ( ( == f ) . fst ) . cbAssignments ) conditionalblocks )
2020-06-22 03:08:33 +03:00
}
2020-06-22 21:12:00 +03:00
2020-07-06 22:17:35 +03:00
matcherPrefix :: Matcher -> MatcherPrefix
matcherPrefix ( RecordMatcher prefix _ ) = prefix
matcherPrefix ( FieldMatcher prefix _ _ ) = prefix
-- | Group matchers into associative pairs based on prefix, e.g.:
-- A
-- & B
-- C
-- D
-- & E
-- => [[A, B], [C], [D, E]]
groupedMatchers :: [ Matcher ] -> [ [ Matcher ] ]
groupedMatchers [] = []
groupedMatchers ( x : xs ) = ( x : ys ) : groupedMatchers zs
where ( ys , zs ) = span ( \ y -> matcherPrefix y == And ) xs
2020-03-02 09:00:39 +03:00
--- *** rules parsers
2020-02-28 10:28:33 +03:00
{-
Grammar for the CSV conversion rules , more or less :
RULES : RULE *
2020-11-07 07:45:52 +03:00
RULE : ( FIELD - LIST | FIELD - ASSIGNMENT | CONDITIONAL - BLOCK | SKIP | NEWEST - FIRST | DATE - FORMAT | DECIMAL - MARK | COMMENT | BLANK ) NEWLINE
2020-02-28 10:28:33 +03:00
FIELD - LIST : fields SPACE FIELD - NAME ( SPACE ? , SPACE ? FIELD - NAME ) *
FIELD - NAME : QUOTED - FIELD - NAME | BARE - FIELD - NAME
QUOTED - FIELD - NAME : " (any CHAR except double-quote)+ "
BARE - FIELD - NAME : any CHAR except space , tab , # , ;
FIELD - ASSIGNMENT : JOURNAL - FIELD ASSIGNMENT - SEPARATOR FIELD - VALUE
JOURNAL - FIELD : date | date2 | status | code | description | comment | account1 | account2 | amount | JOURNAL - PSEUDO - FIELD
JOURNAL - PSEUDO - FIELD : amount - in | amount - out | currency
ASSIGNMENT - SEPARATOR : SPACE | ( : SPACE ? )
FIELD - VALUE : VALUE ( possibly containing CSV - FIELD - REFERENCEs )
CSV - FIELD - REFERENCE : % CSV - FIELD
CSV - FIELD : ( FIELD - NAME | FIELD - NUMBER ) ( corresponding to a CSV field )
FIELD - NUMBER : DIGIT +
CONDITIONAL - BLOCK : if ( FIELD - MATCHER NEWLINE ) + INDENTED - BLOCK
2013-03-30 02:56:55 +04:00
2020-02-28 10:28:33 +03:00
FIELD - MATCHER : ( CSV - FIELD - NAME SPACE ? ) ? ( MATCHOP SPACE ? ) ? PATTERNS
2013-03-30 02:56:55 +04:00
2020-02-28 10:28:33 +03:00
MATCHOP : ~
2013-03-30 02:56:55 +04:00
2020-02-28 10:28:33 +03:00
PATTERNS : ( NEWLINE REGEXP ) * REGEXP
2012-03-11 01:55:48 +04:00
2020-02-28 10:28:33 +03:00
INDENTED - BLOCK : ( SPACE ( FIELD - ASSIGNMENT | COMMENT ) NEWLINE ) +
2013-03-30 02:56:55 +04:00
2020-02-28 10:28:33 +03:00
REGEXP : ( NONSPACE CHAR * ) SPACE ?
2012-03-11 01:55:48 +04:00
2020-02-28 10:28:33 +03:00
VALUE : SPACE ? ( CHAR * ) SPACE ?
2012-03-11 01:55:48 +04:00
2020-02-28 10:28:33 +03:00
COMMENT : SPACE ? COMMENT - CHAR VALUE
2013-03-30 02:56:55 +04:00
2020-02-28 10:28:33 +03:00
COMMENT - CHAR : # | ;
2017-02-06 13:34:18 +03:00
2020-02-28 10:28:33 +03:00
NONSPACE : any CHAR not a SPACE - CHAR
2013-08-04 07:47:43 +04:00
2020-02-28 10:28:33 +03:00
BLANK : SPACE ?
2018-05-22 01:47:56 +03:00
2020-02-28 10:28:33 +03:00
SPACE : SPACE - CHAR +
2013-03-30 02:56:55 +04:00
2020-02-28 10:28:33 +03:00
SPACE - CHAR : space | tab
CHAR : any character except newline
2013-03-30 02:56:55 +04:00
2020-02-28 10:28:33 +03:00
DIGIT : 0 - 9
- }
2013-03-30 02:56:55 +04:00
2016-07-29 18:57:10 +03:00
rulesp :: CsvRulesParser CsvRules
2013-03-30 02:56:55 +04:00
rulesp = do
2020-06-18 02:58:49 +03:00
_ <- many $ choice
2020-06-22 21:00:00 +03:00
[ blankorcommentlinep <?> " blank or comment line "
, ( directivep >>= modify' . addDirective ) <?> " directive "
, ( fieldnamelistp >>= modify' . setIndexesAndAssignmentsFromList ) <?> " field name list "
, ( fieldassignmentp >>= modify' . addAssignment ) <?> " field assignment "
-- conditionalblockp backtracks because it shares "if" prefix with conditionaltablep.
, try ( conditionalblockp >>= modify' . addConditionalBlock ) <?> " conditional block "
-- 'reverse' is there to ensure that conditions are added in the order they listed in the file
, ( conditionaltablep >>= modify' . addConditionalBlocks . reverse ) <?> " conditional table "
2013-03-30 02:56:55 +04:00
]
eof
2016-07-29 18:57:10 +03:00
r <- get
2020-06-22 03:08:33 +03:00
return $ mkrules r
2013-03-30 02:56:55 +04:00
2016-07-29 18:57:10 +03:00
blankorcommentlinep :: CsvRulesParser ()
2020-06-15 03:17:09 +03:00
blankorcommentlinep = lift ( dbgparse 8 " trying blankorcommentlinep " ) >> choiceInState [ blanklinep , commentlinep ]
2014-11-03 08:52:12 +03:00
2016-07-29 18:57:10 +03:00
blanklinep :: CsvRulesParser ()
2020-07-20 18:09:46 +03:00
blanklinep = lift skipNonNewlineSpaces >> newline >> return () <?> " blank line "
2014-11-03 08:52:12 +03:00
2016-07-29 18:57:10 +03:00
commentlinep :: CsvRulesParser ()
2020-07-20 18:09:46 +03:00
commentlinep = lift skipNonNewlineSpaces >> commentcharp >> lift restofline >> return () <?> " comment line "
2014-11-03 08:52:12 +03:00
2016-07-29 18:57:10 +03:00
commentcharp :: CsvRulesParser Char
commentcharp = oneOf ( " ;#* " :: [ Char ] )
2013-03-30 02:56:55 +04:00
2016-07-29 18:57:10 +03:00
directivep :: CsvRulesParser ( DirectiveName , String )
2016-05-27 01:51:59 +03:00
directivep = ( do
2020-06-15 03:17:09 +03:00
lift $ dbgparse 8 " trying directive "
2018-05-22 01:47:56 +03:00
d <- fmap T . unpack $ choiceInState $ map ( lift . string . T . pack ) directives
2016-07-29 18:57:10 +03:00
v <- ( ( ( char ':' >> lift ( many spacenonewline ) ) <|> lift ( some spacenonewline ) ) >> directivevalp )
2020-07-20 18:09:46 +03:00
<|> ( optional ( char ':' ) >> lift skipNonNewlineSpaces >> lift eolof >> return " " )
2017-07-27 14:59:55 +03:00
return ( d , v )
2016-05-27 01:51:59 +03:00
) <?> " directive "
2013-03-30 02:56:55 +04:00
2020-01-02 19:26:13 +03:00
directives :: [ String ]
2013-03-30 02:56:55 +04:00
directives =
[ " date-format "
2020-11-07 07:45:52 +03:00
, " decimal-mark "
2020-01-02 19:26:59 +03:00
, " separator "
2020-02-28 10:28:33 +03:00
-- ,"default-account"
2013-03-30 02:56:55 +04:00
-- ,"default-currency"
, " skip "
2017-07-05 17:24:17 +03:00
, " newest-first "
2019-11-13 12:24:50 +03:00
, " balance-type "
2013-03-30 02:56:55 +04:00
]
2012-03-11 01:55:48 +04:00
2016-07-29 18:57:10 +03:00
directivevalp :: CsvRulesParser String
2018-09-30 04:32:08 +03:00
directivevalp = anySingle ` manyTill ` lift eolof
2013-03-30 02:56:55 +04:00
2016-07-29 18:57:10 +03:00
fieldnamelistp :: CsvRulesParser [ CsvFieldName ]
2015-10-17 21:51:45 +03:00
fieldnamelistp = ( do
2020-06-15 03:17:09 +03:00
lift $ dbgparse 8 " trying fieldnamelist "
2013-03-30 02:56:55 +04:00
string " fields "
optional $ char ':'
2020-07-20 18:09:46 +03:00
lift skipNonNewlineSpaces1
let separator = lift skipNonNewlineSpaces >> char ',' >> lift skipNonNewlineSpaces
2016-07-29 18:57:10 +03:00
f <- fromMaybe " " <$> optional fieldnamep
fs <- some $ ( separator >> fromMaybe " " <$> optional fieldnamep )
lift restofline
2013-03-30 02:56:55 +04:00
return $ map ( map toLower ) $ f : fs
) <?> " field name list "
2016-07-29 18:57:10 +03:00
fieldnamep :: CsvRulesParser String
2015-10-17 21:51:45 +03:00
fieldnamep = quotedfieldnamep <|> barefieldnamep
2013-03-30 02:56:55 +04:00
2016-07-29 18:57:10 +03:00
quotedfieldnamep :: CsvRulesParser String
2015-10-17 21:51:45 +03:00
quotedfieldnamep = do
2013-03-30 02:56:55 +04:00
char '"'
2016-07-29 18:57:10 +03:00
f <- some $ noneOf ( " \ " \ n :;#~ " :: [ Char ] )
2013-03-30 02:56:55 +04:00
char '"'
return f
2016-07-29 18:57:10 +03:00
barefieldnamep :: CsvRulesParser String
barefieldnamep = some $ noneOf ( " \ t \ n ,;#~ " :: [ Char ] )
2013-03-30 02:56:55 +04:00
2020-02-27 12:00:35 +03:00
fieldassignmentp :: CsvRulesParser ( HledgerFieldName , FieldTemplate )
2015-10-17 21:51:45 +03:00
fieldassignmentp = do
2020-06-15 03:17:09 +03:00
lift $ dbgparse 8 " trying fieldassignmentp "
2015-10-17 21:51:45 +03:00
f <- journalfieldnamep
2019-10-13 01:35:57 +03:00
v <- choiceInState [ assignmentseparatorp >> fieldvalp
, lift eolof >> return " "
]
2013-03-30 02:56:55 +04:00
return ( f , v )
<?> " field assignment "
2016-07-29 18:57:10 +03:00
journalfieldnamep :: CsvRulesParser String
2017-07-27 14:59:55 +03:00
journalfieldnamep = do
2020-06-15 03:17:09 +03:00
lift ( dbgparse 8 " trying journalfieldnamep " )
2018-05-22 01:47:56 +03:00
T . unpack <$> choiceInState ( map ( lift . string . T . pack ) journalfieldnames )
2013-03-30 02:56:55 +04:00
2020-04-26 22:24:20 +03:00
maxpostings = 99
2020-03-12 18:52:43 +03:00
2018-09-07 20:12:13 +03:00
-- Transaction fields and pseudo fields for CSV conversion.
-- Names must precede any other name they contain, for the parser
2017-04-19 19:05:21 +03:00
-- (amount-in before amount; date2 before date). TODO: fix
2019-10-12 02:36:17 +03:00
journalfieldnames =
concat [ [ " account " ++ i
, " amount " ++ i ++ " -in "
, " amount " ++ i ++ " -out "
, " amount " ++ i
, " balance " ++ i
, " comment " ++ i
, " currency " ++ i
2020-04-26 22:24:20 +03:00
] | x <- [ maxpostings , ( maxpostings - 1 ) .. 1 ] , let i = show x ]
2019-10-12 02:36:17 +03:00
++
[ " amount-in "
2013-03-30 02:56:55 +04:00
, " amount-out "
2017-04-19 19:05:21 +03:00
, " amount "
, " balance "
, " code "
, " comment "
2013-03-30 02:56:55 +04:00
, " currency "
, " date2 "
, " date "
, " description "
2017-04-19 19:05:21 +03:00
, " status "
2019-10-16 01:47:19 +03:00
, " skip " -- skip and end are not really fields, but we list it here to allow conditional rules that skip records
, " end "
2013-03-30 02:56:55 +04:00
]
2016-07-29 18:57:10 +03:00
assignmentseparatorp :: CsvRulesParser ()
2015-10-17 21:51:45 +03:00
assignmentseparatorp = do
2020-06-15 03:17:09 +03:00
lift $ dbgparse 8 " trying assignmentseparatorp "
2020-07-20 18:09:46 +03:00
_ <- choiceInState [ lift skipNonNewlineSpaces >> char ':' >> lift skipNonNewlineSpaces
, lift skipNonNewlineSpaces1
2019-10-13 01:35:57 +03:00
]
2014-11-03 08:52:12 +03:00
return ()
2013-03-30 02:56:55 +04:00
2016-07-29 18:57:10 +03:00
fieldvalp :: CsvRulesParser String
2015-10-17 21:51:45 +03:00
fieldvalp = do
2020-06-15 03:17:09 +03:00
lift $ dbgparse 8 " trying fieldvalp "
2018-09-30 04:32:08 +03:00
anySingle ` manyTill ` lift eolof
2013-03-30 02:56:55 +04:00
2020-02-26 04:54:16 +03:00
-- A conditional block: one or more matchers, one per line, followed by one or more indented rules.
2016-07-29 18:57:10 +03:00
conditionalblockp :: CsvRulesParser ConditionalBlock
2015-10-17 21:51:45 +03:00
conditionalblockp = do
2020-06-15 03:17:09 +03:00
lift $ dbgparse 8 " trying conditionalblockp "
2020-06-18 02:58:49 +03:00
-- "if\nMATCHER" or "if \nMATCHER" or "if MATCHER"
start <- getOffset
string " if " >> ( ( newline >> return Nothing )
2020-07-20 18:09:46 +03:00
<|> ( lift skipNonNewlineSpaces1 >> optional newline ) )
2020-02-12 17:20:40 +03:00
ms <- some matcherp
2020-06-18 02:58:49 +03:00
as <- catMaybes <$>
2020-07-20 18:09:46 +03:00
many ( lift skipNonNewlineSpaces1 >>
2020-06-18 02:58:49 +03:00
choice [ lift eolof >> return Nothing
, fmap Just fieldassignmentp
] )
2013-03-30 02:56:55 +04:00
when ( null as ) $
2020-06-18 02:58:49 +03:00
customFailure $ parseErrorAt start $ " start of conditional block found, but no assignment rules afterward \ n (assignment rules in a conditional block should be indented) \ n "
2020-02-12 17:20:40 +03:00
return $ CB { cbMatchers = ms , cbAssignments = as }
2013-03-30 02:56:55 +04:00
<?> " conditional block "
2020-06-18 02:58:49 +03:00
-- A conditional table: "if" followed by separator, followed by some field names,
-- followed by many lines, each of which has:
-- one matchers, followed by field assignments (as many as there were fields)
conditionaltablep :: CsvRulesParser [ ConditionalBlock ]
conditionaltablep = do
lift $ dbgparse 8 " trying conditionaltablep "
start <- getOffset
string " if "
2020-06-22 20:12:10 +03:00
sep <- lift $ satisfy ( \ c -> not ( isAlphaNum c || isSpace c ) )
2020-06-18 02:58:49 +03:00
fields <- journalfieldnamep ` sepBy1 ` ( char sep )
newline
body <- flip manyTill ( lift eolof ) $ do
off <- getOffset
m <- matcherp' ( char sep >> return () )
vs <- LS . splitOn [ sep ] <$> lift restofline
if ( length vs /= length fields )
then customFailure $ parseErrorAt off $ ( ( printf " line of conditional table should have %d values, but this one has only %d \ n " ( length fields ) ( length vs ) ) :: String )
else return ( m , vs )
when ( null body ) $
customFailure $ parseErrorAt start $ " start of conditional table found, but no assignment rules afterward \ n "
return $ flip map body $ \ ( m , vs ) ->
CB { cbMatchers = [ m ] , cbAssignments = zip fields vs }
<?> " conditional table "
2020-02-26 04:54:16 +03:00
-- A single matcher, on one line.
2020-06-18 02:58:49 +03:00
matcherp' :: CsvRulesParser () -> CsvRulesParser Matcher
matcherp' end = try ( fieldmatcherp end ) <|> recordmatcherp end
2020-02-12 17:20:40 +03:00
matcherp :: CsvRulesParser Matcher
2020-06-18 02:58:49 +03:00
matcherp = matcherp' ( lift eolof )
2020-02-26 04:54:16 +03:00
-- A single whole-record matcher.
2020-02-26 21:25:36 +03:00
-- A pattern on the whole line, not beginning with a csv field reference.
2020-06-18 02:58:49 +03:00
recordmatcherp :: CsvRulesParser () -> CsvRulesParser Matcher
recordmatcherp end = do
lift $ dbgparse 8 " trying recordmatcherp "
2013-03-30 02:56:55 +04:00
-- pos <- currentPos
2020-07-20 18:09:46 +03:00
-- _ <- optional (matchoperatorp >> lift skipNonNewlineSpaces >> optional newline)
2020-07-06 22:17:35 +03:00
p <- matcherprefixp
2020-06-18 02:58:49 +03:00
r <- regexp end
2020-08-15 12:14:27 +03:00
return $ RecordMatcher p r
2020-02-12 17:20:40 +03:00
-- when (null ps) $
-- Fail.fail "start of record matcher found, but no patterns afterward\n(patterns should not be indented)\n"
2013-03-30 02:56:55 +04:00
<?> " record matcher "
2020-02-26 21:25:36 +03:00
-- | A single matcher for a specific field. A csv field reference
-- (like %date or %1), and a pattern on the rest of the line,
-- optionally space-separated. Eg:
-- %description chez jacques
2020-06-18 02:58:49 +03:00
fieldmatcherp :: CsvRulesParser () -> CsvRulesParser Matcher
fieldmatcherp end = do
2020-06-15 03:17:09 +03:00
lift $ dbgparse 8 " trying fieldmatcher "
2020-02-26 04:54:16 +03:00
-- An optional fieldname (default: "all")
-- f <- fromMaybe "all" `fmap` (optional $ do
-- f' <- fieldnamep
2020-07-20 18:09:46 +03:00
-- lift skipNonNewlineSpaces
2020-02-26 04:54:16 +03:00
-- return f')
2020-07-06 22:17:35 +03:00
p <- matcherprefixp
2020-07-20 18:09:46 +03:00
f <- csvfieldreferencep <* lift skipNonNewlineSpaces
2020-02-26 04:54:16 +03:00
-- optional operator.. just ~ (case insensitive infix regex) for now
2020-02-26 21:25:36 +03:00
-- _op <- fromMaybe "~" <$> optional matchoperatorp
2020-07-20 18:09:46 +03:00
lift skipNonNewlineSpaces
2020-06-18 02:58:49 +03:00
r <- regexp end
2020-07-06 22:17:35 +03:00
return $ FieldMatcher p f r
2020-02-26 04:54:16 +03:00
<?> " field matcher "
2020-07-06 22:17:35 +03:00
matcherprefixp :: CsvRulesParser MatcherPrefix
matcherprefixp = do
lift $ dbgparse 8 " trying matcherprefixp "
2020-07-20 18:09:46 +03:00
( char '&' >> lift skipNonNewlineSpaces >> return And ) <|> return None
2020-07-06 22:17:35 +03:00
2020-02-26 04:54:16 +03:00
csvfieldreferencep :: CsvRulesParser CsvFieldReference
csvfieldreferencep = do
2020-06-15 03:17:09 +03:00
lift $ dbgparse 8 " trying csvfieldreferencep "
2020-02-26 04:54:16 +03:00
char '%'
f <- fieldnamep
return $ '%' : quoteIfNeeded f
2020-02-12 17:20:40 +03:00
-- A single regular expression
2020-08-15 12:14:27 +03:00
regexp :: CsvRulesParser () -> CsvRulesParser Regexp
2020-06-18 02:58:49 +03:00
regexp end = do
2020-06-15 03:17:09 +03:00
lift $ dbgparse 8 " trying regexp "
2020-02-26 21:25:36 +03:00
-- notFollowedBy matchoperatorp
2016-07-29 18:57:10 +03:00
c <- lift nonspace
2020-06-18 02:58:49 +03:00
cs <- anySingle ` manyTill ` end
2020-09-01 04:36:34 +03:00
case toRegexCI . strip $ c : cs of
2020-08-15 12:14:27 +03:00
Left x -> Fail . fail $ " CSV parser: " ++ x
Right x -> return x
2013-03-30 02:56:55 +04:00
2020-02-26 21:25:36 +03:00
-- -- A match operator, indicating the type of match to perform.
-- -- Currently just ~ meaning case insensitive infix regex match.
-- matchoperatorp :: CsvRulesParser String
-- matchoperatorp = fmap T.unpack $ choiceInState $ map string
-- ["~"
-- -- ,"!~"
-- -- ,"="
-- -- ,"!="
-- ]
2020-03-02 09:00:39 +03:00
--- ** reading csv files
2020-02-28 10:28:33 +03:00
-- | Read a Journal from the given CSV data (and filename, used for error
-- messages), or return an error. Proceed as follows:
--
-- 1. parse CSV conversion rules from the specified rules file, or from
-- the default rules file for the specified CSV file, if it exists,
-- or throw a parse error; if it doesn't exist, use built-in default rules
--
-- 2. parse the CSV data, or throw a parse error
--
-- 3. convert the CSV records to transactions using the rules
--
-- 4. if the rules file didn't exist, create it with the default rules and filename
--
-- 5. return the transactions as a Journal
--
readJournalFromCsv :: Maybe FilePath -> FilePath -> Text -> IO ( Either String Journal )
readJournalFromCsv Nothing " - " _ = return $ Left " please use --rules-file when reading CSV from stdin "
readJournalFromCsv mrulesfile csvfile csvdata =
handle ( \ ( e :: IOException ) -> return $ Left $ show e ) $ do
-- make and throw an IO exception.. which we catch and convert to an Either above ?
let throwerr = throw . userError
-- parse the csv rules
let rulesfile = fromMaybe ( rulesFileFor csvfile ) mrulesfile
rulesfileexists <- doesFileExist rulesfile
rulestext <-
if rulesfileexists
then do
2020-07-03 21:37:01 +03:00
dbg6IO " using conversion rules file " rulesfile
2020-02-28 10:28:33 +03:00
readFilePortably rulesfile >>= expandIncludes ( takeDirectory rulesfile )
else
return $ defaultRulesText rulesfile
rules <- either throwerr return $ parseAndValidateCsvRules rulesfile rulestext
2020-11-10 19:01:31 +03:00
dbg6IO " csv rules " rules
2020-02-28 10:28:33 +03:00
-- parse the skip directive's value, if any
let skiplines = case getDirective " skip " rules of
Nothing -> 0
Just " " -> 1
Just s -> readDef ( throwerr $ " could not parse skip value: " ++ show s ) s
-- parse csv
2020-08-21 18:59:55 +03:00
let
-- parsec seems to fail if you pass it "-" here TODO: try again with megaparsec
parsecfilename = if csvfile == " - " then " (stdin) " else csvfile
separator =
case getDirective " separator " rules >>= parseSeparator of
Just c -> c
_ | ext == " ssv " -> ';'
_ | ext == " tsv " -> '\ t'
_ -> ','
where
ext = map toLower $ drop 1 $ takeExtension csvfile
dbg6IO " using separator " separator
2020-02-28 10:28:33 +03:00
records <- ( either throwerr id .
2020-07-03 21:37:01 +03:00
dbg7 " validateCsv " . validateCsv rules skiplines .
dbg7 " parseCsv " )
2020-02-28 10:28:33 +03:00
` fmap ` parseCsv separator parsecfilename csvdata
2020-07-03 21:37:01 +03:00
dbg6IO " first 3 csv records " $ take 3 records
2020-02-28 10:28:33 +03:00
-- identify header lines
-- let (headerlines, datalines) = identifyHeaderLines records
-- mfieldnames = lastMay headerlines
let
-- convert CSV records to transactions
2020-11-10 19:01:31 +03:00
txns = dbg7 " csv txns " $ snd $ mapAccumL
2020-02-28 10:28:33 +03:00
( \ pos r ->
let
SourcePos name line col = pos
line' = ( mkPos . ( + 1 ) . unPos ) line
pos' = SourcePos name line' col
in
( pos , transactionFromCsvRecord pos' rules r )
)
( initialPos parsecfilename ) records
-- Ensure transactions are ordered chronologically.
-- First, if the CSV records seem to be most-recent-first (because
-- there's an explicit "newest-first" directive, or there's more
-- than one date and the first date is more recent than the last):
-- reverse them to get same-date transactions ordered chronologically.
txns' =
2020-11-10 19:01:31 +03:00
( if newestfirst || mdataseemsnewestfirst == Just True
then dbg7 " reversed csv txns " . reverse else id )
txns
2020-02-28 10:28:33 +03:00
where
2020-07-03 21:37:01 +03:00
newestfirst = dbg6 " newestfirst " $ isJust $ getDirective " newest-first " rules
mdataseemsnewestfirst = dbg6 " mdataseemsnewestfirst " $
2020-02-28 10:28:33 +03:00
case nub $ map tdate txns of
ds | length ds > 1 -> Just $ head ds > last ds
_ -> Nothing
-- Second, sort by date.
2020-11-10 19:01:31 +03:00
txns'' = dbg7 " date-sorted csv txns " $ sortBy ( comparing tdate ) txns'
2020-02-28 10:28:33 +03:00
when ( not rulesfileexists ) $ do
dbg1IO " creating conversion rules file " rulesfile
writeFile rulesfile $ T . unpack rulestext
return $ Right nulljournal { jtxns = txns'' }
-- | Parse special separator names TAB and SPACE, or return the first
-- character. Return Nothing on empty string
parseSeparator :: String -> Maybe Char
parseSeparator = specials . map toLower
where specials " space " = Just ' '
specials " tab " = Just '\ t'
specials ( x : _ ) = Just x
specials [] = Nothing
parseCsv :: Char -> FilePath -> Text -> IO ( Either String CSV )
parseCsv separator filePath csvdata =
case filePath of
" - " -> liftM ( parseCassava separator " (stdin) " ) T . getContents
_ -> return $ parseCassava separator filePath csvdata
parseCassava :: Char -> FilePath -> Text -> Either String CSV
parseCassava separator path content =
either ( Left . errorBundlePretty ) ( Right . parseResultToCsv ) <$>
CassavaMP . decodeWith ( decodeOptions separator ) Cassava . NoHeader path $
BL . fromStrict $ T . encodeUtf8 content
decodeOptions :: Char -> Cassava . DecodeOptions
decodeOptions separator = Cassava . defaultDecodeOptions {
Cassava . decDelimiter = fromIntegral ( ord separator )
}
parseResultToCsv :: ( Foldable t , Functor t ) => t ( t B . ByteString ) -> CSV
parseResultToCsv = toListList . unpackFields
where
toListList = toList . fmap toList
unpackFields = ( fmap . fmap ) ( T . unpack . T . decodeUtf8 )
printCSV :: CSV -> String
printCSV records = unlined ( printRecord ` map ` records )
where printRecord = concat . intersperse " , " . map printField
printField f = " \ " " ++ concatMap escape f ++ " \ " "
escape '"' = " \ " \ " "
escape x = [ x ]
unlined = concat . intersperse " \ n "
-- | Return the cleaned up and validated CSV data (can be empty), or an error.
validateCsv :: CsvRules -> Int -> Either String CSV -> Either String [ CsvRecord ]
validateCsv _ _ ( Left err ) = Left err
validateCsv rules numhdrlines ( Right rs ) = validate $ applyConditionalSkips $ drop numhdrlines $ filternulls rs
where
filternulls = filter ( /= [ " " ] )
skipCount r =
case ( getEffectiveAssignment rules r " end " , getEffectiveAssignment rules r " skip " ) of
( Nothing , Nothing ) -> Nothing
( Just _ , _ ) -> Just maxBound
( Nothing , Just " " ) -> Just 1
( Nothing , Just x ) -> Just ( read x )
applyConditionalSkips [] = []
applyConditionalSkips ( r : rest ) =
case skipCount r of
Nothing -> r : ( applyConditionalSkips rest )
Just cnt -> applyConditionalSkips ( drop ( cnt - 1 ) rest )
validate [] = Right []
2020-08-31 15:44:41 +03:00
validate rs @ ( _first : _ ) = case lessthan2 of
Just r -> Left $ printf " CSV record %s has less than two fields " ( show r )
Nothing -> Right rs
2020-02-28 10:28:33 +03:00
where
lessthan2 = headMay $ filter ( ( < 2 ) . length ) rs
-- -- | The highest (0-based) field index referenced in the field
-- -- definitions, or -1 if no fields are defined.
-- maxFieldIndex :: CsvRules -> Int
-- maxFieldIndex r = maximumDef (-1) $ catMaybes [
-- dateField r
-- ,statusField r
-- ,codeField r
-- ,amountField r
-- ,amountInField r
-- ,amountOutField r
-- ,currencyField r
-- ,accountField r
-- ,account2Field r
-- ,date2Field r
-- ]
2013-03-30 02:56:55 +04:00
2020-03-02 09:00:39 +03:00
--- ** converting csv records to transactions
2013-03-30 02:56:55 +04:00
2019-10-12 02:36:17 +03:00
showRules rules record =
unlines $ catMaybes [ ( ( " the " ++ fld ++ " rule is: " ) ++ ) <$> getEffectiveAssignment rules record fld | fld <- journalfieldnames ]
2020-02-27 22:19:37 +03:00
-- | Look up the value (template) of a csv rule by rule keyword.
csvRule :: CsvRules -> DirectiveName -> Maybe FieldTemplate
csvRule rules = ( ` getDirective ` rules )
-- | Look up the value template assigned to a hledger field by field
-- list/field assignment rules, taking into account the current record and
-- conditional rules.
hledgerField :: CsvRules -> CsvRecord -> HledgerFieldName -> Maybe FieldTemplate
hledgerField = getEffectiveAssignment
-- | Look up the final value assigned to a hledger field, with csv field
-- references interpolated.
hledgerFieldValue :: CsvRules -> CsvRecord -> HledgerFieldName -> Maybe String
hledgerFieldValue rules record = fmap ( renderTemplate rules record ) . hledgerField rules record
2014-08-01 04:32:42 +04:00
transactionFromCsvRecord :: SourcePos -> CsvRules -> CsvRecord -> Transaction
transactionFromCsvRecord sourcepos rules record = t
2013-03-30 02:56:55 +04:00
where
2020-02-27 23:09:39 +03:00
----------------------------------------------------------------------
-- 1. Define some helpers:
2020-02-27 22:19:37 +03:00
rule = csvRule rules :: DirectiveName -> Maybe FieldTemplate
2020-02-27 22:46:36 +03:00
-- ruleval = csvRuleValue rules record :: DirectiveName -> Maybe String
2020-02-27 22:19:37 +03:00
field = hledgerField rules record :: HledgerFieldName -> Maybe FieldTemplate
fieldval = hledgerFieldValue rules record :: HledgerFieldName -> Maybe String
2020-08-26 11:11:20 +03:00
parsedate = parseDateWithCustomOrDefaultFormats ( rule " date-format " )
2020-02-27 23:09:39 +03:00
mkdateerror datefield datevalue mdateformat = unlines
[ " error: could not parse \ " " ++ datevalue ++ " \ " as a date using date format "
++ maybe " \ " YYYY/M/D \ " , \ " YYYY-M-D \ " or \ " YYYY.M.D \ " " show mdateformat
, showRecord record
2020-02-27 21:59:18 +03:00
, " the " ++ datefield ++ " rule is: " ++ ( fromMaybe " required, but missing " $ field datefield )
2013-03-30 02:56:55 +04:00
, " the date-format is: " ++ fromMaybe " unspecified " mdateformat
, " you may need to "
2020-02-27 23:09:39 +03:00
++ " change your " ++ datefield ++ " rule, "
++ maybe " add a " ( const " change your " ) mdateformat ++ " date-format rule, "
++ " or " ++ maybe " add a " ( const " change your " ) mskip ++ " skip rule "
2013-03-30 02:56:55 +04:00
, " for m/d/y or d/m/y dates, use date-format %-m/%-d/%Y or date-format %-d/%-m/%Y "
]
2020-02-27 11:27:51 +03:00
where
2020-02-27 21:59:18 +03:00
mskip = rule " skip "
2020-02-27 23:09:39 +03:00
----------------------------------------------------------------------
-- 2. Gather values needed for the transaction itself, by evaluating the
-- field assignment rules using the CSV record's data, and parsing a bit
-- more where needed (dates, status).
mdateformat = rule " date-format "
date = fromMaybe " " $ fieldval " date "
2020-08-06 02:05:56 +03:00
-- PARTIAL:
2020-08-26 11:11:20 +03:00
date' = fromMaybe ( error ' $ m k d a t e e r r o r " d a t e " d a t e m d a t e f o r m a t ) $ p a r s e d a t e d a t e
2020-02-27 23:09:39 +03:00
mdate2 = fieldval " date2 "
2020-08-26 11:11:20 +03:00
mdate2' = maybe Nothing ( maybe ( error ' $ m k d a t e e r r o r " d a t e 2 " ( f r o m M a y b e " " m d a t e 2 ) m d a t e f o r m a t ) J u s t . p a r s e d a t e ) m d a t e 2
2020-02-27 23:09:39 +03:00
status =
2020-02-27 21:59:18 +03:00
case fieldval " status " of
Nothing -> Unmarked
Just s -> either statuserror id $ runParser ( statusp <* eof ) " " $ T . pack s
2015-05-16 21:51:35 +03:00
where
statuserror err = error ' $ u n l i n e s
2020-02-27 21:59:18 +03:00
[ " error: could not parse \ " " ++ s ++ " \ " as a cleared status (should be *, ! or empty) "
2019-03-05 17:23:11 +03:00
, " the parse error is: " ++ customErrorBundlePretty err
2015-05-16 21:51:35 +03:00
]
2020-02-27 21:59:18 +03:00
code = maybe " " singleline $ fieldval " code "
description = maybe " " singleline $ fieldval " description "
comment = maybe " " singleline $ fieldval " comment "
precomment = maybe " " singleline $ fieldval " precomment "
2019-10-12 02:36:17 +03:00
2020-02-27 11:27:51 +03:00
----------------------------------------------------------------------
2020-03-12 18:52:43 +03:00
-- 3. Generate the postings for which an account has been assigned
-- (possibly indirectly due to an amount or balance assignment)
p1IsVirtual = ( accountNamePostingType . T . pack <$> fieldval " account1 " ) == Just VirtualPosting
ps = [ p | n <- [ 1 .. maxpostings ]
, let comment = T . pack $ fromMaybe " " $ fieldval ( " comment " ++ show n )
, let currency = fromMaybe " " ( fieldval ( " currency " ++ show n ) <|> fieldval " currency " )
, let mamount = getAmount rules record currency p1IsVirtual n
, let mbalance = getBalance rules record currency n
, Just ( acct , isfinal ) <- [ getAccount rules record mamount mbalance n ] -- skips Nothings
, let acct' | not isfinal && acct == unknownExpenseAccount &&
fromMaybe False ( mamount >>= isNegativeMixedAmount ) = unknownIncomeAccount
| otherwise = acct
, let p = nullposting { paccount = accountNameWithoutPostingType acct'
, pamount = fromMaybe missingmixedamt mamount
, ptransaction = Just t
, pbalanceassertion = mkBalanceAssertion rules record <$> mbalance
, pcomment = comment
, ptype = accountNamePostingType acct
}
]
2020-02-27 11:27:51 +03:00
----------------------------------------------------------------------
2020-02-27 23:09:39 +03:00
-- 4. Build the transaction (and name it, so the postings can reference it).
2020-02-27 11:27:51 +03:00
2019-10-19 03:18:07 +03:00
t = nulltransaction {
2020-02-27 11:27:51 +03:00
tsourcepos = genericSourcePos sourcepos -- the CSV line number
, tdate = date'
, tdate2 = mdate2'
, tstatus = status
, tcode = T . pack code
, tdescription = T . pack description
, tcomment = T . pack comment
, tprecedingcomment = T . pack precomment
2020-03-12 18:52:43 +03:00
, tpostings = ps
2020-02-27 11:27:51 +03:00
}
2020-03-12 18:52:43 +03:00
-- | Figure out the amount specified for posting N, if any.
-- A currency symbol to prepend to the amount, if any, is provided,
-- and whether posting 1 requires balancing or not.
2020-04-06 00:17:06 +03:00
-- This looks for a non-empty amount value assigned to "amountN", "amountN-in", or "amountN-out".
-- For postings 1 or 2 it also looks at "amount", "amount-in", "amount-out".
-- If more than one of these has a value, it looks for one that is non-zero.
-- If there's multiple non-zeros, or no non-zeros but multiple zeros, it throws an error.
2020-03-12 18:52:43 +03:00
getAmount :: CsvRules -> CsvRecord -> String -> Bool -> Int -> Maybe MixedAmount
getAmount rules record currency p1IsVirtual n =
2020-04-17 01:22:39 +03:00
-- Warning, many tricky corner cases here.
-- docs: hledger_csv.m4.md #### amount
2020-11-16 19:47:41 +03:00
-- tests: hledger/test/csv.test ~ 13, 31-34
2020-03-12 18:52:43 +03:00
let
unnumberedfieldnames = [ " amount " , " amount-in " , " amount-out " ]
2020-04-17 01:22:39 +03:00
-- amount field names which can affect this posting
2020-03-12 18:52:43 +03:00
fieldnames = map ( ( " amount " ++ show n ) ++ ) [ " " , " -in " , " -out " ]
-- For posting 1, also recognise the old amount/amount-in/amount-out names.
-- For posting 2, the same but only if posting 1 needs balancing.
++ if n == 1 || n == 2 && not p1IsVirtual then unnumberedfieldnames else []
2020-04-17 01:22:39 +03:00
-- assignments to any of these field names with non-empty values
assignments = [ ( f , a' ) | f <- fieldnames
, Just v @ ( _ : _ ) <- [ strip . renderTemplate rules record <$> hledgerField rules record f ]
, let a = parseAmount rules record currency v
-- With amount/amount-in/amount-out, in posting 2,
-- flip the sign and convert to cost, as they did before 1.17
2020-06-01 01:48:08 +03:00
, let a' = if f ` elem ` unnumberedfieldnames && n == 2 then mixedAmountCost ( - a ) else a
2020-04-17 01:22:39 +03:00
]
-- if any of the numbered field names are present, discard all the unnumbered ones
assignments' | any isnumbered assignments = filter isnumbered assignments
| otherwise = assignments
where
isnumbered ( f , _ ) = any ( flip elem [ '0' .. '9' ] ) f
-- if there's more than one value and only some are zeros, discard the zeros
assignments''
| length assignments' > 1 && not ( null nonzeros ) = nonzeros
| otherwise = assignments'
2020-05-30 04:57:22 +03:00
where nonzeros = filter ( not . mixedAmountLooksZero . snd ) assignments'
2020-04-17 01:22:39 +03:00
in case -- dbg0 ("amounts for posting "++show n)
assignments'' of
2020-03-12 18:52:43 +03:00
[] -> Nothing
[ ( f , a ) ] | " -out " ` isSuffixOf ` f -> Just ( - a ) -- for -out fields, flip the sign
[ ( _ , a ) ] -> Just a
2020-08-06 02:05:56 +03:00
fs -> error ' $ u n l i n e s $ [ - - P A R T I A L :
2020-04-16 22:59:19 +03:00
" multiple non-zero amounts or multiple zero amounts assigned, "
, " please ensure just one. (https://hledger.org/csv.html#amount) "
2020-04-16 18:36:04 +03:00
, " " ++ showRecord record
2020-04-16 22:59:19 +03:00
, " for posting: " ++ show n
2020-04-16 18:36:04 +03:00
]
2020-04-16 22:59:19 +03:00
++ [ " assignment: " ++ f ++ " " ++
2020-04-16 18:36:04 +03:00
fromMaybe " " ( hledgerField rules record f ) ++
2020-04-16 22:59:19 +03:00
" \ t => value: " ++ showMixedAmount a -- XXX not sure this is showing all the right info
2020-04-16 18:36:04 +03:00
| ( f , a ) <- fs ]
2020-03-12 18:52:43 +03:00
-- | Figure out the expected balance (assertion or assignment) specified for posting N,
-- if any (and its parse position).
getBalance :: CsvRules -> CsvRecord -> String -> Int -> Maybe ( Amount , GenericSourcePos )
2020-11-06 09:07:26 +03:00
getBalance rules record currency n = do
v <- ( fieldval ( " balance " ++ show n )
-- for posting 1, also recognise the old field name
<|> if n == 1 then fieldval " balance " else Nothing )
case v of
" " -> Nothing
s -> Just (
parseBalanceAmount rules record currency n s
, nullsourcepos -- parse position to show when assertion fails,
) -- XXX the csv record's line number would be good
2020-03-12 18:52:43 +03:00
where
2020-11-06 09:07:26 +03:00
fieldval = fmap strip . hledgerFieldValue rules record :: HledgerFieldName -> Maybe String
-- | Given a non-empty amount string (from CSV) to parse, along with a
-- possibly non-empty currency symbol to prepend,
-- parse as a hledger MixedAmount (as in journal format), or raise an error.
-- The whole CSV record is provided for the error message.
parseAmount :: CsvRules -> CsvRecord -> String -> String -> MixedAmount
parseAmount rules record currency s =
either mkerror ( Mixed . ( : [] ) ) $ -- PARTIAL:
2020-11-07 07:45:52 +03:00
runParser ( evalStateT ( amountp <* eof ) journalparsestate ) " " $
2020-11-06 09:07:26 +03:00
T . pack $ ( currency ++ ) $ simplifySign s
where
2020-11-07 07:45:52 +03:00
journalparsestate = nulljournal { jparsedecimalmark = parseDecimalMark rules }
2020-11-06 09:07:26 +03:00
mkerror e = error ' $ u n l i n e s
[ " error: could not parse \ " " ++ s ++ " \ " as an amount "
, showRecord record
, showRules rules record
-- ,"the default-currency is: "++fromMaybe "unspecified" (getDirective "default-currency" rules)
, " the parse error is: " ++ customErrorBundlePretty e
, " you may need to "
++ " change your amount*, balance*, or currency* rules, "
++ " or add or change your skip rule "
]
2020-11-07 07:45:52 +03:00
-- XXX unify these ^v
2020-11-06 09:07:26 +03:00
-- | Almost but not quite the same as parseAmount.
-- Given a non-empty amount string (from CSV) to parse, along with a
-- possibly non-empty currency symbol to prepend,
-- parse as a hledger Amount (as in journal format), or raise an error.
-- The CSV record and the field's numeric suffix are provided for the error message.
parseBalanceAmount :: CsvRules -> CsvRecord -> String -> Int -> String -> Amount
parseBalanceAmount rules record currency n s =
either ( mkerror n s ) id $
2020-11-07 07:45:52 +03:00
runParser ( evalStateT ( amountp <* eof ) journalparsestate ) " " $
2020-11-06 09:07:26 +03:00
T . pack $ ( currency ++ ) $ simplifySign s
-- the csv record's line number would be good
where
2020-11-07 07:45:52 +03:00
journalparsestate = nulljournal { jparsedecimalmark = parseDecimalMark rules }
2020-11-06 09:07:26 +03:00
mkerror n s e = error ' $ u n l i n e s
[ " error: could not parse \ " " ++ s ++ " \ " as balance " ++ show n ++ " amount "
, showRecord record
, showRules rules record
-- ,"the default-currency is: "++fromMaybe "unspecified" mdefaultcurrency
, " the parse error is: " ++ customErrorBundlePretty e
]
2020-02-27 11:27:51 +03:00
2020-11-07 07:45:52 +03:00
-- Read a valid decimal mark from the decimal-mark rule, if any.
-- If the rule is present with an invalid argument, raise an error.
parseDecimalMark :: CsvRules -> Maybe DecimalMark
parseDecimalMark rules =
case rules ` csvRule ` " decimal-mark " of
Nothing -> Nothing
Just [ c ] | isDecimalMark c -> Just c
Just s -> error ' $ " d e c i m a l - m a r k' s argument should be \ " . \ " or \ " , \ " (not \ " " ++ s ++ " \ " ) "
2020-02-27 11:27:51 +03:00
-- | Make a balance assertion for the given amount, with the given parse
-- position (to be shown in assertion failures), with the assertion type
-- possibly set by a balance-type rule.
-- The CSV rules and current record are also provided, to be shown in case
-- balance-type's argument is bad (XXX refactor).
2020-02-28 10:28:33 +03:00
mkBalanceAssertion :: CsvRules -> CsvRecord -> ( Amount , GenericSourcePos ) -> BalanceAssertion
2020-02-27 11:27:51 +03:00
mkBalanceAssertion rules record ( amt , pos ) = assrt { baamount = amt , baposition = pos }
where
assrt =
case getDirective " balance-type " rules of
Nothing -> nullassertion
Just " = " -> nullassertion
Just " == " -> nullassertion { batotal = True }
Just " =* " -> nullassertion { bainclusive = True }
2019-12-07 20:05:41 +03:00
Just " ==* " -> nullassertion { batotal = True , bainclusive = True }
2020-08-06 02:05:56 +03:00
Just x -> error ' $ u n l i n e s - - P A R T I A L :
2020-02-27 11:27:51 +03:00
[ " balance-type \ " " ++ x ++ " \ " is invalid. Use =, ==, =* or ==*. "
2019-11-13 02:33:38 +03:00
, showRecord record
, showRules rules record
]
2013-03-30 02:56:55 +04:00
2020-03-12 18:52:43 +03:00
-- | Figure out the account name specified for posting N, if any.
-- And whether it is the default unknown account (which may be
-- improved later) or an explicitly set account (which may not).
getAccount :: CsvRules -> CsvRecord -> Maybe MixedAmount -> Maybe ( Amount , GenericSourcePos ) -> Int -> Maybe ( AccountName , Bool )
getAccount rules record mamount mbalance n =
let
fieldval = hledgerFieldValue rules record :: HledgerFieldName -> Maybe String
maccount = T . pack <$> fieldval ( " account " ++ show n )
in case maccount of
-- accountN is set to the empty string - no posting will be generated
Just " " -> Nothing
-- accountN is set (possibly to "expenses:unknown"! #1192) - mark it final
Just a -> Just ( a , True )
-- accountN is unset
Nothing ->
case ( mamount , mbalance ) of
-- amountN is set, or implied by balanceN - set accountN to
-- the default unknown account ("expenses:unknown") and
-- allow it to be improved later
( Just _ , _ ) -> Just ( unknownExpenseAccount , False )
( _ , Just _ ) -> Just ( unknownExpenseAccount , False )
-- amountN is also unset - no posting will be generated
( Nothing , Nothing ) -> Nothing
-- | Default account names to use when needed.
unknownExpenseAccount = " expenses:unknown "
unknownIncomeAccount = " income:unknown "
2013-03-30 02:56:55 +04:00
2017-05-14 12:17:56 +03:00
type CsvAmountString = String
2017-05-14 03:34:33 +03:00
2017-05-14 12:17:56 +03:00
-- | Canonicalise the sign in a CSV amount string.
2018-09-07 20:12:13 +03:00
-- Such strings can have a minus sign, negating parentheses,
2018-04-18 17:35:47 +03:00
-- or any two of these (which cancels out).
--
-- >>> simplifySign "1"
-- "1"
-- >>> simplifySign "-1"
-- "-1"
-- >>> simplifySign "(1)"
-- "-1"
-- >>> simplifySign "--1"
-- "1"
-- >>> simplifySign "-(1)"
-- "1"
-- >>> simplifySign "(-1)"
-- "1"
-- >>> simplifySign "((1))"
-- "1"
2017-05-14 12:17:56 +03:00
simplifySign :: CsvAmountString -> CsvAmountString
simplifySign ( '(' : s ) | lastMay s == Just ')' = simplifySign $ negateStr $ init s
2018-04-18 17:35:47 +03:00
simplifySign ( '-' : '(' : s ) | lastMay s == Just ')' = simplifySign $ init s
2017-05-14 12:17:56 +03:00
simplifySign ( '-' : '-' : s ) = s
simplifySign s = s
2013-03-30 02:56:55 +04:00
negateStr :: String -> String
negateStr ( '-' : s ) = s
negateStr s = '-' : s
-- | Show a (approximate) recreation of the original CSV record.
showRecord :: CsvRecord -> String
2020-03-12 18:52:43 +03:00
showRecord r = " record values: " ++ intercalate " , " ( map show r )
2013-03-30 02:56:55 +04:00
2020-02-27 12:00:35 +03:00
-- | Given the conversion rules, a CSV record and a hledger field name, find
2020-02-27 21:59:18 +03:00
-- the value template ultimately assigned to this field, if any, by a field
2020-02-27 12:00:35 +03:00
-- assignment at top level or in a conditional block matching this record.
2020-02-26 04:54:16 +03:00
--
-- Note conditional blocks' patterns are matched against an approximation of the
-- CSV record: all the field values, without enclosing quotes, comma-separated.
--
2020-02-27 12:00:35 +03:00
getEffectiveAssignment :: CsvRules -> CsvRecord -> HledgerFieldName -> Maybe FieldTemplate
2020-02-26 04:54:16 +03:00
getEffectiveAssignment rules record f = lastMay $ map snd $ assignments
2013-03-30 02:56:55 +04:00
where
2020-02-26 04:54:16 +03:00
-- all active assignments to field f, in order
2020-11-10 19:01:31 +03:00
assignments = dbg9 " csv assignments " $ filter ( ( == f ) . fst ) $ toplevelassignments ++ conditionalassignments
2013-03-30 02:56:55 +04:00
where
2020-02-26 04:54:16 +03:00
-- all top level field assignments
2013-06-19 11:30:33 +04:00
toplevelassignments = rassignments rules
2020-02-26 04:54:16 +03:00
-- all field assignments in conditional blocks assigning to field f and active for the current csv record
2020-06-22 21:00:00 +03:00
conditionalassignments = concatMap cbAssignments $ filter isBlockActive $ ( rblocksassigning rules ) f
2013-03-30 02:56:55 +04:00
where
2020-02-26 04:54:16 +03:00
-- does this conditional block match the current csv record ?
isBlockActive :: ConditionalBlock -> Bool
2020-07-06 22:17:35 +03:00
isBlockActive CB { .. } = any ( all matcherMatches ) $ groupedMatchers cbMatchers
2013-03-30 02:56:55 +04:00
where
2020-02-26 04:54:16 +03:00
-- does this individual matcher match the current csv record ?
2020-02-12 17:20:40 +03:00
matcherMatches :: Matcher -> Bool
2020-09-01 04:36:34 +03:00
matcherMatches ( RecordMatcher _ pat ) = regexMatch pat' wholecsvline
2020-02-26 04:54:16 +03:00
where
2020-07-03 21:37:01 +03:00
pat' = dbg7 " regex " pat
2020-05-10 01:43:44 +03:00
-- A synthetic whole CSV record to match against. Note, this can be
-- different from the original CSV data:
-- - any whitespace surrounding field values is preserved
-- - any quotes enclosing field values are removed
-- - and the field separator is always comma
-- which means that a field containing a comma will look like two fields.
2020-07-03 21:37:01 +03:00
wholecsvline = dbg7 " wholecsvline " $ intercalate " , " record
2020-09-01 04:36:34 +03:00
matcherMatches ( FieldMatcher _ csvfieldref pat ) = regexMatch pat csvfieldvalue
2013-03-30 02:56:55 +04:00
where
2020-02-26 04:54:16 +03:00
-- the value of the referenced CSV field to match against.
2020-07-03 21:37:01 +03:00
csvfieldvalue = dbg7 " csvfieldvalue " $ replaceCsvFieldReference rules record csvfieldref
2013-03-30 02:56:55 +04:00
2020-06-17 04:48:45 +03:00
-- | Render a field assignment's template, possibly interpolating referenced
2019-06-15 02:43:12 +03:00
-- CSV field values. Outer whitespace is removed from interpolated values.
2013-03-30 02:56:55 +04:00
renderTemplate :: CsvRules -> CsvRecord -> FieldTemplate -> String
2020-08-31 15:44:41 +03:00
renderTemplate rules record t = maybe t concat $ parseMaybe
( many $ takeWhile1P Nothing ( /= '%' )
<|> replaceCsvFieldReference rules record <$> referencep )
t
where
referencep = liftA2 ( : ) ( char '%' ) ( takeWhile1P ( Just " reference " ) isDescriptorChar ) :: Parsec CustomErr String String
isDescriptorChar c = isAscii c && ( isAlphaNum c || c == '_' || c == '-' )
2020-02-26 04:54:16 +03:00
-- | Replace something that looks like a reference to a csv field ("%date" or "%1)
-- with that field's value. If it doesn't look like a field reference, or if we
-- can't find such a field, leave it unchanged.
replaceCsvFieldReference :: CsvRules -> CsvRecord -> CsvFieldReference -> String
replaceCsvFieldReference rules record s @ ( '%' : fieldname ) = fromMaybe s $ csvFieldValue rules record fieldname
replaceCsvFieldReference _ _ s = s
-- | Get the (whitespace-stripped) value of a CSV field, identified by its name or
-- column number, ("date" or "1"), from the given CSV record, if such a field exists.
csvFieldValue :: CsvRules -> CsvRecord -> CsvFieldName -> Maybe String
csvFieldValue rules record fieldname = do
fieldindex <- if | all isDigit fieldname -> readMay fieldname
| otherwise -> lookup ( map toLower fieldname ) $ rcsvfieldindexes rules
fieldvalue <- strip <$> atMay record ( fieldindex - 1 )
return fieldvalue
2020-02-27 23:09:39 +03:00
-- | Parse the date string using the specified date-format, or if unspecified
-- the "simple date" formats (YYYY/MM/DD, YYYY-MM-DD, YYYY.MM.DD, leading
-- zeroes optional).
parseDateWithCustomOrDefaultFormats :: Maybe DateFormat -> String -> Maybe Day
2020-02-28 10:31:53 +03:00
parseDateWithCustomOrDefaultFormats mformat s = asum $ map parsewith formats
2012-03-11 01:55:48 +04:00
where
2020-08-26 10:16:51 +03:00
parsewith = flip ( parseTimeM True defaultTimeLocale ) s
2013-03-30 02:56:55 +04:00
formats = maybe
[ " %Y/%-m/%-d "
, " %Y-%-m-%-d "
, " %Y.%-m.%-d "
-- ,"%-m/%-d/%Y"
-- ,parseTime defaultTimeLocale "%Y/%m/%e" (take 5 s ++ "0" ++ drop 5 s)
-- ,parseTime defaultTimeLocale "%Y-%m-%e" (take 5 s ++ "0" ++ drop 5 s)
-- ,parseTime defaultTimeLocale "%m/%e/%Y" ('0':s)
-- ,parseTime defaultTimeLocale "%m-%e-%Y" ('0':s)
]
( : [] )
mformat
2020-03-02 09:00:39 +03:00
--- ** tests
2013-03-30 02:56:55 +04:00
2018-09-06 23:08:26 +03:00
tests_CsvReader = tests " CsvReader " [
2018-09-04 23:39:21 +03:00
tests " parseCsvRules " [
2020-02-26 04:54:16 +03:00
test " empty file " $
2020-06-22 03:08:33 +03:00
parseCsvRules " unknown " " " @?= Right ( mkrules defrules )
2020-02-26 04:54:16 +03:00
]
2018-09-04 23:39:21 +03:00
, tests " rulesp " [
2020-02-26 04:54:16 +03:00
test " trailing comments " $
2020-06-22 03:08:33 +03:00
parseWithState' defrules rulesp " skip \ n # \ n # \ n " @?= Right ( mkrules $ defrules { rdirectives = [ ( " skip " , " " ) ] } )
2018-09-07 20:12:13 +03:00
2020-02-26 04:54:16 +03:00
, test " trailing blank lines " $
2020-06-22 03:08:33 +03:00
parseWithState' defrules rulesp " skip \ n \ n \ n " @?= ( Right ( mkrules $ defrules { rdirectives = [ ( " skip " , " " ) ] } ) )
2018-09-07 20:12:13 +03:00
2020-02-26 04:54:16 +03:00
, test " no final newline " $
2020-06-22 03:08:33 +03:00
parseWithState' defrules rulesp " skip " @?= ( Right ( mkrules $ defrules { rdirectives = [ ( " skip " , " " ) ] } ) )
2018-09-04 01:30:52 +03:00
2020-02-26 04:54:16 +03:00
, test " assignment with empty value " $
2019-11-27 23:46:29 +03:00
parseWithState' defrules rulesp " account1 \ n if foo \ n account2 foo \ n " @?=
2020-08-15 12:14:27 +03:00
( Right ( mkrules $ defrules { rassignments = [ ( " account1 " , " " ) ] , rconditionalblocks = [ CB { cbMatchers = [ RecordMatcher None ( toRegex' " foo " ) ] , cbAssignments = [ ( " account2 " , " foo " ) ] } ] } ) )
2020-02-26 04:54:16 +03:00
]
2019-11-13 16:41:32 +03:00
, tests " conditionalblockp " [
2020-02-26 04:54:16 +03:00
test " space after conditional " $ -- #1120
2019-11-27 23:46:29 +03:00
parseWithState' defrules conditionalblockp " if a \ n account2 b \ n \ n " @?=
2020-08-15 12:14:27 +03:00
( Right $ CB { cbMatchers = [ RecordMatcher None $ toRegexCI' " a " ] , cbAssignments = [ ( " account2 " , " b " ) ] } )
2020-02-26 04:54:16 +03:00
, tests " csvfieldreferencep " [
test " number " $ parseWithState' defrules csvfieldreferencep " %1 " @?= ( Right " %1 " )
, test " name " $ parseWithState' defrules csvfieldreferencep " %date " @?= ( Right " %date " )
, test " quoted name " $ parseWithState' defrules csvfieldreferencep " % \ " csv date \ " " @?= ( Right " % \ " csv date \ " " )
]
, tests " matcherp " [
test " recordmatcherp " $
2020-08-15 12:14:27 +03:00
parseWithState' defrules matcherp " A A \ n " @?= ( Right $ RecordMatcher None $ toRegexCI' " A A " )
2020-07-06 22:17:35 +03:00
, test " recordmatcherp.starts-with-& " $
2020-08-15 12:14:27 +03:00
parseWithState' defrules matcherp " & A A \ n " @?= ( Right $ RecordMatcher And $ toRegexCI' " A A " )
2020-02-26 04:54:16 +03:00
, test " fieldmatcherp.starts-with-% " $
2020-08-15 12:14:27 +03:00
parseWithState' defrules matcherp " description A A \ n " @?= ( Right $ RecordMatcher None $ toRegexCI' " description A A " )
2020-02-26 04:54:16 +03:00
, test " fieldmatcherp " $
2020-08-15 12:14:27 +03:00
parseWithState' defrules matcherp " %description A A \ n " @?= ( Right $ FieldMatcher None " %description " $ toRegexCI' " A A " )
2020-07-06 22:17:35 +03:00
, test " fieldmatcherp.starts-with-& " $
2020-08-15 12:14:27 +03:00
parseWithState' defrules matcherp " & %description A A \ n " @?= ( Right $ FieldMatcher And " %description " $ toRegexCI' " A A " )
2020-02-26 04:54:16 +03:00
2020-02-26 21:25:36 +03:00
-- ,test "fieldmatcherp with operator" $
-- parseWithState' defrules matcherp "%description ~ A A\n" @?= (Right $ FieldMatcher "%description" "A A")
2020-02-26 04:54:16 +03:00
]
, tests " getEffectiveAssignment " [
2020-06-22 03:08:33 +03:00
let rules = mkrules $ defrules { rcsvfieldindexes = [ ( " csvdate " , 1 ) ] , rassignments = [ ( " date " , " %csvdate " ) ] }
2020-08-15 12:14:27 +03:00
2020-02-26 04:54:16 +03:00
in test " toplevel " $ getEffectiveAssignment rules [ " a " , " b " ] " date " @?= ( Just " %csvdate " )
2020-08-15 12:14:27 +03:00
, let rules = mkrules $ defrules { rcsvfieldindexes = [ ( " csvdate " , 1 ) ] , rconditionalblocks = [ CB [ FieldMatcher None " %csvdate " $ toRegex' " a " ] [ ( " date " , " %csvdate " ) ] ] }
2020-02-26 04:54:16 +03:00
in test " conditional " $ getEffectiveAssignment rules [ " a " , " b " ] " date " @?= ( Just " %csvdate " )
2020-07-06 22:17:35 +03:00
2020-08-15 12:14:27 +03:00
, let rules = mkrules $ defrules { rcsvfieldindexes = [ ( " csvdate " , 1 ) , ( " description " , 2 ) ] , rconditionalblocks = [ CB [ FieldMatcher None " %csvdate " $ toRegex' " a " , FieldMatcher None " %description " $ toRegex' " b " ] [ ( " date " , " %csvdate " ) ] ] }
2020-07-06 22:17:35 +03:00
in test " conditional-with-or-a " $ getEffectiveAssignment rules [ " a " ] " date " @?= ( Just " %csvdate " )
2020-08-15 12:14:27 +03:00
, let rules = mkrules $ defrules { rcsvfieldindexes = [ ( " csvdate " , 1 ) , ( " description " , 2 ) ] , rconditionalblocks = [ CB [ FieldMatcher None " %csvdate " $ toRegex' " a " , FieldMatcher None " %description " $ toRegex' " b " ] [ ( " date " , " %csvdate " ) ] ] }
2020-07-06 22:17:35 +03:00
in test " conditional-with-or-b " $ getEffectiveAssignment rules [ " _ " , " b " ] " date " @?= ( Just " %csvdate " )
2020-08-15 12:14:27 +03:00
, let rules = mkrules $ defrules { rcsvfieldindexes = [ ( " csvdate " , 1 ) , ( " description " , 2 ) ] , rconditionalblocks = [ CB [ FieldMatcher None " %csvdate " $ toRegex' " a " , FieldMatcher And " %description " $ toRegex' " b " ] [ ( " date " , " %csvdate " ) ] ] }
2020-07-06 22:17:35 +03:00
in test " conditional.with-and " $ getEffectiveAssignment rules [ " a " , " b " ] " date " @?= ( Just " %csvdate " )
2020-08-15 12:14:27 +03:00
, let rules = mkrules $ defrules { rcsvfieldindexes = [ ( " csvdate " , 1 ) , ( " description " , 2 ) ] , rconditionalblocks = [ CB [ FieldMatcher None " %csvdate " $ toRegex' " a " , FieldMatcher And " %description " $ toRegex' " b " , FieldMatcher None " %description " $ toRegex' " c " ] [ ( " date " , " %csvdate " ) ] ] }
2020-07-06 22:17:35 +03:00
in test " conditional.with-and-or " $ getEffectiveAssignment rules [ " _ " , " c " ] " date " @?= ( Just " %csvdate " )
2020-02-26 04:54:16 +03:00
]
2019-11-13 16:41:32 +03:00
]
2020-02-26 04:54:16 +03:00
]