hledger/hledger-lib/Hledger/Read/CsvReader.hs
Simon Michael 23bdac41d9 csv: merge lucamolteni's cassava/custom separators (squashed) (#829)
commit 5ba464de76
Author: Luca Molteni <volothamp@gmail.com>
Date:   Fri Sep 7 17:54:12 2018 +0200

    Fix CI 2

commit f060ae9449
Author: Luca Molteni <volothamp@gmail.com>
Date:   Fri Sep 7 17:30:08 2018 +0200

    Fix CI build

commit af0719a33b
Author: Luca Molteni <volothamp@gmail.com>
Date:   Fri Sep 7 17:19:01 2018 +0200

    Fix rebase

commit 1a24ddfa54
Author: Luca Molteni <volothamp@gmail.com>
Date:   Fri Aug 10 16:25:24 2018 +0200

    Fixed some GHC warnings

commit 1ac43398a3
Author: Luca Molteni <volothamp@gmail.com>
Date:   Fri Aug 10 16:14:49 2018 +0200

    Fix .cabal

commit 422456b925
Author: Luca Molteni <volothamp@gmail.com>
Date:   Sun Jul 1 22:56:20 2018 +0200

    Removed to-do list

commit 1118b762e4
Author: Luca Molteni <volothamp@gmail.com>
Date:   Sun Jul 1 22:53:28 2018 +0200

    Better test

commit 1146ed0941
Author: Luca Molteni <volothamp@gmail.com>
Date:   Sun Jul 1 15:32:28 2018 +0200

    Fix parsing

commit 4fc2374b2b
Author: Luca Molteni <volothamp@gmail.com>
Date:   Thu Jun 21 22:11:11 2018 +0200

    Parsing of separator

commit f7a61737f1
Author: Luca Molteni <volothamp@gmail.com>
Date:   Thu Jun 21 14:29:23 2018 +0200

    Almost separator in options

commit ac8841cf3b
Author: Luca Molteni <volothamp@gmail.com>
Date:   Thu Jun 21 14:16:59 2018 +0200

    Separator in parseCSV

commit 92a8b9f6ba
Author: Luca Molteni <volothamp@gmail.com>
Date:   Thu Jun 21 13:30:41 2018 +0200

    separator option

commit ec417a81ae
Author: Luca Molteni <volothamp@gmail.com>
Date:   Thu Jun 21 10:45:26 2018 +0200

    Removed one qualified import

commit 8b2f386c2f
Author: Luca Molteni <volothamp@gmail.com>
Date:   Wed Jun 20 14:01:12 2018 +0200

    Removed string conversions

commit a14d0e099e
Author: Luca Molteni <volothamp@gmail.com>
Date:   Wed Jun 20 10:23:20 2018 +0200

    custom delimiter in cassava

commit 694d48e2bc
Author: Luca Molteni <volothamp@gmail.com>
Date:   Sun Jun 10 17:51:54 2018 +0200

    Use Text.getContents - remove UTF-8 compatibility library

commit a7ada2cc60
Author: Luca Molteni <volothamp@gmail.com>
Date:   Sun Jun 10 17:49:34 2018 +0200

    todo list

commit 58ec47d398
Author: Luca Molteni <volothamp@gmail.com>
Date:   Sun Jun 10 17:45:22 2018 +0200

    CSV test now has unicode characters

commit b7851e94c3
Author: Luca Molteni <volothamp@gmail.com>
Date:   Sun Jun 10 16:59:39 2018 +0200

    Use decode from Text

commit 79f59fd28c
Author: Luca Molteni <volothamp@gmail.com>
Date:   Sun Jun 10 13:28:57 2018 +0200

    Use Text and Lazy Bytestring

commit 470c9bcb8d
Author: Luca Molteni <volothamp@gmail.com>
Date:   Sat Jun 9 15:30:22 2018 +0200

    Use megaparsec error

commit f978848ba2
Author: Luca Molteni <volothamp@gmail.com>
Date:   Sat Jun 9 15:22:07 2018 +0200

    Renamed qualify and remove Parsec

commit 152587fde2
Author: Luca Molteni <volothamp@gmail.com>
Date:   Sat Jun 9 15:12:36 2018 +0200

    Use cassava mega parsec

commit cf281577a3
Author: Luca Molteni <volothamp@gmail.com>
Date:   Sat Jun 9 14:01:47 2018 +0200

    Removed Data.Vector

commit 1272e8e758
Author: Luca Molteni <volothamp@gmail.com>
Date:   Sat Jun 9 12:16:18 2018 +0200

    Removed Parsec ParseError

commit ae07f04313
Author: Luca Molteni <volothamp@gmail.com>
Date:   Sat Jun 9 12:06:14 2018 +0200

    Type sinonim for ParsecError

commit 8e15b253c1
Author: Luca Molteni <volothamp@gmail.com>
Date:   Sat Jun 9 11:16:08 2018 +0200

    Replaced with typeclasses

commit 1ed46f9c17
Author: Luca Molteni <volothamp@gmail.com>
Date:   Sat Jun 9 11:01:33 2018 +0200

    Replaced Text/CSV with Cassava

commit 362f4111b5
Author: Luca Molteni <volothamp@gmail.com>
Date:   Sat Jun 9 10:34:37 2018 +0200

    Use cassava parsin instead of Text/CSV

commit 83e678e371
Author: Luca Molteni <volothamp@gmail.com>
Date:   Sat Jun 9 08:22:51 2018 +0200

    Text CSV error messages

commit f922df71d2
Author: Luca Molteni <volothamp@gmail.com>
Date:   Fri Jun 8 21:45:20 2018 +0200

    Better types

commit edd130781c
Author: Luca Molteni <volothamp@gmail.com>
Date:   Fri Jun 8 21:34:59 2018 +0200

    Conversion to Text CSV type

commit 0799383214
Author: Luca Molteni <volothamp@gmail.com>
Date:   Fri Jun 8 16:06:21 2018 +0200

    First function with cassava

commit e92aeb151f
Author: Luca Molteni <volothamp@gmail.com>
Date:   Fri Jun 8 13:47:34 2018 +0200

    Added cassava as dependency

commit 5ea005c558
Author: Luca Molteni <volothamp@gmail.com>
Date:   Fri Jun 8 13:18:47 2018 +0200

    Better .gitignore for multi idea modules
2018-09-07 12:34:51 -07:00

898 lines
34 KiB
Haskell

{-|
A reader for CSV data, using an extra rules file to help interpret the data.
-}
{-# LANGUAGE CPP #-}
{-# LANGUAGE FlexibleContexts #-}
{-# LANGUAGE ScopedTypeVariables #-}
{-# LANGUAGE TypeFamilies #-}
{-# LANGUAGE OverloadedStrings #-}
{-# LANGUAGE ViewPatterns #-}
{-# LANGUAGE TypeSynonymInstances #-}
{-# LANGUAGE FlexibleInstances #-}
{-# LANGUAGE PackageImports #-}
module Hledger.Read.CsvReader (
-- * Reader
reader,
-- * Misc.
CsvRecord,
CSV, Record, Field,
-- rules,
rulesFileFor,
parseRulesFile,
parseAndValidateCsvRules,
expandIncludes,
transactionFromCsvRecord,
printCSV,
-- * Tests
tests_CsvReader,
)
where
import Prelude ()
import "base-compat-batteries" Prelude.Compat
import Control.Exception hiding (try)
import Control.Monad
import Control.Monad.Except
import Control.Monad.State.Strict (StateT, get, modify', evalStateT)
import Data.Char (toLower, isDigit, isSpace, ord)
import "base-compat-batteries" Data.List.Compat
import Data.List.NonEmpty (fromList)
import Data.Maybe
import Data.Ord
import qualified Data.Set as S
import Data.Text (Text)
import qualified Data.Text as T
import qualified Data.Text.Encoding as T
import qualified Data.Text.IO as T
import Data.Time.Calendar (Day)
#if MIN_VERSION_time(1,5,0)
import Data.Time.Format (parseTimeM, defaultTimeLocale)
#else
import Data.Time.Format (parseTime)
import System.Locale (defaultTimeLocale)
#endif
import Safe
import System.Directory (doesFileExist)
import System.FilePath
import qualified Data.Csv as Cassava
import qualified Data.Csv.Parser.Megaparsec as CassavaMP
import qualified Data.ByteString as B
import Data.ByteString.Lazy (fromStrict)
import Data.Foldable
import Text.Megaparsec hiding (parse)
import Text.Megaparsec.Char
import Text.Printf (printf)
import Data.Word
import Hledger.Data
import Hledger.Utils
import Hledger.Read.Common (Reader(..),InputOpts(..),amountp, statusp, genericSourcePos)
type CSV = [Record]
type Record = [Field]
type Field = String
data CSVError = CSVError (ParseError Word8 CassavaMP.ConversionError)
deriving Show
reader :: Reader
reader = Reader
{rFormat = "csv"
,rExtensions = ["csv"]
,rParser = parse
,rExperimental = False
}
-- | Parse and post-process a "Journal" from CSV data, or give an error.
-- XXX currently ignores the string and reads from the file path
parse :: InputOpts -> FilePath -> Text -> ExceptT String IO Journal
parse iopts f t = do
let rulesfile = mrules_file_ iopts
let separator = separator_ iopts
r <- liftIO $ readJournalFromCsv separator rulesfile f t
case r of Left e -> throwError e
Right j -> return $ journalNumberAndTieTransactions j
-- XXX does not use parseAndFinaliseJournal like the other readers
-- | Read a Journal from the given CSV data (and filename, used for error
-- messages), or return an error. Proceed as follows:
-- @
-- 1. parse CSV conversion rules from the specified rules file, or from
-- the default rules file for the specified CSV file, if it exists,
-- or throw a parse error; if it doesn't exist, use built-in default rules
-- 2. parse the CSV data, or throw a parse error
-- 3. convert the CSV records to transactions using the rules
-- 4. if the rules file didn't exist, create it with the default rules and filename
-- 5. return the transactions as a Journal
-- @
readJournalFromCsv :: Char -> Maybe FilePath -> FilePath -> Text -> IO (Either String Journal)
readJournalFromCsv _ Nothing "-" _ = return $ Left "please use --rules-file when reading CSV from stdin"
readJournalFromCsv separator mrulesfile csvfile csvdata =
handle (\e -> return $ Left $ show (e :: IOException)) $ do
let throwerr = throw.userError
-- parse rules
let rulesfile = fromMaybe (rulesFileFor csvfile) mrulesfile
rulesfileexists <- doesFileExist rulesfile
rulestext <-
if rulesfileexists
then do
dbg1IO "using conversion rules file" rulesfile
liftIO $ (readFilePortably rulesfile >>= expandIncludes (takeDirectory rulesfile))
else return $ defaultRulesText rulesfile
rules <- liftIO (runExceptT $ parseAndValidateCsvRules rulesfile rulestext) >>= either throwerr return
dbg2IO "rules" rules
-- apply skip directive
let skip = maybe 0 oneorerror $ getDirective "skip" rules
where
oneorerror "" = 1
oneorerror s = readDef (throwerr $ "could not parse skip value: " ++ show s) s
-- parse csv
-- parsec seems to fail if you pass it "-" here XXX try again with megaparsec
let parsecfilename = if csvfile == "-" then "(stdin)" else csvfile
records <- (either throwerr id .
dbg2 "validateCsv" . validateCsv skip .
dbg2 "parseCsv")
`fmap` parseCsv separator parsecfilename csvdata
dbg1IO "first 3 csv records" $ take 3 records
-- identify header lines
-- let (headerlines, datalines) = identifyHeaderLines records
-- mfieldnames = lastMay headerlines
let
-- convert CSV records to transactions
txns = snd $ mapAccumL
(\pos r ->
let
SourcePos name line col = pos
line' = (mkPos . (+1) . unPos) line
pos' = SourcePos name line' col
in
(pos, transactionFromCsvRecord pos' rules r)
)
(initialPos parsecfilename) records
-- Ensure transactions are ordered chronologically.
-- First, reverse them to get same-date transactions ordered chronologically,
-- if the CSV records seem to be most-recent-first, ie if there's an explicit
-- "newest-first" directive, or if there's more than one date and the first date
-- is more recent than the last.
txns' =
(if newestfirst || mseemsnewestfirst == Just True then reverse else id) txns
where
newestfirst = dbg3 "newestfirst" $ isJust $ getDirective "newest-first" rules
mseemsnewestfirst = dbg3 "mseemsnewestfirst" $
case nub $ map tdate txns of
ds | length ds > 1 -> Just $ head ds > last ds
_ -> Nothing
-- Second, sort by date.
txns'' = sortBy (comparing tdate) txns'
when (not rulesfileexists) $ do
dbg1IO "creating conversion rules file" rulesfile
writeFile rulesfile $ T.unpack rulestext
return $ Right nulljournal{jtxns=txns''}
parseCsv :: Char -> FilePath -> Text -> IO (Either CSVError CSV)
parseCsv separator filePath csvdata =
case filePath of
"-" -> liftM (parseCassava separator "(stdin)") T.getContents
_ -> return $ parseCassava separator filePath csvdata
parseCassava :: Char -> FilePath -> Text -> Either CSVError CSV
parseCassava separator path content =
case parseResult of
Left msg -> Left $ CSVError msg
Right a -> Right a
where parseResult = fmap parseResultToCsv $ CassavaMP.decodeWith (decodeOptions separator) Cassava.NoHeader path lazyContent
lazyContent = fromStrict $ T.encodeUtf8 content
decodeOptions :: Char -> Cassava.DecodeOptions
decodeOptions separator = Cassava.defaultDecodeOptions {
Cassava.decDelimiter = fromIntegral (ord separator)
}
parseResultToCsv :: (Foldable t, Functor t) => t (t B.ByteString) -> CSV
parseResultToCsv = toListList . unpackFields
where
toListList = toList . fmap toList
unpackFields = (fmap . fmap) (T.unpack . T.decodeUtf8)
printCSV :: CSV -> String
printCSV records = unlined (printRecord `map` records)
where printRecord = concat . intersperse "," . map printField
printField f = "\"" ++ concatMap escape f ++ "\""
escape '"' = "\"\""
escape x = [x]
unlined = concat . intersperse "\n"
-- | Return the cleaned up and validated CSV data (can be empty), or an error.
validateCsv :: Int -> Either CSVError CSV -> Either String [CsvRecord]
validateCsv _ (Left e) = Left $ show e
validateCsv numhdrlines (Right rs) = validate $ drop numhdrlines $ filternulls rs
where
filternulls = filter (/=[""])
validate [] = Right []
validate rs@(first:_)
| isJust lessthan2 = let r = fromJust lessthan2 in Left $ printf "CSV record %s has less than two fields" (show r)
| isJust different = let r = fromJust different in Left $ printf "the first CSV record %s has %d fields but %s has %d" (show first) length1 (show r) (length r)
| otherwise = Right rs
where
length1 = length first
lessthan2 = headMay $ filter ((<2).length) rs
different = headMay $ filter ((/=length1).length) rs
-- -- | The highest (0-based) field index referenced in the field
-- -- definitions, or -1 if no fields are defined.
-- maxFieldIndex :: CsvRules -> Int
-- maxFieldIndex r = maximumDef (-1) $ catMaybes [
-- dateField r
-- ,statusField r
-- ,codeField r
-- ,amountField r
-- ,amountInField r
-- ,amountOutField r
-- ,currencyField r
-- ,accountField r
-- ,account2Field r
-- ,date2Field r
-- ]
-- rulesFileFor :: CliOpts -> FilePath -> FilePath
-- rulesFileFor CliOpts{rules_file_=Just f} _ = f
-- rulesFileFor CliOpts{rules_file_=Nothing} csvfile = replaceExtension csvfile ".rules"
rulesFileFor :: FilePath -> FilePath
rulesFileFor = (++ ".rules")
csvFileFor :: FilePath -> FilePath
csvFileFor = reverse . drop 6 . reverse
defaultRulesText :: FilePath -> Text
defaultRulesText csvfile = T.pack $ unlines
["# hledger csv conversion rules for " ++ csvFileFor (takeFileName csvfile)
,"# cf http://hledger.org/manual#csv-files"
,""
,"account1 assets:bank:checking"
,""
,"fields date, description, amount"
,""
,"#skip 1"
,"#newest-first"
,""
,"#date-format %-d/%-m/%Y"
,"#date-format %-m/%-d/%Y"
,"#date-format %Y-%h-%d"
,""
,"#currency $"
,""
,"if ITUNES"
," account2 expenses:entertainment"
,""
,"if (TO|FROM) SAVINGS"
," account2 assets:bank:savings\n"
]
--------------------------------------------------------------------------------
-- Conversion rules parsing
{-
Grammar for the CSV conversion rules, more or less:
RULES: RULE*
RULE: ( FIELD-LIST | FIELD-ASSIGNMENT | CONDITIONAL-BLOCK | SKIP | NEWEST-FIRST | DATE-FORMAT | COMMENT | BLANK ) NEWLINE
FIELD-LIST: fields SPACE FIELD-NAME ( SPACE? , SPACE? FIELD-NAME )*
FIELD-NAME: QUOTED-FIELD-NAME | BARE-FIELD-NAME
QUOTED-FIELD-NAME: " (any CHAR except double-quote)+ "
BARE-FIELD-NAME: any CHAR except space, tab, #, ;
FIELD-ASSIGNMENT: JOURNAL-FIELD ASSIGNMENT-SEPARATOR FIELD-VALUE
JOURNAL-FIELD: date | date2 | status | code | description | comment | account1 | account2 | amount | JOURNAL-PSEUDO-FIELD
JOURNAL-PSEUDO-FIELD: amount-in | amount-out | currency
ASSIGNMENT-SEPARATOR: SPACE | ( : SPACE? )
FIELD-VALUE: VALUE (possibly containing CSV-FIELD-REFERENCEs)
CSV-FIELD-REFERENCE: % CSV-FIELD
CSV-FIELD: ( FIELD-NAME | FIELD-NUMBER ) (corresponding to a CSV field)
FIELD-NUMBER: DIGIT+
CONDITIONAL-BLOCK: if ( FIELD-MATCHER NEWLINE )+ INDENTED-BLOCK
FIELD-MATCHER: ( CSV-FIELD-NAME SPACE? )? ( MATCHOP SPACE? )? PATTERNS
MATCHOP: ~
PATTERNS: ( NEWLINE REGEXP )* REGEXP
INDENTED-BLOCK: ( SPACE ( FIELD-ASSIGNMENT | COMMENT ) NEWLINE )+
REGEXP: ( NONSPACE CHAR* ) SPACE?
VALUE: SPACE? ( CHAR* ) SPACE?
COMMENT: SPACE? COMMENT-CHAR VALUE
COMMENT-CHAR: # | ;
NONSPACE: any CHAR not a SPACE-CHAR
BLANK: SPACE?
SPACE: SPACE-CHAR+
SPACE-CHAR: space | tab
CHAR: any character except newline
DIGIT: 0-9
-}
{- |
A set of data definitions and account-matching patterns sufficient to
convert a particular CSV data file into meaningful journal transactions.
-}
data CsvRules = CsvRules {
rdirectives :: [(DirectiveName,String)],
rcsvfieldindexes :: [(CsvFieldName, CsvFieldIndex)],
rassignments :: [(JournalFieldName, FieldTemplate)],
rconditionalblocks :: [ConditionalBlock]
} deriving (Show, Eq)
type CsvRulesParser a = StateT CsvRules SimpleTextParser a
type DirectiveName = String
type CsvFieldName = String
type CsvFieldIndex = Int
type JournalFieldName = String
type FieldTemplate = String
type ConditionalBlock = ([RecordMatcher], [(JournalFieldName, FieldTemplate)]) -- block matches if all RecordMatchers match
type RecordMatcher = [RegexpPattern] -- match if any regexps match any of the csv fields
-- type FieldMatcher = (CsvFieldName, [RegexpPattern]) -- match if any regexps match this csv field
type DateFormat = String
type RegexpPattern = String
rules = CsvRules {
rdirectives=[],
rcsvfieldindexes=[],
rassignments=[],
rconditionalblocks=[]
}
addDirective :: (DirectiveName, String) -> CsvRules -> CsvRules
addDirective d r = r{rdirectives=d:rdirectives r}
addAssignment :: (JournalFieldName, FieldTemplate) -> CsvRules -> CsvRules
addAssignment a r = r{rassignments=a:rassignments r}
setIndexesAndAssignmentsFromList :: [CsvFieldName] -> CsvRules -> CsvRules
setIndexesAndAssignmentsFromList fs r = addAssignmentsFromList fs . setCsvFieldIndexesFromList fs $ r
setCsvFieldIndexesFromList :: [CsvFieldName] -> CsvRules -> CsvRules
setCsvFieldIndexesFromList fs r = r{rcsvfieldindexes=zip fs [1..]}
addAssignmentsFromList :: [CsvFieldName] -> CsvRules -> CsvRules
addAssignmentsFromList fs r = foldl' maybeAddAssignment r journalfieldnames
where
maybeAddAssignment rules f = (maybe id addAssignmentFromIndex $ elemIndex f fs) rules
where
addAssignmentFromIndex i = addAssignment (f, "%"++show (i+1))
addConditionalBlock :: ConditionalBlock -> CsvRules -> CsvRules
addConditionalBlock b r = r{rconditionalblocks=b:rconditionalblocks r}
getDirective :: DirectiveName -> CsvRules -> Maybe FieldTemplate
getDirective directivename = lookup directivename . rdirectives
instance ShowErrorComponent String where
showErrorComponent = id
-- | An error-throwing action that parses this file's content
-- as CSV conversion rules, interpolating any included files first,
-- and runs some extra validation checks.
parseRulesFile :: FilePath -> ExceptT String IO CsvRules
parseRulesFile f =
liftIO (readFilePortably f >>= expandIncludes (takeDirectory f)) >>= parseAndValidateCsvRules f
-- | Inline all files referenced by include directives in this hledger CSV rules text, recursively.
-- Included file paths may be relative to the directory of the provided file path.
-- This is a cheap hack to avoid rewriting the CSV rules parser.
expandIncludes dir content = mapM (expandLine dir) (T.lines content) >>= return . T.unlines
where
expandLine dir line =
case line of
(T.stripPrefix "include " -> Just f) -> expandIncludes dir' =<< T.readFile f'
where
f' = dir </> dropWhile isSpace (T.unpack f)
dir' = takeDirectory f'
_ -> return line
-- | An error-throwing action that parses this text as CSV conversion rules
-- and runs some extra validation checks. The file path is for error messages.
parseAndValidateCsvRules :: FilePath -> T.Text -> ExceptT String IO CsvRules
parseAndValidateCsvRules rulesfile s = do
let rules = parseCsvRules rulesfile s
case rules of
Left e -> ExceptT $ return $ Left $ parseErrorPretty e
Right r -> do
r_ <- liftIO $ runExceptT $ validateRules r
ExceptT $ case r_ of
Left s -> return $ Left $ parseErrorPretty $ makeParseError rulesfile s
Right r -> return $ Right r
where
makeParseError :: FilePath -> String -> ParseError Char String
makeParseError f s = FancyError (fromList [initialPos f]) (S.singleton $ ErrorFail s)
-- | Parse this text as CSV conversion rules. The file path is for error messages.
parseCsvRules :: FilePath -> T.Text -> Either (ParseError Char CustomErr) CsvRules
-- parseCsvRules rulesfile s = runParser csvrulesfile nullrules{baseAccount=takeBaseName rulesfile} rulesfile s
parseCsvRules rulesfile s =
runParser (evalStateT rulesp rules) rulesfile s
-- | Return the validated rules, or an error.
validateRules :: CsvRules -> ExceptT String IO CsvRules
validateRules rules = do
unless (isAssigned "date") $ ExceptT $ return $ Left "Please specify (at top level) the date field. Eg: date %1\n"
unless ((amount && not (amountin || amountout)) ||
(not amount && (amountin && amountout)))
$ ExceptT $ return $ Left "Please specify (at top level) either the amount field, or both the amount-in and amount-out fields. Eg: amount %2\n"
ExceptT $ return $ Right rules
where
amount = isAssigned "amount"
amountin = isAssigned "amount-in"
amountout = isAssigned "amount-out"
isAssigned f = isJust $ getEffectiveAssignment rules [] f
-- parsers
rulesp :: CsvRulesParser CsvRules
rulesp = do
many $ choiceInState
[blankorcommentlinep <?> "blank or comment line"
,(directivep >>= modify' . addDirective) <?> "directive"
,(fieldnamelistp >>= modify' . setIndexesAndAssignmentsFromList) <?> "field name list"
,(fieldassignmentp >>= modify' . addAssignment) <?> "field assignment"
,(conditionalblockp >>= modify' . addConditionalBlock) <?> "conditional block"
]
eof
r <- get
return r{rdirectives=reverse $ rdirectives r
,rassignments=reverse $ rassignments r
,rconditionalblocks=reverse $ rconditionalblocks r
}
blankorcommentlinep :: CsvRulesParser ()
blankorcommentlinep = lift (dbgparse 3 "trying blankorcommentlinep") >> choiceInState [blanklinep, commentlinep]
blanklinep :: CsvRulesParser ()
blanklinep = lift (skipMany spacenonewline) >> newline >> return () <?> "blank line"
commentlinep :: CsvRulesParser ()
commentlinep = lift (skipMany spacenonewline) >> commentcharp >> lift restofline >> return () <?> "comment line"
commentcharp :: CsvRulesParser Char
commentcharp = oneOf (";#*" :: [Char])
directivep :: CsvRulesParser (DirectiveName, String)
directivep = (do
lift $ dbgparse 3 "trying directive"
d <- fmap T.unpack $ choiceInState $ map (lift . string . T.pack) directives
v <- (((char ':' >> lift (many spacenonewline)) <|> lift (some spacenonewline)) >> directivevalp)
<|> (optional (char ':') >> lift (skipMany spacenonewline) >> lift eolof >> return "")
return (d, v)
) <?> "directive"
directives =
["date-format"
-- ,"default-account1"
-- ,"default-currency"
-- ,"skip-lines" -- old
,"skip"
,"newest-first"
-- ,"base-account"
-- ,"base-currency"
]
directivevalp :: CsvRulesParser String
directivevalp = anyChar `manyTill` lift eolof
fieldnamelistp :: CsvRulesParser [CsvFieldName]
fieldnamelistp = (do
lift $ dbgparse 3 "trying fieldnamelist"
string "fields"
optional $ char ':'
lift (skipSome spacenonewline)
let separator = lift (skipMany spacenonewline) >> char ',' >> lift (skipMany spacenonewline)
f <- fromMaybe "" <$> optional fieldnamep
fs <- some $ (separator >> fromMaybe "" <$> optional fieldnamep)
lift restofline
return $ map (map toLower) $ f:fs
) <?> "field name list"
fieldnamep :: CsvRulesParser String
fieldnamep = quotedfieldnamep <|> barefieldnamep
quotedfieldnamep :: CsvRulesParser String
quotedfieldnamep = do
char '"'
f <- some $ noneOf ("\"\n:;#~" :: [Char])
char '"'
return f
barefieldnamep :: CsvRulesParser String
barefieldnamep = some $ noneOf (" \t\n,;#~" :: [Char])
fieldassignmentp :: CsvRulesParser (JournalFieldName, FieldTemplate)
fieldassignmentp = do
lift $ dbgparse 3 "trying fieldassignmentp"
f <- journalfieldnamep
assignmentseparatorp
v <- fieldvalp
return (f,v)
<?> "field assignment"
journalfieldnamep :: CsvRulesParser String
journalfieldnamep = do
lift (dbgparse 2 "trying journalfieldnamep")
T.unpack <$> choiceInState (map (lift . string . T.pack) journalfieldnames)
-- Transaction fields and pseudo fields for CSV conversion.
-- Names must precede any other name they contain, for the parser
-- (amount-in before amount; date2 before date). TODO: fix
journalfieldnames = [
"account1"
,"account2"
,"amount-in"
,"amount-out"
,"amount"
,"balance"
,"code"
,"comment"
,"currency"
,"date2"
,"date"
,"description"
,"status"
]
assignmentseparatorp :: CsvRulesParser ()
assignmentseparatorp = do
lift $ dbgparse 3 "trying assignmentseparatorp"
choice [
-- try (lift (skipMany spacenonewline) >> oneOf ":="),
try (lift (skipMany spacenonewline) >> char ':'),
spaceChar
]
_ <- lift (skipMany spacenonewline)
return ()
fieldvalp :: CsvRulesParser String
fieldvalp = do
lift $ dbgparse 2 "trying fieldvalp"
anyChar `manyTill` lift eolof
conditionalblockp :: CsvRulesParser ConditionalBlock
conditionalblockp = do
lift $ dbgparse 3 "trying conditionalblockp"
string "if" >> lift (skipMany spacenonewline) >> optional newline
ms <- some recordmatcherp
as <- many (lift (skipSome spacenonewline) >> fieldassignmentp)
when (null as) $
fail "start of conditional block found, but no assignment rules afterward\n(assignment rules in a conditional block should be indented)\n"
return (ms, as)
<?> "conditional block"
recordmatcherp :: CsvRulesParser [String]
recordmatcherp = do
lift $ dbgparse 2 "trying recordmatcherp"
-- pos <- currentPos
_ <- optional (matchoperatorp >> lift (skipMany spacenonewline) >> optional newline)
ps <- patternsp
when (null ps) $
fail "start of record matcher found, but no patterns afterward\n(patterns should not be indented)\n"
return ps
<?> "record matcher"
matchoperatorp :: CsvRulesParser String
matchoperatorp = fmap T.unpack $ choiceInState $ map string
["~"
-- ,"!~"
-- ,"="
-- ,"!="
]
patternsp :: CsvRulesParser [String]
patternsp = do
lift $ dbgparse 3 "trying patternsp"
ps <- many regexp
return ps
regexp :: CsvRulesParser String
regexp = do
lift $ dbgparse 3 "trying regexp"
notFollowedBy matchoperatorp
c <- lift nonspace
cs <- anyChar `manyTill` lift eolof
return $ strip $ c:cs
-- fieldmatcher = do
-- dbgparse 2 "trying fieldmatcher"
-- f <- fromMaybe "all" `fmap` (optional $ do
-- f' <- fieldname
-- lift (skipMany spacenonewline)
-- return f')
-- char '~'
-- lift (skipMany spacenonewline)
-- ps <- patterns
-- let r = "(" ++ intercalate "|" ps ++ ")"
-- return (f,r)
-- <?> "field matcher"
--------------------------------------------------------------------------------
-- Converting CSV records to journal transactions
type CsvRecord = [String]
-- Convert a CSV record to a transaction using the rules, or raise an
-- error if the data can not be parsed.
transactionFromCsvRecord :: SourcePos -> CsvRules -> CsvRecord -> Transaction
transactionFromCsvRecord sourcepos rules record = t
where
mdirective = (`getDirective` rules)
mfieldtemplate = getEffectiveAssignment rules record
render = renderTemplate rules record
mskip = mdirective "skip"
mdefaultcurrency = mdirective "default-currency"
mparsedate = parseDateWithFormatOrDefaultFormats (mdirective "date-format")
-- render each field using its template and the csv record, and
-- in some cases parse the rendered string (eg dates and amounts)
mdateformat = mdirective "date-format"
date = render $ fromMaybe "" $ mfieldtemplate "date"
date' = fromMaybe (error' $ dateerror "date" date mdateformat) $ mparsedate date
mdate2 = maybe Nothing (Just . render) $ mfieldtemplate "date2"
mdate2' = maybe Nothing (maybe (error' $ dateerror "date2" (fromMaybe "" mdate2) mdateformat) Just . mparsedate) mdate2
dateerror datefield value mdateformat = unlines
["error: could not parse \""++value++"\" as a date using date format "++maybe "\"YYYY/M/D\", \"YYYY-M-D\" or \"YYYY.M.D\"" show mdateformat
,"the CSV record is: "++intercalate ", " (map show record)
,"the "++datefield++" rule is: "++(fromMaybe "required, but missing" $ mfieldtemplate datefield)
,"the date-format is: "++fromMaybe "unspecified" mdateformat
,"you may need to "
++"change your "++datefield++" rule, "
++maybe "add a" (const "change your") mdateformat++" date-format rule, "
++"or "++maybe "add a" (const "change your") mskip++" skip rule"
,"for m/d/y or d/m/y dates, use date-format %-m/%-d/%Y or date-format %-d/%-m/%Y"
]
status =
case mfieldtemplate "status" of
Nothing -> Unmarked
Just str -> either statuserror id .
runParser (statusp <* eof) "" .
T.pack $ render str
where
statuserror err = error' $ unlines
["error: could not parse \""++str++"\" as a cleared status (should be *, ! or empty)"
,"the parse error is: "++show err
]
code = maybe "" render $ mfieldtemplate "code"
description = maybe "" render $ mfieldtemplate "description"
comment = maybe "" render $ mfieldtemplate "comment"
precomment = maybe "" render $ mfieldtemplate "precomment"
currency = maybe (fromMaybe "" mdefaultcurrency) render $ mfieldtemplate "currency"
amountstr = (currency++) <$> simplifySign <$> getAmountStr rules record
maybeamount = either amounterror (Mixed . (:[])) <$> runParser (evalStateT (amountp <* eof) mempty) "" <$> T.pack <$> amountstr
amounterror err = error' $ unlines
["error: could not parse \""++fromJust amountstr++"\" as an amount"
,showRecord record
,"the amount rule is: "++(fromMaybe "" $ mfieldtemplate "amount")
,"the currency rule is: "++(fromMaybe "unspecified" $ mfieldtemplate "currency")
,"the default-currency is: "++fromMaybe "unspecified" mdefaultcurrency
,"the parse error is: "++show err
,"you may need to "
++"change your amount or currency rules, "
++"or "++maybe "add a" (const "change your") mskip++" skip rule"
]
amount1 = case maybeamount of
Just a -> a
Nothing | balance /= Nothing -> nullmixedamt
Nothing -> error' $ "amount and balance have no value\n"++showRecord record
-- convert balancing amount to cost like hledger print, so eg if
-- amount1 is "10 GBP @@ 15 USD", amount2 will be "-15 USD".
amount2 = costOfMixedAmount (-amount1)
s `or` def = if null s then def else s
defaccount1 = fromMaybe "unknown" $ mdirective "default-account1"
defaccount2 = case isNegativeMixedAmount amount2 of
Just True -> "income:unknown"
_ -> "expenses:unknown"
account1 = T.pack $ maybe "" render (mfieldtemplate "account1") `or` defaccount1
account2 = T.pack $ maybe "" render (mfieldtemplate "account2") `or` defaccount2
balance = maybe Nothing (parsebalance.render) $ mfieldtemplate "balance"
parsebalance str
| all isSpace str = Nothing
| otherwise = Just $ (either (balanceerror str) id $ runParser (evalStateT (amountp <* eof) mempty) "" $ T.pack $ (currency++) $ simplifySign str, nullsourcepos)
balanceerror str err = error' $ unlines
["error: could not parse \""++str++"\" as balance amount"
,showRecord record
,"the balance rule is: "++(fromMaybe "" $ mfieldtemplate "balance")
,"the currency rule is: "++(fromMaybe "unspecified" $ mfieldtemplate "currency")
,"the default-currency is: "++fromMaybe "unspecified" mdefaultcurrency
,"the parse error is: "++show err
]
-- build the transaction
t = nulltransaction{
tsourcepos = genericSourcePos sourcepos,
tdate = date',
tdate2 = mdate2',
tstatus = status,
tcode = T.pack code,
tdescription = T.pack description,
tcomment = T.pack comment,
tpreceding_comment_lines = T.pack precomment,
tpostings =
[posting {paccount=account1, pamount=amount1, ptransaction=Just t, pbalanceassertion=balance}
,posting {paccount=account2, pamount=amount2, ptransaction=Just t}
]
}
getAmountStr :: CsvRules -> CsvRecord -> Maybe String
getAmountStr rules record =
let
mamount = getEffectiveAssignment rules record "amount"
mamountin = getEffectiveAssignment rules record "amount-in"
mamountout = getEffectiveAssignment rules record "amount-out"
render = fmap (strip . renderTemplate rules record)
in
case (render mamount, render mamountin, render mamountout) of
(Just "", Nothing, Nothing) -> Nothing
(Just a, Nothing, Nothing) -> Just a
(Nothing, Just "", Just "") -> error' $ "neither amount-in or amount-out has a value\n"
++ " record: " ++ showRecord record
(Nothing, Just i, Just "") -> Just i
(Nothing, Just "", Just o) -> Just $ negateStr o
(Nothing, Just i, Just o) -> error' $ "both amount-in and amount-out have a value\n"
++ " amount-in: " ++ i ++ "\n"
++ " amount-out: " ++ o ++ "\n"
++ " record: " ++ showRecord record
_ -> error' $ "found values for amount and for amount-in/amount-out\n"
++ "please use either amount or amount-in/amount-out\n"
++ " record: " ++ showRecord record
type CsvAmountString = String
-- | Canonicalise the sign in a CSV amount string.
-- Such strings can have a minus sign, negating parentheses,
-- or any two of these (which cancels out).
--
-- >>> simplifySign "1"
-- "1"
-- >>> simplifySign "-1"
-- "-1"
-- >>> simplifySign "(1)"
-- "-1"
-- >>> simplifySign "--1"
-- "1"
-- >>> simplifySign "-(1)"
-- "1"
-- >>> simplifySign "(-1)"
-- "1"
-- >>> simplifySign "((1))"
-- "1"
simplifySign :: CsvAmountString -> CsvAmountString
simplifySign ('(':s) | lastMay s == Just ')' = simplifySign $ negateStr $ init s
simplifySign ('-':'(':s) | lastMay s == Just ')' = simplifySign $ init s
simplifySign ('-':'-':s) = s
simplifySign s = s
negateStr :: String -> String
negateStr ('-':s) = s
negateStr s = '-':s
-- | Show a (approximate) recreation of the original CSV record.
showRecord :: CsvRecord -> String
showRecord r = "the CSV record is: "++intercalate ", " (map show r)
-- | Given the conversion rules, a CSV record and a journal entry field name, find
-- the template value ultimately assigned to this field, either at top
-- level or in a matching conditional block. Conditional blocks'
-- patterns are matched against an approximation of the original CSV
-- record: all the field values with commas intercalated.
getEffectiveAssignment :: CsvRules -> CsvRecord -> JournalFieldName -> Maybe FieldTemplate
getEffectiveAssignment rules record f = lastMay $ assignmentsFor f
where
assignmentsFor f = map snd $ filter ((==f).fst) $ toplevelassignments ++ conditionalassignments
where
toplevelassignments = rassignments rules
conditionalassignments = concatMap snd $ filter blockMatches $ blocksAssigning f
where
blocksAssigning f = filter (any ((==f).fst) . snd) $ rconditionalblocks rules
blockMatches :: ConditionalBlock -> Bool
blockMatches (matchers,_) = all matcherMatches matchers
where
matcherMatches :: RecordMatcher -> Bool
-- matcherMatches pats = any patternMatches pats
matcherMatches pats = patternMatches $ "(" ++ intercalate "|" pats ++ ")"
where
patternMatches :: RegexpPattern -> Bool
patternMatches pat = regexMatchesCI pat csvline
where
csvline = intercalate "," record
renderTemplate :: CsvRules -> CsvRecord -> FieldTemplate -> String
renderTemplate rules record t = regexReplaceBy "%[A-z0-9]+" replace t
where
replace ('%':pat) = maybe pat (\i -> atDef "" record (i-1)) mindex
where
mindex | all isDigit pat = readMay pat
| otherwise = lookup (map toLower pat) $ rcsvfieldindexes rules
replace pat = pat
-- Parse the date string using the specified date-format, or if unspecified try these default formats:
-- YYYY/MM/DD, YYYY-MM-DD, YYYY.MM.DD, MM/DD/YYYY (month and day can be 1 or 2 digits, year must be 4).
parseDateWithFormatOrDefaultFormats :: Maybe DateFormat -> String -> Maybe Day
parseDateWithFormatOrDefaultFormats mformat s = firstJust $ map parsewith formats
where
parsetime =
#if MIN_VERSION_time(1,5,0)
parseTimeM True
#else
parseTime
#endif
parsewith = flip (parsetime defaultTimeLocale) s
formats = maybe
["%Y/%-m/%-d"
,"%Y-%-m-%-d"
,"%Y.%-m.%-d"
-- ,"%-m/%-d/%Y"
-- ,parseTime defaultTimeLocale "%Y/%m/%e" (take 5 s ++ "0" ++ drop 5 s)
-- ,parseTime defaultTimeLocale "%Y-%m-%e" (take 5 s ++ "0" ++ drop 5 s)
-- ,parseTime defaultTimeLocale "%m/%e/%Y" ('0':s)
-- ,parseTime defaultTimeLocale "%m-%e-%Y" ('0':s)
]
(:[])
mformat
--------------------------------------------------------------------------------
-- tests
tests_CsvReader = tests "CsvReader" [
tests "parseCsvRules" [
test "empty file" $
parseCsvRules "unknown" "" `is` Right rules
]
,tests "rulesp" [
test "trailing comments" $
parseWithState' rules rulesp "skip\n# \n#\n" `is` Right rules{rdirectives = [("skip","")]}
,test "trailing blank lines" $
parseWithState' rules rulesp "skip\n\n \n" `is` (Right rules{rdirectives = [("skip","")]})
,test "no final newline" $
parseWithState' rules rulesp "skip" `is` (Right rules{rdirectives=[("skip","")]})
,test "assignment with empty value" $
parseWithState' rules rulesp "account1 \nif foo\n account2 foo\n" `is`
(Right rules{rassignments = [("account1","")], rconditionalblocks = [([["foo"]],[("account2","foo")])]})
]
]