fix: csv: skip header lines before attempting to parse records (#1967)

This commit is contained in:
Simon Michael 2022-12-27 12:21:20 -10:00
parent 71bd102b4b
commit a9b63bb694
2 changed files with 32 additions and 12 deletions

View File

@ -699,12 +699,6 @@ readJournalFromCsv mrulesfile csvfile csvdata = do
rules <- liftEither $ parseAndValidateCsvRules rulesfile rulestext rules <- liftEither $ parseAndValidateCsvRules rulesfile rulestext
dbg6IO "csv rules" rules dbg6IO "csv rules" rules
-- parse the skip directive's value, if any
skiplines <- case getDirective "skip" rules of
Nothing -> return 0
Just "" -> return 1
Just s -> maybe (throwError $ "could not parse skip value: " ++ show s) return . readMay $ T.unpack s
mtzin <- case getDirective "timezone" rules of mtzin <- case getDirective "timezone" rules of
Nothing -> return Nothing Nothing -> return Nothing
Just s -> Just s ->
@ -712,6 +706,13 @@ readJournalFromCsv mrulesfile csvfile csvdata = do
parseTimeM False defaultTimeLocale "%Z" $ T.unpack s parseTimeM False defaultTimeLocale "%Z" $ T.unpack s
tzout <- liftIO getCurrentTimeZone tzout <- liftIO getCurrentTimeZone
-- skip header lines, if there is a top-level skip rule
skiplines <- case getDirective "skip" rules of
Nothing -> return 0
Just "" -> return 1
Just s -> maybe (throwError $ "could not parse skip value: " ++ show s) return . readMay $ T.unpack s
let csvdata' = T.unlines $ drop skiplines $ T.lines csvdata
-- parse csv -- parse csv
let let
-- parsec seems to fail if you pass it "-" here TODO: try again with megaparsec -- parsec seems to fail if you pass it "-" here TODO: try again with megaparsec
@ -725,8 +726,8 @@ readJournalFromCsv mrulesfile csvfile csvdata = do
where where
ext = map toLower $ drop 1 $ takeExtension csvfile ext = map toLower $ drop 1 $ takeExtension csvfile
dbg6IO "using separator" separator dbg6IO "using separator" separator
csv <- dbg7 "parseCsv" <$> parseCsv separator parsecfilename csvdata csv <- dbg7 "parseCsv" <$> parseCsv separator parsecfilename csvdata'
records <- liftEither $ dbg7 "validateCsv" <$> validateCsv rules skiplines csv records <- liftEither $ dbg7 "validateCsv" <$> validateCsv rules csv
dbg6IO "first 3 csv records" $ take 3 records dbg6IO "first 3 csv records" $ take 3 records
-- identify header lines -- identify header lines
@ -818,8 +819,8 @@ printCSV = TB.toLazyText . unlinesB . map printRecord
printField = wrap "\"" "\"" . T.replace "\"" "\"\"" printField = wrap "\"" "\"" . T.replace "\"" "\"\""
-- | Return the cleaned up and validated CSV data (can be empty), or an error. -- | Return the cleaned up and validated CSV data (can be empty), or an error.
validateCsv :: CsvRules -> Int -> CSV -> Either String [CsvRecord] validateCsv :: CsvRules -> CSV -> Either String [CsvRecord]
validateCsv rules numhdrlines = validate . applyConditionalSkips . drop numhdrlines . filternulls validateCsv rules = validate . applyConditionalSkips . filternulls
where where
filternulls = filter (/=[""]) filternulls = filter (/=[""])
skipnum r = skipnum r =

View File

@ -1033,8 +1033,7 @@ $ ./csvtest.sh
< <
"2021-12-23","caffe_siciliaexpenses:cibo:dolce","-10.5" "2021-12-23","caffe_siciliaexpenses:cibo:dolce","-10.5"
RULES file RULES
account1 assets:bank:checking account1 assets:bank:checking
fields date, description, account2, amount fields date, description, account2, amount
@ -1042,6 +1041,26 @@ $ ./csvtest.sh
>2 /transaction is unbalanced/ >2 /transaction is unbalanced/
>=1 >=1
# 52. We can't parse double quotes inside an unquoted field, or other non-RFC4180 data. (#1966)
<
2022-01-01,B"B",C
RULES
fields date, b, c
$ ./csvtest.sh
>2 /unexpected '"'/
>=1
# 53. A top-level skip directive is able to skip lines which would fail to parse as CSV. (#1967)
<
2022-01-01,B"B",C
RULES
skip 1
fields date, b, c
$ ./csvtest.sh
>=
## . ## .
#< #<
#$ ./csvtest.sh #$ ./csvtest.sh