From 6b2dfad98cb2793f604840454dfdc00ce9484207 Mon Sep 17 00:00:00 2001 From: Simon Michael Date: Sat, 9 May 2020 15:43:44 -0700 Subject: [PATCH] ;csv: clarify that whitespace is not stripped when matching --- hledger-lib/Hledger/Read/CsvReader.hs | 14 +++++++++----- hledger-lib/hledger_csv.m4.md | 9 ++++++--- 2 files changed, 15 insertions(+), 8 deletions(-) diff --git a/hledger-lib/Hledger/Read/CsvReader.hs b/hledger-lib/Hledger/Read/CsvReader.hs index a346f2582..5d3c5cf0e 100644 --- a/hledger-lib/Hledger/Read/CsvReader.hs +++ b/hledger-lib/Hledger/Read/CsvReader.hs @@ -1075,12 +1075,16 @@ getEffectiveAssignment rules record f = lastMay $ map snd $ assignments where -- does this individual matcher match the current csv record ? matcherMatches :: Matcher -> Bool - matcherMatches (RecordMatcher pat) = regexMatchesCI pat wholecsvline + matcherMatches (RecordMatcher pat) = regexMatchesCI pat' wholecsvline where - -- a synthetic whole CSV record to match against; note, it has - -- no quotes enclosing fields, and is always comma-separated, - -- so may differ from the actual record, and may not be valid CSV. - wholecsvline = dbg3 "wholecsvline" $ intercalate "," record + pat' = dbg3 "regex" pat + -- A synthetic whole CSV record to match against. Note, this can be + -- different from the original CSV data: + -- - any whitespace surrounding field values is preserved + -- - any quotes enclosing field values are removed + -- - and the field separator is always comma + -- which means that a field containing a comma will look like two fields. + wholecsvline = dbg3 "wholecsvline" $ intercalate "," record -- $ map strip record ? matcherMatches (FieldMatcher csvfieldref pat) = regexMatchesCI pat csvfieldvalue where -- the value of the referenced CSV field to match against. diff --git a/hledger-lib/hledger_csv.m4.md b/hledger-lib/hledger_csv.m4.md index 11ee7ebb9..ad0153c24 100644 --- a/hledger-lib/hledger_csv.m4.md +++ b/hledger-lib/hledger_csv.m4.md @@ -545,9 +545,12 @@ REGEX REGEX is a case-insensitive regular expression which tries to match anywhere within the CSV record. It is a POSIX extended regular expressions with some additions (see [Regular expressions](https://hledger.org/hledger.html#regular-expressions) in the hledger manual). -Note: the "CSV record" it is matched against is not the original record, but a synthetic one, -with enclosing double quotes or whitespace removed, and always comma-separated. -(Eg, an SSV record `2020-01-01; "Acme, Inc."; 1,000` appears to REGEX as `2020-01-01,Acme, Inc.,1,000`). + +Important note: the record that is matched is not the original record, but a synthetic one, +with any enclosing double quotes (but not enclosing whitespace) removed, and always comma-separated +(which means that a field containing a comma will appear like two fields). +Eg, if the original record is `2020-01-01; "Acme, Inc."; 1,000`, +the REGEX will actually see `2020-01-01,Acme, Inc., 1,000`). Or, MATCHER can be a field matcher, like this: ```rules