From 3b24d9465b682c495b281ecc45660b614b4d3110 Mon Sep 17 00:00:00 2001 From: Simon Michael Date: Mon, 26 Sep 2022 16:38:06 -1000 Subject: [PATCH] imp: csv: new timezone rule; convert zoned date-times to local dates (#1936) Previously, CSV date-times with a different time zone from yours (with or without explicit timezones in the CSV) could give off-by-one dates, because the CSV timezone was ignored. Now, 1. you can use the `timezone` rule to indicate which other timezone a CSV is implicitly using 2. CSV date-times with a timezone - whether declared by rule or parsed with %Z - are localised to the system time zone (or another set with the TZ environment variable). --- hledger-lib/Hledger/Read/CsvReader.hs | 69 ++++++++++++++++++++++----- hledger/hledger.m4.md | 36 ++++++++++---- 2 files changed, 86 insertions(+), 19 deletions(-) diff --git a/hledger-lib/Hledger/Read/CsvReader.hs b/hledger-lib/Hledger/Read/CsvReader.hs index 57cfda9be..e0c7c55fd 100644 --- a/hledger-lib/Hledger/Read/CsvReader.hs +++ b/hledger-lib/Hledger/Read/CsvReader.hs @@ -56,8 +56,8 @@ import qualified Data.Text.Encoding as T import qualified Data.Text.IO as T import qualified Data.Text.Lazy as TL import qualified Data.Text.Lazy.Builder as TB -import Data.Time.Calendar (Day) -import Data.Time.Format (parseTimeM, defaultTimeLocale) +import Data.Time ( Day, TimeZone, UTCTime, LocalTime, ZonedTime(ZonedTime), + defaultTimeLocale, getCurrentTimeZone, localDay, parseTimeM, utcToLocalTime, localTimeToUTC, zonedTimeToUTC) import Safe (atMay, headMay, lastMay, readMay) import System.Directory (doesFileExist) import System.FilePath ((), takeDirectory, takeExtension, takeFileName) @@ -460,6 +460,7 @@ directives = -- ,"default-account" -- ,"default-currency" ,"skip" + ,"timezone" ,"newest-first" , "balance-type" ] @@ -703,6 +704,13 @@ readJournalFromCsv mrulesfile csvfile csvdata = do Just "" -> return 1 Just s -> maybe (throwError $ "could not parse skip value: " ++ show s) return . readMay $ T.unpack s + mtzin <- case getDirective "timezone" rules of + Nothing -> return Nothing + Just s -> + maybe (throwError $ "could not parse time zone: " ++ T.unpack s) (return.Just) $ + parseTimeM False defaultTimeLocale "%Z" $ T.unpack s + tzout <- liftIO getCurrentTimeZone + -- parse csv let -- parsec seems to fail if you pass it "-" here TODO: try again with megaparsec @@ -733,9 +741,14 @@ readJournalFromCsv mrulesfile csvfile csvdata = do line' = (mkPos . (+1) . unPos) line pos' = SourcePos name line' col in - (pos', transactionFromCsvRecord pos rules r) + (pos', transactionFromCsvRecord timesarezoned mtzin tzout pos rules r) ) (initialPos parsecfilename) records + where + timesarezoned = + case csvRule rules "date-format" of + Just f | any (`T.isInfixOf` f) ["%Z","%z","%EZ","%Ez"] -> True + _ -> False -- Ensure transactions are ordered chronologically. -- First, if the CSV records seem to be most-recent-first (because @@ -856,8 +869,8 @@ hledgerField = getEffectiveAssignment hledgerFieldValue :: CsvRules -> CsvRecord -> HledgerFieldName -> Maybe Text hledgerFieldValue rules record = fmap (renderTemplate rules record) . hledgerField rules record -transactionFromCsvRecord :: SourcePos -> CsvRules -> CsvRecord -> Transaction -transactionFromCsvRecord sourcepos rules record = t +transactionFromCsvRecord :: Bool -> Maybe TimeZone -> TimeZone -> SourcePos -> CsvRules -> CsvRecord -> Transaction +transactionFromCsvRecord timesarezoned mtzin tzout sourcepos rules record = t where ---------------------------------------------------------------------- -- 1. Define some helpers: @@ -866,7 +879,8 @@ transactionFromCsvRecord sourcepos rules record = t -- ruleval = csvRuleValue rules record :: DirectiveName -> Maybe String field = hledgerField rules record :: HledgerFieldName -> Maybe FieldTemplate fieldval = hledgerFieldValue rules record :: HledgerFieldName -> Maybe Text - parsedate = parseDateWithCustomOrDefaultFormats (rule "date-format") + mdateformat = rule "date-format" + parsedate = parseDateWithCustomOrDefaultFormats timesarezoned mtzin tzout mdateformat mkdateerror datefield datevalue mdateformat' = T.unpack $ T.unlines ["error: could not parse \""<>datevalue<>"\" as a date using date format " <>maybe "\"YYYY/M/D\", \"YYYY-M-D\" or \"YYYY.M.D\"" (T.pack . show) mdateformat' @@ -887,7 +901,6 @@ transactionFromCsvRecord sourcepos rules record = t -- field assignment rules using the CSV record's data, and parsing a bit -- more where needed (dates, status). - mdateformat = rule "date-format" date = fromMaybe "" $ fieldval "date" -- PARTIAL: date' = fromMaybe (error' $ mkdateerror "date" date mdateformat) $ parsedate date @@ -1320,11 +1333,45 @@ csvFieldValue rules record fieldname = do -- | Parse the date string using the specified date-format, or if unspecified -- the "simple date" formats (YYYY/MM/DD, YYYY-MM-DD, YYYY.MM.DD, leading --- zeroes optional). -parseDateWithCustomOrDefaultFormats :: Maybe DateFormat -> Text -> Maybe Day -parseDateWithCustomOrDefaultFormats mformat s = asum $ map parsewith' formats +-- zeroes optional). If a timezone is provided, we assume the DateFormat +-- produces a zoned time and we localise that to the given timezone. +parseDateWithCustomOrDefaultFormats :: Bool -> Maybe TimeZone -> TimeZone -> Maybe DateFormat -> Text -> Maybe Day +parseDateWithCustomOrDefaultFormats timesarezoned mtzin tzout mformat s = localdate <$> mutctime + -- this time code can probably be simpler, I'm just happy to get out alive where - parsewith' = flip (parseTimeM True defaultTimeLocale) (T.unpack s) + localdate :: UTCTime -> Day = + localDay . + dbg7 ("time in output timezone "++show tzout) . + utcToLocalTime tzout + mutctime :: Maybe UTCTime = asum $ map parseWithFormat formats + + parseWithFormat :: String -> Maybe UTCTime + parseWithFormat fmt = + if timesarezoned + then + dbg7 "zoned CSV time, expressed as UTC" $ + parseTimeM True defaultTimeLocale fmt $ T.unpack s :: Maybe UTCTime + else + -- parse as a local day and time; then if an input timezone is provided, + -- assume it's in that, otherwise assume it's in the output timezone; + -- then convert to UTC like the above + let + mlocaltime = + fmap (dbg7 "unzoned CSV time") $ + parseTimeM True defaultTimeLocale fmt $ T.unpack s :: Maybe LocalTime + localTimeAsZonedTime tz lt = ZonedTime lt tz + in + case mtzin of + Just tzin -> + (dbg7 ("unzoned CSV time, declared as "++show tzin++ ", expressed as UTC") . + localTimeToUTC tzin) + <$> mlocaltime + Nothing -> + (dbg7 ("unzoned CSV time, treated as "++show tzout++ ", expressed as UTC") . + zonedTimeToUTC . + localTimeAsZonedTime tzout) + <$> mlocaltime + formats = map T.unpack $ maybe ["%Y/%-m/%-d" ,"%Y-%-m-%-d" diff --git a/hledger/hledger.m4.md b/hledger/hledger.m4.md index a855a3adc..b9fad4ba2 100644 --- a/hledger/hledger.m4.md +++ b/hledger/hledger.m4.md @@ -4592,8 +4592,9 @@ date-format DATEFMT ``` This is a helper for the `date` (and `date2`) fields. If your CSV dates are not formatted like `YYYY-MM-DD`, `YYYY/MM/DD` or `YYYY.MM.DD`, -you'll need to add a date-format rule describing them with a -strptime date parsing pattern, which must parse the CSV date value completely. +you'll need to add a date-format rule describing them with a strptime-style date parsing pattern - +see . +The pattern must parse the CSV date value completely. Some examples: ``` rules # MM/DD/YY @@ -4613,14 +4614,33 @@ date-format %Y-%h-%d # Note the time and junk must be fully parsed, though only the date is used. date-format %-m/%-d/%Y %l:%M %p some other junk ``` -For the supported strptime syntax, see:\ - -Note that although you can parse date-times which include a time zone, -that time zone is ignored; it will not change the date that is parsed. -This means when reading CSV data with times not in your local time zone, -dates can be "off by one". +### `timezone` +```rules +timezone TIMEZONE +``` + +When CSV contains date-times that are implicitly in some time zone +other than yours, but containing no explicit time zone information, +you can use this rule to declare the CSV's native time zone, +which helps prevent off-by-one dates. + +When the CSV date-times do contain time zone information, +you don't need this rule; instead, use `%Z` in `date-format` +(or `%z`, `%EZ`, `%Ez`; see the formatTime link above). + +In either of these cases, hledger will do a time-zone-aware conversion, +localising the CSV date-times to your current system time zone. +If you prefer to localise to some other time zone, eg for reproducibility, +you can (on unix at least) set the output timezone with the TZ environment variable, eg: +```shell +$ TZ=HST hledger print -f foo.csv # or TZ=HST hledger import foo.csv +``` + +`timezone` currently does not understand timezone names, except +"UTC", "GMT", "EST", "EDT", "CST", "CDT", "MST", "MDT", "PST", or "PDT". +For others, use numeric format: +HHMM or -HHMM. ### `decimal-mark`