imp: csv: new timezone rule; convert zoned date-times to local dates (#1936)

Previously, CSV date-times with a different time zone from yours
(with or without explicit timezones in the CSV) could give off-by-one
dates, because the CSV timezone was ignored.

Now,

1. you can use the `timezone` rule to indicate which other
   timezone a CSV is implicitly using

2. CSV date-times with a timezone - whether declared by rule or
   parsed with %Z - are localised to the system time zone
   (or another set with the TZ environment variable).
This commit is contained in:
Simon Michael 2022-09-26 16:38:06 -10:00
parent 7d1b1cadce
commit 3b24d9465b
2 changed files with 86 additions and 19 deletions

View File

@ -56,8 +56,8 @@ import qualified Data.Text.Encoding as T
import qualified Data.Text.IO as T
import qualified Data.Text.Lazy as TL
import qualified Data.Text.Lazy.Builder as TB
import Data.Time.Calendar (Day)
import Data.Time.Format (parseTimeM, defaultTimeLocale)
import Data.Time ( Day, TimeZone, UTCTime, LocalTime, ZonedTime(ZonedTime),
defaultTimeLocale, getCurrentTimeZone, localDay, parseTimeM, utcToLocalTime, localTimeToUTC, zonedTimeToUTC)
import Safe (atMay, headMay, lastMay, readMay)
import System.Directory (doesFileExist)
import System.FilePath ((</>), takeDirectory, takeExtension, takeFileName)
@ -460,6 +460,7 @@ directives =
-- ,"default-account"
-- ,"default-currency"
,"skip"
,"timezone"
,"newest-first"
, "balance-type"
]
@ -703,6 +704,13 @@ readJournalFromCsv mrulesfile csvfile csvdata = do
Just "" -> return 1
Just s -> maybe (throwError $ "could not parse skip value: " ++ show s) return . readMay $ T.unpack s
mtzin <- case getDirective "timezone" rules of
Nothing -> return Nothing
Just s ->
maybe (throwError $ "could not parse time zone: " ++ T.unpack s) (return.Just) $
parseTimeM False defaultTimeLocale "%Z" $ T.unpack s
tzout <- liftIO getCurrentTimeZone
-- parse csv
let
-- parsec seems to fail if you pass it "-" here TODO: try again with megaparsec
@ -733,9 +741,14 @@ readJournalFromCsv mrulesfile csvfile csvdata = do
line' = (mkPos . (+1) . unPos) line
pos' = SourcePos name line' col
in
(pos', transactionFromCsvRecord pos rules r)
(pos', transactionFromCsvRecord timesarezoned mtzin tzout pos rules r)
)
(initialPos parsecfilename) records
where
timesarezoned =
case csvRule rules "date-format" of
Just f | any (`T.isInfixOf` f) ["%Z","%z","%EZ","%Ez"] -> True
_ -> False
-- Ensure transactions are ordered chronologically.
-- First, if the CSV records seem to be most-recent-first (because
@ -856,8 +869,8 @@ hledgerField = getEffectiveAssignment
hledgerFieldValue :: CsvRules -> CsvRecord -> HledgerFieldName -> Maybe Text
hledgerFieldValue rules record = fmap (renderTemplate rules record) . hledgerField rules record
transactionFromCsvRecord :: SourcePos -> CsvRules -> CsvRecord -> Transaction
transactionFromCsvRecord sourcepos rules record = t
transactionFromCsvRecord :: Bool -> Maybe TimeZone -> TimeZone -> SourcePos -> CsvRules -> CsvRecord -> Transaction
transactionFromCsvRecord timesarezoned mtzin tzout sourcepos rules record = t
where
----------------------------------------------------------------------
-- 1. Define some helpers:
@ -866,7 +879,8 @@ transactionFromCsvRecord sourcepos rules record = t
-- ruleval = csvRuleValue rules record :: DirectiveName -> Maybe String
field = hledgerField rules record :: HledgerFieldName -> Maybe FieldTemplate
fieldval = hledgerFieldValue rules record :: HledgerFieldName -> Maybe Text
parsedate = parseDateWithCustomOrDefaultFormats (rule "date-format")
mdateformat = rule "date-format"
parsedate = parseDateWithCustomOrDefaultFormats timesarezoned mtzin tzout mdateformat
mkdateerror datefield datevalue mdateformat' = T.unpack $ T.unlines
["error: could not parse \""<>datevalue<>"\" as a date using date format "
<>maybe "\"YYYY/M/D\", \"YYYY-M-D\" or \"YYYY.M.D\"" (T.pack . show) mdateformat'
@ -887,7 +901,6 @@ transactionFromCsvRecord sourcepos rules record = t
-- field assignment rules using the CSV record's data, and parsing a bit
-- more where needed (dates, status).
mdateformat = rule "date-format"
date = fromMaybe "" $ fieldval "date"
-- PARTIAL:
date' = fromMaybe (error' $ mkdateerror "date" date mdateformat) $ parsedate date
@ -1320,11 +1333,45 @@ csvFieldValue rules record fieldname = do
-- | Parse the date string using the specified date-format, or if unspecified
-- the "simple date" formats (YYYY/MM/DD, YYYY-MM-DD, YYYY.MM.DD, leading
-- zeroes optional).
parseDateWithCustomOrDefaultFormats :: Maybe DateFormat -> Text -> Maybe Day
parseDateWithCustomOrDefaultFormats mformat s = asum $ map parsewith' formats
-- zeroes optional). If a timezone is provided, we assume the DateFormat
-- produces a zoned time and we localise that to the given timezone.
parseDateWithCustomOrDefaultFormats :: Bool -> Maybe TimeZone -> TimeZone -> Maybe DateFormat -> Text -> Maybe Day
parseDateWithCustomOrDefaultFormats timesarezoned mtzin tzout mformat s = localdate <$> mutctime
-- this time code can probably be simpler, I'm just happy to get out alive
where
parsewith' = flip (parseTimeM True defaultTimeLocale) (T.unpack s)
localdate :: UTCTime -> Day =
localDay .
dbg7 ("time in output timezone "++show tzout) .
utcToLocalTime tzout
mutctime :: Maybe UTCTime = asum $ map parseWithFormat formats
parseWithFormat :: String -> Maybe UTCTime
parseWithFormat fmt =
if timesarezoned
then
dbg7 "zoned CSV time, expressed as UTC" $
parseTimeM True defaultTimeLocale fmt $ T.unpack s :: Maybe UTCTime
else
-- parse as a local day and time; then if an input timezone is provided,
-- assume it's in that, otherwise assume it's in the output timezone;
-- then convert to UTC like the above
let
mlocaltime =
fmap (dbg7 "unzoned CSV time") $
parseTimeM True defaultTimeLocale fmt $ T.unpack s :: Maybe LocalTime
localTimeAsZonedTime tz lt = ZonedTime lt tz
in
case mtzin of
Just tzin ->
(dbg7 ("unzoned CSV time, declared as "++show tzin++ ", expressed as UTC") .
localTimeToUTC tzin)
<$> mlocaltime
Nothing ->
(dbg7 ("unzoned CSV time, treated as "++show tzout++ ", expressed as UTC") .
zonedTimeToUTC .
localTimeAsZonedTime tzout)
<$> mlocaltime
formats = map T.unpack $ maybe
["%Y/%-m/%-d"
,"%Y-%-m-%-d"

View File

@ -4592,8 +4592,9 @@ date-format DATEFMT
```
This is a helper for the `date` (and `date2`) fields.
If your CSV dates are not formatted like `YYYY-MM-DD`, `YYYY/MM/DD` or `YYYY.MM.DD`,
you'll need to add a date-format rule describing them with a
strptime date parsing pattern, which must parse the CSV date value completely.
you'll need to add a date-format rule describing them with a strptime-style date parsing pattern -
see <https://hackage.haskell.org/package/time/docs/Data-Time-Format.html#v:formatTime>.
The pattern must parse the CSV date value completely.
Some examples:
``` rules
# MM/DD/YY
@ -4613,14 +4614,33 @@ date-format %Y-%h-%d
# Note the time and junk must be fully parsed, though only the date is used.
date-format %-m/%-d/%Y %l:%M %p some other junk
```
For the supported strptime syntax, see:\
<https://hackage.haskell.org/package/time/docs/Data-Time-Format.html#v:formatTime>
Note that although you can parse date-times which include a time zone,
that time zone is ignored; it will not change the date that is parsed.
This means when reading CSV data with times not in your local time zone,
dates can be "off by one".
### `timezone`
```rules
timezone TIMEZONE
```
When CSV contains date-times that are implicitly in some time zone
other than yours, but containing no explicit time zone information,
you can use this rule to declare the CSV's native time zone,
which helps prevent off-by-one dates.
When the CSV date-times do contain time zone information,
you don't need this rule; instead, use `%Z` in `date-format`
(or `%z`, `%EZ`, `%Ez`; see the formatTime link above).
In either of these cases, hledger will do a time-zone-aware conversion,
localising the CSV date-times to your current system time zone.
If you prefer to localise to some other time zone, eg for reproducibility,
you can (on unix at least) set the output timezone with the TZ environment variable, eg:
```shell
$ TZ=HST hledger print -f foo.csv # or TZ=HST hledger import foo.csv
```
`timezone` currently does not understand timezone names, except
"UTC", "GMT", "EST", "EDT", "CST", "CDT", "MST", "MDT", "PST", or "PDT".
For others, use numeric format: +HHMM or -HHMM.
### `decimal-mark`