lib: simplify format detection, avoid ledger reader by default

When we don't know a file's format, instead of choosing a subset of
readers based on content sniffing, now we just try them all.
Also, LedgerReader is now used only as a last resort,
as it's not yet competitive with JournalReader.
This commit is contained in:
Simon Michael 2016-11-18 13:24:57 -08:00
parent 59ce4c987b
commit b6ff170688
7 changed files with 61 additions and 135 deletions

View File

@ -319,12 +319,19 @@ type StorageFormat = String
-- | A hledger journal reader is a triple of storage format name, a
-- detector of that format, and a parser from that format to Journal.
data Reader = Reader {
-- name of the format this reader handles
-- The canonical name of the format handled by this reader
rFormat :: StorageFormat
-- quickly check if this reader can probably handle the given file path and file content
,rDetector :: FilePath -> Text -> Bool
-- parse the given string, using the given parse rules file if any, returning a journal or error aware of the given file path
-- The file extensions recognised as containing this format
,rExtensions :: [String]
-- A text parser for this format, accepting an optional rules file,
-- assertion-checking flag, and file path for error messages,
-- producing an exception-raising IO action that returns a journal
-- or error message.
,rParser :: Maybe FilePath -> Bool -> FilePath -> Text -> ExceptT String IO Journal
}
instance Show Reader where show r = rFormat r ++ " reader"

View File

@ -12,9 +12,7 @@ readJournalFiles
readJournalFile
requireJournalFileExists
readJournal
readersFor
readerForStorageFormat
readersForPathAndData
findReader
tryReaders
@
@ -34,20 +32,9 @@ module Hledger.Read (
-- * Journal parsing
readJournal,
readersFor,
readerForStorageFormat,
readersForPathAndData,
tryReaders,
readJournal',
readFormatNames,
-- * Re-exported
-- accountnamep,
-- amountp,
-- amountp',
-- mamountp',
-- numberp,
-- codep,
accountaliasp,
postingp,
module Hledger.Read.Common,
@ -64,10 +51,11 @@ import Data.List
import Data.Maybe
import Data.Text (Text)
import qualified Data.Text as T
import Safe
import System.Directory (doesFileExist, getHomeDirectory)
import System.Environment (getEnv)
import System.Exit (exitFailure)
import System.FilePath ((</>))
import System.FilePath ((</>), takeExtension)
import System.IO (stderr)
import Test.HUnit
import Text.Printf
@ -85,20 +73,16 @@ import Prelude hiding (getContents, writeFile)
import Hledger.Utils.UTF8IOCompat (writeFile)
-- The available data file readers, each one handling a particular data
-- format. The first is also used as the default for unknown formats.
-- The available journal readers, each one handling a particular data format.
readers :: [Reader]
readers = [
JournalReader.reader
,LedgerReader.reader
,TimeclockReader.reader
,TimedotReader.reader
,CsvReader.reader
,LedgerReader.reader
]
readFormatNames :: [StorageFormat]
readFormatNames = map rFormat readers
journalEnvVar = "LEDGER_FILE"
journalEnvVar2 = "LEDGER"
journalDefaultFilename = ".hledger.journal"
@ -192,44 +176,28 @@ tests_readJournal' = [
-- | @readJournal mformat mrulesfile assrt mpath t@
--
-- Read a journal from this string, trying whatever readers seem appropriate:
--
-- - if a format is specified, try that reader only
--
-- - or if one or more readers recognises the file path and data, try those
--
-- - otherwise, try them all.
--
-- A CSV conversion rules file may also be specified for use by the CSV reader.
-- Also there is a flag specifying whether to check or ignore balance assertions in the journal.
-- Try to read a Journal from some text.
-- If a format is specified (mformat), try only that reader.
-- Otherwise if the file path is provided (mpath), and it specifies a format, try only that reader.
-- Otherwise try all readers in turn until one succeeds, or return the first error if none of them succeed.
-- A CSV conversion rules file may be specified (mrulesfile) for use by the CSV reader.
-- If the assrt flag is true, also check and enforce balance assertions in the journal.
readJournal :: Maybe StorageFormat -> Maybe FilePath -> Bool -> Maybe FilePath -> Text -> IO (Either String Journal)
readJournal mformat mrulesfile assrt mpath t =
let rs = readersFor (mformat, mpath, t)
in tryReaders rs mrulesfile assrt mpath t
let rs = maybe readers (:[]) $ findReader mformat mpath
in tryReaders rs mrulesfile assrt mpath t
-- | @readersFor (format,path,t)@
-- | @findReader mformat mpath@
--
-- Which readers are worth trying for this (possibly unspecified) format, filepath, and data ?
readersFor :: (Maybe StorageFormat, Maybe FilePath, Text) -> [Reader]
readersFor (format,path,t) =
dbg1 ("possible readers for "++show (format,path,textElideRight 30 t)) $
case format of
Just f -> case readerForStorageFormat f of Just r -> [r]
Nothing -> []
Nothing -> case path of Nothing -> readers
Just p -> case readersForPathAndData (p,t) of [] -> readers
rs -> rs
-- | Find the (first) reader which can handle the given format, if any.
readerForStorageFormat :: StorageFormat -> Maybe Reader
readerForStorageFormat s | null rs = Nothing
| otherwise = Just $ head rs
where
rs = filter ((s==).rFormat) readers :: [Reader]
-- | Find the readers which think they can handle the given file path and data, if any.
readersForPathAndData :: (FilePath,Text) -> [Reader]
readersForPathAndData (f,t) = filter (\r -> dbg1 ("try "++rFormat r++" format") $ (rDetector r) f t) readers
-- Find the reader for the given format (mformat), if any.
-- Or if no format is provided, find the first reader that handles the
-- file name's extension, if any.
findReader :: Maybe StorageFormat -> Maybe FilePath -> Maybe Reader
findReader Nothing Nothing = Nothing
findReader (Just fmt) _ = headMay [r | r <- readers, fmt == rFormat r]
findReader Nothing (Just path) = headMay [r | r <- readers, ext `elem` rExtensions r]
where
ext = drop 1 $ takeExtension path
-- | @tryReaders readers mrulesfile assrt path t@
--

View File

@ -64,18 +64,11 @@ import Hledger.Read.Common (amountp, statusp, genericSourcePos)
reader :: Reader
reader = Reader format detect parse
format :: String
format = "csv"
-- | Does the given file path and data look like something this reader can handle ?
detect :: FilePath -> Text -> Bool
detect f excerpt
-- file name known: try this reader if it has any of these extensions
| f /= "-" = takeExtension f `elem` ['.':format]
-- file name unknown: try this reader if excerpt contains two or more commas
| otherwise = T.length (T.filter (==',') excerpt) >= 2
reader = Reader
{rFormat = "csv"
,rExtensions = ["csv"]
,rParser = parse
}
-- | Parse and post-process a "Journal" from CSV data, or give an error.
-- XXX currently ignores the string and reads from the file path

View File

@ -106,21 +106,11 @@ import Hledger.Utils
--- * reader
reader :: Reader
reader = Reader format detect parse
format :: String
format = "journal"
-- | Does the given file path and data look like something this reader can handle ?
detect :: FilePath -> Text -> Bool
detect f _
-- file name known: try this reader if it has any of these extensions
| f /= "-" = takeExtension f `elem` ['.':format, ".j", ".hledger", ".ledger", ".l"]
-- file name unknown: always try this reader
| otherwise = True
-- file name unknown: try this reader if we can see something like a journal entry
-- (digits in column 0 with the next line indented)
-- otherwise = regexMatches "(^|\n)[0-9]+.*\n[ \t]+" $ T.unpack excerpt
reader = Reader
{rFormat = "journal"
,rExtensions = ["journal", "j", "hledger", "ledger"]
,rParser = parse
}
-- | Parse and post-process a "Journal" from hledger's journal file
-- format, or give an error.

View File

@ -14,19 +14,11 @@ where
--- * imports
import Prelude ()
import Prelude.Compat hiding (readFile)
-- import qualified Control.Exception as C
import Control.Monad
import Control.Monad.IO.Class (liftIO)
import Control.Monad.Except (ExceptT(..), throwError)
-- import Control.Monad.State.Strict
-- import qualified Data.Map.Strict as M
import Data.Maybe
-- import Data.List
import Data.Text (Text, pack)
import Data.Text.Encoding (encodeUtf8)
-- import qualified Data.Text as T
-- import Data.Time.Calendar
-- import Data.Time.LocalTime
-- import Safe
import Test.HUnit
-- #ifdef TESTS
@ -35,7 +27,6 @@ import Test.HUnit
-- #endif
import Text.Megaparsec (eof)
-- import Text.Printf
import System.FilePath
import System.Time
import qualified Filesystem.Path.CurrentOS as F
@ -51,20 +42,14 @@ import Text.Trifecta.Result (Result(..))
--- * reader
reader :: Reader
reader = Reader format detect parse
reader = Reader
{rFormat = "ledger"
,rExtensions = []
,rParser = parse
}
format :: String
format = "ledger"
-- | Does the given file path and data look like something this reader can handle ?
detect :: FilePath -> Text -> Bool
detect f _
-- file name known: try this reader if it has any of these extensions
| f /= "-" = takeExtension f `elem` ['.':format, ".l"]
-- file name unknown: don't try this reader
| otherwise = False
-- | Parse and post-process a "Journal" from ledger's journal format, or give an error.
-- | Generate an action that parses and post-processes a "Journal" from a
-- C++ Ledger journal, or raises an error.
parse :: Maybe FilePath -> Bool -> FilePath -> Text -> ExceptT String IO Journal
parse _mrulespath assrt path txt = do
let

View File

@ -61,7 +61,6 @@ import Data.Text (Text)
import qualified Data.Text as T
import Test.HUnit
import Text.Megaparsec hiding (parse)
import System.FilePath
import Hledger.Data
-- XXX too much reuse ?
@ -70,18 +69,11 @@ import Hledger.Utils
reader :: Reader
reader = Reader format detect parse
format :: String
format = "timeclock"
-- | Does the given file path and data look like something this reader can handle ?
detect :: FilePath -> Text -> Bool
detect f excerpt
-- file name known: try this reader if it has any of these extensions
| f /= "-" = takeExtension f `elem` ['.':format]
-- file name unknown: try this reader if a line starts with "i " or "o " in excerpt
| otherwise = regexMatches "(^|\n)[io] " $ T.unpack excerpt
reader = Reader
{rFormat = "timeclock"
,rExtensions = ["timeclock"]
,rParser = parse
}
-- | Parse and post-process a "Journal" from timeclock.el's timeclock
-- format, saving the provided file path and the current time, or give an

View File

@ -41,10 +41,8 @@ import Data.Char (isSpace)
import Data.List (foldl')
import Data.Maybe
import Data.Text (Text)
import qualified Data.Text as T
import Test.HUnit
import Text.Megaparsec hiding (parse)
import System.FilePath
import Hledger.Data
import Hledger.Read.Common
@ -56,18 +54,11 @@ import Hledger.Utils hiding (ptrace)
ptrace = return
reader :: Reader
reader = Reader format detect parse
format :: String
format = "timedot"
-- | Does the given file path and data look like something this reader can handle ?
detect :: FilePath -> Text -> Bool
detect f excerpt
-- file name known: try this reader if it has any of these extensions
| f /= "-" = takeExtension f `elem` ['.':format]
-- file name unknown: try this reader if a line starts with a number in excerpt
| otherwise = regexMatches "(^|\n)[0-9]" $ T.unpack excerpt
reader = Reader
{rFormat = "timedot"
,rExtensions = ["timedot"]
,rParser = parse
}
-- | Parse and post-process a "Journal" from the timedot format, or give an error.
parse :: Maybe FilePath -> Bool -> FilePath -> Text -> ExceptT String IO Journal