Begin overhaul of siphon

This commit is contained in:
Andrew Martin 2017-06-09 19:28:32 -04:00
parent fca7d72085
commit 03e9e3734b
5 changed files with 660 additions and 132 deletions

1
.gitignore vendored
View File

@ -25,5 +25,6 @@ tmp/
tags
TAGS
colonnade/ex1.hs
colonnade/result
reflex-dom-colonnade/result

View File

@ -1,29 +1,22 @@
name: siphon
version: 0.6
synopsis: Encode and decode CSV files
description: Please see README.md
homepage: https://github.com/andrewthad/colonnade#readme
license: BSD3
license-file: LICENSE
author: Andrew Martin
maintainer: andrew.thaddeus@gmail.com
copyright: 2016 Andrew Martin
category: web
build-type: Simple
cabal-version: >=1.10
name: siphon
version: 0.7
synopsis: Encode and decode CSV files
description: Please see README.md
homepage: https://github.com/andrewthad/colonnade#readme
license: BSD3
license-file: LICENSE
author: Andrew Martin
maintainer: andrew.thaddeus@gmail.com
copyright: 2016 Andrew Martin
category: web
build-type: Simple
cabal-version: >=1.10
library
hs-source-dirs: src
exposed-modules:
Siphon.Text
Siphon.ByteString.Char8
Siphon
Siphon.Types
Siphon.Content
Siphon.Encoding
Siphon.Decoding
Siphon.Internal
Siphon.Internal.Text
build-depends:
base >= 4.7 && < 5
, colonnade >= 1.1 && < 1.2
@ -31,8 +24,9 @@ library
, bytestring
, contravariant
, vector
, pipes
, streaming
, attoparsec
, transformers
default-language: Haskell2010
test-suite siphon-test

View File

@ -1,11 +1,595 @@
module Siphon where
{-# LANGUAGE DeriveFunctor #-}
{-# LANGUAGE BangPatterns #-}
{-# LANGUAGE ScopedTypeVariables #-}
{-# LANGUAGE RankNTypes #-}
-- encode :: Pipe a (Vector c) m x
-- encode
-- decode :: Pipe (Vector c) a m x
-- {-# OPTIONS_GHC -Wall -Werr -fno-warn-unused-imports #-}
-- encode ::
module Siphon
( Siphon
, SiphonError
, Indexed(..)
, decodeHeadedChar8Csv
, humanizeSiphonError
) where
-- row :: Vector (Escaped Text) -> Text
-- row = Vector.
import Siphon.Types
import Data.Monoid
import Control.Applicative
import Control.Monad
import qualified Data.ByteString.Char8 as BC8
import qualified Data.Attoparsec.ByteString as A
import qualified Data.Attoparsec.Lazy as AL
import qualified Data.Attoparsec.Zepto as Z
import qualified Data.ByteString as S
import qualified Data.ByteString.Unsafe as S
import qualified Data.Vector as V
import qualified Data.ByteString as B
import qualified Data.ByteString.Lazy as LByteString
import qualified Data.ByteString.Builder as Builder
import qualified Data.Text as T
import qualified Data.List as L
import qualified Streaming as SM
import qualified Streaming.Prelude as SMP
import qualified Data.Attoparsec.Types as ATYP
import qualified Colonnade.Encode as CE
import Control.Monad.Trans.Class
import Data.ByteString.Builder (toLazyByteString,byteString)
import Data.Attoparsec.ByteString.Char8 (char, endOfInput, string)
import Data.Word (Word8)
import Data.Vector (Vector)
import Data.ByteString (ByteString)
import Data.Coerce (coerce)
import Data.Char (chr)
import Streaming (Stream,Of(..))
newtype Escaped c = Escaped { getEscaped :: c }
data Ended = EndedYes | EndedNo
data CellResult c = CellResultData !c | CellResultNewline !Ended
decodeHeadedChar8Csv :: Monad m
=> Siphon CE.Headed ByteString a
-> Stream (Of ByteString) m () -- ^ encoded csv
-> Stream (Of a) m (Maybe (SiphonError ByteString))
decodeHeadedChar8Csv headedSiphon s1 = do
e <- lift (consumeHeaderRowChar8 s1)
case e of
Left err -> return (Just err)
Right (v :> s2) -> case headedToIndexed v headedSiphon of
Left err -> return (Just err)
Right ixedSiphon -> do
let requiredLength = V.length v
consumeBodyChar8 1 requiredLength ixedSiphon s2
encodeHeadedChar8Csv :: Monad m
=> Colonnade CE.Headed ByteString a
-> Stream (Of a) m r
-> Stream (Of ByteString) m r
encodeHeadedChar8Csv headedSiphon s1 = do
yield (header siphon encoding)
pipe siphon encoding
encodeGeneralCsv :: Monad m
=> (c -> Escaped c)
-> c -- ^ separator
-> Colonnade f a c
-> Stream (Of a) m r
-> Stream (Of c) m r
encodeGeneralCsv escapeFunc separatorStr colonnade = do
Pipes.map (row siphon encoding)
encodeHeader :: Siphon c -> Colonnade Headed a c -> c
=> (c -> Escaped c)
-> c -- ^ separator
-> Colonnade f a c
-> Stream (Of c) m r
encodeHeader escapeFunc separatorStr colonnade = SMP.mapM_ $ \a -> do
let (vs,ws) = V.splitAt 1 (CE.getColonnade colonnade)
-- we only need to do this split because the first cell
-- gets treated differently than the others. It does not
-- get a separator added before it.
V.forM_ vs $ \(CE.OneColonnade _ encode) -> yield (getEscaped (escapeFunc (encode a)))
V.forM_ ws $ \(CE.OneColonnade _ encode) -> do
yield separator
yeied (getEscaped (escapeFunc (encode a)))
encodeRow ::
=> (c -> Escaped c)
-> c -- ^ separator
-> Colonnade f a c
-> Stream (Of a) m r
-> Stream (Of c) m r
encodeRow escapeFunc separatorStr colonnade = SMP.mapM_ $ \a -> do
let (vs,ws) = V.splitAt 1 (CE.getColonnade colonnade)
-- we only need to do this split because the first cell
-- gets treated differently than the others. It does not
-- get a separator added before it.
V.forM_ vs $ \(CE.OneColonnade _ encode) -> yield (getEscaped (escapeFunc (encode a)))
V.forM_ ws $ \(CE.OneColonnade _ encode) -> do
yield separator
yeied (getEscaped (escapeFunc (encode a)))
data IndexedHeader a = IndexedHeader
{ indexedHeaderIndexed :: {-# UNPACK #-} !Int
, indexedHeaderHeader :: !a
}
-- | Maps over a 'Decolonnade' that expects headers, converting these
-- expected headers into the indices of the columns that they
-- correspond to.
headedToIndexed :: forall c a. Eq c
=> Vector c -- ^ Headers in the source document
-> Siphon CE.Headed c a -- ^ Decolonnade that contains expected headers
-> Either (SiphonError c) (Siphon IndexedHeader c a)
headedToIndexed v =
mapLeft (\(HeaderErrors a b c) -> SiphonError 0 (RowErrorHeaders a b c))
. getEitherWrap
. go
where
go :: forall b.
Siphon CE.Headed c b
-> EitherWrap (HeaderErrors c) (Siphon IndexedHeader c b)
go (SiphonPure b) = EitherWrap (Right (SiphonPure b))
go (SiphonAp (CE.Headed h) decode apNext) =
let rnext = go apNext
ixs = V.elemIndices h v
ixsLen = V.length ixs
rcurrent
| ixsLen == 1 = Right (V.unsafeIndex ixs 0)
| ixsLen == 0 = Left (HeaderErrors V.empty (V.singleton h) V.empty)
| otherwise =
let dups = V.singleton (V.map (\ix -> CellError ix (V.unsafeIndex v ix)) ixs)
in Left (HeaderErrors dups V.empty V.empty)
in (\ix nextSiphon -> SiphonAp (IndexedHeader ix h) decode nextSiphon)
<$> EitherWrap rcurrent
<*> rnext
data HeaderErrors c = HeaderErrors !(Vector (Vector (CellError c))) !(Vector c) !(Vector Int)
instance Monoid (HeaderErrors c) where
mempty = HeaderErrors mempty mempty mempty
mappend (HeaderErrors a1 b1 c1) (HeaderErrors a2 b2 c2) = HeaderErrors
(mappend a1 a2) (mappend b1 b2) (mappend c1 c2)
-- byteStringChar8 :: Siphon ByteString
-- byteStringChar8 = Siphon
-- escape
-- encodeRow
-- (A.parse (row comma))
-- B.null
encodeRow :: Vector (Escaped ByteString) -> ByteString
encodeRow = id
. flip B.append (B.singleton newline)
. B.intercalate (B.singleton comma)
. V.toList
. coerce
escape :: ByteString -> Escaped ByteString
escape t = case B.find (\c -> c == newline || c == cr || c == comma || c == doubleQuote) t of
Nothing -> Escaped t
Just _ -> escapeAlways t
-- | This implementation is definitely suboptimal.
-- A better option (which would waste a little space
-- but would be much faster) would be to build the
-- new bytestring by writing to a buffer directly.
escapeAlways :: ByteString -> Escaped ByteString
escapeAlways t = Escaped $ LByteString.toStrict $ Builder.toLazyByteString $
Builder.word8 doubleQuote
<> B.foldl
(\ acc b -> acc <> if b == doubleQuote
then Builder.byteString
(B.pack [doubleQuote,doubleQuote])
else Builder.word8 b)
mempty
t
<> Builder.word8 doubleQuote
-- | Specialized version of 'sepBy1'' which is faster due to not
-- accepting an arbitrary separator.
sepByDelim1' :: AL.Parser a
-> Word8 -- ^ Field delimiter
-> AL.Parser [a]
sepByDelim1' p !delim = liftM2' (:) p loop
where
loop = do
mb <- A.peekWord8
case mb of
Just b | b == delim -> liftM2' (:) (A.anyWord8 *> p) loop
_ -> pure []
{-# INLINE sepByDelim1' #-}
-- | Parse a record, not including the terminating line separator. The
-- terminating line separate is not included as the last record in a
-- CSV file is allowed to not have a terminating line separator. You
-- most likely want to use the 'endOfLine' parser in combination with
-- this parser.
-- row :: Word8 -- ^ Field delimiter
-- -> AL.Parser (Vector ByteString)
-- row !delim = rowNoNewline delim <* endOfLine
-- {-# INLINE row #-}
--
-- rowNoNewline :: Word8 -- ^ Field delimiter
-- -> AL.Parser (Vector ByteString)
-- rowNoNewline !delim = V.fromList <$!> field delim `sepByDelim1'` delim
-- {-# INLINE rowNoNewline #-}
--
-- removeBlankLines :: [Vector ByteString] -> [Vector ByteString]
-- removeBlankLines = filter (not . blankLine)
-- | Parse a field. The field may be in either the escaped or
-- non-escaped format. The return value is unescaped. This
-- parser will consume the comma that comes after a field
-- but not a newline that follows a field. If we are positioned
-- at a newline when it starts, that newline will be consumed
-- and we return CellResultNewline.
field :: Word8 -> AL.Parser (CellResult ByteString)
field !delim = do
mb <- A.peekWord8
-- We purposely don't use <|> as we want to commit to the first
-- choice if we see a double quote.
case mb of
Just b
| b == delim -> do
bs <- escapedField delim
return (CellResultData bs)
| b == 10 || b == 13 -> do
_ <- eatNewlines
isEnd <- A.atEnd
if isEnd
then return (CellResultNewline EndedYes)
else return (CellResultNewline EndedNo)
| otherwise -> do
bs <- unescapedField delim
return (CellResultData bs)
Nothing -> return (CellResultNewline EndedYes)
{-# INLINE field #-}
eatNewlines :: AL.Parser S.ByteString
eatNewlines = A.takeWhile (\x -> x == 10 || x == 13)
escapedField :: Word8 -> AL.Parser S.ByteString
escapedField !delim = do
_ <- dquote
-- The scan state is 'True' if the previous character was a double
-- quote. We need to drop a trailing double quote left by scan.
s <- S.init <$> (A.scan False $ \s c -> if c == doubleQuote
then Just (not s)
else if s then Nothing
else Just False)
A.skip (== delim)
if doubleQuote `S.elem` s
then case Z.parse unescape s of
Right r -> return r
Left err -> fail err
else return s
-- | Consume an unescaped field. If it ends with a newline,
-- leave that in tact. If it ends with a comma, consume the comma.
unescapedField :: Word8 -> AL.Parser S.ByteString
unescapedField !delim =
( A.takeWhile $ \c ->
c /= doubleQuote &&
c /= newline &&
c /= delim &&
c /= cr
) <* A.skip (== delim)
dquote :: AL.Parser Char
dquote = char '"'
-- | This could be improved. We could avoid the builder and just
-- write to a buffer directly.
unescape :: Z.Parser S.ByteString
unescape = (LByteString.toStrict . toLazyByteString) <$!> go mempty where
go acc = do
h <- Z.takeWhile (/= doubleQuote)
let rest = do
start <- Z.take 2
if (S.unsafeHead start == doubleQuote &&
S.unsafeIndex start 1 == doubleQuote)
then go (acc `mappend` byteString h `mappend` byteString (BC8.singleton '"'))
else fail "invalid CSV escape sequence"
done <- Z.atEnd
if done
then return (acc `mappend` byteString h)
else rest
-- | Is this an empty record (i.e. a blank line)?
blankLine :: V.Vector B.ByteString -> Bool
blankLine v = V.length v == 1 && (B.null (V.head v))
-- | A version of 'liftM2' that is strict in the result of its first
-- action.
liftM2' :: (Monad m) => (a -> b -> c) -> m a -> m b -> m c
liftM2' f a b = do
!x <- a
y <- b
return (f x y)
{-# INLINE liftM2' #-}
-- | Match either a single newline character @\'\\n\'@, or a carriage
-- return followed by a newline character @\"\\r\\n\"@, or a single
-- carriage return @\'\\r\'@.
endOfLine :: A.Parser ()
endOfLine = (A.word8 newline *> return ()) <|> (string (BC8.pack "\r\n") *> return ()) <|> (A.word8 cr *> return ())
{-# INLINE endOfLine #-}
doubleQuote, newline, cr, comma :: Word8
doubleQuote = 34
newline = 10
cr = 13
comma = 44
-- | This adds one to the index because text editors consider
-- line number to be one-based, not zero-based.
humanizeSiphonError :: Eq c => (c -> String) -> SiphonError c -> String
humanizeSiphonError toStr (SiphonError ix e) = unlines
$ ("Decolonnade error on line " ++ show (ix + 1) ++ " of file.")
: ("Error Category: " ++ descr)
: map (" " ++) errDescrs
where (descr,errDescrs) = prettyRowError toStr e
prettyRowError :: Eq c => (c -> String) -> RowError c -> (String, [String])
prettyRowError toStr x = case x of
RowErrorParse -> (,) "CSV Parsing"
[ "The cells were malformed."
]
RowErrorSize reqLen actualLen -> (,) "Row Length"
[ "Expected the row to have exactly " ++ show reqLen ++ " cells."
, "The row only has " ++ show actualLen ++ " cells."
]
RowErrorHeaderSize reqLen actualLen -> (,) "Minimum Header Length"
[ "Expected the row to have at least " ++ show reqLen ++ " cells."
, "The row only has " ++ show actualLen ++ " cells."
]
RowErrorMalformed column -> (,) "Text Decolonnade"
[ "Tried to decode input input in column " ++ columnNumToLetters column ++ " text"
, "There is a mistake in the encoding of the text."
]
RowErrorHeaders dupErrs namedErrs unnamedErrs -> (,) "Missing Headers" $ concat
[ if V.length namedErrs > 0 then prettyNamedMissingHeaders toStr namedErrs else []
, if V.length unnamedErrs > 0 then ["Missing unnamed headers"] else []
, if V.length dupErrs > 0 then prettyHeadingErrors toStr dupErrs else []
]
RowErrorDecode errs -> (,) "Cell Decolonnade" (prettyCellErrors toStr errs)
prettyCellErrors :: (c -> String) -> Vector (CellError c) -> [String]
prettyCellErrors toStr errs = drop 1 $
flip concatMap errs $ \(CellError ix content) ->
let str = toStr content in
[ "-----------"
, "Column " ++ columnNumToLetters ix
, "Cell Content Length: " ++ show (Prelude.length str)
, "Cell Content: " ++ if null str
then "[empty cell]"
else str
]
prettyNamedMissingHeaders :: (c -> String) -> Vector c -> [String]
prettyNamedMissingHeaders conv missing = concat
[ concatMap (\h -> ["The header " ++ conv h ++ " was missing."]) missing
]
prettyHeadingErrors :: forall c. Eq c
=> (c -> String) -> Vector (Vector (CellError c)) -> [String]
prettyHeadingErrors conv missing = join (V.toList (fmap f missing))
where
f :: Vector (CellError c) -> [String]
f v
| not (V.null w) && V.all (== V.head w) (V.tail w) =
[ "The header ["
, conv (V.head w)
, "] appears in columns "
, L.intercalate ", " (V.toList (V.map (\(CellError ix _) -> columnNumToLetters ix) v))
]
| otherwise = multiMsg : V.toList
(V.map (\(CellError ix content) -> " Column " ++ columnNumToLetters ix ++ ": " ++ conv content) v)
where
w :: Vector c
w = V.map cellErrorContent v
multiMsg :: String
multiMsg = "Multiple headers matched the same predicate:"
columnNumToLetters :: Int -> String
columnNumToLetters i
| i >= 0 && i < 25 = [chr (i + 65)]
| otherwise = "Beyond Z. Fix this."
newtype EitherWrap a b = EitherWrap
{ getEitherWrap :: Either a b
} deriving (Functor)
instance Monoid a => Applicative (EitherWrap a) where
pure = EitherWrap . Right
EitherWrap (Left a1) <*> EitherWrap (Left a2) = EitherWrap (Left (mappend a1 a2))
EitherWrap (Left a1) <*> EitherWrap (Right _) = EitherWrap (Left a1)
EitherWrap (Right _) <*> EitherWrap (Left a2) = EitherWrap (Left a2)
EitherWrap (Right f) <*> EitherWrap (Right b) = EitherWrap (Right (f b))
mapLeft :: (a -> b) -> Either a c -> Either b c
mapLeft _ (Right a) = Right a
mapLeft f (Left a) = Left (f a)
consumeHeaderRowChar8 :: Monad m
=> Stream (Of ByteString) m ()
-> m (Either (SiphonError ByteString) (Of (Vector ByteString) (Stream (Of ByteString) m ())))
consumeHeaderRowChar8 = consumeHeaderRow (A.parse (field comma)) B.null B.empty (\() -> True)
consumeBodyChar8 :: forall m a. Monad m
=> Int -- ^ index of first row, usually zero or one
-> Int -- ^ Required row length
-> Siphon IndexedHeader ByteString a
-> Stream (Of ByteString) m ()
-> Stream (Of a) m (Maybe (SiphonError ByteString))
consumeBodyChar8 = consumeBody (A.parse (field comma)) B.null B.empty (\() -> True)
consumeHeaderRow :: forall m r c. Monad m
=> (c -> ATYP.IResult c (CellResult c))
-> (c -> Bool) -- ^ true if null string
-> c
-> (r -> Bool) -- ^ true if termination is acceptable
-> Stream (Of c) m r
-> m (Either (SiphonError c) (Of (Vector c) (Stream (Of c) m r)))
consumeHeaderRow parseCell isNull emptyStr isGood s0 = go 0 StrictListNil s0
where
go :: Int
-> StrictList c
-> Stream (Of c) m r
-> m (Either (SiphonError c) (Of (Vector c) (Stream (Of c) m r)))
go !cellsLen !cells !s1 = do
e <- skipWhile isNull s1
case e of
Left r -> return $ if isGood r
then Right (reverseVectorStrictList cellsLen cells :> return r)
else Left (SiphonError 0 RowErrorParse)
Right (c :> s2) -> handleResult cellsLen cells (parseCell c) s2
handleResult :: Int -> StrictList c
-> ATYP.IResult c (CellResult c)
-> Stream (Of c) m r
-> m (Either (SiphonError c) (Of (Vector c) (Stream (Of c) m r)))
handleResult !cellsLen !cells !result s1 = case result of
ATYP.Fail _ _ _ -> return $ Left $ SiphonError 0 RowErrorParse
ATYP.Done !c1 !res -> case res of
-- it might be wrong to ignore whether or not the stream has ended
CellResultNewline _ -> do
let v = reverseVectorStrictList cellsLen cells
return (Right (v :> (SMP.yield c1 >> s1)))
CellResultData !cd -> if isNull c1
then go (cellsLen + 1) (StrictListCons cd cells) s1
else handleResult (cellsLen + 1) (StrictListCons cd cells) (parseCell c1) s1
ATYP.Partial k -> do
e <- skipWhile isNull s1
case e of
Left r -> handleResult cellsLen cells (k emptyStr) (return r)
Right (c1 :> s2) -> handleResult cellsLen cells (k c1) s2
consumeBody :: forall m r c a. Monad m
=> (c -> ATYP.IResult c (CellResult c))
-> (c -> Bool)
-> c
-> (r -> Bool) -- ^ True if termination is acceptable. False if it is because of a decoding error.
-> Int -- ^ index of first row, usually zero or one
-> Int -- ^ Required row length
-> Siphon IndexedHeader c a
-> Stream (Of c) m r
-> Stream (Of a) m (Maybe (SiphonError c))
consumeBody parseCell isNull emptyStr isGood row0 reqLen siphon s0 = go row0 0 StrictListNil s0
where
go :: Int -> Int -> StrictList c -> Stream (Of c) m r -> Stream (Of a) m (Maybe (SiphonError c))
go !row !cellsLen !cells !s1 = do
e <- lift (skipWhile isNull s1)
case e of
Left r -> return $ if isGood r
then Nothing
else Just (SiphonError row RowErrorParse)
Right (c :> s2) -> handleResult row cellsLen cells (parseCell c) s2
handleResult :: Int -> Int -> StrictList c
-> ATYP.IResult c (CellResult c)
-> Stream (Of c) m r
-> Stream (Of a) m (Maybe (SiphonError c))
handleResult !row !cellsLen !cells !result s1 = case result of
ATYP.Fail _ _ _ -> return $ Just $ SiphonError row RowErrorParse
ATYP.Done !c1 !res -> case res of
CellResultNewline ended -> do
case decodeRow row (reverseVectorStrictList cellsLen cells) of
Left err -> return (Just err)
Right a -> do
SMP.yield a
case ended of
EndedYes -> do
e <- lift (SM.inspect s1)
case e of
Left r -> return $ if isGood r
then Nothing
else Just (SiphonError row RowErrorParse)
Right _ -> error "siphon: logical error, stream should be exhausted"
EndedNo -> if isNull c1
then go (row + 1) 0 StrictListNil s1
else handleResult (row + 1) 0 StrictListNil (parseCell c1) s1
CellResultData !cd -> if isNull c1
then go row (cellsLen + 1) (StrictListCons cd cells) s1
else handleResult row (cellsLen + 1) (StrictListCons cd cells) (parseCell c1) s1
ATYP.Partial k -> do
e <- lift (skipWhile isNull s1)
case e of
Left r -> handleResult row cellsLen cells (k emptyStr) (return r)
Right (c1 :> s2) -> handleResult row cellsLen cells (k c1) s2
decodeRow :: Int -> Vector c -> Either (SiphonError c) a
decodeRow rowIx v =
let vlen = V.length v in
if vlen /= reqLen
then Left $ SiphonError rowIx $ RowErrorSize reqLen vlen
else uncheckedRunWithRow rowIx siphon v
-- | You must pass the length of the list and as the first argument.
reverseVectorStrictList :: Int -> StrictList c -> Vector c
reverseVectorStrictList _ _ = error "write me"
skipWhile :: forall m a r. Monad m
=> (a -> Bool)
-> Stream (Of a) m r
-> m (Either r (Of a (Stream (Of a) m r)))
skipWhile f = go where
go :: Stream (Of a) m r
-> m (Either r (Of a (Stream (Of a) m r)))
go s1 = do
e <- SM.inspect s1
case e of
Left _ -> return e
Right (a :> s2) -> if f a
then go s2
else return e
-- | Strict in the spine and in the values
data StrictList a = StrictListNil | StrictListCons !a !(StrictList a)
-- | This function uses 'unsafeIndex' to access
-- elements of the 'Vector'.
uncheckedRunWithRow ::
Int
-> Siphon IndexedHeader c a
-> Vector c
-> Either (SiphonError c) a
uncheckedRunWithRow i d v = mapLeft (SiphonError i . RowErrorDecode) (uncheckedRun d v)
-- | This function does not check to make sure that the indicies in
-- the 'Decolonnade' are in the 'Vector'. Only use this if you have
-- already verified that none of the indices in the siphon are
-- out of the bounds.
uncheckedRun :: forall c a.
Siphon IndexedHeader c a
-> Vector c
-> Either (Vector (CellError c)) a
uncheckedRun dc v = getEitherWrap (go dc)
where
go :: forall b.
Siphon IndexedHeader c b
-> EitherWrap (Vector (CellError c)) b
go (SiphonPure b) = EitherWrap (Right b)
go (SiphonAp (IndexedHeader ix _) decode apNext) =
let rnext = go apNext
content = V.unsafeIndex v ix
rcurrent = maybe
(Left (V.singleton (CellError ix content)))
Right
(decode content)
in rnext <*> (EitherWrap rcurrent)
siphonLength :: forall f c a. Siphon f c a -> Int
siphonLength = go 0 where
go :: forall b. Int -> Siphon f c b -> Int
go !a (SiphonPure _) = a
go !a (SiphonAp _ _ apNext) = go (a + 1) apNext
maxIndex :: forall c a. Siphon IndexedHeader c a -> Int
maxIndex = go 0 where
go :: forall b. Int -> Siphon IndexedHeader c b -> Int
go !ix (SiphonPure _) = ix
go !ix1 (SiphonAp (IndexedHeader ix2 _) _ apNext) =
go (max ix1 ix2) apNext

View File

@ -2,121 +2,76 @@
{-# LANGUAGE DeriveFunctor #-}
{-# LANGUAGE GeneralizedNewtypeDeriving #-}
module Siphon.Types where
{-# OPTIONS_GHC -Wall -Werror #-}
module Siphon.Types
( Siphon(..)
, Indexed(..)
, SiphonError(..)
, RowError(..)
, CellError(..)
) where
import Data.Vector (Vector)
import Control.Exception (Exception)
import Data.Typeable (Typeable)
import qualified Data.Vector as Vector
import qualified Data.Attoparsec.Types as Atto
newtype Escaped c = Escaped { getEscaped :: c }
data Siphon c = Siphon
{ siphonEscape :: !(c -> Escaped c)
, siphonIntercalate :: !(Vector (Escaped c) -> c)
, siphonParseRow :: c -> Atto.IResult c (Vector c)
, siphonNull :: c -> Bool
}
data DecolonnadeCellError f content = DecolonnadeCellError
{ decodingCellErrorContent :: !content
, decodingCellErrorHeader :: !(Indexed f content)
, decodingCellErrorMessage :: !String
data CellError c = CellError
{ cellErrorColumn :: !Int
, cellErrorContent :: !c
} deriving (Show,Read,Eq)
-- instance (Show (f content), Typeable content) => Exception (DecolonnadeError f content)
data Indexed f a = Indexed
{ indexedIndex :: !Int
, indexedHeading :: !(f a)
newtype Indexed a = Indexed
{ indexedIndex :: Int
} deriving (Eq,Ord,Functor,Show,Read)
newtype DecolonnadeCellErrors f content = DecolonnadeCellErrors
{ getDecolonnadeCellErrors :: Vector (DecolonnadeCellError f content)
} deriving (Monoid,Show,Read,Eq)
-- newtype ParseRowError = ParseRowError String
-- TODO: rewrite the instances for this by hand. They
-- currently use FlexibleContexts.
data DecolonnadeRowError f content = DecolonnadeRowError
{ decodingRowErrorRow :: !Int
, decodingRowErrorError :: !(RowError f content)
data SiphonError c = SiphonError
{ siphonErrorRow :: !Int
, siphonErrorCause :: !(RowError c)
} deriving (Show,Read,Eq)
-- TODO: rewrite the instances for this by hand. They
-- currently use FlexibleContexts.
data RowError f content
= RowErrorParse !String -- ^ Error occurred parsing the document into cells
| RowErrorDecode !(DecolonnadeCellErrors f content) -- ^ Error decoding the content
| RowErrorSize !Int !Int -- ^ Wrong number of cells in the row
| RowErrorHeading !(HeadingErrors content)
| RowErrorMinSize !Int !Int
| RowErrorMalformed !String -- ^ Error decoding unicode content
instance (Show c, Typeable c) => Exception (SiphonError c)
data RowError c
= RowErrorParse
-- ^ Error occurred parsing the document into cells
| RowErrorDecode !(Vector (CellError c))
-- ^ Error decoding the content
| RowErrorSize !Int !Int
-- ^ Wrong number of cells in the row
| RowErrorHeaders !(Vector (Vector (CellError c))) !(Vector c) !(Vector Int)
-- ^ Three parts:
-- (a) Multiple header cells matched the same expected cell,
-- (b) Headers that were missing,
-- (c) Missing headers that were lambdas. They cannot be
-- shown so instead their positions in the 'Siphon' are given.
| RowErrorHeaderSize !Int !Int
-- ^ Not enough cells in header, expected, actual
| RowErrorMalformed !Int
-- ^ Error decoding unicode content, column number
deriving (Show,Read,Eq)
data HeadingErrors content = HeadingErrors
{ headingErrorsMissing :: Vector content -- ^ headers that were missing
, headingErrorsDuplicate :: Vector (content,Int) -- ^ headers that occurred more than once
} deriving (Show,Read,Eq)
instance (Show content, Typeable content) => Exception (HeadingErrors content)
instance Monoid (HeadingErrors content) where
mempty = HeadingErrors Vector.empty Vector.empty
mappend (HeadingErrors a1 b1) (HeadingErrors a2 b2) = HeadingErrors
(a1 Vector.++ a2) (b1 Vector.++ b2)
-- | This just actually a specialization of the free applicative.
-- Check out @Control.Applicative.Free@ in the @free@ library to
-- learn more about this. The meanings of the fields are documented
-- slightly more in the source code. Unfortunately, haddock does not
-- play nicely with GADTs.
data Decolonnade f content a where
DecolonnadePure :: !a -- function
-> Decolonnade f content a
DecolonnadeAp :: !(f content) -- header
-> !(content -> Either String a) -- decoding function
-> !(Decolonnade f content (a -> b)) -- next decoding
-> Decolonnade f content b
data Siphon f c a where
SiphonPure ::
!a -- function
-> Siphon f c a
SiphonAp ::
!(f c) -- header
-> !(c -> Maybe a) -- decoding function
-> !(Siphon f c (a -> b)) -- next decoding
-> Siphon f c b
instance Functor (Decolonnade f content) where
fmap f (DecolonnadePure a) = DecolonnadePure (f a)
fmap f (DecolonnadeAp h c apNext) = DecolonnadeAp h c ((f .) <$> apNext)
instance Functor (Siphon f c) where
fmap f (SiphonPure a) = SiphonPure (f a)
fmap f (SiphonAp h c apNext) = SiphonAp h c ((f .) <$> apNext)
instance Applicative (Decolonnade f content) where
pure = DecolonnadePure
DecolonnadePure f <*> y = fmap f y
DecolonnadeAp h c y <*> z = DecolonnadeAp h c (flip <$> y <*> z)
-- -- | This type is provided for convenience with @pipes-text@
-- data CsvResult f c
-- = CsvResultSuccess
-- | CsvResultTextDecodeError
-- | CsvResultDecodeError (DecodingRowError f c)
-- deriving (Show,Read,Eq)
-- | Consider changing out the use of 'Vector' here
-- with the humble list instead. It might fuse away
-- better. Not sure though.
-- data SiphonX c1 c2 = SiphonX
-- { siphonXEscape :: !(c1 -> Escaped c2)
-- , siphonXIntercalate :: !(Vector (Escaped c2) -> c2)
-- }
--
-- data SiphonDecoding c1 c2 = SiphonDecoding
-- { siphonDecodingParse :: c1 -> Atto.IResult c1 (Vector c2)
-- , siphonDecodingNull :: c1 -> Bool
-- }
-- data WithEnd c = WithEnd
-- { withEndEnded :: !Bool
-- , withEndContent :: !c
-- }
-- data SiphonDecodingError
-- { clarify
-- }
instance Applicative (Siphon f c) where
pure = SiphonPure
SiphonPure f <*> y = fmap f y
SiphonAp h c y <*> z = SiphonAp h c (flip <$> y <*> z)

View File

@ -38,19 +38,13 @@ resolver: lts-8.0
packages:
- 'colonnade'
- 'yesod-colonnade'
- 'reflex-dom-colonnade'
- 'blaze-colonnade'
- 'siphon'
- 'geolite-csv'
# Dependency packages to be pulled from upstream that are not in the resolver
# (e.g., acme-missiles-0.3)
extra-deps:
- 'reflex-dom-0.3'
- 'ref-tf-0.4'
- 'reflex-0.4.0'
- 'haskell-src-exts-1.16.0.1'
- 'syb-0.5.1'
- 'ip-0.8.4'
- 'ip-0.9'
# Override default flag values for local packages and extra-deps
flags: {}