From ce3ab2cbddee7c8ba8abecc3db394c543846ffee Mon Sep 17 00:00:00 2001 From: Andrew Martin Date: Fri, 18 Mar 2022 14:07:57 -0400 Subject: [PATCH] Support decoding CSVs without headers --- siphon/CHANGELOG.md | 4 +++ siphon/siphon.cabal | 2 +- siphon/src/Siphon.hs | 63 +++++++++++++++++++++----------------------- 3 files changed, 35 insertions(+), 34 deletions(-) diff --git a/siphon/CHANGELOG.md b/siphon/CHANGELOG.md index aadb0dc..0ac9ef3 100644 --- a/siphon/CHANGELOG.md +++ b/siphon/CHANGELOG.md @@ -1,5 +1,9 @@ # Revision history for siphon +## 0.8.2.0 -- 2022-??-?? + +* Add + ## 0.8.1.2 -- 2021-10-25 * Correct handling of CRLF. diff --git a/siphon/siphon.cabal b/siphon/siphon.cabal index f05f2ee..7d87807 100644 --- a/siphon/siphon.cabal +++ b/siphon/siphon.cabal @@ -1,6 +1,6 @@ cabal-version: 3.0 name: siphon -version: 0.8.1.2 +version: 0.8.2.0 synopsis: Encode and decode CSV files description: Please see README.md homepage: https://github.com/andrewthad/colonnade#readme diff --git a/siphon/src/Siphon.hs b/siphon/src/Siphon.hs index 1c38e66..5fb13bc 100644 --- a/siphon/src/Siphon.hs +++ b/siphon/src/Siphon.hs @@ -18,6 +18,8 @@ module Siphon , encodeCsvStreamUtf8 -- * Decode CSV , decodeCsvUtf8 + , decodeHeadedCsvUtf8 + , decodeIndexedCsvUtf8 -- * Build Siphon , headed , headless @@ -81,11 +83,19 @@ data Ended = EndedYes | EndedNo data CellResult c = CellResultData !c | CellResultNewline !c !Ended deriving (Show) -decodeCsvUtf8 :: Monad m +-- | Backwards-compatibility alias for 'decodeHeadedCsvUtf8'. +decodeCsvUtf8 :: Monad m => Siphon CE.Headed ByteString a -> Stream (Of ByteString) m () -- ^ encoded csv -> Stream (Of a) m (Maybe SiphonError) -decodeCsvUtf8 headedSiphon s1 = do +decodeCsvUtf8 = decodeHeadedCsvUtf8 + +-- | Decode a CSV whose first row is contains headers identify each column. +decodeHeadedCsvUtf8 :: Monad m + => Siphon CE.Headed ByteString a + -> Stream (Of ByteString) m () -- ^ encoded csv + -> Stream (Of a) m (Maybe SiphonError) +decodeHeadedCsvUtf8 headedSiphon s1 = do e <- lift (consumeHeaderRowUtf8 s1) case e of Left err -> return (Just err) @@ -95,6 +105,15 @@ decodeCsvUtf8 headedSiphon s1 = do let requiredLength = V.length v consumeBodyUtf8 1 requiredLength ixedSiphon s2 +-- | Decode a CSV without a header. +decodeIndexedCsvUtf8 :: Monad m + => Int -- ^ How many columns are there? This number should be greater than any indices referenced by the scheme. + -> Siphon Indexed ByteString a + -> Stream (Of ByteString) m () -- ^ encoded csv + -> Stream (Of a) m (Maybe SiphonError) +decodeIndexedCsvUtf8 !requiredLength ixedSiphon s1 = do + consumeBodyUtf8 0 requiredLength ixedSiphon s1 + encodeCsvStreamUtf8 :: (Monad m, CE.Headedness h) => CE.Colonnade h a ByteString -> Stream (Of a) m r @@ -222,11 +241,6 @@ encodeRows escapeFunc separatorStr newlineStr colonnade = mapStreamM $ \a -> do SMP.yield (getEscaped (escapeFunc (encode a))) SMP.yield newlineStr -data IndexedHeader a = IndexedHeader - { indexedHeaderIndexed :: {-# UNPACK #-} !Int - , indexedHeaderHeader :: !a - } - -- | Maps over a 'Decolonnade' that expects headers, converting these -- expected headers into the indices of the columns that they -- correspond to. @@ -234,7 +248,7 @@ headedToIndexed :: forall c a. Eq c => (c -> T.Text) -> Vector c -- ^ Headers in the source document -> Siphon CE.Headed c a -- ^ Decolonnade that contains expected headers - -> Either SiphonError (Siphon IndexedHeader c a) + -> Either SiphonError (Siphon Indexed c a) headedToIndexed toStr v = mapLeft (\(HeaderErrors a b c) -> SiphonError 0 (RowErrorHeaders a b c)) . getEitherWrap @@ -242,7 +256,7 @@ headedToIndexed toStr v = where go :: forall b. Siphon CE.Headed c b - -> EitherWrap HeaderErrors (Siphon IndexedHeader c b) + -> EitherWrap HeaderErrors (Siphon Indexed c b) go (SiphonPure b) = EitherWrap (Right (SiphonPure b)) go (SiphonAp (CE.Headed h) decode apNext) = let rnext = go apNext @@ -254,7 +268,7 @@ headedToIndexed toStr v = | otherwise = let dups = V.singleton (V.map (\ix -> CellError ix (toStr (v V.! ix) {- (V.unsafeIndex v ix) -} )) ixs) in Left (HeaderErrors dups V.empty V.empty) - in (\ix nextSiphon -> SiphonAp (IndexedHeader ix h) decode nextSiphon) + in (\ix nextSiphon -> SiphonAp (Indexed ix) decode nextSiphon) <$> EitherWrap rcurrent <*> rnext @@ -444,10 +458,6 @@ unescape = (LByteString.toStrict . toLazyByteString) <$!> go mempty where then return (acc `mappend` byteString h) else rest --- | Is this an empty record (i.e. a blank line)? -blankLine :: V.Vector B.ByteString -> Bool -blankLine v = V.length v == 1 && (B.null (V.head v)) - doubleQuote, newline, cr, comma :: Word8 doubleQuote = 34 newline = 10 @@ -551,7 +561,7 @@ consumeHeaderRowUtf8 = consumeHeaderRow (A.parse (field comma)) B.null B.empty ( consumeBodyUtf8 :: forall m a. Monad m => Int -- ^ index of first row, usually zero or one -> Int -- ^ Required row length - -> Siphon IndexedHeader ByteString a + -> Siphon Indexed ByteString a -> Stream (Of ByteString) m () -> Stream (Of a) m (Maybe SiphonError) consumeBodyUtf8 = consumeBody utf8ToStr @@ -608,7 +618,7 @@ consumeBody :: forall m r c a. Monad m -> (r -> Bool) -- ^ True if termination is acceptable. False if it is because of a decoding error. -> Int -- ^ index of first row, usually zero or one -> Int -- ^ Required row length - -> Siphon IndexedHeader c a + -> Siphon Indexed c a -> Stream (Of c) m r -> Stream (Of a) m (Maybe SiphonError) consumeBody toStr parseCell isNull emptyStr isGood row0 reqLen siphon s0 = @@ -701,7 +711,7 @@ data StrictList a = StrictListNil | StrictListCons !a !(StrictList a) uncheckedRunWithRow :: (c -> T.Text) -> Int - -> Siphon IndexedHeader c a + -> Siphon Indexed c a -> Vector c -> Either SiphonError a uncheckedRunWithRow toStr i d v = @@ -713,16 +723,16 @@ uncheckedRunWithRow toStr i d v = -- out of the bounds. uncheckedRun :: forall c a. (c -> T.Text) - -> Siphon IndexedHeader c a + -> Siphon Indexed c a -> Vector c -> Either (Vector CellError) a uncheckedRun toStr dc v = getEitherWrap (go dc) where go :: forall b. - Siphon IndexedHeader c b + Siphon Indexed c b -> EitherWrap (Vector CellError) b go (SiphonPure b) = EitherWrap (Right b) - go (SiphonAp (IndexedHeader ix _) decode apNext) = + go (SiphonAp (Indexed ix) decode apNext) = let rnext = go apNext content = v V.! ix -- V.unsafeIndex v ix rcurrent = maybe @@ -731,19 +741,6 @@ uncheckedRun toStr dc v = getEitherWrap (go dc) (decode content) in rnext <*> (EitherWrap rcurrent) -siphonLength :: forall f c a. Siphon f c a -> Int -siphonLength = go 0 where - go :: forall b. Int -> Siphon f c b -> Int - go !a (SiphonPure _) = a - go !a (SiphonAp _ _ apNext) = go (a + 1) apNext - -maxIndex :: forall c a. Siphon IndexedHeader c a -> Int -maxIndex = go 0 where - go :: forall b. Int -> Siphon IndexedHeader c b -> Int - go !ix (SiphonPure _) = ix - go !ix1 (SiphonAp (IndexedHeader ix2 _) _ apNext) = - go (max ix1 ix2) apNext - -- | Uses the argument to parse a CSV column. headless :: (c -> Maybe a) -> Siphon CE.Headless c a headless f = SiphonAp CE.Headless f (SiphonPure id)