Improve errors, make geolite parsing work

This commit is contained in:
Andrew Martin 2016-07-13 17:33:38 -04:00
parent 4bcf860fbc
commit 02d616a555
13 changed files with 184 additions and 37 deletions

2
.gitignore vendored
View File

@ -35,3 +35,5 @@ tags
TAGS
docs/db/unthreat
geolite-csv/data/large

View File

@ -8,6 +8,7 @@ import Colonnade.Types
import Data.Functor.Contravariant
import Data.Vector (Vector)
import qualified Data.Vector as Vector
import Data.Char (chr)
-- | Converts the content type of a 'Decoding'. The @'Contravariant' f@
-- constraint means that @f@ can be 'Headless' but not 'Headed'.
@ -101,3 +102,59 @@ headedToIndexed v = getEitherWrap . go
<$> EitherWrap rcurrent
<*> rnext
-- | This adds one to the index because text editors consider
-- line number to be one-based, not zero-based.
prettyError :: (c -> String) -> DecodingRowError f c -> String
prettyError toStr (DecodingRowError ix e) = unlines
$ ("Decoding error on line " ++ show (ix + 1) ++ " of file.")
: ("Error Category: " ++ descr)
: map (" " ++) errDescrs
where (descr,errDescrs) = prettyRowError toStr e
prettyRowError :: (content -> String) -> RowError f content -> (String, [String])
prettyRowError toStr x = case x of
RowErrorParse err -> (,) "CSV Parsing"
[ "The line could not be parsed into cells correctly."
, "Original parser error: " ++ err
]
RowErrorSize reqLen actualLen -> (,) "Row Length"
[ "Expected the row to have exactly " ++ show reqLen ++ " cells."
, "The row only has " ++ show actualLen ++ " cells."
]
RowErrorMinSize reqLen actualLen -> (,) "Row Min Length"
[ "Expected the row to have at least " ++ show reqLen ++ " cells."
, "The row only has " ++ show actualLen ++ " cells."
]
RowErrorMalformed enc -> (,) "Text Decoding"
[ "Tried to decode the input as " ++ enc ++ " text"
, "There is a mistake in the encoding of the text."
]
RowErrorHeading errs -> (,) "Header" (prettyHeadingErrors toStr errs)
RowErrorDecode errs -> (,) "Cell Decoding" (prettyCellErrors toStr errs)
prettyCellErrors :: (c -> String) -> DecodingCellErrors f c -> [String]
prettyCellErrors toStr (DecodingCellErrors errs) = drop 1 $
flip concatMap errs $ \(DecodingCellError content (Indexed ix _) msg) ->
let str = toStr content in
[ "-----------"
, "Column " ++ columnNumToLetters ix
, "Original parse error: " ++ msg
, "Cell Content Length: " ++ show (Prelude.length str)
, "Cell Content: " ++ if null str
then "[empty cell]"
else str
]
prettyHeadingErrors :: (c -> String) -> HeadingErrors c -> [String]
prettyHeadingErrors conv (HeadingErrors missing duplicates) = concat
[ concatMap (\h -> ["The header " ++ conv h ++ " was missing."]) missing
, concatMap (\(h,n) -> ["The header " ++ conv h ++ " occurred " ++ show n ++ " times."]) duplicates
]
columnNumToLetters :: Int -> String
columnNumToLetters i
| i >= 0 && i < 25 = [chr (i + 65)]
| otherwise = "Beyond Z. Fix this."

View File

@ -1,6 +1,7 @@
{-# LANGUAGE DeriveFunctor #-}
{-# LANGUAGE DeriveFunctor #-}
{-# LANGUAGE DeriveFoldable #-}
{-# LANGUAGE GeneralizedNewtypeDeriving #-}
{-# LANGUAGE GADTs #-}
{-# LANGUAGE GADTs #-}
module Colonnade.Types
( Encoding(..)
, Decoding(..)
@ -24,11 +25,11 @@ import qualified Data.Vector as Vector
-- | Isomorphic to 'Identity'
newtype Headed a = Headed { getHeaded :: a }
deriving (Eq,Ord,Functor,Show,Read)
deriving (Eq,Ord,Functor,Show,Read,Foldable)
-- | Isomorphic to 'Proxy'
data Headless a = Headless
deriving (Eq,Ord,Functor,Show,Read)
deriving (Eq,Ord,Functor,Show,Read,Foldable)
data Indexed f a = Indexed
{ indexedIndex :: !Int
@ -76,6 +77,7 @@ data RowError f content
| RowErrorSize !Int !Int -- ^ Wrong number of cells in the row
| RowErrorHeading !(HeadingErrors content)
| RowErrorMinSize !Int !Int
| RowErrorMalformed !String -- ^ Error decoding unicode content
deriving (Show,Read,Eq)
-- instance (Show (f content), Typeable content) => Exception (DecodingErrors f content)

View File

@ -43,6 +43,7 @@ test-suite geolite-csv-test
, test-framework-hunit
, pipes-bytestring
, pipes-text
, directory
ghc-options: -threaded -rtsopts -with-rtsopts=-N
default-language: Haskell2010

View File

@ -0,0 +1,35 @@
#!/bin/bash
set -e
current_dir="${PWD##*/}"
echo "Current directory is: $current_dir"
if [ "$current_dir" = "colonnade" ]
then
cd ./geolite-csv
fi
new_current_dir="${PWD##*/}"
if [ "$new_current_dir" != "geolite-csv" ]
then
echo "Not currently in the geolite project directory. Exiting."
exit 1
fi
mkdir -p ./data/large
cd ./data/large
rm -f *.zip
rm -rf GeoLite2-*
curl 'http://geolite.maxmind.com/download/geoip/database/GeoLite2-City-CSV.zip' > archive.zip
unzip archive.zip -d ./
cd GeoLite2-City-CSV*
mv *.csv ../
cd ../
rm -rf GeoLite2-City-CSV*
rm archive.zip

View File

@ -39,16 +39,20 @@ decodingCity = City
decodingBlock :: Decoding Headed Text Block
decodingBlock = Block
<$> CD.headed "network" IPv4RangeText.decodeEither
<*> CD.headed "geoname_id" (CDT.map GeonameId CDT.int)
<*> CD.headed "geoname_id"
(CDT.optional $ CDT.map GeonameId CDT.int)
<*> CD.headed "registered_country_geoname_id"
(CDT.map GeonameId CDT.int)
(CDT.optional $ CDT.map GeonameId CDT.int)
<*> CD.headed "represented_country_geoname_id"
(CDT.optional $ CDT.map GeonameId CDT.int)
<*> CD.headed "is_anonymous_proxy" (CDT.trueFalse "1" "0")
<*> CD.headed "is_satellite_provider" (CDT.trueFalse "1" "0")
<*> CD.headed "postal_code" CDT.text
<*> CD.headed "latitude" (CDT.fromReader TextRead.rational)
<*> CD.headed "longitude" (CDT.fromReader TextRead.rational)
<*> CD.headed "accuracy_radius" CDT.int
<*> CD.headed "latitude"
(CDT.optional $ CDT.fromReader TextRead.rational)
<*> CD.headed "longitude"
(CDT.optional $ CDT.fromReader TextRead.rational)
<*> CD.headed "accuracy_radius"
(CDT.optional CDT.int)

View File

@ -29,14 +29,14 @@ data City = City
data Block = Block
{ blockNetwork :: IPv4Range
, blockGeonameId :: GeonameId
, blockRegisteredCountryGeonameId :: GeonameId
, blockGeonameId :: Maybe GeonameId
, blockRegisteredCountryGeonameId :: Maybe GeonameId
, blockRepresentedCountryGeonameId :: Maybe GeonameId
, blockIsAnonymousProxy :: Bool
, blockIsSatelliteProvider :: Bool
, blockPostalCode :: Text
, blockLatitude :: Fixed E4
, blockLongitude :: Fixed E4
, blockAccuracyRadius :: Int
, blockLatitude :: Maybe (Fixed E4)
, blockLongitude :: Maybe (Fixed E4)
, blockAccuracyRadius :: Maybe Int
} deriving (Show,Read,Eq,Ord)

View File

@ -2,33 +2,67 @@
module Main (main) where
import Test.HUnit (Assertion,(@?=))
import Test.Framework (defaultMain, testGroup, Test)
import Test.HUnit (Assertion,(@?=),assertBool,assertFailure)
import Test.Framework (defaultMainWithOpts, interpretArgsOrExit,
testGroup, Test)
import Test.Framework.Providers.HUnit (testCase)
import Test.Framework.Runners.TestPattern (parseTestPattern)
import Test.Framework.Runners.Options (RunnerOptions'(..))
import Geolite.Csv (cities,blocks)
import Data.Text (Text)
import Colonnade.Types
import Siphon.Types
import Data.Functor.Identity
import Control.Monad (unless)
import System.Environment (getArgs)
import System.Directory (doesDirectoryExist)
import System.IO (withFile,IOMode(ReadMode))
import qualified Data.Text as Text
import qualified Pipes.Prelude as Pipes
import qualified Pipes.ByteString as PB
import qualified Pipes.Text.Encoding as PT
import qualified Siphon.Decoding as SD
import qualified Colonnade.Decoding as Decoding
import Pipes
------------------------------------------------
-- The default behavior of this test suite is to
-- test the CSV decoders against small samples of
-- the GeoLite2 databases. These small samples are
-- included as part of this repository. If you give
-- this test suite an argument named "large", it
-- will run against the full CSVs, which are around
-- 350MB. These are not included
-- as part of the repository, so they need to be
-- downloaded. The script found in
-- scripts/load-full-databases will download the full
-- archive, decompress it, and move the files to
-- the appropriate directory for this test suite
-- to run on them.
-----------------------------------------------
main :: IO ()
main = defaultMain tests
main = do
xs <- getArgs
ropts' <- interpretArgsOrExit xs
let ropts = ropts'
{ ropt_test_patterns = case ropt_test_patterns ropts' of
Nothing -> Just [parseTestPattern "small"]
Just xs -> Just xs
}
defaultMainWithOpts tests ropts
tests :: [Test]
tests =
[ testGroup "Geolite CSV Decoding"
tests = flip concatMap ["small","large"] $ \size ->
[ testGroup size
[ testCase "Network Blocks" $ streamFileWith
"data/GeoLite2-City-Blocks-IPv4.small.csv"
("data/" ++ size ++ "/GeoLite2-City-Blocks-IPv4.csv")
blocks
, testCase "English City Locations" $ streamFileWith
"data/GeoLite2-City-Locations-en.small.csv"
("data/" ++ size ++ "/GeoLite2-City-Locations-en.csv")
cities
, testCase "Japanese City Locations" $ streamFileWith
("data/" ++ size ++ "/GeoLite2-City-Locations-ja.csv")
cities
]
]
@ -39,8 +73,19 @@ streamFileWith ::
-> Assertion
streamFileWith filename decodingPipe = do
r <- withFile filename ReadMode $ \h -> runEffect $
fmap SD.csvResultFromEither (PT.decode (PT.utf8 . PT.eof) $ PB.fromHandle h)
>-> fmap SD.csvResultFromDecodingRowError decodingPipe
fmap (SD.convertDecodeError "utf-8") (PT.decode (PT.utf8 . PT.eof) $ PB.fromHandle h)
>-> fmap Just decodingPipe
>-> Pipes.drain
r @?= CsvResultSuccess
case r of
Nothing -> assertBool "impossible" True
Just err -> assertFailure (Decoding.prettyError Text.unpack err)
-- let dirPiece = case xs of
-- ["full"] -> "large/"
-- _ -> "small/"
-- fullDirName = "data/" ++ dirPiece
-- errMsg = concat
-- [ "The "
-- , fullDirName
-- , " directory does not exist in the geolite project"
-- ]

View File

@ -27,12 +27,12 @@ mkParseError i ctxs msg = id
, "]"
]
csvResultFromEither :: Either (Producer ByteString m ()) () -> CsvResult f c
csvResultFromEither (Left _) = CsvResultTextDecodeError
csvResultFromEither (Right ()) = CsvResultSuccess
csvResultFromDecodingRowError :: DecodingRowError f c -> CsvResult f c
csvResultFromDecodingRowError = CsvResultDecodeError
-- | This is a convenience function for working with @pipes-text@.
-- It will convert a UTF-8 decoding error into a `DecodingRowError`,
-- so the pipes can be properly chained together.
convertDecodeError :: String -> Either (Producer ByteString m ()) () -> Maybe (DecodingRowError f c)
convertDecodeError encodingName (Left _) = Just (DecodingRowError 0 (RowErrorMalformed encodingName))
convertDecodeError _ (Right ()) = Nothing
-- | This is seldom useful but is included for completeness.
headlessPipe :: Monad m
@ -145,7 +145,8 @@ pipeGeneral initIx (Siphon _ _ parse isNull) wrapParseError decodeRow mleftovers
Left err -> return err
Right r -> do
yield r
if isNull c1 then go1 ix else go2 ix c1
let ixNext = ix + 1
if isNull c1 then go1 ixNext else go2 ixNext c1
Atto.Partial k -> go3 ix k
awaitSkip :: Monad m

View File

@ -13,12 +13,12 @@ data Siphon c = Siphon
, siphonNull :: c -> Bool
}
-- | This type is provided for convenience with @pipes-text@
data CsvResult f c
= CsvResultSuccess
| CsvResultTextDecodeError
| CsvResultDecodeError (DecodingRowError f c)
deriving (Show,Read,Eq)
-- -- | This type is provided for convenience with @pipes-text@
-- data CsvResult f c
-- = CsvResultSuccess
-- | CsvResultTextDecodeError
-- | CsvResultDecodeError (DecodingRowError f c)
-- deriving (Show,Read,Eq)
-- | Consider changing out the use of 'Vector' here