Improve errors, make geolite parsing work

2024-09-11 06:45:41 +03:00 · 2016-07-13 17:33:38 -04:00 · 2016-07-13 17:33:38 -04:00 · 02d616a555
commit 02d616a555
parent 4bcf860fbc
13 changed files with 184 additions and 37 deletions
--- a/.gitignore
+++ b/.gitignore
@ -35,3 +35,5 @@ tags
 TAGS

 docs/db/unthreat
+
+geolite-csv/data/large
--- a/colonnade/src/Colonnade/Decoding.hs
+++ b/colonnade/src/Colonnade/Decoding.hs
@ -8,6 +8,7 @@ import Colonnade.Types
 import Data.Functor.Contravariant
 import Data.Vector (Vector)
 import qualified Data.Vector as Vector
+import Data.Char (chr)

 -- | Converts the content type of a 'Decoding'. The @'Contravariant' f@
 -- constraint means that @f@ can be 'Headless' but not 'Headed'.
@ -101,3 +102,59 @@ headedToIndexed v = getEitherWrap . go
       <$> EitherWrap rcurrent
       <*> rnext

+-- | This adds one to the index because text editors consider
+--   line number to be one-based, not zero-based.
+prettyError :: (c -> String) -> DecodingRowError f c -> String
+prettyError toStr (DecodingRowError ix e) = unlines
+  $ ("Decoding error on line " ++ show (ix + 1) ++ " of file.")
+  : ("Error Category: " ++ descr)
+  : map ("  " ++) errDescrs
+  where (descr,errDescrs) = prettyRowError toStr e
+
+prettyRowError :: (content -> String) -> RowError f content -> (String, [String])
+prettyRowError toStr x = case x of
+  RowErrorParse err -> (,) "CSV Parsing"
+    [ "The line could not be parsed into cells correctly."
+    , "Original parser error: " ++ err
+    ]
+  RowErrorSize reqLen actualLen -> (,) "Row Length"
+    [ "Expected the row to have exactly " ++ show reqLen ++ " cells."
+    , "The row only has " ++ show actualLen ++ " cells."
+    ]
+  RowErrorMinSize reqLen actualLen -> (,) "Row Min Length"
+    [ "Expected the row to have at least " ++ show reqLen ++ " cells."
+    , "The row only has " ++ show actualLen ++ " cells."
+    ]
+  RowErrorMalformed enc -> (,) "Text Decoding"
+    [ "Tried to decode the input as " ++ enc ++ " text"
+    , "There is a mistake in the encoding of the text."
+    ]
+  RowErrorHeading errs -> (,) "Header" (prettyHeadingErrors toStr errs)
+  RowErrorDecode errs -> (,) "Cell Decoding" (prettyCellErrors toStr errs)
+
+prettyCellErrors :: (c -> String) -> DecodingCellErrors f c -> [String]
+prettyCellErrors toStr (DecodingCellErrors errs) = drop 1 $
+  flip concatMap errs $ \(DecodingCellError content (Indexed ix _) msg) ->
+    let str = toStr content in
+    [ "-----------"
+    , "Column " ++ columnNumToLetters ix
+    , "Original parse error: " ++ msg
+    , "Cell Content Length: " ++ show (Prelude.length str)
+    , "Cell Content: " ++ if null str
+        then "[empty cell]"
+        else str
+    ]
+
+prettyHeadingErrors :: (c -> String) -> HeadingErrors c -> [String]
+prettyHeadingErrors conv (HeadingErrors missing duplicates) = concat
+  [ concatMap (\h -> ["The header " ++ conv h ++ " was missing."]) missing
+  , concatMap (\(h,n) -> ["The header " ++ conv h ++ " occurred " ++ show n ++ " times."]) duplicates
+  ]
+
+columnNumToLetters :: Int -> String
+columnNumToLetters i
+  | i >= 0 && i < 25 = [chr (i + 65)]
+  | otherwise = "Beyond Z. Fix this."
+
+
+
--- a/colonnade/src/Colonnade/Types.hs
+++ b/colonnade/src/Colonnade/Types.hs
@ -1,6 +1,7 @@
-{-# LANGUAGE DeriveFunctor #-}
+{-# LANGUAGE DeriveFunctor              #-}
+{-# LANGUAGE DeriveFoldable             #-}
 {-# LANGUAGE GeneralizedNewtypeDeriving #-}
-{-# LANGUAGE GADTs #-}
+{-# LANGUAGE GADTs                      #-}
 module Colonnade.Types
  ( Encoding(..)
  , Decoding(..)
@ -24,11 +25,11 @@ import qualified Data.Vector as Vector

 -- | Isomorphic to 'Identity'
 newtype Headed a = Headed { getHeaded :: a }
-  deriving (Eq,Ord,Functor,Show,Read)
+  deriving (Eq,Ord,Functor,Show,Read,Foldable)

 -- | Isomorphic to 'Proxy'
 data Headless a = Headless
-  deriving (Eq,Ord,Functor,Show,Read)
+  deriving (Eq,Ord,Functor,Show,Read,Foldable)

 data Indexed f a = Indexed
  { indexedIndex :: !Int
@ -76,6 +77,7 @@ data RowError f content
  | RowErrorSize !Int !Int -- ^ Wrong number of cells in the row
  | RowErrorHeading !(HeadingErrors content)
  | RowErrorMinSize !Int !Int
+  | RowErrorMalformed !String -- ^ Error decoding unicode content
  deriving (Show,Read,Eq)

 -- instance (Show (f content), Typeable content) => Exception (DecodingErrors f content)
--- a/geolite-csv/data/small/GeoLite2-City-Blocks-IPv4.csv
+++ b/geolite-csv/data/small/GeoLite2-City-Blocks-IPv4.csv
--- a/geolite-csv/data/small/GeoLite2-City-Locations-en.csv
+++ b/geolite-csv/data/small/GeoLite2-City-Locations-en.csv
--- a/geolite-csv/data/small/GeoLite2-City-Locations-ja.csv
+++ b/geolite-csv/data/small/GeoLite2-City-Locations-ja.csv
--- a/geolite-csv/geolite-csv.cabal
+++ b/geolite-csv/geolite-csv.cabal
@ -43,6 +43,7 @@ test-suite geolite-csv-test
    , test-framework-hunit
    , pipes-bytestring
    , pipes-text
+    , directory
  ghc-options:         -threaded -rtsopts -with-rtsopts=-N
  default-language:    Haskell2010

--- a/geolite-csv/scripts/load-full-databases
+++ b/geolite-csv/scripts/load-full-databases
@ -0,0 +1,35 @@
+#!/bin/bash
+
+set -e
+
+current_dir="${PWD##*/}"
+
+echo "Current directory is: $current_dir"
+
+if [ "$current_dir" = "colonnade" ]
+then
+  cd ./geolite-csv
+fi
+
+new_current_dir="${PWD##*/}"
+if [ "$new_current_dir" != "geolite-csv" ]
+then
+  echo "Not currently in the geolite project directory. Exiting."
+  exit 1
+fi
+
+mkdir -p ./data/large
+cd ./data/large
+
+rm -f *.zip
+rm -rf GeoLite2-*
+
+curl 'http://geolite.maxmind.com/download/geoip/database/GeoLite2-City-CSV.zip' > archive.zip
+unzip archive.zip -d ./
+
+cd GeoLite2-City-CSV*
+mv *.csv ../
+cd ../
+rm -rf GeoLite2-City-CSV*
+rm archive.zip
+
--- a/geolite-csv/src/Geolite/Csv.hs
+++ b/geolite-csv/src/Geolite/Csv.hs
@ -39,16 +39,20 @@ decodingCity = City
 decodingBlock :: Decoding Headed Text Block
 decodingBlock = Block
  <$> CD.headed "network" IPv4RangeText.decodeEither
-  <*> CD.headed "geoname_id" (CDT.map GeonameId CDT.int)
+  <*> CD.headed "geoname_id"
+        (CDT.optional $ CDT.map GeonameId CDT.int)
  <*> CD.headed "registered_country_geoname_id"
-        (CDT.map GeonameId CDT.int)
+        (CDT.optional $ CDT.map GeonameId CDT.int)
  <*> CD.headed "represented_country_geoname_id"
        (CDT.optional $ CDT.map GeonameId CDT.int)
  <*> CD.headed "is_anonymous_proxy" (CDT.trueFalse "1" "0")
  <*> CD.headed "is_satellite_provider" (CDT.trueFalse "1" "0")
  <*> CD.headed "postal_code" CDT.text
-  <*> CD.headed "latitude" (CDT.fromReader TextRead.rational)
-  <*> CD.headed "longitude" (CDT.fromReader TextRead.rational)
-  <*> CD.headed "accuracy_radius" CDT.int
+  <*> CD.headed "latitude"
+        (CDT.optional $ CDT.fromReader TextRead.rational)
+  <*> CD.headed "longitude"
+        (CDT.optional $ CDT.fromReader TextRead.rational)
+  <*> CD.headed "accuracy_radius"
+        (CDT.optional CDT.int)


--- a/geolite-csv/src/Geolite/Types.hs
+++ b/geolite-csv/src/Geolite/Types.hs
@ -29,14 +29,14 @@ data City = City

 data Block = Block
  { blockNetwork :: IPv4Range
-  , blockGeonameId :: GeonameId
-  , blockRegisteredCountryGeonameId :: GeonameId
+  , blockGeonameId :: Maybe GeonameId
+  , blockRegisteredCountryGeonameId :: Maybe GeonameId
  , blockRepresentedCountryGeonameId :: Maybe GeonameId
  , blockIsAnonymousProxy :: Bool
  , blockIsSatelliteProvider :: Bool
  , blockPostalCode :: Text
-  , blockLatitude :: Fixed E4
-  , blockLongitude :: Fixed E4
-  , blockAccuracyRadius :: Int
+  , blockLatitude :: Maybe (Fixed E4)
+  , blockLongitude :: Maybe (Fixed E4)
+  , blockAccuracyRadius :: Maybe Int
  } deriving (Show,Read,Eq,Ord)

--- a/geolite-csv/test/Spec.hs
+++ b/geolite-csv/test/Spec.hs
@ -2,33 +2,67 @@

 module Main (main) where

-import Test.HUnit                           (Assertion,(@?=))
-import Test.Framework                       (defaultMain, testGroup, Test)
+import Test.HUnit                           (Assertion,(@?=),assertBool,assertFailure)
+import Test.Framework                       (defaultMainWithOpts, interpretArgsOrExit,
+                                             testGroup, Test)
 import Test.Framework.Providers.HUnit       (testCase)
+import Test.Framework.Runners.TestPattern   (parseTestPattern)
+import Test.Framework.Runners.Options       (RunnerOptions'(..))
 import Geolite.Csv                          (cities,blocks)
 import Data.Text                            (Text)
 import Colonnade.Types
 import Siphon.Types
 import Data.Functor.Identity
+import Control.Monad                        (unless)
+import System.Environment                   (getArgs)
+import System.Directory                     (doesDirectoryExist)
 import System.IO                            (withFile,IOMode(ReadMode))
 import qualified Data.Text                  as Text
 import qualified Pipes.Prelude              as Pipes
 import qualified Pipes.ByteString           as PB
 import qualified Pipes.Text.Encoding        as PT
 import qualified Siphon.Decoding            as SD
+import qualified Colonnade.Decoding         as Decoding
 import Pipes

+------------------------------------------------
+-- The default behavior of this test suite is to
+-- test the CSV decoders against small samples of
+-- the GeoLite2 databases. These small samples are
+-- included as part of this repository. If you give
+-- this test suite an argument named "large", it
+-- will run against the full CSVs, which are around
+-- 350MB. These are not included
+-- as part of the repository, so they need to be
+-- downloaded. The script found in
+-- scripts/load-full-databases will download the full
+-- archive, decompress it, and move the files to
+-- the appropriate directory for this test suite
+-- to run on them.
+-----------------------------------------------
+
 main :: IO ()
-main = defaultMain tests
+main = do
+  xs <- getArgs
+  ropts' <- interpretArgsOrExit xs
+  let ropts = ropts'
+        { ropt_test_patterns = case ropt_test_patterns ropts' of
+            Nothing -> Just [parseTestPattern "small"]
+            Just xs -> Just xs
+        }
+  defaultMainWithOpts tests ropts

 tests :: [Test]
-tests =
-  [ testGroup "Geolite CSV Decoding"
+tests = flip concatMap ["small","large"] $ \size ->
+  [ testGroup size
    [ testCase "Network Blocks" $ streamFileWith
-        "data/GeoLite2-City-Blocks-IPv4.small.csv"
+        ("data/" ++ size ++ "/GeoLite2-City-Blocks-IPv4.csv")
        blocks
    , testCase "English City Locations" $ streamFileWith
-        "data/GeoLite2-City-Locations-en.small.csv"
+        ("data/" ++ size ++ "/GeoLite2-City-Locations-en.csv")
+        cities
+    , testCase "Japanese City Locations" $ streamFileWith
+        ("data/" ++ size ++ "/GeoLite2-City-Locations-ja.csv")
        cities
    ]
  ]
@ -39,8 +73,19 @@ streamFileWith ::
  -> Assertion
 streamFileWith filename decodingPipe = do
  r <- withFile filename ReadMode $ \h -> runEffect $
-        fmap SD.csvResultFromEither (PT.decode (PT.utf8 . PT.eof) $ PB.fromHandle h)
-    >-> fmap SD.csvResultFromDecodingRowError decodingPipe
+        fmap (SD.convertDecodeError "utf-8") (PT.decode (PT.utf8 . PT.eof) $ PB.fromHandle h)
+    >-> fmap Just decodingPipe
    >-> Pipes.drain
-  r @?= CsvResultSuccess
+  case r of
+    Nothing -> assertBool "impossible" True
+    Just err -> assertFailure (Decoding.prettyError Text.unpack err)

+-- let dirPiece = case xs of
+--       ["full"] -> "large/"
+--       _        -> "small/"
+--     fullDirName = "data/" ++ dirPiece
+--     errMsg = concat
+--       [ "The "
+--       , fullDirName
+--       , " directory does not exist in the geolite project"
+--       ]
--- a/siphon/src/Siphon/Decoding.hs
+++ b/siphon/src/Siphon/Decoding.hs
@ -27,12 +27,12 @@ mkParseError i ctxs msg = id
    , "]"
    ]

-csvResultFromEither :: Either (Producer ByteString m ()) () -> CsvResult f c
-csvResultFromEither (Left _) = CsvResultTextDecodeError
-csvResultFromEither (Right ()) = CsvResultSuccess
-
-csvResultFromDecodingRowError :: DecodingRowError f c -> CsvResult f c
-csvResultFromDecodingRowError = CsvResultDecodeError
+-- | This is a convenience function for working with @pipes-text@.
+--   It will convert a UTF-8 decoding error into a `DecodingRowError`,
+--   so the pipes can be properly chained together.
+convertDecodeError :: String -> Either (Producer ByteString m ()) () -> Maybe (DecodingRowError f c)
+convertDecodeError encodingName (Left _) = Just (DecodingRowError 0 (RowErrorMalformed encodingName))
+convertDecodeError _ (Right ()) = Nothing

 -- | This is seldom useful but is included for completeness.
 headlessPipe :: Monad m
@ -145,7 +145,8 @@ pipeGeneral initIx (Siphon _ _ parse isNull) wrapParseError decodeRow mleftovers
        Left err -> return err
        Right r -> do
          yield r
-          if isNull c1 then go1 ix else go2 ix c1
+          let ixNext = ix + 1
+          if isNull c1 then go1 ixNext else go2 ixNext c1
    Atto.Partial k -> go3 ix k

 awaitSkip :: Monad m
--- a/siphon/src/Siphon/Types.hs
+++ b/siphon/src/Siphon/Types.hs
@ -13,12 +13,12 @@ data Siphon c = Siphon
  , siphonNull        :: c -> Bool
  }

-- | This type is provided for convenience with @pipes-text@
-data CsvResult f c
-  = CsvResultSuccess
-  | CsvResultTextDecodeError
-  | CsvResultDecodeError (DecodingRowError f c)
-  deriving (Show,Read,Eq)
+-- -- | This type is provided for convenience with @pipes-text@
+-- data CsvResult f c
+--   = CsvResultSuccess
+--   | CsvResultTextDecodeError
+--   | CsvResultDecodeError (DecodingRowError f c)
+--   deriving (Show,Read,Eq)


 -- | Consider changing out the use of 'Vector' here