Add unicode normalization test-suite

This commit is contained in:
adithyaov 2020-09-21 19:15:02 +05:30 committed by Adithya Kumar
parent b7543dfad8
commit 08ee39b078
6 changed files with 19063 additions and 2 deletions

View File

@ -109,6 +109,7 @@ targets () {
INDIVIDUAL_TARGETS="\
Data.Unfold \
Unicode.Stream \
Unicode.Char \
Unicode.Utf8 \
FileSystem.Handle \
`test_only FileSystem.Event` \

View File

@ -97,11 +97,13 @@ extra-source-files:
test/Streamly/Test/Network/Inet/TCP.hs
test/Streamly/Test/Prelude.hs
test/Streamly/Test/Prelude/*.hs
test/Streamly/Test/Unicode/Stream.hs
test/Streamly/Test/Unicode/*.hs
test/lib/Streamly/Test/Common.hs
test/lib/Streamly/Test/Prelude/Common.hs
test/streamly-tests.cabal
test/version-bounds.hs
test/Streamly/Test/Unicode/ucd/NormalizationTest.txt
test/Streamly/Test/Unicode/extra/NormalizationTest.txt
extra-doc-files:
CONTRIBUTING.md

View File

@ -0,0 +1,136 @@
{-# LANGUAGE CPP #-}
{-# LANGUAGE OverloadedStrings #-}
{-# LANGUAGE TupleSections #-}
-- |
-- Copyright : (c) 2020 Composewell Technologies
--
-- License : BSD-3-Clause
-- Maintainer : streamly@composewell.com
-- Stability : experimental
-- Portability : GHC
--
module Streamly.Test.Unicode.Char (main) where
import Control.Monad (when)
import Data.Char (chr, isSpace, ord, toUpper)
import Data.List (intercalate, isPrefixOf)
#if MIN_VERSION_base(4,8,0)
import Data.Function ((&))
#endif
import Streamly.Internal.Unicode.Char
( NormalizationMode(NFC, NFD, NFKC, NFKD)
, normalize
)
import Streamly.Internal.Data.Stream.IsStream (SerialT)
import Text.Printf (printf)
import System.FilePath.Posix ((</>))
import qualified Streamly.Internal.Data.Stream.IsStream as S
import qualified Streamly.Internal.Data.Fold as FL
import qualified System.Directory as Dir
#if !MIN_VERSION_base(4,8,0)
(&) :: a -> (a -> b) -> b
x & f = f x
#endif
chrToHex :: Char -> [Char]
chrToHex = map toUpper . printf "%.4x" . ord
strToHex :: [Char] -> String
strToHex = unwords . map chrToHex
type Text = SerialT IO Char
checkEqual :: String -> (Text -> Text) -> (Text, Text) -> IO Bool
checkEqual opName op (mc1, mc2) = do
c1 <- S.toList mc1
c2 <- S.toList mc2
opc2 <- S.toList $ op mc2
if c1 /= opc2 then do
putStrLn $ opName ++ " " ++ strToHex c2
++ " = " ++ strToHex opc2
++ "; Expected: " ++ strToHex c1
return False
else return True
checkOp :: String -> NormalizationMode -> [(Text, Text)] -> IO Bool
checkOp name op pairs = do
res <- mapM (checkEqual name (normalize op)) pairs
return $ all (== True) res
checkNFC :: (Text, Text, Text, Text, Text) -> IO Bool
checkNFC (c1, c2, c3, c4, c5) =
checkOp "toNFC" NFC $ map (c2, ) [c1, c2, c3] ++ map (c4, ) [c4, c5]
checkNFD :: (Text, Text, Text, Text, Text) -> IO Bool
checkNFD (c1, c2, c3, c4, c5) =
checkOp "toNFD" NFD $ map (c3, ) [c1, c2, c3] ++ map (c5, ) [c4, c5]
checkNFKC :: (Text, Text, Text, Text, Text) -> IO Bool
checkNFKC (c1, c2, c3, c4, c5) =
checkOp "toNFKC" NFKC $ map (c4,) [c1, c2, c3, c4, c5]
checkNFKD :: (Text, Text, Text, Text, Text) -> IO Bool
checkNFKD (c1, c2, c3, c4, c5) =
checkOp "toNFKD" NFKD $ map (c5,) [c1, c2, c3, c4, c5]
checkAllTestCases :: Int -> String -> IO ()
checkAllTestCases lineno line = do
cs <- S.toList $ S.splitOn (== ';') FL.toList $ S.fromList line
case cs of
c1 : c2 : c3 : c4 : c5 : _ -> do
let cps = map cpToText [c1, c2, c3, c4, c5]
mapM_ (checkOneTestCase cps)
[checkNFD, checkNFKD, checkNFC, checkNFKC]
_ -> error $ "Unrecognized line: " ++ line
where
cpToText xs = S.fromList $ map (chr . read . ("0x" ++)) (words xs)
checkOneTestCase cps f = do
res <- f (tuplify cps)
when (not res) $ do
strs <- mapM S.toList cps
let codes = intercalate ";" $ map strToHex strs
txt = intercalate "; " strs
putStrLn ("Failed at line: " ++ show lineno)
putStrLn line
putStrLn $ codes ++ "; # (" ++ txt
error "Bailing out"
tuplify [c1, c2, c3, c4, c5] = (c1, c2, c3, c4, c5)
tuplify _ = error "tuplify bad arguments"
checkLine :: (Int, String) -> IO ()
checkLine (lineno, line) = do
-- marker lines indicating a test block start with @
if "@" `isPrefixOf` line
then
putStrLn line
else
checkAllTestCases lineno line
testNormalize :: FilePath -> IO ()
testNormalize file = do
contents <- readFile file
let ls = lines contents -- split into lines
& map (dropWhile isSpace) -- trim leading spaces
& zip [1..] -- add numbering
& filter (not . null . snd) -- remove blank lines
& filter (not . ("#" `isPrefixOf`) . snd) -- remove comments
checkAll ls
where
checkAll (x:xs) = checkLine x >> checkAll xs
checkAll [] = return ()
filesDir :: String
filesDir = "Streamly/Test/Unicode"
main :: IO ()
main = do
cdir <- Dir.getCurrentDirectory
testNormalize $ cdir </> filesDir </> "ucd/NormalizationTest.txt"
-- Additional test cases not in the unicode standard suite
testNormalize $ cdir </> filesDir </> "extra/NormalizationTest.txt"

View File

@ -0,0 +1,9 @@
# Columns (c1, c2,...) are separated by semicolons
# They have the following meaning:
# source; NFC; NFD; NFKC; NFKD
@Extra test cases not in the unicode standard test suite
AC00 11A7;AC00 11A7;1100 1161 11A7;AC00 11A7;1100 1161 11A7; # 11A7 is not a valid T and should not be combined with LV AC00 to give AC00 as NFC.
AC00 11A8;AC01;1100 1161 11A8;AC01;1100 1161 11A8; # Combine precomposed LV and a T i.e <LV,T> sequence.
1100 11A8;1100 11A8;1100 11A8;1100 11A8;1100 11A8; # Invalid <L,T> sequence
2FA1E;2FA1E;2FA1E;2FA1E;2FA1E; # Beyond decomposable max

File diff suppressed because it is too large Load Diff

View File

@ -158,7 +158,6 @@ library
if flag(limit-build-mem)
ghc-options: +RTS -M1500M -RTS
-------------------------------------------------------------------------------
-- Test suite options
-------------------------------------------------------------------------------
@ -367,6 +366,12 @@ test-suite Unicode.Stream
main-is: Streamly/Test/Unicode/Stream.hs
ghc-options: -main-is Streamly.Test.Unicode.Stream.main
test-suite Unicode.Char
import: test-options
type: exitcode-stdio-1.0
main-is: Streamly/Test/Unicode/Char.hs
ghc-options: -main-is Streamly.Test.Unicode.Char.main
test-suite version-bounds
import: test-options
type: exitcode-stdio-1.0