Add unicode normalization test-suite

2024-08-16 15:00:25 +03:00 · 2020-09-21 19:15:02 +05:30 · 2020-09-21 19:15:02 +05:30 · 08ee39b078
commit 08ee39b078
parent b7543dfad8
6 changed files with 19063 additions and 2 deletions
--- a/bin/targets.sh
+++ b/bin/targets.sh
@ -109,6 +109,7 @@ targets () {
  INDIVIDUAL_TARGETS="\
      Data.Unfold \
      Unicode.Stream \
+      Unicode.Char \
      Unicode.Utf8 \
      FileSystem.Handle \
      `test_only FileSystem.Event` \
--- a/streamly.cabal
+++ b/streamly.cabal
@ -97,11 +97,13 @@ extra-source-files:
    test/Streamly/Test/Network/Inet/TCP.hs
    test/Streamly/Test/Prelude.hs
    test/Streamly/Test/Prelude/*.hs
-    test/Streamly/Test/Unicode/Stream.hs
+    test/Streamly/Test/Unicode/*.hs
    test/lib/Streamly/Test/Common.hs
    test/lib/Streamly/Test/Prelude/Common.hs
    test/streamly-tests.cabal
    test/version-bounds.hs
+    test/Streamly/Test/Unicode/ucd/NormalizationTest.txt
+    test/Streamly/Test/Unicode/extra/NormalizationTest.txt

 extra-doc-files:
    CONTRIBUTING.md
--- a/test/Streamly/Test/Unicode/Char.hs
+++ b/test/Streamly/Test/Unicode/Char.hs
@ -0,0 +1,136 @@
+{-# LANGUAGE CPP               #-}
+{-# LANGUAGE OverloadedStrings #-}
+{-# LANGUAGE TupleSections     #-}
+
+-- |
+-- Copyright   : (c) 2020 Composewell Technologies
+--
+-- License     : BSD-3-Clause
+-- Maintainer  : streamly@composewell.com
+-- Stability   : experimental
+-- Portability : GHC
+--
+
+module Streamly.Test.Unicode.Char (main) where
+
+import Control.Monad (when)
+import Data.Char (chr, isSpace, ord, toUpper)
+import Data.List (intercalate, isPrefixOf)
+#if MIN_VERSION_base(4,8,0)
+import Data.Function ((&))
+#endif
+import Streamly.Internal.Unicode.Char
+    ( NormalizationMode(NFC, NFD, NFKC, NFKD)
+    , normalize
+    )
+import Streamly.Internal.Data.Stream.IsStream (SerialT)
+import Text.Printf (printf)
+import System.FilePath.Posix ((</>))
+
+import qualified Streamly.Internal.Data.Stream.IsStream as S
+import qualified Streamly.Internal.Data.Fold as FL
+import qualified System.Directory as Dir
+
+#if !MIN_VERSION_base(4,8,0)
+(&) :: a -> (a -> b) -> b
+x & f = f x
+#endif
+
+chrToHex :: Char -> [Char]
+chrToHex = map toUpper . printf "%.4x" . ord
+
+strToHex :: [Char] -> String
+strToHex = unwords . map chrToHex
+
+type Text = SerialT IO Char
+
+checkEqual :: String -> (Text -> Text) -> (Text, Text) -> IO Bool
+checkEqual opName op (mc1, mc2) = do
+    c1 <- S.toList mc1
+    c2 <- S.toList mc2
+    opc2 <- S.toList $ op mc2
+    if c1 /= opc2 then do
+        putStrLn $ opName ++ " "            ++ strToHex c2
+                          ++ " = "          ++ strToHex opc2
+                          ++ "; Expected: " ++ strToHex c1
+        return False
+    else return True
+
+checkOp :: String -> NormalizationMode -> [(Text, Text)] -> IO Bool
+checkOp name op pairs = do
+    res <- mapM (checkEqual name (normalize op)) pairs
+    return $ all (== True) res
+
+checkNFC :: (Text, Text, Text, Text, Text) -> IO Bool
+checkNFC (c1, c2, c3, c4, c5) =
+    checkOp "toNFC" NFC $ map (c2, ) [c1, c2, c3] ++ map (c4, ) [c4, c5]
+
+checkNFD :: (Text, Text, Text, Text, Text) -> IO Bool
+checkNFD (c1, c2, c3, c4, c5) =
+    checkOp "toNFD" NFD $ map (c3, ) [c1, c2, c3] ++ map (c5, ) [c4, c5]
+
+checkNFKC :: (Text, Text, Text, Text, Text) -> IO Bool
+checkNFKC (c1, c2, c3, c4, c5) =
+    checkOp "toNFKC" NFKC $ map (c4,) [c1, c2, c3, c4, c5]
+
+checkNFKD :: (Text, Text, Text, Text, Text) -> IO Bool
+checkNFKD (c1, c2, c3, c4, c5) =
+    checkOp "toNFKD" NFKD $ map (c5,) [c1, c2, c3, c4, c5]
+
+checkAllTestCases :: Int -> String -> IO ()
+checkAllTestCases lineno line = do
+    cs <- S.toList $ S.splitOn (== ';') FL.toList $ S.fromList line
+    case cs of
+        c1 : c2 : c3 : c4 : c5 : _ -> do
+            let cps = map cpToText [c1, c2, c3, c4, c5]
+            mapM_ (checkOneTestCase cps)
+                  [checkNFD, checkNFKD, checkNFC, checkNFKC]
+        _ -> error $ "Unrecognized line: " ++ line
+    where
+        cpToText xs = S.fromList $ map (chr . read . ("0x" ++)) (words xs)
+
+        checkOneTestCase cps f = do
+            res <- f (tuplify cps)
+            when (not res) $ do
+                strs <- mapM S.toList cps
+                let codes = intercalate ";" $ map strToHex strs
+                    txt = intercalate "; " strs
+                putStrLn ("Failed at line: " ++ show lineno)
+                putStrLn line
+                putStrLn $ codes ++ "; # (" ++ txt
+                error "Bailing out"
+
+        tuplify [c1, c2, c3, c4, c5] = (c1, c2, c3, c4, c5)
+        tuplify _ = error "tuplify bad arguments"
+
+checkLine :: (Int, String) -> IO ()
+checkLine (lineno, line) = do
+    -- marker lines indicating a test block start with @
+    if "@" `isPrefixOf` line
+        then
+            putStrLn line
+        else
+            checkAllTestCases lineno line
+
+testNormalize :: FilePath -> IO ()
+testNormalize file = do
+    contents <- readFile file
+    let ls = lines contents                        -- split into lines
+         & map (dropWhile isSpace)                 -- trim leading spaces
+         & zip [1..]                               -- add numbering
+         & filter (not . null . snd)               -- remove blank lines
+         & filter (not . ("#" `isPrefixOf`) . snd) -- remove comments
+    checkAll ls
+    where
+        checkAll (x:xs) = checkLine x >> checkAll xs
+        checkAll []     = return ()
+
+filesDir :: String
+filesDir = "Streamly/Test/Unicode"
+
+main :: IO ()
+main = do
+    cdir <- Dir.getCurrentDirectory
+    testNormalize $ cdir </> filesDir </> "ucd/NormalizationTest.txt"
+    -- Additional test cases not in the unicode standard suite
+    testNormalize $ cdir </> filesDir </> "extra/NormalizationTest.txt"
--- a/test/Streamly/Test/Unicode/extra/NormalizationTest.txt
+++ b/test/Streamly/Test/Unicode/extra/NormalizationTest.txt
@ -0,0 +1,9 @@
+#   Columns (c1, c2,...) are separated by semicolons
+#   They have the following meaning:
+#      source; NFC; NFD; NFKC; NFKD
+
+@Extra test cases not in the unicode standard test suite
+AC00 11A7;AC00 11A7;1100 1161 11A7;AC00 11A7;1100 1161 11A7; # 11A7 is not a valid T and should not be combined with LV AC00 to give AC00 as NFC.
+AC00 11A8;AC01;1100 1161 11A8;AC01;1100 1161 11A8; # Combine precomposed LV and a T i.e <LV,T> sequence.
+1100 11A8;1100 11A8;1100 11A8;1100 11A8;1100 11A8; # Invalid <L,T> sequence
+2FA1E;2FA1E;2FA1E;2FA1E;2FA1E; # Beyond decomposable max
--- a/test/Streamly/Test/Unicode/ucd/NormalizationTest.txt
+++ b/test/Streamly/Test/Unicode/ucd/NormalizationTest.txt
--- a/test/streamly-tests.cabal
+++ b/test/streamly-tests.cabal
@ -158,7 +158,6 @@ library
    if flag(limit-build-mem)
      ghc-options: +RTS -M1500M -RTS

-
 -------------------------------------------------------------------------------
 -- Test suite options
 -------------------------------------------------------------------------------
@ -367,6 +366,12 @@ test-suite Unicode.Stream
  main-is: Streamly/Test/Unicode/Stream.hs
  ghc-options: -main-is Streamly.Test.Unicode.Stream.main

+test-suite Unicode.Char
+  import: test-options
+  type: exitcode-stdio-1.0
+  main-is: Streamly/Test/Unicode/Char.hs
+  ghc-options: -main-is Streamly.Test.Unicode.Char.main
+
 test-suite version-bounds
  import: test-options
  type: exitcode-stdio-1.0