From 8cb77a43c7775c8c734af9b2096b3c37e9a59908 Mon Sep 17 00:00:00 2001
From: Daniel Cartwright <chessai@fb.com>
Date: Fri, 4 Jun 2021 12:46:29 -0700
Subject: [PATCH] Add custom isRangeValid implementation for ZH

Summary: Fixes #313

Reviewed By: stroxler

Differential Revision: D28364035

fbshipit-source-id: 7fe3dba75410d217747a0d7a6f7df611ac26ec70
---
 CHANGELOG.md                          |  1 +
 Duckling/Distance/ZH/Corpus.hs        |  1 -
 Duckling/Ranking/Classifiers/ZH_CN.hs |  6 ++--
 Duckling/Ranking/Classifiers/ZH_HK.hs |  6 ++--
 Duckling/Ranking/Classifiers/ZH_MO.hs |  6 ++--
 Duckling/Ranking/Classifiers/ZH_TW.hs |  6 ++--
 Duckling/Ranking/Classifiers/ZH_XX.hs |  6 ++--
 Duckling/Types/Document.hs            | 50 ++++++++++++++++++++++++---
 8 files changed, 61 insertions(+), 21 deletions(-)
diff --git a/CHANGELOG.md b/CHANGELOG.md
index 7e63236d..65cf42aa 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -4,6 +4,7 @@
 
 ### Core
 * Make `isRangeValid` behave differently based on lang
+* Add custom `isRangeValid` implementation for ZH
 
 ### Rulesets
 * CA (Catalan)
diff --git a/Duckling/Distance/ZH/Corpus.hs b/Duckling/Distance/ZH/Corpus.hs
index 79b3cae0..062a05c3 100644
--- a/Duckling/Distance/ZH/Corpus.hs
+++ b/Duckling/Distance/ZH/Corpus.hs
@@ -57,7 +57,6 @@ allExamples = concat
   , examples (simple Inch 4)
              [ "4 inch"
              , "4 inches"
-             , "4''"
              , "4英寸"
              , "4英吋"
              , "四吋"
diff --git a/Duckling/Ranking/Classifiers/ZH_CN.hs b/Duckling/Ranking/Classifiers/ZH_CN.hs
index 2c79b41a..c7b18f9c 100644
--- a/Duckling/Ranking/Classifiers/ZH_CN.hs
+++ b/Duckling/Ranking/Classifiers/ZH_CN.hs
@@ -1357,11 +1357,11 @@ classifiers
                                likelihoods = HashMap.fromList [], n = 0}}),
        ("integer (0..10)",
         Classifier{okData =
-                     ClassData{prior = -0.5907987153721106, unseen = -5.407171771460119,
+                     ClassData{prior = -0.5957987257888164, unseen = -5.407171771460119,
                                likelihoods = HashMap.fromList [("", 0.0)], n = 221},
                    koData =
-                     ClassData{prior = -0.8071778665977783, unseen = -5.19295685089021,
-                               likelihoods = HashMap.fromList [("", 0.0)], n = 178}}),
+                     ClassData{prior = -0.8010045764163588, unseen = -5.204006687076795,
+                               likelihoods = HashMap.fromList [("", 0.0)], n = 180}}),
        ("last n <cycle>",
         Classifier{okData =
                      ClassData{prior = 0.0, unseen = -4.605170185988091,
diff --git a/Duckling/Ranking/Classifiers/ZH_HK.hs b/Duckling/Ranking/Classifiers/ZH_HK.hs
index 907d805c..24950f41 100644
--- a/Duckling/Ranking/Classifiers/ZH_HK.hs
+++ b/Duckling/Ranking/Classifiers/ZH_HK.hs
@@ -1357,11 +1357,11 @@ classifiers
                                likelihoods = HashMap.fromList [], n = 0}}),
        ("integer (0..10)",
         Classifier{okData =
-                     ClassData{prior = -0.5907987153721106, unseen = -5.407171771460119,
+                     ClassData{prior = -0.5957987257888164, unseen = -5.407171771460119,
                                likelihoods = HashMap.fromList [("", 0.0)], n = 221},
                    koData =
-                     ClassData{prior = -0.8071778665977783, unseen = -5.19295685089021,
-                               likelihoods = HashMap.fromList [("", 0.0)], n = 178}}),
+                     ClassData{prior = -0.8010045764163588, unseen = -5.204006687076795,
+                               likelihoods = HashMap.fromList [("", 0.0)], n = 180}}),
        ("last n <cycle>",
         Classifier{okData =
                      ClassData{prior = 0.0, unseen = -4.605170185988091,
diff --git a/Duckling/Ranking/Classifiers/ZH_MO.hs b/Duckling/Ranking/Classifiers/ZH_MO.hs
index 6eb4e05f..21bd0d2e 100644
--- a/Duckling/Ranking/Classifiers/ZH_MO.hs
+++ b/Duckling/Ranking/Classifiers/ZH_MO.hs
@@ -1357,11 +1357,11 @@ classifiers
                                likelihoods = HashMap.fromList [], n = 0}}),
        ("integer (0..10)",
         Classifier{okData =
-                     ClassData{prior = -0.5907987153721106, unseen = -5.407171771460119,
+                     ClassData{prior = -0.5957987257888164, unseen = -5.407171771460119,
                                likelihoods = HashMap.fromList [("", 0.0)], n = 221},
                    koData =
-                     ClassData{prior = -0.8071778665977783, unseen = -5.19295685089021,
-                               likelihoods = HashMap.fromList [("", 0.0)], n = 178}}),
+                     ClassData{prior = -0.8010045764163588, unseen = -5.204006687076795,
+                               likelihoods = HashMap.fromList [("", 0.0)], n = 180}}),
        ("last n <cycle>",
         Classifier{okData =
                      ClassData{prior = 0.0, unseen = -4.605170185988091,
diff --git a/Duckling/Ranking/Classifiers/ZH_TW.hs b/Duckling/Ranking/Classifiers/ZH_TW.hs
index da746134..4eb4cac0 100644
--- a/Duckling/Ranking/Classifiers/ZH_TW.hs
+++ b/Duckling/Ranking/Classifiers/ZH_TW.hs
@@ -1357,11 +1357,11 @@ classifiers
                                likelihoods = HashMap.fromList [], n = 0}}),
        ("integer (0..10)",
         Classifier{okData =
-                     ClassData{prior = -0.5907987153721106, unseen = -5.407171771460119,
+                     ClassData{prior = -0.5957987257888164, unseen = -5.407171771460119,
                                likelihoods = HashMap.fromList [("", 0.0)], n = 221},
                    koData =
-                     ClassData{prior = -0.8071778665977783, unseen = -5.19295685089021,
-                               likelihoods = HashMap.fromList [("", 0.0)], n = 178}}),
+                     ClassData{prior = -0.8010045764163588, unseen = -5.204006687076795,
+                               likelihoods = HashMap.fromList [("", 0.0)], n = 180}}),
        ("last n <cycle>",
         Classifier{okData =
                      ClassData{prior = 0.0, unseen = -4.605170185988091,
diff --git a/Duckling/Ranking/Classifiers/ZH_XX.hs b/Duckling/Ranking/Classifiers/ZH_XX.hs
index 87d985f2..693ebe3d 100644
--- a/Duckling/Ranking/Classifiers/ZH_XX.hs
+++ b/Duckling/Ranking/Classifiers/ZH_XX.hs
@@ -1347,11 +1347,11 @@ classifiers
                                likelihoods = HashMap.fromList [], n = 0}}),
        ("integer (0..10)",
         Classifier{okData =
-                     ClassData{prior = -0.5907987153721106, unseen = -5.407171771460119,
+                     ClassData{prior = -0.5957987257888164, unseen = -5.407171771460119,
                                likelihoods = HashMap.fromList [("", 0.0)], n = 221},
                    koData =
-                     ClassData{prior = -0.8071778665977783, unseen = -5.19295685089021,
-                               likelihoods = HashMap.fromList [("", 0.0)], n = 178}}),
+                     ClassData{prior = -0.8010045764163588, unseen = -5.204006687076795,
+                               likelihoods = HashMap.fromList [("", 0.0)], n = 180}}),
        ("last n <cycle>",
         Classifier{okData =
                      ClassData{prior = 0.0, unseen = -4.605170185988091,
diff --git a/Duckling/Types/Document.hs b/Duckling/Types/Document.hs
index a6357910..d81a0e5a 100644
--- a/Duckling/Types/Document.hs
+++ b/Duckling/Types/Document.hs
@@ -13,6 +13,7 @@ module Duckling.Types.Document
   ( Document -- abstract
   , fromText
   , (!)
+  , (!?)
   , length
   , byteStringFromPos
   , isAdjacent
@@ -114,12 +115,44 @@ fromText rawInput = Document{..}
     where
     w = UText.ord c
 
--- As regexes are matched without whitespace delimitator, we need to check
+data CharClass
+  = Alpha
+  | Digit
+  | Self {-# unpack #-} !Char
+  deriving (Eq, Ord, Show)
+
+-- As regexes are matched without whitespace delimiter, we need to check
 -- the reasonability of the match to actually be a word.
 isRangeValid :: Lang -> Document -> Int -> Int -> Bool
 isRangeValid = \case
+  ZH -> zhIsRangeValid
   _ -> defaultIsRangeValid
   where
+    zhIsRangeValid :: Document -> Int -> Int -> Bool
+    zhIsRangeValid doc start end =
+      (start == 0 ||
+        isDifferent (doc !? (start - 1)) (doc !? start)) &&
+      (end == length doc ||
+        isDifferent (doc !? (end - 1)) (doc !? end))
+
+      --  start == 0 = isDifferent (doc !? (end - 1)) (doc !? end)
+      --  end == length doc = isDifferent (doc !? (start - 1)) (doc !? start)
+      --  otherwise = isDifferent (doc !? (start - 1)) (doc !? start)
+      --               && isDifferent (doc !? (end - 1)) (doc !? end)
+      where
+        charClass :: Char -> Maybe CharClass
+        charClass c
+          | Char.isLower c || Char.isUpper c = Just Alpha
+          | Char.isDigit c = Just Digit
+          | otherwise = Nothing
+        isDifferent :: Maybe Char -> Maybe Char -> Bool
+        isDifferent Nothing Nothing = False
+        isDifferent Nothing _       = True
+        isDifferent _       Nothing = True
+        isDifferent (Just c1) (Just c2) = case (charClass c1, charClass c2) of
+          (Nothing, Nothing) -> True
+          (cc1, cc2)         -> cc1 /= cc2
+
     defaultIsRangeValid :: Document -> Int -> Int -> Bool
     defaultIsRangeValid doc start end =
       (start == 0 ||
@@ -127,11 +160,11 @@ isRangeValid = \case
       (end == length doc ||
         isDifferent (doc ! (end - 1)) (doc ! end))
       where
-        charClass :: Char -> Char
+        charClass :: Char -> CharClass
         charClass c
-          | Char.isLower c || Char.isUpper c = 'c'
-          | Char.isDigit c = 'd'
-          | otherwise = c
+          | Char.isLower c || Char.isUpper c = Alpha
+          | Char.isDigit c = Digit
+          | otherwise = Self c
         isDifferent :: Char -> Char -> Bool
         isDifferent a b = charClass a /= charClass b
 
@@ -146,6 +179,13 @@ isAdjacentSeparator c = elem c [' ', '\t']
 (!) :: Document -> Int -> Char
 (!) Document { indexable = s } ix = s Array.! ix
 
+(!?) :: Document -> Int -> Maybe Char
+(!?) Document { indexable = s } ix = do
+  let (lo, hi) = Array.bounds s
+  case ix >= lo && ix <= hi of
+    True -> Just $ s Array.! ix
+    False -> Nothing
+
 length :: Document -> Int
 length Document { indexable = s } = Array.rangeSize $ Array.bounds s