Add custom isRangeValid implementation for ZH

Summary: Fixes #313 Reviewed By: stroxler Differential Revision: D28364035 fbshipit-source-id: 7fe3dba75410d217747a0d7a6f7df611ac26ec70
2024-11-24 07:23:03 +03:00 · 2021-06-04 12:46:29 -07:00 · 2021-06-04 12:46:29 -07:00 · 8cb77a43c7
commit 8cb77a43c7
parent 4878820294
8 changed files with 61 additions and 21 deletions
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@ -4,6 +4,7 @@
 ### Core
 * Make `isRangeValid` behave differently based on lang
 * Add custom `isRangeValid` implementation for ZH
 ### Rulesets
 * CA (Catalan)
--- a/Duckling/Distance/ZH/Corpus.hs
+++ b/Duckling/Distance/ZH/Corpus.hs
@ -57,7 +57,6 @@ allExamples = concat
  , examples (simple Inch 4)
             [ "4 inch"
             , "4 inches"
             , "4''"
             , "4英寸"
             , "4英吋"
             , "四吋"
--- a/Duckling/Ranking/Classifiers/ZH_CN.hs
+++ b/Duckling/Ranking/Classifiers/ZH_CN.hs
@ -1357,11 +1357,11 @@ classifiers
                               likelihoods = HashMap.fromList [], n = 0}}),
       ("integer (0..10)",
        Classifier{okData =
-                     ClassData{prior = -0.5907987153721106, unseen = -5.407171771460119,
+                     ClassData{prior = -0.5957987257888164, unseen = -5.407171771460119,
                               likelihoods = HashMap.fromList [("", 0.0)], n = 221},
                   koData =
-                     ClassData{prior = -0.8071778665977783, unseen = -5.19295685089021,
+                     ClassData{prior = -0.8010045764163588, unseen = -5.204006687076795,
-                               likelihoods = HashMap.fromList [("", 0.0)], n = 178}}),
+                               likelihoods = HashMap.fromList [("", 0.0)], n = 180}}),
       ("last n <cycle>",
        Classifier{okData =
                     ClassData{prior = 0.0, unseen = -4.605170185988091,
--- a/Duckling/Ranking/Classifiers/ZH_HK.hs
+++ b/Duckling/Ranking/Classifiers/ZH_HK.hs
@ -1357,11 +1357,11 @@ classifiers
                               likelihoods = HashMap.fromList [], n = 0}}),
       ("integer (0..10)",
        Classifier{okData =
-                     ClassData{prior = -0.5907987153721106, unseen = -5.407171771460119,
+                     ClassData{prior = -0.5957987257888164, unseen = -5.407171771460119,
                               likelihoods = HashMap.fromList [("", 0.0)], n = 221},
                   koData =
-                     ClassData{prior = -0.8071778665977783, unseen = -5.19295685089021,
+                     ClassData{prior = -0.8010045764163588, unseen = -5.204006687076795,
-                               likelihoods = HashMap.fromList [("", 0.0)], n = 178}}),
+                               likelihoods = HashMap.fromList [("", 0.0)], n = 180}}),
       ("last n <cycle>",
        Classifier{okData =
                     ClassData{prior = 0.0, unseen = -4.605170185988091,
--- a/Duckling/Ranking/Classifiers/ZH_MO.hs
+++ b/Duckling/Ranking/Classifiers/ZH_MO.hs
@ -1357,11 +1357,11 @@ classifiers
                               likelihoods = HashMap.fromList [], n = 0}}),
       ("integer (0..10)",
        Classifier{okData =
-                     ClassData{prior = -0.5907987153721106, unseen = -5.407171771460119,
+                     ClassData{prior = -0.5957987257888164, unseen = -5.407171771460119,
                               likelihoods = HashMap.fromList [("", 0.0)], n = 221},
                   koData =
-                     ClassData{prior = -0.8071778665977783, unseen = -5.19295685089021,
+                     ClassData{prior = -0.8010045764163588, unseen = -5.204006687076795,
-                               likelihoods = HashMap.fromList [("", 0.0)], n = 178}}),
+                               likelihoods = HashMap.fromList [("", 0.0)], n = 180}}),
       ("last n <cycle>",
        Classifier{okData =
                     ClassData{prior = 0.0, unseen = -4.605170185988091,
--- a/Duckling/Ranking/Classifiers/ZH_TW.hs
+++ b/Duckling/Ranking/Classifiers/ZH_TW.hs
@ -1357,11 +1357,11 @@ classifiers
                               likelihoods = HashMap.fromList [], n = 0}}),
       ("integer (0..10)",
        Classifier{okData =
-                     ClassData{prior = -0.5907987153721106, unseen = -5.407171771460119,
+                     ClassData{prior = -0.5957987257888164, unseen = -5.407171771460119,
                               likelihoods = HashMap.fromList [("", 0.0)], n = 221},
                   koData =
-                     ClassData{prior = -0.8071778665977783, unseen = -5.19295685089021,
+                     ClassData{prior = -0.8010045764163588, unseen = -5.204006687076795,
-                               likelihoods = HashMap.fromList [("", 0.0)], n = 178}}),
+                               likelihoods = HashMap.fromList [("", 0.0)], n = 180}}),
       ("last n <cycle>",
        Classifier{okData =
                     ClassData{prior = 0.0, unseen = -4.605170185988091,
--- a/Duckling/Ranking/Classifiers/ZH_XX.hs
+++ b/Duckling/Ranking/Classifiers/ZH_XX.hs
@ -1347,11 +1347,11 @@ classifiers
                               likelihoods = HashMap.fromList [], n = 0}}),
       ("integer (0..10)",
        Classifier{okData =
-                     ClassData{prior = -0.5907987153721106, unseen = -5.407171771460119,
+                     ClassData{prior = -0.5957987257888164, unseen = -5.407171771460119,
                               likelihoods = HashMap.fromList [("", 0.0)], n = 221},
                   koData =
-                     ClassData{prior = -0.8071778665977783, unseen = -5.19295685089021,
+                     ClassData{prior = -0.8010045764163588, unseen = -5.204006687076795,
-                               likelihoods = HashMap.fromList [("", 0.0)], n = 178}}),
+                               likelihoods = HashMap.fromList [("", 0.0)], n = 180}}),
       ("last n <cycle>",
        Classifier{okData =
                     ClassData{prior = 0.0, unseen = -4.605170185988091,
--- a/Duckling/Types/Document.hs
+++ b/Duckling/Types/Document.hs
@ -13,6 +13,7 @@ module Duckling.Types.Document
  ( Document -- abstract
  , fromText
  , (!)
  , (!?)
  , length
  , byteStringFromPos
  , isAdjacent
@ -114,12 +115,44 @@ fromText rawInput = Document{..}
    where
    w = UText.ord c
-- As regexes are matched without whitespace delimitator, we need to check
+data CharClass
  = Alpha
  | Digit
  | Self {-# unpack #-} !Char
  deriving (Eq, Ord, Show)
 -- As regexes are matched without whitespace delimiter, we need to check
 -- the reasonability of the match to actually be a word.
 isRangeValid :: Lang -> Document -> Int -> Int -> Bool
 isRangeValid = \case
  ZH -> zhIsRangeValid
  _ -> defaultIsRangeValid
  where
    zhIsRangeValid :: Document -> Int -> Int -> Bool
    zhIsRangeValid doc start end =
      (start == 0 ||
        isDifferent (doc !? (start - 1)) (doc !? start)) &&
      (end == length doc ||
        isDifferent (doc !? (end - 1)) (doc !? end))
      --  start == 0 = isDifferent (doc !? (end - 1)) (doc !? end)
      --  end == length doc = isDifferent (doc !? (start - 1)) (doc !? start)
      --  otherwise = isDifferent (doc !? (start - 1)) (doc !? start)
      --               && isDifferent (doc !? (end - 1)) (doc !? end)
      where
        charClass :: Char -> Maybe CharClass
        charClass c
          | Char.isLower c || Char.isUpper c = Just Alpha
          | Char.isDigit c = Just Digit
          | otherwise = Nothing
        isDifferent :: Maybe Char -> Maybe Char -> Bool
        isDifferent Nothing Nothing = False
        isDifferent Nothing _       = True
        isDifferent _       Nothing = True
        isDifferent (Just c1) (Just c2) = case (charClass c1, charClass c2) of
          (Nothing, Nothing) -> True
          (cc1, cc2)         -> cc1 /= cc2
    defaultIsRangeValid :: Document -> Int -> Int -> Bool
    defaultIsRangeValid doc start end =
      (start == 0 ||
@ -127,11 +160,11 @@ isRangeValid = \case
      (end == length doc ||
        isDifferent (doc ! (end - 1)) (doc ! end))
      where
-        charClass :: Char -> Char
+        charClass :: Char -> CharClass
        charClass c
-          | Char.isLower c || Char.isUpper c = 'c'
+          | Char.isLower c || Char.isUpper c = Alpha
-          | Char.isDigit c = 'd'
+          | Char.isDigit c = Digit
-          | otherwise = c
+          | otherwise = Self c
        isDifferent :: Char -> Char -> Bool
        isDifferent a b = charClass a /= charClass b
@ -146,6 +179,13 @@ isAdjacentSeparator c = elem c [' ', '\t']
 (!) :: Document -> Int -> Char
 (!) Document { indexable = s } ix = s Array.! ix
 (!?) :: Document -> Int -> Maybe Char
 (!?) Document { indexable = s } ix = do
  let (lo, hi) = Array.bounds s
  case ix >= lo && ix <= hi of
    True -> Just $ s Array.! ix
    False -> Nothing
 length :: Document -> Int
 length Document { indexable = s } = Array.rangeSize $ Array.bounds s