From 8cb77a43c7775c8c734af9b2096b3c37e9a59908 Mon Sep 17 00:00:00 2001 From: Daniel Cartwright Date: Fri, 4 Jun 2021 12:46:29 -0700 Subject: [PATCH] Add custom isRangeValid implementation for ZH Summary: Fixes #313 Reviewed By: stroxler Differential Revision: D28364035 fbshipit-source-id: 7fe3dba75410d217747a0d7a6f7df611ac26ec70 --- CHANGELOG.md | 1 + Duckling/Distance/ZH/Corpus.hs | 1 - Duckling/Ranking/Classifiers/ZH_CN.hs | 6 ++-- Duckling/Ranking/Classifiers/ZH_HK.hs | 6 ++-- Duckling/Ranking/Classifiers/ZH_MO.hs | 6 ++-- Duckling/Ranking/Classifiers/ZH_TW.hs | 6 ++-- Duckling/Ranking/Classifiers/ZH_XX.hs | 6 ++-- Duckling/Types/Document.hs | 50 ++++++++++++++++++++++++--- 8 files changed, 61 insertions(+), 21 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 7e63236d..65cf42aa 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -4,6 +4,7 @@ ### Core * Make `isRangeValid` behave differently based on lang +* Add custom `isRangeValid` implementation for ZH ### Rulesets * CA (Catalan) diff --git a/Duckling/Distance/ZH/Corpus.hs b/Duckling/Distance/ZH/Corpus.hs index 79b3cae0..062a05c3 100644 --- a/Duckling/Distance/ZH/Corpus.hs +++ b/Duckling/Distance/ZH/Corpus.hs @@ -57,7 +57,6 @@ allExamples = concat , examples (simple Inch 4) [ "4 inch" , "4 inches" - , "4''" , "4英寸" , "4英吋" , "四吋" diff --git a/Duckling/Ranking/Classifiers/ZH_CN.hs b/Duckling/Ranking/Classifiers/ZH_CN.hs index 2c79b41a..c7b18f9c 100644 --- a/Duckling/Ranking/Classifiers/ZH_CN.hs +++ b/Duckling/Ranking/Classifiers/ZH_CN.hs @@ -1357,11 +1357,11 @@ classifiers likelihoods = HashMap.fromList [], n = 0}}), ("integer (0..10)", Classifier{okData = - ClassData{prior = -0.5907987153721106, unseen = -5.407171771460119, + ClassData{prior = -0.5957987257888164, unseen = -5.407171771460119, likelihoods = HashMap.fromList [("", 0.0)], n = 221}, koData = - ClassData{prior = -0.8071778665977783, unseen = -5.19295685089021, - likelihoods = HashMap.fromList [("", 0.0)], n = 178}}), + ClassData{prior = -0.8010045764163588, unseen = -5.204006687076795, + likelihoods = HashMap.fromList [("", 0.0)], n = 180}}), ("last n ", Classifier{okData = ClassData{prior = 0.0, unseen = -4.605170185988091, diff --git a/Duckling/Ranking/Classifiers/ZH_HK.hs b/Duckling/Ranking/Classifiers/ZH_HK.hs index 907d805c..24950f41 100644 --- a/Duckling/Ranking/Classifiers/ZH_HK.hs +++ b/Duckling/Ranking/Classifiers/ZH_HK.hs @@ -1357,11 +1357,11 @@ classifiers likelihoods = HashMap.fromList [], n = 0}}), ("integer (0..10)", Classifier{okData = - ClassData{prior = -0.5907987153721106, unseen = -5.407171771460119, + ClassData{prior = -0.5957987257888164, unseen = -5.407171771460119, likelihoods = HashMap.fromList [("", 0.0)], n = 221}, koData = - ClassData{prior = -0.8071778665977783, unseen = -5.19295685089021, - likelihoods = HashMap.fromList [("", 0.0)], n = 178}}), + ClassData{prior = -0.8010045764163588, unseen = -5.204006687076795, + likelihoods = HashMap.fromList [("", 0.0)], n = 180}}), ("last n ", Classifier{okData = ClassData{prior = 0.0, unseen = -4.605170185988091, diff --git a/Duckling/Ranking/Classifiers/ZH_MO.hs b/Duckling/Ranking/Classifiers/ZH_MO.hs index 6eb4e05f..21bd0d2e 100644 --- a/Duckling/Ranking/Classifiers/ZH_MO.hs +++ b/Duckling/Ranking/Classifiers/ZH_MO.hs @@ -1357,11 +1357,11 @@ classifiers likelihoods = HashMap.fromList [], n = 0}}), ("integer (0..10)", Classifier{okData = - ClassData{prior = -0.5907987153721106, unseen = -5.407171771460119, + ClassData{prior = -0.5957987257888164, unseen = -5.407171771460119, likelihoods = HashMap.fromList [("", 0.0)], n = 221}, koData = - ClassData{prior = -0.8071778665977783, unseen = -5.19295685089021, - likelihoods = HashMap.fromList [("", 0.0)], n = 178}}), + ClassData{prior = -0.8010045764163588, unseen = -5.204006687076795, + likelihoods = HashMap.fromList [("", 0.0)], n = 180}}), ("last n ", Classifier{okData = ClassData{prior = 0.0, unseen = -4.605170185988091, diff --git a/Duckling/Ranking/Classifiers/ZH_TW.hs b/Duckling/Ranking/Classifiers/ZH_TW.hs index da746134..4eb4cac0 100644 --- a/Duckling/Ranking/Classifiers/ZH_TW.hs +++ b/Duckling/Ranking/Classifiers/ZH_TW.hs @@ -1357,11 +1357,11 @@ classifiers likelihoods = HashMap.fromList [], n = 0}}), ("integer (0..10)", Classifier{okData = - ClassData{prior = -0.5907987153721106, unseen = -5.407171771460119, + ClassData{prior = -0.5957987257888164, unseen = -5.407171771460119, likelihoods = HashMap.fromList [("", 0.0)], n = 221}, koData = - ClassData{prior = -0.8071778665977783, unseen = -5.19295685089021, - likelihoods = HashMap.fromList [("", 0.0)], n = 178}}), + ClassData{prior = -0.8010045764163588, unseen = -5.204006687076795, + likelihoods = HashMap.fromList [("", 0.0)], n = 180}}), ("last n ", Classifier{okData = ClassData{prior = 0.0, unseen = -4.605170185988091, diff --git a/Duckling/Ranking/Classifiers/ZH_XX.hs b/Duckling/Ranking/Classifiers/ZH_XX.hs index 87d985f2..693ebe3d 100644 --- a/Duckling/Ranking/Classifiers/ZH_XX.hs +++ b/Duckling/Ranking/Classifiers/ZH_XX.hs @@ -1347,11 +1347,11 @@ classifiers likelihoods = HashMap.fromList [], n = 0}}), ("integer (0..10)", Classifier{okData = - ClassData{prior = -0.5907987153721106, unseen = -5.407171771460119, + ClassData{prior = -0.5957987257888164, unseen = -5.407171771460119, likelihoods = HashMap.fromList [("", 0.0)], n = 221}, koData = - ClassData{prior = -0.8071778665977783, unseen = -5.19295685089021, - likelihoods = HashMap.fromList [("", 0.0)], n = 178}}), + ClassData{prior = -0.8010045764163588, unseen = -5.204006687076795, + likelihoods = HashMap.fromList [("", 0.0)], n = 180}}), ("last n ", Classifier{okData = ClassData{prior = 0.0, unseen = -4.605170185988091, diff --git a/Duckling/Types/Document.hs b/Duckling/Types/Document.hs index a6357910..d81a0e5a 100644 --- a/Duckling/Types/Document.hs +++ b/Duckling/Types/Document.hs @@ -13,6 +13,7 @@ module Duckling.Types.Document ( Document -- abstract , fromText , (!) + , (!?) , length , byteStringFromPos , isAdjacent @@ -114,12 +115,44 @@ fromText rawInput = Document{..} where w = UText.ord c --- As regexes are matched without whitespace delimitator, we need to check +data CharClass + = Alpha + | Digit + | Self {-# unpack #-} !Char + deriving (Eq, Ord, Show) + +-- As regexes are matched without whitespace delimiter, we need to check -- the reasonability of the match to actually be a word. isRangeValid :: Lang -> Document -> Int -> Int -> Bool isRangeValid = \case + ZH -> zhIsRangeValid _ -> defaultIsRangeValid where + zhIsRangeValid :: Document -> Int -> Int -> Bool + zhIsRangeValid doc start end = + (start == 0 || + isDifferent (doc !? (start - 1)) (doc !? start)) && + (end == length doc || + isDifferent (doc !? (end - 1)) (doc !? end)) + + -- start == 0 = isDifferent (doc !? (end - 1)) (doc !? end) + -- end == length doc = isDifferent (doc !? (start - 1)) (doc !? start) + -- otherwise = isDifferent (doc !? (start - 1)) (doc !? start) + -- && isDifferent (doc !? (end - 1)) (doc !? end) + where + charClass :: Char -> Maybe CharClass + charClass c + | Char.isLower c || Char.isUpper c = Just Alpha + | Char.isDigit c = Just Digit + | otherwise = Nothing + isDifferent :: Maybe Char -> Maybe Char -> Bool + isDifferent Nothing Nothing = False + isDifferent Nothing _ = True + isDifferent _ Nothing = True + isDifferent (Just c1) (Just c2) = case (charClass c1, charClass c2) of + (Nothing, Nothing) -> True + (cc1, cc2) -> cc1 /= cc2 + defaultIsRangeValid :: Document -> Int -> Int -> Bool defaultIsRangeValid doc start end = (start == 0 || @@ -127,11 +160,11 @@ isRangeValid = \case (end == length doc || isDifferent (doc ! (end - 1)) (doc ! end)) where - charClass :: Char -> Char + charClass :: Char -> CharClass charClass c - | Char.isLower c || Char.isUpper c = 'c' - | Char.isDigit c = 'd' - | otherwise = c + | Char.isLower c || Char.isUpper c = Alpha + | Char.isDigit c = Digit + | otherwise = Self c isDifferent :: Char -> Char -> Bool isDifferent a b = charClass a /= charClass b @@ -146,6 +179,13 @@ isAdjacentSeparator c = elem c [' ', '\t'] (!) :: Document -> Int -> Char (!) Document { indexable = s } ix = s Array.! ix +(!?) :: Document -> Int -> Maybe Char +(!?) Document { indexable = s } ix = do + let (lo, hi) = Array.bounds s + case ix >= lo && ix <= hi of + True -> Just $ s Array.! ix + False -> Nothing + length :: Document -> Int length Document { indexable = s } = Array.rangeSize $ Array.bounds s