Add custom isRangeValid implementation for ZH

Summary: Fixes #313

Reviewed By: stroxler

Differential Revision: D28364035

fbshipit-source-id: 7fe3dba75410d217747a0d7a6f7df611ac26ec70
This commit is contained in:
Daniel Cartwright 2021-06-04 12:46:29 -07:00 committed by Facebook GitHub Bot
parent 4878820294
commit 8cb77a43c7
8 changed files with 61 additions and 21 deletions

View File

@ -4,6 +4,7 @@
### Core
* Make `isRangeValid` behave differently based on lang
* Add custom `isRangeValid` implementation for ZH
### Rulesets
* CA (Catalan)

View File

@ -57,7 +57,6 @@ allExamples = concat
, examples (simple Inch 4)
[ "4 inch"
, "4 inches"
, "4''"
, "4英寸"
, "4英吋"
, "四吋"

View File

@ -1357,11 +1357,11 @@ classifiers
likelihoods = HashMap.fromList [], n = 0}}),
("integer (0..10)",
Classifier{okData =
ClassData{prior = -0.5907987153721106, unseen = -5.407171771460119,
ClassData{prior = -0.5957987257888164, unseen = -5.407171771460119,
likelihoods = HashMap.fromList [("", 0.0)], n = 221},
koData =
ClassData{prior = -0.8071778665977783, unseen = -5.19295685089021,
likelihoods = HashMap.fromList [("", 0.0)], n = 178}}),
ClassData{prior = -0.8010045764163588, unseen = -5.204006687076795,
likelihoods = HashMap.fromList [("", 0.0)], n = 180}}),
("last n <cycle>",
Classifier{okData =
ClassData{prior = 0.0, unseen = -4.605170185988091,

View File

@ -1357,11 +1357,11 @@ classifiers
likelihoods = HashMap.fromList [], n = 0}}),
("integer (0..10)",
Classifier{okData =
ClassData{prior = -0.5907987153721106, unseen = -5.407171771460119,
ClassData{prior = -0.5957987257888164, unseen = -5.407171771460119,
likelihoods = HashMap.fromList [("", 0.0)], n = 221},
koData =
ClassData{prior = -0.8071778665977783, unseen = -5.19295685089021,
likelihoods = HashMap.fromList [("", 0.0)], n = 178}}),
ClassData{prior = -0.8010045764163588, unseen = -5.204006687076795,
likelihoods = HashMap.fromList [("", 0.0)], n = 180}}),
("last n <cycle>",
Classifier{okData =
ClassData{prior = 0.0, unseen = -4.605170185988091,

View File

@ -1357,11 +1357,11 @@ classifiers
likelihoods = HashMap.fromList [], n = 0}}),
("integer (0..10)",
Classifier{okData =
ClassData{prior = -0.5907987153721106, unseen = -5.407171771460119,
ClassData{prior = -0.5957987257888164, unseen = -5.407171771460119,
likelihoods = HashMap.fromList [("", 0.0)], n = 221},
koData =
ClassData{prior = -0.8071778665977783, unseen = -5.19295685089021,
likelihoods = HashMap.fromList [("", 0.0)], n = 178}}),
ClassData{prior = -0.8010045764163588, unseen = -5.204006687076795,
likelihoods = HashMap.fromList [("", 0.0)], n = 180}}),
("last n <cycle>",
Classifier{okData =
ClassData{prior = 0.0, unseen = -4.605170185988091,

View File

@ -1357,11 +1357,11 @@ classifiers
likelihoods = HashMap.fromList [], n = 0}}),
("integer (0..10)",
Classifier{okData =
ClassData{prior = -0.5907987153721106, unseen = -5.407171771460119,
ClassData{prior = -0.5957987257888164, unseen = -5.407171771460119,
likelihoods = HashMap.fromList [("", 0.0)], n = 221},
koData =
ClassData{prior = -0.8071778665977783, unseen = -5.19295685089021,
likelihoods = HashMap.fromList [("", 0.0)], n = 178}}),
ClassData{prior = -0.8010045764163588, unseen = -5.204006687076795,
likelihoods = HashMap.fromList [("", 0.0)], n = 180}}),
("last n <cycle>",
Classifier{okData =
ClassData{prior = 0.0, unseen = -4.605170185988091,

View File

@ -1347,11 +1347,11 @@ classifiers
likelihoods = HashMap.fromList [], n = 0}}),
("integer (0..10)",
Classifier{okData =
ClassData{prior = -0.5907987153721106, unseen = -5.407171771460119,
ClassData{prior = -0.5957987257888164, unseen = -5.407171771460119,
likelihoods = HashMap.fromList [("", 0.0)], n = 221},
koData =
ClassData{prior = -0.8071778665977783, unseen = -5.19295685089021,
likelihoods = HashMap.fromList [("", 0.0)], n = 178}}),
ClassData{prior = -0.8010045764163588, unseen = -5.204006687076795,
likelihoods = HashMap.fromList [("", 0.0)], n = 180}}),
("last n <cycle>",
Classifier{okData =
ClassData{prior = 0.0, unseen = -4.605170185988091,

View File

@ -13,6 +13,7 @@ module Duckling.Types.Document
( Document -- abstract
, fromText
, (!)
, (!?)
, length
, byteStringFromPos
, isAdjacent
@ -114,12 +115,44 @@ fromText rawInput = Document{..}
where
w = UText.ord c
-- As regexes are matched without whitespace delimitator, we need to check
data CharClass
= Alpha
| Digit
| Self {-# unpack #-} !Char
deriving (Eq, Ord, Show)
-- As regexes are matched without whitespace delimiter, we need to check
-- the reasonability of the match to actually be a word.
isRangeValid :: Lang -> Document -> Int -> Int -> Bool
isRangeValid = \case
ZH -> zhIsRangeValid
_ -> defaultIsRangeValid
where
zhIsRangeValid :: Document -> Int -> Int -> Bool
zhIsRangeValid doc start end =
(start == 0 ||
isDifferent (doc !? (start - 1)) (doc !? start)) &&
(end == length doc ||
isDifferent (doc !? (end - 1)) (doc !? end))
-- start == 0 = isDifferent (doc !? (end - 1)) (doc !? end)
-- end == length doc = isDifferent (doc !? (start - 1)) (doc !? start)
-- otherwise = isDifferent (doc !? (start - 1)) (doc !? start)
-- && isDifferent (doc !? (end - 1)) (doc !? end)
where
charClass :: Char -> Maybe CharClass
charClass c
| Char.isLower c || Char.isUpper c = Just Alpha
| Char.isDigit c = Just Digit
| otherwise = Nothing
isDifferent :: Maybe Char -> Maybe Char -> Bool
isDifferent Nothing Nothing = False
isDifferent Nothing _ = True
isDifferent _ Nothing = True
isDifferent (Just c1) (Just c2) = case (charClass c1, charClass c2) of
(Nothing, Nothing) -> True
(cc1, cc2) -> cc1 /= cc2
defaultIsRangeValid :: Document -> Int -> Int -> Bool
defaultIsRangeValid doc start end =
(start == 0 ||
@ -127,11 +160,11 @@ isRangeValid = \case
(end == length doc ||
isDifferent (doc ! (end - 1)) (doc ! end))
where
charClass :: Char -> Char
charClass :: Char -> CharClass
charClass c
| Char.isLower c || Char.isUpper c = 'c'
| Char.isDigit c = 'd'
| otherwise = c
| Char.isLower c || Char.isUpper c = Alpha
| Char.isDigit c = Digit
| otherwise = Self c
isDifferent :: Char -> Char -> Bool
isDifferent a b = charClass a /= charClass b
@ -146,6 +179,13 @@ isAdjacentSeparator c = elem c [' ', '\t']
(!) :: Document -> Int -> Char
(!) Document { indexable = s } ix = s Array.! ix
(!?) :: Document -> Int -> Maybe Char
(!?) Document { indexable = s } ix = do
let (lo, hi) = Array.bounds s
case ix >= lo && ix <= hi of
True -> Just $ s Array.! ix
False -> Nothing
length :: Document -> Int
length Document { indexable = s } = Array.rangeSize $ Array.bounds s