Make isRangeValid take Lang as input

Summary: There are different implementations of isRangeValid that work well for different languages, thus it makes sense to facilitate having different implementations based on the language.

Reviewed By: patapizza

Differential Revision: D28362777

fbshipit-source-id: 5f2991d54af3095c8e95cf534e2dd3b4a34dee3a
This commit is contained in:
Daniel Cartwright 2021-05-17 13:08:57 -07:00 committed by Facebook GitHub Bot
parent 7762af850a
commit 69d951220e
4 changed files with 66 additions and 56 deletions

View File

@ -3,6 +3,7 @@
## 0.2.X.X ## 0.2.X.X
### Core ### Core
* Make `isRangeValid` behave differently based on lang
### Rulesets ### Rulesets
* CA (Catalan) * CA (Catalan)

View File

@ -27,6 +27,7 @@ import qualified Data.List as L
import qualified Text.Regex.PCRE as PCRE import qualified Text.Regex.PCRE as PCRE
import Duckling.Dimensions.Types import Duckling.Dimensions.Types
import Duckling.Locale
import Duckling.Regex.Types import Duckling.Regex.Types
import Duckling.Resolve import Duckling.Resolve
import Duckling.Types hiding (regex) import Duckling.Types hiding (regex)
@ -40,11 +41,11 @@ import qualified Duckling.Types.Stash as Stash
-- Engine -- Engine
parseAndResolve :: [Rule] -> Text -> Context -> Options -> [ResolvedToken] parseAndResolve :: [Rule] -> Text -> Context -> Options -> [ResolvedToken]
parseAndResolve rules input context options = parseAndResolve rules input context@Context{locale = Locale lang _} options =
mapMaybe mapMaybe
(resolveNode context options) (resolveNode context options)
$ force $ Stash.toPosOrderedList $ force $ Stash.toPosOrderedList
$ runDuckling $ parseString rules (Document.fromText input) $ runDuckling $ parseString lang rules (Document.fromText input)
type Duckling a = Identity a type Duckling a = Identity a
@ -66,45 +67,45 @@ resolveNode context options n@Node{token = (Token dim dd), nodeRange = r}
, isLatent = latent , isLatent = latent
} }
parseString :: [Rule] -> Document -> Duckling Stash parseString :: Lang -> [Rule] -> Document -> Duckling Stash
parseString rules sentence = do parseString lang rules sentence = do
(new, partialMatches) <- (new, partialMatches) <-
-- One the first pass we try all the rules -- One the first pass we try all the rules
parseString1 rules sentence Stash.empty Stash.empty [] parseString1 lang rules sentence Stash.empty Stash.empty []
if Stash.null new if Stash.null new
then return Stash.empty then return Stash.empty
else else
-- For subsequent passes, we only try rules starting with a predicate. -- For subsequent passes, we only try rules starting with a predicate.
saturateParseString headPredicateRules sentence new new partialMatches saturateParseString lang headPredicateRules sentence new new partialMatches
where where
headPredicateRules = headPredicateRules =
[ rule | rule@Rule{pattern = (Predicate _ : _)} <- rules ] [ rule | rule@Rule{pattern = (Predicate _ : _)} <- rules ]
-- | Produces all tokens recursively. -- | Produces all tokens recursively.
saturateParseString saturateParseString
:: [Rule] -> Document -> Stash -> Stash -> [Match] -> Duckling Stash :: Lang -> [Rule] -> Document -> Stash -> Stash -> [Match] -> Duckling Stash
saturateParseString rules sentence stash new matches = do saturateParseString lang rules sentence stash new matches = do
(new', matches') <- parseString1 rules sentence stash new matches (new', matches') <- parseString1 lang rules sentence stash new matches
let stash' = Stash.union stash new' let stash' = Stash.union stash new'
if Stash.null new' if Stash.null new'
then return stash then return stash
else saturateParseString rules sentence stash' new' matches' else saturateParseString lang rules sentence stash' new' matches'
-- | Finds new matches resulting from newly added tokens. -- | Finds new matches resulting from newly added tokens.
-- Produces new tokens from full matches. -- Produces new tokens from full matches.
parseString1 parseString1
:: [Rule] -> Document -> Stash -> Stash -> [Match] :: Lang -> [Rule] -> Document -> Stash -> Stash -> [Match]
-> Duckling (Stash, [Match]) -> Duckling (Stash, [Match])
parseString1 rules sentence stash new matches = do parseString1 lang rules sentence stash new matches = do
-- Recursively match patterns. -- Recursively match patterns.
-- Find which `matches` can advance because of `new`. -- Find which `matches` can advance because of `new`.
newPartial <- concatMapM (matchFirst sentence new) matches newPartial <- concatMapM (matchFirst sentence lang new) matches
-- Find new matches resulting from newly added tokens (`new`) -- Find new matches resulting from newly added tokens (`new`)
newMatches <- concatMapM (matchFirstAnywhere sentence new) rules newMatches <- concatMapM (matchFirstAnywhere sentence lang new) rules
(full, partial) <- L.partition (\(Rule {pattern}, _, _) -> null pattern) (full, partial) <- L.partition (\(Rule {pattern}, _, _) -> null pattern)
<$> matchAll sentence stash (newPartial ++ newMatches) <$> matchAll sentence lang stash (newPartial ++ newMatches)
-- Produce full matches as new tokens -- Produce full matches as new tokens
return ( Stash.fromList $ mapMaybe produce full return ( Stash.fromList $ mapMaybe produce full
@ -113,13 +114,13 @@ parseString1 rules sentence stash new matches = do
-- | Recursively augments `matches`. -- | Recursively augments `matches`.
-- Discards partial matches stuck by a regex. -- Discards partial matches stuck by a regex.
matchAll :: Document -> Stash -> [Match] -> Duckling [Match] matchAll :: Document -> Lang -> Stash -> [Match] -> Duckling [Match]
matchAll sentence stash matches = concatMapM mkNextMatches matches matchAll sentence lang stash matches = concatMapM mkNextMatches matches
where where
mkNextMatches :: Match -> Duckling [Match] mkNextMatches :: Match -> Duckling [Match]
mkNextMatches match@(Rule {pattern = []}, _, _) = return [ match ] mkNextMatches match@(Rule {pattern = []}, _, _) = return [ match ]
mkNextMatches match@(Rule {pattern = p:_}, _, _) = do mkNextMatches match@(Rule {pattern = p:_}, _, _) = do
nextMatches <- matchAll sentence stash =<< matchFirst sentence stash match nextMatches <- matchAll sentence lang stash =<< matchFirst sentence lang stash match
return $ case p of return $ case p of
Regex _ -> nextMatches Regex _ -> nextMatches
Predicate _ -> match:nextMatches Predicate _ -> match:nextMatches
@ -140,37 +141,37 @@ produce (Rule name _ production, _, etuor@(Node {nodeRange = Range _ e}:_)) = do
-- | Returns all matches matching the first pattern item of `match`, -- | Returns all matches matching the first pattern item of `match`,
-- resuming from a Match position -- resuming from a Match position
matchFirst :: Document -> Stash -> Match -> Duckling [Match] matchFirst :: Document -> Lang -> Stash -> Match -> Duckling [Match]
matchFirst _ _ (Rule {pattern = []}, _, _) = return [] matchFirst _ _ _ (Rule {pattern = []}, _, _) = return []
matchFirst sentence stash (rule@Rule{pattern = p : ps}, position, route) = matchFirst sentence lang stash (rule@Rule{pattern = p : ps}, position, route) =
map (mkMatch route newRule) <$> lookupItem sentence p stash position map (mkMatch route newRule) <$> lookupItem sentence lang p stash position
where where
newRule = rule { pattern = ps } newRule = rule { pattern = ps }
-- | Returns all matches matching the first pattern item of `match`, -- | Returns all matches matching the first pattern item of `match`,
-- starting anywhere -- starting anywhere
matchFirstAnywhere :: Document -> Stash -> Rule -> Duckling [Match] matchFirstAnywhere :: Document -> Lang -> Stash -> Rule -> Duckling [Match]
matchFirstAnywhere _sentence _stash Rule {pattern = []} = return [] matchFirstAnywhere _sentence _lang _stash Rule {pattern = []} = return []
matchFirstAnywhere sentence stash rule@Rule{pattern = p : ps} = matchFirstAnywhere sentence lang stash rule@Rule{pattern = p : ps} =
map (mkMatch [] newRule) <$> lookupItemAnywhere sentence p stash map (mkMatch [] newRule) <$> lookupItemAnywhere sentence lang p stash
where where
newRule = rule { pattern = ps } newRule = rule { pattern = ps }
-- | Handle one PatternItem at a given position -- | Handle one PatternItem at a given position
lookupItem :: Document -> PatternItem -> Stash -> Int -> Duckling [Node] lookupItem :: Document -> Lang -> PatternItem -> Stash -> Int -> Duckling [Node]
lookupItem doc (Regex re) _ position = lookupItem doc lang (Regex re) _ position =
filter (isPositionValid position doc) <$> filter (isPositionValid position doc) <$>
lookupRegex doc re position lookupRegex doc lang re position
lookupItem doc (Predicate p) stash position = lookupItem doc _lang (Predicate p) stash position =
return $ return $
filter (p . token) $ filter (p . token) $
takeWhile (isPositionValid position doc) $ takeWhile (isPositionValid position doc) $
Stash.toPosOrderedListFrom stash position Stash.toPosOrderedListFrom stash position
-- | Handle one PatternItem anywhere in the text -- | Handle one PatternItem anywhere in the text
lookupItemAnywhere :: Document -> PatternItem -> Stash -> Duckling [Node] lookupItemAnywhere :: Document -> Lang -> PatternItem -> Stash -> Duckling [Node]
lookupItemAnywhere doc (Regex re) _ = lookupRegexAnywhere doc re lookupItemAnywhere doc lang (Regex re) _ = lookupRegexAnywhere doc lang re
lookupItemAnywhere _doc (Predicate p) stash = lookupItemAnywhere _doc _lang (Predicate p) stash =
return $ filter (p . token) $ Stash.toPosOrderedList stash return $ filter (p . token) $ Stash.toPosOrderedList stash
isPositionValid :: Int -> Document -> Node -> Bool isPositionValid :: Int -> Document -> Node -> Bool
@ -184,25 +185,26 @@ mkMatch route newRule node@Node{nodeRange = Range _ pos'} =
where newRoute = node:route where newRoute = node:route
-- | Handle a regex match at a given position -- | Handle a regex match at a given position
lookupRegex :: Document -> PCRE.Regex -> Int -> Duckling [Node] lookupRegex :: Document -> Lang -> PCRE.Regex -> Int -> Duckling [Node]
lookupRegex doc _regex position | position >= Document.length doc = return [] lookupRegex doc _lang _regex position | position >= Document.length doc = return []
lookupRegex doc regex position = lookupRegex doc lang regex position =
lookupRegexCommon doc regex position Regex.matchOnce lookupRegexCommon doc lang regex position Regex.matchOnce
-- | Handle a regex match anywhere in the text -- | Handle a regex match anywhere in the text
lookupRegexAnywhere :: Document -> PCRE.Regex -> Duckling [Node] lookupRegexAnywhere :: Document -> Lang -> PCRE.Regex -> Duckling [Node]
lookupRegexAnywhere doc regex = lookupRegexCommon doc regex 0 Regex.matchAll lookupRegexAnywhere doc lang regex = lookupRegexCommon doc lang regex 0 Regex.matchAll
{-# INLINE lookupRegexCommon #-} {-# INLINE lookupRegexCommon #-}
-- INLINE bloats the code a bit, but the code is better -- INLINE bloats the code a bit, but the code is better
lookupRegexCommon lookupRegexCommon
:: Foldable t :: Foldable t
=> Document => Document
-> Lang
-> PCRE.Regex -> PCRE.Regex
-> Int -> Int
-> (PCRE.Regex -> ByteString -> t PCRE.MatchArray) -> (PCRE.Regex -> ByteString -> t PCRE.MatchArray)
-> Duckling [Node] -> Duckling [Node]
lookupRegexCommon doc regex position matchFun = return nodes lookupRegexCommon doc lang regex position matchFun = return nodes
where where
-- See Note [Regular expressions and Text] from Document.hs to understand -- See Note [Regular expressions and Text] from Document.hs to understand
-- what's going on here -- what's going on here
@ -215,7 +217,7 @@ lookupRegexCommon doc regex position matchFun = return nodes
f [] = Nothing f [] = Nothing
f ((0,0):_) = Nothing f ((0,0):_) = Nothing
f ((bsStart, bsLen):groups) = f ((bsStart, bsLen):groups) =
if Document.isRangeValid doc start end if Document.isRangeValid lang doc start end
then Just node then Just node
else Nothing else Nothing
where where

View File

@ -6,6 +6,7 @@
{-# LANGUAGE BangPatterns #-} {-# LANGUAGE BangPatterns #-}
{-# LANGUAGE FlexibleContexts #-} {-# LANGUAGE FlexibleContexts #-}
{-# LANGUAGE LambdaCase #-}
{-# LANGUAGE RecordWildCards #-} {-# LANGUAGE RecordWildCards #-}
module Duckling.Types.Document module Duckling.Types.Document
@ -32,6 +33,7 @@ import qualified Data.Text.Encoding as Text
import qualified Data.Text as Text import qualified Data.Text as Text
import qualified Data.Text.Internal.Unsafe.Char as UText import qualified Data.Text.Internal.Unsafe.Char as UText
import Duckling.Locale (Lang(..))
data Document = Document data Document = Document
{ rawInput :: !Text { rawInput :: !Text
@ -114,20 +116,24 @@ fromText rawInput = Document{..}
-- As regexes are matched without whitespace delimitator, we need to check -- As regexes are matched without whitespace delimitator, we need to check
-- the reasonability of the match to actually be a word. -- the reasonability of the match to actually be a word.
isRangeValid :: Document -> Int -> Int -> Bool isRangeValid :: Lang -> Document -> Int -> Int -> Bool
isRangeValid doc start end = isRangeValid = \case
(start == 0 || _ -> defaultIsRangeValid
isDifferent (doc ! (start - 1)) (doc ! start)) &&
(end == length doc ||
isDifferent (doc ! (end - 1)) (doc ! end))
where where
charClass :: Char -> Char defaultIsRangeValid :: Document -> Int -> Int -> Bool
charClass c defaultIsRangeValid doc start end =
| Char.isLower c || Char.isUpper c = 'c' (start == 0 ||
| Char.isDigit c = 'd' isDifferent (doc ! (start - 1)) (doc ! start)) &&
| otherwise = c (end == length doc ||
isDifferent :: Char -> Char -> Bool isDifferent (doc ! (end - 1)) (doc ! end))
isDifferent a b = charClass a /= charClass b where
charClass :: Char -> Char
charClass c
| Char.isLower c || Char.isUpper c = 'c'
| Char.isDigit c = 'd'
| otherwise = c
isDifferent :: Char -> Char -> Bool
isDifferent a b = charClass a /= charClass b
-- True iff a is followed by whitespaces and b. -- True iff a is followed by whitespaces and b.
isAdjacent :: Document -> Int -> Int -> Bool isAdjacent :: Document -> Int -> Int -> Bool

View File

@ -17,6 +17,7 @@ import Test.Tasty
import Test.Tasty.HUnit import Test.Tasty.HUnit
import Duckling.Engine import Duckling.Engine
import Duckling.Locale (Lang(..))
import Duckling.Types import Duckling.Types
import Duckling.Regex.Types import Duckling.Regex.Types
@ -30,7 +31,7 @@ emptyRegexTest :: TestTree
emptyRegexTest = testCase "Empty Regex Test" $ emptyRegexTest = testCase "Empty Regex Test" $
case regex "()" of case regex "()" of
Regex regex -> assertEqual "empty result" [] $ Regex regex -> assertEqual "empty result" [] $
runDuckling $ lookupRegexAnywhere "hey" regex runDuckling $ lookupRegexAnywhere "hey" EN regex
_ -> assertFailure "expected a regex" _ -> assertFailure "expected a regex"
unicodeAndRegexTest :: TestTree unicodeAndRegexTest :: TestTree
@ -38,7 +39,7 @@ unicodeAndRegexTest = testCase "Unicode and Regex Test" $
case regex "\\$([0-9]*)" of case regex "\\$([0-9]*)" of
Regex regex -> do -- Regex regex -> do --
assertEqual "" expected $ assertEqual "" expected $
runDuckling $ lookupRegexAnywhere "\128526 $35" regex runDuckling $ lookupRegexAnywhere "\128526 $35" EN regex
_ -> assertFailure "expected a regex" _ -> assertFailure "expected a regex"
where where
expected = expected =