mirror of
https://github.com/facebook/duckling.git
synced 2024-10-26 21:59:44 +03:00
Make isRangeValid take Lang as input
Summary: There are different implementations of isRangeValid that work well for different languages, thus it makes sense to facilitate having different implementations based on the language. Reviewed By: patapizza Differential Revision: D28362777 fbshipit-source-id: 5f2991d54af3095c8e95cf534e2dd3b4a34dee3a
This commit is contained in:
parent
7762af850a
commit
69d951220e
@ -3,6 +3,7 @@
|
|||||||
## 0.2.X.X
|
## 0.2.X.X
|
||||||
|
|
||||||
### Core
|
### Core
|
||||||
|
* Make `isRangeValid` behave differently based on lang
|
||||||
|
|
||||||
### Rulesets
|
### Rulesets
|
||||||
* CA (Catalan)
|
* CA (Catalan)
|
||||||
|
@ -27,6 +27,7 @@ import qualified Data.List as L
|
|||||||
import qualified Text.Regex.PCRE as PCRE
|
import qualified Text.Regex.PCRE as PCRE
|
||||||
|
|
||||||
import Duckling.Dimensions.Types
|
import Duckling.Dimensions.Types
|
||||||
|
import Duckling.Locale
|
||||||
import Duckling.Regex.Types
|
import Duckling.Regex.Types
|
||||||
import Duckling.Resolve
|
import Duckling.Resolve
|
||||||
import Duckling.Types hiding (regex)
|
import Duckling.Types hiding (regex)
|
||||||
@ -40,11 +41,11 @@ import qualified Duckling.Types.Stash as Stash
|
|||||||
-- Engine
|
-- Engine
|
||||||
|
|
||||||
parseAndResolve :: [Rule] -> Text -> Context -> Options -> [ResolvedToken]
|
parseAndResolve :: [Rule] -> Text -> Context -> Options -> [ResolvedToken]
|
||||||
parseAndResolve rules input context options =
|
parseAndResolve rules input context@Context{locale = Locale lang _} options =
|
||||||
mapMaybe
|
mapMaybe
|
||||||
(resolveNode context options)
|
(resolveNode context options)
|
||||||
$ force $ Stash.toPosOrderedList
|
$ force $ Stash.toPosOrderedList
|
||||||
$ runDuckling $ parseString rules (Document.fromText input)
|
$ runDuckling $ parseString lang rules (Document.fromText input)
|
||||||
|
|
||||||
type Duckling a = Identity a
|
type Duckling a = Identity a
|
||||||
|
|
||||||
@ -66,45 +67,45 @@ resolveNode context options n@Node{token = (Token dim dd), nodeRange = r}
|
|||||||
, isLatent = latent
|
, isLatent = latent
|
||||||
}
|
}
|
||||||
|
|
||||||
parseString :: [Rule] -> Document -> Duckling Stash
|
parseString :: Lang -> [Rule] -> Document -> Duckling Stash
|
||||||
parseString rules sentence = do
|
parseString lang rules sentence = do
|
||||||
(new, partialMatches) <-
|
(new, partialMatches) <-
|
||||||
-- One the first pass we try all the rules
|
-- One the first pass we try all the rules
|
||||||
parseString1 rules sentence Stash.empty Stash.empty []
|
parseString1 lang rules sentence Stash.empty Stash.empty []
|
||||||
if Stash.null new
|
if Stash.null new
|
||||||
then return Stash.empty
|
then return Stash.empty
|
||||||
else
|
else
|
||||||
-- For subsequent passes, we only try rules starting with a predicate.
|
-- For subsequent passes, we only try rules starting with a predicate.
|
||||||
saturateParseString headPredicateRules sentence new new partialMatches
|
saturateParseString lang headPredicateRules sentence new new partialMatches
|
||||||
where
|
where
|
||||||
headPredicateRules =
|
headPredicateRules =
|
||||||
[ rule | rule@Rule{pattern = (Predicate _ : _)} <- rules ]
|
[ rule | rule@Rule{pattern = (Predicate _ : _)} <- rules ]
|
||||||
|
|
||||||
-- | Produces all tokens recursively.
|
-- | Produces all tokens recursively.
|
||||||
saturateParseString
|
saturateParseString
|
||||||
:: [Rule] -> Document -> Stash -> Stash -> [Match] -> Duckling Stash
|
:: Lang -> [Rule] -> Document -> Stash -> Stash -> [Match] -> Duckling Stash
|
||||||
saturateParseString rules sentence stash new matches = do
|
saturateParseString lang rules sentence stash new matches = do
|
||||||
(new', matches') <- parseString1 rules sentence stash new matches
|
(new', matches') <- parseString1 lang rules sentence stash new matches
|
||||||
let stash' = Stash.union stash new'
|
let stash' = Stash.union stash new'
|
||||||
if Stash.null new'
|
if Stash.null new'
|
||||||
then return stash
|
then return stash
|
||||||
else saturateParseString rules sentence stash' new' matches'
|
else saturateParseString lang rules sentence stash' new' matches'
|
||||||
|
|
||||||
-- | Finds new matches resulting from newly added tokens.
|
-- | Finds new matches resulting from newly added tokens.
|
||||||
-- Produces new tokens from full matches.
|
-- Produces new tokens from full matches.
|
||||||
parseString1
|
parseString1
|
||||||
:: [Rule] -> Document -> Stash -> Stash -> [Match]
|
:: Lang -> [Rule] -> Document -> Stash -> Stash -> [Match]
|
||||||
-> Duckling (Stash, [Match])
|
-> Duckling (Stash, [Match])
|
||||||
parseString1 rules sentence stash new matches = do
|
parseString1 lang rules sentence stash new matches = do
|
||||||
-- Recursively match patterns.
|
-- Recursively match patterns.
|
||||||
-- Find which `matches` can advance because of `new`.
|
-- Find which `matches` can advance because of `new`.
|
||||||
newPartial <- concatMapM (matchFirst sentence new) matches
|
newPartial <- concatMapM (matchFirst sentence lang new) matches
|
||||||
|
|
||||||
-- Find new matches resulting from newly added tokens (`new`)
|
-- Find new matches resulting from newly added tokens (`new`)
|
||||||
newMatches <- concatMapM (matchFirstAnywhere sentence new) rules
|
newMatches <- concatMapM (matchFirstAnywhere sentence lang new) rules
|
||||||
|
|
||||||
(full, partial) <- L.partition (\(Rule {pattern}, _, _) -> null pattern)
|
(full, partial) <- L.partition (\(Rule {pattern}, _, _) -> null pattern)
|
||||||
<$> matchAll sentence stash (newPartial ++ newMatches)
|
<$> matchAll sentence lang stash (newPartial ++ newMatches)
|
||||||
|
|
||||||
-- Produce full matches as new tokens
|
-- Produce full matches as new tokens
|
||||||
return ( Stash.fromList $ mapMaybe produce full
|
return ( Stash.fromList $ mapMaybe produce full
|
||||||
@ -113,13 +114,13 @@ parseString1 rules sentence stash new matches = do
|
|||||||
|
|
||||||
-- | Recursively augments `matches`.
|
-- | Recursively augments `matches`.
|
||||||
-- Discards partial matches stuck by a regex.
|
-- Discards partial matches stuck by a regex.
|
||||||
matchAll :: Document -> Stash -> [Match] -> Duckling [Match]
|
matchAll :: Document -> Lang -> Stash -> [Match] -> Duckling [Match]
|
||||||
matchAll sentence stash matches = concatMapM mkNextMatches matches
|
matchAll sentence lang stash matches = concatMapM mkNextMatches matches
|
||||||
where
|
where
|
||||||
mkNextMatches :: Match -> Duckling [Match]
|
mkNextMatches :: Match -> Duckling [Match]
|
||||||
mkNextMatches match@(Rule {pattern = []}, _, _) = return [ match ]
|
mkNextMatches match@(Rule {pattern = []}, _, _) = return [ match ]
|
||||||
mkNextMatches match@(Rule {pattern = p:_}, _, _) = do
|
mkNextMatches match@(Rule {pattern = p:_}, _, _) = do
|
||||||
nextMatches <- matchAll sentence stash =<< matchFirst sentence stash match
|
nextMatches <- matchAll sentence lang stash =<< matchFirst sentence lang stash match
|
||||||
return $ case p of
|
return $ case p of
|
||||||
Regex _ -> nextMatches
|
Regex _ -> nextMatches
|
||||||
Predicate _ -> match:nextMatches
|
Predicate _ -> match:nextMatches
|
||||||
@ -140,37 +141,37 @@ produce (Rule name _ production, _, etuor@(Node {nodeRange = Range _ e}:_)) = do
|
|||||||
|
|
||||||
-- | Returns all matches matching the first pattern item of `match`,
|
-- | Returns all matches matching the first pattern item of `match`,
|
||||||
-- resuming from a Match position
|
-- resuming from a Match position
|
||||||
matchFirst :: Document -> Stash -> Match -> Duckling [Match]
|
matchFirst :: Document -> Lang -> Stash -> Match -> Duckling [Match]
|
||||||
matchFirst _ _ (Rule {pattern = []}, _, _) = return []
|
matchFirst _ _ _ (Rule {pattern = []}, _, _) = return []
|
||||||
matchFirst sentence stash (rule@Rule{pattern = p : ps}, position, route) =
|
matchFirst sentence lang stash (rule@Rule{pattern = p : ps}, position, route) =
|
||||||
map (mkMatch route newRule) <$> lookupItem sentence p stash position
|
map (mkMatch route newRule) <$> lookupItem sentence lang p stash position
|
||||||
where
|
where
|
||||||
newRule = rule { pattern = ps }
|
newRule = rule { pattern = ps }
|
||||||
|
|
||||||
-- | Returns all matches matching the first pattern item of `match`,
|
-- | Returns all matches matching the first pattern item of `match`,
|
||||||
-- starting anywhere
|
-- starting anywhere
|
||||||
matchFirstAnywhere :: Document -> Stash -> Rule -> Duckling [Match]
|
matchFirstAnywhere :: Document -> Lang -> Stash -> Rule -> Duckling [Match]
|
||||||
matchFirstAnywhere _sentence _stash Rule {pattern = []} = return []
|
matchFirstAnywhere _sentence _lang _stash Rule {pattern = []} = return []
|
||||||
matchFirstAnywhere sentence stash rule@Rule{pattern = p : ps} =
|
matchFirstAnywhere sentence lang stash rule@Rule{pattern = p : ps} =
|
||||||
map (mkMatch [] newRule) <$> lookupItemAnywhere sentence p stash
|
map (mkMatch [] newRule) <$> lookupItemAnywhere sentence lang p stash
|
||||||
where
|
where
|
||||||
newRule = rule { pattern = ps }
|
newRule = rule { pattern = ps }
|
||||||
|
|
||||||
-- | Handle one PatternItem at a given position
|
-- | Handle one PatternItem at a given position
|
||||||
lookupItem :: Document -> PatternItem -> Stash -> Int -> Duckling [Node]
|
lookupItem :: Document -> Lang -> PatternItem -> Stash -> Int -> Duckling [Node]
|
||||||
lookupItem doc (Regex re) _ position =
|
lookupItem doc lang (Regex re) _ position =
|
||||||
filter (isPositionValid position doc) <$>
|
filter (isPositionValid position doc) <$>
|
||||||
lookupRegex doc re position
|
lookupRegex doc lang re position
|
||||||
lookupItem doc (Predicate p) stash position =
|
lookupItem doc _lang (Predicate p) stash position =
|
||||||
return $
|
return $
|
||||||
filter (p . token) $
|
filter (p . token) $
|
||||||
takeWhile (isPositionValid position doc) $
|
takeWhile (isPositionValid position doc) $
|
||||||
Stash.toPosOrderedListFrom stash position
|
Stash.toPosOrderedListFrom stash position
|
||||||
|
|
||||||
-- | Handle one PatternItem anywhere in the text
|
-- | Handle one PatternItem anywhere in the text
|
||||||
lookupItemAnywhere :: Document -> PatternItem -> Stash -> Duckling [Node]
|
lookupItemAnywhere :: Document -> Lang -> PatternItem -> Stash -> Duckling [Node]
|
||||||
lookupItemAnywhere doc (Regex re) _ = lookupRegexAnywhere doc re
|
lookupItemAnywhere doc lang (Regex re) _ = lookupRegexAnywhere doc lang re
|
||||||
lookupItemAnywhere _doc (Predicate p) stash =
|
lookupItemAnywhere _doc _lang (Predicate p) stash =
|
||||||
return $ filter (p . token) $ Stash.toPosOrderedList stash
|
return $ filter (p . token) $ Stash.toPosOrderedList stash
|
||||||
|
|
||||||
isPositionValid :: Int -> Document -> Node -> Bool
|
isPositionValid :: Int -> Document -> Node -> Bool
|
||||||
@ -184,25 +185,26 @@ mkMatch route newRule node@Node{nodeRange = Range _ pos'} =
|
|||||||
where newRoute = node:route
|
where newRoute = node:route
|
||||||
|
|
||||||
-- | Handle a regex match at a given position
|
-- | Handle a regex match at a given position
|
||||||
lookupRegex :: Document -> PCRE.Regex -> Int -> Duckling [Node]
|
lookupRegex :: Document -> Lang -> PCRE.Regex -> Int -> Duckling [Node]
|
||||||
lookupRegex doc _regex position | position >= Document.length doc = return []
|
lookupRegex doc _lang _regex position | position >= Document.length doc = return []
|
||||||
lookupRegex doc regex position =
|
lookupRegex doc lang regex position =
|
||||||
lookupRegexCommon doc regex position Regex.matchOnce
|
lookupRegexCommon doc lang regex position Regex.matchOnce
|
||||||
|
|
||||||
-- | Handle a regex match anywhere in the text
|
-- | Handle a regex match anywhere in the text
|
||||||
lookupRegexAnywhere :: Document -> PCRE.Regex -> Duckling [Node]
|
lookupRegexAnywhere :: Document -> Lang -> PCRE.Regex -> Duckling [Node]
|
||||||
lookupRegexAnywhere doc regex = lookupRegexCommon doc regex 0 Regex.matchAll
|
lookupRegexAnywhere doc lang regex = lookupRegexCommon doc lang regex 0 Regex.matchAll
|
||||||
|
|
||||||
{-# INLINE lookupRegexCommon #-}
|
{-# INLINE lookupRegexCommon #-}
|
||||||
-- INLINE bloats the code a bit, but the code is better
|
-- INLINE bloats the code a bit, but the code is better
|
||||||
lookupRegexCommon
|
lookupRegexCommon
|
||||||
:: Foldable t
|
:: Foldable t
|
||||||
=> Document
|
=> Document
|
||||||
|
-> Lang
|
||||||
-> PCRE.Regex
|
-> PCRE.Regex
|
||||||
-> Int
|
-> Int
|
||||||
-> (PCRE.Regex -> ByteString -> t PCRE.MatchArray)
|
-> (PCRE.Regex -> ByteString -> t PCRE.MatchArray)
|
||||||
-> Duckling [Node]
|
-> Duckling [Node]
|
||||||
lookupRegexCommon doc regex position matchFun = return nodes
|
lookupRegexCommon doc lang regex position matchFun = return nodes
|
||||||
where
|
where
|
||||||
-- See Note [Regular expressions and Text] from Document.hs to understand
|
-- See Note [Regular expressions and Text] from Document.hs to understand
|
||||||
-- what's going on here
|
-- what's going on here
|
||||||
@ -215,7 +217,7 @@ lookupRegexCommon doc regex position matchFun = return nodes
|
|||||||
f [] = Nothing
|
f [] = Nothing
|
||||||
f ((0,0):_) = Nothing
|
f ((0,0):_) = Nothing
|
||||||
f ((bsStart, bsLen):groups) =
|
f ((bsStart, bsLen):groups) =
|
||||||
if Document.isRangeValid doc start end
|
if Document.isRangeValid lang doc start end
|
||||||
then Just node
|
then Just node
|
||||||
else Nothing
|
else Nothing
|
||||||
where
|
where
|
||||||
|
@ -6,6 +6,7 @@
|
|||||||
|
|
||||||
{-# LANGUAGE BangPatterns #-}
|
{-# LANGUAGE BangPatterns #-}
|
||||||
{-# LANGUAGE FlexibleContexts #-}
|
{-# LANGUAGE FlexibleContexts #-}
|
||||||
|
{-# LANGUAGE LambdaCase #-}
|
||||||
{-# LANGUAGE RecordWildCards #-}
|
{-# LANGUAGE RecordWildCards #-}
|
||||||
|
|
||||||
module Duckling.Types.Document
|
module Duckling.Types.Document
|
||||||
@ -32,6 +33,7 @@ import qualified Data.Text.Encoding as Text
|
|||||||
import qualified Data.Text as Text
|
import qualified Data.Text as Text
|
||||||
import qualified Data.Text.Internal.Unsafe.Char as UText
|
import qualified Data.Text.Internal.Unsafe.Char as UText
|
||||||
|
|
||||||
|
import Duckling.Locale (Lang(..))
|
||||||
|
|
||||||
data Document = Document
|
data Document = Document
|
||||||
{ rawInput :: !Text
|
{ rawInput :: !Text
|
||||||
@ -114,20 +116,24 @@ fromText rawInput = Document{..}
|
|||||||
|
|
||||||
-- As regexes are matched without whitespace delimitator, we need to check
|
-- As regexes are matched without whitespace delimitator, we need to check
|
||||||
-- the reasonability of the match to actually be a word.
|
-- the reasonability of the match to actually be a word.
|
||||||
isRangeValid :: Document -> Int -> Int -> Bool
|
isRangeValid :: Lang -> Document -> Int -> Int -> Bool
|
||||||
isRangeValid doc start end =
|
isRangeValid = \case
|
||||||
(start == 0 ||
|
_ -> defaultIsRangeValid
|
||||||
isDifferent (doc ! (start - 1)) (doc ! start)) &&
|
|
||||||
(end == length doc ||
|
|
||||||
isDifferent (doc ! (end - 1)) (doc ! end))
|
|
||||||
where
|
where
|
||||||
charClass :: Char -> Char
|
defaultIsRangeValid :: Document -> Int -> Int -> Bool
|
||||||
charClass c
|
defaultIsRangeValid doc start end =
|
||||||
| Char.isLower c || Char.isUpper c = 'c'
|
(start == 0 ||
|
||||||
| Char.isDigit c = 'd'
|
isDifferent (doc ! (start - 1)) (doc ! start)) &&
|
||||||
| otherwise = c
|
(end == length doc ||
|
||||||
isDifferent :: Char -> Char -> Bool
|
isDifferent (doc ! (end - 1)) (doc ! end))
|
||||||
isDifferent a b = charClass a /= charClass b
|
where
|
||||||
|
charClass :: Char -> Char
|
||||||
|
charClass c
|
||||||
|
| Char.isLower c || Char.isUpper c = 'c'
|
||||||
|
| Char.isDigit c = 'd'
|
||||||
|
| otherwise = c
|
||||||
|
isDifferent :: Char -> Char -> Bool
|
||||||
|
isDifferent a b = charClass a /= charClass b
|
||||||
|
|
||||||
-- True iff a is followed by whitespaces and b.
|
-- True iff a is followed by whitespaces and b.
|
||||||
isAdjacent :: Document -> Int -> Int -> Bool
|
isAdjacent :: Document -> Int -> Int -> Bool
|
||||||
|
@ -17,6 +17,7 @@ import Test.Tasty
|
|||||||
import Test.Tasty.HUnit
|
import Test.Tasty.HUnit
|
||||||
|
|
||||||
import Duckling.Engine
|
import Duckling.Engine
|
||||||
|
import Duckling.Locale (Lang(..))
|
||||||
import Duckling.Types
|
import Duckling.Types
|
||||||
import Duckling.Regex.Types
|
import Duckling.Regex.Types
|
||||||
|
|
||||||
@ -30,7 +31,7 @@ emptyRegexTest :: TestTree
|
|||||||
emptyRegexTest = testCase "Empty Regex Test" $
|
emptyRegexTest = testCase "Empty Regex Test" $
|
||||||
case regex "()" of
|
case regex "()" of
|
||||||
Regex regex -> assertEqual "empty result" [] $
|
Regex regex -> assertEqual "empty result" [] $
|
||||||
runDuckling $ lookupRegexAnywhere "hey" regex
|
runDuckling $ lookupRegexAnywhere "hey" EN regex
|
||||||
_ -> assertFailure "expected a regex"
|
_ -> assertFailure "expected a regex"
|
||||||
|
|
||||||
unicodeAndRegexTest :: TestTree
|
unicodeAndRegexTest :: TestTree
|
||||||
@ -38,7 +39,7 @@ unicodeAndRegexTest = testCase "Unicode and Regex Test" $
|
|||||||
case regex "\\$([0-9]*)" of
|
case regex "\\$([0-9]*)" of
|
||||||
Regex regex -> do --
|
Regex regex -> do --
|
||||||
assertEqual "" expected $
|
assertEqual "" expected $
|
||||||
runDuckling $ lookupRegexAnywhere "\128526 $35" regex
|
runDuckling $ lookupRegexAnywhere "\128526 $35" EN regex
|
||||||
_ -> assertFailure "expected a regex"
|
_ -> assertFailure "expected a regex"
|
||||||
where
|
where
|
||||||
expected =
|
expected =
|
||||||
|
Loading…
Reference in New Issue
Block a user