Make isRangeValid take Lang as input

Summary: There are different implementations of isRangeValid that work well for different languages, thus it makes sense to facilitate having different implementations based on the language. Reviewed By: patapizza Differential Revision: D28362777 fbshipit-source-id: 5f2991d54af3095c8e95cf534e2dd3b4a34dee3a
2024-10-26 21:59:44 +03:00 · 2021-05-17 13:08:57 -07:00 · 2021-05-17 13:08:57 -07:00 · 69d951220e
commit 69d951220e
parent 7762af850a
4 changed files with 66 additions and 56 deletions
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@ -3,6 +3,7 @@
 ## 0.2.X.X
 ### Core
 * Make `isRangeValid` behave differently based on lang
 ### Rulesets
 * CA (Catalan)
--- a/Duckling/Engine.hs
+++ b/Duckling/Engine.hs
@ -27,6 +27,7 @@ import qualified Data.List as L
 import qualified Text.Regex.PCRE as PCRE
 import Duckling.Dimensions.Types
 import Duckling.Locale
 import Duckling.Regex.Types
 import Duckling.Resolve
 import Duckling.Types hiding (regex)
@ -40,11 +41,11 @@ import qualified Duckling.Types.Stash as Stash
 -- Engine
 parseAndResolve :: [Rule] -> Text -> Context -> Options -> [ResolvedToken]
-parseAndResolve rules input context options =
+parseAndResolve rules input context@Context{locale = Locale lang _} options =
  mapMaybe
    (resolveNode context options)
    $ force $ Stash.toPosOrderedList
-      $ runDuckling $ parseString rules (Document.fromText input)
+      $ runDuckling $ parseString lang rules (Document.fromText input)
 type Duckling a = Identity a
@ -66,45 +67,45 @@ resolveNode context options n@Node{token = (Token dim dd), nodeRange = r}
    , isLatent = latent
    }
-parseString :: [Rule] -> Document -> Duckling Stash
+parseString :: Lang -> [Rule] -> Document -> Duckling Stash
-parseString rules sentence = do
+parseString lang rules sentence = do
  (new, partialMatches) <-
    -- One the first pass we try all the rules
-    parseString1 rules sentence Stash.empty Stash.empty []
+    parseString1 lang rules sentence Stash.empty Stash.empty []
  if Stash.null new
    then return Stash.empty
    else
    -- For subsequent passes, we only try rules starting with a predicate.
-    saturateParseString headPredicateRules sentence new new partialMatches
+    saturateParseString lang headPredicateRules sentence new new partialMatches
  where
  headPredicateRules =
    [ rule | rule@Rule{pattern = (Predicate _ : _)} <- rules ]
 -- | Produces all tokens recursively.
 saturateParseString
-  :: [Rule] -> Document -> Stash -> Stash -> [Match] -> Duckling Stash
+  :: Lang -> [Rule] -> Document -> Stash -> Stash -> [Match] -> Duckling Stash
-saturateParseString rules sentence stash new matches = do
+saturateParseString lang rules sentence stash new matches = do
-  (new', matches') <- parseString1 rules sentence stash new matches
+  (new', matches') <- parseString1 lang rules sentence stash new matches
  let stash' = Stash.union stash new'
  if Stash.null new'
    then return stash
-    else saturateParseString rules sentence stash' new' matches'
+    else saturateParseString lang rules sentence stash' new' matches'
 -- | Finds new matches resulting from newly added tokens.
 -- Produces new tokens from full matches.
 parseString1
-  :: [Rule] -> Document -> Stash -> Stash -> [Match]
+  :: Lang -> [Rule] -> Document -> Stash -> Stash -> [Match]
  -> Duckling (Stash, [Match])
-parseString1 rules sentence stash new matches = do
+parseString1 lang rules sentence stash new matches = do
  -- Recursively match patterns.
  -- Find which `matches` can advance because of `new`.
-  newPartial <- concatMapM (matchFirst sentence new) matches
+  newPartial <- concatMapM (matchFirst sentence lang new) matches
  -- Find new matches resulting from newly added tokens (`new`)
-  newMatches <- concatMapM (matchFirstAnywhere sentence new) rules
+  newMatches <- concatMapM (matchFirstAnywhere sentence lang new) rules
  (full, partial) <- L.partition (\(Rule {pattern}, _, _) -> null pattern)
-    <$> matchAll sentence stash (newPartial ++ newMatches)
+    <$> matchAll sentence lang stash (newPartial ++ newMatches)
  -- Produce full matches as new tokens
  return ( Stash.fromList $ mapMaybe produce full
@ -113,13 +114,13 @@ parseString1 rules sentence stash new matches = do
 -- | Recursively augments `matches`.
 -- Discards partial matches stuck by a regex.
-matchAll :: Document -> Stash -> [Match] -> Duckling [Match]
+matchAll :: Document -> Lang -> Stash -> [Match] -> Duckling [Match]
-matchAll sentence stash matches = concatMapM mkNextMatches matches
+matchAll sentence lang stash matches = concatMapM mkNextMatches matches
  where
    mkNextMatches :: Match -> Duckling [Match]
    mkNextMatches match@(Rule {pattern = []}, _, _) = return [ match ]
    mkNextMatches match@(Rule {pattern = p:_}, _, _) = do
-      nextMatches <- matchAll sentence stash =<< matchFirst sentence stash match
+      nextMatches <- matchAll sentence lang stash =<< matchFirst sentence lang stash match
      return $ case p of
        Regex _ -> nextMatches
        Predicate _ -> match:nextMatches
@ -140,37 +141,37 @@ produce (Rule name _ production, _, etuor@(Node {nodeRange = Range _ e}:_)) = do
 -- | Returns all matches matching the first pattern item of `match`,
 -- resuming from a Match position
-matchFirst :: Document -> Stash -> Match -> Duckling [Match]
+matchFirst :: Document -> Lang -> Stash -> Match -> Duckling [Match]
-matchFirst _ _ (Rule {pattern = []}, _, _) = return []
+matchFirst _ _ _ (Rule {pattern = []}, _, _) = return []
-matchFirst sentence stash (rule@Rule{pattern = p : ps}, position, route) =
+matchFirst sentence lang stash (rule@Rule{pattern = p : ps}, position, route) =
-  map (mkMatch route newRule) <$> lookupItem sentence p stash position
+  map (mkMatch route newRule) <$> lookupItem sentence lang p stash position
  where
  newRule = rule { pattern = ps }
 -- | Returns all matches matching the first pattern item of `match`,
 -- starting anywhere
-matchFirstAnywhere :: Document -> Stash -> Rule -> Duckling [Match]
+matchFirstAnywhere :: Document -> Lang -> Stash -> Rule -> Duckling [Match]
-matchFirstAnywhere _sentence _stash Rule {pattern = []} = return []
+matchFirstAnywhere _sentence _lang _stash Rule {pattern = []} = return []
-matchFirstAnywhere sentence stash rule@Rule{pattern = p : ps} =
+matchFirstAnywhere sentence lang stash rule@Rule{pattern = p : ps} =
-  map (mkMatch [] newRule) <$> lookupItemAnywhere sentence p stash
+  map (mkMatch [] newRule) <$> lookupItemAnywhere sentence lang p stash
  where
  newRule = rule { pattern = ps }
 -- | Handle one PatternItem at a given position
-lookupItem :: Document -> PatternItem -> Stash -> Int -> Duckling [Node]
+lookupItem :: Document -> Lang -> PatternItem -> Stash -> Int -> Duckling [Node]
-lookupItem doc (Regex re) _ position =
+lookupItem doc lang (Regex re) _ position =
  filter (isPositionValid position doc) <$>
-  lookupRegex doc re position
+  lookupRegex doc lang re position
-lookupItem doc (Predicate p) stash position =
+lookupItem doc _lang (Predicate p) stash position =
  return $
  filter (p . token) $
  takeWhile (isPositionValid position doc) $
  Stash.toPosOrderedListFrom stash position
 -- | Handle one PatternItem anywhere in the text
-lookupItemAnywhere :: Document -> PatternItem -> Stash -> Duckling [Node]
+lookupItemAnywhere :: Document -> Lang -> PatternItem -> Stash -> Duckling [Node]
-lookupItemAnywhere doc (Regex re) _ = lookupRegexAnywhere doc re
+lookupItemAnywhere doc lang (Regex re) _ = lookupRegexAnywhere doc lang re
-lookupItemAnywhere _doc (Predicate p) stash =
+lookupItemAnywhere _doc _lang (Predicate p) stash =
  return $ filter (p . token) $ Stash.toPosOrderedList stash
 isPositionValid :: Int -> Document -> Node -> Bool
@ -184,25 +185,26 @@ mkMatch route newRule node@Node{nodeRange = Range _ pos'} =
  where newRoute = node:route
 -- | Handle a regex match at a given position
-lookupRegex :: Document -> PCRE.Regex -> Int -> Duckling [Node]
+lookupRegex :: Document -> Lang -> PCRE.Regex -> Int -> Duckling [Node]
-lookupRegex doc _regex position | position >= Document.length doc = return []
+lookupRegex doc _lang _regex position | position >= Document.length doc = return []
-lookupRegex doc regex position =
+lookupRegex doc lang regex position =
-  lookupRegexCommon doc regex position Regex.matchOnce
+  lookupRegexCommon doc lang regex position Regex.matchOnce
 -- | Handle a regex match anywhere in the text
-lookupRegexAnywhere :: Document -> PCRE.Regex -> Duckling [Node]
+lookupRegexAnywhere :: Document -> Lang -> PCRE.Regex -> Duckling [Node]
-lookupRegexAnywhere doc regex = lookupRegexCommon doc regex 0 Regex.matchAll
+lookupRegexAnywhere doc lang regex = lookupRegexCommon doc lang regex 0 Regex.matchAll
 {-# INLINE lookupRegexCommon #-}
 -- INLINE bloats the code a bit, but the code is better
 lookupRegexCommon
  :: Foldable t
  => Document
  -> Lang
  -> PCRE.Regex
  -> Int
  -> (PCRE.Regex -> ByteString -> t PCRE.MatchArray)
  -> Duckling [Node]
-lookupRegexCommon doc regex position matchFun = return nodes
+lookupRegexCommon doc lang regex position matchFun = return nodes
  where
  -- See Note [Regular expressions and Text] from Document.hs to understand
  -- what's going on here
@ -215,7 +217,7 @@ lookupRegexCommon doc regex position matchFun = return nodes
  f [] = Nothing
  f ((0,0):_) = Nothing
  f ((bsStart, bsLen):groups) =
-    if Document.isRangeValid doc start end
+    if Document.isRangeValid lang doc start end
      then Just node
      else Nothing
    where
--- a/Duckling/Types/Document.hs
+++ b/Duckling/Types/Document.hs
@ -6,6 +6,7 @@
 {-# LANGUAGE BangPatterns #-}
 {-# LANGUAGE FlexibleContexts #-}
 {-# LANGUAGE LambdaCase #-}
 {-# LANGUAGE RecordWildCards #-}
 module Duckling.Types.Document
@ -32,6 +33,7 @@ import qualified Data.Text.Encoding as Text
 import qualified Data.Text as Text
 import qualified Data.Text.Internal.Unsafe.Char as UText
 import Duckling.Locale (Lang(..))
 data Document = Document
  { rawInput :: !Text
@ -114,20 +116,24 @@ fromText rawInput = Document{..}
 -- As regexes are matched without whitespace delimitator, we need to check
 -- the reasonability of the match to actually be a word.
-isRangeValid :: Document -> Int -> Int -> Bool
+isRangeValid :: Lang -> Document -> Int -> Int -> Bool
-isRangeValid doc start end =
+isRangeValid = \case
-  (start == 0 ||
+  _ -> defaultIsRangeValid
      isDifferent (doc ! (start - 1)) (doc ! start)) &&
  (end == length doc ||
      isDifferent (doc ! (end - 1)) (doc ! end))
  where
-    charClass :: Char -> Char
+    defaultIsRangeValid :: Document -> Int -> Int -> Bool
-    charClass c
+    defaultIsRangeValid doc start end =
-      | Char.isLower c || Char.isUpper c = 'c'
+      (start == 0 ||
-      | Char.isDigit c = 'd'
+        isDifferent (doc ! (start - 1)) (doc ! start)) &&
-      | otherwise = c
+      (end == length doc ||
-    isDifferent :: Char -> Char -> Bool
+        isDifferent (doc ! (end - 1)) (doc ! end))
-    isDifferent a b = charClass a /= charClass b
+      where
        charClass :: Char -> Char
        charClass c
          | Char.isLower c || Char.isUpper c = 'c'
          | Char.isDigit c = 'd'
          | otherwise = c
        isDifferent :: Char -> Char -> Bool
        isDifferent a b = charClass a /= charClass b
 -- True iff a is followed by whitespaces and b.
 isAdjacent :: Document -> Int -> Int -> Bool
--- a/tests/Duckling/Engine/Tests.hs
+++ b/tests/Duckling/Engine/Tests.hs
@ -17,6 +17,7 @@ import Test.Tasty
 import Test.Tasty.HUnit
 import Duckling.Engine
 import Duckling.Locale (Lang(..))
 import Duckling.Types
 import Duckling.Regex.Types
@ -30,7 +31,7 @@ emptyRegexTest :: TestTree
 emptyRegexTest = testCase "Empty Regex Test" $
  case regex "()" of
    Regex regex -> assertEqual "empty result" [] $
-      runDuckling $ lookupRegexAnywhere "hey" regex
+      runDuckling $ lookupRegexAnywhere "hey" EN regex
    _ -> assertFailure "expected a regex"
 unicodeAndRegexTest :: TestTree
@ -38,7 +39,7 @@ unicodeAndRegexTest = testCase "Unicode and Regex Test" $
  case regex "\\$([0-9]*)" of
    Regex regex -> do --
      assertEqual "" expected $
-        runDuckling $ lookupRegexAnywhere "\128526 $35" regex
+        runDuckling $ lookupRegexAnywhere "\128526 $35" EN regex
    _ -> assertFailure "expected a regex"
  where
  expected =