mirror of
https://github.com/facebook/duckling.git
synced 2025-01-07 14:29:37 +03:00
bf89e34365
Reviewed By: JoelMarcey Differential Revision: D15439223 fbshipit-source-id: c5af3cb06318748142fe503945b38beffadfc28a
230 lines
7.8 KiB
Haskell
230 lines
7.8 KiB
Haskell
-- Copyright (c) 2016-present, Facebook, Inc.
|
|
-- All rights reserved.
|
|
--
|
|
-- This source code is licensed under the BSD-style license found in the
|
|
-- LICENSE file in the root directory of this source tree.
|
|
|
|
|
|
{-# LANGUAGE GADTs #-}
|
|
{-# LANGUAGE NamedFieldPuns #-}
|
|
{-# LANGUAGE NoRebindableSyntax #-}
|
|
|
|
|
|
module Duckling.Engine
|
|
( parseAndResolve
|
|
, lookupRegexAnywhere
|
|
, runDuckling
|
|
) where
|
|
|
|
import Control.DeepSeq
|
|
import Control.Monad.Extra
|
|
import Data.Aeson (toJSON)
|
|
import Data.ByteString (ByteString)
|
|
import Data.Functor.Identity
|
|
import Data.Maybe
|
|
import Data.Text (Text)
|
|
import Prelude
|
|
import qualified Data.Array as Array
|
|
import qualified Data.Foldable as Foldable
|
|
import qualified Data.List as L
|
|
import qualified Text.Regex.PCRE as PCRE
|
|
|
|
import Duckling.Dimensions.Types
|
|
import Duckling.Regex.Types
|
|
import Duckling.Resolve
|
|
import Duckling.Types
|
|
import Duckling.Types.Document (Document)
|
|
import Duckling.Types.Stash (Stash)
|
|
import qualified Duckling.Engine.Regex as Regex
|
|
import qualified Duckling.Types.Document as Document
|
|
import qualified Duckling.Types.Stash as Stash
|
|
|
|
-- -----------------------------------------------------------------
|
|
-- Engine
|
|
|
|
type Duckling a = Identity a
|
|
|
|
runDuckling :: Duckling a -> a
|
|
runDuckling ma = runIdentity ma
|
|
|
|
parseAndResolve :: [Rule] -> Text -> Context -> Options -> [ResolvedToken]
|
|
parseAndResolve rules input context options =
|
|
mapMaybe (resolveNode context options) . force $ Stash.toPosOrderedList $
|
|
runDuckling $ parseString rules (Document.fromText input)
|
|
|
|
produce :: Match -> Maybe Node
|
|
produce (_, _, []) = Nothing
|
|
produce (Rule name _ production, _, etuor@(Node {nodeRange = Range _ e}:_)) = do
|
|
let route = reverse etuor
|
|
token <- force $ production $ map token route
|
|
case route of
|
|
(Node {nodeRange = Range p _}:_) -> Just Node
|
|
{ nodeRange = Range p e
|
|
, token = token
|
|
, children = route
|
|
, rule = Just name
|
|
}
|
|
[] -> Nothing
|
|
|
|
-- | Handle a regex match at a given position
|
|
lookupRegex :: Document -> PCRE.Regex -> Int -> Duckling [Node]
|
|
lookupRegex doc _regex position | position >= Document.length doc = return []
|
|
lookupRegex doc regex position =
|
|
lookupRegexCommon doc regex position Regex.matchOnce
|
|
|
|
-- | Handle a regex match anywhere in the text
|
|
lookupRegexAnywhere :: Document -> PCRE.Regex -> Duckling [Node]
|
|
lookupRegexAnywhere doc regex = lookupRegexCommon doc regex 0 Regex.matchAll
|
|
|
|
{-# INLINE lookupRegexCommon #-}
|
|
-- INLINE bloats the code a bit, but the code is better
|
|
lookupRegexCommon
|
|
:: Foldable t
|
|
=> Document
|
|
-> PCRE.Regex
|
|
-> Int
|
|
-> (PCRE.Regex -> ByteString -> t PCRE.MatchArray)
|
|
-> Duckling [Node]
|
|
lookupRegexCommon doc regex position matchFun = return nodes
|
|
where
|
|
-- See Note [Regular expressions and Text] to understand what's going
|
|
-- on here
|
|
(substring, rangeToText, translateRange) =
|
|
Document.byteStringFromPos doc position
|
|
nodes = mapMaybe (f . Array.elems) $ Foldable.toList $
|
|
matchFun regex substring
|
|
f :: [(Int, Int)] -> Maybe Node
|
|
f [] = Nothing
|
|
f ((0,0):_) = Nothing
|
|
f ((bsStart, bsLen):groups) =
|
|
if Document.isRangeValid doc start end
|
|
then Just node
|
|
else Nothing
|
|
where
|
|
textGroups = map rangeToText groups
|
|
(start, end) = translateRange bsStart bsLen
|
|
node = Node
|
|
{ nodeRange = Range start end
|
|
, token = Token RegexMatch (GroupMatch textGroups)
|
|
, children = []
|
|
, rule = Nothing
|
|
}
|
|
|
|
-- | Handle one PatternItem at a given position
|
|
lookupItem :: Document -> PatternItem -> Stash -> Int -> Duckling [Node]
|
|
lookupItem doc (Regex re) _ position =
|
|
filter (isPositionValid position doc) <$>
|
|
lookupRegex doc re position
|
|
lookupItem doc (Predicate p) stash position =
|
|
return $
|
|
filter (p . token) $
|
|
takeWhile (isPositionValid position doc) $
|
|
Stash.toPosOrderedListFrom stash position
|
|
|
|
-- | Handle one PatternItem anywhere in the text
|
|
lookupItemAnywhere :: Document -> PatternItem -> Stash -> Duckling [Node]
|
|
lookupItemAnywhere doc (Regex re) _ = lookupRegexAnywhere doc re
|
|
lookupItemAnywhere _doc (Predicate p) stash =
|
|
return $ filter (p . token) $ Stash.toPosOrderedList stash
|
|
|
|
isPositionValid :: Int -> Document -> Node -> Bool
|
|
isPositionValid position sentence Node{nodeRange = Range start _} =
|
|
Document.isAdjacent sentence position start
|
|
|
|
-- | A match is full if its rule pattern is empty.
|
|
-- (rule, endPosition, reversedRoute)
|
|
type Match = (Rule, Int, [Node])
|
|
|
|
-- | Recursively augments `matches`.
|
|
-- Discards partial matches stuck by a regex.
|
|
matchAll :: Document -> Stash -> [Match] -> Duckling [Match]
|
|
matchAll sentence stash matches = concatMapM mkNextMatches matches
|
|
where
|
|
mkNextMatches :: Match -> Duckling [Match]
|
|
mkNextMatches match@(Rule {pattern = []}, _, _) = return [ match ]
|
|
mkNextMatches match@(Rule {pattern = p:_}, _, _) = do
|
|
nextMatches <- matchAll sentence stash =<< matchFirst sentence stash match
|
|
return $ case p of
|
|
Regex _ -> nextMatches
|
|
Predicate _ -> match:nextMatches
|
|
|
|
-- | Returns all matches matching the first pattern item of `match`,
|
|
-- resuming from a Match position
|
|
matchFirst :: Document -> Stash -> Match -> Duckling [Match]
|
|
matchFirst _ _ (Rule {pattern = []}, _, _) = return []
|
|
matchFirst sentence stash (rule@Rule{pattern = p : ps}, position, route) =
|
|
map (mkMatch route newRule) <$> lookupItem sentence p stash position
|
|
where
|
|
newRule = rule { pattern = ps }
|
|
|
|
-- | Returns all matches matching the first pattern item of `match`,
|
|
-- starting anywhere
|
|
matchFirstAnywhere :: Document -> Stash -> Rule -> Duckling [Match]
|
|
matchFirstAnywhere _sentence _stash Rule {pattern = []} = return []
|
|
matchFirstAnywhere sentence stash rule@Rule{pattern = p : ps} =
|
|
map (mkMatch [] newRule) <$> lookupItemAnywhere sentence p stash
|
|
where
|
|
newRule = rule { pattern = ps }
|
|
|
|
{-# INLINE mkMatch #-}
|
|
mkMatch :: [Node] -> Rule -> Node -> Match
|
|
mkMatch route newRule (node@Node {nodeRange = Range _ pos'}) =
|
|
newRoute `seq` (newRule, pos', newRoute)
|
|
where newRoute = node:route
|
|
|
|
-- | Finds new matches resulting from newly added tokens.
|
|
-- Produces new tokens from full matches.
|
|
parseString1
|
|
:: [Rule] -> Document -> Stash -> Stash -> [Match]
|
|
-> Duckling (Stash, [Match])
|
|
parseString1 rules sentence stash new matches = do
|
|
-- Recursively match patterns.
|
|
-- Find which `matches` can advance because of `new`.
|
|
newPartial <- concatMapM (matchFirst sentence new) matches
|
|
|
|
-- Find new matches resulting from newly added tokens (`new`)
|
|
newMatches <- concatMapM (matchFirstAnywhere sentence new) rules
|
|
|
|
(full, partial) <- L.partition (\(Rule {pattern}, _, _) -> null pattern)
|
|
<$> matchAll sentence stash (newPartial ++ newMatches)
|
|
|
|
-- Produce full matches as new tokens
|
|
return ( Stash.fromList $ mapMaybe produce full
|
|
, partial ++ matches
|
|
)
|
|
|
|
-- | Produces all tokens recursively.
|
|
saturateParseString
|
|
:: [Rule] -> Document -> Stash -> Stash -> [Match] -> Duckling Stash
|
|
saturateParseString rules sentence stash new matches = do
|
|
(new', matches') <- parseString1 rules sentence stash new matches
|
|
let stash' = Stash.union stash new'
|
|
if Stash.null new'
|
|
then return stash
|
|
else saturateParseString rules sentence stash' new' matches'
|
|
|
|
parseString :: [Rule] -> Document -> Duckling Stash
|
|
parseString rules sentence = do
|
|
(new, partialMatches) <-
|
|
-- One the first pass we try all the rules
|
|
parseString1 rules sentence Stash.empty Stash.empty []
|
|
if Stash.null new
|
|
then return Stash.empty
|
|
else
|
|
-- For subsequent passes, we only try rules starting with a predicate.
|
|
saturateParseString headPredicateRules sentence new new partialMatches
|
|
where
|
|
headPredicateRules =
|
|
[ rule | rule@Rule{pattern = (Predicate _ : _)} <- rules ]
|
|
|
|
resolveNode :: Context -> Options -> Node -> Maybe ResolvedToken
|
|
resolveNode context options n@Node{token = (Token dim dd), nodeRange = r}
|
|
= do
|
|
(val, latent) <- resolve context options dd
|
|
Just Resolved
|
|
{ range = r
|
|
, node = n
|
|
, rval = RVal dim val
|
|
, isLatent = latent
|
|
}
|