duckling/Duckling/Engine.hs

232 lines
8.1 KiB
Haskell
Raw Normal View History

-- Copyright (c) 2016-present, Facebook, Inc.
-- All rights reserved.
--
-- This source code is licensed under the BSD-style license found in the
-- LICENSE file in the root directory of this source tree.
{-# LANGUAGE GADTs #-}
{-# LANGUAGE NamedFieldPuns #-}
{-# LANGUAGE NoRebindableSyntax #-}
module Duckling.Engine
( parseAndResolve
, lookupRegexAnywhere
, runDuckling
) where
import Control.DeepSeq
import Control.Monad.Extra
import Data.ByteString (ByteString)
import Data.Functor.Identity
import Data.Maybe
import Data.Text (Text)
import Prelude
Locales support Summary: * Locales support for the library, following `<Lang>_<Region>` with ISO 639-1 code for `<Lang>` and ISO 3166-1 alpha-2 code for `<Region>` (#33) * `Locale` opaque type (composite of `Lang` and `Region`) with `makeLocale` smart constructor to only allow valid `(Lang, Region)` combinations * API: `Context`'s `lang` parameter has been replaced by `locale`, with optional `Region` and backward compatibility. * `Rules/<Lang>.hs` exposes - `langRules`: cross-locale rules for `<Lang>`, from `<Dimension>/<Lang>/Rules.hs` - `localeRules`: locale-specific rules, from `<Dimension>/<Lang>/<Region>/Rules.hs` - `defaultRules`: `langRules` + specific rules from select locales to ensure backward-compatibility * Corpus, tests & classifiers - 1 classifier per locale, with default classifier (`<Lang>_XX`) when no locale provided (backward-compatible) - Default classifiers are built on existing corpus - Locale classifiers are built on - `<Dimension>/<Lang>/Corpus.hs` exposes a common `corpus` to all locales of `<Lang>` - `<Dimension>/<Lang>/<Region>/Corpus.hs` exposes `allExamples`: a list of examples specific to the locale (following `<Dimension>/<Lang>/<Region>/Rules.hs`). - Locale classifiers use the language corpus extended with the locale examples as training set. - Locale examples need to use the same `Context` (i.e. reference time) as the language corpus. - For backward compatibility, `<Dimension>/<Lang>/Corpus.hs` can expose also `defaultCorpus`, which is `corpus` augmented with specific examples. This is controlled by `getDefaultCorpusForLang` in `Duckling.Ranking.Generate`. - Tests run against each classifier to make sure runtime works as expected. * MM/DD (en_US) vs DD/MM (en_GB) example to illustrate Reviewed By: JonCoens, blandinw Differential Revision: D6038096 fbshipit-source-id: f29c28d
2017-10-13 18:15:32 +03:00
import qualified Data.Array as Array
import qualified Data.Foldable as Foldable
import qualified Data.List as L
import qualified Text.Regex.PCRE as PCRE
import Duckling.Dimensions.Types
import Duckling.Locale
import Duckling.Regex.Types
import Duckling.Resolve
import Duckling.Types hiding (regex)
import Duckling.Types.Document (Document)
import Duckling.Types.Stash (Stash)
Locales support Summary: * Locales support for the library, following `<Lang>_<Region>` with ISO 639-1 code for `<Lang>` and ISO 3166-1 alpha-2 code for `<Region>` (#33) * `Locale` opaque type (composite of `Lang` and `Region`) with `makeLocale` smart constructor to only allow valid `(Lang, Region)` combinations * API: `Context`'s `lang` parameter has been replaced by `locale`, with optional `Region` and backward compatibility. * `Rules/<Lang>.hs` exposes - `langRules`: cross-locale rules for `<Lang>`, from `<Dimension>/<Lang>/Rules.hs` - `localeRules`: locale-specific rules, from `<Dimension>/<Lang>/<Region>/Rules.hs` - `defaultRules`: `langRules` + specific rules from select locales to ensure backward-compatibility * Corpus, tests & classifiers - 1 classifier per locale, with default classifier (`<Lang>_XX`) when no locale provided (backward-compatible) - Default classifiers are built on existing corpus - Locale classifiers are built on - `<Dimension>/<Lang>/Corpus.hs` exposes a common `corpus` to all locales of `<Lang>` - `<Dimension>/<Lang>/<Region>/Corpus.hs` exposes `allExamples`: a list of examples specific to the locale (following `<Dimension>/<Lang>/<Region>/Rules.hs`). - Locale classifiers use the language corpus extended with the locale examples as training set. - Locale examples need to use the same `Context` (i.e. reference time) as the language corpus. - For backward compatibility, `<Dimension>/<Lang>/Corpus.hs` can expose also `defaultCorpus`, which is `corpus` augmented with specific examples. This is controlled by `getDefaultCorpusForLang` in `Duckling.Ranking.Generate`. - Tests run against each classifier to make sure runtime works as expected. * MM/DD (en_US) vs DD/MM (en_GB) example to illustrate Reviewed By: JonCoens, blandinw Differential Revision: D6038096 fbshipit-source-id: f29c28d
2017-10-13 18:15:32 +03:00
import qualified Duckling.Engine.Regex as Regex
import qualified Duckling.Types.Document as Document
import qualified Duckling.Types.Stash as Stash
-- -----------------------------------------------------------------
-- Engine
parseAndResolve :: [Rule] -> Text -> Context -> Options -> [ResolvedToken]
parseAndResolve rules input context@Context{locale = Locale lang _} options =
mapMaybe
(resolveNode context options)
$ force $ Stash.toPosOrderedList
$ runDuckling $ parseString lang rules (Document.fromText input)
type Duckling a = Identity a
runDuckling :: Duckling a -> a
runDuckling ma = runIdentity ma
-- | A match is full if its rule pattern is empty.
-- (rule, endPosition, reversedRoute)
type Match = (Rule, Int, [Node])
resolveNode :: Context -> Options -> Node -> Maybe ResolvedToken
resolveNode context options n@Node{token = (Token dim dd), nodeRange = r}
= do
(val, latent) <- resolve context options dd
Just Resolved
{ range = r
, node = n
, rval = RVal dim val
, isLatent = latent
}
parseString :: Lang -> [Rule] -> Document -> Duckling Stash
parseString lang rules sentence = do
(new, partialMatches) <-
-- One the first pass we try all the rules
parseString1 lang rules sentence Stash.empty Stash.empty []
if Stash.null new
then return Stash.empty
else
-- For subsequent passes, we only try rules starting with a predicate.
saturateParseString lang headPredicateRules sentence new new partialMatches
where
headPredicateRules =
[ rule | rule@Rule{pattern = (Predicate _ : _)} <- rules ]
-- | Produces all tokens recursively.
saturateParseString
:: Lang -> [Rule] -> Document -> Stash -> Stash -> [Match] -> Duckling Stash
saturateParseString lang rules sentence stash new matches = do
(new', matches') <- parseString1 lang rules sentence stash new matches
let stash' = Stash.union stash new'
if Stash.null new'
then return stash
else saturateParseString lang rules sentence stash' new' matches'
-- | Finds new matches resulting from newly added tokens.
-- Produces new tokens from full matches.
parseString1
:: Lang -> [Rule] -> Document -> Stash -> Stash -> [Match]
-> Duckling (Stash, [Match])
parseString1 lang rules sentence stash new matches = do
-- Recursively match patterns.
-- Find which `matches` can advance because of `new`.
newPartial <- concatMapM (matchFirst sentence lang new) matches
-- Find new matches resulting from newly added tokens (`new`)
newMatches <- concatMapM (matchFirstAnywhere sentence lang new) rules
(full, partial) <- L.partition (\(Rule {pattern}, _, _) -> null pattern)
<$> matchAll sentence lang stash (newPartial ++ newMatches)
-- Produce full matches as new tokens
return ( Stash.fromList $ mapMaybe produce full
, partial ++ matches
)
-- | Recursively augments `matches`.
-- Discards partial matches stuck by a regex.
matchAll :: Document -> Lang -> Stash -> [Match] -> Duckling [Match]
matchAll sentence lang stash matches = concatMapM mkNextMatches matches
where
mkNextMatches :: Match -> Duckling [Match]
mkNextMatches match@(Rule {pattern = []}, _, _) = return [ match ]
mkNextMatches match@(Rule {pattern = p:_}, _, _) = do
nextMatches <- matchAll sentence lang stash =<< matchFirst sentence lang stash match
return $ case p of
Regex _ -> nextMatches
Predicate _ -> match:nextMatches
produce :: Match -> Maybe Node
produce (_, _, []) = Nothing
produce (Rule name _ production, _, etuor@(Node {nodeRange = Range _ e}:_)) = do
let route = reverse etuor
token <- force $ production $ map token route
case route of
(Node {nodeRange = Range p _}:_) -> Just Node
{ nodeRange = Range p e
, token = token
, children = route
, rule = Just name
}
[] -> Nothing
-- | Returns all matches matching the first pattern item of `match`,
-- resuming from a Match position
matchFirst :: Document -> Lang -> Stash -> Match -> Duckling [Match]
matchFirst _ _ _ (Rule {pattern = []}, _, _) = return []
matchFirst sentence lang stash (rule@Rule{pattern = p : ps}, position, route) =
map (mkMatch route newRule) <$> lookupItem sentence lang p stash position
where
newRule = rule { pattern = ps }
-- | Returns all matches matching the first pattern item of `match`,
-- starting anywhere
matchFirstAnywhere :: Document -> Lang -> Stash -> Rule -> Duckling [Match]
matchFirstAnywhere _sentence _lang _stash Rule {pattern = []} = return []
matchFirstAnywhere sentence lang stash rule@Rule{pattern = p : ps} =
map (mkMatch [] newRule) <$> lookupItemAnywhere sentence lang p stash
where
newRule = rule { pattern = ps }
-- | Handle one PatternItem at a given position
lookupItem :: Document -> Lang -> PatternItem -> Stash -> Int -> Duckling [Node]
lookupItem doc lang (Regex re) _ position =
filter (isPositionValid position doc) <$>
lookupRegex doc lang re position
lookupItem doc _lang (Predicate p) stash position =
return $
filter (p . token) $
takeWhile (isPositionValid position doc) $
Stash.toPosOrderedListFrom stash position
-- | Handle one PatternItem anywhere in the text
lookupItemAnywhere :: Document -> Lang -> PatternItem -> Stash -> Duckling [Node]
lookupItemAnywhere doc lang (Regex re) _ = lookupRegexAnywhere doc lang re
lookupItemAnywhere _doc _lang (Predicate p) stash =
return $ filter (p . token) $ Stash.toPosOrderedList stash
isPositionValid :: Int -> Document -> Node -> Bool
isPositionValid position sentence Node{nodeRange = Range start _} =
Document.isAdjacent sentence position start
{-# INLINE mkMatch #-}
mkMatch :: [Node] -> Rule -> Node -> Match
mkMatch route newRule node@Node{nodeRange = Range _ pos'} =
newRoute `seq` (newRule, pos', newRoute)
where newRoute = node:route
-- | Handle a regex match at a given position
lookupRegex :: Document -> Lang -> PCRE.Regex -> Int -> Duckling [Node]
lookupRegex doc _lang _regex position | position >= Document.length doc = return []
lookupRegex doc lang regex position =
lookupRegexCommon doc lang regex position Regex.matchOnce
-- | Handle a regex match anywhere in the text
lookupRegexAnywhere :: Document -> Lang -> PCRE.Regex -> Duckling [Node]
lookupRegexAnywhere doc lang regex = lookupRegexCommon doc lang regex 0 Regex.matchAll
{-# INLINE lookupRegexCommon #-}
-- INLINE bloats the code a bit, but the code is better
lookupRegexCommon
:: Foldable t
=> Document
-> Lang
-> PCRE.Regex
-> Int
-> (PCRE.Regex -> ByteString -> t PCRE.MatchArray)
-> Duckling [Node]
lookupRegexCommon doc lang regex position matchFun = return nodes
where
-- See Note [Regular expressions and Text] from Document.hs to understand
-- what's going on here
(substring, rangeToText, translateRange) =
Document.byteStringFromPos doc position
nodes = mapMaybe (f . Array.elems)
$ Foldable.toList
$ matchFun regex substring
f :: [(Int, Int)] -> Maybe Node
f [] = Nothing
f ((0,0):_) = Nothing
f ((bsStart, bsLen):groups) =
if Document.isRangeValid lang doc start end
then Just node
else Nothing
where
textGroups = map rangeToText groups
(start, end) = translateRange bsStart bsLen
node = Node
{ nodeRange = Range start end
, token = Token RegexMatch (GroupMatch textGroups)
, children = []
, rule = Nothing
}