2017-03-08 21:33:55 +03:00
|
|
|
-- Copyright (c) 2016-present, Facebook, Inc.
|
|
|
|
-- All rights reserved.
|
|
|
|
--
|
|
|
|
-- This source code is licensed under the BSD-style license found in the
|
2019-05-22 20:36:43 +03:00
|
|
|
-- LICENSE file in the root directory of this source tree.
|
2017-03-08 21:33:55 +03:00
|
|
|
|
|
|
|
{-# LANGUAGE GADTs #-}
|
|
|
|
{-# LANGUAGE NamedFieldPuns #-}
|
|
|
|
{-# LANGUAGE NoRebindableSyntax #-}
|
2018-01-30 03:41:23 +03:00
|
|
|
|
2017-03-08 21:33:55 +03:00
|
|
|
module Duckling.Engine
|
|
|
|
( parseAndResolve
|
2017-04-28 18:58:31 +03:00
|
|
|
, lookupRegexAnywhere
|
2017-04-01 00:05:57 +03:00
|
|
|
, runDuckling
|
2017-03-08 21:33:55 +03:00
|
|
|
) where
|
|
|
|
|
|
|
|
import Control.DeepSeq
|
2017-04-01 00:05:57 +03:00
|
|
|
import Control.Monad.Extra
|
2018-03-20 20:14:37 +03:00
|
|
|
import Data.Aeson (toJSON)
|
2017-04-28 18:58:31 +03:00
|
|
|
import Data.ByteString (ByteString)
|
2017-04-01 00:05:57 +03:00
|
|
|
import Data.Functor.Identity
|
2017-03-08 21:33:55 +03:00
|
|
|
import Data.Maybe
|
|
|
|
import Data.Text (Text)
|
|
|
|
import Prelude
|
Locales support
Summary:
* Locales support for the library, following `<Lang>_<Region>` with ISO 639-1 code for `<Lang>` and ISO 3166-1 alpha-2 code for `<Region>` (#33)
* `Locale` opaque type (composite of `Lang` and `Region`) with `makeLocale` smart constructor to only allow valid `(Lang, Region)` combinations
* API: `Context`'s `lang` parameter has been replaced by `locale`, with optional `Region` and backward compatibility.
* `Rules/<Lang>.hs` exposes
- `langRules`: cross-locale rules for `<Lang>`, from `<Dimension>/<Lang>/Rules.hs`
- `localeRules`: locale-specific rules, from `<Dimension>/<Lang>/<Region>/Rules.hs`
- `defaultRules`: `langRules` + specific rules from select locales to ensure backward-compatibility
* Corpus, tests & classifiers
- 1 classifier per locale, with default classifier (`<Lang>_XX`) when no locale provided (backward-compatible)
- Default classifiers are built on existing corpus
- Locale classifiers are built on
- `<Dimension>/<Lang>/Corpus.hs` exposes a common `corpus` to all locales of `<Lang>`
- `<Dimension>/<Lang>/<Region>/Corpus.hs` exposes `allExamples`: a list of examples specific to the locale (following `<Dimension>/<Lang>/<Region>/Rules.hs`).
- Locale classifiers use the language corpus extended with the locale examples as training set.
- Locale examples need to use the same `Context` (i.e. reference time) as the language corpus.
- For backward compatibility, `<Dimension>/<Lang>/Corpus.hs` can expose also `defaultCorpus`, which is `corpus` augmented with specific examples. This is controlled by `getDefaultCorpusForLang` in `Duckling.Ranking.Generate`.
- Tests run against each classifier to make sure runtime works as expected.
* MM/DD (en_US) vs DD/MM (en_GB) example to illustrate
Reviewed By: JonCoens, blandinw
Differential Revision: D6038096
fbshipit-source-id: f29c28d
2017-10-13 18:15:32 +03:00
|
|
|
import qualified Data.Array as Array
|
|
|
|
import qualified Data.Foldable as Foldable
|
|
|
|
import qualified Data.List as L
|
2017-03-08 21:33:55 +03:00
|
|
|
import qualified Text.Regex.PCRE as PCRE
|
|
|
|
|
|
|
|
import Duckling.Dimensions.Types
|
|
|
|
import Duckling.Regex.Types
|
|
|
|
import Duckling.Resolve
|
|
|
|
import Duckling.Types
|
2017-04-26 02:23:07 +03:00
|
|
|
import Duckling.Types.Document (Document)
|
2017-04-26 02:23:08 +03:00
|
|
|
import Duckling.Types.Stash (Stash)
|
Locales support
Summary:
* Locales support for the library, following `<Lang>_<Region>` with ISO 639-1 code for `<Lang>` and ISO 3166-1 alpha-2 code for `<Region>` (#33)
* `Locale` opaque type (composite of `Lang` and `Region`) with `makeLocale` smart constructor to only allow valid `(Lang, Region)` combinations
* API: `Context`'s `lang` parameter has been replaced by `locale`, with optional `Region` and backward compatibility.
* `Rules/<Lang>.hs` exposes
- `langRules`: cross-locale rules for `<Lang>`, from `<Dimension>/<Lang>/Rules.hs`
- `localeRules`: locale-specific rules, from `<Dimension>/<Lang>/<Region>/Rules.hs`
- `defaultRules`: `langRules` + specific rules from select locales to ensure backward-compatibility
* Corpus, tests & classifiers
- 1 classifier per locale, with default classifier (`<Lang>_XX`) when no locale provided (backward-compatible)
- Default classifiers are built on existing corpus
- Locale classifiers are built on
- `<Dimension>/<Lang>/Corpus.hs` exposes a common `corpus` to all locales of `<Lang>`
- `<Dimension>/<Lang>/<Region>/Corpus.hs` exposes `allExamples`: a list of examples specific to the locale (following `<Dimension>/<Lang>/<Region>/Rules.hs`).
- Locale classifiers use the language corpus extended with the locale examples as training set.
- Locale examples need to use the same `Context` (i.e. reference time) as the language corpus.
- For backward compatibility, `<Dimension>/<Lang>/Corpus.hs` can expose also `defaultCorpus`, which is `corpus` augmented with specific examples. This is controlled by `getDefaultCorpusForLang` in `Duckling.Ranking.Generate`.
- Tests run against each classifier to make sure runtime works as expected.
* MM/DD (en_US) vs DD/MM (en_GB) example to illustrate
Reviewed By: JonCoens, blandinw
Differential Revision: D6038096
fbshipit-source-id: f29c28d
2017-10-13 18:15:32 +03:00
|
|
|
import qualified Duckling.Engine.Regex as Regex
|
|
|
|
import qualified Duckling.Types.Document as Document
|
|
|
|
import qualified Duckling.Types.Stash as Stash
|
2017-03-08 21:33:55 +03:00
|
|
|
|
|
|
|
-- -----------------------------------------------------------------
|
|
|
|
-- Engine
|
|
|
|
|
2017-04-01 00:05:57 +03:00
|
|
|
type Duckling a = Identity a
|
|
|
|
|
|
|
|
runDuckling :: Duckling a -> a
|
|
|
|
runDuckling ma = runIdentity ma
|
|
|
|
|
2018-03-20 00:34:58 +03:00
|
|
|
parseAndResolve :: [Rule] -> Text -> Context -> Options -> [ResolvedToken]
|
|
|
|
parseAndResolve rules input context options =
|
|
|
|
mapMaybe (resolveNode context options) . force $ Stash.toPosOrderedList $
|
|
|
|
runDuckling $ parseString rules (Document.fromText input)
|
2017-03-08 21:33:55 +03:00
|
|
|
|
|
|
|
produce :: Match -> Maybe Node
|
|
|
|
produce (_, _, []) = Nothing
|
|
|
|
produce (Rule name _ production, _, etuor@(Node {nodeRange = Range _ e}:_)) = do
|
|
|
|
let route = reverse etuor
|
|
|
|
token <- force $ production $ map token route
|
|
|
|
case route of
|
|
|
|
(Node {nodeRange = Range p _}:_) -> Just Node
|
|
|
|
{ nodeRange = Range p e
|
|
|
|
, token = token
|
|
|
|
, children = route
|
|
|
|
, rule = Just name
|
|
|
|
}
|
|
|
|
[] -> Nothing
|
|
|
|
|
2017-04-28 18:58:31 +03:00
|
|
|
-- | Handle a regex match at a given position
|
|
|
|
lookupRegex :: Document -> PCRE.Regex -> Int -> Duckling [Node]
|
|
|
|
lookupRegex doc _regex position | position >= Document.length doc = return []
|
|
|
|
lookupRegex doc regex position =
|
|
|
|
lookupRegexCommon doc regex position Regex.matchOnce
|
|
|
|
|
|
|
|
-- | Handle a regex match anywhere in the text
|
|
|
|
lookupRegexAnywhere :: Document -> PCRE.Regex -> Duckling [Node]
|
|
|
|
lookupRegexAnywhere doc regex = lookupRegexCommon doc regex 0 Regex.matchAll
|
|
|
|
|
|
|
|
{-# INLINE lookupRegexCommon #-}
|
|
|
|
-- INLINE bloats the code a bit, but the code is better
|
|
|
|
lookupRegexCommon
|
|
|
|
:: Foldable t
|
|
|
|
=> Document
|
|
|
|
-> PCRE.Regex
|
|
|
|
-> Int
|
|
|
|
-> (PCRE.Regex -> ByteString -> t PCRE.MatchArray)
|
|
|
|
-> Duckling [Node]
|
|
|
|
lookupRegexCommon doc regex position matchFun = return nodes
|
2017-03-08 21:33:55 +03:00
|
|
|
where
|
2017-04-28 18:58:31 +03:00
|
|
|
-- See Note [Regular expressions and Text] to understand what's going
|
|
|
|
-- on here
|
|
|
|
(substring, rangeToText, translateRange) =
|
|
|
|
Document.byteStringFromPos doc position
|
2020-11-13 00:44:37 +03:00
|
|
|
nodes = mapMaybe (f . Array.elems)
|
|
|
|
$ Foldable.toList
|
|
|
|
$ matchFun regex substring
|
2017-04-28 18:58:31 +03:00
|
|
|
f :: [(Int, Int)] -> Maybe Node
|
|
|
|
f [] = Nothing
|
|
|
|
f ((0,0):_) = Nothing
|
|
|
|
f ((bsStart, bsLen):groups) =
|
|
|
|
if Document.isRangeValid doc start end
|
|
|
|
then Just node
|
|
|
|
else Nothing
|
|
|
|
where
|
|
|
|
textGroups = map rangeToText groups
|
|
|
|
(start, end) = translateRange bsStart bsLen
|
|
|
|
node = Node
|
|
|
|
{ nodeRange = Range start end
|
|
|
|
, token = Token RegexMatch (GroupMatch textGroups)
|
|
|
|
, children = []
|
|
|
|
, rule = Nothing
|
|
|
|
}
|
2017-03-08 21:33:55 +03:00
|
|
|
|
2017-04-28 18:58:31 +03:00
|
|
|
-- | Handle one PatternItem at a given position
|
2017-04-01 00:05:57 +03:00
|
|
|
lookupItem :: Document -> PatternItem -> Stash -> Int -> Duckling [Node]
|
2017-04-28 18:58:31 +03:00
|
|
|
lookupItem doc (Regex re) _ position =
|
|
|
|
filter (isPositionValid position doc) <$>
|
|
|
|
lookupRegex doc re position
|
|
|
|
lookupItem doc (Predicate p) stash position =
|
2017-04-01 00:05:57 +03:00
|
|
|
return $
|
2017-03-08 21:33:55 +03:00
|
|
|
filter (p . token) $
|
2017-04-28 18:58:31 +03:00
|
|
|
takeWhile (isPositionValid position doc) $
|
2017-03-08 21:33:55 +03:00
|
|
|
Stash.toPosOrderedListFrom stash position
|
|
|
|
|
2017-04-28 18:58:31 +03:00
|
|
|
-- | Handle one PatternItem anywhere in the text
|
|
|
|
lookupItemAnywhere :: Document -> PatternItem -> Stash -> Duckling [Node]
|
|
|
|
lookupItemAnywhere doc (Regex re) _ = lookupRegexAnywhere doc re
|
|
|
|
lookupItemAnywhere _doc (Predicate p) stash =
|
|
|
|
return $ filter (p . token) $ Stash.toPosOrderedList stash
|
|
|
|
|
2017-03-21 06:36:42 +03:00
|
|
|
isPositionValid :: Int -> Document -> Node -> Bool
|
2018-01-30 03:41:23 +03:00
|
|
|
isPositionValid position sentence Node{nodeRange = Range start _} =
|
2017-04-28 18:58:31 +03:00
|
|
|
Document.isAdjacent sentence position start
|
2017-03-08 21:33:55 +03:00
|
|
|
|
|
|
|
-- | A match is full if its rule pattern is empty.
|
|
|
|
-- (rule, endPosition, reversedRoute)
|
|
|
|
type Match = (Rule, Int, [Node])
|
|
|
|
|
|
|
|
-- | Recursively augments `matches`.
|
|
|
|
-- Discards partial matches stuck by a regex.
|
2017-04-01 00:05:57 +03:00
|
|
|
matchAll :: Document -> Stash -> [Match] -> Duckling [Match]
|
|
|
|
matchAll sentence stash matches = concatMapM mkNextMatches matches
|
2017-03-08 21:33:55 +03:00
|
|
|
where
|
2017-04-01 00:05:57 +03:00
|
|
|
mkNextMatches :: Match -> Duckling [Match]
|
|
|
|
mkNextMatches match@(Rule {pattern = []}, _, _) = return [ match ]
|
|
|
|
mkNextMatches match@(Rule {pattern = p:_}, _, _) = do
|
|
|
|
nextMatches <- matchAll sentence stash =<< matchFirst sentence stash match
|
|
|
|
return $ case p of
|
2017-03-08 21:33:55 +03:00
|
|
|
Regex _ -> nextMatches
|
|
|
|
Predicate _ -> match:nextMatches
|
|
|
|
|
2017-04-28 18:58:31 +03:00
|
|
|
-- | Returns all matches matching the first pattern item of `match`,
|
|
|
|
-- resuming from a Match position
|
2017-04-01 00:05:57 +03:00
|
|
|
matchFirst :: Document -> Stash -> Match -> Duckling [Match]
|
|
|
|
matchFirst _ _ (Rule {pattern = []}, _, _) = return []
|
2018-01-30 03:41:23 +03:00
|
|
|
matchFirst sentence stash (rule@Rule{pattern = p : ps}, position, route) =
|
2017-04-28 18:58:31 +03:00
|
|
|
map (mkMatch route newRule) <$> lookupItem sentence p stash position
|
|
|
|
where
|
|
|
|
newRule = rule { pattern = ps }
|
|
|
|
|
|
|
|
-- | Returns all matches matching the first pattern item of `match`,
|
|
|
|
-- starting anywhere
|
|
|
|
matchFirstAnywhere :: Document -> Stash -> Rule -> Duckling [Match]
|
|
|
|
matchFirstAnywhere _sentence _stash Rule {pattern = []} = return []
|
2018-01-30 03:41:23 +03:00
|
|
|
matchFirstAnywhere sentence stash rule@Rule{pattern = p : ps} =
|
2017-04-28 18:58:31 +03:00
|
|
|
map (mkMatch [] newRule) <$> lookupItemAnywhere sentence p stash
|
2017-04-01 00:05:57 +03:00
|
|
|
where
|
|
|
|
newRule = rule { pattern = ps }
|
2017-03-08 21:33:55 +03:00
|
|
|
|
2017-04-28 18:58:31 +03:00
|
|
|
{-# INLINE mkMatch #-}
|
|
|
|
mkMatch :: [Node] -> Rule -> Node -> Match
|
|
|
|
mkMatch route newRule (node@Node {nodeRange = Range _ pos'}) =
|
|
|
|
newRoute `seq` (newRule, pos', newRoute)
|
|
|
|
where newRoute = node:route
|
|
|
|
|
2017-03-08 21:33:55 +03:00
|
|
|
-- | Finds new matches resulting from newly added tokens.
|
|
|
|
-- Produces new tokens from full matches.
|
2017-03-21 06:36:42 +03:00
|
|
|
parseString1
|
2017-04-01 00:05:57 +03:00
|
|
|
:: [Rule] -> Document -> Stash -> Stash -> [Match]
|
|
|
|
-> Duckling (Stash, [Match])
|
|
|
|
parseString1 rules sentence stash new matches = do
|
|
|
|
-- Recursively match patterns.
|
|
|
|
-- Find which `matches` can advance because of `new`.
|
|
|
|
newPartial <- concatMapM (matchFirst sentence new) matches
|
|
|
|
|
|
|
|
-- Find new matches resulting from newly added tokens (`new`)
|
2017-04-28 18:58:31 +03:00
|
|
|
newMatches <- concatMapM (matchFirstAnywhere sentence new) rules
|
2017-04-01 00:05:57 +03:00
|
|
|
|
|
|
|
(full, partial) <- L.partition (\(Rule {pattern}, _, _) -> null pattern)
|
|
|
|
<$> matchAll sentence stash (newPartial ++ newMatches)
|
|
|
|
|
2017-03-08 21:33:55 +03:00
|
|
|
-- Produce full matches as new tokens
|
2017-04-01 00:05:57 +03:00
|
|
|
return ( Stash.fromList $ mapMaybe produce full
|
2017-04-20 21:19:11 +03:00
|
|
|
, partial ++ matches
|
2017-04-01 00:05:57 +03:00
|
|
|
)
|
2017-03-08 21:33:55 +03:00
|
|
|
|
|
|
|
-- | Produces all tokens recursively.
|
2017-04-20 21:19:11 +03:00
|
|
|
saturateParseString
|
|
|
|
:: [Rule] -> Document -> Stash -> Stash -> [Match] -> Duckling Stash
|
|
|
|
saturateParseString rules sentence stash new matches = do
|
2017-04-01 00:05:57 +03:00
|
|
|
(new', matches') <- parseString1 rules sentence stash new matches
|
2017-04-20 21:19:11 +03:00
|
|
|
let stash' = Stash.union stash new'
|
2017-04-01 00:05:57 +03:00
|
|
|
if Stash.null new'
|
|
|
|
then return stash
|
2017-04-20 21:19:11 +03:00
|
|
|
else saturateParseString rules sentence stash' new' matches'
|
|
|
|
|
|
|
|
parseString :: [Rule] -> Document -> Duckling Stash
|
|
|
|
parseString rules sentence = do
|
|
|
|
(new, partialMatches) <-
|
|
|
|
-- One the first pass we try all the rules
|
|
|
|
parseString1 rules sentence Stash.empty Stash.empty []
|
|
|
|
if Stash.null new
|
|
|
|
then return Stash.empty
|
|
|
|
else
|
|
|
|
-- For subsequent passes, we only try rules starting with a predicate.
|
|
|
|
saturateParseString headPredicateRules sentence new new partialMatches
|
|
|
|
where
|
|
|
|
headPredicateRules =
|
2018-01-30 03:41:23 +03:00
|
|
|
[ rule | rule@Rule{pattern = (Predicate _ : _)} <- rules ]
|
2017-03-08 21:33:55 +03:00
|
|
|
|
2018-03-20 00:34:58 +03:00
|
|
|
resolveNode :: Context -> Options -> Node -> Maybe ResolvedToken
|
2018-04-21 00:08:39 +03:00
|
|
|
resolveNode context options n@Node{token = (Token dim dd), nodeRange = r}
|
2018-03-20 00:34:58 +03:00
|
|
|
= do
|
|
|
|
(val, latent) <- resolve context options dd
|
2017-03-08 21:33:55 +03:00
|
|
|
Just Resolved
|
2018-04-21 00:08:39 +03:00
|
|
|
{ range = r
|
2017-03-08 21:33:55 +03:00
|
|
|
, node = n
|
2018-04-21 00:08:39 +03:00
|
|
|
, rval = RVal dim val
|
2018-03-20 00:34:58 +03:00
|
|
|
, isLatent = latent
|
2017-03-08 21:33:55 +03:00
|
|
|
}
|