mirror of
https://github.com/facebook/duckling.git
synced 2024-12-01 08:19:36 +03:00
58bf36b9f4
Summary: `isAdjacent` was doing a ton of useless copies and redundant work. But pre-computing a `firstNonAdjacent` table we can answer every `isAdjacent` query in `O(1)` time and (almost?) no allocations. It may be a symptom of algorithmic problems, but we shouldn't make it more expensive than it needs to be. Reviewed By: patapizza Differential Revision: D4744172 fbshipit-source-id: dd70be2
191 lines
5.4 KiB
Haskell
191 lines
5.4 KiB
Haskell
-- Copyright (c) 2016-present, Facebook, Inc.
|
|
-- All rights reserved.
|
|
--
|
|
-- This source code is licensed under the BSD-style license found in the
|
|
-- LICENSE file in the root directory of this source tree. An additional grant
|
|
-- of patent rights can be found in the PATENTS file in the same directory.
|
|
|
|
|
|
{-# LANGUAGE DeriveAnyClass #-}
|
|
{-# LANGUAGE DeriveGeneric #-}
|
|
{-# LANGUAGE ExistentialQuantification #-}
|
|
{-# LANGUAGE FlexibleContexts #-}
|
|
{-# LANGUAGE GADTs #-}
|
|
{-# LANGUAGE NoRebindableSyntax #-}
|
|
{-# LANGUAGE RankNTypes #-}
|
|
{-# LANGUAGE RecordWildCards #-}
|
|
{-# LANGUAGE StandaloneDeriving #-}
|
|
|
|
module Duckling.Types where
|
|
|
|
import Control.DeepSeq
|
|
import Data.Aeson
|
|
import qualified Data.ByteString.Lazy as LB
|
|
import Data.GADT.Compare
|
|
import Data.Hashable
|
|
import Data.Maybe
|
|
import Data.String
|
|
import Data.Text (Text)
|
|
import Data.Vector.Primitive (Vector)
|
|
import qualified Data.Text.Encoding as Text
|
|
import qualified Data.Text as Text
|
|
import qualified Data.Vector.Primitive as Vector
|
|
import Data.Typeable ((:~:)(Refl), Typeable)
|
|
import GHC.Generics
|
|
import Prelude
|
|
import qualified Text.Regex.Base as R
|
|
import qualified Text.Regex.PCRE as PCRE
|
|
|
|
import Duckling.Dimensions.Types
|
|
import Duckling.Resolve
|
|
|
|
data Document = Document
|
|
{ rawInput :: !Text
|
|
, indexable :: !(Vector Char) -- for O(1) indexing pos -> Char
|
|
, firstNonAdjacent :: !(Vector Int)
|
|
-- for a given index 'i' it keeps a first index 'j' greater or equal 'i'
|
|
-- such that isAdjacentSeparator (indexable ! j) == False
|
|
-- eg. " a document " :: Document
|
|
-- firstNonAdjacent = [1,1,3,3,4,5,6,7,8,9,10,12]
|
|
-- Note that in this case 12 is the length of the vector, hence not a
|
|
-- valid index inside the array, this is intentional.
|
|
} deriving (Show)
|
|
|
|
instance IsString Document where
|
|
fromString = mkDocument . fromString
|
|
|
|
mkDocument :: Text -> Document
|
|
mkDocument s = Document s v fna
|
|
where
|
|
v = Vector.fromList $ Text.unpack s
|
|
fna = Vector.fromList $ snd $ Vector.ifoldr' gen (Vector.length v, []) v
|
|
-- go from the end keeping track of the first nonAdjacent (best)
|
|
gen ix elem (best, acc)
|
|
| isAdjacentSeparator elem = (best, best:acc)
|
|
| otherwise = (ix, ix:acc)
|
|
|
|
-- True iff a is followed by whitespaces and b.
|
|
isAdjacent :: Int -> Int -> Document -> Bool
|
|
isAdjacent a b Document{..} =
|
|
b >= a && (firstNonAdjacent Vector.! a >= b)
|
|
|
|
isAdjacentSeparator :: Char -> Bool
|
|
isAdjacentSeparator c = elem c [' ', '\t', '-']
|
|
|
|
-- -----------------------------------------------------------------
|
|
-- Token
|
|
|
|
data Token = forall a . (Resolve a, Eq a, Hashable a, Show a, NFData a) =>
|
|
Token (Dimension a) a
|
|
|
|
deriving instance Show Token
|
|
instance Eq Token where
|
|
Token d1 v1 == Token d2 v2 = case geq d1 d2 of
|
|
Just Refl -> v1 == v2
|
|
Nothing -> False
|
|
|
|
instance Hashable Token where
|
|
hashWithSalt s (Token dim v) = hashWithSalt s (dim, v)
|
|
|
|
instance NFData Token where
|
|
rnf (Token _ v) = rnf v
|
|
|
|
isDimension :: Dimension a -> Token -> Bool
|
|
isDimension dim (Token dim' _) = isJust $ geq dim dim'
|
|
|
|
data Node = Node
|
|
{ nodeRange :: Range
|
|
, token :: Token
|
|
, children :: [Node]
|
|
, rule :: Maybe Text
|
|
} deriving (Eq, Generic, Hashable, Show, NFData)
|
|
|
|
data ResolvedToken = Resolved
|
|
{ range :: Range
|
|
, node :: Node
|
|
, jsonValue :: Value
|
|
} deriving (Eq, Show)
|
|
|
|
instance Ord ResolvedToken where
|
|
compare (Resolved range1 _ json1) (Resolved range2 _ json2) =
|
|
case compare range1 range2 of
|
|
EQ -> compare (toJText json1) (toJText json2)
|
|
z -> z
|
|
|
|
data Candidate = Candidate ResolvedToken Double Bool
|
|
deriving (Eq, Show)
|
|
|
|
instance Ord Candidate where
|
|
compare (Candidate (Resolved{range = Range s1 e1, node = Node{token = Token d1 _}}) score1 t1)
|
|
(Candidate (Resolved{range = Range s2 e2, node = Node{token = tok2}}) score2 t2)
|
|
| isDimension d1 tok2 = case starts of
|
|
EQ -> case ends of
|
|
EQ -> compare score1 score2
|
|
z -> z
|
|
LT -> case ends of
|
|
LT -> EQ
|
|
_ -> GT
|
|
GT -> case ends of
|
|
GT -> EQ
|
|
_ -> LT
|
|
| t1 == t2 = compRange
|
|
| t1 && compRange == GT = GT
|
|
| t2 && compRange == LT = LT
|
|
| otherwise = EQ
|
|
where
|
|
starts = compare s1 s2
|
|
ends = compare e1 e2
|
|
-- a > b if a recovers b
|
|
compRange = case starts of
|
|
EQ -> ends
|
|
LT -> case ends of
|
|
LT -> EQ
|
|
_ -> GT
|
|
GT -> case ends of
|
|
GT -> EQ
|
|
_ -> LT
|
|
|
|
data Range = Range Int Int
|
|
deriving (Eq, Ord, Generic, Hashable, Show, NFData)
|
|
|
|
type Production = [Token] -> Maybe Token
|
|
type Predicate = Token -> Bool
|
|
data PatternItem = Regex PCRE.Regex | Predicate Predicate
|
|
|
|
type Pattern = [PatternItem]
|
|
|
|
data Rule = Rule
|
|
{ name :: Text
|
|
, pattern :: Pattern
|
|
, prod :: Production
|
|
}
|
|
|
|
instance Show Rule where
|
|
show (Rule name _ _) = show name
|
|
|
|
data Entity = Entity
|
|
{ dim :: Text
|
|
, body :: Text
|
|
, value :: Text
|
|
, start :: Int
|
|
, end :: Int
|
|
} deriving (Eq, Generic, Show)
|
|
|
|
instance ToJSON Entity where
|
|
toEncoding = genericToEncoding defaultOptions
|
|
|
|
toJText :: ToJSON x => x -> Text
|
|
toJText j = Text.decodeUtf8 $ LB.toStrict $ encode j
|
|
|
|
-- -----------------------------------------------------------------
|
|
-- Predicates helpers
|
|
|
|
regex :: String -> PatternItem
|
|
regex = Regex . R.makeRegexOpts compOpts execOpts
|
|
where
|
|
compOpts = PCRE.defaultCompOpt + PCRE.compCaseless
|
|
execOpts = PCRE.defaultExecOpt
|
|
|
|
dimension :: Typeable a => Dimension a -> PatternItem
|
|
dimension value = Predicate $ isDimension value
|