duckling/Duckling/Types.hs
Bartosz Nitka 58bf36b9f4 Optimize isAdjacent
Summary:
`isAdjacent` was doing a ton of useless copies and
redundant work. But pre-computing a `firstNonAdjacent` table
we can answer every `isAdjacent` query in `O(1)` time and
(almost?) no allocations.

It may be a symptom of algorithmic problems, but we shouldn't
make it more expensive than it needs to be.

Reviewed By: patapizza

Differential Revision: D4744172

fbshipit-source-id: dd70be2
2017-03-21 07:34:24 -07:00

191 lines
5.4 KiB
Haskell

-- Copyright (c) 2016-present, Facebook, Inc.
-- All rights reserved.
--
-- This source code is licensed under the BSD-style license found in the
-- LICENSE file in the root directory of this source tree. An additional grant
-- of patent rights can be found in the PATENTS file in the same directory.
{-# LANGUAGE DeriveAnyClass #-}
{-# LANGUAGE DeriveGeneric #-}
{-# LANGUAGE ExistentialQuantification #-}
{-# LANGUAGE FlexibleContexts #-}
{-# LANGUAGE GADTs #-}
{-# LANGUAGE NoRebindableSyntax #-}
{-# LANGUAGE RankNTypes #-}
{-# LANGUAGE RecordWildCards #-}
{-# LANGUAGE StandaloneDeriving #-}
module Duckling.Types where
import Control.DeepSeq
import Data.Aeson
import qualified Data.ByteString.Lazy as LB
import Data.GADT.Compare
import Data.Hashable
import Data.Maybe
import Data.String
import Data.Text (Text)
import Data.Vector.Primitive (Vector)
import qualified Data.Text.Encoding as Text
import qualified Data.Text as Text
import qualified Data.Vector.Primitive as Vector
import Data.Typeable ((:~:)(Refl), Typeable)
import GHC.Generics
import Prelude
import qualified Text.Regex.Base as R
import qualified Text.Regex.PCRE as PCRE
import Duckling.Dimensions.Types
import Duckling.Resolve
data Document = Document
{ rawInput :: !Text
, indexable :: !(Vector Char) -- for O(1) indexing pos -> Char
, firstNonAdjacent :: !(Vector Int)
-- for a given index 'i' it keeps a first index 'j' greater or equal 'i'
-- such that isAdjacentSeparator (indexable ! j) == False
-- eg. " a document " :: Document
-- firstNonAdjacent = [1,1,3,3,4,5,6,7,8,9,10,12]
-- Note that in this case 12 is the length of the vector, hence not a
-- valid index inside the array, this is intentional.
} deriving (Show)
instance IsString Document where
fromString = mkDocument . fromString
mkDocument :: Text -> Document
mkDocument s = Document s v fna
where
v = Vector.fromList $ Text.unpack s
fna = Vector.fromList $ snd $ Vector.ifoldr' gen (Vector.length v, []) v
-- go from the end keeping track of the first nonAdjacent (best)
gen ix elem (best, acc)
| isAdjacentSeparator elem = (best, best:acc)
| otherwise = (ix, ix:acc)
-- True iff a is followed by whitespaces and b.
isAdjacent :: Int -> Int -> Document -> Bool
isAdjacent a b Document{..} =
b >= a && (firstNonAdjacent Vector.! a >= b)
isAdjacentSeparator :: Char -> Bool
isAdjacentSeparator c = elem c [' ', '\t', '-']
-- -----------------------------------------------------------------
-- Token
data Token = forall a . (Resolve a, Eq a, Hashable a, Show a, NFData a) =>
Token (Dimension a) a
deriving instance Show Token
instance Eq Token where
Token d1 v1 == Token d2 v2 = case geq d1 d2 of
Just Refl -> v1 == v2
Nothing -> False
instance Hashable Token where
hashWithSalt s (Token dim v) = hashWithSalt s (dim, v)
instance NFData Token where
rnf (Token _ v) = rnf v
isDimension :: Dimension a -> Token -> Bool
isDimension dim (Token dim' _) = isJust $ geq dim dim'
data Node = Node
{ nodeRange :: Range
, token :: Token
, children :: [Node]
, rule :: Maybe Text
} deriving (Eq, Generic, Hashable, Show, NFData)
data ResolvedToken = Resolved
{ range :: Range
, node :: Node
, jsonValue :: Value
} deriving (Eq, Show)
instance Ord ResolvedToken where
compare (Resolved range1 _ json1) (Resolved range2 _ json2) =
case compare range1 range2 of
EQ -> compare (toJText json1) (toJText json2)
z -> z
data Candidate = Candidate ResolvedToken Double Bool
deriving (Eq, Show)
instance Ord Candidate where
compare (Candidate (Resolved{range = Range s1 e1, node = Node{token = Token d1 _}}) score1 t1)
(Candidate (Resolved{range = Range s2 e2, node = Node{token = tok2}}) score2 t2)
| isDimension d1 tok2 = case starts of
EQ -> case ends of
EQ -> compare score1 score2
z -> z
LT -> case ends of
LT -> EQ
_ -> GT
GT -> case ends of
GT -> EQ
_ -> LT
| t1 == t2 = compRange
| t1 && compRange == GT = GT
| t2 && compRange == LT = LT
| otherwise = EQ
where
starts = compare s1 s2
ends = compare e1 e2
-- a > b if a recovers b
compRange = case starts of
EQ -> ends
LT -> case ends of
LT -> EQ
_ -> GT
GT -> case ends of
GT -> EQ
_ -> LT
data Range = Range Int Int
deriving (Eq, Ord, Generic, Hashable, Show, NFData)
type Production = [Token] -> Maybe Token
type Predicate = Token -> Bool
data PatternItem = Regex PCRE.Regex | Predicate Predicate
type Pattern = [PatternItem]
data Rule = Rule
{ name :: Text
, pattern :: Pattern
, prod :: Production
}
instance Show Rule where
show (Rule name _ _) = show name
data Entity = Entity
{ dim :: Text
, body :: Text
, value :: Text
, start :: Int
, end :: Int
} deriving (Eq, Generic, Show)
instance ToJSON Entity where
toEncoding = genericToEncoding defaultOptions
toJText :: ToJSON x => x -> Text
toJText j = Text.decodeUtf8 $ LB.toStrict $ encode j
-- -----------------------------------------------------------------
-- Predicates helpers
regex :: String -> PatternItem
regex = Regex . R.makeRegexOpts compOpts execOpts
where
compOpts = PCRE.defaultCompOpt + PCRE.compCaseless
execOpts = PCRE.defaultExecOpt
dimension :: Typeable a => Dimension a -> PatternItem
dimension value = Predicate $ isDimension value