duckling/Duckling/Types.hs

-- Copyright (c) 2016-present, Facebook, Inc.
-- All rights reserved.
--
-- This source code is licensed under the BSD-style license found in the
-- LICENSE file in the root directory of this source tree. An additional grant
-- of patent rights can be found in the PATENTS file in the same directory.


{-# LANGUAGE DeriveAnyClass #-}
{-# LANGUAGE DeriveGeneric #-}
{-# LANGUAGE ExistentialQuantification #-}
{-# LANGUAGE FlexibleContexts #-}
{-# LANGUAGE GADTs #-}
{-# LANGUAGE NoRebindableSyntax #-}
{-# LANGUAGE RankNTypes #-}
{-# LANGUAGE RecordWildCards #-}
{-# LANGUAGE StandaloneDeriving #-}

module Duckling.Types where

import Control.DeepSeq
import Data.Aeson
import qualified Data.ByteString.Lazy as LB
import Data.GADT.Compare
import Data.Hashable
import Data.Maybe
import Data.String
import Data.Text (Text)
import Data.Vector.Primitive (Vector)
import qualified Data.Text.Encoding as Text
import qualified Data.Text as Text
import qualified Data.Vector.Primitive as Vector
import Data.Typeable ((:~:)(Refl), Typeable)
import GHC.Generics
import Prelude
import qualified Text.Regex.Base as R
import qualified Text.Regex.PCRE as PCRE

import Duckling.Dimensions.Types
import Duckling.Resolve

data Document = Document
  { rawInput :: !Text
  , indexable :: !(Vector Char) -- for O(1) indexing pos -> Char
  , firstNonAdjacent :: !(Vector Int)
    -- for a given index 'i' it keeps a first index 'j' greater or equal 'i'
    -- such that isAdjacentSeparator (indexable ! j) == False
    -- eg. " a document " :: Document
    --     firstNonAdjacent = [1,1,3,3,4,5,6,7,8,9,10,12]
    -- Note that in this case 12 is the length of the vector, hence not a
    -- valid index inside the array, this is intentional.
  } deriving (Show)

instance IsString Document where
  fromString = mkDocument . fromString

mkDocument :: Text -> Document
mkDocument s = Document s v fna
  where
  v = Vector.fromList $ Text.unpack s
  fna = Vector.fromList $ snd $ Vector.ifoldr' gen (Vector.length v, []) v
  -- go from the end keeping track of the first nonAdjacent (best)
  gen ix elem (best, acc)
    | isAdjacentSeparator elem = (best, best:acc)
    | otherwise = (ix, ix:acc)

-- True iff a is followed by whitespaces and b.
isAdjacent :: Int -> Int -> Document -> Bool
isAdjacent a b Document{..} =
  b >= a && (firstNonAdjacent Vector.! a >= b)

isAdjacentSeparator :: Char -> Bool
isAdjacentSeparator c = elem c [' ', '\t', '-']

-- -----------------------------------------------------------------
-- Token

data Token = forall a . (Resolve a, Eq a, Hashable a, Show a, NFData a) =>
  Token (Dimension a) a

deriving instance Show Token
instance Eq Token where
  Token d1 v1 == Token d2 v2 = case geq d1 d2 of
    Just Refl -> v1 == v2
    Nothing   -> False

instance Hashable Token where
  hashWithSalt s (Token dim v) = hashWithSalt s (dim, v)

instance NFData Token where
  rnf (Token _ v) = rnf v

isDimension :: Dimension a -> Token -> Bool
isDimension dim (Token dim' _) = isJust $ geq dim dim'

data Node = Node
  { nodeRange :: Range
  , token     :: Token
  , children  :: [Node]
  , rule      :: Maybe Text
  } deriving (Eq, Generic, Hashable, Show, NFData)

data ResolvedToken = Resolved
  { range :: Range
  , node :: Node
  , jsonValue :: Value
  } deriving (Eq, Show)

instance Ord ResolvedToken where
  compare (Resolved range1 _ json1) (Resolved range2 _ json2) =
    case compare range1 range2 of
      EQ -> compare (toJText json1) (toJText json2)
      z  -> z

data Candidate = Candidate ResolvedToken Double Bool
  deriving (Eq, Show)

instance Ord Candidate where
  compare (Candidate (Resolved{range = Range s1 e1, node = Node{token = Token d1 _}}) score1 t1)
          (Candidate (Resolved{range = Range s2 e2, node = Node{token = tok2}}) score2 t2)
    | isDimension d1 tok2 = case starts of
        EQ -> case ends of
          EQ -> compare score1 score2
          z -> z
        LT -> case ends of
          LT -> EQ
          _ -> GT
        GT -> case ends of
          GT -> EQ
          _ -> LT
    | t1 == t2 = compRange
    | t1 && compRange == GT = GT
    | t2 && compRange == LT = LT
    | otherwise = EQ
      where
        starts = compare s1 s2
        ends = compare e1 e2
        -- a > b if a recovers b
        compRange = case starts of
          EQ -> ends
          LT -> case ends of
            LT -> EQ
            _  -> GT
          GT -> case ends of
            GT -> EQ
            _  -> LT

data Range = Range Int Int
  deriving (Eq, Ord, Generic, Hashable, Show, NFData)

type Production = [Token] -> Maybe Token
type Predicate = Token -> Bool
data PatternItem = Regex PCRE.Regex | Predicate Predicate

type Pattern = [PatternItem]

data Rule = Rule
  { name :: Text
  , pattern :: Pattern
  , prod :: Production
  }

instance Show Rule where
  show (Rule name _ _) = show name

data Entity = Entity
  { dim   :: Text
  , body  :: Text
  , value :: Text
  , start :: Int
  , end   :: Int
  } deriving (Eq, Generic, Show)

instance ToJSON Entity where
  toEncoding = genericToEncoding defaultOptions

toJText :: ToJSON x => x -> Text
toJText j = Text.decodeUtf8 $ LB.toStrict $ encode j

-- -----------------------------------------------------------------
-- Predicates helpers

regex :: String -> PatternItem
regex = Regex . R.makeRegexOpts compOpts execOpts
  where
    compOpts = PCRE.defaultCompOpt + PCRE.compCaseless
    execOpts = PCRE.defaultExecOpt

dimension :: Typeable a => Dimension a -> PatternItem
dimension value = Predicate $ isDimension value