Move Document and helpers to a fresh module

Summary: Document had its internal details leaked over 2 files. This consolidates it. It took a long time to make this perf neutral (now it's even a tiny win), for reasons I don't completely understand. The INLINE pragma on byteStringFromPos I semi-understand, but I also had to move isRangeValid to Document and that's a bit of a mystery. Reviewed By: patapizza Differential Revision: D4948449 fbshipit-source-id: ffb251a
2024-09-11 21:27:13 +03:00 · 2017-04-25 16:23:07 -07:00 · 2017-04-25 16:23:07 -07:00 · 8db73688d7
commit 8db73688d7
parent 924516103b
4 changed files with 213 additions and 165 deletions
--- a/Duckling/Engine.hs
+++ b/Duckling/Engine.hs
@ -10,7 +10,6 @@
 {-# LANGUAGE NamedFieldPuns #-}
 {-# LANGUAGE NoRebindableSyntax #-}
 {-# LANGUAGE OverloadedStrings #-}
-{-# LANGUAGE BangPatterns #-}

 module Duckling.Engine
  ( parseAndResolve
@ -22,20 +21,11 @@ import Control.DeepSeq
 import Control.Monad.Extra
 import Data.Aeson
 import qualified Data.Array as Array
-import qualified Data.Array.Unboxed as UArray
-import Data.ByteString (ByteString)
-import qualified Data.ByteString as BS
-import qualified Data.Char as Char
 import Data.Functor.Identity
 import Data.Maybe
 import Data.Text (Text)
-import qualified Data.Text as Text
-import qualified Data.Text.Encoding as Text
-import qualified Data.Text.Unsafe as UText
 import qualified Data.List as L
-import qualified Data.Vector.Unboxed as Vector
 import Prelude
-import qualified Text.Regex.Base as R
 import qualified Text.Regex.PCRE as PCRE

 import Duckling.Dimensions.Types
@ -43,6 +33,8 @@ import qualified Duckling.Engine.Regex as Regex
 import Duckling.Regex.Types
 import Duckling.Resolve
 import Duckling.Types
+import qualified Duckling.Types.Document as Document
+import Duckling.Types.Document (Document)
 import qualified Duckling.Stash as Stash
 import Duckling.Stash (Stash)

@ -57,7 +49,7 @@ runDuckling ma = runIdentity ma
 parseAndResolve :: [Rule] -> Text -> Context -> [ResolvedToken]
 parseAndResolve rules input context = mapMaybe (resolveNode context) .
  force $ Stash.toPosOrderedList $ runDuckling $
-  parseString rules (mkDocument input)
+  parseString rules (Document.fromText input)

 produce :: Match -> Maybe Node
 produce (_, _, []) = Nothing
@ -73,39 +65,15 @@ produce (Rule name _ production, _, etuor@(Node {nodeRange = Range _ e}:_)) = do
      }
    [] -> Nothing

-- As regexes are matched without whitespace delimitator, we need to check
-- the reasonability of the match to actually be a word.
-isRangeValid :: Document -> Range -> Bool
-isRangeValid Document { indexable = s } (Range start end) =
-  (start == 0 || isDifferent (s UArray.! (start - 1)) (s UArray.! start)) &&
-  (end == arraySize s ||
-      isDifferent (s UArray.! (end - 1)) (s UArray.! end))
-  where
-    charClass :: Char -> Char
-    charClass c
-      | Char.isLower c = 'l'
-      | Char.isUpper c = 'u'
-      | Char.isDigit c = 'd'
-      | otherwise = c
-    isDifferent :: Char -> Char -> Bool
-    isDifferent a b = charClass a /= charClass b
-
 lookupRegex :: PCRE.Regex -> Int -> Document -> Duckling [Node]
-lookupRegex _regex position Document{ indexable = indexable }
-  | position >= arraySize indexable = return []
-lookupRegex regex position
-  Document { rawInput = rawInput
-           , utf8Encoded = utf8Encoded
-           , tDropToBSDrop = tDropToBSDrop
-           , bsDropToTDrop = bsDropToTDrop
-           , tDropToUtf16Drop = tDropToUtf16Drop
-           } = return nodes
+lookupRegex _regex position doc
+  | position >= Document.length doc = return []
+lookupRegex regex position doc = return nodes
  where
    -- See Note [Regular expressions and Text] to understand what's going
    -- on here
-    utf8Position = tDropToBSDrop UArray.! position
-    substring :: ByteString
-    substring = BS.drop utf8Position utf8Encoded
+    (substring, rangeToText, translateRange) =
+      Document.byteStringFromPos doc position
    nodes = L.foldl' f [] $ map Array.elems $ Regex.matchAll regex substring
    f :: [Node] -> [(Int, Int)] -> [Node]
    f nodes [] = nodes
@ -119,31 +87,12 @@ lookupRegex regex position
        , children = []
        , rule = Nothing
        }
-    -- get a subrange of Text reusing the underlying buffer using
-    -- utf16 start and end positions
-    rangeToText :: (Int, Int) -> Text
-    rangeToText (-1, _) = ""
-    -- this is what regexec from Text.Regex.PCRE.ByteString does
-    rangeToText r = UText.takeWord16 (end16Pos - start16Pos) $
-      UText.dropWord16 start16Pos rawInput
-      where
-      start16Pos = tDropToUtf16Drop UArray.! startPos
-      end16Pos = tDropToUtf16Drop UArray.! endPos
-      (startPos, endPos) = uncurry translateRange r
-    -- from utf8 offset and length to Text character start and end position
-    translateRange :: Int -> Int -> (Int, Int)
-    translateRange !bsStart !bsLen = startPos `seq` endPos `seq` res
-      where
-      res = (startPos, endPos)
-      realBsStart = utf8Position + bsStart
-      realBsEnd = realBsStart + bsLen
-      startPos = bsDropToTDrop UArray.! realBsStart
-      endPos = bsDropToTDrop UArray.! realBsEnd

 lookupItem :: Document -> PatternItem -> Stash -> Int -> Duckling [Node]
 lookupItem s (Regex re) _ position =
-  filter (\node -> isRangeValid s (nodeRange node) &&
-                   isPositionValid position s node) <$>
+  filter (\node@Node { nodeRange = Range start end } ->
+            Document.isRangeValid s start end &&
+              isPositionValid position s node) <$>
  lookupRegex re position s
 lookupItem s (Predicate p) stash position =
  return $
@ -153,7 +102,7 @@ lookupItem s (Predicate p) stash position =

 isPositionValid :: Int -> Document -> Node -> Bool
 isPositionValid position sentence (Node {nodeRange = Range start _}) =
-  position == 0 || isAdjacent position start sentence
+  position == 0 || Document.isAdjacent sentence position start

 -- | A match is full if its rule pattern is empty.
 -- (rule, endPosition, reversedRoute)
--- a/Duckling/Types.hs
+++ b/Duckling/Types.hs
@ -5,7 +5,7 @@
 -- LICENSE file in the root directory of this source tree. An additional grant
 -- of patent rights can be found in the PATENTS file in the same directory.

-{-# LANGUAGE BangPatterns #-}
+
 {-# LANGUAGE DeriveAnyClass #-}
 {-# LANGUAGE DeriveGeneric #-}
 {-# LANGUAGE ExistentialQuantification #-}
@ -13,29 +13,19 @@
 {-# LANGUAGE GADTs #-}
 {-# LANGUAGE NoRebindableSyntax #-}
 {-# LANGUAGE RankNTypes #-}
-{-# LANGUAGE RecordWildCards #-}
 {-# LANGUAGE StandaloneDeriving #-}

 module Duckling.Types where

 import Control.DeepSeq
-import qualified Data.Array.Unboxed as Array
-import Data.Array.Unboxed (UArray, IArray)
 import Data.Aeson
-import Data.ByteString (ByteString)
-import qualified Data.ByteString as BS
 import qualified Data.ByteString.Lazy as LB
 import Data.GADT.Compare
 import Data.Hashable
 import Data.Maybe
 import Data.String
 import Data.Text (Text)
-import Data.List (scanl', foldl', foldr)
-import Data.Vector.Unboxed (Vector)
 import qualified Data.Text.Encoding as Text
-import qualified Data.Text as Text
-import qualified Data.Text.Internal.Unsafe.Char as UText
-import qualified Data.Vector.Unboxed as Vector
 import Data.Typeable ((:~:)(Refl), Typeable)
 import GHC.Generics
 import Prelude
@ -45,97 +35,6 @@ import qualified Text.Regex.PCRE as PCRE
 import Duckling.Dimensions.Types
 import Duckling.Resolve

-data Document = Document
-  { rawInput :: !Text
-  , utf8Encoded :: ByteString
-  , indexable :: UArray Int Char -- for O(1) indexing pos -> Char
-  , firstNonAdjacent :: UArray Int Int
-    -- for a given index 'i' it keeps a first index 'j' greater or equal 'i'
-    -- such that isAdjacentSeparator (indexable ! j) == False
-    -- eg. " a document " :: Document
-    --     firstNonAdjacent = [1,1,3,3,4,5,6,7,8,9,10,12]
-    -- Note that in this case 12 is the length of the vector, hence not a
-    -- valid index inside the array, this is intentional.
-  , tDropToBSDrop :: UArray Int Int
-    -- how many bytes to BS.drop from a utf8 encoded ByteString to
-    -- reach the same position as Text.drop would
-  , bsDropToTDrop :: UArray Int Int
-    -- the inverse of tDropToBSDrop, rounds down for bytes that are
-    -- not on character boundary
-    -- for "żółty" :: Document
-    --   tDropToBSDrop = [0,2,4,6,7,8]
-    --   bsDropToTDrop = [0,1,1,2,2,3,3,4,5]
-    --   tDropToUtf16Drop = [0,1,2,3,4,5]
-  , tDropToUtf16Drop :: UArray Int Int
-    -- translate Text.drop to Data.Text.Unsafe.dropWord16
-  } deriving (Show)
-
-{-
-Note [Regular expressions and Text]
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-Text is UTF-16 encoded internally and PCRE operates on UTF-8 encoded
-ByteStrings. Because we do a lot of regexp matching on the same Text,
-it pays off to cache UTF-8 the encoded ByteString. That's the utf8Ecoded
-field in Document.
-
-Moreover we do regexp matching with capture, where the captured groups
-are returned as ByteString and we want them as Text. But all of the
-captured groups are just a substrings of the original Text.
-Fortunately PCRE has an API that returns a MatchArray - a structure with
-just the ByteString indices and ByteString lengths of the matched fragments.
-If we play with indices right we can translate them to offsets into the
-original Text, share the underlying Text buffer and avoid all of
-UTF-8 and UTF-16 encoding and new ByteString and Text allocation.
-}
-
-instance IsString Document where
-  fromString = mkDocument . fromString
-
-mkDocument :: Text -> Document
-mkDocument rawInput = Document{..}
-  where
-  utf8Encoded = Text.encodeUtf8 rawInput
-  rawInputLength = Text.length rawInput
-  unpacked = Text.unpack rawInput
-  indexable = Array.listArray (0, rawInputLength - 1) unpacked
-  firstNonAdjacent = Array.listArray (0, rawInputLength - 1) $ snd $
-   foldr gen (rawInputLength, []) $ zip [0..] unpacked
-  -- go from the end keeping track of the first nonAdjacent (best)
-  gen (ix, elem) (best, !acc)
-    | isAdjacentSeparator elem = (best, best:acc)
-    | otherwise = (ix, ix:acc)
-  tDropToBSDropList = scanl' (\acc a -> acc + utf8CharWidth a) 0 unpacked
-  tDropToBSDrop = Array.listArray (0, rawInputLength) tDropToBSDropList
-  tDropToUtf16Drop = Array.listArray (0, rawInputLength) $
-    scanl' (\acc a -> acc + utf16CharWidth a) 0 unpacked
-  bsDropToTDrop = Array.listArray (0, BS.length utf8Encoded) $
-    reverse $ snd $ foldl' fun (-1, []) $ zip [0..] tDropToBSDropList
-  fun (lastPos, !acc) (ix, elem) = (elem, replicate (elem - lastPos) ix ++ acc)
-  utf8CharWidth c
-    | w <= 0x7F = 1
-    | w <= 0x7FF = 2
-    | w <= 0xFFFF = 3
-    | otherwise = 4
-    where
-    w = UText.ord c
-  utf16CharWidth c
-    | w < 0x10000 = 1
-    | otherwise = 2
-    where
-    w = UText.ord c
-
-
-- True iff a is followed by whitespaces and b.
-isAdjacent :: Int -> Int -> Document -> Bool
-isAdjacent a b Document{..} =
-  b >= a && (firstNonAdjacent Array.! a >= b)
-
-isAdjacentSeparator :: Char -> Bool
-isAdjacentSeparator c = elem c [' ', '\t', '-']
-
-arraySize :: IArray UArray a => UArray Int a -> Int
-arraySize = Array.rangeSize . Array.bounds
-
 -- -----------------------------------------------------------------
 -- Token

--- a/Duckling/Types/Document.hs
+++ b/Duckling/Types/Document.hs
@ -0,0 +1,199 @@
+-- Copyright (c) 2016-present, Facebook, Inc.
+-- All rights reserved.
+--
+-- This source code is licensed under the BSD-style license found in the
+-- LICENSE file in the root directory of this source tree. An additional grant
+-- of patent rights can be found in the PATENTS file in the same directory.
+
+{-# LANGUAGE BangPatterns #-}
+{-# LANGUAGE FlexibleContexts #-}
+{-# LANGUAGE RecordWildCards #-}
+
+module Duckling.Types.Document
+  ( Document -- abstract
+  , fromText
+  , (!)
+  , length
+  , byteStringFromPos
+  , isAdjacent
+  , isRangeValid
+  ) where
+
+import qualified Data.Array.Unboxed as Array
+import Data.Array.Unboxed (UArray)
+import Data.ByteString (ByteString)
+import qualified Data.ByteString as BS
+import qualified Data.Char as Char
+import Data.List (scanl', foldl', foldr)
+import Data.String
+import Data.Text (Text)
+import qualified Data.Text.Unsafe as UText
+import qualified Data.Text.Encoding as Text
+import qualified Data.Text as Text
+import qualified Data.Text.Internal.Unsafe.Char as UText
+import Prelude hiding (length)
+
+
+data Document = Document
+  { rawInput :: !Text
+  , utf8Encoded :: ByteString
+  , indexable :: UArray Int Char -- for O(1) indexing pos -> Char
+  , firstNonAdjacent :: UArray Int Int
+    -- for a given index 'i' it keeps a first index 'j' greater or equal 'i'
+    -- such that isAdjacentSeparator (indexable ! j) == False
+    -- eg. " a document " :: Document
+    --     firstNonAdjacent = [1,1,3,3,4,5,6,7,8,9,10,12]
+    -- Note that in this case 12 is the length of the vector, hence not a
+    -- valid index inside the array, this is intentional.
+  , tDropToBSDrop :: UArray Int Int
+    -- how many bytes to BS.drop from a utf8 encoded ByteString to
+    -- reach the same position as Text.drop would
+  , bsDropToTDrop :: UArray Int Int
+    -- the inverse of tDropToBSDrop, rounds down for bytes that are
+    -- not on character boundary
+    -- for "żółty" :: Document
+    --   tDropToBSDrop = [0,2,4,6,7,8]
+    --   bsDropToTDrop = [0,1,1,2,2,3,3,4,5]
+    --   tDropToUtf16Drop = [0,1,2,3,4,5]
+  , tDropToUtf16Drop :: UArray Int Int
+    -- translate Text.drop to Data.Text.Unsafe.dropWord16
+  } deriving (Show)
+
+{-
+Note [Regular expressions and Text]
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+Text is UTF-16 encoded internally and PCRE operates on UTF-8 encoded
+ByteStrings. Because we do a lot of regexp matching on the same Text,
+it pays off to cache UTF-8 the encoded ByteString. That's the utf8Ecoded
+field in Document.
+
+Moreover we do regexp matching with capture, where the captured groups
+are returned as ByteString and we want them as Text. But all of the
+captured groups are just a substrings of the original Text.
+Fortunately PCRE has an API that returns a MatchArray - a structure with
+just the ByteString indices and ByteString lengths of the matched fragments.
+If we play with indices right we can translate them to offsets into the
+original Text, share the underlying Text buffer and avoid all of
+UTF-8 and UTF-16 encoding and new ByteString and Text allocation.
+-}
+
+instance IsString Document where
+  fromString = fromText . fromString
+
+fromText :: Text -> Document
+fromText rawInput = Document{..}
+  where
+  utf8Encoded = Text.encodeUtf8 rawInput
+  rawInputLength = Text.length rawInput
+  unpacked = Text.unpack rawInput
+  indexable = Array.listArray (0, rawInputLength - 1) unpacked
+  firstNonAdjacent = Array.listArray (0, rawInputLength - 1) $ snd $
+   foldr gen (rawInputLength, []) $ zip [0..] unpacked
+  -- go from the end keeping track of the first nonAdjacent (best)
+  gen (ix, elem) (best, !acc)
+    | isAdjacentSeparator elem = (best, best:acc)
+    | otherwise = (ix, ix:acc)
+  tDropToBSDropList = scanl' (\acc a -> acc + utf8CharWidth a) 0 unpacked
+  tDropToBSDrop = Array.listArray (0, rawInputLength) tDropToBSDropList
+  tDropToUtf16Drop = Array.listArray (0, rawInputLength) $
+    scanl' (\acc a -> acc + utf16CharWidth a) 0 unpacked
+  bsDropToTDrop = Array.listArray (0, BS.length utf8Encoded) $
+    reverse $ snd $ foldl' fun (-1, []) $ zip [0..] tDropToBSDropList
+  fun (lastPos, !acc) (ix, elem) = (elem, replicate (elem - lastPos) ix ++ acc)
+  utf8CharWidth c
+    | w <= 0x7F = 1
+    | w <= 0x7FF = 2
+    | w <= 0xFFFF = 3
+    | otherwise = 4
+    where
+    w = UText.ord c
+  utf16CharWidth c
+    | w < 0x10000 = 1
+    | otherwise = 2
+    where
+    w = UText.ord c
+
+-- As regexes are matched without whitespace delimitator, we need to check
+-- the reasonability of the match to actually be a word.
+isRangeValid :: Document -> Int -> Int -> Bool
+isRangeValid doc start end =
+  (start == 0 ||
+      isDifferent (doc ! (start - 1)) (doc ! start)) &&
+  (end == length doc ||
+      isDifferent (doc ! (end - 1)) (doc ! end))
+  where
+    charClass :: Char -> Char
+    charClass c
+      | Char.isLower c = 'l'
+      | Char.isUpper c = 'u'
+      | Char.isDigit c = 'd'
+      | otherwise = c
+    isDifferent :: Char -> Char -> Bool
+    isDifferent a b = charClass a /= charClass b
+
+-- True iff a is followed by whitespaces and b.
+isAdjacent :: Document -> Int -> Int -> Bool
+isAdjacent Document{..} a b =
+  b >= a && (firstNonAdjacent Array.! a >= b)
+
+isAdjacentSeparator :: Char -> Bool
+isAdjacentSeparator c = elem c [' ', '\t', '-']
+
+(!) :: Document -> Int -> Char
+(!) Document { indexable = s } ix = s Array.! ix
+
+length :: Document -> Int
+length Document { indexable = s } = Array.rangeSize $ Array.bounds s
+
+-- | Given a document and an offset (think Text.drop offset),
+-- returns a utf8 encoded substring of Document at that offset
+-- and 2 translation functions:
+--   rangeToText - given a range in the returned ByteString, gives
+--     a corresponding subrange of the Document as Text
+--   translateRange - given a start and a length of a range in the returned
+--     ByteString, gives a corresponding subrange in the Document as pair
+--     of (start, end) of Text.drop offsets
+{-# INLINE byteStringFromPos #-}
+-- if we don't inline we seem to pay for the tuple, there might be
+-- an easier way
+byteStringFromPos
+  :: Document
+  -> Int
+  -> ( ByteString
+     , (Int, Int) -> Text
+     , Int -> Int -> (Int, Int)
+     )
+byteStringFromPos
+  Document { rawInput = rawInput
+           , utf8Encoded = utf8Encoded
+           , tDropToBSDrop = tDropToBSDrop
+           , bsDropToTDrop = bsDropToTDrop
+           , tDropToUtf16Drop = tDropToUtf16Drop
+           }
+  position = (substring, rangeToText, translateRange)
+  where
+  -- See Note [Regular expressions and Text] to understand what's going
+  -- on here
+  utf8Position = tDropToBSDrop Array.! position
+  substring :: ByteString
+  substring = BS.drop utf8Position utf8Encoded
+  -- get a subrange of Text reusing the underlying buffer using
+  -- utf16 start and end positions
+  rangeToText :: (Int, Int) -> Text
+  rangeToText (-1, _) = ""
+  -- this is what regexec from Text.Regex.PCRE.ByteString does
+  rangeToText r = UText.takeWord16 (end16Pos - start16Pos) $
+    UText.dropWord16 start16Pos rawInput
+    where
+    start16Pos = tDropToUtf16Drop Array.! startPos
+    end16Pos = tDropToUtf16Drop Array.! endPos
+    (startPos, endPos) = uncurry translateRange r
+  -- from utf8 offset and length to Text character start and end position
+  translateRange :: Int -> Int -> (Int, Int)
+  translateRange !bsStart !bsLen = startPos `seq` endPos `seq` res
+    where
+    res = (startPos, endPos)
+    realBsStart = utf8Position + bsStart
+    realBsEnd = realBsStart + bsLen
+    startPos = bsDropToTDrop Array.! realBsStart
+    endPos = bsDropToTDrop Array.! realBsEnd
--- a/duckling.cabal
+++ b/duckling.cabal
@ -31,6 +31,7 @@ library
                     , Duckling.Resolve
                     , Duckling.Stash
                     , Duckling.Types
+                     , Duckling.Types.Document

  -- ------------------------------------------------------------------
  -- Rules