Avoid allocations and encoding in regexp matching

Summary: The rationale is explained in a new Note. Reviewed By: patapizza Differential Revision: D4884104 fbshipit-source-id: 81f36ee
2024-09-11 21:27:13 +03:00 · 2017-04-14 12:05:33 -07:00 · 2017-04-14 12:05:33 -07:00 · e7aeef5436
commit e7aeef5436
parent 3d18cf5ea9
4 changed files with 136 additions and 46 deletions
--- a/Duckling/Engine.hs
+++ b/Duckling/Engine.hs
@ -10,6 +10,7 @@
 {-# LANGUAGE NamedFieldPuns #-}
 {-# LANGUAGE NoRebindableSyntax #-}
 {-# LANGUAGE OverloadedStrings #-}
+{-# LANGUAGE BangPatterns #-}

 module Duckling.Engine
  ( parseAndResolve
@ -20,21 +21,25 @@ module Duckling.Engine
 import Control.DeepSeq
 import Control.Monad.Extra
 import Data.Aeson
+import qualified Data.Array as Array
+import qualified Data.Array.Unboxed as UArray
 import Data.ByteString (ByteString)
+import qualified Data.ByteString as BS
 import qualified Data.Char as Char
 import Data.Functor.Identity
 import Data.Maybe
 import Data.Text (Text)
 import qualified Data.Text as Text
 import qualified Data.Text.Encoding as Text
+import qualified Data.Text.Unsafe as UText
 import qualified Data.List as L
-import qualified Data.Vector.Primitive as Vector
+import qualified Data.Vector.Unboxed as Vector
 import Prelude
 import qualified Text.Regex.Base as R
 import qualified Text.Regex.PCRE as PCRE

 import Duckling.Dimensions.Types
-import Duckling.Engine.Regex
+import qualified Duckling.Engine.Regex as Regex
 import Duckling.Regex.Types
 import Duckling.Resolve
 import Duckling.Types
@ -72,9 +77,9 @@ produce (Rule name _ production, _, etuor@(Node {nodeRange = Range _ e}:_)) = do
 -- the reasonability of the match to actually be a word.
 isRangeValid :: Document -> Range -> Bool
 isRangeValid Document { indexable = s } (Range start end) =
-  (start == 0 || isDifferent (s Vector.! (start - 1)) (s Vector.! start)) &&
-  (end == Vector.length s ||
-      isDifferent (s Vector.! (end - 1)) (s Vector.! end))
+  (start == 0 || isDifferent (s UArray.! (start - 1)) (s UArray.! start)) &&
+  (end == arraySize s ||
+      isDifferent (s UArray.! (end - 1)) (s UArray.! end))
  where
    charClass :: Char -> Char
    charClass c
@ -86,26 +91,54 @@ isRangeValid Document { indexable = s } (Range start end) =
    isDifferent a b = charClass a /= charClass b

 lookupRegex :: PCRE.Regex -> Int -> Document -> Duckling [Node]
-lookupRegex regex position Document { rawInput = s } =
-  return nodes
+lookupRegex _regex position Document{ indexable = indexable }
+  | position >= arraySize indexable = return []
+lookupRegex regex position
+  Document { rawInput = rawInput
+           , utf8Encoded = utf8Encoded
+           , tDropToBSDrop = tDropToBSDrop
+           , bsDropToTDrop = bsDropToTDrop
+           , tDropToUtf16Drop = tDropToUtf16Drop
+           } = return nodes
  where
-    ss = Text.drop position s
-    (nodes, _, _) = L.foldl' f ([], ss, position) $ match regex ss
-    f (nodes, s, offset) [] = (reverse nodes, s, offset)
-    f (nodes, s, offset) ("":_) = (reverse nodes, s, offset)
-    f (nodes, s, offset) (text:group) = (node:nodes, s', newOffset)
+    -- See Note [Regular expressions and Text] to understand what's going
+    -- on here
+    utf8Position = tDropToBSDrop UArray.! position
+    substring :: ByteString
+    substring = BS.drop utf8Position utf8Encoded
+    nodes = L.foldl' f [] $ map Array.elems $ Regex.matchAll regex substring
+    f :: [Node] -> [(Int, Int)] -> [Node]
+    f nodes [] = nodes
+    f nodes ((0,0):_) = nodes
+    f nodes ((bsStart, bsLen):groups) = node:nodes
      where
-        (x,xs) = Text.breakOn text s
-        m = offset + Text.length x
-        n = Text.length text
-        s' = Text.drop n xs
-        newOffset = m + n
-        node = Node
-          { nodeRange = Range m newOffset
-          , token = Token RegexMatch (GroupMatch group)
-          , children = []
-          , rule = Nothing
-          }
+      textGroups = map rangeToText groups
+      node = Node
+        { nodeRange = uncurry Range $ translateRange bsStart bsLen
+        , token = Token RegexMatch (GroupMatch textGroups)
+        , children = []
+        , rule = Nothing
+        }
+    -- get a subrange of Text reusing the underlying buffer using
+    -- utf16 start and end positions
+    rangeToText :: (Int, Int) -> Text
+    rangeToText (-1, _) = ""
+    -- this is what regexec from Text.Regex.PCRE.ByteString does
+    rangeToText r = UText.takeWord16 (end16Pos - start16Pos) $
+      UText.dropWord16 start16Pos rawInput
+      where
+      start16Pos = tDropToUtf16Drop UArray.! startPos
+      end16Pos = tDropToUtf16Drop UArray.! endPos
+      (startPos, endPos) = uncurry translateRange r
+    -- from utf8 offset and length to Text character start and end position
+    translateRange :: Int -> Int -> (Int, Int)
+    translateRange !bsStart !bsLen = startPos `seq` endPos `seq` res
+      where
+      res = (startPos, endPos)
+      realBsStart = utf8Position + bsStart
+      realBsEnd = realBsStart + bsLen
+      startPos = bsDropToTDrop UArray.! realBsStart
+      endPos = bsDropToTDrop UArray.! realBsEnd

 lookupItem :: Document -> PatternItem -> Stash -> Int -> Duckling [Node]
 lookupItem s (Regex re) _ position =
--- a/Duckling/Engine/Regex.hs
+++ b/Duckling/Engine/Regex.hs
@ -7,18 +7,7 @@


 module Duckling.Engine.Regex
-  ( match
+  ( matchAll
  ) where

-import Data.ByteString (ByteString)
-import Data.Text (Text)
-import qualified Data.Text.Encoding as Text
-import Prelude
-import qualified Text.Regex.Base as R
-import qualified Text.Regex.PCRE as PCRE
-
-match :: PCRE.Regex -> Text -> [[Text]]
-match regex s = map (map Text.decodeUtf8) bss
-  where
-    bss :: [[ByteString]]
-    bss = R.match regex $ Text.encodeUtf8 s
+import Text.Regex.Base (matchAll)
--- a/Duckling/Types.hs
+++ b/Duckling/Types.hs
@ -5,7 +5,7 @@
 -- LICENSE file in the root directory of this source tree. An additional grant
 -- of patent rights can be found in the PATENTS file in the same directory.

-
+{-# LANGUAGE BangPatterns #-}
 {-# LANGUAGE DeriveAnyClass #-}
 {-# LANGUAGE DeriveGeneric #-}
 {-# LANGUAGE ExistentialQuantification #-}
@ -19,17 +19,23 @@
 module Duckling.Types where

 import Control.DeepSeq
+import qualified Data.Array.Unboxed as Array
+import Data.Array.Unboxed (UArray, IArray)
 import Data.Aeson
+import Data.ByteString (ByteString)
+import qualified Data.ByteString as BS
 import qualified Data.ByteString.Lazy as LB
 import Data.GADT.Compare
 import Data.Hashable
 import Data.Maybe
 import Data.String
 import Data.Text (Text)
-import Data.Vector.Primitive (Vector)
+import Data.List (scanl', foldl', foldr)
+import Data.Vector.Unboxed (Vector)
 import qualified Data.Text.Encoding as Text
 import qualified Data.Text as Text
-import qualified Data.Vector.Primitive as Vector
+import qualified Data.Text.Internal.Unsafe.Char as UText
+import qualified Data.Vector.Unboxed as Vector
 import Data.Typeable ((:~:)(Refl), Typeable)
 import GHC.Generics
 import Prelude
@ -41,37 +47,98 @@ import Duckling.Resolve

 data Document = Document
  { rawInput :: !Text
-  , indexable :: !(Vector Char) -- for O(1) indexing pos -> Char
-  , firstNonAdjacent :: !(Vector Int)
+  , utf8Encoded :: ByteString
+  , indexable :: UArray Int Char -- for O(1) indexing pos -> Char
+  , firstNonAdjacent :: UArray Int Int
    -- for a given index 'i' it keeps a first index 'j' greater or equal 'i'
    -- such that isAdjacentSeparator (indexable ! j) == False
    -- eg. " a document " :: Document
    --     firstNonAdjacent = [1,1,3,3,4,5,6,7,8,9,10,12]
    -- Note that in this case 12 is the length of the vector, hence not a
    -- valid index inside the array, this is intentional.
+  , tDropToBSDrop :: UArray Int Int
+    -- how many bytes to BS.drop from a utf8 encoded ByteString to
+    -- reach the same position as Text.drop would
+  , bsDropToTDrop :: UArray Int Int
+    -- the inverse of tDropToBSDrop, rounds down for bytes that are
+    -- not on character boundary
+    -- for "żółty" :: Document
+    --   tDropToBSDrop = [0,2,4,6,7,8]
+    --   bsDropToTDrop = [0,1,1,2,2,3,3,4,5]
+    --   tDropToUtf16Drop = [0,1,2,3,4,5]
+  , tDropToUtf16Drop :: UArray Int Int
+    -- translate Text.drop to Data.Text.Unsafe.dropWord16
  } deriving (Show)

+{-
+Note [Regular expressions and Text]
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+Text is UTF-16 encoded internally and PCRE operates on UTF-8 encoded
+ByteStrings. Because we do a lot of regexp matching on the same Text,
+it pays off to cache UTF-8 the encoded ByteString. That's the utf8Ecoded
+field in Document.
+
+Moreover we do regexp matching with capture, where the captured groups
+are returned as ByteString and we want them as Text. But all of the
+captured groups are just a substrings of the original Text.
+Fortunately PCRE has an API that returns a MatchArray - a structure with
+just the ByteString indices and ByteString lengths of the matched fragments.
+If we play with indices right we can translate them to offsets into the
+original Text, share the underlying Text buffer and avoid all of
+UTF-8 and UTF-16 encoding and new ByteString and Text allocation.
+-}
+
 instance IsString Document where
  fromString = mkDocument . fromString

 mkDocument :: Text -> Document
-mkDocument s = Document s v fna
+mkDocument rawInput = Document{..}
  where
-  v = Vector.fromList $ Text.unpack s
-  fna = Vector.fromList $ snd $ Vector.ifoldr' gen (Vector.length v, []) v
+  utf8Encoded = Text.encodeUtf8 rawInput
+  rawInputLength = Text.length rawInput
+  unpacked = Text.unpack rawInput
+  indexable = Array.listArray (0, rawInputLength) unpacked
+  firstNonAdjacent = Array.listArray (0, rawInputLength) $ snd $
+   foldr gen (rawInputLength, []) $ zip [0..] unpacked
  -- go from the end keeping track of the first nonAdjacent (best)
-  gen ix elem (best, acc)
+  gen (ix, elem) (best, !acc)
    | isAdjacentSeparator elem = (best, best:acc)
    | otherwise = (ix, ix:acc)
+  tDropToBSDropList = scanl' (\acc a -> acc + utf8CharWidth a) 0 unpacked
+  tDropToBSDrop = Array.listArray (0, rawInputLength + 1) tDropToBSDropList
+  tDropToUtf16Drop = Array.listArray (0, rawInputLength + 1) $
+    scanl' (\acc a -> acc + utf16CharWidth a) 0 unpacked
+  bsDropToTDrop = Array.listArray (0, BS.length utf8Encoded + 1) $
+    reverse $ snd $ foldl' fun (-1, []) $ zip [0..] tDropToBSDropList
+  fun (lastPos, !acc) (ix, elem) = (elem, replicate (elem - lastPos) ix ++ acc)
+  -- copied from Data.Text.Encoding
+  utf8CharWidth c
+    | w <= 0x7F = 1
+    | w <= 0x7FF = 2
+    | 0xD800 <= w && w <= 0xDBFF = 4
+    | otherwise = 3
+    where
+    w = UText.ord c
+  utf16CharWidth c
+    | w < 0x10000 = 1
+    | otherwise = 2
+    where
+    w = UText.ord c
+

 -- True iff a is followed by whitespaces and b.
 isAdjacent :: Int -> Int -> Document -> Bool
 isAdjacent a b Document{..} =
-  b >= a && (firstNonAdjacent Vector.! a >= b)
+  b >= a && (firstNonAdjacent Array.! a >= b)

 isAdjacentSeparator :: Char -> Bool
 isAdjacentSeparator c = elem c [' ', '\t', '-']

+arraySize :: IArray UArray a => UArray Int a -> Int
+arraySize arr = r - l
+  where
+  (l, r) = Array.bounds arr
+
 -- -----------------------------------------------------------------
 -- Token

--- a/duckling.cabal
+++ b/duckling.cabal
@ -432,6 +432,7 @@ library
                     , Duckling.Volume.Rules
                     , Duckling.Volume.Types
  build-depends:       base                  >= 4.8.2 && < 5.0
+                     , array                 >= 0.5.1.1 && < 0.6
                     , attoparsec            >= 0.13.1.0 && < 0.14
                     , aeson                 >= 0.11.3.0 && < 1.1
                     , bytestring            >= 0.10.6.0 && < 0.11