Avoid allocations and encoding in regexp matching

Summary: The rationale is explained in a new Note.

Reviewed By: patapizza

Differential Revision: D4884104

fbshipit-source-id: 81f36ee
This commit is contained in:
Bartosz Nitka 2017-04-14 12:05:33 -07:00 committed by Facebook Github Bot
parent 3d18cf5ea9
commit e7aeef5436
4 changed files with 136 additions and 46 deletions

View File

@ -10,6 +10,7 @@
{-# LANGUAGE NamedFieldPuns #-}
{-# LANGUAGE NoRebindableSyntax #-}
{-# LANGUAGE OverloadedStrings #-}
{-# LANGUAGE BangPatterns #-}
module Duckling.Engine
( parseAndResolve
@ -20,21 +21,25 @@ module Duckling.Engine
import Control.DeepSeq
import Control.Monad.Extra
import Data.Aeson
import qualified Data.Array as Array
import qualified Data.Array.Unboxed as UArray
import Data.ByteString (ByteString)
import qualified Data.ByteString as BS
import qualified Data.Char as Char
import Data.Functor.Identity
import Data.Maybe
import Data.Text (Text)
import qualified Data.Text as Text
import qualified Data.Text.Encoding as Text
import qualified Data.Text.Unsafe as UText
import qualified Data.List as L
import qualified Data.Vector.Primitive as Vector
import qualified Data.Vector.Unboxed as Vector
import Prelude
import qualified Text.Regex.Base as R
import qualified Text.Regex.PCRE as PCRE
import Duckling.Dimensions.Types
import Duckling.Engine.Regex
import qualified Duckling.Engine.Regex as Regex
import Duckling.Regex.Types
import Duckling.Resolve
import Duckling.Types
@ -72,9 +77,9 @@ produce (Rule name _ production, _, etuor@(Node {nodeRange = Range _ e}:_)) = do
-- the reasonability of the match to actually be a word.
isRangeValid :: Document -> Range -> Bool
isRangeValid Document { indexable = s } (Range start end) =
(start == 0 || isDifferent (s Vector.! (start - 1)) (s Vector.! start)) &&
(end == Vector.length s ||
isDifferent (s Vector.! (end - 1)) (s Vector.! end))
(start == 0 || isDifferent (s UArray.! (start - 1)) (s UArray.! start)) &&
(end == arraySize s ||
isDifferent (s UArray.! (end - 1)) (s UArray.! end))
where
charClass :: Char -> Char
charClass c
@ -86,26 +91,54 @@ isRangeValid Document { indexable = s } (Range start end) =
isDifferent a b = charClass a /= charClass b
lookupRegex :: PCRE.Regex -> Int -> Document -> Duckling [Node]
lookupRegex regex position Document { rawInput = s } =
return nodes
lookupRegex _regex position Document{ indexable = indexable }
| position >= arraySize indexable = return []
lookupRegex regex position
Document { rawInput = rawInput
, utf8Encoded = utf8Encoded
, tDropToBSDrop = tDropToBSDrop
, bsDropToTDrop = bsDropToTDrop
, tDropToUtf16Drop = tDropToUtf16Drop
} = return nodes
where
ss = Text.drop position s
(nodes, _, _) = L.foldl' f ([], ss, position) $ match regex ss
f (nodes, s, offset) [] = (reverse nodes, s, offset)
f (nodes, s, offset) ("":_) = (reverse nodes, s, offset)
f (nodes, s, offset) (text:group) = (node:nodes, s', newOffset)
-- See Note [Regular expressions and Text] to understand what's going
-- on here
utf8Position = tDropToBSDrop UArray.! position
substring :: ByteString
substring = BS.drop utf8Position utf8Encoded
nodes = L.foldl' f [] $ map Array.elems $ Regex.matchAll regex substring
f :: [Node] -> [(Int, Int)] -> [Node]
f nodes [] = nodes
f nodes ((0,0):_) = nodes
f nodes ((bsStart, bsLen):groups) = node:nodes
where
(x,xs) = Text.breakOn text s
m = offset + Text.length x
n = Text.length text
s' = Text.drop n xs
newOffset = m + n
node = Node
{ nodeRange = Range m newOffset
, token = Token RegexMatch (GroupMatch group)
, children = []
, rule = Nothing
}
textGroups = map rangeToText groups
node = Node
{ nodeRange = uncurry Range $ translateRange bsStart bsLen
, token = Token RegexMatch (GroupMatch textGroups)
, children = []
, rule = Nothing
}
-- get a subrange of Text reusing the underlying buffer using
-- utf16 start and end positions
rangeToText :: (Int, Int) -> Text
rangeToText (-1, _) = ""
-- this is what regexec from Text.Regex.PCRE.ByteString does
rangeToText r = UText.takeWord16 (end16Pos - start16Pos) $
UText.dropWord16 start16Pos rawInput
where
start16Pos = tDropToUtf16Drop UArray.! startPos
end16Pos = tDropToUtf16Drop UArray.! endPos
(startPos, endPos) = uncurry translateRange r
-- from utf8 offset and length to Text character start and end position
translateRange :: Int -> Int -> (Int, Int)
translateRange !bsStart !bsLen = startPos `seq` endPos `seq` res
where
res = (startPos, endPos)
realBsStart = utf8Position + bsStart
realBsEnd = realBsStart + bsLen
startPos = bsDropToTDrop UArray.! realBsStart
endPos = bsDropToTDrop UArray.! realBsEnd
lookupItem :: Document -> PatternItem -> Stash -> Int -> Duckling [Node]
lookupItem s (Regex re) _ position =

View File

@ -7,18 +7,7 @@
module Duckling.Engine.Regex
( match
( matchAll
) where
import Data.ByteString (ByteString)
import Data.Text (Text)
import qualified Data.Text.Encoding as Text
import Prelude
import qualified Text.Regex.Base as R
import qualified Text.Regex.PCRE as PCRE
match :: PCRE.Regex -> Text -> [[Text]]
match regex s = map (map Text.decodeUtf8) bss
where
bss :: [[ByteString]]
bss = R.match regex $ Text.encodeUtf8 s
import Text.Regex.Base (matchAll)

View File

@ -5,7 +5,7 @@
-- LICENSE file in the root directory of this source tree. An additional grant
-- of patent rights can be found in the PATENTS file in the same directory.
{-# LANGUAGE BangPatterns #-}
{-# LANGUAGE DeriveAnyClass #-}
{-# LANGUAGE DeriveGeneric #-}
{-# LANGUAGE ExistentialQuantification #-}
@ -19,17 +19,23 @@
module Duckling.Types where
import Control.DeepSeq
import qualified Data.Array.Unboxed as Array
import Data.Array.Unboxed (UArray, IArray)
import Data.Aeson
import Data.ByteString (ByteString)
import qualified Data.ByteString as BS
import qualified Data.ByteString.Lazy as LB
import Data.GADT.Compare
import Data.Hashable
import Data.Maybe
import Data.String
import Data.Text (Text)
import Data.Vector.Primitive (Vector)
import Data.List (scanl', foldl', foldr)
import Data.Vector.Unboxed (Vector)
import qualified Data.Text.Encoding as Text
import qualified Data.Text as Text
import qualified Data.Vector.Primitive as Vector
import qualified Data.Text.Internal.Unsafe.Char as UText
import qualified Data.Vector.Unboxed as Vector
import Data.Typeable ((:~:)(Refl), Typeable)
import GHC.Generics
import Prelude
@ -41,37 +47,98 @@ import Duckling.Resolve
data Document = Document
{ rawInput :: !Text
, indexable :: !(Vector Char) -- for O(1) indexing pos -> Char
, firstNonAdjacent :: !(Vector Int)
, utf8Encoded :: ByteString
, indexable :: UArray Int Char -- for O(1) indexing pos -> Char
, firstNonAdjacent :: UArray Int Int
-- for a given index 'i' it keeps a first index 'j' greater or equal 'i'
-- such that isAdjacentSeparator (indexable ! j) == False
-- eg. " a document " :: Document
-- firstNonAdjacent = [1,1,3,3,4,5,6,7,8,9,10,12]
-- Note that in this case 12 is the length of the vector, hence not a
-- valid index inside the array, this is intentional.
, tDropToBSDrop :: UArray Int Int
-- how many bytes to BS.drop from a utf8 encoded ByteString to
-- reach the same position as Text.drop would
, bsDropToTDrop :: UArray Int Int
-- the inverse of tDropToBSDrop, rounds down for bytes that are
-- not on character boundary
-- for "żółty" :: Document
-- tDropToBSDrop = [0,2,4,6,7,8]
-- bsDropToTDrop = [0,1,1,2,2,3,3,4,5]
-- tDropToUtf16Drop = [0,1,2,3,4,5]
, tDropToUtf16Drop :: UArray Int Int
-- translate Text.drop to Data.Text.Unsafe.dropWord16
} deriving (Show)
{-
Note [Regular expressions and Text]
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
Text is UTF-16 encoded internally and PCRE operates on UTF-8 encoded
ByteStrings. Because we do a lot of regexp matching on the same Text,
it pays off to cache UTF-8 the encoded ByteString. That's the utf8Ecoded
field in Document.
Moreover we do regexp matching with capture, where the captured groups
are returned as ByteString and we want them as Text. But all of the
captured groups are just a substrings of the original Text.
Fortunately PCRE has an API that returns a MatchArray - a structure with
just the ByteString indices and ByteString lengths of the matched fragments.
If we play with indices right we can translate them to offsets into the
original Text, share the underlying Text buffer and avoid all of
UTF-8 and UTF-16 encoding and new ByteString and Text allocation.
-}
instance IsString Document where
fromString = mkDocument . fromString
mkDocument :: Text -> Document
mkDocument s = Document s v fna
mkDocument rawInput = Document{..}
where
v = Vector.fromList $ Text.unpack s
fna = Vector.fromList $ snd $ Vector.ifoldr' gen (Vector.length v, []) v
utf8Encoded = Text.encodeUtf8 rawInput
rawInputLength = Text.length rawInput
unpacked = Text.unpack rawInput
indexable = Array.listArray (0, rawInputLength) unpacked
firstNonAdjacent = Array.listArray (0, rawInputLength) $ snd $
foldr gen (rawInputLength, []) $ zip [0..] unpacked
-- go from the end keeping track of the first nonAdjacent (best)
gen ix elem (best, acc)
gen (ix, elem) (best, !acc)
| isAdjacentSeparator elem = (best, best:acc)
| otherwise = (ix, ix:acc)
tDropToBSDropList = scanl' (\acc a -> acc + utf8CharWidth a) 0 unpacked
tDropToBSDrop = Array.listArray (0, rawInputLength + 1) tDropToBSDropList
tDropToUtf16Drop = Array.listArray (0, rawInputLength + 1) $
scanl' (\acc a -> acc + utf16CharWidth a) 0 unpacked
bsDropToTDrop = Array.listArray (0, BS.length utf8Encoded + 1) $
reverse $ snd $ foldl' fun (-1, []) $ zip [0..] tDropToBSDropList
fun (lastPos, !acc) (ix, elem) = (elem, replicate (elem - lastPos) ix ++ acc)
-- copied from Data.Text.Encoding
utf8CharWidth c
| w <= 0x7F = 1
| w <= 0x7FF = 2
| 0xD800 <= w && w <= 0xDBFF = 4
| otherwise = 3
where
w = UText.ord c
utf16CharWidth c
| w < 0x10000 = 1
| otherwise = 2
where
w = UText.ord c
-- True iff a is followed by whitespaces and b.
isAdjacent :: Int -> Int -> Document -> Bool
isAdjacent a b Document{..} =
b >= a && (firstNonAdjacent Vector.! a >= b)
b >= a && (firstNonAdjacent Array.! a >= b)
isAdjacentSeparator :: Char -> Bool
isAdjacentSeparator c = elem c [' ', '\t', '-']
arraySize :: IArray UArray a => UArray Int a -> Int
arraySize arr = r - l
where
(l, r) = Array.bounds arr
-- -----------------------------------------------------------------
-- Token

View File

@ -432,6 +432,7 @@ library
, Duckling.Volume.Rules
, Duckling.Volume.Types
build-depends: base >= 4.8.2 && < 5.0
, array >= 0.5.1.1 && < 0.6
, attoparsec >= 0.13.1.0 && < 0.14
, aeson >= 0.11.3.0 && < 1.1
, bytestring >= 0.10.6.0 && < 0.11