mirror of
https://github.com/facebook/duckling.git
synced 2024-09-11 21:27:13 +03:00
Avoid allocations and encoding in regexp matching
Summary: The rationale is explained in a new Note. Reviewed By: patapizza Differential Revision: D4884104 fbshipit-source-id: 81f36ee
This commit is contained in:
parent
3d18cf5ea9
commit
e7aeef5436
@ -10,6 +10,7 @@
|
||||
{-# LANGUAGE NamedFieldPuns #-}
|
||||
{-# LANGUAGE NoRebindableSyntax #-}
|
||||
{-# LANGUAGE OverloadedStrings #-}
|
||||
{-# LANGUAGE BangPatterns #-}
|
||||
|
||||
module Duckling.Engine
|
||||
( parseAndResolve
|
||||
@ -20,21 +21,25 @@ module Duckling.Engine
|
||||
import Control.DeepSeq
|
||||
import Control.Monad.Extra
|
||||
import Data.Aeson
|
||||
import qualified Data.Array as Array
|
||||
import qualified Data.Array.Unboxed as UArray
|
||||
import Data.ByteString (ByteString)
|
||||
import qualified Data.ByteString as BS
|
||||
import qualified Data.Char as Char
|
||||
import Data.Functor.Identity
|
||||
import Data.Maybe
|
||||
import Data.Text (Text)
|
||||
import qualified Data.Text as Text
|
||||
import qualified Data.Text.Encoding as Text
|
||||
import qualified Data.Text.Unsafe as UText
|
||||
import qualified Data.List as L
|
||||
import qualified Data.Vector.Primitive as Vector
|
||||
import qualified Data.Vector.Unboxed as Vector
|
||||
import Prelude
|
||||
import qualified Text.Regex.Base as R
|
||||
import qualified Text.Regex.PCRE as PCRE
|
||||
|
||||
import Duckling.Dimensions.Types
|
||||
import Duckling.Engine.Regex
|
||||
import qualified Duckling.Engine.Regex as Regex
|
||||
import Duckling.Regex.Types
|
||||
import Duckling.Resolve
|
||||
import Duckling.Types
|
||||
@ -72,9 +77,9 @@ produce (Rule name _ production, _, etuor@(Node {nodeRange = Range _ e}:_)) = do
|
||||
-- the reasonability of the match to actually be a word.
|
||||
isRangeValid :: Document -> Range -> Bool
|
||||
isRangeValid Document { indexable = s } (Range start end) =
|
||||
(start == 0 || isDifferent (s Vector.! (start - 1)) (s Vector.! start)) &&
|
||||
(end == Vector.length s ||
|
||||
isDifferent (s Vector.! (end - 1)) (s Vector.! end))
|
||||
(start == 0 || isDifferent (s UArray.! (start - 1)) (s UArray.! start)) &&
|
||||
(end == arraySize s ||
|
||||
isDifferent (s UArray.! (end - 1)) (s UArray.! end))
|
||||
where
|
||||
charClass :: Char -> Char
|
||||
charClass c
|
||||
@ -86,26 +91,54 @@ isRangeValid Document { indexable = s } (Range start end) =
|
||||
isDifferent a b = charClass a /= charClass b
|
||||
|
||||
lookupRegex :: PCRE.Regex -> Int -> Document -> Duckling [Node]
|
||||
lookupRegex regex position Document { rawInput = s } =
|
||||
return nodes
|
||||
lookupRegex _regex position Document{ indexable = indexable }
|
||||
| position >= arraySize indexable = return []
|
||||
lookupRegex regex position
|
||||
Document { rawInput = rawInput
|
||||
, utf8Encoded = utf8Encoded
|
||||
, tDropToBSDrop = tDropToBSDrop
|
||||
, bsDropToTDrop = bsDropToTDrop
|
||||
, tDropToUtf16Drop = tDropToUtf16Drop
|
||||
} = return nodes
|
||||
where
|
||||
ss = Text.drop position s
|
||||
(nodes, _, _) = L.foldl' f ([], ss, position) $ match regex ss
|
||||
f (nodes, s, offset) [] = (reverse nodes, s, offset)
|
||||
f (nodes, s, offset) ("":_) = (reverse nodes, s, offset)
|
||||
f (nodes, s, offset) (text:group) = (node:nodes, s', newOffset)
|
||||
-- See Note [Regular expressions and Text] to understand what's going
|
||||
-- on here
|
||||
utf8Position = tDropToBSDrop UArray.! position
|
||||
substring :: ByteString
|
||||
substring = BS.drop utf8Position utf8Encoded
|
||||
nodes = L.foldl' f [] $ map Array.elems $ Regex.matchAll regex substring
|
||||
f :: [Node] -> [(Int, Int)] -> [Node]
|
||||
f nodes [] = nodes
|
||||
f nodes ((0,0):_) = nodes
|
||||
f nodes ((bsStart, bsLen):groups) = node:nodes
|
||||
where
|
||||
(x,xs) = Text.breakOn text s
|
||||
m = offset + Text.length x
|
||||
n = Text.length text
|
||||
s' = Text.drop n xs
|
||||
newOffset = m + n
|
||||
node = Node
|
||||
{ nodeRange = Range m newOffset
|
||||
, token = Token RegexMatch (GroupMatch group)
|
||||
, children = []
|
||||
, rule = Nothing
|
||||
}
|
||||
textGroups = map rangeToText groups
|
||||
node = Node
|
||||
{ nodeRange = uncurry Range $ translateRange bsStart bsLen
|
||||
, token = Token RegexMatch (GroupMatch textGroups)
|
||||
, children = []
|
||||
, rule = Nothing
|
||||
}
|
||||
-- get a subrange of Text reusing the underlying buffer using
|
||||
-- utf16 start and end positions
|
||||
rangeToText :: (Int, Int) -> Text
|
||||
rangeToText (-1, _) = ""
|
||||
-- this is what regexec from Text.Regex.PCRE.ByteString does
|
||||
rangeToText r = UText.takeWord16 (end16Pos - start16Pos) $
|
||||
UText.dropWord16 start16Pos rawInput
|
||||
where
|
||||
start16Pos = tDropToUtf16Drop UArray.! startPos
|
||||
end16Pos = tDropToUtf16Drop UArray.! endPos
|
||||
(startPos, endPos) = uncurry translateRange r
|
||||
-- from utf8 offset and length to Text character start and end position
|
||||
translateRange :: Int -> Int -> (Int, Int)
|
||||
translateRange !bsStart !bsLen = startPos `seq` endPos `seq` res
|
||||
where
|
||||
res = (startPos, endPos)
|
||||
realBsStart = utf8Position + bsStart
|
||||
realBsEnd = realBsStart + bsLen
|
||||
startPos = bsDropToTDrop UArray.! realBsStart
|
||||
endPos = bsDropToTDrop UArray.! realBsEnd
|
||||
|
||||
lookupItem :: Document -> PatternItem -> Stash -> Int -> Duckling [Node]
|
||||
lookupItem s (Regex re) _ position =
|
||||
|
@ -7,18 +7,7 @@
|
||||
|
||||
|
||||
module Duckling.Engine.Regex
|
||||
( match
|
||||
( matchAll
|
||||
) where
|
||||
|
||||
import Data.ByteString (ByteString)
|
||||
import Data.Text (Text)
|
||||
import qualified Data.Text.Encoding as Text
|
||||
import Prelude
|
||||
import qualified Text.Regex.Base as R
|
||||
import qualified Text.Regex.PCRE as PCRE
|
||||
|
||||
match :: PCRE.Regex -> Text -> [[Text]]
|
||||
match regex s = map (map Text.decodeUtf8) bss
|
||||
where
|
||||
bss :: [[ByteString]]
|
||||
bss = R.match regex $ Text.encodeUtf8 s
|
||||
import Text.Regex.Base (matchAll)
|
||||
|
@ -5,7 +5,7 @@
|
||||
-- LICENSE file in the root directory of this source tree. An additional grant
|
||||
-- of patent rights can be found in the PATENTS file in the same directory.
|
||||
|
||||
|
||||
{-# LANGUAGE BangPatterns #-}
|
||||
{-# LANGUAGE DeriveAnyClass #-}
|
||||
{-# LANGUAGE DeriveGeneric #-}
|
||||
{-# LANGUAGE ExistentialQuantification #-}
|
||||
@ -19,17 +19,23 @@
|
||||
module Duckling.Types where
|
||||
|
||||
import Control.DeepSeq
|
||||
import qualified Data.Array.Unboxed as Array
|
||||
import Data.Array.Unboxed (UArray, IArray)
|
||||
import Data.Aeson
|
||||
import Data.ByteString (ByteString)
|
||||
import qualified Data.ByteString as BS
|
||||
import qualified Data.ByteString.Lazy as LB
|
||||
import Data.GADT.Compare
|
||||
import Data.Hashable
|
||||
import Data.Maybe
|
||||
import Data.String
|
||||
import Data.Text (Text)
|
||||
import Data.Vector.Primitive (Vector)
|
||||
import Data.List (scanl', foldl', foldr)
|
||||
import Data.Vector.Unboxed (Vector)
|
||||
import qualified Data.Text.Encoding as Text
|
||||
import qualified Data.Text as Text
|
||||
import qualified Data.Vector.Primitive as Vector
|
||||
import qualified Data.Text.Internal.Unsafe.Char as UText
|
||||
import qualified Data.Vector.Unboxed as Vector
|
||||
import Data.Typeable ((:~:)(Refl), Typeable)
|
||||
import GHC.Generics
|
||||
import Prelude
|
||||
@ -41,37 +47,98 @@ import Duckling.Resolve
|
||||
|
||||
data Document = Document
|
||||
{ rawInput :: !Text
|
||||
, indexable :: !(Vector Char) -- for O(1) indexing pos -> Char
|
||||
, firstNonAdjacent :: !(Vector Int)
|
||||
, utf8Encoded :: ByteString
|
||||
, indexable :: UArray Int Char -- for O(1) indexing pos -> Char
|
||||
, firstNonAdjacent :: UArray Int Int
|
||||
-- for a given index 'i' it keeps a first index 'j' greater or equal 'i'
|
||||
-- such that isAdjacentSeparator (indexable ! j) == False
|
||||
-- eg. " a document " :: Document
|
||||
-- firstNonAdjacent = [1,1,3,3,4,5,6,7,8,9,10,12]
|
||||
-- Note that in this case 12 is the length of the vector, hence not a
|
||||
-- valid index inside the array, this is intentional.
|
||||
, tDropToBSDrop :: UArray Int Int
|
||||
-- how many bytes to BS.drop from a utf8 encoded ByteString to
|
||||
-- reach the same position as Text.drop would
|
||||
, bsDropToTDrop :: UArray Int Int
|
||||
-- the inverse of tDropToBSDrop, rounds down for bytes that are
|
||||
-- not on character boundary
|
||||
-- for "żółty" :: Document
|
||||
-- tDropToBSDrop = [0,2,4,6,7,8]
|
||||
-- bsDropToTDrop = [0,1,1,2,2,3,3,4,5]
|
||||
-- tDropToUtf16Drop = [0,1,2,3,4,5]
|
||||
, tDropToUtf16Drop :: UArray Int Int
|
||||
-- translate Text.drop to Data.Text.Unsafe.dropWord16
|
||||
} deriving (Show)
|
||||
|
||||
{-
|
||||
Note [Regular expressions and Text]
|
||||
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||
Text is UTF-16 encoded internally and PCRE operates on UTF-8 encoded
|
||||
ByteStrings. Because we do a lot of regexp matching on the same Text,
|
||||
it pays off to cache UTF-8 the encoded ByteString. That's the utf8Ecoded
|
||||
field in Document.
|
||||
|
||||
Moreover we do regexp matching with capture, where the captured groups
|
||||
are returned as ByteString and we want them as Text. But all of the
|
||||
captured groups are just a substrings of the original Text.
|
||||
Fortunately PCRE has an API that returns a MatchArray - a structure with
|
||||
just the ByteString indices and ByteString lengths of the matched fragments.
|
||||
If we play with indices right we can translate them to offsets into the
|
||||
original Text, share the underlying Text buffer and avoid all of
|
||||
UTF-8 and UTF-16 encoding and new ByteString and Text allocation.
|
||||
-}
|
||||
|
||||
instance IsString Document where
|
||||
fromString = mkDocument . fromString
|
||||
|
||||
mkDocument :: Text -> Document
|
||||
mkDocument s = Document s v fna
|
||||
mkDocument rawInput = Document{..}
|
||||
where
|
||||
v = Vector.fromList $ Text.unpack s
|
||||
fna = Vector.fromList $ snd $ Vector.ifoldr' gen (Vector.length v, []) v
|
||||
utf8Encoded = Text.encodeUtf8 rawInput
|
||||
rawInputLength = Text.length rawInput
|
||||
unpacked = Text.unpack rawInput
|
||||
indexable = Array.listArray (0, rawInputLength) unpacked
|
||||
firstNonAdjacent = Array.listArray (0, rawInputLength) $ snd $
|
||||
foldr gen (rawInputLength, []) $ zip [0..] unpacked
|
||||
-- go from the end keeping track of the first nonAdjacent (best)
|
||||
gen ix elem (best, acc)
|
||||
gen (ix, elem) (best, !acc)
|
||||
| isAdjacentSeparator elem = (best, best:acc)
|
||||
| otherwise = (ix, ix:acc)
|
||||
tDropToBSDropList = scanl' (\acc a -> acc + utf8CharWidth a) 0 unpacked
|
||||
tDropToBSDrop = Array.listArray (0, rawInputLength + 1) tDropToBSDropList
|
||||
tDropToUtf16Drop = Array.listArray (0, rawInputLength + 1) $
|
||||
scanl' (\acc a -> acc + utf16CharWidth a) 0 unpacked
|
||||
bsDropToTDrop = Array.listArray (0, BS.length utf8Encoded + 1) $
|
||||
reverse $ snd $ foldl' fun (-1, []) $ zip [0..] tDropToBSDropList
|
||||
fun (lastPos, !acc) (ix, elem) = (elem, replicate (elem - lastPos) ix ++ acc)
|
||||
-- copied from Data.Text.Encoding
|
||||
utf8CharWidth c
|
||||
| w <= 0x7F = 1
|
||||
| w <= 0x7FF = 2
|
||||
| 0xD800 <= w && w <= 0xDBFF = 4
|
||||
| otherwise = 3
|
||||
where
|
||||
w = UText.ord c
|
||||
utf16CharWidth c
|
||||
| w < 0x10000 = 1
|
||||
| otherwise = 2
|
||||
where
|
||||
w = UText.ord c
|
||||
|
||||
|
||||
-- True iff a is followed by whitespaces and b.
|
||||
isAdjacent :: Int -> Int -> Document -> Bool
|
||||
isAdjacent a b Document{..} =
|
||||
b >= a && (firstNonAdjacent Vector.! a >= b)
|
||||
b >= a && (firstNonAdjacent Array.! a >= b)
|
||||
|
||||
isAdjacentSeparator :: Char -> Bool
|
||||
isAdjacentSeparator c = elem c [' ', '\t', '-']
|
||||
|
||||
arraySize :: IArray UArray a => UArray Int a -> Int
|
||||
arraySize arr = r - l
|
||||
where
|
||||
(l, r) = Array.bounds arr
|
||||
|
||||
-- -----------------------------------------------------------------
|
||||
-- Token
|
||||
|
||||
|
@ -432,6 +432,7 @@ library
|
||||
, Duckling.Volume.Rules
|
||||
, Duckling.Volume.Types
|
||||
build-depends: base >= 4.8.2 && < 5.0
|
||||
, array >= 0.5.1.1 && < 0.6
|
||||
, attoparsec >= 0.13.1.0 && < 0.14
|
||||
, aeson >= 0.11.3.0 && < 1.1
|
||||
, bytestring >= 0.10.6.0 && < 0.11
|
||||
|
Loading…
Reference in New Issue
Block a user