mirror of
https://github.com/facebook/duckling.git
synced 2024-09-11 21:27:13 +03:00
Move Document and helpers to a fresh module
Summary: Document had its internal details leaked over 2 files. This consolidates it. It took a long time to make this perf neutral (now it's even a tiny win), for reasons I don't completely understand. The INLINE pragma on byteStringFromPos I semi-understand, but I also had to move isRangeValid to Document and that's a bit of a mystery. Reviewed By: patapizza Differential Revision: D4948449 fbshipit-source-id: ffb251a
This commit is contained in:
parent
924516103b
commit
8db73688d7
@ -10,7 +10,6 @@
|
||||
{-# LANGUAGE NamedFieldPuns #-}
|
||||
{-# LANGUAGE NoRebindableSyntax #-}
|
||||
{-# LANGUAGE OverloadedStrings #-}
|
||||
{-# LANGUAGE BangPatterns #-}
|
||||
|
||||
module Duckling.Engine
|
||||
( parseAndResolve
|
||||
@ -22,20 +21,11 @@ import Control.DeepSeq
|
||||
import Control.Monad.Extra
|
||||
import Data.Aeson
|
||||
import qualified Data.Array as Array
|
||||
import qualified Data.Array.Unboxed as UArray
|
||||
import Data.ByteString (ByteString)
|
||||
import qualified Data.ByteString as BS
|
||||
import qualified Data.Char as Char
|
||||
import Data.Functor.Identity
|
||||
import Data.Maybe
|
||||
import Data.Text (Text)
|
||||
import qualified Data.Text as Text
|
||||
import qualified Data.Text.Encoding as Text
|
||||
import qualified Data.Text.Unsafe as UText
|
||||
import qualified Data.List as L
|
||||
import qualified Data.Vector.Unboxed as Vector
|
||||
import Prelude
|
||||
import qualified Text.Regex.Base as R
|
||||
import qualified Text.Regex.PCRE as PCRE
|
||||
|
||||
import Duckling.Dimensions.Types
|
||||
@ -43,6 +33,8 @@ import qualified Duckling.Engine.Regex as Regex
|
||||
import Duckling.Regex.Types
|
||||
import Duckling.Resolve
|
||||
import Duckling.Types
|
||||
import qualified Duckling.Types.Document as Document
|
||||
import Duckling.Types.Document (Document)
|
||||
import qualified Duckling.Stash as Stash
|
||||
import Duckling.Stash (Stash)
|
||||
|
||||
@ -57,7 +49,7 @@ runDuckling ma = runIdentity ma
|
||||
parseAndResolve :: [Rule] -> Text -> Context -> [ResolvedToken]
|
||||
parseAndResolve rules input context = mapMaybe (resolveNode context) .
|
||||
force $ Stash.toPosOrderedList $ runDuckling $
|
||||
parseString rules (mkDocument input)
|
||||
parseString rules (Document.fromText input)
|
||||
|
||||
produce :: Match -> Maybe Node
|
||||
produce (_, _, []) = Nothing
|
||||
@ -73,39 +65,15 @@ produce (Rule name _ production, _, etuor@(Node {nodeRange = Range _ e}:_)) = do
|
||||
}
|
||||
[] -> Nothing
|
||||
|
||||
-- As regexes are matched without whitespace delimitator, we need to check
|
||||
-- the reasonability of the match to actually be a word.
|
||||
isRangeValid :: Document -> Range -> Bool
|
||||
isRangeValid Document { indexable = s } (Range start end) =
|
||||
(start == 0 || isDifferent (s UArray.! (start - 1)) (s UArray.! start)) &&
|
||||
(end == arraySize s ||
|
||||
isDifferent (s UArray.! (end - 1)) (s UArray.! end))
|
||||
where
|
||||
charClass :: Char -> Char
|
||||
charClass c
|
||||
| Char.isLower c = 'l'
|
||||
| Char.isUpper c = 'u'
|
||||
| Char.isDigit c = 'd'
|
||||
| otherwise = c
|
||||
isDifferent :: Char -> Char -> Bool
|
||||
isDifferent a b = charClass a /= charClass b
|
||||
|
||||
lookupRegex :: PCRE.Regex -> Int -> Document -> Duckling [Node]
|
||||
lookupRegex _regex position Document{ indexable = indexable }
|
||||
| position >= arraySize indexable = return []
|
||||
lookupRegex regex position
|
||||
Document { rawInput = rawInput
|
||||
, utf8Encoded = utf8Encoded
|
||||
, tDropToBSDrop = tDropToBSDrop
|
||||
, bsDropToTDrop = bsDropToTDrop
|
||||
, tDropToUtf16Drop = tDropToUtf16Drop
|
||||
} = return nodes
|
||||
lookupRegex _regex position doc
|
||||
| position >= Document.length doc = return []
|
||||
lookupRegex regex position doc = return nodes
|
||||
where
|
||||
-- See Note [Regular expressions and Text] to understand what's going
|
||||
-- on here
|
||||
utf8Position = tDropToBSDrop UArray.! position
|
||||
substring :: ByteString
|
||||
substring = BS.drop utf8Position utf8Encoded
|
||||
(substring, rangeToText, translateRange) =
|
||||
Document.byteStringFromPos doc position
|
||||
nodes = L.foldl' f [] $ map Array.elems $ Regex.matchAll regex substring
|
||||
f :: [Node] -> [(Int, Int)] -> [Node]
|
||||
f nodes [] = nodes
|
||||
@ -119,31 +87,12 @@ lookupRegex regex position
|
||||
, children = []
|
||||
, rule = Nothing
|
||||
}
|
||||
-- get a subrange of Text reusing the underlying buffer using
|
||||
-- utf16 start and end positions
|
||||
rangeToText :: (Int, Int) -> Text
|
||||
rangeToText (-1, _) = ""
|
||||
-- this is what regexec from Text.Regex.PCRE.ByteString does
|
||||
rangeToText r = UText.takeWord16 (end16Pos - start16Pos) $
|
||||
UText.dropWord16 start16Pos rawInput
|
||||
where
|
||||
start16Pos = tDropToUtf16Drop UArray.! startPos
|
||||
end16Pos = tDropToUtf16Drop UArray.! endPos
|
||||
(startPos, endPos) = uncurry translateRange r
|
||||
-- from utf8 offset and length to Text character start and end position
|
||||
translateRange :: Int -> Int -> (Int, Int)
|
||||
translateRange !bsStart !bsLen = startPos `seq` endPos `seq` res
|
||||
where
|
||||
res = (startPos, endPos)
|
||||
realBsStart = utf8Position + bsStart
|
||||
realBsEnd = realBsStart + bsLen
|
||||
startPos = bsDropToTDrop UArray.! realBsStart
|
||||
endPos = bsDropToTDrop UArray.! realBsEnd
|
||||
|
||||
lookupItem :: Document -> PatternItem -> Stash -> Int -> Duckling [Node]
|
||||
lookupItem s (Regex re) _ position =
|
||||
filter (\node -> isRangeValid s (nodeRange node) &&
|
||||
isPositionValid position s node) <$>
|
||||
filter (\node@Node { nodeRange = Range start end } ->
|
||||
Document.isRangeValid s start end &&
|
||||
isPositionValid position s node) <$>
|
||||
lookupRegex re position s
|
||||
lookupItem s (Predicate p) stash position =
|
||||
return $
|
||||
@ -153,7 +102,7 @@ lookupItem s (Predicate p) stash position =
|
||||
|
||||
isPositionValid :: Int -> Document -> Node -> Bool
|
||||
isPositionValid position sentence (Node {nodeRange = Range start _}) =
|
||||
position == 0 || isAdjacent position start sentence
|
||||
position == 0 || Document.isAdjacent sentence position start
|
||||
|
||||
-- | A match is full if its rule pattern is empty.
|
||||
-- (rule, endPosition, reversedRoute)
|
||||
|
@ -5,7 +5,7 @@
|
||||
-- LICENSE file in the root directory of this source tree. An additional grant
|
||||
-- of patent rights can be found in the PATENTS file in the same directory.
|
||||
|
||||
{-# LANGUAGE BangPatterns #-}
|
||||
|
||||
{-# LANGUAGE DeriveAnyClass #-}
|
||||
{-# LANGUAGE DeriveGeneric #-}
|
||||
{-# LANGUAGE ExistentialQuantification #-}
|
||||
@ -13,29 +13,19 @@
|
||||
{-# LANGUAGE GADTs #-}
|
||||
{-# LANGUAGE NoRebindableSyntax #-}
|
||||
{-# LANGUAGE RankNTypes #-}
|
||||
{-# LANGUAGE RecordWildCards #-}
|
||||
{-# LANGUAGE StandaloneDeriving #-}
|
||||
|
||||
module Duckling.Types where
|
||||
|
||||
import Control.DeepSeq
|
||||
import qualified Data.Array.Unboxed as Array
|
||||
import Data.Array.Unboxed (UArray, IArray)
|
||||
import Data.Aeson
|
||||
import Data.ByteString (ByteString)
|
||||
import qualified Data.ByteString as BS
|
||||
import qualified Data.ByteString.Lazy as LB
|
||||
import Data.GADT.Compare
|
||||
import Data.Hashable
|
||||
import Data.Maybe
|
||||
import Data.String
|
||||
import Data.Text (Text)
|
||||
import Data.List (scanl', foldl', foldr)
|
||||
import Data.Vector.Unboxed (Vector)
|
||||
import qualified Data.Text.Encoding as Text
|
||||
import qualified Data.Text as Text
|
||||
import qualified Data.Text.Internal.Unsafe.Char as UText
|
||||
import qualified Data.Vector.Unboxed as Vector
|
||||
import Data.Typeable ((:~:)(Refl), Typeable)
|
||||
import GHC.Generics
|
||||
import Prelude
|
||||
@ -45,97 +35,6 @@ import qualified Text.Regex.PCRE as PCRE
|
||||
import Duckling.Dimensions.Types
|
||||
import Duckling.Resolve
|
||||
|
||||
data Document = Document
|
||||
{ rawInput :: !Text
|
||||
, utf8Encoded :: ByteString
|
||||
, indexable :: UArray Int Char -- for O(1) indexing pos -> Char
|
||||
, firstNonAdjacent :: UArray Int Int
|
||||
-- for a given index 'i' it keeps a first index 'j' greater or equal 'i'
|
||||
-- such that isAdjacentSeparator (indexable ! j) == False
|
||||
-- eg. " a document " :: Document
|
||||
-- firstNonAdjacent = [1,1,3,3,4,5,6,7,8,9,10,12]
|
||||
-- Note that in this case 12 is the length of the vector, hence not a
|
||||
-- valid index inside the array, this is intentional.
|
||||
, tDropToBSDrop :: UArray Int Int
|
||||
-- how many bytes to BS.drop from a utf8 encoded ByteString to
|
||||
-- reach the same position as Text.drop would
|
||||
, bsDropToTDrop :: UArray Int Int
|
||||
-- the inverse of tDropToBSDrop, rounds down for bytes that are
|
||||
-- not on character boundary
|
||||
-- for "żółty" :: Document
|
||||
-- tDropToBSDrop = [0,2,4,6,7,8]
|
||||
-- bsDropToTDrop = [0,1,1,2,2,3,3,4,5]
|
||||
-- tDropToUtf16Drop = [0,1,2,3,4,5]
|
||||
, tDropToUtf16Drop :: UArray Int Int
|
||||
-- translate Text.drop to Data.Text.Unsafe.dropWord16
|
||||
} deriving (Show)
|
||||
|
||||
{-
|
||||
Note [Regular expressions and Text]
|
||||
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||
Text is UTF-16 encoded internally and PCRE operates on UTF-8 encoded
|
||||
ByteStrings. Because we do a lot of regexp matching on the same Text,
|
||||
it pays off to cache UTF-8 the encoded ByteString. That's the utf8Ecoded
|
||||
field in Document.
|
||||
|
||||
Moreover we do regexp matching with capture, where the captured groups
|
||||
are returned as ByteString and we want them as Text. But all of the
|
||||
captured groups are just a substrings of the original Text.
|
||||
Fortunately PCRE has an API that returns a MatchArray - a structure with
|
||||
just the ByteString indices and ByteString lengths of the matched fragments.
|
||||
If we play with indices right we can translate them to offsets into the
|
||||
original Text, share the underlying Text buffer and avoid all of
|
||||
UTF-8 and UTF-16 encoding and new ByteString and Text allocation.
|
||||
-}
|
||||
|
||||
instance IsString Document where
|
||||
fromString = mkDocument . fromString
|
||||
|
||||
mkDocument :: Text -> Document
|
||||
mkDocument rawInput = Document{..}
|
||||
where
|
||||
utf8Encoded = Text.encodeUtf8 rawInput
|
||||
rawInputLength = Text.length rawInput
|
||||
unpacked = Text.unpack rawInput
|
||||
indexable = Array.listArray (0, rawInputLength - 1) unpacked
|
||||
firstNonAdjacent = Array.listArray (0, rawInputLength - 1) $ snd $
|
||||
foldr gen (rawInputLength, []) $ zip [0..] unpacked
|
||||
-- go from the end keeping track of the first nonAdjacent (best)
|
||||
gen (ix, elem) (best, !acc)
|
||||
| isAdjacentSeparator elem = (best, best:acc)
|
||||
| otherwise = (ix, ix:acc)
|
||||
tDropToBSDropList = scanl' (\acc a -> acc + utf8CharWidth a) 0 unpacked
|
||||
tDropToBSDrop = Array.listArray (0, rawInputLength) tDropToBSDropList
|
||||
tDropToUtf16Drop = Array.listArray (0, rawInputLength) $
|
||||
scanl' (\acc a -> acc + utf16CharWidth a) 0 unpacked
|
||||
bsDropToTDrop = Array.listArray (0, BS.length utf8Encoded) $
|
||||
reverse $ snd $ foldl' fun (-1, []) $ zip [0..] tDropToBSDropList
|
||||
fun (lastPos, !acc) (ix, elem) = (elem, replicate (elem - lastPos) ix ++ acc)
|
||||
utf8CharWidth c
|
||||
| w <= 0x7F = 1
|
||||
| w <= 0x7FF = 2
|
||||
| w <= 0xFFFF = 3
|
||||
| otherwise = 4
|
||||
where
|
||||
w = UText.ord c
|
||||
utf16CharWidth c
|
||||
| w < 0x10000 = 1
|
||||
| otherwise = 2
|
||||
where
|
||||
w = UText.ord c
|
||||
|
||||
|
||||
-- True iff a is followed by whitespaces and b.
|
||||
isAdjacent :: Int -> Int -> Document -> Bool
|
||||
isAdjacent a b Document{..} =
|
||||
b >= a && (firstNonAdjacent Array.! a >= b)
|
||||
|
||||
isAdjacentSeparator :: Char -> Bool
|
||||
isAdjacentSeparator c = elem c [' ', '\t', '-']
|
||||
|
||||
arraySize :: IArray UArray a => UArray Int a -> Int
|
||||
arraySize = Array.rangeSize . Array.bounds
|
||||
|
||||
-- -----------------------------------------------------------------
|
||||
-- Token
|
||||
|
||||
|
199
Duckling/Types/Document.hs
Normal file
199
Duckling/Types/Document.hs
Normal file
@ -0,0 +1,199 @@
|
||||
-- Copyright (c) 2016-present, Facebook, Inc.
|
||||
-- All rights reserved.
|
||||
--
|
||||
-- This source code is licensed under the BSD-style license found in the
|
||||
-- LICENSE file in the root directory of this source tree. An additional grant
|
||||
-- of patent rights can be found in the PATENTS file in the same directory.
|
||||
|
||||
{-# LANGUAGE BangPatterns #-}
|
||||
{-# LANGUAGE FlexibleContexts #-}
|
||||
{-# LANGUAGE RecordWildCards #-}
|
||||
|
||||
module Duckling.Types.Document
|
||||
( Document -- abstract
|
||||
, fromText
|
||||
, (!)
|
||||
, length
|
||||
, byteStringFromPos
|
||||
, isAdjacent
|
||||
, isRangeValid
|
||||
) where
|
||||
|
||||
import qualified Data.Array.Unboxed as Array
|
||||
import Data.Array.Unboxed (UArray)
|
||||
import Data.ByteString (ByteString)
|
||||
import qualified Data.ByteString as BS
|
||||
import qualified Data.Char as Char
|
||||
import Data.List (scanl', foldl', foldr)
|
||||
import Data.String
|
||||
import Data.Text (Text)
|
||||
import qualified Data.Text.Unsafe as UText
|
||||
import qualified Data.Text.Encoding as Text
|
||||
import qualified Data.Text as Text
|
||||
import qualified Data.Text.Internal.Unsafe.Char as UText
|
||||
import Prelude hiding (length)
|
||||
|
||||
|
||||
data Document = Document
|
||||
{ rawInput :: !Text
|
||||
, utf8Encoded :: ByteString
|
||||
, indexable :: UArray Int Char -- for O(1) indexing pos -> Char
|
||||
, firstNonAdjacent :: UArray Int Int
|
||||
-- for a given index 'i' it keeps a first index 'j' greater or equal 'i'
|
||||
-- such that isAdjacentSeparator (indexable ! j) == False
|
||||
-- eg. " a document " :: Document
|
||||
-- firstNonAdjacent = [1,1,3,3,4,5,6,7,8,9,10,12]
|
||||
-- Note that in this case 12 is the length of the vector, hence not a
|
||||
-- valid index inside the array, this is intentional.
|
||||
, tDropToBSDrop :: UArray Int Int
|
||||
-- how many bytes to BS.drop from a utf8 encoded ByteString to
|
||||
-- reach the same position as Text.drop would
|
||||
, bsDropToTDrop :: UArray Int Int
|
||||
-- the inverse of tDropToBSDrop, rounds down for bytes that are
|
||||
-- not on character boundary
|
||||
-- for "żółty" :: Document
|
||||
-- tDropToBSDrop = [0,2,4,6,7,8]
|
||||
-- bsDropToTDrop = [0,1,1,2,2,3,3,4,5]
|
||||
-- tDropToUtf16Drop = [0,1,2,3,4,5]
|
||||
, tDropToUtf16Drop :: UArray Int Int
|
||||
-- translate Text.drop to Data.Text.Unsafe.dropWord16
|
||||
} deriving (Show)
|
||||
|
||||
{-
|
||||
Note [Regular expressions and Text]
|
||||
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||
Text is UTF-16 encoded internally and PCRE operates on UTF-8 encoded
|
||||
ByteStrings. Because we do a lot of regexp matching on the same Text,
|
||||
it pays off to cache UTF-8 the encoded ByteString. That's the utf8Ecoded
|
||||
field in Document.
|
||||
|
||||
Moreover we do regexp matching with capture, where the captured groups
|
||||
are returned as ByteString and we want them as Text. But all of the
|
||||
captured groups are just a substrings of the original Text.
|
||||
Fortunately PCRE has an API that returns a MatchArray - a structure with
|
||||
just the ByteString indices and ByteString lengths of the matched fragments.
|
||||
If we play with indices right we can translate them to offsets into the
|
||||
original Text, share the underlying Text buffer and avoid all of
|
||||
UTF-8 and UTF-16 encoding and new ByteString and Text allocation.
|
||||
-}
|
||||
|
||||
instance IsString Document where
|
||||
fromString = fromText . fromString
|
||||
|
||||
fromText :: Text -> Document
|
||||
fromText rawInput = Document{..}
|
||||
where
|
||||
utf8Encoded = Text.encodeUtf8 rawInput
|
||||
rawInputLength = Text.length rawInput
|
||||
unpacked = Text.unpack rawInput
|
||||
indexable = Array.listArray (0, rawInputLength - 1) unpacked
|
||||
firstNonAdjacent = Array.listArray (0, rawInputLength - 1) $ snd $
|
||||
foldr gen (rawInputLength, []) $ zip [0..] unpacked
|
||||
-- go from the end keeping track of the first nonAdjacent (best)
|
||||
gen (ix, elem) (best, !acc)
|
||||
| isAdjacentSeparator elem = (best, best:acc)
|
||||
| otherwise = (ix, ix:acc)
|
||||
tDropToBSDropList = scanl' (\acc a -> acc + utf8CharWidth a) 0 unpacked
|
||||
tDropToBSDrop = Array.listArray (0, rawInputLength) tDropToBSDropList
|
||||
tDropToUtf16Drop = Array.listArray (0, rawInputLength) $
|
||||
scanl' (\acc a -> acc + utf16CharWidth a) 0 unpacked
|
||||
bsDropToTDrop = Array.listArray (0, BS.length utf8Encoded) $
|
||||
reverse $ snd $ foldl' fun (-1, []) $ zip [0..] tDropToBSDropList
|
||||
fun (lastPos, !acc) (ix, elem) = (elem, replicate (elem - lastPos) ix ++ acc)
|
||||
utf8CharWidth c
|
||||
| w <= 0x7F = 1
|
||||
| w <= 0x7FF = 2
|
||||
| w <= 0xFFFF = 3
|
||||
| otherwise = 4
|
||||
where
|
||||
w = UText.ord c
|
||||
utf16CharWidth c
|
||||
| w < 0x10000 = 1
|
||||
| otherwise = 2
|
||||
where
|
||||
w = UText.ord c
|
||||
|
||||
-- As regexes are matched without whitespace delimitator, we need to check
|
||||
-- the reasonability of the match to actually be a word.
|
||||
isRangeValid :: Document -> Int -> Int -> Bool
|
||||
isRangeValid doc start end =
|
||||
(start == 0 ||
|
||||
isDifferent (doc ! (start - 1)) (doc ! start)) &&
|
||||
(end == length doc ||
|
||||
isDifferent (doc ! (end - 1)) (doc ! end))
|
||||
where
|
||||
charClass :: Char -> Char
|
||||
charClass c
|
||||
| Char.isLower c = 'l'
|
||||
| Char.isUpper c = 'u'
|
||||
| Char.isDigit c = 'd'
|
||||
| otherwise = c
|
||||
isDifferent :: Char -> Char -> Bool
|
||||
isDifferent a b = charClass a /= charClass b
|
||||
|
||||
-- True iff a is followed by whitespaces and b.
|
||||
isAdjacent :: Document -> Int -> Int -> Bool
|
||||
isAdjacent Document{..} a b =
|
||||
b >= a && (firstNonAdjacent Array.! a >= b)
|
||||
|
||||
isAdjacentSeparator :: Char -> Bool
|
||||
isAdjacentSeparator c = elem c [' ', '\t', '-']
|
||||
|
||||
(!) :: Document -> Int -> Char
|
||||
(!) Document { indexable = s } ix = s Array.! ix
|
||||
|
||||
length :: Document -> Int
|
||||
length Document { indexable = s } = Array.rangeSize $ Array.bounds s
|
||||
|
||||
-- | Given a document and an offset (think Text.drop offset),
|
||||
-- returns a utf8 encoded substring of Document at that offset
|
||||
-- and 2 translation functions:
|
||||
-- rangeToText - given a range in the returned ByteString, gives
|
||||
-- a corresponding subrange of the Document as Text
|
||||
-- translateRange - given a start and a length of a range in the returned
|
||||
-- ByteString, gives a corresponding subrange in the Document as pair
|
||||
-- of (start, end) of Text.drop offsets
|
||||
{-# INLINE byteStringFromPos #-}
|
||||
-- if we don't inline we seem to pay for the tuple, there might be
|
||||
-- an easier way
|
||||
byteStringFromPos
|
||||
:: Document
|
||||
-> Int
|
||||
-> ( ByteString
|
||||
, (Int, Int) -> Text
|
||||
, Int -> Int -> (Int, Int)
|
||||
)
|
||||
byteStringFromPos
|
||||
Document { rawInput = rawInput
|
||||
, utf8Encoded = utf8Encoded
|
||||
, tDropToBSDrop = tDropToBSDrop
|
||||
, bsDropToTDrop = bsDropToTDrop
|
||||
, tDropToUtf16Drop = tDropToUtf16Drop
|
||||
}
|
||||
position = (substring, rangeToText, translateRange)
|
||||
where
|
||||
-- See Note [Regular expressions and Text] to understand what's going
|
||||
-- on here
|
||||
utf8Position = tDropToBSDrop Array.! position
|
||||
substring :: ByteString
|
||||
substring = BS.drop utf8Position utf8Encoded
|
||||
-- get a subrange of Text reusing the underlying buffer using
|
||||
-- utf16 start and end positions
|
||||
rangeToText :: (Int, Int) -> Text
|
||||
rangeToText (-1, _) = ""
|
||||
-- this is what regexec from Text.Regex.PCRE.ByteString does
|
||||
rangeToText r = UText.takeWord16 (end16Pos - start16Pos) $
|
||||
UText.dropWord16 start16Pos rawInput
|
||||
where
|
||||
start16Pos = tDropToUtf16Drop Array.! startPos
|
||||
end16Pos = tDropToUtf16Drop Array.! endPos
|
||||
(startPos, endPos) = uncurry translateRange r
|
||||
-- from utf8 offset and length to Text character start and end position
|
||||
translateRange :: Int -> Int -> (Int, Int)
|
||||
translateRange !bsStart !bsLen = startPos `seq` endPos `seq` res
|
||||
where
|
||||
res = (startPos, endPos)
|
||||
realBsStart = utf8Position + bsStart
|
||||
realBsEnd = realBsStart + bsLen
|
||||
startPos = bsDropToTDrop Array.! realBsStart
|
||||
endPos = bsDropToTDrop Array.! realBsEnd
|
@ -31,6 +31,7 @@ library
|
||||
, Duckling.Resolve
|
||||
, Duckling.Stash
|
||||
, Duckling.Types
|
||||
, Duckling.Types.Document
|
||||
|
||||
-- ------------------------------------------------------------------
|
||||
-- Rules
|
||||
|
Loading…
Reference in New Issue
Block a user