use utf8 instead of utf16 for Document internals

This commit is contained in:
L0neGamer 2024-02-05 16:51:55 +00:00
parent 53bbf2aa53
commit 341350af60
No known key found for this signature in database
GPG Key ID: 6C316E1973F17D45
2 changed files with 11 additions and 17 deletions

View File

@ -56,8 +56,8 @@ data Document = Document
-- for "żółty" :: Document
-- tDropToBSDrop = [0,2,4,6,7,8]
-- bsDropToTDrop = [0,1,1,2,2,3,3,4,5]
-- tDropToUtf16Drop = [0,1,2,3,4,5]
, tDropToUtf16Drop :: UArray Int Int
-- tDropToUtf8Drop = [0,1,2,3,4,5] -- out of date
, tDropToUtf8Drop :: UArray Int Int
-- translate Text.drop to Data.Text.Unsafe.dropWord16
} deriving (Show)
@ -97,8 +97,8 @@ fromText rawInput = Document{..}
| otherwise = (ix, ix:acc)
tDropToBSDropList = scanl' (\acc a -> acc + utf8CharWidth a) 0 unpacked
tDropToBSDrop = Array.listArray (0, rawInputLength) tDropToBSDropList
tDropToUtf16Drop = Array.listArray (0, rawInputLength) $
scanl' (\acc a -> acc + utf16CharWidth a) 0 unpacked
tDropToUtf8Drop = Array.listArray (0, rawInputLength) $
scanl' (\acc a -> acc + utf8CharWidth a) 0 unpacked
bsDropToTDrop = Array.listArray (0, BS.length utf8Encoded) $
reverse $ snd $ foldl' fun (-1, []) $ zip [0..] tDropToBSDropList
fun (lastPos, !acc) (ix, elem) = (elem, replicate (elem - lastPos) ix ++ acc)
@ -109,11 +109,6 @@ fromText rawInput = Document{..}
| otherwise = 4
where
w = UText.ord c
utf16CharWidth c
| w < 0x10000 = 1
| otherwise = 2
where
w = UText.ord c
data CharClass
= Alpha
@ -262,7 +257,7 @@ byteStringFromPos
, utf8Encoded = utf8Encoded
, tDropToBSDrop = tDropToBSDrop
, bsDropToTDrop = bsDropToTDrop
, tDropToUtf16Drop = tDropToUtf16Drop
, tDropToUtf8Drop = tDropToUtf8Drop
}
position = (substring, rangeToText, translateRange)
where
@ -271,16 +266,15 @@ byteStringFromPos
utf8Position = tDropToBSDrop Array.! position
substring :: ByteString
substring = BS.drop utf8Position utf8Encoded
-- get a subrange of Text reusing the underlying buffer using
-- utf16 start and end positions
-- get a subrange of Text reusing the underlying buffer
rangeToText :: (Int, Int) -> Text
rangeToText (-1, _) = ""
-- this is what regexec from Text.Regex.PCRE.ByteString does
rangeToText r = UText.takeWord16 (end16Pos - start16Pos) $
UText.dropWord16 start16Pos rawInput
rangeToText r = UText.takeWord8 (end8Pos - start8Pos) $
UText.dropWord8 start8Pos rawInput
where
start16Pos = tDropToUtf16Drop Array.! startPos
end16Pos = tDropToUtf16Drop Array.! endPos
start8Pos = tDropToUtf8Drop Array.! startPos
end8Pos = tDropToUtf8Drop Array.! endPos
(startPos, endPos) = uncurry translateRange r
-- from utf8 offset and length to Text character start and end position
translateRange :: Int -> Int -> (Int, Int)

View File

@ -887,7 +887,7 @@ library
, extra >= 1.4.10 && < 1.8
, hashable >= 1.2.4.0 && < 1.5
, regex-base >= 0.93.2 && < 0.95
, text >= 1.2.2.1 && < 1.3
, text >= 2 && < 2.2
, text-show >= 2.1.2 && < 3.10
, time >= 1.5.0.1 && < 2
, timezone-series >= 0.1.5.1 && < 0.2