Speed up unmangleIdentifier (#4810)

* Speed up unmangleIdentifier

On my (admittedly not super scientific) benchmark, this brings the
time used to convert from the low-level proto Haskell AST to the
high-level AST from 20s down to 16s on a certain DALF that we are all
too familiar with.

changelog_begin
changelog_end

* address review comments
This commit is contained in:
Moritz Kiefer 2020-03-04 19:48:03 +01:00 committed by GitHub
parent a40190d695
commit c866ac5132
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23

View File

@ -6,10 +6,11 @@ module DA.Daml.LF.Mangling (mangleIdentifier, unmangleIdentifier) where
import Data.Bits
import Data.Char
import qualified Data.Set as Set
import Data.Either (fromRight)
import qualified Data.Text as T
import qualified Data.Text.Internal as T (text)
import qualified Data.Text.Array as TA
import qualified Data.Text.Internal as T (text)
import qualified Data.Text.Read as T
import Data.Word
-- DAML-LF talks about *identifier* to build up different kind of
@ -108,39 +109,40 @@ mangleIdentifier txt = case T.foldl' f (MangledSize 0 0) txt of
unmangleIdentifier :: T.Text -> Either String T.Text
unmangleIdentifier txt = do
chs <- goStart (T.unpack txt)
if null chs
then Left "Empty identifier"
else Right (T.pack chs)
case T.uncons txt of
Nothing -> Left "Empty identifier"
Just (c, _)
| isAllowedStart c || c == '$' -> go txt
| otherwise -> Left ("Invalid start character: " <> show c)
where
go :: (Char -> Bool) -> (String -> Either String String) -> String -> Either String String
go isAllowed followUp = \case
[] -> Right []
['$'] -> Left "Got control character $ with nothing after it. It should be followed by $, u, or U"
'$' : ctrlCh : chs0 -> case ctrlCh of
'$' -> ('$' :) <$> followUp chs0
'u' -> do
(ch, chs1) <- readEscaped "$u" 4 chs0
(ch :) <$> followUp chs1
'U' -> do
(ch, chs1) <- readEscaped "$U" 8 chs0
(ch :) <$> followUp chs1
ch -> Left ("Control character $ should be followed by $, u, or U, but got " ++ show ch)
ch : chs -> if isAllowed ch
then (ch :) <$> followUp chs
else Left ("Unexpected unescaped character " ++ show ch)
goStart = go isAllowedStart goPart
goPart = go isAllowedPart goPart
go :: T.Text -> Either String T.Text
go s = case T.uncons s of
Nothing -> Right T.empty
Just ('$', s) ->
case T.uncons s of
Just ('$', s) -> T.cons '$' <$> go s
Just ('u', s) -> do
(ch, s') <- readEscaped "$u" 4 s
T.cons ch <$> go s'
Just ('U', s) -> do
(ch, s') <- readEscaped "$U" 8 s
T.cons ch <$> go s'
_ -> Left "Control character $ should be followed by $, u or U"
Just (char, _) -> case T.span isAllowedPart s of
(prefix, suffix)
| T.null prefix -> Left ("Unexpected unescaped character " ++ show char)
| otherwise -> fmap (prefix <>) (go suffix)
readEscaped what n chs0 = let
(escaped, chs) = splitAt n chs0
(escaped, chs) = T.splitAt n chs0
in if
| length escaped < n ->
Left ("Expected " ++ show n ++ " characters after " ++ what ++ ", but got " ++ show (length escaped))
| not (all (`Set.member` escapeSequencesChars) escaped) ->
| T.length escaped < n ->
Left ("Expected " ++ show n ++ " characters after " ++ what ++ ", but got " ++ show (T.length escaped))
| not (T.all isEscapeSequenceChars escaped) ->
Left ("Expected only lowercase hex code in escape sequences, but got " ++ show escaped)
| otherwise -> Right (toEnum (read ("0x" ++ escaped)), chs)
| otherwise ->
-- Weve already done the validation so fromRight is safe.
Right (toEnum (fst $ fromRight (error $ "Internal error in unmangleIdentifier: " <> show escaped) $ T.hexadecimal escaped), chs)
-- only lowercase, as per printf
escapeSequencesChars = Set.fromList (['a'..'f'] ++ ['0'..'9'])
isEscapeSequenceChars c = isDigit c || c >= 'a' && c <= 'f'