From c866ac51323882c26c65a9af11823e7d25a8dcfd Mon Sep 17 00:00:00 2001 From: Moritz Kiefer Date: Wed, 4 Mar 2020 19:48:03 +0100 Subject: [PATCH] Speed up unmangleIdentifier (#4810) * Speed up unmangleIdentifier On my (admittedly not super scientific) benchmark, this brings the time used to convert from the low-level proto Haskell AST to the high-level AST from 20s down to 16s on a certain DALF that we are all too familiar with. changelog_begin changelog_end * address review comments --- .../daml-lf-proto/src/DA/Daml/LF/Mangling.hs | 64 ++++++++++--------- 1 file changed, 33 insertions(+), 31 deletions(-) diff --git a/compiler/daml-lf-proto/src/DA/Daml/LF/Mangling.hs b/compiler/daml-lf-proto/src/DA/Daml/LF/Mangling.hs index cdbf55b339..340918fe67 100644 --- a/compiler/daml-lf-proto/src/DA/Daml/LF/Mangling.hs +++ b/compiler/daml-lf-proto/src/DA/Daml/LF/Mangling.hs @@ -6,10 +6,11 @@ module DA.Daml.LF.Mangling (mangleIdentifier, unmangleIdentifier) where import Data.Bits import Data.Char -import qualified Data.Set as Set +import Data.Either (fromRight) import qualified Data.Text as T -import qualified Data.Text.Internal as T (text) import qualified Data.Text.Array as TA +import qualified Data.Text.Internal as T (text) +import qualified Data.Text.Read as T import Data.Word -- DAML-LF talks about *identifier* to build up different kind of @@ -108,39 +109,40 @@ mangleIdentifier txt = case T.foldl' f (MangledSize 0 0) txt of unmangleIdentifier :: T.Text -> Either String T.Text unmangleIdentifier txt = do - chs <- goStart (T.unpack txt) - if null chs - then Left "Empty identifier" - else Right (T.pack chs) + case T.uncons txt of + Nothing -> Left "Empty identifier" + Just (c, _) + | isAllowedStart c || c == '$' -> go txt + | otherwise -> Left ("Invalid start character: " <> show c) where - go :: (Char -> Bool) -> (String -> Either String String) -> String -> Either String String - go isAllowed followUp = \case - [] -> Right [] - ['$'] -> Left "Got control character $ with nothing after it. It should be followed by $, u, or U" - '$' : ctrlCh : chs0 -> case ctrlCh of - '$' -> ('$' :) <$> followUp chs0 - 'u' -> do - (ch, chs1) <- readEscaped "$u" 4 chs0 - (ch :) <$> followUp chs1 - 'U' -> do - (ch, chs1) <- readEscaped "$U" 8 chs0 - (ch :) <$> followUp chs1 - ch -> Left ("Control character $ should be followed by $, u, or U, but got " ++ show ch) - ch : chs -> if isAllowed ch - then (ch :) <$> followUp chs - else Left ("Unexpected unescaped character " ++ show ch) - - goStart = go isAllowedStart goPart - goPart = go isAllowedPart goPart + go :: T.Text -> Either String T.Text + go s = case T.uncons s of + Nothing -> Right T.empty + Just ('$', s) -> + case T.uncons s of + Just ('$', s) -> T.cons '$' <$> go s + Just ('u', s) -> do + (ch, s') <- readEscaped "$u" 4 s + T.cons ch <$> go s' + Just ('U', s) -> do + (ch, s') <- readEscaped "$U" 8 s + T.cons ch <$> go s' + _ -> Left "Control character $ should be followed by $, u or U" + Just (char, _) -> case T.span isAllowedPart s of + (prefix, suffix) + | T.null prefix -> Left ("Unexpected unescaped character " ++ show char) + | otherwise -> fmap (prefix <>) (go suffix) readEscaped what n chs0 = let - (escaped, chs) = splitAt n chs0 + (escaped, chs) = T.splitAt n chs0 in if - | length escaped < n -> - Left ("Expected " ++ show n ++ " characters after " ++ what ++ ", but got " ++ show (length escaped)) - | not (all (`Set.member` escapeSequencesChars) escaped) -> + | T.length escaped < n -> + Left ("Expected " ++ show n ++ " characters after " ++ what ++ ", but got " ++ show (T.length escaped)) + | not (T.all isEscapeSequenceChars escaped) -> Left ("Expected only lowercase hex code in escape sequences, but got " ++ show escaped) - | otherwise -> Right (toEnum (read ("0x" ++ escaped)), chs) + | otherwise -> + -- We’ve already done the validation so fromRight is safe. + Right (toEnum (fst $ fromRight (error $ "Internal error in unmangleIdentifier: " <> show escaped) $ T.hexadecimal escaped), chs) -- only lowercase, as per printf - escapeSequencesChars = Set.fromList (['a'..'f'] ++ ['0'..'9']) + isEscapeSequenceChars c = isDigit c || c >= 'a' && c <= 'f'