Auto-backtracking for ‘tokens’ and friends

2024-12-28 18:54:34 +03:00 · 2016-02-17 23:32:48 +06:00 · 2016-02-17 23:32:48 +06:00 · 3baa263eab
commit 3baa263eab
parent 4c5aae7098
8 changed files with 78 additions and 60 deletions
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@ -11,6 +11,15 @@
 * The `count` combinator now works with `Applicative` instances (previously
  it worked only with instances of `Alternative`). It's now also faster.

+* `tokens` and parsers built upon it (such as `string` and `string'`)
+  backtrack automatically on failure now, that is, when they fail, they
+  never consume any input. This is done to make their consumption model
+  match how error messages are reported (which becomes an important thing as
+  user gets more control with primitives like `withRecovery`). This means,
+  in particular, that it's no longer necessary to use `try` with
+  `tokens`-based parsers. This new feature *does not* affect performance in
+  any way.
+
 * New primitive parser `withRecovery` added. The parser allows to recover
  from parse errors “on-the-fly” and continue parsing. Once parsing is
  finished, several parse errors may be reported or ignored altogether.
--- a/Text/Megaparsec/Combinator.hs
+++ b/Text/Megaparsec/Combinator.hs
@ -100,10 +100,7 @@ endBy1 p sep = some (p <* sep)
 -- parser @end@ succeeds. Returns the list of values returned by @p@. This
 -- parser can be used to scan comments:
 --
-- > simpleComment = string "<!--" >> manyTill anyChar (try $ string "-->")
--
-- Note that we need to use 'try' since parsers @anyChar@ and @string
-- \"-->\"@ overlap and @string \"-->\"@ could consume input before failing.
+-- > simpleComment = string "<!--" >> manyTill anyChar (string "-->")

 manyTill :: Alternative m => m a -> m end -> m [a]
 manyTill p end = ([] <$ end) <|> someTill p end
--- a/Text/Megaparsec/Expr.hs
+++ b/Text/Megaparsec/Expr.hs
@ -73,9 +73,6 @@ data Operator m a
 -- > binary  name f = InfixL  (reservedOp name >> return f)
 -- > prefix  name f = Prefix  (reservedOp name >> return f)
 -- > postfix name f = Postfix (reservedOp name >> return f)
--
-- Please note that multi-character operators should use 'try' in order to
-- be reported correctly in error messages.

 makeExprParser :: MonadParsec s m t => m a -> [[Operator m a]] -> m a
 makeExprParser = foldl addPrecLevel
--- a/Text/Megaparsec/Lexer.hs
+++ b/Text/Megaparsec/Lexer.hs
@ -148,7 +148,7 @@ skipLineComment :: MonadParsec s m Char
  => String            -- ^ Line comment prefix
  -> m ()
 skipLineComment prefix = p >> void (manyTill C.anyChar n)
-  where p = try (C.string prefix)
+  where p = C.string prefix
        n = lookAhead C.newline

 -- | @skipBlockComment start end@ skips non-nested block comment starting
@ -159,8 +159,8 @@ skipBlockComment :: MonadParsec s m Char
  -> String            -- ^ End of block comment
  -> m ()
 skipBlockComment start end = p >> void (manyTill C.anyChar n)
-  where p = try (C.string start)
-        n = try (C.string end)
+  where p = C.string start
+        n = C.string end

 -- Indentation

--- a/Text/Megaparsec/Prim.hs
+++ b/Text/Megaparsec/Prim.hs
@ -458,6 +458,11 @@ class (A.Alternative m, Monad m, Stream s t)
  -- 1:1:
  -- unexpected "le"
  -- expecting "let" or "lexical"
+  --
+  -- Please note that as of Megaparsec 4.4.0, 'string' backtracks
+  -- automatically (see 'tokens'), so it does not need 'try'. However, the
+  -- examples above demonstrate the idea behind 'try' so well that it was
+  -- decided to keep them.

  try :: m a -> m a

@ -524,6 +529,22 @@ class (A.Alternative m, Monad m, Stream s t)
  -- This can be used for example to write 'Text.Megaparsec.Char.string':
  --
  -- > string = tokens updatePosString (==)
+  --
+  -- Note that beginning from Megaparsec 4.4.0, this is an auto-backtracking
+  -- primitive, which means that if it fails, it never consumes any
+  -- input. This is done to make its consumption model match how error
+  -- messages for this primitive are reported (which becomes an important
+  -- thing as user gets more control with primitives like 'withRecovery'):
+  --
+  -- >>> parseTest (string "abc") "abd"
+  -- 1:1:
+  -- unexpected "abd"
+  -- expecting "abc"
+  --
+  -- This means, in particular, that it's no longer necessary to use 'try'
+  -- with 'tokens'-based parsers, such as 'Text.Megaparsec.Char.string' and
+  -- 'Text.Megaparsec.Char.string''. This new feature /does not/ affect
+  -- performance in any way.

  tokens :: Eq t
    => (Int -> SourcePos -> [t] -> SourcePos)
@ -568,7 +589,8 @@ pLabel l p = ParsecT $ \s cok cerr eok eerr ->
  in unParser p s cok' cerr eok' eerr'

 pTry :: ParsecT s m a -> ParsecT s m a
-pTry p = ParsecT $ \s cok _ eok eerr -> unParser p s cok eerr eok eerr
+pTry p = ParsecT $ \s cok _ eok eerr ->
+  unParser p s cok eerr eok eerr
 {-# INLINE pTry #-}

 pLookAhead :: ParsecT s m a -> ParsecT s m a
@ -632,7 +654,7 @@ pTokens :: Stream s t
  -> [t]
  -> ParsecT s m [t]
 pTokens _ _ [] = ParsecT $ \s _ _ eok _ -> eok [] s mempty
-pTokens nextpos test tts = ParsecT $ \s@(State input pos w) cok cerr _ eerr ->
+pTokens nextpos test tts = ParsecT $ \s@(State input pos w) cok _ _ eerr ->
  let r = showToken . reverse
      errExpect x = setErrorMessage (Expected $ showToken tts)
        (newErrorMessage (Unexpected x) pos)
@ -641,13 +663,12 @@ pTokens nextpos test tts = ParsecT $ \s@(State input pos w) cok cerr _ eerr ->
            s'   = State rs pos' w
        in cok (reverse is) s' mempty
      walk (t:ts) is rs =
-        let errorCont = if null is then eerr else cerr
-            what      = if null is then eoi  else r is
+        let what = if null is then eoi  else r is
        in case uncons rs of
-             Nothing -> errorCont (errExpect what) s
+             Nothing -> eerr (errExpect what) s
             Just (x,xs)
               | test t x  -> walk ts (x:is) xs
-               | otherwise -> errorCont (errExpect $ r (x:is)) s
+               | otherwise -> eerr (errExpect $ r (x:is)) s
  in walk tts [] input
 {-# INLINE pTokens #-}

--- a/old-tests/Bugs/Bug9.hs
+++ b/old-tests/Bugs/Bug9.hs
@ -1,4 +1,3 @@
-
 module Bugs.Bug9 (main) where

 import Control.Applicative (empty)
@ -41,7 +40,7 @@ integer :: Parser Integer
 integer = lexeme L.integer

 operator :: String -> Parser String
-operator = try . L.symbol sc
+operator = L.symbol sc

 parseTopLevel :: Parser Expr
 parseTopLevel = parseExpr <* eof
--- a/tests/Char.hs
+++ b/tests/Char.hs
@ -131,8 +131,7 @@ prop_eol s = checkParser eol r s
          | "\r\n" `isPrefixOf` s = posErr 2 s [uneCh (s !! 2), exEof]
          | otherwise   = posErr 0 s [ uneStr (take 2 s)
                                     , uneCh '\r'
-                                     , exSpec "crlf newline"
-                                     , exSpec "newline" ]
+                                     , exSpec "end of line" ]

 prop_tab :: String -> Property
 prop_tab = checkChar tab (== '\t') (Just "tab")
--- a/tests/Prim.hs
+++ b/tests/Prim.hs
@ -34,7 +34,7 @@ import Control.Applicative
 import Data.Char (isLetter, toUpper)
 import Data.Foldable (asum)
 import Data.List (isPrefixOf)
-import Data.Maybe (maybeToList, fromMaybe)
+import Data.Maybe (maybeToList)

 import Control.Monad.Cont
 import Control.Monad.Except
@ -152,7 +152,7 @@ prop_alternative_1 s0 s1
  | s0 `isPrefixOf` s1 =
      checkParser p (posErr s0l s1 [uneCh (s1 !! s0l), exEof]) s1
  | otherwise = checkParser p (Right s0) s0 .&&. checkParser p (Right s1) s1
-    where p   = try (string s0) <|> string s1
+    where p   = string s0 <|> string s1
          s0l = length s0

 prop_alternative_2 :: Char -> Char -> Char -> Bool -> Property
@ -166,8 +166,8 @@ prop_alternative_2 a b c l = checkParser p r s

 prop_alternative_3 :: Property
 prop_alternative_3 = checkParser p r s
-  where p  = asum [empty, try (string ">>>"), empty, return "foo"] <?> "bar"
-        p' = bsum [empty, try (string ">>>"), empty, return "foo"] <?> "bar"
+  where p  = asum [empty, string ">>>", empty, return "foo"] <?> "bar"
+        p' = bsum [empty, string ">>>", empty, return "foo"] <?> "bar"
        bsum = foldl (<|>) empty
        r = simpleParse p' s
        s = ">>"
@ -333,24 +333,23 @@ prop_hidden_0 a' b' c' = checkParser p r s
          | otherwise = Right s
        s = abcRow a b c

-prop_hidden_1 :: String -> NonEmptyList Char -> String -> Property
-prop_hidden_1 a c' s = checkParser p r s
+prop_hidden_1 :: NonEmptyList Char -> String -> Property
+prop_hidden_1 c' s = checkParser p r s
  where c = getNonEmpty c'
-        p = fromMaybe a <$> optional (hidden $ string c)
-        r | null s = Right a
-          | c == s = Right s
-          | head c /= head s = posErr 0 s [uneCh (head s), exEof]
-          | otherwise = simpleParse (string c) s
+        cn = length c
+        p = optional (hidden $ string c)
+        r | null s           = Right Nothing
+          | c == s           = Right (Just s)
+          | c `isPrefixOf` s = posErr cn s [uneCh (s !! cn), exEof]
+          | otherwise        = posErr 0 s [uneCh (head s), exEof]

-prop_try :: String -> String -> String -> Property
-prop_try pre s1' s2' = checkParser p r s
-  where s1 = pre ++ s1'
-        s2 = pre ++ s2'
-        p = try (string s1) <|> string s2
-        r | s == s1 || s == s2 = Right s
-          | otherwise = posErr 0 s $ (if null s then uneEof else uneStr pre)
-                        : [uneStr pre, exStr s1, exStr s2]
-        s = pre
+prop_try :: Char -> Char -> Char -> Property
+prop_try pre ch1 ch2 = checkParser p r s
+  where s1 = sequence [char pre, char ch1]
+        s2 = sequence [char pre, char ch2]
+        p = try s1 <|> s2
+        r = posErr 1 s [uneEof, exCh ch1, exCh ch2]
+        s = [pre]

 prop_lookAhead_0 :: Bool -> Bool -> Bool -> Property
 prop_lookAhead_0 a b c = checkParser p r s
@ -523,15 +522,13 @@ stateFromInput s = State s (initialPos "") defaultTabWidth

 -- IdentityT instance of MonadParsec

-prop_IdentityT_try :: String -> String -> String -> Property
-prop_IdentityT_try pre s1' s2' = checkParser (runIdentityT p) r s
-  where s1 = pre ++ s1'
-        s2 = pre ++ s2'
-        p = try (string s1) <|> string s2
-        r | s == s1 || s == s2 = Right s
-          | otherwise = posErr 0 s $ (if null s then uneEof else uneStr pre)
-                        : [uneStr pre, exStr s1, exStr s2]
-        s = pre
+prop_IdentityT_try :: Char -> Char -> Char -> Property
+prop_IdentityT_try pre ch1 ch2 = checkParser (runIdentityT p) r s
+  where s1 = sequence [char pre, char ch1]
+        s2 = sequence [char pre, char ch2]
+        p = try s1 <|> s2
+        r = posErr 1 s [uneEof, exCh ch1, exCh ch2]
+        s = [pre]

 prop_IdentityT_notFollowedBy :: NonNegative Int -> NonNegative Int
                             -> NonNegative Int -> Property
@ -544,17 +541,16 @@ prop_IdentityT_notFollowedBy a' b' c' = checkParser (runIdentityT p) r s

 -- ReaderT instance of MonadParsec

-prop_ReaderT_try :: String -> String -> String -> Property
-prop_ReaderT_try pre s1' s2' = checkParser (runReaderT p (s1', s2')) r s
-  where s1 = pre ++ s1'
-        s2 = pre ++ s2'
-        getS1 = asks ((pre ++) . fst)
-        getS2 = asks ((pre ++) . snd)
-        p = try (string =<< getS1) <|> (string =<< getS2)
-        r | s == s1 || s == s2 = Right s
-          | otherwise = posErr 0 s $ (if null s then uneEof else uneStr pre)
-                        : [uneStr pre, exStr s1, exStr s2]
-        s = pre
+prop_ReaderT_try :: Char -> Char -> Char -> Property
+prop_ReaderT_try pre ch1 ch2 = checkParser (runReaderT p (s1, s2)) r s
+  where s1 = pre : [ch1]
+        s2 = pre : [ch2]
+        getS1 = asks fst
+        getS2 = asks snd
+        p = try (g =<< getS1) <|> (g =<< getS2)
+        g = sequence . fmap char
+        r = posErr 1 s [uneEof, exCh ch1, exCh ch2]
+        s = [pre]

 prop_ReaderT_notFollowedBy :: NonNegative Int -> NonNegative Int
                           -> NonNegative Int -> Property