megaparsec/Text/Megaparsec/Prim.hs

-- |
-- Module      :  Text.Megaparsec.Prim
-- Copyright   :  © 2015 Megaparsec contributors
--                © 2007 Paolo Martini
--                © 1999–2001 Daan Leijen
-- License     :  BSD3
--
-- Maintainer  :  Mark Karpov <markkarpov@opmbx.org>
-- Stability   :  experimental
-- Portability :  portable
--
-- The primitive parser combinators.

{-# OPTIONS_HADDOCK not-home #-}

module Text.Megaparsec.Prim
  ( -- * Used data-types
    State (..)
  , Stream (..)
  , Consumed (..)
  , Reply (..)
  , Parsec
  , ParsecT
    -- * Primitive combinators
  , MonadParsec (..)
    -- * Parser state combinators
  , getPosition
  , setPosition
  , getInput
  , setInput
  , setParserState
    -- * Running parser
  , runParser
  , runParserT
  , parse
  , parse'
  , parseTest )
where

import Data.Bool (bool)
import Data.Monoid

import Control.Monad
import Control.Monad.Cont.Class
import Control.Monad.Error.Class
import Control.Monad.Identity
import Control.Monad.Reader.Class
import Control.Monad.State.Class hiding (state)
import Control.Monad.Trans
import qualified Control.Applicative as A

import qualified Data.ByteString.Char8 as B
import qualified Data.ByteString.Lazy.Char8 as BL

import qualified Data.Text as T
import qualified Data.Text.Lazy as TL

import Text.Megaparsec.Error
import Text.Megaparsec.Pos
import Text.Megaparsec.ShowToken

-- | This is Megaparsec state, it's parametrized over stream type @s@.

data State s = State
  { stateInput :: s
  , statePos   :: !SourcePos }
  deriving (Show, Eq)

-- | An instance of @Stream s t@ has stream type @s@, and token type @t@
-- determined by the stream.
--
-- Some rough guidelines for a “correct” instance of Stream:
--
--     * @unfoldM uncons@ gives the @[t]@ corresponding to the stream.
--     * A @Stream@ instance is responsible for maintaining the “position
--       within the stream” in the stream state @s@. This is trivial unless
--       you are using the monad in a non-trivial way.

class (ShowToken t, ShowToken [t]) => Stream s t | s -> t where
  uncons :: s -> Maybe (t, s)

instance (ShowToken t, ShowToken [t]) => Stream [t] t where
  uncons []     = Nothing
  uncons (t:ts) = Just (t, ts)
  {-# INLINE uncons #-}

instance Stream B.ByteString Char where
  uncons = B.uncons
  {-# INLINE uncons #-}

instance Stream BL.ByteString Char where
  uncons = BL.uncons
  {-# INLINE uncons #-}

instance Stream T.Text Char where
  uncons = T.uncons
  {-# INLINE uncons #-}

instance Stream TL.Text Char where
  uncons = TL.uncons
  {-# INLINE uncons #-}

-- | This data structure represents an aspect of result of parser's
-- work. The two constructors have the following meaning:
--
--     * @Cosumed@ is a wrapper for result when some part of input stream
--       was consumed.
--     * @Empty@ is a wrapper for result when no input was consumed.
--
-- See also: 'Reply'.

data Consumed a = Consumed a | Empty !a

-- | This data structure represents an aspect of result of parser's
-- work. The two constructors have the following meaning:
--
--     * @Ok@ for successfully run parser.
--     * @Error@ for failed parser.
--
-- See also 'Consumed'.

data Reply s a = Ok a !(State s) | Error ParseError

-- | 'Hints' represent collection of strings to be included into 'ParserError'
-- as “expected” messages when a parser fails without consuming input right
-- after successful parser that produced the hints.
--
-- For example, without hints you could get:
--
-- >>> parseTest (many (char 'r') <* eof) "ra"
-- parse error at line 1, column 2:
-- unexpected 'a'
-- expecting end of input
--
-- we're getting better error messages with help of hints:
--
-- >>> parseTest (many (char 'r') <* eof) "ra"
-- parse error at line 1, column 2:
-- unexpected 'a'
-- expecting 'r' or end of input

newtype Hints = Hints [[String]] deriving Monoid

-- | Convert 'ParseError' record into 'Hints'.

toHints :: ParseError -> Hints
toHints err = Hints hints
  where hints = if null msgs then [] else [messageString <$> msgs]
        msgs  = filter ((== 1) . fromEnum) $ errorMessages err

-- | @withHints hs c@ makes “error” continuation @c@ use given hints @hs@.

withHints :: Hints -> (ParseError -> m b) -> ParseError -> m b
withHints (Hints xs) c = c . addHints
  where addHints err = foldr addErrorMessage err (Expected <$> concat xs)

-- | @accHints hs c@ results in “OK” continuation that will add given hints
-- @hs@ to third argument of original continuation @c@.

accHints :: Hints -> (a -> State s -> Hints -> m b) ->
            a -> State s -> Hints -> m b
accHints hs1 c x s hs2 = c x s (hs1 <> hs2)

-- | Replace most recent group of hints (if any) with given string. Used in
-- 'label' combinator.

refreshLastHint :: Hints -> String -> Hints
refreshLastHint (Hints [])     _  = Hints []
refreshLastHint (Hints (_:xs)) "" = Hints xs
refreshLastHint (Hints (_:xs)) l  = Hints ([l]:xs)

-- If you're reading this, you may be interested in how Megaparsec works on
-- lower level. That's quite simple. 'ParsecT' is a wrapper around function
-- that takes five arguments:
--
--     * State. It includes input stream, position in input stream and
--     user's backtracking state.
--
--     * “Consumed-OK” continuation (cok). This is just a function that
--     takes three arguments: result of parsing, state after parsing, and
--     hints (see their description above). This continuation is called when
--     something has been consumed during parsing and result is OK (no error
--     occurred).
--
--     * “Consumed-error” continuation (cerr). This function is called when
--     some part of input stream has been consumed and parsing resulted in
--     an error. When error happens, parsing stops and we're only interested
--     in error message, so this continuation takes 'ParseError' as its only
--     argument.
--
--     * “Empty-OK” continuation (eok). The function takes the same
--     arguments as “consumed-OK” continuation. “Empty-OK” is called when no
--     input has been consumed and no error occurred.
--
--     * “Empty-error” continuation (eerr). The function is called when no
--     input has been consumed, but nonetheless parsing resulted in an
--     error. Just like “consumed-error”, the continuation take single
--     argument — 'ParseError' record.
--
-- You call specific continuation when you want to proceed in that specific
-- branch of control flow.

-- | @Parsec@ is non-transformer variant of more general @ParsecT@
-- monad-transformer.

type Parsec s = ParsecT s Identity

-- | @ParsecT s m a@ is a parser with stream type @s@, underlying monad @m@
-- and return type @a@.

newtype ParsecT s m a = ParsecT
  { unParser :: forall b. State s
             -> (a -> State s -> Hints -> m b) -- consumed-OK
             -> (ParseError -> m b)            -- consumed-error
             -> (a -> State s -> Hints -> m b) -- empty-OK
             -> (ParseError -> m b)            -- empty-error
             -> m b }

instance Functor (ParsecT s m) where
  fmap = pMap

pMap :: (a -> b) -> ParsecT s m a -> ParsecT s m b
pMap f p = ParsecT $ \s cok cerr eok eerr ->
  unParser p s (cok . f) cerr (eok . f) eerr
{-# INLINE pMap #-}

instance A.Applicative (ParsecT s m) where
  pure     = return
  (<*>)    = ap
  (*>)     = (>>)
  p1 <* p2 = do { x1 <- p1 ; void p2 ; return x1 }

instance A.Alternative (ParsecT s m) where
  empty  = mzero
  (<|>)  = mplus
  many p = reverse <$> manyAcc p

manyAcc :: ParsecT s m a -> ParsecT s m [a]
manyAcc p = ParsecT $ \s cok cerr eok _ ->
  let errToHints c err = c (toHints err)
      walk xs x s' _ =
        unParser p s'
        (seq xs $ walk $ x:xs)       -- consumed-OK
        cerr                         -- consumed-error
        manyErr                      -- empty-OK
        (errToHints $ cok (x:xs) s') -- empty-error
  in unParser p s (walk []) cerr manyErr (errToHints $ eok [] s)

manyErr :: a
manyErr = error
    "Text.Megaparsec.Prim.many: combinator 'many' is applied to a parser \
    \that accepts an empty string."

instance Monad (ParsecT s m) where
  return = pReturn
  (>>=)  = pBind
  fail   = pFail

pReturn :: a -> ParsecT s m a
pReturn x = ParsecT $ \s _ _ eok _ -> eok x s mempty
{-# INLINE pReturn #-}

pBind :: ParsecT s m a -> (a -> ParsecT s m b) -> ParsecT s m b
pBind m k = ParsecT $ \s cok cerr eok eerr ->
  let mcok x s' hs = unParser (k x) s' cok cerr
                     (accHints hs cok) (withHints hs cerr)
      meok x s' hs = unParser (k x) s' cok cerr
                     (accHints hs eok) (withHints hs eerr)
  in unParser m s mcok cerr meok eerr
{-# INLINE pBind #-}

pFail :: String -> ParsecT s m a
pFail msg = ParsecT $ \s _ _ _ eerr ->
  eerr $ newErrorMessage (Message msg) (statePos s)
{-# INLINE pFail #-}

-- | Low-level creation of the ParsecT type.

mkPT :: Monad m => (State s -> m (Consumed (m (Reply s a)))) -> ParsecT s m a
mkPT k = ParsecT $ \s cok cerr eok eerr -> do
  cons <- k s
  case cons of
    Consumed mrep -> do
      rep <- mrep
      case rep of
        Ok x s'   -> cok x s' mempty
        Error err -> cerr err
    Empty mrep -> do
      rep <- mrep
      case rep of
        Ok x s'   -> eok x s' mempty
        Error err -> eerr err

instance MonadIO m => MonadIO (ParsecT s m) where
  liftIO = lift . liftIO

instance MonadReader r m => MonadReader r (ParsecT s m) where
  ask       = lift ask
  local f p = mkPT $ \s -> local f (runParsecT p s)

instance MonadState s m => MonadState s (ParsecT s' m) where
  get = lift get
  put = lift . put

instance MonadCont m => MonadCont (ParsecT s m) where
  callCC f = mkPT $ \s ->
     callCC $ \c ->
       runParsecT (f (\a -> mkPT $ \s' -> c (pack s' a))) s
    where pack s a = Empty $ return (Ok a s)

instance MonadError e m => MonadError e (ParsecT s m) where
  throwError = lift . throwError
  p `catchError` h = mkPT $ \s ->
      runParsecT p s `catchError` \e ->
          runParsecT (h e) s

instance MonadPlus (ParsecT s m) where
  mzero = pZero
  mplus = pPlus

pZero :: ParsecT s m a
pZero = ParsecT $ \(State _ pos) _ _ _ eerr -> eerr $ newErrorUnknown pos

pPlus :: ParsecT s m a -> ParsecT s m a -> ParsecT s m a
pPlus m n = ParsecT $ \s cok cerr eok eerr ->
  let meerr err =
        let ncerr   err' = cerr (mergeError err' err)
            neok x s' hs = eok x s' (toHints err <> hs)
            neerr   err' = eerr (mergeError err' err)
        in unParser n s cok ncerr neok neerr
  in unParser m s cok cerr eok meerr
{-# INLINE pPlus #-}

instance MonadTrans (ParsecT s) where
  lift amb = ParsecT $ \s _ _ eok _ -> amb >>= \a -> eok a s mempty

-- Primitive combinators

infix 0 <?>

-- | Type class describing parsers independent of input type.

class (A.Alternative m, Monad m, Stream s t) =>
      MonadParsec s m t | m -> s t where

  -- | The parser @unexpected msg@ always fails with an unexpected error
  -- message @msg@ without consuming any input.
  --
  -- The parsers 'fail', ('<?>') and @unexpected@ are the three parsers used
  -- to generate error messages. Of these, only ('<?>') is commonly used.

  unexpected :: String -> m a

  -- | The parser @p \<?> msg@ behaves as parser @p@, but whenever the
  -- parser @p@ fails /without consuming any input/, it replaces expect
  -- error messages with the expect error message @msg@.
  --
  -- This is normally used at the end of a set alternatives where we want to
  -- return an error message in terms of a higher level construct rather
  -- than returning all possible characters. For example, if the @expr@
  -- parser from the “try” example would fail, the error message is: “…:
  -- expecting expression”. Without the @(\<?>)@ combinator, the message
  -- would be like “…: expecting \"let\" or letter”, which is less friendly.

  (<?>) :: m a -> String -> m a
  (<?>) = flip label

  -- | A synonym for @(\<?>)@, but as a function instead of an operator.

  label :: String -> m a -> m a
  label = flip (<?>)

  -- | @hidden p@ behaves just like parser @p@, but it doesn't show any
  -- “expected” tokens in error message when @p@ fails.

  hidden :: m a -> m a
  hidden = label ""

  -- | The parser @try p@ behaves like parser @p@, except that it
  -- pretends that it hasn't consumed any input when an error occurs.
  --
  -- This combinator is used whenever arbitrary look ahead is needed. Since
  -- it pretends that it hasn't consumed any input when @p@ fails, the
  -- ('A.<|>') combinator will try its second alternative even when the
  -- first parser failed while consuming input.
  --
  -- For example, here is a parser that will /try/ (sorry for the pun) to
  -- parse word “let” or “lexical”:
  --
  -- >>> parseTest (string "let" <|> string "lexical") "lexical"
  -- parse error at line 1, column 1:
  -- unexpected "lex"
  -- expecting "let"
  --
  -- First parser consumed “le” and failed, @string "lexical"@ couldn't
  -- succeed with “xical” as its input! Things get much better with help of
  -- @try@:
  --
  -- >>> parseTest (try (string "let") <|> string "lexical") "lexical"
  -- "lexical"
  --
  -- @try@ also improves error messages in case of overlapping alternatives,
  -- because Megaparsec's hint system can be used:
  --
  -- >>> parseTest (try (string "let") <|> string "lexical") "le"
  -- parse error at line 1, column 1:
  -- unexpected "le"
  -- expecting "let" or "lexical"

  try :: m a -> m a

  -- | @lookAhead p@ parses @p@ without consuming any input.
  --
  -- If @p@ fails and consumes some input, so does @lookAhead@. Combine with
  -- 'try' if this is undesirable.

  lookAhead :: m a -> m a

  -- | @notFollowedBy p@ only succeeds when parser @p@ fails. This parser
  -- does not consume any input and can be used to implement the “longest
  -- match” rule.

  notFollowedBy :: m a -> m ()

  -- | This parser only succeeds at the end of the input.

  eof :: m ()

  -- | The parser @token nextPos testTok@ accepts a token @t@ with result
  -- @x@ when the function @testTok t@ returns @'Just' x@. The position of
  -- the /next/ token should be returned when @nextPos@ is called with the
  -- current source position @pos@, the current token @t@ and the rest of
  -- the tokens @toks@, @nextPos pos t toks@.
  --
  -- This is the most primitive combinator for accepting tokens. For
  -- example, the 'Text.Megaparsec.Char.char' parser could be implemented
  -- as:
  --
  -- > char c = token nextPos testChar
  -- >   where testChar x       = if x == c then Just x else Nothing
  -- >         nextPos pos x xs = updatePosChar pos x

  token :: (SourcePos -> t -> s -> SourcePos) -- ^ Next position calculating function.
        -> (t -> Either [Message] a) -- ^ Matching function for the token to parse.
        -> m a

  -- | The parser @tokens posFromTok test@ parses list of tokens and returns
  -- it. The resulting parser will use 'showToken' to pretty-print the
  -- collection of tokens. Supplied predicate @test@ is used to check
  -- equality of given and parsed tokens.
  --
  -- This can be used to example to write 'Text.Megaparsec.Char.string':
  --
  -- > string = tokens updatePosString (==)

  tokens :: Eq t =>
            (SourcePos -> [t] -> SourcePos) -- ^ Computes position of tokens.
         -> (t -> t -> Bool)      -- ^ Predicate to check equality of tokens.
         -> [t]                   -- ^ List of tokens to parse
         -> m [t]

  -- | Returns the full parser state as a 'State' record.

  getParserState :: m (State s)

  -- | @updateParserState f@ applies function @f@ to the parser state.

  updateParserState :: (State s -> State s) -> m ()

instance Stream s t => MonadParsec s (ParsecT s m) t where
  unexpected        = pUnexpected
  label             = pLabel
  try               = pTry
  lookAhead         = pLookAhead
  notFollowedBy     = pNotFollowedBy
  eof               = pEof
  token             = pToken
  tokens            = pTokens
  getParserState    = pGetParserState
  updateParserState = pUpdateParserState

pUnexpected :: String -> ParsecT s m a
pUnexpected msg = ParsecT $ \(State _ pos) _ _ _ eerr ->
  eerr $ newErrorMessage (Unexpected msg) pos

pLabel :: String -> ParsecT s m a -> ParsecT s m a
pLabel l p = ParsecT $ \s cok cerr eok eerr ->
  let cok' x s' hs = cok x s' $ refreshLastHint hs l
      eok' x s' hs = eok x s' $ refreshLastHint hs l
      eerr'    err = eerr $ setErrorMessage (Expected l) err
  in unParser p s cok' cerr eok' eerr'

pTry :: ParsecT s m a -> ParsecT s m a
pTry p = ParsecT $ \s cok _ eok eerr -> unParser p s cok eerr eok eerr
{-# INLINE pTry #-}

pLookAhead :: ParsecT s m a -> ParsecT s m a
pLookAhead p = ParsecT $ \s _ cerr eok eerr ->
  let eok' a _ _ = eok a s mempty
  in unParser p s eok' cerr eok' eerr
{-# INLINE pLookAhead #-}

pNotFollowedBy :: Stream s t => ParsecT s m a -> ParsecT s m ()
pNotFollowedBy p = ParsecT $ \s@(State input pos) _ _ eok eerr ->
  let l = maybe eoi (showToken . fst) (uncons input)
      cok' _ _ _ = eerr $ unexpectedErr l pos
      cerr'    _ = eok () s mempty
      eok' _ _ _ = eerr $ unexpectedErr l pos
      eerr'    _ = eok () s mempty
  in unParser p s cok' cerr' eok' eerr'

pEof :: Stream s t => ParsecT s m ()
pEof = label eoi $ ParsecT $ \s@(State input pos) _ _ eok eerr ->
  case uncons input of
    Nothing    -> eok () s mempty
    Just (x,_) -> eerr $ unexpectedErr (showToken x) pos
{-# INLINE pEof #-}

pToken :: Stream s t =>
          (SourcePos -> t -> s -> SourcePos)
       -> (t -> Either [Message] a)
       -> ParsecT s m a
pToken nextpos test = ParsecT $ \(State input pos) cok _ _ eerr ->
    case uncons input of
      Nothing     -> eerr $ unexpectedErr eoi pos
      Just (c,cs) ->
        case test c of
          Left ms -> eerr $ foldr addErrorMessage (newErrorUnknown pos) ms
          Right x -> let newpos = nextpos pos c cs
                         newstate = State cs newpos
                     in seq newpos $ seq newstate $ cok x newstate mempty
{-# INLINE pToken #-}

pTokens :: Stream s t =>
           (SourcePos -> [t] -> SourcePos)
        -> (t -> t -> Bool)
        -> [t]
        -> ParsecT s m [t]
pTokens _ _ [] = ParsecT $ \s _ _ eok _ -> eok [] s mempty
pTokens nextpos test tts = ParsecT $ \(State input pos) cok cerr _ eerr ->
  let errExpect x = setErrorMessage (Expected $ showToken tts)
                    (newErrorMessage (Unexpected x) pos)
      walk [] _ rs = let pos' = nextpos pos tts
                         s'   = State rs pos'
                     in cok tts s' mempty
      walk (t:ts) is rs =
        let errorCont = if null is then eerr else cerr
            what = bool (showToken $ reverse is) "end of input" (null is)
        in case uncons rs of
             Nothing -> errorCont . errExpect $ what
             Just (x,xs)
               | test t x  -> walk ts (x:is) xs
               | otherwise -> errorCont . errExpect . showToken $ reverse (x:is)
  in walk tts [] input
{-# INLINE pTokens #-}

pGetParserState :: ParsecT s m (State s)
pGetParserState = ParsecT $ \s _ _ eok _ -> eok s s mempty
{-# INLINE pGetParserState #-}

pUpdateParserState :: (State s -> State s) -> ParsecT s m ()
pUpdateParserState f = ParsecT $ \s _ _ eok _ -> eok () (f s) mempty
{-# INLINE pUpdateParserState #-}

unexpectedErr :: String -> SourcePos -> ParseError
unexpectedErr msg = newErrorMessage (Unexpected msg)

eoi :: String
eoi = "end of input"

-- Parser state combinators

-- | Returns the current source position. See also 'SourcePos'.

getPosition :: MonadParsec s m t => m SourcePos
getPosition = statePos <$> getParserState

-- | @setPosition pos@ sets the current source position to @pos@.

setPosition :: MonadParsec s m t => SourcePos -> m ()
setPosition pos = updateParserState (\(State s _) -> State s pos)

-- | Returns the current input.

getInput :: MonadParsec s m t => m s
getInput = stateInput <$> getParserState

-- | @setInput input@ continues parsing with @input@. The 'getInput' and
-- @setInput@ functions can for example be used to deal with #include files.

setInput :: MonadParsec s m t => s -> m ()
setInput s = updateParserState (\(State _ pos) -> State s pos)

-- | @setParserState st@ set the full parser state to @st@.

setParserState :: MonadParsec s m t => State s -> m ()
setParserState st = updateParserState (const st)

-- Running a parser

-- | @parse p file input@ runs a parser @p@ over 'Identity'. The
-- @file@ is only used in error messages and may be the empty
-- string. Returns either a 'ParseError' ('Left') or a value of type @a@
-- ('Right'). This is a synonym for 'runParser'.
--
-- > main = case (parse numbers "" "11, 2, 43") of
-- >          Left err -> print err
-- >          Right xs -> print (sum xs)
-- >
-- > numbers = commaSep integer

parse :: Stream s t => Parsec s a -> SourceName -> s -> Either ParseError a
parse = runParser

-- | @parse' p input@ runs parser @p@ on @input@ and returns result
-- inside 'Just' on success and 'Nothing' on failure. This function also
-- parses 'eof', so all input should be consumed by the parser @p@.
--
-- The function is supposed to be useful for lightweight parsing, where
-- error messages (and thus file name) are not important and entire input
-- should be parsed. For example it can be used when parsing of single
-- number according to specification of its format is desired.

parse' :: Stream s t => Parsec s a -> s -> Maybe a
parse' p s =
  case parse (p <* eof) "" s of
    Left  _ -> Nothing
    Right x -> Just x

-- | The expression @parseTest p input@ applies a parser @p@ against
-- input @input@ and prints the result to stdout. Used for testing.

parseTest :: (Stream s t, Show a) => Parsec s a -> s -> IO ()
parseTest p input =
  case parse p "" input of
    Left err -> putStr "parse error at " >> print err
    Right x  -> print x

-- | The most general way to run a parser over the 'Identity' monad.
-- @runParser p file input@ runs parser @p@ on the input list of tokens
-- @input@, obtained from source @file@. The @file@ is only used in error
-- messages and may be the empty string. Returns either a 'ParseError'
-- ('Left') or a value of type @a@ ('Right').
--
-- > parseFromFile p file = runParser p file <$> readFile file

runParser :: Stream s t => Parsec s a -> SourceName -> s -> Either ParseError a
runParser p name s = runIdentity $ runParserT p name s

-- | The most general way to run a parser. @runParserT p file input@ runs
-- parser @p@ on the input list of tokens @input@, obtained from source
-- @file@. The @file@ is only used in error messages and may be the empty
-- string. Returns a computation in the underlying monad @m@ that return
-- either a 'ParseError' ('Left') or a value of type @a@ ('Right').

runParserT :: (Monad m, Stream s t) =>
              ParsecT s m a -> SourceName -> s -> m (Either ParseError a)
runParserT p name s = do
  res <- runParsecT p $ State s (initialPos name)
  r <- parserReply res
  case r of
    Ok x _    -> return $ Right x
    Error err -> return $ Left err
  where parserReply res =
          case res of
            Consumed r -> r
            Empty    r -> r

-- | Low-level unpacking of the ParsecT type. 'runParserT' and 'runParser'
-- are built upon this.

runParsecT :: Monad m => ParsecT s m a -> State s -> m (Consumed (m (Reply s a)))
runParsecT p s = unParser p s cok cerr eok eerr
  where cok a s' _ = return . Consumed . return $ Ok a s'
        cerr err   = return . Consumed . return $ Error err
        eok a s' _ = return . Empty    . return $ Ok a s'
        eerr err   = return . Empty    . return $ Error err