Merge pull request #76 from mrkkrp/advanced-indent

Advanced combinators for indentation-sensitive parsing
This commit is contained in:
Mark Karpov 2016-01-09 17:42:03 +05:00
commit 61dc477eea
4 changed files with 305 additions and 98 deletions

View File

@ -1,3 +1,18 @@
## Megaparsec 4.3.0
* Canonicalized `Applicative`/`Monad` instances. Thanks to Herbert Valerio
Riedel.
* Custom messages in `ParseError` are printed each on its own line.
* Now accumulated hints are not used with `ParseError` records that have
only custom messages in them (created with `Message` constructor, as
opposed to `Unexpected` or `Expected`). This strips “expected” line from
custom error messages where it's unlikely to be relevant anyway.
* Added higher-level combinators for indentation-sensitive grammars:
`indentLevel`, `nonIndented`, and `indentBlock`.
## Megaparsec 4.2.0
* Made `newPos` constructor and other functions in `Text.Megaparsec.Pos`

View File

@ -19,14 +19,19 @@
-- > import qualified Text.Megaparsec.Lexer as L
module Text.Megaparsec.Lexer
( -- * White space and indentation
( -- * White space
space
, lexeme
, symbol
, symbol'
, indentGuard
, skipLineComment
, skipBlockComment
-- * Indentation
, indentLevel
, indentGuard
, nonIndented
, IndentOpt (..)
, indentBlock
-- * Character and string literals
, charLiteral
-- * Numbers
@ -40,10 +45,10 @@ module Text.Megaparsec.Lexer
, signed )
where
import Control.Applicative ((<|>), some)
import Control.Applicative ((<|>), some, optional)
import Control.Monad (void)
import Data.Char (readLitChar)
import Data.Maybe (listToMaybe)
import Data.Maybe (listToMaybe, fromMaybe)
import Prelude hiding (negate)
import qualified Prelude
@ -85,10 +90,10 @@ import Control.Applicative ((<$>), (<*), (*>), (<*>), pure)
-- of the file).
space :: MonadParsec s m Char
=> m () -- ^ A parser for a space character (e.g. 'C.spaceChar')
-> m () -- ^ A parser for a line comment (e.g. 'skipLineComment')
-> m () -- ^ A parser for a block comment (e.g. 'skipBlockComment')
-> m ()
=> m () -- ^ A parser for a space character (e.g. 'C.spaceChar')
-> m () -- ^ A parser for a line comment (e.g. 'skipLineComment')
-> m () -- ^ A parser for a block comment (e.g. 'skipBlockComment')
-> m ()
space ch line block = hidden . skipMany $ choice [ch, line, block]
-- | This is wrapper for lexemes. Typical usage is to supply first argument
@ -98,7 +103,10 @@ space ch line block = hidden . skipMany $ choice [ch, line, block]
-- > lexeme = L.lexeme spaceConsumer
-- > integer = lexeme L.integer
lexeme :: MonadParsec s m Char => m () -> m a -> m a
lexeme :: MonadParsec s m Char
=> m () -- ^ How to consume white space after lexeme
-> m a -- ^ How to parse actual lexeme
-> m a
lexeme spc p = p <* spc
-- | This is a helper to parse symbols, i.e. verbatim strings. You pass the
@ -116,15 +124,55 @@ lexeme spc p = p <* spc
-- > colon = symbol ":"
-- > dot = symbol "."
symbol :: MonadParsec s m Char => m () -> String -> m String
symbol :: MonadParsec s m Char
=> m () -- ^ How to consume white space after lexeme
-> String -- ^ String to parse
-> m String
symbol spc = lexeme spc . C.string
-- | Case-insensitive version of 'symbol'. This may be helpful if you're
-- working with case-insensitive languages.
symbol' :: MonadParsec s m Char => m () -> String -> m String
symbol' :: MonadParsec s m Char
=> m () -- ^ How to consume white space after lexeme
-> String -- ^ String to parse (case-insensitive)
-> m String
symbol' spc = lexeme spc . C.string'
-- | Given comment prefix this function returns parser that skips line
-- comments. Note that it stops just before newline character but doesn't
-- consume the newline. Newline is either supposed to be consumed by 'space'
-- parser or picked up manually.
skipLineComment :: MonadParsec s m Char
=> String -- ^ Line comment prefix
-> m ()
skipLineComment prefix = p >> void (manyTill C.anyChar n)
where p = try (C.string prefix)
n = lookAhead C.newline
-- | @skipBlockComment start end@ skips non-nested block comment starting
-- with @start@ and ending with @end@.
skipBlockComment :: MonadParsec s m Char
=> String -- ^ Start of block comment
-> String -- ^ End of block comment
-> m ()
skipBlockComment start end = p >> void (manyTill C.anyChar n)
where p = try (C.string start)
n = try (C.string end)
-- Indentation
-- | Return current indentation level.
--
-- The function is a simple shortcut defined as:
--
-- > indentLevel = sourceColumn <$> getPosition
indentLevel :: MonadParsec s m t => m Int
indentLevel = sourceColumn <$> getPosition
-- | @indentGuard spaceConsumer test@ first consumes all white space
-- (indentation) with @spaceConsumer@ parser, then it checks column
-- position. It should satisfy supplied predicate @test@, otherwise the
@ -136,31 +184,88 @@ symbol' spc = lexeme spc . C.string'
-- indentation. Use returned value to check indentation on every subsequent
-- line according to syntax of your language.
indentGuard :: MonadParsec s m Char => m () -> (Int -> Bool) -> m Int
indentGuard :: MonadParsec s m Char
=> m () -- ^ How to consume indentation (white space)
-> (Int -> Bool) -- ^ Predicate checking indentation level
-> m Int -- ^ Current column (indentation level)
indentGuard spc p = do
spc
pos <- sourceColumn <$> getPosition
if p pos
then return pos
else fail "incorrect indentation"
lvl <- indentLevel
if p lvl
then return lvl
else fail ii
-- | Given comment prefix this function returns parser that skips line
-- comments. Note that it stops just before newline character but doesn't
-- consume the newline. Newline is either supposed to be consumed by 'space'
-- parser or picked up manually.
-- | Parse non-indented construction. This ensures that there is no
-- indentation before actual data. Useful, for example, as a wrapper for
-- top-level function definitions.
skipLineComment :: MonadParsec s m Char => String -> m ()
skipLineComment prefix = p >> void (manyTill C.anyChar n)
where p = try $ C.string prefix
n = lookAhead C.newline
nonIndented :: MonadParsec s m Char
=> m () -- ^ How to consume indentation (white space)
-> m a -- ^ How to parse actual data
-> m a
nonIndented sc p = indentGuard sc (== 1) *> p
-- | @skipBlockComment start end@ skips non-nested block comment starting
-- with @start@ and ending with @end@.
-- | The data type represents available behaviors for parsing of indented
-- tokens. This is used in 'indentBlock', which see.
skipBlockComment :: MonadParsec s m Char => String -> String -> m ()
skipBlockComment start end = p >> void (manyTill C.anyChar n)
where p = try $ C.string start
n = try $ C.string end
data IndentOpt m a b
= IndentNone a
-- ^ Parse no indented tokens, just return the value
| IndentMany (Maybe Int) ([b] -> m a) (m b)
-- ^ Parse many indented tokens (possibly zero), use given indentation
-- level (if 'Nothing', use level of the first indented token); the
-- second argument tells how to get final result, and third argument
-- describes how to parse indented token
| IndentSome (Maybe Int) ([b] -> m a) (m b)
-- ^ Just like 'ManyIndent', but requires at least one indented token to
-- be present
-- | Parse a “reference” token and a number of other tokens that have
-- greater (but the same) level of indentation than that of “reference”
-- token. Reference token can influence parsing, see 'IndentOpt' for more
-- information.
--
-- Tokens /must not/ consume newlines after them. On the other hand, the
-- first argument of this function /should/ consume newlines among other
-- white space characters.
indentBlock :: MonadParsec String m Char
=> m () -- ^ How to consume indentation (white space)
-> m (IndentOpt m a b) -- ^ How to parse “reference” token
-> m a
indentBlock sc r = do
ref <- indentGuard sc (const True)
a <- r
case a of
IndentNone x -> return x
IndentMany indent f p -> do
mlvl <- lookAhead . optional . try $ C.eol *> indentGuard sc (> ref)
case mlvl of
Nothing -> sc *> f []
Just lvl -> indentedItems ref (fromMaybe lvl indent) sc p >>= f
IndentSome indent f p -> do
lvl <- lookAhead . try $ C.eol *> indentGuard sc (> ref)
indentedItems ref (fromMaybe lvl indent) sc p >>= f
-- | Grab indented items. This is a helper for 'indentBlock', it's not a
-- part of public API.
indentedItems :: MonadParsec String m Char
=> Int -- ^ Reference indentation level
-> Int -- ^ Level of the first indented item ('lookAhead'ed)
-> m () -- ^ How to consume indentation (white space)
-> m b -- ^ How to parse indented tokens
-> m [b]
indentedItems ref lvl sc p = go
where
go = (sc *> indentLevel) >>= re
re pos
| pos <= ref = return []
| pos == lvl = (:) <$> p <*> go
| otherwise = fail ii
ii :: String
ii = "incorrect indentation"
-- Character and string literals
@ -256,21 +361,18 @@ nump prefix baseDigit = read . (prefix ++) <$> some baseDigit
-- If you need to parse signed floats, see 'signed'.
float :: MonadParsec s m Char => m Double
float = label "float" $ read <$> f
where f = do
d <- some C.digitChar
rest <- fraction <|> fExp
return $ d ++ rest
float = label "float" (read <$> f)
where f = (++) <$> some C.digitChar <*> (fraction <|> fExp)
-- | This is a helper for 'float' parser. It parses fractional part of
-- floating point number, that is, dot and everything after it.
fraction :: MonadParsec s m Char => m String
fraction = do
void $ C.char '.'
void (C.char '.')
d <- some C.digitChar
e <- option "" fExp
return $ '.' : d ++ e
return ('.' : d ++ e)
-- | This helper parses exponent of floating point numbers.
@ -279,7 +381,7 @@ fExp = do
expChar <- C.char' 'e'
signStr <- option "" (pure <$> choice (C.char <$> "+-"))
d <- some C.digitChar
return $ expChar : signStr ++ d
return (expChar : signStr ++ d)
-- | Parse a number: either integer or floating point. The parser can handle
-- overlapping grammars graciously.

View File

@ -136,9 +136,18 @@ toHints err = Hints hints
msgs = filter ((== 1) . fromEnum) $ errorMessages err
-- | @withHints hs c@ makes “error” continuation @c@ use given hints @hs@.
--
-- Note that if resulting continuation gets 'ParseError' where all messages
-- are created with 'Message' constructor, hints are ignored.
withHints :: Hints -> (ParseError -> m b) -> ParseError -> m b
withHints (Hints xs) c = c . addErrorMessages (Expected <$> concat xs)
withHints (Hints xs) c e =
let isMessage (Message _) = True
isMessage _ = False
in (if all isMessage (errorMessages e)
then c
else c . addErrorMessages (Expected <$> concat xs))
e
-- | @accHints hs c@ results in “OK” continuation that will add given hints
-- @hs@ to third argument of original continuation @c@.
@ -183,7 +192,7 @@ instance Stream TL.Text Char where
{-# INLINE uncons #-}
-- | @StorableStream@ abstracts ability of some streams to be stored in a
-- file. This is used by polymorphic function 'readFromFile'.
-- file. This is used by the polymorphic function 'parseFromFile'.
class Stream s t => StorableStream s t where
@ -265,7 +274,7 @@ pMap f p = ParsecT $ \s cok cerr eok eerr ->
instance A.Applicative (ParsecT s m) where
pure = pPure
(<*>) = ap
p1 *> p2 = p1 `pBind` \_ -> p2
p1 *> p2 = p1 `pBind` const p2
p1 <* p2 = do { x1 <- p1 ; void p2 ; return x1 }
instance A.Alternative (ParsecT s m) where

View File

@ -27,6 +27,8 @@
-- ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
-- POSSIBILITY OF SUCH DAMAGE.
{-# LANGUAGE TupleSections #-}
module Lexer (tests) where
import Control.Applicative (empty)
@ -39,7 +41,7 @@ import Data.Char
, isSpace
, toLower )
import Data.List (findIndices, isInfixOf, find)
import Data.Maybe (listToMaybe, maybeToList, isNothing, fromJust)
import Data.Maybe
import Numeric (showInt, showHex, showOct, showSigned)
import Test.Framework
@ -53,10 +55,11 @@ import Text.Megaparsec.Prim
import Text.Megaparsec.String
import qualified Text.Megaparsec.Char as C
import Prim () -- 'Arbitrary' instance for 'SourcePos'
import Util
#if !MIN_VERSION_base(4,8,0)
import Control.Applicative ((<$>), (<*), (<*>))
import Control.Applicative ((<$>), (<*), (<*>), (<$))
#endif
tests :: Test
@ -64,7 +67,11 @@ tests = testGroup "Lexer"
[ testProperty "space combinator" prop_space
, testProperty "symbol combinator" prop_symbol
, testProperty "symbol' combinator" prop_symbol'
, testProperty "indentLevel" prop_indentLevel
, testProperty "indentGuard combinator" prop_indentGuard
, testProperty "nonIndented combinator" prop_nonIndented
, testProperty "indentBlock combinator" prop_indentBlock
, testProperty "indentBlock (many)" prop_indentMany
, testProperty "charLiteral" prop_charLiteral
, testProperty "integer" prop_integer
, testProperty "decimal" prop_decimal
@ -77,25 +84,25 @@ tests = testGroup "Lexer"
, testProperty "number 2 (signed)" prop_number_2
, testProperty "signed" prop_signed ]
newtype WhiteSpace = WhiteSpace
{ getWhiteSpace :: String }
deriving (Show, Eq)
-- White space
instance Arbitrary WhiteSpace where
arbitrary = WhiteSpace . concat <$> listOf whiteUnit
mkWhiteSpace :: Gen String
mkWhiteSpace = concat <$> listOf whiteUnit
where whiteUnit = oneof [whiteChars, whiteLine, whiteBlock]
newtype Symbol = Symbol
{ getSymbol :: String }
deriving (Show, Eq)
mkSymbol :: Gen String
mkSymbol = (++) <$> symbolName <*> whiteChars
instance Arbitrary Symbol where
arbitrary = Symbol <$> ((++) <$> symbolName <*> whiteChars)
whiteUnit :: Gen String
whiteUnit = oneof [whiteChars, whiteLine, whiteBlock]
mkIndent :: String -> Int -> Gen String
mkIndent x n = concat <$> sequence [spc, sym, tra, eol]
where spc = frequency [(5, vectorOf n itm), (1, listOf itm)]
tra = listOf itm
itm = elements " \t"
sym = return x
eol = frequency [(5, return "\n"), (1, listOf1 (return '\n'))]
whiteChars :: Gen String
whiteChars = listOf $ elements "\t\n "
whiteChars = listOf (elements "\t\n ")
whiteLine :: Gen String
whiteLine = commentOut <$> arbitrary `suchThat` goodEnough
@ -118,52 +125,124 @@ sc = space (void C.spaceChar) l b
sc' :: Parser ()
sc' = space (void $ C.oneOf " \t") empty empty
prop_space :: WhiteSpace -> Property
prop_space w = checkParser p r s
prop_space :: Property
prop_space = forAll mkWhiteSpace (checkParser p r)
where p = sc
r = Right ()
s = getWhiteSpace w
prop_symbol :: Symbol -> Maybe Char -> Property
prop_symbol = parseSymbol (symbol sc) id
prop_symbol :: Maybe Char -> Property
prop_symbol t = forAll mkSymbol $ \s ->
parseSymbol (symbol sc) id s t
prop_symbol' :: Symbol -> Maybe Char -> Property
prop_symbol' = parseSymbol (symbol' sc) (fmap toLower)
prop_symbol' :: Maybe Char -> Property
prop_symbol' t = forAll mkSymbol $ \s ->
parseSymbol (symbol' sc) (fmap toLower) s t
parseSymbol :: (String -> Parser String) -> (String -> String)
-> Symbol -> Maybe Char -> Property
parseSymbol
:: (String -> Parser String)
-> (String -> String)
-> String
-> Maybe Char
-> Property
parseSymbol p' f s' t = checkParser p r s
where p = p' (f g)
r | g == s || isSpace (last s) = Right g
| otherwise = posErr (length s - 1) s [uneCh (last s), exEof]
g = takeWhile (not . isSpace) s
s = getSymbol s' ++ maybeToList t
s = s' ++ maybeToList t
newtype IndLine = IndLine
{ getIndLine :: String }
deriving (Show, Eq)
-- Indentation
instance Arbitrary IndLine where
arbitrary = IndLine . concat <$> sequence [spc, sym, spc, eol]
where spc = listOf (elements " \t")
sym = return "xxx"
eol = return "\n"
prop_indentLevel :: SourcePos -> Property
prop_indentLevel pos = p /=\ sourceColumn pos
where p = setPosition pos >> indentLevel
prop_indentGuard :: IndLine -> IndLine -> IndLine -> Property
prop_indentGuard l0 l1 l2 = checkParser p r s
where p = ip (> 1) >>= \x -> sp >> ip (== x) >> sp >> ip (> x) >> sp
ip = indentGuard sc'
sp = void $ symbol sc' "xxx" <* C.eol
r | f' l0 <= 1 = posErr 0 s msg'
| f' l1 /= f' l0 = posErr (f l1 + g [l0]) s msg'
| f' l2 <= f' l0 = posErr (f l2 + g [l0, l1]) s msg'
prop_indentGuard :: NonNegative Int -> Property
prop_indentGuard n =
forAll ((,,) <$> mki <*> mki <*> mki) $ \(l0,l1,l2) ->
let r | getCol l0 <= 1 = posErr 0 s ii
| getCol l1 /= getCol l0 = posErr (getIndent l1 + g 1) s ii
| getCol l2 <= getCol l0 = posErr (getIndent l2 + g 2) s ii
| otherwise = Right ()
msg' = [msg "incorrect indentation"]
f = length . takeWhile isSpace . getIndLine
f' x = sourceColumn $ updatePosString defaultTabWidth (initialPos "") $
take (f x) (getIndLine x)
g xs = sum $ length . getIndLine <$> xs
s = concat $ getIndLine <$> [l0, l1, l2]
fragments = [l0,l1,l2]
g x = sum (length <$> take x fragments)
s = concat fragments
in checkParser p r s
where mki = mkIndent sbla (getNonNegative n)
p = ip (> 1) >>= \x -> sp >> ip (== x) >> sp >> ip (> x) >> sp >> sc
ip = indentGuard sc
sp = void (symbol sc' sbla <* C.eol)
prop_nonIndented :: Property
prop_nonIndented = forAll (mkIndent sbla 0) $ \s ->
let i = getIndent s
r | i == 0 = Right sbla
| otherwise = posErr i s ii
in checkParser p r s
where p = nonIndented sc (symbol sc sbla)
prop_indentBlock :: Maybe (Positive Int) -> Property
prop_indentBlock mn' = forAll mkBlock $ \(l0,l1,l2,l3,l4) ->
let r | getCol l1 <= getCol l0 =
posErr (getIndent l1 + g 1) s [uneCh (head sblb), exEof]
| isJust mn && getCol l1 /= ib =
posErr (getIndent l1 + g 1) s ii
| getCol l2 <= getCol l1 =
posErr (getIndent l2 + g 2) s ii
| getCol l3 == getCol l2 =
posErr (getIndent l3 + g 3) s [uneCh (head sblb), exStr sblc]
| getCol l3 <= getCol l0 =
posErr (getIndent l3 + g 3) s [uneCh (head sblb), exEof]
| getCol l3 /= getCol l1 =
posErr (getIndent l3 + g 3) s ii
| getCol l4 <= getCol l3 =
posErr (getIndent l4 + g 4) s ii
| otherwise = Right (sbla, [(sblb, [sblc]), (sblb, [sblc])])
fragments = [l0,l1,l2,l3,l4]
g x = sum (length <$> take x fragments)
s = concat fragments
in checkParser p r s
where mkBlock = do
l0 <- mkIndent sbla 0
l1 <- mkIndent sblb ib
l2 <- mkIndent sblc (ib + 2)
l3 <- mkIndent sblb ib
l4 <- mkIndent sblc (ib + 2)
return (l0,l1,l2,l3,l4)
p = lvla
lvla = indentBlock sc $ IndentMany mn (l sbla) lvlb <$ b sbla
lvlb = indentBlock sc $ IndentSome Nothing (l sblb) lvlc <$ b sblb
lvlc = indentBlock sc $ IndentNone sblc <$ b sblc
b = symbol sc'
l x = return . (x,)
mn = getPositive <$> mn'
ib = fromMaybe 2 mn
prop_indentMany :: Property
prop_indentMany = forAll (mkIndent "xxx" 0) (checkParser p r)
where r = Right (sbla, [])
p = lvla
lvla = indentBlock sc $ IndentMany Nothing (l sbla) lvlb <$ b sbla
lvlb = b sblb
b = symbol sc'
l x = return . (x,)
getIndent :: String -> Int
getIndent = length . takeWhile isSpace
getCol :: String -> Int
getCol x = sourceColumn $
updatePosString defaultTabWidth (initialPos "") $ take (getIndent x) x
sbla, sblb, sblc :: String
sbla = "xxx"
sblb = "yyy"
sblc = "zzz"
ii :: [Message]
ii = [msg "incorrect indentation"]
-- Character and string literals
prop_charLiteral :: String -> Bool -> Property
prop_charLiteral t i = checkParser charLiteral r s
@ -176,6 +255,8 @@ prop_charLiteral t i = checkParser charLiteral r s
l = length s - length g
s = if null t || i then t else showLitChar (head t) (tail t)
-- Numbers
prop_integer :: NonNegative Integer -> Int -> Property
prop_integer n' i = checkParser integer r s
where (r, s) = quasiCorrupted n' i showInt "integer"
@ -227,11 +308,11 @@ prop_signed n i plus = checkParser p r s
where p = signed (hidden C.space) integer
r | i > length z = Right n
| otherwise = posErr i s $ uneCh '?' :
(if i <= 0 then [exCh '+', exCh '-'] else []) ++
[exSpec $ if isNothing . find isDigit $ take i s
then "integer"
else "rest of integer"]
++ [exEof | i > head (findIndices isDigit s)]
(if i <= 0 then [exCh '+', exCh '-'] else []) ++
[exSpec $ if isNothing . find isDigit $ take i s
then "integer"
else "rest of integer"] ++
[exEof | i > head (findIndices isDigit s)]
z = let bar = showSigned showInt 0 n ""
in if n < 0 || plus then bar else '+' : bar
s = if i <= length z then take i z ++ "?" ++ drop i z else z
@ -243,9 +324,9 @@ quasiCorrupted n' i shower l = (r, s)
where n = getNonNegative n'
r | i > length z = Right n
| otherwise = posErr i s $ uneCh '?' :
[ exEof | i > 0 ] ++
[if i <= 0 || null l
then exSpec l
else exSpec $ "rest of " ++ l]
[ exEof | i > 0 ] ++
[if i <= 0 || null l
then exSpec l
else exSpec $ "rest of " ++ l]
z = shower n ""
s = if i <= length z then take i z ++ "?" ++ drop i z else z