1
1
mirror of https://github.com/github/semantic.git synced 2024-12-20 13:21:59 +03:00

Merge pull request #1183 from github/fast-markdown-assignment

Fast Markdown assignment
This commit is contained in:
Rob Rix 2017-06-26 12:23:46 -04:00 committed by GitHub
commit fc4760335f
5 changed files with 58 additions and 62 deletions

View File

@ -21,12 +21,15 @@ module Data.Source
-- Conversion
, spanToRange
, spanToRangeInLineRanges
, sourceLineRangesByLineNumber
, rangeToSpan
-- Listable
, ListableByteString(..)
) where
import Data.Array
import qualified Data.ByteString as B
import Data.Char (ord)
import Data.List (span)
import Data.Range
import Data.Span
@ -35,7 +38,7 @@ import qualified Data.Text as T
import Prologue
import Test.LeanCheck
-- | The contents of a source file, represented as a ByteString.
-- | The contents of a source file, represented as a 'ByteString'.
newtype Source = Source { sourceBytes :: B.ByteString }
deriving (Eq, IsString, Show)
@ -90,54 +93,44 @@ takeSource i = Source . take . sourceBytes
-- Splitting
-- | Split the source into the longest prefix of elements that do not satisfy the predicate and the rest without copying.
breakSource :: (Word8 -> Bool) -> Source -> (Source, Source)
breakSource predicate (Source text) = let (start, remainder) = B.break predicate text in (Source start, Source remainder)
-- | Split the contents of the source after newlines.
sourceLines :: Source -> [Source]
sourceLines source
| nullSource source = [ source ]
| otherwise = case breakSource (== toEnum (fromEnum '\n')) source of
(line, rest)
| nullSource rest -> [ line ]
| otherwise -> (line <> "\n") : sourceLines (dropSource 1 rest)
sourceLines source = (`slice` source) <$> sourceLineRanges source
-- | Compute the 'Range's of each line in a 'Source'.
sourceLineRanges :: Source -> [Range]
sourceLineRanges = drop 1 . scanl toRange (Range 0 0) . sourceLines
where toRange previous source = Range (end previous) $ end previous + sourceLength source
sourceLineRanges source = sourceLineRangesWithin (totalRange source) source
-- | Compute the 'Range's of each line in a 'Range' of a 'Source'.
sourceLineRangesWithin :: Range -> Source -> [Range]
sourceLineRangesWithin range = drop 1 . scanl toRange (Range (start range) (start range)) . sourceLines . slice range
where toRange previous source = Range (end previous) $ end previous + sourceLength source
sourceLineRangesWithin range = uncurry (zipWith Range) . ((start range:) &&& (<> [ end range ])) . fmap (+ succ (start range)) . B.elemIndices (toEnum (ord '\n')) . sourceBytes . slice range
-- Conversion
-- | Compute the byte 'Range' corresponding to a given 'Span' in a 'Source'.
spanToRange :: Source -> Span -> Range
spanToRange source = spanToRangeInLineRanges (sourceLineRanges source)
spanToRange = spanToRangeInLineRanges . sourceLineRangesByLineNumber
spanToRangeInLineRanges :: [Range] -> Span -> Range
spanToRangeInLineRanges lineRanges Span{..} = Range start end
where start = pred (sumLengths leadingRanges + posColumn spanStart)
end = start + sumLengths (take (posLine spanEnd - posLine spanStart) remainingRanges) + (posColumn spanEnd - posColumn spanStart)
(leadingRanges, remainingRanges) = splitAt (pred (posLine spanStart)) lineRanges
sumLengths = sum . fmap rangeLength
spanToRangeInLineRanges :: Array Int Range -> Span -> Range
spanToRangeInLineRanges lineRanges Span{..} = Range
(start (lineRanges ! posLine spanStart) + pred (posColumn spanStart))
(start (lineRanges ! posLine spanEnd) + pred (posColumn spanEnd))
sourceLineRangesByLineNumber :: Source -> Array Int Range
sourceLineRangesByLineNumber source = listArray (1, length lineRanges) lineRanges
where lineRanges = sourceLineRanges source
-- | Compute the 'Span' corresponding to a given byte 'Range' in a 'Source'.
rangeToSpan :: Source -> Range -> Span
rangeToSpan source (Range rangeStart rangeEnd) = Span startPos endPos
where startPos = Pos (firstLine + 1) (rangeStart - start firstRange + 1)
where startPos = Pos (firstLine + 1) (rangeStart - start firstRange + 1)
endPos = Pos (firstLine + length lineRanges) (rangeEnd - start lastRange + 1)
firstLine = length before
(before, rest) = span ((< rangeStart) . end) (sourceLineRanges source)
(lineRanges, _) = span ((<= rangeEnd) . start) rest
Just firstRange = getFirst (foldMap (First . Just) lineRanges)
Just lastRange = getLast (foldMap (Last . Just) lineRanges)
Just lastRange = getLast (foldMap (Last . Just) lineRanges)
-- Instances

View File

@ -115,6 +115,7 @@ data AssignmentF ast grammar a where
Source :: HasCallStack => AssignmentF ast grammar ByteString
Children :: HasCallStack => Assignment ast grammar a -> AssignmentF ast grammar a
Choose :: HasCallStack => IntMap.IntMap a -> AssignmentF ast grammar a
Many :: HasCallStack => Assignment ast grammar a -> AssignmentF ast grammar [a]
Alt :: HasCallStack => a -> a -> AssignmentF ast grammar a
Empty :: HasCallStack => AssignmentF ast grammar a
Throw :: HasCallStack => Error grammar -> AssignmentF ast grammar a
@ -128,13 +129,13 @@ location = Location `Then` return
-- | Zero-width projection of the current node.
--
-- Since this is zero-width, care must be taken not to repeat it without chaining on other rules. I.e. 'many (project f *> b)' is fine, but 'many (project f)' is not.
-- Since this is zero-width, care must be taken not to repeat it without chaining on other rules. I.e. @many (project f *> b)@ is fine, but @many (project f)@ is not.
project :: HasCallStack => (forall x. Base ast x -> a) -> Assignment ast grammar a
project projection = Project projection `Then` return
-- | Zero-width match of a node with the given symbol, producing the current nodes location.
--
-- Since this is zero-width, care must be taken not to repeat it without chaining on other rules. I.e. 'many (symbol A *> b)' is fine, but 'many (symbol A)' is not.
-- Since this is zero-width, care must be taken not to repeat it without chaining on other rules. I.e. @many (symbol A *> b)@ is fine, but @many (symbol A)@ is not.
symbol :: (Enum grammar, Eq grammar, HasCallStack) => grammar -> Assignment ast grammar (Record Location)
symbol s = withFrozenCallStack $ Choose (IntMap.singleton (fromEnum s) ()) `Then` (const location)
@ -182,12 +183,10 @@ data ErrorCause grammar
-- | Pretty-print an Error with reference to the source where it occurred.
printError :: Show grammar => Source.Source -> Error grammar -> IO ()
printError source error@Error{..}
= do
withSGRCode [SetConsoleIntensity BoldIntensity] . putStrErr . (showPos Nothing errorPos) . showString ": " $ ""
withSGRCode [SetColor Foreground Vivid Red] . putStrErr . (showString "error") . showString ": " . showExpectation error . showChar '\n' . showString (toS context) . (if isSuffixOf "\n" context then identity else showChar '\n') . showString (replicate (succ (Info.posColumn errorPos + lineNumberDigits)) ' ') $ ""
withSGRCode [SetColor Foreground Vivid Green] . putStrErr . (showChar '^') . showChar '\n' . showString (prettyCallStack callStack) $ ""
printError source error@Error{..} = do
withSGRCode [SetConsoleIntensity BoldIntensity] . putStrErr . showPos Nothing errorPos . showString ": " $ ""
withSGRCode [SetColor Foreground Vivid Red] . putStrErr . showString "error" . showString ": " . showExpectation error . showChar '\n' . showString (toS context) . (if isSuffixOf "\n" context then identity else showChar '\n') . showString (replicate (succ (Info.posColumn errorPos + lineNumberDigits)) ' ') $ ""
withSGRCode [SetColor Foreground Vivid Green] . putStrErr . showChar '^' . showChar '\n' . showString (prettyCallStack callStack) $ ""
where context = maybe "\n" (Source.sourceBytes . sconcat) (nonEmpty [ Source.fromBytes (toS (showLineNumber i)) <> Source.fromBytes ": " <> l | (i, l) <- zip [1..] (Source.sourceLines source), inRange (Info.posLine errorPos - 2, Info.posLine errorPos) i ])
showLineNumber n = let s = show n in replicate (lineNumberDigits - length s) ' ' <> s
lineNumberDigits = succ (floor (logBase 10 (fromIntegral (Info.posLine errorPos) :: Double)))
@ -248,7 +247,9 @@ runAssignment toRecord = iterFreer run . fmap ((pure .) . (,))
Result _ (Just (a, state')) -> yield a (advanceState (rtail . toRecord) state' { stateNodes = stateNodes })
Result err Nothing -> Result err Nothing
(Choose choices, node : _) | Just symbol :. _ <- toRecord (F.project node), Just a <- IntMap.lookup (fromEnum symbol) choices -> yield a state
-- Nullability: some rules, e.g. 'pure a' and 'many a', should match at the end of input. Either side of an alternation may be nullable, ergo Alt can match at the end of input.
(Many _, []) -> yield [] state
(Many rule, _) -> uncurry yield (runMany rule state)
-- Nullability: some rules, e.g. @pure a@ and @many a@, should match at the end of input. Either side of an alternation may be nullable, ergo Alt can match at the end of input.
(Alt a b, _) -> yield a state <|> yield b state
(Throw e, _) -> Result (Just e) Nothing
(Catch during handler, _) -> case yield during state of
@ -263,6 +264,10 @@ runAssignment toRecord = iterFreer run . fmap ((pure .) . (,))
Choose choices -> choiceSymbols choices
_ -> []
choiceSymbols choices = (toEnum :: Int -> grammar) <$> IntMap.keys choices
runMany :: Assignment ast grammar v -> AssignmentState ast -> ([v], AssignmentState ast)
runMany rule state = case runAssignment toRecord rule state of
Result _ (Just (a, state')) -> first (a :) (runMany rule state')
_ -> ([], state)
dropAnonymous :: (Symbol grammar, Recursive ast) => (forall x. Base ast x -> Maybe grammar) -> AssignmentState ast -> AssignmentState ast
dropAnonymous toSymbol state = state { stateNodes = dropWhile ((`notElem` [Just Regular, Nothing]) . fmap symbolType . toSymbol . F.project) (stateNodes state) }
@ -284,7 +289,7 @@ data AssignmentState ast = AssignmentState
deriving (Eq, Show)
makeState :: Source.Source -> [ast] -> AssignmentState ast
makeState source nodes = AssignmentState 0 (Info.Pos 1 1) source nodes
makeState = AssignmentState 0 (Info.Pos 1 1)
-- Instances
@ -293,11 +298,16 @@ instance Enum grammar => Alternative (Assignment ast grammar) where
empty :: HasCallStack => Assignment ast grammar a
empty = Empty `Then` return
(<|>) :: HasCallStack => Assignment ast grammar a -> Assignment ast grammar a -> Assignment ast grammar a
a <|> b = case (a, b) of
(_, Empty `Then` _) -> a
(Empty `Then` _, _) -> b
(Choose choices1 `Then` continue1, Choose choices2 `Then` continue2) -> Choose (IntMap.union (fmap continue1 choices1) (fmap continue2 choices2)) `Then` identity
_ -> wrap $ Alt a b
Return a <|> _ = Return a
a <|> b | Just c <- (liftA2 (<>) `on` choices) a b = Choose c `Then` identity
| otherwise = wrap $ Alt a b
where choices :: Assignment ast grammar a -> Maybe (IntMap (Assignment ast grammar a))
choices (Choose choices `Then` continue) = Just (continue <$> choices)
choices (Empty `Then` _) = Just mempty
choices (Many rule `Then` continue) = fmap (const (Many rule `Then` continue)) <$> choices rule
choices _ = Nothing
many :: HasCallStack => Assignment ast grammar a -> Assignment ast grammar [a]
many a = Many a `Then` return
instance Show grammar => Show1 (AssignmentF ast grammar) where
liftShowsPrec sp sl d a = case a of
@ -306,6 +316,7 @@ instance Show grammar => Show1 (AssignmentF ast grammar) where
Source -> showString "Source" . showChar ' ' . sp d ""
Children a -> showsUnaryWith (liftShowsPrec sp sl) "Children" d a
Choose choices -> showsUnaryWith (liftShowsPrec (liftShowsPrec sp sl) (liftShowList sp sl)) "Choose" d (IntMap.toList choices)
Many a -> showsUnaryWith (liftShowsPrec (\ d a -> sp d [a]) (sl . pure)) "Many" d a
Alt a b -> showsBinaryWith sp sp "Alt" d a b
Empty -> showString "Empty"
Throw e -> showsUnaryWith showsPrec "Throw" d e

View File

@ -44,9 +44,9 @@ cmarkParser source = toTerm (totalRange source) (totalSpan source) $ commonmarkT
span = maybe withinSpan toSpan position
in cofree $ (t :. range :. span :. Nil) :< (toTerm range span <$> children)
toSpan PosInfo{..} = Span (Pos startLine startColumn) (Pos endLine (succ endColumn))
toSpan PosInfo{..} = Span (Pos startLine startColumn) (Pos (max startLine endLine) (succ (if endLine <= startLine then max startColumn endColumn else endColumn)))
lineRanges = sourceLineRanges source
lineRanges = sourceLineRangesByLineNumber source
toGrammar :: NodeType -> Grammar
toGrammar DOCUMENT{} = Document

View File

@ -62,7 +62,7 @@ paragraph :: Assignment
paragraph = makeTerm <$> symbol Paragraph <*> children (Markup.Paragraph <$> many inlineElement)
list :: Assignment
list = (cofree .) . (:<) <$> symbol List <*> (project (\ (((CMark.LIST CMark.ListAttributes{..}) :. _) :< _) -> case listType of
list = (cofree .) . (:<) <$> symbol List <*> (project (\ ((CMark.LIST CMark.ListAttributes{..} :. _) :< _) -> case listType of
CMark.BULLET_LIST -> inj . Markup.UnorderedList
CMark.ORDERED_LIST -> inj . Markup.OrderedList) <*> children (many item))
@ -71,7 +71,7 @@ item = makeTerm <$> symbol Item <*> children (many blockElement)
section :: Assignment
section = makeTerm <$> symbol Heading <*> (heading >>= \ headingTerm -> Markup.Section headingTerm <$> while (((<) `on` level) headingTerm) blockElement)
where heading = makeTerm <$> symbol Heading <*> (Markup.Heading <$> project (\ ((CMark.HEADING level :. _) :< _) -> level) <*> children (many inlineElement))
where heading = makeTerm <$> symbol Heading <*> (project (\ ((CMark.HEADING level :. _) :< _) -> Markup.Heading level) <*> children (many inlineElement))
level term = case term of
_ | Just section <- prj (unwrap term) -> level (Markup.sectionHeading section)
_ | Just heading <- prj (unwrap term) -> Markup.headingLevel heading
@ -81,10 +81,10 @@ blockQuote :: Assignment
blockQuote = makeTerm <$> symbol BlockQuote <*> children (Markup.BlockQuote <$> many blockElement)
codeBlock :: Assignment
codeBlock = makeTerm <$> symbol CodeBlock <*> (Markup.Code <$> project (\ (((CMark.CODE_BLOCK language _) :. _) :< _) -> nullText language) <*> source)
codeBlock = makeTerm <$> symbol CodeBlock <*> (project (\ ((CMark.CODE_BLOCK language _ :. _) :< _) -> Markup.Code (nullText language)) <*> source)
thematicBreak :: Assignment
thematicBreak = makeTerm <$> symbol ThematicBreak <*> (Markup.ThematicBreak <$ source)
thematicBreak = makeTerm <$> symbol ThematicBreak <*> pure Markup.ThematicBreak <* source
htmlBlock :: Assignment
htmlBlock = makeTerm <$> symbol HTMLBlock <*> (Markup.HTMLBlock <$> source)
@ -105,19 +105,19 @@ text :: Assignment
text = makeTerm <$> symbol Text <*> (Markup.Text <$> source)
link :: Assignment
link = makeTerm <$> symbol Link <*> (uncurry Markup.Link <$> project (\ (((CMark.LINK url title) :. _) :< _) -> (toS url, nullText title))) <* source
link = makeTerm <$> symbol Link <*> project (\ ((CMark.LINK url title :. _) :< _) -> Markup.Link (toS url) (nullText title)) <* source
image :: Assignment
image = makeTerm <$> symbol Image <*> (uncurry Markup.Image <$> project (\ (((CMark.IMAGE url title) :. _) :< _) -> (toS url, nullText title))) <* source
image = makeTerm <$> symbol Image <*> project (\ ((CMark.IMAGE url title :. _) :< _) -> Markup.Image (toS url) (nullText title)) <* source
code :: Assignment
code = makeTerm <$> symbol Code <*> (Markup.Code Nothing <$> source)
lineBreak :: Assignment
lineBreak = makeTerm <$> symbol LineBreak <*> (Markup.LineBreak <$ source)
lineBreak = makeTerm <$> symbol LineBreak <*> pure Markup.LineBreak <* source
softBreak :: Assignment
softBreak = makeTerm <$> symbol SoftBreak <*> (Markup.LineBreak <$ source)
softBreak = makeTerm <$> symbol SoftBreak <*> pure Markup.LineBreak <* source
-- Implementation details

View File

@ -16,7 +16,6 @@ import Data.Record
import Data.Source as Source
import qualified Data.Syntax as Syntax
import Data.Syntax.Assignment
import qualified Data.Text as T
import Data.Union
import Info hiding (Empty, Go)
import Language
@ -93,7 +92,7 @@ runParser parser = case parser of
Nothing -> pure (errorTerm source err)
TreeSitterParser language tslanguage -> treeSitterParser language tslanguage
MarkdownParser -> pure . cmarkParser
LineByLineParser -> lineByLineParser
LineByLineParser -> pure . lineByLineParser
errorTerm :: Syntax.Error (Error grammar) :< fs => Source -> Maybe (Error grammar) -> Term (Union fs) (Record Location)
errorTerm source err = cofree ((totalRange source :. totalSpan source :. Nil) :< inj (Syntax.Error (fromMaybe (Error (Pos 0 0) (UnexpectedEndOfInput [])) err)))
@ -104,13 +103,6 @@ termErrors = cata $ \ (_ :< s) -> case s of
_ -> fold s
-- | A fallback parser that treats a file simply as rows of strings.
lineByLineParser :: Source -> IO (SyntaxTerm Text DefaultFields)
lineByLineParser source = pure . cofree . root $ case foldl' annotateLeaves ([], 0) lines of
(leaves, _) -> cofree <$> leaves
where
lines = sourceLines source
root children = (sourceRange :. Program :. rangeToSpan source sourceRange :. Nil) :< Indexed children
sourceRange = totalRange source
leaf byteIndex line = (Range byteIndex (byteIndex + T.length line) :. Program :. rangeToSpan source (Range byteIndex (byteIndex + T.length line)) :. Nil) :< Leaf line
annotateLeaves (accum, byteIndex) line =
(accum <> [ leaf byteIndex (toText line) ] , byteIndex + sourceLength line)
lineByLineParser :: Source -> SyntaxTerm Text DefaultFields
lineByLineParser source = cofree $ (totalRange source :. Program :. totalSpan source :. Nil) :< Indexed (zipWith toLine [1..] (sourceLineRanges source))
where toLine line range = cofree $ (range :. Program :. Span (Pos line 1) (Pos line (end range)) :. Nil) :< Leaf (toText (slice range source))