mirror of
https://github.com/github/semantic.git
synced 2024-12-20 13:21:59 +03:00
Merge pull request #1183 from github/fast-markdown-assignment
Fast Markdown assignment
This commit is contained in:
commit
fc4760335f
@ -21,12 +21,15 @@ module Data.Source
|
||||
-- Conversion
|
||||
, spanToRange
|
||||
, spanToRangeInLineRanges
|
||||
, sourceLineRangesByLineNumber
|
||||
, rangeToSpan
|
||||
-- Listable
|
||||
, ListableByteString(..)
|
||||
) where
|
||||
|
||||
import Data.Array
|
||||
import qualified Data.ByteString as B
|
||||
import Data.Char (ord)
|
||||
import Data.List (span)
|
||||
import Data.Range
|
||||
import Data.Span
|
||||
@ -35,7 +38,7 @@ import qualified Data.Text as T
|
||||
import Prologue
|
||||
import Test.LeanCheck
|
||||
|
||||
-- | The contents of a source file, represented as a ByteString.
|
||||
-- | The contents of a source file, represented as a 'ByteString'.
|
||||
newtype Source = Source { sourceBytes :: B.ByteString }
|
||||
deriving (Eq, IsString, Show)
|
||||
|
||||
@ -90,54 +93,44 @@ takeSource i = Source . take . sourceBytes
|
||||
|
||||
-- Splitting
|
||||
|
||||
-- | Split the source into the longest prefix of elements that do not satisfy the predicate and the rest without copying.
|
||||
breakSource :: (Word8 -> Bool) -> Source -> (Source, Source)
|
||||
breakSource predicate (Source text) = let (start, remainder) = B.break predicate text in (Source start, Source remainder)
|
||||
|
||||
|
||||
-- | Split the contents of the source after newlines.
|
||||
sourceLines :: Source -> [Source]
|
||||
sourceLines source
|
||||
| nullSource source = [ source ]
|
||||
| otherwise = case breakSource (== toEnum (fromEnum '\n')) source of
|
||||
(line, rest)
|
||||
| nullSource rest -> [ line ]
|
||||
| otherwise -> (line <> "\n") : sourceLines (dropSource 1 rest)
|
||||
sourceLines source = (`slice` source) <$> sourceLineRanges source
|
||||
|
||||
-- | Compute the 'Range's of each line in a 'Source'.
|
||||
sourceLineRanges :: Source -> [Range]
|
||||
sourceLineRanges = drop 1 . scanl toRange (Range 0 0) . sourceLines
|
||||
where toRange previous source = Range (end previous) $ end previous + sourceLength source
|
||||
sourceLineRanges source = sourceLineRangesWithin (totalRange source) source
|
||||
|
||||
-- | Compute the 'Range's of each line in a 'Range' of a 'Source'.
|
||||
sourceLineRangesWithin :: Range -> Source -> [Range]
|
||||
sourceLineRangesWithin range = drop 1 . scanl toRange (Range (start range) (start range)) . sourceLines . slice range
|
||||
where toRange previous source = Range (end previous) $ end previous + sourceLength source
|
||||
sourceLineRangesWithin range = uncurry (zipWith Range) . ((start range:) &&& (<> [ end range ])) . fmap (+ succ (start range)) . B.elemIndices (toEnum (ord '\n')) . sourceBytes . slice range
|
||||
|
||||
|
||||
-- Conversion
|
||||
|
||||
-- | Compute the byte 'Range' corresponding to a given 'Span' in a 'Source'.
|
||||
spanToRange :: Source -> Span -> Range
|
||||
spanToRange source = spanToRangeInLineRanges (sourceLineRanges source)
|
||||
spanToRange = spanToRangeInLineRanges . sourceLineRangesByLineNumber
|
||||
|
||||
spanToRangeInLineRanges :: [Range] -> Span -> Range
|
||||
spanToRangeInLineRanges lineRanges Span{..} = Range start end
|
||||
where start = pred (sumLengths leadingRanges + posColumn spanStart)
|
||||
end = start + sumLengths (take (posLine spanEnd - posLine spanStart) remainingRanges) + (posColumn spanEnd - posColumn spanStart)
|
||||
(leadingRanges, remainingRanges) = splitAt (pred (posLine spanStart)) lineRanges
|
||||
sumLengths = sum . fmap rangeLength
|
||||
spanToRangeInLineRanges :: Array Int Range -> Span -> Range
|
||||
spanToRangeInLineRanges lineRanges Span{..} = Range
|
||||
(start (lineRanges ! posLine spanStart) + pred (posColumn spanStart))
|
||||
(start (lineRanges ! posLine spanEnd) + pred (posColumn spanEnd))
|
||||
|
||||
sourceLineRangesByLineNumber :: Source -> Array Int Range
|
||||
sourceLineRangesByLineNumber source = listArray (1, length lineRanges) lineRanges
|
||||
where lineRanges = sourceLineRanges source
|
||||
|
||||
-- | Compute the 'Span' corresponding to a given byte 'Range' in a 'Source'.
|
||||
rangeToSpan :: Source -> Range -> Span
|
||||
rangeToSpan source (Range rangeStart rangeEnd) = Span startPos endPos
|
||||
where startPos = Pos (firstLine + 1) (rangeStart - start firstRange + 1)
|
||||
where startPos = Pos (firstLine + 1) (rangeStart - start firstRange + 1)
|
||||
endPos = Pos (firstLine + length lineRanges) (rangeEnd - start lastRange + 1)
|
||||
firstLine = length before
|
||||
(before, rest) = span ((< rangeStart) . end) (sourceLineRanges source)
|
||||
(lineRanges, _) = span ((<= rangeEnd) . start) rest
|
||||
Just firstRange = getFirst (foldMap (First . Just) lineRanges)
|
||||
Just lastRange = getLast (foldMap (Last . Just) lineRanges)
|
||||
Just lastRange = getLast (foldMap (Last . Just) lineRanges)
|
||||
|
||||
|
||||
-- Instances
|
||||
|
@ -115,6 +115,7 @@ data AssignmentF ast grammar a where
|
||||
Source :: HasCallStack => AssignmentF ast grammar ByteString
|
||||
Children :: HasCallStack => Assignment ast grammar a -> AssignmentF ast grammar a
|
||||
Choose :: HasCallStack => IntMap.IntMap a -> AssignmentF ast grammar a
|
||||
Many :: HasCallStack => Assignment ast grammar a -> AssignmentF ast grammar [a]
|
||||
Alt :: HasCallStack => a -> a -> AssignmentF ast grammar a
|
||||
Empty :: HasCallStack => AssignmentF ast grammar a
|
||||
Throw :: HasCallStack => Error grammar -> AssignmentF ast grammar a
|
||||
@ -128,13 +129,13 @@ location = Location `Then` return
|
||||
|
||||
-- | Zero-width projection of the current node.
|
||||
--
|
||||
-- Since this is zero-width, care must be taken not to repeat it without chaining on other rules. I.e. 'many (project f *> b)' is fine, but 'many (project f)' is not.
|
||||
-- Since this is zero-width, care must be taken not to repeat it without chaining on other rules. I.e. @many (project f *> b)@ is fine, but @many (project f)@ is not.
|
||||
project :: HasCallStack => (forall x. Base ast x -> a) -> Assignment ast grammar a
|
||||
project projection = Project projection `Then` return
|
||||
|
||||
-- | Zero-width match of a node with the given symbol, producing the current node’s location.
|
||||
--
|
||||
-- Since this is zero-width, care must be taken not to repeat it without chaining on other rules. I.e. 'many (symbol A *> b)' is fine, but 'many (symbol A)' is not.
|
||||
-- Since this is zero-width, care must be taken not to repeat it without chaining on other rules. I.e. @many (symbol A *> b)@ is fine, but @many (symbol A)@ is not.
|
||||
symbol :: (Enum grammar, Eq grammar, HasCallStack) => grammar -> Assignment ast grammar (Record Location)
|
||||
symbol s = withFrozenCallStack $ Choose (IntMap.singleton (fromEnum s) ()) `Then` (const location)
|
||||
|
||||
@ -182,12 +183,10 @@ data ErrorCause grammar
|
||||
|
||||
-- | Pretty-print an Error with reference to the source where it occurred.
|
||||
printError :: Show grammar => Source.Source -> Error grammar -> IO ()
|
||||
printError source error@Error{..}
|
||||
= do
|
||||
withSGRCode [SetConsoleIntensity BoldIntensity] . putStrErr . (showPos Nothing errorPos) . showString ": " $ ""
|
||||
withSGRCode [SetColor Foreground Vivid Red] . putStrErr . (showString "error") . showString ": " . showExpectation error . showChar '\n' . showString (toS context) . (if isSuffixOf "\n" context then identity else showChar '\n') . showString (replicate (succ (Info.posColumn errorPos + lineNumberDigits)) ' ') $ ""
|
||||
withSGRCode [SetColor Foreground Vivid Green] . putStrErr . (showChar '^') . showChar '\n' . showString (prettyCallStack callStack) $ ""
|
||||
|
||||
printError source error@Error{..} = do
|
||||
withSGRCode [SetConsoleIntensity BoldIntensity] . putStrErr . showPos Nothing errorPos . showString ": " $ ""
|
||||
withSGRCode [SetColor Foreground Vivid Red] . putStrErr . showString "error" . showString ": " . showExpectation error . showChar '\n' . showString (toS context) . (if isSuffixOf "\n" context then identity else showChar '\n') . showString (replicate (succ (Info.posColumn errorPos + lineNumberDigits)) ' ') $ ""
|
||||
withSGRCode [SetColor Foreground Vivid Green] . putStrErr . showChar '^' . showChar '\n' . showString (prettyCallStack callStack) $ ""
|
||||
where context = maybe "\n" (Source.sourceBytes . sconcat) (nonEmpty [ Source.fromBytes (toS (showLineNumber i)) <> Source.fromBytes ": " <> l | (i, l) <- zip [1..] (Source.sourceLines source), inRange (Info.posLine errorPos - 2, Info.posLine errorPos) i ])
|
||||
showLineNumber n = let s = show n in replicate (lineNumberDigits - length s) ' ' <> s
|
||||
lineNumberDigits = succ (floor (logBase 10 (fromIntegral (Info.posLine errorPos) :: Double)))
|
||||
@ -248,7 +247,9 @@ runAssignment toRecord = iterFreer run . fmap ((pure .) . (,))
|
||||
Result _ (Just (a, state')) -> yield a (advanceState (rtail . toRecord) state' { stateNodes = stateNodes })
|
||||
Result err Nothing -> Result err Nothing
|
||||
(Choose choices, node : _) | Just symbol :. _ <- toRecord (F.project node), Just a <- IntMap.lookup (fromEnum symbol) choices -> yield a state
|
||||
-- Nullability: some rules, e.g. 'pure a' and 'many a', should match at the end of input. Either side of an alternation may be nullable, ergo Alt can match at the end of input.
|
||||
(Many _, []) -> yield [] state
|
||||
(Many rule, _) -> uncurry yield (runMany rule state)
|
||||
-- Nullability: some rules, e.g. @pure a@ and @many a@, should match at the end of input. Either side of an alternation may be nullable, ergo Alt can match at the end of input.
|
||||
(Alt a b, _) -> yield a state <|> yield b state
|
||||
(Throw e, _) -> Result (Just e) Nothing
|
||||
(Catch during handler, _) -> case yield during state of
|
||||
@ -263,6 +264,10 @@ runAssignment toRecord = iterFreer run . fmap ((pure .) . (,))
|
||||
Choose choices -> choiceSymbols choices
|
||||
_ -> []
|
||||
choiceSymbols choices = (toEnum :: Int -> grammar) <$> IntMap.keys choices
|
||||
runMany :: Assignment ast grammar v -> AssignmentState ast -> ([v], AssignmentState ast)
|
||||
runMany rule state = case runAssignment toRecord rule state of
|
||||
Result _ (Just (a, state')) -> first (a :) (runMany rule state')
|
||||
_ -> ([], state)
|
||||
|
||||
dropAnonymous :: (Symbol grammar, Recursive ast) => (forall x. Base ast x -> Maybe grammar) -> AssignmentState ast -> AssignmentState ast
|
||||
dropAnonymous toSymbol state = state { stateNodes = dropWhile ((`notElem` [Just Regular, Nothing]) . fmap symbolType . toSymbol . F.project) (stateNodes state) }
|
||||
@ -284,7 +289,7 @@ data AssignmentState ast = AssignmentState
|
||||
deriving (Eq, Show)
|
||||
|
||||
makeState :: Source.Source -> [ast] -> AssignmentState ast
|
||||
makeState source nodes = AssignmentState 0 (Info.Pos 1 1) source nodes
|
||||
makeState = AssignmentState 0 (Info.Pos 1 1)
|
||||
|
||||
|
||||
-- Instances
|
||||
@ -293,11 +298,16 @@ instance Enum grammar => Alternative (Assignment ast grammar) where
|
||||
empty :: HasCallStack => Assignment ast grammar a
|
||||
empty = Empty `Then` return
|
||||
(<|>) :: HasCallStack => Assignment ast grammar a -> Assignment ast grammar a -> Assignment ast grammar a
|
||||
a <|> b = case (a, b) of
|
||||
(_, Empty `Then` _) -> a
|
||||
(Empty `Then` _, _) -> b
|
||||
(Choose choices1 `Then` continue1, Choose choices2 `Then` continue2) -> Choose (IntMap.union (fmap continue1 choices1) (fmap continue2 choices2)) `Then` identity
|
||||
_ -> wrap $ Alt a b
|
||||
Return a <|> _ = Return a
|
||||
a <|> b | Just c <- (liftA2 (<>) `on` choices) a b = Choose c `Then` identity
|
||||
| otherwise = wrap $ Alt a b
|
||||
where choices :: Assignment ast grammar a -> Maybe (IntMap (Assignment ast grammar a))
|
||||
choices (Choose choices `Then` continue) = Just (continue <$> choices)
|
||||
choices (Empty `Then` _) = Just mempty
|
||||
choices (Many rule `Then` continue) = fmap (const (Many rule `Then` continue)) <$> choices rule
|
||||
choices _ = Nothing
|
||||
many :: HasCallStack => Assignment ast grammar a -> Assignment ast grammar [a]
|
||||
many a = Many a `Then` return
|
||||
|
||||
instance Show grammar => Show1 (AssignmentF ast grammar) where
|
||||
liftShowsPrec sp sl d a = case a of
|
||||
@ -306,6 +316,7 @@ instance Show grammar => Show1 (AssignmentF ast grammar) where
|
||||
Source -> showString "Source" . showChar ' ' . sp d ""
|
||||
Children a -> showsUnaryWith (liftShowsPrec sp sl) "Children" d a
|
||||
Choose choices -> showsUnaryWith (liftShowsPrec (liftShowsPrec sp sl) (liftShowList sp sl)) "Choose" d (IntMap.toList choices)
|
||||
Many a -> showsUnaryWith (liftShowsPrec (\ d a -> sp d [a]) (sl . pure)) "Many" d a
|
||||
Alt a b -> showsBinaryWith sp sp "Alt" d a b
|
||||
Empty -> showString "Empty"
|
||||
Throw e -> showsUnaryWith showsPrec "Throw" d e
|
||||
|
@ -44,9 +44,9 @@ cmarkParser source = toTerm (totalRange source) (totalSpan source) $ commonmarkT
|
||||
span = maybe withinSpan toSpan position
|
||||
in cofree $ (t :. range :. span :. Nil) :< (toTerm range span <$> children)
|
||||
|
||||
toSpan PosInfo{..} = Span (Pos startLine startColumn) (Pos endLine (succ endColumn))
|
||||
toSpan PosInfo{..} = Span (Pos startLine startColumn) (Pos (max startLine endLine) (succ (if endLine <= startLine then max startColumn endColumn else endColumn)))
|
||||
|
||||
lineRanges = sourceLineRanges source
|
||||
lineRanges = sourceLineRangesByLineNumber source
|
||||
|
||||
toGrammar :: NodeType -> Grammar
|
||||
toGrammar DOCUMENT{} = Document
|
||||
|
@ -62,7 +62,7 @@ paragraph :: Assignment
|
||||
paragraph = makeTerm <$> symbol Paragraph <*> children (Markup.Paragraph <$> many inlineElement)
|
||||
|
||||
list :: Assignment
|
||||
list = (cofree .) . (:<) <$> symbol List <*> (project (\ (((CMark.LIST CMark.ListAttributes{..}) :. _) :< _) -> case listType of
|
||||
list = (cofree .) . (:<) <$> symbol List <*> (project (\ ((CMark.LIST CMark.ListAttributes{..} :. _) :< _) -> case listType of
|
||||
CMark.BULLET_LIST -> inj . Markup.UnorderedList
|
||||
CMark.ORDERED_LIST -> inj . Markup.OrderedList) <*> children (many item))
|
||||
|
||||
@ -71,7 +71,7 @@ item = makeTerm <$> symbol Item <*> children (many blockElement)
|
||||
|
||||
section :: Assignment
|
||||
section = makeTerm <$> symbol Heading <*> (heading >>= \ headingTerm -> Markup.Section headingTerm <$> while (((<) `on` level) headingTerm) blockElement)
|
||||
where heading = makeTerm <$> symbol Heading <*> (Markup.Heading <$> project (\ ((CMark.HEADING level :. _) :< _) -> level) <*> children (many inlineElement))
|
||||
where heading = makeTerm <$> symbol Heading <*> (project (\ ((CMark.HEADING level :. _) :< _) -> Markup.Heading level) <*> children (many inlineElement))
|
||||
level term = case term of
|
||||
_ | Just section <- prj (unwrap term) -> level (Markup.sectionHeading section)
|
||||
_ | Just heading <- prj (unwrap term) -> Markup.headingLevel heading
|
||||
@ -81,10 +81,10 @@ blockQuote :: Assignment
|
||||
blockQuote = makeTerm <$> symbol BlockQuote <*> children (Markup.BlockQuote <$> many blockElement)
|
||||
|
||||
codeBlock :: Assignment
|
||||
codeBlock = makeTerm <$> symbol CodeBlock <*> (Markup.Code <$> project (\ (((CMark.CODE_BLOCK language _) :. _) :< _) -> nullText language) <*> source)
|
||||
codeBlock = makeTerm <$> symbol CodeBlock <*> (project (\ ((CMark.CODE_BLOCK language _ :. _) :< _) -> Markup.Code (nullText language)) <*> source)
|
||||
|
||||
thematicBreak :: Assignment
|
||||
thematicBreak = makeTerm <$> symbol ThematicBreak <*> (Markup.ThematicBreak <$ source)
|
||||
thematicBreak = makeTerm <$> symbol ThematicBreak <*> pure Markup.ThematicBreak <* source
|
||||
|
||||
htmlBlock :: Assignment
|
||||
htmlBlock = makeTerm <$> symbol HTMLBlock <*> (Markup.HTMLBlock <$> source)
|
||||
@ -105,19 +105,19 @@ text :: Assignment
|
||||
text = makeTerm <$> symbol Text <*> (Markup.Text <$> source)
|
||||
|
||||
link :: Assignment
|
||||
link = makeTerm <$> symbol Link <*> (uncurry Markup.Link <$> project (\ (((CMark.LINK url title) :. _) :< _) -> (toS url, nullText title))) <* source
|
||||
link = makeTerm <$> symbol Link <*> project (\ ((CMark.LINK url title :. _) :< _) -> Markup.Link (toS url) (nullText title)) <* source
|
||||
|
||||
image :: Assignment
|
||||
image = makeTerm <$> symbol Image <*> (uncurry Markup.Image <$> project (\ (((CMark.IMAGE url title) :. _) :< _) -> (toS url, nullText title))) <* source
|
||||
image = makeTerm <$> symbol Image <*> project (\ ((CMark.IMAGE url title :. _) :< _) -> Markup.Image (toS url) (nullText title)) <* source
|
||||
|
||||
code :: Assignment
|
||||
code = makeTerm <$> symbol Code <*> (Markup.Code Nothing <$> source)
|
||||
|
||||
lineBreak :: Assignment
|
||||
lineBreak = makeTerm <$> symbol LineBreak <*> (Markup.LineBreak <$ source)
|
||||
lineBreak = makeTerm <$> symbol LineBreak <*> pure Markup.LineBreak <* source
|
||||
|
||||
softBreak :: Assignment
|
||||
softBreak = makeTerm <$> symbol SoftBreak <*> (Markup.LineBreak <$ source)
|
||||
softBreak = makeTerm <$> symbol SoftBreak <*> pure Markup.LineBreak <* source
|
||||
|
||||
|
||||
-- Implementation details
|
||||
|
@ -16,7 +16,6 @@ import Data.Record
|
||||
import Data.Source as Source
|
||||
import qualified Data.Syntax as Syntax
|
||||
import Data.Syntax.Assignment
|
||||
import qualified Data.Text as T
|
||||
import Data.Union
|
||||
import Info hiding (Empty, Go)
|
||||
import Language
|
||||
@ -93,7 +92,7 @@ runParser parser = case parser of
|
||||
Nothing -> pure (errorTerm source err)
|
||||
TreeSitterParser language tslanguage -> treeSitterParser language tslanguage
|
||||
MarkdownParser -> pure . cmarkParser
|
||||
LineByLineParser -> lineByLineParser
|
||||
LineByLineParser -> pure . lineByLineParser
|
||||
|
||||
errorTerm :: Syntax.Error (Error grammar) :< fs => Source -> Maybe (Error grammar) -> Term (Union fs) (Record Location)
|
||||
errorTerm source err = cofree ((totalRange source :. totalSpan source :. Nil) :< inj (Syntax.Error (fromMaybe (Error (Pos 0 0) (UnexpectedEndOfInput [])) err)))
|
||||
@ -104,13 +103,6 @@ termErrors = cata $ \ (_ :< s) -> case s of
|
||||
_ -> fold s
|
||||
|
||||
-- | A fallback parser that treats a file simply as rows of strings.
|
||||
lineByLineParser :: Source -> IO (SyntaxTerm Text DefaultFields)
|
||||
lineByLineParser source = pure . cofree . root $ case foldl' annotateLeaves ([], 0) lines of
|
||||
(leaves, _) -> cofree <$> leaves
|
||||
where
|
||||
lines = sourceLines source
|
||||
root children = (sourceRange :. Program :. rangeToSpan source sourceRange :. Nil) :< Indexed children
|
||||
sourceRange = totalRange source
|
||||
leaf byteIndex line = (Range byteIndex (byteIndex + T.length line) :. Program :. rangeToSpan source (Range byteIndex (byteIndex + T.length line)) :. Nil) :< Leaf line
|
||||
annotateLeaves (accum, byteIndex) line =
|
||||
(accum <> [ leaf byteIndex (toText line) ] , byteIndex + sourceLength line)
|
||||
lineByLineParser :: Source -> SyntaxTerm Text DefaultFields
|
||||
lineByLineParser source = cofree $ (totalRange source :. Program :. totalSpan source :. Nil) :< Indexed (zipWith toLine [1..] (sourceLineRanges source))
|
||||
where toLine line range = cofree $ (range :. Program :. Span (Pos line 1) (Pos line (end range)) :. Nil) :< Leaf (toText (slice range source))
|
||||
|
Loading…
Reference in New Issue
Block a user