1
1
mirror of https://github.com/github/semantic.git synced 2024-12-22 22:31:36 +03:00

Add Parse module

This commit is contained in:
Rick Winfrey 2016-11-02 20:12:04 -05:00
parent 3bd78d19b9
commit 2abd18e227

88
src/Parse.hs Normal file
View File

@ -0,0 +1,88 @@
{-# LANGUAGE DataKinds, RankNTypes, TypeOperators #-}
module Parse where
import Arguments
import Category
import Data.Aeson.Encode.Pretty
import qualified Data.ByteString.Char8 as B1
import qualified Data.Text.ICU.Convert as Convert
import qualified Data.Text.ICU.Detect as Detect
import Data.Record
import qualified Data.Text as T
import Info
import Language
import Language.Markdown
import Parser
import Prologue
import Range
import Source
import SourceSpan
import Syntax
import System.FilePath
import Term
import Term.Instances
import TreeSitter
import Text.Parser.TreeSitter.Language
run :: Arguments -> IO ()
run args@Arguments{..} = do
sources <- sequence $ readAndTranscodeFile <$> filePaths
let sourceBlobs = Source.SourceBlob <$> sources <*> pure mempty <*> filePaths <*> pure (Just Source.defaultPlainBlob)
let parsers = parserForFilepath <$> filePaths
let parsersAndBlobs = zip parsers sourceBlobs
terms <- traverse (\(parser, sourceBlob) -> parser sourceBlob) parsersAndBlobs
putStrLn $ encodePretty terms
pure ()
-- | Return the parser that should be used for a given path.
parserForFilepath :: FilePath -> Parser (Syntax Text) (Record '[Cost, Range, Category, SourceSpan])
parserForFilepath path blob = decorateTerm termCostDecorator <$> parserForType (toS (takeExtension path)) blob
-- | Return a parser based on the file extension (including the ".").
parserForType :: Text -> Parser (Syntax Text) (Record '[Range, Category, SourceSpan])
parserForType mediaType = case languageForType mediaType of
Just C -> treeSitterParser C ts_language_c
Just JavaScript -> treeSitterParser JavaScript ts_language_javascript
Just Markdown -> cmarkParser
Just Ruby -> treeSitterParser Ruby ts_language_ruby
_ -> lineByLineParser
-- | Decorate a 'Term' using a function to compute the annotation values at every node.
decorateTerm :: Functor f => TermDecorator f fields field -> Term f (Record fields) -> Term f (Record (field ': fields))
decorateTerm decorator = cata $ \ c -> cofree ((decorator (extract <$> c) .: headF c) :< tailF c)
-- | A function computing a value to decorate terms with. This can be used to cache synthesized attributes on terms.
type TermDecorator f fields field = TermF f (Record fields) (Record (field ': fields)) -> field
-- | Term decorator computing the cost of an unpacked term.
termCostDecorator :: (Foldable f, Functor f) => TermDecorator f a Cost
termCostDecorator c = 1 + sum (cost <$> tailF c)
-- | A fallback parser that treats a file simply as rows of strings.
lineByLineParser :: Parser (Syntax Text) (Record '[Range, Category, SourceSpan])
lineByLineParser SourceBlob{..} = pure . cofree . root $ case foldl' annotateLeaves ([], 0) lines of
(leaves, _) -> cofree <$> leaves
where
lines = actualLines source
root children = (Range 0 (length source) .: Program .: rangeToSourceSpan source (Range 0 (length source)) .: RNil) :< Indexed children
leaf charIndex line = (Range charIndex (charIndex + T.length line) .: Program .: rangeToSourceSpan source (Range charIndex (charIndex + T.length line)) .: RNil) :< Leaf line
annotateLeaves (accum, charIndex) line =
(accum <> [ leaf charIndex (toText line) ] , charIndex + length line)
toText = T.pack . Source.toString
-- | Read the file and convert it to Unicode.
readAndTranscodeFile :: FilePath -> IO (Source Char)
readAndTranscodeFile path = do
text <- B1.readFile path
transcode text
-- | Transcode a file to a unicode source.
transcode :: B1.ByteString -> IO (Source Char)
transcode text = fromText <$> do
match <- Detect.detectCharset text
converter <- Convert.open match Nothing
pure $ Convert.toUnicode converter text