1
1
mirror of https://github.com/github/semantic.git synced 2025-01-02 04:10:29 +03:00
semantic/src/TreeSitter.hs

184 lines
7.8 KiB
Haskell
Raw Normal View History

2017-06-07 22:19:32 +03:00
{-# LANGUAGE DataKinds, ScopedTypeVariables, TypeOperators #-}
2017-01-20 21:04:36 +03:00
module TreeSitter
( treeSitterParser
, parseToAST
2017-01-20 21:04:36 +03:00
, defaultTermAssignment
) where
import Prologue hiding (Constructor)
2016-02-09 22:34:23 +03:00
import Category
import Data.Blob
import Data.ByteString.Unsafe (unsafeUseAsCStringLen)
import Data.Functor.Foldable hiding (Nil)
2017-06-24 16:46:28 +03:00
import Data.Range
2016-08-16 00:25:56 +03:00
import Data.Record
import Data.Source
import Data.Span
import qualified Data.Syntax.Assignment as A
2016-02-11 01:30:14 +03:00
import Language
import qualified Language.C as C
import qualified Language.Go as Go
import qualified Language.TypeScript as TS
2016-10-19 18:22:10 +03:00
import qualified Language.Ruby as Ruby
import qualified Syntax
import Foreign
2017-02-10 19:02:38 +03:00
import Foreign.C.String (peekCString)
2017-03-27 22:48:24 +03:00
import Foreign.Marshal.Array (allocaArray)
2017-01-20 21:03:52 +03:00
import qualified Syntax as S
import Term
import Text.Parser.TreeSitter hiding (Language(..))
import qualified Text.Parser.TreeSitter as TS
import qualified Text.Parser.TreeSitter.C as TS
import qualified Text.Parser.TreeSitter.Go as TS
import qualified Text.Parser.TreeSitter.Ruby as TS
import qualified Text.Parser.TreeSitter.TypeScript as TS
2016-08-23 00:56:48 +03:00
import Info
2016-02-11 02:07:27 +03:00
-- | Returns a TreeSitter parser for the given language and TreeSitter grammar.
treeSitterParser :: Ptr TS.Language -> Blob -> IO (Term (Syntax.Syntax Text) (Record DefaultFields))
treeSitterParser language blob = bracket ts_document_new ts_document_free $ \ document -> do
ts_document_set_language document language
unsafeUseAsCStringLen (sourceBytes (blobSource blob)) $ \ (sourceBytes, len) -> do
2017-06-24 17:54:03 +03:00
ts_document_set_input_string_with_length document sourceBytes len
ts_document_parse_halt_on_error document
term <- documentToTerm language document blob
pure term
-- | Parse 'Source' with the given 'TS.Language' and return its AST.
2017-07-12 22:40:54 +03:00
parseToAST :: (Bounded grammar, Enum grammar) => Ptr TS.Language -> Source -> IO (A.AST grammar)
parseToAST language source = bracket ts_document_new ts_document_free $ \ document -> do
ts_document_set_language document language
root <- unsafeUseAsCStringLen (sourceBytes source) $ \ (source, len) -> do
ts_document_set_input_string_with_length document source len
ts_document_parse_halt_on_error document
alloca (\ rootPtr -> do
ts_document_root_node_p document rootPtr
peek rootPtr)
anaM toAST root
toAST :: forall grammar . (Bounded grammar, Enum grammar) => Node -> IO (Base (A.AST grammar) Node)
2017-05-15 22:34:53 +03:00
toAST node@Node{..} = do
let count = fromIntegral nodeChildCount
children <- allocaArray count $ \ childNodesPtr -> do
_ <- with nodeTSNode (\ nodePtr -> ts_node_copy_child_nodes nullPtr nodePtr childNodesPtr (fromIntegral count))
peekArray count childNodesPtr
pure $! A.Node (toEnum (min (fromIntegral nodeSymbol) (fromEnum (maxBound :: grammar)))) (nodeRange node) (nodeSpan node) :< children
2017-05-15 22:34:53 +03:00
2017-05-15 22:34:45 +03:00
anaM :: (Corecursive t, Monad m, Traversable (Base t)) => (a -> m (Base t a)) -> a -> m t
anaM g = a where a = pure . embed <=< traverse a <=< g
2016-06-03 06:06:09 +03:00
-- | Return a parser for a tree sitter language & document.
documentToTerm :: Ptr TS.Language -> Ptr Document -> Blob -> IO (Term (Syntax.Syntax Text) (Record DefaultFields))
documentToTerm language document Blob{..} = do
2017-03-27 22:54:34 +03:00
root <- alloca (\ rootPtr -> do
ts_document_root_node_p document rootPtr
peek rootPtr)
toTerm root
where toTerm :: Node -> IO (Term (Syntax.Syntax Text) (Record DefaultFields))
toTerm node = do
let source = slice (nodeRange node) blobSource
2017-03-27 22:48:24 +03:00
name <- peekCString (nodeType node)
children <- getChildren (fromIntegral (nodeNamedChildCount node)) copyNamed
let allChildren = getChildren (fromIntegral (nodeChildCount node)) copyAll
2017-03-27 23:04:43 +03:00
assignTerm language source (range :. categoryForLanguageProductionName language (toS name) :. nodeSpan node :. Nil) children allChildren
where getChildren count copy = do
nodes <- allocaArray count $ \ childNodesPtr -> do
_ <- with (nodeTSNode node) (\ nodePtr -> copy nodePtr childNodesPtr (fromIntegral count))
peekArray count childNodesPtr
children <- traverse toTerm nodes
return $! filter isNonEmpty children
range = nodeRange node
copyNamed = ts_node_copy_named_child_nodes document
copyAll = ts_node_copy_child_nodes document
2017-02-13 19:41:10 +03:00
isNonEmpty :: HasField fields Category => SyntaxTerm Text fields -> Bool
isNonEmpty = (/= Empty) . category . extract
2017-03-27 22:48:24 +03:00
nodeRange :: Node -> Range
nodeRange Node{..} = Range (fromIntegral nodeStartByte) (fromIntegral nodeEndByte)
nodeSpan :: Node -> Span
nodeSpan Node{..} = nodeStartPoint `seq` nodeEndPoint `seq` Span (pointPos nodeStartPoint) (pointPos nodeEndPoint)
where pointPos TSPoint{..} = pointRow `seq` pointColumn `seq` Pos (1 + fromIntegral pointRow) (1 + fromIntegral pointColumn)
assignTerm :: Ptr TS.Language -> Source -> Record DefaultFields -> [ SyntaxTerm Text DefaultFields ] -> IO [ SyntaxTerm Text DefaultFields ] -> IO (SyntaxTerm Text DefaultFields)
2017-01-20 23:03:40 +03:00
assignTerm language source annotation children allChildren =
cofree . (annotation :<) <$> case assignTermByLanguage source (category annotation) children of
2017-01-20 22:54:03 +03:00
Just a -> pure a
_ -> defaultTermAssignment source (category annotation) children allChildren
where assignTermByLanguage :: Source -> Category -> [ SyntaxTerm Text DefaultFields ] -> Maybe (S.Syntax Text (SyntaxTerm Text DefaultFields))
assignTermByLanguage = case languageForTSLanguage language of
Just C -> C.termAssignment
Just Language.Go -> Go.termAssignment
Just Ruby -> Ruby.termAssignment
Just TypeScript -> TS.termAssignment
2017-01-20 23:03:40 +03:00
_ -> \ _ _ _ -> Nothing
defaultTermAssignment :: Source -> Category -> [ SyntaxTerm Text DefaultFields ] -> IO [ SyntaxTerm Text DefaultFields ] -> IO (S.Syntax Text (SyntaxTerm Text DefaultFields))
defaultTermAssignment source category children allChildren
| category `elem` operatorCategories = S.Operator <$> allChildren
| otherwise = pure $! case (category, children) of
2017-01-25 11:09:53 +03:00
(ParseError, children) -> S.ParseError children
(Comment, _) -> S.Comment (toText source)
2017-01-21 00:49:45 +03:00
(Pair, [key, value]) -> S.Pair key value
2017-01-21 00:49:45 +03:00
-- Control flow statements
(If, condition : body) -> S.If condition body
(Switch, _) -> uncurry S.Switch (Prologue.break ((== Case) . Info.category . extract) children)
2017-01-23 23:12:38 +03:00
(Case, expr : body) -> S.Case expr body
(While, expr : rest) -> S.While expr rest
2017-01-21 00:49:45 +03:00
-- Statements
(Return, _) -> S.Return children
(Yield, _) -> S.Yield children
(Throw, [expr]) -> S.Throw expr
(Break, [label]) -> S.Break (Just label)
(Break, []) -> S.Break Nothing
(Continue, [label]) -> S.Continue (Just label)
(Continue, []) -> S.Continue Nothing
(_, []) -> S.Leaf (toText source)
(_, children) -> S.Indexed children
where operatorCategories =
[ Operator
, Binary
, Unary
, RangeExpression
, ScopeOperator
, BooleanOperator
, MathOperator
, RelationalOperator
, BitwiseOperator
]
2017-01-20 21:03:52 +03:00
categoryForLanguageProductionName :: Ptr TS.Language -> Text -> Category
2017-03-28 22:32:45 +03:00
categoryForLanguageProductionName = withDefaults . byLanguage
where
withDefaults productionMap name = case name of
"ERROR" -> ParseError
s -> productionMap s
byLanguage language = case languageForTSLanguage language of
Just C -> C.categoryForCProductionName
Just Ruby -> Ruby.categoryForRubyName
Just Language.Go -> Go.categoryForGoName
Just TypeScript -> TS.categoryForTypeScriptName
_ -> Other
languageForTSLanguage :: Ptr TS.Language -> Maybe Language
languageForTSLanguage language
= if language == TS.tree_sitter_c then Just C
else if language == TS.tree_sitter_go then Just Language.Go
else if language == TS.tree_sitter_ruby then Just Ruby
else if language == TS.tree_sitter_typescript then Just TypeScript
else Nothing