semantic/src/TreeSitter.hs

module TreeSitter where

import Prologue hiding (Constructor)
import Data.String
import Category
import Info
import Language
import Parser
import Range
import Source
import Foreign
import Foreign.C.String
import Text.Parser.TreeSitter hiding (Language(..))
import qualified Text.Parser.TreeSitter as TS

-- | Returns a TreeSitter parser for the given language and TreeSitter grammar.
treeSitterParser :: Language -> Ptr TS.Language -> Parser
treeSitterParser language grammar contents = do
  document <- ts_document_make
  ts_document_set_language document grammar
  withCString (toString contents) (\source -> do
    ts_document_set_input_string document source
    ts_document_parse document
    term <- documentToTerm language document contents
    ts_document_free document
    pure term)

-- Given a language and a node name, return the correct categories.
categoriesForLanguage :: Language -> String -> Category
categoriesForLanguage language name = case (language, name) of
  (JavaScript, "object") -> DictionaryLiteral
  (JavaScript, "rel_op") -> BinaryOperator -- relational operator, e.g. >, <, <=, >=, ==, !=

  (Ruby, "hash") -> DictionaryLiteral
  _ -> defaultCategoryForNodeName name

-- | Given a node name from TreeSitter, return the correct categories.
defaultCategoryForNodeName :: String -> Category
defaultCategoryForNodeName name = case name of
  "program" -> Program
  "ERROR" -> Error
  "function_call" -> FunctionCall
  "pair" -> Pair
  "string" -> StringLiteral
  "integer" -> IntegerLiteral
  "symbol" -> SymbolLiteral
  "array" -> ArrayLiteral
  _ -> (Other name)

-- | Return a parser for a tree sitter language & document.
documentToTerm :: Language -> Ptr Document -> Parser
documentToTerm language document contents = alloca $ \ root -> do
  ts_document_root_node_p document root
  toTerm root
  where toTerm node = do
          name <- ts_node_p_name node document
          name <- peekCString name
          count <- ts_node_p_named_child_count node
          children <- traverse (alloca . getChild node) $ take (fromIntegral count) [0..]
          -- Note: The strict application here is semantically important. Without it, we may not evaluate the range until after we’ve exited the scope that `node` was allocated within, meaning `alloca` will free it & other stack data may overwrite it.
          range <- pure $! Range { start = fromIntegral $ ts_node_p_start_char node, end = fromIntegral $ ts_node_p_end_char node }

          let size' = 1 + sum (size . extract <$> children)
          let info = Info range (categoriesForLanguage language name) size' size'
          pure $! termConstructor contents info children
        getChild node n out = do
          _ <- ts_node_p_named_child node n out
          toTerm out