diff --git a/semantic-diff.cabal b/semantic-diff.cabal index 1a350f90d..75b658116 100644 --- a/semantic-diff.cabal +++ b/semantic-diff.cabal @@ -18,6 +18,8 @@ library , Arguments , Category , Command + , Command.Files + , Command.Git , Command.Parse , Data.Align.Generic , Data.Functor.Both @@ -50,6 +52,7 @@ library , Language.Ruby , Language.Ruby.Syntax , Parser + , Parser.Language , Patch , Paths_semantic_diff , Prologue diff --git a/src/Command.hs b/src/Command.hs index 638f41fba..368d73910 100644 --- a/src/Command.hs +++ b/src/Command.hs @@ -23,6 +23,7 @@ module Command , runCommand ) where +import Command.Files import Command.Parse import qualified Control.Concurrent.Async.Pool as Async import Control.Exception (catch) @@ -51,6 +52,7 @@ import Info import Interpreter import Language import Patch +import Parser.Language import Prologue hiding (concurrently, Concurrently, readFile) import qualified Renderer as R import qualified Renderer.SExpression as R diff --git a/src/Command/Files.hs b/src/Command/Files.hs new file mode 100644 index 000000000..fc9769b8c --- /dev/null +++ b/src/Command/Files.hs @@ -0,0 +1,46 @@ +module Command.Files +( sourceBlobsFromPaths +, transcode +) where + +import Prologue +import Source +import qualified Data.ByteString as B +import System.IO +import Control.Exception (catch, IOException) +import qualified Data.Text.ICU.Convert as Convert +import qualified Data.Text.ICU.Detect as Detect + +-- | For the given absolute file paths, retrieves their source blobs. +sourceBlobsFromPaths :: [FilePath] -> IO [SourceBlob] +sourceBlobsFromPaths filePaths = + for filePaths (\filePath -> do + source <- readAndTranscodeFile filePath + pure $ Source.SourceBlob source mempty filePath (Just Source.defaultPlainBlob)) + +-- | Read the file and convert it to Unicode. +readAndTranscodeFile :: FilePath -> IO Source +readAndTranscodeFile path = do + size <- fileSize path + text <- case size of + 0 -> pure B.empty + _ -> B.readFile path + transcode text + +-- Based on https://github.com/haskell/bytestring/pull/79/files +-- Neccessary to be able to handle /dev/null as a file. +fileSize :: FilePath -> IO Integer +fileSize f = withBinaryFile f ReadMode $ \h -> do + -- hFileSize fails if file is not regular file (like /dev/null). Catch + -- exception and return 0 in that case. + filesz <- catch (hFileSize h) useZeroIfNotRegularFile + pure $ fromIntegral filesz `max` 0 + where useZeroIfNotRegularFile :: IOException -> IO Integer + useZeroIfNotRegularFile _ = pure 0 + +-- | Transcode a file to a unicode source. +transcode :: B.ByteString -> IO Source +transcode text = fromText <$> do + match <- Detect.detectCharset text + converter <- Convert.open match Nothing + pure $ Convert.toUnicode converter text diff --git a/src/Command/Git.hs b/src/Command/Git.hs new file mode 100644 index 000000000..45b089d21 --- /dev/null +++ b/src/Command/Git.hs @@ -0,0 +1,41 @@ +module Command.Git (sourceBlobsFromSha) where + +import Prologue +import Source +import Command.Files +import Data.String +import Git.Blob +import Git.Libgit2 +import Git.Repository +import Git.Types +import qualified Git + +-- | For the given sha, git repo path, and file paths, retrieves the source blobs. +sourceBlobsFromSha :: String -> String -> [FilePath] -> IO [SourceBlob] +sourceBlobsFromSha commitSha gitDir filePaths = do + maybeBlobs <- withRepository lgFactory gitDir $ do + repo <- getRepository + object <- parseObjOid (toS commitSha) + commit <- lookupCommit object + tree <- lookupTree (commitTree commit) + lift $ runReaderT (traverse (toSourceBlob tree) filePaths) repo + + pure $ catMaybes maybeBlobs + + where + toSourceBlob :: Git.Tree LgRepo -> FilePath -> ReaderT LgRepo IO (Maybe SourceBlob) + toSourceBlob tree filePath = do + entry <- treeEntry tree (toS filePath) + case entry of + Just (BlobEntry entryOid entryKind) -> do + blob <- lookupBlob entryOid + bytestring <- blobToByteString blob + let oid = renderObjOid $ blobOid blob + s <- liftIO $ transcode bytestring + pure . Just $ SourceBlob s (toS oid) filePath (Just (toSourceKind entryKind)) + _ -> pure Nothing + where + toSourceKind :: Git.BlobKind -> SourceKind + toSourceKind (Git.PlainBlob mode) = Source.PlainBlob mode + toSourceKind (Git.ExecutableBlob mode) = Source.ExecutableBlob mode + toSourceKind (Git.SymlinkBlob mode) = Source.SymlinkBlob mode diff --git a/src/Command/Parse.hs b/src/Command/Parse.hs index 5a1d6cc5b..da25ec8e8 100644 --- a/src/Command/Parse.hs +++ b/src/Command/Parse.hs @@ -1,18 +1,8 @@ {-# LANGUAGE DataKinds, GADTs, GeneralizedNewtypeDeriving, ScopedTypeVariables, TypeFamilies, TypeOperators #-} module Command.Parse -( --- Render parser trees - jsonParseTree +( jsonParseTree , jsonIndexParseTree , sExpressionParseTree --- Parsers -, parserForLanguage -, parserForFilePath -- TODO: Remove. Langauge detection should not come from file extension. --- Files and transcoding -, transcode -, sourceBlobsFromPaths --- Git -, sourceBlobsFromSha ) where import Category @@ -20,71 +10,15 @@ import Data.Aeson (ToJSON, toJSON, encode, object, (.=)) import Data.Aeson.Types (Pair) import Data.Functor.Foldable hiding (Nil) import Data.Record -import Data.String -import qualified Data.ByteString as B -import qualified Data.Text as T -import Git.Blob -import Git.Libgit2 -import Git.Repository -import Git.Types -import qualified Git import Info -import Language -import Language.Markdown import Parser +import Parser.Language import Prologue import Source import Syntax -import System.FilePath import Term -import TreeSitter import Renderer.JSON() import Renderer.SExpression -import Text.Parser.TreeSitter.C -import Text.Parser.TreeSitter.Go -import Text.Parser.TreeSitter.JavaScript -import Text.Parser.TreeSitter.Ruby -import Text.Parser.TreeSitter.TypeScript -import System.IO -import Control.Exception (catch, IOException) -import qualified Data.Text.ICU.Convert as Convert -import qualified Data.Text.ICU.Detect as Detect - --- TODO: This doesn't belong in here - --- | Return a parser for a given langauge or the lineByLineParser parser. -parserForLanguage :: Maybe Language -> Parser (Syntax Text) (Record DefaultFields) -parserForLanguage Nothing = lineByLineParser -parserForLanguage (Just language) = case language of - C -> treeSitterParser C tree_sitter_c - JavaScript -> treeSitterParser JavaScript tree_sitter_javascript - TypeScript -> treeSitterParser TypeScript tree_sitter_typescript - Markdown -> cmarkParser - Ruby -> treeSitterParser Ruby tree_sitter_ruby - Language.Go -> treeSitterParser Language.Go tree_sitter_go - --- | Return a parser based on the file extension (including the "."). --- | TODO: Remove. -parserForType :: String -> Parser (Syntax Text) (Record DefaultFields) -parserForType = parserForLanguage . languageForType - --- | Return the parser that should be used for a given path. --- | TODO: Remove. -parserForFilePath :: FilePath -> Parser (Syntax Text) (Record DefaultFields) -parserForFilePath = parserForType . toS . takeExtension - --- | A fallback parser that treats a file simply as rows of strings. -lineByLineParser :: Parser (Syntax Text) (Record DefaultFields) -lineByLineParser SourceBlob{..} = pure . cofree . root $ case foldl' annotateLeaves ([], 0) lines of - (leaves, _) -> cofree <$> leaves - where - lines = actualLines source - root children = (sourceRange :. Program :. rangeToSourceSpan source sourceRange :. Nil) :< Indexed children - sourceRange = Source.totalRange source - leaf charIndex line = (Range charIndex (charIndex + T.length line) :. Program :. rangeToSourceSpan source (Range charIndex (charIndex + T.length line)) :. Nil) :< Leaf line - annotateLeaves (accum, charIndex) line = - (accum <> [ leaf charIndex (Source.toText line) ] , charIndex + Source.length line) - data ParseTreeFile = ParseTreeFile { parseTreeFilePath :: FilePath, node :: Rose ParseNode } deriving (Show) @@ -119,6 +53,15 @@ parseNodeToJSONFields ParseNode{..} = <> [ "sourceText" .= sourceText | isJust sourceText ] <> [ "identifier" .= identifier | isJust identifier ] + +-- | Constructs ParseTreeFile nodes for the provided arguments and encodes them to JSON. +jsonParseTree :: Bool -> [SourceBlob] -> IO ByteString +jsonParseTree debug = fmap (toS . encode) . parseRoot debug ParseTreeFile Rose + +-- | Constructs IndexFile nodes for the provided arguments and encodes them to JSON. +jsonIndexParseTree :: Bool -> [SourceBlob] -> IO ByteString +jsonIndexParseTree debug = fmap (toS . encode) . parseRoot debug IndexFile (\ node siblings -> node : concat siblings) + -- | Parses file contents into an SExpression format for the provided arguments. sExpressionParseTree :: Bool -> [SourceBlob] -> IO ByteString sExpressionParseTree _ blobs = @@ -135,56 +78,11 @@ parseRoot debug construct combine blobs = for blobs (\ sourceBlob@SourceBlob{..} makeNode (head :. range :. category :. sourceSpan :. Nil) syntax = ParseNode (toS category) range head sourceSpan (identifierFor syntax) --- | Constructs IndexFile nodes for the provided arguments and encodes them to JSON. -jsonIndexParseTree :: Bool -> [SourceBlob] -> IO ByteString -jsonIndexParseTree debug = fmap (toS . encode) . parseRoot debug IndexFile (\ node siblings -> node : concat siblings) - --- | Constructs ParseTreeFile nodes for the provided arguments and encodes them to JSON. -jsonParseTree :: Bool -> [SourceBlob] -> IO ByteString -jsonParseTree debug = fmap (toS . encode) . parseRoot debug ParseTreeFile Rose - -- | Determines the term decorator to use when parsing. parseDecorator :: (Functor f, HasField fields Range) => Bool -> (Source -> TermDecorator f fields (Maybe SourceText)) parseDecorator True = termSourceTextDecorator parseDecorator False = const . const Nothing --- | For the given absolute file paths, retrieves their source blobs. -sourceBlobsFromPaths :: [FilePath] -> IO [SourceBlob] -sourceBlobsFromPaths filePaths = - for filePaths (\filePath -> do - source <- readAndTranscodeFile filePath - pure $ Source.SourceBlob source mempty filePath (Just Source.defaultPlainBlob)) - --- | For the given sha, git repo path, and file paths, retrieves the source blobs. -sourceBlobsFromSha :: String -> String -> [FilePath] -> IO [SourceBlob] -sourceBlobsFromSha commitSha gitDir filePaths = do - maybeBlobs <- withRepository lgFactory gitDir $ do - repo <- getRepository - object <- parseObjOid (toS commitSha) - commit <- lookupCommit object - tree <- lookupTree (commitTree commit) - lift $ runReaderT (traverse (toSourceBlob tree) filePaths) repo - - pure $ catMaybes maybeBlobs - - where - toSourceBlob :: Git.Tree LgRepo -> FilePath -> ReaderT LgRepo IO (Maybe SourceBlob) - toSourceBlob tree filePath = do - entry <- treeEntry tree (toS filePath) - case entry of - Just (BlobEntry entryOid entryKind) -> do - blob <- lookupBlob entryOid - bytestring <- blobToByteString blob - let oid = renderObjOid $ blobOid blob - s <- liftIO $ transcode bytestring - pure . Just $ SourceBlob s (toS oid) filePath (Just (toSourceKind entryKind)) - _ -> pure Nothing - where - toSourceKind :: Git.BlobKind -> SourceKind - toSourceKind (Git.PlainBlob mode) = Source.PlainBlob mode - toSourceKind (Git.ExecutableBlob mode) = Source.ExecutableBlob mode - toSourceKind (Git.SymlinkBlob mode) = Source.SymlinkBlob mode - -- | Returns a Just identifier text if the given Syntax term contains an identifier (leaf) syntax. Otherwise returns Nothing. identifierFor :: (HasField fields (Maybe SourceText), HasField fields Category, StringConv leaf Text) => Syntax leaf (Term (Syntax leaf) (Record fields)) -> Maybe Text identifierFor = fmap toS . extractLeafValue . unwrap <=< maybeIdentifier @@ -207,34 +105,6 @@ termSourceTextDecorator source (ann :< _) = Just (SourceText (toText (Source.sli newtype Identifier = Identifier Text deriving (Eq, Show, ToJSON) --- | Read the file and convert it to Unicode. -readAndTranscodeFile :: FilePath -> IO Source -readAndTranscodeFile path = do - size <- fileSize path - text <- case size of - 0 -> pure B.empty - _ -> B.readFile path - transcode text - --- Based on https://github.com/haskell/bytestring/pull/79/files --- Neccessary to be able to handle /dev/null as a file. -fileSize :: FilePath -> IO Integer -fileSize f = withBinaryFile f ReadMode $ \h -> do - -- hFileSize fails if file is not regular file (like /dev/null). Catch - -- exception and return 0 in that case. - filesz <- catch (hFileSize h) useZeroIfNotRegularFile - pure $ fromIntegral filesz `max` 0 - where useZeroIfNotRegularFile :: IOException -> IO Integer - useZeroIfNotRegularFile _ = pure 0 - --- | Transcode a file to a unicode source. -transcode :: B.ByteString -> IO Source -transcode text = fromText <$> do - match <- Detect.detectCharset text - converter <- Convert.open match Nothing - pure $ Convert.toUnicode converter text - - data RoseF a b = RoseF a [b] deriving (Eq, Functor, Show) diff --git a/src/Parser/Language.hs b/src/Parser/Language.hs new file mode 100644 index 000000000..a4f2c2d17 --- /dev/null +++ b/src/Parser/Language.hs @@ -0,0 +1,51 @@ +module Parser.Language +( parserForLanguage +, parserForFilePath -- TODO: Remove. Langauge detection should not come from file extension. +) where + +import Data.Record +import qualified Data.Text as T +import Parser +import Source +import Prologue +import Info +import Language +import Language.Markdown +import Syntax +import TreeSitter +import Text.Parser.TreeSitter.C +import Text.Parser.TreeSitter.Go +import Text.Parser.TreeSitter.JavaScript +import Text.Parser.TreeSitter.Ruby +import Text.Parser.TreeSitter.TypeScript + +-- TODO: Shouldn't need to depend on System.FilePath in here +import System.FilePath + + -- | Return a parser for a given langauge or the lineByLineParser parser. +parserForLanguage :: Maybe Language -> Parser (Syntax Text) (Record DefaultFields) +parserForLanguage Nothing = lineByLineParser +parserForLanguage (Just language) = case language of + C -> treeSitterParser C tree_sitter_c + JavaScript -> treeSitterParser JavaScript tree_sitter_javascript + TypeScript -> treeSitterParser TypeScript tree_sitter_typescript + Markdown -> cmarkParser + Ruby -> treeSitterParser Ruby tree_sitter_ruby + Language.Go -> treeSitterParser Language.Go tree_sitter_go + +-- | Return a parser based on the FilePath's extension (including the "."). +-- | TODO: Remove. +parserForFilePath :: FilePath -> Parser (Syntax Text) (Record DefaultFields) +parserForFilePath = parserForLanguage . languageForType . toS . takeExtension + +-- | A fallback parser that treats a file simply as rows of strings. +lineByLineParser :: Parser (Syntax Text) (Record DefaultFields) +lineByLineParser SourceBlob{..} = pure . cofree . root $ case foldl' annotateLeaves ([], 0) lines of + (leaves, _) -> cofree <$> leaves + where + lines = actualLines source + root children = (sourceRange :. Program :. rangeToSourceSpan source sourceRange :. Nil) :< Indexed children + sourceRange = Source.totalRange source + leaf charIndex line = (Range charIndex (charIndex + T.length line) :. Program :. rangeToSourceSpan source (Range charIndex (charIndex + T.length line)) :. Nil) :< Leaf line + annotateLeaves (accum, charIndex) line = + (accum <> [ leaf charIndex (Source.toText line) ] , charIndex + Source.length line) diff --git a/src/Semantic.hs b/src/Semantic.hs index ccbd8a87b..d0a1d2369 100644 --- a/src/Semantic.hs +++ b/src/Semantic.hs @@ -3,6 +3,8 @@ module Semantic (main, runDiff, runParse) where import Arguments import Command +import Command.Files +import Command.Git import Command.Parse import Data.Functor.Both import Data.List.Split (splitWhen) diff --git a/test/Command/Parse/Spec.hs b/test/Command/Parse/Spec.hs index 6070d95ae..99f927e00 100644 --- a/test/Command/Parse/Spec.hs +++ b/test/Command/Parse/Spec.hs @@ -1,5 +1,6 @@ module Command.Parse.Spec where +import Command.Files import Command.Parse import Control.Monad import Prelude diff --git a/test/SpecHelpers.hs b/test/SpecHelpers.hs index fa3cd7ca8..5360a06bd 100644 --- a/test/SpecHelpers.hs +++ b/test/SpecHelpers.hs @@ -5,7 +5,15 @@ import qualified Data.ByteString as B import qualified Data.Text.ICU.Convert as Convert import qualified Data.Text.ICU.Detect as Detect import Source +import Info +import Data.Record +import Syntax +import Language +import Parser +import Parser.Language +import System.FilePath +-- TODO: Write helper functions for parse file and diff files that don't depend on Command. -- | Read a file and convert it to Unicode. readFileToUnicode :: FilePath -> IO Source @@ -17,3 +25,10 @@ readFileToUnicode path = B.readFile path >>= transcode match <- Detect.detectCharset text converter <- Convert.open match Nothing pure $ Convert.toUnicode converter text + +-- | Return a parser based on the FilePath's extension (including the "."). | +-- NB: This is intentionally duplicated from Parser.Language because our tests +-- will always need to be able to select language from file extention whereas +-- the semantic project should eventually depend on exernal language detection. +parserForFilePath :: FilePath -> Parser (Syntax Text) (Record DefaultFields) +parserForFilePath = parserForLanguage . languageForType . toS . takeExtension