1
1
mirror of https://github.com/github/semantic.git synced 2025-01-01 11:46:14 +03:00

Rework diffing for max parallelization

This commit is contained in:
Timothy Clem 2017-03-13 16:25:18 -07:00
parent 94dc2ac506
commit 652be339c6
3 changed files with 55 additions and 80 deletions

View File

@ -1,3 +1,4 @@
{-# LANGUAGE DataKinds #-}
{-# OPTIONS_GHC -fno-warn-incomplete-patterns #-} {-# OPTIONS_GHC -fno-warn-incomplete-patterns #-}
module DiffCommand where module DiffCommand where
@ -5,12 +6,10 @@ import Data.Aeson hiding (json)
import Data.Functor.Both as Both import Data.Functor.Both as Both
import Data.List ((\\)) import Data.List ((\\))
import Data.String import Data.String
import Data.Text.Encoding (encodeUtf8)
import GHC.Conc (numCapabilities) import GHC.Conc (numCapabilities)
import Prologue hiding (fst, snd, null) import Prologue hiding (fst, snd, null)
import qualified Control.Concurrent.Async.Pool as Async import qualified Control.Concurrent.Async.Pool as Async
import System.FilePath.Posix (hasExtension) import System.FilePath.Posix (hasExtension)
import System.Timeout (timeout)
import Git.Blob import Git.Blob
import Git.Libgit2 import Git.Libgit2
import Git.Libgit2.Backend import Git.Libgit2.Backend
@ -22,6 +21,7 @@ import Category
import Data.RandomWalkSimilarity import Data.RandomWalkSimilarity
import Data.Record import Data.Record
import Info import Info
import Diff
import Interpreter import Interpreter
import ParseCommand (parserForFilepath) import ParseCommand (parserForFilepath)
import Parser import Parser
@ -35,7 +35,7 @@ import Renderer.Summary
import Renderer.TOC import Renderer.TOC
import Source import Source
import Syntax import Syntax
import Term import Debug.Trace
diff :: Arguments -> IO ByteString diff :: Arguments -> IO ByteString
diff args@Arguments{..} = case diffMode of diff args@Arguments{..} = case diffMode of
@ -45,11 +45,8 @@ diff args@Arguments{..} = case diffMode of
-- | Compare changes between two commits. -- | Compare changes between two commits.
diffCommits :: Arguments -> IO ByteString diffCommits :: Arguments -> IO ByteString
diffCommits args@Arguments{..} = do diffCommits args@Arguments{..} = do
ts <- fetchTerms args outputs <- fetchDiffs args
pure $ maybe mempty concatOutputs ts pure $! concatOutputs outputs
where fetchTerms args = if developmentMode
then Just <$> fetchDiffs args
else timeout timeoutInMicroseconds (fetchDiffs args)
-- | Compare two paths on the filesystem (compariable to git diff --no-index). -- | Compare two paths on the filesystem (compariable to git diff --no-index).
diffPaths :: Arguments -> Both FilePath -> IO ByteString diffPaths :: Arguments -> Both FilePath -> IO ByteString
@ -66,28 +63,27 @@ fetchDiffs args@Arguments{..} = do
([], Join (Just a, Just b)) -> pathsToDiff args (both a b) ([], Join (Just a, Just b)) -> pathsToDiff args (both a b)
(ps, _) -> pure ps (ps, _) -> pure ps
Async.withTaskGroup numCapabilities $ \p -> Async.mapTasks p (fetchDiff args <$> paths) diffs <- Async.withTaskGroup numCapabilities . flip Async.mapTasks $
fetchDiff args <$> paths
fetchDiff :: Arguments -> FilePath -> IO Output pure $ uncurry (renderDiff args) <$> diffs
fetchDiff :: Arguments -> FilePath -> IO (Both SourceBlob, SyntaxDiff Text '[Range, Category, SourceSpan])
fetchDiff args@Arguments{..} filepath = withRepository lgFactory gitDir $ do fetchDiff args@Arguments{..} filepath = withRepository lgFactory gitDir $ do
repo <- getRepository repo <- getRepository
for_ alternateObjectDirs (liftIO . odbBackendAddPath repo . toS) for_ alternateObjectDirs (liftIO . odbBackendAddPath repo . toS)
lift $ runReaderT (fetchDiff' args filepath) repo lift $ runReaderT (fetchDiff' args filepath) repo
fetchDiff' :: Arguments -> FilePath -> ReaderT LgRepo IO Output fetchDiff' :: Arguments -> FilePath -> ReaderT LgRepo IO (Both SourceBlob, SyntaxDiff Text '[Range, Category, SourceSpan])
fetchDiff' args@Arguments{..} filepath = do fetchDiff' Arguments{..} filepath = do
liftIO $ traceEventIO ("START fetchDiff: " <> filepath)
sourcesAndOids <- sequence $ traverse (getSourceBlob filepath) <$> shaRange sourcesAndOids <- sequence $ traverse (getSourceBlob filepath) <$> shaRange
let sources = fromMaybe (emptySourceBlob filepath) <$> sourcesAndOids let sources = fromMaybe (emptySourceBlob filepath) <$> sourcesAndOids
let sourceBlobs = idOrEmptySourceBlob <$> sources let sourceBlobs = idOrEmptySourceBlob <$> sources
text <- liftIO . render $ textDiff (parserForFilepath filepath) args sourceBlobs diff <- liftIO $ diffFiles (parserForFilepath filepath) sourceBlobs
truncatedPatch <- liftIO $ truncatedDiff args sourceBlobs pure $! traceEvent ("END fetchDiff: " <> filepath) (sourceBlobs, diff)
pure $ fromMaybe truncatedPatch text
where
render output = if developmentMode
then Just <$> output
else timeout timeoutInMicroseconds output
pathsToDiff :: Arguments -> Both String -> IO [FilePath] pathsToDiff :: Arguments -> Both String -> IO [FilePath]
pathsToDiff Arguments{..} shas = withRepository lgFactory gitDir $ do pathsToDiff Arguments{..} shas = withRepository lgFactory gitDir $ do
@ -136,20 +132,19 @@ getSourceBlob path sha = do
toSourceKind (Git.ExecutableBlob mode) = Source.ExecutableBlob mode toSourceKind (Git.ExecutableBlob mode) = Source.ExecutableBlob mode
toSourceKind (Git.SymlinkBlob mode) = Source.SymlinkBlob mode toSourceKind (Git.SymlinkBlob mode) = Source.SymlinkBlob mode
-- | Given a parser and renderer, diff two sources and return the rendered -- | Given a parser, diff two sources and return a SyntaxDiff.
-- | result. -- | Returns the rendered result strictly, so it's always fully evaluated with respect to other IO actions.
-- | Returns the rendered result strictly, so it's always fully evaluated diffFiles :: (HasField fields Category, NFData (Record fields))
-- | with respect to other IO actions.
diffFiles :: HasField fields Category
=> Parser (Syntax Text) (Record fields) => Parser (Syntax Text) (Record fields)
-> Renderer (Record fields)
-> Both SourceBlob -> Both SourceBlob
-> IO Output -> IO (SyntaxDiff Text fields)
diffFiles parse render sourceBlobs = do diffFiles parse sourceBlobs = do
traceEventIO $ "diffFiles@SEMANTIC-DIFF START parse terms: " <> paths
terms <- Async.withTaskGroup numCapabilities . flip Async.mapTasks $ terms <- Async.withTaskGroup numCapabilities . flip Async.mapTasks $
(fmap (defaultFeatureVectorDecorator getLabel) . parse) <$> sourceBlobs (fmap (defaultFeatureVectorDecorator getLabel) . parse) <$> sourceBlobs
pure $! render sourceBlobs (stripDiff (diffTerms' terms)) traceEventIO $ "diffFiles@SEMANTIC-DIFF END parse terms: " <> paths
traceEventIO $ "diffFiles@SEMANTIC-DIFF START diff terms: " <> paths
traceEvent ("diffFiles@SEMANTIC-DIFF END diff terms: " <> paths) pure $!! stripDiff (diffTerms' terms)
where where
diffTerms' terms = case runBothWith areNullOids sourceBlobs of diffTerms' terms = case runBothWith areNullOids sourceBlobs of
(True, False) -> pure $ Insert (snd terms) (True, False) -> pure $ Insert (snd terms)
@ -158,19 +153,18 @@ diffFiles parse render sourceBlobs = do
runBothWith diffTerms terms runBothWith diffTerms terms
areNullOids a b = (hasNullOid a, hasNullOid b) areNullOids a b = (hasNullOid a, hasNullOid b)
hasNullOid blob = oid blob == nullOid || null (source blob) hasNullOid blob = oid blob == nullOid || null (source blob)
-- For trace events
paths = runBothWith (\a b -> fileAtSha a <> " -> " <> fileAtSha b) sourceBlobs
fileAtSha x = path x <> "@" <> toS (oid x)
getLabel :: HasField fields Category => CofreeF (Syntax leaf) (Record fields) b -> (Category, Maybe leaf) getLabel :: HasField fields Category => CofreeF (Syntax leaf) (Record fields) b -> (Category, Maybe leaf)
getLabel (h :< t) = (category h, case t of getLabel (h :< t) = (category h, case t of
Leaf s -> Just s Leaf s -> Just s
_ -> Nothing) _ -> Nothing)
-- | Determine whether two terms are comparable based on the equality of their categories. -- | Returns a rendered diff given arguments and two source blobs.
compareCategoryEq :: Functor f => HasField fields Category => Term f (Record fields) -> Term f (Record fields) -> Bool renderDiff :: (ToJSON (Record fields), NFData (Record fields), DefaultFields fields) => Arguments -> Both SourceBlob -> SyntaxDiff Text fields -> Output
compareCategoryEq = (==) `on` category . extract renderDiff args = case format args of
-- | Returns a rendered diff given a parser, diff arguments and two source blobs.
textDiff :: (ToJSON (Record fields), DefaultFields fields) => Parser (Syntax Text) (Record fields) -> Arguments -> Both SourceBlob -> IO Output
textDiff parser arguments = diffFiles parser $ case format arguments of
Split -> split Split -> split
Patch -> patch Patch -> patch
SExpression -> sExpression TreeOnly SExpression -> sExpression TreeOnly
@ -178,24 +172,8 @@ textDiff parser arguments = diffFiles parser $ case format arguments of
Summary -> summary Summary -> summary
TOC -> toc TOC -> toc
-- | Returns a truncated diff given diff arguments and two source blobs. -- | Prints a rendered diff to stdio or a filepath given a parser, arguments and two source blobs.
truncatedDiff :: Arguments -> Both SourceBlob -> IO Output printDiff :: (ToJSON (Record fields), NFData (Record fields), DefaultFields fields) => Parser (Syntax Text) (Record fields) -> Arguments -> Both SourceBlob -> IO ByteString
truncatedDiff Arguments{..} sources = pure $ case format of printDiff parser args sources = do
Split -> SplitOutput mempty diff <- diffFiles parser sources
Patch -> PatchOutput (truncatePatch sources) pure $! concatOutputs [renderDiff args sources diff]
SExpression -> SExpressionOutput mempty
JSON -> JSONOutput mempty
Summary -> SummaryOutput mempty
TOC -> TOCOutput mempty
-- | Prints a rendered diff to stdio or a filepath given a parser, diff arguments and two source blobs.
printDiff :: (ToJSON (Record fields), DefaultFields fields) => Parser (Syntax Text) (Record fields) -> Arguments -> Both SourceBlob -> IO ByteString
printDiff parser arguments sources = do
rendered <- textDiff parser arguments sources
pure $ case rendered of
SplitOutput text -> encodeUtf8 text
PatchOutput text -> encodeUtf8 text
SExpressionOutput text -> text
JSONOutput series -> toS $ encode series
SummaryOutput summaries -> toS $ encode summaries
TOCOutput summaries -> toS $ encode summaries

View File

@ -115,11 +115,12 @@ testParse path expectedOutput = do
actual `shouldBe` expected actual `shouldBe` expected
testDiff :: Renderer (Record '[Range, Category, SourceSpan]) -> Both FilePath -> FilePath -> Expectation testDiff :: Renderer (Record '[Range, Category, SourceSpan]) -> Both FilePath -> FilePath -> Expectation
testDiff renderer paths diff = do testDiff renderer paths expectedOutput = do
sources <- traverse readAndTranscodeFile' paths sources <- traverse readAndTranscodeFile' paths
diff' <- diffFiles parser renderer (sourceBlobs sources) diff <- diffFiles parser (sourceBlobs sources)
let actual = (Verbatim . stripWhitespace. concatOutputs . pure) diff' let diffOutput = renderer (sourceBlobs sources) diff
expected <- (Verbatim . stripWhitespace) <$> B.readFile diff let actual = (Verbatim . stripWhitespace. concatOutputs . pure) diffOutput
expected <- (Verbatim . stripWhitespace) <$> B.readFile expectedOutput
actual `shouldBe` expected actual `shouldBe` expected
where where
parser = parserForFilepath filePath parser = parserForFilepath filePath

View File

@ -110,19 +110,23 @@ spec = parallel $ do
describe "diffFiles" $ do describe "diffFiles" $ do
it "encodes to final JSON" $ do it "encodes to final JSON" $ do
sourceBlobs <- blobsForPaths (both "ruby/methods.A.rb" "ruby/methods.B.rb") sourceBlobs <- blobsForPaths (both "ruby/methods.A.rb" "ruby/methods.B.rb")
let parser = parserForFilepath (path (fst sourceBlobs)) output <- diffOutput sourceBlobs
output <- diffFiles parser toc sourceBlobs output `shouldBe` "{\"changes\":{\"ruby/methods.A.rb -> ruby/methods.B.rb\":[{\"span\":{\"start\":[1,1],\"end\":[2,4]},\"category\":\"Method\",\"term\":\"self.foo\",\"changeType\":\"added\"},{\"span\":{\"start\":[4,1],\"end\":[6,4]},\"category\":\"Method\",\"term\":\"bar\",\"changeType\":\"modified\"},{\"span\":{\"start\":[4,1],\"end\":[5,4]},\"category\":\"Method\",\"term\":\"baz\",\"changeType\":\"removed\"}]},\"errors\":{}}"
concatOutputs (pure output) `shouldBe` ("{\"changes\":{\"ruby/methods.A.rb -> ruby/methods.B.rb\":[{\"span\":{\"start\":[1,1],\"end\":[2,4]},\"category\":\"Method\",\"term\":\"self.foo\",\"changeType\":\"added\"},{\"span\":{\"start\":[4,1],\"end\":[6,4]},\"category\":\"Method\",\"term\":\"bar\",\"changeType\":\"modified\"},{\"span\":{\"start\":[4,1],\"end\":[5,4]},\"category\":\"Method\",\"term\":\"baz\",\"changeType\":\"removed\"}]},\"errors\":{}}" )
it "encodes to final JSON if there are parse errors" $ do it "encodes to final JSON if there are parse errors" $ do
sourceBlobs <- blobsForPaths (both "ruby/methods.A.rb" "ruby/methods.X.rb") sourceBlobs <- blobsForPaths (both "ruby/methods.A.rb" "ruby/methods.X.rb")
let parser = parserForFilepath (path (fst sourceBlobs)) output <- diffOutput sourceBlobs
output <- diffFiles parser toc sourceBlobs output `shouldBe` "{\"changes\":{},\"errors\":{\"ruby/methods.A.rb -> ruby/methods.X.rb\":[{\"span\":{\"start\":[1,1],\"end\":[3,1]},\"error\":\"def bar\\nen\\n\"}]}}"
concatOutputs (pure output) `shouldBe` ("{\"changes\":{},\"errors\":{\"ruby/methods.A.rb -> ruby/methods.X.rb\":[{\"span\":{\"start\":[1,1],\"end\":[3,1]},\"error\":\"def bar\\nen\\n\"}]}}" )
type Diff' = SyntaxDiff String '[Range, Category, SourceSpan] type Diff' = SyntaxDiff String '[Range, Category, SourceSpan]
type Term' = SyntaxTerm String '[Range, Category, SourceSpan] type Term' = SyntaxTerm String '[Range, Category, SourceSpan]
diffOutput :: Both SourceBlob -> IO ByteString
diffOutput sourceBlobs = do
let parser = parserForFilepath (path (fst sourceBlobs))
diff <- diffFiles parser sourceBlobs
pure $ concatOutputs [toc sourceBlobs diff]
numTocSummaries :: Diff' -> Int numTocSummaries :: Diff' -> Int
numTocSummaries diff = Prologue.length $ filter (not . isErrorSummary) (diffTOC blankDiffBlobs diff) numTocSummaries diff = Prologue.length $ filter (not . isErrorSummary) (diffTOC blankDiffBlobs diff)
@ -184,17 +188,9 @@ isMethodOrFunction a = case runCofree (unListableF a) of
_ -> False _ -> False
testDiff :: Both SourceBlob -> IO (Diff (Syntax Text) (Record '[Range, Category, SourceSpan])) testDiff :: Both SourceBlob -> IO (Diff (Syntax Text) (Record '[Range, Category, SourceSpan]))
testDiff sourceBlobs = do testDiff sourceBlobs = diffFiles parser sourceBlobs
terms <- traverse (fmap (defaultFeatureVectorDecorator getLabel) . parser) sourceBlobs
pure $! stripDiff (diffTerms' terms sourceBlobs)
where where
parser = parserForFilepath (path . fst $ sourceBlobs) parser = parserForFilepath (path . fst $ sourceBlobs)
diffTerms' terms blobs = case runBothWith areNullOids blobs of
(True, False) -> pure $ Insert (snd terms)
(False, True) -> pure $ Delete (fst terms)
(_, _) -> runBothWith diffTerms terms
areNullOids a b = (hasNullOid a, hasNullOid b)
hasNullOid blob = oid blob == nullOid || Source.null (source blob)
blobsForPaths :: Both FilePath -> IO (Both SourceBlob) blobsForPaths :: Both FilePath -> IO (Both SourceBlob)
blobsForPaths paths = do blobsForPaths paths = do