mirror of
https://github.com/github/semantic.git
synced 2024-12-24 23:42:31 +03:00
Merge branch 'master' into precompute-p,q-grams
# Conflicts: # src/Diffing.hs # src/Parser.hs # src/TreeSitter.hs
This commit is contained in:
commit
25f4d2e751
@ -30,7 +30,6 @@ library
|
||||
, Info
|
||||
, Interpreter
|
||||
, Language
|
||||
, Operation
|
||||
, Parser
|
||||
, Patch
|
||||
, Patch.Arbitrary
|
||||
|
@ -1,7 +1,36 @@
|
||||
module Algorithm where
|
||||
|
||||
import Control.Monad.Trans.Free
|
||||
import Operation
|
||||
import Control.Monad.Free.Church
|
||||
import Prologue
|
||||
|
||||
-- | A lazily-produced AST for diffing.
|
||||
type Algorithm a annotation = Free (Operation a annotation)
|
||||
-- | A single step in a diffing algorithm.
|
||||
--
|
||||
-- 'term' is the type of terms.
|
||||
-- 'diff' is the type of diffs.
|
||||
-- 'f' represents the continuation after diffing. Often 'Algorithm'.
|
||||
data AlgorithmF term diff f
|
||||
-- | Recursively diff two terms and pass the result to the continuation.
|
||||
= Recursive term term (diff -> f)
|
||||
-- | Diff two lists by each element’s position, and pass the resulting list of diffs to the continuation.
|
||||
| ByIndex [term] [term] ([diff] -> f)
|
||||
-- | Diff two lists by each element’s similarity and pass the resulting list of diffs to the continuation.
|
||||
| BySimilarity [term] [term] ([diff] -> f)
|
||||
deriving Functor
|
||||
|
||||
-- | The free monad for 'AlgorithmF'. This enables us to construct diff values using do-notation. We use the Church-encoded free monad 'F' for efficiency.
|
||||
type Algorithm term diff = F (AlgorithmF term diff)
|
||||
|
||||
|
||||
-- DSL
|
||||
|
||||
-- | Constructs a 'Recursive' diff of two terms.
|
||||
recursively :: term -> term -> Algorithm term diff diff
|
||||
recursively a b = wrap (Recursive a b pure)
|
||||
|
||||
-- | Constructs a 'ByIndex' diff of two lists of terms.
|
||||
byIndex :: [term] -> [term] -> Algorithm term diff [diff]
|
||||
byIndex a b = wrap (ByIndex a b pure)
|
||||
|
||||
-- | Constructs a 'BySimilarity' diff of two lists of terms.
|
||||
bySimilarity :: [term] -> [term] -> Algorithm term diff [diff]
|
||||
bySimilarity a b = wrap (BySimilarity a b pure)
|
||||
|
@ -26,36 +26,31 @@ import Test.QuickCheck hiding (Fixed)
|
||||
import Test.QuickCheck.Random
|
||||
|
||||
-- | Given a function comparing two terms recursively, and a function to compute a Hashable label from an unpacked term, compute the diff of a pair of lists of terms using a random walk similarity metric, which completes in log-linear time. This implementation is based on the paper [_RWS-Diff—Flexible and Efficient Change Detection in Hierarchical Data_](https://github.com/github/semantic-diff/files/325837/RWS-Diff.Flexible.and.Efficient.Change.Detection.in.Hierarchical.Data.pdf).
|
||||
rws :: (Eq (Record fields), Prologue.Foldable f, Functor f, Eq (f (Cofree f (Record fields))), HasField fields (Vector.Vector Double)) =>
|
||||
-- | A function which comapres a pair of terms recursively, returning 'Just' their diffed value if appropriate, or 'Nothing' if they should not be compared.
|
||||
(Cofree f (Record fields) -> Cofree f (Record fields) -> Maybe (Free (CofreeF f (Both (Record fields))) (Patch (Cofree f (Record fields))))) ->
|
||||
-- | The old list of terms.
|
||||
[Cofree f (Record fields)] ->
|
||||
-- | The new list of terms.
|
||||
[Cofree f (Record fields)] ->
|
||||
[Free (CofreeF f (Both (Record fields))) (Patch (Cofree f (Record fields)))]
|
||||
rws :: (Eq (Record fields), Prologue.Foldable f, Functor f, Eq (f (Cofree f (Record fields))), HasField fields (Vector.Vector Double))
|
||||
=> (Cofree f (Record fields) -> Cofree f (Record fields) -> Maybe (Free (CofreeF f (Both (Record fields))) (Patch (Cofree f (Record fields))))) -- ^ A function which comapres a pair of terms recursively, returning 'Just' their diffed value if appropriate, or 'Nothing' if they should not be compared.
|
||||
-> [Cofree f (Record fields)] -- ^ The list of old terms.
|
||||
-> [Cofree f (Record fields)] -- ^ The list of new terms.
|
||||
-> [Free (CofreeF f (Both (Record fields))) (Patch (Cofree f (Record fields)))]
|
||||
rws compare as bs
|
||||
| null as, null bs = []
|
||||
| null as = insert <$> bs
|
||||
| null bs = delete <$> as
|
||||
| null as = inserting <$> bs
|
||||
| null bs = deleting <$> as
|
||||
| otherwise = fmap snd . uncurry deleteRemaining . (`runState` (negate 1, fas)) $ traverse findNearestNeighbourTo fbs
|
||||
where insert = pure . Insert
|
||||
delete = pure . Delete
|
||||
fas = zipWith featurize [0..] as
|
||||
where fas = zipWith featurize [0..] as
|
||||
fbs = zipWith featurize [0..] bs
|
||||
kdas = KdTree.build (Vector.toList . feature) fas
|
||||
featurize index term = UnmappedTerm index (getField (extract term)) term
|
||||
findNearestNeighbourTo kv@(UnmappedTerm _ _ v) = do
|
||||
(previous, unmapped) <- get
|
||||
let (UnmappedTerm i _ _) = KdTree.nearest kdas kv
|
||||
fromMaybe (pure (negate 1, insert v)) $ do
|
||||
fromMaybe (pure (negate 1, inserting v)) $ do
|
||||
found <- find ((== i) . termIndex) unmapped
|
||||
guard (i >= previous)
|
||||
compared <- compare (term found) v
|
||||
pure $! do
|
||||
put (i, List.delete found unmapped)
|
||||
pure (i, compared)
|
||||
deleteRemaining diffs (_, unmapped) = foldl' (flip (List.insertBy (comparing fst))) diffs ((termIndex &&& delete . term) <$> unmapped)
|
||||
deleteRemaining diffs (_, unmapped) = foldl' (flip (List.insertBy (comparing fst))) diffs ((termIndex &&& deleting . term) <$> unmapped)
|
||||
|
||||
-- | A term which has not yet been mapped by `rws`, along with its feature vector summary & index.
|
||||
data UnmappedTerm a = UnmappedTerm { termIndex :: {-# UNPACK #-} !Int, feature :: !(Vector.Vector Double), term :: !a }
|
||||
|
@ -1,7 +1,7 @@
|
||||
{-# LANGUAGE RankNTypes #-}
|
||||
module Interpreter (Comparable, DiffConstructor, diffTerms) where
|
||||
|
||||
import Algorithm
|
||||
import Category
|
||||
import Data.Align.Generic
|
||||
import Data.Functor.Foldable
|
||||
import Data.Functor.Both
|
||||
@ -10,12 +10,12 @@ import Data.Record
|
||||
import Data.These
|
||||
import qualified Data.Vector as Vector
|
||||
import Diff
|
||||
import qualified Control.Monad.Free.Church as F
|
||||
import Info
|
||||
import Operation
|
||||
import Patch
|
||||
import Prologue hiding (lookup)
|
||||
import SES
|
||||
import Syntax
|
||||
import Syntax as S
|
||||
import Term
|
||||
|
||||
-- | Returns whether two terms are comparable
|
||||
@ -24,35 +24,63 @@ type Comparable leaf annotation = Term leaf annotation -> Term leaf annotation -
|
||||
-- | Constructs a diff from the CofreeF containing its annotation and syntax. This function has the opportunity to, for example, cache properties in the annotation.
|
||||
type DiffConstructor leaf annotation = CofreeF (Syntax leaf) (Both annotation) (Diff leaf annotation) -> Diff leaf annotation
|
||||
|
||||
-- | Diff two terms, given a function that determines whether two terms can be compared and a cost function.
|
||||
diffTerms :: (Eq leaf, Eq (Record fields), HasField fields Category, HasField fields (Vector.Vector Double)) => DiffConstructor leaf (Record fields) -> Comparable leaf (Record fields) -> SES.Cost (Diff leaf (Record fields)) -> Term leaf (Record fields) -> Term leaf (Record fields) -> Diff leaf (Record fields)
|
||||
diffTerms construct comparable cost a b = fromMaybe (pure $ Replace a b) $ constructAndRun construct comparable cost a b
|
||||
|
||||
-- | Constructs an algorithm and runs it
|
||||
constructAndRun :: (Eq leaf, Eq (Record fields), HasField fields Category, HasField fields (Vector.Vector Double)) => DiffConstructor leaf (Record fields) -> Comparable leaf (Record fields) -> SES.Cost (Diff leaf (Record fields)) -> Term leaf (Record fields) -> Term leaf (Record fields) -> Maybe (Diff leaf (Record fields))
|
||||
constructAndRun construct comparable cost t1 t2
|
||||
| not $ comparable t1 t2 = Nothing
|
||||
| (category <$> t1) == (category <$> t2) = hylo construct runCofree <$> zipTerms t1 t2
|
||||
| otherwise =
|
||||
run construct comparable cost $ algorithm a b where
|
||||
algorithm (Indexed a') (Indexed b') = wrap $! ByIndex a' b' (annotate . Indexed)
|
||||
algorithm (Leaf a') (Leaf b') | a' == b' = annotate $ Leaf b'
|
||||
algorithm a' b' = wrap $! Recursive (cofree (annotation1 :< a')) (cofree (annotation2 :< b')) pure
|
||||
(annotation1 :< a, annotation2 :< b) = (runCofree t1, runCofree t2)
|
||||
annotate = pure . construct . (both annotation1 annotation2 :<)
|
||||
-- | Diff two terms recursively, given functions characterizing the diffing.
|
||||
diffTerms :: (Eq leaf, Eq (Record fields), HasField fields Category, HasField fields (Vector.Vector Double))
|
||||
=> DiffConstructor leaf (Record fields) -- ^ A function to wrap up & possibly annotate every produced diff.
|
||||
-> Comparable leaf (Record fields) -- ^ A function to determine whether or not two terms should even be compared.
|
||||
-> SES.Cost (Diff leaf (Record fields)) -- ^ A function to compute the cost of a given diff node.
|
||||
-> Term leaf (Record fields) -- ^ A term representing the old state.
|
||||
-> Term leaf (Record fields) -- ^ A term representing the new state.
|
||||
-> Diff leaf (Record fields)
|
||||
diffTerms construct comparable cost a b = fromMaybe (replacing a b) $ diffComparableTerms construct comparable cost a b
|
||||
|
||||
-- | Runs the diff algorithm
|
||||
run :: (Eq leaf, Eq (Record fields), HasField fields Category, HasField fields (Vector.Vector Double)) => DiffConstructor leaf (Record fields) -> Comparable leaf (Record fields) -> SES.Cost (Diff leaf (Record fields)) -> Algorithm leaf (Record fields) (Diff leaf (Record fields)) -> Maybe (Diff leaf (Record fields))
|
||||
run construct comparable cost algorithm = case runFree algorithm of
|
||||
Pure diff -> Just diff
|
||||
Free (Recursive t1 t2 f) -> run construct comparable cost . f $ recur a b where
|
||||
(annotation1 :< a, annotation2 :< b) = (runCofree t1, runCofree t2)
|
||||
annotate = construct . (both annotation1 annotation2 :<)
|
||||
-- | Diff two terms recursively, given functions characterizing the diffing. If the terms are incomparable, returns 'Nothing'.
|
||||
diffComparableTerms :: (Eq leaf, Eq (Record fields), HasField fields Category, HasField fields (Vector.Vector Double)) => DiffConstructor leaf (Record fields) -> Comparable leaf (Record fields) -> SES.Cost (Diff leaf (Record fields)) -> Term leaf (Record fields) -> Term leaf (Record fields) -> Maybe (Diff leaf (Record fields))
|
||||
diffComparableTerms construct comparable cost = recur
|
||||
where recur a b
|
||||
| (category <$> a) == (category <$> b) = hylo construct runCofree <$> zipTerms a b
|
||||
| comparable a b = runAlgorithm construct recur cost (Just <$> algorithmWithTerms construct a b)
|
||||
| otherwise = Nothing
|
||||
|
||||
recur a b = maybe (pure (Replace t1 t2)) (annotate . fmap diffThese) (galign a b)
|
||||
-- | Construct an algorithm to diff a pair of terms.
|
||||
algorithmWithTerms :: (TermF leaf (Both a) diff -> diff) -> Term leaf a -> Term leaf a -> Algorithm (Term leaf a) diff diff
|
||||
algorithmWithTerms construct t1 t2 = case (unwrap t1, unwrap t2) of
|
||||
(Indexed a, Indexed b) -> byIndex Indexed a b
|
||||
(S.FunctionCall identifierA argsA, S.FunctionCall identifierB argsB) -> do
|
||||
identifier <- recursively identifierA identifierB
|
||||
byIndex (S.FunctionCall identifier) argsA argsB
|
||||
(S.Switch exprA casesA, S.Switch exprB casesB) -> do
|
||||
expr <- recursively exprA exprB
|
||||
byIndex (S.Switch expr) casesA casesB
|
||||
(S.Object a, S.Object b) -> byIndex S.Object a b
|
||||
(Commented commentsA a, Commented commentsB b) -> do
|
||||
wrapped <- sequenceA (recursively <$> a <*> b)
|
||||
byIndex (`Commented` wrapped) commentsA commentsB
|
||||
(Array a, Array b) -> byIndex Array a b
|
||||
(S.Class identifierA paramsA expressionsA, S.Class identifierB paramsB expressionsB) -> do
|
||||
identifier <- recursively identifierA identifierB
|
||||
params <- sequenceA (recursively <$> paramsA <*> paramsB)
|
||||
byIndex (S.Class identifier params) expressionsA expressionsB
|
||||
(S.Method identifierA paramsA expressionsA, S.Method identifierB paramsB expressionsB) -> do
|
||||
identifier <- recursively identifierA identifierB
|
||||
params <- Algorithm.byIndex paramsA paramsB
|
||||
expressions <- Algorithm.byIndex expressionsA expressionsB
|
||||
annotate $! S.Method identifier params expressions
|
||||
_ -> recursively t1 t2
|
||||
where annotate = pure . construct . (both (extract t1) (extract t2) :<)
|
||||
byIndex constructor a b = Algorithm.byIndex a b >>= annotate . constructor
|
||||
|
||||
diffThese = these (pure . Delete) (pure . Insert) (diffTerms construct comparable cost)
|
||||
|
||||
Free (ByIndex a b f) -> run construct comparable cost . f $ ses (constructAndRun construct comparable cost) cost a b
|
||||
|
||||
Free (ByRandomWalkSimilarity a b f) -> run construct comparable cost . f $ rws (constructAndRun construct comparable cost) a b
|
||||
-- | Run an algorithm, given functions characterizing the evaluation.
|
||||
runAlgorithm :: (Functor f, GAlign f, Eq a, Eq (Record fields), Eq (f (Cofree f (Record fields))), Prologue.Foldable f, Traversable f, HasField fields (Vector.Vector Double))
|
||||
=> (CofreeF f (Both (Record fields)) (Free (CofreeF f (Both (Record fields))) (Patch (Cofree f (Record fields)))) -> Free (CofreeF f (Both (Record fields))) (Patch (Cofree f (Record fields)))) -- ^ A function to wrap up & possibly annotate every produced diff.
|
||||
-> (Cofree f (Record fields) -> Cofree f (Record fields) -> Maybe (Free (CofreeF f (Both (Record fields))) (Patch (Cofree f (Record fields))))) -- ^ A function to diff two subterms recursively, if they are comparable, or else return 'Nothing'.
|
||||
-> SES.Cost (Free (CofreeF f (Both (Record fields))) (Patch (Cofree f (Record fields)))) -- ^ A function to compute the cost of a given diff node.
|
||||
-> Algorithm (Cofree f (Record fields)) (Free (CofreeF f (Both (Record fields))) (Patch (Cofree f (Record fields)))) a -- ^ The algorithm to run.
|
||||
-> a
|
||||
runAlgorithm construct recur cost = F.iter $ \case
|
||||
Recursive a b f -> f (maybe (replacing a b) (construct . (both (extract a) (extract b) :<)) $ do
|
||||
aligned <- galign (unwrap a) (unwrap b)
|
||||
traverse (these (Just . deleting) (Just . inserting) recur) aligned)
|
||||
ByIndex as bs f -> f (ses recur cost as bs)
|
||||
BySimilarity as bs f -> f (rws recur as bs)
|
||||
|
@ -1,18 +0,0 @@
|
||||
module Operation where
|
||||
|
||||
import Prologue
|
||||
import Diff
|
||||
import Term
|
||||
|
||||
-- | A single step in a diffing algorithm.
|
||||
data Operation
|
||||
a -- ^ The type of leaves in the syntax tree, typically String, but possibly some datatype representing different leaves more precisely.
|
||||
annotation -- ^ The type of annotations.
|
||||
f -- ^ The type representing another level of the diffing algorithm. Often Algorithm.
|
||||
-- | Recursively diff two terms and pass the result to the continuation.
|
||||
= Recursive (Term a annotation) (Term a annotation) (Diff a annotation -> f)
|
||||
-- | Diff two dictionaries and pass the result to the continuation.
|
||||
-- | Diff two arrays and pass the result to the continuation.
|
||||
| ByIndex [Term a annotation] [Term a annotation] ([Diff a annotation] -> f)
|
||||
| ByRandomWalkSimilarity [Term a annotation] [Term a annotation] ([Diff a annotation] -> f)
|
||||
deriving Functor
|
19
src/Patch.hs
19
src/Patch.hs
@ -1,5 +1,8 @@
|
||||
module Patch
|
||||
( Patch(..)
|
||||
, replacing
|
||||
, inserting
|
||||
, deleting
|
||||
, after
|
||||
, before
|
||||
, unPatch
|
||||
@ -18,6 +21,22 @@ data Patch a
|
||||
| Delete a
|
||||
deriving (Eq, Foldable, Functor, Generic, Ord, Show, Traversable)
|
||||
|
||||
|
||||
-- DSL
|
||||
|
||||
-- | Constructs the replacement of one value by another in an Applicative context.
|
||||
replacing :: Applicative f => a -> a -> f (Patch a)
|
||||
replacing = (pure .) . Replace
|
||||
|
||||
-- | Constructs the insertion of a value in an Applicative context.
|
||||
inserting :: Applicative f => a -> f (Patch a)
|
||||
inserting = pure . Insert
|
||||
|
||||
-- | Constructs the deletion of a value in an Applicative context.
|
||||
deleting :: Applicative f => a -> f (Patch a)
|
||||
deleting = pure . Delete
|
||||
|
||||
|
||||
-- | Return the item from the after side of the patch.
|
||||
after :: Patch a -> Maybe a
|
||||
after = maybeSnd . unPatch
|
||||
|
@ -38,8 +38,8 @@ diffAt diffTerms cost (i, j) as bs
|
||||
| null bs = pure $ foldr delete [] as
|
||||
| otherwise = pure []
|
||||
where
|
||||
delete = consWithCost cost . pure . Delete
|
||||
insert = consWithCost cost . pure . Insert
|
||||
delete = consWithCost cost . deleting
|
||||
insert = consWithCost cost . inserting
|
||||
costOf [] = 0
|
||||
costOf ((_, c) : _) = c
|
||||
best = minimumBy (comparing costOf)
|
||||
|
@ -8,12 +8,12 @@ import Test.QuickCheck hiding (Fixed)
|
||||
import SourceSpan
|
||||
|
||||
-- | A node in an abstract syntax tree.
|
||||
data Syntax
|
||||
a -- ^ The type of leaves in the syntax tree, typically String, but possibly some datatype representing different leaves more precisely.
|
||||
f -- ^ The type representing another level of the tree, e.g. the children of branches. Often Cofree or Fix or similar.
|
||||
=
|
||||
--
|
||||
-- 'a' is the type of leaves in the syntax tree, typically 'Text', but possibly some datatype representing different leaves more precisely.
|
||||
-- 'f' is the type representing another level of the tree, e.g. the children of branches. Often 'Cofree', 'Free' or similar.
|
||||
data Syntax a f
|
||||
-- | A terminal syntax node, e.g. an identifier, or atomic literal.
|
||||
Leaf a
|
||||
= Leaf a
|
||||
-- | An ordered branch of child nodes, expected to be variadic in the grammar, e.g. a list of statements or uncurried function parameters.
|
||||
| Indexed [f]
|
||||
-- | An ordered branch of child nodes, expected to be of fixed length in the grammar, e.g. a binary operator & its operands.
|
||||
|
Loading…
Reference in New Issue
Block a user