1
1
mirror of https://github.com/github/semantic.git synced 2024-12-24 23:42:31 +03:00

Merge branch 'master' into precompute-p,q-grams

# Conflicts:
#	src/Diffing.hs
#	src/Parser.hs
#	src/TreeSitter.hs
This commit is contained in:
Rob Rix 2016-08-05 13:13:36 -04:00
commit 25f4d2e751
8 changed files with 128 additions and 76 deletions

View File

@ -30,7 +30,6 @@ library
, Info
, Interpreter
, Language
, Operation
, Parser
, Patch
, Patch.Arbitrary

View File

@ -1,7 +1,36 @@
module Algorithm where
import Control.Monad.Trans.Free
import Operation
import Control.Monad.Free.Church
import Prologue
-- | A lazily-produced AST for diffing.
type Algorithm a annotation = Free (Operation a annotation)
-- | A single step in a diffing algorithm.
--
-- 'term' is the type of terms.
-- 'diff' is the type of diffs.
-- 'f' represents the continuation after diffing. Often 'Algorithm'.
data AlgorithmF term diff f
-- | Recursively diff two terms and pass the result to the continuation.
= Recursive term term (diff -> f)
-- | Diff two lists by each elements position, and pass the resulting list of diffs to the continuation.
| ByIndex [term] [term] ([diff] -> f)
-- | Diff two lists by each elements similarity and pass the resulting list of diffs to the continuation.
| BySimilarity [term] [term] ([diff] -> f)
deriving Functor
-- | The free monad for 'AlgorithmF'. This enables us to construct diff values using do-notation. We use the Church-encoded free monad 'F' for efficiency.
type Algorithm term diff = F (AlgorithmF term diff)
-- DSL
-- | Constructs a 'Recursive' diff of two terms.
recursively :: term -> term -> Algorithm term diff diff
recursively a b = wrap (Recursive a b pure)
-- | Constructs a 'ByIndex' diff of two lists of terms.
byIndex :: [term] -> [term] -> Algorithm term diff [diff]
byIndex a b = wrap (ByIndex a b pure)
-- | Constructs a 'BySimilarity' diff of two lists of terms.
bySimilarity :: [term] -> [term] -> Algorithm term diff [diff]
bySimilarity a b = wrap (BySimilarity a b pure)

View File

@ -26,36 +26,31 @@ import Test.QuickCheck hiding (Fixed)
import Test.QuickCheck.Random
-- | Given a function comparing two terms recursively, and a function to compute a Hashable label from an unpacked term, compute the diff of a pair of lists of terms using a random walk similarity metric, which completes in log-linear time. This implementation is based on the paper [_RWS-Diff—Flexible and Efficient Change Detection in Hierarchical Data_](https://github.com/github/semantic-diff/files/325837/RWS-Diff.Flexible.and.Efficient.Change.Detection.in.Hierarchical.Data.pdf).
rws :: (Eq (Record fields), Prologue.Foldable f, Functor f, Eq (f (Cofree f (Record fields))), HasField fields (Vector.Vector Double)) =>
-- | A function which comapres a pair of terms recursively, returning 'Just' their diffed value if appropriate, or 'Nothing' if they should not be compared.
(Cofree f (Record fields) -> Cofree f (Record fields) -> Maybe (Free (CofreeF f (Both (Record fields))) (Patch (Cofree f (Record fields))))) ->
-- | The old list of terms.
[Cofree f (Record fields)] ->
-- | The new list of terms.
[Cofree f (Record fields)] ->
[Free (CofreeF f (Both (Record fields))) (Patch (Cofree f (Record fields)))]
rws :: (Eq (Record fields), Prologue.Foldable f, Functor f, Eq (f (Cofree f (Record fields))), HasField fields (Vector.Vector Double))
=> (Cofree f (Record fields) -> Cofree f (Record fields) -> Maybe (Free (CofreeF f (Both (Record fields))) (Patch (Cofree f (Record fields))))) -- ^ A function which comapres a pair of terms recursively, returning 'Just' their diffed value if appropriate, or 'Nothing' if they should not be compared.
-> [Cofree f (Record fields)] -- ^ The list of old terms.
-> [Cofree f (Record fields)] -- ^ The list of new terms.
-> [Free (CofreeF f (Both (Record fields))) (Patch (Cofree f (Record fields)))]
rws compare as bs
| null as, null bs = []
| null as = insert <$> bs
| null bs = delete <$> as
| null as = inserting <$> bs
| null bs = deleting <$> as
| otherwise = fmap snd . uncurry deleteRemaining . (`runState` (negate 1, fas)) $ traverse findNearestNeighbourTo fbs
where insert = pure . Insert
delete = pure . Delete
fas = zipWith featurize [0..] as
where fas = zipWith featurize [0..] as
fbs = zipWith featurize [0..] bs
kdas = KdTree.build (Vector.toList . feature) fas
featurize index term = UnmappedTerm index (getField (extract term)) term
findNearestNeighbourTo kv@(UnmappedTerm _ _ v) = do
(previous, unmapped) <- get
let (UnmappedTerm i _ _) = KdTree.nearest kdas kv
fromMaybe (pure (negate 1, insert v)) $ do
fromMaybe (pure (negate 1, inserting v)) $ do
found <- find ((== i) . termIndex) unmapped
guard (i >= previous)
compared <- compare (term found) v
pure $! do
put (i, List.delete found unmapped)
pure (i, compared)
deleteRemaining diffs (_, unmapped) = foldl' (flip (List.insertBy (comparing fst))) diffs ((termIndex &&& delete . term) <$> unmapped)
deleteRemaining diffs (_, unmapped) = foldl' (flip (List.insertBy (comparing fst))) diffs ((termIndex &&& deleting . term) <$> unmapped)
-- | A term which has not yet been mapped by `rws`, along with its feature vector summary & index.
data UnmappedTerm a = UnmappedTerm { termIndex :: {-# UNPACK #-} !Int, feature :: !(Vector.Vector Double), term :: !a }

View File

@ -1,7 +1,7 @@
{-# LANGUAGE RankNTypes #-}
module Interpreter (Comparable, DiffConstructor, diffTerms) where
import Algorithm
import Category
import Data.Align.Generic
import Data.Functor.Foldable
import Data.Functor.Both
@ -10,12 +10,12 @@ import Data.Record
import Data.These
import qualified Data.Vector as Vector
import Diff
import qualified Control.Monad.Free.Church as F
import Info
import Operation
import Patch
import Prologue hiding (lookup)
import SES
import Syntax
import Syntax as S
import Term
-- | Returns whether two terms are comparable
@ -24,35 +24,63 @@ type Comparable leaf annotation = Term leaf annotation -> Term leaf annotation -
-- | Constructs a diff from the CofreeF containing its annotation and syntax. This function has the opportunity to, for example, cache properties in the annotation.
type DiffConstructor leaf annotation = CofreeF (Syntax leaf) (Both annotation) (Diff leaf annotation) -> Diff leaf annotation
-- | Diff two terms, given a function that determines whether two terms can be compared and a cost function.
diffTerms :: (Eq leaf, Eq (Record fields), HasField fields Category, HasField fields (Vector.Vector Double)) => DiffConstructor leaf (Record fields) -> Comparable leaf (Record fields) -> SES.Cost (Diff leaf (Record fields)) -> Term leaf (Record fields) -> Term leaf (Record fields) -> Diff leaf (Record fields)
diffTerms construct comparable cost a b = fromMaybe (pure $ Replace a b) $ constructAndRun construct comparable cost a b
-- | Constructs an algorithm and runs it
constructAndRun :: (Eq leaf, Eq (Record fields), HasField fields Category, HasField fields (Vector.Vector Double)) => DiffConstructor leaf (Record fields) -> Comparable leaf (Record fields) -> SES.Cost (Diff leaf (Record fields)) -> Term leaf (Record fields) -> Term leaf (Record fields) -> Maybe (Diff leaf (Record fields))
constructAndRun construct comparable cost t1 t2
| not $ comparable t1 t2 = Nothing
| (category <$> t1) == (category <$> t2) = hylo construct runCofree <$> zipTerms t1 t2
| otherwise =
run construct comparable cost $ algorithm a b where
algorithm (Indexed a') (Indexed b') = wrap $! ByIndex a' b' (annotate . Indexed)
algorithm (Leaf a') (Leaf b') | a' == b' = annotate $ Leaf b'
algorithm a' b' = wrap $! Recursive (cofree (annotation1 :< a')) (cofree (annotation2 :< b')) pure
(annotation1 :< a, annotation2 :< b) = (runCofree t1, runCofree t2)
annotate = pure . construct . (both annotation1 annotation2 :<)
-- | Diff two terms recursively, given functions characterizing the diffing.
diffTerms :: (Eq leaf, Eq (Record fields), HasField fields Category, HasField fields (Vector.Vector Double))
=> DiffConstructor leaf (Record fields) -- ^ A function to wrap up & possibly annotate every produced diff.
-> Comparable leaf (Record fields) -- ^ A function to determine whether or not two terms should even be compared.
-> SES.Cost (Diff leaf (Record fields)) -- ^ A function to compute the cost of a given diff node.
-> Term leaf (Record fields) -- ^ A term representing the old state.
-> Term leaf (Record fields) -- ^ A term representing the new state.
-> Diff leaf (Record fields)
diffTerms construct comparable cost a b = fromMaybe (replacing a b) $ diffComparableTerms construct comparable cost a b
-- | Runs the diff algorithm
run :: (Eq leaf, Eq (Record fields), HasField fields Category, HasField fields (Vector.Vector Double)) => DiffConstructor leaf (Record fields) -> Comparable leaf (Record fields) -> SES.Cost (Diff leaf (Record fields)) -> Algorithm leaf (Record fields) (Diff leaf (Record fields)) -> Maybe (Diff leaf (Record fields))
run construct comparable cost algorithm = case runFree algorithm of
Pure diff -> Just diff
Free (Recursive t1 t2 f) -> run construct comparable cost . f $ recur a b where
(annotation1 :< a, annotation2 :< b) = (runCofree t1, runCofree t2)
annotate = construct . (both annotation1 annotation2 :<)
-- | Diff two terms recursively, given functions characterizing the diffing. If the terms are incomparable, returns 'Nothing'.
diffComparableTerms :: (Eq leaf, Eq (Record fields), HasField fields Category, HasField fields (Vector.Vector Double)) => DiffConstructor leaf (Record fields) -> Comparable leaf (Record fields) -> SES.Cost (Diff leaf (Record fields)) -> Term leaf (Record fields) -> Term leaf (Record fields) -> Maybe (Diff leaf (Record fields))
diffComparableTerms construct comparable cost = recur
where recur a b
| (category <$> a) == (category <$> b) = hylo construct runCofree <$> zipTerms a b
| comparable a b = runAlgorithm construct recur cost (Just <$> algorithmWithTerms construct a b)
| otherwise = Nothing
recur a b = maybe (pure (Replace t1 t2)) (annotate . fmap diffThese) (galign a b)
-- | Construct an algorithm to diff a pair of terms.
algorithmWithTerms :: (TermF leaf (Both a) diff -> diff) -> Term leaf a -> Term leaf a -> Algorithm (Term leaf a) diff diff
algorithmWithTerms construct t1 t2 = case (unwrap t1, unwrap t2) of
(Indexed a, Indexed b) -> byIndex Indexed a b
(S.FunctionCall identifierA argsA, S.FunctionCall identifierB argsB) -> do
identifier <- recursively identifierA identifierB
byIndex (S.FunctionCall identifier) argsA argsB
(S.Switch exprA casesA, S.Switch exprB casesB) -> do
expr <- recursively exprA exprB
byIndex (S.Switch expr) casesA casesB
(S.Object a, S.Object b) -> byIndex S.Object a b
(Commented commentsA a, Commented commentsB b) -> do
wrapped <- sequenceA (recursively <$> a <*> b)
byIndex (`Commented` wrapped) commentsA commentsB
(Array a, Array b) -> byIndex Array a b
(S.Class identifierA paramsA expressionsA, S.Class identifierB paramsB expressionsB) -> do
identifier <- recursively identifierA identifierB
params <- sequenceA (recursively <$> paramsA <*> paramsB)
byIndex (S.Class identifier params) expressionsA expressionsB
(S.Method identifierA paramsA expressionsA, S.Method identifierB paramsB expressionsB) -> do
identifier <- recursively identifierA identifierB
params <- Algorithm.byIndex paramsA paramsB
expressions <- Algorithm.byIndex expressionsA expressionsB
annotate $! S.Method identifier params expressions
_ -> recursively t1 t2
where annotate = pure . construct . (both (extract t1) (extract t2) :<)
byIndex constructor a b = Algorithm.byIndex a b >>= annotate . constructor
diffThese = these (pure . Delete) (pure . Insert) (diffTerms construct comparable cost)
Free (ByIndex a b f) -> run construct comparable cost . f $ ses (constructAndRun construct comparable cost) cost a b
Free (ByRandomWalkSimilarity a b f) -> run construct comparable cost . f $ rws (constructAndRun construct comparable cost) a b
-- | Run an algorithm, given functions characterizing the evaluation.
runAlgorithm :: (Functor f, GAlign f, Eq a, Eq (Record fields), Eq (f (Cofree f (Record fields))), Prologue.Foldable f, Traversable f, HasField fields (Vector.Vector Double))
=> (CofreeF f (Both (Record fields)) (Free (CofreeF f (Both (Record fields))) (Patch (Cofree f (Record fields)))) -> Free (CofreeF f (Both (Record fields))) (Patch (Cofree f (Record fields)))) -- ^ A function to wrap up & possibly annotate every produced diff.
-> (Cofree f (Record fields) -> Cofree f (Record fields) -> Maybe (Free (CofreeF f (Both (Record fields))) (Patch (Cofree f (Record fields))))) -- ^ A function to diff two subterms recursively, if they are comparable, or else return 'Nothing'.
-> SES.Cost (Free (CofreeF f (Both (Record fields))) (Patch (Cofree f (Record fields)))) -- ^ A function to compute the cost of a given diff node.
-> Algorithm (Cofree f (Record fields)) (Free (CofreeF f (Both (Record fields))) (Patch (Cofree f (Record fields)))) a -- ^ The algorithm to run.
-> a
runAlgorithm construct recur cost = F.iter $ \case
Recursive a b f -> f (maybe (replacing a b) (construct . (both (extract a) (extract b) :<)) $ do
aligned <- galign (unwrap a) (unwrap b)
traverse (these (Just . deleting) (Just . inserting) recur) aligned)
ByIndex as bs f -> f (ses recur cost as bs)
BySimilarity as bs f -> f (rws recur as bs)

View File

@ -1,18 +0,0 @@
module Operation where
import Prologue
import Diff
import Term
-- | A single step in a diffing algorithm.
data Operation
a -- ^ The type of leaves in the syntax tree, typically String, but possibly some datatype representing different leaves more precisely.
annotation -- ^ The type of annotations.
f -- ^ The type representing another level of the diffing algorithm. Often Algorithm.
-- | Recursively diff two terms and pass the result to the continuation.
= Recursive (Term a annotation) (Term a annotation) (Diff a annotation -> f)
-- | Diff two dictionaries and pass the result to the continuation.
-- | Diff two arrays and pass the result to the continuation.
| ByIndex [Term a annotation] [Term a annotation] ([Diff a annotation] -> f)
| ByRandomWalkSimilarity [Term a annotation] [Term a annotation] ([Diff a annotation] -> f)
deriving Functor

View File

@ -1,5 +1,8 @@
module Patch
( Patch(..)
, replacing
, inserting
, deleting
, after
, before
, unPatch
@ -18,6 +21,22 @@ data Patch a
| Delete a
deriving (Eq, Foldable, Functor, Generic, Ord, Show, Traversable)
-- DSL
-- | Constructs the replacement of one value by another in an Applicative context.
replacing :: Applicative f => a -> a -> f (Patch a)
replacing = (pure .) . Replace
-- | Constructs the insertion of a value in an Applicative context.
inserting :: Applicative f => a -> f (Patch a)
inserting = pure . Insert
-- | Constructs the deletion of a value in an Applicative context.
deleting :: Applicative f => a -> f (Patch a)
deleting = pure . Delete
-- | Return the item from the after side of the patch.
after :: Patch a -> Maybe a
after = maybeSnd . unPatch

View File

@ -38,8 +38,8 @@ diffAt diffTerms cost (i, j) as bs
| null bs = pure $ foldr delete [] as
| otherwise = pure []
where
delete = consWithCost cost . pure . Delete
insert = consWithCost cost . pure . Insert
delete = consWithCost cost . deleting
insert = consWithCost cost . inserting
costOf [] = 0
costOf ((_, c) : _) = c
best = minimumBy (comparing costOf)

View File

@ -8,12 +8,12 @@ import Test.QuickCheck hiding (Fixed)
import SourceSpan
-- | A node in an abstract syntax tree.
data Syntax
a -- ^ The type of leaves in the syntax tree, typically String, but possibly some datatype representing different leaves more precisely.
f -- ^ The type representing another level of the tree, e.g. the children of branches. Often Cofree or Fix or similar.
=
--
-- 'a' is the type of leaves in the syntax tree, typically 'Text', but possibly some datatype representing different leaves more precisely.
-- 'f' is the type representing another level of the tree, e.g. the children of branches. Often 'Cofree', 'Free' or similar.
data Syntax a f
-- | A terminal syntax node, e.g. an identifier, or atomic literal.
Leaf a
= Leaf a
-- | An ordered branch of child nodes, expected to be variadic in the grammar, e.g. a list of statements or uncurried function parameters.
| Indexed [f]
-- | An ordered branch of child nodes, expected to be of fixed length in the grammar, e.g. a binary operator & its operands.