mirror of
https://github.com/github/semantic.git
synced 2025-01-05 14:11:33 +03:00
Keep a hash for every node instead of a vector
This commit is contained in:
parent
daba1fb2ef
commit
d16cdefe34
@ -34,7 +34,7 @@ import Test.QuickCheck.Random
|
|||||||
import qualified Data.PQueue.Prio.Max as PQueue
|
import qualified Data.PQueue.Prio.Max as PQueue
|
||||||
|
|
||||||
-- | Given a function comparing two terms recursively, and a function to compute a Hashable label from an unpacked term, compute the diff of a pair of lists of terms using a random walk similarity metric, which completes in log-linear time. This implementation is based on the paper [_RWS-Diff—Flexible and Efficient Change Detection in Hierarchical Data_](https://github.com/github/semantic-diff/files/325837/RWS-Diff.Flexible.and.Efficient.Change.Detection.in.Hierarchical.Data.pdf).
|
-- | Given a function comparing two terms recursively, and a function to compute a Hashable label from an unpacked term, compute the diff of a pair of lists of terms using a random walk similarity metric, which completes in log-linear time. This implementation is based on the paper [_RWS-Diff—Flexible and Efficient Change Detection in Hierarchical Data_](https://github.com/github/semantic-diff/files/325837/RWS-Diff.Flexible.and.Efficient.Change.Detection.in.Hierarchical.Data.pdf).
|
||||||
rws :: forall f fields. (Eq (Record fields), Prologue.Foldable f, Functor f, Eq (f (Cofree f (Record fields))), HasField fields (Vector.Vector Double), HasField fields (Vector.Vector Int))
|
rws :: forall f fields. (Eq (Record fields), Prologue.Foldable f, Functor f, Eq (f (Cofree f (Record fields))), HasField fields (Vector.Vector Double), HasField fields Int)
|
||||||
=> (Cofree f (Record fields) -> Cofree f (Record fields) -> Maybe (Free (CofreeF f (Both (Record fields))) (Patch (Cofree f (Record fields))))) -- ^ A function which compares a pair of terms recursively, returning 'Just' their diffed value if appropriate, or 'Nothing' if they should not be compared.
|
=> (Cofree f (Record fields) -> Cofree f (Record fields) -> Maybe (Free (CofreeF f (Both (Record fields))) (Patch (Cofree f (Record fields))))) -- ^ A function which compares a pair of terms recursively, returning 'Just' their diffed value if appropriate, or 'Nothing' if they should not be compared.
|
||||||
-> [Cofree f (Record fields)] -- ^ The list of old terms.
|
-> [Cofree f (Record fields)] -- ^ The list of old terms.
|
||||||
-> [Cofree f (Record fields)] -- ^ The list of new terms.
|
-> [Cofree f (Record fields)] -- ^ The list of new terms.
|
||||||
@ -53,7 +53,7 @@ rws compare as bs
|
|||||||
fmap snd
|
fmap snd
|
||||||
-- Modified xydiff + RWS
|
-- Modified xydiff + RWS
|
||||||
-- 1. Annotate each node with a unique key top down based off its categories and termIndex?
|
-- 1. Annotate each node with a unique key top down based off its categories and termIndex?
|
||||||
-- 1. Construct two priority queues of hash values for each node ordered by max weight
|
-- 2. Construct two priority queues of hash values for each node ordered by max weight
|
||||||
-- 3. Try to find matchings starting with the heaviest nodes
|
-- 3. Try to find matchings starting with the heaviest nodes
|
||||||
-- 4. Use structure to propagate matchings?
|
-- 4. Use structure to propagate matchings?
|
||||||
-- 5. Compute the diff
|
-- 5. Compute the diff
|
||||||
@ -62,14 +62,14 @@ rws compare as bs
|
|||||||
queueBs = PQueue.fromList (zipWith hashabilize [0..] bs)
|
queueBs = PQueue.fromList (zipWith hashabilize [0..] bs)
|
||||||
(fas, fbs) = dropEqualTerms queueAs queueBs
|
(fas, fbs) = dropEqualTerms queueAs queueBs
|
||||||
|
|
||||||
dropEqualTerms :: PQueue.MaxPQueue (Vector.Vector Int) (UnmappedHashTerm (Cofree f (Record fields))) -> PQueue.MaxPQueue (Vector.Vector Int) (UnmappedHashTerm (Cofree f (Record fields))) -> ([UnmappedTerm (Cofree f (Record fields))], [UnmappedTerm (Cofree f (Record fields))])
|
dropEqualTerms :: PQueue.MaxPQueue Int (UnmappedHashTerm (Cofree f (Record fields))) -> PQueue.MaxPQueue Int (UnmappedHashTerm (Cofree f (Record fields))) -> ([UnmappedTerm (Cofree f (Record fields))], [UnmappedTerm (Cofree f (Record fields))])
|
||||||
dropEqualTerms as bs = (unmappedAs, unmappedBs)
|
dropEqualTerms as bs = (unmappedAs, unmappedBs)
|
||||||
where
|
where
|
||||||
(unmappedAs, unmappedBs, _, _) = dropEqualTerms' ([], [], as, bs)
|
(unmappedAs, unmappedBs, _, _) = dropEqualTerms' ([], [], as, bs)
|
||||||
|
|
||||||
dropEqualTerms' (as', bs', queueA, queueB) = case (PQueue.maxViewWithKey queueA, PQueue.maxViewWithKey queueB) of
|
dropEqualTerms' (as', bs', queueA, queueB) = case (PQueue.maxViewWithKey queueA, PQueue.maxViewWithKey queueB) of
|
||||||
(Just ((kA, a), queueA'), Just ((kB, b), queueB')) ->
|
(Just ((kA, a), queueA'), Just ((kB, b), queueB')) ->
|
||||||
if kA /= kB
|
if hashInt a /= hashInt b
|
||||||
then dropEqualTerms' (toUnmappedTerm a : as', toUnmappedTerm b : bs', queueA', queueB')
|
then dropEqualTerms' (toUnmappedTerm a : as', toUnmappedTerm b : bs', queueA', queueB')
|
||||||
else dropEqualTerms' (as', bs', queueA', queueB')
|
else dropEqualTerms' (as', bs', queueA', queueB')
|
||||||
(Just ((_, a), queueA'), Nothing) -> (toUnmappedTerm a : as', bs', queueA', queueB)
|
(Just ((_, a), queueA'), Nothing) -> (toUnmappedTerm a : as', bs', queueA', queueB)
|
||||||
@ -77,9 +77,9 @@ rws compare as bs
|
|||||||
(Nothing, Nothing) -> (as', bs', queueA, queueB)
|
(Nothing, Nothing) -> (as', bs', queueA, queueB)
|
||||||
toUnmappedTerm (UnmappedHashTerm index _ term) = UnmappedTerm index (getField (extract term)) term
|
toUnmappedTerm (UnmappedHashTerm index _ term) = UnmappedTerm index (getField (extract term)) term
|
||||||
|
|
||||||
hashabilize :: (HasField fields (Vector.Vector Int)) => Int -> Cofree f (Record fields) -> (Vector.Vector Int, UnmappedHashTerm (Cofree f (Record fields)))
|
hashabilize :: (HasField fields Int) => Int -> Cofree f (Record fields) -> (Int, UnmappedHashTerm (Cofree f (Record fields)))
|
||||||
hashabilize index term = (hash, UnmappedHashTerm index hash term)
|
hashabilize index term = (termSize term, UnmappedHashTerm index hash term)
|
||||||
where hash = (getField (extract term) :: Vector.Vector Int)
|
where hash = (getField (extract term) :: Int)
|
||||||
|
|
||||||
kdas = KdTree.build (Vector.toList . feature) fas
|
kdas = KdTree.build (Vector.toList . feature) fas
|
||||||
kdbs = KdTree.build (Vector.toList . feature) fbs
|
kdbs = KdTree.build (Vector.toList . feature) fbs
|
||||||
@ -142,7 +142,7 @@ defaultM = 10
|
|||||||
data UnmappedTerm a = UnmappedTerm { termIndex :: {-# UNPACK #-} !Int, feature :: !(Vector.Vector Double), term :: !a }
|
data UnmappedTerm a = UnmappedTerm { termIndex :: {-# UNPACK #-} !Int, feature :: !(Vector.Vector Double), term :: !a }
|
||||||
deriving Eq
|
deriving Eq
|
||||||
|
|
||||||
data UnmappedHashTerm a = UnmappedHashTerm { hashTermIndex :: {-# UNPACK #-} !Int, hashVector :: !(Vector.Vector Int), hashTerm :: !a }
|
data UnmappedHashTerm a = UnmappedHashTerm { hashTermIndex :: {-# UNPACK #-} !Int, hashInt :: !Int, hashTerm :: !a }
|
||||||
deriving Eq
|
deriving Eq
|
||||||
|
|
||||||
|
|
||||||
@ -151,9 +151,10 @@ data Gram label = Gram { stem :: [Maybe label], base :: [Maybe label] }
|
|||||||
deriving (Eq, Show)
|
deriving (Eq, Show)
|
||||||
|
|
||||||
-- -- | Annotates a term with it's hash at each node.
|
-- -- | Annotates a term with it's hash at each node.
|
||||||
hashDecorator :: (Hashable hash, Traversable f) => (forall b. CofreeF f (Record fields) b -> hash) -> Cofree f (Record fields) -> Cofree f (Record (Vector.Vector Int ': fields))
|
hashDecorator :: forall hash f fields. (Hashable (f Int), Hashable hash, Traversable f) => (forall b. CofreeF f (Record fields) b -> hash) -> Cofree f (Record fields) -> Cofree f (Record (Int ': fields))
|
||||||
hashDecorator getHash = cata $ \case
|
hashDecorator getLabel = cata $ \case
|
||||||
term@(record :< functor) -> cofree ((foldr (Vector.zipWith (+) . getField . extract) (Vector.singleton . hash $ getHash term) functor .: record) :< functor)
|
term@(record :< functor) ->
|
||||||
|
cofree ((hash (getLabel term, fmap (getField . extract) functor :: f Int) .: record) :< functor)
|
||||||
|
|
||||||
-- | Annotates a term with a feature vector at each node, using the default values for the p, q, and d parameters.
|
-- | Annotates a term with a feature vector at each node, using the default values for the p, q, and d parameters.
|
||||||
defaultFeatureVectorDecorator :: (Hashable label, Traversable f) => (forall b. CofreeF f (Record fields) b -> label) -> Cofree f (Record fields) -> Cofree f (Record (Vector.Vector Double ': fields))
|
defaultFeatureVectorDecorator :: (Hashable label, Traversable f) => (forall b. CofreeF f (Record fields) b -> label) -> Cofree f (Record fields) -> Cofree f (Record (Vector.Vector Double ': fields))
|
||||||
|
@ -44,9 +44,9 @@ import Data.Hashable
|
|||||||
-- | result.
|
-- | result.
|
||||||
-- | Returns the rendered result strictly, so it's always fully evaluated
|
-- | Returns the rendered result strictly, so it's always fully evaluated
|
||||||
-- | with respect to other IO actions.
|
-- | with respect to other IO actions.
|
||||||
diffFiles :: (HasField fields Category, HasField fields Cost, HasField fields Range, Eq (Record fields)) => Parser (Syntax Text) (Record fields) -> Renderer (Record (Vector.Vector Double ': Vector.Vector Int ': fields)) -> Both SourceBlob -> IO Output
|
diffFiles :: (HasField fields Category, HasField fields Cost, HasField fields Range, Eq (Record fields)) => Parser (Syntax Text) (Record fields) -> Renderer (Record (Vector.Vector Double ': Int ': fields)) -> Both SourceBlob -> IO Output
|
||||||
diffFiles parser renderer sourceBlobs = do
|
diffFiles parser renderer sourceBlobs = do
|
||||||
terms <- traverse (fmap (defaultFeatureVectorDecorator getLabel . hashDecorator getHash) . parser) sourceBlobs
|
terms <- traverse (fmap (defaultFeatureVectorDecorator getLabel . hashDecorator getLabel) . parser) sourceBlobs
|
||||||
|
|
||||||
let areNullOids = runBothWith (\a b -> (oid a == nullOid || null (source a), oid b == nullOid || null (source b))) sourceBlobs
|
let areNullOids = runBothWith (\a b -> (oid a == nullOid || null (source a), oid b == nullOid || null (source b))) sourceBlobs
|
||||||
let textDiff = case areNullOids of
|
let textDiff = case areNullOids of
|
||||||
@ -64,8 +64,8 @@ diffFiles parser renderer sourceBlobs = do
|
|||||||
Pure patch -> uncurry both (fromThese 0 0 (unPatch (cost . extract <$> patch)))
|
Pure patch -> uncurry both (fromThese 0 0 (unPatch (cost . extract <$> patch)))
|
||||||
getLabel (h :< t) = (category h, case t of
|
getLabel (h :< t) = (category h, case t of
|
||||||
Leaf s -> Just s
|
Leaf s -> Just s
|
||||||
|
Syntax.Comment s -> Just s
|
||||||
_ -> Nothing)
|
_ -> Nothing)
|
||||||
getHash (h :< _) = (category h)
|
|
||||||
|
|
||||||
-- | Return a parser based on the file extension (including the ".").
|
-- | Return a parser based on the file extension (including the ".").
|
||||||
parserForType :: Text -> Parser (Syntax Text) (Record '[Range, Category])
|
parserForType :: Text -> Parser (Syntax Text) (Record '[Range, Category])
|
||||||
|
@ -25,7 +25,7 @@ type Comparable leaf annotation = Term leaf annotation -> Term leaf annotation -
|
|||||||
type DiffConstructor leaf annotation = CofreeF (Syntax leaf) (Both annotation) (Diff leaf annotation) -> Diff leaf annotation
|
type DiffConstructor leaf annotation = CofreeF (Syntax leaf) (Both annotation) (Diff leaf annotation) -> Diff leaf annotation
|
||||||
|
|
||||||
-- | Diff two terms recursively, given functions characterizing the diffing.
|
-- | Diff two terms recursively, given functions characterizing the diffing.
|
||||||
diffTerms :: (Eq leaf, Eq (Record fields), HasField fields Category, HasField fields (Vector.Vector Double), HasField fields (Vector.Vector Int))
|
diffTerms :: (Eq leaf, Eq (Record fields), HasField fields Category, HasField fields (Vector.Vector Double), HasField fields Int)
|
||||||
=> DiffConstructor leaf (Record fields) -- ^ A function to wrap up & possibly annotate every produced diff.
|
=> DiffConstructor leaf (Record fields) -- ^ A function to wrap up & possibly annotate every produced diff.
|
||||||
-> Comparable leaf (Record fields) -- ^ A function to determine whether or not two terms should even be compared.
|
-> Comparable leaf (Record fields) -- ^ A function to determine whether or not two terms should even be compared.
|
||||||
-> SES.Cost (Diff leaf (Record fields)) -- ^ A function to compute the cost of a given diff node.
|
-> SES.Cost (Diff leaf (Record fields)) -- ^ A function to compute the cost of a given diff node.
|
||||||
@ -35,7 +35,7 @@ diffTerms :: (Eq leaf, Eq (Record fields), HasField fields Category, HasField fi
|
|||||||
diffTerms construct comparable cost a b = fromMaybe (replacing a b) $ diffComparableTerms construct comparable cost a b
|
diffTerms construct comparable cost a b = fromMaybe (replacing a b) $ diffComparableTerms construct comparable cost a b
|
||||||
|
|
||||||
-- | Diff two terms recursively, given functions characterizing the diffing. If the terms are incomparable, returns 'Nothing'.
|
-- | Diff two terms recursively, given functions characterizing the diffing. If the terms are incomparable, returns 'Nothing'.
|
||||||
diffComparableTerms :: (Eq leaf, Eq (Record fields), HasField fields Category, HasField fields (Vector.Vector Double), HasField fields (Vector.Vector Int)) => DiffConstructor leaf (Record fields) -> Comparable leaf (Record fields) -> SES.Cost (Diff leaf (Record fields)) -> Term leaf (Record fields) -> Term leaf (Record fields) -> Maybe (Diff leaf (Record fields))
|
diffComparableTerms :: (Eq leaf, Eq (Record fields), HasField fields Category, HasField fields (Vector.Vector Double), HasField fields Int) => DiffConstructor leaf (Record fields) -> Comparable leaf (Record fields) -> SES.Cost (Diff leaf (Record fields)) -> Term leaf (Record fields) -> Term leaf (Record fields) -> Maybe (Diff leaf (Record fields))
|
||||||
diffComparableTerms construct comparable cost = recur
|
diffComparableTerms construct comparable cost = recur
|
||||||
where recur a b
|
where recur a b
|
||||||
| (category <$> a) == (category <$> b) = hylo construct runCofree <$> zipTerms a b
|
| (category <$> a) == (category <$> b) = hylo construct runCofree <$> zipTerms a b
|
||||||
@ -71,7 +71,7 @@ algorithmWithTerms construct t1 t2 = case (unwrap t1, unwrap t2) of
|
|||||||
branch constructor a b = bySimilarity a b >>= annotate . constructor
|
branch constructor a b = bySimilarity a b >>= annotate . constructor
|
||||||
|
|
||||||
-- | Run an algorithm, given functions characterizing the evaluation.
|
-- | Run an algorithm, given functions characterizing the evaluation.
|
||||||
runAlgorithm :: (Functor f, GAlign f, Eq a, Eq (Record fields), Eq (f (Cofree f (Record fields))), Prologue.Foldable f, Traversable f, HasField fields (Vector.Vector Double), HasField fields (Vector.Vector Int))
|
runAlgorithm :: (Functor f, GAlign f, Eq a, Eq (Record fields), Eq (f (Cofree f (Record fields))), Prologue.Foldable f, Traversable f, HasField fields (Vector.Vector Double), HasField fields Int)
|
||||||
=> (CofreeF f (Both (Record fields)) (Free (CofreeF f (Both (Record fields))) (Patch (Cofree f (Record fields)))) -> Free (CofreeF f (Both (Record fields))) (Patch (Cofree f (Record fields)))) -- ^ A function to wrap up & possibly annotate every produced diff.
|
=> (CofreeF f (Both (Record fields)) (Free (CofreeF f (Both (Record fields))) (Patch (Cofree f (Record fields)))) -> Free (CofreeF f (Both (Record fields))) (Patch (Cofree f (Record fields)))) -- ^ A function to wrap up & possibly annotate every produced diff.
|
||||||
-> (Cofree f (Record fields) -> Cofree f (Record fields) -> Maybe (Free (CofreeF f (Both (Record fields))) (Patch (Cofree f (Record fields))))) -- ^ A function to diff two subterms recursively, if they are comparable, or else return 'Nothing'.
|
-> (Cofree f (Record fields) -> Cofree f (Record fields) -> Maybe (Free (CofreeF f (Both (Record fields))) (Patch (Cofree f (Record fields))))) -- ^ A function to diff two subterms recursively, if they are comparable, or else return 'Nothing'.
|
||||||
-> SES.Cost (Free (CofreeF f (Both (Record fields))) (Patch (Cofree f (Record fields)))) -- ^ A function to compute the cost of a given diff node.
|
-> SES.Cost (Free (CofreeF f (Both (Record fields))) (Patch (Cofree f (Record fields)))) -- ^ A function to compute the cost of a given diff node.
|
||||||
|
@ -6,6 +6,7 @@ import GHC.Generics
|
|||||||
import Prologue
|
import Prologue
|
||||||
import Test.QuickCheck hiding (Fixed)
|
import Test.QuickCheck hiding (Fixed)
|
||||||
import SourceSpan
|
import SourceSpan
|
||||||
|
import Data.Hashable
|
||||||
|
|
||||||
-- | A node in an abstract syntax tree.
|
-- | A node in an abstract syntax tree.
|
||||||
--
|
--
|
||||||
@ -72,7 +73,7 @@ data Syntax a f
|
|||||||
-- | A method definition with an identifier, params, and a list of expressions.
|
-- | A method definition with an identifier, params, and a list of expressions.
|
||||||
| Method f [f] [f]
|
| Method f [f] [f]
|
||||||
| If f f (Maybe f)
|
| If f f (Maybe f)
|
||||||
deriving (Eq, Foldable, Functor, Generic, Generic1, Mergeable, Ord, Show, Traversable)
|
deriving (Eq, Foldable, Functor, Generic, Generic1, Mergeable, Ord, Show, Traversable, Hashable)
|
||||||
|
|
||||||
-- Instances
|
-- Instances
|
||||||
|
|
||||||
|
Loading…
Reference in New Issue
Block a user