From 3e94a6a9dda7bbd5a41875d1d112ab4cc753a5c1 Mon Sep 17 00:00:00 2001 From: Rob Rix Date: Thu, 18 Aug 2016 17:28:35 -0400 Subject: [PATCH 01/10] Construct IntMaps of the lists of inputs. --- src/Data/RandomWalkSimilarity.hs | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/src/Data/RandomWalkSimilarity.hs b/src/Data/RandomWalkSimilarity.hs index 19e86afaa..d1d8d5510 100644 --- a/src/Data/RandomWalkSimilarity.hs +++ b/src/Data/RandomWalkSimilarity.hs @@ -20,6 +20,7 @@ import Control.Monad.State import Data.Functor.Both hiding (fst, snd) import Data.Functor.Foldable as Foldable import Data.Hashable +import qualified Data.IntMap as IntMap import qualified Data.KdTree.Static as KdTree import qualified Data.List as List import Data.Record @@ -42,11 +43,11 @@ rws compare as bs | null as, null bs = [] | null as = inserting <$> bs | null bs = deleting <$> as - | otherwise = fmap snd . uncurry deleteRemaining . (`runState` (negate 1, fas, fbs)) $ traverse findNearestNeighbourTo fbs - where fas = zipWith featurize [0..] as - fbs = zipWith featurize [0..] bs - kdas = KdTree.build (Vector.toList . feature) fas - kdbs = KdTree.build (Vector.toList . feature) fbs + | otherwise = fmap snd . uncurry deleteRemaining . (`runState` (negate 1, (toList fas), (toList fbs))) $ traverse findNearestNeighbourTo (toList fbs) + where fas = IntMap.fromList $ (\ t -> (termIndex t, t)) <$> zipWith featurize [0..] as + fbs = IntMap.fromList $ (\ t -> (termIndex t, t)) <$> zipWith featurize [0..] bs + kdas = KdTree.build (Vector.toList . feature) (toList fas) + kdbs = KdTree.build (Vector.toList . feature) (toList fbs) featurize index term = UnmappedTerm index (getField (extract term)) term findNearestNeighbourTo kv@(UnmappedTerm j _ b) = do (previous, unmappedA, unmappedB) <- get From bff81035d8ae3a39236d61e0ef8868e3ecdb5cdf Mon Sep 17 00:00:00 2001 From: Rob Rix Date: Thu, 18 Aug 2016 17:41:27 -0400 Subject: [PATCH 02/10] Traverse & delete remaining unmapped terms in IntMap. --- src/Data/RandomWalkSimilarity.hs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/Data/RandomWalkSimilarity.hs b/src/Data/RandomWalkSimilarity.hs index d1d8d5510..335946fbb 100644 --- a/src/Data/RandomWalkSimilarity.hs +++ b/src/Data/RandomWalkSimilarity.hs @@ -43,7 +43,7 @@ rws compare as bs | null as, null bs = [] | null as = inserting <$> bs | null bs = deleting <$> as - | otherwise = fmap snd . uncurry deleteRemaining . (`runState` (negate 1, (toList fas), (toList fbs))) $ traverse findNearestNeighbourTo (toList fbs) + | otherwise = toList . uncurry deleteRemaining . first IntMap.fromList . (`runState` (negate 1, (toList fas), (toList fbs))) . fmap toList $ traverse findNearestNeighbourTo fbs where fas = IntMap.fromList $ (\ t -> (termIndex t, t)) <$> zipWith featurize [0..] as fbs = IntMap.fromList $ (\ t -> (termIndex t, t)) <$> zipWith featurize [0..] bs kdas = KdTree.build (Vector.toList . feature) (toList fas) @@ -72,7 +72,7 @@ rws compare as bs put (previous, unmappedA, List.delete kv unmappedB) pure (negate 1, inserting b) - deleteRemaining diffs (_, unmappedA, _) = foldl' (flip (List.insertBy (comparing fst))) diffs ((termIndex &&& deleting . term) <$> unmappedA) + deleteRemaining diffs (_, unmappedA, _) = foldl' (flip (uncurry IntMap.insert)) diffs ((termIndex &&& deleting . term) <$> unmappedA) -- | Computes a constant-time approximation to the edit distance of a diff. This is done by comparing at most _m_ nodes, & assuming the rest are zero-cost. editDistanceUpTo :: (Prologue.Foldable f, Functor f) => Integer -> Free (CofreeF f (Both a)) (Patch (Cofree f a)) -> Int From 124e7b51da8719cecc666ab5cd269804014d259a Mon Sep 17 00:00:00 2001 From: Rob Rix Date: Thu, 18 Aug 2016 17:49:33 -0400 Subject: [PATCH 03/10] Revert "Traverse & delete remaining unmapped terms in IntMap." This reverts commit 7f3f842657b8cd9b1e5c60ae81e5fcaa44f1b1e4. --- src/Data/RandomWalkSimilarity.hs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/Data/RandomWalkSimilarity.hs b/src/Data/RandomWalkSimilarity.hs index 335946fbb..d1d8d5510 100644 --- a/src/Data/RandomWalkSimilarity.hs +++ b/src/Data/RandomWalkSimilarity.hs @@ -43,7 +43,7 @@ rws compare as bs | null as, null bs = [] | null as = inserting <$> bs | null bs = deleting <$> as - | otherwise = toList . uncurry deleteRemaining . first IntMap.fromList . (`runState` (negate 1, (toList fas), (toList fbs))) . fmap toList $ traverse findNearestNeighbourTo fbs + | otherwise = fmap snd . uncurry deleteRemaining . (`runState` (negate 1, (toList fas), (toList fbs))) $ traverse findNearestNeighbourTo (toList fbs) where fas = IntMap.fromList $ (\ t -> (termIndex t, t)) <$> zipWith featurize [0..] as fbs = IntMap.fromList $ (\ t -> (termIndex t, t)) <$> zipWith featurize [0..] bs kdas = KdTree.build (Vector.toList . feature) (toList fas) @@ -72,7 +72,7 @@ rws compare as bs put (previous, unmappedA, List.delete kv unmappedB) pure (negate 1, inserting b) - deleteRemaining diffs (_, unmappedA, _) = foldl' (flip (uncurry IntMap.insert)) diffs ((termIndex &&& deleting . term) <$> unmappedA) + deleteRemaining diffs (_, unmappedA, _) = foldl' (flip (List.insertBy (comparing fst))) diffs ((termIndex &&& deleting . term) <$> unmappedA) -- | Computes a constant-time approximation to the edit distance of a diff. This is done by comparing at most _m_ nodes, & assuming the rest are zero-cost. editDistanceUpTo :: (Prologue.Foldable f, Functor f) => Integer -> Free (CofreeF f (Both a)) (Patch (Cofree f a)) -> Int From 10ca6f60b679b9865510c233e4a663f62bb0a744 Mon Sep 17 00:00:00 2001 From: Rob Rix Date: Thu, 18 Aug 2016 17:49:40 -0400 Subject: [PATCH 04/10] Revert "Construct IntMaps of the lists of inputs." This reverts commit 8d62477ee805d45603d6c84fdeea59b341153c00. --- src/Data/RandomWalkSimilarity.hs | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/src/Data/RandomWalkSimilarity.hs b/src/Data/RandomWalkSimilarity.hs index d1d8d5510..19e86afaa 100644 --- a/src/Data/RandomWalkSimilarity.hs +++ b/src/Data/RandomWalkSimilarity.hs @@ -20,7 +20,6 @@ import Control.Monad.State import Data.Functor.Both hiding (fst, snd) import Data.Functor.Foldable as Foldable import Data.Hashable -import qualified Data.IntMap as IntMap import qualified Data.KdTree.Static as KdTree import qualified Data.List as List import Data.Record @@ -43,11 +42,11 @@ rws compare as bs | null as, null bs = [] | null as = inserting <$> bs | null bs = deleting <$> as - | otherwise = fmap snd . uncurry deleteRemaining . (`runState` (negate 1, (toList fas), (toList fbs))) $ traverse findNearestNeighbourTo (toList fbs) - where fas = IntMap.fromList $ (\ t -> (termIndex t, t)) <$> zipWith featurize [0..] as - fbs = IntMap.fromList $ (\ t -> (termIndex t, t)) <$> zipWith featurize [0..] bs - kdas = KdTree.build (Vector.toList . feature) (toList fas) - kdbs = KdTree.build (Vector.toList . feature) (toList fbs) + | otherwise = fmap snd . uncurry deleteRemaining . (`runState` (negate 1, fas, fbs)) $ traverse findNearestNeighbourTo fbs + where fas = zipWith featurize [0..] as + fbs = zipWith featurize [0..] bs + kdas = KdTree.build (Vector.toList . feature) fas + kdbs = KdTree.build (Vector.toList . feature) fbs featurize index term = UnmappedTerm index (getField (extract term)) term findNearestNeighbourTo kv@(UnmappedTerm j _ b) = do (previous, unmappedA, unmappedB) <- get From 1b2271e47957aed832a1382cc2f94b34a7b5a4ec Mon Sep 17 00:00:00 2001 From: Rob Rix Date: Thu, 18 Aug 2016 17:53:17 -0400 Subject: [PATCH 05/10] Maintain IntMaps of unmapped elements. --- src/Data/RandomWalkSimilarity.hs | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) diff --git a/src/Data/RandomWalkSimilarity.hs b/src/Data/RandomWalkSimilarity.hs index 19e86afaa..33f433d64 100644 --- a/src/Data/RandomWalkSimilarity.hs +++ b/src/Data/RandomWalkSimilarity.hs @@ -20,6 +20,7 @@ import Control.Monad.State import Data.Functor.Both hiding (fst, snd) import Data.Functor.Foldable as Foldable import Data.Hashable +import qualified Data.IntMap as IntMap import qualified Data.KdTree.Static as KdTree import qualified Data.List as List import Data.Record @@ -42,22 +43,23 @@ rws compare as bs | null as, null bs = [] | null as = inserting <$> bs | null bs = deleting <$> as - | otherwise = fmap snd . uncurry deleteRemaining . (`runState` (negate 1, fas, fbs)) $ traverse findNearestNeighbourTo fbs + | otherwise = fmap snd . uncurry deleteRemaining . (`runState` (negate 1, toMap fas, toMap fbs)) $ traverse findNearestNeighbourTo fbs where fas = zipWith featurize [0..] as fbs = zipWith featurize [0..] bs kdas = KdTree.build (Vector.toList . feature) fas kdbs = KdTree.build (Vector.toList . feature) fbs featurize index term = UnmappedTerm index (getField (extract term)) term + toMap = IntMap.fromList . fmap (termIndex &&& identity) findNearestNeighbourTo kv@(UnmappedTerm j _ b) = do (previous, unmappedA, unmappedB) <- get fromMaybe (insertion previous unmappedA unmappedB kv) $ do - foundA@(UnmappedTerm i _ a) <- nearestUnmapped unmappedA kdas kv - foundB@(UnmappedTerm j' _ _) <- nearestUnmapped unmappedB kdbs foundA + foundA@(UnmappedTerm i _ a) <- nearestUnmapped (toList unmappedA) kdas kv + foundB@(UnmappedTerm j' _ _) <- nearestUnmapped (toList unmappedB) kdbs foundA guard (j == j') guard (previous <= i && i <= previous + defaultMoveBound) compared <- compare a b pure $! do - put (i, List.delete foundA unmappedA, List.delete foundB unmappedB) + put (i, IntMap.delete i unmappedA, IntMap.delete j unmappedB) pure (i, compared) -- | Finds the most-similar unmapped term to the passed-in term, if any. @@ -67,8 +69,8 @@ rws compare as bs -- cf §4.2 of RWS-Diff nearestUnmapped unmapped tree key = getFirst $ foldMap (First . Just) (sortOn (maybe maxBound (editDistanceUpTo defaultM) . compare (term key) . term) (intersectBy ((==) `on` termIndex) unmapped (KdTree.kNearest tree defaultL key))) - insertion previous unmappedA unmappedB kv@(UnmappedTerm _ _ b) = do - put (previous, unmappedA, List.delete kv unmappedB) + insertion previous unmappedA unmappedB (UnmappedTerm j _ b) = do + put (previous, unmappedA, IntMap.delete j unmappedB) pure (negate 1, inserting b) deleteRemaining diffs (_, unmappedA, _) = foldl' (flip (List.insertBy (comparing fst))) diffs ((termIndex &&& deleting . term) <$> unmappedA) From 1627ab0e268d761869a285afe44f4f2f60657908 Mon Sep 17 00:00:00 2001 From: Rob Rix Date: Thu, 18 Aug 2016 17:53:29 -0400 Subject: [PATCH 06/10] :fire: a redundant binding. --- src/Data/RandomWalkSimilarity.hs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/Data/RandomWalkSimilarity.hs b/src/Data/RandomWalkSimilarity.hs index 33f433d64..5a40fa4ef 100644 --- a/src/Data/RandomWalkSimilarity.hs +++ b/src/Data/RandomWalkSimilarity.hs @@ -54,7 +54,7 @@ rws compare as bs (previous, unmappedA, unmappedB) <- get fromMaybe (insertion previous unmappedA unmappedB kv) $ do foundA@(UnmappedTerm i _ a) <- nearestUnmapped (toList unmappedA) kdas kv - foundB@(UnmappedTerm j' _ _) <- nearestUnmapped (toList unmappedB) kdbs foundA + UnmappedTerm j' _ _ <- nearestUnmapped (toList unmappedB) kdbs foundA guard (j == j') guard (previous <= i && i <= previous + defaultMoveBound) compared <- compare a b From 1385e9193fe1f94b45f50fa000574d06f63790be Mon Sep 17 00:00:00 2001 From: Rob Rix Date: Fri, 19 Aug 2016 11:35:32 -0400 Subject: [PATCH 07/10] :fire: a redundant import. --- src/Data/RandomWalkSimilarity.hs | 1 - 1 file changed, 1 deletion(-) diff --git a/src/Data/RandomWalkSimilarity.hs b/src/Data/RandomWalkSimilarity.hs index 5a40fa4ef..0c4f8fedf 100644 --- a/src/Data/RandomWalkSimilarity.hs +++ b/src/Data/RandomWalkSimilarity.hs @@ -27,7 +27,6 @@ import Data.Record import qualified Data.Vector as Vector import Patch import Prologue -import Term () import Test.QuickCheck hiding (Fixed) import Test.QuickCheck.Random import Data.List (intersectBy) From db6679a9125dea169852e76005ec1399e89fa8cd Mon Sep 17 00:00:00 2001 From: Rob Rix Date: Fri, 19 Aug 2016 11:36:46 -0400 Subject: [PATCH 08/10] Use IntMap intersections to define nearestUnmapped. --- src/Data/RandomWalkSimilarity.hs | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/src/Data/RandomWalkSimilarity.hs b/src/Data/RandomWalkSimilarity.hs index 0c4f8fedf..4bdcf2763 100644 --- a/src/Data/RandomWalkSimilarity.hs +++ b/src/Data/RandomWalkSimilarity.hs @@ -27,10 +27,9 @@ import Data.Record import qualified Data.Vector as Vector import Patch import Prologue +import Term (termSize) import Test.QuickCheck hiding (Fixed) import Test.QuickCheck.Random -import Data.List (intersectBy) -import Term (termSize) -- | Given a function comparing two terms recursively, and a function to compute a Hashable label from an unpacked term, compute the diff of a pair of lists of terms using a random walk similarity metric, which completes in log-linear time. This implementation is based on the paper [_RWS-Diff—Flexible and Efficient Change Detection in Hierarchical Data_](https://github.com/github/semantic-diff/files/325837/RWS-Diff.Flexible.and.Efficient.Change.Detection.in.Hierarchical.Data.pdf). rws :: (Eq (Record fields), Prologue.Foldable f, Functor f, Eq (f (Cofree f (Record fields))), HasField fields (Vector.Vector Double)) @@ -52,8 +51,8 @@ rws compare as bs findNearestNeighbourTo kv@(UnmappedTerm j _ b) = do (previous, unmappedA, unmappedB) <- get fromMaybe (insertion previous unmappedA unmappedB kv) $ do - foundA@(UnmappedTerm i _ a) <- nearestUnmapped (toList unmappedA) kdas kv - UnmappedTerm j' _ _ <- nearestUnmapped (toList unmappedB) kdbs foundA + foundA@(UnmappedTerm i _ a) <- nearestUnmapped unmappedA kdas kv + UnmappedTerm j' _ _ <- nearestUnmapped unmappedB kdbs foundA guard (j == j') guard (previous <= i && i <= previous + defaultMoveBound) compared <- compare a b @@ -66,7 +65,7 @@ rws compare as bs -- RWS can produce false positives in the case of e.g. hash collisions. Therefore, we find the _l_ nearest candidates, filter out any which have already been mapped, and select the minimum of the remaining by (a constant-time approximation of) edit distance. -- -- cf §4.2 of RWS-Diff - nearestUnmapped unmapped tree key = getFirst $ foldMap (First . Just) (sortOn (maybe maxBound (editDistanceUpTo defaultM) . compare (term key) . term) (intersectBy ((==) `on` termIndex) unmapped (KdTree.kNearest tree defaultL key))) + nearestUnmapped unmapped tree key = getFirst $ foldMap (First . Just) (sortOn (maybe maxBound (editDistanceUpTo defaultM) . compare (term key) . term) (toList (IntMap.intersection unmapped (toMap (KdTree.kNearest tree defaultL key))))) insertion previous unmappedA unmappedB (UnmappedTerm j _ b) = do put (previous, unmappedA, IntMap.delete j unmappedB) From b5c67da74b19a740c26ecaa425c342330924d8c4 Mon Sep 17 00:00:00 2001 From: Rob Rix Date: Fri, 19 Aug 2016 13:10:35 -0400 Subject: [PATCH 09/10] Add a type signature for nearestUnmapped. --- src/Data/RandomWalkSimilarity.hs | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/src/Data/RandomWalkSimilarity.hs b/src/Data/RandomWalkSimilarity.hs index 4bdcf2763..2dbf778c1 100644 --- a/src/Data/RandomWalkSimilarity.hs +++ b/src/Data/RandomWalkSimilarity.hs @@ -1,4 +1,4 @@ -{-# LANGUAGE DataKinds, GADTs, RankNTypes, TypeOperators #-} +{-# LANGUAGE DataKinds, GADTs, RankNTypes, ScopedTypeVariables, TypeOperators #-} module Data.RandomWalkSimilarity ( rws , pqGramDecorator @@ -32,7 +32,7 @@ import Test.QuickCheck hiding (Fixed) import Test.QuickCheck.Random -- | Given a function comparing two terms recursively, and a function to compute a Hashable label from an unpacked term, compute the diff of a pair of lists of terms using a random walk similarity metric, which completes in log-linear time. This implementation is based on the paper [_RWS-Diff—Flexible and Efficient Change Detection in Hierarchical Data_](https://github.com/github/semantic-diff/files/325837/RWS-Diff.Flexible.and.Efficient.Change.Detection.in.Hierarchical.Data.pdf). -rws :: (Eq (Record fields), Prologue.Foldable f, Functor f, Eq (f (Cofree f (Record fields))), HasField fields (Vector.Vector Double)) +rws :: forall f fields. (Eq (Record fields), Prologue.Foldable f, Functor f, Eq (f (Cofree f (Record fields))), HasField fields (Vector.Vector Double)) => (Cofree f (Record fields) -> Cofree f (Record fields) -> Maybe (Free (CofreeF f (Both (Record fields))) (Patch (Cofree f (Record fields))))) -- ^ A function which compares a pair of terms recursively, returning 'Just' their diffed value if appropriate, or 'Nothing' if they should not be compared. -> [Cofree f (Record fields)] -- ^ The list of old terms. -> [Cofree f (Record fields)] -- ^ The list of new terms. @@ -65,6 +65,7 @@ rws compare as bs -- RWS can produce false positives in the case of e.g. hash collisions. Therefore, we find the _l_ nearest candidates, filter out any which have already been mapped, and select the minimum of the remaining by (a constant-time approximation of) edit distance. -- -- cf §4.2 of RWS-Diff + nearestUnmapped :: IntMap (UnmappedTerm (Cofree f (Record fields))) -> KdTree.KdTree Double (UnmappedTerm (Cofree f (Record fields))) -> UnmappedTerm (Cofree f (Record fields)) -> Maybe (UnmappedTerm (Cofree f (Record fields))) nearestUnmapped unmapped tree key = getFirst $ foldMap (First . Just) (sortOn (maybe maxBound (editDistanceUpTo defaultM) . compare (term key) . term) (toList (IntMap.intersection unmapped (toMap (KdTree.kNearest tree defaultL key))))) insertion previous unmappedA unmappedB (UnmappedTerm j _ b) = do From b0c8f93d700087095567ae308ee0715a11534e3e Mon Sep 17 00:00:00 2001 From: Rob Rix Date: Fri, 19 Aug 2016 13:12:59 -0400 Subject: [PATCH 10/10] Add a type signature for findNearestNeighbourTo. --- src/Data/RandomWalkSimilarity.hs | 1 + 1 file changed, 1 insertion(+) diff --git a/src/Data/RandomWalkSimilarity.hs b/src/Data/RandomWalkSimilarity.hs index 2dbf778c1..aa382df86 100644 --- a/src/Data/RandomWalkSimilarity.hs +++ b/src/Data/RandomWalkSimilarity.hs @@ -48,6 +48,7 @@ rws compare as bs kdbs = KdTree.build (Vector.toList . feature) fbs featurize index term = UnmappedTerm index (getField (extract term)) term toMap = IntMap.fromList . fmap (termIndex &&& identity) + findNearestNeighbourTo :: UnmappedTerm (Cofree f (Record fields)) -> State (Int, IntMap (UnmappedTerm (Cofree f (Record fields))), IntMap (UnmappedTerm (Cofree f (Record fields)))) (Int, Free (CofreeF f (Both (Record fields))) (Patch (Cofree f (Record fields)))) findNearestNeighbourTo kv@(UnmappedTerm j _ b) = do (previous, unmappedA, unmappedB) <- get fromMaybe (insertion previous unmappedA unmappedB kv) $ do