From 3e94a6a9dda7bbd5a41875d1d112ab4cc753a5c1 Mon Sep 17 00:00:00 2001
From: Rob Rix <rob.rix@github.com>
Date: Thu, 18 Aug 2016 17:28:35 -0400
Subject: [PATCH 01/10] Construct IntMaps of the lists of inputs.

---
 src/Data/RandomWalkSimilarity.hs | 11 ++++++-----
 1 file changed, 6 insertions(+), 5 deletions(-)

diff --git a/src/Data/RandomWalkSimilarity.hs b/src/Data/RandomWalkSimilarity.hs
index 19e86afaa..d1d8d5510 100644
--- a/src/Data/RandomWalkSimilarity.hs
+++ b/src/Data/RandomWalkSimilarity.hs
@@ -20,6 +20,7 @@ import Control.Monad.State
 import Data.Functor.Both hiding (fst, snd)
 import Data.Functor.Foldable as Foldable
 import Data.Hashable
+import qualified Data.IntMap as IntMap
 import qualified Data.KdTree.Static as KdTree
 import qualified Data.List as List
 import Data.Record
@@ -42,11 +43,11 @@ rws compare as bs
   | null as, null bs = []
   | null as = inserting <$> bs
   | null bs = deleting <$> as
-  | otherwise = fmap snd . uncurry deleteRemaining . (`runState` (negate 1, fas, fbs)) $ traverse findNearestNeighbourTo fbs
-  where fas = zipWith featurize [0..] as
-        fbs = zipWith featurize [0..] bs
-        kdas = KdTree.build (Vector.toList . feature) fas
-        kdbs = KdTree.build (Vector.toList . feature) fbs
+  | otherwise = fmap snd . uncurry deleteRemaining . (`runState` (negate 1, (toList fas), (toList fbs))) $ traverse findNearestNeighbourTo (toList fbs)
+  where fas = IntMap.fromList $ (\ t -> (termIndex t, t)) <$> zipWith featurize [0..] as
+        fbs = IntMap.fromList $ (\ t -> (termIndex t, t)) <$> zipWith featurize [0..] bs
+        kdas = KdTree.build (Vector.toList . feature) (toList fas)
+        kdbs = KdTree.build (Vector.toList . feature) (toList fbs)
         featurize index term = UnmappedTerm index (getField (extract term)) term
         findNearestNeighbourTo kv@(UnmappedTerm j _ b) = do
           (previous, unmappedA, unmappedB) <- get

From bff81035d8ae3a39236d61e0ef8868e3ecdb5cdf Mon Sep 17 00:00:00 2001
From: Rob Rix <rob.rix@github.com>
Date: Thu, 18 Aug 2016 17:41:27 -0400
Subject: [PATCH 02/10] Traverse & delete remaining unmapped terms in IntMap.

---
 src/Data/RandomWalkSimilarity.hs | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/Data/RandomWalkSimilarity.hs b/src/Data/RandomWalkSimilarity.hs
index d1d8d5510..335946fbb 100644
--- a/src/Data/RandomWalkSimilarity.hs
+++ b/src/Data/RandomWalkSimilarity.hs
@@ -43,7 +43,7 @@ rws compare as bs
   | null as, null bs = []
   | null as = inserting <$> bs
   | null bs = deleting <$> as
-  | otherwise = fmap snd . uncurry deleteRemaining . (`runState` (negate 1, (toList fas), (toList fbs))) $ traverse findNearestNeighbourTo (toList fbs)
+  | otherwise = toList . uncurry deleteRemaining . first IntMap.fromList . (`runState` (negate 1, (toList fas), (toList fbs))) . fmap toList $ traverse findNearestNeighbourTo fbs
   where fas = IntMap.fromList $ (\ t -> (termIndex t, t)) <$> zipWith featurize [0..] as
         fbs = IntMap.fromList $ (\ t -> (termIndex t, t)) <$> zipWith featurize [0..] bs
         kdas = KdTree.build (Vector.toList . feature) (toList fas)
@@ -72,7 +72,7 @@ rws compare as bs
           put (previous, unmappedA, List.delete kv unmappedB)
           pure (negate 1, inserting b)
 
-        deleteRemaining diffs (_, unmappedA, _) = foldl' (flip (List.insertBy (comparing fst))) diffs ((termIndex &&& deleting . term) <$> unmappedA)
+        deleteRemaining diffs (_, unmappedA, _) = foldl' (flip (uncurry IntMap.insert)) diffs ((termIndex &&& deleting . term) <$> unmappedA)
 
 -- | Computes a constant-time approximation to the edit distance of a diff. This is done by comparing at most _m_ nodes, & assuming the rest are zero-cost.
 editDistanceUpTo :: (Prologue.Foldable f, Functor f) => Integer -> Free (CofreeF f (Both a)) (Patch (Cofree f a)) -> Int

From 124e7b51da8719cecc666ab5cd269804014d259a Mon Sep 17 00:00:00 2001
From: Rob Rix <rob.rix@github.com>
Date: Thu, 18 Aug 2016 17:49:33 -0400
Subject: [PATCH 03/10] Revert "Traverse & delete remaining unmapped terms in
 IntMap."

This reverts commit 7f3f842657b8cd9b1e5c60ae81e5fcaa44f1b1e4.
---
 src/Data/RandomWalkSimilarity.hs | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/Data/RandomWalkSimilarity.hs b/src/Data/RandomWalkSimilarity.hs
index 335946fbb..d1d8d5510 100644
--- a/src/Data/RandomWalkSimilarity.hs
+++ b/src/Data/RandomWalkSimilarity.hs
@@ -43,7 +43,7 @@ rws compare as bs
   | null as, null bs = []
   | null as = inserting <$> bs
   | null bs = deleting <$> as
-  | otherwise = toList . uncurry deleteRemaining . first IntMap.fromList . (`runState` (negate 1, (toList fas), (toList fbs))) . fmap toList $ traverse findNearestNeighbourTo fbs
+  | otherwise = fmap snd . uncurry deleteRemaining . (`runState` (negate 1, (toList fas), (toList fbs))) $ traverse findNearestNeighbourTo (toList fbs)
   where fas = IntMap.fromList $ (\ t -> (termIndex t, t)) <$> zipWith featurize [0..] as
         fbs = IntMap.fromList $ (\ t -> (termIndex t, t)) <$> zipWith featurize [0..] bs
         kdas = KdTree.build (Vector.toList . feature) (toList fas)
@@ -72,7 +72,7 @@ rws compare as bs
           put (previous, unmappedA, List.delete kv unmappedB)
           pure (negate 1, inserting b)
 
-        deleteRemaining diffs (_, unmappedA, _) = foldl' (flip (uncurry IntMap.insert)) diffs ((termIndex &&& deleting . term) <$> unmappedA)
+        deleteRemaining diffs (_, unmappedA, _) = foldl' (flip (List.insertBy (comparing fst))) diffs ((termIndex &&& deleting . term) <$> unmappedA)
 
 -- | Computes a constant-time approximation to the edit distance of a diff. This is done by comparing at most _m_ nodes, & assuming the rest are zero-cost.
 editDistanceUpTo :: (Prologue.Foldable f, Functor f) => Integer -> Free (CofreeF f (Both a)) (Patch (Cofree f a)) -> Int

From 10ca6f60b679b9865510c233e4a663f62bb0a744 Mon Sep 17 00:00:00 2001
From: Rob Rix <rob.rix@github.com>
Date: Thu, 18 Aug 2016 17:49:40 -0400
Subject: [PATCH 04/10] Revert "Construct IntMaps of the lists of inputs."

This reverts commit 8d62477ee805d45603d6c84fdeea59b341153c00.
---
 src/Data/RandomWalkSimilarity.hs | 11 +++++------
 1 file changed, 5 insertions(+), 6 deletions(-)

diff --git a/src/Data/RandomWalkSimilarity.hs b/src/Data/RandomWalkSimilarity.hs
index d1d8d5510..19e86afaa 100644
--- a/src/Data/RandomWalkSimilarity.hs
+++ b/src/Data/RandomWalkSimilarity.hs
@@ -20,7 +20,6 @@ import Control.Monad.State
 import Data.Functor.Both hiding (fst, snd)
 import Data.Functor.Foldable as Foldable
 import Data.Hashable
-import qualified Data.IntMap as IntMap
 import qualified Data.KdTree.Static as KdTree
 import qualified Data.List as List
 import Data.Record
@@ -43,11 +42,11 @@ rws compare as bs
   | null as, null bs = []
   | null as = inserting <$> bs
   | null bs = deleting <$> as
-  | otherwise = fmap snd . uncurry deleteRemaining . (`runState` (negate 1, (toList fas), (toList fbs))) $ traverse findNearestNeighbourTo (toList fbs)
-  where fas = IntMap.fromList $ (\ t -> (termIndex t, t)) <$> zipWith featurize [0..] as
-        fbs = IntMap.fromList $ (\ t -> (termIndex t, t)) <$> zipWith featurize [0..] bs
-        kdas = KdTree.build (Vector.toList . feature) (toList fas)
-        kdbs = KdTree.build (Vector.toList . feature) (toList fbs)
+  | otherwise = fmap snd . uncurry deleteRemaining . (`runState` (negate 1, fas, fbs)) $ traverse findNearestNeighbourTo fbs
+  where fas = zipWith featurize [0..] as
+        fbs = zipWith featurize [0..] bs
+        kdas = KdTree.build (Vector.toList . feature) fas
+        kdbs = KdTree.build (Vector.toList . feature) fbs
         featurize index term = UnmappedTerm index (getField (extract term)) term
         findNearestNeighbourTo kv@(UnmappedTerm j _ b) = do
           (previous, unmappedA, unmappedB) <- get

From 1b2271e47957aed832a1382cc2f94b34a7b5a4ec Mon Sep 17 00:00:00 2001
From: Rob Rix <rob.rix@github.com>
Date: Thu, 18 Aug 2016 17:53:17 -0400
Subject: [PATCH 05/10] Maintain IntMaps of unmapped elements.

---
 src/Data/RandomWalkSimilarity.hs | 14 ++++++++------
 1 file changed, 8 insertions(+), 6 deletions(-)

diff --git a/src/Data/RandomWalkSimilarity.hs b/src/Data/RandomWalkSimilarity.hs
index 19e86afaa..33f433d64 100644
--- a/src/Data/RandomWalkSimilarity.hs
+++ b/src/Data/RandomWalkSimilarity.hs
@@ -20,6 +20,7 @@ import Control.Monad.State
 import Data.Functor.Both hiding (fst, snd)
 import Data.Functor.Foldable as Foldable
 import Data.Hashable
+import qualified Data.IntMap as IntMap
 import qualified Data.KdTree.Static as KdTree
 import qualified Data.List as List
 import Data.Record
@@ -42,22 +43,23 @@ rws compare as bs
   | null as, null bs = []
   | null as = inserting <$> bs
   | null bs = deleting <$> as
-  | otherwise = fmap snd . uncurry deleteRemaining . (`runState` (negate 1, fas, fbs)) $ traverse findNearestNeighbourTo fbs
+  | otherwise = fmap snd . uncurry deleteRemaining . (`runState` (negate 1, toMap fas, toMap fbs)) $ traverse findNearestNeighbourTo fbs
   where fas = zipWith featurize [0..] as
         fbs = zipWith featurize [0..] bs
         kdas = KdTree.build (Vector.toList . feature) fas
         kdbs = KdTree.build (Vector.toList . feature) fbs
         featurize index term = UnmappedTerm index (getField (extract term)) term
+        toMap = IntMap.fromList . fmap (termIndex &&& identity)
         findNearestNeighbourTo kv@(UnmappedTerm j _ b) = do
           (previous, unmappedA, unmappedB) <- get
           fromMaybe (insertion previous unmappedA unmappedB kv) $ do
-            foundA@(UnmappedTerm i _ a) <- nearestUnmapped unmappedA kdas kv
-            foundB@(UnmappedTerm j' _ _) <- nearestUnmapped unmappedB kdbs foundA
+            foundA@(UnmappedTerm i _ a) <- nearestUnmapped (toList unmappedA) kdas kv
+            foundB@(UnmappedTerm j' _ _) <- nearestUnmapped (toList unmappedB) kdbs foundA
             guard (j == j')
             guard (previous <= i && i <= previous + defaultMoveBound)
             compared <- compare a b
             pure $! do
-              put (i, List.delete foundA unmappedA, List.delete foundB unmappedB)
+              put (i, IntMap.delete i unmappedA, IntMap.delete j unmappedB)
               pure (i, compared)
 
         -- | Finds the most-similar unmapped term to the passed-in term, if any.
@@ -67,8 +69,8 @@ rws compare as bs
         -- cf §4.2 of RWS-Diff
         nearestUnmapped unmapped tree key = getFirst $ foldMap (First . Just) (sortOn (maybe maxBound (editDistanceUpTo defaultM) . compare (term key) . term) (intersectBy ((==) `on` termIndex) unmapped (KdTree.kNearest tree defaultL key)))
 
-        insertion previous unmappedA unmappedB kv@(UnmappedTerm _ _ b) = do
-          put (previous, unmappedA, List.delete kv unmappedB)
+        insertion previous unmappedA unmappedB (UnmappedTerm j _ b) = do
+          put (previous, unmappedA, IntMap.delete j unmappedB)
           pure (negate 1, inserting b)
 
         deleteRemaining diffs (_, unmappedA, _) = foldl' (flip (List.insertBy (comparing fst))) diffs ((termIndex &&& deleting . term) <$> unmappedA)

From 1627ab0e268d761869a285afe44f4f2f60657908 Mon Sep 17 00:00:00 2001
From: Rob Rix <rob.rix@github.com>
Date: Thu, 18 Aug 2016 17:53:29 -0400
Subject: [PATCH 06/10] :fire: a redundant binding.

---
 src/Data/RandomWalkSimilarity.hs | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/Data/RandomWalkSimilarity.hs b/src/Data/RandomWalkSimilarity.hs
index 33f433d64..5a40fa4ef 100644
--- a/src/Data/RandomWalkSimilarity.hs
+++ b/src/Data/RandomWalkSimilarity.hs
@@ -54,7 +54,7 @@ rws compare as bs
           (previous, unmappedA, unmappedB) <- get
           fromMaybe (insertion previous unmappedA unmappedB kv) $ do
             foundA@(UnmappedTerm i _ a) <- nearestUnmapped (toList unmappedA) kdas kv
-            foundB@(UnmappedTerm j' _ _) <- nearestUnmapped (toList unmappedB) kdbs foundA
+            UnmappedTerm j' _ _ <- nearestUnmapped (toList unmappedB) kdbs foundA
             guard (j == j')
             guard (previous <= i && i <= previous + defaultMoveBound)
             compared <- compare a b

From 1385e9193fe1f94b45f50fa000574d06f63790be Mon Sep 17 00:00:00 2001
From: Rob Rix <rob.rix@github.com>
Date: Fri, 19 Aug 2016 11:35:32 -0400
Subject: [PATCH 07/10] :fire: a redundant import.

---
 src/Data/RandomWalkSimilarity.hs | 1 -
 1 file changed, 1 deletion(-)

diff --git a/src/Data/RandomWalkSimilarity.hs b/src/Data/RandomWalkSimilarity.hs
index 5a40fa4ef..0c4f8fedf 100644
--- a/src/Data/RandomWalkSimilarity.hs
+++ b/src/Data/RandomWalkSimilarity.hs
@@ -27,7 +27,6 @@ import Data.Record
 import qualified Data.Vector as Vector
 import Patch
 import Prologue
-import Term ()
 import Test.QuickCheck hiding (Fixed)
 import Test.QuickCheck.Random
 import Data.List (intersectBy)

From db6679a9125dea169852e76005ec1399e89fa8cd Mon Sep 17 00:00:00 2001
From: Rob Rix <rob.rix@github.com>
Date: Fri, 19 Aug 2016 11:36:46 -0400
Subject: [PATCH 08/10] Use IntMap intersections to define nearestUnmapped.

---
 src/Data/RandomWalkSimilarity.hs | 9 ++++-----
 1 file changed, 4 insertions(+), 5 deletions(-)

diff --git a/src/Data/RandomWalkSimilarity.hs b/src/Data/RandomWalkSimilarity.hs
index 0c4f8fedf..4bdcf2763 100644
--- a/src/Data/RandomWalkSimilarity.hs
+++ b/src/Data/RandomWalkSimilarity.hs
@@ -27,10 +27,9 @@ import Data.Record
 import qualified Data.Vector as Vector
 import Patch
 import Prologue
+import Term (termSize)
 import Test.QuickCheck hiding (Fixed)
 import Test.QuickCheck.Random
-import Data.List (intersectBy)
-import Term (termSize)
 
 -- | Given a function comparing two terms recursively, and a function to compute a Hashable label from an unpacked term, compute the diff of a pair of lists of terms using a random walk similarity metric, which completes in log-linear time. This implementation is based on the paper [_RWS-Diff—Flexible and Efficient Change Detection in Hierarchical Data_](https://github.com/github/semantic-diff/files/325837/RWS-Diff.Flexible.and.Efficient.Change.Detection.in.Hierarchical.Data.pdf).
 rws :: (Eq (Record fields), Prologue.Foldable f, Functor f, Eq (f (Cofree f (Record fields))), HasField fields (Vector.Vector Double))
@@ -52,8 +51,8 @@ rws compare as bs
         findNearestNeighbourTo kv@(UnmappedTerm j _ b) = do
           (previous, unmappedA, unmappedB) <- get
           fromMaybe (insertion previous unmappedA unmappedB kv) $ do
-            foundA@(UnmappedTerm i _ a) <- nearestUnmapped (toList unmappedA) kdas kv
-            UnmappedTerm j' _ _ <- nearestUnmapped (toList unmappedB) kdbs foundA
+            foundA@(UnmappedTerm i _ a) <- nearestUnmapped unmappedA kdas kv
+            UnmappedTerm j' _ _ <- nearestUnmapped unmappedB kdbs foundA
             guard (j == j')
             guard (previous <= i && i <= previous + defaultMoveBound)
             compared <- compare a b
@@ -66,7 +65,7 @@ rws compare as bs
         -- RWS can produce false positives in the case of e.g. hash collisions. Therefore, we find the _l_ nearest candidates, filter out any which have already been mapped, and select the minimum of the remaining by (a constant-time approximation of) edit distance.
         --
         -- cf §4.2 of RWS-Diff
-        nearestUnmapped unmapped tree key = getFirst $ foldMap (First . Just) (sortOn (maybe maxBound (editDistanceUpTo defaultM) . compare (term key) . term) (intersectBy ((==) `on` termIndex) unmapped (KdTree.kNearest tree defaultL key)))
+        nearestUnmapped unmapped tree key = getFirst $ foldMap (First . Just) (sortOn (maybe maxBound (editDistanceUpTo defaultM) . compare (term key) . term) (toList (IntMap.intersection unmapped (toMap (KdTree.kNearest tree defaultL key)))))
 
         insertion previous unmappedA unmappedB (UnmappedTerm j _ b) = do
           put (previous, unmappedA, IntMap.delete j unmappedB)

From b5c67da74b19a740c26ecaa425c342330924d8c4 Mon Sep 17 00:00:00 2001
From: Rob Rix <rob.rix@github.com>
Date: Fri, 19 Aug 2016 13:10:35 -0400
Subject: [PATCH 09/10] Add a type signature for nearestUnmapped.

---
 src/Data/RandomWalkSimilarity.hs | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/src/Data/RandomWalkSimilarity.hs b/src/Data/RandomWalkSimilarity.hs
index 4bdcf2763..2dbf778c1 100644
--- a/src/Data/RandomWalkSimilarity.hs
+++ b/src/Data/RandomWalkSimilarity.hs
@@ -1,4 +1,4 @@
-{-# LANGUAGE DataKinds, GADTs, RankNTypes, TypeOperators #-}
+{-# LANGUAGE DataKinds, GADTs, RankNTypes, ScopedTypeVariables, TypeOperators #-}
 module Data.RandomWalkSimilarity
 ( rws
 , pqGramDecorator
@@ -32,7 +32,7 @@ import Test.QuickCheck hiding (Fixed)
 import Test.QuickCheck.Random
 
 -- | Given a function comparing two terms recursively, and a function to compute a Hashable label from an unpacked term, compute the diff of a pair of lists of terms using a random walk similarity metric, which completes in log-linear time. This implementation is based on the paper [_RWS-Diff—Flexible and Efficient Change Detection in Hierarchical Data_](https://github.com/github/semantic-diff/files/325837/RWS-Diff.Flexible.and.Efficient.Change.Detection.in.Hierarchical.Data.pdf).
-rws :: (Eq (Record fields), Prologue.Foldable f, Functor f, Eq (f (Cofree f (Record fields))), HasField fields (Vector.Vector Double))
+rws :: forall f fields. (Eq (Record fields), Prologue.Foldable f, Functor f, Eq (f (Cofree f (Record fields))), HasField fields (Vector.Vector Double))
   => (Cofree f (Record fields) -> Cofree f (Record fields) -> Maybe (Free (CofreeF f (Both (Record fields))) (Patch (Cofree f (Record fields))))) -- ^ A function which compares a pair of terms recursively, returning 'Just' their diffed value if appropriate, or 'Nothing' if they should not be compared.
   -> [Cofree f (Record fields)] -- ^ The list of old terms.
   -> [Cofree f (Record fields)] -- ^ The list of new terms.
@@ -65,6 +65,7 @@ rws compare as bs
         -- RWS can produce false positives in the case of e.g. hash collisions. Therefore, we find the _l_ nearest candidates, filter out any which have already been mapped, and select the minimum of the remaining by (a constant-time approximation of) edit distance.
         --
         -- cf §4.2 of RWS-Diff
+        nearestUnmapped :: IntMap (UnmappedTerm (Cofree f (Record fields))) -> KdTree.KdTree Double (UnmappedTerm (Cofree f (Record fields))) -> UnmappedTerm (Cofree f (Record fields)) -> Maybe (UnmappedTerm (Cofree f (Record fields)))
         nearestUnmapped unmapped tree key = getFirst $ foldMap (First . Just) (sortOn (maybe maxBound (editDistanceUpTo defaultM) . compare (term key) . term) (toList (IntMap.intersection unmapped (toMap (KdTree.kNearest tree defaultL key)))))
 
         insertion previous unmappedA unmappedB (UnmappedTerm j _ b) = do

From b0c8f93d700087095567ae308ee0715a11534e3e Mon Sep 17 00:00:00 2001
From: Rob Rix <rob.rix@github.com>
Date: Fri, 19 Aug 2016 13:12:59 -0400
Subject: [PATCH 10/10] Add a type signature for findNearestNeighbourTo.

---
 src/Data/RandomWalkSimilarity.hs | 1 +
 1 file changed, 1 insertion(+)

diff --git a/src/Data/RandomWalkSimilarity.hs b/src/Data/RandomWalkSimilarity.hs
index 2dbf778c1..aa382df86 100644
--- a/src/Data/RandomWalkSimilarity.hs
+++ b/src/Data/RandomWalkSimilarity.hs
@@ -48,6 +48,7 @@ rws compare as bs
         kdbs = KdTree.build (Vector.toList . feature) fbs
         featurize index term = UnmappedTerm index (getField (extract term)) term
         toMap = IntMap.fromList . fmap (termIndex &&& identity)
+        findNearestNeighbourTo :: UnmappedTerm (Cofree f (Record fields)) -> State (Int, IntMap (UnmappedTerm (Cofree f (Record fields))), IntMap (UnmappedTerm (Cofree f (Record fields)))) (Int, Free (CofreeF f (Both (Record fields))) (Patch (Cofree f (Record fields))))
         findNearestNeighbourTo kv@(UnmappedTerm j _ b) = do
           (previous, unmappedA, unmappedB) <- get
           fromMaybe (insertion previous unmappedA unmappedB kv) $ do