From d16cdefe348285bde2a4819a9159c89c0b47b514 Mon Sep 17 00:00:00 2001
From: joshvera <josh@joshvera.com>
Date: Thu, 1 Sep 2016 16:58:24 -0400
Subject: [PATCH] Keep a hash for every node instead of a vector

---
 src/Data/RandomWalkSimilarity.hs | 23 ++++++++++++-----------
 src/Diffing.hs                   |  6 +++---
 src/Interpreter.hs               |  6 +++---
 src/Syntax.hs                    |  3 ++-
 4 files changed, 20 insertions(+), 18 deletions(-)

diff --git a/src/Data/RandomWalkSimilarity.hs b/src/Data/RandomWalkSimilarity.hs
index c2e9d3460..81256c14e 100644
--- a/src/Data/RandomWalkSimilarity.hs
+++ b/src/Data/RandomWalkSimilarity.hs
@@ -34,7 +34,7 @@ import Test.QuickCheck.Random
 import qualified Data.PQueue.Prio.Max as PQueue
 
 -- | Given a function comparing two terms recursively, and a function to compute a Hashable label from an unpacked term, compute the diff of a pair of lists of terms using a random walk similarity metric, which completes in log-linear time. This implementation is based on the paper [_RWS-Diff—Flexible and Efficient Change Detection in Hierarchical Data_](https://github.com/github/semantic-diff/files/325837/RWS-Diff.Flexible.and.Efficient.Change.Detection.in.Hierarchical.Data.pdf).
-rws :: forall f fields. (Eq (Record fields), Prologue.Foldable f, Functor f, Eq (f (Cofree f (Record fields))), HasField fields (Vector.Vector Double), HasField fields (Vector.Vector Int))
+rws :: forall f fields. (Eq (Record fields), Prologue.Foldable f, Functor f, Eq (f (Cofree f (Record fields))), HasField fields (Vector.Vector Double), HasField fields Int)
   => (Cofree f (Record fields) -> Cofree f (Record fields) -> Maybe (Free (CofreeF f (Both (Record fields))) (Patch (Cofree f (Record fields))))) -- ^ A function which compares a pair of terms recursively, returning 'Just' their diffed value if appropriate, or 'Nothing' if they should not be compared.
   -> [Cofree f (Record fields)] -- ^ The list of old terms.
   -> [Cofree f (Record fields)] -- ^ The list of new terms.
@@ -53,7 +53,7 @@ rws compare as bs
     fmap snd
     -- Modified xydiff + RWS
     -- 1. Annotate each node with a unique key top down based off its categories and termIndex?
-    -- 1. Construct two priority queues of hash values for each node ordered by max weight
+    -- 2. Construct two priority queues of hash values for each node ordered by max weight
     -- 3. Try to find matchings starting with the heaviest nodes
     -- 4. Use structure to propagate matchings?
     -- 5. Compute the diff
@@ -62,14 +62,14 @@ rws compare as bs
         queueBs = PQueue.fromList (zipWith hashabilize [0..] bs)
         (fas, fbs) = dropEqualTerms queueAs queueBs
 
-        dropEqualTerms :: PQueue.MaxPQueue (Vector.Vector Int) (UnmappedHashTerm (Cofree f (Record fields))) -> PQueue.MaxPQueue (Vector.Vector Int) (UnmappedHashTerm (Cofree f (Record fields))) ->  ([UnmappedTerm (Cofree f (Record fields))], [UnmappedTerm (Cofree f (Record fields))])
+        dropEqualTerms :: PQueue.MaxPQueue Int (UnmappedHashTerm (Cofree f (Record fields))) -> PQueue.MaxPQueue Int (UnmappedHashTerm (Cofree f (Record fields))) ->  ([UnmappedTerm (Cofree f (Record fields))], [UnmappedTerm (Cofree f (Record fields))])
         dropEqualTerms as bs = (unmappedAs, unmappedBs)
           where
             (unmappedAs, unmappedBs, _, _) = dropEqualTerms' ([], [], as, bs)
 
             dropEqualTerms' (as', bs', queueA, queueB) = case (PQueue.maxViewWithKey queueA, PQueue.maxViewWithKey queueB) of
               (Just ((kA, a), queueA'), Just ((kB, b), queueB')) ->
-                if kA /= kB
+                if hashInt a /= hashInt b
                 then dropEqualTerms' (toUnmappedTerm a : as', toUnmappedTerm b : bs', queueA', queueB')
                 else dropEqualTerms' (as', bs', queueA', queueB')
               (Just ((_, a), queueA'), Nothing) -> (toUnmappedTerm a : as', bs', queueA', queueB)
@@ -77,9 +77,9 @@ rws compare as bs
               (Nothing, Nothing) -> (as', bs', queueA, queueB)
             toUnmappedTerm (UnmappedHashTerm index _ term) = UnmappedTerm index (getField (extract term)) term
 
-        hashabilize :: (HasField fields (Vector.Vector Int)) => Int -> Cofree f (Record fields) -> (Vector.Vector Int, UnmappedHashTerm (Cofree f (Record fields)))
-        hashabilize index term = (hash, UnmappedHashTerm index hash term)
-          where hash = (getField (extract term) :: Vector.Vector Int)
+        hashabilize :: (HasField fields Int) => Int -> Cofree f (Record fields) -> (Int, UnmappedHashTerm (Cofree f (Record fields)))
+        hashabilize index term = (termSize term, UnmappedHashTerm index hash term)
+          where hash = (getField (extract term) :: Int)
 
         kdas = KdTree.build (Vector.toList . feature) fas
         kdbs = KdTree.build (Vector.toList . feature) fbs
@@ -142,7 +142,7 @@ defaultM = 10
 data UnmappedTerm a = UnmappedTerm { termIndex :: {-# UNPACK #-} !Int, feature :: !(Vector.Vector Double), term :: !a }
   deriving Eq
 
-data UnmappedHashTerm a = UnmappedHashTerm { hashTermIndex :: {-# UNPACK #-} !Int, hashVector :: !(Vector.Vector Int), hashTerm :: !a }
+data UnmappedHashTerm a = UnmappedHashTerm { hashTermIndex :: {-# UNPACK #-} !Int, hashInt :: !Int, hashTerm :: !a }
   deriving Eq
 
 
@@ -151,9 +151,10 @@ data Gram label = Gram { stem :: [Maybe label], base :: [Maybe label] }
   deriving (Eq, Show)
 
 -- -- | Annotates a term with it's hash at each node.
-hashDecorator :: (Hashable hash, Traversable f) => (forall b. CofreeF f (Record fields) b -> hash) -> Cofree f (Record fields) -> Cofree f (Record (Vector.Vector Int ': fields))
-hashDecorator getHash = cata $ \case
-  term@(record :< functor) -> cofree ((foldr (Vector.zipWith (+) . getField . extract) (Vector.singleton . hash $ getHash term) functor .: record) :< functor)
+hashDecorator :: forall hash f fields. (Hashable (f Int), Hashable hash, Traversable f) => (forall b. CofreeF f (Record fields) b -> hash) -> Cofree f (Record fields) -> Cofree f (Record (Int ': fields))
+hashDecorator getLabel = cata $ \case
+  term@(record :< functor) ->
+    cofree ((hash (getLabel term, fmap (getField . extract) functor :: f Int) .: record) :< functor)
 
 -- | Annotates a term with a feature vector at each node, using the default values for the p, q, and d parameters.
 defaultFeatureVectorDecorator :: (Hashable label, Traversable f) => (forall b. CofreeF f (Record fields) b -> label) -> Cofree f (Record fields) -> Cofree f (Record (Vector.Vector Double ': fields))
diff --git a/src/Diffing.hs b/src/Diffing.hs
index 9c161d324..ab4ccb6b8 100644
--- a/src/Diffing.hs
+++ b/src/Diffing.hs
@@ -44,9 +44,9 @@ import Data.Hashable
 -- | result.
 -- | Returns the rendered result strictly, so it's always fully evaluated
 -- | with respect to other IO actions.
-diffFiles :: (HasField fields Category, HasField fields Cost, HasField fields Range, Eq (Record fields)) => Parser (Syntax Text) (Record fields) -> Renderer (Record (Vector.Vector Double ': Vector.Vector Int ': fields)) -> Both SourceBlob -> IO Output
+diffFiles :: (HasField fields Category, HasField fields Cost, HasField fields Range, Eq (Record fields)) => Parser (Syntax Text) (Record fields) -> Renderer (Record (Vector.Vector Double ': Int ': fields)) -> Both SourceBlob -> IO Output
 diffFiles parser renderer sourceBlobs = do
-  terms <- traverse (fmap (defaultFeatureVectorDecorator getLabel . hashDecorator getHash) . parser) sourceBlobs
+  terms <- traverse (fmap (defaultFeatureVectorDecorator getLabel . hashDecorator getLabel) . parser) sourceBlobs
 
   let areNullOids = runBothWith (\a b -> (oid a == nullOid || null (source a), oid b == nullOid || null (source b))) sourceBlobs
   let textDiff = case areNullOids of
@@ -64,8 +64,8 @@ diffFiles parser renderer sourceBlobs = do
           Pure patch -> uncurry both (fromThese 0 0 (unPatch (cost . extract <$> patch)))
         getLabel (h :< t) = (category h, case t of
           Leaf s -> Just s
+          Syntax.Comment s -> Just s
           _ -> Nothing)
-        getHash (h :< _) = (category h)
 
 -- | Return a parser based on the file extension (including the ".").
 parserForType :: Text -> Parser (Syntax Text) (Record '[Range, Category])
diff --git a/src/Interpreter.hs b/src/Interpreter.hs
index 20f144ae3..985560d1f 100644
--- a/src/Interpreter.hs
+++ b/src/Interpreter.hs
@@ -25,7 +25,7 @@ type Comparable leaf annotation = Term leaf annotation -> Term leaf annotation -
 type DiffConstructor leaf annotation = CofreeF (Syntax leaf) (Both annotation) (Diff leaf annotation) -> Diff leaf annotation
 
 -- | Diff two terms recursively, given functions characterizing the diffing.
-diffTerms :: (Eq leaf, Eq (Record fields), HasField fields Category, HasField fields (Vector.Vector Double), HasField fields (Vector.Vector Int))
+diffTerms :: (Eq leaf, Eq (Record fields), HasField fields Category, HasField fields (Vector.Vector Double), HasField fields Int)
   => DiffConstructor leaf (Record fields) -- ^ A function to wrap up & possibly annotate every produced diff.
   -> Comparable leaf (Record fields) -- ^ A function to determine whether or not two terms should even be compared.
   -> SES.Cost (Diff leaf (Record fields)) -- ^ A function to compute the cost of a given diff node.
@@ -35,7 +35,7 @@ diffTerms :: (Eq leaf, Eq (Record fields), HasField fields Category, HasField fi
 diffTerms construct comparable cost a b = fromMaybe (replacing a b) $ diffComparableTerms construct comparable cost a b
 
 -- | Diff two terms recursively, given functions characterizing the diffing. If the terms are incomparable, returns 'Nothing'.
-diffComparableTerms :: (Eq leaf, Eq (Record fields), HasField fields Category, HasField fields (Vector.Vector Double), HasField fields (Vector.Vector Int)) => DiffConstructor leaf (Record fields) -> Comparable leaf (Record fields) -> SES.Cost (Diff leaf (Record fields)) -> Term leaf (Record fields) -> Term leaf (Record fields) -> Maybe (Diff leaf (Record fields))
+diffComparableTerms :: (Eq leaf, Eq (Record fields), HasField fields Category, HasField fields (Vector.Vector Double), HasField fields Int) => DiffConstructor leaf (Record fields) -> Comparable leaf (Record fields) -> SES.Cost (Diff leaf (Record fields)) -> Term leaf (Record fields) -> Term leaf (Record fields) -> Maybe (Diff leaf (Record fields))
 diffComparableTerms construct comparable cost = recur
   where recur a b
           | (category <$> a) == (category <$> b) = hylo construct runCofree <$> zipTerms a b
@@ -71,7 +71,7 @@ algorithmWithTerms construct t1 t2 = case (unwrap t1, unwrap t2) of
         branch constructor a b = bySimilarity a b >>= annotate . constructor
 
 -- | Run an algorithm, given functions characterizing the evaluation.
-runAlgorithm :: (Functor f, GAlign f, Eq a, Eq (Record fields), Eq (f (Cofree f (Record fields))), Prologue.Foldable f, Traversable f, HasField fields (Vector.Vector Double), HasField fields (Vector.Vector Int))
+runAlgorithm :: (Functor f, GAlign f, Eq a, Eq (Record fields), Eq (f (Cofree f (Record fields))), Prologue.Foldable f, Traversable f, HasField fields (Vector.Vector Double), HasField fields Int)
   => (CofreeF f (Both (Record fields)) (Free (CofreeF f (Both (Record fields))) (Patch (Cofree f (Record fields)))) -> Free (CofreeF f (Both (Record fields))) (Patch (Cofree f (Record fields)))) -- ^ A function to wrap up & possibly annotate every produced diff.
   -> (Cofree f (Record fields) -> Cofree f (Record fields) -> Maybe (Free (CofreeF f (Both (Record fields))) (Patch (Cofree f (Record fields))))) -- ^ A function to diff two subterms recursively, if they are comparable, or else return 'Nothing'.
   -> SES.Cost (Free (CofreeF f (Both (Record fields))) (Patch (Cofree f (Record fields)))) -- ^ A function to compute the cost of a given diff node.
diff --git a/src/Syntax.hs b/src/Syntax.hs
index ed6e7be32..91e74fd0d 100644
--- a/src/Syntax.hs
+++ b/src/Syntax.hs
@@ -6,6 +6,7 @@ import GHC.Generics
 import Prologue
 import Test.QuickCheck hiding (Fixed)
 import SourceSpan
+import Data.Hashable
 
 -- | A node in an abstract syntax tree.
 --
@@ -72,7 +73,7 @@ data Syntax a f
   -- | A method definition with an identifier, params, and a list of expressions.
   | Method f [f] [f]
   | If f f (Maybe f)
-  deriving (Eq, Foldable, Functor, Generic, Generic1, Mergeable, Ord, Show, Traversable)
+  deriving (Eq, Foldable, Functor, Generic, Generic1, Mergeable, Ord, Show, Traversable, Hashable)
 
 -- Instances