From d16cdefe348285bde2a4819a9159c89c0b47b514 Mon Sep 17 00:00:00 2001 From: joshvera Date: Thu, 1 Sep 2016 16:58:24 -0400 Subject: [PATCH] Keep a hash for every node instead of a vector --- src/Data/RandomWalkSimilarity.hs | 23 ++++++++++++----------- src/Diffing.hs | 6 +++--- src/Interpreter.hs | 6 +++--- src/Syntax.hs | 3 ++- 4 files changed, 20 insertions(+), 18 deletions(-) diff --git a/src/Data/RandomWalkSimilarity.hs b/src/Data/RandomWalkSimilarity.hs index c2e9d3460..81256c14e 100644 --- a/src/Data/RandomWalkSimilarity.hs +++ b/src/Data/RandomWalkSimilarity.hs @@ -34,7 +34,7 @@ import Test.QuickCheck.Random import qualified Data.PQueue.Prio.Max as PQueue -- | Given a function comparing two terms recursively, and a function to compute a Hashable label from an unpacked term, compute the diff of a pair of lists of terms using a random walk similarity metric, which completes in log-linear time. This implementation is based on the paper [_RWS-Diff—Flexible and Efficient Change Detection in Hierarchical Data_](https://github.com/github/semantic-diff/files/325837/RWS-Diff.Flexible.and.Efficient.Change.Detection.in.Hierarchical.Data.pdf). -rws :: forall f fields. (Eq (Record fields), Prologue.Foldable f, Functor f, Eq (f (Cofree f (Record fields))), HasField fields (Vector.Vector Double), HasField fields (Vector.Vector Int)) +rws :: forall f fields. (Eq (Record fields), Prologue.Foldable f, Functor f, Eq (f (Cofree f (Record fields))), HasField fields (Vector.Vector Double), HasField fields Int) => (Cofree f (Record fields) -> Cofree f (Record fields) -> Maybe (Free (CofreeF f (Both (Record fields))) (Patch (Cofree f (Record fields))))) -- ^ A function which compares a pair of terms recursively, returning 'Just' their diffed value if appropriate, or 'Nothing' if they should not be compared. -> [Cofree f (Record fields)] -- ^ The list of old terms. -> [Cofree f (Record fields)] -- ^ The list of new terms. @@ -53,7 +53,7 @@ rws compare as bs fmap snd -- Modified xydiff + RWS -- 1. Annotate each node with a unique key top down based off its categories and termIndex? - -- 1. Construct two priority queues of hash values for each node ordered by max weight + -- 2. Construct two priority queues of hash values for each node ordered by max weight -- 3. Try to find matchings starting with the heaviest nodes -- 4. Use structure to propagate matchings? -- 5. Compute the diff @@ -62,14 +62,14 @@ rws compare as bs queueBs = PQueue.fromList (zipWith hashabilize [0..] bs) (fas, fbs) = dropEqualTerms queueAs queueBs - dropEqualTerms :: PQueue.MaxPQueue (Vector.Vector Int) (UnmappedHashTerm (Cofree f (Record fields))) -> PQueue.MaxPQueue (Vector.Vector Int) (UnmappedHashTerm (Cofree f (Record fields))) -> ([UnmappedTerm (Cofree f (Record fields))], [UnmappedTerm (Cofree f (Record fields))]) + dropEqualTerms :: PQueue.MaxPQueue Int (UnmappedHashTerm (Cofree f (Record fields))) -> PQueue.MaxPQueue Int (UnmappedHashTerm (Cofree f (Record fields))) -> ([UnmappedTerm (Cofree f (Record fields))], [UnmappedTerm (Cofree f (Record fields))]) dropEqualTerms as bs = (unmappedAs, unmappedBs) where (unmappedAs, unmappedBs, _, _) = dropEqualTerms' ([], [], as, bs) dropEqualTerms' (as', bs', queueA, queueB) = case (PQueue.maxViewWithKey queueA, PQueue.maxViewWithKey queueB) of (Just ((kA, a), queueA'), Just ((kB, b), queueB')) -> - if kA /= kB + if hashInt a /= hashInt b then dropEqualTerms' (toUnmappedTerm a : as', toUnmappedTerm b : bs', queueA', queueB') else dropEqualTerms' (as', bs', queueA', queueB') (Just ((_, a), queueA'), Nothing) -> (toUnmappedTerm a : as', bs', queueA', queueB) @@ -77,9 +77,9 @@ rws compare as bs (Nothing, Nothing) -> (as', bs', queueA, queueB) toUnmappedTerm (UnmappedHashTerm index _ term) = UnmappedTerm index (getField (extract term)) term - hashabilize :: (HasField fields (Vector.Vector Int)) => Int -> Cofree f (Record fields) -> (Vector.Vector Int, UnmappedHashTerm (Cofree f (Record fields))) - hashabilize index term = (hash, UnmappedHashTerm index hash term) - where hash = (getField (extract term) :: Vector.Vector Int) + hashabilize :: (HasField fields Int) => Int -> Cofree f (Record fields) -> (Int, UnmappedHashTerm (Cofree f (Record fields))) + hashabilize index term = (termSize term, UnmappedHashTerm index hash term) + where hash = (getField (extract term) :: Int) kdas = KdTree.build (Vector.toList . feature) fas kdbs = KdTree.build (Vector.toList . feature) fbs @@ -142,7 +142,7 @@ defaultM = 10 data UnmappedTerm a = UnmappedTerm { termIndex :: {-# UNPACK #-} !Int, feature :: !(Vector.Vector Double), term :: !a } deriving Eq -data UnmappedHashTerm a = UnmappedHashTerm { hashTermIndex :: {-# UNPACK #-} !Int, hashVector :: !(Vector.Vector Int), hashTerm :: !a } +data UnmappedHashTerm a = UnmappedHashTerm { hashTermIndex :: {-# UNPACK #-} !Int, hashInt :: !Int, hashTerm :: !a } deriving Eq @@ -151,9 +151,10 @@ data Gram label = Gram { stem :: [Maybe label], base :: [Maybe label] } deriving (Eq, Show) -- -- | Annotates a term with it's hash at each node. -hashDecorator :: (Hashable hash, Traversable f) => (forall b. CofreeF f (Record fields) b -> hash) -> Cofree f (Record fields) -> Cofree f (Record (Vector.Vector Int ': fields)) -hashDecorator getHash = cata $ \case - term@(record :< functor) -> cofree ((foldr (Vector.zipWith (+) . getField . extract) (Vector.singleton . hash $ getHash term) functor .: record) :< functor) +hashDecorator :: forall hash f fields. (Hashable (f Int), Hashable hash, Traversable f) => (forall b. CofreeF f (Record fields) b -> hash) -> Cofree f (Record fields) -> Cofree f (Record (Int ': fields)) +hashDecorator getLabel = cata $ \case + term@(record :< functor) -> + cofree ((hash (getLabel term, fmap (getField . extract) functor :: f Int) .: record) :< functor) -- | Annotates a term with a feature vector at each node, using the default values for the p, q, and d parameters. defaultFeatureVectorDecorator :: (Hashable label, Traversable f) => (forall b. CofreeF f (Record fields) b -> label) -> Cofree f (Record fields) -> Cofree f (Record (Vector.Vector Double ': fields)) diff --git a/src/Diffing.hs b/src/Diffing.hs index 9c161d324..ab4ccb6b8 100644 --- a/src/Diffing.hs +++ b/src/Diffing.hs @@ -44,9 +44,9 @@ import Data.Hashable -- | result. -- | Returns the rendered result strictly, so it's always fully evaluated -- | with respect to other IO actions. -diffFiles :: (HasField fields Category, HasField fields Cost, HasField fields Range, Eq (Record fields)) => Parser (Syntax Text) (Record fields) -> Renderer (Record (Vector.Vector Double ': Vector.Vector Int ': fields)) -> Both SourceBlob -> IO Output +diffFiles :: (HasField fields Category, HasField fields Cost, HasField fields Range, Eq (Record fields)) => Parser (Syntax Text) (Record fields) -> Renderer (Record (Vector.Vector Double ': Int ': fields)) -> Both SourceBlob -> IO Output diffFiles parser renderer sourceBlobs = do - terms <- traverse (fmap (defaultFeatureVectorDecorator getLabel . hashDecorator getHash) . parser) sourceBlobs + terms <- traverse (fmap (defaultFeatureVectorDecorator getLabel . hashDecorator getLabel) . parser) sourceBlobs let areNullOids = runBothWith (\a b -> (oid a == nullOid || null (source a), oid b == nullOid || null (source b))) sourceBlobs let textDiff = case areNullOids of @@ -64,8 +64,8 @@ diffFiles parser renderer sourceBlobs = do Pure patch -> uncurry both (fromThese 0 0 (unPatch (cost . extract <$> patch))) getLabel (h :< t) = (category h, case t of Leaf s -> Just s + Syntax.Comment s -> Just s _ -> Nothing) - getHash (h :< _) = (category h) -- | Return a parser based on the file extension (including the "."). parserForType :: Text -> Parser (Syntax Text) (Record '[Range, Category]) diff --git a/src/Interpreter.hs b/src/Interpreter.hs index 20f144ae3..985560d1f 100644 --- a/src/Interpreter.hs +++ b/src/Interpreter.hs @@ -25,7 +25,7 @@ type Comparable leaf annotation = Term leaf annotation -> Term leaf annotation - type DiffConstructor leaf annotation = CofreeF (Syntax leaf) (Both annotation) (Diff leaf annotation) -> Diff leaf annotation -- | Diff two terms recursively, given functions characterizing the diffing. -diffTerms :: (Eq leaf, Eq (Record fields), HasField fields Category, HasField fields (Vector.Vector Double), HasField fields (Vector.Vector Int)) +diffTerms :: (Eq leaf, Eq (Record fields), HasField fields Category, HasField fields (Vector.Vector Double), HasField fields Int) => DiffConstructor leaf (Record fields) -- ^ A function to wrap up & possibly annotate every produced diff. -> Comparable leaf (Record fields) -- ^ A function to determine whether or not two terms should even be compared. -> SES.Cost (Diff leaf (Record fields)) -- ^ A function to compute the cost of a given diff node. @@ -35,7 +35,7 @@ diffTerms :: (Eq leaf, Eq (Record fields), HasField fields Category, HasField fi diffTerms construct comparable cost a b = fromMaybe (replacing a b) $ diffComparableTerms construct comparable cost a b -- | Diff two terms recursively, given functions characterizing the diffing. If the terms are incomparable, returns 'Nothing'. -diffComparableTerms :: (Eq leaf, Eq (Record fields), HasField fields Category, HasField fields (Vector.Vector Double), HasField fields (Vector.Vector Int)) => DiffConstructor leaf (Record fields) -> Comparable leaf (Record fields) -> SES.Cost (Diff leaf (Record fields)) -> Term leaf (Record fields) -> Term leaf (Record fields) -> Maybe (Diff leaf (Record fields)) +diffComparableTerms :: (Eq leaf, Eq (Record fields), HasField fields Category, HasField fields (Vector.Vector Double), HasField fields Int) => DiffConstructor leaf (Record fields) -> Comparable leaf (Record fields) -> SES.Cost (Diff leaf (Record fields)) -> Term leaf (Record fields) -> Term leaf (Record fields) -> Maybe (Diff leaf (Record fields)) diffComparableTerms construct comparable cost = recur where recur a b | (category <$> a) == (category <$> b) = hylo construct runCofree <$> zipTerms a b @@ -71,7 +71,7 @@ algorithmWithTerms construct t1 t2 = case (unwrap t1, unwrap t2) of branch constructor a b = bySimilarity a b >>= annotate . constructor -- | Run an algorithm, given functions characterizing the evaluation. -runAlgorithm :: (Functor f, GAlign f, Eq a, Eq (Record fields), Eq (f (Cofree f (Record fields))), Prologue.Foldable f, Traversable f, HasField fields (Vector.Vector Double), HasField fields (Vector.Vector Int)) +runAlgorithm :: (Functor f, GAlign f, Eq a, Eq (Record fields), Eq (f (Cofree f (Record fields))), Prologue.Foldable f, Traversable f, HasField fields (Vector.Vector Double), HasField fields Int) => (CofreeF f (Both (Record fields)) (Free (CofreeF f (Both (Record fields))) (Patch (Cofree f (Record fields)))) -> Free (CofreeF f (Both (Record fields))) (Patch (Cofree f (Record fields)))) -- ^ A function to wrap up & possibly annotate every produced diff. -> (Cofree f (Record fields) -> Cofree f (Record fields) -> Maybe (Free (CofreeF f (Both (Record fields))) (Patch (Cofree f (Record fields))))) -- ^ A function to diff two subterms recursively, if they are comparable, or else return 'Nothing'. -> SES.Cost (Free (CofreeF f (Both (Record fields))) (Patch (Cofree f (Record fields)))) -- ^ A function to compute the cost of a given diff node. diff --git a/src/Syntax.hs b/src/Syntax.hs index ed6e7be32..91e74fd0d 100644 --- a/src/Syntax.hs +++ b/src/Syntax.hs @@ -6,6 +6,7 @@ import GHC.Generics import Prologue import Test.QuickCheck hiding (Fixed) import SourceSpan +import Data.Hashable -- | A node in an abstract syntax tree. -- @@ -72,7 +73,7 @@ data Syntax a f -- | A method definition with an identifier, params, and a list of expressions. | Method f [f] [f] | If f f (Maybe f) - deriving (Eq, Foldable, Functor, Generic, Generic1, Mergeable, Ord, Show, Traversable) + deriving (Eq, Foldable, Functor, Generic, Generic1, Mergeable, Ord, Show, Traversable, Hashable) -- Instances