1
1
mirror of https://github.com/github/semantic.git synced 2025-01-03 13:02:37 +03:00

Keep a hash for every node instead of a vector

This commit is contained in:
joshvera 2016-09-01 16:58:24 -04:00
parent daba1fb2ef
commit d16cdefe34
4 changed files with 20 additions and 18 deletions

View File

@ -34,7 +34,7 @@ import Test.QuickCheck.Random
import qualified Data.PQueue.Prio.Max as PQueue
-- | Given a function comparing two terms recursively, and a function to compute a Hashable label from an unpacked term, compute the diff of a pair of lists of terms using a random walk similarity metric, which completes in log-linear time. This implementation is based on the paper [_RWS-Diff—Flexible and Efficient Change Detection in Hierarchical Data_](https://github.com/github/semantic-diff/files/325837/RWS-Diff.Flexible.and.Efficient.Change.Detection.in.Hierarchical.Data.pdf).
rws :: forall f fields. (Eq (Record fields), Prologue.Foldable f, Functor f, Eq (f (Cofree f (Record fields))), HasField fields (Vector.Vector Double), HasField fields (Vector.Vector Int))
rws :: forall f fields. (Eq (Record fields), Prologue.Foldable f, Functor f, Eq (f (Cofree f (Record fields))), HasField fields (Vector.Vector Double), HasField fields Int)
=> (Cofree f (Record fields) -> Cofree f (Record fields) -> Maybe (Free (CofreeF f (Both (Record fields))) (Patch (Cofree f (Record fields))))) -- ^ A function which compares a pair of terms recursively, returning 'Just' their diffed value if appropriate, or 'Nothing' if they should not be compared.
-> [Cofree f (Record fields)] -- ^ The list of old terms.
-> [Cofree f (Record fields)] -- ^ The list of new terms.
@ -53,7 +53,7 @@ rws compare as bs
fmap snd
-- Modified xydiff + RWS
-- 1. Annotate each node with a unique key top down based off its categories and termIndex?
-- 1. Construct two priority queues of hash values for each node ordered by max weight
-- 2. Construct two priority queues of hash values for each node ordered by max weight
-- 3. Try to find matchings starting with the heaviest nodes
-- 4. Use structure to propagate matchings?
-- 5. Compute the diff
@ -62,14 +62,14 @@ rws compare as bs
queueBs = PQueue.fromList (zipWith hashabilize [0..] bs)
(fas, fbs) = dropEqualTerms queueAs queueBs
dropEqualTerms :: PQueue.MaxPQueue (Vector.Vector Int) (UnmappedHashTerm (Cofree f (Record fields))) -> PQueue.MaxPQueue (Vector.Vector Int) (UnmappedHashTerm (Cofree f (Record fields))) -> ([UnmappedTerm (Cofree f (Record fields))], [UnmappedTerm (Cofree f (Record fields))])
dropEqualTerms :: PQueue.MaxPQueue Int (UnmappedHashTerm (Cofree f (Record fields))) -> PQueue.MaxPQueue Int (UnmappedHashTerm (Cofree f (Record fields))) -> ([UnmappedTerm (Cofree f (Record fields))], [UnmappedTerm (Cofree f (Record fields))])
dropEqualTerms as bs = (unmappedAs, unmappedBs)
where
(unmappedAs, unmappedBs, _, _) = dropEqualTerms' ([], [], as, bs)
dropEqualTerms' (as', bs', queueA, queueB) = case (PQueue.maxViewWithKey queueA, PQueue.maxViewWithKey queueB) of
(Just ((kA, a), queueA'), Just ((kB, b), queueB')) ->
if kA /= kB
if hashInt a /= hashInt b
then dropEqualTerms' (toUnmappedTerm a : as', toUnmappedTerm b : bs', queueA', queueB')
else dropEqualTerms' (as', bs', queueA', queueB')
(Just ((_, a), queueA'), Nothing) -> (toUnmappedTerm a : as', bs', queueA', queueB)
@ -77,9 +77,9 @@ rws compare as bs
(Nothing, Nothing) -> (as', bs', queueA, queueB)
toUnmappedTerm (UnmappedHashTerm index _ term) = UnmappedTerm index (getField (extract term)) term
hashabilize :: (HasField fields (Vector.Vector Int)) => Int -> Cofree f (Record fields) -> (Vector.Vector Int, UnmappedHashTerm (Cofree f (Record fields)))
hashabilize index term = (hash, UnmappedHashTerm index hash term)
where hash = (getField (extract term) :: Vector.Vector Int)
hashabilize :: (HasField fields Int) => Int -> Cofree f (Record fields) -> (Int, UnmappedHashTerm (Cofree f (Record fields)))
hashabilize index term = (termSize term, UnmappedHashTerm index hash term)
where hash = (getField (extract term) :: Int)
kdas = KdTree.build (Vector.toList . feature) fas
kdbs = KdTree.build (Vector.toList . feature) fbs
@ -142,7 +142,7 @@ defaultM = 10
data UnmappedTerm a = UnmappedTerm { termIndex :: {-# UNPACK #-} !Int, feature :: !(Vector.Vector Double), term :: !a }
deriving Eq
data UnmappedHashTerm a = UnmappedHashTerm { hashTermIndex :: {-# UNPACK #-} !Int, hashVector :: !(Vector.Vector Int), hashTerm :: !a }
data UnmappedHashTerm a = UnmappedHashTerm { hashTermIndex :: {-# UNPACK #-} !Int, hashInt :: !Int, hashTerm :: !a }
deriving Eq
@ -151,9 +151,10 @@ data Gram label = Gram { stem :: [Maybe label], base :: [Maybe label] }
deriving (Eq, Show)
-- -- | Annotates a term with it's hash at each node.
hashDecorator :: (Hashable hash, Traversable f) => (forall b. CofreeF f (Record fields) b -> hash) -> Cofree f (Record fields) -> Cofree f (Record (Vector.Vector Int ': fields))
hashDecorator getHash = cata $ \case
term@(record :< functor) -> cofree ((foldr (Vector.zipWith (+) . getField . extract) (Vector.singleton . hash $ getHash term) functor .: record) :< functor)
hashDecorator :: forall hash f fields. (Hashable (f Int), Hashable hash, Traversable f) => (forall b. CofreeF f (Record fields) b -> hash) -> Cofree f (Record fields) -> Cofree f (Record (Int ': fields))
hashDecorator getLabel = cata $ \case
term@(record :< functor) ->
cofree ((hash (getLabel term, fmap (getField . extract) functor :: f Int) .: record) :< functor)
-- | Annotates a term with a feature vector at each node, using the default values for the p, q, and d parameters.
defaultFeatureVectorDecorator :: (Hashable label, Traversable f) => (forall b. CofreeF f (Record fields) b -> label) -> Cofree f (Record fields) -> Cofree f (Record (Vector.Vector Double ': fields))

View File

@ -44,9 +44,9 @@ import Data.Hashable
-- | result.
-- | Returns the rendered result strictly, so it's always fully evaluated
-- | with respect to other IO actions.
diffFiles :: (HasField fields Category, HasField fields Cost, HasField fields Range, Eq (Record fields)) => Parser (Syntax Text) (Record fields) -> Renderer (Record (Vector.Vector Double ': Vector.Vector Int ': fields)) -> Both SourceBlob -> IO Output
diffFiles :: (HasField fields Category, HasField fields Cost, HasField fields Range, Eq (Record fields)) => Parser (Syntax Text) (Record fields) -> Renderer (Record (Vector.Vector Double ': Int ': fields)) -> Both SourceBlob -> IO Output
diffFiles parser renderer sourceBlobs = do
terms <- traverse (fmap (defaultFeatureVectorDecorator getLabel . hashDecorator getHash) . parser) sourceBlobs
terms <- traverse (fmap (defaultFeatureVectorDecorator getLabel . hashDecorator getLabel) . parser) sourceBlobs
let areNullOids = runBothWith (\a b -> (oid a == nullOid || null (source a), oid b == nullOid || null (source b))) sourceBlobs
let textDiff = case areNullOids of
@ -64,8 +64,8 @@ diffFiles parser renderer sourceBlobs = do
Pure patch -> uncurry both (fromThese 0 0 (unPatch (cost . extract <$> patch)))
getLabel (h :< t) = (category h, case t of
Leaf s -> Just s
Syntax.Comment s -> Just s
_ -> Nothing)
getHash (h :< _) = (category h)
-- | Return a parser based on the file extension (including the ".").
parserForType :: Text -> Parser (Syntax Text) (Record '[Range, Category])

View File

@ -25,7 +25,7 @@ type Comparable leaf annotation = Term leaf annotation -> Term leaf annotation -
type DiffConstructor leaf annotation = CofreeF (Syntax leaf) (Both annotation) (Diff leaf annotation) -> Diff leaf annotation
-- | Diff two terms recursively, given functions characterizing the diffing.
diffTerms :: (Eq leaf, Eq (Record fields), HasField fields Category, HasField fields (Vector.Vector Double), HasField fields (Vector.Vector Int))
diffTerms :: (Eq leaf, Eq (Record fields), HasField fields Category, HasField fields (Vector.Vector Double), HasField fields Int)
=> DiffConstructor leaf (Record fields) -- ^ A function to wrap up & possibly annotate every produced diff.
-> Comparable leaf (Record fields) -- ^ A function to determine whether or not two terms should even be compared.
-> SES.Cost (Diff leaf (Record fields)) -- ^ A function to compute the cost of a given diff node.
@ -35,7 +35,7 @@ diffTerms :: (Eq leaf, Eq (Record fields), HasField fields Category, HasField fi
diffTerms construct comparable cost a b = fromMaybe (replacing a b) $ diffComparableTerms construct comparable cost a b
-- | Diff two terms recursively, given functions characterizing the diffing. If the terms are incomparable, returns 'Nothing'.
diffComparableTerms :: (Eq leaf, Eq (Record fields), HasField fields Category, HasField fields (Vector.Vector Double), HasField fields (Vector.Vector Int)) => DiffConstructor leaf (Record fields) -> Comparable leaf (Record fields) -> SES.Cost (Diff leaf (Record fields)) -> Term leaf (Record fields) -> Term leaf (Record fields) -> Maybe (Diff leaf (Record fields))
diffComparableTerms :: (Eq leaf, Eq (Record fields), HasField fields Category, HasField fields (Vector.Vector Double), HasField fields Int) => DiffConstructor leaf (Record fields) -> Comparable leaf (Record fields) -> SES.Cost (Diff leaf (Record fields)) -> Term leaf (Record fields) -> Term leaf (Record fields) -> Maybe (Diff leaf (Record fields))
diffComparableTerms construct comparable cost = recur
where recur a b
| (category <$> a) == (category <$> b) = hylo construct runCofree <$> zipTerms a b
@ -71,7 +71,7 @@ algorithmWithTerms construct t1 t2 = case (unwrap t1, unwrap t2) of
branch constructor a b = bySimilarity a b >>= annotate . constructor
-- | Run an algorithm, given functions characterizing the evaluation.
runAlgorithm :: (Functor f, GAlign f, Eq a, Eq (Record fields), Eq (f (Cofree f (Record fields))), Prologue.Foldable f, Traversable f, HasField fields (Vector.Vector Double), HasField fields (Vector.Vector Int))
runAlgorithm :: (Functor f, GAlign f, Eq a, Eq (Record fields), Eq (f (Cofree f (Record fields))), Prologue.Foldable f, Traversable f, HasField fields (Vector.Vector Double), HasField fields Int)
=> (CofreeF f (Both (Record fields)) (Free (CofreeF f (Both (Record fields))) (Patch (Cofree f (Record fields)))) -> Free (CofreeF f (Both (Record fields))) (Patch (Cofree f (Record fields)))) -- ^ A function to wrap up & possibly annotate every produced diff.
-> (Cofree f (Record fields) -> Cofree f (Record fields) -> Maybe (Free (CofreeF f (Both (Record fields))) (Patch (Cofree f (Record fields))))) -- ^ A function to diff two subterms recursively, if they are comparable, or else return 'Nothing'.
-> SES.Cost (Free (CofreeF f (Both (Record fields))) (Patch (Cofree f (Record fields)))) -- ^ A function to compute the cost of a given diff node.

View File

@ -6,6 +6,7 @@ import GHC.Generics
import Prologue
import Test.QuickCheck hiding (Fixed)
import SourceSpan
import Data.Hashable
-- | A node in an abstract syntax tree.
--
@ -72,7 +73,7 @@ data Syntax a f
-- | A method definition with an identifier, params, and a list of expressions.
| Method f [f] [f]
| If f f (Maybe f)
deriving (Eq, Foldable, Functor, Generic, Generic1, Mergeable, Ord, Show, Traversable)
deriving (Eq, Foldable, Functor, Generic, Generic1, Mergeable, Ord, Show, Traversable, Hashable)
-- Instances