mirror of
https://github.com/github/semantic.git
synced 2025-01-03 13:02:37 +03:00
Keep a hash for every node instead of a vector
This commit is contained in:
parent
daba1fb2ef
commit
d16cdefe34
@ -34,7 +34,7 @@ import Test.QuickCheck.Random
|
||||
import qualified Data.PQueue.Prio.Max as PQueue
|
||||
|
||||
-- | Given a function comparing two terms recursively, and a function to compute a Hashable label from an unpacked term, compute the diff of a pair of lists of terms using a random walk similarity metric, which completes in log-linear time. This implementation is based on the paper [_RWS-Diff—Flexible and Efficient Change Detection in Hierarchical Data_](https://github.com/github/semantic-diff/files/325837/RWS-Diff.Flexible.and.Efficient.Change.Detection.in.Hierarchical.Data.pdf).
|
||||
rws :: forall f fields. (Eq (Record fields), Prologue.Foldable f, Functor f, Eq (f (Cofree f (Record fields))), HasField fields (Vector.Vector Double), HasField fields (Vector.Vector Int))
|
||||
rws :: forall f fields. (Eq (Record fields), Prologue.Foldable f, Functor f, Eq (f (Cofree f (Record fields))), HasField fields (Vector.Vector Double), HasField fields Int)
|
||||
=> (Cofree f (Record fields) -> Cofree f (Record fields) -> Maybe (Free (CofreeF f (Both (Record fields))) (Patch (Cofree f (Record fields))))) -- ^ A function which compares a pair of terms recursively, returning 'Just' their diffed value if appropriate, or 'Nothing' if they should not be compared.
|
||||
-> [Cofree f (Record fields)] -- ^ The list of old terms.
|
||||
-> [Cofree f (Record fields)] -- ^ The list of new terms.
|
||||
@ -53,7 +53,7 @@ rws compare as bs
|
||||
fmap snd
|
||||
-- Modified xydiff + RWS
|
||||
-- 1. Annotate each node with a unique key top down based off its categories and termIndex?
|
||||
-- 1. Construct two priority queues of hash values for each node ordered by max weight
|
||||
-- 2. Construct two priority queues of hash values for each node ordered by max weight
|
||||
-- 3. Try to find matchings starting with the heaviest nodes
|
||||
-- 4. Use structure to propagate matchings?
|
||||
-- 5. Compute the diff
|
||||
@ -62,14 +62,14 @@ rws compare as bs
|
||||
queueBs = PQueue.fromList (zipWith hashabilize [0..] bs)
|
||||
(fas, fbs) = dropEqualTerms queueAs queueBs
|
||||
|
||||
dropEqualTerms :: PQueue.MaxPQueue (Vector.Vector Int) (UnmappedHashTerm (Cofree f (Record fields))) -> PQueue.MaxPQueue (Vector.Vector Int) (UnmappedHashTerm (Cofree f (Record fields))) -> ([UnmappedTerm (Cofree f (Record fields))], [UnmappedTerm (Cofree f (Record fields))])
|
||||
dropEqualTerms :: PQueue.MaxPQueue Int (UnmappedHashTerm (Cofree f (Record fields))) -> PQueue.MaxPQueue Int (UnmappedHashTerm (Cofree f (Record fields))) -> ([UnmappedTerm (Cofree f (Record fields))], [UnmappedTerm (Cofree f (Record fields))])
|
||||
dropEqualTerms as bs = (unmappedAs, unmappedBs)
|
||||
where
|
||||
(unmappedAs, unmappedBs, _, _) = dropEqualTerms' ([], [], as, bs)
|
||||
|
||||
dropEqualTerms' (as', bs', queueA, queueB) = case (PQueue.maxViewWithKey queueA, PQueue.maxViewWithKey queueB) of
|
||||
(Just ((kA, a), queueA'), Just ((kB, b), queueB')) ->
|
||||
if kA /= kB
|
||||
if hashInt a /= hashInt b
|
||||
then dropEqualTerms' (toUnmappedTerm a : as', toUnmappedTerm b : bs', queueA', queueB')
|
||||
else dropEqualTerms' (as', bs', queueA', queueB')
|
||||
(Just ((_, a), queueA'), Nothing) -> (toUnmappedTerm a : as', bs', queueA', queueB)
|
||||
@ -77,9 +77,9 @@ rws compare as bs
|
||||
(Nothing, Nothing) -> (as', bs', queueA, queueB)
|
||||
toUnmappedTerm (UnmappedHashTerm index _ term) = UnmappedTerm index (getField (extract term)) term
|
||||
|
||||
hashabilize :: (HasField fields (Vector.Vector Int)) => Int -> Cofree f (Record fields) -> (Vector.Vector Int, UnmappedHashTerm (Cofree f (Record fields)))
|
||||
hashabilize index term = (hash, UnmappedHashTerm index hash term)
|
||||
where hash = (getField (extract term) :: Vector.Vector Int)
|
||||
hashabilize :: (HasField fields Int) => Int -> Cofree f (Record fields) -> (Int, UnmappedHashTerm (Cofree f (Record fields)))
|
||||
hashabilize index term = (termSize term, UnmappedHashTerm index hash term)
|
||||
where hash = (getField (extract term) :: Int)
|
||||
|
||||
kdas = KdTree.build (Vector.toList . feature) fas
|
||||
kdbs = KdTree.build (Vector.toList . feature) fbs
|
||||
@ -142,7 +142,7 @@ defaultM = 10
|
||||
data UnmappedTerm a = UnmappedTerm { termIndex :: {-# UNPACK #-} !Int, feature :: !(Vector.Vector Double), term :: !a }
|
||||
deriving Eq
|
||||
|
||||
data UnmappedHashTerm a = UnmappedHashTerm { hashTermIndex :: {-# UNPACK #-} !Int, hashVector :: !(Vector.Vector Int), hashTerm :: !a }
|
||||
data UnmappedHashTerm a = UnmappedHashTerm { hashTermIndex :: {-# UNPACK #-} !Int, hashInt :: !Int, hashTerm :: !a }
|
||||
deriving Eq
|
||||
|
||||
|
||||
@ -151,9 +151,10 @@ data Gram label = Gram { stem :: [Maybe label], base :: [Maybe label] }
|
||||
deriving (Eq, Show)
|
||||
|
||||
-- -- | Annotates a term with it's hash at each node.
|
||||
hashDecorator :: (Hashable hash, Traversable f) => (forall b. CofreeF f (Record fields) b -> hash) -> Cofree f (Record fields) -> Cofree f (Record (Vector.Vector Int ': fields))
|
||||
hashDecorator getHash = cata $ \case
|
||||
term@(record :< functor) -> cofree ((foldr (Vector.zipWith (+) . getField . extract) (Vector.singleton . hash $ getHash term) functor .: record) :< functor)
|
||||
hashDecorator :: forall hash f fields. (Hashable (f Int), Hashable hash, Traversable f) => (forall b. CofreeF f (Record fields) b -> hash) -> Cofree f (Record fields) -> Cofree f (Record (Int ': fields))
|
||||
hashDecorator getLabel = cata $ \case
|
||||
term@(record :< functor) ->
|
||||
cofree ((hash (getLabel term, fmap (getField . extract) functor :: f Int) .: record) :< functor)
|
||||
|
||||
-- | Annotates a term with a feature vector at each node, using the default values for the p, q, and d parameters.
|
||||
defaultFeatureVectorDecorator :: (Hashable label, Traversable f) => (forall b. CofreeF f (Record fields) b -> label) -> Cofree f (Record fields) -> Cofree f (Record (Vector.Vector Double ': fields))
|
||||
|
@ -44,9 +44,9 @@ import Data.Hashable
|
||||
-- | result.
|
||||
-- | Returns the rendered result strictly, so it's always fully evaluated
|
||||
-- | with respect to other IO actions.
|
||||
diffFiles :: (HasField fields Category, HasField fields Cost, HasField fields Range, Eq (Record fields)) => Parser (Syntax Text) (Record fields) -> Renderer (Record (Vector.Vector Double ': Vector.Vector Int ': fields)) -> Both SourceBlob -> IO Output
|
||||
diffFiles :: (HasField fields Category, HasField fields Cost, HasField fields Range, Eq (Record fields)) => Parser (Syntax Text) (Record fields) -> Renderer (Record (Vector.Vector Double ': Int ': fields)) -> Both SourceBlob -> IO Output
|
||||
diffFiles parser renderer sourceBlobs = do
|
||||
terms <- traverse (fmap (defaultFeatureVectorDecorator getLabel . hashDecorator getHash) . parser) sourceBlobs
|
||||
terms <- traverse (fmap (defaultFeatureVectorDecorator getLabel . hashDecorator getLabel) . parser) sourceBlobs
|
||||
|
||||
let areNullOids = runBothWith (\a b -> (oid a == nullOid || null (source a), oid b == nullOid || null (source b))) sourceBlobs
|
||||
let textDiff = case areNullOids of
|
||||
@ -64,8 +64,8 @@ diffFiles parser renderer sourceBlobs = do
|
||||
Pure patch -> uncurry both (fromThese 0 0 (unPatch (cost . extract <$> patch)))
|
||||
getLabel (h :< t) = (category h, case t of
|
||||
Leaf s -> Just s
|
||||
Syntax.Comment s -> Just s
|
||||
_ -> Nothing)
|
||||
getHash (h :< _) = (category h)
|
||||
|
||||
-- | Return a parser based on the file extension (including the ".").
|
||||
parserForType :: Text -> Parser (Syntax Text) (Record '[Range, Category])
|
||||
|
@ -25,7 +25,7 @@ type Comparable leaf annotation = Term leaf annotation -> Term leaf annotation -
|
||||
type DiffConstructor leaf annotation = CofreeF (Syntax leaf) (Both annotation) (Diff leaf annotation) -> Diff leaf annotation
|
||||
|
||||
-- | Diff two terms recursively, given functions characterizing the diffing.
|
||||
diffTerms :: (Eq leaf, Eq (Record fields), HasField fields Category, HasField fields (Vector.Vector Double), HasField fields (Vector.Vector Int))
|
||||
diffTerms :: (Eq leaf, Eq (Record fields), HasField fields Category, HasField fields (Vector.Vector Double), HasField fields Int)
|
||||
=> DiffConstructor leaf (Record fields) -- ^ A function to wrap up & possibly annotate every produced diff.
|
||||
-> Comparable leaf (Record fields) -- ^ A function to determine whether or not two terms should even be compared.
|
||||
-> SES.Cost (Diff leaf (Record fields)) -- ^ A function to compute the cost of a given diff node.
|
||||
@ -35,7 +35,7 @@ diffTerms :: (Eq leaf, Eq (Record fields), HasField fields Category, HasField fi
|
||||
diffTerms construct comparable cost a b = fromMaybe (replacing a b) $ diffComparableTerms construct comparable cost a b
|
||||
|
||||
-- | Diff two terms recursively, given functions characterizing the diffing. If the terms are incomparable, returns 'Nothing'.
|
||||
diffComparableTerms :: (Eq leaf, Eq (Record fields), HasField fields Category, HasField fields (Vector.Vector Double), HasField fields (Vector.Vector Int)) => DiffConstructor leaf (Record fields) -> Comparable leaf (Record fields) -> SES.Cost (Diff leaf (Record fields)) -> Term leaf (Record fields) -> Term leaf (Record fields) -> Maybe (Diff leaf (Record fields))
|
||||
diffComparableTerms :: (Eq leaf, Eq (Record fields), HasField fields Category, HasField fields (Vector.Vector Double), HasField fields Int) => DiffConstructor leaf (Record fields) -> Comparable leaf (Record fields) -> SES.Cost (Diff leaf (Record fields)) -> Term leaf (Record fields) -> Term leaf (Record fields) -> Maybe (Diff leaf (Record fields))
|
||||
diffComparableTerms construct comparable cost = recur
|
||||
where recur a b
|
||||
| (category <$> a) == (category <$> b) = hylo construct runCofree <$> zipTerms a b
|
||||
@ -71,7 +71,7 @@ algorithmWithTerms construct t1 t2 = case (unwrap t1, unwrap t2) of
|
||||
branch constructor a b = bySimilarity a b >>= annotate . constructor
|
||||
|
||||
-- | Run an algorithm, given functions characterizing the evaluation.
|
||||
runAlgorithm :: (Functor f, GAlign f, Eq a, Eq (Record fields), Eq (f (Cofree f (Record fields))), Prologue.Foldable f, Traversable f, HasField fields (Vector.Vector Double), HasField fields (Vector.Vector Int))
|
||||
runAlgorithm :: (Functor f, GAlign f, Eq a, Eq (Record fields), Eq (f (Cofree f (Record fields))), Prologue.Foldable f, Traversable f, HasField fields (Vector.Vector Double), HasField fields Int)
|
||||
=> (CofreeF f (Both (Record fields)) (Free (CofreeF f (Both (Record fields))) (Patch (Cofree f (Record fields)))) -> Free (CofreeF f (Both (Record fields))) (Patch (Cofree f (Record fields)))) -- ^ A function to wrap up & possibly annotate every produced diff.
|
||||
-> (Cofree f (Record fields) -> Cofree f (Record fields) -> Maybe (Free (CofreeF f (Both (Record fields))) (Patch (Cofree f (Record fields))))) -- ^ A function to diff two subterms recursively, if they are comparable, or else return 'Nothing'.
|
||||
-> SES.Cost (Free (CofreeF f (Both (Record fields))) (Patch (Cofree f (Record fields)))) -- ^ A function to compute the cost of a given diff node.
|
||||
|
@ -6,6 +6,7 @@ import GHC.Generics
|
||||
import Prologue
|
||||
import Test.QuickCheck hiding (Fixed)
|
||||
import SourceSpan
|
||||
import Data.Hashable
|
||||
|
||||
-- | A node in an abstract syntax tree.
|
||||
--
|
||||
@ -72,7 +73,7 @@ data Syntax a f
|
||||
-- | A method definition with an identifier, params, and a list of expressions.
|
||||
| Method f [f] [f]
|
||||
| If f f (Maybe f)
|
||||
deriving (Eq, Foldable, Functor, Generic, Generic1, Mergeable, Ord, Show, Traversable)
|
||||
deriving (Eq, Foldable, Functor, Generic, Generic1, Mergeable, Ord, Show, Traversable, Hashable)
|
||||
|
||||
-- Instances
|
||||
|
||||
|
Loading…
Reference in New Issue
Block a user