mirror of
https://github.com/github/semantic.git
synced 2024-11-28 10:15:55 +03:00
Merge pull request #639 from github/generalize-rws-over-the-functor
Generalize rws over the functor
This commit is contained in:
commit
224332fb86
@ -1,24 +1,33 @@
|
||||
{-# LANGUAGE RankNTypes #-}
|
||||
module Data.RandomWalkSimilarity where
|
||||
|
||||
import Control.Arrow ((&&&))
|
||||
import Control.Monad.Random
|
||||
import Control.Monad.State
|
||||
import qualified Data.DList as DList
|
||||
import Data.Functor.Both hiding (fst, snd)
|
||||
import Data.Functor.Foldable as Foldable
|
||||
import Data.Hashable
|
||||
import qualified Data.KdTree.Static as KdTree
|
||||
import qualified Data.List as List
|
||||
import qualified Data.Vector as Vector
|
||||
import Diff
|
||||
import Patch
|
||||
import Prologue
|
||||
import Syntax
|
||||
import Term
|
||||
import Term ()
|
||||
import Test.QuickCheck hiding (Fixed)
|
||||
import Test.QuickCheck.Random
|
||||
|
||||
-- | Given a function comparing two terms recursively, and a function to compute a Hashable label from an annotation, compute the diff of a pair of lists of terms using a random walk similarity metric, which completes in log-linear time. This implementation is based on the paper [_RWS-Diff—Flexible and Efficient Change Detection in Hierarchical Data_](https://github.com/github/semantic-diff/files/325837/RWS-Diff.Flexible.and.Efficient.Change.Detection.in.Hierarchical.Data.pdf).
|
||||
rws :: (Hashable label, Hashable leaf, Eq leaf, Eq annotation) => (Term leaf annotation -> Term leaf annotation -> Maybe (Diff leaf annotation)) -> (annotation -> label) -> [Term leaf annotation] -> [Term leaf annotation] -> [Diff leaf annotation]
|
||||
-- | Given a function comparing two terms recursively, and a function to compute a Hashable label from an unpacked term, compute the diff of a pair of lists of terms using a random walk similarity metric, which completes in log-linear time. This implementation is based on the paper [_RWS-Diff—Flexible and Efficient Change Detection in Hierarchical Data_](https://github.com/github/semantic-diff/files/325837/RWS-Diff.Flexible.and.Efficient.Change.Detection.in.Hierarchical.Data.pdf).
|
||||
rws :: (Hashable label, Eq annotation, Prologue.Foldable f, Functor f, Eq (f (Cofree f annotation))) =>
|
||||
-- | A function which comapres a pair of terms recursively, returning 'Just' their diffed value if appropriate, or 'Nothing' if they should not be compared.
|
||||
(Cofree f annotation -> Cofree f annotation -> Maybe (Free (CofreeF f (Both annotation)) (Patch (Cofree f annotation)))) ->
|
||||
-- | A function to compute a label for an unpacked term.
|
||||
(forall b. CofreeF f annotation b -> label) ->
|
||||
-- | The old list of terms.
|
||||
[Cofree f annotation] ->
|
||||
-- | The new list of terms.
|
||||
[Cofree f annotation] ->
|
||||
[Free (CofreeF f (Both annotation)) (Patch (Cofree f annotation))]
|
||||
rws compare getLabel as bs
|
||||
| null as, null bs = []
|
||||
| null as = insert <$> bs
|
||||
@ -44,29 +53,24 @@ rws compare getLabel as bs
|
||||
deleteRemaining diffs (_, unmapped) = foldl' (flip (List.insertBy (comparing fst))) diffs ((termIndex &&& delete . term) <$> unmapped)
|
||||
|
||||
-- | A term which has not yet been mapped by `rws`, along with its feature vector summary & index.
|
||||
data UnmappedTerm leaf annotation = UnmappedTerm { termIndex :: {-# UNPACK #-} !Int, feature :: !(Vector.Vector Double), term :: !(Term leaf annotation) }
|
||||
data UnmappedTerm a = UnmappedTerm { termIndex :: {-# UNPACK #-} !Int, feature :: !(Vector.Vector Double), term :: !a }
|
||||
deriving Eq
|
||||
|
||||
|
||||
-- | A `Gram` is a fixed-size view of some portion of a tree, consisting of a `stem` of _p_ labels for parent nodes, and a `base` of _q_ labels of sibling nodes. Collectively, the bag of `Gram`s for each node of a tree (e.g. as computed by `pqGrams`) form a summary of the tree.
|
||||
data Gram label = Gram { stem :: [Maybe label], base :: [Maybe label] }
|
||||
deriving (Eq, Show)
|
||||
|
||||
-- | Compute the bag of grams with stems of length _p_ and bases of length _q_, with labels computed from annotations, which summarize the entire subtree of a term.
|
||||
pqGrams :: Int -> Int -> (annotation -> label) -> Cofree (Syntax leaf) annotation -> DList.DList (Gram (label, Maybe leaf))
|
||||
pqGrams p q getLabel = cata merge . setRootBase . setRootStem . hylo go project
|
||||
where go (annotation :< functor) = cofree (Gram [] [ Just (getLabel annotation, leafValue functor) ] :< (assignParent (Just (getLabel annotation, leafValue functor)) p <$> functor))
|
||||
leafValue (Leaf s) = Just s
|
||||
leafValue _ = Nothing
|
||||
merge (head :< tail) = DList.cons head (Prologue.fold tail)
|
||||
pqGrams :: (Prologue.Foldable f, Functor f) => Int -> Int -> (forall b. CofreeF f annotation b -> label) -> Cofree f annotation -> DList.DList (Gram label)
|
||||
pqGrams p q getLabel = uncurry DList.cons . cata merge . setRootBase . setRootStem . cata go
|
||||
where go c = cofree (Gram [] [ Just (getLabel c) ] :< (assignParent (Just (getLabel c)) p <$> tailF c))
|
||||
merge (head :< tail) = let tail' = toList tail in (head, DList.fromList (windowed q setBases [] (fst <$> tail')) <> foldMap snd tail')
|
||||
assignParent parentLabel n tree
|
||||
| n > 0 = let gram :< functor = runCofree tree in cofree $ prependParent parentLabel gram :< assignSiblings (assignParent parentLabel (pred n) <$> functor)
|
||||
| n > 0 = let gram :< functor = runCofree tree in cofree $ prependParent parentLabel gram :< (assignParent parentLabel (pred n) <$> functor)
|
||||
| otherwise = tree
|
||||
prependParent parentLabel gram = gram { stem = parentLabel : stem gram }
|
||||
assignSiblings functor = case functor of
|
||||
Leaf a -> Leaf a
|
||||
Indexed a -> Indexed $ windowed q setBases [] a
|
||||
Fixed a -> Fixed $ windowed q setBases [] a
|
||||
setBases child siblings rest = let (gram :< further) = (runCofree child) in cofree (setBase gram (siblings >>= base . extract) :< further) : rest
|
||||
setBases gram siblings rest = setBase gram (siblings >>= base) : rest
|
||||
setBase gram newBase = gram { base = take q (newBase <> repeat Nothing) }
|
||||
setRootBase term = let (a :< f) = runCofree term in cofree (setBase a (base a) :< f)
|
||||
setRootStem = foldr (\ p rest -> assignParent Nothing p . rest) identity [0..p]
|
||||
|
@ -56,4 +56,6 @@ run construct comparable cost algorithm = case runFree algorithm of
|
||||
Free (ByIndex a b f) -> run construct comparable cost . f $ ses (constructAndRun construct comparable cost) cost a b
|
||||
|
||||
Free (ByRandomWalkSimilarity a b f) -> run construct comparable cost . f $ rws (constructAndRun construct comparable cost) getLabel a b
|
||||
where getLabel = category
|
||||
where getLabel (h :< t) = (category h, case t of
|
||||
Leaf s -> Just s
|
||||
_ -> Nothing)
|
||||
|
@ -17,10 +17,10 @@ spec :: Spec
|
||||
spec = parallel $ do
|
||||
describe "pqGrams" $ do
|
||||
prop "produces grams with stems of the specified length" . forAll (arbitrary `suchThat` (\ (_, p, q) -> p > 0 && q > 0)) $
|
||||
\ (term, p, q) -> pqGrams p q identity (toTerm term :: Term Text Text) `shouldSatisfy` all ((== p) . length . stem)
|
||||
\ (term, p, q) -> pqGrams p q headF (toTerm term :: Term Text Text) `shouldSatisfy` all ((== p) . length . stem)
|
||||
|
||||
prop "produces grams with bases of the specified length" . forAll (arbitrary `suchThat` (\ (_, p, q) -> p > 0 && q > 0)) $
|
||||
\ (term, p, q) -> pqGrams p q identity (toTerm term :: Term Text Text) `shouldSatisfy` all ((== q) . length . base)
|
||||
\ (term, p, q) -> pqGrams p q headF (toTerm term :: Term Text Text) `shouldSatisfy` all ((== q) . length . base)
|
||||
|
||||
describe "featureVector" $ do
|
||||
prop "produces a vector of the specified dimension" . forAll (arbitrary `suchThat` ((> 0) . Prologue.snd)) $
|
||||
@ -31,5 +31,5 @@ spec = parallel $ do
|
||||
prop "produces correct diffs" . forAll (scale (`div` 4) arbitrary) $
|
||||
\ (as, bs) -> let tas = toTerm <$> as
|
||||
tbs = toTerm <$> bs
|
||||
diff = free (Free (pure Program :< Indexed (rws compare identity tas tbs :: [Diff Text Category]))) in
|
||||
diff = free (Free (pure Program :< Indexed (rws compare headF tas tbs :: [Diff Text Category]))) in
|
||||
(beforeTerm diff, afterTerm diff) `shouldBe` (Just (cofree (Program :< Indexed tas)), Just (cofree (Program :< Indexed tbs)))
|
||||
|
Loading…
Reference in New Issue
Block a user