1
1
mirror of https://github.com/github/semantic.git synced 2024-11-24 17:04:47 +03:00

Merge pull request #639 from github/generalize-rws-over-the-functor

Generalize rws over the functor
This commit is contained in:
Josh Vera 2016-07-22 11:24:00 -04:00 committed by GitHub
commit 224332fb86
3 changed files with 28 additions and 22 deletions

View File

@ -1,24 +1,33 @@
{-# LANGUAGE RankNTypes #-}
module Data.RandomWalkSimilarity where
import Control.Arrow ((&&&))
import Control.Monad.Random
import Control.Monad.State
import qualified Data.DList as DList
import Data.Functor.Both hiding (fst, snd)
import Data.Functor.Foldable as Foldable
import Data.Hashable
import qualified Data.KdTree.Static as KdTree
import qualified Data.List as List
import qualified Data.Vector as Vector
import Diff
import Patch
import Prologue
import Syntax
import Term
import Term ()
import Test.QuickCheck hiding (Fixed)
import Test.QuickCheck.Random
-- | Given a function comparing two terms recursively, and a function to compute a Hashable label from an annotation, compute the diff of a pair of lists of terms using a random walk similarity metric, which completes in log-linear time. This implementation is based on the paper [_RWS-Diff—Flexible and Efficient Change Detection in Hierarchical Data_](https://github.com/github/semantic-diff/files/325837/RWS-Diff.Flexible.and.Efficient.Change.Detection.in.Hierarchical.Data.pdf).
rws :: (Hashable label, Hashable leaf, Eq leaf, Eq annotation) => (Term leaf annotation -> Term leaf annotation -> Maybe (Diff leaf annotation)) -> (annotation -> label) -> [Term leaf annotation] -> [Term leaf annotation] -> [Diff leaf annotation]
-- | Given a function comparing two terms recursively, and a function to compute a Hashable label from an unpacked term, compute the diff of a pair of lists of terms using a random walk similarity metric, which completes in log-linear time. This implementation is based on the paper [_RWS-Diff—Flexible and Efficient Change Detection in Hierarchical Data_](https://github.com/github/semantic-diff/files/325837/RWS-Diff.Flexible.and.Efficient.Change.Detection.in.Hierarchical.Data.pdf).
rws :: (Hashable label, Eq annotation, Prologue.Foldable f, Functor f, Eq (f (Cofree f annotation))) =>
-- | A function which comapres a pair of terms recursively, returning 'Just' their diffed value if appropriate, or 'Nothing' if they should not be compared.
(Cofree f annotation -> Cofree f annotation -> Maybe (Free (CofreeF f (Both annotation)) (Patch (Cofree f annotation)))) ->
-- | A function to compute a label for an unpacked term.
(forall b. CofreeF f annotation b -> label) ->
-- | The old list of terms.
[Cofree f annotation] ->
-- | The new list of terms.
[Cofree f annotation] ->
[Free (CofreeF f (Both annotation)) (Patch (Cofree f annotation))]
rws compare getLabel as bs
| null as, null bs = []
| null as = insert <$> bs
@ -44,29 +53,24 @@ rws compare getLabel as bs
deleteRemaining diffs (_, unmapped) = foldl' (flip (List.insertBy (comparing fst))) diffs ((termIndex &&& delete . term) <$> unmapped)
-- | A term which has not yet been mapped by `rws`, along with its feature vector summary & index.
data UnmappedTerm leaf annotation = UnmappedTerm { termIndex :: {-# UNPACK #-} !Int, feature :: !(Vector.Vector Double), term :: !(Term leaf annotation) }
data UnmappedTerm a = UnmappedTerm { termIndex :: {-# UNPACK #-} !Int, feature :: !(Vector.Vector Double), term :: !a }
deriving Eq
-- | A `Gram` is a fixed-size view of some portion of a tree, consisting of a `stem` of _p_ labels for parent nodes, and a `base` of _q_ labels of sibling nodes. Collectively, the bag of `Gram`s for each node of a tree (e.g. as computed by `pqGrams`) form a summary of the tree.
data Gram label = Gram { stem :: [Maybe label], base :: [Maybe label] }
deriving (Eq, Show)
-- | Compute the bag of grams with stems of length _p_ and bases of length _q_, with labels computed from annotations, which summarize the entire subtree of a term.
pqGrams :: Int -> Int -> (annotation -> label) -> Cofree (Syntax leaf) annotation -> DList.DList (Gram (label, Maybe leaf))
pqGrams p q getLabel = cata merge . setRootBase . setRootStem . hylo go project
where go (annotation :< functor) = cofree (Gram [] [ Just (getLabel annotation, leafValue functor) ] :< (assignParent (Just (getLabel annotation, leafValue functor)) p <$> functor))
leafValue (Leaf s) = Just s
leafValue _ = Nothing
merge (head :< tail) = DList.cons head (Prologue.fold tail)
pqGrams :: (Prologue.Foldable f, Functor f) => Int -> Int -> (forall b. CofreeF f annotation b -> label) -> Cofree f annotation -> DList.DList (Gram label)
pqGrams p q getLabel = uncurry DList.cons . cata merge . setRootBase . setRootStem . cata go
where go c = cofree (Gram [] [ Just (getLabel c) ] :< (assignParent (Just (getLabel c)) p <$> tailF c))
merge (head :< tail) = let tail' = toList tail in (head, DList.fromList (windowed q setBases [] (fst <$> tail')) <> foldMap snd tail')
assignParent parentLabel n tree
| n > 0 = let gram :< functor = runCofree tree in cofree $ prependParent parentLabel gram :< assignSiblings (assignParent parentLabel (pred n) <$> functor)
| n > 0 = let gram :< functor = runCofree tree in cofree $ prependParent parentLabel gram :< (assignParent parentLabel (pred n) <$> functor)
| otherwise = tree
prependParent parentLabel gram = gram { stem = parentLabel : stem gram }
assignSiblings functor = case functor of
Leaf a -> Leaf a
Indexed a -> Indexed $ windowed q setBases [] a
Fixed a -> Fixed $ windowed q setBases [] a
setBases child siblings rest = let (gram :< further) = (runCofree child) in cofree (setBase gram (siblings >>= base . extract) :< further) : rest
setBases gram siblings rest = setBase gram (siblings >>= base) : rest
setBase gram newBase = gram { base = take q (newBase <> repeat Nothing) }
setRootBase term = let (a :< f) = runCofree term in cofree (setBase a (base a) :< f)
setRootStem = foldr (\ p rest -> assignParent Nothing p . rest) identity [0..p]

View File

@ -56,4 +56,6 @@ run construct comparable cost algorithm = case runFree algorithm of
Free (ByIndex a b f) -> run construct comparable cost . f $ ses (constructAndRun construct comparable cost) cost a b
Free (ByRandomWalkSimilarity a b f) -> run construct comparable cost . f $ rws (constructAndRun construct comparable cost) getLabel a b
where getLabel = category
where getLabel (h :< t) = (category h, case t of
Leaf s -> Just s
_ -> Nothing)

View File

@ -17,10 +17,10 @@ spec :: Spec
spec = parallel $ do
describe "pqGrams" $ do
prop "produces grams with stems of the specified length" . forAll (arbitrary `suchThat` (\ (_, p, q) -> p > 0 && q > 0)) $
\ (term, p, q) -> pqGrams p q identity (toTerm term :: Term Text Text) `shouldSatisfy` all ((== p) . length . stem)
\ (term, p, q) -> pqGrams p q headF (toTerm term :: Term Text Text) `shouldSatisfy` all ((== p) . length . stem)
prop "produces grams with bases of the specified length" . forAll (arbitrary `suchThat` (\ (_, p, q) -> p > 0 && q > 0)) $
\ (term, p, q) -> pqGrams p q identity (toTerm term :: Term Text Text) `shouldSatisfy` all ((== q) . length . base)
\ (term, p, q) -> pqGrams p q headF (toTerm term :: Term Text Text) `shouldSatisfy` all ((== q) . length . base)
describe "featureVector" $ do
prop "produces a vector of the specified dimension" . forAll (arbitrary `suchThat` ((> 0) . Prologue.snd)) $
@ -31,5 +31,5 @@ spec = parallel $ do
prop "produces correct diffs" . forAll (scale (`div` 4) arbitrary) $
\ (as, bs) -> let tas = toTerm <$> as
tbs = toTerm <$> bs
diff = free (Free (pure Program :< Indexed (rws compare identity tas tbs :: [Diff Text Category]))) in
diff = free (Free (pure Program :< Indexed (rws compare headF tas tbs :: [Diff Text Category]))) in
(beforeTerm diff, afterTerm diff) `shouldBe` (Just (cofree (Program :< Indexed tas)), Just (cofree (Program :< Indexed tbs)))