diff --git a/semantic-diff.cabal b/semantic-diff.cabal index c8d8899de..63d92519d 100644 --- a/semantic-diff.cabal +++ b/semantic-diff.cabal @@ -18,6 +18,7 @@ library , Category , Data.Bifunctor.Join.Arbitrary , Data.Functor.Both + , Data.RandomWalkSimilarity , Data.OrderedMap , Data.Record , Data.These.Arbitrary @@ -55,8 +56,12 @@ library , bytestring , containers , directory + , dlist , filepath + , hashable + , kdt , mtl + , MonadRandom , pointed , QuickCheck >= 2.8.1 , quickcheck-text @@ -71,7 +76,7 @@ library , comonad , protolude default-language: Haskell2010 - default-extensions: DeriveFunctor, DeriveFoldable, DeriveTraversable, DeriveGeneric, FlexibleInstances, OverloadedStrings, NoImplicitPrelude, RecordWildCards + default-extensions: DeriveFunctor, DeriveFoldable, DeriveTraversable, DeriveGeneric, FlexibleContexts, FlexibleInstances, OverloadedStrings, NoImplicitPrelude, RecordWildCards ghc-options: -Wall -fno-warn-name-shadowing -O2 -threaded -fprof-auto "-with-rtsopts=-N -p -s -h -i0.1" -j benchmark semantic-diff-bench @@ -95,18 +100,20 @@ test-suite semantic-diff-test hs-source-dirs: test main-is: Spec.hs other-modules: AlignmentSpec - , ArbitraryTerm , CorpusSpec + , Data.RandomWalkSimilarity.Spec + , Diff.Spec + , DiffSummarySpec , InterpreterSpec , OrderedMapSpec , PatchOutputSpec , TermSpec - , DiffSummarySpec build-depends: base , bifunctors , bytestring , containers , deepseq + , dlist , filepath , Glob , hspec >= 2.1.10 diff --git a/src/Alignment.hs b/src/Alignment.hs index c6bd1561a..2e16e99dc 100644 --- a/src/Alignment.hs +++ b/src/Alignment.hs @@ -1,4 +1,4 @@ -{-# LANGUAGE FlexibleContexts, RankNTypes, ScopedTypeVariables #-} +{-# LANGUAGE RankNTypes, ScopedTypeVariables #-} module Alignment ( hasChanges , numberedRows diff --git a/src/Category.hs b/src/Category.hs index f621177dc..5dd35d817 100644 --- a/src/Category.hs +++ b/src/Category.hs @@ -1,7 +1,9 @@ module Category where import Prologue +import Data.Hashable import Data.String +import Test.QuickCheck -- | A standardized category of AST node. Used to determine the semantics for -- | semantic diffing and define comparability of nodes. @@ -28,4 +30,27 @@ data Category | ArrayLiteral -- | A non-standard category, which can be used for comparability. | Other String - deriving (Eq, Show, Ord) + deriving (Eq, Generic, Ord, Show) + + +-- Instances + +instance Hashable Category + +instance Arbitrary Category where + arbitrary = oneof + [ pure Program + , pure Error + , pure BinaryOperator + , pure DictionaryLiteral + , pure Pair + , pure FunctionCall + , pure StringLiteral + , pure IntegerLiteral + , pure SymbolLiteral + , pure ArrayLiteral + , Other <$> arbitrary + ] + + shrink (Other s) = Other <$> shrink s + shrink _ = [] diff --git a/src/Data/Functor/Both.hs b/src/Data/Functor/Both.hs index 3e4cbfae8..544b7223a 100644 --- a/src/Data/Functor/Both.hs +++ b/src/Data/Functor/Both.hs @@ -1,3 +1,4 @@ +{-# OPTIONS_GHC -fno-warn-orphans #-} module Data.Functor.Both (Both,both, runBothWith, fst, snd, module X) where import Data.Bifunctor.Join as X diff --git a/src/Data/OrderedMap.hs b/src/Data/OrderedMap.hs index 5c4eb5b26..fe789a38a 100644 --- a/src/Data/OrderedMap.hs +++ b/src/Data/OrderedMap.hs @@ -15,10 +15,11 @@ module Data.OrderedMap ( ) where import Prologue hiding (toList, empty) +import Test.QuickCheck -- | An ordered map of keys and values. newtype OrderedMap key value = OrderedMap { toList :: [(key, value)] } - deriving (Show, Eq, Functor, Foldable, Traversable) + deriving (Eq, Foldable, Functor, Generic, Ord, Show, Traversable) -- | Construct an ordered map from a list of pairs of keys and values. fromList :: [(key, value)] -> OrderedMap key value @@ -70,3 +71,7 @@ difference (OrderedMap a) (OrderedMap b) = OrderedMap $ filter ((`notElem` extan instance Eq key => Monoid (OrderedMap key value) where mempty = fromList [] mappend = union + +instance (Arbitrary key, Arbitrary value) => Arbitrary (OrderedMap key value) where + arbitrary = fromList <$> arbitrary + shrink = genericShrink diff --git a/src/Data/RandomWalkSimilarity.hs b/src/Data/RandomWalkSimilarity.hs new file mode 100644 index 000000000..bf44d7939 --- /dev/null +++ b/src/Data/RandomWalkSimilarity.hs @@ -0,0 +1,112 @@ +module Data.RandomWalkSimilarity where + +import Control.Arrow ((&&&)) +import Control.Monad.Random +import Control.Monad.State +import Data.Bifunctor.Join +import qualified Data.DList as DList +import Data.Functor.Foldable as Foldable +import Data.Hashable +import qualified Data.KdTree.Static as KdTree +import qualified Data.List as List +import qualified Data.OrderedMap as Map +import qualified Data.Vector as Vector +import Diff +import Patch +import Prologue +import Syntax +import Term +import Test.QuickCheck hiding (Fixed) +import Test.QuickCheck.Random + +-- | Given a function comparing two terms recursively, and a function to compute a Hashable label from an annotation, compute the diff of a pair of lists of terms using a random walk similarity metric, which completes in log-linear time. This implementation is based on the paper [_RWS-Diff—Flexible and Efficient Change Detection in Hierarchical Data_](https://github.com/github/semantic-diff/files/325837/RWS-Diff.Flexible.and.Efficient.Change.Detection.in.Hierarchical.Data.pdf). +rws :: (Hashable label, Hashable leaf, Eq leaf, Ord annotation) => (Term leaf annotation -> Term leaf annotation -> Maybe (Diff leaf annotation)) -> (annotation -> label) -> [Term leaf annotation] -> [Term leaf annotation] -> [Diff leaf annotation] +rws compare getLabel as bs + | null as, null bs = [] + | null as = insert <$> bs + | null bs = delete <$> as + | otherwise = uncurry deleteRemaining . (`runState` fas) $ traverse findNearestNeighbourTo fbs + where insert = pure . Insert + delete = pure . Delete + replace = (pure .) . Replace + (p, q, d) = (2, 2, 15) + fas = featurize <$> as + fbs = featurize <$> bs + kdas = KdTree.build (Vector.toList . fst) fas + featurize = featureVector d . pqGrams p q getLabel &&& identity + findNearestNeighbourTo kv@(_, v) = do + unmapped <- get + let (k, _) = KdTree.nearest kdas kv + case k `List.lookup` unmapped of + Nothing -> pure $! insert v + Just found -> do + put (List.delete (k, found) unmapped) + pure $! fromMaybe (replace found v) (compare found v) + deleteRemaining diffs unmapped = foldl' (flip (List.insertBy (comparing firstAnnotation))) diffs (delete . snd <$> unmapped) + +-- | Extract the annotation for the before state of a diff node. This is returned in `Maybe` because e.g. an `Insert` patch does not have an annotation for the before state. +firstAnnotation :: Diff leaf annotation -> Maybe annotation +firstAnnotation diff = case runFree diff of + Free (annotations :< _) -> Just (fst (runJoin annotations)) + Pure patch -> maybeFst (unPatch $ extract <$> patch) + +-- | A `Gram` is a fixed-size view of some portion of a tree, consisting of a `stem` of _p_ labels for parent nodes, and a `base` of _q_ labels of sibling nodes. Collectively, the bag of `Gram`s for each node of a tree (e.g. as computed by `pqGrams`) form a summary of the tree. +data Gram label = Gram { stem :: [Maybe label], base :: [Maybe label] } + deriving (Eq, Show) + +-- | Compute the bag of grams with stems of length _p_ and bases of length _q_, with labels computed from annotations, which summarize the entire subtree of a term. +pqGrams :: Int -> Int -> (annotation -> label) -> Cofree (Syntax leaf) annotation -> DList.DList (Gram (label, Maybe leaf)) +pqGrams p q getLabel = cata merge . setRootBase . setRootStem . hylo go project + where go (annotation :< functor) = cofree (Gram [] [ Just (getLabel annotation, leafValue functor) ] :< (assignParent (Just (getLabel annotation, leafValue functor)) p <$> functor)) + leafValue (Leaf s) = Just s + leafValue _ = Nothing + merge (head :< tail) = DList.cons head (Prologue.fold tail) + assignParent parentLabel n tree + | n > 0 = let gram :< functor = runCofree tree in cofree $ prependParent parentLabel gram :< assignSiblings (assignParent parentLabel (pred n) <$> functor) + | otherwise = tree + prependParent parentLabel gram = gram { stem = parentLabel : stem gram } + assignSiblings functor = case functor of + Leaf a -> Leaf a + Indexed a -> Indexed $ windowed q setBases [] a + Fixed a -> Fixed $ windowed q setBases [] a + Keyed a -> Keyed . Map.fromList $ windowed q setBasesKV [] (Map.toList a) + setBases child siblings rest = let (gram :< further) = (runCofree child) in cofree (setBase gram (siblings >>= base . extract) :< further) : rest + setBasesKV (key, child) siblings rest = let (gram :< further) = (runCofree child) in (key, cofree (setBase gram (siblings >>= base . extract . snd) :< further)) : rest + setBase gram newBase = gram { base = take q (newBase <> repeat Nothing) } + setRootBase term = let (a :< f) = runCofree term in cofree (setBase a (base a) :< f) + setRootStem = foldr (\ p rest -> assignParent Nothing p . rest) identity [0..p] + +-- | A sliding-window fold over _n_ items of a list per iteration. +windowed :: Int -> (a -> [a] -> b -> b) -> b -> [a] -> b +windowed n f seed = para alg + where alg xs = case xs of + Cons a (as, b) -> f a (take n $ a : as) b + Nil -> seed + + +-- | Compute a vector with the specified number of dimensions, as an approximation of a bag of `Gram`s summarizing a tree. +featureVector :: Hashable label => Int -> DList.DList (Gram label) -> Vector.Vector Double +featureVector d bag = sumVectors $ unitDVector . hash <$> bag + where unitDVector hash = normalize . (`evalRand` mkQCGen hash) $ Prologue.sequence (Vector.replicate d getRandom) + normalize vec = fmap (/ vmagnitude vec) vec + sumVectors = DList.foldr (Vector.zipWith (+)) (Vector.replicate d 0) + +-- | The magnitude of a Euclidean vector, i.e. its distance from the origin. +vmagnitude :: Vector.Vector Double -> Double +vmagnitude = sqrtDouble . Vector.sum . fmap (** 2) + + +-- Instances + +instance Hashable label => Hashable (Gram label) where + hashWithSalt _ = hash + hash gram = hash (stem gram <> base gram) + +-- | Construct a generator for arbitrary `Gram`s of size `(p, q)`. +gramWithPQ :: Arbitrary label => Int -> Int -> Gen (Gram label) +gramWithPQ p q = Gram <$> vectorOf p arbitrary <*> vectorOf q arbitrary + +instance Arbitrary label => Arbitrary (Gram label) where + arbitrary = join $ gramWithPQ <$> arbitrary <*> arbitrary + + shrink (Gram a b) = Gram <$> shrink a <*> shrink b diff --git a/src/Data/Record.hs b/src/Data/Record.hs index 03127f39a..5d6043771 100644 --- a/src/Data/Record.hs +++ b/src/Data/Record.hs @@ -1,7 +1,8 @@ -{-# LANGUAGE DataKinds, FlexibleContexts, GADTs, KindSignatures, MultiParamTypeClasses, TypeOperators #-} +{-# LANGUAGE DataKinds, GADTs, KindSignatures, MultiParamTypeClasses, TypeOperators #-} module Data.Record where import Prologue +import Test.QuickCheck -- | A type-safe, extensible record structure. -- | @@ -50,3 +51,17 @@ instance (Eq h, Eq (Record t)) => Eq (Record (h ': t)) where instance Eq (Record '[]) where _ == _ = True + + +instance (Ord h, Ord (Record t)) => Ord (Record (h ': t)) where + RCons h1 t1 `compare` RCons h2 t2 = let h = h1 `compare` h2 in + if h == EQ then t1 `compare` t2 else h + +instance Ord (Record '[]) where + _ `compare` _ = EQ + + +instance Arbitrary fields => Arbitrary (Record '[fields]) where + arbitrary = RCons <$> arbitrary <*> pure RNil + + shrink (RCons h t) = RCons <$> shrink h <*> pure t diff --git a/src/Diff.hs b/src/Diff.hs index d95cd16ab..87c727d99 100644 --- a/src/Diff.hs +++ b/src/Diff.hs @@ -1,9 +1,11 @@ {-# LANGUAGE TypeFamilies, TypeSynonymInstances #-} +{-# OPTIONS_GHC -fno-warn-orphans #-} module Diff where import Prologue import Data.Functor.Foldable as Foldable -import Data.Functor.Both +import Data.Functor.Both as Both +import qualified Data.OrderedMap as Map import Patch import Syntax import Term @@ -13,8 +15,8 @@ type DiffF leaf annotation = FreeF (CofreeF (Syntax leaf) (Both annotation)) (Pa type Diff a annotation = Free (CofreeF (Syntax a) (Both annotation)) (Patch (Term a annotation)) type instance Base (Free f a) = FreeF f a -instance (Functor f) => Foldable.Foldable (Free f a) where project = runFree -instance (Functor f) => Foldable.Unfoldable (Free f a) where embed = free +instance Functor f => Foldable.Foldable (Free f a) where project = runFree +instance Functor f => Foldable.Unfoldable (Free f a) where embed = free diffSum :: (Patch (Term a annotation) -> Integer) -> Diff a annotation -> Integer diffSum patchCost diff = sum $ fmap patchCost diff @@ -22,3 +24,22 @@ diffSum patchCost diff = sum $ fmap patchCost diff -- | The sum of the node count of the diff’s patches. diffCost :: Diff a annotation -> Integer diffCost = diffSum $ patchSum termSize + +-- | Merge a diff using a function to provide the Term (in Maybe, to simplify recovery of the before/after state) for every Patch. +mergeMaybe :: (Patch (Term leaf annotation) -> Maybe (Term leaf annotation)) -> Diff leaf annotation -> Maybe (Term leaf annotation) +mergeMaybe transform = cata algebra . fmap transform + where algebra :: FreeF (CofreeF (Syntax leaf) (Both annotation)) (Maybe (Term leaf annotation)) (Maybe (Term leaf annotation)) -> Maybe (Term leaf annotation) + algebra (Pure p) = p + algebra (Free (annotations :< syntax)) = Just . cofree $ Both.fst annotations :< case syntax of + Leaf s -> Leaf s + Indexed i -> Indexed (catMaybes i) + Fixed i -> Fixed (catMaybes i) + Keyed i -> Keyed (Map.fromList (Map.toList i >>= (\ (k, v) -> maybe [] (pure . (,) k) v))) + +-- | Recover the before state of a diff. +beforeTerm :: Diff leaf annotation -> Maybe (Term leaf annotation) +beforeTerm = mergeMaybe before + +-- | Recover the after state of a diff. +afterTerm :: Diff leaf annotation -> Maybe (Term leaf annotation) +afterTerm = mergeMaybe after diff --git a/src/Diff/Arbitrary.hs b/src/Diff/Arbitrary.hs index f187a699e..d08397b7e 100644 --- a/src/Diff/Arbitrary.hs +++ b/src/Diff/Arbitrary.hs @@ -4,7 +4,6 @@ import Diff import Data.Bifunctor.Join import Data.Bifunctor.Join.Arbitrary () import Data.Functor.Foldable (unfold) -import qualified Data.List as List import qualified Data.OrderedMap as Map import Patch import Patch.Arbitrary () @@ -13,32 +12,25 @@ import Prologue import Term.Arbitrary import Test.QuickCheck hiding (Fixed) -newtype ArbitraryDiff leaf annotation = ArbitraryDiff { unArbitraryDiff :: FreeF (CofreeF (Syntax leaf) (Join (,) annotation)) (Patch (ArbitraryTerm leaf annotation)) (ArbitraryDiff leaf annotation) } +data ArbitraryDiff leaf annotation + = ArbitraryFree (Join (,) annotation) (Syntax leaf (ArbitraryDiff leaf annotation)) + | ArbitraryPure (Patch (ArbitraryTerm leaf annotation)) deriving (Show, Eq, Generic) +unArbitraryDiff :: ArbitraryDiff leaf annotation -> FreeF (CofreeF (Syntax leaf) (Join (,) annotation)) (Patch (ArbitraryTerm leaf annotation)) (ArbitraryDiff leaf annotation) +unArbitraryDiff (ArbitraryFree a s) = Free (a :< s) +unArbitraryDiff (ArbitraryPure p) = Pure p + toDiff :: ArbitraryDiff leaf annotation -> Diff leaf annotation toDiff = fmap (fmap toTerm) . unfold unArbitraryDiff diffOfSize :: (Arbitrary leaf, Arbitrary annotation) => Int -> Gen (ArbitraryDiff leaf annotation) diffOfSize n - | n <= 0 = (ArbitraryDiff .) . (Free .) . (:<) <$> arbitrary <*> syntaxOfSize n + | n <= 0 = ArbitraryFree <$> arbitrary <*> syntaxOfSize diffOfSize n | otherwise = oneof - [ (ArbitraryDiff .) . (Free .) . (:<) <$> arbitrary <*> syntaxOfSize n - , ArbitraryDiff . Pure <$> patchOfSize n ] - where syntaxOfSize n | n <= 1 = oneof $ (Leaf <$> arbitrary) : branchGeneratorsOfSize n - | otherwise = oneof $ branchGeneratorsOfSize n - branchGeneratorsOfSize n = - [ Indexed <$> childrenOfSize (pred n) - , Fixed <$> childrenOfSize (pred n) - , (Keyed .) . (Map.fromList .) . zip <$> infiniteListOf arbitrary <*> childrenOfSize (pred n) - ] - childrenOfSize n | n <= 0 = pure [] - childrenOfSize n = do - m <- choose (1, n) - first <- diffOfSize m - rest <- childrenOfSize (n - m) - pure $! first : rest - patchOfSize 1 = oneof [ Insert <$> termOfSize 1 + [ ArbitraryFree <$> arbitrary <*> syntaxOfSize diffOfSize n + , ArbitraryPure <$> patchOfSize n ] + where patchOfSize 1 = oneof [ Insert <$> termOfSize 1 , Delete <$> termOfSize 1 ] patchOfSize n = do m <- choose (1, n - 1) @@ -59,11 +51,4 @@ instance (Eq leaf, Eq annotation, Arbitrary leaf, Arbitrary annotation) => Arbit m <- choose (0, n) diffOfSize m - shrink diff@(ArbitraryDiff annotated) = case annotated of - Free (annotation :< syntax) -> (subterms diff ++) $ filter (/= diff) $ - (ArbitraryDiff .) . (Free .) . (:<) <$> shrink annotation <*> case syntax of - Leaf a -> Leaf <$> shrink a - Indexed i -> Indexed <$> (List.subsequences i >>= recursivelyShrink) - Fixed f -> Fixed <$> (List.subsequences f >>= recursivelyShrink) - Keyed k -> Keyed . Map.fromList <$> (List.subsequences (Map.toList k) >>= recursivelyShrink) - Pure patch -> ArbitraryDiff . Pure <$> shrink patch + shrink = genericShrink diff --git a/src/Diffing.hs b/src/Diffing.hs index 299dc76d6..415a6444e 100644 --- a/src/Diffing.hs +++ b/src/Diffing.hs @@ -1,4 +1,3 @@ -{-# LANGUAGE FlexibleContexts #-} module Diffing where import Prologue hiding (fst, snd) diff --git a/src/Info.hs b/src/Info.hs index 7c459f18a..cd2d3d849 100644 --- a/src/Info.hs +++ b/src/Info.hs @@ -1,4 +1,4 @@ -{-# LANGUAGE DataKinds, FlexibleContexts, GeneralizedNewtypeDeriving #-} +{-# LANGUAGE DataKinds, GeneralizedNewtypeDeriving #-} module Info where import Data.Record @@ -7,9 +7,9 @@ import Category import Range newtype Size = Size { unSize :: Integer } - deriving (Eq, Num, Show) + deriving (Eq, Num, Ord, Show) newtype Cost = Cost { unCost :: Integer } - deriving (Eq, Num, Show) + deriving (Eq, Num, Ord, Show) type InfoFields = '[ Range, Category, Size, Cost ] diff --git a/src/Interpreter.hs b/src/Interpreter.hs index bc1aec57b..da3fc4423 100644 --- a/src/Interpreter.hs +++ b/src/Interpreter.hs @@ -1,13 +1,20 @@ module Interpreter (Comparable, DiffConstructor, diffTerms) where import Algorithm +import Category +import Data.Align import Data.Functor.Foldable import Data.Functor.Both -import qualified Data.OrderedMap as Map -import qualified Data.List as List +import Data.Hashable import Data.List ((\\)) +import qualified Data.List as List import Data.OrderedMap ((!)) +import qualified Data.OrderedMap as Map +import Data.RandomWalkSimilarity +import Data.Record +import Data.These import Diff +import Info import Operation import Patch import Prologue hiding (lookup) @@ -16,17 +23,17 @@ import Syntax import Term -- | Returns whether two terms are comparable -type Comparable a annotation = Term a annotation -> Term a annotation -> Bool +type Comparable leaf annotation = Term leaf annotation -> Term leaf annotation -> Bool -- | Constructs a diff from the CofreeF containing its annotation and syntax. This function has the opportunity to, for example, cache properties in the annotation. type DiffConstructor leaf annotation = CofreeF (Syntax leaf) (Both annotation) (Diff leaf annotation) -> Diff leaf annotation -- | Diff two terms, given a function that determines whether two terms can be compared and a cost function. -diffTerms :: (Eq a, Eq annotation) => DiffConstructor a annotation -> Comparable a annotation -> Cost (Diff a annotation) -> Term a annotation -> Term a annotation -> Diff a annotation +diffTerms :: (Eq leaf, Hashable leaf, Ord (Record fields), HasField fields Category) => DiffConstructor leaf (Record fields) -> Comparable leaf (Record fields) -> SES.Cost (Diff leaf (Record fields)) -> Term leaf (Record fields) -> Term leaf (Record fields) -> Diff leaf (Record fields) diffTerms construct comparable cost a b = fromMaybe (pure $ Replace a b) $ constructAndRun construct comparable cost a b -- | Constructs an algorithm and runs it -constructAndRun :: (Eq a, Eq annotation) => DiffConstructor a annotation -> Comparable a annotation -> Cost (Diff a annotation) -> Term a annotation -> Term a annotation -> Maybe (Diff a annotation) +constructAndRun :: (Eq leaf, Hashable leaf, Ord (Record fields), HasField fields Category) => DiffConstructor leaf (Record fields) -> Comparable leaf (Record fields) -> SES.Cost (Diff leaf (Record fields)) -> Term leaf (Record fields) -> Term leaf (Record fields) -> Maybe (Diff leaf (Record fields)) constructAndRun _ comparable _ a b | not $ comparable a b = Nothing constructAndRun construct _ _ a b | (() <$ a) == (() <$ b) = hylo construct runCofree <$> zipTerms a b @@ -41,21 +48,23 @@ constructAndRun construct comparable cost t1 t2 = annotate = pure . construct . (both annotation1 annotation2 :<) -- | Runs the diff algorithm -run :: (Eq a, Eq annotation) => DiffConstructor a annotation -> Comparable a annotation -> Cost (Diff a annotation) -> Algorithm a annotation (Diff a annotation) -> Maybe (Diff a annotation) +run :: (Eq leaf, Hashable leaf, Ord (Record fields), HasField fields Category) => DiffConstructor leaf (Record fields) -> Comparable leaf (Record fields) -> SES.Cost (Diff leaf (Record fields)) -> Algorithm leaf (Record fields) (Diff leaf (Record fields)) -> Maybe (Diff leaf (Record fields)) run construct comparable cost algorithm = case runFree algorithm of Pure diff -> Just diff Free (Recursive t1 t2 f) -> run construct comparable cost . f $ recur a b where (annotation1 :< a, annotation2 :< b) = (runCofree t1, runCofree t2) annotate = construct . (both annotation1 annotation2 :<) - recur (Indexed a') (Indexed b') | length a' == length b' = annotate . Indexed $ zipWith (diffTerms construct comparable cost) a' b' - recur (Fixed a') (Fixed b') | length a' == length b' = annotate . Fixed $ zipWith (diffTerms construct comparable cost) a' b' + recur (Indexed a') (Indexed b') = annotate . Indexed $ alignWith diffThese a' b' + recur (Fixed a') (Fixed b') = annotate . Fixed $ alignWith diffThese a' b' recur (Keyed a') (Keyed b') | Map.keys a' == bKeys = annotate . Keyed . Map.fromList . fmap repack $ bKeys where bKeys = Map.keys b' repack key = (key, interpretInBoth key a' b') interpretInBoth key x y = diffTerms construct comparable cost (x ! key) (y ! key) recur _ _ = pure $ Replace (cofree (annotation1 :< a)) (cofree (annotation2 :< b)) + diffThese = these (pure . Delete) (pure . Insert) (diffTerms construct comparable cost) + Free (ByKey a b f) -> run construct comparable cost $ f byKey where byKey = Map.fromList $ toKeyValue <$> List.union aKeys bKeys toKeyValue key | key `List.elem` deleted = (key, pure . Delete $ a ! key) @@ -67,3 +76,6 @@ run construct comparable cost algorithm = case runFree algorithm of inserted = bKeys \\ aKeys Free (ByIndex a b f) -> run construct comparable cost . f $ ses (constructAndRun construct comparable cost) cost a b + + Free (ByRandomWalkSimilarity a b f) -> run construct comparable cost . f $ rws (constructAndRun construct comparable cost) getLabel a b + where getLabel = category diff --git a/src/Operation.hs b/src/Operation.hs index 32e553842..b920748b6 100644 --- a/src/Operation.hs +++ b/src/Operation.hs @@ -10,11 +10,11 @@ data Operation a -- ^ The type of leaves in the syntax tree, typically String, but possibly some datatype representing different leaves more precisely. annotation -- ^ The type of annotations. f -- ^ The type representing another level of the diffing algorithm. Often Algorithm. - = -- | Recursively diff two terms and pass the result to the continuation. - Recursive (Term a annotation) (Term a annotation) (Diff a annotation -> f) + = Recursive (Term a annotation) (Term a annotation) (Diff a annotation -> f) -- | Diff two dictionaries and pass the result to the continuation. | ByKey (OrderedMap Text (Term a annotation)) (OrderedMap Text (Term a annotation)) (OrderedMap Text (Diff a annotation) -> f) -- | Diff two arrays and pass the result to the continuation. | ByIndex [Term a annotation] [Term a annotation] ([Diff a annotation] -> f) + | ByRandomWalkSimilarity [Term a annotation] [Term a annotation] ([Diff a annotation] -> f) deriving Functor diff --git a/src/Parser.hs b/src/Parser.hs index 44a9a9114..9e04e07e5 100644 --- a/src/Parser.hs +++ b/src/Parser.hs @@ -1,4 +1,3 @@ -{-# LANGUAGE FlexibleContexts #-} module Parser where import Prologue hiding (Constructor) diff --git a/src/Patch.hs b/src/Patch.hs index d5e38583c..d1500119c 100644 --- a/src/Patch.hs +++ b/src/Patch.hs @@ -12,19 +12,19 @@ import Data.These import Prologue -- | An operation to replace, insert, or delete an item. -data Patch a = - Replace a a +data Patch a + = Replace a a | Insert a | Delete a - deriving (Eq, Foldable, Functor, Show, Traversable) + deriving (Eq, Foldable, Functor, Ord, Show, Traversable) -- | Return the item from the after side of the patch. after :: Patch a -> Maybe a -after = maybeFst . unPatch +after = maybeSnd . unPatch -- | Return the item from the before side of the patch. before :: Patch a -> Maybe a -before = maybeSnd . unPatch +before = maybeFst . unPatch -- | Return both sides of a patch. unPatch :: Patch a -> These a a diff --git a/src/SplitDiff.hs b/src/SplitDiff.hs index 30011fde6..9dc6459f7 100644 --- a/src/SplitDiff.hs +++ b/src/SplitDiff.hs @@ -1,4 +1,3 @@ -{-# LANGUAGE FlexibleContexts #-} module SplitDiff where import Data.Record diff --git a/src/Syntax.hs b/src/Syntax.hs index 80657ea46..a7168cfd4 100644 --- a/src/Syntax.hs +++ b/src/Syntax.hs @@ -1,8 +1,10 @@ module Syntax where import Prologue -import Data.OrderedMap +import Data.OrderedMap as Map +import Data.Text.Arbitrary () import qualified Data.Text as T +import Test.QuickCheck hiding (Fixed) -- | A node in an abstract syntax tree. data Syntax @@ -17,4 +19,27 @@ data Syntax | Fixed [f] -- | A branch of child nodes indexed by some String identity. This is useful for identifying e.g. methods & properties in a class scope by their names. Note that comments can generally occur in these scopes as well; one strategy for dealing with this is to identify comments by their text in the source. | Keyed (OrderedMap T.Text f) - deriving (Functor, Show, Eq, Foldable, Traversable) + deriving (Eq, Foldable, Functor, Generic, Ord, Show, Traversable) + + +-- Instances + +syntaxOfSize :: Arbitrary leaf => (Int -> Gen f) -> Int -> Gen (Syntax leaf f) +syntaxOfSize recur n | n <= 1 = oneof $ (Leaf <$> arbitrary) : branchGeneratorsOfSize n + | otherwise = oneof $ branchGeneratorsOfSize n + where branchGeneratorsOfSize n = + [ Indexed <$> childrenOfSize (pred n) + , Fixed <$> childrenOfSize (pred n) + , (Keyed .) . (Map.fromList .) . zip <$> infiniteListOf arbitrary <*> childrenOfSize (pred n) + ] + childrenOfSize n | n <= 0 = pure [] + childrenOfSize n = do + m <- choose (1, n) + first <- recur m + rest <- childrenOfSize (n - m) + pure $! first : rest + +instance (Arbitrary leaf, Arbitrary f) => Arbitrary (Syntax leaf f) where + arbitrary = sized (syntaxOfSize (`resize` arbitrary) ) + + shrink = genericShrink diff --git a/src/Term.hs b/src/Term.hs index 1f4b915f8..c76207f10 100644 --- a/src/Term.hs +++ b/src/Term.hs @@ -1,4 +1,5 @@ {-# LANGUAGE TypeFamilies, TypeSynonymInstances #-} +{-# OPTIONS_GHC -fno-warn-orphans #-} module Term where import Prologue diff --git a/src/Term/Arbitrary.hs b/src/Term/Arbitrary.hs index 2ffc28fb1..2c0921da4 100644 --- a/src/Term/Arbitrary.hs +++ b/src/Term/Arbitrary.hs @@ -2,52 +2,36 @@ module Term.Arbitrary where import Data.Functor.Foldable (Base, cata, unfold, Unfoldable(embed)) -import qualified Data.List as List -import qualified Data.OrderedMap as Map import Data.Text.Arbitrary () import Prologue import Syntax import Term import Test.QuickCheck hiding (Fixed) -newtype ArbitraryTerm leaf annotation = ArbitraryTerm { unArbitraryTerm :: TermF leaf annotation (ArbitraryTerm leaf annotation) } +data ArbitraryTerm leaf annotation = ArbitraryTerm annotation (Syntax leaf (ArbitraryTerm leaf annotation)) deriving (Show, Eq, Generic) +unArbitraryTerm :: ArbitraryTerm leaf annotation -> TermF leaf annotation (ArbitraryTerm leaf annotation) +unArbitraryTerm (ArbitraryTerm a s) = a :< s + toTerm :: ArbitraryTerm leaf annotation -> Term leaf annotation toTerm = unfold unArbitraryTerm termOfSize :: (Arbitrary leaf, Arbitrary annotation) => Int -> Gen (ArbitraryTerm leaf annotation) -termOfSize n = (ArbitraryTerm .) . (:<) <$> arbitrary <*> syntaxOfSize n - where syntaxOfSize n | n <= 1 = oneof $ (Leaf <$> arbitrary) : branchGeneratorsOfSize n - | otherwise = oneof $ branchGeneratorsOfSize n - branchGeneratorsOfSize n = - [ Indexed <$> childrenOfSize (pred n) - , Fixed <$> childrenOfSize (pred n) - , (Keyed .) . (Map.fromList .) . zip <$> infiniteListOf arbitrary <*> childrenOfSize (pred n) - ] - childrenOfSize n | n <= 0 = pure [] - childrenOfSize n = do - m <- choose (1, n) - first <- termOfSize m - rest <- childrenOfSize (n - m) - pure $! first : rest +termOfSize n = ArbitraryTerm <$> arbitrary <*> syntaxOfSize termOfSize n arbitraryTermSize :: ArbitraryTerm leaf annotation -> Int arbitraryTermSize = cata (succ . sum) . toTerm + -- Instances -type instance Base (ArbitraryTerm leaf annotation) = CofreeF (Syntax leaf) annotation -instance Unfoldable (ArbitraryTerm leaf annotation) where embed = ArbitraryTerm +type instance Base (ArbitraryTerm leaf annotation) = TermF leaf annotation +instance Unfoldable (ArbitraryTerm leaf annotation) where embed (a :< s) = ArbitraryTerm a s instance (Eq leaf, Eq annotation, Arbitrary leaf, Arbitrary annotation) => Arbitrary (ArbitraryTerm leaf annotation) where arbitrary = sized $ \ n -> do m <- choose (0, n) termOfSize m - shrink term@(ArbitraryTerm (annotation :< syntax)) = (subterms term ++) $ filter (/= term) $ - (ArbitraryTerm .) . (:<) <$> shrink annotation <*> case syntax of - Leaf a -> Leaf <$> shrink a - Indexed i -> Indexed <$> (List.subsequences i >>= recursivelyShrink) - Fixed f -> Fixed <$> (List.subsequences f >>= recursivelyShrink) - Keyed k -> Keyed . Map.fromList <$> (List.subsequences (Map.toList k) >>= recursivelyShrink) + shrink = genericShrink diff --git a/test/AlignmentSpec.hs b/test/AlignmentSpec.hs index 1c8757f0e..13730d8a2 100644 --- a/test/AlignmentSpec.hs +++ b/test/AlignmentSpec.hs @@ -1,7 +1,6 @@ module AlignmentSpec where import Alignment -import ArbitraryTerm () import Control.Arrow ((&&&)) import Control.Monad.State import Data.Align hiding (align) @@ -49,7 +48,7 @@ spec = parallel $ do prop "covers every input line" $ \ elements -> let (_, children, ranges) = toAlignBranchInputs elements in - join <$> (traverse (modifyJoin (fromThese [] []) . fmap pure . fmap Prologue.fst) (alignBranch Prologue.snd children ranges)) `shouldBe` ranges + join <$> (traverse (modifyJoin (fromThese [] []) . fmap (pure . Prologue.fst)) (alignBranch Prologue.snd children ranges)) `shouldBe` ranges prop "covers every input child" $ \ elements -> let (_, children, ranges) = toAlignBranchInputs elements in @@ -194,7 +193,7 @@ spec = parallel $ do (info 8 9 `branch` [ info 8 9 `leaf` "c" ])) ] - describe "numberedRows" $ + describe "numberedRows" $ do prop "counts only non-empty values" $ \ xs -> counts (numberedRows (xs :: [Join These Char])) `shouldBe` length . catMaybes <$> Join (unalign (runJoin <$> xs)) @@ -237,9 +236,9 @@ keysOfAlignedChildren lines = lines >>= these identity identity (++) . runJoin . instance Arbitrary BranchElement where arbitrary = oneof [ key >>= \ key -> Child key <$> joinTheseOf (contents key) , Margin <$> joinTheseOf margin ] - where key = listOf1 (elements (['a'..'z'] ++ ['A'..'Z'] ++ ['0'..'9'])) + where key = listOf1 (elements (['a'..'z'] <> ['A'..'Z'] <> ['0'..'9'])) contents key = wrap key <$> listOf (padding '*') - wrap key contents = "(" ++ key ++ contents ++ ")" :: String + wrap key contents = "(" <> key <> contents <> ")" :: String margin = listOf (padding '-') padding char = frequency [ (10, pure char) , (1, pure '\n') ] diff --git a/test/ArbitraryTerm.hs b/test/ArbitraryTerm.hs deleted file mode 100644 index 5590f1749..000000000 --- a/test/ArbitraryTerm.hs +++ /dev/null @@ -1,16 +0,0 @@ -{-# OPTIONS_GHC -fno-warn-orphans #-} -module ArbitraryTerm where - -import Data.Text.Arbitrary () -import Data.These.Arbitrary () -import Prologue hiding (fst, snd) -import Source hiding ((++)) -import Test.QuickCheck hiding (Fixed) - -data CategorySet = A | B | C | D deriving (Eq, Show) - -instance Arbitrary CategorySet where - arbitrary = elements [ A, B, C, D ] - -instance Arbitrary a => Arbitrary (Source a) where - arbitrary = Source.fromList <$> arbitrary diff --git a/test/Data/RandomWalkSimilarity/Spec.hs b/test/Data/RandomWalkSimilarity/Spec.hs new file mode 100644 index 000000000..cdecd4a25 --- /dev/null +++ b/test/Data/RandomWalkSimilarity/Spec.hs @@ -0,0 +1,39 @@ +module Data.RandomWalkSimilarity.Spec where + +import Category +import Data.DList as DList hiding (toList) +import Data.RandomWalkSimilarity +import qualified Data.Set as Set +import Diff +import Patch +import Prologue +import Syntax +import Term +import Term.Arbitrary +import Test.Hspec +import Test.Hspec.QuickCheck +import Test.QuickCheck + +spec :: Spec +spec = parallel $ do + describe "pqGrams" $ do + prop "produces grams with stems of the specified length" . forAll (arbitrary `suchThat` (\ (_, p, q) -> p > 0 && q > 0)) $ + \ (term, p, q) -> pqGrams p q identity (toTerm term :: Term Text Text) `shouldSatisfy` all ((== p) . length . stem) + + prop "produces grams with bases of the specified length" . forAll (arbitrary `suchThat` (\ (_, p, q) -> p > 0 && q > 0)) $ + \ (term, p, q) -> pqGrams p q identity (toTerm term :: Term Text Text) `shouldSatisfy` all ((== q) . length . base) + + describe "featureVector" $ do + prop "produces a vector of the specified dimension" . forAll (arbitrary `suchThat` ((> 0) . Prologue.snd)) $ + \ (grams, d) -> length (featureVector d (fromList (grams :: [Gram Text]))) `shouldBe` d + + describe "rws" $ do + let compare a b = if extract a == extract b then Just (pure (Replace a b)) else Nothing + prop "produces correct diffs" . forAll (scale (`div` 4) arbitrary) $ + \ (as, bs) -> let tas = toTerm <$> as + tbs = toTerm <$> bs + diff = free (Free (pure Program :< Indexed (rws compare identity tas tbs :: [Diff Text Category]))) in + (childrenOf <$> beforeTerm diff, childrenOf <$> afterTerm diff) `shouldBe` (Just (Set.fromList tas), Just (Set.fromList tbs)) + +childrenOf :: (Ord leaf, Ord annotation) => Term leaf annotation -> Set.Set (Term leaf annotation) +childrenOf = Set.fromList . toList . unwrap diff --git a/test/Diff/Spec.hs b/test/Diff/Spec.hs new file mode 100644 index 000000000..38e2b536a --- /dev/null +++ b/test/Diff/Spec.hs @@ -0,0 +1,38 @@ +{-# LANGUAGE DataKinds #-} +module Diff.Spec where + +import Category +import Data.Record +import Data.Text.Arbitrary () +import Diff +import Diff.Arbitrary +import Interpreter +import Prologue +import Term.Arbitrary +import Test.Hspec +import Test.Hspec.QuickCheck +import Test.QuickCheck + +spec :: Spec +spec = parallel $ do + prop "equality is reflexive" $ + \ a b -> let diff = diffTerms (free . Free) (==) diffCost (toTerm a) (toTerm (b :: ArbitraryTerm Text (Record '[Category]))) in + diff `shouldBe` diff + + prop "equal terms produce identity diffs" $ + \ a -> let term = toTerm (a :: ArbitraryTerm Text (Record '[Category])) in + diffCost (diffTerms (free . Free) (==) diffCost term term) `shouldBe` 0 + + describe "beforeTerm" $ do + prop "recovers the before term" $ + \ a b -> let diff = diffTerms (free . Free) (==) diffCost (toTerm a) (toTerm (b :: ArbitraryTerm Text (Record '[Category]))) in + beforeTerm diff `shouldBe` Just (toTerm a) + + describe "afterTerm" $ do + prop "recovers the after term" $ + \ a b -> let diff = diffTerms (free . Free) (==) diffCost (toTerm a) (toTerm (b :: ArbitraryTerm Text (Record '[Category]))) in + afterTerm diff `shouldBe` Just (toTerm b) + + describe "ArbitraryDiff" $ do + prop "generates diffs of a specific size" . forAll ((arbitrary >>= \ n -> (,) n <$> diffOfSize n) `suchThat` ((> 0) . fst)) $ + \ (n, diff) -> arbitraryDiffSize (diff :: ArbitraryDiff Text ()) `shouldBe` n diff --git a/test/InterpreterSpec.hs b/test/InterpreterSpec.hs index fb8422cee..0b62d9790 100644 --- a/test/InterpreterSpec.hs +++ b/test/InterpreterSpec.hs @@ -1,23 +1,30 @@ +{-# LANGUAGE DataKinds #-} module InterpreterSpec where -import Prologue +import Category import Diff import Data.Record -import qualified Interpreter as I -import Range -import Syntax +import Interpreter import Patch -import Info -import Category +import Prologue +import Syntax +import Term.Arbitrary import Test.Hspec +import Test.Hspec.QuickCheck spec :: Spec -spec = parallel $ - describe "interpret" $ +spec = parallel $ do + describe "interpret" $ do it "returns a replacement when comparing two unicode equivalent terms" $ - I.diffTerms (free . Free) ((==) `on` extract) diffCost (cofree ((range .: StringLiteral .: 0 .: 0 .: RNil) :< Leaf "t\776")) (cofree ((range2 .: StringLiteral .: 0 .: 0 .: RNil) :< Leaf "\7831")) `shouldBe` - free (Pure (Replace (cofree ((range .: StringLiteral .: 0 .: 0 .: RNil) :< Leaf "t\776")) (cofree ((range2 .: StringLiteral .: 0 .: 0 .: RNil) :< Leaf "\7831")))) + let termA = cofree $ (StringLiteral .: RNil) :< Leaf ("t\776" :: Text) + termB = cofree $ (StringLiteral .: RNil) :< Leaf "\7831" in + diffTerms (free . Free) ((==) `on` extract) diffCost termA termB `shouldBe` free (Pure (Replace termA termB)) - where - range = Range 0 2 - range2 = Range 0 1 + prop "produces correct diffs" $ + \ a b -> let diff = diffTerms (free . Free) ((==) `on` extract) diffCost (toTerm a) (toTerm b) :: Diff Text (Record '[Category]) in + (beforeTerm diff, afterTerm diff) `shouldBe` (Just (toTerm a), Just (toTerm b)) + + prop "constructs zero-cost diffs of equal terms" $ + \ a -> let term = toTerm a + diff = diffTerms (free . Free) ((==) `on` extract) diffCost term term :: Diff Text (Record '[Category]) in + diffCost diff `shouldBe` 0 diff --git a/test/Spec.hs b/test/Spec.hs index 035688e32..e38ac01e2 100644 --- a/test/Spec.hs +++ b/test/Spec.hs @@ -3,19 +3,23 @@ module Main where import Prologue import qualified AlignmentSpec import qualified CorpusSpec +import qualified Data.RandomWalkSimilarity.Spec +import qualified Diff.Spec +import qualified DiffSummarySpec import qualified InterpreterSpec import qualified OrderedMapSpec import qualified PatchOutputSpec import qualified TermSpec -import qualified DiffSummarySpec import Test.Hspec main :: IO () -main = hspec $ parallel $ do +main = hspec . parallel $ do describe "Alignment" AlignmentSpec.spec describe "Corpus" CorpusSpec.spec + describe "Data.RandomWalkSimilarity" Data.RandomWalkSimilarity.Spec.spec + describe "Diff.Spec" Diff.Spec.spec + describe "DiffSummary" DiffSummarySpec.spec describe "Interpreter" InterpreterSpec.spec describe "OrderedMap" OrderedMapSpec.spec describe "PatchOutput" PatchOutputSpec.spec describe "Term" TermSpec.spec - describe "DiffSummary" DiffSummarySpec.spec diff --git a/test/TermSpec.hs b/test/TermSpec.hs index 42a1f9c28..f0a209557 100644 --- a/test/TermSpec.hs +++ b/test/TermSpec.hs @@ -1,11 +1,6 @@ module TermSpec where -import ArbitraryTerm -import Data.String import Data.Text.Arbitrary () -import Diff -import Diff.Arbitrary -import Interpreter import Prologue import Term.Arbitrary import Test.Hspec @@ -16,21 +11,8 @@ spec :: Spec spec = parallel $ do describe "Term" $ do prop "equality is reflexive" $ - \ a -> toTerm a == toTerm (a :: ArbitraryTerm String ()) + \ a -> toTerm a `shouldBe` toTerm (a :: ArbitraryTerm Text ()) - describe "ArbitraryTerm" $ - prop "generates terms of a specific size" $ forAll ((arbitrary >>= \ n -> (,) n <$> termOfSize n) `suchThat` ((> 0) . fst)) $ - \ (n, term) -> arbitraryTermSize (term :: ArbitraryTerm String ()) `shouldBe` n - - describe "ArbitraryDiff" $ - prop "generates diffs of a specific size" $ forAll ((arbitrary >>= \ n -> (,) n <$> diffOfSize n) `suchThat` ((> 0) . fst)) $ - \ (n, diff) -> arbitraryDiffSize (diff :: ArbitraryDiff String ()) `shouldBe` n - - describe "Diff" $ do - prop "equality is reflexive" $ - \ a b -> let diff = diffTerms (free . Free) (==) diffCost (toTerm a) (toTerm (b :: ArbitraryTerm String CategorySet)) in - diff == diff - - prop "equal terms produce identity diffs" $ - \ a -> let term = toTerm (a :: ArbitraryTerm String CategorySet) in - diffCost (diffTerms (free . Free) (==) diffCost term term) == 0 + describe "ArbitraryTerm" $ do + prop "generates terms of a specific size" . forAll ((arbitrary >>= \ n -> (,) n <$> termOfSize n) `suchThat` ((> 0) . fst)) $ + \ (n, term) -> arbitraryTermSize (term :: ArbitraryTerm Text ()) `shouldBe` n