1
1
mirror of https://github.com/github/semantic.git synced 2024-12-19 21:01:35 +03:00

Merge pull request #617 from github/random-walk-similarity

Random walk similarity
This commit is contained in:
Josh Vera 2016-07-06 14:15:32 -04:00 committed by GitHub
commit 6420010a6c
26 changed files with 387 additions and 144 deletions

View File

@ -18,6 +18,7 @@ library
, Category
, Data.Bifunctor.Join.Arbitrary
, Data.Functor.Both
, Data.RandomWalkSimilarity
, Data.OrderedMap
, Data.Record
, Data.These.Arbitrary
@ -55,8 +56,12 @@ library
, bytestring
, containers
, directory
, dlist
, filepath
, hashable
, kdt
, mtl
, MonadRandom
, pointed
, QuickCheck >= 2.8.1
, quickcheck-text
@ -71,7 +76,7 @@ library
, comonad
, protolude
default-language: Haskell2010
default-extensions: DeriveFunctor, DeriveFoldable, DeriveTraversable, DeriveGeneric, FlexibleInstances, OverloadedStrings, NoImplicitPrelude, RecordWildCards
default-extensions: DeriveFunctor, DeriveFoldable, DeriveTraversable, DeriveGeneric, FlexibleContexts, FlexibleInstances, OverloadedStrings, NoImplicitPrelude, RecordWildCards
ghc-options: -Wall -fno-warn-name-shadowing -O2 -threaded -fprof-auto "-with-rtsopts=-N -p -s -h -i0.1" -j
benchmark semantic-diff-bench
@ -95,18 +100,20 @@ test-suite semantic-diff-test
hs-source-dirs: test
main-is: Spec.hs
other-modules: AlignmentSpec
, ArbitraryTerm
, CorpusSpec
, Data.RandomWalkSimilarity.Spec
, Diff.Spec
, DiffSummarySpec
, InterpreterSpec
, OrderedMapSpec
, PatchOutputSpec
, TermSpec
, DiffSummarySpec
build-depends: base
, bifunctors
, bytestring
, containers
, deepseq
, dlist
, filepath
, Glob
, hspec >= 2.1.10

View File

@ -1,4 +1,4 @@
{-# LANGUAGE FlexibleContexts, RankNTypes, ScopedTypeVariables #-}
{-# LANGUAGE RankNTypes, ScopedTypeVariables #-}
module Alignment
( hasChanges
, numberedRows

View File

@ -1,7 +1,9 @@
module Category where
import Prologue
import Data.Hashable
import Data.String
import Test.QuickCheck
-- | A standardized category of AST node. Used to determine the semantics for
-- | semantic diffing and define comparability of nodes.
@ -28,4 +30,27 @@ data Category
| ArrayLiteral
-- | A non-standard category, which can be used for comparability.
| Other String
deriving (Eq, Show, Ord)
deriving (Eq, Generic, Ord, Show)
-- Instances
instance Hashable Category
instance Arbitrary Category where
arbitrary = oneof
[ pure Program
, pure Error
, pure BinaryOperator
, pure DictionaryLiteral
, pure Pair
, pure FunctionCall
, pure StringLiteral
, pure IntegerLiteral
, pure SymbolLiteral
, pure ArrayLiteral
, Other <$> arbitrary
]
shrink (Other s) = Other <$> shrink s
shrink _ = []

View File

@ -1,3 +1,4 @@
{-# OPTIONS_GHC -fno-warn-orphans #-}
module Data.Functor.Both (Both,both, runBothWith, fst, snd, module X) where
import Data.Bifunctor.Join as X

View File

@ -15,10 +15,11 @@ module Data.OrderedMap (
) where
import Prologue hiding (toList, empty)
import Test.QuickCheck
-- | An ordered map of keys and values.
newtype OrderedMap key value = OrderedMap { toList :: [(key, value)] }
deriving (Show, Eq, Functor, Foldable, Traversable)
deriving (Eq, Foldable, Functor, Generic, Ord, Show, Traversable)
-- | Construct an ordered map from a list of pairs of keys and values.
fromList :: [(key, value)] -> OrderedMap key value
@ -70,3 +71,7 @@ difference (OrderedMap a) (OrderedMap b) = OrderedMap $ filter ((`notElem` extan
instance Eq key => Monoid (OrderedMap key value) where
mempty = fromList []
mappend = union
instance (Arbitrary key, Arbitrary value) => Arbitrary (OrderedMap key value) where
arbitrary = fromList <$> arbitrary
shrink = genericShrink

View File

@ -0,0 +1,112 @@
module Data.RandomWalkSimilarity where
import Control.Arrow ((&&&))
import Control.Monad.Random
import Control.Monad.State
import Data.Bifunctor.Join
import qualified Data.DList as DList
import Data.Functor.Foldable as Foldable
import Data.Hashable
import qualified Data.KdTree.Static as KdTree
import qualified Data.List as List
import qualified Data.OrderedMap as Map
import qualified Data.Vector as Vector
import Diff
import Patch
import Prologue
import Syntax
import Term
import Test.QuickCheck hiding (Fixed)
import Test.QuickCheck.Random
-- | Given a function comparing two terms recursively, and a function to compute a Hashable label from an annotation, compute the diff of a pair of lists of terms using a random walk similarity metric, which completes in log-linear time. This implementation is based on the paper [_RWS-Diff—Flexible and Efficient Change Detection in Hierarchical Data_](https://github.com/github/semantic-diff/files/325837/RWS-Diff.Flexible.and.Efficient.Change.Detection.in.Hierarchical.Data.pdf).
rws :: (Hashable label, Hashable leaf, Eq leaf, Ord annotation) => (Term leaf annotation -> Term leaf annotation -> Maybe (Diff leaf annotation)) -> (annotation -> label) -> [Term leaf annotation] -> [Term leaf annotation] -> [Diff leaf annotation]
rws compare getLabel as bs
| null as, null bs = []
| null as = insert <$> bs
| null bs = delete <$> as
| otherwise = uncurry deleteRemaining . (`runState` fas) $ traverse findNearestNeighbourTo fbs
where insert = pure . Insert
delete = pure . Delete
replace = (pure .) . Replace
(p, q, d) = (2, 2, 15)
fas = featurize <$> as
fbs = featurize <$> bs
kdas = KdTree.build (Vector.toList . fst) fas
featurize = featureVector d . pqGrams p q getLabel &&& identity
findNearestNeighbourTo kv@(_, v) = do
unmapped <- get
let (k, _) = KdTree.nearest kdas kv
case k `List.lookup` unmapped of
Nothing -> pure $! insert v
Just found -> do
put (List.delete (k, found) unmapped)
pure $! fromMaybe (replace found v) (compare found v)
deleteRemaining diffs unmapped = foldl' (flip (List.insertBy (comparing firstAnnotation))) diffs (delete . snd <$> unmapped)
-- | Extract the annotation for the before state of a diff node. This is returned in `Maybe` because e.g. an `Insert` patch does not have an annotation for the before state.
firstAnnotation :: Diff leaf annotation -> Maybe annotation
firstAnnotation diff = case runFree diff of
Free (annotations :< _) -> Just (fst (runJoin annotations))
Pure patch -> maybeFst (unPatch $ extract <$> patch)
-- | A `Gram` is a fixed-size view of some portion of a tree, consisting of a `stem` of _p_ labels for parent nodes, and a `base` of _q_ labels of sibling nodes. Collectively, the bag of `Gram`s for each node of a tree (e.g. as computed by `pqGrams`) form a summary of the tree.
data Gram label = Gram { stem :: [Maybe label], base :: [Maybe label] }
deriving (Eq, Show)
-- | Compute the bag of grams with stems of length _p_ and bases of length _q_, with labels computed from annotations, which summarize the entire subtree of a term.
pqGrams :: Int -> Int -> (annotation -> label) -> Cofree (Syntax leaf) annotation -> DList.DList (Gram (label, Maybe leaf))
pqGrams p q getLabel = cata merge . setRootBase . setRootStem . hylo go project
where go (annotation :< functor) = cofree (Gram [] [ Just (getLabel annotation, leafValue functor) ] :< (assignParent (Just (getLabel annotation, leafValue functor)) p <$> functor))
leafValue (Leaf s) = Just s
leafValue _ = Nothing
merge (head :< tail) = DList.cons head (Prologue.fold tail)
assignParent parentLabel n tree
| n > 0 = let gram :< functor = runCofree tree in cofree $ prependParent parentLabel gram :< assignSiblings (assignParent parentLabel (pred n) <$> functor)
| otherwise = tree
prependParent parentLabel gram = gram { stem = parentLabel : stem gram }
assignSiblings functor = case functor of
Leaf a -> Leaf a
Indexed a -> Indexed $ windowed q setBases [] a
Fixed a -> Fixed $ windowed q setBases [] a
Keyed a -> Keyed . Map.fromList $ windowed q setBasesKV [] (Map.toList a)
setBases child siblings rest = let (gram :< further) = (runCofree child) in cofree (setBase gram (siblings >>= base . extract) :< further) : rest
setBasesKV (key, child) siblings rest = let (gram :< further) = (runCofree child) in (key, cofree (setBase gram (siblings >>= base . extract . snd) :< further)) : rest
setBase gram newBase = gram { base = take q (newBase <> repeat Nothing) }
setRootBase term = let (a :< f) = runCofree term in cofree (setBase a (base a) :< f)
setRootStem = foldr (\ p rest -> assignParent Nothing p . rest) identity [0..p]
-- | A sliding-window fold over _n_ items of a list per iteration.
windowed :: Int -> (a -> [a] -> b -> b) -> b -> [a] -> b
windowed n f seed = para alg
where alg xs = case xs of
Cons a (as, b) -> f a (take n $ a : as) b
Nil -> seed
-- | Compute a vector with the specified number of dimensions, as an approximation of a bag of `Gram`s summarizing a tree.
featureVector :: Hashable label => Int -> DList.DList (Gram label) -> Vector.Vector Double
featureVector d bag = sumVectors $ unitDVector . hash <$> bag
where unitDVector hash = normalize . (`evalRand` mkQCGen hash) $ Prologue.sequence (Vector.replicate d getRandom)
normalize vec = fmap (/ vmagnitude vec) vec
sumVectors = DList.foldr (Vector.zipWith (+)) (Vector.replicate d 0)
-- | The magnitude of a Euclidean vector, i.e. its distance from the origin.
vmagnitude :: Vector.Vector Double -> Double
vmagnitude = sqrtDouble . Vector.sum . fmap (** 2)
-- Instances
instance Hashable label => Hashable (Gram label) where
hashWithSalt _ = hash
hash gram = hash (stem gram <> base gram)
-- | Construct a generator for arbitrary `Gram`s of size `(p, q)`.
gramWithPQ :: Arbitrary label => Int -> Int -> Gen (Gram label)
gramWithPQ p q = Gram <$> vectorOf p arbitrary <*> vectorOf q arbitrary
instance Arbitrary label => Arbitrary (Gram label) where
arbitrary = join $ gramWithPQ <$> arbitrary <*> arbitrary
shrink (Gram a b) = Gram <$> shrink a <*> shrink b

View File

@ -1,7 +1,8 @@
{-# LANGUAGE DataKinds, FlexibleContexts, GADTs, KindSignatures, MultiParamTypeClasses, TypeOperators #-}
{-# LANGUAGE DataKinds, GADTs, KindSignatures, MultiParamTypeClasses, TypeOperators #-}
module Data.Record where
import Prologue
import Test.QuickCheck
-- | A type-safe, extensible record structure.
-- |
@ -50,3 +51,17 @@ instance (Eq h, Eq (Record t)) => Eq (Record (h ': t)) where
instance Eq (Record '[]) where
_ == _ = True
instance (Ord h, Ord (Record t)) => Ord (Record (h ': t)) where
RCons h1 t1 `compare` RCons h2 t2 = let h = h1 `compare` h2 in
if h == EQ then t1 `compare` t2 else h
instance Ord (Record '[]) where
_ `compare` _ = EQ
instance Arbitrary fields => Arbitrary (Record '[fields]) where
arbitrary = RCons <$> arbitrary <*> pure RNil
shrink (RCons h t) = RCons <$> shrink h <*> pure t

View File

@ -1,9 +1,11 @@
{-# LANGUAGE TypeFamilies, TypeSynonymInstances #-}
{-# OPTIONS_GHC -fno-warn-orphans #-}
module Diff where
import Prologue
import Data.Functor.Foldable as Foldable
import Data.Functor.Both
import Data.Functor.Both as Both
import qualified Data.OrderedMap as Map
import Patch
import Syntax
import Term
@ -13,8 +15,8 @@ type DiffF leaf annotation = FreeF (CofreeF (Syntax leaf) (Both annotation)) (Pa
type Diff a annotation = Free (CofreeF (Syntax a) (Both annotation)) (Patch (Term a annotation))
type instance Base (Free f a) = FreeF f a
instance (Functor f) => Foldable.Foldable (Free f a) where project = runFree
instance (Functor f) => Foldable.Unfoldable (Free f a) where embed = free
instance Functor f => Foldable.Foldable (Free f a) where project = runFree
instance Functor f => Foldable.Unfoldable (Free f a) where embed = free
diffSum :: (Patch (Term a annotation) -> Integer) -> Diff a annotation -> Integer
diffSum patchCost diff = sum $ fmap patchCost diff
@ -22,3 +24,22 @@ diffSum patchCost diff = sum $ fmap patchCost diff
-- | The sum of the node count of the diffs patches.
diffCost :: Diff a annotation -> Integer
diffCost = diffSum $ patchSum termSize
-- | Merge a diff using a function to provide the Term (in Maybe, to simplify recovery of the before/after state) for every Patch.
mergeMaybe :: (Patch (Term leaf annotation) -> Maybe (Term leaf annotation)) -> Diff leaf annotation -> Maybe (Term leaf annotation)
mergeMaybe transform = cata algebra . fmap transform
where algebra :: FreeF (CofreeF (Syntax leaf) (Both annotation)) (Maybe (Term leaf annotation)) (Maybe (Term leaf annotation)) -> Maybe (Term leaf annotation)
algebra (Pure p) = p
algebra (Free (annotations :< syntax)) = Just . cofree $ Both.fst annotations :< case syntax of
Leaf s -> Leaf s
Indexed i -> Indexed (catMaybes i)
Fixed i -> Fixed (catMaybes i)
Keyed i -> Keyed (Map.fromList (Map.toList i >>= (\ (k, v) -> maybe [] (pure . (,) k) v)))
-- | Recover the before state of a diff.
beforeTerm :: Diff leaf annotation -> Maybe (Term leaf annotation)
beforeTerm = mergeMaybe before
-- | Recover the after state of a diff.
afterTerm :: Diff leaf annotation -> Maybe (Term leaf annotation)
afterTerm = mergeMaybe after

View File

@ -4,7 +4,6 @@ import Diff
import Data.Bifunctor.Join
import Data.Bifunctor.Join.Arbitrary ()
import Data.Functor.Foldable (unfold)
import qualified Data.List as List
import qualified Data.OrderedMap as Map
import Patch
import Patch.Arbitrary ()
@ -13,32 +12,25 @@ import Prologue
import Term.Arbitrary
import Test.QuickCheck hiding (Fixed)
newtype ArbitraryDiff leaf annotation = ArbitraryDiff { unArbitraryDiff :: FreeF (CofreeF (Syntax leaf) (Join (,) annotation)) (Patch (ArbitraryTerm leaf annotation)) (ArbitraryDiff leaf annotation) }
data ArbitraryDiff leaf annotation
= ArbitraryFree (Join (,) annotation) (Syntax leaf (ArbitraryDiff leaf annotation))
| ArbitraryPure (Patch (ArbitraryTerm leaf annotation))
deriving (Show, Eq, Generic)
unArbitraryDiff :: ArbitraryDiff leaf annotation -> FreeF (CofreeF (Syntax leaf) (Join (,) annotation)) (Patch (ArbitraryTerm leaf annotation)) (ArbitraryDiff leaf annotation)
unArbitraryDiff (ArbitraryFree a s) = Free (a :< s)
unArbitraryDiff (ArbitraryPure p) = Pure p
toDiff :: ArbitraryDiff leaf annotation -> Diff leaf annotation
toDiff = fmap (fmap toTerm) . unfold unArbitraryDiff
diffOfSize :: (Arbitrary leaf, Arbitrary annotation) => Int -> Gen (ArbitraryDiff leaf annotation)
diffOfSize n
| n <= 0 = (ArbitraryDiff .) . (Free .) . (:<) <$> arbitrary <*> syntaxOfSize n
| n <= 0 = ArbitraryFree <$> arbitrary <*> syntaxOfSize diffOfSize n
| otherwise = oneof
[ (ArbitraryDiff .) . (Free .) . (:<) <$> arbitrary <*> syntaxOfSize n
, ArbitraryDiff . Pure <$> patchOfSize n ]
where syntaxOfSize n | n <= 1 = oneof $ (Leaf <$> arbitrary) : branchGeneratorsOfSize n
| otherwise = oneof $ branchGeneratorsOfSize n
branchGeneratorsOfSize n =
[ Indexed <$> childrenOfSize (pred n)
, Fixed <$> childrenOfSize (pred n)
, (Keyed .) . (Map.fromList .) . zip <$> infiniteListOf arbitrary <*> childrenOfSize (pred n)
]
childrenOfSize n | n <= 0 = pure []
childrenOfSize n = do
m <- choose (1, n)
first <- diffOfSize m
rest <- childrenOfSize (n - m)
pure $! first : rest
patchOfSize 1 = oneof [ Insert <$> termOfSize 1
[ ArbitraryFree <$> arbitrary <*> syntaxOfSize diffOfSize n
, ArbitraryPure <$> patchOfSize n ]
where patchOfSize 1 = oneof [ Insert <$> termOfSize 1
, Delete <$> termOfSize 1 ]
patchOfSize n = do
m <- choose (1, n - 1)
@ -59,11 +51,4 @@ instance (Eq leaf, Eq annotation, Arbitrary leaf, Arbitrary annotation) => Arbit
m <- choose (0, n)
diffOfSize m
shrink diff@(ArbitraryDiff annotated) = case annotated of
Free (annotation :< syntax) -> (subterms diff ++) $ filter (/= diff) $
(ArbitraryDiff .) . (Free .) . (:<) <$> shrink annotation <*> case syntax of
Leaf a -> Leaf <$> shrink a
Indexed i -> Indexed <$> (List.subsequences i >>= recursivelyShrink)
Fixed f -> Fixed <$> (List.subsequences f >>= recursivelyShrink)
Keyed k -> Keyed . Map.fromList <$> (List.subsequences (Map.toList k) >>= recursivelyShrink)
Pure patch -> ArbitraryDiff . Pure <$> shrink patch
shrink = genericShrink

View File

@ -1,4 +1,3 @@
{-# LANGUAGE FlexibleContexts #-}
module Diffing where
import Prologue hiding (fst, snd)

View File

@ -1,4 +1,4 @@
{-# LANGUAGE DataKinds, FlexibleContexts, GeneralizedNewtypeDeriving #-}
{-# LANGUAGE DataKinds, GeneralizedNewtypeDeriving #-}
module Info where
import Data.Record
@ -7,9 +7,9 @@ import Category
import Range
newtype Size = Size { unSize :: Integer }
deriving (Eq, Num, Show)
deriving (Eq, Num, Ord, Show)
newtype Cost = Cost { unCost :: Integer }
deriving (Eq, Num, Show)
deriving (Eq, Num, Ord, Show)
type InfoFields = '[ Range, Category, Size, Cost ]

View File

@ -1,13 +1,20 @@
module Interpreter (Comparable, DiffConstructor, diffTerms) where
import Algorithm
import Category
import Data.Align
import Data.Functor.Foldable
import Data.Functor.Both
import qualified Data.OrderedMap as Map
import qualified Data.List as List
import Data.Hashable
import Data.List ((\\))
import qualified Data.List as List
import Data.OrderedMap ((!))
import qualified Data.OrderedMap as Map
import Data.RandomWalkSimilarity
import Data.Record
import Data.These
import Diff
import Info
import Operation
import Patch
import Prologue hiding (lookup)
@ -16,17 +23,17 @@ import Syntax
import Term
-- | Returns whether two terms are comparable
type Comparable a annotation = Term a annotation -> Term a annotation -> Bool
type Comparable leaf annotation = Term leaf annotation -> Term leaf annotation -> Bool
-- | Constructs a diff from the CofreeF containing its annotation and syntax. This function has the opportunity to, for example, cache properties in the annotation.
type DiffConstructor leaf annotation = CofreeF (Syntax leaf) (Both annotation) (Diff leaf annotation) -> Diff leaf annotation
-- | Diff two terms, given a function that determines whether two terms can be compared and a cost function.
diffTerms :: (Eq a, Eq annotation) => DiffConstructor a annotation -> Comparable a annotation -> Cost (Diff a annotation) -> Term a annotation -> Term a annotation -> Diff a annotation
diffTerms :: (Eq leaf, Hashable leaf, Ord (Record fields), HasField fields Category) => DiffConstructor leaf (Record fields) -> Comparable leaf (Record fields) -> SES.Cost (Diff leaf (Record fields)) -> Term leaf (Record fields) -> Term leaf (Record fields) -> Diff leaf (Record fields)
diffTerms construct comparable cost a b = fromMaybe (pure $ Replace a b) $ constructAndRun construct comparable cost a b
-- | Constructs an algorithm and runs it
constructAndRun :: (Eq a, Eq annotation) => DiffConstructor a annotation -> Comparable a annotation -> Cost (Diff a annotation) -> Term a annotation -> Term a annotation -> Maybe (Diff a annotation)
constructAndRun :: (Eq leaf, Hashable leaf, Ord (Record fields), HasField fields Category) => DiffConstructor leaf (Record fields) -> Comparable leaf (Record fields) -> SES.Cost (Diff leaf (Record fields)) -> Term leaf (Record fields) -> Term leaf (Record fields) -> Maybe (Diff leaf (Record fields))
constructAndRun _ comparable _ a b | not $ comparable a b = Nothing
constructAndRun construct _ _ a b | (() <$ a) == (() <$ b) = hylo construct runCofree <$> zipTerms a b
@ -41,21 +48,23 @@ constructAndRun construct comparable cost t1 t2 =
annotate = pure . construct . (both annotation1 annotation2 :<)
-- | Runs the diff algorithm
run :: (Eq a, Eq annotation) => DiffConstructor a annotation -> Comparable a annotation -> Cost (Diff a annotation) -> Algorithm a annotation (Diff a annotation) -> Maybe (Diff a annotation)
run :: (Eq leaf, Hashable leaf, Ord (Record fields), HasField fields Category) => DiffConstructor leaf (Record fields) -> Comparable leaf (Record fields) -> SES.Cost (Diff leaf (Record fields)) -> Algorithm leaf (Record fields) (Diff leaf (Record fields)) -> Maybe (Diff leaf (Record fields))
run construct comparable cost algorithm = case runFree algorithm of
Pure diff -> Just diff
Free (Recursive t1 t2 f) -> run construct comparable cost . f $ recur a b where
(annotation1 :< a, annotation2 :< b) = (runCofree t1, runCofree t2)
annotate = construct . (both annotation1 annotation2 :<)
recur (Indexed a') (Indexed b') | length a' == length b' = annotate . Indexed $ zipWith (diffTerms construct comparable cost) a' b'
recur (Fixed a') (Fixed b') | length a' == length b' = annotate . Fixed $ zipWith (diffTerms construct comparable cost) a' b'
recur (Indexed a') (Indexed b') = annotate . Indexed $ alignWith diffThese a' b'
recur (Fixed a') (Fixed b') = annotate . Fixed $ alignWith diffThese a' b'
recur (Keyed a') (Keyed b') | Map.keys a' == bKeys = annotate . Keyed . Map.fromList . fmap repack $ bKeys where
bKeys = Map.keys b'
repack key = (key, interpretInBoth key a' b')
interpretInBoth key x y = diffTerms construct comparable cost (x ! key) (y ! key)
recur _ _ = pure $ Replace (cofree (annotation1 :< a)) (cofree (annotation2 :< b))
diffThese = these (pure . Delete) (pure . Insert) (diffTerms construct comparable cost)
Free (ByKey a b f) -> run construct comparable cost $ f byKey where
byKey = Map.fromList $ toKeyValue <$> List.union aKeys bKeys
toKeyValue key | key `List.elem` deleted = (key, pure . Delete $ a ! key)
@ -67,3 +76,6 @@ run construct comparable cost algorithm = case runFree algorithm of
inserted = bKeys \\ aKeys
Free (ByIndex a b f) -> run construct comparable cost . f $ ses (constructAndRun construct comparable cost) cost a b
Free (ByRandomWalkSimilarity a b f) -> run construct comparable cost . f $ rws (constructAndRun construct comparable cost) getLabel a b
where getLabel = category

View File

@ -10,11 +10,11 @@ data Operation
a -- ^ The type of leaves in the syntax tree, typically String, but possibly some datatype representing different leaves more precisely.
annotation -- ^ The type of annotations.
f -- ^ The type representing another level of the diffing algorithm. Often Algorithm.
=
-- | Recursively diff two terms and pass the result to the continuation.
Recursive (Term a annotation) (Term a annotation) (Diff a annotation -> f)
= Recursive (Term a annotation) (Term a annotation) (Diff a annotation -> f)
-- | Diff two dictionaries and pass the result to the continuation.
| ByKey (OrderedMap Text (Term a annotation)) (OrderedMap Text (Term a annotation)) (OrderedMap Text (Diff a annotation) -> f)
-- | Diff two arrays and pass the result to the continuation.
| ByIndex [Term a annotation] [Term a annotation] ([Diff a annotation] -> f)
| ByRandomWalkSimilarity [Term a annotation] [Term a annotation] ([Diff a annotation] -> f)
deriving Functor

View File

@ -1,4 +1,3 @@
{-# LANGUAGE FlexibleContexts #-}
module Parser where
import Prologue hiding (Constructor)

View File

@ -12,19 +12,19 @@ import Data.These
import Prologue
-- | An operation to replace, insert, or delete an item.
data Patch a =
Replace a a
data Patch a
= Replace a a
| Insert a
| Delete a
deriving (Eq, Foldable, Functor, Show, Traversable)
deriving (Eq, Foldable, Functor, Ord, Show, Traversable)
-- | Return the item from the after side of the patch.
after :: Patch a -> Maybe a
after = maybeFst . unPatch
after = maybeSnd . unPatch
-- | Return the item from the before side of the patch.
before :: Patch a -> Maybe a
before = maybeSnd . unPatch
before = maybeFst . unPatch
-- | Return both sides of a patch.
unPatch :: Patch a -> These a a

View File

@ -1,4 +1,3 @@
{-# LANGUAGE FlexibleContexts #-}
module SplitDiff where
import Data.Record

View File

@ -1,8 +1,10 @@
module Syntax where
import Prologue
import Data.OrderedMap
import Data.OrderedMap as Map
import Data.Text.Arbitrary ()
import qualified Data.Text as T
import Test.QuickCheck hiding (Fixed)
-- | A node in an abstract syntax tree.
data Syntax
@ -17,4 +19,27 @@ data Syntax
| Fixed [f]
-- | A branch of child nodes indexed by some String identity. This is useful for identifying e.g. methods & properties in a class scope by their names. Note that comments can generally occur in these scopes as well; one strategy for dealing with this is to identify comments by their text in the source.
| Keyed (OrderedMap T.Text f)
deriving (Functor, Show, Eq, Foldable, Traversable)
deriving (Eq, Foldable, Functor, Generic, Ord, Show, Traversable)
-- Instances
syntaxOfSize :: Arbitrary leaf => (Int -> Gen f) -> Int -> Gen (Syntax leaf f)
syntaxOfSize recur n | n <= 1 = oneof $ (Leaf <$> arbitrary) : branchGeneratorsOfSize n
| otherwise = oneof $ branchGeneratorsOfSize n
where branchGeneratorsOfSize n =
[ Indexed <$> childrenOfSize (pred n)
, Fixed <$> childrenOfSize (pred n)
, (Keyed .) . (Map.fromList .) . zip <$> infiniteListOf arbitrary <*> childrenOfSize (pred n)
]
childrenOfSize n | n <= 0 = pure []
childrenOfSize n = do
m <- choose (1, n)
first <- recur m
rest <- childrenOfSize (n - m)
pure $! first : rest
instance (Arbitrary leaf, Arbitrary f) => Arbitrary (Syntax leaf f) where
arbitrary = sized (syntaxOfSize (`resize` arbitrary) )
shrink = genericShrink

View File

@ -1,4 +1,5 @@
{-# LANGUAGE TypeFamilies, TypeSynonymInstances #-}
{-# OPTIONS_GHC -fno-warn-orphans #-}
module Term where
import Prologue

View File

@ -2,52 +2,36 @@
module Term.Arbitrary where
import Data.Functor.Foldable (Base, cata, unfold, Unfoldable(embed))
import qualified Data.List as List
import qualified Data.OrderedMap as Map
import Data.Text.Arbitrary ()
import Prologue
import Syntax
import Term
import Test.QuickCheck hiding (Fixed)
newtype ArbitraryTerm leaf annotation = ArbitraryTerm { unArbitraryTerm :: TermF leaf annotation (ArbitraryTerm leaf annotation) }
data ArbitraryTerm leaf annotation = ArbitraryTerm annotation (Syntax leaf (ArbitraryTerm leaf annotation))
deriving (Show, Eq, Generic)
unArbitraryTerm :: ArbitraryTerm leaf annotation -> TermF leaf annotation (ArbitraryTerm leaf annotation)
unArbitraryTerm (ArbitraryTerm a s) = a :< s
toTerm :: ArbitraryTerm leaf annotation -> Term leaf annotation
toTerm = unfold unArbitraryTerm
termOfSize :: (Arbitrary leaf, Arbitrary annotation) => Int -> Gen (ArbitraryTerm leaf annotation)
termOfSize n = (ArbitraryTerm .) . (:<) <$> arbitrary <*> syntaxOfSize n
where syntaxOfSize n | n <= 1 = oneof $ (Leaf <$> arbitrary) : branchGeneratorsOfSize n
| otherwise = oneof $ branchGeneratorsOfSize n
branchGeneratorsOfSize n =
[ Indexed <$> childrenOfSize (pred n)
, Fixed <$> childrenOfSize (pred n)
, (Keyed .) . (Map.fromList .) . zip <$> infiniteListOf arbitrary <*> childrenOfSize (pred n)
]
childrenOfSize n | n <= 0 = pure []
childrenOfSize n = do
m <- choose (1, n)
first <- termOfSize m
rest <- childrenOfSize (n - m)
pure $! first : rest
termOfSize n = ArbitraryTerm <$> arbitrary <*> syntaxOfSize termOfSize n
arbitraryTermSize :: ArbitraryTerm leaf annotation -> Int
arbitraryTermSize = cata (succ . sum) . toTerm
-- Instances
type instance Base (ArbitraryTerm leaf annotation) = CofreeF (Syntax leaf) annotation
instance Unfoldable (ArbitraryTerm leaf annotation) where embed = ArbitraryTerm
type instance Base (ArbitraryTerm leaf annotation) = TermF leaf annotation
instance Unfoldable (ArbitraryTerm leaf annotation) where embed (a :< s) = ArbitraryTerm a s
instance (Eq leaf, Eq annotation, Arbitrary leaf, Arbitrary annotation) => Arbitrary (ArbitraryTerm leaf annotation) where
arbitrary = sized $ \ n -> do
m <- choose (0, n)
termOfSize m
shrink term@(ArbitraryTerm (annotation :< syntax)) = (subterms term ++) $ filter (/= term) $
(ArbitraryTerm .) . (:<) <$> shrink annotation <*> case syntax of
Leaf a -> Leaf <$> shrink a
Indexed i -> Indexed <$> (List.subsequences i >>= recursivelyShrink)
Fixed f -> Fixed <$> (List.subsequences f >>= recursivelyShrink)
Keyed k -> Keyed . Map.fromList <$> (List.subsequences (Map.toList k) >>= recursivelyShrink)
shrink = genericShrink

View File

@ -1,7 +1,6 @@
module AlignmentSpec where
import Alignment
import ArbitraryTerm ()
import Control.Arrow ((&&&))
import Control.Monad.State
import Data.Align hiding (align)
@ -49,7 +48,7 @@ spec = parallel $ do
prop "covers every input line" $
\ elements -> let (_, children, ranges) = toAlignBranchInputs elements in
join <$> (traverse (modifyJoin (fromThese [] []) . fmap pure . fmap Prologue.fst) (alignBranch Prologue.snd children ranges)) `shouldBe` ranges
join <$> (traverse (modifyJoin (fromThese [] []) . fmap (pure . Prologue.fst)) (alignBranch Prologue.snd children ranges)) `shouldBe` ranges
prop "covers every input child" $
\ elements -> let (_, children, ranges) = toAlignBranchInputs elements in
@ -194,7 +193,7 @@ spec = parallel $ do
(info 8 9 `branch` [ info 8 9 `leaf` "c" ]))
]
describe "numberedRows" $
describe "numberedRows" $ do
prop "counts only non-empty values" $
\ xs -> counts (numberedRows (xs :: [Join These Char])) `shouldBe` length . catMaybes <$> Join (unalign (runJoin <$> xs))
@ -237,9 +236,9 @@ keysOfAlignedChildren lines = lines >>= these identity identity (++) . runJoin .
instance Arbitrary BranchElement where
arbitrary = oneof [ key >>= \ key -> Child key <$> joinTheseOf (contents key)
, Margin <$> joinTheseOf margin ]
where key = listOf1 (elements (['a'..'z'] ++ ['A'..'Z'] ++ ['0'..'9']))
where key = listOf1 (elements (['a'..'z'] <> ['A'..'Z'] <> ['0'..'9']))
contents key = wrap key <$> listOf (padding '*')
wrap key contents = "(" ++ key ++ contents ++ ")" :: String
wrap key contents = "(" <> key <> contents <> ")" :: String
margin = listOf (padding '-')
padding char = frequency [ (10, pure char)
, (1, pure '\n') ]

View File

@ -1,16 +0,0 @@
{-# OPTIONS_GHC -fno-warn-orphans #-}
module ArbitraryTerm where
import Data.Text.Arbitrary ()
import Data.These.Arbitrary ()
import Prologue hiding (fst, snd)
import Source hiding ((++))
import Test.QuickCheck hiding (Fixed)
data CategorySet = A | B | C | D deriving (Eq, Show)
instance Arbitrary CategorySet where
arbitrary = elements [ A, B, C, D ]
instance Arbitrary a => Arbitrary (Source a) where
arbitrary = Source.fromList <$> arbitrary

View File

@ -0,0 +1,39 @@
module Data.RandomWalkSimilarity.Spec where
import Category
import Data.DList as DList hiding (toList)
import Data.RandomWalkSimilarity
import qualified Data.Set as Set
import Diff
import Patch
import Prologue
import Syntax
import Term
import Term.Arbitrary
import Test.Hspec
import Test.Hspec.QuickCheck
import Test.QuickCheck
spec :: Spec
spec = parallel $ do
describe "pqGrams" $ do
prop "produces grams with stems of the specified length" . forAll (arbitrary `suchThat` (\ (_, p, q) -> p > 0 && q > 0)) $
\ (term, p, q) -> pqGrams p q identity (toTerm term :: Term Text Text) `shouldSatisfy` all ((== p) . length . stem)
prop "produces grams with bases of the specified length" . forAll (arbitrary `suchThat` (\ (_, p, q) -> p > 0 && q > 0)) $
\ (term, p, q) -> pqGrams p q identity (toTerm term :: Term Text Text) `shouldSatisfy` all ((== q) . length . base)
describe "featureVector" $ do
prop "produces a vector of the specified dimension" . forAll (arbitrary `suchThat` ((> 0) . Prologue.snd)) $
\ (grams, d) -> length (featureVector d (fromList (grams :: [Gram Text]))) `shouldBe` d
describe "rws" $ do
let compare a b = if extract a == extract b then Just (pure (Replace a b)) else Nothing
prop "produces correct diffs" . forAll (scale (`div` 4) arbitrary) $
\ (as, bs) -> let tas = toTerm <$> as
tbs = toTerm <$> bs
diff = free (Free (pure Program :< Indexed (rws compare identity tas tbs :: [Diff Text Category]))) in
(childrenOf <$> beforeTerm diff, childrenOf <$> afterTerm diff) `shouldBe` (Just (Set.fromList tas), Just (Set.fromList tbs))
childrenOf :: (Ord leaf, Ord annotation) => Term leaf annotation -> Set.Set (Term leaf annotation)
childrenOf = Set.fromList . toList . unwrap

38
test/Diff/Spec.hs Normal file
View File

@ -0,0 +1,38 @@
{-# LANGUAGE DataKinds #-}
module Diff.Spec where
import Category
import Data.Record
import Data.Text.Arbitrary ()
import Diff
import Diff.Arbitrary
import Interpreter
import Prologue
import Term.Arbitrary
import Test.Hspec
import Test.Hspec.QuickCheck
import Test.QuickCheck
spec :: Spec
spec = parallel $ do
prop "equality is reflexive" $
\ a b -> let diff = diffTerms (free . Free) (==) diffCost (toTerm a) (toTerm (b :: ArbitraryTerm Text (Record '[Category]))) in
diff `shouldBe` diff
prop "equal terms produce identity diffs" $
\ a -> let term = toTerm (a :: ArbitraryTerm Text (Record '[Category])) in
diffCost (diffTerms (free . Free) (==) diffCost term term) `shouldBe` 0
describe "beforeTerm" $ do
prop "recovers the before term" $
\ a b -> let diff = diffTerms (free . Free) (==) diffCost (toTerm a) (toTerm (b :: ArbitraryTerm Text (Record '[Category]))) in
beforeTerm diff `shouldBe` Just (toTerm a)
describe "afterTerm" $ do
prop "recovers the after term" $
\ a b -> let diff = diffTerms (free . Free) (==) diffCost (toTerm a) (toTerm (b :: ArbitraryTerm Text (Record '[Category]))) in
afterTerm diff `shouldBe` Just (toTerm b)
describe "ArbitraryDiff" $ do
prop "generates diffs of a specific size" . forAll ((arbitrary >>= \ n -> (,) n <$> diffOfSize n) `suchThat` ((> 0) . fst)) $
\ (n, diff) -> arbitraryDiffSize (diff :: ArbitraryDiff Text ()) `shouldBe` n

View File

@ -1,23 +1,30 @@
{-# LANGUAGE DataKinds #-}
module InterpreterSpec where
import Prologue
import Category
import Diff
import Data.Record
import qualified Interpreter as I
import Range
import Syntax
import Interpreter
import Patch
import Info
import Category
import Prologue
import Syntax
import Term.Arbitrary
import Test.Hspec
import Test.Hspec.QuickCheck
spec :: Spec
spec = parallel $
describe "interpret" $
spec = parallel $ do
describe "interpret" $ do
it "returns a replacement when comparing two unicode equivalent terms" $
I.diffTerms (free . Free) ((==) `on` extract) diffCost (cofree ((range .: StringLiteral .: 0 .: 0 .: RNil) :< Leaf "t\776")) (cofree ((range2 .: StringLiteral .: 0 .: 0 .: RNil) :< Leaf "\7831")) `shouldBe`
free (Pure (Replace (cofree ((range .: StringLiteral .: 0 .: 0 .: RNil) :< Leaf "t\776")) (cofree ((range2 .: StringLiteral .: 0 .: 0 .: RNil) :< Leaf "\7831"))))
let termA = cofree $ (StringLiteral .: RNil) :< Leaf ("t\776" :: Text)
termB = cofree $ (StringLiteral .: RNil) :< Leaf "\7831" in
diffTerms (free . Free) ((==) `on` extract) diffCost termA termB `shouldBe` free (Pure (Replace termA termB))
where
range = Range 0 2
range2 = Range 0 1
prop "produces correct diffs" $
\ a b -> let diff = diffTerms (free . Free) ((==) `on` extract) diffCost (toTerm a) (toTerm b) :: Diff Text (Record '[Category]) in
(beforeTerm diff, afterTerm diff) `shouldBe` (Just (toTerm a), Just (toTerm b))
prop "constructs zero-cost diffs of equal terms" $
\ a -> let term = toTerm a
diff = diffTerms (free . Free) ((==) `on` extract) diffCost term term :: Diff Text (Record '[Category]) in
diffCost diff `shouldBe` 0

View File

@ -3,19 +3,23 @@ module Main where
import Prologue
import qualified AlignmentSpec
import qualified CorpusSpec
import qualified Data.RandomWalkSimilarity.Spec
import qualified Diff.Spec
import qualified DiffSummarySpec
import qualified InterpreterSpec
import qualified OrderedMapSpec
import qualified PatchOutputSpec
import qualified TermSpec
import qualified DiffSummarySpec
import Test.Hspec
main :: IO ()
main = hspec $ parallel $ do
main = hspec . parallel $ do
describe "Alignment" AlignmentSpec.spec
describe "Corpus" CorpusSpec.spec
describe "Data.RandomWalkSimilarity" Data.RandomWalkSimilarity.Spec.spec
describe "Diff.Spec" Diff.Spec.spec
describe "DiffSummary" DiffSummarySpec.spec
describe "Interpreter" InterpreterSpec.spec
describe "OrderedMap" OrderedMapSpec.spec
describe "PatchOutput" PatchOutputSpec.spec
describe "Term" TermSpec.spec
describe "DiffSummary" DiffSummarySpec.spec

View File

@ -1,11 +1,6 @@
module TermSpec where
import ArbitraryTerm
import Data.String
import Data.Text.Arbitrary ()
import Diff
import Diff.Arbitrary
import Interpreter
import Prologue
import Term.Arbitrary
import Test.Hspec
@ -16,21 +11,8 @@ spec :: Spec
spec = parallel $ do
describe "Term" $ do
prop "equality is reflexive" $
\ a -> toTerm a == toTerm (a :: ArbitraryTerm String ())
\ a -> toTerm a `shouldBe` toTerm (a :: ArbitraryTerm Text ())
describe "ArbitraryTerm" $
prop "generates terms of a specific size" $ forAll ((arbitrary >>= \ n -> (,) n <$> termOfSize n) `suchThat` ((> 0) . fst)) $
\ (n, term) -> arbitraryTermSize (term :: ArbitraryTerm String ()) `shouldBe` n
describe "ArbitraryDiff" $
prop "generates diffs of a specific size" $ forAll ((arbitrary >>= \ n -> (,) n <$> diffOfSize n) `suchThat` ((> 0) . fst)) $
\ (n, diff) -> arbitraryDiffSize (diff :: ArbitraryDiff String ()) `shouldBe` n
describe "Diff" $ do
prop "equality is reflexive" $
\ a b -> let diff = diffTerms (free . Free) (==) diffCost (toTerm a) (toTerm (b :: ArbitraryTerm String CategorySet)) in
diff == diff
prop "equal terms produce identity diffs" $
\ a -> let term = toTerm (a :: ArbitraryTerm String CategorySet) in
diffCost (diffTerms (free . Free) (==) diffCost term term) == 0
describe "ArbitraryTerm" $ do
prop "generates terms of a specific size" . forAll ((arbitrary >>= \ n -> (,) n <$> termOfSize n) `suchThat` ((> 0) . fst)) $
\ (n, term) -> arbitraryTermSize (term :: ArbitraryTerm Text ()) `shouldBe` n