From 21cb7f71d7a9ec4336790f4d49b0d0e99ad146e8 Mon Sep 17 00:00:00 2001 From: joshvera Date: Wed, 10 Aug 2016 17:33:11 -0400 Subject: [PATCH] Merge 'origin/master' into one-json-to-rule-them-all --- semantic-diff.cabal | 5 +- src/Data/RandomWalkSimilarity.hs | 94 ++++++++++++++------------ src/Data/Record.hs | 4 ++ src/Diffing.hs | 65 +++++++++++------- src/Interpreter.hs | 26 +++---- src/Parser.hs | 6 +- src/TreeSitter.hs | 10 +-- test/CorpusSpec.hs | 25 +++---- test/Data/RandomWalkSimilarity/Spec.hs | 30 ++++---- test/Diff/Spec.hs | 15 ++-- test/DiffSummarySpec.hs | 31 ++++----- test/InterpreterSpec.hs | 19 +++--- test/PatchOutputSpec.hs | 7 +- test/diffs/dictionary.patch.js | 10 +++ 14 files changed, 193 insertions(+), 154 deletions(-) create mode 100644 test/diffs/dictionary.patch.js diff --git a/semantic-diff.cabal b/semantic-diff.cabal index f05e47239..3994797ea 100644 --- a/semantic-diff.cabal +++ b/semantic-diff.cabal @@ -118,16 +118,17 @@ test-suite semantic-diff-test , deepseq , dlist , filepath + , free , Glob , hspec >= 2.1.10 , mtl , QuickCheck >= 2.8.1 , quickcheck-text + , recursion-schemes >= 4.1 , semantic-diff , text >= 1.2.1.3 , these - , free - , recursion-schemes >= 4.1 + , vector , wl-pprint-text , protolude if os(darwin) diff --git a/src/Data/RandomWalkSimilarity.hs b/src/Data/RandomWalkSimilarity.hs index 6b10ffe36..1b8b613bd 100644 --- a/src/Data/RandomWalkSimilarity.hs +++ b/src/Data/RandomWalkSimilarity.hs @@ -1,15 +1,21 @@ -{-# LANGUAGE RankNTypes #-} -module Data.RandomWalkSimilarity where +{-# LANGUAGE DataKinds, GADTs, RankNTypes, TypeOperators #-} +module Data.RandomWalkSimilarity +( rws +, pqGramDecorator +, featureVectorDecorator +, Gram(..) +) where +import Control.Applicative import Control.Arrow ((&&&)) import Control.Monad.Random import Control.Monad.State -import qualified Data.DList as DList import Data.Functor.Both hiding (fst, snd) import Data.Functor.Foldable as Foldable import Data.Hashable import qualified Data.KdTree.Static as KdTree import qualified Data.List as List +import Data.Record import qualified Data.Vector as Vector import Patch import Prologue @@ -18,25 +24,23 @@ import Test.QuickCheck hiding (Fixed) import Test.QuickCheck.Random -- | Given a function comparing two terms recursively, and a function to compute a Hashable label from an unpacked term, compute the diff of a pair of lists of terms using a random walk similarity metric, which completes in log-linear time. This implementation is based on the paper [_RWS-Diff—Flexible and Efficient Change Detection in Hierarchical Data_](https://github.com/github/semantic-diff/files/325837/RWS-Diff.Flexible.and.Efficient.Change.Detection.in.Hierarchical.Data.pdf). -rws :: (Hashable label, Eq annotation, Prologue.Foldable f, Functor f, Eq (f (Cofree f annotation))) - => (Cofree f annotation -> Cofree f annotation -> Maybe (Free (CofreeF f (Both annotation)) (Patch (Cofree f annotation)))) -- ^ A function which comapres a pair of terms recursively, returning 'Just' their diffed value if appropriate, or 'Nothing' if they should not be compared. - -> (forall b. CofreeF f annotation b -> label) -- ^ A function to compute a label for an unpacked term. - -> [Cofree f annotation] -- ^ The list of old terms. - -> [Cofree f annotation] -- ^ The list of new terms. - -> [Free (CofreeF f (Both annotation)) (Patch (Cofree f annotation))] -rws compare getLabel as bs +rws :: (Eq (Record fields), Prologue.Foldable f, Functor f, Eq (f (Cofree f (Record fields))), HasField fields (Vector.Vector Double)) + => (Cofree f (Record fields) -> Cofree f (Record fields) -> Maybe (Free (CofreeF f (Both (Record fields))) (Patch (Cofree f (Record fields))))) -- ^ A function which compares a pair of terms recursively, returning 'Just' their diffed value if appropriate, or 'Nothing' if they should not be compared. + -> [Cofree f (Record fields)] -- ^ The list of old terms. + -> [Cofree f (Record fields)] -- ^ The list of new terms. + -> [Free (CofreeF f (Both (Record fields))) (Patch (Cofree f (Record fields)))] -- ^ The resulting list of similarity-matched diffs. +rws compare as bs | null as, null bs = [] | null as = inserting <$> bs | null bs = deleting <$> as | otherwise = fmap snd . uncurry deleteRemaining . (`runState` (negate 1, fas)) $ traverse findNearestNeighbourTo fbs - where (p, q, d) = (2, 2, 15) - fas = zipWith featurize [0..] as + where fas = zipWith featurize [0..] as fbs = zipWith featurize [0..] bs kdas = KdTree.build (Vector.toList . feature) fas - featurize index term = UnmappedTerm index (featureVector d (pqGrams p q getLabel term)) term + featurize index term = UnmappedTerm index (getField (extract term)) term findNearestNeighbourTo kv@(UnmappedTerm _ _ v) = do (previous, unmapped) <- get - let (UnmappedTerm i _ _) = KdTree.nearest kdas kv + let UnmappedTerm i _ _ = KdTree.nearest kdas kv fromMaybe (pure (negate 1, inserting v)) $ do found <- find ((== i) . termIndex) unmapped guard (i >= previous) @@ -55,38 +59,40 @@ data UnmappedTerm a = UnmappedTerm { termIndex :: {-# UNPACK #-} !Int, feature : data Gram label = Gram { stem :: [Maybe label], base :: [Maybe label] } deriving (Eq, Show) --- | Compute the bag of grams with stems of length _p_ and bases of length _q_, with labels computed from annotations, which summarize the entire subtree of a term. -pqGrams :: (Prologue.Foldable f, Functor f) => Int -> Int -> (forall b. CofreeF f annotation b -> label) -> Cofree f annotation -> DList.DList (Gram label) -pqGrams p q getLabel = uncurry DList.cons . cata merge . setRootBase . setRootStem . cata go - where go c = cofree (Gram [] [ Just (getLabel c) ] :< (assignParent (Just (getLabel c)) p <$> tailF c)) - merge (head :< tail) = let tail' = toList tail in (head, DList.fromList (windowed q setBases [] (fst <$> tail')) <> foldMap snd tail') - assignParent parentLabel n tree - | n > 0 = let gram :< functor = runCofree tree in cofree $ prependParent parentLabel gram :< (assignParent parentLabel (pred n) <$> functor) - | otherwise = tree - prependParent parentLabel gram = gram { stem = parentLabel : stem gram } - setBases gram siblings rest = setBase gram (siblings >>= base) : rest - setBase gram newBase = gram { base = take q (newBase <> repeat Nothing) } - setRootBase term = let (a :< f) = runCofree term in cofree (setBase a (base a) :< f) - setRootStem = foldr (\ p rest -> assignParent Nothing p . rest) identity [0..p] +-- | Annotates a term with the corresponding p,q-gram at each node. +pqGramDecorator :: Traversable f + => (forall b. CofreeF f (Record fields) b -> label) -- ^ A function computing the label from an arbitrary unpacked term. This function can use the annotation and functor’s constructor, but not any recursive values inside the functor (since they’re held parametric in 'b'). + -> Int -- ^ 'p'; the desired stem length for the grams. + -> Int -- ^ 'q'; the desired base length for the grams. + -> Cofree f (Record fields) -- ^ The term to decorate. + -> Cofree f (Record (Gram label ': fields)) -- ^ The decorated term. +pqGramDecorator getLabel p q = cata algebra + where algebra term = let label = getLabel term in + cofree ((gram label .: headF term) :< assignParentAndSiblingLabels (tailF term) label) + gram label = Gram (padToSize p []) (padToSize q (pure (Just label))) + assignParentAndSiblingLabels functor label = (`evalState` (siblingLabels functor)) (for functor (assignLabels label)) + assignLabels :: label -> Cofree f (Record (Gram label ': fields)) -> State [Maybe label] (Cofree f (Record (Gram label ': fields))) + assignLabels label a = case runCofree a of + RCons gram rest :< functor -> do + labels <- get + put (drop 1 labels) + pure $! cofree ((gram { stem = padToSize p (Just label : stem gram), base = padToSize q labels } .: rest) :< functor) + siblingLabels :: Traversable f => f (Cofree f (Record (Gram label ': fields))) -> [Maybe label] + siblingLabels = foldMap (base . rhead . extract) + padToSize n list = take n (list <> repeat empty) --- | A sliding-window fold over _n_ items of a list per iteration. -windowed :: Int -> (a -> [a] -> b -> b) -> b -> [a] -> b -windowed n f seed = para alg - where alg xs = case xs of - Cons a (as, b) -> f a (take n $ a : as) b - Nil -> seed +-- | Computes a unit vector of the specified dimension from a hash. +unitVector :: Int -> Int -> Vector.Vector Double +unitVector d hash = normalize ((`evalRand` mkQCGen hash) (sequenceA (Vector.replicate d getRandom))) + where normalize vec = fmap (/ vmagnitude vec) vec + vmagnitude = sqrtDouble . Vector.sum . fmap (** 2) - --- | Compute a vector with the specified number of dimensions, as an approximation of a bag of `Gram`s summarizing a tree. -featureVector :: Hashable label => Int -> DList.DList (Gram label) -> Vector.Vector Double -featureVector d bag = sumVectors $ unitDVector . hash <$> bag - where unitDVector hash = normalize . (`evalRand` mkQCGen hash) $ Prologue.sequence (Vector.replicate d getRandom) - normalize vec = fmap (/ vmagnitude vec) vec - sumVectors = DList.foldr (Vector.zipWith (+)) (Vector.replicate d 0) - --- | The magnitude of a Euclidean vector, i.e. its distance from the origin. -vmagnitude :: Vector.Vector Double -> Double -vmagnitude = sqrtDouble . Vector.sum . fmap (** 2) +-- | Annotates a term with a feature vector at each node. +featureVectorDecorator :: (Hashable label, Traversable f) => (forall b. CofreeF f (Record fields) b -> label) -> Int -> Int -> Int -> Cofree f (Record fields) -> Cofree f (Record (Vector.Vector Double ': fields)) +featureVectorDecorator getLabel p q d + = cata (\ (RCons gram rest :< functor) -> + cofree ((foldr (Vector.zipWith (+) . getField . extract) (unitVector d (hash gram)) functor .: rest) :< functor)) + . pqGramDecorator getLabel p q -- Instances diff --git a/src/Data/Record.hs b/src/Data/Record.hs index 70646facc..a2c740b4c 100644 --- a/src/Data/Record.hs +++ b/src/Data/Record.hs @@ -18,6 +18,10 @@ infixr 0 .: (.:) :: h -> Record t -> Record (h ': t) (.:) = RCons +-- | Get the first element of a non-empty record. +rhead :: Record (head ': tail) -> head +rhead (RCons head _) = head + -- Classes diff --git a/src/Diffing.hs b/src/Diffing.hs index e98669896..522f7b293 100644 --- a/src/Diffing.hs +++ b/src/Diffing.hs @@ -1,15 +1,18 @@ -{-# LANGUAGE DataKinds #-} +{-# LANGUAGE DataKinds, RankNTypes, TypeOperators #-} module Diffing where +import qualified Prologue import Prologue hiding (fst, snd) import qualified Data.ByteString.Char8 as B1 import Data.Functor.Both import Data.Functor.Foldable +import Data.RandomWalkSimilarity import Data.Record import qualified Data.Text.IO as TextIO import qualified Data.Text.ICU.Detect as Detect import qualified Data.Text.ICU.Convert as Convert import Data.These +import qualified Data.Vector as Vector import Diff import Info import Interpreter @@ -38,18 +41,16 @@ import Data.Aeson.Encoding (encodingToLazyByteString) -- | result. -- | Returns the rendered result strictly, so it's always fully evaluated -- | with respect to other IO actions. -diffFiles :: (HasField fields Category, HasField fields Cost, HasField fields Range, Eq (Record fields)) => Parser fields -> Renderer (Record fields) -> Both SourceBlob -> IO Output +diffFiles :: (HasField fields Category, HasField fields Cost, HasField fields Range, Eq (Record fields)) => Parser (Syntax Text) (Record fields) -> Renderer (Record (Vector.Vector Double ': fields)) -> Both SourceBlob -> IO Output diffFiles parser renderer sourceBlobs = do - let sources = source <$> sourceBlobs - terms <- sequence $ parser <$> sourceBlobs + terms <- traverse (fmap (featureVectorDecorator getLabel p q d) . parser) sourceBlobs - let replaceLeaves = breakDownLeavesByWord <$> sources let areNullOids = runBothWith (\a b -> (oid a == nullOid || null (source a), oid b == nullOid || null (source b))) sourceBlobs let textDiff = case areNullOids of (True, False) -> pure $ Insert (snd terms) (False, True) -> pure $ Delete (fst terms) (_, _) -> - runBothWith (diffTerms construct compareCategoryEq diffCostWithCachedTermCosts) (replaceLeaves <*> terms) + runBothWith (diffTerms construct compareCategoryEq diffCostWithCachedTermCosts) terms pure $! renderer sourceBlobs textDiff @@ -58,10 +59,13 @@ diffFiles parser renderer sourceBlobs = do getCost diff = case runFree diff of Free (info :< _) -> cost <$> info Pure patch -> uncurry both (fromThese 0 0 (unPatch (cost . extract <$> patch))) - + getLabel (h :< t) = (category h, case t of + Leaf s -> Just s + _ -> Nothing) + (p, q, d) = (2, 2, 15) -- | Return a parser based on the file extension (including the "."). -parserForType :: Text -> Parser '[Range, Category, Cost] +parserForType :: Text -> Parser (Syntax Text) (Record '[Range, Category]) parserForType mediaType = case languageForType mediaType of Just C -> treeSitterParser C ts_language_c Just JavaScript -> treeSitterParser JavaScript ts_language_javascript @@ -69,28 +73,29 @@ parserForType mediaType = case languageForType mediaType of _ -> lineByLineParser -- | A fallback parser that treats a file simply as rows of strings. -lineByLineParser :: Parser '[Range, Category, Cost] +lineByLineParser :: Parser (Syntax Text) (Record '[Range, Category]) lineByLineParser blob = pure . cofree . root $ case foldl' annotateLeaves ([], 0) lines of (leaves, _) -> cofree <$> leaves where input = source blob lines = actualLines input - root children = let cost = 1 + fromIntegral (length children) in - (Range 0 (length input) .: Other "program" .: cost .: RNil) :< Indexed children - leaf charIndex line = (Range charIndex (charIndex + T.length line) .: Other "program" .: 1 .: RNil) :< Leaf line + root children = ((Range 0 $ length input) .: Program .: RNil) :< Indexed children + leaf charIndex line = ((Range charIndex $ charIndex + T.length line) .: Program .: RNil) :< Leaf line annotateLeaves (accum, charIndex) line = (accum <> [ leaf charIndex (toText line) ] , charIndex + length line) toText = T.pack . Source.toString -- | Return the parser that should be used for a given path. -parserForFilepath :: FilePath -> Parser '[Range, Category, Cost] -parserForFilepath = parserForType . toS . takeExtension +parserForFilepath :: FilePath -> Parser (Syntax Text) (Record '[Cost, Range, Category]) +parserForFilepath path blob = decorateTerm termCostDecorator <$> do + parsed <- parserForType (toS (takeExtension path)) blob + pure $! breakDownLeavesByWord (source blob) parsed -- | Replace every string leaf with leaves of the words in the string. -breakDownLeavesByWord :: (HasField fields Category, HasField fields Cost, HasField fields Range) => Source Char -> Term Text (Record fields) -> Term Text (Record fields) +breakDownLeavesByWord :: (HasField fields Category, HasField fields Range) => Source Char -> Term Text (Record fields) -> Term Text (Record fields) breakDownLeavesByWord source = cata replaceIn where - replaceIn (info :< syntax) = let cost' = 1 + sum (cost . extract <$> syntax') in cofree $ setCost info cost' :< syntax' + replaceIn (info :< syntax) = cofree $ info :< syntax' where syntax' = case (ranges, syntax) of (_:_:_, Leaf _) | category info /= Regex -> Indexed (makeLeaf info <$> ranges) _ -> syntax @@ -111,6 +116,18 @@ readAndTranscodeFile path = do text <- B1.readFile path transcode text +-- | A function computing a value to decorate terms with. This can be used to cache synthesized attributes on terms. +type TermDecorator f fields field = CofreeF f (Record fields) (Record (field ': fields)) -> field + +-- | Decorate a 'Term' using a function to compute the annotation values at every node. +decorateTerm :: Functor f => TermDecorator f fields field -> Cofree f (Record fields) -> Cofree f (Record (field ': fields)) +decorateTerm decorator = cata $ \ c -> cofree ((decorator (extract <$> c) .: headF c) :< tailF c) + +-- | Term decorator computing the cost of an unpacked term. +termCostDecorator :: (Prologue.Foldable f, Functor f) => TermDecorator f a Cost +termCostDecorator c = 1 + sum (cost <$> tailF c) + +-- | Determine whether two terms are comparable based on the equality of their categories. compareCategoryEq :: HasField fields Category => Term leaf (Record fields) -> Term leaf (Record fields) -> Bool compareCategoryEq = (==) `on` category . extract @@ -121,14 +138,12 @@ diffCostWithCachedTermCosts diff = unCost $ case runFree diff of Pure patch -> sum (cost . extract <$> patch) -- | Returns a rendered diff given a parser, diff arguments and two source blobs. -textDiff :: (Eq (Record fields), HasField fields Category, HasField fields Cost, HasField fields Range) => Parser fields -> DiffArguments -> Both SourceBlob -> IO Output -textDiff parser arguments = diffFiles parser renderer - where - renderer = case format arguments of - Split -> split - Patch -> patch - JSON -> json - Summary -> summary +textDiff :: (Eq (Record fields), HasField fields Category, HasField fields Cost, HasField fields Range) => Parser (Syntax Text) (Record fields) -> DiffArguments -> Both SourceBlob -> IO Output +textDiff parser arguments = diffFiles parser $ case format arguments of + Split -> split + Patch -> patch + JSON -> json + Summary -> summary -- | Returns a truncated diff given diff arguments and two source blobs. truncatedDiff :: DiffArguments -> Both SourceBlob -> IO Output @@ -139,7 +154,7 @@ truncatedDiff arguments sources = pure $ case format arguments of Summary -> SummaryOutput mempty -- | Prints a rendered diff to stdio or a filepath given a parser, diff arguments and two source blobs. -printDiff :: (Eq (Record fields), HasField fields Category, HasField fields Cost, HasField fields Range) => Parser fields -> DiffArguments -> Both SourceBlob -> IO () +printDiff :: (Eq (Record fields), HasField fields Category, HasField fields Cost, HasField fields Range) => Parser (Syntax Text) (Record fields) -> DiffArguments -> Both SourceBlob -> IO () printDiff parser arguments sources = do rendered <- textDiff parser arguments sources let renderedText = case rendered of diff --git a/src/Interpreter.hs b/src/Interpreter.hs index 3c15317c3..e2ff84091 100644 --- a/src/Interpreter.hs +++ b/src/Interpreter.hs @@ -5,10 +5,10 @@ import Algorithm import Data.Align.Generic import Data.Functor.Foldable import Data.Functor.Both -import Data.Hashable import Data.RandomWalkSimilarity import Data.Record import Data.These +import qualified Data.Vector as Vector import Diff import qualified Control.Monad.Free.Church as F import Info @@ -25,7 +25,7 @@ type Comparable leaf annotation = Term leaf annotation -> Term leaf annotation - type DiffConstructor leaf annotation = CofreeF (Syntax leaf) (Both annotation) (Diff leaf annotation) -> Diff leaf annotation -- | Diff two terms recursively, given functions characterizing the diffing. -diffTerms :: (Eq leaf, Hashable leaf, Eq (Record fields), HasField fields Category) +diffTerms :: (Eq leaf, Eq (Record fields), HasField fields Category, HasField fields (Vector.Vector Double)) => DiffConstructor leaf (Record fields) -- ^ A function to wrap up & possibly annotate every produced diff. -> Comparable leaf (Record fields) -- ^ A function to determine whether or not two terms should even be compared. -> SES.Cost (Diff leaf (Record fields)) -- ^ A function to compute the cost of a given diff node. @@ -35,15 +35,12 @@ diffTerms :: (Eq leaf, Hashable leaf, Eq (Record fields), HasField fields Catego diffTerms construct comparable cost a b = fromMaybe (replacing a b) $ diffComparableTerms construct comparable cost a b -- | Diff two terms recursively, given functions characterizing the diffing. If the terms are incomparable, returns 'Nothing'. -diffComparableTerms :: (Eq leaf, Hashable leaf, Eq (Record fields), HasField fields Category) => DiffConstructor leaf (Record fields) -> Comparable leaf (Record fields) -> SES.Cost (Diff leaf (Record fields)) -> Term leaf (Record fields) -> Term leaf (Record fields) -> Maybe (Diff leaf (Record fields)) +diffComparableTerms :: (Eq leaf, Eq (Record fields), HasField fields Category, HasField fields (Vector.Vector Double)) => DiffConstructor leaf (Record fields) -> Comparable leaf (Record fields) -> SES.Cost (Diff leaf (Record fields)) -> Term leaf (Record fields) -> Term leaf (Record fields) -> Maybe (Diff leaf (Record fields)) diffComparableTerms construct comparable cost = recur where recur a b | (category <$> a) == (category <$> b) = hylo construct runCofree <$> zipTerms a b - | comparable a b = runAlgorithm construct recur cost getLabel (Just <$> algorithmWithTerms construct a b) + | comparable a b = runAlgorithm construct recur cost (Just <$> algorithmWithTerms construct a b) | otherwise = Nothing - getLabel (h :< t) = (category h, case t of - Leaf s -> Just s - _ -> Nothing) -- | Construct an algorithm to diff a pair of terms. algorithmWithTerms :: (TermF leaf (Both a) diff -> diff) -> Term leaf a -> Term leaf a -> Algorithm (Term leaf a) diff diff @@ -74,16 +71,15 @@ algorithmWithTerms construct t1 t2 = case (unwrap t1, unwrap t2) of byIndex constructor a b = Algorithm.byIndex a b >>= annotate . constructor -- | Run an algorithm, given functions characterizing the evaluation. -runAlgorithm :: (Functor f, GAlign f, Eq a, Eq annotation, Eq (f (Cofree f annotation)), Prologue.Foldable f, Traversable f, Hashable label) - => (CofreeF f (Both annotation) (Free (CofreeF f (Both annotation)) (Patch (Cofree f annotation))) -> Free (CofreeF f (Both annotation)) (Patch (Cofree f annotation))) -- ^ A function to wrap up & possibly annotate every produced diff. - -> (Cofree f annotation -> Cofree f annotation -> Maybe (Free (CofreeF f (Both annotation)) (Patch (Cofree f annotation)))) -- ^ A function to diff two subterms recursively, if they are comparable, or else return 'Nothing'. - -> SES.Cost (Free (CofreeF f (Both annotation)) (Patch (Cofree f annotation))) -- ^ A function to compute the cost of a given diff node. - -> (forall b. CofreeF f annotation b -> label) -- ^ A function to compute a label for a given term. - -> Algorithm (Cofree f annotation) (Free (CofreeF f (Both annotation)) (Patch (Cofree f annotation))) a -- ^ The algorithm to run. +runAlgorithm :: (Functor f, GAlign f, Eq a, Eq (Record fields), Eq (f (Cofree f (Record fields))), Prologue.Foldable f, Traversable f, HasField fields (Vector.Vector Double)) + => (CofreeF f (Both (Record fields)) (Free (CofreeF f (Both (Record fields))) (Patch (Cofree f (Record fields)))) -> Free (CofreeF f (Both (Record fields))) (Patch (Cofree f (Record fields)))) -- ^ A function to wrap up & possibly annotate every produced diff. + -> (Cofree f (Record fields) -> Cofree f (Record fields) -> Maybe (Free (CofreeF f (Both (Record fields))) (Patch (Cofree f (Record fields))))) -- ^ A function to diff two subterms recursively, if they are comparable, or else return 'Nothing'. + -> SES.Cost (Free (CofreeF f (Both (Record fields))) (Patch (Cofree f (Record fields)))) -- ^ A function to compute the cost of a given diff node. + -> Algorithm (Cofree f (Record fields)) (Free (CofreeF f (Both (Record fields))) (Patch (Cofree f (Record fields)))) a -- ^ The algorithm to run. -> a -runAlgorithm construct recur cost getLabel = F.iter $ \case +runAlgorithm construct recur cost = F.iter $ \case Recursive a b f -> f (maybe (replacing a b) (construct . (both (extract a) (extract b) :<)) $ do aligned <- galign (unwrap a) (unwrap b) traverse (these (Just . deleting) (Just . inserting) recur) aligned) ByIndex as bs f -> f (ses recur cost as bs) - BySimilarity as bs f -> f (rws recur getLabel as bs) + BySimilarity as bs f -> f (rws recur as bs) diff --git a/src/Parser.hs b/src/Parser.hs index 875a9cc30..3f5014193 100644 --- a/src/Parser.hs +++ b/src/Parser.hs @@ -12,10 +12,10 @@ import qualified Data.Set as Set import Source import SourceSpan --- | A function that takes a source file and returns an annotated AST. +-- | A function that takes a source blob and returns an annotated AST. -- | The return is in the IO monad because some of the parsers are written in C -- | and aren't pure. -type Parser fields = SourceBlob -> IO (Term Text (Record fields)) +type Parser f a = SourceBlob -> IO (Cofree f a) -- | Whether a category is an Operator Category isOperator :: Category -> Bool @@ -141,4 +141,4 @@ methodDefinitions :: HasField fields Category => Term Text (Record fields) -> [T methodDefinitions definitions | Other "class_body" == category (extract definitions), S.Indexed definitions' <- unwrap definitions = definitions' -methodDefinitions _ = mempty \ No newline at end of file +methodDefinitions _ = mempty diff --git a/src/TreeSitter.hs b/src/TreeSitter.hs index 91cb4540c..752be44e1 100644 --- a/src/TreeSitter.hs +++ b/src/TreeSitter.hs @@ -4,10 +4,11 @@ module TreeSitter where import Prologue hiding (Constructor) import Data.Record import Category -import Info import Language import Parser +import Range import Source +import qualified Syntax import Foreign import Foreign.C.String import Text.Parser.TreeSitter hiding (Language(..)) @@ -15,7 +16,7 @@ import qualified Text.Parser.TreeSitter as TS import SourceSpan -- | Returns a TreeSitter parser for the given language and TreeSitter grammar. -treeSitterParser :: Language -> Ptr TS.Language -> Parser '[Range, Category, Cost] +treeSitterParser :: Language -> Ptr TS.Language -> Parser (Syntax.Syntax Text) (Record '[Range, Category]) treeSitterParser language grammar blob = do document <- ts_document_make ts_document_set_language document grammar @@ -94,7 +95,7 @@ defaultCategoryForNodeName name = case name of _ -> Other name -- | Return a parser for a tree sitter language & document. -documentToTerm :: Language -> Ptr Document -> Parser '[Range, Category, Cost] +documentToTerm :: Language -> Ptr Document -> Parser (Syntax.Syntax Text) (Record '[Range, Category]) documentToTerm language document blob = alloca $ \ root -> do ts_document_root_node_p document root toTerm root @@ -110,8 +111,7 @@ documentToTerm language document blob = alloca $ \ root -> do , spanStart = SourcePos (fromIntegral $ ts_node_p_start_point_row node) (fromIntegral $ ts_node_p_start_point_column node) , spanEnd = SourcePos (fromIntegral $ ts_node_p_end_point_row node) (fromIntegral $ ts_node_p_end_point_column node) } - let cost' = 1 + sum (cost . extract <$> children) - let info = range .: (categoriesForLanguage language (toS name)) .: cost' .: RNil + let info = range .: (categoriesForLanguage language (toS name)) .: RNil pure $! termConstructor (source blob) sourceSpan info children getChild node n out = do _ <- ts_node_p_named_child node n out diff --git a/test/CorpusSpec.hs b/test/CorpusSpec.hs index 0cc042a5f..9275b5696 100644 --- a/test/CorpusSpec.hs +++ b/test/CorpusSpec.hs @@ -1,31 +1,31 @@ {-# LANGUAGE DataKinds, FlexibleContexts, GeneralizedNewtypeDeriving #-} module CorpusSpec where -import Unsafe (unsafeFromJust) -import Diffing -import Renderer -import qualified Renderer.JSON as J -import qualified Renderer.Patch as P -import qualified Renderer.Split as Split - import Category import Control.DeepSeq import Data.Functor.Both -import Data.Record import Data.List (union) +import Data.Record import qualified Data.Text as T +import qualified Data.Vector as Vector +import Diffing +import GHC.Show (Show(..)) import Info import Prologue hiding (fst, snd, lookup) +import Renderer +import qualified Renderer.JSON as J +import qualified Renderer.Patch as P +import qualified Renderer.Split as Split import qualified Source as S import System.FilePath import System.FilePath.Glob import Test.Hspec -import GHC.Show (Show(..)) +import Unsafe (unsafeFromJust) spec :: Spec spec = parallel $ do - describe "crashers crash" $ runTestsIn "test/crashers-todo/" (\ a b -> - a `deepseq` pure (a == b) `shouldThrow` anyException) + describe "crashers crash" . runTestsIn "test/crashers-todo/" $ \ a b -> + a `deepseq` pure (a == b) `shouldThrow` anyException describe "crashers should not crash" $ runTestsIn "test/crashers/" shouldBe describe "todos are incorrect" $ runTestsIn "test/diffs-todo/" shouldNotBe describe "should produce the correct diff" $ runTestsIn "test/diffs/" shouldBe @@ -46,6 +46,7 @@ spec = parallel $ do correctTests paths = filter (\(_, _, _, output) -> isJust output) $ testsForPaths paths testsForPaths (aPath, bPath, json, patch, split) = [ ("json", J.json, paths, json), ("patch", P.patch, paths, patch), ("split", Split.split, paths, split) ] where paths = both aPath bPath + -- | Return all the examples from the given directory. Examples are expected to -- | have the form "foo.A.js", "foo.B.js", "foo.patch.js". Diffs are not -- | required as the test may be verifying that the inputs don't crash. @@ -73,7 +74,7 @@ normalizeName path = addExtension (dropExtension $ dropExtension path) (takeExte -- | Given file paths for A, B, and, optionally, a diff, return whether diffing -- | the files will produce the diff. If no diff is provided, then the result -- | is true, but the diff will still be calculated. -testDiff :: Renderer (Record '[Range, Category, Cost]) -> Both (Maybe FilePath) -> Maybe FilePath -> (Maybe Verbatim -> Maybe Verbatim -> Expectation) -> Expectation +testDiff :: Renderer (Record '[Vector.Vector Double, Cost, Range, Category]) -> Both (Maybe FilePath) -> Maybe FilePath -> (Maybe Verbatim -> Maybe Verbatim -> Expectation) -> Expectation testDiff renderer paths diff matcher = do sources <- traverse (traverse readAndTranscodeFile) paths actual <- fmap Verbatim <$> traverse ((pure . concatOutputs . pure) <=< diffFiles' sources) parser diff --git a/test/Data/RandomWalkSimilarity/Spec.hs b/test/Data/RandomWalkSimilarity/Spec.hs index 97074107b..034e04c5b 100644 --- a/test/Data/RandomWalkSimilarity/Spec.hs +++ b/test/Data/RandomWalkSimilarity/Spec.hs @@ -1,9 +1,10 @@ +{-# LANGUAGE DataKinds #-} module Data.RandomWalkSimilarity.Spec where -import Category -import Data.DList as DList hiding (toList) import Data.RandomWalkSimilarity +import Data.Record import Diff +import Info import Patch import Prologue import Syntax @@ -15,21 +16,22 @@ import Test.QuickCheck spec :: Spec spec = parallel $ do - describe "pqGrams" $ do - prop "produces grams with stems of the specified length" . forAll (arbitrary `suchThat` (\ (_, p, q) -> p > 0 && q > 0)) $ - \ (term, p, q) -> pqGrams p q headF (toTerm term :: Term Text Text) `shouldSatisfy` all ((== p) . length . stem) + let positively = succ . abs + describe "pqGramDecorator" $ do + prop "produces grams with stems of the specified length" $ + \ (term, p, q) -> pqGramDecorator (rhead . headF) (positively p) (positively q) (toTerm term :: Term Text (Record '[Text])) `shouldSatisfy` all ((== (positively p)) . length . stem . rhead) - prop "produces grams with bases of the specified length" . forAll (arbitrary `suchThat` (\ (_, p, q) -> p > 0 && q > 0)) $ - \ (term, p, q) -> pqGrams p q headF (toTerm term :: Term Text Text) `shouldSatisfy` all ((== q) . length . base) + prop "produces grams with bases of the specified width" $ + \ (term, p, q) -> pqGramDecorator (rhead . headF) (positively p) (positively q) (toTerm term :: Term Text (Record '[Text])) `shouldSatisfy` all ((== (positively q)) . length . base . rhead) - describe "featureVector" $ do - prop "produces a vector of the specified dimension" . forAll (arbitrary `suchThat` ((> 0) . Prologue.snd)) $ - \ (grams, d) -> length (featureVector d (fromList (grams :: [Gram Text]))) `shouldBe` d + describe "featureVectorDecorator" $ do + prop "produces a vector of the specified dimension" $ + \ (term, p, q, d) -> featureVectorDecorator (rhead . headF) (positively p) (positively q) (positively d) (toTerm term :: Term Text (Record '[Text])) `shouldSatisfy` all ((== (positively d)) . length . rhead) describe "rws" $ do let compare a b = if extract a == extract b then Just (pure (Replace a b)) else Nothing prop "produces correct diffs" . forAll (scale (`div` 4) arbitrary) $ - \ (as, bs) -> let tas = toTerm <$> as - tbs = toTerm <$> bs - diff = free (Free (pure Program :< Indexed (rws compare headF tas tbs :: [Diff Text Category]))) in - (beforeTerm diff, afterTerm diff) `shouldBe` (Just (cofree (Program :< Indexed tas)), Just (cofree (Program :< Indexed tbs))) + \ (as, bs) -> let tas = featureVectorDecorator (category . headF) 2 2 15 . toTerm <$> (as :: [ArbitraryTerm Text (Record '[Category])]) + tbs = featureVectorDecorator (category . headF) 2 2 15 . toTerm <$> (bs :: [ArbitraryTerm Text (Record '[Category])]) + diff = free (Free (pure (pure 0 .: Program .: RNil) :< Indexed (rws compare tas tbs))) in + (beforeTerm diff, afterTerm diff) `shouldBe` (Just (cofree ((pure 0 .: Program .: RNil) :< Indexed tas)), Just (cofree ((pure 0 .: Program .: RNil) :< Indexed tbs))) diff --git a/test/Diff/Spec.hs b/test/Diff/Spec.hs index 6673b7ba5..91857b665 100644 --- a/test/Diff/Spec.hs +++ b/test/Diff/Spec.hs @@ -2,10 +2,12 @@ module Diff.Spec where import Category +import Data.RandomWalkSimilarity import Data.Record import Data.Text.Arbitrary () import Diff import Diff.Arbitrary +import Info import Interpreter import Prologue import Term.Arbitrary @@ -15,23 +17,24 @@ import Test.QuickCheck spec :: Spec spec = parallel $ do + let toTerm' = featureVectorDecorator (category . headF) 2 2 15 . toTerm prop "equality is reflexive" $ - \ a b -> let diff = diffTerms wrap (==) diffCost (toTerm a) (toTerm (b :: ArbitraryTerm Text (Record '[Category]))) in + \ a b -> let diff = diffTerms wrap (==) diffCost (toTerm' a) (toTerm' (b :: ArbitraryTerm Text (Record '[Category]))) in diff `shouldBe` diff prop "equal terms produce identity diffs" $ - \ a -> let term = toTerm (a :: ArbitraryTerm Text (Record '[Category])) in + \ a -> let term = toTerm' (a :: ArbitraryTerm Text (Record '[Category])) in diffCost (diffTerms wrap (==) diffCost term term) `shouldBe` 0 describe "beforeTerm" $ do prop "recovers the before term" $ - \ a b -> let diff = diffTerms wrap (==) diffCost (toTerm a) (toTerm (b :: ArbitraryTerm Text (Record '[Category]))) in - beforeTerm diff `shouldBe` Just (toTerm a) + \ a b -> let diff = diffTerms wrap (==) diffCost (toTerm' a) (toTerm' (b :: ArbitraryTerm Text (Record '[Category]))) in + beforeTerm diff `shouldBe` Just (toTerm' a) describe "afterTerm" $ do prop "recovers the after term" $ - \ a b -> let diff = diffTerms wrap (==) diffCost (toTerm a) (toTerm (b :: ArbitraryTerm Text (Record '[Category]))) in - afterTerm diff `shouldBe` Just (toTerm b) + \ a b -> let diff = diffTerms wrap (==) diffCost (toTerm' a) (toTerm' (b :: ArbitraryTerm Text (Record '[Category]))) in + afterTerm diff `shouldBe` Just (toTerm' b) describe "ArbitraryDiff" $ do prop "generates diffs of a specific size" . forAll ((arbitrary >>= \ n -> (,) n <$> diffOfSize n) `suchThat` ((> 0) . fst)) $ diff --git a/test/DiffSummarySpec.hs b/test/DiffSummarySpec.hs index e3f0710e3..93d69491c 100644 --- a/test/DiffSummarySpec.hs +++ b/test/DiffSummarySpec.hs @@ -1,25 +1,24 @@ {-# LANGUAGE DataKinds #-} module DiffSummarySpec where -import Prologue +import Category +import Data.Functor.Both +import Data.List (partition) +import Data.RandomWalkSimilarity import Data.Record -import Test.Hspec -import Test.Hspec.QuickCheck import Diff +import Diff.Arbitrary +import DiffSummary +import Info +import Interpreter +import Patch +import Prologue +import Source import Syntax import Term -import Patch -import Category -import DiffSummary -import Text.PrettyPrint.Leijen.Text (pretty) -import Test.Hspec.QuickCheck -import Diff.Arbitrary -import Data.List (partition) import Term.Arbitrary -import Interpreter -import Info -import Source -import Data.Functor.Both +import Test.Hspec +import Test.Hspec.QuickCheck arrayInfo :: Record '[Category, Range] arrayInfo = ArrayLiteral .: Range 0 3 .: RNil @@ -46,7 +45,7 @@ spec = parallel $ do diffSummaries sources testDiff `shouldBe` [ DiffSummary { patch = Insert (LeafInfo "string" "a"), parentAnnotation = Nothing } ] prop "equal terms produce identity diffs" $ - \ a -> let term = toTerm (a :: ArbitraryTerm Text (Record '[Category, Range])) in + \ a -> let term = featureVectorDecorator (category . headF) 2 2 15 (toTerm (a :: ArbitraryTerm Text (Record '[Category, Range]))) in diffSummaries sources (diffTerms wrap (==) diffCost term term) `shouldBe` [] describe "annotatedSummaries" $ do @@ -99,7 +98,7 @@ isIndexedOrFixed' syntax = case syntax of isBranchInfo :: DiffInfo -> Bool isBranchInfo info = case info of (BranchInfo _ _ _) -> True - (LeafInfo _ _) -> False + _ -> False isBranchNode :: Patch DiffInfo -> Bool isBranchNode = any isBranchInfo diff --git a/test/InterpreterSpec.hs b/test/InterpreterSpec.hs index 0b62d9790..ca8785585 100644 --- a/test/InterpreterSpec.hs +++ b/test/InterpreterSpec.hs @@ -2,8 +2,10 @@ module InterpreterSpec where import Category -import Diff +import Data.RandomWalkSimilarity import Data.Record +import Diff +import Info import Interpreter import Patch import Prologue @@ -15,16 +17,17 @@ import Test.Hspec.QuickCheck spec :: Spec spec = parallel $ do describe "interpret" $ do + let decorate = featureVectorDecorator (category . headF) 2 2 15 it "returns a replacement when comparing two unicode equivalent terms" $ - let termA = cofree $ (StringLiteral .: RNil) :< Leaf ("t\776" :: Text) - termB = cofree $ (StringLiteral .: RNil) :< Leaf "\7831" in - diffTerms (free . Free) ((==) `on` extract) diffCost termA termB `shouldBe` free (Pure (Replace termA termB)) + let termA = decorate . cofree $ (StringLiteral .: RNil) :< Leaf ("t\776" :: Text) + termB = decorate . cofree $ (StringLiteral .: RNil) :< Leaf "\7831" in + diffTerms wrap ((==) `on` extract) diffCost termA termB `shouldBe` free (Pure (Replace termA termB)) prop "produces correct diffs" $ - \ a b -> let diff = diffTerms (free . Free) ((==) `on` extract) diffCost (toTerm a) (toTerm b) :: Diff Text (Record '[Category]) in - (beforeTerm diff, afterTerm diff) `shouldBe` (Just (toTerm a), Just (toTerm b)) + \ a b -> let diff = diffTerms wrap ((==) `on` extract) diffCost (decorate (toTerm a)) (decorate (toTerm (b :: ArbitraryTerm Text (Record '[Category])))) in + (beforeTerm diff, afterTerm diff) `shouldBe` (Just (decorate (toTerm a)), Just (decorate (toTerm b))) prop "constructs zero-cost diffs of equal terms" $ - \ a -> let term = toTerm a - diff = diffTerms (free . Free) ((==) `on` extract) diffCost term term :: Diff Text (Record '[Category]) in + \ a -> let term = decorate (toTerm (a :: ArbitraryTerm Text (Record '[Category]))) + diff = diffTerms wrap ((==) `on` extract) diffCost term term in diffCost diff `shouldBe` 0 diff --git a/test/PatchOutputSpec.hs b/test/PatchOutputSpec.hs index 429d18e70..b03eddf92 100644 --- a/test/PatchOutputSpec.hs +++ b/test/PatchOutputSpec.hs @@ -7,11 +7,10 @@ import Range import Renderer.Patch import Source import Syntax -import Category import Test.Hspec spec :: Spec -spec = parallel $ - describe "hunks" $ +spec = parallel $ do + describe "hunks" $ do it "empty diffs have empty hunks" $ - hunks (free . Free $ pure (Range 0 0 .: StringLiteral .: 1 .: 0 .: RNil) :< Leaf "") (both (SourceBlob (fromList "") "abcde" "path2.txt" (Just defaultPlainBlob)) (SourceBlob (fromList "") "xyz" "path2.txt" (Just defaultPlainBlob))) `shouldBe` [Hunk {offset = pure 0, changes = [], trailingContext = []}] + hunks (wrap $ pure (Range 0 0 .: RNil) :< Leaf ("" :: Text)) (both (SourceBlob (fromList "") "abcde" "path2.txt" (Just defaultPlainBlob)) (SourceBlob (fromList "") "xyz" "path2.txt" (Just defaultPlainBlob))) `shouldBe` [Hunk {offset = pure 0, changes = [], trailingContext = []}] diff --git a/test/diffs/dictionary.patch.js b/test/diffs/dictionary.patch.js new file mode 100644 index 000000000..49351e5ae --- /dev/null +++ b/test/diffs/dictionary.patch.js @@ -0,0 +1,10 @@ +diff --git a/test/diffs/dictionary.A.js b/test/diffs/dictionary.B.js +index 0000000000000000000000000000000000000000..0000000000000000000000000000000000000000 100644 +--- a/test/diffs/dictionary.A.js ++++ b/test/diffs/dictionary.B.js +@@ -1,5 +1,5 @@ + { +- "b": 4, ++ "b": 5, + "a": 5 + }