From 7fd0b52a9179955357b1d0b33a1cc6932b34940c Mon Sep 17 00:00:00 2001 From: Rick Winfrey Date: Tue, 16 Aug 2016 13:08:23 -0500 Subject: [PATCH 01/79] Add Comment constructor to Category Previously all comments were being caught by the Other constructor. Now that we specifically handle Category Syntax, it seems appropriate to promote Comment as a first class Category, too. --- src/Category.hs | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/Category.hs b/src/Category.hs index b82d99d42..c20175fb3 100644 --- a/src/Category.hs +++ b/src/Category.hs @@ -92,6 +92,8 @@ data Category | Class -- | A class method declaration. | Method + -- | A comment. + | Comment -- | A non-standard category, which can be used for comparability. | Other Text deriving (Eq, Generic, Ord, Show) From 60d10c483e82651f6e47099e9ea0d52bc8f31f2c Mon Sep 17 00:00:00 2001 From: Rick Winfrey Date: Tue, 16 Aug 2016 13:08:55 -0500 Subject: [PATCH 02/79] Disambiguate between Syntax Comment and Category Comment --- src/Alignment.hs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/Alignment.hs b/src/Alignment.hs index 00866ea3a..1513ec0d2 100644 --- a/src/Alignment.hs +++ b/src/Alignment.hs @@ -63,7 +63,7 @@ alignPatch sources patch = case patch of alignSyntax :: (Applicative f, HasField fields Range) => (forall a. f a -> Join These a) -> (CofreeF (Syntax leaf) (Record fields) term -> term) -> (term -> Range) -> f (Source Char) -> CofreeF (Syntax leaf) (f (Record fields)) [Join These term] -> [Join These term] alignSyntax toJoinThese toNode getRange sources (infos :< syntax) = catMaybes $ case syntax of Leaf s -> wrapInBranch (const (Leaf s)) <$> alignBranch getRange [] bothRanges - Comment a -> wrapInBranch (const (Comment a)) <$> alignBranch getRange [] bothRanges + Syntax.Comment a -> wrapInBranch (const (Syntax.Comment a)) <$> alignBranch getRange [] bothRanges Fixed children -> wrapInBranch Fixed <$> alignBranch getRange (join children) bothRanges _ -> wrapInBranch Indexed <$> alignBranch getRange (join (toList syntax)) bothRanges where bothRanges = modifyJoin (fromThese [] []) lineRanges From e9240b5b6e22421c156c97b6296b66a7c97d7f76 Mon Sep 17 00:00:00 2001 From: Rick Winfrey Date: Tue, 16 Aug 2016 13:09:12 -0500 Subject: [PATCH 03/79] Disambiguate between Syntax Comment and Category Comment --- src/DiffSummary.hs | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/DiffSummary.hs b/src/DiffSummary.hs index 52a2bf24e..957d408f0 100644 --- a/src/DiffSummary.hs +++ b/src/DiffSummary.hs @@ -108,7 +108,7 @@ toTermName source term = case unwrap term of S.Array _ -> termNameFromSource term S.Class identifier _ _ -> toTermName' identifier S.Method identifier _ _ -> toTermName' identifier - Comment a -> toCategoryName a + S.Comment a -> toCategoryName a S.Commented _ _ -> termNameFromChildren term where toTermName' = toTermName source termNameFromChildren term = termNameFromRange (unionRangesFrom (range term) (range <$> toList (unwrap term))) @@ -174,6 +174,7 @@ instance HasCategory Category where BinaryOperator -> "binary operator" Boolean -> "boolean" DictionaryLiteral -> "dictionary" + C.Comment -> "comment" C.Error -> "error" ExpressionStatements -> "expression statements" C.Assignment -> "assignment" From e29651ce073c86e619552dff0b33ac4b882e408c Mon Sep 17 00:00:00 2001 From: Rick Winfrey Date: Tue, 16 Aug 2016 13:09:43 -0500 Subject: [PATCH 04/79] Create Comment Category instances from TreeSitter productions --- src/TreeSitter.hs | 1 + 1 file changed, 1 insertion(+) diff --git a/src/TreeSitter.hs b/src/TreeSitter.hs index f0c66fed4..2e83994f2 100644 --- a/src/TreeSitter.hs +++ b/src/TreeSitter.hs @@ -93,6 +93,7 @@ defaultCategoryForNodeName name = case name of "throw_statement" -> Throw "try_statement" -> Try "method_definition" -> Method + "comment" -> Comment _ -> Other name -- | Return a parser for a tree sitter language & document. From 1f1b0a9b3fba02b04be79918716be18e1f549823 Mon Sep 17 00:00:00 2001 From: Rick Winfrey Date: Tue, 16 Aug 2016 13:10:26 -0500 Subject: [PATCH 05/79] Specifically handle Comment nodes instead of breaking them down by words (the default behavior for Leaf syntaxes that are not Regex) --- src/Diffing.hs | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/Diffing.hs b/src/Diffing.hs index 522f7b293..5ee1ca1e4 100644 --- a/src/Diffing.hs +++ b/src/Diffing.hs @@ -34,6 +34,7 @@ import Term import TreeSitter import Text.Parser.TreeSitter.Language import qualified Data.Text as T +import Category import Data.Aeson (pairs) import Data.Aeson.Encoding (encodingToLazyByteString) @@ -97,6 +98,7 @@ breakDownLeavesByWord source = cata replaceIn where replaceIn (info :< syntax) = cofree $ info :< syntax' where syntax' = case (ranges, syntax) of + (_:_:_, Leaf _) | category info == Category.Comment -> syntax (_:_:_, Leaf _) | category info /= Regex -> Indexed (makeLeaf info <$> ranges) _ -> syntax ranges = rangesAndWordsInSource (characterRange info) From 30b94f822c7f70f4a0a974866f6a54dc1dbeb7dd Mon Sep 17 00:00:00 2001 From: Rick Winfrey Date: Tue, 16 Aug 2016 13:22:41 -0500 Subject: [PATCH 06/79] Add Set for Category constructors that should not be made into Indexed leaves --- src/Diffing.hs | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/src/Diffing.hs b/src/Diffing.hs index 5ee1ca1e4..4bdd9ff10 100644 --- a/src/Diffing.hs +++ b/src/Diffing.hs @@ -8,6 +8,7 @@ import Data.Functor.Both import Data.Functor.Foldable import Data.RandomWalkSimilarity import Data.Record +import qualified Data.Set as Set import qualified Data.Text.IO as TextIO import qualified Data.Text.ICU.Detect as Detect import qualified Data.Text.ICU.Convert as Convert @@ -98,12 +99,12 @@ breakDownLeavesByWord source = cata replaceIn where replaceIn (info :< syntax) = cofree $ info :< syntax' where syntax' = case (ranges, syntax) of - (_:_:_, Leaf _) | category info == Category.Comment -> syntax - (_:_:_, Leaf _) | category info /= Regex -> Indexed (makeLeaf info <$> ranges) + (_:_:_, Leaf _) | Set.notMember (category info) preserveSyntax -> Indexed (makeLeaf info <$> ranges) _ -> syntax ranges = rangesAndWordsInSource (characterRange info) rangesAndWordsInSource range = rangesAndWordsFrom (start range) (toString $ slice range source) makeLeaf info (range, substring) = cofree $ setCharacterRange info range :< Leaf (toS substring) + preserveSyntax = Set.fromList [Regex, Category.Comment] -- | Transcode a file to a unicode source. transcode :: B1.ByteString -> IO (Source Char) From 2b196ade35576421052529d47ba3ca14542d9d21 Mon Sep 17 00:00:00 2001 From: Rick Winfrey Date: Tue, 16 Aug 2016 13:24:36 -0500 Subject: [PATCH 07/79] Add explanation about the intent of the `preserveSyntax` Set --- src/Diffing.hs | 3 +++ 1 file changed, 3 insertions(+) diff --git a/src/Diffing.hs b/src/Diffing.hs index 4bdd9ff10..dc55ca971 100644 --- a/src/Diffing.hs +++ b/src/Diffing.hs @@ -104,6 +104,9 @@ breakDownLeavesByWord source = cata replaceIn ranges = rangesAndWordsInSource (characterRange info) rangesAndWordsInSource range = rangesAndWordsFrom (start range) (toString $ slice range source) makeLeaf info (range, substring) = cofree $ setCharacterRange info range :< Leaf (toS substring) + -- Some Category constructors should retain their original structure, and not be sliced + -- into words. This Set represents those Category constructors for which we want to + -- preserve the original Syntax. preserveSyntax = Set.fromList [Regex, Category.Comment] -- | Transcode a file to a unicode source. From a9c3bfa671dd7909f15e1c77f73f4074f012c6c7 Mon Sep 17 00:00:00 2001 From: joshvera Date: Tue, 16 Aug 2016 15:44:02 -0400 Subject: [PATCH 08/79] Add CommaOperator --- src/Category.hs | 2 ++ src/TreeSitter.hs | 2 +- 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/src/Category.hs b/src/Category.hs index b82d99d42..3116788b8 100644 --- a/src/Category.hs +++ b/src/Category.hs @@ -76,6 +76,8 @@ data Category | Case -- | An expression with an operator. | Operator + -- | An comma operator expression + | CommaOperator -- | An object/dictionary/hash literal. | Object -- | A throw statement. diff --git a/src/TreeSitter.hs b/src/TreeSitter.hs index f0c66fed4..052aa3e96 100644 --- a/src/TreeSitter.hs +++ b/src/TreeSitter.hs @@ -41,7 +41,7 @@ categoriesForLanguage language name = case (language, name) of (JavaScript, "bool_op") -> BinaryOperator -- boolean operator, e.g. ||, &&. (JavaScript, "bitwise_op") -> BinaryOperator -- bitwise operator, e.g. ^, &, etc. (JavaScript, "rel_op") -> BinaryOperator -- relational operator, e.g. >, <, <=, >=, ==, !=. - (JavaScript, "comma_op") -> Operator -- comma operator, e.g. expr1, expr2. + (JavaScript, "comma_op") -> CommaOperator -- comma operator, e.g. expr1, expr2. (JavaScript, "delete_op") -> Operator -- delete operator, e.g. delete x[2]. (JavaScript, "type_op") -> Operator -- type operator, e.g. typeof Object. (JavaScript, "void_op") -> Operator -- void operator, e.g. void 2. From 7785cbf4985ba0291db45e7652c98f5a78b69224 Mon Sep 17 00:00:00 2001 From: joshvera Date: Tue, 16 Aug 2016 16:18:02 -0400 Subject: [PATCH 09/79] Add comma operator mapping --- src/DiffSummary.hs | 1 + 1 file changed, 1 insertion(+) diff --git a/src/DiffSummary.hs b/src/DiffSummary.hs index 52a2bf24e..513517bb8 100644 --- a/src/DiffSummary.hs +++ b/src/DiffSummary.hs @@ -213,6 +213,7 @@ instance HasCategory Category where C.Class -> "class" C.Method -> "method" C.If -> "if statement" + C.CommaOperator -> "comma operator" instance (HasCategory leaf, HasField fields Category) => HasCategory (Term leaf (Record fields)) where toCategoryName = toCategoryName . category . extract From cb3a7095408643323b0cb2e5c31ac53aa54bdffe Mon Sep 17 00:00:00 2001 From: Rick Winfrey Date: Tue, 16 Aug 2016 15:36:36 -0500 Subject: [PATCH 10/79] Add missing Comment Category to Split styles --- src/Renderer/Split.hs | 1 + 1 file changed, 1 insertion(+) diff --git a/src/Renderer/Split.hs b/src/Renderer/Split.hs index d2fac0102..c8559d17b 100644 --- a/src/Renderer/Split.hs +++ b/src/Renderer/Split.hs @@ -40,6 +40,7 @@ styleName category = "category-" <> case category of StringLiteral -> "string" SymbolLiteral -> "symbol" IntegerLiteral -> "integer" + C.Comment -> "comment" C.FunctionCall -> "function_call" C.Function -> "function" C.MethodCall -> "method_call" From b4133bdcac0a9dabb3ad1cba7e5f249fc790df7d Mon Sep 17 00:00:00 2001 From: joshvera Date: Tue, 16 Aug 2016 17:24:06 -0400 Subject: [PATCH 11/79] Map comma operators to flattened indexed nodes --- src/Parser.hs | 3 +++ test/repos/js-test | 2 +- 2 files changed, 4 insertions(+), 1 deletion(-) diff --git a/src/Parser.hs b/src/Parser.hs index 297516fcf..8e43468fb 100644 --- a/src/Parser.hs +++ b/src/Parser.hs @@ -55,6 +55,9 @@ termConstructor source sourceSpan info = fmap cofree . construct (base:element:[]) -> withDefaultInfo $ S.SubscriptAccess base element _ -> errorWith children construct children | isOperator (category info) = withDefaultInfo $ S.Operator children + construct children | CommaOperator == category info = withDefaultInfo $ case children of + [child, rest] | S.Indexed _ <- unwrap rest -> S.Indexed $ child : toList (unwrap rest) + _ -> S.Indexed children construct children | Function == category info = case children of (body:[]) -> withDefaultInfo $ S.Function Nothing Nothing body (params:body:[]) | (info :< _) <- runCofree params, Params == category info -> diff --git a/test/repos/js-test b/test/repos/js-test index 3af143b53..45875c177 160000 --- a/test/repos/js-test +++ b/test/repos/js-test @@ -1 +1 @@ -Subproject commit 3af143b53c8f9a2a6761d8e9ea91d47982cc3bc0 +Subproject commit 45875c177feba7e9c25f780669e6719c59441a42 From b14234f0aa003a5e4bea938e8ef96fd9006ec0e1 Mon Sep 17 00:00:00 2001 From: joshvera Date: Tue, 16 Aug 2016 17:24:22 -0400 Subject: [PATCH 12/79] bind toLeafInfos --- src/DiffSummary.hs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/DiffSummary.hs b/src/DiffSummary.hs index 513517bb8..0c5dbc11f 100644 --- a/src/DiffSummary.hs +++ b/src/DiffSummary.hs @@ -57,7 +57,7 @@ summaries (Replace i1 i2) = zipWith (\a b -> "Replaced" <+> "the" <+> a <+> "wit toLeafInfos :: DiffInfo -> [Doc] toLeafInfos LeafInfo{..} = pure $ squotes (toDoc termName) <+> (toDoc categoryName) -toLeafInfos BranchInfo{..} = pretty <$> branches +toLeafInfos BranchInfo{..} = toLeafInfos =<< branches toLeafInfos err@ErrorInfo{} = pure $ pretty err toTermName :: (HasCategory leaf, HasField fields Category, HasField fields Range) => Source Char -> Term leaf (Record fields) -> Text From 59997566bbf783a14bf35aba80c580a835df0473 Mon Sep 17 00:00:00 2001 From: joshvera Date: Tue, 16 Aug 2016 22:03:34 -0400 Subject: [PATCH 13/79] No need to unwrap rest --- src/Parser.hs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/Parser.hs b/src/Parser.hs index 8e43468fb..67535fb48 100644 --- a/src/Parser.hs +++ b/src/Parser.hs @@ -56,7 +56,7 @@ termConstructor source sourceSpan info = fmap cofree . construct _ -> errorWith children construct children | isOperator (category info) = withDefaultInfo $ S.Operator children construct children | CommaOperator == category info = withDefaultInfo $ case children of - [child, rest] | S.Indexed _ <- unwrap rest -> S.Indexed $ child : toList (unwrap rest) + [child, rest] | S.Indexed f <- unwrap rest -> S.Indexed $ child : toList f _ -> S.Indexed children construct children | Function == category info = case children of (body:[]) -> withDefaultInfo $ S.Function Nothing Nothing body From b24d12d869822925778918ff7dba7363a456baf7 Mon Sep 17 00:00:00 2001 From: joshvera Date: Tue, 16 Aug 2016 22:03:48 -0400 Subject: [PATCH 14/79] Add missing cases to toSeries and toText --- src/Renderer.hs | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/Renderer.hs b/src/Renderer.hs index fbbfca8c3..0c537d657 100644 --- a/src/Renderer.hs +++ b/src/Renderer.hs @@ -24,10 +24,12 @@ concatOutputs l = T.intercalate "\n" (toText <$> l) toSeries :: Output -> Series toSeries (JSONOutput series) = series toSeries (SummaryOutput series) = series +toSeries _ = mempty toText :: Output -> Text toText (SplitOutput text) = text toText (PatchOutput text) = text +toText _ = mempty -- | The available types of diff rendering. From 3a4a19b9ab72c7e5191f79baf7ac834ace8255ef Mon Sep 17 00:00:00 2001 From: joshvera Date: Tue, 16 Aug 2016 22:04:36 -0400 Subject: [PATCH 15/79] s/f/cs --- src/Parser.hs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/Parser.hs b/src/Parser.hs index 67535fb48..f59b65e09 100644 --- a/src/Parser.hs +++ b/src/Parser.hs @@ -56,7 +56,7 @@ termConstructor source sourceSpan info = fmap cofree . construct _ -> errorWith children construct children | isOperator (category info) = withDefaultInfo $ S.Operator children construct children | CommaOperator == category info = withDefaultInfo $ case children of - [child, rest] | S.Indexed f <- unwrap rest -> S.Indexed $ child : toList f + [child, rest] | S.Indexed cs <- unwrap rest -> S.Indexed $ child : toList cs _ -> S.Indexed children construct children | Function == category info = case children of (body:[]) -> withDefaultInfo $ S.Function Nothing Nothing body From 5615da1bc08f79c431ae17c3d70038b55e785208 Mon Sep 17 00:00:00 2001 From: joshvera Date: Wed, 17 Aug 2016 10:42:05 -0400 Subject: [PATCH 16/79] Change methodParams to be a list of args --- src/Parser.hs | 6 ++++-- src/Syntax.hs | 2 +- 2 files changed, 5 insertions(+), 3 deletions(-) diff --git a/src/Parser.hs b/src/Parser.hs index f59b65e09..e247e883d 100644 --- a/src/Parser.hs +++ b/src/Parser.hs @@ -69,8 +69,10 @@ termConstructor source sourceSpan info = fmap cofree . construct _ -> errorWith children construct children | FunctionCall == category info = case runCofree <$> children of - [ (_ :< S.MemberAccess{..}), params@(_ :< S.Args{}) ] -> - pure $! setCategory info MethodCall :< S.MethodCall memberId property (cofree params) + [ (_ :< S.MemberAccess{..}), params@(_ :< S.Args args) ] -> + pure $! setCategory info MethodCall :< S.MethodCall memberId property args + [ (_ :< S.MemberAccess{..}) ] -> + pure $! setCategory info MethodCall :< S.MethodCall memberId property [] (x:xs) -> withDefaultInfo $ S.FunctionCall (cofree x) (cofree <$> xs) _ -> errorWith children diff --git a/src/Syntax.hs b/src/Syntax.hs index 9ef6465ab..85763201b 100644 --- a/src/Syntax.hs +++ b/src/Syntax.hs @@ -33,7 +33,7 @@ data Syntax a f | MemberAccess { memberId :: f, property :: f } -- | A method call consisting of its target, the method name, and the parameters passed to the method. -- | e.g. in Javascript console.log('hello') represents a method call. - | MethodCall { targetId :: f, methodId :: f, methodParams :: f } + | MethodCall { targetId :: f, methodId :: f, methodParams :: [f] } -- | The list of arguments to a method call. -- | TODO: It might be worth removing this and using Fixed instead. | Args [f] From c16effb699fd910c64c090dd4a90ef573e1624a4 Mon Sep 17 00:00:00 2001 From: joshvera Date: Wed, 17 Aug 2016 11:00:17 -0400 Subject: [PATCH 17/79] Add comma operator to Split --- src/Renderer/Split.hs | 1 + 1 file changed, 1 insertion(+) diff --git a/src/Renderer/Split.hs b/src/Renderer/Split.hs index c8559d17b..087a6c0f9 100644 --- a/src/Renderer/Split.hs +++ b/src/Renderer/Split.hs @@ -74,6 +74,7 @@ styleName category = "category-" <> case category of C.Class -> "class_statement" C.Method -> "method" C.If -> "if_statement" + C.CommaOperator -> "comma_operator" Other string -> string -- | Pick the class name for a split patch. From d90653aaffa7551aa364a2630ec815242e0313af Mon Sep 17 00:00:00 2001 From: joshvera Date: Wed, 17 Aug 2016 11:23:53 -0400 Subject: [PATCH 18/79] remove unused params --- src/Parser.hs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/Parser.hs b/src/Parser.hs index e247e883d..594eb3ddd 100644 --- a/src/Parser.hs +++ b/src/Parser.hs @@ -69,7 +69,7 @@ termConstructor source sourceSpan info = fmap cofree . construct _ -> errorWith children construct children | FunctionCall == category info = case runCofree <$> children of - [ (_ :< S.MemberAccess{..}), params@(_ :< S.Args args) ] -> + [ (_ :< S.MemberAccess{..}), (_ :< S.Args args) ] -> pure $! setCategory info MethodCall :< S.MethodCall memberId property args [ (_ :< S.MemberAccess{..}) ] -> pure $! setCategory info MethodCall :< S.MethodCall memberId property [] From c09887b8df6bfa1a5fa2b06f8c0170264424f61b Mon Sep 17 00:00:00 2001 From: joshvera Date: Wed, 17 Aug 2016 11:24:12 -0400 Subject: [PATCH 19/79] update semantic-diff tests --- test/diffs/asymmetrical-context.split.js | 8 ++++---- test/diffs/insert.split.js | 6 +++--- test/diffs/multiline-insert.split.js | 14 +++++++------- test/diffs/nested-insert.split.js | 6 +++--- test/diffs/newline-at-eof.json.js | 2 +- test/diffs/newline-at-eof.split.js | 6 +++--- test/diffs/no-newline-at-eof.json.js | 2 +- test/diffs/no-newline-at-eof.split.js | 6 +++--- 8 files changed, 25 insertions(+), 25 deletions(-) diff --git a/test/diffs/asymmetrical-context.split.js b/test/diffs/asymmetrical-context.split.js index 31bc72255..10de89c0c 100644 --- a/test/diffs/asymmetrical-context.split.js +++ b/test/diffs/asymmetrical-context.split.js @@ -1,7 +1,7 @@ -
1
      • console
      • .
      • log
      • (
          • '
          • hello
          • '
      • )
    • ;
  • + - @@ -20,9 +20,9 @@ - - diff --git a/test/diffs/insert.split.js b/test/diffs/insert.split.js index 04fe95e10..1bdaa6bef 100644 --- a/test/diffs/insert.split.js +++ b/test/diffs/insert.split.js @@ -1,11 +1,11 @@ -
    1
        • console
        • .
        • log
        • (
          • '
          • hello
          • '
        • )
      • ;
    1
        • console
        • .
        • log
        • (
            • '
            • hello
            • '
        • )
      • ;
    • +
    1
        • console
        • .
        • log
        • (
          • '
          • hello
          • '
        • )
      • ;
    5
    2
        • console
        • .
        • log
        • (
            • '
            • world
            • '
        • )
      • ;
    • +
    2
        • console
        • .
        • log
        • (
          • '
          • world
          • '
        • )
      • ;
    6
        • console
        • .
        • log
        • (
            • '
            • world
            • '
        • )
      • ;
    • +
    6
        • console
        • .
        • log
        • (
          • '
          • world
          • '
        • )
      • ;
    3
      1
          • console
          • .
          • log
          • (
              • '
              • hello
              • '
          • )
        • ;
      • + - - diff --git a/test/diffs/multiline-insert.split.js b/test/diffs/multiline-insert.split.js index 6d4caafc2..5fcbdab80 100644 --- a/test/diffs/multiline-insert.split.js +++ b/test/diffs/multiline-insert.split.js @@ -1,24 +1,24 @@ -
        1
            • console
            • .
            • log
            • (
              • '
              • hello
              • '
            • )
          • ;
        1
            • console
            • .
            • log
            • (
                • '
                • hello
                • '
            • )
          • ;
        • +
        1
            • console
            • .
            • log
            • (
              • '
              • hello
              • '
            • )
          • ;
        2
            • console
            • .
            • log
            • (
                • '
                • world
                • '
            • )
          • ;
        • +
        2
            • console
            • .
            • log
            • (
              • '
              • world
              • '
            • )
          • ;
        2
          1
              • console
              • .
              • log
              • (
                  • '
                  • hello
                  • '
              • )
            • ;
          • + - - - - - - diff --git a/test/diffs/nested-insert.split.js b/test/diffs/nested-insert.split.js index f43d38f8c..a41bcebe5 100644 --- a/test/diffs/nested-insert.split.js +++ b/test/diffs/nested-insert.split.js @@ -4,13 +4,13 @@ - - -
            1
                • console
                • .
                • log
                • (
                  • '
                  • hello
                  • '
                • )
              • ;
            1
                • console
                • .
                • log
                • (
                    • '
                    • hello
                    • '
                • )
              • ;
            • +
            1
                • console
                • .
                • log
                • (
                  • '
                  • hello
                  • '
                • )
              • ;
            2
              • if (
              • true
              • )
                • { +
            2
              • if (
              • true
              • )
                • {
            3
                    • console
                    • .
                    • log
                    • (
                        • '
                        • cruel
                        • '
                    • )
                  • ;
                • +
            3
                    • console
                    • .
                    • log
                    • (
                      • '
                      • cruel
                      • '
                    • )
                  • ;
            4
                • }
            • +
            4
                • }
            2
                • console
                • .
                • log
                • (
                    • '
                    • world
                    • '
                • )
              • ;
            • +
            2
                • console
                • .
                • log
                • (
                  • '
                  • world
                  • '
                • )
              • ;
            5
                • console
                • .
                • log
                • (
                    • '
                    • world
                    • '
                • )
              • ;
            • +
            5
                • console
                • .
                • log
                • (
                  • '
                  • world
                  • '
                • )
              • ;
            3
              1
                • if (
                • true
                • )
                  • {
              2
                      • console
                      • .
                      • log
                      • (
                          • '
                          • hello
                          • '
                      • )
                    • ;
                  • +
              2
                      • console
                      • .
                      • log
                      • (
                        • '
                        • hello
                        • '
                      • )
                    • ;
              2
                      • console
                      • .
                      • log
                      • (
                          • '
                          • hello
                          • '
                      • )
                    • ;
                  • +
              2
                      • console
                      • .
                      • log
                      • (
                        • '
                        • hello
                        • '
                      • )
                    • ;
              3
                      • console
                      • .
                      • log
                      • (
                          • '
                          • world
                          • '
                      • )
                    • ;
                  • +
              3
                      • console
                      • .
                      • log
                      • (
                        • '
                        • world
                        • '
                      • )
                    • ;
              3
                  • }
              • diff --git a/test/diffs/newline-at-eof.json.js b/test/diffs/newline-at-eof.json.js index 3d58ee6bc..9f0602ca9 100644 --- a/test/diffs/newline-at-eof.json.js +++ b/test/diffs/newline-at-eof.json.js @@ -1 +1 @@ -{"rows":[[{"number":1,"terms":[{"range":[0,29],"category":"Program","children":[{"range":[0,28],"category":"ExpressionStatements","children":[{"range":[0,27],"category":"MethodCall","children":[{"range":[0,7],"category":"Identifier"},{"range":[8,11],"category":"Identifier"},{"range":[12,26],"category":"Args","children":[{"range":[12,26],"category":"StringLiteral","children":[{"range":[12,13],"category":"StringLiteral"},{"range":[13,18],"category":"StringLiteral"},{"range":[18,19],"category":"StringLiteral"},{"range":[20,25],"category":"StringLiteral"},{"range":[25,26],"category":"StringLiteral"}]}]}]}]}]}],"range":[0,29],"hasChanges":false},{"number":1,"terms":[{"range":[0,29],"category":"Program","children":[{"range":[0,28],"category":"ExpressionStatements","children":[{"range":[0,27],"category":"MethodCall","children":[{"range":[0,7],"category":"Identifier"},{"range":[8,11],"category":"Identifier"},{"range":[12,26],"category":"Args","children":[{"range":[12,26],"category":"StringLiteral","children":[{"range":[12,13],"category":"StringLiteral"},{"range":[13,18],"category":"StringLiteral"},{"range":[18,19],"category":"StringLiteral"},{"range":[20,25],"category":"StringLiteral"},{"range":[25,26],"category":"StringLiteral"}]}]}]}]}]}],"range":[0,29],"hasChanges":false}],[{"number":2,"terms":[{"range":[29,29],"category":"Program","children":[]}],"range":[29,29],"hasChanges":false},{"number":2,"terms":[{"range":[29,30],"category":"Program","children":[]}],"range":[29,30],"hasChanges":false}],[{"number":3,"terms":[{"range":[30,56],"category":"Program","children":[{"patch":"insert","range":[30,55],"category":"ExpressionStatements","children":[{"range":[30,54],"category":"MethodCall","children":[{"range":[30,37],"category":"Identifier"},{"range":[38,41],"category":"Identifier"},{"range":[42,53],"category":"Args","children":[{"range":[42,53],"category":"StringLiteral","children":[{"range":[42,43],"category":"StringLiteral"},{"range":[43,52],"category":"StringLiteral"},{"range":[52,53],"category":"StringLiteral"}]}]}]}]}]}],"range":[30,56],"hasChanges":true}],[{"number":4,"terms":[{"range":[56,56],"category":"Program","children":[]}],"range":[56,56],"hasChanges":false}]],"oids":["0000000000000000000000000000000000000000","0000000000000000000000000000000000000000"],"paths":["test/diffs/newline-at-eof.A.js","test/diffs/newline-at-eof.B.js"]} \ No newline at end of file +{"rows":[[{"number":1,"terms":[{"range":[0,29],"category":"Program","children":[{"range":[0,28],"category":"ExpressionStatements","children":[{"range":[0,27],"category":"MethodCall","children":[{"range":[0,7],"category":"Identifier"},{"range":[8,11],"category":"Identifier"},{"range":[12,26],"category":"StringLiteral","children":[{"range":[12,13],"category":"StringLiteral"},{"range":[13,18],"category":"StringLiteral"},{"range":[18,19],"category":"StringLiteral"},{"range":[20,25],"category":"StringLiteral"},{"range":[25,26],"category":"StringLiteral"}]}]}]}]}],"range":[0,29],"hasChanges":false},{"number":1,"terms":[{"range":[0,29],"category":"Program","children":[{"range":[0,28],"category":"ExpressionStatements","children":[{"range":[0,27],"category":"MethodCall","children":[{"range":[0,7],"category":"Identifier"},{"range":[8,11],"category":"Identifier"},{"range":[12,26],"category":"StringLiteral","children":[{"range":[12,13],"category":"StringLiteral"},{"range":[13,18],"category":"StringLiteral"},{"range":[18,19],"category":"StringLiteral"},{"range":[20,25],"category":"StringLiteral"},{"range":[25,26],"category":"StringLiteral"}]}]}]}]}],"range":[0,29],"hasChanges":false}],[{"number":2,"terms":[{"range":[29,29],"category":"Program","children":[]}],"range":[29,29],"hasChanges":false},{"number":2,"terms":[{"range":[29,30],"category":"Program","children":[]}],"range":[29,30],"hasChanges":false}],[{"number":3,"terms":[{"range":[30,56],"category":"Program","children":[{"patch":"insert","range":[30,55],"category":"ExpressionStatements","children":[{"range":[30,54],"category":"MethodCall","children":[{"range":[30,37],"category":"Identifier"},{"range":[38,41],"category":"Identifier"},{"range":[42,53],"category":"StringLiteral","children":[{"range":[42,43],"category":"StringLiteral"},{"range":[43,52],"category":"StringLiteral"},{"range":[52,53],"category":"StringLiteral"}]}]}]}]}],"range":[30,56],"hasChanges":true}],[{"number":4,"terms":[{"range":[56,56],"category":"Program","children":[]}],"range":[56,56],"hasChanges":false}]],"oids":["0000000000000000000000000000000000000000","0000000000000000000000000000000000000000"],"paths":["test/diffs/newline-at-eof.A.js","test/diffs/newline-at-eof.B.js"]} \ No newline at end of file diff --git a/test/diffs/newline-at-eof.split.js b/test/diffs/newline-at-eof.split.js index 220e3f673..e3f5295b4 100644 --- a/test/diffs/newline-at-eof.split.js +++ b/test/diffs/newline-at-eof.split.js @@ -1,7 +1,7 @@ -
                1
                    • console
                    • .
                    • log
                    • (
                        • "
                        • hello
                        • ,
                        • world
                        • "
                    • )
                  • ;
                • + - @@ -9,7 +9,7 @@ - diff --git a/test/diffs/no-newline-at-eof.json.js b/test/diffs/no-newline-at-eof.json.js index ea5e696d5..3d1b24107 100644 --- a/test/diffs/no-newline-at-eof.json.js +++ b/test/diffs/no-newline-at-eof.json.js @@ -1 +1 @@ -{"rows":[[{"number":1,"terms":[{"range":[0,28],"category":"Program","children":[{"range":[0,28],"category":"ExpressionStatements","children":[{"range":[0,27],"category":"MethodCall","children":[{"range":[0,7],"category":"Identifier"},{"range":[8,11],"category":"Identifier"},{"range":[12,26],"category":"Args","children":[{"range":[12,26],"category":"StringLiteral","children":[{"range":[12,13],"category":"StringLiteral"},{"range":[13,18],"category":"StringLiteral"},{"range":[18,19],"category":"StringLiteral"},{"range":[20,25],"category":"StringLiteral"},{"range":[25,26],"category":"StringLiteral"}]}]}]}]}]}],"range":[0,28],"hasChanges":false},{"number":1,"terms":[{"range":[0,29],"category":"Program","children":[{"range":[0,28],"category":"ExpressionStatements","children":[{"range":[0,27],"category":"MethodCall","children":[{"range":[0,7],"category":"Identifier"},{"range":[8,11],"category":"Identifier"},{"range":[12,26],"category":"Args","children":[{"range":[12,26],"category":"StringLiteral","children":[{"range":[12,13],"category":"StringLiteral"},{"range":[13,18],"category":"StringLiteral"},{"range":[18,19],"category":"StringLiteral"},{"range":[20,25],"category":"StringLiteral"},{"range":[25,26],"category":"StringLiteral"}]}]}]}]}]}],"range":[0,29],"hasChanges":false}],[{"number":2,"terms":[{"range":[29,30],"category":"Program","children":[]}],"range":[29,30],"hasChanges":false}],[{"number":3,"terms":[{"range":[30,55],"category":"Program","children":[{"patch":"insert","range":[30,55],"category":"ExpressionStatements","children":[{"range":[30,54],"category":"MethodCall","children":[{"range":[30,37],"category":"Identifier"},{"range":[38,41],"category":"Identifier"},{"range":[42,53],"category":"Args","children":[{"range":[42,53],"category":"StringLiteral","children":[{"range":[42,43],"category":"StringLiteral"},{"range":[43,52],"category":"StringLiteral"},{"range":[52,53],"category":"StringLiteral"}]}]}]}]}]}],"range":[30,55],"hasChanges":true}]],"oids":["0000000000000000000000000000000000000000","0000000000000000000000000000000000000000"],"paths":["test/diffs/no-newline-at-eof.A.js","test/diffs/no-newline-at-eof.B.js"]} \ No newline at end of file +{"rows":[[{"number":1,"terms":[{"range":[0,28],"category":"Program","children":[{"range":[0,28],"category":"ExpressionStatements","children":[{"range":[0,27],"category":"MethodCall","children":[{"range":[0,7],"category":"Identifier"},{"range":[8,11],"category":"Identifier"},{"range":[12,26],"category":"StringLiteral","children":[{"range":[12,13],"category":"StringLiteral"},{"range":[13,18],"category":"StringLiteral"},{"range":[18,19],"category":"StringLiteral"},{"range":[20,25],"category":"StringLiteral"},{"range":[25,26],"category":"StringLiteral"}]}]}]}]}],"range":[0,28],"hasChanges":false},{"number":1,"terms":[{"range":[0,29],"category":"Program","children":[{"range":[0,28],"category":"ExpressionStatements","children":[{"range":[0,27],"category":"MethodCall","children":[{"range":[0,7],"category":"Identifier"},{"range":[8,11],"category":"Identifier"},{"range":[12,26],"category":"StringLiteral","children":[{"range":[12,13],"category":"StringLiteral"},{"range":[13,18],"category":"StringLiteral"},{"range":[18,19],"category":"StringLiteral"},{"range":[20,25],"category":"StringLiteral"},{"range":[25,26],"category":"StringLiteral"}]}]}]}]}],"range":[0,29],"hasChanges":false}],[{"number":2,"terms":[{"range":[29,30],"category":"Program","children":[]}],"range":[29,30],"hasChanges":false}],[{"number":3,"terms":[{"range":[30,55],"category":"Program","children":[{"patch":"insert","range":[30,55],"category":"ExpressionStatements","children":[{"range":[30,54],"category":"MethodCall","children":[{"range":[30,37],"category":"Identifier"},{"range":[38,41],"category":"Identifier"},{"range":[42,53],"category":"StringLiteral","children":[{"range":[42,43],"category":"StringLiteral"},{"range":[43,52],"category":"StringLiteral"},{"range":[52,53],"category":"StringLiteral"}]}]}]}]}],"range":[30,55],"hasChanges":true}]],"oids":["0000000000000000000000000000000000000000","0000000000000000000000000000000000000000"],"paths":["test/diffs/no-newline-at-eof.A.js","test/diffs/no-newline-at-eof.B.js"]} \ No newline at end of file diff --git a/test/diffs/no-newline-at-eof.split.js b/test/diffs/no-newline-at-eof.split.js index 9643db481..c6d2dd59b 100644 --- a/test/diffs/no-newline-at-eof.split.js +++ b/test/diffs/no-newline-at-eof.split.js @@ -1,6 +1,6 @@ -
                  1
                      • console
                      • .
                      • log
                      • (
                        • "
                        • hello
                        • ,
                        • world
                        • "
                      • )
                    • ;
                  1
                      • console
                      • .
                      • log
                      • (
                          • "
                          • hello
                          • ,
                          • world
                          • "
                      • )
                    • ;
                  • +
                  1
                      • console
                      • .
                      • log
                      • (
                        • "
                        • hello
                        • ,
                        • world
                        • "
                      • )
                    • ;
                  2
                    3
                        • console
                        • .
                        • log
                        • (
                            • "
                            • insertion
                            • "
                        • )
                      • ;
                    • +
                    3
                        • console
                        • .
                        • log
                        • (
                          • "
                          • insertion
                          • "
                        • )
                      • ;
                    -
                    1
                        • console
                        • .
                        • log
                        • (
                            • "
                            • hello
                            • ,
                            • world
                            • "
                        • )
                      • ;
                    1
                        • console
                        • .
                        • log
                        • (
                            • "
                            • hello
                            • ,
                            • world
                            • "
                        • )
                      • ;
                    • + + @@ -8,6 +8,6 @@ - +
                      1
                          • console
                          • .
                          • log
                          • (
                            • "
                            • hello
                            • ,
                            • world
                            • "
                          • )
                        • ;
                      1
                          • console
                          • .
                          • log
                          • (
                            • "
                            • hello
                            • ,
                            • world
                            • "
                          • )
                        • ;
                      3
                          • console
                          • .
                          • log
                          • (
                              • "
                              • insertion
                              • "
                          • )
                        • ;
                      3
                          • console
                          • .
                          • log
                          • (
                            • "
                            • insertion
                            • "
                          • )
                        • ;
                      \ No newline at end of file From ebe077c7c8fceddf86d5877580a4278f99772c79 Mon Sep 17 00:00:00 2001 From: Rick Winfrey Date: Wed, 17 Aug 2016 10:59:24 -0500 Subject: [PATCH 20/79] Add BitwiseOperator Category constructor --- src/Category.hs | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/Category.hs b/src/Category.hs index 980ef002a..2f8f6baa2 100644 --- a/src/Category.hs +++ b/src/Category.hs @@ -14,6 +14,8 @@ data Category | Error -- | A boolean expression. | Boolean + -- | A bitwise operator. + | BitwiseOperator -- | An operator with 2 operands. | BinaryOperator -- | A literal key-value data structure. From b93bffe8310fd4e81f66ae5930f446e3464751dd Mon Sep 17 00:00:00 2001 From: Rick Winfrey Date: Wed, 17 Aug 2016 10:59:39 -0500 Subject: [PATCH 21/79] Add BitwiseOperator constructor to list of operators --- src/Parser.hs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/Parser.hs b/src/Parser.hs index f59b65e09..8f98aedd9 100644 --- a/src/Parser.hs +++ b/src/Parser.hs @@ -19,7 +19,7 @@ type Parser f a = SourceBlob -> IO (Cofree f a) -- | Whether a category is an Operator Category isOperator :: Category -> Bool -isOperator = flip Set.member (Set.fromList [ Operator, BinaryOperator ]) +isOperator = flip Set.member (Set.fromList [ Operator, BinaryOperator, BitwiseOperator ]) -- | Construct a term given source, the span covered, the annotation for the term, and its children. -- From c91ec6c9a1861276735bd4d045e9cf4bd6f685c6 Mon Sep 17 00:00:00 2001 From: Rick Winfrey Date: Wed, 17 Aug 2016 11:00:01 -0500 Subject: [PATCH 22/79] Allow BitwiseOperator matching in diff summaries --- src/DiffSummary.hs | 1 + 1 file changed, 1 insertion(+) diff --git a/src/DiffSummary.hs b/src/DiffSummary.hs index d9804a821..af38900aa 100644 --- a/src/DiffSummary.hs +++ b/src/DiffSummary.hs @@ -172,6 +172,7 @@ instance HasCategory Category where toCategoryName = \case ArrayLiteral -> "array" BinaryOperator -> "binary operator" + BitwiseOperator -> "bitwise operator" Boolean -> "boolean" DictionaryLiteral -> "dictionary" C.Comment -> "comment" From 9907711099f2d26d9c5489d94c74bd023c31ea32 Mon Sep 17 00:00:00 2001 From: Rick Winfrey Date: Wed, 17 Aug 2016 11:00:25 -0500 Subject: [PATCH 23/79] Make "bitwise_op" a generic language production --- src/TreeSitter.hs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/TreeSitter.hs b/src/TreeSitter.hs index d76c460eb..e2ea4b514 100644 --- a/src/TreeSitter.hs +++ b/src/TreeSitter.hs @@ -40,7 +40,6 @@ categoriesForLanguage language name = case (language, name) of (JavaScript, "generator_function") -> Function (JavaScript, "math_op") -> BinaryOperator -- bitwise operator, e.g. +, -, *, /. (JavaScript, "bool_op") -> BinaryOperator -- boolean operator, e.g. ||, &&. - (JavaScript, "bitwise_op") -> BinaryOperator -- bitwise operator, e.g. ^, &, etc. (JavaScript, "rel_op") -> BinaryOperator -- relational operator, e.g. >, <, <=, >=, ==, !=. (JavaScript, "comma_op") -> CommaOperator -- comma operator, e.g. expr1, expr2. (JavaScript, "delete_op") -> Operator -- delete operator, e.g. delete x[2]. @@ -96,6 +95,7 @@ defaultCategoryForNodeName name = case name of "try_statement" -> Try "method_definition" -> Method "comment" -> Comment + "bitwise_op" -> BitwiseOperator _ -> Other name {-# INLINE defaultCategoryForNodeName #-} From 3dae5cb2f86068564a2969f4f81881818b3fda1c Mon Sep 17 00:00:00 2001 From: Rick Winfrey Date: Wed, 17 Aug 2016 11:00:47 -0500 Subject: [PATCH 24/79] Update Split to account for newly added CommaOperator and BitwiseOperator constructors --- src/Renderer/Split.hs | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/Renderer/Split.hs b/src/Renderer/Split.hs index c8559d17b..b12418bbe 100644 --- a/src/Renderer/Split.hs +++ b/src/Renderer/Split.hs @@ -34,6 +34,8 @@ styleName category = "category-" <> case category of Program -> "program" C.Error -> "error" BinaryOperator -> "binary-operator" + BitwiseOperator -> "bitwise-operator" + C.CommaOperator -> "comma-operator" Boolean -> "boolean" DictionaryLiteral -> "dictionary" C.Pair -> "pair" From 2fa8f1487e4e23c0fa16ca6df7a2fe73e500fd0c Mon Sep 17 00:00:00 2001 From: Rick Winfrey Date: Wed, 17 Aug 2016 11:17:09 -0500 Subject: [PATCH 25/79] Submodule update --- vendor/tree-sitter-parsers | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vendor/tree-sitter-parsers b/vendor/tree-sitter-parsers index 07bc6380b..61b004baf 160000 --- a/vendor/tree-sitter-parsers +++ b/vendor/tree-sitter-parsers @@ -1 +1 @@ -Subproject commit 07bc6380b1e924395b5d40f33209783b96dd45fe +Subproject commit 61b004baf78814b0335c7a1a44f6d2ba0e848e18 From 349b2688848138f9215a265f1699fb8c44464bf5 Mon Sep 17 00:00:00 2001 From: Rick Winfrey Date: Wed, 17 Aug 2016 11:54:16 -0500 Subject: [PATCH 26/79] Add RelationalOperator constructor for Category --- src/Category.hs | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/Category.hs b/src/Category.hs index 2f8f6baa2..3244fe3d7 100644 --- a/src/Category.hs +++ b/src/Category.hs @@ -18,6 +18,8 @@ data Category | BitwiseOperator -- | An operator with 2 operands. | BinaryOperator + -- | A relational operator (e.g. < or >=) + | RelationalOperator -- | A literal key-value data structure. | DictionaryLiteral -- | A pair, e.g. of a key & value From e0f28282e838ae33fa37e394dbe35a9bd24ec4b9 Mon Sep 17 00:00:00 2001 From: Rick Winfrey Date: Wed, 17 Aug 2016 11:54:30 -0500 Subject: [PATCH 27/79] Add RelationalOperator constructor to list of operators --- src/Parser.hs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/Parser.hs b/src/Parser.hs index 8f98aedd9..64e1a1a03 100644 --- a/src/Parser.hs +++ b/src/Parser.hs @@ -19,7 +19,7 @@ type Parser f a = SourceBlob -> IO (Cofree f a) -- | Whether a category is an Operator Category isOperator :: Category -> Bool -isOperator = flip Set.member (Set.fromList [ Operator, BinaryOperator, BitwiseOperator ]) +isOperator = flip Set.member (Set.fromList [ Operator, BinaryOperator, BitwiseOperator, RelationalOperator ]) -- | Construct a term given source, the span covered, the annotation for the term, and its children. -- From ed3fab42c378df9d4960abd1804abe9b0ad31062 Mon Sep 17 00:00:00 2001 From: Rick Winfrey Date: Wed, 17 Aug 2016 11:54:40 -0500 Subject: [PATCH 28/79] Render RelationalOperator --- src/Renderer/Split.hs | 1 + 1 file changed, 1 insertion(+) diff --git a/src/Renderer/Split.hs b/src/Renderer/Split.hs index b12418bbe..431a49b13 100644 --- a/src/Renderer/Split.hs +++ b/src/Renderer/Split.hs @@ -35,6 +35,7 @@ styleName category = "category-" <> case category of C.Error -> "error" BinaryOperator -> "binary-operator" BitwiseOperator -> "bitwise-operator" + RelationalOperator -> "relational-operator" C.CommaOperator -> "comma-operator" Boolean -> "boolean" DictionaryLiteral -> "dictionary" From 95582bb7982eca8ef331c85e71b5c4a5225f3f30 Mon Sep 17 00:00:00 2001 From: Rick Winfrey Date: Wed, 17 Aug 2016 11:54:53 -0500 Subject: [PATCH 29/79] Account for RelationalOperator in diff summaries --- src/DiffSummary.hs | 1 + 1 file changed, 1 insertion(+) diff --git a/src/DiffSummary.hs b/src/DiffSummary.hs index af38900aa..c38da5ccb 100644 --- a/src/DiffSummary.hs +++ b/src/DiffSummary.hs @@ -173,6 +173,7 @@ instance HasCategory Category where ArrayLiteral -> "array" BinaryOperator -> "binary operator" BitwiseOperator -> "bitwise operator" + RelationalOperator -> "relational operator" Boolean -> "boolean" DictionaryLiteral -> "dictionary" C.Comment -> "comment" From 1b542a89ba2ec066465a7d77381c609564b6e490 Mon Sep 17 00:00:00 2001 From: Rick Winfrey Date: Wed, 17 Aug 2016 11:55:20 -0500 Subject: [PATCH 30/79] Promote "rel_op" to general language production --- src/TreeSitter.hs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/TreeSitter.hs b/src/TreeSitter.hs index e2ea4b514..659599a0e 100644 --- a/src/TreeSitter.hs +++ b/src/TreeSitter.hs @@ -40,7 +40,6 @@ categoriesForLanguage language name = case (language, name) of (JavaScript, "generator_function") -> Function (JavaScript, "math_op") -> BinaryOperator -- bitwise operator, e.g. +, -, *, /. (JavaScript, "bool_op") -> BinaryOperator -- boolean operator, e.g. ||, &&. - (JavaScript, "rel_op") -> BinaryOperator -- relational operator, e.g. >, <, <=, >=, ==, !=. (JavaScript, "comma_op") -> CommaOperator -- comma operator, e.g. expr1, expr2. (JavaScript, "delete_op") -> Operator -- delete operator, e.g. delete x[2]. (JavaScript, "type_op") -> Operator -- type operator, e.g. typeof Object. @@ -96,6 +95,7 @@ defaultCategoryForNodeName name = case name of "method_definition" -> Method "comment" -> Comment "bitwise_op" -> BitwiseOperator + "rel_op" -> RelationalOperator _ -> Other name {-# INLINE defaultCategoryForNodeName #-} From 2df90eada71db3cfe974a7e810c4808be7bde6e7 Mon Sep 17 00:00:00 2001 From: joshvera Date: Wed, 17 Aug 2016 16:34:58 -0400 Subject: [PATCH 31/79] Account for false positive close terms Improve nearestUnmapped by looking up the nearest unmapped term which is not a false positive. We look up the nearest unmapped terms closest to the query term, then sort by the constant time edit distance between the query term and the intersecting terms. Finally, we select the first term that has the lowest edit distance. --- src/Data/RandomWalkSimilarity.hs | 16 +++++++++++++++- src/Diff.hs | 4 ++-- src/Patch.hs | 2 +- src/Term.hs | 2 +- 4 files changed, 19 insertions(+), 5 deletions(-) diff --git a/src/Data/RandomWalkSimilarity.hs b/src/Data/RandomWalkSimilarity.hs index dc164b72a..dd036e3d1 100644 --- a/src/Data/RandomWalkSimilarity.hs +++ b/src/Data/RandomWalkSimilarity.hs @@ -24,6 +24,8 @@ import Prologue import Term () import Test.QuickCheck hiding (Fixed) import Test.QuickCheck.Random +import Data.List (intersectBy) +import Term (termSize) -- | Given a function comparing two terms recursively, and a function to compute a Hashable label from an unpacked term, compute the diff of a pair of lists of terms using a random walk similarity metric, which completes in log-linear time. This implementation is based on the paper [_RWS-Diff—Flexible and Efficient Change Detection in Hierarchical Data_](https://github.com/github/semantic-diff/files/325837/RWS-Diff.Flexible.and.Efficient.Change.Detection.in.Hierarchical.Data.pdf). rws :: (Eq (Record fields), Prologue.Foldable f, Functor f, Eq (f (Cofree f (Record fields))), HasField fields (Vector.Vector Double)) @@ -52,12 +54,24 @@ rws compare as bs pure $! do put (i, List.delete foundA unmappedA, List.delete foundB unmappedB) pure (i, compared) - nearestUnmapped unmapped tree key = find ((== termIndex (KdTree.nearest tree key)) . termIndex) unmapped + -- we get k different things + -- we need the nearest unmapped one which is not a false positive + -- so filter the k things to the list of unmapped things, and then take the smallest of those by constant-time edit distance approximation + -- later on maybe find a better k, filter during k-nearest lookup, etc. + nearestUnmapped unmapped tree key = getFirst $ foldMap (First . Just) (sortOn (constantTimeEditDistance key) (intersectBy ((==) `on` termIndex) unmapped (KdTree.kNearest tree 2 key))) + + constantTimeEditDistance key a = fromMaybe (maxBound :: Int) $ diffCostOfMaybes . cutoff 10 <$> compare (term key) (term a) + + -- ((== termIndex ) . termIndex) insertion previous unmappedA unmappedB kv@(UnmappedTerm _ _ b) = do put (previous, unmappedA, List.delete kv unmappedB) pure (negate 1, inserting b) deleteRemaining diffs (_, unmappedA, _) = foldl' (flip (List.insertBy (comparing fst))) diffs ((termIndex &&& deleting . term) <$> unmappedA) +diffCostOfMaybes :: (Prologue.Foldable f, Functor f) => Free (CofreeF f (Both annotation)) (Maybe (Patch (Cofree f annotation))) -> Int +diffCostOfMaybes = diffSum $ patchSum termSize + where diffSum patchCost diff = sum $ fmap (maybe 0 patchCost) diff + -- | A term which has not yet been mapped by `rws`, along with its feature vector summary & index. data UnmappedTerm a = UnmappedTerm { termIndex :: {-# UNPACK #-} !Int, feature :: !(Vector.Vector Double), term :: !a } deriving Eq diff --git a/src/Diff.hs b/src/Diff.hs index ae917010f..ec56caac7 100644 --- a/src/Diff.hs +++ b/src/Diff.hs @@ -18,11 +18,11 @@ type instance Base (Free f a) = FreeF f a instance Functor f => Foldable.Foldable (Free f a) where project = runFree instance Functor f => Foldable.Unfoldable (Free f a) where embed = free -diffSum :: (Patch (Term a annotation) -> Integer) -> Diff a annotation -> Integer +diffSum :: (Patch (Term a annotation) -> Int) -> Diff a annotation -> Int diffSum patchCost diff = sum $ fmap patchCost diff -- | The sum of the node count of the diff’s patches. -diffCost :: Diff a annotation -> Integer +diffCost :: Diff a annotation -> Int diffCost = diffSum $ patchSum termSize -- | Merge a diff using a function to provide the Term (in Maybe, to simplify recovery of the before/after state) for every Patch. diff --git a/src/Patch.hs b/src/Patch.hs index a156f01f2..65a163838 100644 --- a/src/Patch.hs +++ b/src/Patch.hs @@ -58,7 +58,7 @@ mapPatch _ g (Insert b) = Insert (g b) mapPatch f g (Replace a b) = Replace (f a) (g b) -- | Calculate the cost of the patch given a function to compute the cost of a item. -patchSum :: (a -> Integer) -> Patch a -> Integer +patchSum :: (a -> Int) -> Patch a -> Int patchSum termCost patch = maybe 0 termCost (before patch) + maybe 0 termCost (after patch) -- | Return Just the value in This, or the first value in These, if any. diff --git a/src/Term.hs b/src/Term.hs index 0c71a52fb..ad7d49e5f 100644 --- a/src/Term.hs +++ b/src/Term.hs @@ -25,7 +25,7 @@ zipTerms t1 t2 = iter go (alignCofreeWith galign (const Nothing) both (These t1 where go (a :< s) = cofree . (a :<) <$> sequenceA s -- | Return the node count of a term. -termSize :: Term a annotation -> Integer +termSize :: (Prologue.Foldable f, Functor f) => Cofree f annotation -> Int termSize = cata size where size (_ :< syntax) = 1 + sum syntax From 2c04be022df5ba9bf288c1e33bebb36fe243eb06 Mon Sep 17 00:00:00 2001 From: joshvera Date: Wed, 17 Aug 2016 16:35:48 -0400 Subject: [PATCH 32/79] Remove unused comment --- src/Data/RandomWalkSimilarity.hs | 1 - 1 file changed, 1 deletion(-) diff --git a/src/Data/RandomWalkSimilarity.hs b/src/Data/RandomWalkSimilarity.hs index dd036e3d1..97345fcf1 100644 --- a/src/Data/RandomWalkSimilarity.hs +++ b/src/Data/RandomWalkSimilarity.hs @@ -62,7 +62,6 @@ rws compare as bs constantTimeEditDistance key a = fromMaybe (maxBound :: Int) $ diffCostOfMaybes . cutoff 10 <$> compare (term key) (term a) - -- ((== termIndex ) . termIndex) insertion previous unmappedA unmappedB kv@(UnmappedTerm _ _ b) = do put (previous, unmappedA, List.delete kv unmappedB) pure (negate 1, inserting b) From 11d6362997064283e7b83eca3cb7c48fa333f22a Mon Sep 17 00:00:00 2001 From: Rick Winfrey Date: Wed, 17 Aug 2016 16:12:40 -0500 Subject: [PATCH 33/79] Preserve original hashed value for Category --- src/Category.hs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/Category.hs b/src/Category.hs index 3244fe3d7..1f471e488 100644 --- a/src/Category.hs +++ b/src/Category.hs @@ -18,8 +18,6 @@ data Category | BitwiseOperator -- | An operator with 2 operands. | BinaryOperator - -- | A relational operator (e.g. < or >=) - | RelationalOperator -- | A literal key-value data structure. | DictionaryLiteral -- | A pair, e.g. of a key & value @@ -102,6 +100,8 @@ data Category | Comment -- | A non-standard category, which can be used for comparability. | Other Text + -- | A relational operator (e.g. < or >=) + | RelationalOperator deriving (Eq, Generic, Ord, Show) -- Instances From 4cc090e92bd4ed4e065d7fa030037e5ded9b45d7 Mon Sep 17 00:00:00 2001 From: Rob Rix Date: Thu, 18 Aug 2016 10:52:49 -0400 Subject: [PATCH 34/79] Use `Int` instead of `Integer`. --- src/Diffing.hs | 2 +- src/Info.hs | 2 +- src/SES.hs | 6 +++--- 3 files changed, 5 insertions(+), 5 deletions(-) diff --git a/src/Diffing.hs b/src/Diffing.hs index dc55ca971..05d6e86a8 100644 --- a/src/Diffing.hs +++ b/src/Diffing.hs @@ -138,7 +138,7 @@ compareCategoryEq :: HasField fields Category => Term leaf (Record fields) -> Te compareCategoryEq = (==) `on` category . extract -- | The sum of the node count of the diff’s patches. -diffCostWithCachedTermCosts :: HasField fields Cost => Diff leaf (Record fields) -> Integer +diffCostWithCachedTermCosts :: HasField fields Cost => Diff leaf (Record fields) -> Int diffCostWithCachedTermCosts diff = unCost $ case runFree diff of Free (info :< _) -> sum (cost <$> info) Pure patch -> sum (cost . extract <$> patch) diff --git a/src/Info.hs b/src/Info.hs index 44ba5c7e9..30410252a 100644 --- a/src/Info.hs +++ b/src/Info.hs @@ -7,7 +7,7 @@ import Category import Range import Test.QuickCheck -newtype Cost = Cost { unCost :: Integer } +newtype Cost = Cost { unCost :: Int } deriving (Eq, Num, Ord, Show) characterRange :: HasField fields Range => Record fields -> Range diff --git a/src/SES.hs b/src/SES.hs index 97ad55d1a..cb6ae2349 100644 --- a/src/SES.hs +++ b/src/SES.hs @@ -9,7 +9,7 @@ import Prologue type Compare term edit = term -> term -> Maybe edit -- | A function that computes the cost of an edit. -type Cost edit = edit -> Integer +type Cost edit = edit -> Int -- | Find the shortest edit script (diff) between two terms given a function to compute the cost. ses :: Applicative edit => Compare term (edit (Patch term)) -> Cost (edit (Patch term)) -> [term] -> [term] -> [edit (Patch term)] @@ -17,7 +17,7 @@ ses diffTerms cost as bs = fst <$> evalState diffState Map.empty where diffState = diffAt diffTerms cost (0, 0) as bs -- | Find the shortest edit script between two terms at a given vertex in the edit graph. -diffAt :: Applicative edit => Compare term (edit (Patch term)) -> Cost (edit (Patch term)) -> (Integer, Integer) -> [term] -> [term] -> State (Map.Map (Integer, Integer) [(edit (Patch term), Integer)]) [(edit (Patch term), Integer)] +diffAt :: Applicative edit => Compare term (edit (Patch term)) -> Cost (edit (Patch term)) -> (Int, Int) -> [term] -> [term] -> State (Map.Map (Int, Int) [(edit (Patch term), Int)]) [(edit (Patch term), Int)] diffAt diffTerms cost (i, j) as bs | (a : as) <- as, (b : bs) <- bs = do cachedDiffs <- get @@ -47,5 +47,5 @@ diffAt diffTerms cost (i, j) as bs -- | Prepend an edit script and the cumulative cost onto the edit script. -consWithCost :: Cost edit -> edit -> [(edit, Integer)] -> [(edit, Integer)] +consWithCost :: Cost edit -> edit -> [(edit, Int)] -> [(edit, Int)] consWithCost cost edit rest = (edit, cost edit + maybe 0 snd (fst <$> uncons rest)) : rest From c3e0e9e7d4a7930f22374db10c02b490dc1f9897 Mon Sep 17 00:00:00 2001 From: Rob Rix Date: Thu, 18 Aug 2016 11:01:27 -0400 Subject: [PATCH 35/79] Bump _q_ to 3. --- src/Diffing.hs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/Diffing.hs b/src/Diffing.hs index 05d6e86a8..54f8f161e 100644 --- a/src/Diffing.hs +++ b/src/Diffing.hs @@ -64,7 +64,7 @@ diffFiles parser renderer sourceBlobs = do getLabel (h :< t) = (category h, case t of Leaf s -> Just s _ -> Nothing) - (p, q, d) = (2, 2, 15) + (p, q, d) = (2, 3, 15) -- | Return a parser based on the file extension (including the "."). parserForType :: Text -> Parser (Syntax Text) (Record '[Range, Category]) From d17f76bc6067fbcbfa75ab0cb811aade8bc941ab Mon Sep 17 00:00:00 2001 From: Rob Rix Date: Thu, 18 Aug 2016 11:14:37 -0400 Subject: [PATCH 36/79] Take surrounding context into account when computing bases. ``` 1 `div` 2 = 0 2 `div` 2 = 1 3 `div` 2 = 1 ``` etc. This means that given _q_ = 3, the base for b in [a, b, c, d] will be [a, b, c] instead of [b, c, d]. This means that grams encode limited sibling ordering information both forwards and backwards, making them slightly more stable. --- src/Data/RandomWalkSimilarity.hs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/Data/RandomWalkSimilarity.hs b/src/Data/RandomWalkSimilarity.hs index 97345fcf1..a914794be 100644 --- a/src/Data/RandomWalkSimilarity.hs +++ b/src/Data/RandomWalkSimilarity.hs @@ -91,7 +91,7 @@ pqGramDecorator getLabel p q = cata algebra where algebra term = let label = getLabel term in cofree ((gram label .: headF term) :< assignParentAndSiblingLabels (tailF term) label) gram label = Gram (padToSize p []) (padToSize q (pure (Just label))) - assignParentAndSiblingLabels functor label = (`evalState` (siblingLabels functor)) (for functor (assignLabels label)) + assignParentAndSiblingLabels functor label = (`evalState` (replicate (q `div` 2) Nothing <> siblingLabels functor)) (for functor (assignLabels label)) assignLabels :: label -> Cofree f (Record (Gram label ': fields)) -> State [Maybe label] (Cofree f (Record (Gram label ': fields))) assignLabels label a = case runCofree a of RCons gram rest :< functor -> do From 8aa0f082a6b4e85ec0c99888fc1c8b3389e231ed Mon Sep 17 00:00:00 2001 From: Rob Rix Date: Thu, 18 Aug 2016 11:23:21 -0400 Subject: [PATCH 37/79] Bump _q_ in all of the specs. MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Figure it’s best if they use the same value for the parameter. --- test/Data/RandomWalkSimilarity/Spec.hs | 2 +- test/Diff/Spec.hs | 2 +- test/DiffSummarySpec.hs | 2 +- test/InterpreterSpec.hs | 2 +- 4 files changed, 4 insertions(+), 4 deletions(-) diff --git a/test/Data/RandomWalkSimilarity/Spec.hs b/test/Data/RandomWalkSimilarity/Spec.hs index fdb9da3d0..8198ef019 100644 --- a/test/Data/RandomWalkSimilarity/Spec.hs +++ b/test/Data/RandomWalkSimilarity/Spec.hs @@ -30,7 +30,7 @@ spec = parallel $ do describe "rws" $ do let compare a b = if ((==) `on` category . extract) a b then Just (replacing a b) else Nothing - let decorate = featureVectorDecorator (category . headF) 2 2 15 + let decorate = featureVectorDecorator (category . headF) 2 3 15 let toTerm' = decorate . toTerm prop "produces correct diffs" . forAll (scale (`div` 4) arbitrary) $ \ (as, bs) -> let tas = toTerm' <$> (as :: [ArbitraryTerm Text (Record '[Category])]) diff --git a/test/Diff/Spec.hs b/test/Diff/Spec.hs index 91857b665..43ff6bb15 100644 --- a/test/Diff/Spec.hs +++ b/test/Diff/Spec.hs @@ -17,7 +17,7 @@ import Test.QuickCheck spec :: Spec spec = parallel $ do - let toTerm' = featureVectorDecorator (category . headF) 2 2 15 . toTerm + let toTerm' = featureVectorDecorator (category . headF) 2 3 15 . toTerm prop "equality is reflexive" $ \ a b -> let diff = diffTerms wrap (==) diffCost (toTerm' a) (toTerm' (b :: ArbitraryTerm Text (Record '[Category]))) in diff `shouldBe` diff diff --git a/test/DiffSummarySpec.hs b/test/DiffSummarySpec.hs index 93d69491c..28de15a2b 100644 --- a/test/DiffSummarySpec.hs +++ b/test/DiffSummarySpec.hs @@ -45,7 +45,7 @@ spec = parallel $ do diffSummaries sources testDiff `shouldBe` [ DiffSummary { patch = Insert (LeafInfo "string" "a"), parentAnnotation = Nothing } ] prop "equal terms produce identity diffs" $ - \ a -> let term = featureVectorDecorator (category . headF) 2 2 15 (toTerm (a :: ArbitraryTerm Text (Record '[Category, Range]))) in + \ a -> let term = featureVectorDecorator (category . headF) 2 3 15 (toTerm (a :: ArbitraryTerm Text (Record '[Category, Range]))) in diffSummaries sources (diffTerms wrap (==) diffCost term term) `shouldBe` [] describe "annotatedSummaries" $ do diff --git a/test/InterpreterSpec.hs b/test/InterpreterSpec.hs index 755b6001c..4c79aaf79 100644 --- a/test/InterpreterSpec.hs +++ b/test/InterpreterSpec.hs @@ -18,7 +18,7 @@ import Test.Hspec.QuickCheck spec :: Spec spec = parallel $ do describe "interpret" $ do - let decorate = featureVectorDecorator (category . headF) 2 2 15 + let decorate = featureVectorDecorator (category . headF) 2 3 15 let compare = ((==) `on` category . extract) it "returns a replacement when comparing two unicode equivalent terms" $ let termA = cofree $ (StringLiteral .: RNil) :< Leaf ("t\776" :: Text) From 02bb72ee0b0d7e88d7bae100b04b9f2c08227f10 Mon Sep 17 00:00:00 2001 From: Rob Rix Date: Thu, 18 Aug 2016 11:25:21 -0400 Subject: [PATCH 38/79] Move compare to the where clause. --- test/Data/RandomWalkSimilarity/Spec.hs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/Data/RandomWalkSimilarity/Spec.hs b/test/Data/RandomWalkSimilarity/Spec.hs index 8198ef019..1545aebf7 100644 --- a/test/Data/RandomWalkSimilarity/Spec.hs +++ b/test/Data/RandomWalkSimilarity/Spec.hs @@ -29,7 +29,6 @@ spec = parallel $ do \ (term, p, q, d) -> featureVectorDecorator (rhead . headF) (positively p) (positively q) (positively d) (toTerm term :: Term Text (Record '[Text])) `shouldSatisfy` all ((== (positively d)) . length . rhead) describe "rws" $ do - let compare a b = if ((==) `on` category . extract) a b then Just (replacing a b) else Nothing let decorate = featureVectorDecorator (category . headF) 2 3 15 let toTerm' = decorate . toTerm prop "produces correct diffs" . forAll (scale (`div` 4) arbitrary) $ @@ -42,3 +41,4 @@ spec = parallel $ do it "produces unbiased insertions within branches" $ let (a, b) = (decorate (cofree ((StringLiteral .: RNil) :< Indexed [ cofree ((StringLiteral .: RNil) :< Leaf "a") ])), decorate (cofree ((StringLiteral .: RNil) :< Indexed [ cofree ((StringLiteral .: RNil) :< Leaf "b") ]))) in fmap stripDiff (rws compare [ b ] [ a, b ]) `shouldBe` fmap stripDiff [ inserting a, replacing b b ] + where compare a b = if ((==) `on` category . extract) a b then Just (replacing a b) else Nothing From cabf1a887e279d183c4ba5f647447a791cab4346 Mon Sep 17 00:00:00 2001 From: Rob Rix Date: Thu, 18 Aug 2016 11:37:55 -0400 Subject: [PATCH 39/79] Compute recursive as copies instead of replacements. This should make us more accurate to the tests of Interpreter. --- test/Data/RandomWalkSimilarity/Spec.hs | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/test/Data/RandomWalkSimilarity/Spec.hs b/test/Data/RandomWalkSimilarity/Spec.hs index 1545aebf7..fcc3d8f8b 100644 --- a/test/Data/RandomWalkSimilarity/Spec.hs +++ b/test/Data/RandomWalkSimilarity/Spec.hs @@ -1,6 +1,8 @@ {-# LANGUAGE DataKinds #-} module Data.RandomWalkSimilarity.Spec where +import Data.Functor.Both +import Data.Functor.Foldable (cata) import Data.RandomWalkSimilarity import Data.Record import Diff @@ -41,4 +43,6 @@ spec = parallel $ do it "produces unbiased insertions within branches" $ let (a, b) = (decorate (cofree ((StringLiteral .: RNil) :< Indexed [ cofree ((StringLiteral .: RNil) :< Leaf "a") ])), decorate (cofree ((StringLiteral .: RNil) :< Indexed [ cofree ((StringLiteral .: RNil) :< Leaf "b") ]))) in fmap stripDiff (rws compare [ b ] [ a, b ]) `shouldBe` fmap stripDiff [ inserting a, replacing b b ] - where compare a b = if ((==) `on` category . extract) a b then Just (replacing a b) else Nothing + where compare :: (HasField fields Category, Functor f, Eq (Cofree f Category)) => Cofree f (Record fields) -> Cofree f (Record fields) -> Maybe (Free (CofreeF f (Both (Record fields))) (Patch (Cofree f (Record fields)))) + compare a b | (category <$> a) == (category <$> b) = Just (cata wrap (fmap pure b)) + | otherwise = if ((==) `on` category . extract) a b then Just (replacing a b) else Nothing From ecaff655c1bda3c739db23d1f50f9b9403db902f Mon Sep 17 00:00:00 2001 From: Rob Rix Date: Thu, 18 Aug 2016 11:40:33 -0400 Subject: [PATCH 40/79] Extract a function to copy terms in. --- test/Data/RandomWalkSimilarity/Spec.hs | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/test/Data/RandomWalkSimilarity/Spec.hs b/test/Data/RandomWalkSimilarity/Spec.hs index fcc3d8f8b..4f5d43651 100644 --- a/test/Data/RandomWalkSimilarity/Spec.hs +++ b/test/Data/RandomWalkSimilarity/Spec.hs @@ -44,5 +44,7 @@ spec = parallel $ do let (a, b) = (decorate (cofree ((StringLiteral .: RNil) :< Indexed [ cofree ((StringLiteral .: RNil) :< Leaf "a") ])), decorate (cofree ((StringLiteral .: RNil) :< Indexed [ cofree ((StringLiteral .: RNil) :< Leaf "b") ]))) in fmap stripDiff (rws compare [ b ] [ a, b ]) `shouldBe` fmap stripDiff [ inserting a, replacing b b ] where compare :: (HasField fields Category, Functor f, Eq (Cofree f Category)) => Cofree f (Record fields) -> Cofree f (Record fields) -> Maybe (Free (CofreeF f (Both (Record fields))) (Patch (Cofree f (Record fields)))) - compare a b | (category <$> a) == (category <$> b) = Just (cata wrap (fmap pure b)) + compare a b | (category <$> a) == (category <$> b) = Just (copying b) | otherwise = if ((==) `on` category . extract) a b then Just (replacing a b) else Nothing + copying :: Functor f => Cofree f (Record fields) -> Free (CofreeF f (Both (Record fields))) (Patch (Cofree f (Record fields))) + copying = cata wrap . fmap pure From b14a7765e61426c696442565893d438a89996af1 Mon Sep 17 00:00:00 2001 From: Rob Rix Date: Thu, 18 Aug 2016 11:40:39 -0400 Subject: [PATCH 41/79] Copy in b. --- test/Data/RandomWalkSimilarity/Spec.hs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/Data/RandomWalkSimilarity/Spec.hs b/test/Data/RandomWalkSimilarity/Spec.hs index 4f5d43651..f93818998 100644 --- a/test/Data/RandomWalkSimilarity/Spec.hs +++ b/test/Data/RandomWalkSimilarity/Spec.hs @@ -42,7 +42,7 @@ spec = parallel $ do it "produces unbiased insertions within branches" $ let (a, b) = (decorate (cofree ((StringLiteral .: RNil) :< Indexed [ cofree ((StringLiteral .: RNil) :< Leaf "a") ])), decorate (cofree ((StringLiteral .: RNil) :< Indexed [ cofree ((StringLiteral .: RNil) :< Leaf "b") ]))) in - fmap stripDiff (rws compare [ b ] [ a, b ]) `shouldBe` fmap stripDiff [ inserting a, replacing b b ] + fmap stripDiff (rws compare [ b ] [ a, b ]) `shouldBe` fmap stripDiff [ inserting a, copying b ] where compare :: (HasField fields Category, Functor f, Eq (Cofree f Category)) => Cofree f (Record fields) -> Cofree f (Record fields) -> Maybe (Free (CofreeF f (Both (Record fields))) (Patch (Cofree f (Record fields)))) compare a b | (category <$> a) == (category <$> b) = Just (copying b) | otherwise = if ((==) `on` category . extract) a b then Just (replacing a b) else Nothing From 768703dc7ee3e1c34aa884c00623f3c289a07f0a Mon Sep 17 00:00:00 2001 From: Rob Rix Date: Thu, 18 Aug 2016 12:27:26 -0400 Subject: [PATCH 42/79] :memo: nearestUnmapped. --- src/Data/RandomWalkSimilarity.hs | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/src/Data/RandomWalkSimilarity.hs b/src/Data/RandomWalkSimilarity.hs index a914794be..f64db9077 100644 --- a/src/Data/RandomWalkSimilarity.hs +++ b/src/Data/RandomWalkSimilarity.hs @@ -54,10 +54,12 @@ rws compare as bs pure $! do put (i, List.delete foundA unmappedA, List.delete foundB unmappedB) pure (i, compared) - -- we get k different things - -- we need the nearest unmapped one which is not a false positive - -- so filter the k things to the list of unmapped things, and then take the smallest of those by constant-time edit distance approximation - -- later on maybe find a better k, filter during k-nearest lookup, etc. + + -- | Finds the most-similar unmapped term to the passed-in term, if any. + -- + -- RWS can produce false positives in the case of e.g. hash collisions. Therefore, we find the _l_ nearest candidates, filter out any which have already been mapped, and select the minimum of the remaining by (a constant-time approximation of) edit distance. + -- + -- cf §4.2 of RWS-Diff nearestUnmapped unmapped tree key = getFirst $ foldMap (First . Just) (sortOn (constantTimeEditDistance key) (intersectBy ((==) `on` termIndex) unmapped (KdTree.kNearest tree 2 key))) constantTimeEditDistance key a = fromMaybe (maxBound :: Int) $ diffCostOfMaybes . cutoff 10 <$> compare (term key) (term a) From ffad7170f086ffa365d3801aaec1c13e702acae8 Mon Sep 17 00:00:00 2001 From: Rob Rix Date: Thu, 18 Aug 2016 12:31:46 -0400 Subject: [PATCH 43/79] Spacing. --- src/Data/RandomWalkSimilarity.hs | 1 + 1 file changed, 1 insertion(+) diff --git a/src/Data/RandomWalkSimilarity.hs b/src/Data/RandomWalkSimilarity.hs index f64db9077..8dba6ebe6 100644 --- a/src/Data/RandomWalkSimilarity.hs +++ b/src/Data/RandomWalkSimilarity.hs @@ -67,6 +67,7 @@ rws compare as bs insertion previous unmappedA unmappedB kv@(UnmappedTerm _ _ b) = do put (previous, unmappedA, List.delete kv unmappedB) pure (negate 1, inserting b) + deleteRemaining diffs (_, unmappedA, _) = foldl' (flip (List.insertBy (comparing fst))) diffs ((termIndex &&& deleting . term) <$> unmappedA) diffCostOfMaybes :: (Prologue.Foldable f, Functor f) => Free (CofreeF f (Both annotation)) (Maybe (Patch (Cofree f annotation))) -> Int From 3832a7b90fdb99328c4862c3b2740be5a3839efc Mon Sep 17 00:00:00 2001 From: Rob Rix Date: Thu, 18 Aug 2016 12:35:24 -0400 Subject: [PATCH 44/79] :memo: constantTimeEditDistance. --- src/Data/RandomWalkSimilarity.hs | 1 + 1 file changed, 1 insertion(+) diff --git a/src/Data/RandomWalkSimilarity.hs b/src/Data/RandomWalkSimilarity.hs index 8dba6ebe6..524515ab2 100644 --- a/src/Data/RandomWalkSimilarity.hs +++ b/src/Data/RandomWalkSimilarity.hs @@ -62,6 +62,7 @@ rws compare as bs -- cf §4.2 of RWS-Diff nearestUnmapped unmapped tree key = getFirst $ foldMap (First . Just) (sortOn (constantTimeEditDistance key) (intersectBy ((==) `on` termIndex) unmapped (KdTree.kNearest tree 2 key))) + -- | Computes a constant-time approximation to the edit distance of a diff. This is done by comparing at most _m_ nodes, & assuming the rest are zero-cost. constantTimeEditDistance key a = fromMaybe (maxBound :: Int) $ diffCostOfMaybes . cutoff 10 <$> compare (term key) (term a) insertion previous unmappedA unmappedB kv@(UnmappedTerm _ _ b) = do From 0c4921b58cba05f339507eccc5858dc14d9b5fea Mon Sep 17 00:00:00 2001 From: Rob Rix Date: Thu, 18 Aug 2016 12:38:28 -0400 Subject: [PATCH 45/79] Bind a variable for l. --- src/Data/RandomWalkSimilarity.hs | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/src/Data/RandomWalkSimilarity.hs b/src/Data/RandomWalkSimilarity.hs index 524515ab2..93c752684 100644 --- a/src/Data/RandomWalkSimilarity.hs +++ b/src/Data/RandomWalkSimilarity.hs @@ -60,7 +60,7 @@ rws compare as bs -- RWS can produce false positives in the case of e.g. hash collisions. Therefore, we find the _l_ nearest candidates, filter out any which have already been mapped, and select the minimum of the remaining by (a constant-time approximation of) edit distance. -- -- cf §4.2 of RWS-Diff - nearestUnmapped unmapped tree key = getFirst $ foldMap (First . Just) (sortOn (constantTimeEditDistance key) (intersectBy ((==) `on` termIndex) unmapped (KdTree.kNearest tree 2 key))) + nearestUnmapped unmapped tree key = getFirst $ foldMap (First . Just) (sortOn (constantTimeEditDistance key) (intersectBy ((==) `on` termIndex) unmapped (KdTree.kNearest tree l key))) -- | Computes a constant-time approximation to the edit distance of a diff. This is done by comparing at most _m_ nodes, & assuming the rest are zero-cost. constantTimeEditDistance key a = fromMaybe (maxBound :: Int) $ diffCostOfMaybes . cutoff 10 <$> compare (term key) (term a) @@ -71,6 +71,8 @@ rws compare as bs deleteRemaining diffs (_, unmappedA, _) = foldl' (flip (List.insertBy (comparing fst))) diffs ((termIndex &&& deleting . term) <$> unmappedA) + l = 2 + diffCostOfMaybes :: (Prologue.Foldable f, Functor f) => Free (CofreeF f (Both annotation)) (Maybe (Patch (Cofree f annotation))) -> Int diffCostOfMaybes = diffSum $ patchSum termSize where diffSum patchCost diff = sum $ fmap (maybe 0 patchCost) diff From ab2dfacab61379d9823820a5b986570d4907cf4a Mon Sep 17 00:00:00 2001 From: Rob Rix Date: Thu, 18 Aug 2016 12:38:39 -0400 Subject: [PATCH 46/79] Bind a variable for m. --- src/Data/RandomWalkSimilarity.hs | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/Data/RandomWalkSimilarity.hs b/src/Data/RandomWalkSimilarity.hs index 93c752684..2834e6c8b 100644 --- a/src/Data/RandomWalkSimilarity.hs +++ b/src/Data/RandomWalkSimilarity.hs @@ -63,7 +63,7 @@ rws compare as bs nearestUnmapped unmapped tree key = getFirst $ foldMap (First . Just) (sortOn (constantTimeEditDistance key) (intersectBy ((==) `on` termIndex) unmapped (KdTree.kNearest tree l key))) -- | Computes a constant-time approximation to the edit distance of a diff. This is done by comparing at most _m_ nodes, & assuming the rest are zero-cost. - constantTimeEditDistance key a = fromMaybe (maxBound :: Int) $ diffCostOfMaybes . cutoff 10 <$> compare (term key) (term a) + constantTimeEditDistance key a = fromMaybe (maxBound :: Int) $ diffCostOfMaybes . cutoff m <$> compare (term key) (term a) insertion previous unmappedA unmappedB kv@(UnmappedTerm _ _ b) = do put (previous, unmappedA, List.delete kv unmappedB) @@ -72,6 +72,7 @@ rws compare as bs deleteRemaining diffs (_, unmappedA, _) = foldl' (flip (List.insertBy (comparing fst))) diffs ((termIndex &&& deleting . term) <$> unmappedA) l = 2 + m = 10 diffCostOfMaybes :: (Prologue.Foldable f, Functor f) => Free (CofreeF f (Both annotation)) (Maybe (Patch (Cofree f annotation))) -> Int diffCostOfMaybes = diffSum $ patchSum termSize From 29a0db26755591fedc9c055b83028bfcbb99c41d Mon Sep 17 00:00:00 2001 From: Rob Rix Date: Thu, 18 Aug 2016 12:42:57 -0400 Subject: [PATCH 47/79] :memo: l & m. --- src/Data/RandomWalkSimilarity.hs | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/Data/RandomWalkSimilarity.hs b/src/Data/RandomWalkSimilarity.hs index 2834e6c8b..486d2f624 100644 --- a/src/Data/RandomWalkSimilarity.hs +++ b/src/Data/RandomWalkSimilarity.hs @@ -71,7 +71,9 @@ rws compare as bs deleteRemaining diffs (_, unmappedA, _) = foldl' (flip (List.insertBy (comparing fst))) diffs ((termIndex &&& deleting . term) <$> unmappedA) + -- | How many of the most similar terms to consider, to rule out false positives. l = 2 + -- | How many nodes to consider for our constant-time approximation to tree edit distance. m = 10 diffCostOfMaybes :: (Prologue.Foldable f, Functor f) => Free (CofreeF f (Both annotation)) (Maybe (Patch (Cofree f annotation))) -> Int From f2c5e888012eeb5e76df8b96d9a32c4c4c3213f4 Mon Sep 17 00:00:00 2001 From: Rob Rix Date: Thu, 18 Aug 2016 12:48:01 -0400 Subject: [PATCH 48/79] Extract a Comparator type synonym. --- src/Data/RandomWalkSimilarity.hs | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/src/Data/RandomWalkSimilarity.hs b/src/Data/RandomWalkSimilarity.hs index 486d2f624..3c3adeb2f 100644 --- a/src/Data/RandomWalkSimilarity.hs +++ b/src/Data/RandomWalkSimilarity.hs @@ -27,9 +27,11 @@ import Test.QuickCheck.Random import Data.List (intersectBy) import Term (termSize) +type Comparator f a = Cofree f a -> Cofree f a -> Maybe (Free (CofreeF f (Both a)) (Patch (Cofree f a))) + -- | Given a function comparing two terms recursively, and a function to compute a Hashable label from an unpacked term, compute the diff of a pair of lists of terms using a random walk similarity metric, which completes in log-linear time. This implementation is based on the paper [_RWS-Diff—Flexible and Efficient Change Detection in Hierarchical Data_](https://github.com/github/semantic-diff/files/325837/RWS-Diff.Flexible.and.Efficient.Change.Detection.in.Hierarchical.Data.pdf). rws :: (Eq (Record fields), Prologue.Foldable f, Functor f, Eq (f (Cofree f (Record fields))), HasField fields (Vector.Vector Double)) - => (Cofree f (Record fields) -> Cofree f (Record fields) -> Maybe (Free (CofreeF f (Both (Record fields))) (Patch (Cofree f (Record fields))))) -- ^ A function which compares a pair of terms recursively, returning 'Just' their diffed value if appropriate, or 'Nothing' if they should not be compared. + => Comparator f (Record fields) -- ^ A function which compares a pair of terms recursively, returning 'Just' their diffed value if appropriate, or 'Nothing' if they should not be compared. -> [Cofree f (Record fields)] -- ^ The list of old terms. -> [Cofree f (Record fields)] -- ^ The list of new terms. -> [Free (CofreeF f (Both (Record fields))) (Patch (Cofree f (Record fields)))] -- ^ The resulting list of similarity-matched diffs. From a184d5ad4b544d98d834122d0bebf08ddf135bcb Mon Sep 17 00:00:00 2001 From: Rob Rix Date: Thu, 18 Aug 2016 12:48:13 -0400 Subject: [PATCH 49/79] Extract constantTimeEditDistance to the top level. --- src/Data/RandomWalkSimilarity.hs | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/src/Data/RandomWalkSimilarity.hs b/src/Data/RandomWalkSimilarity.hs index 3c3adeb2f..18c6b03a8 100644 --- a/src/Data/RandomWalkSimilarity.hs +++ b/src/Data/RandomWalkSimilarity.hs @@ -62,10 +62,7 @@ rws compare as bs -- RWS can produce false positives in the case of e.g. hash collisions. Therefore, we find the _l_ nearest candidates, filter out any which have already been mapped, and select the minimum of the remaining by (a constant-time approximation of) edit distance. -- -- cf §4.2 of RWS-Diff - nearestUnmapped unmapped tree key = getFirst $ foldMap (First . Just) (sortOn (constantTimeEditDistance key) (intersectBy ((==) `on` termIndex) unmapped (KdTree.kNearest tree l key))) - - -- | Computes a constant-time approximation to the edit distance of a diff. This is done by comparing at most _m_ nodes, & assuming the rest are zero-cost. - constantTimeEditDistance key a = fromMaybe (maxBound :: Int) $ diffCostOfMaybes . cutoff m <$> compare (term key) (term a) + nearestUnmapped unmapped tree key = getFirst $ foldMap (First . Just) (sortOn (constantTimeEditDistance compare m key) (intersectBy ((==) `on` termIndex) unmapped (KdTree.kNearest tree l key))) insertion previous unmappedA unmappedB kv@(UnmappedTerm _ _ b) = do put (previous, unmappedA, List.delete kv unmappedB) @@ -78,6 +75,10 @@ rws compare as bs -- | How many nodes to consider for our constant-time approximation to tree edit distance. m = 10 +-- | Computes a constant-time approximation to the edit distance of a diff. This is done by comparing at most _m_ nodes, & assuming the rest are zero-cost. +constantTimeEditDistance :: (Prologue.Foldable f, Functor f) => Comparator f a -> Integer -> UnmappedTerm (Cofree f a) -> UnmappedTerm (Cofree f a) -> Int +constantTimeEditDistance compare m key a = fromMaybe maxBound $ diffCostOfMaybes . cutoff m <$> compare (term key) (term a) + diffCostOfMaybes :: (Prologue.Foldable f, Functor f) => Free (CofreeF f (Both annotation)) (Maybe (Patch (Cofree f annotation))) -> Int diffCostOfMaybes = diffSum $ patchSum termSize where diffSum patchCost diff = sum $ fmap (maybe 0 patchCost) diff From e063d7e0cdfc5425cf8211072830b4c1f2028d3d Mon Sep 17 00:00:00 2001 From: Rob Rix Date: Thu, 18 Aug 2016 12:49:17 -0400 Subject: [PATCH 50/79] constantTimeEditDistance is computed over terms. --- src/Data/RandomWalkSimilarity.hs | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/Data/RandomWalkSimilarity.hs b/src/Data/RandomWalkSimilarity.hs index 18c6b03a8..2d1edf59d 100644 --- a/src/Data/RandomWalkSimilarity.hs +++ b/src/Data/RandomWalkSimilarity.hs @@ -62,7 +62,7 @@ rws compare as bs -- RWS can produce false positives in the case of e.g. hash collisions. Therefore, we find the _l_ nearest candidates, filter out any which have already been mapped, and select the minimum of the remaining by (a constant-time approximation of) edit distance. -- -- cf §4.2 of RWS-Diff - nearestUnmapped unmapped tree key = getFirst $ foldMap (First . Just) (sortOn (constantTimeEditDistance compare m key) (intersectBy ((==) `on` termIndex) unmapped (KdTree.kNearest tree l key))) + nearestUnmapped unmapped tree key = getFirst $ foldMap (First . Just) (sortOn (constantTimeEditDistance compare m (term key) . term) (intersectBy ((==) `on` termIndex) unmapped (KdTree.kNearest tree l key))) insertion previous unmappedA unmappedB kv@(UnmappedTerm _ _ b) = do put (previous, unmappedA, List.delete kv unmappedB) @@ -76,8 +76,8 @@ rws compare as bs m = 10 -- | Computes a constant-time approximation to the edit distance of a diff. This is done by comparing at most _m_ nodes, & assuming the rest are zero-cost. -constantTimeEditDistance :: (Prologue.Foldable f, Functor f) => Comparator f a -> Integer -> UnmappedTerm (Cofree f a) -> UnmappedTerm (Cofree f a) -> Int -constantTimeEditDistance compare m key a = fromMaybe maxBound $ diffCostOfMaybes . cutoff m <$> compare (term key) (term a) +constantTimeEditDistance :: (Prologue.Foldable f, Functor f) => Comparator f a -> Integer -> Cofree f a -> Cofree f a -> Int +constantTimeEditDistance compare m a b = fromMaybe maxBound $ diffCostOfMaybes . cutoff m <$> compare a b diffCostOfMaybes :: (Prologue.Foldable f, Functor f) => Free (CofreeF f (Both annotation)) (Maybe (Patch (Cofree f annotation))) -> Int diffCostOfMaybes = diffSum $ patchSum termSize From ddbcb0cd18b400b3c8c8e1be928ea57dc2f1e078 Mon Sep 17 00:00:00 2001 From: Rob Rix Date: Thu, 18 Aug 2016 12:50:03 -0400 Subject: [PATCH 51/79] Roll diffCostOfMaybes into constantTimeEditDistance. --- src/Data/RandomWalkSimilarity.hs | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/src/Data/RandomWalkSimilarity.hs b/src/Data/RandomWalkSimilarity.hs index 2d1edf59d..ba7d8c0cb 100644 --- a/src/Data/RandomWalkSimilarity.hs +++ b/src/Data/RandomWalkSimilarity.hs @@ -77,10 +77,7 @@ rws compare as bs -- | Computes a constant-time approximation to the edit distance of a diff. This is done by comparing at most _m_ nodes, & assuming the rest are zero-cost. constantTimeEditDistance :: (Prologue.Foldable f, Functor f) => Comparator f a -> Integer -> Cofree f a -> Cofree f a -> Int -constantTimeEditDistance compare m a b = fromMaybe maxBound $ diffCostOfMaybes . cutoff m <$> compare a b - -diffCostOfMaybes :: (Prologue.Foldable f, Functor f) => Free (CofreeF f (Both annotation)) (Maybe (Patch (Cofree f annotation))) -> Int -diffCostOfMaybes = diffSum $ patchSum termSize +constantTimeEditDistance compare m a b = fromMaybe maxBound $ diffSum (patchSum termSize) . cutoff m <$> compare a b where diffSum patchCost diff = sum $ fmap (maybe 0 patchCost) diff -- | A term which has not yet been mapped by `rws`, along with its feature vector summary & index. From f49527a268c523bdb4894fe27da4fbe4746f1d9b Mon Sep 17 00:00:00 2001 From: Rob Rix Date: Thu, 18 Aug 2016 12:50:15 -0400 Subject: [PATCH 52/79] Export constantTimeEditDistance. --- src/Data/RandomWalkSimilarity.hs | 1 + 1 file changed, 1 insertion(+) diff --git a/src/Data/RandomWalkSimilarity.hs b/src/Data/RandomWalkSimilarity.hs index ba7d8c0cb..38493c3b5 100644 --- a/src/Data/RandomWalkSimilarity.hs +++ b/src/Data/RandomWalkSimilarity.hs @@ -3,6 +3,7 @@ module Data.RandomWalkSimilarity ( rws , pqGramDecorator , featureVectorDecorator +, constantTimeEditDistance , stripDiff , stripTerm , Gram(..) From d169e6fa66ea957cc86abfa339a16a3d61ef9a04 Mon Sep 17 00:00:00 2001 From: Rob Rix Date: Thu, 18 Aug 2016 12:51:07 -0400 Subject: [PATCH 53/79] Spacing. --- test/Data/RandomWalkSimilarity/Spec.hs | 1 + 1 file changed, 1 insertion(+) diff --git a/test/Data/RandomWalkSimilarity/Spec.hs b/test/Data/RandomWalkSimilarity/Spec.hs index f93818998..9f390465e 100644 --- a/test/Data/RandomWalkSimilarity/Spec.hs +++ b/test/Data/RandomWalkSimilarity/Spec.hs @@ -43,6 +43,7 @@ spec = parallel $ do it "produces unbiased insertions within branches" $ let (a, b) = (decorate (cofree ((StringLiteral .: RNil) :< Indexed [ cofree ((StringLiteral .: RNil) :< Leaf "a") ])), decorate (cofree ((StringLiteral .: RNil) :< Indexed [ cofree ((StringLiteral .: RNil) :< Leaf "b") ]))) in fmap stripDiff (rws compare [ b ] [ a, b ]) `shouldBe` fmap stripDiff [ inserting a, copying b ] + where compare :: (HasField fields Category, Functor f, Eq (Cofree f Category)) => Cofree f (Record fields) -> Cofree f (Record fields) -> Maybe (Free (CofreeF f (Both (Record fields))) (Patch (Cofree f (Record fields)))) compare a b | (category <$> a) == (category <$> b) = Just (copying b) | otherwise = if ((==) `on` category . extract) a b then Just (replacing a b) else Nothing From 3fb0ba5ddf33fbd28ca984b188f5ec8acf302539 Mon Sep 17 00:00:00 2001 From: Rob Rix Date: Thu, 18 Aug 2016 12:54:41 -0400 Subject: [PATCH 54/79] constantTimeEditDistance takes a diff. --- src/Data/RandomWalkSimilarity.hs | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/Data/RandomWalkSimilarity.hs b/src/Data/RandomWalkSimilarity.hs index 38493c3b5..ece55fec4 100644 --- a/src/Data/RandomWalkSimilarity.hs +++ b/src/Data/RandomWalkSimilarity.hs @@ -63,7 +63,7 @@ rws compare as bs -- RWS can produce false positives in the case of e.g. hash collisions. Therefore, we find the _l_ nearest candidates, filter out any which have already been mapped, and select the minimum of the remaining by (a constant-time approximation of) edit distance. -- -- cf §4.2 of RWS-Diff - nearestUnmapped unmapped tree key = getFirst $ foldMap (First . Just) (sortOn (constantTimeEditDistance compare m (term key) . term) (intersectBy ((==) `on` termIndex) unmapped (KdTree.kNearest tree l key))) + nearestUnmapped unmapped tree key = getFirst $ foldMap (First . Just) (sortOn (maybe maxBound (constantTimeEditDistance m) . compare (term key) . term) (intersectBy ((==) `on` termIndex) unmapped (KdTree.kNearest tree l key))) insertion previous unmappedA unmappedB kv@(UnmappedTerm _ _ b) = do put (previous, unmappedA, List.delete kv unmappedB) @@ -77,8 +77,8 @@ rws compare as bs m = 10 -- | Computes a constant-time approximation to the edit distance of a diff. This is done by comparing at most _m_ nodes, & assuming the rest are zero-cost. -constantTimeEditDistance :: (Prologue.Foldable f, Functor f) => Comparator f a -> Integer -> Cofree f a -> Cofree f a -> Int -constantTimeEditDistance compare m a b = fromMaybe maxBound $ diffSum (patchSum termSize) . cutoff m <$> compare a b +constantTimeEditDistance :: (Prologue.Foldable f, Functor f) => Integer -> Free (CofreeF f (Both a)) (Patch (Cofree f a)) -> Int +constantTimeEditDistance m = diffSum (patchSum termSize) . cutoff m where diffSum patchCost diff = sum $ fmap (maybe 0 patchCost) diff -- | A term which has not yet been mapped by `rws`, along with its feature vector summary & index. From 627ae0552c78fd641421f59883d2d497a8dc9fee Mon Sep 17 00:00:00 2001 From: Rob Rix Date: Thu, 18 Aug 2016 12:54:52 -0400 Subject: [PATCH 55/79] Revert "Extract a Comparator type synonym." This reverts commit 1aa2b093f0f45815b73c8ad172cf507d2d7ed6ec. --- src/Data/RandomWalkSimilarity.hs | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/src/Data/RandomWalkSimilarity.hs b/src/Data/RandomWalkSimilarity.hs index ece55fec4..4d85d0bcd 100644 --- a/src/Data/RandomWalkSimilarity.hs +++ b/src/Data/RandomWalkSimilarity.hs @@ -28,11 +28,9 @@ import Test.QuickCheck.Random import Data.List (intersectBy) import Term (termSize) -type Comparator f a = Cofree f a -> Cofree f a -> Maybe (Free (CofreeF f (Both a)) (Patch (Cofree f a))) - -- | Given a function comparing two terms recursively, and a function to compute a Hashable label from an unpacked term, compute the diff of a pair of lists of terms using a random walk similarity metric, which completes in log-linear time. This implementation is based on the paper [_RWS-Diff—Flexible and Efficient Change Detection in Hierarchical Data_](https://github.com/github/semantic-diff/files/325837/RWS-Diff.Flexible.and.Efficient.Change.Detection.in.Hierarchical.Data.pdf). rws :: (Eq (Record fields), Prologue.Foldable f, Functor f, Eq (f (Cofree f (Record fields))), HasField fields (Vector.Vector Double)) - => Comparator f (Record fields) -- ^ A function which compares a pair of terms recursively, returning 'Just' their diffed value if appropriate, or 'Nothing' if they should not be compared. + => (Cofree f (Record fields) -> Cofree f (Record fields) -> Maybe (Free (CofreeF f (Both (Record fields))) (Patch (Cofree f (Record fields))))) -- ^ A function which compares a pair of terms recursively, returning 'Just' their diffed value if appropriate, or 'Nothing' if they should not be compared. -> [Cofree f (Record fields)] -- ^ The list of old terms. -> [Cofree f (Record fields)] -- ^ The list of new terms. -> [Free (CofreeF f (Both (Record fields))) (Patch (Cofree f (Record fields)))] -- ^ The resulting list of similarity-matched diffs. From 2706ed262e66966aba6f9ca5af916ca9671b5531 Mon Sep 17 00:00:00 2001 From: Rob Rix Date: Thu, 18 Aug 2016 13:44:27 -0400 Subject: [PATCH 56/79] Rename constantTimeEditDistance to editDistanceUpTo. --- src/Data/RandomWalkSimilarity.hs | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/src/Data/RandomWalkSimilarity.hs b/src/Data/RandomWalkSimilarity.hs index 4d85d0bcd..018c44885 100644 --- a/src/Data/RandomWalkSimilarity.hs +++ b/src/Data/RandomWalkSimilarity.hs @@ -3,7 +3,7 @@ module Data.RandomWalkSimilarity ( rws , pqGramDecorator , featureVectorDecorator -, constantTimeEditDistance +, editDistanceUpTo , stripDiff , stripTerm , Gram(..) @@ -61,7 +61,7 @@ rws compare as bs -- RWS can produce false positives in the case of e.g. hash collisions. Therefore, we find the _l_ nearest candidates, filter out any which have already been mapped, and select the minimum of the remaining by (a constant-time approximation of) edit distance. -- -- cf §4.2 of RWS-Diff - nearestUnmapped unmapped tree key = getFirst $ foldMap (First . Just) (sortOn (maybe maxBound (constantTimeEditDistance m) . compare (term key) . term) (intersectBy ((==) `on` termIndex) unmapped (KdTree.kNearest tree l key))) + nearestUnmapped unmapped tree key = getFirst $ foldMap (First . Just) (sortOn (maybe maxBound (editDistanceUpTo m) . compare (term key) . term) (intersectBy ((==) `on` termIndex) unmapped (KdTree.kNearest tree l key))) insertion previous unmappedA unmappedB kv@(UnmappedTerm _ _ b) = do put (previous, unmappedA, List.delete kv unmappedB) @@ -75,8 +75,8 @@ rws compare as bs m = 10 -- | Computes a constant-time approximation to the edit distance of a diff. This is done by comparing at most _m_ nodes, & assuming the rest are zero-cost. -constantTimeEditDistance :: (Prologue.Foldable f, Functor f) => Integer -> Free (CofreeF f (Both a)) (Patch (Cofree f a)) -> Int -constantTimeEditDistance m = diffSum (patchSum termSize) . cutoff m +editDistanceUpTo :: (Prologue.Foldable f, Functor f) => Integer -> Free (CofreeF f (Both a)) (Patch (Cofree f a)) -> Int +editDistanceUpTo m = diffSum (patchSum termSize) . cutoff m where diffSum patchCost diff = sum $ fmap (maybe 0 patchCost) diff -- | A term which has not yet been mapped by `rws`, along with its feature vector summary & index. From 3e01b1cd130476547833d5334fa93140d62d2257 Mon Sep 17 00:00:00 2001 From: Rob Rix Date: Thu, 18 Aug 2016 13:59:34 -0400 Subject: [PATCH 57/79] Add bindings for the default values of the variables. --- src/Data/RandomWalkSimilarity.hs | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/src/Data/RandomWalkSimilarity.hs b/src/Data/RandomWalkSimilarity.hs index 018c44885..3225a8b70 100644 --- a/src/Data/RandomWalkSimilarity.hs +++ b/src/Data/RandomWalkSimilarity.hs @@ -4,6 +4,9 @@ module Data.RandomWalkSimilarity , pqGramDecorator , featureVectorDecorator , editDistanceUpTo +, defaultD +, defaultP +, defaultQ , stripDiff , stripTerm , Gram(..) @@ -79,6 +82,13 @@ editDistanceUpTo :: (Prologue.Foldable f, Functor f) => Integer -> Free (CofreeF editDistanceUpTo m = diffSum (patchSum termSize) . cutoff m where diffSum patchCost diff = sum $ fmap (maybe 0 patchCost) diff +defaultD, defaultL, defaultM, defaultP, defaultQ :: Int +defaultD = 15 +defaultL = 2 +defaultM = 10 +defaultP = 2 +defaultQ = 3 + -- | A term which has not yet been mapped by `rws`, along with its feature vector summary & index. data UnmappedTerm a = UnmappedTerm { termIndex :: {-# UNPACK #-} !Int, feature :: !(Vector.Vector Double), term :: !a } deriving Eq From 900a0b7a1f0ed158d3f98b31c0a2c26fd6f1e5a3 Mon Sep 17 00:00:00 2001 From: Rob Rix Date: Thu, 18 Aug 2016 14:00:40 -0400 Subject: [PATCH 58/79] Use the global l/m defaults. --- src/Data/RandomWalkSimilarity.hs | 15 +++++++-------- 1 file changed, 7 insertions(+), 8 deletions(-) diff --git a/src/Data/RandomWalkSimilarity.hs b/src/Data/RandomWalkSimilarity.hs index 3225a8b70..9487da71f 100644 --- a/src/Data/RandomWalkSimilarity.hs +++ b/src/Data/RandomWalkSimilarity.hs @@ -64,7 +64,7 @@ rws compare as bs -- RWS can produce false positives in the case of e.g. hash collisions. Therefore, we find the _l_ nearest candidates, filter out any which have already been mapped, and select the minimum of the remaining by (a constant-time approximation of) edit distance. -- -- cf §4.2 of RWS-Diff - nearestUnmapped unmapped tree key = getFirst $ foldMap (First . Just) (sortOn (maybe maxBound (editDistanceUpTo m) . compare (term key) . term) (intersectBy ((==) `on` termIndex) unmapped (KdTree.kNearest tree l key))) + nearestUnmapped unmapped tree key = getFirst $ foldMap (First . Just) (sortOn (maybe maxBound (editDistanceUpTo defaultM) . compare (term key) . term) (intersectBy ((==) `on` termIndex) unmapped (KdTree.kNearest tree defaultL key))) insertion previous unmappedA unmappedB kv@(UnmappedTerm _ _ b) = do put (previous, unmappedA, List.delete kv unmappedB) @@ -72,23 +72,22 @@ rws compare as bs deleteRemaining diffs (_, unmappedA, _) = foldl' (flip (List.insertBy (comparing fst))) diffs ((termIndex &&& deleting . term) <$> unmappedA) - -- | How many of the most similar terms to consider, to rule out false positives. - l = 2 - -- | How many nodes to consider for our constant-time approximation to tree edit distance. - m = 10 - -- | Computes a constant-time approximation to the edit distance of a diff. This is done by comparing at most _m_ nodes, & assuming the rest are zero-cost. editDistanceUpTo :: (Prologue.Foldable f, Functor f) => Integer -> Free (CofreeF f (Both a)) (Patch (Cofree f a)) -> Int editDistanceUpTo m = diffSum (patchSum termSize) . cutoff m where diffSum patchCost diff = sum $ fmap (maybe 0 patchCost) diff -defaultD, defaultL, defaultM, defaultP, defaultQ :: Int +defaultD, defaultL, defaultP, defaultQ :: Int defaultD = 15 +-- | How many of the most similar terms to consider, to rule out false positives. defaultL = 2 -defaultM = 10 defaultP = 2 defaultQ = 3 +-- | How many nodes to consider for our constant-time approximation to tree edit distance. +defaultM :: Integer +defaultM = 10 + -- | A term which has not yet been mapped by `rws`, along with its feature vector summary & index. data UnmappedTerm a = UnmappedTerm { termIndex :: {-# UNPACK #-} !Int, feature :: !(Vector.Vector Double), term :: !a } deriving Eq From 1473c10659ae44cc10a85aed14e0d3e237b1655b Mon Sep 17 00:00:00 2001 From: Rob Rix Date: Thu, 18 Aug 2016 14:02:48 -0400 Subject: [PATCH 59/79] Add a defaultFeatureVectorDecorator function. --- src/Data/RandomWalkSimilarity.hs | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/src/Data/RandomWalkSimilarity.hs b/src/Data/RandomWalkSimilarity.hs index 9487da71f..bcaa59bfd 100644 --- a/src/Data/RandomWalkSimilarity.hs +++ b/src/Data/RandomWalkSimilarity.hs @@ -2,6 +2,7 @@ module Data.RandomWalkSimilarity ( rws , pqGramDecorator +, defaultFeatureVectorDecorator , featureVectorDecorator , editDistanceUpTo , defaultD @@ -125,13 +126,17 @@ unitVector d hash = normalize ((`evalRand` mkQCGen hash) (sequenceA (Vector.repl where normalize vec = fmap (/ vmagnitude vec) vec vmagnitude = sqrtDouble . Vector.sum . fmap (** 2) --- | Annotates a term with a feature vector at each node. +-- | Annotates a term with a feature vector at each node, parameterized by stem length, base width, and feature vector dimensions. featureVectorDecorator :: (Hashable label, Traversable f) => (forall b. CofreeF f (Record fields) b -> label) -> Int -> Int -> Int -> Cofree f (Record fields) -> Cofree f (Record (Vector.Vector Double ': fields)) featureVectorDecorator getLabel p q d = cata (\ (RCons gram rest :< functor) -> cofree ((foldr (Vector.zipWith (+) . getField . extract) (unitVector d (hash gram)) functor .: rest) :< functor)) . pqGramDecorator getLabel p q +-- | Annotates a term with a feature vector at each node, using the default values for the p, q, and d parameters. +defaultFeatureVectorDecorator :: (Hashable label, Traversable f) => (forall b. CofreeF f (Record fields) b -> label) -> Cofree f (Record fields) -> Cofree f (Record (Vector.Vector Double ': fields)) +defaultFeatureVectorDecorator getLabel = featureVectorDecorator getLabel defaultP defaultQ defaultD + -- | Strips the head annotation off a term annotated with non-empty records. stripTerm :: Functor f => Cofree f (Record (h ': t)) -> Cofree f (Record t) stripTerm = fmap rtail From 00689650dd320792b1a6c5da0307c8756de6e0c3 Mon Sep 17 00:00:00 2001 From: Rob Rix Date: Thu, 18 Aug 2016 14:03:36 -0400 Subject: [PATCH 60/79] Use the defaultFeatureVectorDecorator in the tests. --- test/Data/RandomWalkSimilarity/Spec.hs | 2 +- test/Diff/Spec.hs | 2 +- test/DiffSummarySpec.hs | 2 +- test/InterpreterSpec.hs | 2 +- 4 files changed, 4 insertions(+), 4 deletions(-) diff --git a/test/Data/RandomWalkSimilarity/Spec.hs b/test/Data/RandomWalkSimilarity/Spec.hs index 9f390465e..118e72898 100644 --- a/test/Data/RandomWalkSimilarity/Spec.hs +++ b/test/Data/RandomWalkSimilarity/Spec.hs @@ -31,7 +31,7 @@ spec = parallel $ do \ (term, p, q, d) -> featureVectorDecorator (rhead . headF) (positively p) (positively q) (positively d) (toTerm term :: Term Text (Record '[Text])) `shouldSatisfy` all ((== (positively d)) . length . rhead) describe "rws" $ do - let decorate = featureVectorDecorator (category . headF) 2 3 15 + let decorate = defaultFeatureVectorDecorator (category . headF) let toTerm' = decorate . toTerm prop "produces correct diffs" . forAll (scale (`div` 4) arbitrary) $ \ (as, bs) -> let tas = toTerm' <$> (as :: [ArbitraryTerm Text (Record '[Category])]) diff --git a/test/Diff/Spec.hs b/test/Diff/Spec.hs index 43ff6bb15..351444ce9 100644 --- a/test/Diff/Spec.hs +++ b/test/Diff/Spec.hs @@ -17,7 +17,7 @@ import Test.QuickCheck spec :: Spec spec = parallel $ do - let toTerm' = featureVectorDecorator (category . headF) 2 3 15 . toTerm + let toTerm' = defaultFeatureVectorDecorator (category . headF) . toTerm prop "equality is reflexive" $ \ a b -> let diff = diffTerms wrap (==) diffCost (toTerm' a) (toTerm' (b :: ArbitraryTerm Text (Record '[Category]))) in diff `shouldBe` diff diff --git a/test/DiffSummarySpec.hs b/test/DiffSummarySpec.hs index 28de15a2b..94f21fae7 100644 --- a/test/DiffSummarySpec.hs +++ b/test/DiffSummarySpec.hs @@ -45,7 +45,7 @@ spec = parallel $ do diffSummaries sources testDiff `shouldBe` [ DiffSummary { patch = Insert (LeafInfo "string" "a"), parentAnnotation = Nothing } ] prop "equal terms produce identity diffs" $ - \ a -> let term = featureVectorDecorator (category . headF) 2 3 15 (toTerm (a :: ArbitraryTerm Text (Record '[Category, Range]))) in + \ a -> let term = defaultFeatureVectorDecorator (category . headF) (toTerm (a :: ArbitraryTerm Text (Record '[Category, Range]))) in diffSummaries sources (diffTerms wrap (==) diffCost term term) `shouldBe` [] describe "annotatedSummaries" $ do diff --git a/test/InterpreterSpec.hs b/test/InterpreterSpec.hs index 4c79aaf79..29ea5e7aa 100644 --- a/test/InterpreterSpec.hs +++ b/test/InterpreterSpec.hs @@ -18,7 +18,7 @@ import Test.Hspec.QuickCheck spec :: Spec spec = parallel $ do describe "interpret" $ do - let decorate = featureVectorDecorator (category . headF) 2 3 15 + let decorate = defaultFeatureVectorDecorator (category . headF) let compare = ((==) `on` category . extract) it "returns a replacement when comparing two unicode equivalent terms" $ let termA = cofree $ (StringLiteral .: RNil) :< Leaf ("t\776" :: Text) From 6765a9e764411c9654dcc3f7977bf4060c7e1545 Mon Sep 17 00:00:00 2001 From: Rob Rix Date: Thu, 18 Aug 2016 14:04:25 -0400 Subject: [PATCH 61/79] Use the default parameters when diffing. --- src/Diffing.hs | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/src/Diffing.hs b/src/Diffing.hs index 54f8f161e..a406549d0 100644 --- a/src/Diffing.hs +++ b/src/Diffing.hs @@ -45,7 +45,7 @@ import Data.Aeson.Encoding (encodingToLazyByteString) -- | with respect to other IO actions. diffFiles :: (HasField fields Category, HasField fields Cost, HasField fields Range, Eq (Record fields)) => Parser (Syntax Text) (Record fields) -> Renderer (Record (Vector.Vector Double ': fields)) -> Both SourceBlob -> IO Output diffFiles parser renderer sourceBlobs = do - terms <- traverse (fmap (featureVectorDecorator getLabel p q d) . parser) sourceBlobs + terms <- traverse (fmap (defaultFeatureVectorDecorator getLabel) . parser) sourceBlobs let areNullOids = runBothWith (\a b -> (oid a == nullOid || null (source a), oid b == nullOid || null (source b))) sourceBlobs let textDiff = case areNullOids of @@ -64,7 +64,6 @@ diffFiles parser renderer sourceBlobs = do getLabel (h :< t) = (category h, case t of Leaf s -> Just s _ -> Nothing) - (p, q, d) = (2, 3, 15) -- | Return a parser based on the file extension (including the "."). parserForType :: Text -> Parser (Syntax Text) (Record '[Range, Category]) From 7009d9ac2b3c4db110f908e5558ec7402660476e Mon Sep 17 00:00:00 2001 From: Rob Rix Date: Thu, 18 Aug 2016 15:33:14 -0400 Subject: [PATCH 62/79] Move the multiple hunks fixture back to the main diffs list. --- test/{diffs-todo => diffs}/multiple-hunks.A.js | 0 test/{diffs-todo => diffs}/multiple-hunks.B.js | 0 test/{diffs-todo => diffs}/multiple-hunks.patch.js | 0 3 files changed, 0 insertions(+), 0 deletions(-) rename test/{diffs-todo => diffs}/multiple-hunks.A.js (100%) rename test/{diffs-todo => diffs}/multiple-hunks.B.js (100%) rename test/{diffs-todo => diffs}/multiple-hunks.patch.js (100%) diff --git a/test/diffs-todo/multiple-hunks.A.js b/test/diffs/multiple-hunks.A.js similarity index 100% rename from test/diffs-todo/multiple-hunks.A.js rename to test/diffs/multiple-hunks.A.js diff --git a/test/diffs-todo/multiple-hunks.B.js b/test/diffs/multiple-hunks.B.js similarity index 100% rename from test/diffs-todo/multiple-hunks.B.js rename to test/diffs/multiple-hunks.B.js diff --git a/test/diffs-todo/multiple-hunks.patch.js b/test/diffs/multiple-hunks.patch.js similarity index 100% rename from test/diffs-todo/multiple-hunks.patch.js rename to test/diffs/multiple-hunks.patch.js From d4cbba043d795cfa93e4dd4a8e2dec94407a0c01 Mon Sep 17 00:00:00 2001 From: Rob Rix Date: Thu, 18 Aug 2016 16:59:57 -0400 Subject: [PATCH 63/79] Disallow moves beyond a given bound. :tophat: @rewinfrey --- src/Data/RandomWalkSimilarity.hs | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/src/Data/RandomWalkSimilarity.hs b/src/Data/RandomWalkSimilarity.hs index bcaa59bfd..b501e5cb6 100644 --- a/src/Data/RandomWalkSimilarity.hs +++ b/src/Data/RandomWalkSimilarity.hs @@ -54,7 +54,7 @@ rws compare as bs foundA@(UnmappedTerm i _ a) <- nearestUnmapped unmappedA kdas kv foundB@(UnmappedTerm j' _ _) <- nearestUnmapped unmappedB kdbs foundA guard (j == j') - guard (i >= previous) + guard (previous <= i && i <= previous + defaultMoveBound) compared <- compare a b pure $! do put (i, List.delete foundA unmappedA, List.delete foundB unmappedB) @@ -78,12 +78,13 @@ editDistanceUpTo :: (Prologue.Foldable f, Functor f) => Integer -> Free (CofreeF editDistanceUpTo m = diffSum (patchSum termSize) . cutoff m where diffSum patchCost diff = sum $ fmap (maybe 0 patchCost) diff -defaultD, defaultL, defaultP, defaultQ :: Int +defaultD, defaultL, defaultP, defaultQ, defaultMoveBound :: Int defaultD = 15 -- | How many of the most similar terms to consider, to rule out false positives. defaultL = 2 defaultP = 2 defaultQ = 3 +defaultMoveBound = 2 -- | How many nodes to consider for our constant-time approximation to tree edit distance. defaultM :: Integer From a83faecdfd286b45d610dea7a8b898c84bb5fa9d Mon Sep 17 00:00:00 2001 From: Rob Rix Date: Thu, 18 Aug 2016 17:04:10 -0400 Subject: [PATCH 64/79] So, apparently 3 is the magic number today. --- src/Data/RandomWalkSimilarity.hs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/Data/RandomWalkSimilarity.hs b/src/Data/RandomWalkSimilarity.hs index b501e5cb6..19e86afaa 100644 --- a/src/Data/RandomWalkSimilarity.hs +++ b/src/Data/RandomWalkSimilarity.hs @@ -84,7 +84,7 @@ defaultD = 15 defaultL = 2 defaultP = 2 defaultQ = 3 -defaultMoveBound = 2 +defaultMoveBound = 3 -- | How many nodes to consider for our constant-time approximation to tree edit distance. defaultM :: Integer From 3e94a6a9dda7bbd5a41875d1d112ab4cc753a5c1 Mon Sep 17 00:00:00 2001 From: Rob Rix Date: Thu, 18 Aug 2016 17:28:35 -0400 Subject: [PATCH 65/79] Construct IntMaps of the lists of inputs. --- src/Data/RandomWalkSimilarity.hs | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/src/Data/RandomWalkSimilarity.hs b/src/Data/RandomWalkSimilarity.hs index 19e86afaa..d1d8d5510 100644 --- a/src/Data/RandomWalkSimilarity.hs +++ b/src/Data/RandomWalkSimilarity.hs @@ -20,6 +20,7 @@ import Control.Monad.State import Data.Functor.Both hiding (fst, snd) import Data.Functor.Foldable as Foldable import Data.Hashable +import qualified Data.IntMap as IntMap import qualified Data.KdTree.Static as KdTree import qualified Data.List as List import Data.Record @@ -42,11 +43,11 @@ rws compare as bs | null as, null bs = [] | null as = inserting <$> bs | null bs = deleting <$> as - | otherwise = fmap snd . uncurry deleteRemaining . (`runState` (negate 1, fas, fbs)) $ traverse findNearestNeighbourTo fbs - where fas = zipWith featurize [0..] as - fbs = zipWith featurize [0..] bs - kdas = KdTree.build (Vector.toList . feature) fas - kdbs = KdTree.build (Vector.toList . feature) fbs + | otherwise = fmap snd . uncurry deleteRemaining . (`runState` (negate 1, (toList fas), (toList fbs))) $ traverse findNearestNeighbourTo (toList fbs) + where fas = IntMap.fromList $ (\ t -> (termIndex t, t)) <$> zipWith featurize [0..] as + fbs = IntMap.fromList $ (\ t -> (termIndex t, t)) <$> zipWith featurize [0..] bs + kdas = KdTree.build (Vector.toList . feature) (toList fas) + kdbs = KdTree.build (Vector.toList . feature) (toList fbs) featurize index term = UnmappedTerm index (getField (extract term)) term findNearestNeighbourTo kv@(UnmappedTerm j _ b) = do (previous, unmappedA, unmappedB) <- get From bff81035d8ae3a39236d61e0ef8868e3ecdb5cdf Mon Sep 17 00:00:00 2001 From: Rob Rix Date: Thu, 18 Aug 2016 17:41:27 -0400 Subject: [PATCH 66/79] Traverse & delete remaining unmapped terms in IntMap. --- src/Data/RandomWalkSimilarity.hs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/Data/RandomWalkSimilarity.hs b/src/Data/RandomWalkSimilarity.hs index d1d8d5510..335946fbb 100644 --- a/src/Data/RandomWalkSimilarity.hs +++ b/src/Data/RandomWalkSimilarity.hs @@ -43,7 +43,7 @@ rws compare as bs | null as, null bs = [] | null as = inserting <$> bs | null bs = deleting <$> as - | otherwise = fmap snd . uncurry deleteRemaining . (`runState` (negate 1, (toList fas), (toList fbs))) $ traverse findNearestNeighbourTo (toList fbs) + | otherwise = toList . uncurry deleteRemaining . first IntMap.fromList . (`runState` (negate 1, (toList fas), (toList fbs))) . fmap toList $ traverse findNearestNeighbourTo fbs where fas = IntMap.fromList $ (\ t -> (termIndex t, t)) <$> zipWith featurize [0..] as fbs = IntMap.fromList $ (\ t -> (termIndex t, t)) <$> zipWith featurize [0..] bs kdas = KdTree.build (Vector.toList . feature) (toList fas) @@ -72,7 +72,7 @@ rws compare as bs put (previous, unmappedA, List.delete kv unmappedB) pure (negate 1, inserting b) - deleteRemaining diffs (_, unmappedA, _) = foldl' (flip (List.insertBy (comparing fst))) diffs ((termIndex &&& deleting . term) <$> unmappedA) + deleteRemaining diffs (_, unmappedA, _) = foldl' (flip (uncurry IntMap.insert)) diffs ((termIndex &&& deleting . term) <$> unmappedA) -- | Computes a constant-time approximation to the edit distance of a diff. This is done by comparing at most _m_ nodes, & assuming the rest are zero-cost. editDistanceUpTo :: (Prologue.Foldable f, Functor f) => Integer -> Free (CofreeF f (Both a)) (Patch (Cofree f a)) -> Int From 124e7b51da8719cecc666ab5cd269804014d259a Mon Sep 17 00:00:00 2001 From: Rob Rix Date: Thu, 18 Aug 2016 17:49:33 -0400 Subject: [PATCH 67/79] Revert "Traverse & delete remaining unmapped terms in IntMap." This reverts commit 7f3f842657b8cd9b1e5c60ae81e5fcaa44f1b1e4. --- src/Data/RandomWalkSimilarity.hs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/Data/RandomWalkSimilarity.hs b/src/Data/RandomWalkSimilarity.hs index 335946fbb..d1d8d5510 100644 --- a/src/Data/RandomWalkSimilarity.hs +++ b/src/Data/RandomWalkSimilarity.hs @@ -43,7 +43,7 @@ rws compare as bs | null as, null bs = [] | null as = inserting <$> bs | null bs = deleting <$> as - | otherwise = toList . uncurry deleteRemaining . first IntMap.fromList . (`runState` (negate 1, (toList fas), (toList fbs))) . fmap toList $ traverse findNearestNeighbourTo fbs + | otherwise = fmap snd . uncurry deleteRemaining . (`runState` (negate 1, (toList fas), (toList fbs))) $ traverse findNearestNeighbourTo (toList fbs) where fas = IntMap.fromList $ (\ t -> (termIndex t, t)) <$> zipWith featurize [0..] as fbs = IntMap.fromList $ (\ t -> (termIndex t, t)) <$> zipWith featurize [0..] bs kdas = KdTree.build (Vector.toList . feature) (toList fas) @@ -72,7 +72,7 @@ rws compare as bs put (previous, unmappedA, List.delete kv unmappedB) pure (negate 1, inserting b) - deleteRemaining diffs (_, unmappedA, _) = foldl' (flip (uncurry IntMap.insert)) diffs ((termIndex &&& deleting . term) <$> unmappedA) + deleteRemaining diffs (_, unmappedA, _) = foldl' (flip (List.insertBy (comparing fst))) diffs ((termIndex &&& deleting . term) <$> unmappedA) -- | Computes a constant-time approximation to the edit distance of a diff. This is done by comparing at most _m_ nodes, & assuming the rest are zero-cost. editDistanceUpTo :: (Prologue.Foldable f, Functor f) => Integer -> Free (CofreeF f (Both a)) (Patch (Cofree f a)) -> Int From 10ca6f60b679b9865510c233e4a663f62bb0a744 Mon Sep 17 00:00:00 2001 From: Rob Rix Date: Thu, 18 Aug 2016 17:49:40 -0400 Subject: [PATCH 68/79] Revert "Construct IntMaps of the lists of inputs." This reverts commit 8d62477ee805d45603d6c84fdeea59b341153c00. --- src/Data/RandomWalkSimilarity.hs | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/src/Data/RandomWalkSimilarity.hs b/src/Data/RandomWalkSimilarity.hs index d1d8d5510..19e86afaa 100644 --- a/src/Data/RandomWalkSimilarity.hs +++ b/src/Data/RandomWalkSimilarity.hs @@ -20,7 +20,6 @@ import Control.Monad.State import Data.Functor.Both hiding (fst, snd) import Data.Functor.Foldable as Foldable import Data.Hashable -import qualified Data.IntMap as IntMap import qualified Data.KdTree.Static as KdTree import qualified Data.List as List import Data.Record @@ -43,11 +42,11 @@ rws compare as bs | null as, null bs = [] | null as = inserting <$> bs | null bs = deleting <$> as - | otherwise = fmap snd . uncurry deleteRemaining . (`runState` (negate 1, (toList fas), (toList fbs))) $ traverse findNearestNeighbourTo (toList fbs) - where fas = IntMap.fromList $ (\ t -> (termIndex t, t)) <$> zipWith featurize [0..] as - fbs = IntMap.fromList $ (\ t -> (termIndex t, t)) <$> zipWith featurize [0..] bs - kdas = KdTree.build (Vector.toList . feature) (toList fas) - kdbs = KdTree.build (Vector.toList . feature) (toList fbs) + | otherwise = fmap snd . uncurry deleteRemaining . (`runState` (negate 1, fas, fbs)) $ traverse findNearestNeighbourTo fbs + where fas = zipWith featurize [0..] as + fbs = zipWith featurize [0..] bs + kdas = KdTree.build (Vector.toList . feature) fas + kdbs = KdTree.build (Vector.toList . feature) fbs featurize index term = UnmappedTerm index (getField (extract term)) term findNearestNeighbourTo kv@(UnmappedTerm j _ b) = do (previous, unmappedA, unmappedB) <- get From 1b2271e47957aed832a1382cc2f94b34a7b5a4ec Mon Sep 17 00:00:00 2001 From: Rob Rix Date: Thu, 18 Aug 2016 17:53:17 -0400 Subject: [PATCH 69/79] Maintain IntMaps of unmapped elements. --- src/Data/RandomWalkSimilarity.hs | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) diff --git a/src/Data/RandomWalkSimilarity.hs b/src/Data/RandomWalkSimilarity.hs index 19e86afaa..33f433d64 100644 --- a/src/Data/RandomWalkSimilarity.hs +++ b/src/Data/RandomWalkSimilarity.hs @@ -20,6 +20,7 @@ import Control.Monad.State import Data.Functor.Both hiding (fst, snd) import Data.Functor.Foldable as Foldable import Data.Hashable +import qualified Data.IntMap as IntMap import qualified Data.KdTree.Static as KdTree import qualified Data.List as List import Data.Record @@ -42,22 +43,23 @@ rws compare as bs | null as, null bs = [] | null as = inserting <$> bs | null bs = deleting <$> as - | otherwise = fmap snd . uncurry deleteRemaining . (`runState` (negate 1, fas, fbs)) $ traverse findNearestNeighbourTo fbs + | otherwise = fmap snd . uncurry deleteRemaining . (`runState` (negate 1, toMap fas, toMap fbs)) $ traverse findNearestNeighbourTo fbs where fas = zipWith featurize [0..] as fbs = zipWith featurize [0..] bs kdas = KdTree.build (Vector.toList . feature) fas kdbs = KdTree.build (Vector.toList . feature) fbs featurize index term = UnmappedTerm index (getField (extract term)) term + toMap = IntMap.fromList . fmap (termIndex &&& identity) findNearestNeighbourTo kv@(UnmappedTerm j _ b) = do (previous, unmappedA, unmappedB) <- get fromMaybe (insertion previous unmappedA unmappedB kv) $ do - foundA@(UnmappedTerm i _ a) <- nearestUnmapped unmappedA kdas kv - foundB@(UnmappedTerm j' _ _) <- nearestUnmapped unmappedB kdbs foundA + foundA@(UnmappedTerm i _ a) <- nearestUnmapped (toList unmappedA) kdas kv + foundB@(UnmappedTerm j' _ _) <- nearestUnmapped (toList unmappedB) kdbs foundA guard (j == j') guard (previous <= i && i <= previous + defaultMoveBound) compared <- compare a b pure $! do - put (i, List.delete foundA unmappedA, List.delete foundB unmappedB) + put (i, IntMap.delete i unmappedA, IntMap.delete j unmappedB) pure (i, compared) -- | Finds the most-similar unmapped term to the passed-in term, if any. @@ -67,8 +69,8 @@ rws compare as bs -- cf §4.2 of RWS-Diff nearestUnmapped unmapped tree key = getFirst $ foldMap (First . Just) (sortOn (maybe maxBound (editDistanceUpTo defaultM) . compare (term key) . term) (intersectBy ((==) `on` termIndex) unmapped (KdTree.kNearest tree defaultL key))) - insertion previous unmappedA unmappedB kv@(UnmappedTerm _ _ b) = do - put (previous, unmappedA, List.delete kv unmappedB) + insertion previous unmappedA unmappedB (UnmappedTerm j _ b) = do + put (previous, unmappedA, IntMap.delete j unmappedB) pure (negate 1, inserting b) deleteRemaining diffs (_, unmappedA, _) = foldl' (flip (List.insertBy (comparing fst))) diffs ((termIndex &&& deleting . term) <$> unmappedA) From 1627ab0e268d761869a285afe44f4f2f60657908 Mon Sep 17 00:00:00 2001 From: Rob Rix Date: Thu, 18 Aug 2016 17:53:29 -0400 Subject: [PATCH 70/79] :fire: a redundant binding. --- src/Data/RandomWalkSimilarity.hs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/Data/RandomWalkSimilarity.hs b/src/Data/RandomWalkSimilarity.hs index 33f433d64..5a40fa4ef 100644 --- a/src/Data/RandomWalkSimilarity.hs +++ b/src/Data/RandomWalkSimilarity.hs @@ -54,7 +54,7 @@ rws compare as bs (previous, unmappedA, unmappedB) <- get fromMaybe (insertion previous unmappedA unmappedB kv) $ do foundA@(UnmappedTerm i _ a) <- nearestUnmapped (toList unmappedA) kdas kv - foundB@(UnmappedTerm j' _ _) <- nearestUnmapped (toList unmappedB) kdbs foundA + UnmappedTerm j' _ _ <- nearestUnmapped (toList unmappedB) kdbs foundA guard (j == j') guard (previous <= i && i <= previous + defaultMoveBound) compared <- compare a b From 1385e9193fe1f94b45f50fa000574d06f63790be Mon Sep 17 00:00:00 2001 From: Rob Rix Date: Fri, 19 Aug 2016 11:35:32 -0400 Subject: [PATCH 71/79] :fire: a redundant import. --- src/Data/RandomWalkSimilarity.hs | 1 - 1 file changed, 1 deletion(-) diff --git a/src/Data/RandomWalkSimilarity.hs b/src/Data/RandomWalkSimilarity.hs index 5a40fa4ef..0c4f8fedf 100644 --- a/src/Data/RandomWalkSimilarity.hs +++ b/src/Data/RandomWalkSimilarity.hs @@ -27,7 +27,6 @@ import Data.Record import qualified Data.Vector as Vector import Patch import Prologue -import Term () import Test.QuickCheck hiding (Fixed) import Test.QuickCheck.Random import Data.List (intersectBy) From db6679a9125dea169852e76005ec1399e89fa8cd Mon Sep 17 00:00:00 2001 From: Rob Rix Date: Fri, 19 Aug 2016 11:36:46 -0400 Subject: [PATCH 72/79] Use IntMap intersections to define nearestUnmapped. --- src/Data/RandomWalkSimilarity.hs | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/src/Data/RandomWalkSimilarity.hs b/src/Data/RandomWalkSimilarity.hs index 0c4f8fedf..4bdcf2763 100644 --- a/src/Data/RandomWalkSimilarity.hs +++ b/src/Data/RandomWalkSimilarity.hs @@ -27,10 +27,9 @@ import Data.Record import qualified Data.Vector as Vector import Patch import Prologue +import Term (termSize) import Test.QuickCheck hiding (Fixed) import Test.QuickCheck.Random -import Data.List (intersectBy) -import Term (termSize) -- | Given a function comparing two terms recursively, and a function to compute a Hashable label from an unpacked term, compute the diff of a pair of lists of terms using a random walk similarity metric, which completes in log-linear time. This implementation is based on the paper [_RWS-Diff—Flexible and Efficient Change Detection in Hierarchical Data_](https://github.com/github/semantic-diff/files/325837/RWS-Diff.Flexible.and.Efficient.Change.Detection.in.Hierarchical.Data.pdf). rws :: (Eq (Record fields), Prologue.Foldable f, Functor f, Eq (f (Cofree f (Record fields))), HasField fields (Vector.Vector Double)) @@ -52,8 +51,8 @@ rws compare as bs findNearestNeighbourTo kv@(UnmappedTerm j _ b) = do (previous, unmappedA, unmappedB) <- get fromMaybe (insertion previous unmappedA unmappedB kv) $ do - foundA@(UnmappedTerm i _ a) <- nearestUnmapped (toList unmappedA) kdas kv - UnmappedTerm j' _ _ <- nearestUnmapped (toList unmappedB) kdbs foundA + foundA@(UnmappedTerm i _ a) <- nearestUnmapped unmappedA kdas kv + UnmappedTerm j' _ _ <- nearestUnmapped unmappedB kdbs foundA guard (j == j') guard (previous <= i && i <= previous + defaultMoveBound) compared <- compare a b @@ -66,7 +65,7 @@ rws compare as bs -- RWS can produce false positives in the case of e.g. hash collisions. Therefore, we find the _l_ nearest candidates, filter out any which have already been mapped, and select the minimum of the remaining by (a constant-time approximation of) edit distance. -- -- cf §4.2 of RWS-Diff - nearestUnmapped unmapped tree key = getFirst $ foldMap (First . Just) (sortOn (maybe maxBound (editDistanceUpTo defaultM) . compare (term key) . term) (intersectBy ((==) `on` termIndex) unmapped (KdTree.kNearest tree defaultL key))) + nearestUnmapped unmapped tree key = getFirst $ foldMap (First . Just) (sortOn (maybe maxBound (editDistanceUpTo defaultM) . compare (term key) . term) (toList (IntMap.intersection unmapped (toMap (KdTree.kNearest tree defaultL key))))) insertion previous unmappedA unmappedB (UnmappedTerm j _ b) = do put (previous, unmappedA, IntMap.delete j unmappedB) From 2068bdd22a651ae54a53f7914cdc5bd905e51703 Mon Sep 17 00:00:00 2001 From: Rob Rix Date: Fri, 19 Aug 2016 11:53:52 -0400 Subject: [PATCH 73/79] Extract the guard clause into a predicate. --- src/Data/RandomWalkSimilarity.hs | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/src/Data/RandomWalkSimilarity.hs b/src/Data/RandomWalkSimilarity.hs index 4bdcf2763..123b7abd3 100644 --- a/src/Data/RandomWalkSimilarity.hs +++ b/src/Data/RandomWalkSimilarity.hs @@ -54,12 +54,15 @@ rws compare as bs foundA@(UnmappedTerm i _ a) <- nearestUnmapped unmappedA kdas kv UnmappedTerm j' _ _ <- nearestUnmapped unmappedB kdbs foundA guard (j == j') - guard (previous <= i && i <= previous + defaultMoveBound) + guard (isInMoveBounds previous i) compared <- compare a b pure $! do put (i, IntMap.delete i unmappedA, IntMap.delete j unmappedB) pure (i, compared) + -- | Determines whether an index is in-bounds for a move given the most recently matched index. + isInMoveBounds previous i = previous <= i && i <= previous + defaultMoveBound + -- | Finds the most-similar unmapped term to the passed-in term, if any. -- -- RWS can produce false positives in the case of e.g. hash collisions. Therefore, we find the _l_ nearest candidates, filter out any which have already been mapped, and select the minimum of the remaining by (a constant-time approximation of) edit distance. From 14aef5d3cd9bef6cf1db7d196568619dd884590e Mon Sep 17 00:00:00 2001 From: Rob Rix Date: Fri, 19 Aug 2016 12:03:32 -0400 Subject: [PATCH 74/79] Filter out-of-bounds candidates out of the unmapped elements of A. --- src/Data/RandomWalkSimilarity.hs | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/src/Data/RandomWalkSimilarity.hs b/src/Data/RandomWalkSimilarity.hs index 123b7abd3..e36a75790 100644 --- a/src/Data/RandomWalkSimilarity.hs +++ b/src/Data/RandomWalkSimilarity.hs @@ -51,10 +51,9 @@ rws compare as bs findNearestNeighbourTo kv@(UnmappedTerm j _ b) = do (previous, unmappedA, unmappedB) <- get fromMaybe (insertion previous unmappedA unmappedB kv) $ do - foundA@(UnmappedTerm i _ a) <- nearestUnmapped unmappedA kdas kv + foundA@(UnmappedTerm i _ a) <- nearestUnmapped (IntMap.filterWithKey (\ k _ -> isInMoveBounds previous k) unmappedA) kdas kv UnmappedTerm j' _ _ <- nearestUnmapped unmappedB kdbs foundA guard (j == j') - guard (isInMoveBounds previous i) compared <- compare a b pure $! do put (i, IntMap.delete i unmappedA, IntMap.delete j unmappedB) From 716b0b44d682ff772e5f9bdf6d0c0153b3854425 Mon Sep 17 00:00:00 2001 From: Rob Rix Date: Fri, 19 Aug 2016 12:51:59 -0400 Subject: [PATCH 75/79] Lower the default move bound to 2. 2 is a slightly less magical number than 3. --- src/Data/RandomWalkSimilarity.hs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/Data/RandomWalkSimilarity.hs b/src/Data/RandomWalkSimilarity.hs index e36a75790..7308501d3 100644 --- a/src/Data/RandomWalkSimilarity.hs +++ b/src/Data/RandomWalkSimilarity.hs @@ -86,7 +86,7 @@ defaultD = 15 defaultL = 2 defaultP = 2 defaultQ = 3 -defaultMoveBound = 3 +defaultMoveBound = 2 -- | How many nodes to consider for our constant-time approximation to tree edit distance. defaultM :: Integer From b5c67da74b19a740c26ecaa425c342330924d8c4 Mon Sep 17 00:00:00 2001 From: Rob Rix Date: Fri, 19 Aug 2016 13:10:35 -0400 Subject: [PATCH 76/79] Add a type signature for nearestUnmapped. --- src/Data/RandomWalkSimilarity.hs | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/src/Data/RandomWalkSimilarity.hs b/src/Data/RandomWalkSimilarity.hs index 4bdcf2763..2dbf778c1 100644 --- a/src/Data/RandomWalkSimilarity.hs +++ b/src/Data/RandomWalkSimilarity.hs @@ -1,4 +1,4 @@ -{-# LANGUAGE DataKinds, GADTs, RankNTypes, TypeOperators #-} +{-# LANGUAGE DataKinds, GADTs, RankNTypes, ScopedTypeVariables, TypeOperators #-} module Data.RandomWalkSimilarity ( rws , pqGramDecorator @@ -32,7 +32,7 @@ import Test.QuickCheck hiding (Fixed) import Test.QuickCheck.Random -- | Given a function comparing two terms recursively, and a function to compute a Hashable label from an unpacked term, compute the diff of a pair of lists of terms using a random walk similarity metric, which completes in log-linear time. This implementation is based on the paper [_RWS-Diff—Flexible and Efficient Change Detection in Hierarchical Data_](https://github.com/github/semantic-diff/files/325837/RWS-Diff.Flexible.and.Efficient.Change.Detection.in.Hierarchical.Data.pdf). -rws :: (Eq (Record fields), Prologue.Foldable f, Functor f, Eq (f (Cofree f (Record fields))), HasField fields (Vector.Vector Double)) +rws :: forall f fields. (Eq (Record fields), Prologue.Foldable f, Functor f, Eq (f (Cofree f (Record fields))), HasField fields (Vector.Vector Double)) => (Cofree f (Record fields) -> Cofree f (Record fields) -> Maybe (Free (CofreeF f (Both (Record fields))) (Patch (Cofree f (Record fields))))) -- ^ A function which compares a pair of terms recursively, returning 'Just' their diffed value if appropriate, or 'Nothing' if they should not be compared. -> [Cofree f (Record fields)] -- ^ The list of old terms. -> [Cofree f (Record fields)] -- ^ The list of new terms. @@ -65,6 +65,7 @@ rws compare as bs -- RWS can produce false positives in the case of e.g. hash collisions. Therefore, we find the _l_ nearest candidates, filter out any which have already been mapped, and select the minimum of the remaining by (a constant-time approximation of) edit distance. -- -- cf §4.2 of RWS-Diff + nearestUnmapped :: IntMap (UnmappedTerm (Cofree f (Record fields))) -> KdTree.KdTree Double (UnmappedTerm (Cofree f (Record fields))) -> UnmappedTerm (Cofree f (Record fields)) -> Maybe (UnmappedTerm (Cofree f (Record fields))) nearestUnmapped unmapped tree key = getFirst $ foldMap (First . Just) (sortOn (maybe maxBound (editDistanceUpTo defaultM) . compare (term key) . term) (toList (IntMap.intersection unmapped (toMap (KdTree.kNearest tree defaultL key))))) insertion previous unmappedA unmappedB (UnmappedTerm j _ b) = do From b0c8f93d700087095567ae308ee0715a11534e3e Mon Sep 17 00:00:00 2001 From: Rob Rix Date: Fri, 19 Aug 2016 13:12:59 -0400 Subject: [PATCH 77/79] Add a type signature for findNearestNeighbourTo. --- src/Data/RandomWalkSimilarity.hs | 1 + 1 file changed, 1 insertion(+) diff --git a/src/Data/RandomWalkSimilarity.hs b/src/Data/RandomWalkSimilarity.hs index 2dbf778c1..aa382df86 100644 --- a/src/Data/RandomWalkSimilarity.hs +++ b/src/Data/RandomWalkSimilarity.hs @@ -48,6 +48,7 @@ rws compare as bs kdbs = KdTree.build (Vector.toList . feature) fbs featurize index term = UnmappedTerm index (getField (extract term)) term toMap = IntMap.fromList . fmap (termIndex &&& identity) + findNearestNeighbourTo :: UnmappedTerm (Cofree f (Record fields)) -> State (Int, IntMap (UnmappedTerm (Cofree f (Record fields))), IntMap (UnmappedTerm (Cofree f (Record fields)))) (Int, Free (CofreeF f (Both (Record fields))) (Patch (Cofree f (Record fields)))) findNearestNeighbourTo kv@(UnmappedTerm j _ b) = do (previous, unmappedA, unmappedB) <- get fromMaybe (insertion previous unmappedA unmappedB kv) $ do From 5c9e05c78113b0c0ee1353d112df96aabcb310f4 Mon Sep 17 00:00:00 2001 From: Rob Rix Date: Fri, 19 Aug 2016 13:16:00 -0400 Subject: [PATCH 78/79] :memo: the parameters/return of nearestUnmapped. --- src/Data/RandomWalkSimilarity.hs | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/src/Data/RandomWalkSimilarity.hs b/src/Data/RandomWalkSimilarity.hs index aa382df86..8c0e13369 100644 --- a/src/Data/RandomWalkSimilarity.hs +++ b/src/Data/RandomWalkSimilarity.hs @@ -66,7 +66,11 @@ rws compare as bs -- RWS can produce false positives in the case of e.g. hash collisions. Therefore, we find the _l_ nearest candidates, filter out any which have already been mapped, and select the minimum of the remaining by (a constant-time approximation of) edit distance. -- -- cf §4.2 of RWS-Diff - nearestUnmapped :: IntMap (UnmappedTerm (Cofree f (Record fields))) -> KdTree.KdTree Double (UnmappedTerm (Cofree f (Record fields))) -> UnmappedTerm (Cofree f (Record fields)) -> Maybe (UnmappedTerm (Cofree f (Record fields))) + nearestUnmapped + :: IntMap (UnmappedTerm (Cofree f (Record fields))) -- ^ A set of terms eligible for matching against. + -> KdTree.KdTree Double (UnmappedTerm (Cofree f (Record fields))) -- ^ The k-d tree to look up nearest neighbours within. + -> UnmappedTerm (Cofree f (Record fields)) -- ^ The term to find the nearest neighbour to. + -> Maybe (UnmappedTerm (Cofree f (Record fields))) -- ^ The most similar unmapped term, if any. nearestUnmapped unmapped tree key = getFirst $ foldMap (First . Just) (sortOn (maybe maxBound (editDistanceUpTo defaultM) . compare (term key) . term) (toList (IntMap.intersection unmapped (toMap (KdTree.kNearest tree defaultL key))))) insertion previous unmappedA unmappedB (UnmappedTerm j _ b) = do From 78f5281264d7149f8433f6943ff03d0c9dcb07cc Mon Sep 17 00:00:00 2001 From: Rob Rix Date: Fri, 19 Aug 2016 13:17:39 -0400 Subject: [PATCH 79/79] :memo: findNearestNeighbourTo. --- src/Data/RandomWalkSimilarity.hs | 1 + 1 file changed, 1 insertion(+) diff --git a/src/Data/RandomWalkSimilarity.hs b/src/Data/RandomWalkSimilarity.hs index 8c0e13369..063164e9b 100644 --- a/src/Data/RandomWalkSimilarity.hs +++ b/src/Data/RandomWalkSimilarity.hs @@ -48,6 +48,7 @@ rws compare as bs kdbs = KdTree.build (Vector.toList . feature) fbs featurize index term = UnmappedTerm index (getField (extract term)) term toMap = IntMap.fromList . fmap (termIndex &&& identity) + -- | Construct a diff for a term in B by matching it against the most similar eligible term in A (if any), marking both as ineligible for future matches. findNearestNeighbourTo :: UnmappedTerm (Cofree f (Record fields)) -> State (Int, IntMap (UnmappedTerm (Cofree f (Record fields))), IntMap (UnmappedTerm (Cofree f (Record fields)))) (Int, Free (CofreeF f (Both (Record fields))) (Patch (Cofree f (Record fields)))) findNearestNeighbourTo kv@(UnmappedTerm j _ b) = do (previous, unmappedA, unmappedB) <- get