2019-12-10 02:24:31 +03:00
|
|
|
#chg-compatible
|
|
|
|
|
2010-09-26 22:41:32 +04:00
|
|
|
Setup
|
|
|
|
|
2020-05-19 16:11:56 +03:00
|
|
|
$ setconfig ui.color=always ui.paginate=never
|
2020-01-20 13:42:49 +03:00
|
|
|
$ setconfig color.mode=ansi
|
2010-09-26 22:41:32 +04:00
|
|
|
$ hg init repo
|
|
|
|
$ cd repo
|
|
|
|
$ cat > a <<EOF
|
|
|
|
> c
|
|
|
|
> c
|
|
|
|
> a
|
|
|
|
> a
|
|
|
|
> b
|
|
|
|
> a
|
|
|
|
> a
|
|
|
|
> c
|
|
|
|
> c
|
|
|
|
> EOF
|
|
|
|
$ hg ci -Am adda
|
|
|
|
adding a
|
|
|
|
$ cat > a <<EOF
|
|
|
|
> c
|
|
|
|
> c
|
|
|
|
> a
|
|
|
|
> a
|
|
|
|
> dd
|
|
|
|
> a
|
|
|
|
> a
|
|
|
|
> c
|
|
|
|
> c
|
|
|
|
> EOF
|
|
|
|
|
|
|
|
default context
|
|
|
|
|
2017-02-25 21:44:23 +03:00
|
|
|
$ hg diff --nodates
|
2010-11-08 03:41:41 +03:00
|
|
|
\x1b[0;1mdiff -r cf9f4ba66af2 a\x1b[0m (esc)
|
|
|
|
\x1b[0;31;1m--- a/a\x1b[0m (esc)
|
|
|
|
\x1b[0;32;1m+++ b/a\x1b[0m (esc)
|
|
|
|
\x1b[0;35m@@ -2,7 +2,7 @@\x1b[0m (esc)
|
2010-09-26 22:41:32 +04:00
|
|
|
c
|
|
|
|
a
|
|
|
|
a
|
2018-03-24 12:15:55 +03:00
|
|
|
\x1b[0;91m-b\x1b[0m (esc)
|
|
|
|
\x1b[0;92m+dd\x1b[0m (esc)
|
2010-09-26 22:41:32 +04:00
|
|
|
a
|
|
|
|
a
|
|
|
|
c
|
|
|
|
|
2017-05-02 21:01:54 +03:00
|
|
|
(check that 'ui.color=yes' match '--color=auto')
|
|
|
|
|
2020-05-19 16:11:56 +03:00
|
|
|
$ hg diff --nodates --config ui.color=yes
|
2017-05-02 21:01:54 +03:00
|
|
|
diff -r cf9f4ba66af2 a
|
|
|
|
--- a/a
|
|
|
|
+++ b/a
|
|
|
|
@@ -2,7 +2,7 @@
|
|
|
|
c
|
|
|
|
a
|
|
|
|
a
|
|
|
|
-b
|
|
|
|
+dd
|
|
|
|
a
|
|
|
|
a
|
|
|
|
c
|
|
|
|
|
|
|
|
(check that 'ui.color=no' disable color)
|
|
|
|
|
2020-05-19 16:11:56 +03:00
|
|
|
$ hg diff --nodates --config ui.color=no
|
2017-05-02 21:01:54 +03:00
|
|
|
diff -r cf9f4ba66af2 a
|
|
|
|
--- a/a
|
|
|
|
+++ b/a
|
|
|
|
@@ -2,7 +2,7 @@
|
|
|
|
c
|
|
|
|
a
|
|
|
|
a
|
|
|
|
-b
|
|
|
|
+dd
|
|
|
|
a
|
|
|
|
a
|
|
|
|
c
|
|
|
|
|
2017-05-02 21:19:09 +03:00
|
|
|
(check that 'ui.color=always' force color)
|
|
|
|
|
2020-05-19 16:11:56 +03:00
|
|
|
$ hg diff --nodates --config ui.color=always
|
2017-05-02 21:19:09 +03:00
|
|
|
\x1b[0;1mdiff -r cf9f4ba66af2 a\x1b[0m (esc)
|
|
|
|
\x1b[0;31;1m--- a/a\x1b[0m (esc)
|
|
|
|
\x1b[0;32;1m+++ b/a\x1b[0m (esc)
|
|
|
|
\x1b[0;35m@@ -2,7 +2,7 @@\x1b[0m (esc)
|
|
|
|
c
|
|
|
|
a
|
|
|
|
a
|
2018-03-24 12:15:55 +03:00
|
|
|
\x1b[0;91m-b\x1b[0m (esc)
|
|
|
|
\x1b[0;92m+dd\x1b[0m (esc)
|
2017-05-02 21:19:09 +03:00
|
|
|
a
|
|
|
|
a
|
|
|
|
c
|
|
|
|
|
2010-09-26 22:41:32 +04:00
|
|
|
--unified=2
|
|
|
|
|
2017-02-25 21:44:23 +03:00
|
|
|
$ hg diff --nodates -U 2
|
2010-11-08 03:41:41 +03:00
|
|
|
\x1b[0;1mdiff -r cf9f4ba66af2 a\x1b[0m (esc)
|
|
|
|
\x1b[0;31;1m--- a/a\x1b[0m (esc)
|
|
|
|
\x1b[0;32;1m+++ b/a\x1b[0m (esc)
|
|
|
|
\x1b[0;35m@@ -3,5 +3,5 @@\x1b[0m (esc)
|
2010-09-26 22:41:32 +04:00
|
|
|
a
|
|
|
|
a
|
2018-03-24 12:15:55 +03:00
|
|
|
\x1b[0;91m-b\x1b[0m (esc)
|
|
|
|
\x1b[0;92m+dd\x1b[0m (esc)
|
2010-09-26 22:41:32 +04:00
|
|
|
a
|
|
|
|
a
|
|
|
|
|
|
|
|
diffstat
|
|
|
|
|
2017-02-25 21:44:23 +03:00
|
|
|
$ hg diff --stat
|
2010-11-08 03:41:41 +03:00
|
|
|
a | 2 \x1b[0;32m+\x1b[0m\x1b[0;31m-\x1b[0m (esc)
|
2010-09-26 22:41:32 +04:00
|
|
|
1 files changed, 1 insertions(+), 1 deletions(-)
|
2014-11-04 17:41:46 +03:00
|
|
|
$ cat <<EOF >> $HGRCPATH
|
|
|
|
> [ui]
|
|
|
|
> interactive = true
|
|
|
|
> [diff]
|
|
|
|
> git = True
|
|
|
|
> EOF
|
2010-09-26 22:41:32 +04:00
|
|
|
|
2012-06-10 16:14:05 +04:00
|
|
|
#if execbit
|
|
|
|
|
2010-09-26 22:41:32 +04:00
|
|
|
record
|
|
|
|
|
2011-11-07 06:14:54 +04:00
|
|
|
$ chmod +x a
|
2017-02-25 21:44:23 +03:00
|
|
|
$ hg record -m moda a <<EOF
|
2010-09-26 22:41:32 +04:00
|
|
|
> y
|
|
|
|
> y
|
|
|
|
> EOF
|
2010-11-08 03:41:41 +03:00
|
|
|
\x1b[0;1mdiff --git a/a b/a\x1b[0m (esc)
|
|
|
|
\x1b[0;36;1mold mode 100644\x1b[0m (esc)
|
|
|
|
\x1b[0;36;1mnew mode 100755\x1b[0m (esc)
|
2010-09-26 22:41:32 +04:00
|
|
|
1 hunks, 1 lines changed
|
2014-10-01 03:04:18 +04:00
|
|
|
\x1b[0;33mexamine changes to 'a'? [Ynesfdaq?]\x1b[0m y (esc)
|
|
|
|
|
2015-12-17 17:38:22 +03:00
|
|
|
\x1b[0;35m@@ -2,7 +2,7 @@ c\x1b[0m (esc)
|
2010-09-26 22:41:32 +04:00
|
|
|
c
|
|
|
|
a
|
|
|
|
a
|
2018-03-24 12:15:55 +03:00
|
|
|
\x1b[0;91m-b\x1b[0m (esc)
|
|
|
|
\x1b[0;92m+dd\x1b[0m (esc)
|
2010-09-26 22:41:32 +04:00
|
|
|
a
|
|
|
|
a
|
|
|
|
c
|
2014-10-01 03:04:18 +04:00
|
|
|
\x1b[0;33mrecord this change to 'a'? [Ynesfdaq?]\x1b[0m y (esc)
|
|
|
|
|
2011-10-13 06:27:49 +04:00
|
|
|
|
2010-09-26 22:41:32 +04:00
|
|
|
$ echo "[extensions]" >> $HGRCPATH
|
|
|
|
$ echo "mq=" >> $HGRCPATH
|
|
|
|
$ hg rollback
|
2011-02-10 11:03:06 +03:00
|
|
|
repository tip rolled back to revision 0 (undo commit)
|
|
|
|
working directory now based on revision 0
|
2010-09-26 22:41:32 +04:00
|
|
|
|
2012-06-10 16:14:05 +04:00
|
|
|
#endif
|
2012-06-11 03:40:51 +04:00
|
|
|
|
2017-10-25 18:13:38 +03:00
|
|
|
test inline color diff
|
|
|
|
|
|
|
|
$ hg init inline
|
|
|
|
$ cd inline
|
|
|
|
$ cat > file1 << EOF
|
|
|
|
> this is the first line
|
|
|
|
> this is the second line
|
|
|
|
> third line starts with space
|
|
|
|
> + starts with a plus sign
|
2017-12-08 11:20:11 +03:00
|
|
|
> this one with one tab
|
|
|
|
> now with full two tabs
|
|
|
|
> now tabs everywhere, much fun
|
2017-10-25 18:13:38 +03:00
|
|
|
>
|
|
|
|
> this line won't change
|
|
|
|
>
|
|
|
|
> two lines are going to
|
|
|
|
> be changed into three!
|
|
|
|
>
|
|
|
|
> three of those lines will
|
|
|
|
> collapse onto one
|
|
|
|
> (to see if it works)
|
|
|
|
> EOF
|
|
|
|
$ hg add file1
|
|
|
|
$ hg ci -m 'commit'
|
2017-12-08 11:20:11 +03:00
|
|
|
|
2017-10-25 18:13:38 +03:00
|
|
|
$ cat > file1 << EOF
|
|
|
|
> that is the first paragraph
|
|
|
|
> this is the second line
|
|
|
|
> third line starts with space
|
|
|
|
> - starts with a minus sign
|
2017-12-08 11:20:11 +03:00
|
|
|
> this one with two tab
|
|
|
|
> now with full three tabs
|
|
|
|
> now there are tabs everywhere, much fun
|
2017-10-25 18:13:38 +03:00
|
|
|
>
|
|
|
|
> this line won't change
|
|
|
|
>
|
|
|
|
> two lines are going to
|
|
|
|
> (entirely magically,
|
|
|
|
> assuming this works)
|
|
|
|
> be changed into four!
|
|
|
|
>
|
|
|
|
> three of those lines have
|
|
|
|
> collapsed onto one
|
|
|
|
> EOF
|
|
|
|
$ hg diff --config experimental.worddiff=False --color=debug
|
|
|
|
[diff.diffline|diff --git a/file1 b/file1]
|
|
|
|
[diff.file_a|--- a/file1]
|
|
|
|
[diff.file_b|+++ b/file1]
|
2017-12-08 11:20:11 +03:00
|
|
|
[diff.hunk|@@ -1,16 +1,17 @@]
|
2017-10-25 18:13:38 +03:00
|
|
|
[diff.deleted|-this is the first line]
|
|
|
|
[diff.deleted|-this is the second line]
|
|
|
|
[diff.deleted|- third line starts with space]
|
|
|
|
[diff.deleted|-+ starts with a plus sign]
|
2017-12-08 11:20:11 +03:00
|
|
|
[diff.deleted|-][diff.tab| ][diff.deleted|this one with one tab]
|
|
|
|
[diff.deleted|-][diff.tab| ][diff.deleted|now with full two tabs]
|
|
|
|
[diff.deleted|-][diff.tab| ][diff.deleted|now tabs][diff.tab| ][diff.deleted|everywhere, much fun]
|
2017-10-25 18:13:38 +03:00
|
|
|
[diff.inserted|+that is the first paragraph]
|
|
|
|
[diff.inserted|+ this is the second line]
|
|
|
|
[diff.inserted|+third line starts with space]
|
|
|
|
[diff.inserted|+- starts with a minus sign]
|
2017-12-08 11:20:11 +03:00
|
|
|
[diff.inserted|+][diff.tab| ][diff.inserted|this one with two tab]
|
|
|
|
[diff.inserted|+][diff.tab| ][diff.inserted|now with full three tabs]
|
|
|
|
[diff.inserted|+][diff.tab| ][diff.inserted|now there are tabs][diff.tab| ][diff.inserted|everywhere, much fun]
|
2017-10-25 18:13:38 +03:00
|
|
|
|
|
|
|
this line won't change
|
|
|
|
|
|
|
|
two lines are going to
|
|
|
|
[diff.deleted|-be changed into three!]
|
|
|
|
[diff.inserted|+(entirely magically,]
|
|
|
|
[diff.inserted|+ assuming this works)]
|
|
|
|
[diff.inserted|+be changed into four!]
|
|
|
|
|
|
|
|
[diff.deleted|-three of those lines will]
|
|
|
|
[diff.deleted|-collapse onto one]
|
|
|
|
[diff.deleted|-(to see if it works)]
|
|
|
|
[diff.inserted|+three of those lines have]
|
|
|
|
[diff.inserted|+collapsed onto one]
|
|
|
|
$ hg diff --config experimental.worddiff=True --color=debug
|
|
|
|
[diff.diffline|diff --git a/file1 b/file1]
|
|
|
|
[diff.file_a|--- a/file1]
|
|
|
|
[diff.file_b|+++ b/file1]
|
2017-12-08 11:20:11 +03:00
|
|
|
[diff.hunk|@@ -1,16 +1,17 @@]
|
patch: rewrite the worddiff algorithm
Summary:
There were recent complains about both quality [1] [2] and performance [3]
of the current word diff algorithm.
The current algorithm is actually bad in various ways:
- Lines could be matched across hunks, which is confusing (report [1]).
- For short lines, they can fail "similarity" check, which means they
won't be highlighted when they are expected to be (report [2]).
- Various performance issues:
- Using difflib implemented by pure Python, which is both slow and
suboptimal comparing with xdiff.
- Searching for matched lines across hunks could be O(N^2) if there are
no match found.
Thinking it in a "highlight" way is actually tricky, consider the following
change:
```
# before
foo = 10
# after
if True:
foo = 21 + 3
```
It's obvious that "10" and "21 + 3" need highlighting because they are
different. But what about "if True:"? In theory it's also "different" and
need highlighting. How about purely inserted or deleted hunks then?
Highlighting all of them would be too noisy.
This diff rewrites the word diff algorithm. It differs in multiple ways:
1. Get rid of "matching lines by similarity" step.
2. Only diff words within a same hunk.
3. Dim unchanged words. Instead of highlighting changed words.
4. Treat pure insertion or deletion hunks differently - do not dim or
highlight words in them.
5. Use xdiff instead.
6. Use a better regexp to split words. This reduces the number of tokens sent
to the diff algorithm.
1, 2, 5, 6 help performance. 1, 2, 3, 4 make the result more predictable and
trustworthy. 3 avoids the nasty question about what to highlight. 3 and 4 makes
it more flexible for people to tweak colors. 6 makes the result better since it
merges multiple space tokens into one so xdiff will less likely miss important
matches (than meaningless matches like spaces).
"bold" and "underline" were removed so the changed words will have regular
red/green colors. The output won't be too "noisy" even in cases where code are
changed in a way that inline word matching is meaningless. For people who want
more contrast, they can set:
[color]
diff.inserted.changed = green bold
diff.deleted.changed = red bold
Practically, when diffing D7319718, the old code spends 4 seconds on finding
matched lines preparing for worddiff:
```
| diffordiffstat cmdutil.py:1522
\ difflabel (17467 times) patch.py:2471
....
> 3927 \ _findmatches (22 times) patch.py:2537
348 \ __init__ (8158 times) difflib.py:154
340 | set_seqs (8158 times) difflib.py:223
328 | set_seq2 (8158 times) difflib.py:261
322 | __chain_b (8158 times) difflib.py:306
1818 \ ratio (8158 times) difflib.py:636
1777 | get_matching_blocks (8158 times) difflib.py:460
1605 \ find_longest_match (51966 times) difflib.py:350
38 | __new__ (51966 times) <string>:8
29 \ _make (36035 times) <string>:12
143 \ write (17466 times) ui.py:883
```
The new code takes 0.14 seconds:
```
| diffordiffstat cmdutil.py:1522
\ difflabel (23401 times) patch.py:2562
....
> 140 \ consumehunkbuffer (23346 times) patch.py:2585
130 | diffsinglehunkinline (23240 times) patch.py:2496
215 \ write (23400 times) ui.py:883
118 \ flush cmdutil.py:1606
118 | write ui.py:883
```
[1]: https://fburl.com/lkb9rc9m
[2]: https://fburl.com/0r9bqf0e
[3]: https://fburl.com/pxqznw31
Reviewed By: ryanmce
Differential Revision: D7314726
fbshipit-source-id: becd979cb9ac3fd3f4adae11cb10804d535f58df
2018-03-19 14:28:32 +03:00
|
|
|
[diff.deleted|-][diff.deleted.changed|this][diff.deleted.unchanged| is the first ][diff.deleted.changed|line]
|
|
|
|
[diff.deleted|-][diff.deleted.unchanged|this is the second line]
|
|
|
|
[diff.deleted|-][diff.deleted.changed| ][diff.deleted.unchanged|third line starts with space]
|
|
|
|
[diff.deleted|-][diff.deleted.changed|+][diff.deleted.unchanged| starts with a ][diff.deleted.changed|plus][diff.deleted.unchanged| sign]
|
|
|
|
[diff.deleted|-][diff.tab| ][diff.deleted.unchanged|this one with ][diff.deleted.changed|one][diff.deleted.unchanged| tab]
|
|
|
|
[diff.deleted|-][diff.tab| ][diff.deleted.unchanged|now with full ][diff.deleted.changed|two][diff.deleted.unchanged| tabs]
|
|
|
|
[diff.deleted|-][diff.tab| ][diff.deleted.unchanged|now ][diff.deleted.unchanged|tabs][diff.tab| ][diff.deleted.unchanged|everywhere, much fun]
|
|
|
|
[diff.inserted|+][diff.inserted.changed|that][diff.inserted.unchanged| is the first ][diff.inserted.changed|paragraph]
|
|
|
|
[diff.inserted|+][diff.inserted.changed| ][diff.inserted.unchanged|this is the second line]
|
|
|
|
[diff.inserted|+][diff.inserted.unchanged|third line starts with space]
|
|
|
|
[diff.inserted|+][diff.inserted.changed|-][diff.inserted.unchanged| starts with a ][diff.inserted.changed|minus][diff.inserted.unchanged| sign]
|
|
|
|
[diff.inserted|+][diff.tab| ][diff.inserted.unchanged|this one with ][diff.inserted.changed|two][diff.inserted.unchanged| tab]
|
|
|
|
[diff.inserted|+][diff.tab| ][diff.inserted.unchanged|now with full ][diff.inserted.changed|three][diff.inserted.unchanged| tabs]
|
|
|
|
[diff.inserted|+][diff.tab| ][diff.inserted.unchanged|now ][diff.inserted.changed|there are ][diff.inserted.unchanged|tabs][diff.tab| ][diff.inserted.unchanged|everywhere, much fun]
|
2017-10-25 18:13:38 +03:00
|
|
|
|
|
|
|
this line won't change
|
|
|
|
|
|
|
|
two lines are going to
|
patch: rewrite the worddiff algorithm
Summary:
There were recent complains about both quality [1] [2] and performance [3]
of the current word diff algorithm.
The current algorithm is actually bad in various ways:
- Lines could be matched across hunks, which is confusing (report [1]).
- For short lines, they can fail "similarity" check, which means they
won't be highlighted when they are expected to be (report [2]).
- Various performance issues:
- Using difflib implemented by pure Python, which is both slow and
suboptimal comparing with xdiff.
- Searching for matched lines across hunks could be O(N^2) if there are
no match found.
Thinking it in a "highlight" way is actually tricky, consider the following
change:
```
# before
foo = 10
# after
if True:
foo = 21 + 3
```
It's obvious that "10" and "21 + 3" need highlighting because they are
different. But what about "if True:"? In theory it's also "different" and
need highlighting. How about purely inserted or deleted hunks then?
Highlighting all of them would be too noisy.
This diff rewrites the word diff algorithm. It differs in multiple ways:
1. Get rid of "matching lines by similarity" step.
2. Only diff words within a same hunk.
3. Dim unchanged words. Instead of highlighting changed words.
4. Treat pure insertion or deletion hunks differently - do not dim or
highlight words in them.
5. Use xdiff instead.
6. Use a better regexp to split words. This reduces the number of tokens sent
to the diff algorithm.
1, 2, 5, 6 help performance. 1, 2, 3, 4 make the result more predictable and
trustworthy. 3 avoids the nasty question about what to highlight. 3 and 4 makes
it more flexible for people to tweak colors. 6 makes the result better since it
merges multiple space tokens into one so xdiff will less likely miss important
matches (than meaningless matches like spaces).
"bold" and "underline" were removed so the changed words will have regular
red/green colors. The output won't be too "noisy" even in cases where code are
changed in a way that inline word matching is meaningless. For people who want
more contrast, they can set:
[color]
diff.inserted.changed = green bold
diff.deleted.changed = red bold
Practically, when diffing D7319718, the old code spends 4 seconds on finding
matched lines preparing for worddiff:
```
| diffordiffstat cmdutil.py:1522
\ difflabel (17467 times) patch.py:2471
....
> 3927 \ _findmatches (22 times) patch.py:2537
348 \ __init__ (8158 times) difflib.py:154
340 | set_seqs (8158 times) difflib.py:223
328 | set_seq2 (8158 times) difflib.py:261
322 | __chain_b (8158 times) difflib.py:306
1818 \ ratio (8158 times) difflib.py:636
1777 | get_matching_blocks (8158 times) difflib.py:460
1605 \ find_longest_match (51966 times) difflib.py:350
38 | __new__ (51966 times) <string>:8
29 \ _make (36035 times) <string>:12
143 \ write (17466 times) ui.py:883
```
The new code takes 0.14 seconds:
```
| diffordiffstat cmdutil.py:1522
\ difflabel (23401 times) patch.py:2562
....
> 140 \ consumehunkbuffer (23346 times) patch.py:2585
130 | diffsinglehunkinline (23240 times) patch.py:2496
215 \ write (23400 times) ui.py:883
118 \ flush cmdutil.py:1606
118 | write ui.py:883
```
[1]: https://fburl.com/lkb9rc9m
[2]: https://fburl.com/0r9bqf0e
[3]: https://fburl.com/pxqznw31
Reviewed By: ryanmce
Differential Revision: D7314726
fbshipit-source-id: becd979cb9ac3fd3f4adae11cb10804d535f58df
2018-03-19 14:28:32 +03:00
|
|
|
[diff.deleted|-][diff.deleted.unchanged|be changed into ][diff.deleted.changed|three][diff.deleted.unchanged|!]
|
|
|
|
[diff.inserted|+][diff.inserted.changed|(entirely magically,]
|
|
|
|
[diff.inserted|+][diff.inserted.changed| assuming this works)]
|
|
|
|
[diff.inserted|+][diff.inserted.unchanged|be changed into ][diff.inserted.changed|four][diff.inserted.unchanged|!]
|
2017-10-25 18:13:38 +03:00
|
|
|
|
patch: rewrite the worddiff algorithm
Summary:
There were recent complains about both quality [1] [2] and performance [3]
of the current word diff algorithm.
The current algorithm is actually bad in various ways:
- Lines could be matched across hunks, which is confusing (report [1]).
- For short lines, they can fail "similarity" check, which means they
won't be highlighted when they are expected to be (report [2]).
- Various performance issues:
- Using difflib implemented by pure Python, which is both slow and
suboptimal comparing with xdiff.
- Searching for matched lines across hunks could be O(N^2) if there are
no match found.
Thinking it in a "highlight" way is actually tricky, consider the following
change:
```
# before
foo = 10
# after
if True:
foo = 21 + 3
```
It's obvious that "10" and "21 + 3" need highlighting because they are
different. But what about "if True:"? In theory it's also "different" and
need highlighting. How about purely inserted or deleted hunks then?
Highlighting all of them would be too noisy.
This diff rewrites the word diff algorithm. It differs in multiple ways:
1. Get rid of "matching lines by similarity" step.
2. Only diff words within a same hunk.
3. Dim unchanged words. Instead of highlighting changed words.
4. Treat pure insertion or deletion hunks differently - do not dim or
highlight words in them.
5. Use xdiff instead.
6. Use a better regexp to split words. This reduces the number of tokens sent
to the diff algorithm.
1, 2, 5, 6 help performance. 1, 2, 3, 4 make the result more predictable and
trustworthy. 3 avoids the nasty question about what to highlight. 3 and 4 makes
it more flexible for people to tweak colors. 6 makes the result better since it
merges multiple space tokens into one so xdiff will less likely miss important
matches (than meaningless matches like spaces).
"bold" and "underline" were removed so the changed words will have regular
red/green colors. The output won't be too "noisy" even in cases where code are
changed in a way that inline word matching is meaningless. For people who want
more contrast, they can set:
[color]
diff.inserted.changed = green bold
diff.deleted.changed = red bold
Practically, when diffing D7319718, the old code spends 4 seconds on finding
matched lines preparing for worddiff:
```
| diffordiffstat cmdutil.py:1522
\ difflabel (17467 times) patch.py:2471
....
> 3927 \ _findmatches (22 times) patch.py:2537
348 \ __init__ (8158 times) difflib.py:154
340 | set_seqs (8158 times) difflib.py:223
328 | set_seq2 (8158 times) difflib.py:261
322 | __chain_b (8158 times) difflib.py:306
1818 \ ratio (8158 times) difflib.py:636
1777 | get_matching_blocks (8158 times) difflib.py:460
1605 \ find_longest_match (51966 times) difflib.py:350
38 | __new__ (51966 times) <string>:8
29 \ _make (36035 times) <string>:12
143 \ write (17466 times) ui.py:883
```
The new code takes 0.14 seconds:
```
| diffordiffstat cmdutil.py:1522
\ difflabel (23401 times) patch.py:2562
....
> 140 \ consumehunkbuffer (23346 times) patch.py:2585
130 | diffsinglehunkinline (23240 times) patch.py:2496
215 \ write (23400 times) ui.py:883
118 \ flush cmdutil.py:1606
118 | write ui.py:883
```
[1]: https://fburl.com/lkb9rc9m
[2]: https://fburl.com/0r9bqf0e
[3]: https://fburl.com/pxqznw31
Reviewed By: ryanmce
Differential Revision: D7314726
fbshipit-source-id: becd979cb9ac3fd3f4adae11cb10804d535f58df
2018-03-19 14:28:32 +03:00
|
|
|
[diff.deleted|-][diff.deleted.unchanged|three of those lines ][diff.deleted.changed|will]
|
|
|
|
[diff.deleted|-][diff.deleted.changed|collapse][diff.deleted.unchanged| onto one]
|
|
|
|
[diff.deleted|-][diff.deleted.changed|(to see if it works)]
|
|
|
|
[diff.inserted|+][diff.inserted.unchanged|three of those lines ][diff.inserted.changed|have]
|
|
|
|
[diff.inserted|+][diff.inserted.changed|collapsed][diff.inserted.unchanged| onto one]
|
2017-12-11 16:38:31 +03:00
|
|
|
|
|
|
|
multibyte character shouldn't be broken up in word diff:
|
|
|
|
|
|
|
|
$ $PYTHON <<'EOF'
|
|
|
|
> with open("utf8", "wb") as f:
|
|
|
|
> f.write(b"blah \xe3\x82\xa2 blah\n")
|
|
|
|
> EOF
|
|
|
|
$ hg ci -Am 'add utf8 char' utf8
|
|
|
|
$ $PYTHON <<'EOF'
|
|
|
|
> with open("utf8", "wb") as f:
|
|
|
|
> f.write(b"blah \xe3\x82\xa4 blah\n")
|
|
|
|
> EOF
|
|
|
|
$ hg ci -m 'slightly change utf8 char' utf8
|
2018-03-19 14:28:30 +03:00
|
|
|
|
2017-12-11 16:38:31 +03:00
|
|
|
$ hg diff --config experimental.worddiff=True --color=debug -c.
|
|
|
|
[diff.diffline|diff --git a/utf8 b/utf8]
|
|
|
|
[diff.file_a|--- a/utf8]
|
|
|
|
[diff.file_b|+++ b/utf8]
|
|
|
|
[diff.hunk|@@ -1,1 +1,1 @@]
|
patch: rewrite the worddiff algorithm
Summary:
There were recent complains about both quality [1] [2] and performance [3]
of the current word diff algorithm.
The current algorithm is actually bad in various ways:
- Lines could be matched across hunks, which is confusing (report [1]).
- For short lines, they can fail "similarity" check, which means they
won't be highlighted when they are expected to be (report [2]).
- Various performance issues:
- Using difflib implemented by pure Python, which is both slow and
suboptimal comparing with xdiff.
- Searching for matched lines across hunks could be O(N^2) if there are
no match found.
Thinking it in a "highlight" way is actually tricky, consider the following
change:
```
# before
foo = 10
# after
if True:
foo = 21 + 3
```
It's obvious that "10" and "21 + 3" need highlighting because they are
different. But what about "if True:"? In theory it's also "different" and
need highlighting. How about purely inserted or deleted hunks then?
Highlighting all of them would be too noisy.
This diff rewrites the word diff algorithm. It differs in multiple ways:
1. Get rid of "matching lines by similarity" step.
2. Only diff words within a same hunk.
3. Dim unchanged words. Instead of highlighting changed words.
4. Treat pure insertion or deletion hunks differently - do not dim or
highlight words in them.
5. Use xdiff instead.
6. Use a better regexp to split words. This reduces the number of tokens sent
to the diff algorithm.
1, 2, 5, 6 help performance. 1, 2, 3, 4 make the result more predictable and
trustworthy. 3 avoids the nasty question about what to highlight. 3 and 4 makes
it more flexible for people to tweak colors. 6 makes the result better since it
merges multiple space tokens into one so xdiff will less likely miss important
matches (than meaningless matches like spaces).
"bold" and "underline" were removed so the changed words will have regular
red/green colors. The output won't be too "noisy" even in cases where code are
changed in a way that inline word matching is meaningless. For people who want
more contrast, they can set:
[color]
diff.inserted.changed = green bold
diff.deleted.changed = red bold
Practically, when diffing D7319718, the old code spends 4 seconds on finding
matched lines preparing for worddiff:
```
| diffordiffstat cmdutil.py:1522
\ difflabel (17467 times) patch.py:2471
....
> 3927 \ _findmatches (22 times) patch.py:2537
348 \ __init__ (8158 times) difflib.py:154
340 | set_seqs (8158 times) difflib.py:223
328 | set_seq2 (8158 times) difflib.py:261
322 | __chain_b (8158 times) difflib.py:306
1818 \ ratio (8158 times) difflib.py:636
1777 | get_matching_blocks (8158 times) difflib.py:460
1605 \ find_longest_match (51966 times) difflib.py:350
38 | __new__ (51966 times) <string>:8
29 \ _make (36035 times) <string>:12
143 \ write (17466 times) ui.py:883
```
The new code takes 0.14 seconds:
```
| diffordiffstat cmdutil.py:1522
\ difflabel (23401 times) patch.py:2562
....
> 140 \ consumehunkbuffer (23346 times) patch.py:2585
130 | diffsinglehunkinline (23240 times) patch.py:2496
215 \ write (23400 times) ui.py:883
118 \ flush cmdutil.py:1606
118 | write ui.py:883
```
[1]: https://fburl.com/lkb9rc9m
[2]: https://fburl.com/0r9bqf0e
[3]: https://fburl.com/pxqznw31
Reviewed By: ryanmce
Differential Revision: D7314726
fbshipit-source-id: becd979cb9ac3fd3f4adae11cb10804d535f58df
2018-03-19 14:28:32 +03:00
|
|
|
[diff.deleted|-][diff.deleted.unchanged|blah ][diff.deleted.changed|\xe3\x82\xa2][diff.deleted.unchanged| blah] (esc)
|
|
|
|
[diff.inserted|+][diff.inserted.unchanged|blah ][diff.inserted.changed|\xe3\x82\xa4][diff.inserted.unchanged| blah] (esc)
|
2018-03-19 14:28:34 +03:00
|
|
|
|
|
|
|
word diff is disabled if HGPLAIN=1
|
|
|
|
|
|
|
|
$ HGPLAIN=1 hg diff --config experimental.worddiff=True --color=debug -c.
|
|
|
|
diff --git a/utf8 b/utf8
|
|
|
|
--- a/utf8
|
|
|
|
+++ b/utf8
|
|
|
|
@@ -1,1 +1,1 @@
|
|
|
|
-blah \xe3\x82\xa2 blah (esc)
|
|
|
|
+blah \xe3\x82\xa4 blah (esc)
|