mirror of
https://github.com/facebook/sapling.git
synced 2024-10-05 14:28:17 +03:00
Choose diff-sequences
for diff algorithm
Summary:
Before this change, there are 3 Myers diff algorithms used in the dependency tree:
- diff-match-patch (1.0.5)
- diff (4.0.1)
- diff-sequences (via jest -> jest-diff -> diff-sequences)
We'd like to simplify the dependency tree. The short answer is:
- Use `diff-sequences`, or `jest-diff` which uses `diff-sequences` internally.
For best performance, do:
- Strip common prefix and suffix.
- Make line comparison O(1), avoid `line1 === line2` which can be O(line
length).
- Consider skipping "cleanup" in `jest-diff` for long input.
----
Long answer of picking a diff library:
I wrote a benchmark script to get some idea about their performance:
const fs = require('fs')
const dmp = new (require('diff-match-patch').diff_match_patch)();
const diff = require('diff');
const ds = require('diff-sequences').default;
const jd = require('jest-diff');
dmp.Diff_Timeout = 120;
// Diff functions. Output format: Chunk[]
// Chunk is one of:
// [0, n]: n common lines (same on both side)
// [-1, n]: n left-side-only lines
// [1, n]: n right-side-only lines
function diff1(chars1, chars2) {
return dmp.diff_main(chars1, chars2).map(v => [v[0], v[1].length]);
}
function diff1a(chars1, chars2) {
return dmp.diff_main(chars1, chars2, false).map(v => [v[0], v[1].length]);
}
function diff2(chars1, chars2) {
return diff.diffChars(chars1, chars2).map(v => {
const d = v.added ? 1 : (v.removed ? -1 : 0);
return [d, v.count];
});
}
function diff3(chars1, chars2) {
function isCommon(ai, bi) {
return chars1[ai] == chars2[bi];
}
const r = [];
let lastA = 0, lastB = 0;
function foundSequence(n, na, nb) {
if (na > lastA) {
r.push([-1, na - lastA]);
lastA = na;
}
if (nb > lastB) {
r.push([1, nb - lastB]);
lastB = nb;
}
if (n > 0) {
r.push([0, n]);
lastA += n;
lastB += n;
}
}
ds(chars1.length, chars2.length, isCommon, foundSequence);
foundSequence(0, chars1.length, chars2.length);
return r;
}
function diff3a(chars1, chars2) {
return jd.diffStringsRaw(chars1, chars2, false).map((d) => [d[0], d[1].length]);
}
function diff3b(chars1, chars2) {
return jd.diffStringsRaw(chars1, chars2, true).map((d) => [d[0], d[1].length]);
}
function bench(a, b) {
const {chars1, chars2} = dmp.diff_linesToChars_(a, b);
function stringify(obj) {
if (obj.length > 20) {
return `${obj.length} items`;
} else {
return JSON.stringify(obj);
}
}
[
['diff-match-patch', diff1],
['diff-match-patch (checklines=false)', diff1a],
['diff-sequences', diff3],
['jest-diff (diff-sequences), no cleanup', diff3a],
['jest-diff (diff-sequences), with cleanup', diff3b],
['jsdiff', diff2],
].forEach(([name, diffFunc]) => {
// node --expose_gc
if (global.gc) {
gc();
}
const label = ` ${name}`;
console.time(label);
console.log(' ', stringify(diffFunc(chars1, chars2)));
console.timeEnd(label);
});
}
let a, b;
console.log('\nwith common prefix and suffix 1');
a = 'aaaaaaa\n'.repeat(50000) + 'bbbb\n' + 'dddd\n'.repeat(50000);
b = 'aaaaaaa\n'.repeat(50000) + 'cccc\n' + 'dddd\n'.repeat(50000);
bench(a, b);
console.log('\nwith common prefix and suffix 2');
a = 'aaaaaaa\n'.repeat(50000) + 'bbbbbbb\n' + 'dddd\n'.repeat(50000);
b = 'aaaaaaa\n'.repeat(50100) + 'cccc\n' + 'dddd\n'.repeat(49900);
bench(a, b);
console.log('\nwithout common prefix or suffix 1');
a = 'c\n' + 'aaaaaaa\n'.repeat(50000) + 'dddd\n'.repeat(50000);
b = 'aaaaaaa\n'.repeat(50000) + 'dddd\n'.repeat(50100) + 'z\n';
bench(a, b);
console.log('\nwithout common prefix or suffix 2');
a = 'cccc\n' + 'aaaaaaa\n'.repeat(50000) + 'bbbbbbb\n' + 'dddd\n'.repeat(50000) + 'z\n';
b = 'aaaaaaa\n'.repeat(50100) + 'cccc\n' + 'dddd\n'.repeat(49900) + 'z\ny\n';
bench(a, b);
// Hearthstone cards.json in different languages.
// This is somewhat challenging since many lines are changed.
// wget https://api.hearthstonejson.com/v1/168129/enUS/cards.json -O 1
// wget https://api.hearthstonejson.com/v1/168129/zhCN/cards.json -O 2
// python3 -m json.tool < 1 > 1.json
// python3 -m json.tool < 2 > 2.json
console.log('\ncards.json with different languages');
a = fs.readFileSync('1.json', {encoding: 'utf-8'});
b = fs.readFileSync('2.json', {encoding: 'utf-8'});
bench(a, b);
The output looks like:
with common prefix and suffix 1
[[0,50000],[-1,1],[1,1],[0,50000]]
diff-match-patch: 5.073ms
[[0,50000],[-1,1],[1,1],[0,50000]]
diff-match-patch (checklines=false): 0.481ms
[[0,50000],[-1,1],[1,1],[0,50000]]
diff-sequences: 7.589ms
[[0,50000],[-1,1],[1,1],[0,50000]]
jest-diff (diff-sequences), no cleanup: 10.915ms
[[0,50000],[-1,1],[1,1],[0,50000]]
jest-diff (diff-sequences), with cleanup: 10.588ms
[[0,50000],[-1,1],[1,1],[0,50000]]
jsdiff: 22.664ms
with common prefix and suffix 2
[[0,50000],[-1,101],[1,101],[0,49900]]
diff-match-patch: 10.688ms
[[0,50000],[-1,101],[1,101],[0,49900]]
diff-match-patch (checklines=false): 2.619ms
[[0,50000],[-1,101],[1,101],[0,49900]]
diff-sequences: 12.687ms
[[0,50000],[-1,101],[1,101],[0,49900]]
jest-diff (diff-sequences), no cleanup: 11.055ms
[[0,50000],[-1,101],[1,101],[0,49900]]
jest-diff (diff-sequences), with cleanup: 4.356ms
[[0,50000],[-1,1],[1,101],[0,49900],[-1,100]]
jsdiff: 59.359ms
without common prefix or suffix 1
[[-1,1],[0,100000],[1,101]]
diff-match-patch: 632.863ms
[[-1,1],[0,100000],[1,101]]
diff-match-patch (checklines=false): 607.796ms
[[-1,1],[0,50000],[1,51],[0,50000],[1,50]]
diff-sequences: 12.366ms
[[-1,1],[0,50000],[1,51],[0,50000],[1,50]]
jest-diff (diff-sequences), no cleanup: 11.096ms
[[-1,1],[0,100000],[1,51],[1,50]]
jest-diff (diff-sequences), with cleanup: 1.029s
[[-1,1],[0,100000],[1,101]]
jsdiff: 13.163ms
without common prefix or suffix 2
[[-1,1],[0,50000],[-1,101],[1,101],[0,49901],[1,1]]
diff-match-patch: 2.773s
[[-1,1],[0,50000],[-1,101],[1,101],[0,49901],[1,1]]
diff-match-patch (checklines=false): 1.402s
[[-1,1],[0,50000],[-1,101],[1,101],[0,49901],[1,1]]
diff-sequences: 22.216ms
[[-1,1],[0,50000],[-1,101],[1,101],[0,49901],[1,1]]
jest-diff (diff-sequences), no cleanup: 20.546ms
[[-1,1],[0,50000],[-1,101],[1,101],[0,49901],[1,1]]
jest-diff (diff-sequences), with cleanup: 19.222ms
[[-1,1],[0,50000],[-1,1],[1,101],[0,49900],[-1,100],[0,1],[1,1]]
jsdiff: 33.82ms
cards.json with different languages
67781 items
diff-match-patch: 1:04.122 (m:ss.mmm)
57514 items
diff-match-patch (checklines=false): 2:00.283 (m:ss.mmm)
67781 items
diff-sequences: 1:09.486 (m:ss.mmm)
67781 items
jest-diff (diff-sequences), no cleanup: 1:06.452 (m:ss.mmm)
52937 items
jest-diff (diff-sequences), with cleanup: 1:09.118 (m:ss.mmm)
...
(jsdiff cannot complete this test case in 20+ minutes)
Observations:
- In the last test case, `jsdiff` does not implement O(D^2) -> O(D) space
optimization so it is practically unusable (reported as https://github.com/kpdecker/jsdiff/issues/396).
`diff-match-patch` and `jest-diff` both implement the linear space
optimization, and have similar performance.
- `diff-match-patch` strips common prefix and suffix, which makes it faster
than `jest-diff` in "common prefix and suffix" test cases.
- Both `diff-match-patch` and `jest-diff` can take a long time on "cleanup".
See the "without common prefix or suffix 1" test case. We probably want
to only enable cleanup for smaller input.
- `diff-match-patch` performs visibly worse on the "without common prefix
or suffix 2" test case. From the code it looks like `diff-match-patch` uses
some kind of heuristics that tries to speed up things but ends up slowing it
down.
- Without cleanup, `jest-diff` might output `[1,51],[1,50]` that can be
"obviously" merged to `[1,101]`. We might use a lightweight cleanup logic
for that.
- Reading the code, `diff-match-patch` turns lines into char codes. It cannot
handle 65536 unique lines.
(62f2e689f4/javascript/diff_match_patch_uncompressed.js (L503)
)
Conclusions:
- `jest-diff` (and `diff-sequences` under the hood) is overall the best choice.
It has expected time and space complexities, and provides flexibility to skip
the potentially slow "cleanup", and can support >65k unique lines.
- `jest-diff` misses the "skip common prefix / suffix" optimization that
`diff-match-patch` has, and seems practically important (editing a line in
the editor - all lines are common prefixes and suffixes except for the line
being edited). The optimization is not hard to implement. This diff
implements it.
- For certain use-cases (ex. linelog) where the diff content is not needed
(at least for the left / "a" side), it should use `diff-sequences` to avoid
overhead preparing the diff content.
- `jest-diff`'s `diffLines` outputs one line per `Diff` but we want
one chunk per `Diff`.
- `jest-diff`'s `diffStringsRaw` produces one `Diff` per chunk, and because
[`string.slice` is O(1) in V8](https://stackoverflow.com/a/72545403), it has
acceptable performance. But mapping lines to chars would introduce the
65535 unique line limit undesirably.
Reviewed By: evangrayk
Differential Revision: D43857949
fbshipit-source-id: 9a3d85ebf10c9b82da8ab5cba4e14e519bbf264d
This commit is contained in:
parent
41391d8927
commit
bce27b2980
@ -6,14 +6,12 @@
|
||||
"@testing-library/jest-dom": "^5.14.1",
|
||||
"@testing-library/react": "^13.0.0",
|
||||
"@testing-library/user-event": "^13.2.1",
|
||||
"@types/diff": "^5.0.2",
|
||||
"@types/jest": "^27.0.1",
|
||||
"@types/node": "^16.7.13",
|
||||
"@types/react": "^18.0.0",
|
||||
"@types/react-dom": "^18.0.0",
|
||||
"@vscode/webview-ui-toolkit": "^1.0.0",
|
||||
"diff": "^5.0.0",
|
||||
"diff-match-patch": "^1.0.5",
|
||||
"diff-sequences": "^29.4.3",
|
||||
"isl-server": "0.0.0",
|
||||
"react": "^18.1.0",
|
||||
"react-dom": "^18.1.0",
|
||||
@ -23,7 +21,6 @@
|
||||
"typescript": "^4.4.2"
|
||||
},
|
||||
"devDependencies": {
|
||||
"@types/diff-match-patch": "^1.0.32",
|
||||
"rewire": "^6.0.0",
|
||||
"ts-loader": "^9.3.1"
|
||||
},
|
||||
|
@ -29,13 +29,8 @@ SOFTWARE.
|
||||
|
||||
*/
|
||||
|
||||
import {diff_match_patch} from 'diff-match-patch';
|
||||
|
||||
const dmp = new diff_match_patch();
|
||||
|
||||
// The timeout does not seem to affect dmp performance.
|
||||
// But bumping it produces better diff results.
|
||||
dmp.Diff_Timeout = 10000;
|
||||
// Read D43857949 about the choice of the diff library.
|
||||
import diffSequences from 'diff-sequences';
|
||||
|
||||
/** Operation code. */
|
||||
enum Op {
|
||||
@ -400,13 +395,13 @@ class LineLog {
|
||||
const [aRev, bRev] = rev ? [rev, rev] : [this.maxRev, this.maxRev + 1];
|
||||
const b = text;
|
||||
|
||||
const lines = splitLines(b);
|
||||
const bLines = splitLines(b);
|
||||
this.checkOut(aRev);
|
||||
const a = this.content;
|
||||
const blocks = diffLines(a, b);
|
||||
const aLines = splitLines(this.content);
|
||||
const blocks = diffLines(aLines, bLines);
|
||||
|
||||
blocks.reverse().forEach(([a1, a2, b1, b2]) => {
|
||||
this.editChunk(a1, a2, bRev, lines.slice(b1, b2));
|
||||
this.editChunk(a1, a2, bRev, bLines.slice(b1, b2));
|
||||
});
|
||||
this.content = b;
|
||||
this.lastCheckoutKey = `${bRev},null`;
|
||||
@ -429,41 +424,59 @@ class LineLog {
|
||||
}
|
||||
|
||||
/**
|
||||
* Calculate the differences.
|
||||
* Calculate the line differences. For performance, this function only
|
||||
* returns the line indexes for different chunks. The line contents
|
||||
* are not returned.
|
||||
*
|
||||
* @param a Content of the "a" side.
|
||||
* @param b Content of the "b" side.
|
||||
* @param aLines lines on the "a" side.
|
||||
* @param bLines lines on the "b" side.
|
||||
* @returns A list of `(a1, a2, b1, b2)` tuples for the line ranges that
|
||||
* are different between "a" and "b".
|
||||
*/
|
||||
function diffLines(a: string, b: string): [LineIdx, LineIdx, LineIdx, LineIdx][] {
|
||||
const {chars1, chars2} = dmp.diff_linesToChars_(a, b);
|
||||
function diffLines(aLines: string[], bLines: string[]): [LineIdx, LineIdx, LineIdx, LineIdx][] {
|
||||
// Avoid O(string length) comparison.
|
||||
const [aList, bList] = stringsToInts([aLines, bLines]);
|
||||
|
||||
// Skip common prefix and suffix.
|
||||
let aLen = aList.length;
|
||||
let bLen = bList.length;
|
||||
const minLen = Math.min(aLen, bLen);
|
||||
let commonPrefixLen = 0;
|
||||
while (commonPrefixLen < minLen && aList[commonPrefixLen] === bList[commonPrefixLen]) {
|
||||
commonPrefixLen += 1;
|
||||
}
|
||||
while (aLen > commonPrefixLen && bLen > commonPrefixLen && aList[aLen - 1] === bList[bLen - 1]) {
|
||||
aLen -= 1;
|
||||
bLen -= 1;
|
||||
}
|
||||
aLen -= commonPrefixLen;
|
||||
bLen -= commonPrefixLen;
|
||||
|
||||
// Run the diff algorithm.
|
||||
const blocks: [LineIdx, LineIdx, LineIdx, LineIdx][] = [];
|
||||
let a1 = 0,
|
||||
a2 = 0,
|
||||
b1 = 0,
|
||||
b2 = 0;
|
||||
const push = (len: number) => {
|
||||
let a1 = 0;
|
||||
let b1 = 0;
|
||||
|
||||
function isCommon(aIndex: number, bIndex: number) {
|
||||
return aList[aIndex + commonPrefixLen] === bList[bIndex + commonPrefixLen];
|
||||
}
|
||||
|
||||
function foundSequence(n: LineIdx, a2: LineIdx, b2: LineIdx) {
|
||||
if (a1 !== a2 || b1 !== b2) {
|
||||
blocks.push([a1, a2, b1, b2]);
|
||||
blocks.push([
|
||||
a1 + commonPrefixLen,
|
||||
a2 + commonPrefixLen,
|
||||
b1 + commonPrefixLen,
|
||||
b2 + commonPrefixLen,
|
||||
]);
|
||||
}
|
||||
a1 = a2 = a2 + len;
|
||||
b1 = b2 = b2 + len;
|
||||
};
|
||||
dmp.diff_main(chars1, chars2, false).forEach(x => {
|
||||
const [op, chars] = x;
|
||||
const len = chars.length;
|
||||
if (op === 0) {
|
||||
push(len);
|
||||
}
|
||||
if (op < 0) {
|
||||
a2 += len;
|
||||
}
|
||||
if (op > 0) {
|
||||
b2 += len;
|
||||
}
|
||||
});
|
||||
push(0);
|
||||
a1 = a2 + n;
|
||||
b1 = b2 + n;
|
||||
}
|
||||
|
||||
diffSequences(aLen, bLen, isCommon, foundSequence);
|
||||
foundSequence(0, aLen, bLen);
|
||||
|
||||
return blocks;
|
||||
}
|
||||
|
||||
@ -485,6 +498,28 @@ function splitLines(s: string): string[] {
|
||||
return result;
|
||||
}
|
||||
|
||||
/**
|
||||
* Make strings with the same content use the same integer
|
||||
* for fast comparasion.
|
||||
*/
|
||||
function stringsToInts(linesArray: string[][]): number[][] {
|
||||
// This is similar to diff-match-patch's diff_linesToChars_ but is not
|
||||
// limited to 65536 unique lines.
|
||||
const lineMap = new Map<string, number>();
|
||||
return linesArray.map(lines =>
|
||||
lines.map(line => {
|
||||
const existingId = lineMap.get(line);
|
||||
if (existingId != null) {
|
||||
return existingId;
|
||||
} else {
|
||||
const id = lineMap.size;
|
||||
lineMap.set(line, id);
|
||||
return id;
|
||||
}
|
||||
}),
|
||||
);
|
||||
}
|
||||
|
||||
/** If the assertion fails, throw an `Error` with the given `message`. */
|
||||
function assert(condition: boolean, message: string) {
|
||||
if (!condition) {
|
||||
|
@ -2517,11 +2517,6 @@
|
||||
dependencies:
|
||||
"@types/node" "*"
|
||||
|
||||
"@types/diff-match-patch@^1.0.32":
|
||||
version "1.0.32"
|
||||
resolved "https://registry.yarnpkg.com/@types/diff-match-patch/-/diff-match-patch-1.0.32.tgz#d9c3b8c914aa8229485351db4865328337a3d09f"
|
||||
integrity sha512-bPYT5ECFiblzsVzyURaNhljBH2Gh1t9LowgUwciMrNAhFewLkHT2H0Mto07Y4/3KCOGZHRQll3CTtQZ0X11D/A==
|
||||
|
||||
"@types/diff@^5.0.2":
|
||||
version "5.0.2"
|
||||
resolved "https://registry.yarnpkg.com/@types/diff/-/diff-5.0.2.tgz#dd565e0086ccf8bc6522c6ebafd8a3125c91c12b"
|
||||
@ -4813,11 +4808,6 @@ didyoumean@^1.2.2:
|
||||
resolved "https://registry.yarnpkg.com/didyoumean/-/didyoumean-1.2.2.tgz#989346ffe9e839b4555ecf5666edea0d3e8ad037"
|
||||
integrity sha512-gxtyfqMg7GKyhQmb056K7M3xszy/myH8w+B4RT+QXBQsvAOdc3XymqDDPHx1BgPgsdAA5SIifona89YtRATDzw==
|
||||
|
||||
diff-match-patch@^1.0.5:
|
||||
version "1.0.5"
|
||||
resolved "https://registry.yarnpkg.com/diff-match-patch/-/diff-match-patch-1.0.5.tgz#abb584d5f10cd1196dfc55aa03701592ae3f7b37"
|
||||
integrity sha512-IayShXAgj/QMXgB0IWmKx+rOPuGMhqm5w6jvFxmVenXKIzRqTAAsbBPT3kWQeGANj3jGgvcvv4yK6SxqYmikgw==
|
||||
|
||||
diff-sequences@^27.5.1:
|
||||
version "27.5.1"
|
||||
resolved "https://registry.yarnpkg.com/diff-sequences/-/diff-sequences-27.5.1.tgz#eaecc0d327fd68c8d9672a1e64ab8dccb2ef5327"
|
||||
@ -4828,6 +4818,11 @@ diff-sequences@^28.1.1:
|
||||
resolved "https://registry.yarnpkg.com/diff-sequences/-/diff-sequences-28.1.1.tgz#9989dc731266dc2903457a70e996f3a041913ac6"
|
||||
integrity sha512-FU0iFaH/E23a+a718l8Qa/19bF9p06kgE0KipMOMadwa3SjnaElKzPaUC0vnibs6/B/9ni97s61mcejk8W1fQw==
|
||||
|
||||
diff-sequences@^29.4.3:
|
||||
version "29.4.3"
|
||||
resolved "https://registry.yarnpkg.com/diff-sequences/-/diff-sequences-29.4.3.tgz#9314bc1fabe09267ffeca9cbafc457d8499a13f2"
|
||||
integrity sha512-ofrBgwpPhCD85kMKtE9RYFFq6OC1A89oW2vvgWZNCwxrUpRUILopY7lsYyMDSjc8g6U6aiO0Qubg6r4Wgt5ZnA==
|
||||
|
||||
diff@^4.0.1:
|
||||
version "4.0.2"
|
||||
resolved "https://registry.yarnpkg.com/diff/-/diff-4.0.2.tgz#60f3aecb89d5fae520c11aa19efc2bb982aade7d"
|
||||
|
Loading…
Reference in New Issue
Block a user