sapling/mercurial/bdiff.c

322 lines
6.8 KiB
C
Raw Normal View History

/*
bdiff.c - efficient binary diff extension for Mercurial
2006-08-12 23:30:02 +04:00
Copyright 2005, 2006 Matt Mackall <mpm@selenic.com>
This software may be used and distributed according to the terms of
the GNU General Public License, incorporated herein by reference.
Based roughly on Python difflib
*/
#include <stdlib.h>
#include <string.h>
#include <limits.h>
#include "compat.h"
#include "bitmanipulation.h"
#include "bdiff.h"
bdiff: replace hash algorithm This patch replaces lyhash with the hash algorithm used by diffutils. The algorithm has its origins in Git commit 2e9d1410, which is all the way back from 1992. The license header in the code at that revision in GPL v2. I have not performed an extensive analysis of the distribution (and therefore buckets) of hash output. However, `hg perfbdiff` gives some clear wins. I'd like to think that if it is good enough for diffutils it is good enough for us? From the mozilla-unified repository: $ perfbdiff -m 3041e4d59df2 ! wall 0.053271 comb 0.060000 user 0.060000 sys 0.000000 (best of 100) ! wall 0.035827 comb 0.040000 user 0.040000 sys 0.000000 (best of 100) $ perfbdiff 0e9928989e9c --alldata --count 100 ! wall 6.204277 comb 6.200000 user 6.200000 sys 0.000000 (best of 3) ! wall 4.309710 comb 4.300000 user 4.300000 sys 0.000000 (best of 3) From the hg repo: $ perfbdiff 35000 --alldata --count 1000 ! wall 0.660358 comb 0.660000 user 0.660000 sys 0.000000 (best of 15) ! wall 0.534092 comb 0.530000 user 0.530000 sys 0.000000 (best of 19) Looking at the generated assembly and statistical profiler output from the kernel level, I believe there is room to make this function even faster. Namely, we're still consuming data character by character instead of at the word level. This translates to more loop iterations and more instructions. At this juncture though, the real performance killer is that we're hashing every line. We should get a significant speedup if we change the algorithm to find the longest prefix, longest suffix, treat those as single "lines" and then only do the line splitting and hashing on the parts that are different. That will require a lot of C code, however. I'm optimistic this approach could result in a ~2x speedup.
2016-11-07 05:51:57 +03:00
/* Hash implementation from diffutils */
#define ROL(v, n) ((v) << (n) | (v) >> (sizeof(v) * CHAR_BIT - (n)))
#define HASH(h, c) ((c) + ROL(h ,7))
struct pos {
int pos, len;
};
int bdiff_splitlines(const char *a, ssize_t len, struct bdiff_line **lr)
{
unsigned hash;
int i;
const char *p, *b = a;
2007-09-28 08:58:54 +04:00
const char * const plast = a + len - 1;
struct bdiff_line *l;
/* count the lines */
i = 1; /* extra line for sentinel */
for (p = a; p < plast; p++)
if (*p == '\n')
i++;
if (p == plast)
i++;
*lr = l = (struct bdiff_line *)malloc(sizeof(struct bdiff_line) * i);
if (!l)
return -1;
/* build the line array and calculate hashes */
hash = 0;
for (p = a; p < plast; p++) {
bdiff: replace hash algorithm This patch replaces lyhash with the hash algorithm used by diffutils. The algorithm has its origins in Git commit 2e9d1410, which is all the way back from 1992. The license header in the code at that revision in GPL v2. I have not performed an extensive analysis of the distribution (and therefore buckets) of hash output. However, `hg perfbdiff` gives some clear wins. I'd like to think that if it is good enough for diffutils it is good enough for us? From the mozilla-unified repository: $ perfbdiff -m 3041e4d59df2 ! wall 0.053271 comb 0.060000 user 0.060000 sys 0.000000 (best of 100) ! wall 0.035827 comb 0.040000 user 0.040000 sys 0.000000 (best of 100) $ perfbdiff 0e9928989e9c --alldata --count 100 ! wall 6.204277 comb 6.200000 user 6.200000 sys 0.000000 (best of 3) ! wall 4.309710 comb 4.300000 user 4.300000 sys 0.000000 (best of 3) From the hg repo: $ perfbdiff 35000 --alldata --count 1000 ! wall 0.660358 comb 0.660000 user 0.660000 sys 0.000000 (best of 15) ! wall 0.534092 comb 0.530000 user 0.530000 sys 0.000000 (best of 19) Looking at the generated assembly and statistical profiler output from the kernel level, I believe there is room to make this function even faster. Namely, we're still consuming data character by character instead of at the word level. This translates to more loop iterations and more instructions. At this juncture though, the real performance killer is that we're hashing every line. We should get a significant speedup if we change the algorithm to find the longest prefix, longest suffix, treat those as single "lines" and then only do the line splitting and hashing on the parts that are different. That will require a lot of C code, however. I'm optimistic this approach could result in a ~2x speedup.
2016-11-07 05:51:57 +03:00
hash = HASH(hash, *p);
if (*p == '\n') {
l->hash = hash;
hash = 0;
l->len = p - b + 1;
l->l = b;
l->n = INT_MAX;
l++;
b = p + 1;
}
}
if (p == plast) {
hash = HASH(hash, *p);
l->hash = hash;
l->len = p - b + 1;
l->l = b;
l->n = INT_MAX;
l++;
}
/* set up a sentinel */
l->hash = 0;
l->len = 0;
l->l = a + len;
return i - 1;
}
static inline int cmp(struct bdiff_line *a, struct bdiff_line *b)
{
return a->hash != b->hash || a->len != b->len || memcmp(a->l, b->l, a->len);
}
static int equatelines(struct bdiff_line *a, int an, struct bdiff_line *b,
int bn)
{
int i, j, buckets = 1, t, scale;
struct pos *h = NULL;
/* build a hash table of the next highest power of 2 */
while (buckets < bn + 1)
buckets *= 2;
/* try to allocate a large hash table to avoid collisions */
for (scale = 4; scale; scale /= 2) {
h = (struct pos *)malloc(scale * buckets * sizeof(struct pos));
if (h)
break;
}
if (!h)
return 0;
buckets = buckets * scale - 1;
/* clear the hash table */
for (i = 0; i <= buckets; i++) {
bdiff: deal better with duplicate lines The longest_match code compares all the possible positions in two files to find the best match. Given a pair of sequences, it effectively searches a grid like this: a b b b c . d e . f 0 1 2 3 4 5 6 7 8 9 a 1 - - - - - - - - - b - 2 1 1 - - - - - - b - 1 3 2 - - - - - - b - 1 2 4 - - - - - - . - - - - - 1 - - 1 - Here, the 4 in the middle says "the first four lines of the file match", which it can compute be comparing the fourth lines and then adding one to the result found when comparing the third lines in the entry to the upper left. We generally avoid the quadratic worst case by only looking at lines that match, which is precomputed. We also avoid quadratic storage by only keeping a single column vector and then keeping track of the best match. Unfortunately, this can get us into trouble with the sequences above. Because we want to reuse the '3' value when calculating the '4', we need to be careful not to overwrite it with the '2' we calculate immediately before. If we scan left to right, top to bottom, we're going to have a problem: we'll overwrite our 3 before we use it and calculate a suboptimal best match. To address this, we can either keep two column vectors and swap between them (which significantly complicates bookkeeping), or change our scanning order. If we instead scan from left to right, bottom to top, we'll avoid ever overwriting values we'll need in the future. This unfortunately needs several changes to be made simultaneously: - change the order we build the initial hash chains for the b sequence - change the sentinel values from INT_MAX to -1 - change the visit order in the longest_match inner loop - add a tie-breaker preference for earlier matches This last is needed because we previously had an implicit tie-breaker from our visitation order that our test suite relies on. Later matches can also trigger a bug in the normalization code in diff().
2016-04-22 05:05:26 +03:00
h[i].pos = -1;
h[i].len = 0;
}
/* add lines to the hash table chains */
bdiff: deal better with duplicate lines The longest_match code compares all the possible positions in two files to find the best match. Given a pair of sequences, it effectively searches a grid like this: a b b b c . d e . f 0 1 2 3 4 5 6 7 8 9 a 1 - - - - - - - - - b - 2 1 1 - - - - - - b - 1 3 2 - - - - - - b - 1 2 4 - - - - - - . - - - - - 1 - - 1 - Here, the 4 in the middle says "the first four lines of the file match", which it can compute be comparing the fourth lines and then adding one to the result found when comparing the third lines in the entry to the upper left. We generally avoid the quadratic worst case by only looking at lines that match, which is precomputed. We also avoid quadratic storage by only keeping a single column vector and then keeping track of the best match. Unfortunately, this can get us into trouble with the sequences above. Because we want to reuse the '3' value when calculating the '4', we need to be careful not to overwrite it with the '2' we calculate immediately before. If we scan left to right, top to bottom, we're going to have a problem: we'll overwrite our 3 before we use it and calculate a suboptimal best match. To address this, we can either keep two column vectors and swap between them (which significantly complicates bookkeeping), or change our scanning order. If we instead scan from left to right, bottom to top, we'll avoid ever overwriting values we'll need in the future. This unfortunately needs several changes to be made simultaneously: - change the order we build the initial hash chains for the b sequence - change the sentinel values from INT_MAX to -1 - change the visit order in the longest_match inner loop - add a tie-breaker preference for earlier matches This last is needed because we previously had an implicit tie-breaker from our visitation order that our test suite relies on. Later matches can also trigger a bug in the normalization code in diff().
2016-04-22 05:05:26 +03:00
for (i = 0; i < bn; i++) {
/* find the equivalence class */
bdiff: deal better with duplicate lines The longest_match code compares all the possible positions in two files to find the best match. Given a pair of sequences, it effectively searches a grid like this: a b b b c . d e . f 0 1 2 3 4 5 6 7 8 9 a 1 - - - - - - - - - b - 2 1 1 - - - - - - b - 1 3 2 - - - - - - b - 1 2 4 - - - - - - . - - - - - 1 - - 1 - Here, the 4 in the middle says "the first four lines of the file match", which it can compute be comparing the fourth lines and then adding one to the result found when comparing the third lines in the entry to the upper left. We generally avoid the quadratic worst case by only looking at lines that match, which is precomputed. We also avoid quadratic storage by only keeping a single column vector and then keeping track of the best match. Unfortunately, this can get us into trouble with the sequences above. Because we want to reuse the '3' value when calculating the '4', we need to be careful not to overwrite it with the '2' we calculate immediately before. If we scan left to right, top to bottom, we're going to have a problem: we'll overwrite our 3 before we use it and calculate a suboptimal best match. To address this, we can either keep two column vectors and swap between them (which significantly complicates bookkeeping), or change our scanning order. If we instead scan from left to right, bottom to top, we'll avoid ever overwriting values we'll need in the future. This unfortunately needs several changes to be made simultaneously: - change the order we build the initial hash chains for the b sequence - change the sentinel values from INT_MAX to -1 - change the visit order in the longest_match inner loop - add a tie-breaker preference for earlier matches This last is needed because we previously had an implicit tie-breaker from our visitation order that our test suite relies on. Later matches can also trigger a bug in the normalization code in diff().
2016-04-22 05:05:26 +03:00
for (j = b[i].hash & buckets; h[j].pos != -1;
j = (j + 1) & buckets)
if (!cmp(b + i, b + h[j].pos))
break;
/* add to the head of the equivalence class */
b[i].n = h[j].pos;
b[i].e = j;
h[j].pos = i;
h[j].len++; /* keep track of popularity */
}
/* compute popularity threshold */
t = (bn >= 31000) ? bn / 1000 : 1000000 / (bn + 1);
/* match items in a to their equivalence class in b */
for (i = 0; i < an; i++) {
/* find the equivalence class */
bdiff: deal better with duplicate lines The longest_match code compares all the possible positions in two files to find the best match. Given a pair of sequences, it effectively searches a grid like this: a b b b c . d e . f 0 1 2 3 4 5 6 7 8 9 a 1 - - - - - - - - - b - 2 1 1 - - - - - - b - 1 3 2 - - - - - - b - 1 2 4 - - - - - - . - - - - - 1 - - 1 - Here, the 4 in the middle says "the first four lines of the file match", which it can compute be comparing the fourth lines and then adding one to the result found when comparing the third lines in the entry to the upper left. We generally avoid the quadratic worst case by only looking at lines that match, which is precomputed. We also avoid quadratic storage by only keeping a single column vector and then keeping track of the best match. Unfortunately, this can get us into trouble with the sequences above. Because we want to reuse the '3' value when calculating the '4', we need to be careful not to overwrite it with the '2' we calculate immediately before. If we scan left to right, top to bottom, we're going to have a problem: we'll overwrite our 3 before we use it and calculate a suboptimal best match. To address this, we can either keep two column vectors and swap between them (which significantly complicates bookkeeping), or change our scanning order. If we instead scan from left to right, bottom to top, we'll avoid ever overwriting values we'll need in the future. This unfortunately needs several changes to be made simultaneously: - change the order we build the initial hash chains for the b sequence - change the sentinel values from INT_MAX to -1 - change the visit order in the longest_match inner loop - add a tie-breaker preference for earlier matches This last is needed because we previously had an implicit tie-breaker from our visitation order that our test suite relies on. Later matches can also trigger a bug in the normalization code in diff().
2016-04-22 05:05:26 +03:00
for (j = a[i].hash & buckets; h[j].pos != -1;
j = (j + 1) & buckets)
if (!cmp(a + i, b + h[j].pos))
break;
a[i].e = j; /* use equivalence class for quick compare */
2005-11-14 05:58:28 +03:00
if (h[j].len <= t)
a[i].n = h[j].pos; /* point to head of match list */
else
bdiff: deal better with duplicate lines The longest_match code compares all the possible positions in two files to find the best match. Given a pair of sequences, it effectively searches a grid like this: a b b b c . d e . f 0 1 2 3 4 5 6 7 8 9 a 1 - - - - - - - - - b - 2 1 1 - - - - - - b - 1 3 2 - - - - - - b - 1 2 4 - - - - - - . - - - - - 1 - - 1 - Here, the 4 in the middle says "the first four lines of the file match", which it can compute be comparing the fourth lines and then adding one to the result found when comparing the third lines in the entry to the upper left. We generally avoid the quadratic worst case by only looking at lines that match, which is precomputed. We also avoid quadratic storage by only keeping a single column vector and then keeping track of the best match. Unfortunately, this can get us into trouble with the sequences above. Because we want to reuse the '3' value when calculating the '4', we need to be careful not to overwrite it with the '2' we calculate immediately before. If we scan left to right, top to bottom, we're going to have a problem: we'll overwrite our 3 before we use it and calculate a suboptimal best match. To address this, we can either keep two column vectors and swap between them (which significantly complicates bookkeeping), or change our scanning order. If we instead scan from left to right, bottom to top, we'll avoid ever overwriting values we'll need in the future. This unfortunately needs several changes to be made simultaneously: - change the order we build the initial hash chains for the b sequence - change the sentinel values from INT_MAX to -1 - change the visit order in the longest_match inner loop - add a tie-breaker preference for earlier matches This last is needed because we previously had an implicit tie-breaker from our visitation order that our test suite relies on. Later matches can also trigger a bug in the normalization code in diff().
2016-04-22 05:05:26 +03:00
a[i].n = -1; /* too popular */
}
/* discard hash tables */
free(h);
return 1;
}
static int longest_match(struct bdiff_line *a, struct bdiff_line *b,
struct pos *pos,
int a1, int a2, int b1, int b2, int *omi, int *omj)
{
int mi = a1, mj = b1, mk = 0, i, j, k, half, bhalf;
/* window our search on large regions to better bound
worst-case performance. by choosing a window at the end, we
reduce skipping overhead on the b chains. */
if (a2 - a1 > 30000)
a1 = a2 - 30000;
half = (a1 + a2 - 1) / 2;
bhalf = (b1 + b2 - 1) / 2;
for (i = a1; i < a2; i++) {
bdiff: deal better with duplicate lines The longest_match code compares all the possible positions in two files to find the best match. Given a pair of sequences, it effectively searches a grid like this: a b b b c . d e . f 0 1 2 3 4 5 6 7 8 9 a 1 - - - - - - - - - b - 2 1 1 - - - - - - b - 1 3 2 - - - - - - b - 1 2 4 - - - - - - . - - - - - 1 - - 1 - Here, the 4 in the middle says "the first four lines of the file match", which it can compute be comparing the fourth lines and then adding one to the result found when comparing the third lines in the entry to the upper left. We generally avoid the quadratic worst case by only looking at lines that match, which is precomputed. We also avoid quadratic storage by only keeping a single column vector and then keeping track of the best match. Unfortunately, this can get us into trouble with the sequences above. Because we want to reuse the '3' value when calculating the '4', we need to be careful not to overwrite it with the '2' we calculate immediately before. If we scan left to right, top to bottom, we're going to have a problem: we'll overwrite our 3 before we use it and calculate a suboptimal best match. To address this, we can either keep two column vectors and swap between them (which significantly complicates bookkeeping), or change our scanning order. If we instead scan from left to right, bottom to top, we'll avoid ever overwriting values we'll need in the future. This unfortunately needs several changes to be made simultaneously: - change the order we build the initial hash chains for the b sequence - change the sentinel values from INT_MAX to -1 - change the visit order in the longest_match inner loop - add a tie-breaker preference for earlier matches This last is needed because we previously had an implicit tie-breaker from our visitation order that our test suite relies on. Later matches can also trigger a bug in the normalization code in diff().
2016-04-22 05:05:26 +03:00
/* skip all lines in b after the current block */
for (j = a[i].n; j >= b2; j = b[j].n)
;
/* loop through all lines match a[i] in b */
bdiff: deal better with duplicate lines The longest_match code compares all the possible positions in two files to find the best match. Given a pair of sequences, it effectively searches a grid like this: a b b b c . d e . f 0 1 2 3 4 5 6 7 8 9 a 1 - - - - - - - - - b - 2 1 1 - - - - - - b - 1 3 2 - - - - - - b - 1 2 4 - - - - - - . - - - - - 1 - - 1 - Here, the 4 in the middle says "the first four lines of the file match", which it can compute be comparing the fourth lines and then adding one to the result found when comparing the third lines in the entry to the upper left. We generally avoid the quadratic worst case by only looking at lines that match, which is precomputed. We also avoid quadratic storage by only keeping a single column vector and then keeping track of the best match. Unfortunately, this can get us into trouble with the sequences above. Because we want to reuse the '3' value when calculating the '4', we need to be careful not to overwrite it with the '2' we calculate immediately before. If we scan left to right, top to bottom, we're going to have a problem: we'll overwrite our 3 before we use it and calculate a suboptimal best match. To address this, we can either keep two column vectors and swap between them (which significantly complicates bookkeeping), or change our scanning order. If we instead scan from left to right, bottom to top, we'll avoid ever overwriting values we'll need in the future. This unfortunately needs several changes to be made simultaneously: - change the order we build the initial hash chains for the b sequence - change the sentinel values from INT_MAX to -1 - change the visit order in the longest_match inner loop - add a tie-breaker preference for earlier matches This last is needed because we previously had an implicit tie-breaker from our visitation order that our test suite relies on. Later matches can also trigger a bug in the normalization code in diff().
2016-04-22 05:05:26 +03:00
for (; j >= b1; j = b[j].n) {
/* does this extend an earlier match? */
for (k = 1; j - k >= b1 && i - k >= a1; k++) {
/* reached an earlier match? */
if (pos[j - k].pos == i - k) {
k += pos[j - k].len;
break;
}
/* previous line mismatch? */
if (a[i - k].e != b[j - k].e)
break;
}
pos[j].pos = i;
pos[j].len = k;
/* best match so far? we prefer matches closer
to the middle to balance recursion */
if (k > mk) {
/* a longer match */
mi = i;
mj = j;
mk = k;
} else if (k == mk) {
if (i > mi && i <= half && j > b1) {
/* same match but closer to half */
mi = i;
mj = j;
} else if (i == mi && (mj > bhalf || i == a1)) {
/* same i but best earlier j */
mj = j;
}
}
}
}
if (mk) {
mi = mi - mk + 1;
mj = mj - mk + 1;
}
/* expand match to include subsequent popular lines */
while (mi + mk < a2 && mj + mk < b2 &&
a[mi + mk].e == b[mj + mk].e)
mk++;
*omi = mi;
*omj = mj;
return mk;
}
static struct bdiff_hunk *recurse(struct bdiff_line *a, struct bdiff_line *b,
struct pos *pos,
int a1, int a2, int b1, int b2, struct bdiff_hunk *l)
{
int i, j, k;
while (1) {
/* find the longest match in this chunk */
k = longest_match(a, b, pos, a1, a2, b1, b2, &i, &j);
if (!k)
return l;
/* and recurse on the remaining chunks on either side */
l = recurse(a, b, pos, a1, i, b1, j, l);
if (!l)
return NULL;
l->next = (struct bdiff_hunk *)malloc(sizeof(struct bdiff_hunk));
if (!l->next)
return NULL;
l = l->next;
l->a1 = i;
l->a2 = i + k;
l->b1 = j;
l->b2 = j + k;
l->next = NULL;
/* tail-recursion didn't happen, so do equivalent iteration */
a1 = i + k;
b1 = j + k;
}
}
int bdiff_diff(struct bdiff_line *a, int an, struct bdiff_line *b,
int bn, struct bdiff_hunk *base)
{
struct bdiff_hunk *curr;
struct pos *pos;
int t, count = 0;
/* allocate and fill arrays */
t = equatelines(a, an, b, bn);
2007-10-23 14:39:24 +04:00
pos = (struct pos *)calloc(bn ? bn : 1, sizeof(struct pos));
if (pos && t) {
/* generate the matching block list */
curr = recurse(a, b, pos, 0, an, 0, bn, base);
if (!curr)
return -1;
/* sentinel end hunk */
curr->next = (struct bdiff_hunk *)malloc(sizeof(struct bdiff_hunk));
if (!curr->next)
2010-12-07 01:59:43 +03:00
return -1;
curr = curr->next;
curr->a1 = curr->a2 = an;
curr->b1 = curr->b2 = bn;
curr->next = NULL;
}
free(pos);
2009-01-12 19:51:57 +03:00
/* normalize the hunk list, try to push each hunk towards the end */
for (curr = base->next; curr; curr = curr->next) {
struct bdiff_hunk *next = curr->next;
if (!next)
break;
if (curr->a2 == next->a1 || curr->b2 == next->b1)
while (curr->a2 < an && curr->b2 < bn
&& next->a1 < next->a2
&& next->b1 < next->b2
&& !cmp(a + curr->a2, b + curr->b2)) {
curr->a2++;
next->a1++;
curr->b2++;
next->b1++;
}
}
for (curr = base->next; curr; curr = curr->next)
count++;
return count;
}
void bdiff_freehunks(struct bdiff_hunk *l)
{
struct bdiff_hunk *n;
for (; l; l = n) {
n = l->next;
free(l);
}
}
2005-11-14 05:58:28 +03:00