mirror of
https://github.com/facebook/sapling.git
synced 2024-10-10 16:57:49 +03:00
c23eb09a4f
Before this patch, similarity detection logic (for addremove and automv) depends entirely on SHA-1 digesting. But this causes incorrect rename detection, if: - removing file A and adding file B occur at same committing, and - SHA-1 hash values of file A and B are same This may prevent security experts from managing sample files for SHAttered issue in Mercurial repository, for example. https://security.googleblog.com/2017/02/announcing-first-sha1-collision.html https://shattered.it/ Hash collision itself isn't so serious for core repository functionality of Mercurial, described by mpm as below, though. https://www.mercurial-scm.org/wiki/mpm/SHA1 This patch compares between actual file contents after hash comparison for exact identity. Even after this patch, SHA-1 is still used, because it is reasonable enough to quickly detect existence of "(almost) same" file. - replacing SHA-1 causes decreasing performance, and - replacement of it has ambiguity, yet Getting content of removed file (= rfctx.data()) at each exact comparison should be cheap enough, even though getting content of added one costs much. ======= ============== ===================== file fctx data() reads from ======= ============== ===================== removed filectx in-memory revlog data added workingfilectx storage ======= ============== =====================
120 lines
3.9 KiB
Python
120 lines
3.9 KiB
Python
# similar.py - mechanisms for finding similar files
|
|
#
|
|
# Copyright 2005-2007 Matt Mackall <mpm@selenic.com>
|
|
#
|
|
# This software may be used and distributed according to the terms of the
|
|
# GNU General Public License version 2 or any later version.
|
|
|
|
from __future__ import absolute_import
|
|
|
|
import hashlib
|
|
|
|
from .i18n import _
|
|
from . import (
|
|
bdiff,
|
|
mdiff,
|
|
)
|
|
|
|
def _findexactmatches(repo, added, removed):
|
|
'''find renamed files that have no changes
|
|
|
|
Takes a list of new filectxs and a list of removed filectxs, and yields
|
|
(before, after) tuples of exact matches.
|
|
'''
|
|
numfiles = len(added) + len(removed)
|
|
|
|
# Get hashes of removed files.
|
|
hashes = {}
|
|
for i, fctx in enumerate(removed):
|
|
repo.ui.progress(_('searching for exact renames'), i, total=numfiles,
|
|
unit=_('files'))
|
|
h = hashlib.sha1(fctx.data()).digest()
|
|
hashes[h] = fctx
|
|
|
|
# For each added file, see if it corresponds to a removed file.
|
|
for i, fctx in enumerate(added):
|
|
repo.ui.progress(_('searching for exact renames'), i + len(removed),
|
|
total=numfiles, unit=_('files'))
|
|
adata = fctx.data()
|
|
h = hashlib.sha1(adata).digest()
|
|
if h in hashes:
|
|
rfctx = hashes[h]
|
|
# compare between actual file contents for exact identity
|
|
if adata == rfctx.data():
|
|
yield (rfctx, fctx)
|
|
|
|
# Done
|
|
repo.ui.progress(_('searching for exact renames'), None)
|
|
|
|
def _ctxdata(fctx):
|
|
# lazily load text
|
|
orig = fctx.data()
|
|
return orig, mdiff.splitnewlines(orig)
|
|
|
|
def _score(fctx, otherdata):
|
|
orig, lines = otherdata
|
|
text = fctx.data()
|
|
# bdiff.blocks() returns blocks of matching lines
|
|
# count the number of bytes in each
|
|
equal = 0
|
|
matches = bdiff.blocks(text, orig)
|
|
for x1, x2, y1, y2 in matches:
|
|
for line in lines[y1:y2]:
|
|
equal += len(line)
|
|
|
|
lengths = len(text) + len(orig)
|
|
return equal * 2.0 / lengths
|
|
|
|
def score(fctx1, fctx2):
|
|
return _score(fctx1, _ctxdata(fctx2))
|
|
|
|
def _findsimilarmatches(repo, added, removed, threshold):
|
|
'''find potentially renamed files based on similar file content
|
|
|
|
Takes a list of new filectxs and a list of removed filectxs, and yields
|
|
(before, after, score) tuples of partial matches.
|
|
'''
|
|
copies = {}
|
|
for i, r in enumerate(removed):
|
|
repo.ui.progress(_('searching for similar files'), i,
|
|
total=len(removed), unit=_('files'))
|
|
|
|
data = None
|
|
for a in added:
|
|
bestscore = copies.get(a, (None, threshold))[1]
|
|
if data is None:
|
|
data = _ctxdata(r)
|
|
myscore = _score(a, data)
|
|
if myscore >= bestscore:
|
|
copies[a] = (r, myscore)
|
|
repo.ui.progress(_('searching'), None)
|
|
|
|
for dest, v in copies.iteritems():
|
|
source, bscore = v
|
|
yield source, dest, bscore
|
|
|
|
def findrenames(repo, added, removed, threshold):
|
|
'''find renamed files -- yields (before, after, score) tuples'''
|
|
parentctx = repo['.']
|
|
workingctx = repo[None]
|
|
|
|
# Zero length files will be frequently unrelated to each other, and
|
|
# tracking the deletion/addition of such a file will probably cause more
|
|
# harm than good. We strip them out here to avoid matching them later on.
|
|
addedfiles = set([workingctx[fp] for fp in added
|
|
if workingctx[fp].size() > 0])
|
|
removedfiles = set([parentctx[fp] for fp in removed
|
|
if fp in parentctx and parentctx[fp].size() > 0])
|
|
|
|
# Find exact matches.
|
|
for (a, b) in _findexactmatches(repo,
|
|
sorted(addedfiles), sorted(removedfiles)):
|
|
addedfiles.remove(b)
|
|
yield (a.path(), b.path(), 1.0)
|
|
|
|
# If the user requested similar files to be matched, search for them also.
|
|
if threshold < 1.0:
|
|
for (a, b, score) in _findsimilarmatches(repo,
|
|
sorted(addedfiles), sorted(removedfiles), threshold):
|
|
yield (a.path(), b.path(), score)
|