sapling/mercurial/similar.py
Jun Wu 584656dff3 codemod: join the auto-formatter party
Summary:
Turned on the auto formatter. Ran `arc lint --apply-patches --take BLACK **/*.py`.
Then run `arc lint` again so some other autofixers like spellchecker etc. looked
at the code base. Manually accept the changes whenever they make sense, or use
a workaround (ex. changing "dict()" to "dict constructor") where autofix is false
positive. Disabled linters on files that are hard (i18n/polib.py) to fix, or less
interesting to fix (hgsubversion tests), or cannot be fixed without breaking
OSS build (FBPYTHON4).

Conflicted linters (test-check-module-imports.t, part of test-check-code.t,
test-check-pyflakes.t) are removed or disabled.

Duplicated linters (test-check-pyflakes.t, test-check-pylint.t) are removed.

An issue of the auto-formatter is lines are no longer guarnateed to be <= 80
chars. But that seems less important comparing with the benefit auto-formatter
provides.

As we're here, also remove test-check-py3-compat.t, as it is currently broken
if `PYTHON3=/bin/python3` is set.

Reviewed By: wez, phillco, simpkins, pkaush, singhsrb

Differential Revision: D8173629

fbshipit-source-id: 90e248ae0c5e6eaadbe25520a6ee42d32005621b
2018-05-25 22:17:29 -07:00

128 lines
4.0 KiB
Python

# similar.py - mechanisms for finding similar files
#
# Copyright 2005-2007 Matt Mackall <mpm@selenic.com>
#
# This software may be used and distributed according to the terms of the
# GNU General Public License version 2 or any later version.
from __future__ import absolute_import
from . import mdiff, progress
from .i18n import _
def _findexactmatches(repo, added, removed):
"""find renamed files that have no changes
Takes a list of new filectxs and a list of removed filectxs, and yields
(before, after) tuples of exact matches.
"""
numfiles = len(added) + len(removed)
with progress.bar(
repo.ui, _("searching for exact renames"), _("files"), numfiles
) as prog:
# Build table of removed files: {hash(fctx.data()): [fctx, ...]}.
# We use hash() to discard fctx.data() from memory.
hashes = {}
for fctx in removed:
prog.value += 1
h = hash(fctx.data())
if h not in hashes:
hashes[h] = [fctx]
else:
hashes[h].append(fctx)
# For each added file, see if it corresponds to a removed file.
for fctx in added:
prog.value += 1
adata = fctx.data()
h = hash(adata)
for rfctx in hashes.get(h, []):
# compare between actual file contents for exact identity
if adata == rfctx.data():
yield (rfctx, fctx)
break
def _ctxdata(fctx):
# lazily load text
orig = fctx.data()
return orig, mdiff.splitnewlines(orig)
def _score(fctx, otherdata):
orig, lines = otherdata
text = fctx.data()
# mdiff.blocks() returns blocks of matching lines
# count the number of bytes in each
equal = 0
matches = mdiff.blocks(text, orig)
for x1, x2, y1, y2 in matches:
for line in lines[y1:y2]:
equal += len(line)
lengths = len(text) + len(orig)
return equal * 2.0 / lengths
def score(fctx1, fctx2):
return _score(fctx1, _ctxdata(fctx2))
def _findsimilarmatches(repo, added, removed, threshold):
"""find potentially renamed files based on similar file content
Takes a list of new filectxs and a list of removed filectxs, and yields
(before, after, score) tuples of partial matches.
"""
copies = {}
with progress.bar(
repo.ui, _("searching for similar files"), _("files"), len(removed)
) as prog:
for r in removed:
prog.value += 1
data = None
for a in added:
bestscore = copies.get(a, (None, threshold))[1]
if data is None:
data = _ctxdata(r)
myscore = _score(a, data)
if myscore > bestscore:
copies[a] = (r, myscore)
for dest, v in copies.iteritems():
source, bscore = v
yield source, dest, bscore
def _dropempty(fctxs):
return [x for x in fctxs if x.size() > 0]
def findrenames(repo, added, removed, threshold):
"""find renamed files -- yields (before, after, score) tuples"""
wctx = repo[None]
pctx = wctx.p1()
# Zero length files will be frequently unrelated to each other, and
# tracking the deletion/addition of such a file will probably cause more
# harm than good. We strip them out here to avoid matching them later on.
addedfiles = _dropempty(wctx[fp] for fp in sorted(added))
removedfiles = _dropempty(pctx[fp] for fp in sorted(removed) if fp in pctx)
# Find exact matches.
matchedfiles = set()
for (a, b) in _findexactmatches(repo, addedfiles, removedfiles):
matchedfiles.add(b)
yield (a.path(), b.path(), 1.0)
# If the user requested similar files to be matched, search for them also.
if threshold < 1.0:
addedfiles = [x for x in addedfiles if x not in matchedfiles]
for (a, b, score) in _findsimilarmatches(
repo, addedfiles, removedfiles, threshold
):
yield (a.path(), b.path(), score)