2006-08-08 01:27:09 +04:00
|
|
|
# verify.py - repository integrity checking for Mercurial
|
|
|
|
#
|
2007-06-19 10:51:34 +04:00
|
|
|
# Copyright 2006, 2007 Matt Mackall <mpm@selenic.com>
|
2006-08-08 01:27:09 +04:00
|
|
|
#
|
2009-04-26 03:08:54 +04:00
|
|
|
# This software may be used and distributed according to the terms of the
|
2010-01-20 07:20:08 +03:00
|
|
|
# GNU General Public License version 2 or any later version.
|
2006-08-08 01:27:09 +04:00
|
|
|
|
2015-08-09 04:48:10 +03:00
|
|
|
from __future__ import absolute_import
|
|
|
|
|
2012-10-24 20:27:47 +04:00
|
|
|
import os
|
2015-08-09 04:48:10 +03:00
|
|
|
|
|
|
|
from .i18n import _
|
|
|
|
from .node import (
|
|
|
|
nullid,
|
|
|
|
short,
|
|
|
|
)
|
|
|
|
|
|
|
|
from . import (
|
|
|
|
error,
|
|
|
|
revlog,
|
verify: replace _validpath() by matcher
The verifier calls out to _validpath() to check if it should verify
that path and the narrowhg extension overrides _validpath() to tell
the verifier to skip that path. In treemanifest repos, the verifier
calls the same method to check if it should visit a
directory. However, the decision to visit a directory is different
from the condition that it's a matching path, and narrowhg was working
around it by returning True from its _validpath() override if *either*
was true.
Similar to how one can do "hg files -I foo/bar/ -X foo/" (making the
include pointless), narrowhg can be configured to track the same
paths. In that case match("foo/bar/baz") would be false, but
match.visitdir("foo/bar/baz") turns out to be true, causing verify to
fail. This may seem like a bug in visitdir(), but it's explicitly
documented to be undefined for subdirectories of excluded
directories. When using treemanifests, the walk would not descend into
foo/, so verification would pass. However, when using flat manifests,
there is no recursive directory walk and the file path "foo/bar/baz"
would be passed to _validpath() without "foo/" (actually without the
slash) being passed first. As explained above, _validpath() would
return true for the file path and "hg verify" would fail.
Replacing the _validpath() method by a matcher seems like the obvious
fix. Narrowhg can then pass in its own matcher and not have to
conflate the two matching functions (for dirs and files). I think it
also makes the code clearer.
2017-01-23 21:48:55 +03:00
|
|
|
scmutil,
|
2015-08-09 04:48:10 +03:00
|
|
|
util,
|
|
|
|
)
|
2006-08-08 01:27:09 +04:00
|
|
|
|
2018-01-03 16:35:56 +03:00
|
|
|
def verify(repo):
|
2016-01-16 00:14:49 +03:00
|
|
|
with repo.lock():
|
2018-01-03 16:35:56 +03:00
|
|
|
return verifier(repo).verify()
|
2007-07-22 01:02:10 +04:00
|
|
|
|
2012-10-24 20:27:47 +04:00
|
|
|
def _normpath(f):
|
|
|
|
# under hg < 2.4, convert didn't sanitize paths properly, so a
|
|
|
|
# converted repo may contain repeated slashes
|
|
|
|
while '//' in f:
|
|
|
|
f = f.replace('//', '/')
|
|
|
|
return f
|
|
|
|
|
2015-12-19 03:42:39 +03:00
|
|
|
class verifier(object):
|
verify: replace _validpath() by matcher
The verifier calls out to _validpath() to check if it should verify
that path and the narrowhg extension overrides _validpath() to tell
the verifier to skip that path. In treemanifest repos, the verifier
calls the same method to check if it should visit a
directory. However, the decision to visit a directory is different
from the condition that it's a matching path, and narrowhg was working
around it by returning True from its _validpath() override if *either*
was true.
Similar to how one can do "hg files -I foo/bar/ -X foo/" (making the
include pointless), narrowhg can be configured to track the same
paths. In that case match("foo/bar/baz") would be false, but
match.visitdir("foo/bar/baz") turns out to be true, causing verify to
fail. This may seem like a bug in visitdir(), but it's explicitly
documented to be undefined for subdirectories of excluded
directories. When using treemanifests, the walk would not descend into
foo/, so verification would pass. However, when using flat manifests,
there is no recursive directory walk and the file path "foo/bar/baz"
would be passed to _validpath() without "foo/" (actually without the
slash) being passed first. As explained above, _validpath() would
return true for the file path and "hg verify" would fail.
Replacing the _validpath() method by a matcher seems like the obvious
fix. Narrowhg can then pass in its own matcher and not have to
conflate the two matching functions (for dirs and files). I think it
also makes the code clearer.
2017-01-23 21:48:55 +03:00
|
|
|
# The match argument is always None in hg core, but e.g. the narrowhg
|
|
|
|
# extension will pass in a matcher here.
|
2018-01-03 16:35:56 +03:00
|
|
|
def __init__(self, repo, match=None):
|
2015-12-19 03:42:39 +03:00
|
|
|
self.repo = repo.unfiltered()
|
|
|
|
self.ui = repo.ui
|
verify: replace _validpath() by matcher
The verifier calls out to _validpath() to check if it should verify
that path and the narrowhg extension overrides _validpath() to tell
the verifier to skip that path. In treemanifest repos, the verifier
calls the same method to check if it should visit a
directory. However, the decision to visit a directory is different
from the condition that it's a matching path, and narrowhg was working
around it by returning True from its _validpath() override if *either*
was true.
Similar to how one can do "hg files -I foo/bar/ -X foo/" (making the
include pointless), narrowhg can be configured to track the same
paths. In that case match("foo/bar/baz") would be false, but
match.visitdir("foo/bar/baz") turns out to be true, causing verify to
fail. This may seem like a bug in visitdir(), but it's explicitly
documented to be undefined for subdirectories of excluded
directories. When using treemanifests, the walk would not descend into
foo/, so verification would pass. However, when using flat manifests,
there is no recursive directory walk and the file path "foo/bar/baz"
would be passed to _validpath() without "foo/" (actually without the
slash) being passed first. As explained above, _validpath() would
return true for the file path and "hg verify" would fail.
Replacing the _validpath() method by a matcher seems like the obvious
fix. Narrowhg can then pass in its own matcher and not have to
conflate the two matching functions (for dirs and files). I think it
also makes the code clearer.
2017-01-23 21:48:55 +03:00
|
|
|
self.match = match or scmutil.matchall(repo)
|
2015-12-19 03:42:39 +03:00
|
|
|
self.badrevs = set()
|
2015-12-21 01:33:44 +03:00
|
|
|
self.errors = 0
|
|
|
|
self.warnings = 0
|
2015-12-19 03:42:39 +03:00
|
|
|
self.havecl = len(repo.changelog) > 0
|
2016-11-10 13:13:19 +03:00
|
|
|
self.havemf = len(repo.manifestlog._revlog) > 0
|
2015-12-19 03:42:39 +03:00
|
|
|
self.revlogv1 = repo.changelog.version != revlog.REVLOGV0
|
|
|
|
self.lrugetctx = util.lrucachefunc(repo.changectx)
|
|
|
|
self.refersmf = False
|
2015-12-19 03:42:39 +03:00
|
|
|
self.fncachewarned = False
|
2017-05-14 19:38:06 +03:00
|
|
|
# developer config: verify.skipflags
|
|
|
|
self.skipflags = repo.ui.configint('verify', 'skipflags')
|
2015-12-19 03:42:39 +03:00
|
|
|
|
2015-12-19 03:42:39 +03:00
|
|
|
def warn(self, msg):
|
|
|
|
self.ui.warn(msg + "\n")
|
2015-12-21 01:33:44 +03:00
|
|
|
self.warnings += 1
|
2015-12-19 03:42:39 +03:00
|
|
|
|
2015-12-19 03:42:39 +03:00
|
|
|
def err(self, linkrev, msg, filename=None):
|
|
|
|
if linkrev is not None:
|
|
|
|
self.badrevs.add(linkrev)
|
|
|
|
else:
|
|
|
|
linkrev = '?'
|
|
|
|
msg = "%s: %s" % (linkrev, msg)
|
|
|
|
if filename:
|
|
|
|
msg = "%s@%s" % (filename, msg)
|
|
|
|
self.ui.warn(" " + msg + "\n")
|
2015-12-21 01:33:44 +03:00
|
|
|
self.errors += 1
|
2015-12-19 03:42:39 +03:00
|
|
|
|
2015-12-19 03:42:39 +03:00
|
|
|
def exc(self, linkrev, msg, inst, filename=None):
|
|
|
|
if not str(inst):
|
|
|
|
inst = repr(inst)
|
|
|
|
self.err(linkrev, "%s: %s" % (msg, inst), filename)
|
|
|
|
|
2016-01-06 04:08:14 +03:00
|
|
|
def checklog(self, obj, name, linkrev):
|
|
|
|
if not len(obj) and (self.havecl or self.havemf):
|
|
|
|
self.err(linkrev, _("empty or missing %s") % name)
|
|
|
|
return
|
|
|
|
|
|
|
|
d = obj.checksize()
|
|
|
|
if d[0]:
|
|
|
|
self.err(None, _("data length off by %d bytes") % d[0], name)
|
|
|
|
if d[1]:
|
|
|
|
self.err(None, _("index contains %d extra bytes") % d[1], name)
|
|
|
|
|
|
|
|
if obj.version != revlog.REVLOGV0:
|
|
|
|
if not self.revlogv1:
|
|
|
|
self.warn(_("warning: `%s' uses revlog format 1") % name)
|
|
|
|
elif self.revlogv1:
|
|
|
|
self.warn(_("warning: `%s' uses revlog format 0") % name)
|
|
|
|
|
2016-01-06 04:08:14 +03:00
|
|
|
def checkentry(self, obj, i, node, seen, linkrevs, f):
|
|
|
|
lr = obj.linkrev(obj.rev(node))
|
|
|
|
if lr < 0 or (self.havecl and lr not in linkrevs):
|
|
|
|
if lr < 0 or lr >= len(self.repo.changelog):
|
|
|
|
msg = _("rev %d points to nonexistent changeset %d")
|
|
|
|
else:
|
|
|
|
msg = _("rev %d points to unexpected changeset %d")
|
|
|
|
self.err(None, msg % (i, lr), f)
|
|
|
|
if linkrevs:
|
|
|
|
if f and len(linkrevs) > 1:
|
|
|
|
try:
|
|
|
|
# attempt to filter down to real linkrevs
|
|
|
|
linkrevs = [l for l in linkrevs
|
|
|
|
if self.lrugetctx(l)[f].filenode() == node]
|
|
|
|
except Exception:
|
|
|
|
pass
|
|
|
|
self.warn(_(" (expected %s)") % " ".join(map(str, linkrevs)))
|
|
|
|
lr = None # can't be trusted
|
|
|
|
|
2018-01-03 16:35:56 +03:00
|
|
|
try:
|
|
|
|
p1, p2 = obj.parents(node)
|
|
|
|
if p1 not in seen and p1 != nullid:
|
|
|
|
self.err(lr, _("unknown parent 1 %s of %s") %
|
|
|
|
(short(p1), short(node)), f)
|
|
|
|
if p2 not in seen and p2 != nullid:
|
|
|
|
self.err(lr, _("unknown parent 2 %s of %s") %
|
|
|
|
(short(p2), short(node)), f)
|
|
|
|
except Exception as inst:
|
|
|
|
self.exc(lr, _("checking parents of %s") % short(node), inst, f)
|
2016-01-06 04:08:14 +03:00
|
|
|
|
|
|
|
if node in seen:
|
|
|
|
self.err(lr, _("duplicate revision %d (%d)") % (i, seen[node]), f)
|
|
|
|
seen[node] = i
|
|
|
|
return lr
|
|
|
|
|
2015-12-19 03:42:39 +03:00
|
|
|
def verify(self):
|
|
|
|
repo = self.repo
|
2016-01-06 04:08:14 +03:00
|
|
|
|
2015-12-19 03:42:39 +03:00
|
|
|
ui = repo.ui
|
|
|
|
|
|
|
|
if not repo.url().startswith('file:'):
|
|
|
|
raise error.Abort(_("cannot verify bundle or remote repos"))
|
|
|
|
|
|
|
|
if os.path.exists(repo.sjoin("journal")):
|
|
|
|
ui.warn(_("abandoned transaction found - run hg recover\n"))
|
|
|
|
|
2016-01-06 04:08:14 +03:00
|
|
|
if ui.verbose or not self.revlogv1:
|
2015-12-19 03:42:39 +03:00
|
|
|
ui.status(_("repository uses revlog format %d\n") %
|
2016-01-06 04:08:14 +03:00
|
|
|
(self.revlogv1 and 1 or 0))
|
2015-12-19 03:42:39 +03:00
|
|
|
|
2016-01-06 08:25:51 +03:00
|
|
|
mflinkrevs, filelinkrevs = self._verifychangelog()
|
2015-12-19 03:42:39 +03:00
|
|
|
|
2016-01-06 08:25:51 +03:00
|
|
|
filenodes = self._verifymanifest(mflinkrevs)
|
2016-01-31 11:31:55 +03:00
|
|
|
del mflinkrevs
|
2016-01-06 05:34:39 +03:00
|
|
|
|
2016-01-31 11:10:56 +03:00
|
|
|
self._crosscheckfiles(filelinkrevs, filenodes)
|
|
|
|
|
2016-01-06 05:34:39 +03:00
|
|
|
totalfiles, filerevisions = self._verifyfiles(filenodes, filelinkrevs)
|
|
|
|
|
|
|
|
ui.status(_("%d files, %d changesets, %d total revisions\n") %
|
2018-01-03 16:35:56 +03:00
|
|
|
(totalfiles, len(repo.changelog), filerevisions))
|
2016-01-06 05:34:39 +03:00
|
|
|
if self.warnings:
|
|
|
|
ui.warn(_("%d warnings encountered!\n") % self.warnings)
|
|
|
|
if self.fncachewarned:
|
|
|
|
ui.warn(_('hint: run "hg debugrebuildfncache" to recover from '
|
|
|
|
'corrupt fncache\n'))
|
|
|
|
if self.errors:
|
|
|
|
ui.warn(_("%d integrity errors encountered!\n") % self.errors)
|
2016-01-06 04:08:14 +03:00
|
|
|
if self.badrevs:
|
2016-01-06 05:34:39 +03:00
|
|
|
ui.warn(_("(first damaged changeset appears to be %d)\n")
|
2016-01-06 04:08:14 +03:00
|
|
|
% min(self.badrevs))
|
2016-01-06 05:34:39 +03:00
|
|
|
return 1
|
|
|
|
|
2016-01-06 08:25:51 +03:00
|
|
|
def _verifychangelog(self):
|
2016-01-06 04:08:14 +03:00
|
|
|
ui = self.ui
|
|
|
|
repo = self.repo
|
verify: replace _validpath() by matcher
The verifier calls out to _validpath() to check if it should verify
that path and the narrowhg extension overrides _validpath() to tell
the verifier to skip that path. In treemanifest repos, the verifier
calls the same method to check if it should visit a
directory. However, the decision to visit a directory is different
from the condition that it's a matching path, and narrowhg was working
around it by returning True from its _validpath() override if *either*
was true.
Similar to how one can do "hg files -I foo/bar/ -X foo/" (making the
include pointless), narrowhg can be configured to track the same
paths. In that case match("foo/bar/baz") would be false, but
match.visitdir("foo/bar/baz") turns out to be true, causing verify to
fail. This may seem like a bug in visitdir(), but it's explicitly
documented to be undefined for subdirectories of excluded
directories. When using treemanifests, the walk would not descend into
foo/, so verification would pass. However, when using flat manifests,
there is no recursive directory walk and the file path "foo/bar/baz"
would be passed to _validpath() without "foo/" (actually without the
slash) being passed first. As explained above, _validpath() would
return true for the file path and "hg verify" would fail.
Replacing the _validpath() method by a matcher seems like the obvious
fix. Narrowhg can then pass in its own matcher and not have to
conflate the two matching functions (for dirs and files). I think it
also makes the code clearer.
2017-01-23 21:48:55 +03:00
|
|
|
match = self.match
|
2016-01-06 04:08:14 +03:00
|
|
|
cl = repo.changelog
|
|
|
|
|
|
|
|
ui.status(_("checking changesets\n"))
|
2016-01-06 08:25:51 +03:00
|
|
|
mflinkrevs = {}
|
|
|
|
filelinkrevs = {}
|
2016-01-06 04:08:14 +03:00
|
|
|
seen = {}
|
|
|
|
self.checklog(cl, "changelog", 0)
|
2018-01-03 16:35:56 +03:00
|
|
|
total = len(repo)
|
|
|
|
for i in repo:
|
2016-01-06 04:08:14 +03:00
|
|
|
ui.progress(_('checking'), i, total=total, unit=_('changesets'))
|
|
|
|
n = cl.node(i)
|
|
|
|
self.checkentry(cl, i, n, seen, [i], "changelog")
|
|
|
|
|
|
|
|
try:
|
|
|
|
changes = cl.read(n)
|
|
|
|
if changes[0] != nullid:
|
|
|
|
mflinkrevs.setdefault(changes[0], []).append(i)
|
|
|
|
self.refersmf = True
|
|
|
|
for f in changes[3]:
|
verify: replace _validpath() by matcher
The verifier calls out to _validpath() to check if it should verify
that path and the narrowhg extension overrides _validpath() to tell
the verifier to skip that path. In treemanifest repos, the verifier
calls the same method to check if it should visit a
directory. However, the decision to visit a directory is different
from the condition that it's a matching path, and narrowhg was working
around it by returning True from its _validpath() override if *either*
was true.
Similar to how one can do "hg files -I foo/bar/ -X foo/" (making the
include pointless), narrowhg can be configured to track the same
paths. In that case match("foo/bar/baz") would be false, but
match.visitdir("foo/bar/baz") turns out to be true, causing verify to
fail. This may seem like a bug in visitdir(), but it's explicitly
documented to be undefined for subdirectories of excluded
directories. When using treemanifests, the walk would not descend into
foo/, so verification would pass. However, when using flat manifests,
there is no recursive directory walk and the file path "foo/bar/baz"
would be passed to _validpath() without "foo/" (actually without the
slash) being passed first. As explained above, _validpath() would
return true for the file path and "hg verify" would fail.
Replacing the _validpath() method by a matcher seems like the obvious
fix. Narrowhg can then pass in its own matcher and not have to
conflate the two matching functions (for dirs and files). I think it
also makes the code clearer.
2017-01-23 21:48:55 +03:00
|
|
|
if match(f):
|
2016-01-06 04:08:14 +03:00
|
|
|
filelinkrevs.setdefault(_normpath(f), []).append(i)
|
|
|
|
except Exception as inst:
|
|
|
|
self.refersmf = True
|
|
|
|
self.exc(i, _("unpacking changeset %s") % short(n), inst)
|
|
|
|
ui.progress(_('checking'), None)
|
2016-01-06 08:25:51 +03:00
|
|
|
return mflinkrevs, filelinkrevs
|
2016-01-06 04:08:14 +03:00
|
|
|
|
2016-02-12 02:38:56 +03:00
|
|
|
def _verifymanifest(self, mflinkrevs, dir="", storefiles=None,
|
|
|
|
progress=None):
|
2016-01-06 05:34:39 +03:00
|
|
|
repo = self.repo
|
|
|
|
ui = self.ui
|
verify: replace _validpath() by matcher
The verifier calls out to _validpath() to check if it should verify
that path and the narrowhg extension overrides _validpath() to tell
the verifier to skip that path. In treemanifest repos, the verifier
calls the same method to check if it should visit a
directory. However, the decision to visit a directory is different
from the condition that it's a matching path, and narrowhg was working
around it by returning True from its _validpath() override if *either*
was true.
Similar to how one can do "hg files -I foo/bar/ -X foo/" (making the
include pointless), narrowhg can be configured to track the same
paths. In that case match("foo/bar/baz") would be false, but
match.visitdir("foo/bar/baz") turns out to be true, causing verify to
fail. This may seem like a bug in visitdir(), but it's explicitly
documented to be undefined for subdirectories of excluded
directories. When using treemanifests, the walk would not descend into
foo/, so verification would pass. However, when using flat manifests,
there is no recursive directory walk and the file path "foo/bar/baz"
would be passed to _validpath() without "foo/" (actually without the
slash) being passed first. As explained above, _validpath() would
return true for the file path and "hg verify" would fail.
Replacing the _validpath() method by a matcher seems like the obvious
fix. Narrowhg can then pass in its own matcher and not have to
conflate the two matching functions (for dirs and files). I think it
also makes the code clearer.
2017-01-23 21:48:55 +03:00
|
|
|
match = self.match
|
2016-11-03 03:10:47 +03:00
|
|
|
mfl = self.repo.manifestlog
|
|
|
|
mf = mfl._revlog.dirlog(dir)
|
2016-02-08 08:13:24 +03:00
|
|
|
|
|
|
|
if not dir:
|
|
|
|
self.ui.status(_("checking manifests\n"))
|
2016-01-06 05:34:39 +03:00
|
|
|
|
2016-01-06 08:25:51 +03:00
|
|
|
filenodes = {}
|
2016-02-08 08:13:24 +03:00
|
|
|
subdirnodes = {}
|
2015-12-19 03:42:39 +03:00
|
|
|
seen = {}
|
2016-02-04 02:53:48 +03:00
|
|
|
label = "manifest"
|
2016-02-08 08:13:24 +03:00
|
|
|
if dir:
|
|
|
|
label = dir
|
2016-02-04 02:35:15 +03:00
|
|
|
revlogfiles = mf.files()
|
|
|
|
storefiles.difference_update(revlogfiles)
|
2016-02-12 02:38:56 +03:00
|
|
|
if progress: # should be true since we're in a subdirectory
|
|
|
|
progress()
|
2015-12-19 03:42:39 +03:00
|
|
|
if self.refersmf:
|
2015-12-19 03:42:39 +03:00
|
|
|
# Do not check manifest if there are only changelog entries with
|
|
|
|
# null manifests.
|
2016-02-04 02:53:48 +03:00
|
|
|
self.checklog(mf, label, 0)
|
2015-12-19 03:42:39 +03:00
|
|
|
total = len(mf)
|
|
|
|
for i in mf:
|
2016-02-08 08:13:24 +03:00
|
|
|
if not dir:
|
|
|
|
ui.progress(_('checking'), i, total=total, unit=_('manifests'))
|
2015-12-19 03:42:39 +03:00
|
|
|
n = mf.node(i)
|
2016-02-04 02:53:48 +03:00
|
|
|
lr = self.checkentry(mf, i, n, seen, mflinkrevs.get(n, []), label)
|
2015-12-19 03:42:39 +03:00
|
|
|
if n in mflinkrevs:
|
|
|
|
del mflinkrevs[n]
|
2016-02-08 08:13:24 +03:00
|
|
|
elif dir:
|
|
|
|
self.err(lr, _("%s not in parent-directory manifest") %
|
|
|
|
short(n), label)
|
2015-12-19 03:42:39 +03:00
|
|
|
else:
|
2016-02-04 02:53:48 +03:00
|
|
|
self.err(lr, _("%s not in changesets") % short(n), label)
|
2006-08-08 01:27:09 +04:00
|
|
|
|
|
|
|
try:
|
2016-11-03 03:10:47 +03:00
|
|
|
mfdelta = mfl.get(dir, n).readdelta(shallow=True)
|
|
|
|
for f, fn, fl in mfdelta.iterentries():
|
2015-12-19 03:42:39 +03:00
|
|
|
if not f:
|
2016-02-08 08:13:24 +03:00
|
|
|
self.err(lr, _("entry without name in manifest"))
|
|
|
|
elif f == "/dev/null": # ignore this in very old repos
|
|
|
|
continue
|
|
|
|
fullpath = dir + _normpath(f)
|
|
|
|
if fl == 't':
|
verify: replace _validpath() by matcher
The verifier calls out to _validpath() to check if it should verify
that path and the narrowhg extension overrides _validpath() to tell
the verifier to skip that path. In treemanifest repos, the verifier
calls the same method to check if it should visit a
directory. However, the decision to visit a directory is different
from the condition that it's a matching path, and narrowhg was working
around it by returning True from its _validpath() override if *either*
was true.
Similar to how one can do "hg files -I foo/bar/ -X foo/" (making the
include pointless), narrowhg can be configured to track the same
paths. In that case match("foo/bar/baz") would be false, but
match.visitdir("foo/bar/baz") turns out to be true, causing verify to
fail. This may seem like a bug in visitdir(), but it's explicitly
documented to be undefined for subdirectories of excluded
directories. When using treemanifests, the walk would not descend into
foo/, so verification would pass. However, when using flat manifests,
there is no recursive directory walk and the file path "foo/bar/baz"
would be passed to _validpath() without "foo/" (actually without the
slash) being passed first. As explained above, _validpath() would
return true for the file path and "hg verify" would fail.
Replacing the _validpath() method by a matcher seems like the obvious
fix. Narrowhg can then pass in its own matcher and not have to
conflate the two matching functions (for dirs and files). I think it
also makes the code clearer.
2017-01-23 21:48:55 +03:00
|
|
|
if not match.visitdir(fullpath):
|
|
|
|
continue
|
2016-02-08 08:13:24 +03:00
|
|
|
subdirnodes.setdefault(fullpath + '/', {}).setdefault(
|
|
|
|
fn, []).append(lr)
|
|
|
|
else:
|
verify: replace _validpath() by matcher
The verifier calls out to _validpath() to check if it should verify
that path and the narrowhg extension overrides _validpath() to tell
the verifier to skip that path. In treemanifest repos, the verifier
calls the same method to check if it should visit a
directory. However, the decision to visit a directory is different
from the condition that it's a matching path, and narrowhg was working
around it by returning True from its _validpath() override if *either*
was true.
Similar to how one can do "hg files -I foo/bar/ -X foo/" (making the
include pointless), narrowhg can be configured to track the same
paths. In that case match("foo/bar/baz") would be false, but
match.visitdir("foo/bar/baz") turns out to be true, causing verify to
fail. This may seem like a bug in visitdir(), but it's explicitly
documented to be undefined for subdirectories of excluded
directories. When using treemanifests, the walk would not descend into
foo/, so verification would pass. However, when using flat manifests,
there is no recursive directory walk and the file path "foo/bar/baz"
would be passed to _validpath() without "foo/" (actually without the
slash) being passed first. As explained above, _validpath() would
return true for the file path and "hg verify" would fail.
Replacing the _validpath() method by a matcher seems like the obvious
fix. Narrowhg can then pass in its own matcher and not have to
conflate the two matching functions (for dirs and files). I think it
also makes the code clearer.
2017-01-23 21:48:55 +03:00
|
|
|
if not match(fullpath):
|
|
|
|
continue
|
2016-02-08 08:13:24 +03:00
|
|
|
filenodes.setdefault(fullpath, {}).setdefault(fn, lr)
|
2015-06-24 08:20:08 +03:00
|
|
|
except Exception as inst:
|
2016-02-04 02:53:48 +03:00
|
|
|
self.exc(lr, _("reading delta %s") % short(n), inst, label)
|
2016-02-08 08:13:24 +03:00
|
|
|
if not dir:
|
|
|
|
ui.progress(_('checking'), None)
|
2015-12-19 03:42:39 +03:00
|
|
|
|
2016-01-31 11:10:56 +03:00
|
|
|
if self.havemf:
|
|
|
|
for c, m in sorted([(c, m) for m in mflinkrevs
|
|
|
|
for c in mflinkrevs[m]]):
|
2016-02-08 08:13:24 +03:00
|
|
|
if dir:
|
|
|
|
self.err(c, _("parent-directory manifest refers to unknown "
|
|
|
|
"revision %s") % short(m), label)
|
|
|
|
else:
|
|
|
|
self.err(c, _("changeset refers to unknown revision %s") %
|
|
|
|
short(m), label)
|
|
|
|
|
|
|
|
if not dir and subdirnodes:
|
|
|
|
self.ui.status(_("checking directory manifests\n"))
|
2016-02-04 02:35:15 +03:00
|
|
|
storefiles = set()
|
2016-02-12 02:38:56 +03:00
|
|
|
subdirs = set()
|
2016-02-04 02:35:15 +03:00
|
|
|
revlogv1 = self.revlogv1
|
|
|
|
for f, f2, size in repo.store.datafiles():
|
|
|
|
if not f:
|
|
|
|
self.err(None, _("cannot decode filename '%s'") % f2)
|
|
|
|
elif (size > 0 or not revlogv1) and f.startswith('meta/'):
|
|
|
|
storefiles.add(_normpath(f))
|
2016-02-12 02:38:56 +03:00
|
|
|
subdirs.add(os.path.dirname(f))
|
|
|
|
subdircount = len(subdirs)
|
|
|
|
currentsubdir = [0]
|
|
|
|
def progress():
|
|
|
|
currentsubdir[0] += 1
|
|
|
|
ui.progress(_('checking'), currentsubdir[0], total=subdircount,
|
|
|
|
unit=_('manifests'))
|
2016-02-04 02:35:15 +03:00
|
|
|
|
2016-02-08 08:13:24 +03:00
|
|
|
for subdir, linkrevs in subdirnodes.iteritems():
|
2016-02-12 02:38:56 +03:00
|
|
|
subdirfilenodes = self._verifymanifest(linkrevs, subdir, storefiles,
|
|
|
|
progress)
|
2016-02-08 08:13:24 +03:00
|
|
|
for f, onefilenodes in subdirfilenodes.iteritems():
|
|
|
|
filenodes.setdefault(f, {}).update(onefilenodes)
|
2016-01-31 11:10:56 +03:00
|
|
|
|
2016-02-04 02:35:15 +03:00
|
|
|
if not dir and subdirnodes:
|
2016-02-12 02:38:56 +03:00
|
|
|
ui.progress(_('checking'), None)
|
2016-02-04 02:35:15 +03:00
|
|
|
for f in sorted(storefiles):
|
|
|
|
self.warn(_("warning: orphan revlog '%s'") % f)
|
|
|
|
|
2016-01-06 08:25:51 +03:00
|
|
|
return filenodes
|
2016-01-06 05:31:51 +03:00
|
|
|
|
2016-01-31 11:10:56 +03:00
|
|
|
def _crosscheckfiles(self, filelinkrevs, filenodes):
|
2016-01-06 05:31:51 +03:00
|
|
|
repo = self.repo
|
|
|
|
ui = self.ui
|
2015-12-19 03:42:39 +03:00
|
|
|
ui.status(_("crosschecking files in changesets and manifests\n"))
|
|
|
|
|
2016-01-31 11:10:56 +03:00
|
|
|
total = len(filelinkrevs) + len(filenodes)
|
2015-12-19 03:42:39 +03:00
|
|
|
count = 0
|
2016-01-06 05:31:51 +03:00
|
|
|
if self.havemf:
|
2015-12-19 03:42:39 +03:00
|
|
|
for f in sorted(filelinkrevs):
|
|
|
|
count += 1
|
|
|
|
ui.progress(_('crosschecking'), count, total=total)
|
|
|
|
if f not in filenodes:
|
|
|
|
lr = filelinkrevs[f][0]
|
2015-12-19 03:42:39 +03:00
|
|
|
self.err(lr, _("in changeset but not in manifest"), f)
|
2015-12-19 03:42:39 +03:00
|
|
|
|
2016-01-06 05:31:51 +03:00
|
|
|
if self.havecl:
|
2015-12-19 03:42:39 +03:00
|
|
|
for f in sorted(filenodes):
|
|
|
|
count += 1
|
|
|
|
ui.progress(_('crosschecking'), count, total=total)
|
|
|
|
if f not in filelinkrevs:
|
|
|
|
try:
|
|
|
|
fl = repo.file(f)
|
|
|
|
lr = min([fl.linkrev(fl.rev(n)) for n in filenodes[f]])
|
|
|
|
except Exception:
|
|
|
|
lr = None
|
2015-12-19 03:42:39 +03:00
|
|
|
self.err(lr, _("in manifest but not in changeset"), f)
|
2015-12-19 03:42:39 +03:00
|
|
|
|
|
|
|
ui.progress(_('crosschecking'), None)
|
|
|
|
|
2016-01-06 05:28:46 +03:00
|
|
|
def _verifyfiles(self, filenodes, filelinkrevs):
|
|
|
|
repo = self.repo
|
|
|
|
ui = self.ui
|
|
|
|
lrugetctx = self.lrugetctx
|
|
|
|
revlogv1 = self.revlogv1
|
|
|
|
havemf = self.havemf
|
2015-12-19 03:42:39 +03:00
|
|
|
ui.status(_("checking files\n"))
|
|
|
|
|
|
|
|
storefiles = set()
|
2018-01-03 16:35:56 +03:00
|
|
|
for f, f2, size in repo.store.datafiles():
|
|
|
|
if not f:
|
|
|
|
self.err(None, _("cannot decode filename '%s'") % f2)
|
|
|
|
elif (size > 0 or not revlogv1) and f.startswith('data/'):
|
|
|
|
storefiles.add(_normpath(f))
|
|
|
|
|
2015-12-19 03:42:39 +03:00
|
|
|
files = sorted(set(filenodes) | set(filelinkrevs))
|
|
|
|
total = len(files)
|
2016-01-06 05:28:46 +03:00
|
|
|
revisions = 0
|
2015-12-19 03:42:39 +03:00
|
|
|
for i, f in enumerate(files):
|
2016-03-11 15:18:41 +03:00
|
|
|
ui.progress(_('checking'), i, item=f, total=total, unit=_('files'))
|
2015-12-19 03:42:39 +03:00
|
|
|
try:
|
|
|
|
linkrevs = filelinkrevs[f]
|
|
|
|
except KeyError:
|
|
|
|
# in manifest but not in changelog
|
|
|
|
linkrevs = []
|
|
|
|
|
|
|
|
if linkrevs:
|
|
|
|
lr = linkrevs[0]
|
|
|
|
else:
|
|
|
|
lr = None
|
2006-08-08 01:27:09 +04:00
|
|
|
|
2006-12-01 11:35:46 +03:00
|
|
|
try:
|
2015-12-19 03:42:39 +03:00
|
|
|
fl = repo.file(f)
|
|
|
|
except error.RevlogError as e:
|
2015-12-19 03:42:39 +03:00
|
|
|
self.err(lr, _("broken revlog! (%s)") % e, f)
|
2015-12-19 03:42:39 +03:00
|
|
|
continue
|
|
|
|
|
2018-01-03 16:35:56 +03:00
|
|
|
for ff in fl.files():
|
|
|
|
try:
|
|
|
|
storefiles.remove(ff)
|
|
|
|
except KeyError:
|
|
|
|
self.warn(_(" warning: revlog '%s' not in fncache!") % ff)
|
|
|
|
self.fncachewarned = True
|
2015-12-19 03:42:39 +03:00
|
|
|
|
2016-01-06 04:08:14 +03:00
|
|
|
self.checklog(fl, f, lr)
|
2015-12-19 03:42:39 +03:00
|
|
|
seen = {}
|
|
|
|
rp = None
|
|
|
|
for i in fl:
|
|
|
|
revisions += 1
|
|
|
|
n = fl.node(i)
|
2016-01-06 04:08:14 +03:00
|
|
|
lr = self.checkentry(fl, i, n, seen, linkrevs, f)
|
2015-12-19 03:42:39 +03:00
|
|
|
if f in filenodes:
|
|
|
|
if havemf and n not in filenodes[f]:
|
2015-12-19 03:42:39 +03:00
|
|
|
self.err(lr, _("%s not in manifests") % (short(n)), f)
|
2008-04-15 00:31:33 +04:00
|
|
|
else:
|
2015-12-19 03:42:39 +03:00
|
|
|
del filenodes[f][n]
|
|
|
|
|
2017-03-30 00:45:01 +03:00
|
|
|
# Verify contents. 4 cases to care about:
|
|
|
|
#
|
|
|
|
# common: the most common case
|
|
|
|
# rename: with a rename
|
|
|
|
# meta: file content starts with b'\1\n', the metadata
|
|
|
|
# header defined in filelog.py, but without a rename
|
|
|
|
# ext: content stored externally
|
|
|
|
#
|
|
|
|
# More formally, their differences are shown below:
|
|
|
|
#
|
|
|
|
# | common | rename | meta | ext
|
|
|
|
# -------------------------------------------------------
|
|
|
|
# flags() | 0 | 0 | 0 | not 0
|
|
|
|
# renamed() | False | True | False | ?
|
|
|
|
# rawtext[0:2]=='\1\n'| False | True | True | ?
|
|
|
|
#
|
|
|
|
# "rawtext" means the raw text stored in revlog data, which
|
|
|
|
# could be retrieved by "revision(rev, raw=True)". "text"
|
|
|
|
# mentioned below is "revision(rev, raw=False)".
|
|
|
|
#
|
|
|
|
# There are 3 different lengths stored physically:
|
|
|
|
# 1. L1: rawsize, stored in revlog index
|
|
|
|
# 2. L2: len(rawtext), stored in revlog data
|
|
|
|
# 3. L3: len(text), stored in revlog data if flags==0, or
|
|
|
|
# possibly somewhere else if flags!=0
|
|
|
|
#
|
|
|
|
# L1 should be equal to L2. L3 could be different from them.
|
|
|
|
# "text" may or may not affect commit hash depending on flag
|
|
|
|
# processors (see revlog.addflagprocessor).
|
|
|
|
#
|
|
|
|
# | common | rename | meta | ext
|
|
|
|
# -------------------------------------------------
|
|
|
|
# rawsize() | L1 | L1 | L1 | L1
|
|
|
|
# size() | L1 | L2-LM | L1(*) | L1 (?)
|
|
|
|
# len(rawtext) | L2 | L2 | L2 | L2
|
|
|
|
# len(text) | L2 | L2 | L2 | L3
|
|
|
|
# len(read()) | L2 | L2-LM | L2-LM | L3 (?)
|
|
|
|
#
|
|
|
|
# LM: length of metadata, depending on rawtext
|
|
|
|
# (*): not ideal, see comment in filelog.size
|
|
|
|
# (?): could be "- len(meta)" if the resolved content has
|
|
|
|
# rename metadata
|
|
|
|
#
|
|
|
|
# Checks needed to be done:
|
|
|
|
# 1. length check: L1 == L2, in all cases.
|
|
|
|
# 2. hash check: depending on flag processor, we may need to
|
|
|
|
# use either "text" (external), or "rawtext" (in revlog).
|
2015-12-19 03:42:39 +03:00
|
|
|
try:
|
2017-05-14 19:38:06 +03:00
|
|
|
skipflags = self.skipflags
|
|
|
|
if skipflags:
|
|
|
|
skipflags &= fl.flags(i)
|
|
|
|
if not skipflags:
|
|
|
|
fl.read(n) # side effect: read content and do checkhash
|
|
|
|
rp = fl.renamed(n)
|
2017-05-12 00:52:02 +03:00
|
|
|
# the "L1 == L2" check
|
|
|
|
l1 = fl.rawsize(i)
|
|
|
|
l2 = len(fl.revision(n, raw=True))
|
|
|
|
if l1 != l2:
|
|
|
|
self.err(lr, _("unpacked size is %s, %s expected") %
|
|
|
|
(l2, l1), f)
|
2015-12-19 03:42:39 +03:00
|
|
|
except error.CensoredNodeError:
|
|
|
|
# experimental config: censor.policy
|
codemod: register core configitems using a script
This is done by a script [2] using RedBaron [1], a tool designed for doing
code refactoring. All "default" values are decided by the script and are
strongly consistent with the existing code.
There are 2 changes done manually to fix tests:
[warn] mercurial/exchange.py: experimental.bundle2-output-capture: default needs manual removal
[warn] mercurial/localrepo.py: experimental.hook-track-tags: default needs manual removal
Since RedBaron is not confident about how to indent things [2].
[1]: https://github.com/PyCQA/redbaron
[2]: https://github.com/PyCQA/redbaron/issues/100
[3]:
#!/usr/bin/env python
# codemod_configitems.py - codemod tool to fill configitems
#
# Copyright 2017 Facebook, Inc.
#
# This software may be used and distributed according to the terms of the
# GNU General Public License version 2 or any later version.
from __future__ import absolute_import, print_function
import os
import sys
import redbaron
def readpath(path):
with open(path) as f:
return f.read()
def writepath(path, content):
with open(path, 'w') as f:
f.write(content)
_configmethods = {'config', 'configbool', 'configint', 'configbytes',
'configlist', 'configdate'}
def extractstring(rnode):
"""get the string from a RedBaron string or call_argument node"""
while rnode.type != 'string':
rnode = rnode.value
return rnode.value[1:-1] # unquote, "'str'" -> "str"
def uiconfigitems(red):
"""match *.ui.config* pattern, yield (node, method, args, section, name)"""
for node in red.find_all('atomtrailers'):
entry = None
try:
obj = node[-3].value
method = node[-2].value
args = node[-1]
section = args[0].value
name = args[1].value
if (obj in ('ui', 'self') and method in _configmethods
and section.type == 'string' and name.type == 'string'):
entry = (node, method, args, extractstring(section),
extractstring(name))
except Exception:
pass
else:
if entry:
yield entry
def coreconfigitems(red):
"""match coreconfigitem(...) pattern, yield (node, args, section, name)"""
for node in red.find_all('atomtrailers'):
entry = None
try:
args = node[1]
section = args[0].value
name = args[1].value
if (node[0].value == 'coreconfigitem' and section.type == 'string'
and name.type == 'string'):
entry = (node, args, extractstring(section),
extractstring(name))
except Exception:
pass
else:
if entry:
yield entry
def registercoreconfig(cfgred, section, name, defaultrepr):
"""insert coreconfigitem to cfgred AST
section and name are plain string, defaultrepr is a string
"""
# find a place to insert the "coreconfigitem" item
entries = list(coreconfigitems(cfgred))
for node, args, nodesection, nodename in reversed(entries):
if (nodesection, nodename) < (section, name):
# insert after this entry
node.insert_after(
'coreconfigitem(%r, %r,\n'
' default=%s,\n'
')' % (section, name, defaultrepr))
return
def main(argv):
if not argv:
print('Usage: codemod_configitems.py FILES\n'
'For example, FILES could be "{hgext,mercurial}/*/**.py"')
dirname = os.path.dirname
reporoot = dirname(dirname(dirname(os.path.abspath(__file__))))
# register configitems to this destination
cfgpath = os.path.join(reporoot, 'mercurial', 'configitems.py')
cfgred = redbaron.RedBaron(readpath(cfgpath))
# state about what to do
registered = set((s, n) for n, a, s, n in coreconfigitems(cfgred))
toregister = {} # {(section, name): defaultrepr}
coreconfigs = set() # {(section, name)}, whether it's used in core
# first loop: scan all files before taking any action
for i, path in enumerate(argv):
print('(%d/%d) scanning %s' % (i + 1, len(argv), path))
iscore = ('mercurial' in path) and ('hgext' not in path)
red = redbaron.RedBaron(readpath(path))
# find all repo.ui.config* and ui.config* calls, and collect their
# section, name and default value information.
for node, method, args, section, name in uiconfigitems(red):
if section == 'web':
# [web] section has some weirdness, ignore them for now
continue
defaultrepr = None
key = (section, name)
if len(args) == 2:
if key in registered:
continue
if method == 'configlist':
defaultrepr = 'list'
elif method == 'configbool':
defaultrepr = 'False'
else:
defaultrepr = 'None'
elif len(args) >= 3 and (args[2].target is None or
args[2].target.value == 'default'):
# try to understand the "default" value
dnode = args[2].value
if dnode.type == 'name':
if dnode.value in {'None', 'True', 'False'}:
defaultrepr = dnode.value
elif dnode.type == 'string':
defaultrepr = repr(dnode.value[1:-1])
elif dnode.type in ('int', 'float'):
defaultrepr = dnode.value
# inconsistent default
if key in toregister and toregister[key] != defaultrepr:
defaultrepr = None
# interesting to rewrite
if key not in registered:
if defaultrepr is None:
print('[note] %s: %s.%s: unsupported default'
% (path, section, name))
registered.add(key) # skip checking it again
else:
toregister[key] = defaultrepr
if iscore:
coreconfigs.add(key)
# second loop: rewrite files given "toregister" result
for path in argv:
# reconstruct redbaron - trade CPU for memory
red = redbaron.RedBaron(readpath(path))
changed = False
for node, method, args, section, name in uiconfigitems(red):
key = (section, name)
defaultrepr = toregister.get(key)
if defaultrepr is None or key not in coreconfigs:
continue
if len(args) >= 3 and (args[2].target is None or
args[2].target.value == 'default'):
try:
del args[2]
changed = True
except Exception:
# redbaron fails to do the rewrite due to indentation
# see https://github.com/PyCQA/redbaron/issues/100
print('[warn] %s: %s.%s: default needs manual removal'
% (path, section, name))
if key not in registered:
print('registering %s.%s' % (section, name))
registercoreconfig(cfgred, section, name, defaultrepr)
registered.add(key)
if changed:
print('updating %s' % path)
writepath(path, red.dumps())
if toregister:
print('updating configitems.py')
writepath(cfgpath, cfgred.dumps())
if __name__ == "__main__":
sys.exit(main(sys.argv[1:]))
2017-07-15 00:22:40 +03:00
|
|
|
if ui.config("censor", "policy") == "abort":
|
2015-12-19 03:42:39 +03:00
|
|
|
self.err(lr, _("censored file data"), f)
|
2015-12-19 03:42:39 +03:00
|
|
|
except Exception as inst:
|
2015-12-19 03:42:39 +03:00
|
|
|
self.exc(lr, _("unpacking %s") % short(n), inst, f)
|
2015-12-19 03:42:39 +03:00
|
|
|
|
|
|
|
# check renames
|
|
|
|
try:
|
|
|
|
if rp:
|
|
|
|
if lr is not None and ui.verbose:
|
|
|
|
ctx = lrugetctx(lr)
|
|
|
|
found = False
|
|
|
|
for pctx in ctx.parents():
|
|
|
|
if rp[0] in pctx:
|
|
|
|
found = True
|
|
|
|
break
|
|
|
|
if not found:
|
2015-12-19 03:42:39 +03:00
|
|
|
self.warn(_("warning: copy source of '%s' not"
|
|
|
|
" in parents of %s") % (f, ctx))
|
2015-12-19 03:42:39 +03:00
|
|
|
fl2 = repo.file(rp[0])
|
|
|
|
if not len(fl2):
|
2015-12-19 03:42:39 +03:00
|
|
|
self.err(lr, _("empty or missing copy source "
|
|
|
|
"revlog %s:%s") % (rp[0], short(rp[1])), f)
|
2015-12-19 03:42:39 +03:00
|
|
|
elif rp[1] == nullid:
|
|
|
|
ui.note(_("warning: %s@%s: copy source"
|
|
|
|
" revision is nullid %s:%s\n")
|
|
|
|
% (f, lr, rp[0], short(rp[1])))
|
|
|
|
else:
|
|
|
|
fl2.rev(rp[1])
|
|
|
|
except Exception as inst:
|
2015-12-19 03:42:39 +03:00
|
|
|
self.exc(lr, _("checking rename of %s") % short(n), inst, f)
|
2015-12-19 03:42:39 +03:00
|
|
|
|
|
|
|
# cross-check
|
|
|
|
if f in filenodes:
|
2016-11-11 00:35:54 +03:00
|
|
|
fns = [(v, k) for k, v in filenodes[f].iteritems()]
|
2015-12-19 03:42:39 +03:00
|
|
|
for lr, node in sorted(fns):
|
2016-02-08 09:46:20 +03:00
|
|
|
self.err(lr, _("manifest refers to unknown revision %s") %
|
|
|
|
short(node), f)
|
2015-12-19 03:42:39 +03:00
|
|
|
ui.progress(_('checking'), None)
|
|
|
|
|
2018-01-03 16:35:56 +03:00
|
|
|
for f in sorted(storefiles):
|
|
|
|
self.warn(_("warning: orphan revlog '%s'") % f)
|
2015-12-19 03:42:39 +03:00
|
|
|
|
2016-01-06 05:28:46 +03:00
|
|
|
return len(files), revisions
|