sapling/hgext/p4fastimport/__init__.py
Alexandre Marin e2fc55f8a6 Use memctx to create commit without working copy
Summary:
p4seqimport has used very high level mercurial abstractions so far (almost
equivalent to running hg add / mv / rm / commit on command line). This is very
easy to grasp as we use it day to day. It is not performant enough for our
importer:
- It does the work twice (write to working copy, then commit changing hg metadata)
- It requires the working copy (this would force us to update between revs,
  materializing a prohibitively large number of files)

This change makes use of memctx, which is basically an in-memory commit. This way
we don't need a working copy and we save time + a lot of space.

For a high-level overview of p4seqimport, please check https://our.intern.facebook.com/intern/wiki/IDI/p4seqimport/

Differential Revision: D7176903

fbshipit-source-id: 2773d7c001b615837496ea9db3229d9afc020124
2018-04-13 21:51:44 -07:00

536 lines
20 KiB
Python

# (c) 2017-present Facebook Inc.
"""p4fastimport - A fast importer from Perforce to Mercurial
Config example:
[p4fastimport]
# whether use worker or not
useworker = false
# trace copies?
copytrace = false
# Instead of uploading to LFS, store lfs metadata in this sqlite output
# file. Some other process will upload from there to the LFS server later.
lfsmetadata = PATH
# path to sqlite output file for metadata
metadata = PATH
# certain commits by certain users should be igored so that
# p4fastimporter imports the actual commits we want
ignore-user = None
# heuristic time difference between a ignored user commit
# and a p4fastimporter import
ignore-time-delta = None
# The P4 database can become corrupted when it tracks symlinks to
# directories. Keep this corruption out of the Mercurial repo.
checksymlinks = True
"""
from __future__ import absolute_import
import collections
import itertools
import json
import sqlite3
from . import (
importer,
p4,
seqimporter,
)
from .util import decodefileflags, getcl, lastcl, runworker
from mercurial.i18n import _
from mercurial.node import short, hex
from mercurial import (
error,
extensions,
progress,
registrar,
revlog,
scmutil,
verify,
)
def extsetup():
# Writing multiple changelog entries in one transaction can lead to revlog
# caching issues when the inlined .i data is separated into a .d file. We
# workaround by not allowing inlined revlogs at all.
revlog.REVLOG_DEFAULT_VERSION = revlog.REVLOG_DEFAULT_FORMAT
def reposetup(ui, repo):
def nothing(orig, *args, **kwargs):
pass
def yoloverify(orig, *args, **kwargs):
# We have to set it directly as repo is reading the config lfs.bypass
# during their repo setup.
repo.svfs.options['lfsbypass'] = True
return orig(*args, **kwargs)
extensions.wrapfunction(verify.verifier, 'verify', yoloverify)
if ui.config('p4fastimport', 'lfsmetadata', None) is not None:
try:
lfs = extensions.find('lfs')
except KeyError:
pass
else:
extensions.wrapfunction(lfs.blobstore.local, 'write', nothing)
extensions.wrapfunction(lfs.blobstore.local, 'read', nothing)
def writebookmark(tr, repo, revisions, name):
if len(revisions) > 0:
__, hexnode = revisions[-1]
repo._bookmarks.applychanges(repo, tr, [(name, repo[hexnode].node())])
def writerevmetadata(revisions, outfile):
"""Write the LFS mappings from OID to a depotpath and it's CLnum into
sqlite. This way the LFS server can import the correct file from Perforce
and mapping it to the correct OID.
"""
with sqlite3.connect(outfile, isolation_level=None) as conn:
cur = conn.cursor()
cur.execute("BEGIN TRANSACTION")
cur.execute("""
CREATE TABLE IF NOT EXISTS revision_mapping (
"id" INTEGER PRIMARY KEY AUTOINCREMENT,
"cl" INTEGER NOT NULL,
"node" BLOB
)""")
cur.executemany(
"INSERT INTO revision_mapping(cl, node) VALUES (?,?)",
revisions)
cur.execute("COMMIT")
def writelfsmetadata(largefiles, revisions, outfile):
"""Write the LFS mappings from OID to a depotpath and it's CLnum into
sqlite. This way the LFS server can import the correct file from Perforce
and mapping it to the correct OID.
"""
with sqlite3.connect(outfile, isolation_level=None) as conn:
cur = conn.cursor()
cur.execute("BEGIN TRANSACTION")
cur.execute("""
CREATE TABLE IF NOT EXISTS p4_lfs_map(
"id" INTEGER PRIMARY KEY AUTOINCREMENT,
"cl" INTEGER NOT NULL,
"node" BLOB,
"oid" TEXT,
"path" BLOB
)""")
inserts = []
revdict = dict(revisions)
for cl, path, oid in largefiles:
inserts.append((cl, path, oid, revdict[cl]))
cur.executemany(
"INSERT INTO p4_lfs_map(cl, path, oid, node) VALUES (?,?,?,?)",
inserts)
cur.execute("COMMIT")
def create(tr, ui, repo, importset, p1ctx, filelogs):
for filelog in filelogs:
# If the Perforce is case insensitive a filelog can map to
# multiple filenames. For exmaple A.txt and a.txt would show up in the
# same filelog. It would be more appropriate to update the filelist
# after receiving the initial filelist but this would not be parallel.
fi = importer.FileImporter(ui, repo, importset, filelog, p1ctx)
fileflags, largefiles, baserevatcl, oldtiprev, newtiprev = fi.create(tr)
yield 1, json.dumps({
'newtiprev': newtiprev,
'oldtiprev': oldtiprev,
'fileflags': fileflags,
'baserevatcl': baserevatcl,
'largefiles': largefiles,
'depotname': filelog.depotfile,
'localname': fi.relpath,
})
def enforce_p4_client_exists(client):
# A client defines checkout behavior for a user. It contains a list of
# views. A view defines a set of files and directories to check out from a
# Perforce server and their mappins to local disk, e.g.:
# //depot/foo/... //client/x/...
# would map the files that are stored on the
# server under foo/* locally under x/*.
if not p4.exists_client(client):
raise error.Abort(_('p4 client %s does not exist.') % client)
def getchangelists(ui, client, startcl, limit=None):
'''
Returns a sorted list of changelists affecting client,
starting at startcl.
If a limit N is provided, return only the first N changelists.
'''
ui.note(_('loading changelist numbers.\n'))
ignore_user = ui.config('p4fastimport', 'ignore-user')
ignore_time_delta = ui.config('p4fastimport', 'ignore-time-delta')
if ignore_user is None or ignore_time_delta is None:
changelists = sorted(p4.parse_changes(client, startcl=startcl))
else:
changelists = list(itertools.takewhile(
lambda cl: not (cl._user == ignore_user
and cl._commit_time_diff < ignore_time_delta),
sorted(p4.parse_changes(client, startcl=startcl))))
ui.note(_('%d changelists to import.\n') % len(changelists))
if limit:
limit = int(limit)
if limit < len(changelists):
ui.debug('importing %d only because of --limit.\n' % limit)
changelists = changelists[:limit]
return changelists
def getfilelist(ui, p4filelist):
filelist = set()
for fileinfo in p4filelist:
if fileinfo['action'] in p4.ACTION_ARCHIVE:
pass
elif fileinfo['action'] in p4.SUPPORTED_ACTIONS:
filelist.add(fileinfo['depotFile'])
else:
ui.warn(_('unknown action %s: %s\n') % (fileinfo['action'],
fileinfo['depotFile']))
return filelist
def sanitizeopts(repo, opts):
if opts.get('base') and not opts.get('bookmark'):
raise error.Abort(_('must set --bookmark when using --base'))
if opts.get('bookmark'):
scmutil.checknewlabel(repo, opts['bookmark'], 'bookmark')
limit = opts.get('limit')
if limit:
try:
limit = int(limit)
except ValueError:
raise error.Abort(_('--limit should be an integer, got %s') % limit)
if limit <= 0:
raise error.Abort(_('--limit should be > 0, got %d') % limit)
def startfrom(ui, repo, opts):
base, dest = 'null', 'tip'
if opts.get('bookmark'):
dest = opts.get('bookmark')
if opts.get('base'):
base = opts['base']
if opts.get('bookmark') not in repo:
dest = base
basectx = scmutil.revsingle(repo, base)
destctx = scmutil.revsingle(repo, dest)
ctx = list(repo.set("""
last(
%n::%n and (
extra(p4changelist) or
extra(p4fullimportbasechangelist)))""",
basectx.node(), destctx.node()))
if ctx:
ctx = ctx[0]
startcl = lastcl(ctx)
ui.note(_('incremental import from changelist: %d, node: %s\n') %
(startcl, short(ctx.node())))
if ctx.node() == basectx.node():
ui.note(_('creating branchpoint, base %s\n') %
short(basectx.node()))
return ctx, startcl, True
return ctx, startcl, False
raise error.Abort(_('no valid p4 changelist number.'))
def updatemetadata(ui, revisions, largefiles):
lfsmetadata = ui.config('p4fastimport', 'lfsmetadata', None)
if len(largefiles) > 0 and lfsmetadata is not None:
ui.note(_('writing lfs metadata to sqlite\n'))
writelfsmetadata(largefiles, revisions, lfsmetadata)
metadata = ui.config('p4fastimport', 'metadata', None)
if len(revisions) > 0 and metadata is not None:
ui.note(_('writing metadata to sqlite\n'))
writerevmetadata(revisions, metadata)
cmdtable = {}
command = registrar.command(cmdtable)
@command(
'p4fastimport',
[('P', 'path', '.', _('path to the local depot store'), _('PATH')),
('B', 'bookmark', '', _('bookmark to set'), _('NAME')),
('', 'base', '', _('base changeset (must exist in the repository)')),
('', 'limit', '',
_('number of changelists to import at a time'), _('N'))],
_('[-P PATH] [-B NAME] [--limit N] [CLIENT]'),
inferrepo=True)
def p4fastimport(ui, repo, client, **opts):
if 'fncache' in repo.requirements:
raise error.Abort(_('fncache must be disabled'))
sanitizeopts(repo, opts)
if len(repo) > 0:
p1ctx, startcl, isbranchpoint = startfrom(ui, repo, opts)
else:
p1ctx, startcl, isbranchpoint = repo['tip'], None, False
# 0. Fail if the specified client does not exist
enforce_p4_client_exists(client)
# 1. Return all the changelists touching files in a given client view.
changelists = getchangelists(ui, client, startcl, limit=opts.get('limit'))
if len(changelists) == 0:
return
basepath = opts.get('path')
startcl, endcl = changelists[0].cl, changelists[-1].cl
# 2. Get a list of files that we will have to import from the depot with
# it's full path in the depot.
ui.note(_('loading list of files.\n'))
filelist = getfilelist(ui, p4.parse_filelist(client, startcl, endcl))
ui.note(_('%d files to import.\n') % len(filelist))
importset = importer.ImportSet(repo, client, changelists,
filelist, basepath, isbranchpoint=isbranchpoint)
p4filelogs = []
with progress.bar(ui, _('reading filelog'), _('filelogs'),
len(filelist)) as prog:
for i, f in enumerate(importset.filelogs()):
ui.debug('reading filelog %s\n' % f.depotfile)
prog.value = i
p4filelogs.append(f)
# runlist is used to topologically order files which were branched (Perforce
# uses per-file branching, not per-repo branching). If we do copytracing a
# file A' which was branched off A will be considered a copy of A. Therefore
# we need to import A' before A. In this case A' will have a dependency
# counter +1 of A's, and therefore being imported after A. If copy tracing
# is disabled this is not needed and we can import files in arbitrary order.
runlist = collections.OrderedDict()
if ui.configbool('p4fastimport', 'copytrace', False):
raise error.Abort(_('copytracing is broken'))
else:
runlist[0] = p4filelogs
ui.note(_('importing repository.\n'))
with repo.wlock(), repo.lock():
for a, b in importset.caseconflicts:
ui.warn(_('case conflict: %s and %s\n') % (a, b))
# 3. Import files.
count = 0
fileinfo = {}
largefiles = []
ftr = repo.transaction('importer')
try:
with progress.bar(ui, _('importing filelogs'), 'file',
len(p4filelogs)) as prog:
for filelogs in map(sorted, runlist.values()):
wargs = (ftr, ui, repo, importset, p1ctx)
for i, serialized in runworker(ui, create, wargs, filelogs):
data = json.loads(serialized)
prog.value = (count, data['depotname'])
# Json converts to UTF8 and int keys to strings, so we
# have to convert back.
# TODO: Find a better way to handle this.
fileinfo[data['depotname']] = {
'localname': data['localname'].encode('utf-8'),
'flags': decodefileflags(data['fileflags']),
'baserevatcl': data['baserevatcl'],
}
largefiles.extend(data['largefiles'])
count += i
ftr.close()
tr = repo.transaction('import')
try:
# 4. Generate manifest and changelog based on the filelogs
# we imported
clog = importer.ChangeManifestImporter(ui, repo, importset,
p1ctx=p1ctx)
revisions = []
for cl, hgnode in clog.creategen(tr, fileinfo):
revisions.append((cl, hex(hgnode)))
if opts.get('bookmark'):
ui.note(_('writing bookmark\n'))
writebookmark(tr, repo, revisions, opts['bookmark'])
updatemetadata(ui, revisions, largefiles)
tr.close()
ui.note(_('%d revision(s), %d file(s) imported.\n') % (
len(changelists), count))
finally:
tr.release()
finally:
ftr.release()
@command(
'p4seqimport',
[('P', 'path', '.', _('path to the local depot store'), _('PATH')),
('B', 'bookmark', '', _('bookmark to set'), _('NAME')),
('', 'base', '', _('base changeset (must exist in the repository)')),
('', 'limit', '', _('max number of changelists to import'), _('N'))],
_('[-P PATH] [-B NAME] client'),
)
def p4seqimport(ui, repo, client, **opts):
'''Sequentially import changelists'''
enforce_p4_client_exists(client)
sanitizeopts(repo, opts)
startcl = None
ctx = repo['tip']
if len(repo) > 0:
ctx, startcl = startfrom(ui, repo, opts)[:2]
changelists = getchangelists(ui, client, startcl, limit=opts.get('limit'))
if len(changelists) == 0:
ui.note(_('no changes to import, exiting.\n'))
return
climporter = seqimporter.ChangelistImporter(
ui,
repo,
ctx,
client,
opts.get('path'),
opts.get('bookmark'),
)
with repo.wlock(), repo.lock(), repo.transaction('seqimport'):
for p4cl in changelists:
node, largefiles = climporter.importcl(p4cl)
updatemetadata(ui, [(p4cl.cl, hex(node))], largefiles)
@command(
'p4syncimport',
[('P', 'path', '.', _('path to the local depot store'), _('PATH')),
('B', 'bookmark', '', _('bookmark to set'), _('NAME'))],
_('[-P PATH] [-B NAME] oldclient newclient'),
)
def p4syncimport(ui, repo, oldclient, newclient, **opts):
sanitizeopts(repo, opts)
if len(repo) == 0:
raise error.Abort(_('p4 sync commit does not support empty repo yet.'))
p1ctx, startcl, __ = startfrom(ui, repo, opts)
# Fail if the specified client does not exist
enforce_p4_client_exists(oldclient)
enforce_p4_client_exists(newclient)
# Get a list of files that we will have to import
oldcl = p4.get_latest_cl(oldclient)
latestcl = p4.get_latest_cl(newclient)
lastimportedcl = getcl(p1ctx)
if latestcl is None:
raise error.Abort(_('cannot find latest p4 changelist number'))
ui.debug('%r (current client) %r (requested client) '
'%r (latest imported)\n' % (oldcl, latestcl, lastimportedcl))
if oldcl != lastimportedcl:
# Consider running p4fastimport from here
raise error.Abort(_('repository must contain most recent changes'))
ui.note(_('latest change list number %s\n') % latestcl)
# p4filelogs: a full list of filelogs included in newclient's view
# newp4filelogs: a list of files that will need new entries in hgfilelog
# reusep4filelogs: a list of files that do not need new entries in hgfilelog
p4filelogs = p4.get_filelogs_at_cl(newclient, latestcl)
p4filelogs = sorted(p4filelogs)
newp4filelogs, reusep4filelogs = importer.get_filelogs_to_sync(
ui, newclient, repo, p1ctx, startcl - 1, p4filelogs)
if not newp4filelogs and not reusep4filelogs:
raise error.Abort(_('nothing to import.'))
# sync import.
with repo.wlock(), repo.lock():
ui.note(_('running a sync import.\n'))
count = 0
fileinfo = {}
largefileslist = []
tr = repo.transaction('syncimport')
try:
for p4fl, localname in newp4filelogs:
bfi = importer.SyncFileImporter(
ui, repo, newclient, latestcl, p4fl,
localfile=localname)
# Create hg filelog
fileflags, largefiles, oldtiprev, newtiprev = bfi.create(tr)
fileinfo[p4fl.depotfile] = {
'flags': fileflags,
'localname': bfi.relpath,
'baserev': oldtiprev,
}
largefileslist.extend(largefiles)
count += 1
# Generate manifest and changelog
clog = importer.SyncChangeManifestImporter(
ui, repo, newclient, latestcl, p1ctx=p1ctx)
revisions = []
for cl, hgnode in clog.creategen(tr, fileinfo, reusep4filelogs):
revisions.append((cl, hex(hgnode)))
if opts.get('bookmark'):
ui.note(_('writing bookmark\n'))
writebookmark(tr, repo, revisions, opts['bookmark'])
if ui.config('p4fastimport', 'lfsmetadata', None) is not None:
ui.note(_('writing lfs metadata to sqlite\n'))
writelfsmetadata(largefileslist, revisions,
ui.config('p4fastimport', 'lfsmetadata', None))
tr.close()
ui.note(_('1 revision, %d file(s) imported.\n') % count)
finally:
tr.release()
@command('debugscanlfs',
[('C', 'client', '', _('Perforce client to reverse lookup')),
('r', 'rev', '.', _('display LFS files in REV')),
('A', 'all', None, _('display LFS files all revisions'))])
def debugscanlfs(ui, repo, **opts):
lfs = extensions.find('lfs')
def display(repo, filename, flog, rev):
filenode = flog.node(rev)
rawtext = flog.revision(filenode, raw=True)
ptr = lfs.pointer.deserialize(rawtext)
linkrev = flog.linkrev(rev)
cl = int(repo[linkrev].extra()['p4changelist'])
return _('%d %s %s %d %s\n') % (
flog.linkrev(rev), hex(filenode), ptr.oid(), cl, filename)
def batchfnmap(repo, client, infos):
for filename, flog, rev in infos:
whereinfo = p4.parse_where(client, filename)
yield 1, display(repo, whereinfo['depotFile'], flog, rev)
client = opts.get('client', None)
todisplay = []
if opts.get('all'):
prefix, suffix = "data/", ".i"
plen, slen = len(prefix), len(suffix)
for fn, b, size in repo.store.datafiles():
if size == 0 or fn[-slen:] != suffix or fn[:plen] != prefix:
continue
fn = fn[plen:-slen]
flog = repo.file(fn)
for rev in range(0, len(flog)):
flags = flog.flags(rev)
if bool(flags & revlog.REVIDX_EXTSTORED):
if client:
todisplay.append((fn, flog, rev))
else:
ui.write(display(repo, fn, flog, rev))
else:
revisions = repo.set(opts.get('rev', '.'))
for ctx in revisions:
for fn in ctx.manifest():
fctx = ctx[fn]
flog = fctx.filelog()
flags = flog.flags(fctx.filerev())
if bool(flags & revlog.REVIDX_EXTSTORED):
if client:
todisplay.append((fn, flog, fctx.filerev()))
else:
ui.write(display(repo, fn, flog, fctx.filerev()))
if todisplay:
args = (repo, client)
for i, s in runworker(ui, batchfnmap, args, todisplay):
ui.write(s)