sapling/edenscm/hgext/remotefilelog/remotefilelogserver.py
Aida Getoeva 6f4459dd42 don't skip metadata while creating data wirepacks
Summary:
Currently we skip metadata when constructing data wirepacks.
We need copy metadata in the data packflies, because the client expects the base text to contain copy metadata since the delta base on the server also contains copy metadata.

It is also needed for future file nodes validation.

Differential Revision: D14851678

fbshipit-source-id: 1a3f79dc2565cdb864bee2400d331ae3a7c3751b
2019-04-09 10:37:01 -07:00

616 lines
20 KiB
Python

# remotefilelogserver.py - server logic for a remotefilelog server
#
# Copyright 2013 Facebook, Inc.
#
# This software may be used and distributed according to the terms of the
# GNU General Public License version 2 or any later version.
from __future__ import absolute_import
import errno
import json
import os
import stat
import time
from edenscm.mercurial import (
changegroup,
changelog,
context,
error,
exchange,
extensions,
match,
progress,
sshserver,
store,
util,
wireproto,
)
from edenscm.mercurial.extensions import wrapfunction
from edenscm.mercurial.hgweb import protocol as httpprotocol
from edenscm.mercurial.i18n import _
from edenscm.mercurial.node import bin, hex, nullid, nullrev
from . import constants, lz4wrapper, shallowrepo, shallowutil, wirepack
try:
xrange(0)
except NameError:
xrange = range
try:
from edenscm.mercurial import streamclone
streamclone._walkstreamfiles
hasstreamclone = True
except Exception:
hasstreamclone = False
onetime = False
def onetimesetup(ui):
"""Configures the wireprotocol for both clients and servers.
"""
global onetime
if onetime:
return
onetime = True
# support file content requests
wireproto.commands["getflogheads"] = (getflogheads, "path")
wireproto.commands["getfiles"] = (getfiles, "")
wireproto.commands["getfile"] = (getfile, "file node")
wireproto.commands["getpackv1"] = (getpack, "*")
class streamstate(object):
match = None
shallowremote = False
noflatmf = False
state = streamstate()
def stream_out_shallow(repo, proto, other):
includepattern = None
excludepattern = None
raw = other.get("includepattern")
if raw:
includepattern = raw.split("\0")
raw = other.get("excludepattern")
if raw:
excludepattern = raw.split("\0")
oldshallow = state.shallowremote
oldmatch = state.match
oldnoflatmf = state.noflatmf
try:
state.shallowremote = True
state.match = match.always(repo.root, "")
state.noflatmf = other.get("noflatmanifest") == "True"
if includepattern or excludepattern:
state.match = match.match(
repo.root, "", None, includepattern, excludepattern
)
streamres = wireproto.stream(repo, proto)
# Force the first value to execute, so the file list is computed
# within the try/finally scope
first = next(streamres.gen)
second = next(streamres.gen)
def gen():
yield first
yield second
for value in streamres.gen:
yield value
return wireproto.streamres(gen())
finally:
state.shallowremote = oldshallow
state.match = oldmatch
state.noflatmf = oldnoflatmf
wireproto.commands["stream_out_shallow"] = (stream_out_shallow, "*")
# don't clone filelogs to shallow clients
def _walkstreamfiles(orig, repo):
if state.shallowremote:
# if we are shallow ourselves, stream our local commits
if shallowrepo.requirement in repo.requirements:
striplen = len(repo.store.path) + 1
readdir = repo.store.rawvfs.readdir
visit = [os.path.join(repo.store.path, "data")]
while visit:
p = visit.pop()
for f, kind, st in readdir(p, stat=True):
fp = p + "/" + f
if kind == stat.S_IFREG:
if not fp.endswith(".i") and not fp.endswith(".d"):
n = util.pconvert(fp[striplen:])
yield (store.decodedir(n), n, st.st_size)
if kind == stat.S_IFDIR:
visit.append(fp)
shallowtrees = repo.ui.configbool("remotefilelog", "shallowtrees", False)
if "treemanifest" in repo.requirements and not shallowtrees:
for (u, e, s) in repo.store.datafiles():
if u.startswith("meta/") and (u.endswith(".i") or u.endswith(".d")):
yield (u, e, s)
# Return .d and .i files that do not match the shallow pattern
match = state.match
if match and not match.always():
for (u, e, s) in repo.store.datafiles():
f = u[5:-2] # trim data/... and .i/.d
if not state.match(f):
yield (u, e, s)
for x in repo.store.topfiles():
if shallowtrees and x[0][:15] == "00manifesttree.":
continue
if state.noflatmf and x[0][:11] == "00manifest.":
continue
yield x
elif shallowrepo.requirement in repo.requirements:
# don't allow cloning from a shallow repo to a full repo
# since it would require fetching every version of every
# file in order to create the revlogs.
raise error.Abort(_("Cannot clone from a shallow repo " "to a full repo."))
else:
for x in orig(repo):
yield x
# This function moved in Mercurial 3.5 and 3.6
if hasstreamclone:
wrapfunction(streamclone, "_walkstreamfiles", _walkstreamfiles)
elif util.safehasattr(wireproto, "_walkstreamfiles"):
wrapfunction(wireproto, "_walkstreamfiles", _walkstreamfiles)
else:
wrapfunction(exchange, "_walkstreamfiles", _walkstreamfiles)
# We no longer use getbundle_shallow commands, but we must still
# support it for migration purposes
def getbundleshallow(repo, proto, others):
bundlecaps = others.get("bundlecaps", "")
bundlecaps = set(bundlecaps.split(","))
bundlecaps.add("remotefilelog")
others["bundlecaps"] = ",".join(bundlecaps)
return wireproto.commands["getbundle"][0](repo, proto, others)
wireproto.commands["getbundle_shallow"] = (getbundleshallow, "*")
# expose remotefilelog capabilities
def _capabilities(orig, repo, proto):
caps = orig(repo, proto)
if shallowrepo.requirement in repo.requirements or ui.configbool(
"remotefilelog", "server"
):
if isinstance(proto, sshserver.sshserver):
# legacy getfiles method which only works over ssh
caps.append(shallowrepo.requirement)
caps.append("getflogheads")
caps.append("getfile")
return caps
if util.safehasattr(wireproto, "_capabilities"):
wrapfunction(wireproto, "_capabilities", _capabilities)
else:
wrapfunction(wireproto, "capabilities", _capabilities)
def _adjustlinkrev(orig, self, *args, **kwargs):
# When generating file blobs, taking the real path is too slow on large
# repos, so force it to just return the linkrev directly.
repo = self._repo
if util.safehasattr(repo, "forcelinkrev") and repo.forcelinkrev:
return self._filelog.linkrev(self._filelog.rev(self._filenode))
return orig(self, *args, **kwargs)
wrapfunction(context.basefilectx, "_adjustlinkrev", _adjustlinkrev)
def _iscmd(orig, cmd):
if cmd == "getfiles":
return False
return orig(cmd)
wrapfunction(httpprotocol, "iscmd", _iscmd)
class trivialserializer(object):
"""Trivial simple serializer for remotefilelog cache"""
@staticmethod
def serialize(value):
return value
@staticmethod
def deserialize(raw):
return raw
def readvalue(repo, path, node):
filectx = repo.filectx(path, fileid=node)
if filectx.node() == nullid:
repo.changelog = changelog.changelog(repo.svfs)
filectx = repo.filectx(path, fileid=node)
return lz4wrapper.lz4compresshc(createfileblob(filectx))
def _loadfileblob(repo, path, node):
usesimplecache = repo.ui.configbool("remotefilelog", "simplecacheserverstore")
cachepath = repo.ui.config("remotefilelog", "servercachepath")
if cachepath and usesimplecache:
raise error.Abort(
"remotefilelog.servercachepath and remotefilelog.simplecacheserverstore can't be both enabled"
)
key = os.path.join(path, hex(node))
# simplecache store for remotefilelogcache
if usesimplecache:
try:
simplecache = extensions.find("simplecache")
except KeyError:
raise error.Abort(
"simplecache extension must be enabled with remotefilelog.simplecacheserverstore enabled"
)
# this function doesn't raise exception
text = simplecache.cacheget(key, trivialserializer, repo.ui)
if text:
return text
else:
text = readvalue(repo, path, node)
# this function doesn't raise exception
simplecache.cacheset(key, text, trivialserializer, repo.ui)
return text
# on disk store for remotefilelogcache
if not cachepath:
cachepath = os.path.join(repo.path, "remotefilelogcache")
filecachepath = os.path.join(cachepath, key)
if not os.path.exists(filecachepath) or os.path.getsize(filecachepath) == 0:
text = readvalue(repo, path, node)
# everything should be user & group read/writable
oldumask = os.umask(0o002)
try:
dirname = os.path.dirname(filecachepath)
if not os.path.exists(dirname):
try:
os.makedirs(dirname)
except OSError as ex:
if ex.errno != errno.EEXIST:
raise
f = None
try:
f = util.atomictempfile(filecachepath, "w")
f.write(text)
except (IOError, OSError):
# Don't abort if the user only has permission to read,
# and not write.
pass
finally:
if f:
f.close()
finally:
os.umask(oldumask)
else:
with util.posixfile(filecachepath, "r") as f:
text = f.read()
return text
def getflogheads(repo, proto, path):
"""A server api for requesting a filelog's heads
"""
flog = repo.file(path)
heads = flog.heads()
return "\n".join((hex(head) for head in heads if head != nullid))
def getfile(repo, proto, file, node):
"""A server api for requesting a particular version of a file. Can be used
in batches to request many files at once. The return protocol is:
<errorcode>\0<data/errormsg> where <errorcode> is 0 for success or
non-zero for an error.
data is a compressed blob with revlog flag and ancestors information. See
createfileblob for its content.
"""
if shallowrepo.requirement in repo.requirements:
return "1\0" + _("cannot fetch remote files from shallow repo")
node = bin(node.strip())
if node == nullid:
return "0\0"
return "0\0" + _loadfileblob(repo, file, node)
def getfiles(repo, proto):
"""A server api for requesting particular versions of particular files.
"""
if shallowrepo.requirement in repo.requirements:
raise error.Abort(_("cannot fetch remote files from shallow repo"))
if not isinstance(proto, sshserver.sshserver):
raise error.Abort(_("cannot fetch remote files over non-ssh protocol"))
def streamer():
fin = proto.fin
args = []
responselen = 0
start_time = time.time()
while True:
request = fin.readline()[:-1]
if not request:
break
hexnode = request[:40]
node = bin(hexnode)
if node == nullid:
yield "0\n"
continue
path = request[40:]
args.append([hexnode, path])
text = _loadfileblob(repo, path, node)
response = "%d\n%s" % (len(text), text)
responselen += len(response)
yield response
# it would be better to only flush after processing a whole batch
# but currently we don't know if there are more requests coming
proto.fout.flush()
if repo.ui.configbool("wireproto", "loggetfiles"):
try:
serializedargs = json.dumps(args)
except Exception:
serializedargs = "Failed to serialize arguments"
kwargs = {}
try:
clienttelemetry = extensions.find("clienttelemetry")
kwargs = clienttelemetry.getclienttelemetry(repo)
except KeyError:
pass
reponame = repo.ui.config("common", "reponame", "unknown")
kwargs["reponame"] = reponame
repo.ui.log(
"wireproto_requests",
"",
command="getfiles",
args=serializedargs,
responselen=responselen,
duration=int((time.time() - start_time) * 1000),
**kwargs
)
return wireproto.streamres(streamer())
def createfileblob(filectx):
"""
format:
v0:
str(len(rawtext)) + '\0' + rawtext + ancestortext
v1:
'v1' + '\n' + metalist + '\0' + rawtext + ancestortext
metalist := metalist + '\n' + meta | meta
meta := sizemeta | flagmeta
sizemeta := METAKEYSIZE + str(len(rawtext))
flagmeta := METAKEYFLAG + str(flag)
note: sizemeta must exist. METAKEYFLAG and METAKEYSIZE must have a
length of 1.
"""
flog = filectx.filelog()
frev = filectx.filerev()
revlogflags = flog.flags(frev)
if revlogflags == 0:
# normal files
text = filectx.data()
else:
# lfs, read raw revision data
text = flog.revision(frev, raw=True)
repo = filectx._repo
ancestors = [filectx]
try:
repo.forcelinkrev = True
ancestors.extend([f for f in filectx.ancestors()])
ancestortext = ""
for ancestorctx in ancestors:
parents = ancestorctx.parents()
p1 = nullid
p2 = nullid
if len(parents) > 0:
p1 = parents[0].filenode()
if len(parents) > 1:
p2 = parents[1].filenode()
copyname = ""
rename = ancestorctx.renamed()
if rename:
copyname = rename[0]
linknode = ancestorctx.node()
ancestortext += "%s%s%s%s%s\0" % (
ancestorctx.filenode(),
p1,
p2,
linknode,
copyname,
)
finally:
repo.forcelinkrev = False
header = shallowutil.buildfileblobheader(len(text), revlogflags)
return "%s\0%s%s" % (header, text, ancestortext)
def gcserver(ui, repo):
if not repo.ui.configbool("remotefilelog", "server"):
return
neededfiles = set()
heads = repo.revs("heads(tip~25000:) - null")
cachepath = repo.localvfs.join("remotefilelogcache")
for head in heads:
mf = repo[head].manifest()
for filename, filenode in mf.iteritems():
filecachepath = os.path.join(cachepath, filename, hex(filenode))
neededfiles.add(filecachepath)
# delete unneeded older files
days = repo.ui.configint("remotefilelog", "serverexpiration", 30)
expiration = time.time() - (days * 24 * 60 * 60)
with progress.bar(ui, _("removing old server cache"), "files") as prog:
for root, dirs, files in os.walk(cachepath):
for file in files:
filepath = os.path.join(root, file)
prog.value += 1
if filepath in neededfiles:
continue
stat = os.stat(filepath)
if stat.st_mtime < expiration:
os.remove(filepath)
def getpack(repo, proto, args):
"""A server api for requesting a pack of file information.
"""
if shallowrepo.requirement in repo.requirements:
raise error.Abort(_("cannot fetch remote files from shallow repo"))
if not isinstance(proto, sshserver.sshserver):
raise error.Abort(_("cannot fetch remote files over non-ssh protocol"))
def streamer():
"""Request format:
[<filerequest>,...]\0\0
filerequest = <filename len: 2 byte><filename><count: 4 byte>
[<node: 20 byte>,...]
Response format:
[<fileresponse>,...]<10 null bytes>
fileresponse = <filename len: 2 byte><filename><history><deltas>
history = <count: 4 byte>[<history entry>,...]
historyentry = <node: 20 byte><p1: 20 byte><p2: 20 byte>
<linknode: 20 byte><copyfrom len: 2 byte><copyfrom>
deltas = <count: 4 byte>[<delta entry>,...]
deltaentry = <node: 20 byte><deltabase: 20 byte>
<delta len: 8 byte><delta>
"""
files = _receivepackrequest(proto.fin)
# Sort the files by name, so we provide deterministic results
for filename, nodes in sorted(files.iteritems()):
fl = repo.file(filename)
# Compute history
history = []
for rev in fl.ancestors(list(fl.rev(n) for n in nodes), inclusive=True):
x, x, x, x, linkrev, p1, p2, node = fl.index[rev]
copyfrom = ""
p1node = fl.node(p1)
p2node = fl.node(p2)
linknode = repo.changelog.node(linkrev)
if p1node == nullid:
copydata = fl.renamed(node)
if copydata:
copyfrom, copynode = copydata
p1node = copynode
history.append((node, p1node, p2node, linknode, copyfrom))
# Scan and send deltas
chain = _getdeltachain(fl, nodes, -1)
for chunk in wirepack.sendpackpart(filename, history, chain):
yield chunk
yield wirepack.closepart()
proto.fout.flush()
return wireproto.streamres(streamer())
def _receivepackrequest(stream):
files = {}
while True:
filenamelen = shallowutil.readunpack(stream, constants.FILENAMESTRUCT)[0]
if filenamelen == 0:
break
filename = shallowutil.readexactly(stream, filenamelen)
nodecount = shallowutil.readunpack(stream, constants.PACKREQUESTCOUNTSTRUCT)[0]
# Read N nodes
nodes = shallowutil.readexactly(stream, constants.NODESIZE * nodecount)
nodes = set(
nodes[i : i + constants.NODESIZE]
for i in xrange(0, len(nodes), constants.NODESIZE)
)
files[filename] = nodes
return files
def _getdeltachain(fl, nodes, stophint):
"""Produces a chain of deltas that includes each of the given nodes.
`stophint` - The changeset rev number to stop at. If it's set to >= 0, we
will return not only the deltas for the requested nodes, but also all
necessary deltas in their delta chains, as long as the deltas have link revs
>= the stophint. This allows us to return an approximately minimal delta
chain when the user performs a pull. If `stophint` is set to -1, all nodes
will return full texts. """
chain = []
seen = set()
for node in nodes:
startrev = fl.rev(node)
cur = startrev
while True:
if cur in seen:
break
start, length, size, base, linkrev, p1, p2, node = fl.index[cur]
if linkrev < stophint and cur != startrev:
break
# Return a full text if:
# - the caller requested it (via stophint == -1)
# - the revlog chain has ended (via base==null or base==node)
# - p1 is null, which can mean a copy in some situations
if stophint == -1 or base == nullrev or base == cur or p1 == nullrev:
delta = fl.revision(cur)
base = nullrev
else:
delta = fl._chunk(cur)
basenode = fl.node(base)
chain.append((node, basenode, delta))
seen.add(cur)
if base == nullrev:
break
cur = base
chain.reverse()
return chain