mirror of
https://github.com/facebook/sapling.git
synced 2024-10-11 17:27:53 +03:00
68030f1dd7
Summary: During a repack we often want to access the ancestory for a bunch of nodes that might be ancestors of each other. Using getancestors for that results in a lot of duplicated work. For instance, getancestors(0) returns [0], getancestors(1) returns [0, 1], getancestors(2) returns [0, 1, 2], etc. Which is n^2 This patch adds an optional `known` argument for getancestors that let's the caller tell getancestors what ancestors it's already aware of. Then getancestors can short circuit when it reaches that level. This avoids duplicate work during repack. Test Plan: Ran treemanifest repack in our large repo and verified it made progress over the nodes much faster than before Reviewers: #mercurial, quark Reviewed By: quark Subscribers: mjpieters Differential Revision: https://phabricator.intern.facebook.com/D4901308 Signature: t1:4901308:1492640896:27d4a90c2993cd1fefbd8dbc211f2ec181178bce
386 lines
13 KiB
Python
386 lines
13 KiB
Python
import hashlib, struct
|
|
from mercurial import util
|
|
from mercurial.node import hex, nullid
|
|
import basepack, constants
|
|
|
|
# (filename hash, offset, size)
|
|
INDEXFORMAT = '!20sQQ'
|
|
INDEXENTRYLENGTH = 36
|
|
NODELENGTH = 20
|
|
|
|
# (node, p1, p2, linknode)
|
|
PACKFORMAT = "!20s20s20s20sH"
|
|
PACKENTRYLENGTH = 82
|
|
|
|
OFFSETSIZE = 4
|
|
|
|
INDEXSUFFIX = '.histidx'
|
|
PACKSUFFIX = '.histpack'
|
|
|
|
ANC_NODE = 0
|
|
ANC_P1NODE = 1
|
|
ANC_P2NODE = 2
|
|
ANC_LINKNODE = 3
|
|
ANC_COPYFROM = 4
|
|
|
|
class historypackstore(basepack.basepackstore):
|
|
INDEXSUFFIX = INDEXSUFFIX
|
|
PACKSUFFIX = PACKSUFFIX
|
|
|
|
def getpack(self, path):
|
|
return historypack(path)
|
|
|
|
def getancestors(self, name, node, known=None):
|
|
for pack in self.packs:
|
|
try:
|
|
return pack.getancestors(name, node, known=known)
|
|
except KeyError:
|
|
pass
|
|
|
|
for pack in self.refresh():
|
|
try:
|
|
return pack.getancestors(name, node, known=known)
|
|
except KeyError:
|
|
pass
|
|
|
|
raise KeyError((name, node))
|
|
|
|
def add(self, filename, node, p1, p2, linknode, copyfrom):
|
|
raise RuntimeError("cannot add to historypackstore (%s:%s)"
|
|
% (filename, hex(node)))
|
|
|
|
class historypack(basepack.basepack):
|
|
INDEXSUFFIX = INDEXSUFFIX
|
|
PACKSUFFIX = PACKSUFFIX
|
|
|
|
def getmissing(self, keys):
|
|
missing = []
|
|
for name, node in keys:
|
|
try:
|
|
section = self._findsection(name)
|
|
self._findnode(section, node)
|
|
except KeyError:
|
|
missing.append((name, node))
|
|
|
|
return missing
|
|
|
|
def getancestors(self, name, node, known=None):
|
|
"""Returns as many ancestors as we're aware of.
|
|
|
|
return value: {
|
|
node: (p1, p2, linknode, copyfrom),
|
|
...
|
|
}
|
|
"""
|
|
if known is None:
|
|
known = set()
|
|
if node in known:
|
|
return []
|
|
|
|
filename, offset, size = self._findsection(name)
|
|
pending = set((node,))
|
|
data = self._data[offset:offset + size]
|
|
results = {}
|
|
o = 0
|
|
while o < len(data):
|
|
if not pending:
|
|
break
|
|
entry = struct.unpack(PACKFORMAT, data[o:o + PACKENTRYLENGTH])
|
|
o += PACKENTRYLENGTH
|
|
copyfrom = None
|
|
copyfromlen = entry[ANC_COPYFROM]
|
|
if copyfromlen != 0:
|
|
copyfrom = data[o:o + copyfromlen]
|
|
o += copyfromlen
|
|
|
|
ancnode = entry[ANC_NODE]
|
|
if ancnode in pending:
|
|
pending.remove(ancnode)
|
|
p1node = entry[ANC_P1NODE]
|
|
p2node = entry[ANC_P2NODE]
|
|
if p1node != nullid and p1node not in known:
|
|
pending.add(p1node)
|
|
if p2node != nullid and p2node not in known:
|
|
pending.add(p2node)
|
|
|
|
result = (p1node,
|
|
p2node,
|
|
entry[ANC_LINKNODE],
|
|
copyfrom)
|
|
results[entry[ANC_NODE]] = result
|
|
|
|
self._pagedin += PACKENTRYLENGTH
|
|
|
|
# If we've read a lot of data from the mmap, free some memory.
|
|
self.freememory()
|
|
|
|
if not results:
|
|
raise KeyError((name, node))
|
|
return results
|
|
|
|
def add(self, filename, node, p1, p2, linknode, copyfrom):
|
|
raise RuntimeError("cannot add to historypack (%s:%s)" %
|
|
(filename, hex(node)))
|
|
|
|
def _findnode(self, section, node):
|
|
name, offset, size = section
|
|
data = self._data
|
|
o = offset
|
|
while o < offset + size:
|
|
entry = struct.unpack(PACKFORMAT,
|
|
data[o:o + PACKENTRYLENGTH])
|
|
o += PACKENTRYLENGTH
|
|
|
|
if entry[0] == node:
|
|
copyfrom = None
|
|
copyfromlen = entry[ANC_COPYFROM]
|
|
if copyfromlen != 0:
|
|
copyfrom = data[o:o + copyfromlen]
|
|
|
|
# If we've read a lot of data from the mmap, free some memory.
|
|
self._pagedin += o - offset
|
|
self.freememory()
|
|
return (entry[ANC_P1NODE],
|
|
entry[ANC_P2NODE],
|
|
entry[ANC_LINKNODE],
|
|
copyfrom)
|
|
|
|
o += entry[ANC_COPYFROM]
|
|
|
|
raise KeyError("unable to find history for %s:%s" % (name, hex(node)))
|
|
|
|
def _findsection(self, name):
|
|
params = self.params
|
|
namehash = hashlib.sha1(name).digest()
|
|
fanoutkey = struct.unpack(params.fanoutstruct,
|
|
namehash[:params.fanoutprefix])[0]
|
|
fanout = self._fanouttable
|
|
|
|
start = fanout[fanoutkey] + params.indexstart
|
|
for i in xrange(fanoutkey + 1, params.fanoutcount):
|
|
end = fanout[i] + params.indexstart
|
|
if end != start:
|
|
break
|
|
else:
|
|
end = self.indexsize
|
|
|
|
# Bisect between start and end to find node
|
|
startnode = self._index[start:start + NODELENGTH]
|
|
endnode = self._index[end:end + NODELENGTH]
|
|
if startnode == namehash:
|
|
entry = self._index[start:start + INDEXENTRYLENGTH]
|
|
elif endnode == namehash:
|
|
entry = self._index[end:end + INDEXENTRYLENGTH]
|
|
else:
|
|
while start < end - INDEXENTRYLENGTH:
|
|
mid = start + (end - start) / 2
|
|
mid = mid - ((mid - params.indexstart) % INDEXENTRYLENGTH)
|
|
midnode = self._index[mid:mid + NODELENGTH]
|
|
if midnode == namehash:
|
|
entry = self._index[mid:mid + INDEXENTRYLENGTH]
|
|
break
|
|
if namehash > midnode:
|
|
start = mid
|
|
startnode = midnode
|
|
elif namehash < midnode:
|
|
end = mid
|
|
endnode = midnode
|
|
else:
|
|
raise KeyError(name)
|
|
|
|
filenamehash, offset, size = struct.unpack(INDEXFORMAT, entry)
|
|
filenamelength = struct.unpack('!H', self._data[offset:offset +
|
|
constants.FILENAMESIZE])[0]
|
|
offset += constants.FILENAMESIZE
|
|
|
|
actualname = self._data[offset:offset + filenamelength]
|
|
offset += filenamelength
|
|
|
|
if name != actualname:
|
|
raise KeyError("found file name %s when looking for %s" %
|
|
(actualname, name))
|
|
|
|
struct.unpack('!I', self._data[offset:offset +
|
|
OFFSETSIZE])[0]
|
|
offset += OFFSETSIZE
|
|
|
|
return (name, offset, size - constants.FILENAMESIZE - filenamelength
|
|
- OFFSETSIZE)
|
|
|
|
def markledger(self, ledger):
|
|
for filename, node in self:
|
|
ledger.markhistoryentry(self, filename, node)
|
|
|
|
def cleanup(self, ledger):
|
|
entries = ledger.sources.get(self, [])
|
|
allkeys = set(self)
|
|
repackedkeys = set((e.filename, e.node) for e in entries if
|
|
e.historyrepacked)
|
|
|
|
if len(allkeys - repackedkeys) == 0:
|
|
if self.path not in ledger.created:
|
|
util.unlinkpath(self.indexpath, ignoremissing=True)
|
|
util.unlinkpath(self.packpath, ignoremissing=True)
|
|
|
|
def __iter__(self):
|
|
for f, n, x, x, x, x in self.iterentries():
|
|
yield f, n
|
|
|
|
def iterentries(self):
|
|
# Start at 1 to skip the header
|
|
offset = 1
|
|
while offset < self.datasize:
|
|
data = self._data
|
|
# <2 byte len> + <filename>
|
|
filenamelen = struct.unpack('!H', data[offset:offset +
|
|
constants.FILENAMESIZE])[0]
|
|
offset += constants.FILENAMESIZE
|
|
filename = data[offset:offset + filenamelen]
|
|
offset += filenamelen
|
|
|
|
revcount = struct.unpack('!I', data[offset:offset +
|
|
OFFSETSIZE])[0]
|
|
offset += OFFSETSIZE
|
|
|
|
for i in xrange(revcount):
|
|
entry = struct.unpack(PACKFORMAT, data[offset:offset +
|
|
PACKENTRYLENGTH])
|
|
offset += PACKENTRYLENGTH
|
|
|
|
copyfrom = data[offset:offset + entry[ANC_COPYFROM]]
|
|
offset += entry[ANC_COPYFROM]
|
|
|
|
yield (filename, entry[ANC_NODE], entry[ANC_P1NODE],
|
|
entry[ANC_P2NODE], entry[ANC_LINKNODE], copyfrom)
|
|
|
|
self._pagedin += PACKENTRYLENGTH
|
|
|
|
# If we've read a lot of data from the mmap, free some memory.
|
|
self.freememory()
|
|
|
|
class mutablehistorypack(basepack.mutablebasepack):
|
|
"""A class for constructing and serializing a histpack file and index.
|
|
|
|
A history pack is a pair of files that contain the revision history for
|
|
various file revisions in Mercurial. It contains only revision history (like
|
|
parent pointers and linknodes), not any revision content information.
|
|
|
|
It consists of two files, with the following format:
|
|
|
|
.histpack
|
|
The pack itself is a series of file revisions with some basic header
|
|
information on each.
|
|
|
|
datapack = <version: 1 byte>
|
|
[<filesection>,...]
|
|
filesection = <filename len: 2 byte unsigned int>
|
|
<filename>
|
|
<revision count: 4 byte unsigned int>
|
|
[<revision>,...]
|
|
revision = <node: 20 byte>
|
|
<p1node: 20 byte>
|
|
<p2node: 20 byte>
|
|
<linknode: 20 byte>
|
|
<copyfromlen: 2 byte>
|
|
<copyfrom>
|
|
|
|
The revisions within each filesection are stored in topological order
|
|
(newest first). If a given entry has a parent from another file (a copy)
|
|
then p1node is the node from the other file, and copyfrom is the
|
|
filepath of the other file.
|
|
|
|
.histidx
|
|
The index file provides a mapping from filename to the file section in
|
|
the histpack. It consists of two parts, the fanout and the index.
|
|
|
|
The index is a list of index entries, sorted by filename hash (one per
|
|
file section in the pack). Each entry has:
|
|
|
|
- node (The 20 byte hash of the filename)
|
|
- pack entry offset (The location of this file section in the histpack)
|
|
- pack content size (The on-disk length of this file section's pack
|
|
data)
|
|
|
|
The fanout is a quick lookup table to reduce the number of steps for
|
|
bisecting the index. It is a series of 4 byte pointers to positions
|
|
within the index. It has 2^16 entries, which corresponds to hash
|
|
prefixes [00, 01, 02,..., FD, FE, FF]. Example: the pointer in slot 4F
|
|
points to the index position of the first revision whose node starts
|
|
with 4F. This saves log(2^16) bisect steps.
|
|
|
|
dataidx = <fanouttable>
|
|
<index>
|
|
fanouttable = [<index offset: 4 byte unsigned int>,...] (2^16 entries)
|
|
index = [<index entry>,...]
|
|
indexentry = <node: 20 byte>
|
|
<pack file section offset: 8 byte unsigned int>
|
|
<pack file section size: 8 byte unsigned int>
|
|
"""
|
|
INDEXSUFFIX = INDEXSUFFIX
|
|
PACKSUFFIX = PACKSUFFIX
|
|
INDEXENTRYLENGTH = INDEXENTRYLENGTH
|
|
|
|
def __init__(self, ui, packpath):
|
|
super(mutablehistorypack, self).__init__(ui, packpath)
|
|
self.pastfiles = {}
|
|
self.currentfile = None
|
|
self.currententries = []
|
|
|
|
def add(self, filename, node, p1, p2, linknode, copyfrom):
|
|
if filename != self.currentfile:
|
|
if filename in self.pastfiles:
|
|
raise RuntimeError("cannot add file node after another file's "
|
|
"nodes have been added")
|
|
if self.currentfile is not None:
|
|
self._writependingsection()
|
|
|
|
self.currentfile = filename
|
|
self.currententries = []
|
|
|
|
copyfrom = copyfrom or ''
|
|
copyfromlen = struct.pack('!H', len(copyfrom))
|
|
self.currententries.append((node, p1, p2, linknode, copyfromlen,
|
|
copyfrom))
|
|
|
|
def _writependingsection(self):
|
|
filename = self.currentfile
|
|
sectionstart = self.packfp.tell()
|
|
|
|
# Write the file section header
|
|
self.writeraw("%s%s%s" % (
|
|
struct.pack('!H', len(filename)),
|
|
filename,
|
|
struct.pack('!I', len(self.currententries)),
|
|
))
|
|
sectionlen = constants.FILENAMESIZE + len(filename) + 4
|
|
|
|
# Write the file section content
|
|
rawdata = ''.join('%s%s%s%s%s%s' % e for e in self.currententries)
|
|
sectionlen += len(rawdata)
|
|
|
|
self.writeraw(rawdata)
|
|
|
|
self.pastfiles[filename] = (sectionstart, sectionlen)
|
|
node = hashlib.sha1(filename).digest()
|
|
self.entries[node] = node
|
|
|
|
def close(self, ledger=None):
|
|
if self._closed:
|
|
return
|
|
|
|
if self.currentfile is not None:
|
|
self._writependingsection()
|
|
|
|
return super(mutablehistorypack, self).close(ledger=ledger)
|
|
|
|
def createindex(self, nodelocations):
|
|
files = ((hashlib.sha1(filename).digest(), offset, size)
|
|
for filename, (offset, size) in self.pastfiles.iteritems())
|
|
files = sorted(files)
|
|
|
|
rawindex = ""
|
|
for namehash, offset, size in files:
|
|
rawindex += struct.pack(INDEXFORMAT, namehash, offset, size)
|
|
|
|
return rawindex
|