sapling/tests/test-fb-hgext-remotefilelog-datapack.py

514 lines
16 KiB
Python
Raw Normal View History

#!/usr/bin/env python
from __future__ import absolute_import, print_function
[remotefilelog] use hashlib to compute sha1 hashes Summary: hg-crew's c27dc3c3122 and c27dc3c3122^ were breaking our extensions: ``` $ hg log -r c27dc3c3122^ changeset: 9010734b79911d2d2e7405d91a4df479b35b3841 user: Augie Fackler <raf@durin42.com> date: Thu, 09 Jun 2016 21:12:33 -0700 s.ummary: cleanup: replace uses of util.(md5|sha1|sha256|sha512) with hashlib.\1 ``` ``` $ hg log -r c27dc3c3122 changeset: 0d55a7b8d07bf948c935822e6eea85b044383f00 user: Augie Fackler <raf@durin42.com> date: Thu, 09 Jun 2016 21:13:23 -0700 s.ummary: util: drop local aliases for md5, sha1, sha256, and sha512 ``` I did a grep over facebook-hg-rpms to see what was affected: ``` $ grep "util\.\(md5\|sha1\|sha256\|sha512\)" -r ~/facebook-hg-rpms /home/jeroenv/facebook-hg-rpms/remotefilelog/remotefilelog/basestore.py: sha = util.sha1(filename).digest() /home/jeroenv/facebook-hg-rpms/remotefilelog/remotefilelog/basestore.py: sha = util.sha1(filename).digest() /home/jeroenv/facebook-hg-rpms/remotefilelog/remotefilelog/shallowutil.py: pathhash = util.sha1(file).hexdigest() /home/jeroenv/facebook-hg-rpms/remotefilelog/remotefilelog/shallowutil.py: pathhash = util.sha1(file).hexdigest() /home/jeroenv/facebook-hg-rpms/remotefilelog/remotefilelog/debugcommands.py: filekey = util.sha1(file).hexdigest() /home/jeroenv/facebook-hg-rpms/remotefilelog/remotefilelog/historypack.py: namehash = util.sha1(name).digest() /home/jeroenv/facebook-hg-rpms/remotefilelog/remotefilelog/historypack.py: node = util.sha1(filename).digest() /home/jeroenv/facebook-hg-rpms/remotefilelog/remotefilelog/historypack.py: files = ((util.sha1(filename).digest(), offset, size) /home/jeroenv/facebook-hg-rpms/remotefilelog/remotefilelog/fileserverclient.py: pathhash = util.sha1(file).hexdigest() /home/jeroenv/facebook-hg-rpms/remotefilelog/remotefilelog/fileserverclient.py: pathhash = util.sha1(file).hexdigest() /home/jeroenv/facebook-hg-rpms/remotefilelog/remotefilelog/basepack.py: self.sha = util.sha1() /home/jeroenv/facebook-hg-rpms/remotefilelog/tests/test-datapack.py: return util.sha1(content).digest() /home/jeroenv/facebook-hg-rpms/remotefilelog/tests/test-histpack.py: return util.sha1(content).digest() Binary file /home/jeroenv/facebook-hg-rpms/hg-crew/.hg/store/data/mercurial/revlog.py.i matches /home/jeroenv/facebook-hg-rpms/fb-hgext/sparse.py: return util.sha1(fh.read()).hexdigest() /home/jeroenv/facebook-hg-rpms/fb-hgext/sparse.py: sha1 = util.sha1() /home/jeroenv/facebook-hg-rpms/fb-hgext/sparse.py: sha1 = util.sha1() /home/jeroenv/facebook-hg-rpms/fb-hgext/sparse.py: sha1 = util.sha1() /home/jeroenv/facebook-hg-rpms/fb-hgext/sparse.py: sha1 = util.sha1() /home/jeroenv/facebook-hg-rpms/mutable-history/hgext/simple4server.py: sha = util.sha1() /home/jeroenv/facebook-hg-rpms/mutable-history/hgext/evolve.py: sha = util.sha1() ``` This diff is part of the fix. Test Plan: Ran the tests. ``` $MERCURIALRUNTEST -S -j 48 --with-hg ~/local/facebook-hg-rpms/hg-crew/hg ``` Reviewers: #sourcecontrol, ttung Differential Revision: https://phabricator.intern.facebook.com/D3440041 Tasks: 11762191
2016-06-16 01:48:16 +03:00
import hashlib
import os
import random
import shutil
import stat
import struct
import tempfile
import time
import unittest
import mercurial.ui
import silenttestrunner
from hgext.remotefilelog import constants
from hgext.remotefilelog.basepack import (
LARGEFANOUTPREFIX,
SMALLFANOUTCUTOFF,
SMALLFANOUTPREFIX,
)
2018-01-10 02:23:52 +03:00
from hgext.remotefilelog.datapack import (
datapack,
datapackstore,
fastdatapack,
mutabledatapack,
)
from mercurial.node import nullid
flake8: enable F821 check Summary: This check is useful and detects real errors (ex. fbconduit). Unfortunately `arc lint` will run it with both py2 and py3 so a lot of py2 builtins will still be warned. I didn't find a clean way to disable py3 check. So this diff tries to fix them. For `xrange`, the change was done by a script: ``` import sys import redbaron headertypes = {'comment', 'endl', 'from_import', 'import', 'string', 'assignment', 'atomtrailers'} xrangefix = '''try: xrange(0) except NameError: xrange = range ''' def isxrange(x): try: return x[0].value == 'xrange' except Exception: return False def main(argv): for i, path in enumerate(argv): print('(%d/%d) scanning %s' % (i + 1, len(argv), path)) content = open(path).read() try: red = redbaron.RedBaron(content) except Exception: print(' warning: failed to parse') continue hasxrange = red.find('atomtrailersnode', value=isxrange) hasxrangefix = 'xrange = range' in content if hasxrangefix or not hasxrange: print(' no need to change') continue # find a place to insert the compatibility statement changed = False for node in red: if node.type in headertypes: continue # node.insert_before is an easier API, but it has bugs changing # other "finally" and "except" positions. So do the insert # manually. # # node.insert_before(xrangefix) line = node.absolute_bounding_box.top_left.line - 1 lines = content.splitlines(1) content = ''.join(lines[:line]) + xrangefix + ''.join(lines[line:]) changed = True break if changed: # "content" is faster than "red.dumps()" open(path, 'w').write(content) print(' updated') if __name__ == "__main__": sys.exit(main(sys.argv[1:])) ``` For other py2 builtins that do not have a py3 equivalent, some `# noqa` were added as a workaround for now. Reviewed By: DurhamG Differential Revision: D6934535 fbshipit-source-id: 546b62830af144bc8b46788d2e0fd00496838939
2018-02-10 04:31:44 +03:00
try:
xrange(0)
except NameError:
xrange = range
class datapacktestsbase(object):
def __init__(self, datapackreader, paramsavailable, iscdatapack):
self.datapackreader = datapackreader
self.iscdatapack = iscdatapack
self.paramsavailable = paramsavailable
def setUp(self):
self.tempdirs = []
def tearDown(self):
for d in self.tempdirs:
shutil.rmtree(d)
def makeTempDir(self):
tempdir = tempfile.mkdtemp()
self.tempdirs.append(tempdir)
return tempdir
def getHash(self, content):
[remotefilelog] use hashlib to compute sha1 hashes Summary: hg-crew's c27dc3c3122 and c27dc3c3122^ were breaking our extensions: ``` $ hg log -r c27dc3c3122^ changeset: 9010734b79911d2d2e7405d91a4df479b35b3841 user: Augie Fackler <raf@durin42.com> date: Thu, 09 Jun 2016 21:12:33 -0700 s.ummary: cleanup: replace uses of util.(md5|sha1|sha256|sha512) with hashlib.\1 ``` ``` $ hg log -r c27dc3c3122 changeset: 0d55a7b8d07bf948c935822e6eea85b044383f00 user: Augie Fackler <raf@durin42.com> date: Thu, 09 Jun 2016 21:13:23 -0700 s.ummary: util: drop local aliases for md5, sha1, sha256, and sha512 ``` I did a grep over facebook-hg-rpms to see what was affected: ``` $ grep "util\.\(md5\|sha1\|sha256\|sha512\)" -r ~/facebook-hg-rpms /home/jeroenv/facebook-hg-rpms/remotefilelog/remotefilelog/basestore.py: sha = util.sha1(filename).digest() /home/jeroenv/facebook-hg-rpms/remotefilelog/remotefilelog/basestore.py: sha = util.sha1(filename).digest() /home/jeroenv/facebook-hg-rpms/remotefilelog/remotefilelog/shallowutil.py: pathhash = util.sha1(file).hexdigest() /home/jeroenv/facebook-hg-rpms/remotefilelog/remotefilelog/shallowutil.py: pathhash = util.sha1(file).hexdigest() /home/jeroenv/facebook-hg-rpms/remotefilelog/remotefilelog/debugcommands.py: filekey = util.sha1(file).hexdigest() /home/jeroenv/facebook-hg-rpms/remotefilelog/remotefilelog/historypack.py: namehash = util.sha1(name).digest() /home/jeroenv/facebook-hg-rpms/remotefilelog/remotefilelog/historypack.py: node = util.sha1(filename).digest() /home/jeroenv/facebook-hg-rpms/remotefilelog/remotefilelog/historypack.py: files = ((util.sha1(filename).digest(), offset, size) /home/jeroenv/facebook-hg-rpms/remotefilelog/remotefilelog/fileserverclient.py: pathhash = util.sha1(file).hexdigest() /home/jeroenv/facebook-hg-rpms/remotefilelog/remotefilelog/fileserverclient.py: pathhash = util.sha1(file).hexdigest() /home/jeroenv/facebook-hg-rpms/remotefilelog/remotefilelog/basepack.py: self.sha = util.sha1() /home/jeroenv/facebook-hg-rpms/remotefilelog/tests/test-datapack.py: return util.sha1(content).digest() /home/jeroenv/facebook-hg-rpms/remotefilelog/tests/test-histpack.py: return util.sha1(content).digest() Binary file /home/jeroenv/facebook-hg-rpms/hg-crew/.hg/store/data/mercurial/revlog.py.i matches /home/jeroenv/facebook-hg-rpms/fb-hgext/sparse.py: return util.sha1(fh.read()).hexdigest() /home/jeroenv/facebook-hg-rpms/fb-hgext/sparse.py: sha1 = util.sha1() /home/jeroenv/facebook-hg-rpms/fb-hgext/sparse.py: sha1 = util.sha1() /home/jeroenv/facebook-hg-rpms/fb-hgext/sparse.py: sha1 = util.sha1() /home/jeroenv/facebook-hg-rpms/fb-hgext/sparse.py: sha1 = util.sha1() /home/jeroenv/facebook-hg-rpms/mutable-history/hgext/simple4server.py: sha = util.sha1() /home/jeroenv/facebook-hg-rpms/mutable-history/hgext/evolve.py: sha = util.sha1() ``` This diff is part of the fix. Test Plan: Ran the tests. ``` $MERCURIALRUNTEST -S -j 48 --with-hg ~/local/facebook-hg-rpms/hg-crew/hg ``` Reviewers: #sourcecontrol, ttung Differential Revision: https://phabricator.intern.facebook.com/D3440041 Tasks: 11762191
2016-06-16 01:48:16 +03:00
return hashlib.sha1(content).digest()
def getFakeHash(self):
return "".join(chr(random.randint(0, 255)) for _ in range(20))
def createPack(self, revisions=None, packdir=None, version=0):
if revisions is None:
revisions = [("filename", self.getFakeHash(), nullid, "content")]
if packdir is None:
packdir = self.makeTempDir()
packer = mutabledatapack(mercurial.ui.ui(), packdir, version=version)
for args in revisions:
filename, node, base, content = args[0:4]
# meta is optional
meta = None
if len(args) > 4:
meta = args[4]
packer.add(filename, node, base, content, metadata=meta)
path = packer.close()
return self.datapackreader(path)
def _testAddSingle(self, content):
"""Test putting a simple blob into a pack and reading it out.
"""
filename = "foo"
node = self.getHash(content)
revisions = [(filename, node, nullid, content)]
pack = self.createPack(revisions)
if self.paramsavailable:
self.assertEquals(pack.params.fanoutprefix, SMALLFANOUTPREFIX)
chain = pack.getdeltachain(filename, node)
self.assertEquals(content, chain[0][4])
def testAddSingle(self):
self._testAddSingle("")
def testAddSingleEmpty(self):
self._testAddSingle("abcdef")
def testAddMultiple(self):
"""Test putting multiple unrelated blobs into a pack and reading them
out.
"""
revisions = []
for i in range(10):
filename = "foo%s" % i
content = "abcdef%s" % i
node = self.getHash(content)
revisions.append((filename, node, self.getFakeHash(), content))
pack = self.createPack(revisions)
for filename, node, base, content in revisions:
entry = pack.getdelta(filename, node)
self.assertEquals((content, filename, base, {}), entry)
chain = pack.getdeltachain(filename, node)
self.assertEquals(content, chain[0][4])
def testAddDeltas(self):
"""Test putting multiple delta blobs into a pack and read the chain.
"""
revisions = []
filename = "foo"
lastnode = nullid
for i in range(10):
content = "abcdef%s" % i
node = self.getHash(content)
revisions.append((filename, node, lastnode, content))
lastnode = node
pack = self.createPack(revisions)
entry = pack.getdelta(filename, revisions[0][1])
realvalue = (revisions[0][3], filename, revisions[0][2], {})
self.assertEquals(entry, realvalue)
# Test that the chain for the final entry has all the others
chain = pack.getdeltachain(filename, node)
for i in range(10):
content = "abcdef%s" % i
self.assertEquals(content, chain[-i - 1][4])
def testPackMany(self):
"""Pack many related and unrelated objects.
"""
# Build a random pack file
revisions = []
blobs = {}
random.seed(0)
for i in range(100):
filename = "filename-%s" % i
filerevs = []
for j in range(random.randint(1, 100)):
content = "content-%s" % j
node = self.getHash(content)
lastnode = nullid
if len(filerevs) > 0:
lastnode = filerevs[random.randint(0, len(filerevs) - 1)]
filerevs.append(node)
blobs[(filename, node, lastnode)] = content
revisions.append((filename, node, lastnode, content))
pack = self.createPack(revisions)
# Verify the pack contents
for (filename, node, lastnode), content in sorted(blobs.iteritems()):
chain = pack.getdeltachain(filename, node)
for entry in chain:
expectedcontent = blobs[(entry[0], entry[1], entry[3])]
self.assertEquals(entry[4], expectedcontent)
def testPackMetadata(self):
revisions = []
for i in range(100):
filename = "%s.txt" % i
content = "put-something-here \n" * i
node = self.getHash(content)
meta = {
constants.METAKEYFLAG: i ** 4,
constants.METAKEYSIZE: len(content),
"Z": "random_string",
"_": "\0" * i,
}
revisions.append((filename, node, nullid, content, meta))
pack = self.createPack(revisions, version=1)
for name, node, x, content, origmeta in revisions:
parsedmeta = pack.getmeta(name, node)
# flag == 0 should be optimized out
if origmeta[constants.METAKEYFLAG] == 0:
del origmeta[constants.METAKEYFLAG]
self.assertEquals(parsedmeta, origmeta)
def testPackMetadataThrows(self):
filename = "1"
content = "2"
node = self.getHash(content)
meta = {constants.METAKEYFLAG: 3}
revisions = [(filename, node, nullid, content, meta)]
try:
self.createPack(revisions, version=0)
self.assertTrue(False, "should throw if metadata is not supported")
except RuntimeError:
pass
def testGetMissing(self):
"""Test the getmissing() api.
"""
revisions = []
filename = "foo"
lastnode = nullid
for i in range(10):
content = "abcdef%s" % i
node = self.getHash(content)
revisions.append((filename, node, lastnode, content))
lastnode = node
pack = self.createPack(revisions)
missing = pack.getmissing([("foo", revisions[0][1])])
self.assertFalse(missing)
missing = pack.getmissing([("foo", revisions[0][1]), ("foo", revisions[1][1])])
self.assertFalse(missing)
fakenode = self.getFakeHash()
missing = pack.getmissing([("foo", revisions[0][1]), ("foo", fakenode)])
self.assertEquals(missing, [("foo", fakenode)])
def testAddThrows(self):
pack = self.createPack()
try:
pack.add("filename", nullid, "contents")
self.assertTrue(False, "datapack.add should throw")
except RuntimeError:
pass
def testBadVersionThrows(self):
pack = self.createPack()
path = pack.path + ".datapack"
with open(path) as f:
raw = f.read()
raw = struct.pack("!B", 255) + raw[1:]
os.chmod(path, os.stat(path).st_mode | stat.S_IWRITE)
with open(path, "w+") as f:
f.write(raw)
try:
pack = self.datapackreader(pack.path)
self.assertTrue(False, "bad version number should have thrown")
except RuntimeError:
pass
def testMissingDeltabase(self):
fakenode = self.getFakeHash()
revisions = [("filename", fakenode, self.getFakeHash(), "content")]
pack = self.createPack(revisions)
chain = pack.getdeltachain("filename", fakenode)
self.assertEquals(len(chain), 1)
def testLargePack(self):
"""Test creating and reading from a large pack with over X entries.
This causes it to use a 2^16 fanout table instead."""
revisions = []
blobs = {}
total = SMALLFANOUTCUTOFF + 1
for i in xrange(total):
filename = "filename-%s" % i
content = filename
node = self.getHash(content)
blobs[(filename, node)] = content
revisions.append((filename, node, nullid, content))
pack = self.createPack(revisions)
if self.paramsavailable:
self.assertEquals(pack.params.fanoutprefix, LARGEFANOUTPREFIX)
for (filename, node), content in blobs.iteritems():
actualcontent = pack.getdeltachain(filename, node)[0][4]
self.assertEquals(actualcontent, content)
def testPacksCache(self):
"""Test that we remember the most recent packs while fetching the delta
chain."""
packdir = self.makeTempDir()
deltachains = []
if self.iscdatapack:
numpacks = 200
else:
numpacks = 10
revisionsperpack = 100
for i in range(numpacks):
chain = []
revision = (str(i), self.getFakeHash(), nullid, "content")
for _ in range(revisionsperpack):
chain.append(revision)
revision = (str(i), self.getFakeHash(), revision[1], self.getFakeHash())
self.createPack(chain, packdir)
deltachains.append(chain)
class testdatapackstore(datapackstore):
# Ensures that we are not keeping everything in the cache.
DEFAULTCACHESIZE = numpacks / 2
store = testdatapackstore(mercurial.ui.ui(), packdir, self.iscdatapack)
random.shuffle(deltachains)
for randomchain in deltachains:
revision = random.choice(randomchain)
chain = store.getdeltachain(revision[0], revision[1])
mostrecentpack = next(iter(store.packs), None)
self.assertEquals(
mostrecentpack.getdeltachain(revision[0], revision[1]), chain
)
self.assertEquals(randomchain.index(revision) + 1, len(chain))
def testCorruptPackHandling(self):
"""Test that the pack store deletes corrupt packs."""
# There's a bug in cdatapack right now that causes it to return bad data
# even if the pack is corrupt. Since we're not getting an exception, we
# can't detect the corruption and remediate. Let's wait for the rust
# implementation to deprecate the C implementation then this will be
# easier to fix.
if self.iscdatapack:
return
packdir = self.makeTempDir()
deltachains = []
numpacks = 5
revisionsperpack = 100
firstpack = None
secondindex = None
for i in range(numpacks):
chain = []
revision = (str(i), self.getFakeHash(), nullid, "content")
for _ in range(revisionsperpack):
chain.append(revision)
revision = (str(i), self.getFakeHash(), revision[1], self.getFakeHash())
pack = self.createPack(chain, packdir)
if firstpack is None:
firstpack = pack.packpath
elif secondindex is None:
secondindex = pack.indexpath
deltachains.append(chain)
ui = mercurial.ui.ui()
store = datapackstore(ui, packdir, self.iscdatapack, deletecorruptpacks=True)
key = (deltachains[0][0][0], deltachains[0][0][1])
# Count packs
origpackcount = len(os.listdir(packdir))
# Read key
store.getdelta(*key)
# Corrupt the pack
os.chmod(firstpack, 0o644)
f = open(firstpack, "w")
f.truncate(1)
f.close()
# Look for key again
try:
ui.pushbuffer(error=True)
delta = store.getdelta(*key)
raise RuntimeError("getdelta on corrupt key should fail %s" % repr(delta))
except KeyError:
pass
ui.popbuffer()
# Count packs
newpackcount = len(os.listdir(packdir))
# Assert the corrupt pack was removed
self.assertEquals(origpackcount - 2, newpackcount)
# Corrupt the index
os.chmod(secondindex, 0o644)
f = open(secondindex, "w")
f.truncate(1)
f.close()
# Load the packs
origpackcount = len(os.listdir(packdir))
ui.pushbuffer(error=True)
store = datapackstore(ui, packdir, self.iscdatapack, deletecorruptpacks=True)
ui.popbuffer()
newpackcount = len(os.listdir(packdir))
# Assert the corrupt pack was removed
self.assertEquals(origpackcount - 2, newpackcount)
def testReadingMutablePack(self):
"""Tests that the data written into a mutabledatapack can be read out
before it has been finalized."""
packdir = self.makeTempDir()
packer = mutabledatapack(mercurial.ui.ui(), packdir, version=1)
# Add some unused first revision for noise
packer.add("qwert", self.getFakeHash(), self.getFakeHash(), "qwertcontent")
filename = "filename1"
node = self.getFakeHash()
base = self.getFakeHash()
content = "asdf"
meta = {
constants.METAKEYFLAG: 1,
constants.METAKEYSIZE: len(content),
"Z": "random_string",
"_": "\0" * 40,
}
packer.add(filename, node, base, content, metadata=meta)
# Add some unused third revision for noise
packer.add("zxcv", self.getFakeHash(), self.getFakeHash(), "zcxvcontent")
# Test getmissing
missing = ("", self.getFakeHash())
value = packer.getmissing([missing, (filename, node)])
self.assertEquals(value, [missing])
# Test getmeta
value = packer.getmeta(filename, node)
self.assertEquals(value, meta)
# Test getdelta
value = packer.getdelta(filename, node)
self.assertEquals(value, (content, filename, base, meta))
# Test getdeltachain
value = packer.getdeltachain(filename, node)
self.assertEquals(value, [(filename, node, filename, base, content)])
# perf test off by default since it's slow
def _testIndexPerf(self):
random.seed(0)
print("Multi-get perf test")
packsizes = [100, 10000, 100000, 500000, 1000000, 3000000]
lookupsizes = [10, 100, 1000, 10000, 100000, 1000000]
for packsize in packsizes:
revisions = []
for i in xrange(packsize):
filename = "filename-%s" % i
content = "content-%s" % i
node = self.getHash(content)
revisions.append((filename, node, nullid, content))
path = self.createPack(revisions).path
# Perf of large multi-get
import gc
gc.disable()
pack = self.datapackreader(path)
for lookupsize in lookupsizes:
if lookupsize > packsize:
continue
random.shuffle(revisions)
findnodes = [(rev[0], rev[1]) for rev in revisions]
start = time.time()
pack.getmissing(findnodes[:lookupsize])
elapsed = time.time() - start
print(
"%s pack %s lookups = %0.04f"
% (
("%s" % packsize).rjust(7),
("%s" % lookupsize).rjust(7),
elapsed,
)
)
print("")
gc.enable()
# The perf test is meant to produce output, so we always fail the test
# so the user sees the output.
raise RuntimeError("perf test always fails")
class datapacktests(datapacktestsbase, unittest.TestCase):
def __init__(self, *args, **kwargs):
datapacktestsbase.__init__(self, datapack, True, False)
unittest.TestCase.__init__(self, *args, **kwargs)
class fastdatapacktests(datapacktestsbase, unittest.TestCase):
def __init__(self, *args, **kwargs):
datapacktestsbase.__init__(self, fastdatapack, False, True)
unittest.TestCase.__init__(self, *args, **kwargs)
# TODO:
# datapack store:
# - getmissing
# - GC two packs into one
if __name__ == "__main__":
silenttestrunner.main(__name__)