sapling/remotefilelog/repack.py
Durham Goode 2a938a761c store: add copyfrom information to history index
Summary:
Previously we were throwing away copy information when we repacked things into
pack files. The hope was that we could store copy information somewhere else,
and keep the history pack using fixed length entries. Since storing copy
information elsewhere is a long ways off, let's just go ahead and put copy info
in the pack file.

This makes the entries non-fixed length, which means any iteration over them has
to read the length of each entry. This also affects the historypack filename
hashes since they are content based, so the tests had to change.

This matches the old remotefilelog behavior more closely (which is why no code
had to change outside the pack logic).

Test Plan: Added a test

Reviewers: #mercurial, mitrandir, ttung

Reviewed By: mitrandir

Subscribers: mitrandir

Differential Revision: https://phabricator.intern.facebook.com/D3262185

Signature: t1:3262185:1462562602:935683692276c7fa569d381b18aa3b18656793b1
2016-05-16 10:59:09 -07:00

240 lines
8.4 KiB
Python

import os
from collections import defaultdict
from mercurial import mdiff, util
from mercurial.node import nullid, bin, hex
from mercurial.i18n import _
import shallowutil
class repacker(object):
"""Class for orchestrating the repack of data and history information into a
new format.
"""
def __init__(self, repo, data, history):
self.repo = repo
self.data = data
self.history = history
def run(self, targetdata, targethistory):
ledger = repackledger()
with self.repo._lock(self.repo.svfs, "repacklock", False, None,
None, _('repacking %s') % self.repo.origroot):
self.repo.hook('prerepack')
# Populate ledger from source
self.data.markledger(ledger)
self.history.markledger(ledger)
# Run repack
self.repackdata(ledger, targetdata)
self.repackhistory(ledger, targethistory)
# Call cleanup on each source
for source in ledger.sources:
source.cleanup(ledger)
def repackdata(self, ledger, target):
ui = self.repo.ui
byfile = {}
for entry in ledger.entries.itervalues():
if entry.datasource:
byfile.setdefault(entry.filename, {})[entry.node] = entry
count = 0
for filename, entries in sorted(byfile.iteritems()):
ancestors = {}
nodes = list(node for node in entries.iterkeys())
for node in nodes:
if node in ancestors:
continue
ancestors.update(self.history.getancestors(filename, node))
# Order the nodes children first, so we can produce reverse deltas
orderednodes = reversed(self._toposort(ancestors))
# getancestors() will return the ancestry of a commit, even across
# renames. We currently don't support producing deltas across
# renames, so we use dontprocess to store when an ancestory
# traverses across a rename, so we can avoid processing those.
dontprocess = set()
# Compute deltas and write to the pack
deltabases = defaultdict(lambda: nullid)
nodes = set(nodes)
for node in orderednodes:
# orderednodes is all ancestors, but we only want to serialize
# the files we have.
if node not in nodes:
continue
# Find delta base
# TODO: allow delta'ing against most recent descendant instead
# of immediate child
deltabase = deltabases[node]
# Record this child as the delta base for its parents.
# This may be non optimal, since the parents may have many
# children, and this will only choose the last one.
# TODO: record all children and try all deltas to find best
p1, p2, linknode, copyfrom = ancestors[node]
if node in dontprocess:
if p1 != nullid:
dontprocess.add(p1)
if p2 != nullid:
dontprocess.add(p2)
continue
if copyfrom:
dontprocess.add(p1)
p1 = nullid
if p1 != nullid:
deltabases[p1] = node
if p2 != nullid:
deltabases[p2] = node
# Compute delta
# TODO: reuse existing deltas if it matches our deltabase
if deltabase != nullid:
deltabasetext = self.data.get(filename, deltabase)
original = self.data.get(filename, node)
delta = mdiff.textdiff(deltabasetext, original)
else:
delta = self.data.get(filename, node)
# TODO: don't use the delta if it's larger than the fulltext
target.add(filename, node, deltabase, delta)
entries[node].datarepacked = True
count += 1
ui.progress(_("repacking data"), count, unit="files",
total=len(byfile))
ui.progress(_("repacking data"), None)
target.close(ledger=ledger)
def repackhistory(self, ledger, target):
ui = self.repo.ui
byfile = {}
for entry in ledger.entries.itervalues():
if entry.historysource:
byfile.setdefault(entry.filename, {})[entry.node] = entry
count = 0
for filename, entries in sorted(byfile.iteritems()):
ancestors = {}
nodes = list(node for node in entries.iterkeys())
for node in nodes:
if node in ancestors:
continue
ancestors.update(self.history.getancestors(filename, node))
# Order the nodes children first
orderednodes = reversed(self._toposort(ancestors))
# Write to the pack
dontprocess = set()
for node in orderednodes:
p1, p2, linknode, copyfrom = ancestors[node]
if node in dontprocess:
if p1 != nullid:
dontprocess.add(p1)
if p2 != nullid:
dontprocess.add(p2)
continue
if copyfrom:
dontprocess.add(p1)
target.add(filename, node, p1, p2, linknode, copyfrom)
if node in entries:
entries[node].historyrepacked = True
count += 1
ui.progress(_("repacking history"), count, unit="files",
total=len(byfile))
ui.progress(_("repacking history"), None)
target.close(ledger=ledger)
def _toposort(self, ancestors):
def parentfunc(node):
p1, p2, linknode, copyfrom = ancestors[node]
parents = []
if p1 != nullid:
parents.append(p1)
if p2 != nullid:
parents.append(p2)
return parents
sortednodes = shallowutil.sortnodes(ancestors.keys(), parentfunc)
return sortednodes
class repackledger(object):
"""Storage for all the bookkeeping that happens during a repack. It contains
the list of revisions being repacked, what happened to each revision, and
which source store contained which revision originally (for later cleanup).
"""
def __init__(self):
self.entries = {}
self.sources = {}
self.created = set()
def markdataentry(self, source, filename, node):
"""Mark the given filename+node revision as having a data rev in the
given source.
"""
entry = self._getorcreateentry(filename, node)
entry.datasource = True
entries = self.sources.get(source)
if not entries:
entries = set()
self.sources[source] = entries
entries.add(entry)
def markhistoryentry(self, source, filename, node):
"""Mark the given filename+node revision as having a history rev in the
given source.
"""
entry = self._getorcreateentry(filename, node)
entry.historysource = True
entries = self.sources.get(source)
if not entries:
entries = set()
self.sources[source] = entries
entries.add(entry)
def _getorcreateentry(self, filename, node):
key = (filename, node)
value = self.entries.get(key)
if not value:
value = repackentry(filename, node)
self.entries[key] = value
return value
def addcreated(self, value):
self.created.add(value)
class repackentry(object):
"""Simple class representing a single revision entry in the repackledger.
"""
__slots__ = ['filename', 'node', 'datasource', 'historysource',
'datarepacked', 'historyrepacked']
def __init__(self, filename, node):
self.filename = filename
self.node = node
# If the revision has a data entry in the source
self.datasource = False
# If the revision has a history entry in the source
self.historysource = False
# If the revision's data entry was repacked into the repack target
self.datarepacked = False
# If the revision's history entry was repacked into the repack target
self.historyrepacked = False