sapling/remotefilelog/repack.py

import os
from collections import defaultdict
from mercurial import error, mdiff, osutil, scmutil, util
from mercurial.node import nullid, bin, hex
from mercurial.i18n import _
import datapack, historypack, contentstore, metadatastore, shallowutil

def backgroundrepack(repo, incremental=True):
    cmd = util.hgcmd() + ['-R', repo.origroot, 'repack']
    incrementalstr = ''
    if incremental:
        cmd.append('--incremental')
        incrementalstr = 'incremental '
    cmd = ' '.join(map(util.shellquote, cmd))

    repo.ui.warn("(running background %srepack)\n" % incrementalstr)
    shallowutil.runshellcommand(cmd, os.environ)

def fullrepack(repo):
    datasource = contentstore.unioncontentstore(*repo.shareddatastores)
    historysource = metadatastore.unionmetadatastore(*repo.sharedhistorystores,
                                                     allowincomplete=True)

    _runrepack(repo, datasource, historysource)

def incrementalrepack(repo):
    """This repacks the repo by looking at the distribution of pack files in the
    repo and performing the most minimal repack to keep the repo in good shape.
    """

    packpath = shallowutil.getpackpath(repo)
    shallowutil.mkstickygroupdir(repo.ui, packpath)

    files = osutil.listdir(packpath, stat=True)

    datapacks = _computeincrementaldatapack(repo.ui, files)
    fullpaths = list(os.path.join(packpath, p) for p in datapacks)
    datapacks = list(datapack.datapack(p) for p in fullpaths)
    datapacks.extend(s for s in repo.shareddatastores
                     if not isinstance(s, datapack.datapackstore))

    historypacks = _computeincrementalhistorypack(repo.ui, files)
    fullpaths = list(os.path.join(packpath, p) for p in historypacks)
    historypacks = list(historypack.historypack(p) for p in fullpaths)
    historypacks.extend(s for s in repo.sharedhistorystores
                        if not isinstance(s, historypack.historypackstore))

    datasource = contentstore.unioncontentstore(*datapacks)
    historysource = metadatastore.unionmetadatastore(*historypacks,
                                                     allowincomplete=True)

    _runrepack(repo, datasource, historysource)

def _computeincrementaldatapack(ui, files):
    """Given a set of pack files and a set of generation size limits, this
    function computes the list of files that should be packed as part of an
    incremental repack.

    It tries to strike a balance between keeping incremental repacks cheap (i.e.
    packing small things when possible, and rolling the packs up to the big ones
    over time).
    """
    generations = ui.configlist("remotefilelog", "data.generations",
                                ['1GB', '100MB', '1MB'])
    generations = list(sorted((util.sizetoint(s) for s in generations),
                                reverse=True))
    generations.append(0)

    gencountlimit = ui.configint('remotefilelog', 'data.gencountlimit', 2)
    repacksizelimit = ui.configbytes('remotefilelog', 'data.repacksizelimit',
                                     '100MB')

    return _computeincrementalpack(ui, files, generations, datapack.PACKSUFFIX,
            datapack.INDEXSUFFIX, gencountlimit, repacksizelimit)

def _computeincrementalhistorypack(ui, files):
    generations = ui.configlist("remotefilelog", "history.generations",
                                ['100MB'])
    generations = list(sorted((util.sizetoint(s) for s in generations),
                                reverse=True))
    generations.append(0)

    gencountlimit = ui.configint('remotefilelog', 'history.gencountlimit', 2)
    repacksizelimit = ui.configbytes('remotefilelog', 'history.repacksizelimit',
                                     '100MB')

    return _computeincrementalpack(ui, files, generations,
            historypack.PACKSUFFIX, historypack.INDEXSUFFIX, gencountlimit,
            repacksizelimit)

def _computeincrementalpack(ui, files, limits, packsuffix, indexsuffix,
                            gencountlimit, repacksizelimit):
    # Group the packs by generation (i.e. by size)
    generations = []
    for i in xrange(len(limits)):
        generations.append([])
    sizes = {}
    fileset = set(fn for fn, mode, stat in files)
    for filename, mode, stat in files:
        if not filename.endswith(packsuffix):
            continue

        prefix = filename[:-len(packsuffix)]

        # Don't process a pack if it doesn't have an index.
        if (prefix + indexsuffix) not in fileset:
            continue

        size = stat.st_size
        sizes[prefix] = size
        for i, limit in enumerate(limits):
            if size > limit:
                generations[i].append(prefix)
                break

    # Find the largest generation with more than 2 packs and repack it.
    for i, limit in enumerate(limits):
        if len(generations[i]) > gencountlimit:
            # Generally we only want to repack 2 things at once, but if the
            # whole generation is small, let's just do it all!
            count = 2
            if sum(sizes[n] for n in generations[i]) < repacksizelimit:
                count = len(generations[i])
            return sorted(generations[i], key=lambda x: sizes[x])[:count]

    # If no generation has more than 2 packs, repack as many as fit into the
    # limit
    small = set().union(*generations[1:])
    if len(small) > 1:
        total = 0
        packs = []
        for pack in sorted(small, key=lambda x: sizes[x]):
            size = sizes[pack]
            if total + size < repacksizelimit:
                packs.append(pack)
                total += size
            else:
                break

        if len(packs) > 1:
            return packs

    # If there aren't small ones to repack, repack the two largest ones.
    if len(generations[0]) > 1:
        return generations[0]

    return []

def _runrepack(repo, data, history):
    packpath = shallowutil.getpackpath(repo)
    shallowutil.mkstickygroupdir(repo.ui, packpath)

    packer = repacker(repo, data, history)

    opener = scmutil.vfs(packpath)
    # Packs should be write-once files, so set them to read-only.
    opener.createmode = 0o444
    with datapack.mutabledatapack(opener) as dpack:
        with historypack.mutablehistorypack(opener) as hpack:
            try:
                packer.run(dpack, hpack)
            except error.LockHeld:
                raise error.Abort(_("skipping repack - another repack is "
                                    "already running"))

class repacker(object):
    """Class for orchestrating the repack of data and history information into a
    new format.
    """
    def __init__(self, repo, data, history):
        self.repo = repo
        self.data = data
        self.history = history

    def run(self, targetdata, targethistory):
        ledger = repackledger()

        with self.repo._lock(self.repo.svfs, "repacklock", False, None,
                             None, _('repacking %s') % self.repo.origroot):
            self.repo.hook('prerepack')
            # Populate ledger from source
            self.data.markledger(ledger)
            self.history.markledger(ledger)

            # Run repack
            self.repackdata(ledger, targetdata)
            self.repackhistory(ledger, targethistory)

            # Call cleanup on each source
            for source in ledger.sources:
                source.cleanup(ledger)

    def repackdata(self, ledger, target):
        ui = self.repo.ui

        byfile = {}
        for entry in ledger.entries.itervalues():
            if entry.datasource:
                byfile.setdefault(entry.filename, {})[entry.node] = entry

        count = 0
        for filename, entries in sorted(byfile.iteritems()):
            ancestors = {}
            nodes = list(node for node in entries.iterkeys())
            nohistory = []
            for node in nodes:
                if node in ancestors:
                    continue
                try:
                    ancestors.update(self.history.getancestors(filename, node))
                except KeyError:
                    # Since we're packing data entries, we may not have the
                    # corresponding history entries for them. It's not a big
                    # deal, but the entries won't be delta'd perfectly.
                    nohistory.append(node)

            # Order the nodes children first, so we can produce reverse deltas
            orderednodes = list(reversed(self._toposort(ancestors)))
            orderednodes.extend(sorted(nohistory))

            # Compute deltas and write to the pack
            deltabases = defaultdict(lambda: nullid)
            nodes = set(nodes)
            for node in orderednodes:
                # orderednodes is all ancestors, but we only want to serialize
                # the files we have.
                if node not in nodes:
                    continue

                # Find delta base
                # TODO: allow delta'ing against most recent descendant instead
                # of immediate child
                deltabase = deltabases[node]

                # Use available ancestor information to inform our delta choices
                ancestorinfo = ancestors.get(node)
                if ancestorinfo:
                    p1, p2, linknode, copyfrom = ancestorinfo

                    # The presence of copyfrom means we're at a point where the
                    # file was copied from elsewhere. So don't attempt to do any
                    # deltas with the other file.
                    if copyfrom:
                        p1 = nullid

                    # Record this child as the delta base for its parents.
                    # This may be non optimal, since the parents may have many
                    # children, and this will only choose the last one.
                    # TODO: record all children and try all deltas to find best
                    if p1 != nullid:
                        deltabases[p1] = node
                    if p2 != nullid:
                        deltabases[p2] = node

                # Compute delta
                # TODO: reuse existing deltas if it matches our deltabase
                if deltabase != nullid:
                    deltabasetext = self.data.get(filename, deltabase)
                    original = self.data.get(filename, node)
                    delta = mdiff.textdiff(deltabasetext, original)
                else:
                    delta = self.data.get(filename, node)

                # TODO: don't use the delta if it's larger than the fulltext
                target.add(filename, node, deltabase, delta)

                entries[node].datarepacked = True

            count += 1
            ui.progress(_("repacking data"), count, unit="files",
                        total=len(byfile))

        ui.progress(_("repacking data"), None)
        target.close(ledger=ledger)

    def repackhistory(self, ledger, target):
        ui = self.repo.ui

        byfile = {}
        for entry in ledger.entries.itervalues():
            if entry.historysource:
                byfile.setdefault(entry.filename, {})[entry.node] = entry

        count = 0
        for filename, entries in sorted(byfile.iteritems()):
            ancestors = {}
            nodes = list(node for node in entries.iterkeys())

            for node in nodes:
                if node in ancestors:
                    continue
                ancestors.update(self.history.getancestors(filename, node))

            # Order the nodes children first
            orderednodes = reversed(self._toposort(ancestors))

            # Write to the pack
            dontprocess = set()
            for node in orderednodes:
                p1, p2, linknode, copyfrom = ancestors[node]

                # If the node is marked dontprocess, but it's also in the
                # explicit entries set, that means the node exists both in this
                # file and in another file that was copied to this file.
                # Usually this happens if the file was copied to another file,
                # then the copy was deleted, then reintroduced without copy
                # metadata. The original add and the new add have the same hash
                # since the content is identical and the parents are null.
                if node in dontprocess and node not in entries:
                    # If copyfrom == filename, it means the copy history
                    # went to come other file, then came back to this one, so we
                    # should continue processing it.
                    if p1 != nullid and copyfrom != filename:
                        dontprocess.add(p1)
                    if p2 != nullid:
                        dontprocess.add(p2)
                    continue

                if copyfrom:
                    dontprocess.add(p1)

                target.add(filename, node, p1, p2, linknode, copyfrom)

                if node in entries:
                    entries[node].historyrepacked = True

            count += 1
            ui.progress(_("repacking history"), count, unit="files",
                        total=len(byfile))

        ui.progress(_("repacking history"), None)
        target.close(ledger=ledger)

    def _toposort(self, ancestors):
        def parentfunc(node):
            p1, p2, linknode, copyfrom = ancestors[node]
            parents = []
            if p1 != nullid:
                parents.append(p1)
            if p2 != nullid:
                parents.append(p2)
            return parents

        sortednodes = shallowutil.sortnodes(ancestors.keys(), parentfunc)
        return sortednodes

class repackledger(object):
    """Storage for all the bookkeeping that happens during a repack. It contains
    the list of revisions being repacked, what happened to each revision, and
    which source store contained which revision originally (for later cleanup).
    """
    def __init__(self):
        self.entries = {}
        self.sources = {}
        self.created = set()

    def markdataentry(self, source, filename, node):
        """Mark the given filename+node revision as having a data rev in the
        given source.
        """
        entry = self._getorcreateentry(filename, node)
        entry.datasource = True
        entries = self.sources.get(source)
        if not entries:
            entries = set()
            self.sources[source] = entries
        entries.add(entry)

    def markhistoryentry(self, source, filename, node):
        """Mark the given filename+node revision as having a history rev in the
        given source.
        """
        entry = self._getorcreateentry(filename, node)
        entry.historysource = True
        entries = self.sources.get(source)
        if not entries:
            entries = set()
            self.sources[source] = entries
        entries.add(entry)

    def _getorcreateentry(self, filename, node):
        key = (filename, node)
        value = self.entries.get(key)
        if not value:
            value = repackentry(filename, node)
            self.entries[key] = value

        return value

    def addcreated(self, value):
        self.created.add(value)

class repackentry(object):
    """Simple class representing a single revision entry in the repackledger.
    """
    __slots__ = ['filename', 'node', 'datasource', 'historysource',
                 'datarepacked', 'historyrepacked']
    def __init__(self, filename, node):
        self.filename = filename
        self.node = node
        # If the revision has a data entry in the source
        self.datasource = False
        # If the revision has a history entry in the source
        self.historysource = False
        # If the revision's data entry was repacked into the repack target
        self.datarepacked = False
        # If the revision's history entry was repacked into the repack target
        self.historyrepacked = False