2019-11-01 23:24:22 +03:00
|
|
|
# Copyright (c) Facebook, Inc. and its affiliates.
|
2018-10-27 01:07:18 +03:00
|
|
|
#
|
|
|
|
# This software may be used and distributed according to the terms of the
|
2019-11-01 23:24:22 +03:00
|
|
|
# GNU General Public License version 2.
|
2018-10-27 01:07:18 +03:00
|
|
|
|
2017-05-31 20:46:16 +03:00
|
|
|
from __future__ import absolute_import
|
|
|
|
|
2018-01-30 08:19:59 +03:00
|
|
|
import collections
|
|
|
|
import errno
|
|
|
|
import hashlib
|
|
|
|
import os
|
2019-03-29 20:39:40 +03:00
|
|
|
import stat as statmod
|
2018-01-30 08:19:59 +03:00
|
|
|
import struct
|
|
|
|
import time
|
2017-11-07 05:13:55 +03:00
|
|
|
|
2019-01-30 03:25:33 +03:00
|
|
|
from edenscm.mercurial import error, policy, pycompat, util, vfs as vfsmod
|
|
|
|
from edenscm.mercurial.i18n import _
|
2019-09-27 01:28:09 +03:00
|
|
|
from edenscm.mercurial.pycompat import range
|
2018-01-30 08:19:59 +03:00
|
|
|
|
2018-05-30 12:16:33 +03:00
|
|
|
from . import constants, shallowutil
|
2018-06-08 03:49:13 +03:00
|
|
|
|
2017-05-31 20:46:16 +03:00
|
|
|
|
2016-07-08 01:59:06 +03:00
|
|
|
# The amount of time to wait between checking for new packs. This prevents an
|
|
|
|
# exception when data is moved to a new pack after the process has already
|
|
|
|
# loaded the pack list.
|
|
|
|
REFRESHRATE = 0.1
|
|
|
|
|
2017-10-17 23:05:18 +03:00
|
|
|
if pycompat.isposix:
|
2017-08-15 20:49:20 +03:00
|
|
|
# With glibc 2.7+ the 'e' flag uses O_CLOEXEC when opening.
|
|
|
|
# The 'e' flag will be ignored on older versions of glibc.
|
2018-05-30 12:16:33 +03:00
|
|
|
PACKOPENMODE = "rbe"
|
2017-08-15 20:49:20 +03:00
|
|
|
else:
|
2018-05-30 12:16:33 +03:00
|
|
|
PACKOPENMODE = "rb"
|
|
|
|
|
2017-08-15 20:49:20 +03:00
|
|
|
|
2017-11-08 21:25:12 +03:00
|
|
|
class _cachebackedpacks(object):
|
|
|
|
def __init__(self, packs, cachesize):
|
|
|
|
self._packs = set(packs)
|
|
|
|
self._lrucache = util.lrucachedict(cachesize)
|
|
|
|
self._lastpack = None
|
|
|
|
|
|
|
|
# Avoid cold start of the cache by populating the most recent packs
|
|
|
|
# in the cache.
|
|
|
|
for i in reversed(range(min(cachesize, len(packs)))):
|
|
|
|
self._movetofront(packs[i])
|
|
|
|
|
2019-04-04 20:35:38 +03:00
|
|
|
def __len__(self):
|
|
|
|
return len(self._lrucache)
|
|
|
|
|
2017-11-08 21:25:12 +03:00
|
|
|
def _movetofront(self, pack):
|
|
|
|
# This effectively makes pack the first entry in the cache.
|
|
|
|
self._lrucache[pack] = True
|
|
|
|
|
|
|
|
def _registerlastpackusage(self):
|
|
|
|
if self._lastpack is not None:
|
|
|
|
self._movetofront(self._lastpack)
|
|
|
|
self._lastpack = None
|
|
|
|
|
|
|
|
def add(self, pack):
|
|
|
|
self._registerlastpackusage()
|
|
|
|
|
|
|
|
# This method will mostly be called when packs are not in cache.
|
|
|
|
# Therefore, adding pack to the cache.
|
|
|
|
self._movetofront(pack)
|
|
|
|
self._packs.add(pack)
|
|
|
|
|
2018-05-25 20:51:54 +03:00
|
|
|
def remove(self, pack):
|
|
|
|
self._packs.remove(pack)
|
|
|
|
del self._lrucache[pack]
|
|
|
|
|
2017-11-08 21:25:12 +03:00
|
|
|
def __iter__(self):
|
|
|
|
self._registerlastpackusage()
|
|
|
|
|
|
|
|
# Cache iteration is based on LRU.
|
|
|
|
for pack in self._lrucache:
|
|
|
|
self._lastpack = pack
|
|
|
|
yield pack
|
|
|
|
|
2019-04-06 01:46:47 +03:00
|
|
|
if len(self._packs) != len(self._lrucache):
|
|
|
|
cachedpacks = set(pack for pack in self._lrucache)
|
|
|
|
# Yield for paths not in the cache.
|
|
|
|
for pack in self._packs - cachedpacks:
|
|
|
|
self._lastpack = pack
|
|
|
|
yield pack
|
2017-11-08 21:25:12 +03:00
|
|
|
|
|
|
|
# Data not found in any pack.
|
|
|
|
self._lastpack = None
|
|
|
|
|
remotefilelog: properly drop packs before repacking
Summary:
After trying to enable `remotefilelog.fetchpacks` on windows, I realized that
the inline repack would always fail to remove the old pack files, leaving
temporary files around. The reason for this is that the packfiles are simply
not being garbage collected, even though `self.packs` is properly being
reinitialized. For some reason, that isn't enough to drop the refcount on the
old `self.packs` and therefore the packfiles aren't being released.
Instead of re-initializing `self.packs`, we can simply clear them, which will
release every one properly. In some cases though, it looks like one of the
packfile still isn't being released properly, I haven't figured out why, but at
least only one temporary file will be left around (until a repack 24h later
removes it).
In doing all of this, I had to duplicate some of the code in refresh to avoid
holding packfiles references in `oldpacks`.
Reviewed By: DurhamG
Differential Revision: D15076018
fbshipit-source-id: f05852c3dddc6a25e1eb13bfd633c1fcc9466bb1
2019-04-26 04:49:54 +03:00
|
|
|
def clear(self):
|
|
|
|
self._packs.clear()
|
|
|
|
self._lrucache.clear()
|
|
|
|
self._lastpack = None
|
|
|
|
|
2018-05-30 12:16:33 +03:00
|
|
|
|
2016-05-24 12:15:58 +03:00
|
|
|
class basepackstore(object):
|
2017-11-08 21:25:12 +03:00
|
|
|
# Default cache size limit for the pack files.
|
|
|
|
DEFAULTCACHESIZE = 100
|
|
|
|
|
2018-05-25 20:51:54 +03:00
|
|
|
def __init__(self, ui, path, deletecorruptpacks=False):
|
2017-11-01 22:55:36 +03:00
|
|
|
self.ui = ui
|
2016-07-08 01:59:06 +03:00
|
|
|
self.path = path
|
2018-05-25 20:51:54 +03:00
|
|
|
self.deletecorruptpacks = deletecorruptpacks
|
2017-11-08 21:25:12 +03:00
|
|
|
|
2016-07-08 01:59:06 +03:00
|
|
|
# lastrefesh is 0 so we'll immediately check for new packs on the first
|
|
|
|
# failure.
|
|
|
|
self.lastrefresh = 0
|
2017-11-08 21:25:12 +03:00
|
|
|
|
2019-02-14 21:09:03 +03:00
|
|
|
self.packs = _cachebackedpacks([], self.DEFAULTCACHESIZE)
|
2019-04-06 01:46:47 +03:00
|
|
|
self.packspath = set()
|
2016-07-08 01:59:06 +03:00
|
|
|
|
2019-03-29 20:39:40 +03:00
|
|
|
def _getavailablepackfiles(self, currentpacks=None):
|
2017-11-07 05:13:55 +03:00
|
|
|
"""For each pack file (a index/data file combo), yields:
|
|
|
|
(full path without extension, mtime, size)
|
|
|
|
|
|
|
|
mtime will be the mtime of the index/data file (whichever is newer)
|
|
|
|
size is the combined size of index/data file
|
|
|
|
"""
|
2019-03-29 20:39:40 +03:00
|
|
|
if currentpacks is None:
|
|
|
|
currentpacks = set()
|
2017-11-07 05:13:55 +03:00
|
|
|
|
|
|
|
ids = set()
|
2018-01-30 08:19:59 +03:00
|
|
|
sizes = collections.defaultdict(lambda: 0)
|
|
|
|
mtimes = collections.defaultdict(lambda: [])
|
2016-05-24 12:15:58 +03:00
|
|
|
try:
|
2019-03-29 20:39:40 +03:00
|
|
|
for filename in os.listdir(self.path):
|
|
|
|
filename = os.path.join(self.path, filename)
|
|
|
|
id, ext = os.path.splitext(filename)
|
|
|
|
|
|
|
|
if id not in currentpacks:
|
|
|
|
# Since we expect to have two files corresponding to each ID
|
|
|
|
# (the index file and the pack file), we can yield once we see
|
|
|
|
# it twice.
|
|
|
|
if ext == self.INDEXSUFFIX or ext == self.PACKSUFFIX:
|
2019-12-10 00:09:25 +03:00
|
|
|
st = util.lstat(filename)
|
2019-03-29 20:39:40 +03:00
|
|
|
if statmod.S_ISDIR(st.st_mode):
|
|
|
|
continue
|
|
|
|
|
|
|
|
sizes[id] += st.st_size # Sum both files' sizes together
|
|
|
|
mtimes[id].append(st.st_mtime)
|
|
|
|
if id in ids:
|
|
|
|
yield (
|
|
|
|
os.path.join(self.path, id),
|
|
|
|
max(mtimes[id]),
|
|
|
|
sizes[id],
|
|
|
|
)
|
|
|
|
else:
|
|
|
|
ids.add(id)
|
2016-05-24 12:15:58 +03:00
|
|
|
except OSError as ex:
|
|
|
|
if ex.errno != errno.ENOENT:
|
|
|
|
raise
|
|
|
|
|
2019-03-29 20:39:40 +03:00
|
|
|
def _getavailablepackfilessorted(self, currentpacks):
|
2017-11-07 05:13:55 +03:00
|
|
|
"""Like `_getavailablepackfiles`, but also sorts the files by mtime,
|
|
|
|
yielding newest files first.
|
|
|
|
|
|
|
|
This is desirable, since it is more likely newer packfiles have more
|
|
|
|
desirable data.
|
|
|
|
"""
|
|
|
|
files = []
|
2019-03-29 20:39:40 +03:00
|
|
|
for path, mtime, size in self._getavailablepackfiles(currentpacks):
|
2017-11-07 05:13:55 +03:00
|
|
|
files.append((mtime, size, path))
|
2016-05-24 12:15:58 +03:00
|
|
|
files = sorted(files, reverse=True)
|
2019-03-29 20:39:40 +03:00
|
|
|
for __, __, path in files:
|
|
|
|
yield path
|
2017-11-07 05:13:55 +03:00
|
|
|
|
|
|
|
def gettotalsizeandcount(self):
|
|
|
|
"""Returns the total disk size (in bytes) of all the pack files in
|
|
|
|
this store, and the count of pack files.
|
|
|
|
|
|
|
|
(This might be smaller than the total size of the ``self.path``
|
|
|
|
directory, since this only considers fuly-writen pack files, and not
|
|
|
|
temporary files or other detritus on the directory.)
|
|
|
|
"""
|
|
|
|
totalsize = 0
|
|
|
|
count = 0
|
|
|
|
for __, __, size in self._getavailablepackfiles():
|
|
|
|
totalsize += size
|
|
|
|
count += 1
|
|
|
|
return totalsize, count
|
|
|
|
|
|
|
|
def getmetrics(self):
|
|
|
|
"""Returns metrics on the state of this store."""
|
|
|
|
size, count = self.gettotalsizeandcount()
|
2018-05-30 12:16:33 +03:00
|
|
|
return {"numpacks": count, "totalpacksize": size}
|
2016-05-24 12:15:58 +03:00
|
|
|
|
|
|
|
def getpack(self, path):
|
|
|
|
raise NotImplemented()
|
|
|
|
|
|
|
|
def getmissing(self, keys):
|
|
|
|
missing = keys
|
2018-05-30 12:16:33 +03:00
|
|
|
|
2018-05-25 20:51:49 +03:00
|
|
|
def func(pack):
|
|
|
|
return pack.getmissing(missing)
|
2018-05-30 12:16:33 +03:00
|
|
|
|
2018-05-25 20:51:49 +03:00
|
|
|
for newmissing in self.runonpacks(func):
|
|
|
|
missing = newmissing
|
2017-11-08 21:25:12 +03:00
|
|
|
if not missing:
|
2018-05-25 20:51:49 +03:00
|
|
|
break
|
2016-07-08 01:59:06 +03:00
|
|
|
|
2016-05-24 12:15:58 +03:00
|
|
|
return missing
|
|
|
|
|
2016-09-12 21:44:53 +03:00
|
|
|
def markforrefresh(self):
|
|
|
|
"""Tells the store that there may be new pack files, so the next time it
|
|
|
|
has a lookup miss it should check for new files."""
|
|
|
|
self.lastrefresh = 0
|
|
|
|
|
2016-07-08 01:59:06 +03:00
|
|
|
def refresh(self):
|
|
|
|
"""Checks for any new packs on disk, adds them to the main pack list,
|
|
|
|
and returns a list of just the new packs."""
|
|
|
|
now = time.time()
|
|
|
|
|
2019-04-04 20:35:38 +03:00
|
|
|
# When remotefilelog.fetchpacks is enabled, some commands will trigger
|
|
|
|
# many packfiles to be written to disk. This has the negative effect to
|
|
|
|
# really slow down the refresh function, to the point where 90+% of the
|
|
|
|
# time would be spent in it. A simple (but effective) solution is to
|
|
|
|
# run repack when we detect that the number of packfiles is too big. A
|
|
|
|
# better solution is to use a file format that isn't immutable, like
|
|
|
|
# IndexedLog. Running repack is the short-time solution until
|
|
|
|
# IndexedLog is more widely deployed.
|
2019-10-17 00:19:36 +03:00
|
|
|
if len(self.packs) == self.DEFAULTCACHESIZE:
|
remotefilelog: properly drop packs before repacking
Summary:
After trying to enable `remotefilelog.fetchpacks` on windows, I realized that
the inline repack would always fail to remove the old pack files, leaving
temporary files around. The reason for this is that the packfiles are simply
not being garbage collected, even though `self.packs` is properly being
reinitialized. For some reason, that isn't enough to drop the refcount on the
old `self.packs` and therefore the packfiles aren't being released.
Instead of re-initializing `self.packs`, we can simply clear them, which will
release every one properly. In some cases though, it looks like one of the
packfile still isn't being released properly, I haven't figured out why, but at
least only one temporary file will be left around (until a repack 24h later
removes it).
In doing all of this, I had to duplicate some of the code in refresh to avoid
holding packfiles references in `oldpacks`.
Reviewed By: DurhamG
Differential Revision: D15076018
fbshipit-source-id: f05852c3dddc6a25e1eb13bfd633c1fcc9466bb1
2019-04-26 04:49:54 +03:00
|
|
|
self.packs.clear()
|
|
|
|
self.packspath.clear()
|
2019-04-04 20:35:38 +03:00
|
|
|
try:
|
|
|
|
self.repackstore()
|
|
|
|
except Exception:
|
|
|
|
# Failures can happen due to concurrent repacks, which should
|
|
|
|
# be rare. Let's just ignore these, the next refresh will
|
|
|
|
# re-issue the repack and succeed.
|
|
|
|
pass
|
|
|
|
|
2016-07-08 01:59:06 +03:00
|
|
|
# If we experience a lot of misses (like in the case of getmissing() on
|
|
|
|
# new objects), let's only actually check disk for new stuff every once
|
|
|
|
# in a while. Generally this code path should only ever matter when a
|
|
|
|
# repack is going on in the background, and that should be pretty rare
|
|
|
|
# to have that happen twice in quick succession.
|
|
|
|
newpacks = []
|
|
|
|
if now > self.lastrefresh + REFRESHRATE:
|
2019-04-06 01:46:47 +03:00
|
|
|
previous = self.packspath
|
2019-03-29 20:39:40 +03:00
|
|
|
for filepath in self._getavailablepackfilessorted(previous):
|
|
|
|
try:
|
|
|
|
newpack = self.getpack(filepath)
|
|
|
|
newpacks.append(newpack)
|
|
|
|
except Exception as ex:
|
|
|
|
# An exception may be thrown if the pack file is corrupted
|
|
|
|
# somehow. Log a warning but keep going in this case, just
|
|
|
|
# skipping this pack file.
|
|
|
|
#
|
|
|
|
# If this is an ENOENT error then don't even bother logging.
|
|
|
|
# Someone could have removed the file since we retrieved the
|
|
|
|
# list of paths.
|
|
|
|
if getattr(ex, "errno", None) != errno.ENOENT:
|
|
|
|
if self.deletecorruptpacks:
|
|
|
|
self.ui.warn(_("deleting corrupt pack '%s'\n") % filepath)
|
|
|
|
util.tryunlink(filepath + self.PACKSUFFIX)
|
|
|
|
util.tryunlink(filepath + self.INDEXSUFFIX)
|
|
|
|
else:
|
|
|
|
self.ui.warn(
|
|
|
|
_("detected corrupt pack '%s' - ignoring it\n")
|
|
|
|
% filepath
|
|
|
|
)
|
2016-07-08 01:59:06 +03:00
|
|
|
|
2018-09-12 02:12:39 +03:00
|
|
|
self.lastrefresh = time.time()
|
2019-02-12 21:53:37 +03:00
|
|
|
|
|
|
|
for pack in reversed(newpacks):
|
|
|
|
self.packs.add(pack)
|
2019-04-06 01:46:47 +03:00
|
|
|
self.packspath.add(pack.path())
|
2019-02-12 21:53:37 +03:00
|
|
|
|
2016-07-08 01:59:06 +03:00
|
|
|
return newpacks
|
|
|
|
|
2018-05-25 20:51:49 +03:00
|
|
|
def runonpacks(self, func):
|
2018-05-25 20:51:54 +03:00
|
|
|
badpacks = []
|
2019-02-12 21:53:37 +03:00
|
|
|
|
remotefilelog: properly drop packs before repacking
Summary:
After trying to enable `remotefilelog.fetchpacks` on windows, I realized that
the inline repack would always fail to remove the old pack files, leaving
temporary files around. The reason for this is that the packfiles are simply
not being garbage collected, even though `self.packs` is properly being
reinitialized. For some reason, that isn't enough to drop the refcount on the
old `self.packs` and therefore the packfiles aren't being released.
Instead of re-initializing `self.packs`, we can simply clear them, which will
release every one properly. In some cases though, it looks like one of the
packfile still isn't being released properly, I haven't figured out why, but at
least only one temporary file will be left around (until a repack 24h later
removes it).
In doing all of this, I had to duplicate some of the code in refresh to avoid
holding packfiles references in `oldpacks`.
Reviewed By: DurhamG
Differential Revision: D15076018
fbshipit-source-id: f05852c3dddc6a25e1eb13bfd633c1fcc9466bb1
2019-04-26 04:49:54 +03:00
|
|
|
for pack in self.packs:
|
|
|
|
try:
|
|
|
|
yield func(pack)
|
|
|
|
except KeyError:
|
|
|
|
pass
|
|
|
|
except Exception as ex:
|
|
|
|
# Other exceptions indicate an issue with the pack file, so
|
|
|
|
# remove it.
|
|
|
|
badpacks.append((pack, getattr(ex, "errno", None)))
|
|
|
|
|
|
|
|
newpacks = self.refresh()
|
|
|
|
if newpacks != []:
|
|
|
|
newpacks = set(newpacks)
|
2019-02-12 21:53:37 +03:00
|
|
|
for pack in self.packs:
|
remotefilelog: properly drop packs before repacking
Summary:
After trying to enable `remotefilelog.fetchpacks` on windows, I realized that
the inline repack would always fail to remove the old pack files, leaving
temporary files around. The reason for this is that the packfiles are simply
not being garbage collected, even though `self.packs` is properly being
reinitialized. For some reason, that isn't enough to drop the refcount on the
old `self.packs` and therefore the packfiles aren't being released.
Instead of re-initializing `self.packs`, we can simply clear them, which will
release every one properly. In some cases though, it looks like one of the
packfile still isn't being released properly, I haven't figured out why, but at
least only one temporary file will be left around (until a repack 24h later
removes it).
In doing all of this, I had to duplicate some of the code in refresh to avoid
holding packfiles references in `oldpacks`.
Reviewed By: DurhamG
Differential Revision: D15076018
fbshipit-source-id: f05852c3dddc6a25e1eb13bfd633c1fcc9466bb1
2019-04-26 04:49:54 +03:00
|
|
|
if pack in newpacks:
|
2019-02-12 21:53:37 +03:00
|
|
|
try:
|
|
|
|
yield func(pack)
|
|
|
|
except KeyError:
|
|
|
|
pass
|
2019-03-28 20:17:39 +03:00
|
|
|
except Exception as ex:
|
2019-02-12 21:53:37 +03:00
|
|
|
# Other exceptions indicate an issue with the pack file, so
|
|
|
|
# remove it.
|
2019-03-28 20:17:39 +03:00
|
|
|
badpacks.append((pack, getattr(ex, "errno", None)))
|
2018-05-25 20:51:54 +03:00
|
|
|
|
|
|
|
if badpacks:
|
|
|
|
if self.deletecorruptpacks:
|
2019-03-28 20:17:39 +03:00
|
|
|
for pack, err in badpacks:
|
2018-05-25 20:51:54 +03:00
|
|
|
self.packs.remove(pack)
|
2019-04-06 01:46:47 +03:00
|
|
|
self.packspath.remove(pack.path())
|
2019-03-28 20:17:39 +03:00
|
|
|
|
|
|
|
if err != errno.ENOENT:
|
|
|
|
self.ui.warn(_("deleting corrupt pack '%s'\n") % pack.path())
|
|
|
|
util.tryunlink(pack.packpath())
|
|
|
|
util.tryunlink(pack.indexpath())
|
2018-05-25 20:51:54 +03:00
|
|
|
else:
|
2019-03-28 20:17:39 +03:00
|
|
|
for pack, err in badpacks:
|
|
|
|
if err != errno.ENOENT:
|
|
|
|
self.ui.warn(
|
|
|
|
_("detected corrupt pack '%s' - ignoring it\n")
|
|
|
|
% pack.path()
|
|
|
|
)
|