flush only the updated parts of the height-to-hash cache file (#15996)

This commit is contained in:
Arvid Norberg 2023-09-08 03:46:52 +02:00 committed by GitHub
parent f4f1528058
commit 2682be1282
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
2 changed files with 40 additions and 11 deletions

View File

@ -42,7 +42,12 @@ class BlockHeightMap:
# count how many blocks have been added since the cache was last written to
# disk
__dirty: int
__counter: int
# this is the lowest height whose hash has been updated since the last flush
# to disk. When it's time to write to disk, we can start flushing from this
# offset
__first_dirty: int
# the file we're saving the height-to-hash cache to
__height_to_hash_filename: Path
@ -57,7 +62,8 @@ class BlockHeightMap:
self = BlockHeightMap()
self.db = db
self.__dirty = 0
self.__counter = 0
self.__first_dirty = 0
self.__height_to_hash = bytearray()
self.__sub_epoch_summaries = {}
self.__height_to_hash_filename = blockchain_dir / "height-to-hash"
@ -104,6 +110,8 @@ class BlockHeightMap:
else:
self.__height_to_hash += bytearray([0] * (new_size - size))
self.__first_dirty = height + 1
# if the peak hash is already in the height-to-hash map, we don't need
# to load anything more from the DB
if self.get_hash(height) != peak:
@ -128,17 +136,28 @@ class BlockHeightMap:
self.__sub_epoch_summaries[height] = bytes(ses)
async def maybe_flush(self) -> None:
if self.__dirty < 1000:
if self.__counter < 1000:
return
assert (len(self.__height_to_hash) % 32) == 0
map_buf = self.__height_to_hash.copy()
offset = self.__first_dirty * 32
ses_buf = bytes(SesCache([(k, v) for (k, v) in self.__sub_epoch_summaries.items()]))
self.__dirty = 0
self.__counter = 0
await write_file_async(self.__height_to_hash_filename, map_buf)
try:
async with aiofiles.open(self.__height_to_hash_filename, "r+b") as f:
map_buf = self.__height_to_hash[offset:].copy()
await f.seek(offset)
await f.write(map_buf)
except Exception:
# if the file doesn't exist, write the whole buffer
async with aiofiles.open(self.__height_to_hash_filename, "wb") as f:
map_buf = self.__height_to_hash.copy()
await f.write(map_buf)
self.__first_dirty = len(self.__height_to_hash) // 32
await write_file_async(self.__ses_filename, ses_buf)
# load height-to-hash map entries from the DB starting at height back in
@ -189,7 +208,8 @@ class BlockHeightMap:
def __set_hash(self, height: int, block_hash: bytes32) -> None:
idx = height * 32
self.__height_to_hash[idx : idx + 32] = block_hash
self.__dirty += 1
self.__counter += 1
self.__first_dirty = min(self.__first_dirty, height)
def get_hash(self, height: uint32) -> bytes32:
idx = height * 32
@ -209,6 +229,7 @@ class BlockHeightMap:
for height in heights_to_delete:
del self.__sub_epoch_summaries[height]
del self.__height_to_hash[(fork_height + 1) * 32 :]
self.__first_dirty = min(self.__first_dirty, fork_height + 1)
def get_ses(self, height: uint32) -> SubEpochSummary:
return SubEpochSummary.from_bytes(self.__sub_epoch_summaries[height])

View File

@ -394,7 +394,12 @@ class TestBlockHeightMap:
await BlockHeightMap.create(tmp_dir, db_wrapper)
# Make sure we didn't alter the cache (nothing new to write)
with open(tmp_dir / "height-to-hash", "rb") as f:
assert bytearray(f.read()) == heights
# pytest doesn't behave very well comparing large buffers
# (when the test fails). Compare small portions at a time instead
new_heights = f.read()
assert len(new_heights) == len(heights)
for idx in range(0, len(heights), 32):
assert new_heights[idx : idx + 32] == heights[idx : idx + 32]
@pytest.mark.asyncio
async def test_cache_file_replace_everything(self, tmp_dir: Path, db_version: int) -> None:
@ -433,14 +438,17 @@ class TestBlockHeightMap:
await setup_chain(db_wrapper, 4000, ses_every=20)
# At this point we should have a proper cache with the old chain data
with open(tmp_dir / "height-to-hash", "rb") as f:
heights = bytearray(f.read())
heights = f.read()
assert len(heights) == (2000 + 1) * 32
await BlockHeightMap.create(tmp_dir, db_wrapper)
# Make sure we properly wrote the additional data to the cache
with open(tmp_dir / "height-to-hash", "rb") as f:
new_heights = bytearray(f.read())
new_heights = f.read()
assert len(new_heights) == (4000 + 1) * 32
assert new_heights[: len(heights)] == heights
# pytest doesn't behave very well comparing large buffers
# (when the test fails). Compare small portions at a time instead
for idx in range(0, len(heights), 32):
assert new_heights[idx : idx + 32] == heights[idx : idx + 32]
@pytest.mark.asyncio