pma: fix sync bug

pma: _bt_split_datacopy use memcpy to copy dirty bitset
This commit is contained in:
barter-simsum 2024-06-27 11:29:39 -04:00 committed by Chris Allen
parent 74fad40bdc
commit dcfef17199
4 changed files with 390 additions and 57 deletions

View File

@ -8,11 +8,11 @@
*/
#undef BT_DAT_MAXKEYS
#define BT_DAT_MAXKEYS 10
#define BT_DAT_MAXKEYS 16
/* maxdepth expanded because when BT_DAT_MAXKEYS is shrunk, the depth of the
btree can grow higher than usual */
#undef BT_MAXDEPTH
#define BT_MAXDEPTH 10
#endif
#endif /* __BTEST_OVERRIDES_H__ */

View File

@ -157,6 +157,21 @@ int main(int argc, char *argv[])
BT_findpath path = {0};
int rc = 0;
/* ;;: testing dirtying logic */
#if 0
/* start: 0x2aaa80 */
bp(0);
bt_state_new(&state1);
if (mkdir("./pmatest1", 0774) == -1)
return errno;
assert(SUCC(bt_state_open(state1, "./pmatest1", 0, 0644)));
_bt_insert(state1, 0x2aaa80, 0x2aaa81, 1);
_bt_insert(state1, 0x2aaa83, 0x2aaa84, 2);
_bt_insert(state1, 0x2aaa82, 0x2aaa83, 3);
/* ;;: so basically dirtying logic is incorrect. Mark the index in the bitmap where the actual fo is inserted. Further, the dirty bitmap needs to be shifted when the data segment is shifted. This error may apply not just with leaf nodes, but branch nodes also when they are split... */
#endif
DPUTS("== test 2: malloc");
BT_state *state2;
@ -214,7 +229,7 @@ int main(int argc, char *argv[])
};
#define ITERATIONS 1000
#define MAXALLOCPG 0xFF
#define MAXALLOCPG 0xFE
lohi_pair allocs[ITERATIONS] = {0};
size_t alloc_sizp = 0;
size_t flist_sizp = _flist_sizep(state3->flist);

View File

@ -229,6 +229,7 @@ struct BT_kv {
#endif
#define BT_DAT_MAXVALS BT_DAT_MAXKEYS
static_assert(BT_DAT_MAXENTRIES % 2 == 0);
static_assert(BT_DAT_MAXKEYS % 8 == 0);
/* we assume off_t is 64 bit */
static_assert(sizeof(off_t) == sizeof(uint64_t));
@ -692,7 +693,7 @@ _bt_find2(BT_state *state,
}
static void
_bt_root_new(BT_meta *meta, BT_page *root)
_bt_root_new(BT_page *root)
{
/* The first usable address in the PMA is just beyond the btree segment */
root->datk[0].va = B2PAGES(BLK_BASE_LEN_TOTAL);
@ -748,22 +749,6 @@ _bt_datshift(BT_page *node, size_t i, size_t n)
return BT_SUCC;
}
/* _bt_split_datcopy: copy right half of left node to right node */
static int
_bt_split_datcopy(BT_page *left, BT_page *right)
{
size_t mid = BT_DAT_MAXKEYS / 2;
size_t bytelen = mid * sizeof(left->datk[0]);
/* copy rhs of left to right */
memcpy(right->datk, &left->datk[mid], bytelen);
/* zero rhs of left */
ZERO(&left->datk[mid], bytelen); /* ;;: note, this would be unnecessary if we stored node.N */
/* the last entry in left should be the first entry in right */
left->datk[mid].va = right->datk[0].va;
return BT_SUCC;
}
static int
_bt_ischilddirty(BT_page *parent, size_t child_idx)
{
@ -775,12 +760,15 @@ _bt_ischilddirty(BT_page *parent, size_t child_idx)
/* ;;: todo: name the 0x8 and 4 literals and/or generalize */
static int
_bt_dirtychild(BT_page *parent, size_t child_idx)
{
{ /* ;;: should we assert the corresponding FO is nonzero? */
/* ;;: todo: remove _bt_dirtydata */
assert(child_idx < 2048);
/* although there's nothing theoretically wrong with dirtying a dirty node,
there's probably a bug if we do it since a we only dirty a node when it's
alloced after a split or CoWed */
#if 0
assert(!_bt_ischilddirty(parent, child_idx));
#endif
uint8_t *flag = &parent->head.dirty[child_idx >> 3];
*flag |= 1 << (child_idx & 0x7);
return BT_SUCC;
@ -808,6 +796,55 @@ _bt_cleanchild(BT_page *parent, size_t child_idx)
return BT_SUCC;
}
/* _bt_split_datcopy: copy right half of left node to right node */
static int
_bt_split_datcopy(BT_page *left, BT_page *right)
{
size_t mid = BT_DAT_MAXKEYS / 2;
size_t len_b = mid * sizeof(left->datk[0]);
/* copy rhs of left to right */
memcpy(right->datk, &left->datk[mid], len_b);
/* zero rhs of left */
ZERO(&left->datk[mid], len_b); /* ;;: note, this would be unnecessary if we stored node.N */
/* the last entry in left should be the first entry in right */
left->datk[mid].va = right->datk[0].va;
/* copy rhs of left's dirty bitmap to lhs of right's */
uint8_t *l = &left->head.dirty[mid];
uint8_t *r = &right->head.dirty[0];
memcpy(r, l, len_b);
ZERO(l, len_b);
return BT_SUCC;
}
static int
_bt_dirtyshift(BT_page *node, size_t idx, size_t n)
/* shift dirty bitset at idx over by n bits */
{
assert(idx + n < 2048);
uint8_t copy[256] = {0};
/* copy bitset left of idx */
for (size_t i = 0; i < idx; i++) {
uint8_t *to = &copy[i >> 3];
int is_dirty = _bt_ischilddirty(node, i);
*to |= is_dirty;
}
/* copy bitset right of idx shifted n bits */
for (size_t i = idx; (i + n) < 2048; i++) {
uint8_t *to = &copy[(i + n) >> 3];
int is_dirty = _bt_ischilddirty(node, i);
*to |= is_dirty << n;
}
/* memcpy the shifted array into node */
memcpy(&node->head.dirty, &copy, 256);
return BT_SUCC;
}
/* ;:: assert that the node is dirty when splitting */
static int
_bt_split_child(BT_state *state, BT_page *parent, size_t i, pgno_t *newchild)
@ -834,11 +871,6 @@ _bt_split_child(BT_state *state, BT_page *parent, size_t i, pgno_t *newchild)
_bt_insertdat(lo, hi, _fo_get(state, right), parent, i);
/* dirty right child */
size_t ridx = _bt_childidx(parent, lo, hi);
assert(ridx == i+1); /* 0x100000020100;;: tmp? */
_bt_dirtychild(parent, ridx);
/* ;;: fix this */
*newchild = _fo_get(state, right);
@ -875,25 +907,31 @@ _bt_insertdat(vaof_t lo, vaof_t hi, pgno_t fo,
/* duplicate */
if (llo == lo && hhi == hi) {
parent->datk[childidx].fo = fo;
_bt_dirtychild(parent, childidx);
return BT_SUCC;
}
if (llo == lo) {
_bt_datshift(parent, childidx + 1, 1);
_bt_dirtyshift(parent, childidx + 1, 1);
vaof_t oldfo = parent->datk[childidx].fo;
parent->datk[childidx].fo = fo;
parent->datk[childidx+1].va = hi;
parent->datk[childidx+1].fo = (oldfo == 0)
? 0
: oldfo + (hi - llo);
_bt_dirtychild(parent, childidx);
}
else if (hhi == hi) {
_bt_datshift(parent, childidx + 1, 1);
_bt_dirtyshift(parent, childidx + 1, 1);
parent->datk[childidx+1].va = lo;
parent->datk[childidx+1].fo = fo;
_bt_dirtychild(parent, childidx+1);
}
else {
_bt_datshift(parent, childidx + 1, 2);
_bt_dirtyshift(parent, childidx + 1, 2);
parent->datk[childidx+1].va = lo;
parent->datk[childidx+1].fo = fo;
parent->datk[childidx+2].va = hi;
@ -902,6 +940,8 @@ _bt_insertdat(vaof_t lo, vaof_t hi, pgno_t fo,
parent->datk[childidx+2].fo = (lfo == 0)
? 0
: lfo + (hi - lva);
_bt_dirtychild(parent, childidx+1);
_bt_dirtychild(parent, childidx+2);
}
#if DEBUG_PRINTNODE
@ -1516,7 +1556,7 @@ _bt_insert2(BT_state *state, vaof_t lo, vaof_t hi, pgno_t fo,
/* nullcond: node is a leaf */
if (meta->depth == depth) {
/* dirty the data range */
_bt_dirtydata(node, childidx);
_bt_dirtydata(node, childidx); /* ;;: I believe this is incorrect. We should just directly modify the dirty bitset in _bt_insertdat */
/* guaranteed non-full and dirty by n-1 recursive call, so just insert */
return _bt_insertdat(lo, hi, fo, node, childidx);
}
@ -1556,6 +1596,9 @@ _bt_insert(BT_state *state, vaof_t lo, vaof_t hi, pgno_t fo)
BT_meta *meta = state->meta_pages[state->which];
BT_page *root = _node_get(state, meta->root);
if (fo == 0x8)
bp(0);
/* the root MUST be dirty (zero checksum in metapage) */
assert(meta->chk == 0);
@ -1585,8 +1628,6 @@ _bt_insert(BT_state *state, vaof_t lo, vaof_t hi, pgno_t fo)
/* before calling into recursive insert, handle root splitting since it's
special cased (2 allocs) */
if (N >= BT_DAT_MAXKEYS - 2) { /* ;;: remind, fix all these conditions to be - 2 */
pgno_t pg = 0;
/* the old root is now the left child of the new root */
BT_page *left = root;
BT_page *right = _bt_nalloc(state);
@ -1594,21 +1635,20 @@ _bt_insert(BT_state *state, vaof_t lo, vaof_t hi, pgno_t fo)
/* split root's data across left and right nodes */
_bt_split_datcopy(left, right);
/* save left and right in new root's .data */
pg = _fo_get(state, left);
rootnew->datk[0].fo = pg;
rootnew->datk[0].va = 0;
pg = _fo_get(state, right);
rootnew->datk[1].fo = pg;
/* point to left and right data nodes in the new root */
rootnew->datk[0].va = B2PAGES(BLK_BASE_LEN_TOTAL);
rootnew->datk[0].fo = _fo_get(state, left);
rootnew->datk[1].va = right->datk[0].va;
rootnew->datk[1].fo = _fo_get(state, right);
rootnew->datk[2].va = UINT32_MAX;
rootnew->datk[2].fo = 0;
/* dirty new root's children */
_bt_dirtychild(rootnew, 0);
_bt_dirtychild(rootnew, 1);
/* update meta page information. (root and depth) */
pg = _fo_get(state, rootnew);
meta->root = pg;
meta->root = _fo_get(state, rootnew);
meta->depth += 1;
assert(meta->depth <= BT_MAXDEPTH);
root = rootnew;
}
@ -2149,6 +2189,7 @@ _bt_state_restore_maps2(BT_state *state, BT_page *node,
/* leaf */
if (depth == maxdepth) {
for (size_t i = 0; i < N-1; i++) {
assert(_bt_ischilddirty(node, i) == 0);
vaof_t lo = node->datk[i].va;
vaof_t hi = node->datk[i+1].va;
pgno_t pg = node->datk[i].fo;
@ -2211,7 +2252,7 @@ _bt_state_restore_maps(BT_state *state)
static int
_bt_state_meta_which(BT_state *state)
{
{ /* ;;: TODO you need to mprotect writable the current metapage */
BT_meta *m1 = state->meta_pages[0];
BT_meta *m2 = state->meta_pages[1];
int which = -1;
@ -2361,7 +2402,7 @@ _bt_state_meta_inject_root(BT_state *state)
assert(state->nlist);
BT_meta *meta = state->meta_pages[state->which];
BT_page *root = _bt_nalloc(state);
_bt_root_new(meta, root);
_bt_root_new(root);
meta->root = _fo_get(state, root);
assert(meta->root == INITIAL_ROOTPG);
return BT_SUCC;
@ -2637,7 +2678,7 @@ _bt_sync_leaf(BT_state *state, BT_page *node)
size_t bytelen = P2BYTES(hi - lo);
void *addr = off2addr(lo);
/* sync the page */
/* sync the data */
if (msync(addr, bytelen, MS_SYNC) != 0) {
DPRINTF("msync of leaf: %p failed with %s", addr, strerror(errno));
abort();
@ -2648,14 +2689,7 @@ _bt_sync_leaf(BT_state *state, BT_page *node)
DPRINTF("mprotect of leaf data failed with %s", strerror(errno));
abort();
}
/* and clean the dirty bit */
_bt_cleanchild(node, i);
}
/* ;;: all data pages synced. should we now sync the node as well? No, I think
that should be the caller's responsibility */
/* ;;: it is probably faster to scan the dirty bit set and derive the datk idx
rather than iterate over the full datk array and check if it is dirty. This
was simpler to implement for now though. */
@ -2743,6 +2777,7 @@ _bt_sync(BT_state *state, BT_page *node, uint8_t depth, uint8_t maxdepth)
/* recursively syncs the subtree under node. The caller is expected to sync node
itself and mark it clean. */
{
DPRINTF("== syncing node: %p", node);
int rc = 0;
size_t N = _bt_numkeys(node);
@ -2762,21 +2797,21 @@ _bt_sync(BT_state *state, BT_page *node, uint8_t depth, uint8_t maxdepth)
/* recursively sync the child's data */
if ((rc = _bt_sync(state, child, depth+1, maxdepth)))
return rc;
/* sync the child node */
if (msync(child, sizeof(BT_page), MS_SYNC) != 0) {
DPRINTF("msync of child node: %p failed with %s", child, strerror(errno));
abort();
}
/* unset child dirty bit */
_bt_cleanchild(node, i);
}
e:
/* zero out the dirty bitmap */
ZERO(&node->head.dirty[0], sizeof node->head.dirty);
/* all modifications done in node, mark it read-only */
if (mprotect(node, sizeof(BT_page), BT_PROT_CLEAN) != 0) {
DPRINTF("mprotect of node failed with %s", strerror(errno));
DPRINTF("mprotect of node: %p failed with %s", node, strerror(errno));
abort();
}
/* sync self */
if (msync(node, sizeof(BT_page), MS_SYNC) != 0) {
DPRINTF("msync of node: %p failed with %s", node, strerror(errno));
abort();
}

View File

@ -0,0 +1,283 @@
#include "btest-overrides.h"
#include "btree.h"
#include "btree.c"
#include <stdlib.h>
#include <stdio.h>
static void
_test_nodeinteg(BT_state *state, BT_findpath *path,
vaof_t lo, vaof_t hi, pgno_t pg)
{
size_t childidx = 0;
BT_page *parent = 0;
assert(SUCC(_bt_find(state, path, lo, hi)));
parent = path->path[path->depth];
childidx = path->idx[path->depth];
assert(parent->datk[childidx].fo == pg);
assert(parent->datk[childidx].va == lo);
assert(parent->datk[childidx+1].va == hi);
}
static size_t
_mlist_sizep(BT_mlistnode *head)
/* calculate the size of the mlist in pages */
{
size_t sz = 0;
while (head) {
size_t sz_p = addr2off(head->hi) - addr2off(head->lo);
sz += sz_p;
head = head->next;
}
return sz;
}
static size_t
_flist_sizep(BT_flistnode *head)
/* calculate the size of the flist in pages */
{
size_t sz = 0;
while (head) {
size_t sz_p = head->hi - head->lo;
sz += sz_p;
head = head->next;
}
return sz;
}
static BT_mlistnode *
_mlist_copy(BT_state *state)
{
BT_mlistnode *head = state->mlist;
BT_mlistnode *ret, *prev;
ret = prev = calloc(1, sizeof *ret);
memcpy(ret, head, sizeof *head);
ret->next = 0;
head = head->next;
while (head) {
BT_mlistnode *copy = calloc(1, sizeof *copy);
memcpy(copy, head, sizeof *head);
prev->next = copy;
prev = copy;
head = head->next;
}
return ret;
}
static BT_nlistnode *
_nlist_copy(BT_state *state)
{
BT_nlistnode *head = state->nlist;
BT_nlistnode *ret, *prev;
ret = prev = calloc(1, sizeof *ret);
memcpy(ret, head, sizeof *head);
ret->next = 0;
head = head->next;
while (head) {
BT_nlistnode *copy = calloc(1, sizeof *copy);
memcpy(copy, head, sizeof *head);
prev->next = copy;
prev = copy;
head = head->next;
}
return ret;
}
static BT_flistnode *
_flist_copy(BT_state *state)
{
BT_flistnode *head = state->flist;
BT_flistnode *ret, *prev;
ret = prev = calloc(1, sizeof *ret);
memcpy(ret, head, sizeof *head);
ret->next = 0;
head = head->next;
while (head) {
BT_flistnode *copy = calloc(1, sizeof *copy);
memcpy(copy, head, sizeof *head);
prev->next = copy;
prev = copy;
head = head->next;
}
return ret;
}
static int
_mlist_eq(BT_mlistnode *l, BT_mlistnode *r)
{
while (l && r) {
if (l->lo != r->lo)
bp(0);
if (l->hi != r->hi)
bp(0);
l = l->next; r = r->next;
}
if (l == 0 && r == 0)
return 1;
bp(0);
}
static int
_nlist_eq(BT_nlistnode *l, BT_nlistnode *r)
{
while (l && r) {
if (l->lo != r->lo)
bp(0);
if (l->hi != r->hi)
bp(0);
l = l->next; r = r->next;
}
if (l == 0 && r == 0)
return 1;
bp(0);
}
static int
_flist_eq(BT_flistnode *l, BT_flistnode *r)
{
while (l && r) {
if (l->lo != r->lo)
bp(0);
if (l->hi != r->hi)
bp(0);
l = l->next; r = r->next;
}
if (l == 0 && r == 0)
return 1;
bp(0);
}
int main(int argc, char *argv[])
{
DPRINTF("PMA Max Storage: %lld", ((uint64_t)UINT32_MAX * BT_PAGESIZE) - BLK_BASE_LEN_TOTAL);
DPUTS("PMA Tests");
BT_findpath path = {0};
int rc = 0;
/*
test 3 pairs poorly with an overridden BT_DAT_MAKEYS=10 leading to huge
persistent file growth. Disabling for now. Is there some way we can easily
override these values at a per-test level without altering a code?
*/
BT_page *testnode = calloc(0, sizeof *testnode);
for (size_t i = 0; i <= 10; i++) {
assert(!_bt_ischilddirty(testnode, i));
}
for (size_t i = 0; i <= 10; i++) {
_bt_dirtychild(testnode, i);
assert(_bt_ischilddirty(testnode, i));
}
for (size_t i = 0; i <= 10; i++) {
assert(_bt_ischilddirty(testnode, i));
}
assert(!_bt_ischilddirty(testnode, 11));
DPUTS("== test 3: ephemeral structure restoration");
BT_state *state3;
bt_state_new(&state3);
if (mkdir("./pmatest3", 0774) == -1)
return errno;
assert(SUCC(bt_state_open(state3, "./pmatest3", 0, 0644)));
typedef struct lohi_pair lohi_pair;
struct lohi_pair
{
BT_page *lo;
BT_page *hi;
};
/* #define ITERATIONS 1000 */
#define ITERATIONS 40
#define MAXALLOCPG 0xF
lohi_pair allocs[ITERATIONS] = {0};
int should_free[ITERATIONS] = {0};
size_t alloc_sizp = 0;
size_t mlist_sizp = _mlist_sizep(state3->mlist);
BT_meta *meta = state3->meta_pages[state3->which];
BT_page *root = _node_get(state3, meta->root);
size_t N;
/* ;;: testing huge alloc for ted */
/* bp(0); */
/* void *foo = bt_malloc(state3, 0x40000); */
bp(0);
for (size_t i = 0; i < ITERATIONS; i++) {
/* malloc a random number of pages <= 16 and store in the allocs array */
int pages = random();
pages &= MAXALLOCPG;
pages |= 1;
allocs[i].lo = bt_malloc(state3, pages);
allocs[i].hi = allocs[i].lo + pages;
should_free[i] = random() % 2;
alloc_sizp += pages;
}
/* close/reopen */
#if 1
/* a sync is not sufficient to repro the sigsegv. So issue most likely in pma
restore path */
bp(0);
bt_state_close(state3);
bt_state_new(&state3);
assert(SUCC(bt_state_open(state3, "./pmatest3", 0, 0644)));
#else
bt_sync(state3);
#endif
/* free allocations according to the values of should_free[i] */
bp(0);
for (size_t i = 0; i < ITERATIONS; i++) {
if (should_free[i]) {
bt_free(state3, allocs[i].lo, allocs[i].hi);
}
}
/* sync before cloning ephemeral data structures and closing the pma in order
to ensure the pending freelists are already merged (otherwise the clone
will differ from the reconstructed) */
bp(0);
bt_sync(state3);
/* copy ephemeral data structures */
BT_mlistnode *mlist_copy = _mlist_copy(state3);
BT_nlistnode *nlist_copy = _nlist_copy(state3);
BT_flistnode *flist_copy = _flist_copy(state3);
/* close/reopen */
bt_state_close(state3);
bt_state_new(&state3);
assert(SUCC(bt_state_open(state3, "./pmatest3", 0, 0644)));
/* assert clones structures are equivalent to those reconstructed */
bp(0);
assert(_mlist_eq(mlist_copy, state3->mlist));
assert(_nlist_eq(nlist_copy, state3->nlist));
assert(_flist_eq(flist_copy, state3->flist));
meta = state3->meta_pages[state3->which];
BT_meta metacopy = {0};
memcpy(&metacopy, meta, sizeof metacopy);
bt_state_close(state3);
bt_state_new(&state3);
assert(SUCC(bt_state_open(state3, "./pmatest3", 0, 0644)));
/* compare for equality copies of ephemeral structures with restored ephemeral
structures */
meta = state3->meta_pages[state3->which];
/* ;;: fixme */
/* assert(meta->root == metacopy.root); */
/* assert(_mlist_eq(mlist_copy, state3->mlist)); */
/* assert(_nlist_eq(nlist_copy, state3->nlist)); */
/* assert(_flist_eq(flist_copy, state3->flist)); */
bt_state_close(state3);
return 0;
}