pma: fix regression on restore

There was a regression introduced with partition striping that caused problems
on restore due to node partitions incidentally getting mapped anonymous rather
than to the backing file
This commit is contained in:
barter-simsum 2024-03-27 19:08:40 -04:00
parent a245328266
commit 93a788ec76
2 changed files with 38 additions and 45 deletions

View File

@ -157,30 +157,6 @@ int main(int argc, char *argv[])
BT_findpath path = {0}; BT_findpath path = {0};
int rc = 0; int rc = 0;
/* broken with recent changes. Maybe because we aren't mmapping the data
ranges (pure _bt_insert) */
#if 0
DPUTS("== test 1: insert");
bt_state_new(&state1);
if (mkdir("./pmatest1", 0774) == -1)
return errno;
assert(SUCC(bt_state_open(state1, "./pmatest1", 0, 0644)));
#define LOWEST_ADDR 0x2aaa80;
vaof_t lo = LOWEST_ADDR;
vaof_t hi = 0xDEADBEEF;
pgno_t pg = 1; /* dummy value */
for (size_t i = 0; i < BT_DAT_MAXKEYS * 4; ++i) {
_bt_insert(state1, lo, hi, pg);
_test_nodeinteg(state1, &path, lo, hi, pg);
lo++; pg++;
}
bt_state_close(state1);
#endif
DPUTS("== test 2: malloc"); DPUTS("== test 2: malloc");
BT_state *state2; BT_state *state2;
@ -340,7 +316,7 @@ int main(int argc, char *argv[])
assert(SUCC(bt_state_open(state4, "./pmatest4", 0, 0644))); assert(SUCC(bt_state_open(state4, "./pmatest4", 0, 0644)));
assert(state4->file_size_p == PMA_INITIAL_SIZE_p + PMA_GROW_SIZE_p * 2); assert(state4->file_size_p == PMA_INITIAL_SIZE_p + PMA_GROW_SIZE_p * 2);
assert(state4->flist->hi == state4->file_size_p); assert(state4->flist->next->hi == state4->file_size_p);
DPUTS("== test 5: partition striping"); DPUTS("== test 5: partition striping");

View File

@ -699,6 +699,9 @@ _bt_root_new(BT_meta *meta, BT_page *root)
root->datk[0].fo = 0; root->datk[0].fo = 0;
root->datk[1].va = UINT32_MAX; root->datk[1].va = UINT32_MAX;
root->datk[1].fo = 0; root->datk[1].fo = 0;
/* though we've modified the data segment, we shouldn't mark these default
values dirty because when we attempt to sync them, we'll obviously run into
problems since they aren't mapped */
} }
static int static int
@ -1513,7 +1516,7 @@ _bt_insert2(BT_state *state, vaof_t lo, vaof_t hi, pgno_t fo,
/* nullcond: node is a leaf */ /* nullcond: node is a leaf */
if (meta->depth == depth) { if (meta->depth == depth) {
/* dirty the data range */ /* dirty the data range */
_bt_dirtydata(node, childidx); _bt_dirtydata(node, childidx); /* ;;: I believe this is incorrect. We should just directly modify the dirty bitset in _bt_insertdat */
/* guaranteed non-full and dirty by n-1 recursive call, so just insert */ /* guaranteed non-full and dirty by n-1 recursive call, so just insert */
return _bt_insertdat(lo, hi, fo, node, childidx); return _bt_insertdat(lo, hi, fo, node, childidx);
} }
@ -2209,7 +2212,7 @@ _bt_state_restore_maps(BT_state *state)
static int static int
_bt_state_meta_which(BT_state *state) _bt_state_meta_which(BT_state *state)
{ { /* ;;: TODO you need to mprotect writable the current metapage */
BT_meta *m1 = state->meta_pages[0]; BT_meta *m1 = state->meta_pages[0];
BT_meta *m2 = state->meta_pages[1]; BT_meta *m2 = state->meta_pages[1];
int which = -1; int which = -1;
@ -2307,9 +2310,8 @@ _bt_state_read_header(BT_state *state)
static int static int
_bt_state_meta_new(BT_state *state) _bt_state_meta_new(BT_state *state)
#define INITIAL_ROOTPG 2
{ {
BT_page *p1, *p2, *root; BT_page *p1, *p2;
BT_meta meta = {0}; BT_meta meta = {0};
TRACE(); TRACE();
@ -2324,9 +2326,6 @@ _bt_state_meta_new(BT_state *state)
/* initialize the block base array */ /* initialize the block base array */
meta.blk_base[0] = BT_NUMMETAS; meta.blk_base[0] = BT_NUMMETAS;
root = _bt_nalloc(state);
_bt_root_new(&meta, root);
/* initialize meta struct */ /* initialize meta struct */
meta.magic = BT_MAGIC; meta.magic = BT_MAGIC;
meta.version = BT_VERSION; meta.version = BT_VERSION;
@ -2335,8 +2334,6 @@ _bt_state_meta_new(BT_state *state)
meta.fix_addr = BT_MAPADDR; meta.fix_addr = BT_MAPADDR;
meta.depth = 1; meta.depth = 1;
meta.flags = BP_META; meta.flags = BP_META;
meta.root = _fo_get(state, root);
assert(meta.root == INITIAL_ROOTPG); /* ;;: remove?? */
/* initialize the metapages */ /* initialize the metapages */
p1 = &((BT_page *)state->map)[0]; p1 = &((BT_page *)state->map)[0];
@ -2344,9 +2341,8 @@ _bt_state_meta_new(BT_state *state)
/* copy the metadata into the metapages */ /* copy the metadata into the metapages */
memcpy(METADATA(p1), &meta, sizeof meta); memcpy(METADATA(p1), &meta, sizeof meta);
/* ;;: todo, should the second metapage actually share a .root with the /* ;;: writing to the second metapage really isn't necessary and it's probably better to leave it zeroed */
first?? */ /* memcpy(METADATA(p2), &meta, sizeof meta); */
memcpy(METADATA(p2), &meta, sizeof meta);
/* only the active metapage should be writable (first page) */ /* only the active metapage should be writable (first page) */
if (mprotect(BT_MAPADDR, BT_META_SECTION_WIDTH, BT_PROT_CLEAN) != 0) { if (mprotect(BT_MAPADDR, BT_META_SECTION_WIDTH, BT_PROT_CLEAN) != 0) {
@ -2361,6 +2357,19 @@ _bt_state_meta_new(BT_state *state)
return BT_SUCC; return BT_SUCC;
} }
static int
_bt_state_meta_inject_root(BT_state *state)
#define INITIAL_ROOTPG 2
{
assert(state->nlist);
BT_meta *meta = state->meta_pages[state->which];
BT_page *root = _bt_nalloc(state);
_bt_root_new(meta, root);
meta->root = _fo_get(state, root);
assert(meta->root == INITIAL_ROOTPG);
return BT_SUCC;
}
#undef INITIAL_ROOTPG #undef INITIAL_ROOTPG
static void static void
@ -2422,6 +2431,8 @@ _bt_state_map_node_segment(BT_state *state)
BYTE *targ = BT_MAPADDR + BT_META_SECTION_WIDTH; BYTE *targ = BT_MAPADDR + BT_META_SECTION_WIDTH;
size_t i; size_t i;
assert(meta->blk_base[0] == BT_NUMMETAS);
/* map all allocated node stripes as clean */ /* map all allocated node stripes as clean */
for (i = 0 for (i = 0
; i < BT_NUMPARTS && meta->blk_base[i] != 0 ; i < BT_NUMPARTS && meta->blk_base[i] != 0
@ -2500,19 +2511,18 @@ _bt_state_load(BT_state *state)
} }
} }
if (new) {
assert(SUCC(_bt_state_meta_new(state)));
}
/* map the node segment */ /* map the node segment */
_bt_state_map_node_segment(state); _bt_state_map_node_segment(state); /* ;;: this should follow a call to _bt_state_meta_new. hmm... but that leads to a bad dependency graph. We may need to separately initialize the first partition and only call map_node_segment on restore. */
/* new db, so populate metadata */ /* new db, so populate metadata */
if (new) { if (new) {
assert(SUCC(_flist_new(state, PMA_GROW_SIZE_p))); assert(SUCC(_flist_new(state, PMA_GROW_SIZE_p)));
assert(SUCC(_nlist_new(state))); assert(SUCC(_nlist_new(state)));
assert(SUCC(_bt_state_meta_inject_root(state)));
if (!SUCC(rc = _bt_state_meta_new(state))) {
munmap(state->map, BT_ADDRSIZE);
return rc;
}
assert(SUCC(_mlist_new(state))); assert(SUCC(_mlist_new(state)));
} }
else { else {
@ -2729,7 +2739,7 @@ _bt_sync(BT_state *state, BT_page *node, uint8_t depth, uint8_t maxdepth)
/* do dfs */ /* do dfs */
for (size_t i = 0; i < N-1; i++) { for (size_t i = 0; i < N-1; i++) {
if (!_bt_ischilddirty(node, i)) if (!_bt_ischilddirty(node, i)) /* ;;: consider removing case until dirty logic is foolproof */
continue; /* not dirty. nothing to do */ continue; /* not dirty. nothing to do */
BT_page *child = _node_get(state, node->datk[i].fo); BT_page *child = _node_get(state, node->datk[i].fo);
@ -2931,9 +2941,16 @@ bt_sync(BT_state *state)
BT_page *root = _node_get(state, meta->root); BT_page *root = _node_get(state, meta->root);
int rc = 0; int rc = 0;
/* sync root subtrees */
if ((rc = _bt_sync(state, root, 1, meta->depth))) if ((rc = _bt_sync(state, root, 1, meta->depth)))
return rc; return rc;
/* sync root page itself */
if (msync(root, sizeof(BT_page), MS_SYNC) != 0) {
DPRINTF("msync of root node: %p failed with %s", root, strerror(errno));
abort();
}
/* merge the pending freelists */ /* merge the pending freelists */
_pending_nlist_merge(state); _pending_nlist_merge(state);
_pending_flist_merge(state); _pending_flist_merge(state);