mirror of
https://github.com/urbit/ares.git
synced 2024-11-26 09:57:56 +03:00
pma: partition striping wip
This commit is contained in:
parent
38e17971ae
commit
05cb9d8dba
@ -307,25 +307,7 @@ static const size_t BLK_BASE_LENS_b[BT_NUMPARTS] = {
|
|||||||
BLK_BASE_LEN7,
|
BLK_BASE_LEN7,
|
||||||
};
|
};
|
||||||
|
|
||||||
#define BLK_BASE_OFF0 ((size_t)BT_META_SECTION_WIDTH)
|
static_assert(PMA_GROW_SIZE_b >= (BLK_BASE_LEN0 + BT_META_LEN));
|
||||||
#define BLK_BASE_OFF1 (BLK_BASE_OFF0 + BLK_BASE_LEN0)
|
|
||||||
#define BLK_BASE_OFF2 (BLK_BASE_OFF1 + BLK_BASE_LEN1)
|
|
||||||
#define BLK_BASE_OFF3 (BLK_BASE_OFF2 + BLK_BASE_LEN2)
|
|
||||||
#define BLK_BASE_OFF4 (BLK_BASE_OFF3 + BLK_BASE_LEN3)
|
|
||||||
#define BLK_BASE_OFF5 (BLK_BASE_OFF4 + BLK_BASE_LEN4)
|
|
||||||
#define BLK_BASE_OFF6 (BLK_BASE_OFF5 + BLK_BASE_LEN5)
|
|
||||||
#define BLK_BASE_OFF7 (BLK_BASE_OFF6 + BLK_BASE_LEN6)
|
|
||||||
|
|
||||||
static const size_t BLK_BASE_OFFS_b[BT_NUMPARTS] = {
|
|
||||||
BLK_BASE_OFF0,
|
|
||||||
BLK_BASE_OFF1,
|
|
||||||
BLK_BASE_OFF2,
|
|
||||||
BLK_BASE_OFF3,
|
|
||||||
BLK_BASE_OFF4,
|
|
||||||
BLK_BASE_OFF5,
|
|
||||||
BLK_BASE_OFF6,
|
|
||||||
BLK_BASE_OFF7,
|
|
||||||
};
|
|
||||||
|
|
||||||
typedef struct BT_mlistnode BT_mlistnode;
|
typedef struct BT_mlistnode BT_mlistnode;
|
||||||
struct BT_mlistnode {
|
struct BT_mlistnode {
|
||||||
@ -358,6 +340,7 @@ struct BT_state {
|
|||||||
int data_fd;
|
int data_fd;
|
||||||
char *path;
|
char *path;
|
||||||
void *fixaddr;
|
void *fixaddr;
|
||||||
|
/* ;;: TODO: refactor ->map to be a (BT_page *) */
|
||||||
BYTE *map;
|
BYTE *map;
|
||||||
BT_meta *meta_pages[2]; /* double buffered */
|
BT_meta *meta_pages[2]; /* double buffered */
|
||||||
pgno_t file_size_p; /* the size of the pma file in pages */
|
pgno_t file_size_p; /* the size of the pma file in pages */
|
||||||
@ -428,7 +411,7 @@ _node_get(BT_state *state, pgno_t pgno)
|
|||||||
/* ;;: hmm. is there something wrong here? No, I don't think so.
|
/* ;;: hmm. is there something wrong here? No, I don't think so.
|
||||||
|
|
||||||
On resume (reading a persistent file):
|
On resume (reading a persistent file):
|
||||||
|
|
||||||
1) mmap the node partitions.
|
1) mmap the node partitions.
|
||||||
- (read the offset stored in meta->blk_base)
|
- (read the offset stored in meta->blk_base)
|
||||||
- mmap the offset + corresponding length of the pma file next to the end
|
- mmap the offset + corresponding length of the pma file next to the end
|
||||||
@ -447,9 +430,9 @@ _node_get(BT_state *state, pgno_t pgno)
|
|||||||
*** page offset into the memory arena and calling _node_get on it. That
|
*** page offset into the memory arena and calling _node_get on it. That
|
||||||
*** would technically work for the first partition. It will NOT work for any
|
*** would technically work for the first partition. It will NOT work for any
|
||||||
*** other partition. Not sure if we are doing that anywhere currently.
|
*** other partition. Not sure if we are doing that anywhere currently.
|
||||||
|
|
||||||
*/
|
*/
|
||||||
|
|
||||||
return FO2PA(state->map, pgno);
|
return FO2PA(state->map, pgno);
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -457,6 +440,7 @@ _node_get(BT_state *state, pgno_t pgno)
|
|||||||
static pgno_t
|
static pgno_t
|
||||||
_fo_get(BT_state *state, BT_page *node)
|
_fo_get(BT_state *state, BT_page *node)
|
||||||
{
|
{
|
||||||
|
/* ;;: This may need to be fixed to accommodate partition striping */
|
||||||
uintptr_t vaddr = (uintptr_t)node;
|
uintptr_t vaddr = (uintptr_t)node;
|
||||||
uintptr_t start = (uintptr_t)state->map;
|
uintptr_t start = (uintptr_t)state->map;
|
||||||
return BY2FO(vaddr - start);
|
return BY2FO(vaddr - start);
|
||||||
@ -2422,6 +2406,54 @@ _freelist_restore(BT_state *state)
|
|||||||
_freelist_restore2(state, root, 1, meta->depth);
|
_freelist_restore2(state, root, 1, meta->depth);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static void
|
||||||
|
_bt_state_map_node_segment(BT_state *state)
|
||||||
|
{
|
||||||
|
BT_meta *meta = state->meta_pages[state->which];
|
||||||
|
BYTE *targ = BT_MAPADDR + BT_META_SECTION_WIDTH;
|
||||||
|
size_t i;
|
||||||
|
|
||||||
|
/* map all allocated node stripes as clean */
|
||||||
|
for (i = 0
|
||||||
|
; i < BT_NUMPARTS && meta->blk_base[i] != 0
|
||||||
|
; i++) {
|
||||||
|
pgno_t partoff_p = meta->blk_base[i];
|
||||||
|
size_t partoff_b = P2BYTES(partoff_p);
|
||||||
|
size_t partlen_b = BLK_BASE_LENS_b[i];
|
||||||
|
|
||||||
|
if (targ != mmap(targ,
|
||||||
|
partlen_b,
|
||||||
|
BT_PROT_CLEAN,
|
||||||
|
BT_FLAG_CLEAN,
|
||||||
|
state->data_fd,
|
||||||
|
partoff_b)) {
|
||||||
|
DPRINTF("mmap: failed to map node stripe %zu, addr: 0x%p, file offset (bytes): 0x%zX, errno: %s",
|
||||||
|
i, targ, partoff_b, strerror(errno));
|
||||||
|
abort();
|
||||||
|
}
|
||||||
|
|
||||||
|
/* move the target address ahead of the mapped partition */
|
||||||
|
targ += partlen_b;
|
||||||
|
}
|
||||||
|
|
||||||
|
/* map the rest of the node segment as free */
|
||||||
|
for (; i < BT_NUMPARTS; i++) {
|
||||||
|
assert(meta->blk_base[i] == 0);
|
||||||
|
size_t partlen_b = BLK_BASE_LENS_b[i];
|
||||||
|
if (targ != mmap (targ,
|
||||||
|
partlen_b,
|
||||||
|
BT_PROT_FREE,
|
||||||
|
BT_FLAG_FREE,
|
||||||
|
0, 0)) {
|
||||||
|
DPRINTF("mmap: failed to map unallocated node segment, addr: 0x%p, errno: %s",
|
||||||
|
targ, strerror(errno));
|
||||||
|
abort();
|
||||||
|
}
|
||||||
|
|
||||||
|
targ += partlen_b;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
static int
|
static int
|
||||||
_bt_state_load(BT_state *state)
|
_bt_state_load(BT_state *state)
|
||||||
{
|
{
|
||||||
@ -2432,16 +2464,19 @@ _bt_state_load(BT_state *state)
|
|||||||
|
|
||||||
TRACE();
|
TRACE();
|
||||||
|
|
||||||
/* map first node stripe (along with metapages) as read only */
|
/* map the metapages */
|
||||||
/* ;;: todo: after handling the first node stripe which always exists, read
|
|
||||||
the current metapage's blk_base and appropriately mmap each partition */
|
|
||||||
state->map = mmap(BT_MAPADDR,
|
state->map = mmap(BT_MAPADDR,
|
||||||
BT_META_SECTION_WIDTH + BLK_BASE_LEN0,
|
BT_META_SECTION_WIDTH,
|
||||||
BT_PROT_CLEAN,
|
BT_PROT_CLEAN,
|
||||||
BT_FLAG_CLEAN,
|
BT_FLAG_CLEAN,
|
||||||
state->data_fd,
|
state->data_fd,
|
||||||
0);
|
0);
|
||||||
|
|
||||||
|
if (state->map != BT_MAPADDR) {
|
||||||
|
DPRINTF("mmap: failed to map at addr %p, errno: %s", BT_MAPADDR, strerror(errno));
|
||||||
|
abort();
|
||||||
|
}
|
||||||
|
|
||||||
p = (BT_page *)state->map;
|
p = (BT_page *)state->map;
|
||||||
state->meta_pages[0] = METADATA(p);
|
state->meta_pages[0] = METADATA(p);
|
||||||
state->meta_pages[1] = METADATA(p + 1);
|
state->meta_pages[1] = METADATA(p + 1);
|
||||||
@ -2456,21 +2491,8 @@ _bt_state_load(BT_state *state)
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
if (state->map != BT_MAPADDR) {
|
/* map the node segment */
|
||||||
DPRINTF("mmap: failed to map at addr %p, errno: %s", BT_MAPADDR, strerror(errno));
|
_bt_state_map_node_segment(state);
|
||||||
abort();
|
|
||||||
}
|
|
||||||
|
|
||||||
BYTE *nullspace_addr = BT_MAPADDR + (BT_META_SECTION_WIDTH + BLK_BASE_LEN0);
|
|
||||||
size_t nullspace_len = BLK_BASE_LEN_TOTAL - (BT_META_SECTION_WIDTH + BLK_BASE_LEN0);
|
|
||||||
if (nullspace_addr != mmap(nullspace_addr,
|
|
||||||
nullspace_len,
|
|
||||||
BT_PROT_FREE,
|
|
||||||
BT_FLAG_FREE,
|
|
||||||
0, 0)) {
|
|
||||||
DPRINTF("mmap: failed to map at addr %p, errno: %s", nullspace_addr, strerror(errno));
|
|
||||||
abort();
|
|
||||||
}
|
|
||||||
|
|
||||||
/* new db, so populate metadata */
|
/* new db, so populate metadata */
|
||||||
if (new) {
|
if (new) {
|
||||||
@ -3250,3 +3272,37 @@ _bt_printnode(BT_page *node)
|
|||||||
fprintf(stderr, "[%5zu] %10x %10x\n", i, node->datk[i].va, node->datk[i].fo);
|
fprintf(stderr, "[%5zu] %10x %10x\n", i, node->datk[i].va, node->datk[i].fo);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
/*
|
||||||
|
|
||||||
|
re: partition striping, I find the following somewhat confusing.
|
||||||
|
|
||||||
|
A pgno for a node could be either:
|
||||||
|
|
||||||
|
1) Its actual pgno in the persistent file
|
||||||
|
|
||||||
|
2) merely a page offset into the memory arena
|
||||||
|
|
||||||
|
The pgno that a parent node stores for a child need not be (1) if the memory
|
||||||
|
maps are properly restored from the blk_base array in the metapage. Right?
|
||||||
|
|
||||||
|
So, on startup:
|
||||||
|
|
||||||
|
before traversing nodes, restore the memory map by mapping each partition
|
||||||
|
successively after the other. Do this for all non-zero page offsets in
|
||||||
|
blk_base.
|
||||||
|
|
||||||
|
If this is done, then _node_get and _fo_get can remain largely unchanged
|
||||||
|
|
||||||
|
Is there any reason this won't work?
|
||||||
|
|
||||||
|
---------------------------
|
||||||
|
|
||||||
|
If we have to do (2) for some reason. Then _node_get and _fo_get /will/ have
|
||||||
|
to read the blk_base array and appropriately translate using the offsets and
|
||||||
|
partition sizes.
|
||||||
|
|
||||||
|
is _bt_data_cow a problem?
|
||||||
|
|
||||||
|
*/
|
||||||
|
Loading…
Reference in New Issue
Block a user