mirror of
synced 2024-12-02 12:16:44 +03:00
- initialize root to page offset beyond first node stripe (not zero). Is this improper? We initialize the mlist at the first va in the root. Maybe that part should handle the node stripes instead? Don't know if it actually matters. - fix insert tests - fix state->meta_pages initialization - other minor fixes
2894 lines
77 KiB
2894 lines
77 KiB
#include <sys/types.h>
#include <sys/stat.h>
#include <sys/mman.h>
#include <stdint.h>
#include <unistd.h>
#include <stdlib.h>
#include <fcntl.h>
#include <assert.h>
#include <stdio.h>
#include <errno.h>
#include <string.h>
#include <inttypes.h>
#include "btree.h"
#include "lib/checksum.h"
typedef uint32_t pgno_t; /* a page number */
typedef uint32_t vaof_t; /* a virtual address offset */
typedef uint32_t flag_t;
typedef unsigned char BYTE;
//// ===========================================================================
//// tmp tmp tmp tmp tmp
/* ;;: remove -- for debugging */
bp(X) where X is false will raise a SIGTRAP. If the process is being run
inside a debugger, this can be caught and ignored. It's equivalent to a
breakpoint. If run without a debugger, it will dump core, like an assert
#ifdef DEBUG
#if defined(__i386__) || defined(__x86_64__)
#define bp(x) do { if(!(x)) __asm__ volatile("int $3"); } while (0)
#elif defined(__thumb__)
#define bp(x) do { if(!(x)) __asm__ volatile(".inst 0xde01"); } while (0)
#elif defined(__aarch64__)
#define bp(x) do { if(!(x)) __asm__ volatile(".inst 0xd4200000"); } while (0)
#elif defined(__arm__)
#define bp(x) do { if(!(x)) __asm__ volatile(".inst 0xe7f001f0"); } while (0)
STATIC_ASSERT(0, "debugger break instruction unimplemented");
#define bp(x) ((void)(0))
/* coalescing of memory freelist currently prohibited since we haven't
implemented coalescing of btree nodes (necessary) */
#define CAN_COALESCE 0
/* ;;: remove once confident in logic and delete all code dependencies on
state->node_freelist */
#define ZERO(s, n) memset((s), 0, (n))
#define S7(A, B, C, D, E, F, G) A##B##C##D##E##F##G
#define S6(A, B, C, D, E, F, ...) S7(A, B, C, D, E, F, __VA_ARGS__)
#define S5(A, B, C, D, E, ...) S6(A, B, C, D, E, __VA_ARGS__)
#define S4(A, B, C, D, ...) S5(A, B, C, D, __VA_ARGS__)
#define S3(A, B, C, ...) S4(A, B, C, __VA_ARGS__)
#define S2(A, B, ...) S3(A, B, __VA_ARGS__)
#define S(A, ...) S2(A, __VA_ARGS__)
#define KBYTES(x) ((size_t)(x) << 10)
#define MBYTES(x) ((size_t)(x) << 20)
#define GBYTES(x) ((size_t)(x) << 30)
#define TBYTES(x) ((size_t)(x) << 40)
#define PBYTES(x) ((size_t)(x) << 50)
/* 4K page in bytes */
#define P2BYTES(x) ((size_t)(x) << BT_PAGEBITS)
/* the opposite of P2BYTES */
#define B2PAGES(x) ((size_t)(x) >> BT_PAGEBITS)
#define __packed __attribute__((__packed__))
#define UNUSED(x) ((void)(x))
#ifdef DEBUG
# define DPRINTF(fmt, ...) \
fprintf(stderr, "%s:%d " fmt "\n", __func__, __LINE__, __VA_ARGS__)
# define DPRINTF(fmt, ...) ((void) 0)
#define DPUTS(arg) DPRINTF("%s", arg)
#define TRACE(...) DPUTS("")
#define BT_SUCC 0
#define SUCC(x) ((x) == BT_SUCC)
#define BT_MAPADDR ((void *) S(0x1000,0000,0000))
static inline vaof_t
addr2off(void *p)
/* convert a pointer into a 32-bit page offset */
uintptr_t pu = (uintptr_t)p;
assert((pu & ((1 << BT_PAGEBITS) - 1)) == 0); /* p must be page-aligned */
uintptr_t off = pu - (uintptr_t)BT_MAPADDR;
return (vaof_t)(pu >> BT_PAGEBITS);
static inline void *
off2addr(vaof_t off)
/* convert a 32-bit page offset into a pointer */
uintptr_t pu = (uintptr_t)off << BT_PAGEBITS;
pu += (uintptr_t)BT_MAPADDR;
return (void *)pu;
#define BT_NUMMETAS 2 /* 2 metapages */
#define PMA_GROW_SIZE (BT_PAGESIZE * 1024)
#define BT_NOPAGE 0
FO2BY: file offset to byte
get byte INDEX into pma map from file offset
#define FO2BY(fo) \
((uint64_t)(fo) << BT_PAGEBITS)
BY2FO: byte to file offset
get pgno from byte INDEX into pma map
#define BY2FO(p) \
((pgno_t)((p) >> BT_PAGEBITS))
FO2PA: file offset to page
get a reference to a BT_page from a file offset
/* ;;: can simplify:
#define FO2PA(map, fo) \
((BT_page *)&(map)[FO2BY(fo)])
/* NMEMB: number of members in array, a */
#define NMEMB(a) \
(sizeof(a[0]) / sizeof(a))
#define offsetof(st, m) \
__builtin_offsetof(st, m)
//// ===========================================================================
//// btree types
btree page header. all pages share this header. Though for metapages, you can
expect it to be zeroed out.
typedef struct BT_pageheader BT_pageheader;
struct BT_pageheader {
uint8_t dirty[256]; /* dirty bit map */
} __packed;
btree key/value data format
BT_dat is used to provide a view of the data section in a BT_page where data is
stored like:
va fo va fo
bytes 0 4 8 12
The convenience macros given an index into the data array do the following:
BT_dat_lo(i) returns ith va (low addr)
BT_dat_hi(i) returns i+1th va (high addr)
BT_dat_fo(i) returns ith file offset
typedef union BT_dat BT_dat;
union BT_dat {
vaof_t va; /* virtual address offset */
pgno_t fo; /* file offset */
/* like BT_dat but when a struct is more useful than a union */
typedef struct BT_kv BT_kv;
struct BT_kv {
vaof_t va;
pgno_t fo;
/* ;;: todo, perhaps rather than an index, return the data directly and typecast?? */
#define BT_dat_lo(i) ((i) * 2)
#define BT_dat_fo(i) ((i) * 2 + 1)
#define BT_dat_hi(i) ((i) * 2 + 2)
#define BT_dat_lo2(I, dat)
#define BT_dat_fo2(I, dat)
#define BT_dat_hi2(I, dat)
/* BT_dat_maxva: pointer to highest va in page data section */
#define BT_dat_maxva(p) \
((void *)&(p)->datd[BT_dat_lo(BT_DAT_MAXKEYS)])
/* BT_dat_maxfo: pointer to highest fo in page data section */
#define BT_dat_maxfo(p) \
((void *)&(p)->datd[BT_dat_fo(BT_DAT_MAXVALS)])
#define BT_DAT_MAXBYTES (BT_PAGESIZE - sizeof(BT_pageheader))
#define BT_DAT_MAXENTRIES (BT_DAT_MAXBYTES / sizeof(BT_dat))
/* #define BT_DAT_MAXKEYS 10 */
static_assert(BT_DAT_MAXENTRIES % 2 == 0);
/* we assume off_t is 64 bit */
static_assert(sizeof(off_t) == sizeof(uint64_t));
all pages in the memory arena consist of a header and data section
typedef struct BT_page BT_page;
struct BT_page {
BT_pageheader head; /* header */
union { /* data section */
BT_dat datd[BT_DAT_MAXENTRIES]; /* union view */
BT_kv datk[0]; /* struct view */
BYTE datc[0]; /* byte-level view */
static_assert(sizeof(BT_page) == BT_PAGESIZE);
static_assert(BT_DAT_MAXBYTES % sizeof(BT_dat) == 0);
#define BT_VERSION 1
a meta page is like any other page, but the data section is used to store
additional information
#define BLK_BASE_LEN1 (BLK_BASE_LEN0 * 4)
#define BLK_BASE_LEN2 (BLK_BASE_LEN1 * 4)
#define BLK_BASE_LEN3 (BLK_BASE_LEN2 * 4)
#define BLK_BASE_LEN4 (BLK_BASE_LEN3 * 4)
#define BLK_BASE_LEN5 (BLK_BASE_LEN4 * 4)
#define BLK_BASE_LEN6 (BLK_BASE_LEN5 * 4)
#define BLK_BASE_LEN7 (BLK_BASE_LEN6 * 4)
typedef struct BT_meta BT_meta;
struct BT_meta {
#define BT_NUMROOTS 32
uint32_t magic;
uint32_t version;
pgno_t last_pg; /* last page used in file */
uint32_t _pad0;
uint64_t txnid;
void *fix_addr; /* fixed addr of btree */
pgno_t blk_base[8]; /* block base array for striped node partition */
/* ;;: for the blk_base array, code may be simpler if this were an array of
BT_page *. */
uint8_t blk_cnt; /* currently highest valid block base */
uint8_t depth; /* tree depth */
#define BP_META ((uint8_t)0x02)
uint8_t flags;
uint8_t _pad1;
pgno_t root;
/* 64bit alignment manually checked - 72 bytes total above */
uint64_t roots[BT_NUMROOTS]; /* for usage by ares */
uint32_t chk; /* checksum */
} __packed;
static_assert(sizeof(BT_meta) <= BT_DAT_MAXBYTES);
/* the length of the metapage up to but excluding the checksum */
#define BT_META_LEN (offsetof(BT_meta, chk))
#define BT_roots_bytelen (sizeof(BT_meta) - offsetof(BT_meta, roots))
typedef struct BT_mlistnode BT_mlistnode;
struct BT_mlistnode {
void *va; /* virtual address */
size_t sz; /* size in pages */
BT_mlistnode *next; /* next freelist node */
typedef struct BT_nlistnode BT_nlistnode;
struct BT_nlistnode {
BT_page *va; /* virtual address */
size_t sz; /* size in pages */
BT_nlistnode *next; /* next freelist node */
typedef struct BT_flistnode BT_flistnode;
struct BT_flistnode {
pgno_t pg; /* pgno - an offset in the persistent file */
size_t sz; /* size in pages */
BT_flistnode *next; /* next freelist node */
/* macro to access the metadata stored in a page's data section */
#define METADATA(p) ((BT_meta *)(void *)(p)->datc)
typedef struct BT_state BT_state;
struct BT_state {
int data_fd;
char *path;
void *fixaddr;
BYTE *map;
BT_meta *meta_pages[2]; /* double buffered */
/* ;;: note, while meta_pages[which]->root stores a pgno, we may want to just
store a pointer to root in state in addition to avoid a _node_find on it
every time it's referenced */
/* BT_page *root; */
off_t file_size; /* the size of the pma file in bytes */
pgno_t frontier; /* last non-free page in use by pma (exclusive) */
unsigned int which; /* which double-buffered db are we using? */
BT_nlistnode *nlist; /* node freelist */
BT_mlistnode *mlist; /* memory freelist */
BT_flistnode *flist; /* pma file freelist */
BT_flistnode *pending_flist;
BT_nlistnode *pending_nlist;
;;: wrt to frontier: if you need to allocate space for data, push the frontier
out by that amount allocated. If you're allocating a new stripe, push it to
the end of that stripe.
//// ===========================================================================
//// btree internal routines
static void _bt_printnode(BT_page *node); /* ;;: tmp */
static int
_bt_insertdat(vaof_t lo, vaof_t hi, pgno_t fo,
BT_page *parent, size_t childidx); /* ;;: tmp */
#define BT_MAXDEPTH 4 /* ;;: todo derive it */
typedef struct BT_findpath BT_findpath;
struct BT_findpath {
BT_page *path[BT_MAXDEPTH];
size_t idx[BT_MAXDEPTH];
uint8_t depth;
/* _node_get: get a pointer to a node stored at file offset pgno */
static BT_page *
_node_get(BT_state *state, pgno_t pgno)
/* TODO: eventually, once we can store more than 2M of nodes, this will need
to reference the meta page's blk_base array to determine where a node is
mapped. i.e:
- receive pgno
- find first pgno in blk_base that exceeds pgno : i
- sector that contains node is i-1
- appropriately offset into i-1th fixed size partition: 2M, 8M, 16M, ...
/* for now, this works because the 2M sector is at the beginning of both the
memory arena and pma file
if (pgno <= 1) return 0; /* no nodes stored at 0 and 1 (metapages) */
/* TODO: when partition striping is implemented, a call beyond the furthest
block base should result in the allocation of a new block base */
assert((pgno * BT_PAGESIZE) < MBYTES(2));
return FO2PA(state->map, pgno);
/* ;;: I don't think we should need this if _bt_nalloc also returns a disc offset */
static pgno_t
_fo_get(BT_state *state, BT_page *node)
uintptr_t vaddr = (uintptr_t)node;
uintptr_t start = (uintptr_t)state->map;
return BY2FO(vaddr - start);
static BT_page *
_bt_nalloc(BT_state *state)
/* allocate a node in the node freelist */
/* TODO: maybe change _bt_nalloc to return both a file and a node offset as
params to the function and make actual return value an error code. This is
to avoid forcing some callers to immediately use _fo_get */
BT_nlistnode **n = &state->nlist;
for (; *n; n = &(*n)->next) {
/* ;;: this assert is temporary. When partition striping is
implemented. Rather than assert, conditionally check if we're at the
end of the current stripe. If so, allocate a new region and append that
to the freelist. */
size_t width = (BYTE *)state->nlist->va - state->map;
/* ;;: asserting 2M for now since partition striping is unimplemented */
assert(width < MBYTES(2));
/* perfect fit */
if ((*n)->sz == 1) {
BT_page *ret;
ret = (*n)->va;
*n = (*n)->next;
return ret;
/* larger than necessary: shrink the node */
if ((*n)->sz > 1) {
BT_page *ret;
ret = (*n)->va;
(*n)->sz -= 1;
(*n)->va = (*n)->va + 1;
return ret;
static int
_node_cow(BT_state *state, BT_page *node, pgno_t *pgno)
BT_page *ret = _bt_nalloc(state);
memcpy(ret->datk, node->datk, sizeof node->datk[0] * BT_DAT_MAXENTRIES);
*pgno = _fo_get(state, ret);
return BT_SUCC;
/* binary search a page's data section for a va. Returns a pointer to the found BT_dat */
static void *
_bt_bsearch(BT_page *page, vaof_t va)
/* ;;: todo: actually bsearch rather than linear */
for (BT_kv *kv = &page->datk[0]; kv <= BT_dat_maxva(page); kv++) {
if (kv->va == va)
return kv;
return 0;
static size_t
_bt_childidx(BT_page *node, vaof_t lo, vaof_t hi)
/* looks up the child index in a parent node. If not found, return is
size_t i = 0;
for (; i < BT_DAT_MAXKEYS - 1; i++) {
vaof_t llo = node->datk[i].va;
vaof_t hhi = node->datk[i+1].va;
if (llo <= lo && hhi >= hi)
return i;
/* ;;: find returns a path to nodes that things should be in if they are there. */
/* a leaf has a meta page depth eq to findpath depth */
static int
_bt_find2(BT_state *state,
BT_page *node,
BT_findpath *path,
uint8_t maxdepth,
vaof_t lo,
vaof_t hi)
/* ;;: meta node stores depth (node or leaf?)
look at root node and binsearch BT_dats where low is <= lo and high is >= hi
If at depth of metapage (a leaf), then done
otherwise grab node, increment depth, save node in path
if (path->depth > maxdepth)
return ENOENT;
assert(node != 0);
size_t i;
if ((i = _bt_childidx(node, lo, hi)) == BT_DAT_MAXKEYS)
return ENOENT;
if (path->depth == maxdepth) {
path->idx[path->depth] = i;
path->path[path->depth] = node;
return BT_SUCC;
/* then branch */
else {
pgno_t fo = node->datk[i].fo;
BT_page *child = _node_get(state, fo);
path->idx[path->depth] = i;
path->path[path->depth] = node;
return _bt_find2(state, child, path, maxdepth, lo, hi);
static void
_bt_root_new(BT_meta *meta, BT_page *root)
/* The first usable address in the PMA is just beyond the first node stripe */
root->datk[0].va = meta->blk_base[0] + BLK_BASE_LEN0;
root->datk[0].fo = 0;
root->datk[1].va = UINT32_MAX;
root->datk[1].fo = 0;
static int
_bt_find(BT_state *state, BT_findpath *path, vaof_t lo, vaof_t hi)
path->depth = 1;
BT_meta *meta = state->meta_pages[state->which];
BT_page *root = _node_get(state, meta->root);
uint8_t maxdepth = meta->depth;
return _bt_find2(state, root, path, maxdepth, lo, hi);
static int
_bt_findpath_is_root(BT_findpath *path)
assert(path != 0);
return path->depth == 0;
/* _bt_numkeys: find next empty space in node's data section. Returned as
index into node->datk. If the node is full, return is BT_DAT_MAXKEYS */
static size_t
_bt_numkeys(BT_page *node)
size_t i = 1;
for (; i < BT_DAT_MAXKEYS; i++) {
if (node->datk[i].va == 0) break;
return i;
static int
_bt_datshift(BT_page *node, size_t i, size_t n)
/* shift data segment at i over by n KVs */
assert(i+n < BT_DAT_MAXKEYS); /* check buffer overflow */
size_t siz = sizeof node->datk[0];
size_t bytelen = (BT_DAT_MAXKEYS - i - n) * siz;
memmove(&node->datk[i+n], &node->datk[i], bytelen);
ZERO(&node->datk[i], n * siz); /* NB: not completely necessary */
return BT_SUCC;
/* _bt_split_datcopy: copy right half of left node to right node */
static int
_bt_split_datcopy(BT_page *left, BT_page *right)
size_t mid = BT_DAT_MAXKEYS / 2;
size_t bytelen = mid * sizeof(left->datk[0]);
/* copy rhs of left to right */
memcpy(right->datk, &left->datk[mid], bytelen);
/* zero rhs of left */
ZERO(&left->datk[mid], bytelen); /* ;;: note, this would be unnecessary if we stored node.N */
/* the last entry in left should be the first entry in right */
left->datk[mid].va = right->datk[0].va;
return BT_SUCC;
static int
_bt_ischilddirty(BT_page *parent, size_t child_idx)
assert(child_idx < 2048);
uint8_t flag = parent->head.dirty[child_idx >> 3];
return flag & (1 << (child_idx & 0x7));
/* ;;: todo: name the 0x8 and 4 literals and/or generalize */
static int
_bt_dirtychild(BT_page *parent, size_t child_idx)
assert(child_idx < 2048);
/* although there's nothing theoretically wrong with dirtying a dirty node,
there's probably a bug if we do it since a we only dirty a node when it's
alloced after a split or CoWed */
assert(!_bt_ischilddirty(parent, child_idx));
uint8_t *flag = &parent->head.dirty[child_idx >> 3];
*flag |= 1 << (child_idx & 0x7);
return BT_SUCC;
static int
_bt_cleanchild(BT_page *parent, size_t child_idx)
assert(_bt_ischilddirty(parent, child_idx));
uint8_t *flag = &parent->head.dirty[child_idx >> 3];
*flag ^= 1 << (child_idx & 0x7);
return BT_SUCC;
/* ;:: assert that the node is dirty when splitting */
static int
_bt_split_child(BT_state *state, BT_page *parent, size_t i, pgno_t *newchild)
/* ;;: todo: better error handling */
assert(_bt_ischilddirty(parent, i));
int rc = BT_SUCC;
size_t N;
BT_page *left = _node_get(state, parent->datk[i].fo);
BT_page *right = _bt_nalloc(state);
if (right == 0)
return ENOMEM;
if (!SUCC(rc = _bt_split_datcopy(left, right)))
return rc;
/* adjust high address of left node in parent */
N = _bt_numkeys(left);
/* insert reference to right child into parent node */
N = _bt_numkeys(right);
vaof_t lo = right->datk[0].va;
vaof_t hi = right->datk[N-1].va;
_bt_insertdat(lo, hi, _fo_get(state, right), parent, i);
/* dirty right child */
size_t ridx = _bt_childidx(parent, lo, hi);
assert(ridx == i+1); /* 0x100000020100;;: tmp? */
_bt_dirtychild(parent, ridx);
/* ;;: fix this */
*newchild = _fo_get(state, right);
return BT_SUCC;
static int
_bt_rebalance(BT_state *state, BT_page *node)
return 255;
/* insert lo, hi, and fo in parent's data section for childidx */
static int
_bt_insertdat(vaof_t lo, vaof_t hi, pgno_t fo,
BT_page *parent, size_t childidx)
DPRINTF("BEFORE INSERT lo %" PRIu32 " hi %" PRIu32 " fo %" PRIu32, lo, hi, fo);
/* ;;: TODO confirm this logic is appropriate for branch nodes. (It /should/
be correct for leaf nodes) */
vaof_t llo = parent->datk[childidx].va;
vaof_t hhi = parent->datk[childidx+1].va;
/* NB: it can be assumed that llo <= lo and hi <= hhi because this routine is
called using an index found with _bt_childidx */
/* duplicate */
if (llo == lo && hhi == hi) {
parent->datk[childidx].fo = fo;
return BT_SUCC;
if (llo == lo) {
_bt_datshift(parent, childidx + 1, 1);
vaof_t oldfo = parent->datk[childidx].fo;
parent->datk[childidx].fo = fo;
parent->datk[childidx+1].va = hi;
parent->datk[childidx+1].fo = oldfo + (hi - llo);
else if (hhi == hi) {
_bt_datshift(parent, childidx + 1, 1);
parent->datk[childidx+1].va = lo;
parent->datk[childidx+1].fo = fo;
else {
_bt_datshift(parent, childidx + 1, 2);
parent->datk[childidx+1].va = lo;
parent->datk[childidx+1].fo = fo;
parent->datk[childidx+2].va = hi;
pgno_t lfo = parent->datk[childidx].fo;
vaof_t lva = parent->datk[childidx].va;
parent->datk[childidx+2].fo = (lfo == 0)
? 0
: lfo + (hi - lva);
return BT_SUCC;
//// ===========================================================================
//// wip - deletion coalescing
/* ;;: todo: rename routines */
_bt_delco_1pass_0(BT_state *state, vaof_t lo, vaof_t hi,
BT_page *node, uint8_t depth, uint8_t maxdepth)
/* Perform a dfs search on all ranges that fall within lo and hi */
size_t N = _bt_numkeys(node);
size_t loidx = 0;
size_t hiidx = 0;
/* first find the entry that matches lo */
size_t i;
for (i = 0; i < N-1; i++) {
vaof_t hhi = node->datk[i+1].va;
if (hhi > lo) {
loidx = i;
/* and then the entry that matches hi */
for (; i < N-1; i++) {
vaof_t hhi = node->datk[i].va;
if (hhi >= hi) {
hiidx = hi;
/* node->datk[loidx] - node->datk[hiidx] are the bounds on which to perform
the dfs */
for (i = loidx; i < hiidx; i++) {
vaof_t llo = node->datk[i].va;
pgno_t pg = node->datk[i].va;
/* if at the leaf level, terminate with failure if pg is not free */
if (depth == maxdepth) {
if (pg != 0) return 1;
else continue;
/* otherwise, dfs the child node */
BT_page *child = _node_get(state, pg);
if (!SUCC(_bt_delco_1pass_0(state, lo, hi, child, depth+1, maxdepth)))
return 1;
/* whether we're at a leaf or a branch, by now all pages corresponding to the
hi-lo range must be free */
return BT_SUCC;
/* ;;: since this is called by another recursive function _bt_delco that first
finds if a split exists, this /could/ take a pgno to avoid unnecessarily
rewalking the tree. not a big deal though as is. */
static int
_bt_delco_1pass(BT_state *state, vaof_t lo, vaof_t hi)
/* returns true if the leaves in the given range are all free (pgno of 0). false
otherwise. This must be the case for an insert into an overlapping range to
succeed */
BT_meta *meta = state->meta_pages[state->which];
BT_page *root = _node_get(state, meta->root);
return _bt_delco_1pass_0(state, lo, hi, root, 1, meta->depth);
static void
_mlist_insert(BT_state *state, void *lo, void *hi)
BT_mlistnode *head = state->mlist;
BYTE *lob = lo;
BYTE *hib = hi;
while (head->next) {
BYTE *vob = head->va;
size_t siz = head->sz;
BYTE *nob = head->next->va;
/* freed chunk immediately precedes head */
if (hi == vob) {
head->va = lo;
head->sz += (hib - lob);
/* freed chunk immediately follows termination of head */
if (vob + siz == lo) {
head->sz += (hib - lob);
/* freed chunk between head and next but not contiguous */
if (lob > vob + siz
&& hib < nob) {
BT_mlistnode *new = calloc(1, sizeof *new);
new->sz = (hib - lob);
new->va = lob;
new->next = head->next;
head->next = new;
head = head->next;
/* freelist completely searched. Chunk must be at tail and not contiguous */
BT_mlistnode *new = calloc(1, sizeof *new);
new->sz = (hib - lob);
new->va = lob;
new->next = head->next;
head->next = new;
static void
_pending_nlist_insert(BT_state *state, pgno_t nodepg)
/* ;;: todo: need to account for a null head */
BT_nlistnode *head = state->pending_nlist;
BT_page *va = _node_get(state, nodepg);
/* freelist may be empty. create head */
if (head == 0) {
state->pending_nlist = calloc(1, sizeof *state->pending_nlist);
state->pending_nlist->sz = 1;
state->pending_nlist->va = va;
/* we don't need to account for a freelist node's size because we aren't
coalescing the pending freelists */
while (head->next) {
if (head->next->va > va)
head = head->next;
/* head->next is either null or has a higher address than va */
BT_nlistnode *new = calloc(1, sizeof *new);
new->sz = 1;
new->va = va;
new->next = head->next;
head->next = new;
static BT_nlistnode *
_nlist_find(BT_nlistnode *head, BT_page *va)
/* find a node */
static void
_pending_nlist_merge(BT_state *state)
BT_nlistnode **src_head = &state->pending_nlist;
BT_nlistnode **dst_head = &state->nlist;
while (*dst_head) {
/* src cleared. done */
if (!*src_head) {
/* check if src node should be merged with dst **************************/
BT_page *dst_va = (*dst_head)->va;
size_t dst_sz = (*dst_head)->sz;
BT_page *src_va = (*src_head)->va;
/* NB: while we don't currently coalesce the pending nlist, it's not that
hard to account for if we did, so might as well generalize the merge
algorithm */
size_t src_sz = (*src_head)->sz;
BT_page *dst_next_va = *dst_head ? (*dst_head)->next->va : 0;
/* source node immediately follows dst node's termination */
if (dst_va + dst_sz == src_va) {
(*dst_head)->sz += src_sz; /* widen dst node */
/* advance src node and free previous */
BT_nlistnode *prev = *src_head;
src_head = &(*src_head)->next;
/* source node's termination immediately precedes dst node */
else if (dst_next_va == src_va + src_sz) {
(*dst_head)->va = src_va; /* pull va back */
(*dst_head)->sz += src_sz; /* widen node */
/* advance src node and free previous */
BT_nlistnode *prev = *src_head;
src_head = &(*src_head)->next;
/* src node lies between but isn't contiguous with dst */
else if (src_va > dst_va + dst_sz
&& src_va + src_sz < dst_next_va) {
/* link src node in */
(*src_head)->next = (*dst_head)->next;
(*dst_head)->next = *src_head;
/* and advance src node */
src_head = &(*src_head)->next;
/* otherwise, advance dst node */
else {
dst_head = &(*dst_head)->next;
/* merge what remains of src if anything */
*dst_head = *src_head;
static void
_pending_flist_insert(BT_state *state, pgno_t pg, size_t sz)
BT_flistnode *head = state->pending_flist;
/* freelist may be empty. create head */
if (head == 0) {
state->pending_flist = calloc(1, sizeof *state->pending_flist);
state->pending_flist->pg = pg;
state->pending_flist->sz = sz;
while (head->next) {
/* next node starts at pg higher than this freechunk's termination */
if (head->next->pg >= pg + sz) {
head = head->next;
/* if freed chunk follows head, expand head */
if (head->pg + head->sz == pg) {
head->sz += sz;
/* if the freed chunk precedes next, expand next and pull pg back */
if (head->next->pg == pg + sz) {
head->next->pg = pg;
head->next->sz += sz;
/* otherwise, insert a new node following head */
BT_flistnode *new = calloc(1, sizeof *new);
new->pg = pg;
new->sz = sz;
new->next = head->next;
head->next = new;
static void
_pending_flist_merge(BT_state *state)
BT_flistnode **src_head = &state->pending_flist;
BT_flistnode **dst_head = &state->flist;
while (*dst_head) {
/* src cleared. done */
if (!*src_head) {
/* check if src node should be merged with dst **************************/
pgno_t dst_pg = (*dst_head)->pg;
size_t dst_sz = (*dst_head)->sz;
pgno_t src_pg = (*src_head)->pg;
size_t src_sz = (*src_head)->sz;
pgno_t dst_next_pg = *dst_head ? (*dst_head)->next->pg : 0;
/* source node immediately follows dst node's termination */
if (dst_pg + dst_sz == src_pg) {
(*dst_head)->sz += src_sz; /* widen dst node */
/* advance src node and free previous */
BT_flistnode *prev = *src_head;
src_head = &(*src_head)->next;
/* source node's termination immediately precedes dst node */
else if (src_pg + src_sz == dst_pg) {
(*dst_head)->pg = src_pg; /* pull page back */
(*dst_head)->sz += src_sz; /* widen node */
/* advance src node and free previous */
BT_flistnode *prev = *src_head;
src_head = &(*src_head)->next;
/* src node lies between but isn't contiguous with dst */
else if (dst_next_pg > src_pg + src_sz
&& dst_pg + dst_sz < src_pg) {
/* link src node in */
(*src_head)->next = (*dst_head)->next;
(*dst_head)->next = *src_head;
/* and advance src node */
src_head = &(*src_head)->next;
/* otherwise, advance dst node */
else {
dst_head = &(*dst_head)->next;
/* merge what remains of src if anything */
*dst_head = *src_head;
/* ;;: todo move shit around */
static void
_bt_delco_droptree2(BT_state *state, pgno_t nodepg, uint8_t depth, uint8_t maxdepth)
/* branch */
if (depth != maxdepth) {
BT_page *node = _node_get(state, nodepg);
for (size_t i = 0; i < BT_DAT_MAXKEYS; i++) {
BT_kv entry = node->datk[i];
if (entry.fo == 0)
break; /* done */
_bt_delco_droptree2(state, entry.fo, depth+1, maxdepth);
_pending_nlist_insert(state, nodepg);
static void
_bt_delco_droptree(BT_state *state, pgno_t nodepg, uint8_t depth)
/* completely drop a tree. Assume that all leaves under the tree are free
(pgno = 0) */
assert(nodepg >= 2);
BT_meta *meta = state->meta_pages[state->which];
return _bt_delco_droptree2(state, nodepg, depth, meta->depth);
static void
_bt_delco_trim_rsubtree_lhs2(BT_state *state, vaof_t lo, vaof_t hi,
pgno_t nodepg, uint8_t depth, uint8_t maxdepth)
BT_page *node = _node_get(state, nodepg);
size_t hiidx = 0;
size_t N = _bt_numkeys(node);
/* find hi idx of range */
size_t i;
for (i = 0; i < N-1; i++) {
vaof_t hhi = node->datk[i].va;
if (hhi >= hi) {
hiidx = i;
/* set the lo address of datk[hiidx] to hi */
node->datk[hiidx-1].va = hi;
/* drop the subtrees left of the range */
if (depth != maxdepth) {
for (i = 0; i < hiidx-1; i++) {
pgno_t childpg = node->datk[i].fo;
if (childpg == 0)
_bt_delco_droptree(state, childpg, depth+1);
/* memmove the buffer so the found range is the first in the node */
BYTE *dst = (BYTE *)&node->datk[0].va;
BYTE *src = (BYTE *)&node->datk[hiidx-1].va;
BYTE *end = (BYTE *)&node->datk[BT_DAT_MAXKEYS-1].fo;
size_t len = end - src;
memmove(dst, src, len);
/* ;;: TODO add temporary asserts for testing? */
/* and now zero the moved range */
ZERO(dst+len, end-(dst+len));
/* done if this is a leaf */
if (depth == maxdepth)
/* otherwise, recur on subtree */
pgno_t rsubtree = node->datk[hiidx].fo;
return _bt_delco_trim_rsubtree_lhs2(state, lo, hi, rsubtree, depth+1, maxdepth);
static void
_bt_delco_trim_rsubtree_lhs(BT_state *state, vaof_t lo, vaof_t hi,
pgno_t nodepg, uint8_t depth)
BT_meta *meta = state->meta_pages[state->which];
return _bt_delco_trim_rsubtree_lhs2(state, lo, hi, nodepg, depth, meta->depth);
static void
_bt_delco_trim_lsubtree_rhs2(BT_state *state, vaof_t lo, vaof_t hi,
pgno_t nodepg, uint8_t depth, uint8_t maxdepth)
BT_page *node = _node_get(state, nodepg);
size_t N = _bt_numkeys(node);
size_t loidx = 0;
/* find low idx of range */
size_t i;
for (i = 0; i < N-1; i++) {
vaof_t hhi = node->datk[i+1].va;
if (hhi > lo) {
loidx = i;
/* set the hi address of datk[loidx] to hi */
node->datk[loidx+1].va = hi;
/* drop the subtrees right of the range */
if (depth != maxdepth) {
/* recur and droptree for branches */
for (i = loidx+1; i < N-1; i++) {
pgno_t childpg = node->datk[i].fo;
if (childpg == 0)
_bt_delco_droptree(state, childpg, depth+1);
/* always zero rhs whether node is a leaf or a branch */
BYTE *beg = (BYTE *)&node->datk[loidx+1].fo;
BYTE *end = (BYTE *)&node->datk[BT_DAT_MAXKEYS-1].fo;
size_t len = end - beg;
ZERO(beg, len);
/* ;;: this won't zero the last fo, but that should be fine. remove the assert
when you're confident it /is/ fine */
assert(node->datk[BT_DAT_MAXKEYS-1].fo == 0);
/* done if this is a leaf */
if (depth == maxdepth)
/* otherwise, recur on the left subtree */
pgno_t lsubtree = node->datk[loidx].fo;
return _bt_delco_trim_lsubtree_rhs2(state, lo, hi, lsubtree, depth+1, maxdepth);
static void
_bt_delco_trim_lsubtree_rhs(BT_state *state, vaof_t lo, vaof_t hi,
pgno_t nodepg, uint8_t depth)
BT_meta *meta = state->meta_pages[state->which];
return _bt_delco_trim_lsubtree_rhs2(state, lo, hi, nodepg, depth, meta->depth);
static void
_bt_delco(BT_state *state, vaof_t lo, vaof_t hi,
pgno_t nodepg, uint8_t depth, uint8_t maxdepth)
/* ;;: "find_internal_splits" in the original algorithm */
BT_page *node = _node_get(state, nodepg);
size_t N = _bt_numkeys(node);
size_t loidx = 0;
size_t hiidx = 0;
pgno_t lsubtree = 0;
pgno_t rsubtree = 0;
/* find low idx of range */
for (size_t i = 0; i < N-1; i++) {
vaof_t hhi = node->datk[i+1].va;
if (hhi > lo) {
loidx = i;
/* find high idx of range */
for (size_t i = loidx; i < N-1; i++) {
vaof_t hhi = node->datk[i].va;
if (hhi >= hi) {
assert(i > 0);
hiidx = i - 1;
/* non-split range and at leaf. done */
if (depth == maxdepth
&& hiidx == loidx) {
lsubtree = node->datk[loidx].fo;
rsubtree = node->datk[hiidx].fo;
if (depth < maxdepth) {
/* guarantee path is dirty by CoWing node if not */
/* ;;: refactor? code duplication?? */
if (!_bt_ischilddirty(node, loidx)) {
BT_page *child = _node_get(state, lsubtree);
pgno_t newpg;
_node_cow(state, child, &newpg);
lsubtree = node->datk[loidx].fo = newpg;
_bt_dirtychild(node, loidx);
if (!_bt_ischilddirty(node, hiidx)) {
BT_page *child = _node_get(state, rsubtree);
pgno_t newpg;
_node_cow(state, child, &newpg);
rsubtree = node->datk[hiidx].fo = newpg;
_bt_dirtychild(node, hiidx);
/* non-split range, recurse to child tree */
if (hiidx == loidx) {
pgno_t childpg = node->datk[loidx].fo;
_bt_delco(state, lo, hi, childpg, depth+1, maxdepth);
/* split range discovered */
if (hiidx > loidx) {
/* run first pass to guarantee range is completely free */
if (!SUCC(_bt_delco_1pass(state, lo, hi))) {
/* attempted insert on split range that cannot be coalesced */
/* set leftmost boundary va to hi */
node->datk[loidx+1].va = hi;
/* set the lo side of the right boundary to hi */
node->datk[hiidx].va = hi;
/* drop all trees between the two subtrees */
for (size_t i = loidx+1; i < hiidx; i++) {
pgno_t childpg = node->datk[i].fo;
_bt_delco_droptree(state, childpg, depth);
/* move buffer */
BYTE *dst = (BYTE *)&node->datk[loidx+1].va;
BYTE *src = (BYTE *)&node->datk[hiidx].va;
BYTE *end = (BYTE *)&node->datk[BT_DAT_MAXKEYS-1].fo;
size_t len = end - src;
memmove(dst, src, len);
ZERO(dst+len, end-(dst+len));
/* trim left subtree then trim right subtree */
_bt_delco_trim_lsubtree_rhs(state, lo, hi, lsubtree, depth+1);
_bt_delco_trim_rsubtree_lhs(state, lo, hi, rsubtree, depth+1);
/* done */
/* ;;: todo, update meta->depth when we add a row. Should this be done in
_bt_rebalance? */
static int
_bt_insert2(BT_state *state, vaof_t lo, vaof_t hi, pgno_t fo,
BT_page *node, size_t depth)
/* ;;: to be written in such a way that node is guaranteed both dirty and
non-full */
/* ;;: remember:
- You need to CoW+dirty a node when you insert a non-dirty node.
- You need to insert into a node when:
- It's a leaf
- It's a branch and you CoWed the child
- Hence, all nodes in a path to a leaf being inserted into need to already
be dirty or explicitly Cowed. Splitting doesn't actually factor into this
decision afaict.
int rc = 255;
size_t N = 0;
size_t childidx = _bt_childidx(node, lo, hi);
assert(childidx != BT_DAT_MAXKEYS);
BT_meta *meta = state->meta_pages[state->which];
if (depth < meta->depth) {
pgno_t childpgno = node->datk[childidx].fo;
BT_page *child = _node_get(state, childpgno);
N = _bt_numkeys(child);
/* nullcond: node is a leaf */
if (meta->depth == depth) {
/* guaranteed non-full and dirty by n-1 recursive call, so just insert */
return _bt_insertdat(lo, hi, fo, node, childidx);
/* do we need to CoW the child node? */
if (!_bt_ischilddirty(node, childidx)) {
pgno_t pgno;
_node_cow(state, node, &pgno);
node->datk[childidx].fo = pgno;
_bt_dirtychild(node, childidx);
/* do we need to split the child node? */
if (N >= BT_DAT_MAXKEYS - 2) {
pgno_t rchild_pgno;
if (!SUCC(rc = _bt_split_child(state, node, childidx, &rchild_pgno)))
return rc;
/* since we split the child's data, recalculate the child idx */
/* ;;: note, this can be simplified into a conditional i++ */
childidx = _bt_childidx(node, lo, hi);
/* the child is now guaranteed non-full (split) and dirty. Recurse */
BT_page *child = _node_get(state, node->datk[childidx].fo);
return _bt_insert2(state, lo, hi, fo, child, depth+1);
static int
_bt_insert(BT_state *state, vaof_t lo, vaof_t hi, pgno_t fo)
/* handles CoWing/splitting of the root page since it's special cased. Then
passes the child matching hi/lo to _bt_insert2 */
int rc;
BT_meta *meta = state->meta_pages[state->which];
BT_page *root = _node_get(state, meta->root);
/* the root MUST be dirty (zero checksum in metapage) */
assert(meta->chk == 0);
size_t N = _bt_numkeys(root);
/* perform deletion coalescing (and preemptively guarantee path is dirty) if
inserting a non-zero (non-free) page */
if (fo != 0) {
_bt_delco(state, lo, hi, meta->root, 1, meta->depth);
/* CoW root's child if it isn't already dirty */
size_t childidx = _bt_childidx(root, lo, hi);
assert(childidx != BT_DAT_MAXKEYS); /* ;;: this should catch the case of
improperly inserting into a split
range. Should we do it earlier or
differently? */
if (meta->depth > 1
&& !_bt_ischilddirty(root, childidx)) {
BT_page *child = _node_get(state, root->datk[childidx].fo);
pgno_t newchildpg;
_node_cow(state, child, &newchildpg);
root->datk[childidx].fo = newchildpg;
_bt_dirtychild(root, childidx);
/* before calling into recursive insert, handle root splitting since it's
special cased (2 allocs) */
if (N >= BT_DAT_MAXKEYS - 2) { /* ;;: remind, fix all these conditions to be - 2 */
pgno_t pg = 0;
/* the old root is now the left child of the new root */
BT_page *left = root;
BT_page *right = _bt_nalloc(state);
BT_page *rootnew = _bt_nalloc(state);
/* split root's data across left and right nodes */
_bt_split_datcopy(left, right);
/* save left and right in new root's .data */
pg = _fo_get(state, left);
rootnew->datk[0].fo = pg;
rootnew->datk[0].va = 0;
pg = _fo_get(state, right);
rootnew->datk[1].fo = pg;
rootnew->datk[1].va = right->datk[0].va;
rootnew->datk[2].va = UINT32_MAX;
/* dirty new root's children */
_bt_dirtychild(rootnew, 0);
_bt_dirtychild(rootnew, 1);
/* update meta page information. (root and depth) */
pg = _fo_get(state, rootnew);
meta->root = pg;
meta->depth += 1;
root = rootnew;
meta is dirty
root is dirty and split if necessary
root's child in insert path is dirty and split if necessary
finally, recurse on child
return _bt_insert2(state, lo, hi, fo, root, 1);
/* return _bt_insert2(state, lo, hi, fo, child, 1); */
/* ;;: wip */
/* ;;: inspired by lmdb's MDB_pageparent. While seemingly unnecessary for
_bt_insert, this may be useful for _bt_delete when we implement deletion
coalescing */
typedef struct BT_ppage BT_ppage;
struct BT_ppage {
BT_page *node;
BT_page *parent;
static int
_bt_delete(BT_state *state, vaof_t lo, vaof_t hi)
/* ;;: tmp, implement coalescing of zero ranges and merging/rebalancing of
nodes */
return _bt_insert(state, lo, hi, 0);
static int
_mlist_new(BT_state *state)
/* implemented separate from _mlist_read since _mlist_read uses lo va == 0 to
stop parsing node's data. This, however, is a valid starting condition when
freshly creating the btree */
BT_meta *meta = state->meta_pages[state->which];
BT_page *root = _node_get(state, meta->root);
assert(root->datk[0].fo == 0);
vaof_t lo = root->datk[0].va;
vaof_t hi = root->datk[1].va;
size_t len = hi - lo;
BT_mlistnode *head = calloc(1, sizeof *head);
head->next = 0;
head->sz = len;
head->va = off2addr(lo);
state->mlist = head;
return BT_SUCC;
static int
_flist_grow(BT_state *state, BT_flistnode *space)
/* growing the flist consists of expanding the backing persistent file, pushing
that space onto the disk freelist, and updating the dimension members in
BT_state */
/* ;;: I don't see any reason to grow the backing file non-linearly, but we
may want to adjust the size of the amount grown based on performance
testing. */
if (-1 == lseek(state->data_fd, state->file_size + PMA_GROW_SIZE, SEEK_SET))
return errno;
if (-1 == write(state->data_fd, "", 1))
return errno;
/* find the last node in the disk freelist */
BT_flistnode *tail = state->flist;
for (; tail->next; tail = tail->next)
pgno_t lastpgfree = tail->pg + tail->sz;
/* ;;: TODO, make sure you are certain of this logic. Further, add assertions
regarding relative positions of state->file_size, state->frontier, and
we MAY call into this routine even if there is freespace on the end
because it's possible that freespace isn't large enough. We may also call
into this routine when the frontier exceeds the last free pg because
that's just how freelists work. ofc, frontier should never exceed
file_size. what other assertions??
/* if the frontier (last pg in use) is less than the last page free, we should
coalesce the new node with the tail. */
if (state->frontier <= lastpgfree) {
tail->sz += PMA_GROW_SIZE;
/* otherwise, a new node needs to be allocated */
else {
BT_flistnode *new = calloc(1, sizeof *new);
/* since the frontier exceeds the last pg free, new freespace should
naturally be allocated at the frontier */
new->pg = state->frontier;
new->sz = PMA_GROW_SIZE;
tail->next = new;
/* finally, update the file size */
state->file_size += PMA_GROW_SIZE;
return BT_SUCC;
static int
_flist_new(BT_state *state)
BT_meta *meta = state->meta_pages[state->which];
BT_page *root = _node_get(state, meta->root);
assert(root->datk[0].fo == 0);
vaof_t lo = root->datk[0].va;
vaof_t hi = root->datk[1].va;
size_t len = hi - lo;
BT_flistnode *head = calloc(1, sizeof *head);
head->next = 0;
head->sz = len;
head->pg = PMA_GROW_SIZE; /* ;;: should we invoke logic to expand the backing file
here? probably. implement it */ /* */
state->flist = head;
return BT_SUCC;
static int
_nlist_new(BT_state *state)
#define NLIST_PG_START 2 /* the third page */
BT_meta *meta = state->meta_pages[state->which];
BT_nlistnode *head = calloc(1, sizeof *head);
/* the size of a new node freelist is just the first stripe length */
head->sz = BLK_BASE_LEN0;
head->va = &((BT_page *)state->map)[BT_NUMMETAS];
head->next = 0;
state->nlist = head;
return BT_SUCC;
static int
_nlist_delete(BT_state *state)
BT_nlistnode *head, *prev;
head = prev = state->nlist;
while (head->next) {
prev = head;
head = head->next;
state->nlist = 0;
return BT_SUCC;
static BT_nlistnode *
_nlist_read_prev(BT_nlistnode *head, BT_nlistnode *curr)
/* find nlist node preceding curr and return it */
BT_nlistnode *p, *n;
p = head;
n = head->next;
for (; n; p = n, n = n->next) {
if (n == curr)
return p;
return 0;
/* TODO this is a pretty bad algorithm in terms of time complexity. It should be
fixed, but isn't necessary now as our nlist is quite small. You may want to
consider making nlist doubly linked or incorporate a sort and merge step. */
static int
_nlist_read2(BT_state *state, BT_page *node, uint8_t maxdepth,
BT_nlistnode *head, uint8_t depth)
/* recursively walk all nodes in the btree. Allocating new nlist nodes when a
node is found to be in a stripe unaccounted for. For each node found,
split/shrink the appropriate node to account for the allocated page */
BT_nlistnode *p, *n;
p = head;
n = head->next;
/* find the nlist node that fits the current btree node */
for (; n; p = n, n = n->next) {
if (p->va <= node && p->va + p->sz > node)
/* if the nlist node is only one page wide, it needs to be freed */
if (p->sz == 1) {
BT_nlistnode *prev = _nlist_read_prev(head, p);
prev->next = p->next;
goto e;
/* if the btree node resides at the end of the nlist node, just shrink it */
BT_page *last = p->va + p->sz - 1;
if (last == node) {
p->sz -= 1;
goto e;
/* if the btree node resides at the start of the nlist node, likewise shrink
it and update the va */
if (p->va == node) {
p->sz -= 1;
p->va += 1;
goto e;
/* otherwise, need to split the current nlist node */
BT_nlistnode *right = calloc(1, sizeof *right);
size_t lsz = node - p->va;
size_t rsz = (p->va + p->sz) - node;
/* remove 1 page from the right nlist node's size to account for the allocated
btree node */
rsz -= 1;
assert(lsz > 0 && rsz > 0);
/* update the size of the left node. And set the size and va of the right
node. Finally, insert the new nlist node into the nlist. */
p->sz = lsz;
right->sz = rsz;
right->va = node + 1;
right->next = p->next;
p->next = right;
/* if at a leaf, we're finished */
if (depth == maxdepth) {
return BT_SUCC;
/* otherwise iterate over all child nodes, recursively constructing the
list */
int rc = BT_SUCC;
for (size_t i = 0; i < BT_DAT_MAXKEYS; i++) {
BT_kv kv = node->datk[i];
BT_page *child = _node_get(state, node->datk[i].fo);
if (!child) continue;
if (!SUCC(rc = _nlist_read2(state,
return rc;
/* all children traversed */
return BT_SUCC;
static int
_nlist_read(BT_state *state)
/* ;;: this should theoretically be simpler than _mlist_read. right? We can
derive the stripes that contain nodes from the block base array stored in
the metapage. What else do we need to know? -- the parts of each stripe
that are free or in use. How can we discover that?
1) Without storing any per-page metadata, we could walk the entire tree
from the root. Check the page number of the node. And modify the freelist
2) If we stored per-page metadata, this would be simpler. Linearly traverse
each stripe and check if the page is BT_NODE or BT_FREE.
-- are there downsides to (2)? The only advantage to this would be quicker
startup. So for now, going to traverse all nodes and for each node,
traverse the nlist and split it appropriately.
int rc = BT_SUCC;
BT_meta *meta = state->meta_pages[state->which];
BT_page *root = _node_get(state, meta->root);
/* ;;: since partition striping isn't implemented yet, simplifying code by
assuming all nodes reside in the 2M region */
BT_nlistnode *head = calloc(1, sizeof *head);
head->sz = BLK_BASE_LEN0;
head->va = &((BT_page *)state->map)[BT_NUMMETAS];
head->next = 0;
if (!SUCC(rc = _nlist_read2(state, root, meta->depth, head, 1)))
return rc;
state->nlist = head;
return rc;
static BT_mlistnode *
_mlist_read2(BT_state *state, BT_page *node, uint8_t maxdepth, uint8_t depth)
/* leaf */
if (depth == maxdepth) {
BT_mlistnode *head, *prev;
head = prev = calloc(1, sizeof *head);
size_t i = 0;
BT_kv *kv = &node->datk[i];
while (i < BT_DAT_MAXKEYS - 1) {
/* free and contiguous with previous mlist node: merge */
if (kv->fo == 0
&& addr2off(prev->va) + prev->sz == kv->va) {
vaof_t hi = node->datk[i+1].va;
vaof_t lo = kv->va;
size_t len = hi - lo;
prev->sz += len;
/* free but not contiguous with previous mlist node: append new node */
else if (kv->fo == 0) {
BT_mlistnode *new = calloc(1, sizeof *new);
vaof_t hi = node->datk[i+1].va;
vaof_t lo = kv->va;
size_t len = hi - lo;
new->sz = len;
new->va = off2addr(lo);
prev->next = new;
prev = new;
kv = &node->datk[++i];
return head;
/* branch */
size_t i = 0;
BT_mlistnode *head, *prev;
head = prev = 0;
for (; i < BT_DAT_MAXKEYS; ++i) {
BT_kv kv = node->datk[i];
if (kv.fo == BT_NOPAGE)
BT_page *child = _node_get(state, kv.fo);
BT_mlistnode *new = _mlist_read2(state, child, maxdepth, depth+1);
if (head == 0) {
head = prev = new;
else {
/* just blindly append and unify the ends afterward */
prev->next = new;
return 0;
static int
_mlist_read(BT_state *state)
BT_meta *meta = state->meta_pages[state->which];
BT_page *root = _node_get(state, meta->root);
uint8_t maxdepth = meta->depth;
BT_mlistnode *head = _mlist_read2(state, root, maxdepth, 1);
trace the full freelist and unify nodes one last time
NB: linking the leaf nodes would make this unnecessary
BT_mlistnode *p = head;
BT_mlistnode *n = head->next;
while (n) {
size_t llen = P2BYTES(p->sz);
uintptr_t laddr = (uintptr_t)p->va;
uintptr_t raddr = (uintptr_t)n->va;
/* contiguous: unify */
if (laddr + llen == raddr) {
p->sz += n->sz;
p->next = n->next;
state->mlist = head;
return BT_SUCC;
static int
_mlist_delete(BT_state *state)
BT_mlistnode *head, *prev;
head = prev = state->mlist;
while (head->next) {
prev = head;
head = head->next;
state->mlist = 0;
return BT_SUCC;
static void
_flist_split(BT_flistnode *head, BT_flistnode **left, BT_flistnode **right)
/* split flist starting at head into two lists, left and right at the midpoint
of head */
assert(head != 0);
BT_flistnode *slow, *fast;
slow = head; fast = head->next;
while (fast) {
fast = fast->next;
if (fast) {
slow = slow->next;
fast = fast->next;
*left = head;
*right = slow->next;
slow->next = 0;
static BT_flistnode *
_flist_merge2(BT_flistnode *l, BT_flistnode *r)
/* returns the furthest node in l that has a pg less than the first node in r */
BT_flistnode *curr, *prev;
prev = l;
curr = l->next;
while (curr) {
if (curr->pg < r->pg) {
prev = curr;
curr = curr->next;
if (prev->pg < r->pg)
return prev;
return 0;
static BT_flistnode *
_flist_merge(BT_flistnode *l, BT_flistnode *r)
/* merge two sorted flists, l and r and return the sorted result */
BT_flistnode *head;
if (!l) return r;
if (!r) return l;
while (l && r) {
if (l->next == 0) {
l->next = r;
if (r->next == 0) {
BT_flistnode *ll = _flist_merge2(l, r);
BT_flistnode *rnext = r->next;
/* insert head of r into appropriate spot in l */
r->next = ll->next;
ll->next = r;
/* adjust l and r heads */
l = ll->next;
r = rnext;
return head;
BT_flistnode *
_flist_mergesort(BT_flistnode *head)
if (head == 0 || head->next == 0)
return head;
BT_flistnode *l, *r;
_flist_split(head, &l, &r);
/* ;;: todo, make it non-recursive. Though, shouldn't matter as much here
since O(log n). merge already non-recursive */
return _flist_merge(l, r);
BT_flistnode *
_flist_read2(BT_state *state, BT_page *node, uint8_t maxdepth, uint8_t depth)
/* leaf */
if (depth == maxdepth) {
BT_flistnode *head, *prev;
head = prev = calloc(1, sizeof(*head));
/* ;;: fixme the head won't get populated in this logic */
size_t i = 0;
BT_kv *kv = &node->datk[i];
while (i < BT_DAT_MAXKEYS - 1) {
/* Just blindly append nodes since they aren't guaranteed sorted */
BT_flistnode *new = calloc(1, sizeof *new);
vaof_t hi = node->datk[i+1].va;
vaof_t lo = kv->va;
size_t len = hi - lo;
pgno_t fo = kv->fo;
new->sz = len;
new->pg = fo;
prev->next = new;
prev = new;
kv = &node->datk[++i];
return head;
/* branch */
size_t i = 0;
BT_flistnode *head, *prev;
head = prev = 0;
for (; i < BT_DAT_MAXKEYS; ++i) {
BT_kv kv = node->datk[i];
if (kv.fo == BT_NOPAGE)
BT_page *child = _node_get(state, kv.fo);
BT_flistnode *new = _flist_read2(state, child, maxdepth, depth+1);
if (head == 0) {
head = prev = new;
else {
/* just blindly append and unify the ends afterward */
prev->next = new;
return 0;
static int
_flist_read(BT_state *state)
BT_meta *meta = state->meta_pages[state->which];
BT_page *root = _node_get(state, meta->root);
uint8_t maxdepth = meta->depth;
BT_flistnode *head = _flist_read2(state, root, maxdepth, 0);
/* ;;: infinite loop with proper starting depth of 1. -- fix that! */
/* BT_flistnode *head = _flist_read2(state, root, maxdepth, 1); */
if (head == 0)
return BT_SUCC;
/* sort the freelist */
/* merge contiguous regions after sorting */
BT_flistnode *p = head;
BT_flistnode *n = head->next;
while (n) {
size_t llen = p->sz;
pgno_t lfo = p->pg;
pgno_t rfo = n->pg;
/* contiguous: unify */
if (lfo + llen == rfo) {
p->sz += n->sz;
p->next = n->next;
state->flist = head;
return BT_SUCC;
static int
_flist_delete(BT_state *state)
BT_flistnode *head, *prev;
head = prev = state->flist;
while (head->next) {
prev = head;
head = head->next;
state->flist = 0;
return BT_SUCC;
#define CLOSE_FD(fd) \
do { \
close(fd); \
fd = -1; \
} while(0)
/* TODO: move to lib */
static uint32_t
nonzero_crc_32(void *dat, size_t len)
unsigned char nonce = 0;
uint32_t chk = crc_32(dat, len);
do {
if (nonce > 8)
chk = update_crc_32(chk, nonce++);
} while (chk == 0);
return chk;
static void
_bt_state_restore_maps2(BT_state *state, BT_page *node,
uint8_t depth, uint8_t maxdepth)
size_t N = _bt_numkeys(node);
/* leaf */
if (depth == maxdepth) {
for (size_t i = 0; i < N-1; i++) {
vaof_t lo = node->datk[i].va;
vaof_t hi = node->datk[i+1].va;
pgno_t pg = node->datk[i].fo;
BYTE *loaddr = off2addr(lo);
BYTE *hiaddr = off2addr(hi);
size_t bytelen = hiaddr - loaddr;
off_t offset = P2BYTES(pg);
if (loaddr !=
offset)) {
DPRINTF("mmap: failed to map at addr %p", loaddr);
/* branch - bfs all subtrees */
for (size_t i = 0; i < N-1; i++) {
/* ;;: assuming node stripes when partition striping is implemented will be
1:1 mapped to disk for simplicity. If that is not the case, they should
be handled here. */
pgno_t pg = node->datk[i].fo;
BT_page *child = _node_get(state, pg);
return _bt_state_restore_maps2(state, child, depth+1, maxdepth);
static void
_bt_state_restore_maps(BT_state *state)
/* restores the memory map of the btree since data can be arbitrarily located */
/* TODO: add checks to ensure data isn't mapped into an invalid location
(e.g. a node stripe) */
BT_meta *meta = state->meta_pages[state->which];
BT_page *root = _node_get(state, meta->root);
_bt_state_restore_maps2(state, root, 1, meta->depth);
static int
_bt_state_meta_which(BT_state *state, int *which)
BT_meta *m1 = state->meta_pages[0];
BT_meta *m2 = state->meta_pages[1];
*which = -1;
if (m1->chk == 0) {
/* first is dirty */
*which = 1;
else if (m2->chk == 0) {
/* second is dirty */
*which = 0;
else if (m1->txnid > m2->txnid) {
/* first is most recent */
*which = 0;
else if (m1->txnid < m2->txnid) {
/* second is most recent */
*which = 1;
else {
/* invalid state */
return EINVAL;
/* checksum the metapage found and abort if checksum doesn't match */
BT_meta *meta = state->meta_pages[*which];
uint32_t chk = nonzero_crc_32(meta, BT_META_LEN);
if (chk != meta->chk) {
return BT_SUCC;
static int
_bt_state_read_header(BT_state *state)
/* TODO: actually read the header and copy the data to meta when we implement
persistence */
BT_page metas[2];
int rc, len, which;
BT_meta *m1, *m2;
/* pma already exists, parse metadata file */
m1 = state->meta_pages[0];
m2 = state->meta_pages[1];
/* ;;: TODO, need to store last page in use by pma in both metadata pages. choose the frontier after _bt_state_meta_which and store it in state */
if ((len = pread(state->data_fd, metas, BT_PAGESIZE*2, 0))
/* new pma */
return ENOENT;
/* validate magic */
if (m1->magic != BT_MAGIC) {
DPRINTF("metapage 0x%pX inconsistent magic: 0x%" PRIX32, m1, m1->magic);
return EINVAL;
if (m2->magic != BT_MAGIC) {
DPRINTF("metapage 0x%pX inconsistent magic: 0x%" PRIX32, m2, m2->magic);
return EINVAL;
/* validate flags */
if (m1->flags & BP_META != BP_META) {
DPRINTF("metapage 0x%pX missing meta page flag", m1);
return EINVAL;
if (m2->flags & BP_META != BP_META) {
DPRINTF("metapage 0x%pX missing meta page flag", m2);
return EINVAL;
/* validate binary version */
if (m1->version != BT_VERSION) {
DPRINTF("version mismatch on metapage: 0x%pX, metapage version: %" PRIu32 ", binary version %u",
m1, m1->version, BT_VERSION);
return EINVAL;
/* validate binary version */
if (m2->version != BT_VERSION) {
DPRINTF("version mismatch on metapage: 0x%pX, metapage version: %" PRIu32 ", binary version %u",
m2, m2->version, BT_VERSION);
return EINVAL;
if (!SUCC(rc = _bt_state_meta_which(state, &which)))
return rc;
state->which = which;
return BT_SUCC;
static int
_bt_state_meta_new(BT_state *state)
BT_page *p1, *p2, *root;
BT_meta meta = {0};
int rc, pagesize;
/* initialize the block base array */
meta.blk_base[0] = BT_PAGESIZE * BT_NUMMETAS;
root = _bt_nalloc(state);
_bt_root_new(&meta, root);
pagesize = sizeof *p1;
/* initialize meta struct */
meta.magic = BT_MAGIC;
meta.version = BT_VERSION;
meta.last_pg = 1;
meta.txnid = 0;
meta.fix_addr = BT_MAPADDR;
meta.blk_cnt = 1;
meta.depth = 1;
meta.flags = BP_META;
meta.root = _fo_get(state, root);
assert(meta.root == INITIAL_ROOTPG); /* ;;: remove?? */
/* initialize the metapages */
p1 = &((BT_page *)state->map)[0];
p2 = &((BT_page *)state->map)[1];
/* copy the metadata into the metapages */
memcpy(METADATA(p1), &meta, sizeof meta);
/* ;;: todo, should the second metapage actually share a .root with the
first?? */
memcpy(METADATA(p2), &meta, sizeof meta);
return BT_SUCC;
static int
_bt_state_load(BT_state *state)
int rc;
int new = 0;
BT_page *p;
struct stat stat;
if (!SUCC(rc = _bt_state_read_header(state))) {
if (rc != ENOENT) return rc;
DPUTS("creating new db");
state->file_size = PMA_GROW_SIZE;
new = 1;
state->map = mmap(BT_MAPADDR,
if (state->map != BT_MAPADDR) {
DPRINTF("mmap: failed to map at addr %p", BT_MAPADDR);
p = (BT_page *)state->map;
state->meta_pages[0] = METADATA(p);
state->meta_pages[1] = METADATA(p + 1);
/* new db, so populate metadata */
if (new) {
/* ;;: move this logic to _flist_new */
if (-1 == lseek(state->data_fd, state->file_size, SEEK_SET))
return errno;
if (-1 == write(state->data_fd, "", 1))
return errno;
state->file_size = PMA_GROW_SIZE;
if (!SUCC(rc = _bt_state_meta_new(state))) {
munmap(state->map, BT_ADDRSIZE);
return rc;
else {
/* restore ephemeral freelists */
if (fstat(state->data_fd, &stat) != 0)
return errno;
state->file_size = stat.st_size;
/* restore data memory maps */
return BT_SUCC;
/* ;;: TODO, when persistence has been implemented, _bt_falloc will probably
need to handle extension of the file with appropriate striping. i.e. if no
space is found on the freelist, save the last entry, expand the file size,
and set last_entry->next to a new node representing the newly added file
space */
static pgno_t
_bt_falloc(BT_state *state, size_t pages)
/* walk the persistent file freelist and return a pgno with sufficient
contiguous space for pages */
BT_flistnode **n = &state->flist;
pgno_t ret = 0;
/* first fit */
/* ;;: is there any reason to use a different allocation strategy for disk? */
for (; *n; n = &(*n)->next) {
/* perfect fit */
if ((*n)->sz == pages) {
pgno_t ret;
ret = (*n)->pg;
*n = (*n)->next;
return ret;
/* larger than necessary: shrink the node */
if ((*n)->sz > pages) {
pgno_t ret;
ret = (*n)->pg;
(*n)->sz -= pages;
(*n)->pg = (*n)->pg + pages;
return ret;
return 0;
static int
_bt_sync_hasdirtypage(BT_state *state, BT_page *node)
/* ;;: could be more efficiently replaced by a gcc vectorized builtin */
for (size_t i = 0; i < NMEMB(node->head.dirty); i++) {
if (node->head.dirty[i] != 0)
return 1;
return 0;
static int
_bt_sync_leaf(BT_state *state, BT_page *node)
/* msync all of a leaf's data that is dirty. The caller is expected to sync
the node itself and mark it as clean in the parent. */
pgno_t pg;
size_t i = 0;
size_t N = _bt_numkeys(node);
for (size_t i = 0; i < N-1; i++) {
if (!_bt_ischilddirty(node, i))
continue; /* not dirty. nothing to do */
/* ;;: we don't actually need the page, do we? */
/* pgno_t pg = node->datk[i].fo; */
vaof_t lo = node->datk[i].va;
vaof_t hi = node->datk[i+1].va;
size_t bytelen = P2BYTES(hi - lo);
void *addr = off2addr(lo);
/* sync the page */
if (msync(addr, bytelen, MS_SYNC))
return errno;
/* and clean the dirty bit */
_bt_cleanchild(node, i);
/* ;;: all data pages synced. should we now sync the node as well? No, I think
that should be the caller's responsibility */
/* ;;: it is probably faster to scan the dirty bit set and derive the datk idx
rather than iterate over the full datk array and check if it is dirty. This
was simpler to implement for now though. */
/* while (_bt_sync_hasdirtypage(state, node)) { */
/* ... */
/* } */
return BT_SUCC;
static int
_bt_sync_meta(BT_state *state)
/* syncs the metapage and performs necessary checksumming. Additionally, flips
the which */
BT_meta *meta = state->meta_pages[state->which];
BT_meta *newmeta;
uint32_t chk;
int newwhich;
/* increment the txnid */
meta->txnid += 1;
/* checksum the metapage */
chk = nonzero_crc_32(meta, BT_META_LEN);
/* ;;: todo: guarantee the chk cannot be zero */
meta->chk = chk;
/* sync the metapage */
if (msync(meta, sizeof(BT_page), MS_SYNC))
return errno;
/* zero the new metapage's checksum */
newwhich = state->which ? 0 : 1;
newmeta = state->meta_pages[newwhich];
newmeta->chk = 0;
/* copy over metapage to new metapage excluding the checksum */
memcpy(newmeta, meta, BT_META_LEN);
/* CoW a new root since the root referred to by the metapage should always be
dirty */
BT_page *root;
pgno_t newrootpg;
root = _node_get(state, newmeta->root);
if (!SUCC(_node_cow(state, root, &newrootpg)))
newmeta->root = newrootpg;
/* finally, switch the metapage we're referring to */
state->which = newwhich;
return BT_SUCC;
static int
_bt_sync(BT_state *state, BT_page *node, uint8_t depth, uint8_t maxdepth)
/* recursively syncs the subtree under node. The caller is expected to sync node
itself and mark it clean. */
int rc = 0;
size_t N = _bt_numkeys(node);
/* leaf */
if (depth == maxdepth) {
_bt_sync_leaf(state, node);
return BT_SUCC;
/* do dfs */
for (size_t i = 0; i < N-1; i++) {
if (!_bt_ischilddirty(node, i))
continue; /* not dirty. nothing to do */
BT_page *child = _node_get(state, node->datk[i].fo);
/* recursively sync the child's data */
if (rc = _bt_sync(state, child, depth+1, maxdepth))
return rc;
/* sync the child node */
if (msync(child, sizeof(BT_page), MS_SYNC))
return errno;
/* clean the child */
_bt_cleanchild(node, i);
return BT_SUCC;
//// ===========================================================================
//// btree external routines
bt_state_new(BT_state **state)
BT_state *s = calloc(1, sizeof *s);
s->data_fd = -1;
s->fixaddr = BT_MAPADDR;
*state = s;
return BT_SUCC;
#define DATANAME "/data.pma"
bt_state_open(BT_state *state, const char *path, ULONG flags, mode_t mode)
int oflags, rc;
char *dpath;
oflags = O_RDWR | O_CREAT;
dpath = malloc(strlen(path) + sizeof(DATANAME));
if (!dpath) return ENOMEM;
sprintf(dpath, "%s" DATANAME, path);
if (mkdir(path, 0774) == -1)
return errno;
if ((state->data_fd = open(dpath, oflags, mode)) == -1)
return errno;
if (!SUCC(rc = _bt_state_load(state)))
goto e;
state->path = strdup(dpath);
/* cleanup FDs stored in state if anything failed */
if (!SUCC(rc)) {
if (state->data_fd != -1) CLOSE_FD(state->data_fd);
return rc;
bt_state_close(BT_state *state)
int rc;
if ((rc = munmap(state->map, BT_ADDRSIZE)) != 0) {
rc = errno;
return rc;
if (state->data_fd != -1) CLOSE_FD(state->data_fd);
ZERO(state, sizeof *state);
return BT_SUCC;
void *
bt_malloc(BT_state *state, size_t pages)
BT_mlistnode **n = &state->mlist;
void *ret = 0;
/* first fit */
for (; *n; n = &(*n)->next) {
/* perfect fit */
if ((*n)->sz == pages) {
ret = (*n)->va;
*n = (*n)->next;
/* larger than necessary: shrink the node */
if ((*n)->sz > pages) {
ret = (*n)->va;
(*n)->sz -= pages;
(*n)->va = (BT_page *)(*n)->va + pages;
pgno_t pgno = _bt_falloc(state, pages);
bp(pgno != 0);
addr2off(ret) + pages,
bp(ret != 0);
return ret;
bt_free(BT_state *state, void *lo, void *hi)
vaof_t looff = addr2off(lo);
vaof_t hioff = addr2off(hi);
_bt_insert(state, looff, hioff, 0);
_mlist_insert(state, lo, hi);
bt_sync(BT_state *state)
/* as is often the case, handling the metapage/root is a special case, which
is done here. Syncing any other page of the tree is done in _bt_sync */
BT_meta *meta = state->meta_pages[state->which];
BT_page *root = _node_get(state, meta->root);
int rc = 0;
if (rc = _bt_sync(state, root, 1, meta->depth))
return rc;
/* merge the pending freelists */
/* sync the root page */
if (msync(root, sizeof(BT_page), MS_SYNC))
return errno;
/* then sync the metapage */
if (rc = _bt_sync_meta(state))
return rc;
return BT_SUCC;
bt_meta_get(BT_state *state, size_t idx)
BT_meta *meta = state->meta_pages[state->which];
assert((uintptr_t)&meta->roots[idx] - (uintptr_t)&meta <= sizeof *meta);
return meta->roots[idx];
bt_meta_set(BT_state *state, size_t idx, uint64_t val)
BT_meta *meta = state->meta_pages[state->which];
assert((uintptr_t)&meta->roots[idx] - (uintptr_t)&meta <= sizeof *meta);
meta->roots[idx] = val;
_bt_range_of(BT_state *state, vaof_t p, vaof_t **lo, vaof_t **hi,
pgno_t nodepg, uint8_t depth, uint8_t maxdepth)
BT_page *node = _node_get(state, nodepg);
size_t N = _bt_numkeys(node);
vaof_t llo = 0;
vaof_t hhi = 0;
pgno_t pg = 0;
size_t i;
for (i = 0; i < N-1; i++) {
llo = node->datk[i].va;
hhi = node->datk[i+1].va;
pg = node->datk[i].fo;
if (llo <= p && hhi > p) {
/* not found */
if (i == N-1)
return 1;
if (depth == maxdepth) {
**lo = llo;
**hi = hhi;
return BT_SUCC;
return _bt_range_of(state, p, lo, hi, pg, depth+1, maxdepth);
bt_range_of(BT_state *state, void *p, void **lo, void **hi)
/* traverse tree looking for lo <= p and hi > p. return that range as a pair
of pointers NOT as two vaof_t
0: succ (found)
1: otherwise
BT_meta *meta = state->meta_pages[state->which];
pgno_t root = meta->root;
vaof_t *loret = 0;
vaof_t *hiret = 0;
vaof_t poff = addr2off(p);
int rc = 0;
if (!SUCC(rc = _bt_range_of(state, poff, &loret, &hiret, root, 1, meta->depth))) {
return rc;
*lo = off2addr(*loret);
*hi = off2addr(*hiret);
return BT_SUCC;
pseudocode from ed:
bt_dirty(btree, lo, hi):
(range_lo, range_hi) = find_range_for_pointer(btree, lo);
dirty_hi = min(hi, range_hi);
new_start_fo = data_cow(btree, lo, dirty_hi);
lo := range_hi;
if dirty_hi == hi then break;
// precondition: given range does not cross a tree boundary
data_cow(btree, lo, hi):
(range_lo, range_hi, fo) = bt_find(btree, lo, hi);
size = lo - hi;
new_fo = data_alloc(btree.data_free, size);
// puts data in the unified buffer cache without having to map virtual memory
write(fd, new_fo, size * BT_PAGESIZE, to_ptr(lo));
// maps new file offset with same data back into same memory
mmap(fd, new_fo, size, to_ptr(lo));
bt_insert(btree, lo, hi, new_fo);
offset = lo - range_lo;
freelist_insert(btree.pending_data_flist, fo + offset, fo + offset + size);
return new_fo
static pgno_t
_bt_data_cow(BT_state *state, vaof_t lo, vaof_t hi, pgno_t pg)
size_t len = hi - lo;
size_t bytelen = P2BYTES(len);
pgno_t newpg = _bt_falloc(state, len);
BYTE *loaddr = off2addr(lo);
off_t offset = P2BYTES(newpg);
/* write call puts data in the unified buffer cache without having to map
virtual memory */
if (pwrite(state->data_fd, loaddr, bytelen, offset) != bytelen)
/* maps new file offset with same data back into memory */
if (loaddr !=
offset)) {
DPRINTF("mmap: failed to map at addr %p", loaddr);
_bt_insert(state, lo, hi, newpg);
_pending_flist_insert(state, pg, len);
return newpg;
#define MIN(x, y) ((x) > (y) ? (y) : (x))
static int
_bt_dirty(BT_state *state, vaof_t lo, vaof_t hi, pgno_t nodepg,
uint8_t depth, uint8_t maxdepth)
BT_page *node = _node_get(state, nodepg);
size_t N = _bt_numkeys(node);
size_t loidx = 0;
size_t hiidx = 0;
/* find loidx of range */
for (size_t i = 0; i < N-1; i++) {
vaof_t hhi = node->datk[i+1].va;
if (hhi > lo) {
loidx = i;
assert(loidx != 0);
/* find hiidx of range */
for (size_t i = loidx; i < N-1; i++) {
vaof_t hhi = node->datk[i+1].va;
if (hhi >= hi) {
hiidx = i;
assert(hiidx != 0);
/* found a range in node that contains (lo-hi). May span multiple entries */
for (size_t i = loidx; i < hiidx; i++) {
/* leaf: base case. cow the data */
if (depth == maxdepth) {
vaof_t llo = node->datk[i].va;
vaof_t hhi = MIN(node->datk[i+1].va, hi);
pgno_t pg = node->datk[i].fo;
pgno_t newpg = _bt_data_cow(state, llo, hhi, pg);
_bt_insert(state, llo, hhi, newpg);
/* branch: recursive case */
pgno_t childpg = node->datk[i].fo;
/* iteratively recurse on all entries */
_bt_dirty(state, lo, hi, childpg, depth+1, maxdepth);
bt_dirty(BT_state *state, void *lo, void *hi)
/* takes a range and ensures that entire range is CoWed */
/* if part of the range is free then return 1 */
BT_meta *meta = state->meta_pages[state->which];
vaof_t looff = addr2off(lo);
vaof_t hioff = addr2off(hi);
return _bt_dirty(state, looff, hioff, meta->root, 1, meta->depth);
bt_next_alloc(BT_state *state, void *p, void **lo, void **hi)
/* if p is free, sets lo and hi to the bounds of the next adjacent allocated
space. If p is allocated, sets lo and hi to the bounds of the allocated space
it falls in. */
BT_mlistnode *head = state->mlist;
while (head) {
/* at last free block, different logic applies */
if (head->next == 0)
goto end;
/* p is in a free range, return the allocated hole after it */
if (head->va <= p
&& head->va + head->sz > p) {
goto found;
/* p is alloced, return this hole */
if (head->next->va > p
&& head->va + head->sz <= p) {
goto found;
head = head->next;
/* not found */
return 1;
/* the alloced space begins at the end of the free block */
*lo = head->va + head->sz;
/* ... and ends at the start of the next free block */
*hi = head->next->va;
return BT_SUCC;
void *pma_end = (void *)((uintptr_t)BT_MAPADDR + BT_ADDRSIZE);
assert(head->va + head->sz <= pma_end);
/* no alloced region between tail of freelist and end of pma memory space */
if (head->va + head->sz == pma_end)
return 1;
/* otherwise, return the alloced region between the tail of the freelist and
the end of the memory arena */
*lo = head->va + head->sz;
*hi = pma_end;
return BT_SUCC;
bt_bounds(BT_state *state, void **lo, void **hi)
*hi = (void *)((uintptr_t)BT_MAPADDR + BT_ADDRSIZE);
bt_inbounds(BT_state *state, void *p)
/* 1: if in the bounds of the PMA, 0 otherwise */
return p >= BT_MAPADDR
&& p < (void *)((uintptr_t)BT_MAPADDR + BT_ADDRSIZE);
//// ===========================================================================
//// tests
/* ;;: obv this should be moved to a separate file */
static void
_sham_sync_clean(BT_page *node)
for (uint8_t *dit = &node->head.dirty[0]
; dit < &node->head.dirty[sizeof(node->head.dirty) - 1]
; dit++) {
*dit = 0;
static void
_sham_sync2(BT_state *state, BT_page *node, uint8_t depth, uint8_t maxdepth)
if (depth == maxdepth) return;
/* clean node */
/* then recurse and clean all children with DFS */
size_t N = _bt_numkeys(node);
for (size_t i = 1; i < N; ++i) {
BT_kv kv = node->datk[i];
pgno_t childpg = kv.fo;
BT_page *child = _node_get(state, childpg);
_sham_sync2(state, child, depth+1, maxdepth);
static void
_sham_sync(BT_state *state)
/* walk the tree and unset the dirty bit from all pages */
BT_meta *meta = state->meta_pages[state->which];
BT_page *root = _node_get(state, meta->root);
meta->chk = nonzero_crc_32(meta, BT_META_LEN);
_sham_sync2(state, root, 1, meta->depth);
static void
_bt_printnode(BT_page *node)
printf("node: %p\n", node);
printf("data: \n");
for (size_t i = 0; i < BT_DAT_MAXKEYS; ++i) {
if (i && node->datk[i].va == 0)
printf("[%5zu] %10x %10x\n", i, node->datk[i].va, node->datk[i].fo);