2023-11-29 19:15:46 +03:00
# include <sys/types.h>
# include <sys/stat.h>
# include <sys/mman.h>
# include <stdint.h>
# include <unistd.h>
# include <stdlib.h>
# include <fcntl.h>
# include <assert.h>
# include <stdio.h>
# include <errno.h>
# include <string.h>
# include <inttypes.h>
2023-11-30 19:35:10 +03:00
# include "btree.h"
2023-11-29 19:15:46 +03:00
# include "lib/checksum.h"
typedef uint32_t pgno_t ; /* a page number */
typedef uint32_t vaof_t ; /* a virtual address offset */
typedef uint32_t flag_t ;
typedef unsigned char BYTE ;
//// ===========================================================================
//// tmp tmp tmp tmp tmp
/* ;;: remove -- for debugging */
/*
bp ( X ) where X is false will raise a SIGTRAP . If the process is being run
inside a debugger , this can be caught and ignored . It ' s equivalent to a
breakpoint . If run without a debugger , it will dump core , like an assert
*/
# ifdef DEBUG
# if defined(__i386__) || defined(__x86_64__)
# define bp(x) do { if(!(x)) __asm__ volatile("int $3"); } while (0)
# elif defined(__thumb__)
# define bp(x) do { if(!(x)) __asm__ volatile(".inst 0xde01"); } while (0)
# elif defined(__aarch64__)
# define bp(x) do { if(!(x)) __asm__ volatile(".inst 0xd4200000"); } while (0)
# elif defined(__arm__)
# define bp(x) do { if(!(x)) __asm__ volatile(".inst 0xe7f001f0"); } while (0)
# else
STATIC_ASSERT ( 0 , " debugger break instruction unimplemented " ) ;
# endif
# else
# define bp(x) ((void)(0))
# endif
/* coalescing of memory freelist currently prohibited since we haven't
implemented coalescing of btree nodes ( necessary ) */
# define CAN_COALESCE 0
/* ;;: remove once confident in logic and delete all code dependencies on
state - > node_freelist */
2023-12-15 08:50:26 +03:00
/* prints a node before and after a call to _bt_insertdat */
# define DEBUG_PRINTNODE 0
2024-01-11 02:27:31 +03:00
# define MAX(a, b) ((a) > (b) ? (a) : (b))
2024-01-11 03:15:34 +03:00
# define MIN(x, y) ((x) > (y) ? (y) : (x))
2023-11-29 19:15:46 +03:00
# define ZERO(s, n) memset((s), 0, (n))
# define S7(A, B, C, D, E, F, G) A##B##C##D##E##F##G
# define S6(A, B, C, D, E, F, ...) S7(A, B, C, D, E, F, __VA_ARGS__)
# define S5(A, B, C, D, E, ...) S6(A, B, C, D, E, __VA_ARGS__)
# define S4(A, B, C, D, ...) S5(A, B, C, D, __VA_ARGS__)
# define S3(A, B, C, ...) S4(A, B, C, __VA_ARGS__)
# define S2(A, B, ...) S3(A, B, __VA_ARGS__)
# define S(A, ...) S2(A, __VA_ARGS__)
# define KBYTES(x) ((size_t)(x) << 10)
# define MBYTES(x) ((size_t)(x) << 20)
# define GBYTES(x) ((size_t)(x) << 30)
# define TBYTES(x) ((size_t)(x) << 40)
# define PBYTES(x) ((size_t)(x) << 50)
/* 4K page in bytes */
2023-12-06 03:29:56 +03:00
# define P2BYTES(x) ((size_t)(x) << BT_PAGEBITS)
2023-11-29 19:15:46 +03:00
/* the opposite of P2BYTES */
2023-12-06 03:29:56 +03:00
# define B2PAGES(x) ((size_t)(x) >> BT_PAGEBITS)
2023-11-29 19:15:46 +03:00
# define __packed __attribute__((__packed__))
# define UNUSED(x) ((void)(x))
# ifdef DEBUG
# define DPRINTF(fmt, ...) \
fprintf ( stderr , " %s:%d " fmt " \n " , __func__ , __LINE__ , __VA_ARGS__ )
# else
# define DPRINTF(fmt, ...) ((void) 0)
# endif
# define DPUTS(arg) DPRINTF("%s", arg)
# define TRACE(...) DPUTS("")
# define BT_SUCC 0
# define SUCC(x) ((x) == BT_SUCC)
2023-12-15 00:30:29 +03:00
/* given a pointer p returns the low page-aligned addr */
# define LO_ALIGN_PAGE(p) ((BT_page *)(((uintptr_t)p) & ~(BT_PAGESIZE - 1)))
2023-11-29 19:15:46 +03:00
2023-12-13 06:22:48 +03:00
# define BT_MAPADDR ((BYTE *) S(0x1000,0000,0000))
2023-11-29 19:15:46 +03:00
2023-12-04 21:36:50 +03:00
static inline vaof_t
addr2off ( void * p )
/* convert a pointer into a 32-bit page offset */
{
uintptr_t pu = ( uintptr_t ) p ;
2023-12-20 01:47:57 +03:00
assert ( pu > = ( uintptr_t ) BT_MAPADDR ) ;
pu - = ( uintptr_t ) BT_MAPADDR ;
2023-12-06 03:29:56 +03:00
assert ( ( pu & ( ( 1 < < BT_PAGEBITS ) - 1 ) ) = = 0 ) ; /* p must be page-aligned */
return ( vaof_t ) ( pu > > BT_PAGEBITS ) ;
2023-12-04 21:36:50 +03:00
}
static inline void *
off2addr ( vaof_t off )
/* convert a 32-bit page offset into a pointer */
{
2023-12-06 03:29:56 +03:00
uintptr_t pu = ( uintptr_t ) off < < BT_PAGEBITS ;
2023-12-04 21:36:50 +03:00
pu + = ( uintptr_t ) BT_MAPADDR ;
return ( void * ) pu ;
}
2023-11-29 19:15:46 +03:00
# define BT_PAGEWORD 32ULL
# define BT_NUMMETAS 2 /* 2 metapages */
2023-12-13 06:22:48 +03:00
# define BT_META_SECTION_WIDTH (BT_NUMMETAS * BT_PAGESIZE)
2023-11-29 19:15:46 +03:00
# define BT_ADDRSIZE (BT_PAGESIZE << BT_PAGEWORD)
2024-02-07 16:56:38 +03:00
# define PMA_GROW_SIZE_p (10000000)
2024-01-11 02:27:31 +03:00
# define PMA_GROW_SIZE_b (BT_PAGESIZE * PMA_GROW_SIZE_p)
2023-11-29 19:15:46 +03:00
# define BT_NOPAGE 0
2023-12-13 15:45:44 +03:00
# define BT_PROT_CLEAN (PROT_READ)
# define BT_FLAG_CLEAN (MAP_FIXED | MAP_SHARED)
# define BT_PROT_FREE (PROT_NONE)
2023-12-19 06:02:58 +03:00
# define BT_FLAG_FREE (MAP_ANONYMOUS | MAP_SHARED | MAP_FIXED | MAP_NORESERVE)
2023-12-13 15:45:44 +03:00
# define BT_PROT_DIRTY (PROT_READ | PROT_WRITE)
# define BT_FLAG_DIRTY (MAP_FIXED | MAP_SHARED)
2023-11-29 19:15:46 +03:00
/*
FO2BY : file offset to byte
get byte INDEX into pma map from file offset
*/
# define FO2BY(fo) \
( ( uint64_t ) ( fo ) < < BT_PAGEBITS )
/*
BY2FO : byte to file offset
get pgno from byte INDEX into pma map
*/
# define BY2FO(p) \
( ( pgno_t ) ( ( p ) > > BT_PAGEBITS ) )
/*
FO2PA : file offset to page
get a reference to a BT_page from a file offset
2023-12-06 03:29:56 +03:00
2023-12-19 18:56:34 +03:00
; ; : can simplify :
2023-12-06 03:29:56 +03:00
( ( BT_page * ) state - > map ) [ fo ]
2023-11-29 19:15:46 +03:00
*/
# define FO2PA(map, fo) \
( ( BT_page * ) & ( map ) [ FO2BY ( fo ) ] )
/* NMEMB: number of members in array, a */
# define NMEMB(a) \
2023-12-20 09:16:55 +03:00
( sizeof ( a ) / sizeof ( a [ 0 ] ) )
2023-11-29 19:15:46 +03:00
# define offsetof(st, m) \
__builtin_offsetof ( st , m )
//// ===========================================================================
//// btree types
/*
btree page header . all pages share this header . Though for metapages , you can
expect it to be zeroed out .
*/
typedef struct BT_pageheader BT_pageheader ;
struct BT_pageheader {
uint8_t dirty [ 256 ] ; /* dirty bit map */
} __packed ;
/*
btree key / value data format
BT_dat is used to provide a view of the data section in a BT_page where data is
stored like :
va fo va fo
bytes 0 4 8 12
The convenience macros given an index into the data array do the following :
BT_dat_lo ( i ) returns ith va ( low addr )
BT_dat_hi ( i ) returns i + 1 th va ( high addr )
BT_dat_fo ( i ) returns ith file offset
*/
typedef union BT_dat BT_dat ;
union BT_dat {
vaof_t va ; /* virtual address offset */
pgno_t fo ; /* file offset */
} ;
/* like BT_dat but when a struct is more useful than a union */
typedef struct BT_kv BT_kv ;
struct BT_kv {
vaof_t va ;
pgno_t fo ;
} ;
/* ;;: todo, perhaps rather than an index, return the data directly and typecast?? */
# define BT_dat_lo(i) ((i) * 2)
# define BT_dat_fo(i) ((i) * 2 + 1)
# define BT_dat_hi(i) ((i) * 2 + 2)
# define BT_dat_lo2(I, dat)
# define BT_dat_fo2(I, dat)
# define BT_dat_hi2(I, dat)
/* BT_dat_maxva: pointer to highest va in page data section */
# define BT_dat_maxva(p) \
( ( void * ) & ( p ) - > datd [ BT_dat_lo ( BT_DAT_MAXKEYS ) ] )
/* BT_dat_maxfo: pointer to highest fo in page data section */
# define BT_dat_maxfo(p) \
( ( void * ) & ( p ) - > datd [ BT_dat_fo ( BT_DAT_MAXVALS ) ] )
# define BT_DAT_MAXBYTES (BT_PAGESIZE - sizeof(BT_pageheader))
# define BT_DAT_MAXENTRIES (BT_DAT_MAXBYTES / sizeof(BT_dat))
2024-01-24 03:20:07 +03:00
# ifndef BT_DAT_MAXKEYS
2023-11-29 19:15:46 +03:00
# define BT_DAT_MAXKEYS (BT_DAT_MAXENTRIES / 2)
2024-01-24 03:20:07 +03:00
# endif
2023-11-29 19:15:46 +03:00
# define BT_DAT_MAXVALS BT_DAT_MAXKEYS
static_assert ( BT_DAT_MAXENTRIES % 2 = = 0 ) ;
2023-12-06 01:24:46 +03:00
/* we assume off_t is 64 bit */
static_assert ( sizeof ( off_t ) = = sizeof ( uint64_t ) ) ;
2023-11-29 19:15:46 +03:00
/*
all pages in the memory arena consist of a header and data section
*/
typedef struct BT_page BT_page ;
struct BT_page {
2023-12-05 04:13:33 +03:00
BT_pageheader head ; /* header */
union { /* data section */
2023-11-29 19:15:46 +03:00
BT_dat datd [ BT_DAT_MAXENTRIES ] ; /* union view */
2023-12-20 09:16:55 +03:00
BT_kv datk [ BT_DAT_MAXKEYS ] ; /* struct view */
BYTE datc [ BT_DAT_MAXBYTES ] ; /* byte-level view */
2023-11-29 19:15:46 +03:00
} ;
} ;
static_assert ( sizeof ( BT_page ) = = BT_PAGESIZE ) ;
static_assert ( BT_DAT_MAXBYTES % sizeof ( BT_dat ) = = 0 ) ;
# define BT_MAGIC 0xBADDBABE
# define BT_VERSION 1
/*
a meta page is like any other page , but the data section is used to store
additional information
*/
2024-01-19 02:00:28 +03:00
typedef struct BT_meta BT_meta ;
struct BT_meta {
# define BT_NUMROOTS 32
# define BT_NUMPARTS 8
uint32_t magic ;
uint32_t version ;
pgno_t last_pg ; /* last page used in file */
uint32_t _pad0 ;
uint64_t txnid ;
void * fix_addr ; /* fixed addr of btree */
pgno_t blk_base [ BT_NUMPARTS ] ; /* stores pg offsets of node partitions */
uint8_t depth ; /* tree depth */
# define BP_META ((uint8_t)0x02)
uint8_t flags ;
uint16_t _pad1 ;
pgno_t root ;
/* 64bit alignment manually checked - 72 bytes total above */
uint64_t roots [ BT_NUMROOTS ] ; /* for usage by ares */
uint32_t chk ; /* checksum */
} __packed ;
static_assert ( sizeof ( BT_meta ) < = BT_DAT_MAXBYTES ) ;
/* the length of the metapage up to but excluding the checksum */
2024-01-23 04:35:02 +03:00
# define BT_META_LEN_b (offsetof(BT_meta, chk))
# define BLK_BASE_LEN0_b ((size_t)MBYTES(2) - BT_META_SECTION_WIDTH)
# define BLK_BASE_LEN1_b ((size_t)MBYTES(8))
# define BLK_BASE_LEN2_b ((size_t)BLK_BASE_LEN1_b * 4)
# define BLK_BASE_LEN3_b ((size_t)BLK_BASE_LEN2_b * 4)
# define BLK_BASE_LEN4_b ((size_t)BLK_BASE_LEN3_b * 4)
# define BLK_BASE_LEN5_b ((size_t)BLK_BASE_LEN4_b * 4)
# define BLK_BASE_LEN6_b ((size_t)BLK_BASE_LEN5_b * 4)
# define BLK_BASE_LEN7_b ((size_t)BLK_BASE_LEN6_b * 4)
2023-12-19 06:02:58 +03:00
# define BLK_BASE_LEN_TOTAL ( \
BT_META_SECTION_WIDTH + \
2024-01-23 04:35:02 +03:00
BLK_BASE_LEN0_b + \
BLK_BASE_LEN1_b + \
BLK_BASE_LEN2_b + \
BLK_BASE_LEN3_b + \
BLK_BASE_LEN4_b + \
BLK_BASE_LEN5_b + \
BLK_BASE_LEN6_b + \
BLK_BASE_LEN7_b )
2024-01-18 01:40:46 +03:00
2024-01-19 02:00:28 +03:00
static const size_t BLK_BASE_LENS_b [ BT_NUMPARTS ] = {
2024-01-23 04:35:02 +03:00
BLK_BASE_LEN0_b ,
BLK_BASE_LEN1_b ,
BLK_BASE_LEN2_b ,
BLK_BASE_LEN3_b ,
BLK_BASE_LEN4_b ,
BLK_BASE_LEN5_b ,
BLK_BASE_LEN6_b ,
BLK_BASE_LEN7_b ,
2024-01-18 01:40:46 +03:00
} ;
2024-01-23 04:35:02 +03:00
static_assert ( PMA_GROW_SIZE_b > = ( BLK_BASE_LEN0_b + BT_META_LEN_b ) ) ;
2023-11-29 19:15:46 +03:00
typedef struct BT_mlistnode BT_mlistnode ;
struct BT_mlistnode {
2023-12-19 06:02:58 +03:00
/* ;;: lo and hi might as well by (BT_page *) because we don't have any reason
to have finer granularity */
BYTE * lo ; /* low virtual address */
BYTE * hi ; /* high virtual address */
2023-11-29 19:15:46 +03:00
BT_mlistnode * next ; /* next freelist node */
} ;
typedef struct BT_nlistnode BT_nlistnode ;
struct BT_nlistnode {
2023-12-19 06:02:58 +03:00
BT_page * lo ; /* low virtual address */
BT_page * hi ; /* high virtual address */
2023-11-29 19:15:46 +03:00
BT_nlistnode * next ; /* next freelist node */
} ;
typedef struct BT_flistnode BT_flistnode ;
struct BT_flistnode {
2023-12-19 06:02:58 +03:00
pgno_t lo ; /* low pgno in persistent file */
pgno_t hi ; /* high pgno in persistent file */
2023-11-29 19:15:46 +03:00
BT_flistnode * next ; /* next freelist node */
} ;
/* macro to access the metadata stored in a page's data section */
# define METADATA(p) ((BT_meta *)(void *)(p)->datc)
typedef struct BT_state BT_state ;
struct BT_state {
int data_fd ;
char * path ;
void * fixaddr ;
2024-01-23 04:23:34 +03:00
/* ;;: TODO: refactor ->map to be a (BT_page *) */
2023-11-29 19:15:46 +03:00
BYTE * map ;
BT_meta * meta_pages [ 2 ] ; /* double buffered */
2024-01-11 02:27:31 +03:00
pgno_t file_size_p ; /* the size of the pma file in pages */
2023-11-29 19:15:46 +03:00
unsigned int which ; /* which double-buffered db are we using? */
BT_nlistnode * nlist ; /* node freelist */
BT_mlistnode * mlist ; /* memory freelist */
BT_flistnode * flist ; /* pma file freelist */
BT_flistnode * pending_flist ;
BT_nlistnode * pending_nlist ;
} ;
/*
2024-01-11 02:27:31 +03:00
2023-11-29 19:15:46 +03:00
//// ===========================================================================
//// btree internal routines
2023-12-19 18:56:34 +03:00
static void _bt_printnode ( BT_page * node ) __attribute__ ( ( unused ) ) ; /* ;;: tmp */
2023-11-29 19:15:46 +03:00
static int
_bt_insertdat ( vaof_t lo , vaof_t hi , pgno_t fo ,
BT_page * parent , size_t childidx ) ; /* ;;: tmp */
2023-12-20 05:55:57 +03:00
static int _bt_flip_meta ( BT_state * ) ;
2023-11-29 19:15:46 +03:00
# define BT_MAXDEPTH 4 /* ;;: todo derive it */
typedef struct BT_findpath BT_findpath ;
struct BT_findpath {
BT_page * path [ BT_MAXDEPTH ] ;
size_t idx [ BT_MAXDEPTH ] ;
uint8_t depth ;
} ;
/* _node_get: get a pointer to a node stored at file offset pgno */
static BT_page *
_node_get ( BT_state * state , pgno_t pgno )
{
/* TODO: eventually, once we can store more than 2M of nodes, this will need
to reference the meta page ' s blk_base array to determine where a node is
mapped . i . e :
- receive pgno
- find first pgno in blk_base that exceeds pgno : i
- sector that contains node is i - 1
- appropriately offset into i - 1 th fixed size partition : 2 M , 8 M , 16 M , . . .
*/
/* for now, this works because the 2M sector is at the beginning of both the
memory arena and pma file
*/
2024-01-19 02:00:28 +03:00
assert ( pgno > = BT_NUMMETAS ) ;
2024-01-25 01:49:17 +03:00
#if 0
2024-01-19 02:00:28 +03:00
BT_meta * meta = state - > meta_pages [ state - > which ] ;
/* find the partition that contains pgno */
size_t partition_idx = 0 ;
for ( ; ; partition_idx + + ) {
assert ( partition_idx < BT_NUMPARTS ) ;
pgno_t partition_beg = meta - > blk_base [ partition_idx ] ;
pgno_t partition_end = partition_beg + B2PAGES ( BLK_BASE_LENS_b [ partition_idx ] ) ;
if ( partition_end > pgno ) {
assert ( partition_beg < = pgno ) ;
break ;
}
}
/* ;;: hmm. is there something wrong here? No, I don't think so.
On resume ( reading a persistent file ) :
2024-01-23 04:23:34 +03:00
2024-01-19 02:00:28 +03:00
1 ) mmap the node partitions .
- ( read the offset stored in meta - > blk_base )
- mmap the offset + corresponding length of the pma file next to the end
of the last partition in the memory arena . ( in memory , nodes are all
stored at the lowest addresses )
calls to _node_get are given a pgno in the persistent file :
1 ) find the partition that contains this pgno
2 ) Do math on the pgno + found partition to find the memory address it ' s
mapped to and return that as a BT_page *
* * * We do , however , need to be sure we aren ' t cheating anywhere and using a
* * * page offset into the memory arena and calling _node_get on it . That
* * * would technically work for the first partition . It will NOT work for any
* * * other partition . Not sure if we are doing that anywhere currently .
2024-01-23 04:23:34 +03:00
2024-01-19 02:00:28 +03:00
*/
2024-01-25 01:49:17 +03:00
# endif
2023-11-29 19:15:46 +03:00
return FO2PA ( state - > map , pgno ) ;
}
2023-12-06 05:39:21 +03:00
/* ;;: I don't think we should need this if _bt_nalloc also returns a disc offset */
2023-11-29 19:15:46 +03:00
static pgno_t
_fo_get ( BT_state * state , BT_page * node )
{
2024-01-23 04:23:34 +03:00
/* ;;: This may need to be fixed to accommodate partition striping */
2023-11-29 19:15:46 +03:00
uintptr_t vaddr = ( uintptr_t ) node ;
uintptr_t start = ( uintptr_t ) state - > map ;
return BY2FO ( vaddr - start ) ;
}
2023-12-20 00:29:03 +03:00
static void
_mlist_record_alloc ( BT_state * state , void * lo , void * hi )
{
BT_mlistnode * * head = & state - > mlist ;
BYTE * lob = lo ;
BYTE * hib = hi ;
while ( * head ) {
/* found chunk */
if ( ( * head ) - > lo < = lob & & ( * head ) - > hi > = hib )
break ;
assert ( ( * head ) - > next ) ;
head = & ( * head ) - > next ;
}
if ( hib < ( * head ) - > hi ) {
if ( lob > ( * head ) - > lo ) {
BT_mlistnode * left = * head ;
BT_mlistnode * right = calloc ( 1 , sizeof * right ) ;
right - > hi = left - > hi ;
right - > lo = hib ;
right - > next = left - > next ;
left - > hi = lob ;
left - > next = right ;
}
else {
/* lob equal */
( * head ) - > lo = hib ;
}
}
else if ( lob > ( * head ) - > lo ) {
/* hib equal */
( * head ) - > hi = lob ;
}
else {
/* equals */
BT_mlistnode * next = ( * head ) - > next ;
free ( * head ) ;
* head = next ;
}
}
2024-01-18 01:40:46 +03:00
/* ;;: tmp. forward declared. move shit around */
static pgno_t
_bt_falloc ( BT_state * state , size_t pages ) ;
static void
_nlist_insertn ( BT_state * state , BT_nlistnode * * dst , pgno_t lo , pgno_t hi ) ;
2024-01-11 02:27:31 +03:00
static void
_nlist_grow ( BT_state * state )
/* grows the nlist by allocating the next sized stripe from the block base
array . Handles storing the offset of this stripe in state - > blk_base */
{
2024-01-18 01:40:46 +03:00
BT_meta * meta = state - > meta_pages [ state - > which ] ;
2024-01-19 02:00:28 +03:00
/* find the next block (zero pgno) */
2024-01-18 01:40:46 +03:00
size_t next_block = 0 ;
2024-01-19 02:00:28 +03:00
for ( ; meta - > blk_base [ next_block ] ! = 0 ; next_block + + )
2024-01-23 05:21:31 +03:00
assert ( next_block < BT_NUMPARTS ) ;
2024-01-18 01:40:46 +03:00
/* falloc the node partition and store its offset in the metapage */
2024-01-19 02:00:28 +03:00
size_t block_len_b = BLK_BASE_LENS_b [ next_block ] ;
2024-01-18 01:40:46 +03:00
size_t block_len_p = B2PAGES ( block_len_b ) ;
DPRINTF ( " Adding a new node stripe of size (pages): 0x%zX " , block_len_p ) ;
pgno_t partition_pg = _bt_falloc ( state , block_len_p ) ;
2024-01-23 05:21:31 +03:00
size_t partoff_b = P2BYTES ( partition_pg ) ;
2024-01-18 01:40:46 +03:00
meta - > blk_base [ next_block ] = partition_pg ;
2024-01-25 01:49:17 +03:00
/* calculate the target memory address of the mmap call (the length of all
partitions preceding it ) */
2024-01-23 05:21:31 +03:00
BYTE * targ = BT_MAPADDR + BT_META_SECTION_WIDTH ;
for ( size_t i = 0 ; i < next_block ; i + + ) {
targ + = BLK_BASE_LENS_b [ i ] ;
}
/* map the newly alloced node partition */
if ( targ ! = mmap ( targ ,
block_len_b ,
BT_PROT_CLEAN ,
BT_FLAG_CLEAN ,
state - > data_fd ,
2024-01-25 01:49:17 +03:00
partoff_b ) ) {
2024-01-23 05:21:31 +03:00
DPRINTF ( " mmap: failed to map node stripe %zu, addr: 0x%p, file offset (bytes): 0x%zX, errno: %s " ,
next_block , targ , partoff_b , strerror ( errno ) ) ;
abort ( ) ;
}
2024-01-25 01:49:17 +03:00
pgno_t memoff_p = B2PAGES ( targ - BT_MAPADDR ) ;
/* ;;: tmp assert. debugging. */
assert ( & ( ( BT_page * ) state - > map ) [ memoff_p ] = = ( BT_page * ) targ ) ;
/* add the partition to the nlist */
_nlist_insertn ( state ,
& state - > nlist ,
memoff_p ,
memoff_p + block_len_p ) ;
2024-01-11 02:27:31 +03:00
}
2023-12-20 00:29:03 +03:00
static void
_nlist_record_alloc ( BT_state * state , BT_page * lo )
{
BT_nlistnode * * head = & state - > nlist ;
BT_page * hi = lo + 1 ;
while ( * head ) {
/* found chunk */
if ( ( * head ) - > lo < = lo & & ( * head ) - > hi > = hi )
break ;
assert ( ( * head ) - > next ) ;
head = & ( * head ) - > next ;
}
if ( hi < ( * head ) - > hi ) {
if ( lo > ( * head ) - > lo ) {
BT_nlistnode * left = * head ;
BT_nlistnode * right = calloc ( 1 , sizeof * right ) ;
right - > hi = left - > hi ;
right - > lo = hi ;
right - > next = left - > next ;
left - > hi = lo ;
left - > next = right ;
}
else {
/* lo equal */
( * head ) - > lo = hi ;
}
}
else if ( lo > ( * head ) - > lo ) {
/* hi equal */
( * head ) - > hi = lo ;
}
else {
/* equals */
BT_nlistnode * next = ( * head ) - > next ;
free ( * head ) ;
* head = next ;
}
}
static void
_flist_record_alloc ( BT_state * state , pgno_t lo , pgno_t hi )
{
BT_flistnode * * head = & state - > flist ;
while ( * head ) {
/* found chunk */
if ( ( * head ) - > lo < = lo & & ( * head ) - > hi > = hi )
break ;
assert ( ( * head ) - > next ) ;
head = & ( * head ) - > next ;
}
if ( hi < ( * head ) - > hi ) {
if ( lo > ( * head ) - > lo ) {
BT_flistnode * left = * head ;
BT_flistnode * right = calloc ( 1 , sizeof * right ) ;
right - > hi = left - > hi ;
right - > lo = hi ;
right - > next = left - > next ;
left - > hi = lo ;
left - > next = right ;
}
else {
/* lo equal */
( * head ) - > lo = hi ;
}
}
else if ( lo > ( * head ) - > lo ) {
/* hi equal */
( * head ) - > hi = lo ;
}
else {
/* equals */
BT_flistnode * next = ( * head ) - > next ;
free ( * head ) ;
* head = next ;
}
}
2023-11-29 19:15:46 +03:00
static BT_page *
_bt_nalloc ( BT_state * state )
/* allocate a node in the node freelist */
{
2023-12-06 05:39:21 +03:00
/* TODO: maybe change _bt_nalloc to return both a file and a node offset as
params to the function and make actual return value an error code . This is
to avoid forcing some callers to immediately use _fo_get */
2024-01-19 02:00:28 +03:00
BT_nlistnode * * n ;
BT_page * ret ;
start :
n = & state - > nlist ;
ret = 0 ;
2023-11-29 19:15:46 +03:00
for ( ; * n ; n = & ( * n ) - > next ) {
2023-12-19 06:02:58 +03:00
size_t sz_p = ( * n ) - > hi - ( * n ) - > lo ;
2023-12-20 00:29:03 +03:00
/* ;;: refactor? this is ridiculous */
if ( sz_p > = 1 ) {
2023-12-19 06:02:58 +03:00
ret = ( * n ) - > lo ;
2023-12-20 00:29:03 +03:00
_nlist_record_alloc ( state , ret ) ;
2023-12-13 02:23:37 +03:00
break ;
2023-11-29 19:15:46 +03:00
}
}
2023-12-13 02:23:37 +03:00
2023-12-19 06:02:58 +03:00
if ( ret = = 0 ) {
2024-01-18 01:40:46 +03:00
DPUTS ( " nlist out of mem. allocating a new block. " ) ;
_nlist_grow ( state ) ;
2024-01-19 02:00:28 +03:00
/* restart the find procedure */
goto start ;
2023-12-19 06:02:58 +03:00
}
2023-12-13 06:22:48 +03:00
/* make node writable */
2023-12-15 00:40:42 +03:00
if ( mprotect ( ret , sizeof ( BT_page ) , BT_PROT_DIRTY ) ! = 0 ) {
2023-12-15 00:35:33 +03:00
DPRINTF ( " mprotect of node: %p failed with %s " , ret , strerror ( errno ) ) ;
2024-01-25 01:49:17 +03:00
abort ( ) ; /* ;;: https://stackoverflow.com/questions/6862825/getting-cannot-allocate-memory-error */
2023-12-15 00:35:33 +03:00
}
2023-12-13 06:22:48 +03:00
return ret ;
2023-11-29 19:15:46 +03:00
}
static int
2023-12-05 04:13:33 +03:00
_node_cow ( BT_state * state , BT_page * node , pgno_t * pgno )
2023-11-29 19:15:46 +03:00
{
2023-12-13 15:17:05 +03:00
BT_page * ret = _bt_nalloc ( state ) ; /* ;;: todo: assert node has no dirty entries */
memcpy ( ret - > datk , node - > datk , sizeof node - > datk [ 0 ] * BT_DAT_MAXKEYS ) ;
2023-11-29 19:15:46 +03:00
* pgno = _fo_get ( state , ret ) ;
return BT_SUCC ;
}
2023-12-20 09:16:55 +03:00
static void *
_bt_bsearch ( BT_page * page , vaof_t va ) __attribute ( ( unused ) ) ;
2023-11-29 19:15:46 +03:00
/* binary search a page's data section for a va. Returns a pointer to the found BT_dat */
static void *
_bt_bsearch ( BT_page * page , vaof_t va )
{
/* ;;: todo: actually bsearch rather than linear */
2023-12-19 18:56:34 +03:00
for ( BT_kv * kv = & page - > datk [ 0 ] ; kv < = ( BT_kv * ) BT_dat_maxva ( page ) ; kv + + ) {
2023-11-29 19:15:46 +03:00
if ( kv - > va = = va )
return kv ;
}
return 0 ;
}
static size_t
_bt_childidx ( BT_page * node , vaof_t lo , vaof_t hi )
/* looks up the child index in a parent node. If not found, return is
BT_DAT_MAXKEYS */
{
size_t i = 0 ;
for ( ; i < BT_DAT_MAXKEYS - 1 ; i + + ) {
vaof_t llo = node - > datk [ i ] . va ;
vaof_t hhi = node - > datk [ i + 1 ] . va ;
if ( llo < = lo & & hhi > = hi )
return i ;
}
return BT_DAT_MAXKEYS ;
}
/* ;;: find returns a path to nodes that things should be in if they are there. */
/* a leaf has a meta page depth eq to findpath depth */
static int
_bt_find2 ( BT_state * state ,
BT_page * node ,
BT_findpath * path ,
uint8_t maxdepth ,
vaof_t lo ,
vaof_t hi )
{
/* ;;: meta node stores depth (node or leaf?)
look at root node and binsearch BT_dats where low is < = lo and high is > = hi
If at depth of metapage ( a leaf ) , then done
otherwise grab node , increment depth , save node in path
*/
if ( path - > depth > maxdepth )
return ENOENT ;
assert ( node ! = 0 ) ;
size_t i ;
if ( ( i = _bt_childidx ( node , lo , hi ) ) = = BT_DAT_MAXKEYS )
return ENOENT ;
if ( path - > depth = = maxdepth ) {
path - > idx [ path - > depth ] = i ;
path - > path [ path - > depth ] = node ;
return BT_SUCC ;
}
/* then branch */
else {
pgno_t fo = node - > datk [ i ] . fo ;
BT_page * child = _node_get ( state , fo ) ;
path - > idx [ path - > depth ] = i ;
path - > path [ path - > depth ] = node ;
path - > depth + + ;
return _bt_find2 ( state , child , path , maxdepth , lo , hi ) ;
}
}
static void
2023-12-09 02:15:31 +03:00
_bt_root_new ( BT_meta * meta , BT_page * root )
2023-11-29 19:15:46 +03:00
{
2023-12-20 01:47:57 +03:00
/* The first usable address in the PMA is just beyond the btree segment */
2023-12-19 06:02:58 +03:00
root - > datk [ 0 ] . va = B2PAGES ( BLK_BASE_LEN_TOTAL ) ;
2023-11-29 19:15:46 +03:00
root - > datk [ 0 ] . fo = 0 ;
root - > datk [ 1 ] . va = UINT32_MAX ;
root - > datk [ 1 ] . fo = 0 ;
}
static int
_bt_find ( BT_state * state , BT_findpath * path , vaof_t lo , vaof_t hi )
{
path - > depth = 1 ;
BT_meta * meta = state - > meta_pages [ state - > which ] ;
BT_page * root = _node_get ( state , meta - > root ) ;
uint8_t maxdepth = meta - > depth ;
return _bt_find2 ( state , root , path , maxdepth , lo , hi ) ;
}
2023-12-20 09:16:55 +03:00
static int
_bt_findpath_is_root ( BT_findpath * path ) __attribute ( ( unused ) ) ;
2023-11-29 19:15:46 +03:00
static int
_bt_findpath_is_root ( BT_findpath * path )
{
assert ( path ! = 0 ) ;
return path - > depth = = 0 ;
}
/* _bt_numkeys: find next empty space in node's data section. Returned as
index into node - > datk . If the node is full , return is BT_DAT_MAXKEYS */
static size_t
_bt_numkeys ( BT_page * node )
{
size_t i = 1 ;
for ( ; i < BT_DAT_MAXKEYS ; i + + ) {
if ( node - > datk [ i ] . va = = 0 ) break ;
}
return i ;
}
static int
_bt_datshift ( BT_page * node , size_t i , size_t n )
/* shift data segment at i over by n KVs */
{
assert ( i + n < BT_DAT_MAXKEYS ) ; /* check buffer overflow */
size_t siz = sizeof node - > datk [ 0 ] ;
size_t bytelen = ( BT_DAT_MAXKEYS - i - n ) * siz ;
memmove ( & node - > datk [ i + n ] , & node - > datk [ i ] , bytelen ) ;
2023-12-06 03:29:56 +03:00
ZERO ( & node - > datk [ i ] , n * siz ) ; /* NB: not completely necessary */
2023-11-29 19:15:46 +03:00
return BT_SUCC ;
}
/* _bt_split_datcopy: copy right half of left node to right node */
static int
_bt_split_datcopy ( BT_page * left , BT_page * right )
{
size_t mid = BT_DAT_MAXKEYS / 2 ;
size_t bytelen = mid * sizeof ( left - > datk [ 0 ] ) ;
/* copy rhs of left to right */
memcpy ( right - > datk , & left - > datk [ mid ] , bytelen ) ;
/* zero rhs of left */
ZERO ( & left - > datk [ mid ] , bytelen ) ; /* ;;: note, this would be unnecessary if we stored node.N */
/* the last entry in left should be the first entry in right */
left - > datk [ mid ] . va = right - > datk [ 0 ] . va ;
return BT_SUCC ;
}
static int
_bt_ischilddirty ( BT_page * parent , size_t child_idx )
{
assert ( child_idx < 2048 ) ;
uint8_t flag = parent - > head . dirty [ child_idx > > 3 ] ;
return flag & ( 1 < < ( child_idx & 0x7 ) ) ;
}
/* ;;: todo: name the 0x8 and 4 literals and/or generalize */
static int
_bt_dirtychild ( BT_page * parent , size_t child_idx )
{
assert ( child_idx < 2048 ) ;
/* although there's nothing theoretically wrong with dirtying a dirty node,
there ' s probably a bug if we do it since a we only dirty a node when it ' s
alloced after a split or CoWed */
assert ( ! _bt_ischilddirty ( parent , child_idx ) ) ;
uint8_t * flag = & parent - > head . dirty [ child_idx > > 3 ] ;
* flag | = 1 < < ( child_idx & 0x7 ) ;
return BT_SUCC ;
}
2024-02-03 02:51:49 +03:00
static int
_bt_dirtydata ( BT_page * leaf , size_t child_idx )
/* effectively the same as _bt_dirtychild (setting the dirty bit at child_idx in
the given node ) , with the exception that we don ' t assert the dirty bit isn ' t
set . ( Data may be written to the same fileoffset multiple times ( a
malloc - free - malloc cycle ) */
{
assert ( child_idx < 2048 ) ;
uint8_t * flag = & leaf - > head . dirty [ child_idx > > 3 ] ;
* flag | = 1 < < ( child_idx & 0x7 ) ;
return BT_SUCC ;
}
2023-11-29 19:15:46 +03:00
static int
_bt_cleanchild ( BT_page * parent , size_t child_idx )
{
assert ( _bt_ischilddirty ( parent , child_idx ) ) ;
uint8_t * flag = & parent - > head . dirty [ child_idx > > 3 ] ;
* flag ^ = 1 < < ( child_idx & 0x7 ) ;
return BT_SUCC ;
}
/* ;:: assert that the node is dirty when splitting */
static int
_bt_split_child ( BT_state * state , BT_page * parent , size_t i , pgno_t * newchild )
{
/* ;;: todo: better error handling */
2023-12-05 04:13:33 +03:00
assert ( _bt_ischilddirty ( parent , i ) ) ;
2023-11-29 19:15:46 +03:00
int rc = BT_SUCC ;
size_t N ;
BT_page * left = _node_get ( state , parent - > datk [ i ] . fo ) ;
2023-12-06 05:39:21 +03:00
BT_page * right = _bt_nalloc ( state ) ;
2023-11-29 19:15:46 +03:00
if ( right = = 0 )
return ENOMEM ;
if ( ! SUCC ( rc = _bt_split_datcopy ( left , right ) ) )
return rc ;
/* adjust high address of left node in parent */
N = _bt_numkeys ( left ) ;
/* insert reference to right child into parent node */
N = _bt_numkeys ( right ) ;
vaof_t lo = right - > datk [ 0 ] . va ;
vaof_t hi = right - > datk [ N - 1 ] . va ;
_bt_insertdat ( lo , hi , _fo_get ( state , right ) , parent , i ) ;
/* dirty right child */
size_t ridx = _bt_childidx ( parent , lo , hi ) ;
assert ( ridx = = i + 1 ) ; /* 0x100000020100;;: tmp? */
_bt_dirtychild ( parent , ridx ) ;
/* ;;: fix this */
* newchild = _fo_get ( state , right ) ;
return BT_SUCC ;
}
2023-12-20 09:16:55 +03:00
static int
_bt_rebalance ( BT_state * state , BT_page * node ) __attribute ( ( unused ) ) ;
2023-11-29 19:15:46 +03:00
static int
_bt_rebalance ( BT_state * state , BT_page * node )
{
return 255 ;
}
/* insert lo, hi, and fo in parent's data section for childidx */
static int
_bt_insertdat ( vaof_t lo , vaof_t hi , pgno_t fo ,
BT_page * parent , size_t childidx )
{
2023-12-15 08:50:26 +03:00
# if DEBUG_PRINTNODE
2023-11-29 19:15:46 +03:00
DPRINTF ( " BEFORE INSERT lo % " PRIu32 " hi % " PRIu32 " fo % " PRIu32 , lo , hi , fo ) ;
2023-12-07 02:17:51 +03:00
_bt_printnode ( parent ) ;
2023-12-15 08:50:26 +03:00
# endif
2023-11-29 19:15:46 +03:00
/* ;;: TODO confirm this logic is appropriate for branch nodes. (It /should/
be correct for leaf nodes ) */
vaof_t llo = parent - > datk [ childidx ] . va ;
vaof_t hhi = parent - > datk [ childidx + 1 ] . va ;
2023-12-06 03:29:56 +03:00
/* NB: it can be assumed that llo <= lo and hi <= hhi because this routine is
called using an index found with _bt_childidx */
2023-11-29 19:15:46 +03:00
/* duplicate */
if ( llo = = lo & & hhi = = hi ) {
parent - > datk [ childidx ] . fo = fo ;
return BT_SUCC ;
}
if ( llo = = lo ) {
_bt_datshift ( parent , childidx + 1 , 1 ) ;
vaof_t oldfo = parent - > datk [ childidx ] . fo ;
parent - > datk [ childidx ] . fo = fo ;
parent - > datk [ childidx + 1 ] . va = hi ;
2023-12-20 00:29:03 +03:00
parent - > datk [ childidx + 1 ] . fo = ( oldfo = = 0 )
? 0
: oldfo + ( hi - llo ) ;
2023-11-29 19:15:46 +03:00
}
else if ( hhi = = hi ) {
_bt_datshift ( parent , childidx + 1 , 1 ) ;
parent - > datk [ childidx + 1 ] . va = lo ;
parent - > datk [ childidx + 1 ] . fo = fo ;
}
else {
_bt_datshift ( parent , childidx + 1 , 2 ) ;
parent - > datk [ childidx + 1 ] . va = lo ;
parent - > datk [ childidx + 1 ] . fo = fo ;
parent - > datk [ childidx + 2 ] . va = hi ;
pgno_t lfo = parent - > datk [ childidx ] . fo ;
vaof_t lva = parent - > datk [ childidx ] . va ;
parent - > datk [ childidx + 2 ] . fo = ( lfo = = 0 )
? 0
: lfo + ( hi - lva ) ;
}
2023-12-15 08:50:26 +03:00
# if DEBUG_PRINTNODE
2023-11-29 19:15:46 +03:00
DPUTS ( " AFTER INSERT " ) ;
2023-12-07 02:17:51 +03:00
_bt_printnode ( parent ) ;
2023-12-15 08:50:26 +03:00
# endif
2023-11-29 19:15:46 +03:00
return BT_SUCC ;
}
//// ===========================================================================
//// wip - deletion coalescing
/* ;;: todo: rename routines */
int
_bt_delco_1pass_0 ( BT_state * state , vaof_t lo , vaof_t hi ,
BT_page * node , uint8_t depth , uint8_t maxdepth )
{
/* Perform a dfs search on all ranges that fall within lo and hi */
2023-12-05 04:13:33 +03:00
size_t N = _bt_numkeys ( node ) ;
2023-11-29 19:15:46 +03:00
size_t loidx = 0 ;
size_t hiidx = 0 ;
/* first find the entry that matches lo */
size_t i ;
2023-12-05 04:13:33 +03:00
for ( i = 0 ; i < N - 1 ; i + + ) {
vaof_t hhi = node - > datk [ i + 1 ] . va ;
if ( hhi > lo ) {
2023-11-29 19:15:46 +03:00
loidx = i ;
break ;
}
}
/* and then the entry that matches hi */
2023-12-09 03:27:34 +03:00
for ( ; i < N ; i + + ) {
2023-11-29 19:15:46 +03:00
vaof_t hhi = node - > datk [ i ] . va ;
if ( hhi > = hi ) {
2023-12-09 03:40:37 +03:00
hiidx = i ;
2023-11-29 19:15:46 +03:00
break ;
}
}
/* node->datk[loidx] - node->datk[hiidx] are the bounds on which to perform
the dfs */
for ( i = loidx ; i < hiidx ; i + + ) {
2023-12-09 03:40:37 +03:00
pgno_t pg = node - > datk [ i ] . fo ;
2023-11-29 19:15:46 +03:00
/* if at the leaf level, terminate with failure if pg is not free */
if ( depth = = maxdepth ) {
if ( pg ! = 0 ) return 1 ;
else continue ;
}
/* otherwise, dfs the child node */
BT_page * child = _node_get ( state , pg ) ;
if ( ! SUCC ( _bt_delco_1pass_0 ( state , lo , hi , child , depth + 1 , maxdepth ) ) )
return 1 ;
}
/* whether we're at a leaf or a branch, by now all pages corresponding to the
hi - lo range must be free */
return BT_SUCC ;
}
/* ;;: since this is called by another recursive function _bt_delco that first
finds if a split exists , this / could / take a pgno to avoid unnecessarily
rewalking the tree . not a big deal though as is . */
static int
_bt_delco_1pass ( BT_state * state , vaof_t lo , vaof_t hi )
/* returns true if the leaves in the given range are all free (pgno of 0). false
otherwise . This must be the case for an insert into an overlapping range to
succeed */
{
BT_meta * meta = state - > meta_pages [ state - > which ] ;
BT_page * root = _node_get ( state , meta - > root ) ;
return _bt_delco_1pass_0 ( state , lo , hi , root , 1 , meta - > depth ) ;
}
2023-12-05 03:50:20 +03:00
static void
_mlist_insert ( BT_state * state , void * lo , void * hi )
2023-12-19 06:02:58 +03:00
{
BT_mlistnode * * dst = & state - > mlist ;
BT_mlistnode * * prev_dst = 0 ;
2023-12-05 03:50:20 +03:00
BYTE * lob = lo ;
BYTE * hib = hi ;
2023-12-19 06:02:58 +03:00
while ( * dst ) {
if ( hib = = ( * dst ) - > lo ) {
( * dst ) - > lo = lob ;
/* check if we can coalesce with left neighbor */
if ( prev_dst ! = 0 ) {
bp ( 0 ) ; /* ;;: note, this case should not hit. keeping for debugging. */
/* dst equals &(*prev_dst)->next */
assert ( * prev_dst ! = 0 ) ;
if ( ( * prev_dst ) - > hi = = lob ) {
( * prev_dst ) - > hi = ( * dst ) - > hi ;
( * prev_dst ) - > next = ( * dst ) - > next ;
free ( * dst ) ;
}
}
2023-12-05 03:50:20 +03:00
return ;
}
2023-12-19 06:02:58 +03:00
if ( lob = = ( * dst ) - > hi ) {
( * dst ) - > hi = hi ;
/* check if we can coalesce with right neighbor */
if ( ( * dst ) - > next ! = 0 ) {
if ( hib = = ( * dst ) - > next - > lo ) {
( * dst ) - > hi = ( * dst ) - > next - > hi ;
BT_mlistnode * dst_next = ( * dst ) - > next ;
( * dst ) - > next = ( * dst ) - > next - > next ;
free ( dst_next ) ;
}
}
2023-12-05 03:50:20 +03:00
return ;
}
2023-12-19 06:02:58 +03:00
if ( hib > ( * dst ) - > lo ) {
assert ( lob > ( * dst ) - > hi ) ;
assert ( hib > ( * dst ) - > hi ) ;
prev_dst = dst ;
dst = & ( * dst ) - > next ;
continue ;
2023-12-05 03:50:20 +03:00
}
2023-12-19 06:02:58 +03:00
2024-01-23 05:43:36 +03:00
/* ;;: i think this chunk can replace the below "found end of list" chunk */
2023-12-19 06:02:58 +03:00
/* otherwise, insert discontinuous node */
BT_mlistnode * new = calloc ( 1 , sizeof * new ) ;
new - > lo = lob ;
new - > hi = hib ;
new - > next = * dst ;
* dst = new ;
return ;
2023-12-05 03:50:20 +03:00
}
2023-12-19 06:02:58 +03:00
/* found end of list */
2023-12-05 03:50:20 +03:00
BT_mlistnode * new = calloc ( 1 , sizeof * new ) ;
2023-12-19 06:02:58 +03:00
new - > lo = lob ;
new - > hi = hib ;
new - > next = 0 ;
( * dst ) = new ;
2023-12-05 03:50:20 +03:00
}
2023-11-29 19:15:46 +03:00
static void
2023-12-20 17:37:04 +03:00
_nlist_insert2 ( BT_state * state , BT_nlistnode * * dst , BT_page * lo , BT_page * hi )
2023-12-19 06:02:58 +03:00
{
BT_nlistnode * * prev_dst = 0 ;
2024-01-24 03:20:07 +03:00
while ( * dst ) { /* ;;: this is a problem */
2023-12-19 06:02:58 +03:00
if ( hi = = ( * dst ) - > lo ) {
( * dst ) - > lo = lo ;
/* check if we can coalesce with left neighbor */
if ( prev_dst ! = 0 ) {
bp ( 0 ) ; /* ;;: note, this case should not hit. keeping for debugging. */
/* dst equals &(*prev_dst)->next */
assert ( * prev_dst ! = 0 ) ;
if ( ( * prev_dst ) - > hi = = lo ) {
( * prev_dst ) - > hi = ( * dst ) - > hi ;
( * prev_dst ) - > next = ( * dst ) - > next ;
free ( * dst ) ;
}
}
return ;
}
if ( lo = = ( * dst ) - > hi ) {
( * dst ) - > hi = hi ;
/* check if we can coalesce with right neighbor */
if ( ( * dst ) - > next ! = 0 ) {
if ( hi = = ( * dst ) - > next - > lo ) {
( * dst ) - > hi = ( * dst ) - > next - > hi ;
BT_nlistnode * dst_next = ( * dst ) - > next ;
( * dst ) - > next = ( * dst ) - > next - > next ;
free ( dst_next ) ;
}
}
return ;
}
if ( hi > ( * dst ) - > lo ) {
assert ( lo > ( * dst ) - > hi ) ;
assert ( hi > ( * dst ) - > hi ) ;
prev_dst = dst ;
dst = & ( * dst ) - > next ;
continue ;
}
2023-11-29 19:15:46 +03:00
}
2024-01-23 05:43:36 +03:00
/* otherwise, insert discontinuous node */
BT_nlistnode * new = calloc ( 1 , sizeof * new ) ;
new - > lo = lo ;
new - > hi = hi ;
new - > next = * dst ;
* dst = new ;
2023-11-29 19:15:46 +03:00
}
static void
2023-12-20 17:37:04 +03:00
_nlist_insert ( BT_state * state , BT_nlistnode * * dst , pgno_t nodepg )
2023-11-29 19:15:46 +03:00
{
2023-12-20 17:37:04 +03:00
BT_page * lo = _node_get ( state , nodepg ) ;
BT_page * hi = _node_get ( state , nodepg + 1 ) ;
_nlist_insert2 ( state , dst , lo , hi ) ;
}
2023-11-29 19:15:46 +03:00
2024-01-18 01:40:46 +03:00
static void
_nlist_insertn ( BT_state * state , BT_nlistnode * * dst , pgno_t lo , pgno_t hi )
{
_nlist_insert2 ( state ,
dst ,
_node_get ( state , lo ) ,
_node_get ( state , hi ) ) ;
}
2023-12-20 17:37:04 +03:00
static void
_pending_nlist_merge ( BT_state * state )
{
BT_nlistnode * src_head = state - > pending_nlist ;
BT_nlistnode * prev = 0 ;
while ( src_head ) {
_nlist_insert2 ( state , & state - > nlist , src_head - > lo , src_head - > hi ) ;
prev = src_head ;
src_head = src_head - > next ;
free ( prev ) ;
2023-11-29 19:15:46 +03:00
}
2024-03-13 18:36:07 +03:00
state - > pending_nlist = 0 ;
2023-11-29 19:15:46 +03:00
}
2023-12-04 23:28:25 +03:00
static void
2023-12-19 06:02:58 +03:00
_flist_insert ( BT_flistnode * * dst , pgno_t lo , pgno_t hi )
{
BT_flistnode * * prev_dst = 0 ;
while ( * dst ) {
if ( hi = = ( * dst ) - > lo ) {
( * dst ) - > lo = lo ;
/* check if we can coalesce with left neighbor */
if ( prev_dst ! = 0 ) {
bp ( 0 ) ; /* ;;: note, this case should not hit. keeping for debugging. */
/* dst equals &(*prev_dst)->next */
assert ( * prev_dst ! = 0 ) ;
if ( ( * prev_dst ) - > hi = = lo ) {
( * prev_dst ) - > hi = ( * dst ) - > hi ;
( * prev_dst ) - > next = ( * dst ) - > next ;
free ( * dst ) ;
}
}
return ;
}
if ( lo = = ( * dst ) - > hi ) {
( * dst ) - > hi = hi ;
/* check if we can coalesce with right neighbor */
if ( ( * dst ) - > next ! = 0 ) {
if ( hi = = ( * dst ) - > next - > lo ) {
( * dst ) - > hi = ( * dst ) - > next - > hi ;
BT_flistnode * dst_next = ( * dst ) - > next ;
( * dst ) - > next = ( * dst ) - > next - > next ;
free ( dst_next ) ;
}
}
return ;
}
if ( hi > ( * dst ) - > lo ) {
2024-01-24 03:20:07 +03:00
/* advance to next freeblock and retry */
2023-12-19 06:02:58 +03:00
assert ( lo > ( * dst ) - > hi ) ;
assert ( hi > ( * dst ) - > hi ) ;
prev_dst = dst ;
dst = & ( * dst ) - > next ;
continue ;
2023-12-04 23:28:25 +03:00
}
2024-01-31 02:39:44 +03:00
2024-01-24 03:20:07 +03:00
/* otherwise, insert discontinuous node */
BT_flistnode * new = calloc ( 1 , sizeof * new ) ;
new - > lo = lo ;
new - > hi = hi ;
new - > next = * dst ;
* dst = new ;
return ;
}
2024-02-03 02:51:49 +03:00
/* otherwise, insert discontinuous node */
BT_flistnode * new = calloc ( 1 , sizeof * new ) ;
new - > lo = lo ;
new - > hi = hi ;
new - > next = * dst ;
* dst = new ;
return ;
2023-12-04 23:28:25 +03:00
}
static void
_pending_flist_merge ( BT_state * state )
{
2023-12-20 17:37:04 +03:00
BT_flistnode * src_head = state - > pending_flist ;
BT_flistnode * prev = 0 ;
while ( src_head ) {
_flist_insert ( & state - > flist , src_head - > lo , src_head - > hi ) ;
prev = src_head ;
src_head = src_head - > next ;
free ( prev ) ;
2023-12-05 00:26:51 +03:00
}
2024-03-13 02:16:12 +03:00
state - > pending_flist = 0 ;
2023-12-04 23:28:25 +03:00
}
2023-11-29 19:15:46 +03:00
/* ;;: todo move shit around */
static void
2023-12-19 06:02:58 +03:00
_bt_delco_droptree2 ( BT_state * state , pgno_t nodepg ,
uint8_t depth , uint8_t maxdepth , int isdirty )
2023-11-29 19:15:46 +03:00
{
2023-12-19 06:02:58 +03:00
int ischilddirty = 0 ;
2023-11-29 19:15:46 +03:00
/* branch */
if ( depth ! = maxdepth ) {
BT_page * node = _node_get ( state , nodepg ) ;
2023-12-19 06:02:58 +03:00
for ( size_t i = 0 ; i < BT_DAT_MAXKEYS ; i + + ) {
2023-11-29 19:15:46 +03:00
BT_kv entry = node - > datk [ i ] ;
if ( entry . fo = = 0 )
break ; /* done */
2023-12-19 06:02:58 +03:00
ischilddirty = _bt_ischilddirty ( node , i ) ;
_bt_delco_droptree2 ( state , entry . fo , depth + 1 , maxdepth , ischilddirty ) ;
2023-11-29 19:15:46 +03:00
}
}
2023-12-19 06:02:58 +03:00
/* branch and leaf */
if ( isdirty ) {
_nlist_insert ( state , & state - > nlist , nodepg ) ;
}
else {
_nlist_insert ( state , & state - > pending_nlist , nodepg ) ;
}
2023-11-29 19:15:46 +03:00
}
static void
2023-12-19 06:02:58 +03:00
_bt_delco_droptree ( BT_state * state , pgno_t nodepg , uint8_t depth , int isdirty )
2023-11-29 19:15:46 +03:00
{
/* completely drop a tree. Assume that all leaves under the tree are free
( pgno = 0 ) */
assert ( nodepg > = 2 ) ;
BT_meta * meta = state - > meta_pages [ state - > which ] ;
2023-12-19 18:56:34 +03:00
_bt_delco_droptree2 ( state , nodepg , depth , meta - > depth , isdirty ) ;
2023-11-29 19:15:46 +03:00
}
static void
_bt_delco_trim_rsubtree_lhs2 ( BT_state * state , vaof_t lo , vaof_t hi ,
pgno_t nodepg , uint8_t depth , uint8_t maxdepth )
{
BT_page * node = _node_get ( state , nodepg ) ;
size_t hiidx = 0 ;
2023-12-09 01:41:18 +03:00
size_t N = _bt_numkeys ( node ) ;
2023-11-29 19:15:46 +03:00
/* find hi idx of range */
size_t i ;
2023-12-09 03:27:34 +03:00
for ( i = 0 ; i < N ; i + + ) {
2023-11-29 19:15:46 +03:00
vaof_t hhi = node - > datk [ i ] . va ;
if ( hhi > = hi ) {
hiidx = i ;
break ;
}
}
/* set the lo address of datk[hiidx] to hi */
node - > datk [ hiidx - 1 ] . va = hi ;
/* drop the subtrees left of the range */
if ( depth ! = maxdepth ) {
for ( i = 0 ; i < hiidx - 1 ; i + + ) {
pgno_t childpg = node - > datk [ i ] . fo ;
if ( childpg = = 0 )
break ;
2023-12-19 06:02:58 +03:00
int ischilddirty = _bt_ischilddirty ( node , i ) ;
_bt_delco_droptree ( state , childpg , depth + 1 , ischilddirty ) ;
2023-11-29 19:15:46 +03:00
}
}
/* memmove the buffer so the found range is the first in the node */
BYTE * dst = ( BYTE * ) & node - > datk [ 0 ] . va ;
BYTE * src = ( BYTE * ) & node - > datk [ hiidx - 1 ] . va ;
BYTE * end = ( BYTE * ) & node - > datk [ BT_DAT_MAXKEYS - 1 ] . fo ;
size_t len = end - src ;
memmove ( dst , src , len ) ;
/* ;;: TODO add temporary asserts for testing? */
/* and now zero the moved range */
ZERO ( dst + len , end - ( dst + len ) ) ;
/* done if this is a leaf */
if ( depth = = maxdepth )
return ;
/* otherwise, recur on subtree */
pgno_t rsubtree = node - > datk [ hiidx ] . fo ;
2023-12-19 18:56:34 +03:00
_bt_delco_trim_rsubtree_lhs2 ( state , lo , hi , rsubtree , depth + 1 , maxdepth ) ;
2023-11-29 19:15:46 +03:00
}
static void
_bt_delco_trim_rsubtree_lhs ( BT_state * state , vaof_t lo , vaof_t hi ,
pgno_t nodepg , uint8_t depth )
{
BT_meta * meta = state - > meta_pages [ state - > which ] ;
2023-12-19 18:56:34 +03:00
_bt_delco_trim_rsubtree_lhs2 ( state , lo , hi , nodepg , depth , meta - > depth ) ;
2023-11-29 19:15:46 +03:00
}
static void
_bt_delco_trim_lsubtree_rhs2 ( BT_state * state , vaof_t lo , vaof_t hi ,
pgno_t nodepg , uint8_t depth , uint8_t maxdepth )
{
BT_page * node = _node_get ( state , nodepg ) ;
2023-12-05 04:13:33 +03:00
size_t N = _bt_numkeys ( node ) ;
2023-11-29 19:15:46 +03:00
size_t loidx = 0 ;
/* find low idx of range */
size_t i ;
2023-12-05 04:13:33 +03:00
for ( i = 0 ; i < N - 1 ; i + + ) {
vaof_t hhi = node - > datk [ i + 1 ] . va ;
if ( hhi > lo ) {
2023-11-29 19:15:46 +03:00
loidx = i ;
break ;
}
}
/* set the hi address of datk[loidx] to hi */
node - > datk [ loidx + 1 ] . va = hi ;
/* drop the subtrees right of the range */
if ( depth ! = maxdepth ) {
/* recur and droptree for branches */
2023-12-09 01:41:18 +03:00
for ( i = loidx + 1 ; i < N - 1 ; i + + ) {
2023-11-29 19:15:46 +03:00
pgno_t childpg = node - > datk [ i ] . fo ;
if ( childpg = = 0 )
break ;
2023-12-19 06:02:58 +03:00
int ischilddirty = _bt_ischilddirty ( node , i ) ;
_bt_delco_droptree ( state , childpg , depth + 1 , ischilddirty ) ;
2023-11-29 19:15:46 +03:00
}
}
/* always zero rhs whether node is a leaf or a branch */
BYTE * beg = ( BYTE * ) & node - > datk [ loidx + 1 ] . fo ;
BYTE * end = ( BYTE * ) & node - > datk [ BT_DAT_MAXKEYS - 1 ] . fo ;
size_t len = end - beg ;
ZERO ( beg , len ) ;
/* ;;: this won't zero the last fo, but that should be fine. remove the assert
when you ' re confident it / is / fine */
assert ( node - > datk [ BT_DAT_MAXKEYS - 1 ] . fo = = 0 ) ;
/* done if this is a leaf */
if ( depth = = maxdepth )
return ;
/* otherwise, recur on the left subtree */
pgno_t lsubtree = node - > datk [ loidx ] . fo ;
2023-12-19 18:56:34 +03:00
_bt_delco_trim_lsubtree_rhs2 ( state , lo , hi , lsubtree , depth + 1 , maxdepth ) ;
2023-11-29 19:15:46 +03:00
}
static void
_bt_delco_trim_lsubtree_rhs ( BT_state * state , vaof_t lo , vaof_t hi ,
pgno_t nodepg , uint8_t depth )
{
BT_meta * meta = state - > meta_pages [ state - > which ] ;
2023-12-19 18:56:34 +03:00
_bt_delco_trim_lsubtree_rhs2 ( state , lo , hi , nodepg , depth , meta - > depth ) ;
2023-11-29 19:15:46 +03:00
}
static void
_bt_delco ( BT_state * state , vaof_t lo , vaof_t hi ,
pgno_t nodepg , uint8_t depth , uint8_t maxdepth )
{
/* ;;: "find_internal_splits" in the original algorithm */
BT_page * node = _node_get ( state , nodepg ) ;
size_t N = _bt_numkeys ( node ) ;
size_t loidx = 0 ;
size_t hiidx = 0 ;
pgno_t lsubtree = 0 ;
pgno_t rsubtree = 0 ;
/* find low idx of range */
2023-12-05 04:13:33 +03:00
for ( size_t i = 0 ; i < N - 1 ; i + + ) {
vaof_t hhi = node - > datk [ i + 1 ] . va ;
if ( hhi > lo ) {
2023-11-29 19:15:46 +03:00
loidx = i ;
break ;
}
}
/* find high idx of range */
2023-12-09 03:27:34 +03:00
for ( size_t i = loidx ; i < N ; i + + ) {
2023-11-29 19:15:46 +03:00
vaof_t hhi = node - > datk [ i ] . va ;
if ( hhi > = hi ) {
assert ( i > 0 ) ;
hiidx = i - 1 ;
break ;
}
}
/* non-split range and at leaf. done */
if ( depth = = maxdepth
& & hiidx = = loidx ) {
return ;
}
lsubtree = node - > datk [ loidx ] . fo ;
rsubtree = node - > datk [ hiidx ] . fo ;
if ( depth < maxdepth ) {
/* guarantee path is dirty by CoWing node if not */
/* ;;: refactor? code duplication?? */
if ( ! _bt_ischilddirty ( node , loidx ) ) {
BT_page * child = _node_get ( state , lsubtree ) ;
pgno_t newpg ;
2023-12-05 04:13:33 +03:00
_node_cow ( state , child , & newpg ) ;
2023-11-29 19:15:46 +03:00
lsubtree = node - > datk [ loidx ] . fo = newpg ;
_bt_dirtychild ( node , loidx ) ;
}
if ( ! _bt_ischilddirty ( node , hiidx ) ) {
BT_page * child = _node_get ( state , rsubtree ) ;
pgno_t newpg ;
2023-12-05 04:13:33 +03:00
_node_cow ( state , child , & newpg ) ;
2023-11-29 19:15:46 +03:00
rsubtree = node - > datk [ hiidx ] . fo = newpg ;
_bt_dirtychild ( node , hiidx ) ;
}
}
/* non-split range, recurse to child tree */
if ( hiidx = = loidx ) {
pgno_t childpg = node - > datk [ loidx ] . fo ;
_bt_delco ( state , lo , hi , childpg , depth + 1 , maxdepth ) ;
}
/* split range discovered */
if ( hiidx > loidx ) {
/* run first pass to guarantee range is completely free */
if ( ! SUCC ( _bt_delco_1pass ( state , lo , hi ) ) ) {
/* attempted insert on split range that cannot be coalesced */
assert ( 0 ) ;
}
/* set leftmost boundary va to hi */
node - > datk [ loidx + 1 ] . va = hi ;
/* set the lo side of the right boundary to hi */
node - > datk [ hiidx ] . va = hi ;
/* drop all trees between the two subtrees */
for ( size_t i = loidx + 1 ; i < hiidx ; i + + ) {
pgno_t childpg = node - > datk [ i ] . fo ;
2023-12-19 06:02:58 +03:00
int ischilddirty = _bt_ischilddirty ( node , i ) ;
_bt_delco_droptree ( state , childpg , depth + 1 , ischilddirty ) ;
2023-11-29 19:15:46 +03:00
}
/* move buffer */
BYTE * dst = ( BYTE * ) & node - > datk [ loidx + 1 ] . va ;
BYTE * src = ( BYTE * ) & node - > datk [ hiidx ] . va ;
BYTE * end = ( BYTE * ) & node - > datk [ BT_DAT_MAXKEYS - 1 ] . fo ;
size_t len = end - src ;
memmove ( dst , src , len ) ;
ZERO ( dst + len , end - ( dst + len ) ) ;
2023-12-09 03:51:44 +03:00
/* unless at leaf trim left subtree then trim right subtree */
if ( depth < maxdepth ) {
_bt_delco_trim_lsubtree_rhs ( state , lo , hi , lsubtree , depth + 1 ) ;
_bt_delco_trim_rsubtree_lhs ( state , lo , hi , rsubtree , depth + 1 ) ;
}
2023-11-29 19:15:46 +03:00
/* done */
return ;
}
}
/* ;;: todo, update meta->depth when we add a row. Should this be done in
_bt_rebalance ? */
static int
_bt_insert2 ( BT_state * state , vaof_t lo , vaof_t hi , pgno_t fo ,
BT_page * node , size_t depth )
{
/* ;;: to be written in such a way that node is guaranteed both dirty and
non - full */
/* ;;: remember:
- You need to CoW + dirty a node when you insert a non - dirty node .
- You need to insert into a node when :
- It ' s a leaf
- It ' s a branch and you CoWed the child
- Hence , all nodes in a path to a leaf being inserted into need to already
be dirty or explicitly Cowed . Splitting doesn ' t actually factor into this
decision afaict .
*/
assert ( node ) ;
int rc = 255 ;
size_t N = 0 ;
size_t childidx = _bt_childidx ( node , lo , hi ) ;
assert ( childidx ! = BT_DAT_MAXKEYS ) ;
BT_meta * meta = state - > meta_pages [ state - > which ] ;
if ( depth < meta - > depth ) {
pgno_t childpgno = node - > datk [ childidx ] . fo ;
BT_page * child = _node_get ( state , childpgno ) ;
N = _bt_numkeys ( child ) ;
}
/* nullcond: node is a leaf */
if ( meta - > depth = = depth ) {
2024-02-03 02:51:49 +03:00
/* dirty the data range */
_bt_dirtydata ( node , childidx ) ;
2023-11-29 19:15:46 +03:00
/* guaranteed non-full and dirty by n-1 recursive call, so just insert */
return _bt_insertdat ( lo , hi , fo , node , childidx ) ;
}
/* do we need to CoW the child node? */
if ( ! _bt_ischilddirty ( node , childidx ) ) {
pgno_t pgno ;
2024-03-01 02:28:43 +03:00
BT_page * child = _node_get ( state , node - > datk [ childidx ] . fo ) ;
_node_cow ( state , child , & pgno ) ;
2023-11-29 19:15:46 +03:00
node - > datk [ childidx ] . fo = pgno ;
_bt_dirtychild ( node , childidx ) ;
}
/* do we need to split the child node? */
if ( N > = BT_DAT_MAXKEYS - 2 ) {
pgno_t rchild_pgno ;
if ( ! SUCC ( rc = _bt_split_child ( state , node , childidx , & rchild_pgno ) ) )
return rc ;
/* since we split the child's data, recalculate the child idx */
/* ;;: note, this can be simplified into a conditional i++ */
childidx = _bt_childidx ( node , lo , hi ) ;
}
/* the child is now guaranteed non-full (split) and dirty. Recurse */
BT_page * child = _node_get ( state , node - > datk [ childidx ] . fo ) ;
return _bt_insert2 ( state , lo , hi , fo , child , depth + 1 ) ;
}
static int
_bt_insert ( BT_state * state , vaof_t lo , vaof_t hi , pgno_t fo )
/* handles CoWing/splitting of the root page since it's special cased. Then
passes the child matching hi / lo to _bt_insert2 */
{
BT_meta * meta = state - > meta_pages [ state - > which ] ;
BT_page * root = _node_get ( state , meta - > root ) ;
/* the root MUST be dirty (zero checksum in metapage) */
assert ( meta - > chk = = 0 ) ;
size_t N = _bt_numkeys ( root ) ;
/* perform deletion coalescing (and preemptively guarantee path is dirty) if
inserting a non - zero ( non - free ) page */
if ( fo ! = 0 ) {
_bt_delco ( state , lo , hi , meta - > root , 1 , meta - > depth ) ;
}
/* CoW root's child if it isn't already dirty */
size_t childidx = _bt_childidx ( root , lo , hi ) ;
assert ( childidx ! = BT_DAT_MAXKEYS ) ; /* ;;: this should catch the case of
improperly inserting into a split
range . Should we do it earlier or
differently ? */
if ( meta - > depth > 1
& & ! _bt_ischilddirty ( root , childidx ) ) {
BT_page * child = _node_get ( state , root - > datk [ childidx ] . fo ) ;
pgno_t newchildpg ;
2023-12-05 04:13:33 +03:00
_node_cow ( state , child , & newchildpg ) ;
2023-11-29 19:15:46 +03:00
root - > datk [ childidx ] . fo = newchildpg ;
_bt_dirtychild ( root , childidx ) ;
}
/* before calling into recursive insert, handle root splitting since it's
special cased ( 2 allocs ) */
if ( N > = BT_DAT_MAXKEYS - 2 ) { /* ;;: remind, fix all these conditions to be - 2 */
pgno_t pg = 0 ;
/* the old root is now the left child of the new root */
BT_page * left = root ;
2023-12-06 05:39:21 +03:00
BT_page * right = _bt_nalloc ( state ) ;
BT_page * rootnew = _bt_nalloc ( state ) ;
2023-11-29 19:15:46 +03:00
/* split root's data across left and right nodes */
_bt_split_datcopy ( left , right ) ;
/* save left and right in new root's .data */
pg = _fo_get ( state , left ) ;
rootnew - > datk [ 0 ] . fo = pg ;
rootnew - > datk [ 0 ] . va = 0 ;
pg = _fo_get ( state , right ) ;
rootnew - > datk [ 1 ] . fo = pg ;
rootnew - > datk [ 1 ] . va = right - > datk [ 0 ] . va ;
rootnew - > datk [ 2 ] . va = UINT32_MAX ;
/* dirty new root's children */
_bt_dirtychild ( rootnew , 0 ) ;
_bt_dirtychild ( rootnew , 1 ) ;
/* update meta page information. (root and depth) */
pg = _fo_get ( state , rootnew ) ;
meta - > root = pg ;
meta - > depth + = 1 ;
root = rootnew ;
}
/*
meta is dirty
root is dirty and split if necessary
root ' s child in insert path is dirty and split if necessary
finally , recurse on child
*/
return _bt_insert2 ( state , lo , hi , fo , root , 1 ) ;
/* return _bt_insert2(state, lo, hi, fo, child, 1); */
}
/* ;;: wip */
/* ;;: inspired by lmdb's MDB_pageparent. While seemingly unnecessary for
_bt_insert , this may be useful for _bt_delete when we implement deletion
coalescing */
typedef struct BT_ppage BT_ppage ;
struct BT_ppage {
BT_page * node ;
BT_page * parent ;
} ;
2023-12-20 09:16:55 +03:00
static int
_bt_delete ( BT_state * state , vaof_t lo , vaof_t hi ) __attribute ( ( unused ) ) ;
2023-11-29 19:15:46 +03:00
static int
_bt_delete ( BT_state * state , vaof_t lo , vaof_t hi )
{
/* ;;: tmp, implement coalescing of zero ranges and merging/rebalancing of
nodes */
return _bt_insert ( state , lo , hi , 0 ) ;
}
static int
_mlist_new ( BT_state * state )
{
BT_meta * meta = state - > meta_pages [ state - > which ] ;
BT_page * root = _node_get ( state , meta - > root ) ;
2023-12-19 06:02:58 +03:00
/* assert(root->datk[0].fo == 0); */
size_t N = _bt_numkeys ( root ) ;
2023-11-29 19:15:46 +03:00
vaof_t lo = root - > datk [ 0 ] . va ;
2023-12-19 06:02:58 +03:00
vaof_t hi = root - > datk [ N - 1 ] . va ;
2023-11-29 19:15:46 +03:00
BT_mlistnode * head = calloc ( 1 , sizeof * head ) ;
head - > next = 0 ;
2023-12-19 06:02:58 +03:00
head - > lo = off2addr ( lo ) ;
head - > hi = off2addr ( hi ) ;
2023-11-29 19:15:46 +03:00
state - > mlist = head ;
return BT_SUCC ;
}
2024-01-13 02:11:42 +03:00
/* ;;: todo: we could remove the hifreepg param if we can derive the highest
page ( alloced or not ) in the persistent file . */
2024-01-11 02:27:31 +03:00
static void
2024-01-13 03:41:19 +03:00
_flist_grow ( BT_state * state , size_t pages )
2024-01-18 01:40:46 +03:00
/* grows the backing file by maximally `pages' or minimally PMA_GROW_SIZE_p and
appends this freespace to the flist */
2024-01-11 02:27:31 +03:00
{
2024-01-11 02:49:55 +03:00
/* grow the backing file by at least PMA_GROW_SIZE_p */
pages = MAX ( pages , PMA_GROW_SIZE_p ) ;
2024-01-12 02:26:33 +03:00
off_t bytes = P2BYTES ( pages ) ;
2024-01-11 02:49:55 +03:00
off_t size = state - > file_size_p * BT_PAGESIZE ;
2024-01-12 02:26:33 +03:00
if ( ftruncate ( state - > data_fd , size + bytes ) ! = 0 ) {
DPUTS ( " resize of backing file failed. aborting " ) ;
abort ( ) ;
}
2024-01-11 02:27:31 +03:00
/* and add this space to the flist */
_flist_insert ( & state - > flist ,
2024-01-13 03:41:19 +03:00
state - > file_size_p ,
state - > file_size_p + pages ) ;
2024-01-12 02:26:33 +03:00
2024-01-13 03:41:19 +03:00
state - > file_size_p + = pages ;
2024-01-11 02:27:31 +03:00
}
2023-11-29 19:15:46 +03:00
static int
2024-01-13 03:41:19 +03:00
_flist_new ( BT_state * state , size_t size_p )
# define FLIST_PG_START (BT_META_SECTION_WIDTH / BT_PAGESIZE)
2024-01-23 04:35:02 +03:00
/* #define FLIST_PG_START ((BT_META_SECTION_WIDTH + BLK_BASE_LEN0_b) / BT_PAGESIZE) */
2023-11-29 19:15:46 +03:00
{
BT_flistnode * head = calloc ( 1 , sizeof * head ) ;
head - > next = 0 ;
2023-12-19 06:02:58 +03:00
head - > lo = FLIST_PG_START ;
2024-01-13 03:41:19 +03:00
head - > hi = size_p ;
2023-11-29 19:15:46 +03:00
state - > flist = head ;
return BT_SUCC ;
}
2024-01-11 02:27:31 +03:00
# undef FLIST_PG_START
2023-11-29 19:15:46 +03:00
static int
2024-01-25 01:49:17 +03:00
_nlist_creat ( BT_state * state , BT_page * start , size_t len_p )
/* create a new nlist in `state' */
2023-11-29 19:15:46 +03:00
{
BT_nlistnode * head = calloc ( 1 , sizeof * head ) ;
2024-01-25 01:49:17 +03:00
head - > lo = start ;
head - > hi = head - > lo + len_p ;
head - > next = 0 ;
state - > nlist = head ;
2023-11-29 19:15:46 +03:00
2024-01-25 01:49:17 +03:00
return BT_SUCC ;
}
static int
_nlist_new ( BT_state * state )
/* create a new nlist */
{
2024-01-23 04:35:02 +03:00
pgno_t partition_0_pg = _bt_falloc ( state , BLK_BASE_LEN0_b / BT_PAGESIZE ) ;
2024-01-13 03:41:19 +03:00
BT_page * partition_0 = _node_get ( state , partition_0_pg ) ;
/* ;;: tmp. assert. for debugging changes */
assert ( partition_0 = = & ( ( BT_page * ) state - > map ) [ BT_NUMMETAS ] ) ;
2023-11-29 19:15:46 +03:00
/* the size of a new node freelist is just the first stripe length */
2024-01-25 01:49:17 +03:00
return _nlist_creat ( state , partition_0 , B2PAGES ( BLK_BASE_LEN0_b ) ) ;
}
2023-11-29 19:15:46 +03:00
2024-01-25 01:49:17 +03:00
static int
_nlist_load ( BT_state * state )
/* create new nlist from persistent state. Doesn't call _bt_falloc */
{
BT_meta * meta = state - > meta_pages [ state - > which ] ;
size_t len_p = 0 ;
BT_page * partition_0 = _node_get ( state , meta - > blk_base [ 0 ] ) ;
/* ;;: tmp. assert. for debugging changes */
assert ( partition_0 = = & ( ( BT_page * ) state - > map ) [ BT_NUMMETAS ] ) ;
2023-11-29 19:15:46 +03:00
2024-01-25 01:49:17 +03:00
/* calculate total size */
for ( size_t i = 0
; meta - > blk_base [ i ] ! = 0 & & i < BT_NUMPARTS
; i + + ) {
len_p + = B2PAGES ( BLK_BASE_LENS_b [ i ] ) ;
}
return _nlist_creat ( state , partition_0 , len_p ) ;
2023-11-29 19:15:46 +03:00
}
2023-12-08 02:40:27 +03:00
static int
_nlist_delete ( BT_state * state )
{
BT_nlistnode * head , * prev ;
head = prev = state - > nlist ;
while ( head - > next ) {
prev = head ;
head = head - > next ;
free ( prev ) ;
}
state - > nlist = 0 ;
return BT_SUCC ;
}
2023-12-19 06:02:58 +03:00
#if 0
2023-11-29 19:15:46 +03:00
static BT_nlistnode *
_nlist_read_prev ( BT_nlistnode * head , BT_nlistnode * curr )
{
/* find nlist node preceding curr and return it */
BT_nlistnode * p , * n ;
p = head ;
n = head - > next ;
for ( ; n ; p = n , n = n - > next ) {
if ( n = = curr )
return p ;
}
return 0 ;
}
/* TODO this is a pretty bad algorithm in terms of time complexity. It should be
fixed , but isn ' t necessary now as our nlist is quite small . You may want to
consider making nlist doubly linked or incorporate a sort and merge step . */
static int
_nlist_read2 ( BT_state * state , BT_page * node , uint8_t maxdepth ,
BT_nlistnode * head , uint8_t depth )
/* recursively walk all nodes in the btree. Allocating new nlist nodes when a
node is found to be in a stripe unaccounted for . For each node found ,
split / shrink the appropriate node to account for the allocated page */
{
BT_nlistnode * p , * n ;
p = head ;
n = head - > next ;
/* find the nlist node that fits the current btree node */
for ( ; n ; p = n , n = n - > next ) {
if ( p - > va < = node & & p - > va + p - > sz > node )
break ;
}
/* if the nlist node is only one page wide, it needs to be freed */
if ( p - > sz = = 1 ) {
BT_nlistnode * prev = _nlist_read_prev ( head , p ) ;
prev - > next = p - > next ;
free ( p ) ;
goto e ;
}
/* if the btree node resides at the end of the nlist node, just shrink it */
BT_page * last = p - > va + p - > sz - 1 ;
if ( last = = node ) {
p - > sz - = 1 ;
goto e ;
}
/* if the btree node resides at the start of the nlist node, likewise shrink
it and update the va */
if ( p - > va = = node ) {
p - > sz - = 1 ;
p - > va + = 1 ;
goto e ;
}
/* otherwise, need to split the current nlist node */
BT_nlistnode * right = calloc ( 1 , sizeof * right ) ;
size_t lsz = node - p - > va ;
size_t rsz = ( p - > va + p - > sz ) - node ;
/* remove 1 page from the right nlist node's size to account for the allocated
btree node */
rsz - = 1 ;
assert ( lsz > 0 & & rsz > 0 ) ;
/* update the size of the left node. And set the size and va of the right
node . Finally , insert the new nlist node into the nlist . */
p - > sz = lsz ;
right - > sz = rsz ;
right - > va = node + 1 ;
right - > next = p - > next ;
p - > next = right ;
e :
/* if at a leaf, we're finished */
if ( depth = = maxdepth ) {
return BT_SUCC ;
}
/* otherwise iterate over all child nodes, recursively constructing the
list */
int rc = BT_SUCC ;
for ( size_t i = 0 ; i < BT_DAT_MAXKEYS ; i + + ) {
BT_kv kv = node - > datk [ i ] ;
BT_page * child = _node_get ( state , node - > datk [ i ] . fo ) ;
if ( ! child ) continue ;
if ( ! SUCC ( rc = _nlist_read2 ( state ,
child ,
maxdepth ,
head ,
depth + 1 ) ) )
return rc ;
}
/* all children traversed */
return BT_SUCC ;
}
static int
_nlist_read ( BT_state * state )
{
/* ;;: this should theoretically be simpler than _mlist_read. right? We can
derive the stripes that contain nodes from the block base array stored in
the metapage . What else do we need to know ? - - the parts of each stripe
that are free or in use . How can we discover that ?
1 ) Without storing any per - page metadata , we could walk the entire tree
from the root . Check the page number of the node . And modify the freelist
accordingly .
2 ) If we stored per - page metadata , this would be simpler . Linearly traverse
each stripe and check if the page is BT_NODE or BT_FREE .
- - are there downsides to ( 2 ) ? The only advantage to this would be quicker
startup . So for now , going to traverse all nodes and for each node ,
traverse the nlist and split it appropriately .
*/
int rc = BT_SUCC ;
BT_meta * meta = state - > meta_pages [ state - > which ] ;
BT_page * root = _node_get ( state , meta - > root ) ;
/* ;;: since partition striping isn't implemented yet, simplifying code by
assuming all nodes reside in the 2 M region */
BT_nlistnode * head = calloc ( 1 , sizeof * head ) ;
2024-01-23 04:35:02 +03:00
head - > sz = BLK_BASE_LEN0_b ;
2023-11-29 19:15:46 +03:00
head - > va = & ( ( BT_page * ) state - > map ) [ BT_NUMMETAS ] ;
head - > next = 0 ;
if ( ! SUCC ( rc = _nlist_read2 ( state , root , meta - > depth , head , 1 ) ) )
return rc ;
state - > nlist = head ;
return rc ;
}
static BT_mlistnode *
_mlist_read2 ( BT_state * state , BT_page * node , uint8_t maxdepth , uint8_t depth )
{
/* leaf */
if ( depth = = maxdepth ) {
BT_mlistnode * head , * prev ;
head = prev = calloc ( 1 , sizeof * head ) ;
size_t i = 0 ;
BT_kv * kv = & node - > datk [ i ] ;
while ( i < BT_DAT_MAXKEYS - 1 ) {
# if CAN_COALESCE
/* free and contiguous with previous mlist node: merge */
if ( kv - > fo = = 0
2023-12-04 21:36:50 +03:00
& & addr2off ( prev - > va ) + prev - > sz = = kv - > va ) {
2023-11-29 19:15:46 +03:00
vaof_t hi = node - > datk [ i + 1 ] . va ;
vaof_t lo = kv - > va ;
2023-12-04 21:36:50 +03:00
size_t len = hi - lo ;
2023-11-29 19:15:46 +03:00
prev - > sz + = len ;
}
/* free but not contiguous with previous mlist node: append new node */
else if ( kv - > fo = = 0 ) {
# endif
BT_mlistnode * new = calloc ( 1 , sizeof * new ) ;
vaof_t hi = node - > datk [ i + 1 ] . va ;
vaof_t lo = kv - > va ;
2023-12-04 21:36:50 +03:00
size_t len = hi - lo ;
2023-11-29 19:15:46 +03:00
new - > sz = len ;
2023-12-04 21:36:50 +03:00
new - > va = off2addr ( lo ) ;
2023-11-29 19:15:46 +03:00
prev - > next = new ;
prev = new ;
# if CAN_COALESCE
}
# endif
kv = & node - > datk [ + + i ] ;
}
return head ;
}
/* branch */
size_t i = 0 ;
BT_mlistnode * head , * prev ;
head = prev = 0 ;
for ( ; i < BT_DAT_MAXKEYS ; + + i ) {
BT_kv kv = node - > datk [ i ] ;
if ( kv . fo = = BT_NOPAGE )
continue ;
BT_page * child = _node_get ( state , kv . fo ) ;
BT_mlistnode * new = _mlist_read2 ( state , child , maxdepth , depth + 1 ) ;
if ( head = = 0 ) {
head = prev = new ;
}
else {
/* just blindly append and unify the ends afterward */
prev - > next = new ;
}
}
return 0 ;
}
static int
_mlist_read ( BT_state * state )
{
BT_meta * meta = state - > meta_pages [ state - > which ] ;
BT_page * root = _node_get ( state , meta - > root ) ;
uint8_t maxdepth = meta - > depth ;
BT_mlistnode * head = _mlist_read2 ( state , root , maxdepth , 1 ) ;
/*
trace the full freelist and unify nodes one last time
NB : linking the leaf nodes would make this unnecessary
*/
# if CAN_COALESCE
BT_mlistnode * p = head ;
BT_mlistnode * n = head - > next ;
while ( n ) {
size_t llen = P2BYTES ( p - > sz ) ;
uintptr_t laddr = ( uintptr_t ) p - > va ;
uintptr_t raddr = ( uintptr_t ) n - > va ;
/* contiguous: unify */
if ( laddr + llen = = raddr ) {
p - > sz + = n - > sz ;
p - > next = n - > next ;
free ( n ) ;
}
}
# endif
state - > mlist = head ;
return BT_SUCC ;
}
2023-12-19 06:02:58 +03:00
# endif
2023-11-29 19:15:46 +03:00
static int
_mlist_delete ( BT_state * state )
{
BT_mlistnode * head , * prev ;
head = prev = state - > mlist ;
while ( head - > next ) {
prev = head ;
head = head - > next ;
free ( prev ) ;
}
state - > mlist = 0 ;
return BT_SUCC ;
}
2023-12-19 06:02:58 +03:00
#if 0
2023-11-29 19:15:46 +03:00
BT_flistnode *
_flist_read2 ( BT_state * state , BT_page * node , uint8_t maxdepth , uint8_t depth )
{
2023-12-19 06:02:58 +03:00
size_t N = _bt_numkeys ( node ) ;
2023-11-29 19:15:46 +03:00
/* leaf */
if ( depth = = maxdepth ) {
BT_flistnode * head , * prev ;
head = prev = calloc ( 1 , sizeof ( * head ) ) ;
/* ;;: fixme the head won't get populated in this logic */
size_t i = 0 ;
BT_kv * kv = & node - > datk [ i ] ;
2023-12-19 06:02:58 +03:00
while ( i < N - 1 ) {
2023-11-29 19:15:46 +03:00
/* Just blindly append nodes since they aren't guaranteed sorted */
BT_flistnode * new = calloc ( 1 , sizeof * new ) ;
vaof_t hi = node - > datk [ i + 1 ] . va ;
vaof_t lo = kv - > va ;
2023-12-04 21:36:50 +03:00
size_t len = hi - lo ;
2023-11-29 19:15:46 +03:00
pgno_t fo = kv - > fo ;
new - > sz = len ;
new - > pg = fo ;
prev - > next = new ;
prev = new ;
kv = & node - > datk [ + + i ] ;
}
2023-12-19 06:02:58 +03:00
for ( size_t i = 0 ; i < N - 1 ; i + + ) {
vaof_t hi = node - > datk [ i + 1 ] . va ;
vaof_t lo = node - > datk [ i ] . va ;
size_t len = hi - lo ;
pgno_t fo = node - > datk [ i ] . fo ;
/* not free */
if ( fo ! = 0 )
continue ;
}
2023-11-29 19:15:46 +03:00
return head ;
}
/* branch */
size_t i = 0 ;
BT_flistnode * head , * prev ;
head = prev = 0 ;
2023-12-19 06:02:58 +03:00
for ( ; i < N ; + + i ) {
2023-11-29 19:15:46 +03:00
BT_kv kv = node - > datk [ i ] ;
if ( kv . fo = = BT_NOPAGE )
continue ;
BT_page * child = _node_get ( state , kv . fo ) ;
BT_flistnode * new = _flist_read2 ( state , child , maxdepth , depth + 1 ) ;
if ( head = = 0 ) {
head = prev = new ;
}
else {
/* just blindly append and unify the ends afterward */
prev - > next = new ;
}
}
return 0 ;
}
static int
_flist_read ( BT_state * state )
{
BT_meta * meta = state - > meta_pages [ state - > which ] ;
BT_page * root = _node_get ( state , meta - > root ) ;
uint8_t maxdepth = meta - > depth ;
2023-12-16 03:49:35 +03:00
BT_flistnode * head = _flist_read2 ( state , root , maxdepth , 1 ) ;
2023-11-29 19:15:46 +03:00
/* ;;: infinite loop with proper starting depth of 1. -- fix that! */
/* BT_flistnode *head = _flist_read2(state, root, maxdepth, 1); */
if ( head = = 0 )
return BT_SUCC ;
/* sort the freelist */
_flist_mergesort ( head ) ;
/* merge contiguous regions after sorting */
BT_flistnode * p = head ;
BT_flistnode * n = head - > next ;
while ( n ) {
size_t llen = p - > sz ;
pgno_t lfo = p - > pg ;
pgno_t rfo = n - > pg ;
/* contiguous: unify */
if ( lfo + llen = = rfo ) {
p - > sz + = n - > sz ;
p - > next = n - > next ;
free ( n ) ;
}
}
state - > flist = head ;
return BT_SUCC ;
}
2023-12-19 06:02:58 +03:00
# endif
2023-11-29 19:15:46 +03:00
static int
_flist_delete ( BT_state * state )
{
BT_flistnode * head , * prev ;
head = prev = state - > flist ;
while ( head - > next ) {
prev = head ;
head = head - > next ;
free ( prev ) ;
}
state - > flist = 0 ;
return BT_SUCC ;
}
# define CLOSE_FD(fd) \
do { \
close ( fd ) ; \
fd = - 1 ; \
} while ( 0 )
/* TODO: move to lib */
static uint32_t
nonzero_crc_32 ( void * dat , size_t len )
{
unsigned char nonce = 0 ;
uint32_t chk = crc_32 ( dat , len ) ;
do {
if ( nonce > 8 )
abort ( ) ;
chk = update_crc_32 ( chk , nonce + + ) ;
} while ( chk = = 0 ) ;
return chk ;
}
2023-12-06 22:05:54 +03:00
static void
_bt_state_restore_maps2 ( BT_state * state , BT_page * node ,
uint8_t depth , uint8_t maxdepth )
{
size_t N = _bt_numkeys ( node ) ;
/* leaf */
if ( depth = = maxdepth ) {
for ( size_t i = 0 ; i < N - 1 ; i + + ) {
vaof_t lo = node - > datk [ i ] . va ;
vaof_t hi = node - > datk [ i + 1 ] . va ;
pgno_t pg = node - > datk [ i ] . fo ;
BYTE * loaddr = off2addr ( lo ) ;
BYTE * hiaddr = off2addr ( hi ) ;
size_t bytelen = hiaddr - loaddr ;
off_t offset = P2BYTES ( pg ) ;
2023-12-13 02:23:37 +03:00
if ( pg ! = 0 ) {
/* not freespace, map readonly data on disk */
if ( loaddr ! =
mmap ( loaddr ,
bytelen ,
2023-12-13 15:45:44 +03:00
BT_PROT_CLEAN ,
BT_FLAG_CLEAN ,
2023-12-13 02:23:37 +03:00
state - > data_fd ,
offset ) ) {
2023-12-13 15:45:44 +03:00
DPRINTF ( " mmap: failed to map at addr %p, errno: %s " , loaddr , strerror ( errno ) ) ;
2023-12-13 02:23:37 +03:00
abort ( ) ;
}
}
else {
/* freespace, map no access */
if ( loaddr ! =
mmap ( loaddr ,
bytelen ,
2023-12-13 15:45:44 +03:00
BT_PROT_FREE ,
BT_FLAG_FREE ,
2023-12-13 15:17:05 +03:00
0 , 0 ) ) {
2023-12-13 15:45:44 +03:00
DPRINTF ( " mmap: failed to map at addr %p, errno: %s " , loaddr , strerror ( errno ) ) ;
2023-12-13 02:23:37 +03:00
abort ( ) ;
}
2023-12-06 22:05:54 +03:00
}
}
return ;
}
2023-12-19 06:02:58 +03:00
/* branch - dfs all subtrees */
2023-12-06 22:05:54 +03:00
for ( size_t i = 0 ; i < N - 1 ; i + + ) {
/* ;;: assuming node stripes when partition striping is implemented will be
1 : 1 mapped to disk for simplicity . If that is not the case , they should
be handled here . */
pgno_t pg = node - > datk [ i ] . fo ;
BT_page * child = _node_get ( state , pg ) ;
2023-12-19 18:56:34 +03:00
_bt_state_restore_maps2 ( state , child , depth + 1 , maxdepth ) ;
2023-12-06 22:05:54 +03:00
}
}
static void
_bt_state_restore_maps ( BT_state * state )
/* restores the memory map of the btree since data can be arbitrarily located */
{
/* TODO: add checks to ensure data isn't mapped into an invalid location
( e . g . a node stripe ) */
BT_meta * meta = state - > meta_pages [ state - > which ] ;
BT_page * root = _node_get ( state , meta - > root ) ;
_bt_state_restore_maps2 ( state , root , 1 , meta - > depth ) ;
}
2023-11-29 19:15:46 +03:00
static int
2023-12-16 02:38:52 +03:00
_bt_state_meta_which ( BT_state * state )
2023-11-29 19:15:46 +03:00
{
BT_meta * m1 = state - > meta_pages [ 0 ] ;
BT_meta * m2 = state - > meta_pages [ 1 ] ;
2023-12-16 02:38:52 +03:00
int which = - 1 ;
2023-11-29 19:15:46 +03:00
2023-12-06 03:29:56 +03:00
if ( m1 - > chk = = 0 ) {
2023-11-29 19:15:46 +03:00
/* first is dirty */
2023-12-16 02:38:52 +03:00
which = 1 ;
2023-11-29 19:15:46 +03:00
}
2023-12-06 03:29:56 +03:00
else if ( m2 - > chk = = 0 ) {
2023-11-29 19:15:46 +03:00
/* second is dirty */
2023-12-16 02:38:52 +03:00
which = 0 ;
2023-11-29 19:15:46 +03:00
}
else if ( m1 - > txnid > m2 - > txnid ) {
/* first is most recent */
2023-12-16 02:38:52 +03:00
which = 0 ;
2023-11-29 19:15:46 +03:00
}
else if ( m1 - > txnid < m2 - > txnid ) {
/* second is most recent */
2023-12-16 02:38:52 +03:00
which = 1 ;
2023-11-29 19:15:46 +03:00
}
else {
/* invalid state */
return EINVAL ;
}
/* checksum the metapage found and abort if checksum doesn't match */
2023-12-16 02:38:52 +03:00
BT_meta * meta = state - > meta_pages [ which ] ;
2024-01-23 04:35:02 +03:00
uint32_t chk = nonzero_crc_32 ( meta , BT_META_LEN_b ) ;
2023-11-29 19:15:46 +03:00
if ( chk ! = meta - > chk ) {
abort ( ) ;
}
2023-12-16 02:38:52 +03:00
/* set which in state */
2023-12-16 00:00:46 +03:00
state - > which = which ;
return BT_SUCC ;
}
2023-11-29 19:15:46 +03:00
static int
_bt_state_read_header ( BT_state * state )
{
BT_meta * m1 , * m2 ;
2023-12-16 02:38:52 +03:00
int rc = 1 ;
BYTE metas [ BT_PAGESIZE * 2 ] = { 0 } ;
2023-11-29 19:15:46 +03:00
m1 = state - > meta_pages [ 0 ] ;
m2 = state - > meta_pages [ 1 ] ;
TRACE ( ) ;
2023-12-16 02:38:52 +03:00
if ( pread ( state - > data_fd , metas , BT_PAGESIZE * 2 , 0 )
2023-11-29 19:15:46 +03:00
! = BT_PAGESIZE * 2 ) {
/* new pma */
return ENOENT ;
}
/* validate magic */
if ( m1 - > magic ! = BT_MAGIC ) {
DPRINTF ( " metapage 0x%pX inconsistent magic: 0x% " PRIX32 , m1 , m1 - > magic ) ;
return EINVAL ;
}
if ( m2 - > magic ! = BT_MAGIC ) {
DPRINTF ( " metapage 0x%pX inconsistent magic: 0x% " PRIX32 , m2 , m2 - > magic ) ;
return EINVAL ;
}
/* validate flags */
2023-12-19 18:56:34 +03:00
if ( ( m1 - > flags & BP_META ) ! = BP_META ) {
2023-11-29 19:15:46 +03:00
DPRINTF ( " metapage 0x%pX missing meta page flag " , m1 ) ;
return EINVAL ;
}
2023-12-19 18:56:34 +03:00
if ( ( m2 - > flags & BP_META ) ! = BP_META ) {
2023-11-29 19:15:46 +03:00
DPRINTF ( " metapage 0x%pX missing meta page flag " , m2 ) ;
return EINVAL ;
}
/* validate binary version */
if ( m1 - > version ! = BT_VERSION ) {
DPRINTF ( " version mismatch on metapage: 0x%pX, metapage version: % " PRIu32 " , binary version %u " ,
m1 , m1 - > version , BT_VERSION ) ;
return EINVAL ;
}
/* validate binary version */
if ( m2 - > version ! = BT_VERSION ) {
DPRINTF ( " version mismatch on metapage: 0x%pX, metapage version: % " PRIu32 " , binary version %u " ,
m2 , m2 - > version , BT_VERSION ) ;
return EINVAL ;
}
2023-12-16 02:38:52 +03:00
if ( ! SUCC ( rc = _bt_state_meta_which ( state ) ) )
2023-11-29 19:15:46 +03:00
return rc ;
return BT_SUCC ;
}
static int
_bt_state_meta_new ( BT_state * state )
# define INITIAL_ROOTPG 2
{
BT_page * p1 , * p2 , * root ;
BT_meta meta = { 0 } ;
TRACE ( ) ;
2023-12-13 06:22:48 +03:00
/* open the metapage region for writing */
if ( mprotect ( BT_MAPADDR , BT_META_SECTION_WIDTH ,
2023-12-13 15:52:36 +03:00
BT_PROT_DIRTY ) ! = 0 ) {
2023-12-13 06:22:48 +03:00
DPRINTF ( " mprotect of metapage section failed with %s " , strerror ( errno ) ) ;
abort ( ) ;
}
2023-12-09 02:15:31 +03:00
/* initialize the block base array */
2024-01-18 01:40:46 +03:00
meta . blk_base [ 0 ] = BT_NUMMETAS ;
2023-12-09 02:15:31 +03:00
2023-12-06 05:39:21 +03:00
root = _bt_nalloc ( state ) ;
2023-12-09 02:15:31 +03:00
_bt_root_new ( & meta , root ) ;
2023-11-29 19:15:46 +03:00
/* initialize meta struct */
meta . magic = BT_MAGIC ;
meta . version = BT_VERSION ;
meta . last_pg = 1 ;
meta . txnid = 0 ;
meta . fix_addr = BT_MAPADDR ;
meta . depth = 1 ;
meta . flags = BP_META ;
meta . root = _fo_get ( state , root ) ;
assert ( meta . root = = INITIAL_ROOTPG ) ; /* ;;: remove?? */
/* initialize the metapages */
p1 = & ( ( BT_page * ) state - > map ) [ 0 ] ;
p2 = & ( ( BT_page * ) state - > map ) [ 1 ] ;
/* copy the metadata into the metapages */
memcpy ( METADATA ( p1 ) , & meta , sizeof meta ) ;
/* ;;: todo, should the second metapage actually share a .root with the
first ? ? */
memcpy ( METADATA ( p2 ) , & meta , sizeof meta ) ;
2023-12-13 06:22:48 +03:00
/* only the active metapage should be writable (first page) */
2023-12-13 15:52:36 +03:00
if ( mprotect ( BT_MAPADDR , BT_META_SECTION_WIDTH , BT_PROT_CLEAN ) ! = 0 ) {
2023-12-13 06:22:48 +03:00
DPRINTF ( " mprotect of metapage section failed with %s " , strerror ( errno ) ) ;
abort ( ) ;
}
if ( mprotect ( BT_MAPADDR , BT_PAGESIZE ,
2023-12-13 15:52:36 +03:00
BT_PROT_DIRTY ) ! = 0 ) {
2023-12-13 06:22:48 +03:00
DPRINTF ( " mprotect of current metapage failed with %s " , strerror ( errno ) ) ;
abort ( ) ;
}
2023-11-29 19:15:46 +03:00
return BT_SUCC ;
}
2024-01-11 03:15:34 +03:00
# undef INITIAL_ROOTPG
2023-11-29 19:15:46 +03:00
2023-12-19 06:02:58 +03:00
static void
_freelist_restore2 ( BT_state * state , BT_page * node ,
uint8_t depth , uint8_t maxdepth )
{
size_t N = _bt_numkeys ( node ) ;
/* leaf */
if ( depth = = maxdepth ) {
2023-12-20 00:29:03 +03:00
for ( size_t i = 0 ; i < N - 1 ; i + + ) {
/* if allocated */
if ( node - > datk [ i ] . fo ! = 0 ) {
/* record allocated memory range */
BT_page * lo = off2addr ( node - > datk [ i ] . va ) ;
BT_page * hi = off2addr ( node - > datk [ i + 1 ] . va ) ;
_mlist_record_alloc ( state , lo , hi ) ;
/* record allocated file range */
ssize_t siz_p = hi - lo ;
assert ( siz_p > 0 ) ;
assert ( siz_p < UINT32_MAX ) ;
pgno_t lofo = node - > datk [ i ] . fo ;
pgno_t hifo = lofo + ( pgno_t ) siz_p ;
_flist_record_alloc ( state , lofo , hifo ) ;
}
}
return ;
2023-12-19 06:02:58 +03:00
}
/* branch */
2023-12-20 00:29:03 +03:00
for ( size_t i = 0 ; i < N - 1 ; i + + ) {
pgno_t fo = node - > datk [ i ] . fo ;
if ( fo ! = 0 ) {
/* record allocated node */
BT_page * child = _node_get ( state , fo ) ;
_nlist_record_alloc ( state , child ) ;
_freelist_restore2 ( state , child , depth + 1 , maxdepth ) ;
}
}
2023-12-19 06:02:58 +03:00
}
static void
_freelist_restore ( BT_state * state )
/* restores the mlist, nlist, and mlist */
{
BT_meta * meta = state - > meta_pages [ state - > which ] ;
BT_page * root = _node_get ( state , meta - > root ) ;
2024-01-13 03:41:19 +03:00
assert ( SUCC ( _flist_new ( state , state - > file_size_p ) ) ) ;
2024-01-25 01:49:17 +03:00
assert ( SUCC ( _nlist_load ( state ) ) ) ;
2023-12-19 06:02:58 +03:00
assert ( SUCC ( _mlist_new ( state ) ) ) ;
2023-12-20 00:29:03 +03:00
/* first record root's allocation */
_nlist_record_alloc ( state , root ) ;
2023-12-19 06:02:58 +03:00
_freelist_restore2 ( state , root , 1 , meta - > depth ) ;
}
2024-01-23 04:23:34 +03:00
static void
_bt_state_map_node_segment ( BT_state * state )
{
BT_meta * meta = state - > meta_pages [ state - > which ] ;
BYTE * targ = BT_MAPADDR + BT_META_SECTION_WIDTH ;
size_t i ;
/* map all allocated node stripes as clean */
for ( i = 0
; i < BT_NUMPARTS & & meta - > blk_base [ i ] ! = 0
; i + + ) {
pgno_t partoff_p = meta - > blk_base [ i ] ;
size_t partoff_b = P2BYTES ( partoff_p ) ;
size_t partlen_b = BLK_BASE_LENS_b [ i ] ;
if ( targ ! = mmap ( targ ,
partlen_b ,
BT_PROT_CLEAN ,
BT_FLAG_CLEAN ,
state - > data_fd ,
partoff_b ) ) {
DPRINTF ( " mmap: failed to map node stripe %zu, addr: 0x%p, file offset (bytes): 0x%zX, errno: %s " ,
i , targ , partoff_b , strerror ( errno ) ) ;
abort ( ) ;
}
/* move the target address ahead of the mapped partition */
targ + = partlen_b ;
}
/* map the rest of the node segment as free */
for ( ; i < BT_NUMPARTS ; i + + ) {
assert ( meta - > blk_base [ i ] = = 0 ) ;
size_t partlen_b = BLK_BASE_LENS_b [ i ] ;
if ( targ ! = mmap ( targ ,
partlen_b ,
BT_PROT_FREE ,
BT_FLAG_FREE ,
0 , 0 ) ) {
DPRINTF ( " mmap: failed to map unallocated node segment, addr: 0x%p, errno: %s " ,
targ , strerror ( errno ) ) ;
abort ( ) ;
}
targ + = partlen_b ;
}
}
2023-11-29 19:15:46 +03:00
static int
_bt_state_load ( BT_state * state )
{
int rc ;
int new = 0 ;
BT_page * p ;
struct stat stat ;
TRACE ( ) ;
2024-01-23 04:23:34 +03:00
/* map the metapages */
2023-12-16 00:00:46 +03:00
state - > map = mmap ( BT_MAPADDR ,
2024-01-23 04:23:34 +03:00
BT_META_SECTION_WIDTH ,
2023-12-16 00:00:46 +03:00
BT_PROT_CLEAN ,
BT_FLAG_CLEAN ,
state - > data_fd ,
0 ) ;
2024-01-23 04:23:34 +03:00
if ( state - > map ! = BT_MAPADDR ) {
DPRINTF ( " mmap: failed to map at addr %p, errno: %s " , BT_MAPADDR , strerror ( errno ) ) ;
abort ( ) ;
}
2023-12-16 02:38:52 +03:00
p = ( BT_page * ) state - > map ;
state - > meta_pages [ 0 ] = METADATA ( p ) ;
state - > meta_pages [ 1 ] = METADATA ( p + 1 ) ;
2023-12-16 00:00:46 +03:00
2023-11-29 19:15:46 +03:00
if ( ! SUCC ( rc = _bt_state_read_header ( state ) ) ) {
if ( rc ! = ENOENT ) return rc ;
DPUTS ( " creating new db " ) ;
2024-01-11 02:27:31 +03:00
state - > file_size_p = PMA_GROW_SIZE_p ;
2023-11-29 19:15:46 +03:00
new = 1 ;
2024-01-11 02:27:31 +03:00
if ( ftruncate ( state - > data_fd , PMA_GROW_SIZE_b ) ) {
2023-12-12 08:15:53 +03:00
return errno ;
}
2023-11-29 19:15:46 +03:00
}
2024-01-23 04:23:34 +03:00
/* map the node segment */
2024-01-24 03:20:07 +03:00
_bt_state_map_node_segment ( state ) ; /* ;;: this should follow a call to _bt_state_meta_new. hmm... but that leads to a bad dependency graph. We may need to separately initialize the first partition and only call map_node_segment on restore. */
2023-12-15 11:24:42 +03:00
2023-11-29 19:15:46 +03:00
/* new db, so populate metadata */
if ( new ) {
2024-01-13 03:41:19 +03:00
assert ( SUCC ( _flist_new ( state , PMA_GROW_SIZE_p ) ) ) ;
2023-11-29 19:15:46 +03:00
assert ( SUCC ( _nlist_new ( state ) ) ) ;
if ( ! SUCC ( rc = _bt_state_meta_new ( state ) ) ) {
munmap ( state - > map , BT_ADDRSIZE ) ;
return rc ;
}
2023-12-07 02:23:33 +03:00
assert ( SUCC ( _mlist_new ( state ) ) ) ;
2023-11-29 19:15:46 +03:00
}
else {
2024-01-13 04:02:59 +03:00
/* Set the file length */
if ( fstat ( state - > data_fd , & stat ) ! = 0 )
return errno ;
/* the file size should be a multiple of our pagesize */
assert ( ( stat . st_size % BT_PAGESIZE ) = = 0 ) ;
state - > file_size_p = stat . st_size / BT_PAGESIZE ;
2023-12-16 03:24:37 +03:00
/* restore data memory maps */
_bt_state_restore_maps ( state ) ;
2023-12-06 22:05:54 +03:00
/* restore ephemeral freelists */
2023-12-19 06:02:58 +03:00
_freelist_restore ( state ) ;
2023-12-06 05:39:21 +03:00
2023-12-20 05:55:57 +03:00
/* Dirty the metapage and root page */
assert ( SUCC ( _bt_flip_meta ( state ) ) ) ;
2023-11-29 19:15:46 +03:00
}
return BT_SUCC ;
}
/* ;;: TODO, when persistence has been implemented, _bt_falloc will probably
need to handle extension of the file with appropriate striping . i . e . if no
space is found on the freelist , save the last entry , expand the file size ,
and set last_entry - > next to a new node representing the newly added file
space */
static pgno_t
_bt_falloc ( BT_state * state , size_t pages )
{
/* walk the persistent file freelist and return a pgno with sufficient
contiguous space for pages */
2024-01-18 18:00:54 +03:00
BT_flistnode * * n ;
pgno_t ret ;
2024-01-11 02:27:31 +03:00
start :
2024-01-18 18:00:54 +03:00
n = & state - > flist ;
ret = 0 ;
2023-11-29 19:15:46 +03:00
/* first fit */
for ( ; * n ; n = & ( * n ) - > next ) {
2023-12-19 06:02:58 +03:00
size_t sz_p = ( * n ) - > hi - ( * n ) - > lo ;
2023-12-20 00:29:03 +03:00
if ( sz_p > = pages ) {
2023-12-19 06:02:58 +03:00
ret = ( * n ) - > lo ;
2023-12-20 00:29:03 +03:00
pgno_t hi = ret + pages ;
_flist_record_alloc ( state , ret , hi ) ;
break ;
2023-11-29 19:15:46 +03:00
}
}
2023-12-20 00:29:03 +03:00
if ( ret = = 0 ) {
2024-01-11 02:27:31 +03:00
/* flist out of mem, grow it */
2024-01-12 02:26:33 +03:00
DPRINTF ( " flist out of mem, growing current size (pages): 0x% " PRIX32 " to: 0x% " PRIX32 ,
2024-01-11 02:27:31 +03:00
state - > file_size_p , state - > file_size_p + PMA_GROW_SIZE_p ) ;
2024-01-13 03:41:19 +03:00
_flist_grow ( state , pages ) ;
2024-01-11 02:27:31 +03:00
/* restart the find procedure */
2024-01-11 02:49:55 +03:00
/* TODO: obv a minor optimization can be made here */
2024-01-11 02:27:31 +03:00
goto start ;
2023-12-20 00:29:03 +03:00
}
return ret ;
2023-11-29 19:15:46 +03:00
}
2023-12-20 09:16:55 +03:00
static int
_bt_sync_hasdirtypage ( BT_state * state , BT_page * node ) __attribute ( ( unused ) ) ;
2023-11-29 19:15:46 +03:00
static int
_bt_sync_hasdirtypage ( BT_state * state , BT_page * node )
/* ;;: could be more efficiently replaced by a gcc vectorized builtin */
{
for ( size_t i = 0 ; i < NMEMB ( node - > head . dirty ) ; i + + ) {
if ( node - > head . dirty [ i ] ! = 0 )
return 1 ;
}
return 0 ;
}
static int
_bt_sync_leaf ( BT_state * state , BT_page * node )
{
/* msync all of a leaf's data that is dirty. The caller is expected to sync
the node itself and mark it as clean in the parent . */
size_t i = 0 ;
2023-12-09 01:41:18 +03:00
size_t N = _bt_numkeys ( node ) ;
2023-11-29 19:15:46 +03:00
2023-12-19 18:56:34 +03:00
for ( i = 0 ; i < N - 1 ; i + + ) {
2023-11-29 19:15:46 +03:00
if ( ! _bt_ischilddirty ( node , i ) )
continue ; /* not dirty. nothing to do */
/* ;;: we don't actually need the page, do we? */
/* pgno_t pg = node->datk[i].fo; */
vaof_t lo = node - > datk [ i ] . va ;
vaof_t hi = node - > datk [ i + 1 ] . va ;
2023-12-04 21:36:50 +03:00
size_t bytelen = P2BYTES ( hi - lo ) ;
void * addr = off2addr ( lo ) ;
2023-11-29 19:15:46 +03:00
/* sync the page */
2023-12-15 00:40:42 +03:00
if ( msync ( addr , bytelen , MS_SYNC ) ! = 0 ) {
2023-12-15 00:30:29 +03:00
DPRINTF ( " msync of leaf: %p failed with %s " , addr , strerror ( errno ) ) ;
abort ( ) ;
}
2023-11-29 19:15:46 +03:00
2023-12-13 04:34:12 +03:00
/* mprotect the data */
2023-12-13 15:52:36 +03:00
if ( mprotect ( addr , bytelen , BT_PROT_CLEAN ) ! = 0 ) {
2023-12-13 04:34:12 +03:00
DPRINTF ( " mprotect of leaf data failed with %s " , strerror ( errno ) ) ;
abort ( ) ;
}
2023-11-29 19:15:46 +03:00
/* and clean the dirty bit */
_bt_cleanchild ( node , i ) ;
}
/* ;;: all data pages synced. should we now sync the node as well? No, I think
that should be the caller ' s responsibility */
/* ;;: it is probably faster to scan the dirty bit set and derive the datk idx
rather than iterate over the full datk array and check if it is dirty . This
was simpler to implement for now though . */
/* while (_bt_sync_hasdirtypage(state, node)) { */
/* ... */
/* } */
return BT_SUCC ;
}
static int
_bt_sync_meta ( BT_state * state )
/* syncs the metapage and performs necessary checksumming. Additionally, flips
the which */
{
BT_meta * meta = state - > meta_pages [ state - > which ] ;
uint32_t chk ;
2023-12-20 05:55:57 +03:00
int rc ;
2023-11-29 19:15:46 +03:00
2023-12-06 05:19:03 +03:00
/* increment the txnid */
meta - > txnid + = 1 ;
2023-11-29 19:15:46 +03:00
/* checksum the metapage */
2024-01-23 04:35:02 +03:00
chk = nonzero_crc_32 ( meta , BT_META_LEN_b ) ;
2023-11-29 19:15:46 +03:00
/* ;;: todo: guarantee the chk cannot be zero */
meta - > chk = chk ;
/* sync the metapage */
2023-12-15 00:40:42 +03:00
if ( msync ( LO_ALIGN_PAGE ( meta ) , sizeof ( BT_page ) , MS_SYNC ) ! = 0 ) {
2023-12-15 00:30:29 +03:00
DPRINTF ( " msync of metapage: %p failed with %s " , meta , strerror ( errno ) ) ;
abort ( ) ;
}
2023-11-29 19:15:46 +03:00
2023-12-20 05:55:57 +03:00
// ensure we have a new dirty metapage and root node
/* finally, make old metapage clean */
rc = _bt_flip_meta ( state ) ;
if ( mprotect ( LO_ALIGN_PAGE ( meta ) , sizeof ( BT_page ) , BT_PROT_CLEAN ) ! = 0 ) {
DPRINTF ( " mprotect of old metapage failed with %s " , strerror ( errno ) ) ;
abort ( ) ;
}
return rc ;
}
static int _bt_flip_meta ( BT_state * state ) {
BT_meta * meta = state - > meta_pages [ state - > which ] ;
BT_meta * newmeta ;
int newwhich ;
2023-11-29 19:15:46 +03:00
/* zero the new metapage's checksum */
newwhich = state - > which ? 0 : 1 ;
newmeta = state - > meta_pages [ newwhich ] ;
2023-12-13 02:23:37 +03:00
2023-12-13 15:52:36 +03:00
/* mprotect dirty new metapage */
2023-12-15 00:30:29 +03:00
if ( mprotect ( LO_ALIGN_PAGE ( newmeta ) , sizeof ( BT_page ) , BT_PROT_DIRTY ) ! = 0 ) {
2023-12-13 02:23:37 +03:00
DPRINTF ( " mprotect of new metapage failed with %s " , strerror ( errno ) ) ;
abort ( ) ;
}
2023-11-29 19:15:46 +03:00
newmeta - > chk = 0 ;
/* copy over metapage to new metapage excluding the checksum */
2024-01-23 04:35:02 +03:00
memcpy ( newmeta , meta , BT_META_LEN_b ) ;
2023-11-29 19:15:46 +03:00
/* CoW a new root since the root referred to by the metapage should always be
dirty */
2023-12-05 04:13:33 +03:00
BT_page * root ;
2023-11-29 19:15:46 +03:00
pgno_t newrootpg ;
root = _node_get ( state , newmeta - > root ) ;
2023-12-05 04:13:33 +03:00
if ( ! SUCC ( _node_cow ( state , root , & newrootpg ) ) )
2023-11-29 19:15:46 +03:00
abort ( ) ;
newmeta - > root = newrootpg ;
2023-12-13 02:23:37 +03:00
/* switch the metapage we're referring to */
2023-11-29 19:15:46 +03:00
state - > which = newwhich ;
return BT_SUCC ;
}
static int
_bt_sync ( BT_state * state , BT_page * node , uint8_t depth , uint8_t maxdepth )
/* recursively syncs the subtree under node. The caller is expected to sync node
itself and mark it clean . */
{
int rc = 0 ;
2023-12-09 01:41:18 +03:00
size_t N = _bt_numkeys ( node ) ;
2023-11-29 19:15:46 +03:00
/* leaf */
if ( depth = = maxdepth ) {
_bt_sync_leaf ( state , node ) ;
2023-12-13 02:23:37 +03:00
goto e ;
2023-11-29 19:15:46 +03:00
}
/* do dfs */
2023-12-09 01:41:18 +03:00
for ( size_t i = 0 ; i < N - 1 ; i + + ) {
2023-11-29 19:15:46 +03:00
if ( ! _bt_ischilddirty ( node , i ) )
continue ; /* not dirty. nothing to do */
BT_page * child = _node_get ( state , node - > datk [ i ] . fo ) ;
/* recursively sync the child's data */
2023-12-19 18:56:34 +03:00
if ( ( rc = _bt_sync ( state , child , depth + 1 , maxdepth ) ) )
2023-11-29 19:15:46 +03:00
return rc ;
/* sync the child node */
2023-12-15 00:40:42 +03:00
if ( msync ( child , sizeof ( BT_page ) , MS_SYNC ) ! = 0 ) {
2023-12-15 00:30:29 +03:00
DPRINTF ( " msync of child node: %p failed with %s " , child , strerror ( errno ) ) ;
abort ( ) ;
}
2023-11-29 19:15:46 +03:00
2023-12-13 02:23:37 +03:00
/* unset child dirty bit */
2023-11-29 19:15:46 +03:00
_bt_cleanchild ( node , i ) ;
}
2023-12-13 02:23:37 +03:00
e :
/* all modifications done in node, mark it read-only */
2023-12-13 15:52:36 +03:00
if ( mprotect ( node , sizeof ( BT_page ) , BT_PROT_CLEAN ) ! = 0 ) {
2023-12-13 02:23:37 +03:00
DPRINTF ( " mprotect of node failed with %s " , strerror ( errno ) ) ;
abort ( ) ;
}
2023-11-29 19:15:46 +03:00
return BT_SUCC ;
}
//// ===========================================================================
//// btree external routines
int
bt_state_new ( BT_state * * state )
{
BT_state * s = calloc ( 1 , sizeof * s ) ;
2023-12-06 03:29:35 +03:00
s - > data_fd = - 1 ;
2023-11-29 19:15:46 +03:00
s - > fixaddr = BT_MAPADDR ;
* state = s ;
return BT_SUCC ;
}
int
bt_state_open ( BT_state * state , const char * path , ULONG flags , mode_t mode )
2024-01-11 03:15:34 +03:00
# define DATANAME " / data.pma"
2023-11-29 19:15:46 +03:00
{
int oflags , rc ;
char * dpath ;
TRACE ( ) ;
UNUSED ( flags ) ;
oflags = O_RDWR | O_CREAT ;
dpath = malloc ( strlen ( path ) + sizeof ( DATANAME ) ) ;
if ( ! dpath ) return ENOMEM ;
sprintf ( dpath , " %s " DATANAME , path ) ;
if ( ( state - > data_fd = open ( dpath , oflags , mode ) ) = = - 1 )
return errno ;
if ( ! SUCC ( rc = _bt_state_load ( state ) ) )
goto e ;
state - > path = strdup ( dpath ) ;
e :
/* cleanup FDs stored in state if anything failed */
if ( ! SUCC ( rc ) ) {
if ( state - > data_fd ! = - 1 ) CLOSE_FD ( state - > data_fd ) ;
}
free ( dpath ) ;
return rc ;
}
2024-01-11 03:15:34 +03:00
# undef DATANAME
2023-11-29 19:15:46 +03:00
int
bt_state_close ( BT_state * state )
{
int rc ;
2023-12-08 02:40:27 +03:00
bt_sync ( state ) ;
2023-11-29 19:15:46 +03:00
_mlist_delete ( state ) ;
_flist_delete ( state ) ;
2023-12-08 02:40:27 +03:00
_nlist_delete ( state ) ;
2023-11-29 19:15:46 +03:00
2023-12-08 02:40:27 +03:00
if ( ( rc = munmap ( state - > map , BT_ADDRSIZE ) ) ! = 0 ) {
rc = errno ;
2023-11-29 19:15:46 +03:00
return rc ;
2023-12-08 02:40:27 +03:00
}
if ( state - > data_fd ! = - 1 ) CLOSE_FD ( state - > data_fd ) ;
ZERO ( state , sizeof * state ) ;
2023-11-29 19:15:46 +03:00
return BT_SUCC ;
}
void *
bt_malloc ( BT_state * state , size_t pages )
{
BT_mlistnode * * n = & state - > mlist ;
void * ret = 0 ;
/* first fit */
for ( ; * n ; n = & ( * n ) - > next ) {
2023-12-19 06:02:58 +03:00
size_t sz_p = addr2off ( ( * n ) - > hi ) - addr2off ( ( * n ) - > lo ) ;
2023-12-20 00:29:03 +03:00
if ( sz_p > = pages ) {
2023-12-19 06:02:58 +03:00
ret = ( * n ) - > lo ;
2023-12-20 01:47:57 +03:00
BT_page * hi = ( ( BT_page * ) ret ) + pages ;
2023-12-20 00:29:03 +03:00
_mlist_record_alloc ( state , ret , hi ) ;
2023-11-29 19:15:46 +03:00
break ;
}
2023-12-20 00:29:03 +03:00
// XX return early if nothing suitable found in freelist
}
if ( ret = = 0 ) {
DPUTS ( " mlist out of mem! " ) ;
return 0 ;
2023-11-29 19:15:46 +03:00
}
pgno_t pgno = _bt_falloc ( state , pages ) ;
bp ( pgno ! = 0 ) ;
_bt_insert ( state ,
2023-12-04 21:36:50 +03:00
addr2off ( ret ) ,
addr2off ( ret ) + pages ,
2023-11-29 19:15:46 +03:00
pgno ) ;
2023-12-15 13:38:35 +03:00
DPRINTF ( " map %p to offset 0x%zx bytes (0x%zx pages) \n " , ret , P2BYTES ( pgno ) , pages ) ;
2023-12-12 08:15:53 +03:00
if ( ret ! =
mmap ( ret ,
P2BYTES ( pages ) ,
2023-12-13 15:45:44 +03:00
BT_PROT_DIRTY ,
BT_FLAG_DIRTY ,
2023-12-12 08:15:53 +03:00
state - > data_fd ,
P2BYTES ( pgno ) ) ) {
2023-12-13 15:45:44 +03:00
DPRINTF ( " mmap: failed to map at addr %p, errno: %s " , ret , strerror ( errno ) ) ;
2023-12-12 08:15:53 +03:00
abort ( ) ;
}
2023-11-29 19:15:46 +03:00
bp ( ret ! = 0 ) ;
return ret ;
}
2023-12-12 08:15:53 +03:00
// XX need to mmap fixed/anon/no_reserve and prot_none
2023-11-29 19:15:46 +03:00
void
bt_free ( BT_state * state , void * lo , void * hi )
{
2023-12-04 21:36:50 +03:00
vaof_t looff = addr2off ( lo ) ;
vaof_t hioff = addr2off ( hi ) ;
2023-12-19 06:02:58 +03:00
pgno_t lopg , hipg ;
BT_findpath path = { 0 } ;
if ( ! SUCC ( _bt_find ( state , & path , looff , hioff ) ) ) {
DPRINTF ( " Failed to find range: (%p, %p) " , lo , hi ) ;
abort ( ) ;
}
/* insert freed range into mlist */
2023-12-05 03:50:20 +03:00
_mlist_insert ( state , lo , hi ) ;
2023-12-19 06:02:58 +03:00
/* insert freed range into flist */
BT_page * leaf = path . path [ path . depth ] ;
size_t childidx = path . idx [ path . depth ] ;
int isdirty = _bt_ischilddirty ( leaf , childidx ) ;
BT_kv kv = leaf - > datk [ childidx ] ;
vaof_t offset = looff - kv . va ;
lopg = kv . fo + offset ;
2024-02-03 02:51:49 +03:00
hipg = lopg + ( hioff - looff ) ;
/* insert null into btree */
_bt_insert ( state , looff , hioff , 0 ) ;
2023-12-19 06:02:58 +03:00
if ( isdirty ) {
_flist_insert ( & state - > flist , lopg , hipg ) ;
}
else {
_flist_insert ( & state - > pending_flist , lopg , hipg ) ;
}
2023-12-13 02:23:37 +03:00
/* ;;: is this correct? Shouldn't this actually happen when we merge the
pending_mlist on sync ? */
size_t bytelen = ( BYTE * ) hi - ( BYTE * ) lo ;
if ( lo ! =
mmap ( lo ,
bytelen ,
2023-12-13 15:45:44 +03:00
BT_PROT_FREE ,
BT_FLAG_FREE ,
2023-12-13 15:17:05 +03:00
0 , 0 ) ) {
2023-12-13 15:45:44 +03:00
DPRINTF ( " mmap: failed to map at addr %p, errno: %s " , lo , strerror ( errno ) ) ;
2023-12-13 02:23:37 +03:00
abort ( ) ;
}
2023-11-29 19:15:46 +03:00
}
2023-12-12 08:15:53 +03:00
// XX need to mprotect PROT_READ all ranges synced including root/meta
2023-11-29 19:15:46 +03:00
int
bt_sync ( BT_state * state )
{
/* as is often the case, handling the metapage/root is a special case, which
is done here . Syncing any other page of the tree is done in _bt_sync */
BT_meta * meta = state - > meta_pages [ state - > which ] ;
BT_page * root = _node_get ( state , meta - > root ) ;
int rc = 0 ;
2023-12-19 18:56:34 +03:00
if ( ( rc = _bt_sync ( state , root , 1 , meta - > depth ) ) )
2023-11-29 19:15:46 +03:00
return rc ;
2023-12-05 01:11:54 +03:00
/* merge the pending freelists */
_pending_nlist_merge ( state ) ;
_pending_flist_merge ( state ) ;
2023-11-29 19:15:46 +03:00
/* sync the root page */
2023-12-15 00:40:42 +03:00
if ( msync ( root , sizeof ( BT_page ) , MS_SYNC ) ! = 0 ) {
2023-12-15 00:35:33 +03:00
DPRINTF ( " msync of root: %p failed with %s " , root , strerror ( errno ) ) ;
abort ( ) ;
}
2023-11-29 19:15:46 +03:00
2023-12-13 02:23:37 +03:00
/* make root read-only */
2023-12-13 15:52:36 +03:00
if ( mprotect ( root , sizeof ( BT_page ) , BT_PROT_CLEAN ) ! = 0 ) {
2023-12-13 02:23:37 +03:00
DPRINTF ( " mprotect of root failed with %s " , strerror ( errno ) ) ;
abort ( ) ;
}
2023-11-29 19:15:46 +03:00
/* then sync the metapage */
2023-12-19 18:56:34 +03:00
if ( ( rc = _bt_sync_meta ( state ) ) )
2023-11-29 19:15:46 +03:00
return rc ;
return BT_SUCC ;
}
uint64_t
bt_meta_get ( BT_state * state , size_t idx )
{
BT_meta * meta = state - > meta_pages [ state - > which ] ;
2023-12-12 03:34:06 +03:00
assert ( ( uintptr_t ) & ( meta - > roots [ idx ] ) - ( uintptr_t ) meta < = sizeof * meta ) ;
2023-11-29 19:15:46 +03:00
return meta - > roots [ idx ] ;
}
void
bt_meta_set ( BT_state * state , size_t idx , uint64_t val )
{
BT_meta * meta = state - > meta_pages [ state - > which ] ;
2023-12-12 03:34:06 +03:00
assert ( ( uintptr_t ) & ( meta - > roots [ idx ] ) - ( uintptr_t ) meta < = sizeof * meta ) ;
2023-11-29 19:15:46 +03:00
meta - > roots [ idx ] = val ;
}
2023-12-01 01:16:15 +03:00
int
_bt_range_of ( BT_state * state , vaof_t p , vaof_t * * lo , vaof_t * * hi ,
pgno_t nodepg , uint8_t depth , uint8_t maxdepth )
{
BT_page * node = _node_get ( state , nodepg ) ;
size_t N = _bt_numkeys ( node ) ;
vaof_t llo = 0 ;
vaof_t hhi = 0 ;
pgno_t pg = 0 ;
size_t i ;
for ( i = 0 ; i < N - 1 ; i + + ) {
llo = node - > datk [ i ] . va ;
hhi = node - > datk [ i + 1 ] . va ;
pg = node - > datk [ i ] . fo ;
if ( llo < = p & & hhi > p ) {
break ;
}
}
/* not found */
if ( i = = N - 1 )
return 1 ;
if ( depth = = maxdepth ) {
* * lo = llo ;
* * hi = hhi ;
return BT_SUCC ;
}
return _bt_range_of ( state , p , lo , hi , pg , depth + 1 , maxdepth ) ;
}
2023-11-29 22:05:22 +03:00
int
2023-11-30 19:35:10 +03:00
bt_range_of ( BT_state * state , void * p , void * * lo , void * * hi )
2023-11-29 22:05:22 +03:00
{
/* traverse tree looking for lo <= p and hi > p. return that range as a pair
of pointers NOT as two vaof_t
0 : succ ( found )
1 : otherwise
*/
2023-12-01 01:16:15 +03:00
BT_meta * meta = state - > meta_pages [ state - > which ] ;
pgno_t root = meta - > root ;
vaof_t * loret = 0 ;
vaof_t * hiret = 0 ;
2023-12-04 21:36:50 +03:00
vaof_t poff = addr2off ( p ) ;
2023-12-01 01:16:15 +03:00
int rc = 0 ;
if ( ! SUCC ( rc = _bt_range_of ( state , poff , & loret , & hiret , root , 1 , meta - > depth ) ) ) {
return rc ;
}
2023-12-04 21:36:50 +03:00
* lo = off2addr ( * loret ) ;
* hi = off2addr ( * hiret ) ;
2023-12-01 01:16:15 +03:00
return BT_SUCC ;
2023-11-29 22:05:22 +03:00
}
2023-12-02 07:17:38 +03:00
/**
pseudocode from ed :
bt_dirty ( btree , lo , hi ) :
loop :
( range_lo , range_hi ) = find_range_for_pointer ( btree , lo ) ;
dirty_hi = min ( hi , range_hi ) ;
new_start_fo = data_cow ( btree , lo , dirty_hi ) ;
lo : = range_hi ;
if dirty_hi = = hi then break ;
// precondition: given range does not cross a tree boundary
data_cow ( btree , lo , hi ) :
( range_lo , range_hi , fo ) = bt_find ( btree , lo , hi ) ;
size = lo - hi ;
new_fo = data_alloc ( btree . data_free , size ) ;
// puts data in the unified buffer cache without having to map virtual memory
write ( fd , new_fo , size * BT_PAGESIZE , to_ptr ( lo ) ) ;
// maps new file offset with same data back into same memory
mmap ( fd , new_fo , size , to_ptr ( lo ) ) ;
bt_insert ( btree , lo , hi , new_fo ) ;
offset = lo - range_lo ;
freelist_insert ( btree . pending_data_flist , fo + offset , fo + offset + size ) ;
return new_fo
* */
static pgno_t
2023-12-04 23:28:25 +03:00
_bt_data_cow ( BT_state * state , vaof_t lo , vaof_t hi , pgno_t pg )
2023-12-02 07:17:38 +03:00
{
2023-12-04 21:36:50 +03:00
size_t len = hi - lo ;
size_t bytelen = P2BYTES ( len ) ;
pgno_t newpg = _bt_falloc ( state , len ) ;
BYTE * loaddr = off2addr ( lo ) ;
2023-12-06 01:24:46 +03:00
off_t offset = P2BYTES ( newpg ) ;
2023-12-02 16:02:14 +03:00
/* write call puts data in the unified buffer cache without having to map
virtual memory */
2023-12-19 18:56:34 +03:00
if ( pwrite ( state - > data_fd , loaddr , bytelen , offset ) ! = ( ssize_t ) bytelen )
2023-12-02 16:02:14 +03:00
abort ( ) ;
2023-12-02 07:17:38 +03:00
2023-12-02 16:02:14 +03:00
/* maps new file offset with same data back into memory */
2023-12-06 22:05:54 +03:00
if ( loaddr ! =
mmap ( loaddr ,
bytelen ,
2023-12-13 15:45:44 +03:00
BT_PROT_DIRTY ,
BT_FLAG_DIRTY ,
2023-12-06 22:05:54 +03:00
state - > data_fd ,
offset ) ) {
2023-12-13 15:45:44 +03:00
DPRINTF ( " mmap: failed to map at addr %p, errno: %s " , loaddr , strerror ( errno ) ) ;
2023-12-06 05:21:26 +03:00
abort ( ) ;
2023-12-06 22:05:54 +03:00
}
2023-12-02 16:02:14 +03:00
2023-12-04 21:47:33 +03:00
_bt_insert ( state , lo , hi , newpg ) ;
2023-12-02 16:02:14 +03:00
2023-12-19 06:02:58 +03:00
_flist_insert ( & state - > pending_flist , pg , pg + len ) ;
2023-12-02 07:17:38 +03:00
return newpg ;
}
static int
_bt_dirty ( BT_state * state , vaof_t lo , vaof_t hi , pgno_t nodepg ,
uint8_t depth , uint8_t maxdepth )
{
BT_page * node = _node_get ( state , nodepg ) ;
size_t N = _bt_numkeys ( node ) ;
2023-12-20 06:36:27 +03:00
size_t loidx = BT_DAT_MAXKEYS ; // 0 is a valid loidx!
2023-12-02 07:17:38 +03:00
size_t hiidx = 0 ;
/* find loidx of range */
for ( size_t i = 0 ; i < N - 1 ; i + + ) {
vaof_t hhi = node - > datk [ i + 1 ] . va ;
if ( hhi > lo ) {
loidx = i ;
break ;
}
}
2023-12-20 06:36:27 +03:00
assert ( loidx < BT_DAT_MAXKEYS ) ;
2023-12-02 07:17:38 +03:00
2023-12-20 05:55:57 +03:00
/* find hiidx (exclusive) of range */
for ( size_t i = loidx + 1 ; i < N ; i + + ) {
vaof_t hhi = node - > datk [ i ] . va ;
2023-12-02 07:17:38 +03:00
if ( hhi > = hi ) {
hiidx = i ;
break ;
}
}
assert ( hiidx ! = 0 ) ;
/* found a range in node that contains (lo-hi). May span multiple entries */
/* leaf: base case. cow the data */
2023-12-20 05:55:57 +03:00
if ( depth = = maxdepth ) {
for ( size_t i = loidx ; i < hiidx ; i + + ) {
2023-12-02 16:02:14 +03:00
vaof_t llo = node - > datk [ i ] . va ;
vaof_t hhi = MIN ( node - > datk [ i + 1 ] . va , hi ) ;
2023-12-04 23:28:25 +03:00
pgno_t pg = node - > datk [ i ] . fo ;
pgno_t newpg = _bt_data_cow ( state , llo , hhi , pg ) ;
2023-12-02 16:02:14 +03:00
_bt_insert ( state , llo , hhi , newpg ) ;
2024-01-11 02:27:31 +03:00
}
2023-12-20 05:55:57 +03:00
} else {
for ( size_t i = loidx ; i < hiidx ; i + + ) {
/* branch: recursive case */
pgno_t childpg = node - > datk [ i ] . fo ;
/* iteratively recurse on all entries */
_bt_dirty ( state , lo , hi , childpg , depth + 1 , maxdepth ) ;
2023-12-02 07:17:38 +03:00
}
}
2023-12-19 18:56:34 +03:00
return BT_SUCC ;
2023-12-02 07:17:38 +03:00
}
2023-11-29 22:05:22 +03:00
int
2023-11-30 19:35:10 +03:00
bt_dirty ( BT_state * state , void * lo , void * hi )
2023-11-29 22:05:22 +03:00
{
/* takes a range and ensures that entire range is CoWed */
/* if part of the range is free then return 1 */
2023-12-02 07:17:38 +03:00
BT_meta * meta = state - > meta_pages [ state - > which ] ;
2023-12-04 21:36:50 +03:00
vaof_t looff = addr2off ( lo ) ;
vaof_t hioff = addr2off ( hi ) ;
2023-12-02 07:17:38 +03:00
return _bt_dirty ( state , looff , hioff , meta - > root , 1 , meta - > depth ) ;
2023-11-29 22:05:22 +03:00
}
int
2023-11-30 19:35:10 +03:00
bt_next_alloc ( BT_state * state , void * p , void * * lo , void * * hi )
2023-12-02 00:30:14 +03:00
/* if p is free, sets lo and hi to the bounds of the next adjacent allocated
space . If p is allocated , sets lo and hi to the bounds of the allocated space
it falls in . */
{
BT_mlistnode * head = state - > mlist ;
2023-12-19 06:02:58 +03:00
BYTE * pb = p ;
2023-12-19 18:56:34 +03:00
BYTE * pma_end ;
2023-12-02 00:30:14 +03:00
while ( head ) {
2023-12-02 00:37:18 +03:00
/* at last free block, different logic applies */
if ( head - > next = = 0 )
goto end ;
2023-12-02 00:30:14 +03:00
/* p is in a free range, return the allocated hole after it */
2023-12-19 06:02:58 +03:00
if ( head - > lo < = pb
& & head - > hi > pb ) {
2023-12-02 00:30:14 +03:00
goto found ;
}
/* p is alloced, return this hole */
2023-12-19 06:02:58 +03:00
if ( head - > next - > lo > pb
& & head - > hi < = pb ) {
2023-12-02 00:30:14 +03:00
goto found ;
}
head = head - > next ;
}
2023-11-29 22:05:22 +03:00
2023-12-02 00:30:14 +03:00
/* not found */
return 1 ;
found :
/* the alloced space begins at the end of the free block */
2023-12-19 06:02:58 +03:00
* lo = head - > hi ;
2023-12-02 00:30:14 +03:00
/* ... and ends at the start of the next free block */
2023-12-19 06:02:58 +03:00
* hi = head - > next - > lo ;
2023-12-02 00:30:14 +03:00
return BT_SUCC ;
2023-12-02 00:37:18 +03:00
end :
2023-12-19 18:56:34 +03:00
pma_end = ( void * ) ( ( uintptr_t ) BT_MAPADDR + BT_ADDRSIZE ) ;
2023-12-19 06:02:58 +03:00
assert ( head - > hi < = pma_end ) ;
2023-12-02 00:37:18 +03:00
/* no alloced region between tail of freelist and end of pma memory space */
2023-12-19 06:02:58 +03:00
if ( head - > hi = = pma_end )
2023-12-02 00:37:18 +03:00
return 1 ;
/* otherwise, return the alloced region between the tail of the freelist and
the end of the memory arena */
2023-12-19 06:02:58 +03:00
* lo = head - > hi ;
2023-12-02 00:37:18 +03:00
* hi = pma_end ;
return BT_SUCC ;
2023-11-29 22:05:22 +03:00
}
void
2023-11-30 19:35:10 +03:00
bt_bounds ( BT_state * state , void * * lo , void * * hi )
2023-11-29 22:05:22 +03:00
{
2023-11-30 19:35:10 +03:00
* lo = BT_MAPADDR ;
* hi = ( void * ) ( ( uintptr_t ) BT_MAPADDR + BT_ADDRSIZE ) ;
2023-11-29 22:05:22 +03:00
}
int
2023-11-30 19:35:10 +03:00
bt_inbounds ( BT_state * state , void * p )
2023-12-04 20:37:53 +03:00
/* 1: if in the bounds of the PMA, 0 otherwise */
2023-11-29 22:05:22 +03:00
{
2023-12-13 06:22:48 +03:00
return p > = ( void * ) BT_MAPADDR
2023-12-04 20:37:53 +03:00
& & p < ( void * ) ( ( uintptr_t ) BT_MAPADDR + BT_ADDRSIZE ) ;
2023-11-29 22:05:22 +03:00
}
2023-11-29 19:15:46 +03:00
//// ===========================================================================
//// tests
/* ;;: obv this should be moved to a separate file */
static void
_sham_sync_clean ( BT_page * node )
{
for ( uint8_t * dit = & node - > head . dirty [ 0 ]
; dit < & node - > head . dirty [ sizeof ( node - > head . dirty ) - 1 ]
; dit + + ) {
* dit = 0 ;
}
}
static void
_sham_sync2 ( BT_state * state , BT_page * node , uint8_t depth , uint8_t maxdepth )
{
if ( depth = = maxdepth ) return ;
/* clean node */
_sham_sync_clean ( node ) ;
/* then recurse and clean all children with DFS */
size_t N = _bt_numkeys ( node ) ;
for ( size_t i = 1 ; i < N ; + + i ) {
BT_kv kv = node - > datk [ i ] ;
pgno_t childpg = kv . fo ;
BT_page * child = _node_get ( state , childpg ) ;
_sham_sync2 ( state , child , depth + 1 , maxdepth ) ;
}
}
2023-12-20 09:16:55 +03:00
static void
_sham_sync ( BT_state * state ) __attribute ( ( unused ) ) ;
2023-11-29 19:15:46 +03:00
static void
_sham_sync ( BT_state * state )
{
/* walk the tree and unset the dirty bit from all pages */
BT_meta * meta = state - > meta_pages [ state - > which ] ;
BT_page * root = _node_get ( state , meta - > root ) ;
2024-01-23 04:35:02 +03:00
meta - > chk = nonzero_crc_32 ( meta , BT_META_LEN_b ) ;
2023-11-29 19:15:46 +03:00
_sham_sync2 ( state , root , 1 , meta - > depth ) ;
}
static void
_bt_printnode ( BT_page * node )
{
2023-12-19 18:56:34 +03:00
fprintf ( stderr , " node: %p \n " , ( void * ) node ) ;
2023-12-15 08:50:26 +03:00
fprintf ( stderr , " data: \n " ) ;
2023-11-29 19:15:46 +03:00
for ( size_t i = 0 ; i < BT_DAT_MAXKEYS ; + + i ) {
if ( i & & node - > datk [ i ] . va = = 0 )
break ;
2023-12-15 09:19:28 +03:00
fprintf ( stderr , " [%5zu] %10x %10x \n " , i , node - > datk [ i ] . va , node - > datk [ i ] . fo ) ;
2023-11-29 19:15:46 +03:00
}
}
2024-01-23 04:23:34 +03:00
/*
re : partition striping , I find the following somewhat confusing .
A pgno for a node could be either :
1 ) Its actual pgno in the persistent file
2 ) merely a page offset into the memory arena
The pgno that a parent node stores for a child need not be ( 1 ) if the memory
maps are properly restored from the blk_base array in the metapage . Right ?
So , on startup :
before traversing nodes , restore the memory map by mapping each partition
successively after the other . Do this for all non - zero page offsets in
blk_base .
If this is done , then _node_get and _fo_get can remain largely unchanged
Is there any reason this won ' t work ?
- - - - - - - - - - - - - - - - - - - - - - - - - - -
If we have to do ( 2 ) for some reason . Then _node_get and _fo_get / will / have
to read the blk_base array and appropriately translate using the offsets and
partition sizes .
is _bt_data_cow a problem ?
*/