diff --git a/.github/workflows/ares-shared.yml b/.github/workflows/ares-shared.yml index fe7aba4..244017e 100644 --- a/.github/workflows/ares-shared.yml +++ b/.github/workflows/ares-shared.yml @@ -75,7 +75,7 @@ jobs: # Build Ares - name: Build run: | - nix develop --command bash -c "cargo build --release --verbose --features check_all" + nix develop --command bash -c "cargo build --release --verbose" # Run tests - name: Test diff --git a/.gitignore b/.gitignore index 917314b..e90d778 100644 --- a/.gitignore +++ b/.gitignore @@ -2,3 +2,5 @@ ships/ *.backup urbit *.jam.out +*.o +*.a diff --git a/rust/ares/Cargo.lock b/rust/ares/Cargo.lock index 77636ad..23a871e 100644 --- a/rust/ares/Cargo.lock +++ b/rust/ares/Cargo.lock @@ -60,6 +60,7 @@ version = "0.1.0" dependencies = [ "ares_crypto", "ares_macros", + "ares_pma", "assert_no_alloc", "autotools", "bitvec", @@ -103,6 +104,14 @@ dependencies = [ "syn 1.0.109", ] +[[package]] +name = "ares_pma" +version = "0.1.0" +dependencies = [ + "bindgen 0.69.1", + "cc", +] + [[package]] name = "assert_no_alloc" version = "1.1.2" @@ -158,6 +167,29 @@ dependencies = [ "which", ] +[[package]] +name = "bindgen" +version = "0.69.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9ffcebc3849946a7170a05992aac39da343a90676ab392c51a4280981d6379c2" +dependencies = [ + "bitflags 2.4.1", + "cexpr", + "clang-sys", + "lazy_static", + "lazycell", + "log", + "peeking_take_while", + "prettyplease", + "proc-macro2", + "quote", + "regex", + "rustc-hash", + "shlex", + "syn 2.0.39", + "which", +] + [[package]] name = "bitflags" version = "1.3.2" @@ -1105,7 +1137,7 @@ version = "0.1.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "ced751f95a527a3458eb67c75e4ae7093d41585edaa7565f5769101502473019" dependencies = [ - "bindgen", + "bindgen 0.68.1", "pkg-config", ] diff --git a/rust/ares/Cargo.toml b/rust/ares/Cargo.toml index ef58fed..3d7ec25 100644 --- a/rust/ares/Cargo.toml +++ b/rust/ares/Cargo.toml @@ -13,9 +13,12 @@ edition = "2018" [dependencies] ares_crypto = { path = "../ares_crypto" } ares_macros = { path = "../ares_macros" } -# assert_no_alloc = "1.1.2" +# Use this when debugging requires the debug printfs in the PMA +# ares_pma = { path = "../ares_pma", features=["debug_prints"] } +ares_pma = { path = "../ares_pma" } +assert_no_alloc = "1.1.2" # use this when debugging requires allocation (e.g. eprintln) -assert_no_alloc = {version="1.1.2", features=["warn_debug"]} +# assert_no_alloc = {version="1.1.2", features=["warn_debug"]} bitvec = "1.0.0" criterion = "0.4" either = "1.9.0" @@ -47,6 +50,7 @@ opt-level = 3 # run with e.g. 'cargo build --features check_forwarding,check_acyclic' [features] +# FOR DEBUGGING MEMORY ISSUES ONLY check_all = [ "check_acyclic", "check_forwarding", "check_junior" ] check_acyclic = [] check_forwarding = [] diff --git a/rust/ares/src/hamt.rs b/rust/ares/src/hamt.rs index 001f51a..d7f08a3 100644 --- a/rust/ares/src/hamt.rs +++ b/rust/ares/src/hamt.rs @@ -1,8 +1,11 @@ -use crate::mem::{unifying_equality, NockStack, Preserve}; +use crate::mem::{NockStack, Preserve}; use crate::mug::mug_u32; use crate::noun::Noun; +use crate::persist::{pma_contains, Persist}; +use crate::unifying_equality::unifying_equality; use either::Either::{self, *}; -use std::ptr::{copy_nonoverlapping, null}; +use std::mem::size_of; +use std::ptr::{copy_nonoverlapping, null_mut}; use std::slice; type MutStemEntry = Either<*mut MutStem, Leaf>; @@ -160,11 +163,23 @@ impl MutHamt { } } +/** + * This is the core memory structure of an immutable HAMT. + * + * The root Stem lives in its own memory allocation, addressed by the pointer wrapped by [Hamt]. + * All other Stems and Leaves live in memory blocks pointed to by [buffer]. The memory pointed to + * by this field may be zero to 32 entries, depending on the *number of bits set* in bitmap. + * + * Addressing a chunk of the key's hash is done by counting the number of set bits in the bitmap + * before the chunk'th bit. The typemap is a parallel bitmap in which bits are set if the + * corresponding entry is a stem, and cleared if it is a leaf. + */ #[repr(packed)] +#[repr(C)] struct Stem { bitmap: u32, typemap: u32, - buffer: *const Entry, + buffer: *mut Entry, } impl Copy for Stem {} @@ -218,6 +233,7 @@ impl Stem { } #[repr(packed)] +#[repr(C)] struct Leaf { len: usize, buffer: *mut (Noun, T), // mutable for unifying equality @@ -238,6 +254,8 @@ impl Leaf { } #[derive(Copy, Clone)] +#[repr(packed)] +#[repr(C)] union Entry { stem: Stem, leaf: Leaf, @@ -256,19 +274,23 @@ assert_eq_size!(&[(Noun, ())], Leaf<()>); assert_eq_size!(&[Entry<()>], Stem<()>); #[derive(Copy, Clone)] -pub struct Hamt(Stem); +pub struct Hamt(*mut Stem); impl Hamt { pub fn is_null(&self) -> bool { - self.0.bitmap == 0 + unsafe { (*self.0).bitmap == 0 } } // Make a new, empty HAMT - pub fn new() -> Self { - Hamt(Stem { - bitmap: 0, - typemap: 0, - buffer: null(), - }) + pub fn new(stack: &mut NockStack) -> Self { + unsafe { + let stem_ptr = stack.struct_alloc::>(1); + *stem_ptr = Stem { + bitmap: 0, + typemap: 0, + buffer: null_mut(), + }; + Hamt(stem_ptr) + } } /** @@ -278,7 +300,7 @@ impl Hamt { * in the HAMT */ pub fn lookup(&self, stack: &mut NockStack, n: &mut Noun) -> Option { - let mut stem = self.0; + let mut stem = unsafe { *self.0 }; let mut mug = mug_u32(stack, *n); 'lookup: loop { let chunk = mug & 0x1F; // 5 bits @@ -309,9 +331,9 @@ impl Hamt { pub fn insert(&self, stack: &mut NockStack, n: &mut Noun, t: T) -> Hamt { let mut mug = mug_u32(stack, *n); let mut depth = 0u8; - let mut stem = self.0; - let mut stem_ret = self.0; - let mut dest = &mut stem_ret as *mut Stem; + let mut stem = unsafe { *self.0 }; + let stem_ret = unsafe { stack.struct_alloc::>(1) }; + let mut dest = stem_ret; unsafe { 'insert: loop { let chunk = mug & 0x1F; // 5 bits @@ -439,17 +461,12 @@ impl Hamt { } } -impl Default for Hamt { - fn default() -> Self { - Self::new() - } -} - impl Preserve for Hamt { unsafe fn assert_in_stack(&self, stack: &NockStack) { - stack.assert_struct_is_in(self.0.buffer, self.0.size()); + stack.assert_struct_is_in(self.0, 1); + stack.assert_struct_is_in((*self.0).buffer, (*self.0).size()); let mut traversal_stack: [Option<(Stem, u32)>; 6] = [None; 6]; - traversal_stack[0] = Some((self.0, 0)); + traversal_stack[0] = Some(((*self.0), 0)); let mut traversal_depth = 1; 'check: loop { if traversal_depth == 0 { @@ -491,78 +508,85 @@ impl Preserve for Hamt { } unsafe fn preserve(&mut self, stack: &mut NockStack) { - if stack.is_in_frame(self.0.buffer) { - let dest_buffer = stack.struct_alloc_in_previous_frame(self.0.size()); - copy_nonoverlapping(self.0.buffer, dest_buffer, self.0.size()); - self.0.buffer = dest_buffer; - // Here we're using the Rust stack since the array is a fixed - // size. Thus it will be cleaned up if the Rust thread running - // this is killed, and is therefore not an issue vs. if it were allocated - // on the heap. - // - // In the past, this traversal stack was allocated in NockStack, but - // exactly the right way to do this is less clear with the split stack. - let mut traversal_stack: [Option<(Stem, u32)>; 6] = [None; 6]; - traversal_stack[0] = Some((self.0, 0)); - let mut traversal_depth = 1; - 'preserve: loop { - if traversal_depth == 0 { - break; - } - let (stem, mut position) = traversal_stack[traversal_depth - 1] - .expect("Attempted to access uninitialized array element"); - // can we loop over the size and count leading 0s remaining in the bitmap? - 'preserve_stem: loop { - if position >= 32 { - traversal_depth -= 1; - continue 'preserve; + if stack.is_in_frame(self.0) { + let dest_stem = stack.struct_alloc_in_previous_frame(1); + copy_nonoverlapping(self.0, dest_stem, 1); + self.0 = dest_stem; + if stack.is_in_frame((*dest_stem).buffer) { + let dest_buffer = stack.struct_alloc_in_previous_frame((*dest_stem).size()); + copy_nonoverlapping((*dest_stem).buffer, dest_buffer, (*dest_stem).size()); + (*dest_stem).buffer = dest_buffer; + // Here we're using the Rust stack since the array is a fixed + // size. Thus it will be cleaned up if the Rust thread running + // this is killed, and is therefore not an issue vs. if it were allocated + // on the heap. + // + // In the past, this traversal stack was allocated in NockStack, but + // exactly the right way to do this is less clear with the split stack. + let mut traversal_stack: [Option<(Stem, u32)>; 6] = [None; 6]; + traversal_stack[0] = Some(((*dest_stem), 0)); + let mut traversal_depth = 1; + 'preserve: loop { + if traversal_depth == 0 { + break; } - match stem.entry(position) { - None => { - position += 1; - continue 'preserve_stem; + let (stem, mut position) = traversal_stack[traversal_depth - 1] + .expect("Attempted to access uninitialized array element"); + // can we loop over the size and count leading 0s remaining in the bitmap? + 'preserve_stem: loop { + if position >= 32 { + traversal_depth -= 1; + continue 'preserve; } - Some((Left(next_stem), idx)) => { - if stack.is_in_frame(next_stem.buffer) { - let dest_buffer = - stack.struct_alloc_in_previous_frame(next_stem.size()); - copy_nonoverlapping( - next_stem.buffer, - dest_buffer, - next_stem.size(), - ); - let new_stem = Stem { - bitmap: next_stem.bitmap, - typemap: next_stem.typemap, - buffer: dest_buffer, - }; - *(stem.buffer.add(idx) as *mut Entry) = Entry { stem: new_stem }; - assert!(traversal_depth <= 5); // will increment - traversal_stack[traversal_depth - 1] = Some((stem, position + 1)); - traversal_stack[traversal_depth] = Some((new_stem, 0)); - traversal_depth += 1; - continue 'preserve; - } else { + match stem.entry(position) { + None => { position += 1; continue 'preserve_stem; } - } - Some((Right(leaf), idx)) => { - if stack.is_in_frame(leaf.buffer) { - let dest_buffer = stack.struct_alloc_in_previous_frame(leaf.len); - copy_nonoverlapping(leaf.buffer, dest_buffer, leaf.len); - let new_leaf = Leaf { - len: leaf.len, - buffer: dest_buffer, - }; - for pair in new_leaf.to_mut_slice().iter_mut() { - pair.0.preserve(stack); - pair.1.preserve(stack); + Some((Left(next_stem), idx)) => { + if stack.is_in_frame(next_stem.buffer) { + let dest_buffer = + stack.struct_alloc_in_previous_frame(next_stem.size()); + copy_nonoverlapping( + next_stem.buffer, + dest_buffer, + next_stem.size(), + ); + let new_stem = Stem { + bitmap: next_stem.bitmap, + typemap: next_stem.typemap, + buffer: dest_buffer, + }; + *stem.buffer.add(idx) = Entry { stem: new_stem }; + assert!(traversal_depth <= 5); // will increment + traversal_stack[traversal_depth - 1] = + Some((stem, position + 1)); + traversal_stack[traversal_depth] = Some((new_stem, 0)); + traversal_depth += 1; + continue 'preserve; + } else { + position += 1; + continue 'preserve_stem; } - *(stem.buffer.add(idx) as *mut Entry) = Entry { leaf: new_leaf }; } - position += 1; - continue 'preserve_stem; + Some((Right(leaf), idx)) => { + if stack.is_in_frame(leaf.buffer) { + let dest_buffer = + stack.struct_alloc_in_previous_frame(leaf.len); + copy_nonoverlapping(leaf.buffer, dest_buffer, leaf.len); + let new_leaf = Leaf { + len: leaf.len, + buffer: dest_buffer, + }; + for pair in new_leaf.to_mut_slice().iter_mut() { + pair.0.preserve(stack); + pair.1.preserve(stack); + } + *stem.buffer.add(idx) = Entry { leaf: new_leaf }; + } + position += 1; + continue 'preserve_stem; + } } } } @@ -570,3 +594,182 @@ impl Preserve for Hamt { } } } + +impl Persist for Hamt { + unsafe fn space_needed(&mut self, stack: &mut NockStack) -> usize { + if pma_contains(self.0, 1) { + return 0; + } + let mut bytes: usize = size_of::>(); + if pma_contains((*self.0).buffer, (*self.0).size()) { + return bytes; + }; + + bytes += (*self.0).size() * size_of::>(); + + let mut depth: usize = 0; + let mut traversal = [Stem { + bitmap: 0, + typemap: 0, + buffer: null_mut(), + }; 6]; + traversal[0] = *self.0; + + loop { + assert!(depth < 6); + if traversal[depth].bitmap == 0 { + if depth == 0 { + break bytes; + } + depth -= 1; + continue; + } + + let next_chunk = traversal[depth].bitmap.trailing_zeros(); + let next_type = traversal[depth].typemap & (1 << next_chunk) != 0; + let next_entry = *traversal[depth].buffer; + traversal[depth].bitmap >>= next_chunk + 1; + traversal[depth].typemap >>= next_chunk + 1; + traversal[depth].buffer = traversal[depth].buffer.add(1); + + if next_type { + // true->stem false->leaf + // found another stem + traversal[depth + 1] = next_entry.stem; + + if pma_contains(traversal[depth + 1].buffer, traversal[depth + 1].size()) { + continue; + } + + // count the buffer for the next stem + bytes += traversal[depth + 1].size() * size_of::>(); + depth += 1; + } else { + let mut leaf = next_entry.leaf; + + if leaf.len == 0 { + continue; + } + + if pma_contains(leaf.buffer, leaf.len) { + continue; + } + + bytes += size_of::<(Noun, T)>() * leaf.len; + + while leaf.len > 0 { + bytes += (*leaf.buffer).0.space_needed(stack); + bytes += (*leaf.buffer).1.space_needed(stack); + leaf.buffer = leaf.buffer.add(1); + leaf.len -= 1; + } + } + } + } + + unsafe fn copy_to_buffer(&mut self, stack: &mut NockStack, buffer: &mut *mut u8) { + if pma_contains(self.0, 1) { + return; + } + let stem_ptr = *buffer as *mut Stem; + copy_nonoverlapping(self.0, stem_ptr, 1); + *buffer = stem_ptr.add(1) as *mut u8; + self.0 = stem_ptr; + + let stem_buffer_size = (*stem_ptr).size(); + if pma_contains((*stem_ptr).buffer, stem_buffer_size) { + return; + } + let stem_buffer_ptr = *buffer as *mut Entry; + copy_nonoverlapping((*stem_ptr).buffer, stem_buffer_ptr, stem_buffer_size); + *buffer = stem_buffer_ptr.add(stem_buffer_size) as *mut u8; + (*stem_ptr).buffer = stem_buffer_ptr; + + let mut depth: usize = 0; + let mut traversal = [Stem { + bitmap: 0, + typemap: 0, + buffer: null_mut(), + }; 6]; + + traversal[0] = *stem_ptr; + + loop { + if traversal[depth].bitmap == 0 { + if depth == 0 { + break; + } + depth -= 1; + continue; + } + + let next_chunk = traversal[depth].bitmap.trailing_zeros(); + let next_type = traversal[depth].typemap & (1 << next_chunk) != 0; + let next_entry_ptr = traversal[depth].buffer; + + traversal[depth].bitmap >>= next_chunk + 1; + traversal[depth].typemap >>= next_chunk + 1; + traversal[depth].buffer = traversal[depth].buffer.add(1); + + if next_type { + // Stem case + assert!(depth < 5); + + let stem_ptr: *mut Stem = &mut (*next_entry_ptr).stem; + let stem_size = (*stem_ptr).size(); + + if pma_contains((*stem_ptr).buffer, stem_size) { + continue; + } + + let stem_buffer_ptr = *buffer as *mut Entry; + + copy_nonoverlapping((*stem_ptr).buffer, stem_buffer_ptr, stem_size); + *buffer = stem_buffer_ptr.add(stem_size) as *mut u8; + + (*stem_ptr).buffer = stem_buffer_ptr; + traversal[depth + 1] = *stem_ptr; + depth += 1; + } else { + // Leaf case + let leaf_ptr: *mut Leaf = &mut (*next_entry_ptr).leaf; + + if (*leaf_ptr).len == 0 { + continue; + } + + if pma_contains((*leaf_ptr).buffer, (*leaf_ptr).len) { + continue; + } + + let leaf_buffer_ptr = *buffer as *mut (Noun, T); + + copy_nonoverlapping((*leaf_ptr).buffer, leaf_buffer_ptr, (*leaf_ptr).len); + *buffer = leaf_buffer_ptr.add((*leaf_ptr).len) as *mut u8; + + (*leaf_ptr).buffer = leaf_buffer_ptr; + + let mut leaf_idx = 0; + + while leaf_idx < (*leaf_ptr).len { + (*(*leaf_ptr).buffer.add(leaf_idx)) + .0 + .copy_to_buffer(stack, buffer); + (*(*leaf_ptr).buffer.add(leaf_idx)) + .1 + .copy_to_buffer(stack, buffer); + + leaf_idx += 1; + } + } + } + } + + unsafe fn handle_to_u64(&self) -> u64 { + self.0 as u64 + } + + unsafe fn handle_from_u64(meta_handle: u64) -> Self { + Hamt(meta_handle as *mut Stem) + } +} diff --git a/rust/ares/src/interpreter.rs b/rust/ares/src/interpreter.rs index bf406af..358f396 100644 --- a/rust/ares/src/interpreter.rs +++ b/rust/ares/src/interpreter.rs @@ -7,7 +7,6 @@ use crate::jets::cold::Cold; use crate::jets::hot::Hot; use crate::jets::warm::Warm; use crate::jets::JetErr; -use crate::mem::unifying_equality; use crate::mem::NockStack; use crate::mem::Preserve; use crate::newt::Newt; @@ -15,6 +14,7 @@ use crate::noun; use crate::noun::{Atom, Cell, IndirectAtom, Noun, Slots, D, T}; use crate::serf::TERMINATOR; use crate::trace::{write_nock_trace, TraceInfo, TraceStack}; +use crate::unifying_equality::unifying_equality; use ares_macros::tas; use assert_no_alloc::assert_no_alloc; use bitvec::prelude::{BitSlice, Lsb0}; @@ -1304,9 +1304,9 @@ mod hint { use crate::jets; use crate::jets::cold; use crate::jets::nock::util::{mook, LEAF}; - use crate::mem::unifying_equality; use crate::noun::{tape, Atom, Cell, Noun, D, T}; use crate::serf::TERMINATOR; + use crate::unifying_equality::unifying_equality; use ares_macros::tas; use std::sync::atomic::Ordering; use std::sync::Arc; diff --git a/rust/ares/src/jets.rs b/rust/ares/src/jets.rs index 5bff6b1..00fc1a0 100644 --- a/rust/ares/src/jets.rs +++ b/rust/ares/src/jets.rs @@ -307,8 +307,9 @@ pub mod util { pub mod test { use super::*; use crate::hamt::Hamt; - use crate::mem::{unifying_equality, NockStack}; + use crate::mem::NockStack; use crate::noun::{Atom, Noun, D, T}; + use crate::unifying_equality::unifying_equality; use assert_no_alloc::assert_no_alloc; use ibig::UBig; @@ -316,9 +317,9 @@ pub mod util { let mut stack = NockStack::new(8 << 10 << 10, 0); let newt = Newt::new_mock(); let cold = Cold::new(&mut stack); - let warm = Warm::new(); + let warm = Warm::new(&mut stack); let hot = Hot::init(&mut stack, URBIT_HOT_STATE); - let cache = Hamt::::new(); + let cache = Hamt::::new(&mut stack); Context { stack, diff --git a/rust/ares/src/jets/bits.rs b/rust/ares/src/jets/bits.rs index 9b6b700..64bcf4d 100644 --- a/rust/ares/src/jets/bits.rs +++ b/rust/ares/src/jets/bits.rs @@ -3,7 +3,7 @@ use crate::interpreter::{Context, Error}; use crate::jets::util::*; use crate::jets::{JetErr, Result}; -use crate::noun::{DirectAtom, IndirectAtom, Noun, D}; +use crate::noun::{IndirectAtom, Noun, D}; use std::cmp; crate::gdb!(); @@ -207,17 +207,9 @@ pub fn jet_rev(context: &mut Context, subject: Noun) -> Result { let bits = len << boz; - /* 63 is the maximum number of bits for a direct atom */ - let mut output = if dat.is_direct() && bits < 64 { - unsafe { DirectAtom::new_unchecked(0).as_atom() } - } else { - unsafe { - IndirectAtom::new_raw(&mut context.stack, ((bits + 7) / 8) as usize, &0).as_atom() - } - }; - let src = dat.as_bitslice(); - let dest = output.as_bitslice_mut(); + let (mut output, dest) = + unsafe { IndirectAtom::new_raw_mut_bitslice(&mut context.stack, bits as usize) }; let len = len as usize; let total_len = len << boz; @@ -226,7 +218,7 @@ pub fn jet_rev(context: &mut Context, subject: Noun) -> Result { dest[start..end].copy_from_bitslice(&src[(total_len - end)..(total_len - start)]); } - Ok(unsafe { output.normalize() }.as_noun()) + Ok(unsafe { output.normalize_as_atom() }.as_noun()) } pub fn jet_rip(context: &mut Context, subject: Noun) -> Result { @@ -736,12 +728,15 @@ mod tests { fn test_rev() { let c = &mut init_context(); - let (_a0, a24, _a63, _a96, _a128) = atoms(&mut c.stack); + let (_a0, a24, _a63, a96, _a128) = atoms(&mut c.stack); let sam = T(&mut c.stack, &[D(0), D(60), a24]); assert_jet(c, jet_rev, sam, D(0xc2a6e1000000000)); let test = 0x1234567890123u64; let sam = T(&mut c.stack, &[D(3), D(7), D(test)]); assert_jet(c, jet_rev, sam, D(test.swap_bytes() >> 8)); + let sam = T(&mut c.stack, &[D(3), D(12), a96]); + let res = A(&mut c.stack, &ubig!(0x563412efbeadde150cb0cefa)); + assert_jet(c, jet_rev, sam, res); } #[test] diff --git a/rust/ares/src/jets/cold.rs b/rust/ares/src/jets/cold.rs index e4391bd..3be45ee 100644 --- a/rust/ares/src/jets/cold.rs +++ b/rust/ares/src/jets/cold.rs @@ -1,7 +1,10 @@ use crate::hamt::Hamt; -use crate::mem::{unifying_equality, NockStack, Preserve}; +use crate::mem::{NockStack, Preserve}; use crate::noun; use crate::noun::{Atom, DirectAtom, Noun, Slots, D, T}; +use crate::persist::{pma_contains, Persist}; +use crate::unifying_equality::unifying_equality; +use std::mem::size_of; use std::ptr::copy_nonoverlapping; use std::ptr::null_mut; @@ -31,6 +34,59 @@ struct BatteriesMem { parent_batteries: Batteries, } +impl Persist for Batteries { + unsafe fn space_needed(&mut self, stack: &mut NockStack) -> usize { + let mut bytes = 0; + let mut batteries = *self; + + loop { + if batteries.0.is_null() { + break; + } + if pma_contains(batteries.0, 1) { + break; + } + bytes += size_of::(); + bytes += (*batteries.0).battery.space_needed(stack); + bytes += (*batteries.0).parent_axis.space_needed(stack); + batteries = (*batteries.0).parent_batteries; + } + bytes + } + + unsafe fn copy_to_buffer(&mut self, stack: &mut NockStack, buffer: &mut *mut u8) { + let mut dest = self; + loop { + if dest.0.is_null() { + break; + } + if pma_contains(dest.0, 1) { + break; + } + + let batteries_mem_ptr = *buffer as *mut BatteriesMem; + copy_nonoverlapping(dest.0, batteries_mem_ptr, 1); + *buffer = batteries_mem_ptr.add(1) as *mut u8; + + (*batteries_mem_ptr).battery.copy_to_buffer(stack, buffer); + (*batteries_mem_ptr) + .parent_axis + .copy_to_buffer(stack, buffer); + + dest.0 = batteries_mem_ptr; + dest = &mut (*dest.0).parent_batteries; + } + } + + unsafe fn handle_to_u64(&self) -> u64 { + self.0 as u64 + } + + unsafe fn handle_from_u64(meta_handle: u64) -> Self { + Batteries(meta_handle as *mut BatteriesMem) + } +} + impl Preserve for Batteries { unsafe fn assert_in_stack(&self, stack: &NockStack) { if self.0.is_null() { @@ -143,6 +199,55 @@ struct BatteriesListMem { next: BatteriesList, } +impl Persist for BatteriesList { + unsafe fn space_needed(&mut self, stack: &mut NockStack) -> usize { + let mut bytes = 0; + let mut list = *self; + loop { + if list.0.is_null() { + break; + } + if pma_contains(list.0, 1) { + break; + } + bytes += size_of::(); + bytes += (*list.0).batteries.space_needed(stack); + + list = (*list.0).next; + } + bytes + } + + unsafe fn copy_to_buffer(&mut self, stack: &mut NockStack, buffer: &mut *mut u8) { + let mut dest = self; + + loop { + if dest.0.is_null() { + break; + } + if pma_contains(dest.0, 1) { + break; + } + + let list_mem_ptr = *buffer as *mut BatteriesListMem; + copy_nonoverlapping(dest.0, list_mem_ptr, 1); + *buffer = list_mem_ptr.add(1) as *mut u8; + dest.0 = list_mem_ptr; + + (*dest.0).batteries.copy_to_buffer(stack, buffer); + dest = &mut (*dest.0).next; + } + } + + unsafe fn handle_to_u64(&self) -> u64 { + self.0 as u64 + } + + unsafe fn handle_from_u64(meta_handle: u64) -> Self { + BatteriesList(meta_handle as *mut BatteriesListMem) + } +} + impl Preserve for BatteriesList { unsafe fn assert_in_stack(&self, stack: &NockStack) { if self.0.is_null() { @@ -215,6 +320,58 @@ struct NounListMem { next: NounList, } +impl Persist for NounList { + unsafe fn space_needed(&mut self, stack: &mut NockStack) -> usize { + let mut bytes: usize = 0; + let mut list = *self; + + loop { + if list.0.is_null() { + break; + } + if pma_contains(list.0, 1) { + break; + } + + bytes += size_of::(); + bytes += (*list.0).element.space_needed(stack); + + list = (*list.0).next; + } + bytes + } + + unsafe fn copy_to_buffer(&mut self, stack: &mut NockStack, buffer: &mut *mut u8) { + let mut dest = self; + + loop { + if dest.0.is_null() { + break; + } + if pma_contains(dest.0, 1) { + break; + } + + let noun_list_mem_ptr = *buffer as *mut NounListMem; + copy_nonoverlapping(dest.0, noun_list_mem_ptr, 1); + *buffer = noun_list_mem_ptr.add(1) as *mut u8; + + dest.0 = noun_list_mem_ptr; + (*dest.0).element.copy_to_buffer(stack, buffer); + + dest = &mut (*dest.0).next; + } + } + + unsafe fn handle_to_u64(&self) -> u64 { + self.0 as u64 + } + + unsafe fn handle_from_u64(meta_handle: u64) -> Self { + NounList(meta_handle as *mut NounListMem) + } +} + impl Preserve for NounList { unsafe fn assert_in_stack(&self, stack: &NockStack) { if self.0.is_null() { @@ -292,6 +449,44 @@ struct ColdMem { path_to_batteries: Hamt, } +impl Persist for Cold { + unsafe fn space_needed(&mut self, stack: &mut NockStack) -> usize { + if pma_contains(self.0, 1) { + return 0; + } + + let mut bytes = size_of::(); + bytes += (*self.0).battery_to_paths.space_needed(stack); + bytes += (*self.0).root_to_paths.space_needed(stack); + bytes += (*self.0).path_to_batteries.space_needed(stack); + bytes + } + + unsafe fn copy_to_buffer(&mut self, stack: &mut NockStack, buffer: &mut *mut u8) { + if pma_contains(self.0, 1) { + return; + } + + let cold_mem_ptr = *buffer as *mut ColdMem; + copy_nonoverlapping(self.0, cold_mem_ptr, 1); + *buffer = cold_mem_ptr.add(1) as *mut u8; + + self.0 = cold_mem_ptr; + + (*self.0).battery_to_paths.copy_to_buffer(stack, buffer); + (*self.0).root_to_paths.copy_to_buffer(stack, buffer); + (*self.0).path_to_batteries.copy_to_buffer(stack, buffer); + } + + unsafe fn handle_to_u64(&self) -> u64 { + self.0 as u64 + } + + unsafe fn handle_from_u64(meta_handle: u64) -> Self { + Cold(meta_handle as *mut ColdMem) + } +} + impl Preserve for Cold { unsafe fn assert_in_stack(&self, stack: &NockStack) { stack.assert_struct_is_in(self.0, 1); @@ -319,9 +514,9 @@ impl Cold { } pub fn new(stack: &mut NockStack) -> Self { - let battery_to_paths = Hamt::new(); - let root_to_paths = Hamt::new(); - let path_to_batteries = Hamt::new(); + let battery_to_paths = Hamt::new(stack); + let root_to_paths = Hamt::new(stack); + let path_to_batteries = Hamt::new(stack); unsafe { let cold_mem_ptr: *mut ColdMem = stack.struct_alloc(1); *cold_mem_ptr = ColdMem { diff --git a/rust/ares/src/jets/nock.rs b/rust/ares/src/jets/nock.rs index 2dfd2d0..33c54f5 100644 --- a/rust/ares/src/jets/nock.rs +++ b/rust/ares/src/jets/nock.rs @@ -149,7 +149,7 @@ pub mod util { let cache_snapshot = context.cache; let scry_snapshot = context.scry_stack; - context.cache = Hamt::::new(); + context.cache = Hamt::::new(&mut context.stack); context.scry_stack = T(&mut context.stack, &[scry, context.scry_stack]); match interpret(context, subject, formula) { diff --git a/rust/ares/src/jets/warm.rs b/rust/ares/src/jets/warm.rs index aaeb056..b66db8b 100644 --- a/rust/ares/src/jets/warm.rs +++ b/rust/ares/src/jets/warm.rs @@ -86,8 +86,8 @@ impl Iterator for WarmEntry { impl Warm { #[allow(clippy::new_without_default)] - pub fn new() -> Self { - Warm(Hamt::new()) + pub fn new(stack: &mut NockStack) -> Self { + Warm(Hamt::new(stack)) } fn insert( @@ -112,7 +112,7 @@ impl Warm { } pub fn init(stack: &mut NockStack, cold: &mut Cold, hot: &Hot) -> Self { - let mut warm = Self::new(); + let mut warm = Self::new(stack); for (mut path, axis, jet) in *hot { let batteries_list = cold.find(stack, &mut path); for batteries in batteries_list { diff --git a/rust/ares/src/lib.rs b/rust/ares/src/lib.rs index 8393ff9..17b7223 100644 --- a/rust/ares/src/lib.rs +++ b/rust/ares/src/lib.rs @@ -12,8 +12,10 @@ pub mod newt; pub mod noun; pub mod serf; //pub mod bytecode; +pub mod persist; pub mod serialization; pub mod trace; +pub mod unifying_equality; /** Introduce useful functions for debugging * diff --git a/rust/ares/src/mem.rs b/rust/ares/src/mem.rs index 22909f5..31f81c5 100644 --- a/rust/ares/src/mem.rs +++ b/rust/ares/src/mem.rs @@ -5,7 +5,6 @@ use crate::noun::{Atom, Cell, CellMemory, IndirectAtom, Noun, NounAllocator}; use assert_no_alloc::permit_alloc; use either::Either::{self, Left, Right}; use ibig::Stack; -use libc::{c_void, memcmp}; use memmap::MmapMut; use std::alloc::Layout; use std::mem; @@ -50,6 +49,7 @@ pub struct NockStack { alloc_pointer: *mut u64, /** MMap which must be kept alive as long as this NockStack is */ memory: MmapMut, + /** PMA from which we will copy into the NockStack */ /** Whether or not pre_copy() has been called on the current stack frame. */ pc: bool, } @@ -142,6 +142,26 @@ impl NockStack { self.frame_pointer } + /** Current stack pointer of this NockStack */ + pub fn get_stack_pointer(&self) -> *const u64 { + self.stack_pointer + } + + /** Current alloc pointer of this NockStack */ + pub fn get_alloc_pointer(&self) -> *const u64 { + self.alloc_pointer + } + + /** Start of the memory range for this NockStack */ + pub fn get_start(&self) -> *const u64 { + self.start + } + + /** End of the memory range for this NockStack */ + pub fn get_size(&self) -> usize { + self.size + } + /** Checks if the current stack frame has West polarity */ #[inline] pub fn is_west(&self) -> bool { @@ -227,7 +247,7 @@ impl NockStack { } /** Pointer to where the previous stack pointer is saved in a frame */ - unsafe fn prev_stack_pointer_pointer(&self) -> *mut *mut u64 { + pub unsafe fn prev_stack_pointer_pointer(&self) -> *mut *mut u64 { if !self.pc { self.slot_pointer(STACK) as *mut *mut u64 } else { @@ -816,240 +836,6 @@ impl NockStack { } } -#[cfg(feature = "check_junior")] -#[macro_export] -macro_rules! assert_no_junior_pointers { - ( $x:expr, $y:expr ) => { - assert_no_alloc::permit_alloc(|| { - assert!($x.no_junior_pointers($y)); - }) - }; -} - -#[cfg(not(feature = "check_junior"))] -#[macro_export] -macro_rules! assert_no_junior_pointers { - ( $x:expr, $y:expr ) => {}; -} - -pub unsafe fn unifying_equality(stack: &mut NockStack, a: *mut Noun, b: *mut Noun) -> bool { - /* This version of unifying equality is not like that of vere. - * Vere does a tree comparison (accelerated by pointer equality and short-circuited by mug - * equality) and then unifies the nouns at the top level if they are equal. - * - * Here we recursively attempt to unify nouns. Pointer-equal nouns are already unified. - * Disequal mugs again short-circuit the unification and equality check. - * - * Since we expect atoms to be normalized, direct and indirect atoms do not unify with each - * other. For direct atoms, no unification is possible as there is no pointer involved in their - * representation. Equality is simply direct equality on the word representation. Indirect - * atoms require equality first of the size and then of the memory buffers' contents. - * - * Cell equality is tested (after mug and pointer equality) by attempting to unify the heads and tails, - * respectively, of cells, and then re-testing. If unification succeeds then the heads and - * tails will be pointer-wise equal and the cell itself can be unified. A failed unification of - * the head or the tail will already short-circuit the unification/equality test, so we will - * not return to re-test the pointer equality. - * - * When actually mutating references for unification, we must be careful to respect seniority. - * A reference to a more junior noun should always be replaced with a reference to a more - * senior noun, *never vice versa*, to avoid introducing references from more senior frames - * into more junior frames, which would result in incorrect operation of the copier. - */ - assert_acyclic!(*a); - assert_acyclic!(*b); - assert_no_forwarding_pointers!(*a); - assert_no_forwarding_pointers!(*b); - assert_no_junior_pointers!(stack, *a); - assert_no_junior_pointers!(stack, *b); - - // If the nouns are already word-equal we have nothing to do - if (*a).raw_equals(*b) { - return true; - }; - // If the nouns have cached mugs which are disequal we have nothing to do - if let (Ok(a_alloc), Ok(b_alloc)) = ((*a).as_allocated(), (*b).as_allocated()) { - if let (Some(a_mug), Some(b_mug)) = (a_alloc.get_cached_mug(), b_alloc.get_cached_mug()) { - if a_mug != b_mug { - return false; - }; - }; - }; - stack.frame_push(0); - *(stack.push::<(*mut Noun, *mut Noun)>()) = (a, b); - loop { - if stack.stack_is_empty() { - break; - }; - let (x, y): (*mut Noun, *mut Noun) = *(stack.top()); - if (*x).raw_equals(*y) { - stack.pop::<(*mut Noun, *mut Noun)>(); - continue; - }; - if let (Ok(x_alloc), Ok(y_alloc)) = ( - // equal direct atoms return true for raw_equals() - (*x).as_allocated(), - (*y).as_allocated(), - ) { - if let (Some(x_mug), Some(y_mug)) = (x_alloc.get_cached_mug(), y_alloc.get_cached_mug()) - { - if x_mug != y_mug { - break; // short-circuit, the mugs differ therefore the nouns must differ - } - }; - match (x_alloc.as_either(), y_alloc.as_either()) { - (Left(x_indirect), Left(y_indirect)) => { - let x_as_ptr = x_indirect.to_raw_pointer(); - let y_as_ptr = y_indirect.to_raw_pointer(); - if x_indirect.size() == y_indirect.size() - && memcmp( - x_indirect.data_pointer() as *const c_void, - y_indirect.data_pointer() as *const c_void, - x_indirect.size() << 3, - ) == 0 - { - let (_senior, junior) = senior_pointer_first(stack, x_as_ptr, y_as_ptr); - if x_as_ptr == junior { - *x = *y; - } else { - *y = *x; - } - stack.pop::<(*mut Noun, *mut Noun)>(); - continue; - } else { - break; - } - } - (Right(x_cell), Right(y_cell)) => { - let x_as_ptr = x_cell.to_raw_pointer() as *const u64; - let y_as_ptr = y_cell.to_raw_pointer() as *const u64; - if x_cell.head().raw_equals(y_cell.head()) - && x_cell.tail().raw_equals(y_cell.tail()) - { - let (_senior, junior) = senior_pointer_first(stack, x_as_ptr, y_as_ptr); - if x_as_ptr == junior { - *x = *y; - } else { - *y = *x; - } - stack.pop::<(*mut Noun, *mut Noun)>(); - continue; - } else { - /* THIS ISN'T AN INFINITE LOOP - * If we discover a disequality in either side, we will - * short-circuit the entire loop and reset the work stack. - * - * If both sides are equal, then we will discover pointer - * equality when we return and unify the cell. - */ - *(stack.push::<(*mut Noun, *mut Noun)>()) = - (x_cell.tail_as_mut(), y_cell.tail_as_mut()); - *(stack.push::<(*mut Noun, *mut Noun)>()) = - (x_cell.head_as_mut(), y_cell.head_as_mut()); - continue; - } - } - (_, _) => { - break; // cells don't unify with atoms - } - } - } else { - break; // direct atom not raw equal, so short circuit - } - } - stack.frame_pop(); - - assert_acyclic!(*a); - assert_acyclic!(*b); - assert_no_forwarding_pointers!(*a); - assert_no_forwarding_pointers!(*b); - assert_no_junior_pointers!(stack, *a); - assert_no_junior_pointers!(stack, *b); - - (*a).raw_equals(*b) -} - -unsafe fn senior_pointer_first( - stack: &NockStack, - a: *const u64, - b: *const u64, -) -> (*const u64, *const u64) { - let mut frame_pointer: *const u64 = stack.frame_pointer; - let mut stack_pointer: *const u64 = stack.stack_pointer; - let mut alloc_pointer: *const u64 = stack.alloc_pointer; - let prev_stack_pointer = *(stack.prev_stack_pointer_pointer()); - - let (mut high_pointer, mut low_pointer): (*const u64, *const u64) = if stack.is_west() { - (prev_stack_pointer, alloc_pointer) - } else { - (alloc_pointer, prev_stack_pointer) - }; - - loop { - if low_pointer.is_null() || high_pointer.is_null() { - // we found the bottom of the stack; check entirety of the stack - low_pointer = stack.start; - high_pointer = stack.start.add(stack.size); - } - - match ( - a < high_pointer && a >= low_pointer, - b < high_pointer && b >= low_pointer, - ) { - (true, true) => { - // both pointers are in the same frame, pick arbitrarily (lower in mem) - break lower_pointer_first(a, b); - } - (true, false) => break (b, a), // a is in the frame, b is not, so b is senior - (false, true) => break (a, b), // b is in the frame, a is not, so a is senior - (false, false) => { - // chase up the stack - #[allow(clippy::comparison_chain)] - // test to see if the frame under consideration is a west frame - if stack_pointer < alloc_pointer { - stack_pointer = *(frame_pointer.sub(STACK + 1)) as *const u64; - alloc_pointer = *(frame_pointer.sub(ALLOC + 1)) as *const u64; - frame_pointer = *(frame_pointer.sub(FRAME + 1)) as *const u64; - - // both pointers are in the PMA, pick arbitrarily (lower in mem) - if frame_pointer.is_null() { - break lower_pointer_first(a, b); - }; - - // previous allocation pointer - high_pointer = alloc_pointer; - // "previous previous" stack pointer. this is the other boundary of the previous allocation arena - low_pointer = *(frame_pointer.add(STACK)) as *const u64; - } else if stack_pointer > alloc_pointer { - stack_pointer = *(frame_pointer.add(STACK)) as *const u64; - alloc_pointer = *(frame_pointer.add(ALLOC)) as *const u64; - frame_pointer = *(frame_pointer.add(FRAME)) as *const u64; - - // both pointers are in the PMA, pick arbitrarily (lower in mem) - if frame_pointer.is_null() { - break lower_pointer_first(a, b); - }; - - // previous allocation pointer - low_pointer = alloc_pointer; - // "previous previous" stack pointer. this is the other boundary of the previous allocation arena - high_pointer = *(frame_pointer.sub(STACK + 1)) as *const u64; - } else { - panic!("senior_pointer_first: stack_pointer == alloc_pointer"); - } - } - } - } -} - -fn lower_pointer_first(a: *const u64, b: *const u64) -> (*const u64, *const u64) { - if a < b { - (a, b) - } else { - (b, a) - } -} - impl NounAllocator for NockStack { unsafe fn alloc_indirect(&mut self, words: usize) -> *mut u64 { self.indirect_alloc(words) diff --git a/rust/ares/src/noun.rs b/rust/ares/src/noun.rs index 72272fd..0c5008b 100644 --- a/rust/ares/src/noun.rs +++ b/rust/ares/src/noun.rs @@ -446,6 +446,11 @@ impl IndirectAtom { unsafe { *(self.to_raw_pointer().add(1)) as usize } } + /** Memory size of an indirect atom (including size + metadata fields) in 64-bit words */ + pub fn raw_size(&self) -> usize { + self.size() + 2 + } + pub fn bit_size(&self) -> usize { unsafe { ((self.size() - 1) << 6) + 64 @@ -906,6 +911,21 @@ impl Atom { *self } } + + /** Make an atom from a raw u64 + * + * # Safety + * + * Note that the [u64] parameter is *not*, in general, the value of the atom! + * + * In particular, anything with the high bit set will be treated as a tagged pointer. + * This method is only to be used to restore an atom from the raw [u64] representation + * returned by [Noun::as_raw], and should only be used if we are sure the restored noun is in + * fact an atom. + */ + pub unsafe fn from_raw(raw: u64) -> Atom { + Atom { raw } + } } impl fmt::Display for Atom { diff --git a/rust/ares/src/persist.rs b/rust/ares/src/persist.rs new file mode 100644 index 0000000..f13abc5 --- /dev/null +++ b/rust/ares/src/persist.rs @@ -0,0 +1,311 @@ +use crate::mem::NockStack; +use crate::noun::{Allocated, Atom, Cell, CellMemory, IndirectAtom, Noun}; +use ares_pma::*; +use either::Either::{Left, Right}; +use std::convert::TryInto; +use std::ffi::{c_void, CString}; +use std::mem::size_of; +use std::path::PathBuf; +use std::ptr::copy_nonoverlapping; +use std::sync::OnceLock; + +const PMA_MODE: mode_t = 0o600; // RW for user only +const PMA_FLAGS: ULONG = 0; // ignored for now + +const NOUN_MARKED: u64 = 1 << 63; + +/// Handle to a PMA +#[derive(Copy, Clone)] +struct PMAState(u64); // this is idiotic but necessary for Rust to let us put this in a oncelock + +static PMA: OnceLock = OnceLock::new(); + +fn get_pma_state() -> Option<*mut BT_state> { + PMA.get().map(|r| r.0 as *mut BT_state) +} + +fn pma_state_err() -> std::io::Error { + std::io::Error::new(std::io::ErrorKind::AlreadyExists, "PMA") +} + +#[cfg(unix)] +pub fn pma_open(path: PathBuf) -> Result<(), std::io::Error> { + let mut state: *mut BT_state = std::ptr::null_mut(); + + // correct for Unix thus cfg gated + let path_cstring = CString::new(path.into_os_string().as_encoded_bytes())?; + unsafe { + bt_state_new(&mut state); + let err = bt_state_open(state, path_cstring.as_ptr(), PMA_FLAGS, PMA_MODE); + if err == 0 { + PMA.set(PMAState(state as u64)) + .map_err(|state| state.0 as *mut BT_state) + .expect("PMA state already initialized to:"); + assert!(get_pma_state().is_some()); + Ok(()) + } else { + // XX need to free the state + Err(std::io::Error::from_raw_os_error(err)) + } + } +} + +#[cfg(windows)] +pub fn pma_open(path: PathBuf) -> Result { + unimplemented!() +} + +pub fn pma_close() -> Result<(), std::io::Error> { + // XX need a way to free the state after + let err = unsafe { bt_state_close(get_pma_state().ok_or_else(pma_state_err)?) }; + if err == 0 { + Ok(()) + } else { + Err(std::io::Error::from_raw_os_error(err)) + } +} + +#[inline] +pub fn pma_meta_get(field: usize) -> u64 { + unsafe { bt_meta_get(get_pma_state().unwrap(), field) } +} + +#[inline] +pub fn pma_meta_set(field: usize, val: u64) { + unsafe { bt_meta_set(get_pma_state().unwrap(), field, val) }; +} + +pub unsafe fn pma_contains(ptr: *const T, count: usize) -> bool { + if let Some(pma_state) = get_pma_state() { + bt_inbounds(pma_state, ptr as *mut c_void) != 0 + && bt_inbounds(pma_state, ptr.add(count) as *mut c_void) != 0 + } else { + false + } +} + +pub fn pma_sync() { + unsafe { + if bt_sync(get_pma_state().unwrap()) != 0 { + panic!("PMA sync failed but did not abort: this should never happen."); + } + } +} + +pub unsafe fn pma_dirty(ptr: *mut T, count: usize) { + let lo = bt_page_round_down(ptr); + let hi = bt_page_round_up(ptr.add(count)); + let e = bt_dirty(get_pma_state().unwrap(), lo, hi); + assert!(e == 0); +} + +/** + * This trait defines operations for copying a structure into the PMA. + * + * This is done in two phases. The [space_needed] phase counts how much space the structure needs in + * the PMA, not counting referenced structures already in the PMA. Then a buffer is allocated in + * the PMA of at least the computed size, and the [copy_to_buffer] phase copies the structure into + * this buffer. + * + * The phases are separated so that instances of the trait may compose, while still allocating a + * single buffer. Thus, in the instance for a HAMT, the [space_needed] method for the HAMT will + * call the [space_needed] method on each noun key, and on each value, as well as computing the + * size of the HAMT's own structures. Similarly, the [copy_to_buffer] method for the HAMT will call + * the [copy_to_buffer] method for the keys and values as it copies its own structures in. + */ +pub trait Persist { + /// Count how much space is needed, in bytes. May set marks so long as marks are cleaned up by + /// [copy_into_buffer] + unsafe fn space_needed(&mut self, stack: &mut NockStack) -> usize; + + /// Copy into the provided buffer, which may be assumed to be at least as large as the size + /// returned by [space_needed] on the same structure. + unsafe fn copy_to_buffer(&mut self, stack: &mut NockStack, buffer: &mut *mut u8); + + /// Persist an object into the PMA using [space_needed] and [copy_to_buffer], returning + /// a [u64] (probably a pointer or tagged pointer) that can be saved into metadata. + unsafe fn save_to_pma(&mut self, stack: &mut NockStack) -> u64 { + unsafe { + let space = self.space_needed(stack); + + if space == 0 { + return self.handle_to_u64(); + } + + let space_as_pages = (space + (BT_PAGESIZE as usize - 1)) >> BT_PAGEBITS; + + let mut buffer = bt_malloc(get_pma_state().unwrap(), space_as_pages) as *mut u8; + let orig_buffer = buffer; + self.copy_to_buffer(stack, &mut buffer); + let space_isize: isize = space.try_into().unwrap(); + assert!(buffer.offset_from(orig_buffer) == space_isize); + self.handle_to_u64() + } + } + + unsafe fn handle_to_u64(&self) -> u64; + unsafe fn handle_from_u64(meta_handle: u64) -> Self; +} + +/// Ensure an allocated noun is marked and return if it was already marked +unsafe fn mark(a: Allocated) -> bool { + let metadata = a.get_metadata(); + a.set_metadata(metadata | NOUN_MARKED); + metadata & NOUN_MARKED != 0 +} + +/// Unmark an allocated noun +unsafe fn unmark(a: Allocated) { + let metadata = a.get_metadata(); + a.set_metadata(metadata & !NOUN_MARKED); +} + +impl Persist for Atom { + unsafe fn space_needed(&mut self, _stack: &mut NockStack) -> usize { + if let Ok(indirect) = self.as_indirect() { + let count = indirect.raw_size(); + if !pma_contains(indirect.to_raw_pointer(), count) && !mark(indirect.as_allocated()) { + return count * size_of::(); + } + } + 0 + } + + unsafe fn copy_to_buffer(&mut self, _stack: &mut NockStack, buffer: &mut *mut u8) { + if let Ok(mut indirect) = self.as_indirect() { + let count = indirect.raw_size(); + if !pma_contains(indirect.to_raw_pointer(), count) { + if let Some(forward) = indirect.forwarding_pointer() { + *self = forward.as_atom(); + } else { + let indirect_buffer_ptr = *buffer as *mut u64; + copy_nonoverlapping(indirect.to_raw_pointer(), indirect_buffer_ptr, count); + *buffer = indirect_buffer_ptr.add(count) as *mut u8; + + indirect.set_forwarding_pointer(indirect_buffer_ptr); + + *self = IndirectAtom::from_raw_pointer(indirect_buffer_ptr).as_atom(); + } + } + } + } + + unsafe fn handle_to_u64(&self) -> u64 { + self.as_noun().as_raw() + } + + unsafe fn handle_from_u64(meta_handle: u64) -> Self { + Atom::from_raw(meta_handle) + } +} + +impl Persist for Noun { + unsafe fn space_needed(&mut self, stack: &mut NockStack) -> usize { + let mut space = 0usize; + stack.frame_push(0); + *(stack.push::()) = *self; + loop { + if stack.stack_is_empty() { + break; + } + let noun = *(stack.top::()); + stack.pop::(); + + match noun.as_either_atom_cell() { + Left(mut atom) => { + space += atom.space_needed(stack); + } + Right(cell) => { + if !pma_contains(cell.to_raw_pointer(), 1) && !mark(cell.as_allocated()) { + space += size_of::(); + (*stack.push::()) = cell.tail(); + (*stack.push::()) = cell.head(); + } + } + } + } + stack.frame_pop(); + space + } + + unsafe fn copy_to_buffer(&mut self, stack: &mut NockStack, buffer: &mut *mut u8) { + let mut buffer_u64 = (*buffer) as *mut u64; + stack.frame_push(0); + *(stack.push::<*mut Noun>()) = self as *mut Noun; + + loop { + if stack.stack_is_empty() { + break; + } + + let dest = *(stack.top::<*mut Noun>()); + stack.pop::<*mut Noun>(); + + match (*dest).as_either_direct_allocated() { + Left(_direct) => {} + Right(allocated) => { + if let Some(a) = allocated.forwarding_pointer() { + *dest = a.as_noun(); + continue; + } + + match allocated.as_either() { + Left(mut indirect) => { + let count = indirect.raw_size(); + if pma_contains(indirect.to_raw_pointer(), count) { + continue; + } + + unmark(allocated); + copy_nonoverlapping(indirect.to_raw_pointer(), buffer_u64, count); + indirect.set_forwarding_pointer(buffer_u64); + *dest = IndirectAtom::from_raw_pointer(buffer_u64).as_noun(); + buffer_u64 = buffer_u64.add(count); + } + Right(mut cell) => { + if pma_contains(cell.to_raw_pointer(), 1) { + continue; + } + + unmark(allocated); + + let new_cell_mem = buffer_u64 as *mut CellMemory; + copy_nonoverlapping(cell.to_raw_pointer(), new_cell_mem, 1); + cell.set_forwarding_pointer(new_cell_mem); + + *dest = Cell::from_raw_pointer(new_cell_mem).as_noun(); + + *(stack.push::<*mut Noun>()) = &mut (*new_cell_mem).tail; + *(stack.push::<*mut Noun>()) = &mut (*new_cell_mem).head; + + buffer_u64 = new_cell_mem.add(1) as *mut u64; + } + } + } + } + } + *buffer = buffer_u64 as *mut u8; + stack.frame_pop(); + } + + unsafe fn handle_to_u64(&self) -> u64 { + self.as_raw() + } + + unsafe fn handle_from_u64(meta_handle: u64) -> Self { + Noun::from_raw(meta_handle) + } +} + +/** Mask to mask out pointer bits not aligned with a BT_PAGESIZE page */ +const BT_PAGEBITS_MASK_OUT: u64 = !((1 << BT_PAGEBITS) - 1); + +// round an address down to a page boundary +fn bt_page_round_down(ptr: *mut T) -> *mut c_void { + ((ptr as u64) & BT_PAGEBITS_MASK_OUT) as *mut c_void +} + +// round an address up to a page boundary +fn bt_page_round_up(ptr: *mut T) -> *mut c_void { + (((ptr as u64) + (BT_PAGESIZE as u64) - 1) & BT_PAGEBITS_MASK_OUT) as *mut c_void +} diff --git a/rust/ares/src/pma/README.md b/rust/ares/src/pma/README.md deleted file mode 100644 index a2cd3d1..0000000 --- a/rust/ares/src/pma/README.md +++ /dev/null @@ -1,8 +0,0 @@ -## PMA - TODO - -Ported from development in a -[separate repo](https://github.com/ashelkovnykov/pma_malloc). README will be -updated after the final implementation is complete, which replaces the -array-based page directory with a B+ Tree one. Until then, please refer to the -README in the above-linked directory. - diff --git a/rust/ares/src/pma/malloc.c b/rust/ares/src/pma/malloc.c deleted file mode 100644 index 399e563..0000000 --- a/rust/ares/src/pma/malloc.c +++ /dev/null @@ -1,2167 +0,0 @@ -/** - * ---------------------------------------------------------------------------- - * "THE BEER-WARE LICENSE" (Revision 42): - * wrote this file. As long as you retain this notice you - * can do whatever you want with this stuff. If we meet some day, and you think - * this stuff is worth it, you can buy me a beer in return. Poul-Henning Kamp - * ---------------------------------------------------------------------------- - */ -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include "includes/checksum.h" -#include "malloc.h" - -//============================================================================== -// CONFIGURABLE MACROS -//============================================================================== - -/** - * PMA_PAGE_SIZE = 1 << PMA_PAGE_SHIFT - * - * Should be configured to native page size. - */ -#define PMA_PAGE_SHIFT 12U - -/** - * PMA_MIN_ALLOC_SIZE = 1 << PMA_MIN_ALLOC_SHIFT - * - * Note that types/sizes in PMASharedPageHeader are currently hardcoded to this - * value being 4. - */ -#define PMA_MIN_ALLOC_SHIFT 4U - -/** - * How many bits per bitmap element. Change only if not 8 bits/byte - */ -#define PMA_BITMAP_BITS (8 * sizeof(uint8_t)) - -/** - * Increment block size for resizing the snapshot backing file (4 GiB in bytes). - * This is just the default increment; the backing file is extended by the - * smallest multiple of this value sufficient to fit the new allocation. - */ -#define PMA_SNAPSHOT_RESIZE_INC 0x100000000 - -//============================================================================== -// AUTO MACROS (do not manually configure) -//============================================================================== - -/** - * Number bytes per page - */ -#define PMA_PAGE_SIZE (1UL << PMA_PAGE_SHIFT) - -/** - * A mask for the offset of an address inside a page - */ -#define PMA_PAGE_MASK (PMA_PAGE_SIZE - 1) - -/** - * Minimum size of an allocation in bytes - * - * If this is too small, it's too much work to manage small allocations. - */ -#define PMA_MIN_ALLOC_SIZE (1U << PMA_MIN_ALLOC_SHIFT) - -/** - * PMA_MAX_SHARED_ALLOC = 1 << PMA_MAX_SHARED_SHIFT - * - * Should be log_2 of 1/4 of page size. Also the number of buckets in the array - * of shared page pointers. - */ -#define PMA_MAX_SHARED_SHIFT (PMA_PAGE_SHIFT - 2U) - -/** - * Max slot size (in bytes) for shared page allocations - * - * In the original phk_malloc code, this was set to 1/2 the size of a page. - * However, since shared page metadata is stored as a header inside the page - * itself, an allocation of 1/2 a page will use a full page anyway. Therefore, - * the limit is set to 1/4 of a page to remove the overhead of dealing with - * the shared page header for a page containing a single allocation. - */ -#define PMA_MAX_SHARED_ALLOC (1UL << PMA_MAX_SHARED_SHIFT) - -/** - * Number of buckets for shared page linked lists in the metadata page - */ -#define PMA_SHARED_BUCKETS (PMA_MAX_SHARED_SHIFT - PMA_MIN_ALLOC_SHIFT + 1) - -/** - * Round address down to beginning of containing page - */ -#define PAGE_ROUND_DOWN(foo) (foo & (~PMA_PAGE_MASK)) - -/** - * Round address up to beginning of next page - */ -#define PAGE_ROUND_UP(foo) ((foo + PMA_PAGE_MASK) & (~PMA_PAGE_MASK)) - -/** - * Convert pointer to index in page directory - */ -#define PTR_TO_INDEX(foo) ((((uint64_t)(foo)) - ((uint64_t)_pma_state->metadata->arena_start)) >> PMA_PAGE_SHIFT) - -/** - * Convert index in page directory to pointer - */ -#define INDEX_TO_PTR(foo) (void *)((char *)_pma_state->metadata->arena_start + ((foo) * PMA_PAGE_SIZE)) - -/** - * Flags to use for all mmap operations, excluding initial metadata page mapping - * - * We don't care to what memory the metadata pages are mapped, so long as it's - * before the memory arena, because we track it in the PMA process itself. - * However, to retain consistent pointers between ship shutdown & relaunch, we - * want all memory arena mmap mappings to go to the exact address to which we - * tell them. Another mapping already existing at one of those addresses is a - * fatal error. - * - * For more info, see https://www.man7.org/linux/man-pages/man2/mmap.2.html. - */ -#ifdef __linux__ - #define PMA_MMAP_FLAGS (MAP_SHARED | MAP_FIXED_NOREPLACE) -#else - #define PMA_MMAP_FLAGS (MAP_SHARED | MAP_FIXED) -#endif - -/** - * Magic code that identifies a file as an event snapshot file - */ -#define PMA_MAGIC_CODE 0xBADDECAFC0FFEE00 // i.e. all decaf coffee - -/** - * Version of the persistent memory arena which created an event snapshot (in - * case of breaking changes) - */ -#define PMA_DATA_VERSION 1 - -/** - * Representation of an empty byte for a byte in a bitmap (1 = empty, 0 = full) - */ -#define PMA_EMPTY_BITMAP 0xFF - -/** - * See PMASharedPageHeader for explanation - */ -#define PMA_BITMAP_SIZE 32 - -/** - * Max number of dpage offsets that can fit into a cache of free dpages stored - * as an array in a single page (when factoring in space used by metadata). - * - * 511 for 4 KiB page - */ -#define PMA_DPAGE_CACHE_SIZE ((PMA_PAGE_SIZE - sizeof(PMADPageCache)) / sizeof(uint64_t)) - -/** - * Max number of dirty page entries that can be stored in the extra space of the - * metadata page. Caching the dirty page entries and writing them as a part of - * the metadata allows us to solve the problem of desynchronization between the - * metadata and page directory without using B+ Trees. - * - * 164 for 4 KiB page - */ -// #define PMA_DIRTY_PAGE_LIMIT ((PMA_PAGE_SIZE - sizeof(PMAMetadata)) / sizeof(PMADirtyPageEntry)) -#define PMA_DIRTY_PAGE_LIMIT 164 - -/** - * Default settings for new PMA backing files - * - * See https://www.man7.org/linux/man-pages/man2/chmod.2.html for more info - * about individual flags. - * - * Start with a page directory big enough to hold 1 GiB of data: - * - * 1 GiB = 262144 page entries - * (up to) 16 bytes per page dir entry - * 4096 / 16 = 256 entries per page - * 262144 / 256 = 1024 pages - * 1024 * 4096 = 4194304 bytes - * - * Maximum size of page directory = 340 GiB - */ -#define PMA_SNAPSHOT_FILENAME "snap.bin" -#define PMA_PAGE_DIR_FILENAME "page.bin" -#define PMA_DEFAULT_DIR_NAME ".bin" -#define PMA_NEW_FILE_FLAGS (O_RDWR | O_CREAT) -#define PMA_LOAD_FILE_FLAGS (O_RDWR) -#define PMA_DIR_PERMISSIONS (S_IRWXU | S_IRWXG | S_IRWXO) -#define PMA_FILE_PERMISSIONS (S_IRUSR | S_IWUSR | S_IRGRP | S_IWGRP) -#define PMA_INIT_SNAP_SIZE 0x40000000 -#define PMA_INIT_DIR_SIZE 0x400000 - -/** - * Maximum possible size of the page directory. This is how big the page - * directory would need to be to reach all addressable virtual memory in Linux. - */ -#define PMA_MAXIMUM_DIR_SIZE 0x5500000000 - -/** - * Base address for the PMA. Lowest address not reserved by Linux. - */ -#ifdef __linux__ - #define PMA_SNAPSHOT_ADDR 0x10000 -#else - #define PMA_SNAPSHOT_ADDR 0x28000000000 -#endif - -/** - * Maximum file size on disk for the filesystem (16 TiB for ext4). - * - * TODO: need to automatically discover this and set it accordingly - */ -#define PMA_MAX_DISK_FILE_SIZE 0x100000000000 - -/** - * Maximum multiplier for resizing the snapshot backing file. - */ -#define PMA_MAX_RESIZE_FACTOR (PMA_MAX_DISK_FILE_SIZE / PMA_SNAPSHOT_RESIZE_INC) - -//============================================================================== -// HELPER MACROS -//============================================================================== - -/* TODO: these should just be funlike macros. The "save line" and goto is - unnecessary */ -/** - * Log error and return failure during new PMA bootstrap - */ -#define INIT_ERROR do { err_line = __LINE__; goto init_error; } while(0) - -/** - * Log error and return failure during existing PMA load - */ -#define LOAD_ERROR do { err_line = __LINE__; goto load_error; } while(0) - -/** - * Log error and return failure during PMA sync - */ -#define SYNC_ERROR do { err_line = __LINE__; goto sync_error; } while(0) - -/** - * Log warning to console - */ -#define WARNING(foo) _pma_warning(foo, address, __LINE__) - -//============================================================================== -// TYPES -//============================================================================== - -/** - * Page statuses used in page directory - */ -enum PMAPageStatus { - UNALLOCATED, - FREE, - SHARED, - FIRST, - FOLLOW -}; -typedef enum PMAPageStatus PMAPageStatus; - -/** - * Directory entry for a page in virtual memory - */ -typedef struct PMAPageDirEntry PMAPageDirEntry; -struct PMAPageDirEntry { - uint64_t offset; // Offset for page in backing file - PMAPageStatus status; // Status of page -}; - -/** - * Directory of pages in virtual memory - */ -typedef struct PMAPageDir PMAPageDir; -struct PMAPageDir { - uint64_t size; // Number of slots currently supported by page directory - uint64_t next_index; // Index of next open slot in (makes it easier to resize) - PMAPageDirEntry *entries; // Address to start of page directory as an array of entries -}; - -/** - * Shared allocation page - * - * A shared page is an array of slots of a single size. The metadata for each - * page is stored as a header within the page itself. - * - * On a 64-bit system, the alignment of this struct is 8. This is relevant to - * the currently hard-coded values for simplifying how slots work. The ideal - * size of a hard-coded bitmap, given the number of slots available in a page - * after subtracting the header, is 32 bytes: - * - * X = max # slots in page (min slot size = 16-bytes) - * (4096 - (11 + ceil(X/8))) > 16X - * (4096 - (11 + (X/8) + 1)) > 16X - * 4084 - X/8 > 16X - * 32672 - X > 128X - * 32672 > 129X - * 253.27 > X - * X = 253 - * bitmap bytes = ceil(253 div 8) = ceil(31.625) = 32 - * - * However, the alignment adds padding bytes in between the scalar and array - * struct members: - * (253 * 16) + 11 + 5 + 32 = 4096 - * - * In this case, this doesn't affect the total number of - * available slots, but it could if the members of the PMASharedPageHeader change. - */ -typedef struct PMASharedPageHeader PMASharedPageHeader; -struct PMASharedPageHeader { - struct PMASharedPageHeader *next; // Next shared page; forms a stack as additional pages of the same slot size are allocated - uint8_t dirty; // Dirty bit; necessary when allocating twice to the same page in one event - uint8_t size; // Slot size for this page = 2^size - uint8_t free; // Number of free slots in page - uint8_t bits[PMA_BITMAP_SIZE]; // Bitmap of which slots are free -}; - -/** - * Update to page directory state for an allocation. A limited number of such - * updates can be stored behind the header in the metadata page, allowing - * simultaneous copy-on-write semantics for the metadata and updates to the page - * directory without a B+ Tree. - */ -typedef struct PMADirtyPageEntry PMADirtyPageEntry; -struct PMADirtyPageEntry { - uint64_t index; // Index in page directory - uint64_t offset; // Offset on disk backing file - uint32_t num_pages; // Number of pages marked dirty (for multi-page allocations) - PMAPageStatus status; // Page status after sync -}; - -/** - * Free page cache node - * - * Nodes form a linked list of single free pages. A free page is an allocated - * page already backed by disk, but available for use (the old values were - * freed). - * - * Free pages are purposely not merged into runs, because two pages being - * adjacent in virtual memory does not mean that they are adjacent on disk, and - * disk locality is preferable for multi-page allocations. - * - * The caches for free single pages and free multi-page runs are split to save - * time: any free page will do for a shared page or single page allocation, but - * exact ranges are preferable for multi-page allocations. - */ -typedef struct PMASinglePageCache PMASinglePageCache; -struct PMASinglePageCache { - PMASinglePageCache *next; // Next node in list - void *page; // Pointer to free page -}; - -/** - * Free page run cache node - * - * Nodes form a linked list of free multi-page runs. A free page is an allocated - * page already backed by disk, but available for use (the old values were - * freed). - * - * Free pages are purposely not merged into runs, because two pages being - * adjacent in virtual memory does not mean that they are adjacent on disk, and - * disk locality is preferable for multi-page allocations (typically, when the - * OS experiences a page miss, the OS/hardware will fetch not just the missing - * page, but also several of the following [nearby?] pages). - * - * The caches for free single pages and free multi-page runs are split to save - * time: any free page will do for a shared page or single page allocation, but - * exact ranges are preferable for multi-page allocations. - */ -typedef struct PMAPageRunCache PMAPageRunCache; -struct PMAPageRunCache { - PMAPageRunCache *next; // Next node in list - void *page; // Pointer to start of page run - uint64_t length; // Number of pages in run -}; - -/** - * Free dpage cache - * - * A dpage is a page-sized block already allocated to the snapshot file on disk - * but without memory mapped to it. Reusing free dpages allows allocations - * without growing the backing file. - * - * The cache contains only individual dpages. Since multi-page allocations are - * never moved, their corresponding dpage allocations never change. When freed, - * multi-page allocations in the free page run cache still refer to the same - * contiguous block of dpages that they were assigned upon initial allocation. - */ -typedef struct PMADPageCache PMADPageCache; -struct PMADPageCache { - uint8_t dirty; // Has dpage cache already been copied to a new page with PROT_WRITE - uint16_t size; // Number of entries in queue - uint16_t head; // Index of front of queue - uint16_t tail; // Index of back of queue - uint64_t queue[]; // Cache of free dpages as queue; array of size PMA_DPAGE_CACHE_SIZE -}; - -/** - * Persistent Memory Arena/event snapshot metadata - */ -typedef struct PMAMetadata PMAMetadata; -struct PMAMetadata { - uint64_t magic_code; // Stamp identifying a file as a New Mars PMA file - uint32_t checksum; // Checksum value to detect corruption - uint32_t version; // Version of Vere (New Mars?) used to produce the backing file - uint64_t epoch; // Epoch ID of the most recently processed event - uint64_t event; // ID of the most recently processed event - uint64_t root; // Root after most recent event - void *arena_start; // Beginning of mapped address space - void *arena_end; // End of mapped address space (first address beyond mapped range) - PMASharedPageHeader *shared_pages[PMA_SHARED_BUCKETS]; // Shared allocation pages - PMADPageCache *dpage_cache; // Cache of free dpges as queue - uint64_t snapshot_size; // Size of the backing file - uint64_t next_offset; // Next open dpage in the backing file - uint8_t num_dirty_pages; // Counter of dirty page entries - uint64_t padding[2]; // sizeof(PMAMetadata) must be PMA_PAGE_SIZE - PMADirtyPageEntry dirty_pages[PMA_DIRTY_PAGE_LIMIT]; // Queue of changes not yet synced to page directory -}; -static_assert(sizeof(PMAMetadata) == PMA_PAGE_SIZE, "PMAMetadata must be a page in length"); - -/** - * Struct containing global data used by PMA - * - * Containment zone for what would otherwise be global variables. Global state - * stored in struct and passed around to functions that need it. Data that - * could otherwise go into the metadata, but is recomputable as derived state - * should go here. - */ -typedef struct PMAState PMAState; -struct PMAState { - PMAMetadata *metadata; // Metadata; contains current status of snapshot - uint64_t meta_page_offset; // Offset on disk of next metadata page to be replaced - PMAPageDir page_directory; // Page directory; maps virtual memory addresses to pages on disk - int snapshot_fd; // File descriptor for PMA backing file - int page_dir_fd; // File descriptor for page directory - PMASinglePageCache *free_pages; // Cache of free single pages - PMAPageRunCache *free_page_runs; // Cache of free multi-page runs -}; - - -//============================================================================== -// GLOBALS -//============================================================================== - -PMAState *_pma_state = NULL; - - -//============================================================================== -// FORWARD DECLARATIONS -//============================================================================== - -int _pma_verify_checksum(PMAMetadata *meta_page); -int _pma_sync_dirty_pages(int fd, uint8_t num_dirty_pages, PMADirtyPageEntry *dirty_pages); -int _pma_write_page_status(int fd, uint64_t index, PMAPageStatus status); -int _pma_write_page_offset(int fd, uint64_t index, uint64_t offset); -int _pma_update_free_pages(uint8_t num_dirty_pages, PMADirtyPageEntry *dirty_pages); -void *_pma_malloc_bytes(size_t size); -int _pma_malloc_shared_page(uint8_t bucket); -void *_pma_malloc_pages(size_t size); -void *_pma_malloc_single_page(PMAPageStatus status); -void *_pma_malloc_multi_pages(uint64_t num_pages); -void *_pma_get_cached_pages(uint64_t num_pages); -void *_pma_get_new_page(PMAPageStatus status); -void *_pma_get_new_pages(uint64_t num_pages); -int _pma_free_pages(void *address); -int _pma_free_bytes(void *address); -int _pma_copy_shared_page(void *address); -uint64_t _pma_get_single_dpage(void); -uint64_t _pma_get_cached_dpage(void); -int _pma_copy_dpage_cache(void); -uint64_t _pma_get_disk_dpage(void); -void _pma_copy_page(void *address, uint64_t offset, PMAPageStatus status, int fd); -void _pma_mark_page_dirty(uint64_t index, uint64_t offset, PMAPageStatus status, uint32_t num_pages); -int _pma_extend_snapshot_file(uint32_t multiplier); -void _pma_warning(const char *p, void *a, int l); -void _pma_state_free(void); -int _pma_state_malloc(void); - - -//============================================================================== -// PUBLIC FUNCTIONS -//============================================================================== - -// TODO: Replace errno codes with our own error codes - -// TODO: Inconsistent abort() calls; should better define when an error is fatal - -int -pma_init(const char *path) { - DIR *dir; - char *filepath; - PMAMetadata *meta_pages = 0; - void *page_dir = 0; - uint64_t meta_bytes; - int err; - int err_line; - int page_dir_fd = 0; - int snapshot_fd = 0; - - // Precompute metadata and page directory sizes in bytes - meta_bytes = 2 * PMA_PAGE_SIZE; - - // Allocate memory for state - if (_pma_state_malloc()) return -1; - - // - // Create backing files - // - - // Initialize dir and file path buffer - filepath = malloc( - strlen(path) + 1 + - strlen(PMA_DEFAULT_DIR_NAME) + 1 + - strlen(PMA_SNAPSHOT_FILENAME) + 1); - - // Create input directory, if necessary - dir = opendir(path); - if (dir == NULL) { - // Error if opening dir failed for reason other than it doesn't exist - if (ENOENT != errno) INIT_ERROR; - - // Error if creating dir failed - if (mkdir(path, PMA_DIR_PERMISSIONS)) INIT_ERROR; - } - - // Create file path for dir of backing files - sprintf(filepath, "%s/%s", path, PMA_DEFAULT_DIR_NAME); - - // Create dir for backing files, if necessary - dir = opendir(filepath); - if (dir == NULL) { - // Error if opening dir failed for reason other than it doesn't exist - if (ENOENT != errno) INIT_ERROR; - - // Error if creating dir failed - if (mkdir(filepath, PMA_DIR_PERMISSIONS)) INIT_ERROR; - } - - // Create backing file for snapshot - sprintf(filepath, "%s/%s/%s", path, PMA_DEFAULT_DIR_NAME, PMA_SNAPSHOT_FILENAME); - snapshot_fd = open(filepath, PMA_NEW_FILE_FLAGS, PMA_FILE_PERMISSIONS); - if (snapshot_fd == -1) INIT_ERROR; - - // Create backing file for page directory - sprintf(filepath, "%s/%s/%s", path, PMA_DEFAULT_DIR_NAME, PMA_PAGE_DIR_FILENAME); - page_dir_fd = open(filepath, PMA_NEW_FILE_FLAGS, PMA_FILE_PERMISSIONS); - if (page_dir_fd == -1) INIT_ERROR; - - // - // Set initial sizes for backing files - // - - // Set initial size of snapshot file - err = lseek(snapshot_fd, (PMA_INIT_SNAP_SIZE - 1), SEEK_SET); - if (err == -1) INIT_ERROR; - err = write(snapshot_fd, "", 1); - if (err != 1) INIT_ERROR; - - // Set initial size of page directory - err = lseek(page_dir_fd, (PMA_INIT_DIR_SIZE - 1), SEEK_SET); - if (err == -1) INIT_ERROR; - err = write(page_dir_fd, "", 1); - if (err != 1) INIT_ERROR; - - // - // Initialize snapshot and page directory - // - - /* - * The following links are useful for understanding the layout of virtual memory for a Linux process: - * https://www.sobyte.net/post/2022-08/linux-virtual-memory/ - * https://blog.holbertonschool.com/hack-the-virtual-memory-malloc-the-heap-the-program-break/ - * Chapters 2 & 3 - * - * Practically, on my machine, this translates to the following virtual memory layout: - * - ??? = 0x0000 0000 0000 - 0x0000 0000 ffff 64 KiB - * - empty = 0x0000 0001 0000 - 0x559f ffff ffff ~85 TiB - * - data = 0x55a0 0000 0000 - 0x560f ffff ffff 448 GiB - * - heap = 0x5610 0000 0000 - 0x7f3f ffff ffff ~41 TiB - * - libs = 0x7f40 0000 0000 - 0x7f9f ffff ffff 384 GiB - * - stack = 0x7fa0 0000 0000 - 0x7ffb ffff ffff 368 GiB - * - vdso = 0x7ffc 0000 0000 - 0x7fff ffff ffff 16 GiB - * Note that these address ranges are rough approximations and the sizes are vastly larger for sections like 'data' - * and 'vdso' than the actual memory section for the process because I'm documenting the range in which the section - * can be found. Identical Linux processes will not have identical memory layouts due to Address Space Layout - * Randomization. - * - * Without explicit arguments, calls to mmap will return addresses in the above 'stack' range, and successive calls - * will grow down. I presume that this is due to the implementation of this proposal: https://lwn.net/Articles/91829/ - * - * Given these circumstances, probably the easiest things to do are: - * 1. mmap the snapshot to a low address (i.e. 0x1 0000) so that it can use all of the available space before the - * 'data' section - * 2. mmap the page directory using its maximum possible size (at least on Linux, it's okay to mmap a file to more - * pages than it actually occupies and have it grow into the space). Doing so on eliminates the need to ever - * resize the mapping using mremap. - * 3. mmap the page directory without a location hint. How big is this mmap? Given the above estimate of virtual - * memory available to the snapshot (85 TiB) and the ratio of snapshot size to page directory size (256:1), this - * mapping would be 340 GiB in size. Even assuming the kernel were not smart enough to work around the linked - * libs, this is still small enough to fit into the stack, according to the above memory section size estimates. - */ - - // Init metadata pages - meta_pages = mmap( - NULL, - meta_bytes, - PROT_READ | PROT_WRITE, - MAP_SHARED, - snapshot_fd, - 0); - if (meta_pages == MAP_FAILED) INIT_ERROR; - - // Init page directory - page_dir = mmap( - NULL, - PMA_MAXIMUM_DIR_SIZE, - PROT_READ | PROT_WRITE, - MAP_SHARED, - page_dir_fd, - 0); - if (page_dir == MAP_FAILED) INIT_ERROR; - - // Initialize simple metadata state - _pma_state->metadata->magic_code = PMA_MAGIC_CODE; - _pma_state->metadata->checksum = 0; - _pma_state->metadata->version = PMA_DATA_VERSION; - _pma_state->metadata->epoch = 0; - _pma_state->metadata->event = 0; - _pma_state->metadata->root = 0; - - // Initialize shared pages stacks - for(uint8_t i = 0; i < PMA_SHARED_BUCKETS; ++i) { - _pma_state->metadata->shared_pages[i] = NULL; - } - - // Initialize dirty page array - for(uint8_t i = 0; i < PMA_DIRTY_PAGE_LIMIT; ++i) { - _pma_state->metadata->dirty_pages[i].index = 0; - _pma_state->metadata->dirty_pages[i].offset = 0; - _pma_state->metadata->dirty_pages[i].num_pages = 0; - } - _pma_state->metadata->num_dirty_pages = 0; - - // Initialize snapshot page info - _pma_state->metadata->snapshot_size = PMA_INIT_SNAP_SIZE; - _pma_state->metadata->next_offset = meta_bytes + PMA_PAGE_SIZE; - - // Initialize arena start pointer - _pma_state->metadata->arena_start = (void *)PMA_SNAPSHOT_ADDR; - - // Manually allocate a page for the dpage cache - _pma_state->metadata->dpage_cache = mmap( - _pma_state->metadata->arena_start, - PMA_PAGE_SIZE, - PROT_READ | PROT_WRITE, - PMA_MMAP_FLAGS, - snapshot_fd, - meta_bytes); - if (_pma_state->metadata->dpage_cache == MAP_FAILED) INIT_ERROR; - - // Initialize arena end pointer - _pma_state->metadata->arena_end = ((char*)_pma_state->metadata->arena_start + PMA_PAGE_SIZE); - - // Setup initial dpage cache values - _pma_state->metadata->dpage_cache->dirty = 0; - _pma_state->metadata->dpage_cache->size = 0; - _pma_state->metadata->dpage_cache->head = 0; - _pma_state->metadata->dpage_cache->tail = 0; - - // - // Setup page directory - // - - _pma_state->page_directory.size = PMA_INIT_DIR_SIZE; - _pma_state->page_directory.next_index = 1; - _pma_state->page_directory.entries = (PMAPageDirEntry *)page_dir; - - // First page used by dpage cache - _pma_state->page_directory.entries[0].status = FIRST; - _pma_state->page_directory.entries[0].offset = meta_bytes; - - // - // Setup transient state - // - - // Replace the first metadata page, since they're identical - _pma_state->meta_page_offset = 0; - - // Initialize file descriptors - _pma_state->snapshot_fd = snapshot_fd; - _pma_state->page_dir_fd = page_dir_fd; - - // Initialize free page caches - _pma_state->free_pages = NULL; - _pma_state->free_page_runs = NULL; - - // - // Sync initial PMA state to disk - // - - // Sync dpage cache - err = msync( - _pma_state->metadata->dpage_cache, - PMA_PAGE_SIZE, - MS_SYNC); - if (err) INIT_ERROR; - - // Sync page directory - err = msync(_pma_state->page_directory.entries, PMA_PAGE_SIZE, MS_SYNC); - if (err) INIT_ERROR; - - // Compute checksum for metadata - _pma_state->metadata->checksum = crc_32((unsigned char*)_pma_state->metadata, PMA_PAGE_SIZE); - - // Copy and sync metadata to both buffers - memset(meta_pages, 0, meta_bytes); - memcpy(&meta_pages[0], _pma_state->metadata, PMA_PAGE_SIZE); - memcpy(&meta_pages[1], _pma_state->metadata, PMA_PAGE_SIZE); - if (msync(meta_pages, meta_bytes, MS_SYNC)) INIT_ERROR; - - // Remove PROT_WRITE permissions from snapshot and page directory - if (mprotect(meta_pages, meta_bytes, PROT_READ)) INIT_ERROR; - if (mprotect(_pma_state->metadata->dpage_cache, PMA_PAGE_SIZE, PROT_READ)) INIT_ERROR; - if (mprotect(page_dir, PMA_PAGE_SIZE, PROT_READ)) INIT_ERROR; - - // - // Done - // - - // Clean up - free(filepath); - munmap(meta_pages, meta_bytes); - - return 0; - -init_error: - fprintf(stderr, "(L%d) PMA initialization error: %s\n", err_line, strerror(errno)); - - if (meta_pages) munmap(meta_pages, meta_bytes); - if (page_dir) munmap(page_dir, PMA_INIT_DIR_SIZE); - if (snapshot_fd) close(snapshot_fd); - if (page_dir_fd) close(page_dir_fd); - free(filepath); - _pma_state_free(); - - return -1; -} - -PMARootState -pma_load(const char *path) { - PMAMetadata *newer_page; - PMAMetadata *older_page; - char *filepath; - void *address; - PMAMetadata *meta_pages = 0; - uint64_t index; - uint64_t meta_bytes; - int err; - int err_line; - int page_dir_fd = 0; - int snapshot_fd = 0; - - // Precompute metadata and page directory sizes in bytes - meta_bytes = 2 * PMA_PAGE_SIZE; - - // Allocate memory for state - if (_pma_state_malloc()) return (PMARootState){0}; - - // - // Open backing files - // - - // Initialize dir and file path buffer - filepath = malloc( - strlen(path) + 1 + - strlen(PMA_DEFAULT_DIR_NAME) + 1 + - strlen(PMA_SNAPSHOT_FILENAME) + 1); - - // Open backing file for snapshot - sprintf(filepath, "%s/%s/%s", path, PMA_DEFAULT_DIR_NAME, PMA_SNAPSHOT_FILENAME); - snapshot_fd = open(filepath, PMA_LOAD_FILE_FLAGS, PMA_FILE_PERMISSIONS); - if (snapshot_fd == -1) LOAD_ERROR; - _pma_state->snapshot_fd = snapshot_fd; - - // Open backing file for page directory - sprintf(filepath, "%s/%s/%s", path, PMA_DEFAULT_DIR_NAME, PMA_PAGE_DIR_FILENAME); - page_dir_fd = open(filepath, PMA_LOAD_FILE_FLAGS, PMA_FILE_PERMISSIONS); - if (page_dir_fd == -1) LOAD_ERROR; - _pma_state->page_dir_fd = page_dir_fd; - - // - // Verify file can be loaded - // - - // Read magic code - if (-1 == read(snapshot_fd, &_pma_state->metadata->magic_code, sizeof(uint64_t))) { - LOAD_ERROR; - } else if (_pma_state->metadata->magic_code != PMA_MAGIC_CODE) { - errno = EILSEQ; - LOAD_ERROR; - } - - // Read version - if (-1 == pread(snapshot_fd, &_pma_state->metadata->version, sizeof(uint32_t), 12)) { - LOAD_ERROR; - } else if (_pma_state->metadata->version != PMA_DATA_VERSION) { - // TODO: possibly upgrade - errno = EILSEQ; - LOAD_ERROR; - } - - // Load metadata pages - meta_pages = mmap( - NULL, - meta_bytes, - PROT_READ, - MAP_SHARED, - snapshot_fd, - 0); - if (meta_pages == MAP_FAILED) LOAD_ERROR; - - // Determine newer metadata page - newer_page = &meta_pages[0]; - older_page = &meta_pages[1]; - assert(newer_page->magic_code == PMA_MAGIC_CODE); assert(older_page->magic_code == PMA_MAGIC_CODE); - if ((newer_page->epoch < older_page->epoch) - || ((newer_page->epoch == older_page->epoch) - && (newer_page->event < older_page->event))) { - newer_page = &meta_pages[1]; - older_page = &meta_pages[0]; - } - - // Verify checksum for either page - if (!_pma_verify_checksum(newer_page)) { - if (_pma_verify_checksum(older_page)) { - newer_page = older_page; - } else { - errno = EILSEQ; - LOAD_ERROR; - } - } - - // Next page replaced is the older of the two pages - _pma_state->meta_page_offset = (newer_page == meta_pages) ? PMA_PAGE_SIZE : 0; - - // Update page directory using metadata dirty page list - err = _pma_sync_dirty_pages(page_dir_fd, _pma_state->metadata->num_dirty_pages, _pma_state->metadata->dirty_pages); - if (err) LOAD_ERROR; - - _pma_state->metadata->num_dirty_pages = 0; - - // - // Load page directory - // - - // mmap page directory - _pma_state->page_directory.entries = mmap( - NULL, - PMA_MAXIMUM_DIR_SIZE, - PROT_READ, - MAP_SHARED, - page_dir_fd, - 0); - if (_pma_state->page_directory.entries == MAP_FAILED) LOAD_ERROR; - - // - // Map pages and compute free page caches - // - - // get total number of indices - struct stat st; - fstat(page_dir_fd, &st); - _pma_state->page_directory.size = (st.st_size / sizeof(PMAPageDirEntry)) - 1; - - - index = 0; - while (index < _pma_state->page_directory.size) { - uint64_t count = 1; - - switch (_pma_state->page_directory.entries[index].status) { - case UNALLOCATED: - ++index; - continue; - - case FREE: - // While pages have FREE status AND are contiguous on disk, scan forward - ++index; - while ( - _pma_state->page_directory.entries[index].status == FREE && - _pma_state->page_directory.entries[index].offset == (_pma_state->page_directory.entries[index - 1].offset + PMA_PAGE_SIZE)) { - ++count; - ++index; - } - - // Add to appropriate free page cache - if (count == 1) { - PMASinglePageCache *free_page = malloc(sizeof *free_page); - - // Add it to the single-page cache - free_page->next = _pma_state->free_pages; - free_page->page = INDEX_TO_PTR(index - 1); - _pma_state->free_pages = free_page; - - } else { - PMAPageRunCache *page_run = malloc(sizeof *page_run); - - page_run->next = _pma_state->free_page_runs; - page_run->page = INDEX_TO_PTR(index - count); - page_run->length = count; - _pma_state->free_page_runs = page_run; - } - - // Map free pages (they're expected to be mapped but read only) - address = mmap( - INDEX_TO_PTR(index - count), - (PMA_PAGE_SIZE * count), - PROT_READ, - PMA_MMAP_FLAGS, - snapshot_fd, - _pma_state->page_directory.entries[index - count].offset); - - continue; - - case SHARED: - // Map immediately - address = mmap( - INDEX_TO_PTR(index), - PMA_PAGE_SIZE, - PROT_READ, - PMA_MMAP_FLAGS, - snapshot_fd, - _pma_state->page_directory.entries[index].offset); - if (address == MAP_FAILED) LOAD_ERROR; - - ++index; - - continue; - - case FIRST: - // While pages have FOLLOW status, scan forward - ++index; - while (_pma_state->page_directory.entries[index].status == FOLLOW) { - ++count; - ++index; - } - - // mmap entire block - address = mmap( - INDEX_TO_PTR(index - count), - (count * PMA_PAGE_SIZE), - PROT_READ, - PMA_MMAP_FLAGS, - snapshot_fd, - _pma_state->page_directory.entries[index - count].offset); - if (address == MAP_FAILED) LOAD_ERROR; - - continue; - - case FOLLOW: - // FOLLOW pages should be passed over correctly by FIRST case - default: - fprintf(stderr, "Index %" PRIu64 " invalid\n", index); - errno = EINVAL; - LOAD_ERROR; - } - } - - // Get next free index - _pma_state->page_directory.next_index = index; - - // - // Done - // - - // Clean up - munmap(meta_pages, meta_bytes); - free(filepath); - - return (PMARootState){ - .epoch = _pma_state->metadata->epoch, - .event = _pma_state->metadata->event, - .root = _pma_state->metadata->root, - }; - -load_error: - fprintf(stderr, "(L%d) Error loading PMA from %s: %s\n", err_line, path, strerror(errno)); - - if (meta_pages) munmap(meta_pages, meta_bytes); - if (_pma_state->page_directory.entries) { - munmap(_pma_state->page_directory.entries, PMA_MAXIMUM_DIR_SIZE); - } - if (_pma_state->metadata && _pma_state->metadata->arena_start) { - munmap(_pma_state->metadata->arena_start, - (uintptr_t)_pma_state->metadata->arena_end - - (uintptr_t)_pma_state->metadata->arena_start); - } - if (snapshot_fd > 0) close(snapshot_fd); - if (page_dir_fd > 0) close(page_dir_fd); - free(filepath); - _pma_state_free(); - - return (PMARootState){0}; -} - -int -pma_close(uint64_t epoch, uint64_t event, uint64_t root) { - // Sync changes to disk - if (pma_sync(epoch, event, root)) { - return -1; - } - - // Unmap page directory - munmap(_pma_state->page_directory.entries, PMA_MAXIMUM_DIR_SIZE); - - // Unmap snapshot - // XX should just be end - start? - munmap(_pma_state->metadata->arena_start, _pma_state->metadata->snapshot_size); - - // Close file descriptors - close(_pma_state->page_dir_fd); - close(_pma_state->snapshot_fd); - - // free pma state - _pma_state_free(); - - return 0; -} - -void * -pma_malloc(size_t size) { - void *result = NULL; - - /* MALLOC_LOCK */ - - if (!size) { - /* MALLOC_UNLOCK */ - return result; - } else if ((size + PMA_PAGE_SIZE) < size) { // Check for overflow - errno = ENOMEM; - } else if (size <= PMA_MAX_SHARED_ALLOC) { - result = _pma_malloc_bytes(size); - } else { - result = _pma_malloc_pages(size); - } - - /* MALLOC_UNLOCK */ - - return result; -} - -int -pma_free(void *address) { - uint64_t index; - - // TODO: This is legal for POSIX free, but would this ever happen for pma_free? - if (address == NULL) return 0; - - if (address < _pma_state->metadata->arena_start) { - WARNING("address too low to make sense"); - errno = EINVAL; - return -1; - } - if (address >= _pma_state->metadata->arena_end) { - WARNING("address too high to make sense"); - errno = EINVAL; - return -1; - } - - index = PTR_TO_INDEX(address); - switch (_pma_state->page_directory.entries[index].status) { - case UNALLOCATED: - // Something has definitely gone wrong if an address between arena_start - // and arena_end, with an index between 0 and next_free_index is - // unallocated - WARNING("address marked unallocated"); - errno = EINVAL; - return -1; - - case FREE: - WARNING("address already free"); - errno = EINVAL; - return -1; - - case SHARED: - return _pma_free_bytes(address); - - case FIRST: - return _pma_free_pages(address); - - case FOLLOW: - WARNING("address points to middle of multi-page allocation"); - errno = EINVAL; - return -1; - } - - return 0; -} - -int -pma_sync(uint64_t epoch, uint64_t event, uint64_t root) { - PMADPageCache *dpage_cache = _pma_state->metadata->dpage_cache; - ssize_t bytes_out; - int err; - int err_line; - - // Epoch & event may only increase - if ( - (epoch < _pma_state->metadata->epoch) || - ((epoch == _pma_state->metadata->epoch) && (event <= _pma_state->metadata->event))) { - errno = EINVAL; - return -1; - } - - // Clear dpage cache dirty bit and compute new size. This is the only place - // where the dpage cache active size should ever increase! - if (dpage_cache->dirty) { - dpage_cache->dirty = 0; - dpage_cache->size = (dpage_cache->tail - dpage_cache->head); - if (dpage_cache->size > PMA_DPAGE_CACHE_SIZE) { - // Simple correction of integer underflow when queue wraps around - dpage_cache->size += PMA_DPAGE_CACHE_SIZE; - } - } - - // Sync dirty pages - for (uint8_t i = 0; i < _pma_state->metadata->num_dirty_pages; ++i) { - void *address = INDEX_TO_PTR(_pma_state->metadata->dirty_pages[i].index); - uint64_t bytes = (_pma_state->metadata->dirty_pages[i].num_pages * PMA_PAGE_SIZE); - - // Clear dirty bit for shared pages - if (_pma_state->metadata->dirty_pages[i].status == SHARED) { - ((PMASharedPageHeader*)address)->dirty = 0; - } - - err = msync(address, bytes, MS_SYNC); - if (err) SYNC_ERROR; - - if (mprotect(address, bytes, PROT_READ)) SYNC_ERROR; - } - - // Compute checksum - _pma_state->metadata->epoch = epoch; - _pma_state->metadata->event = event; - _pma_state->metadata->root = root; - _pma_state->metadata->checksum = 0; - _pma_state->metadata->checksum - = crc_32((unsigned char *)_pma_state->metadata, PMA_PAGE_SIZE); - - // Sync metadata - // - // Note: It's a long-standing Unix convention that while both write and - // pwrite return the number of bytes written, when operating on a file - // (as opposed to a pipe or socket) it is assumed that the entire - // buffer will be written. If this isn't the case, an error has - // occurred. - bytes_out = pwrite( - _pma_state->snapshot_fd, - _pma_state->metadata, - PMA_PAGE_SIZE, - _pma_state->meta_page_offset); - if (bytes_out != PMA_PAGE_SIZE) SYNC_ERROR; - - _pma_state->meta_page_offset = _pma_state->meta_page_offset ? 0 : PMA_PAGE_SIZE; - - // Sync dirty pages in page directory - err = _pma_sync_dirty_pages( - _pma_state->page_dir_fd, - _pma_state->metadata->num_dirty_pages, - _pma_state->metadata->dirty_pages); - if (err) SYNC_ERROR; - - // Update free page caches - err = _pma_update_free_pages(_pma_state->metadata->num_dirty_pages, _pma_state->metadata->dirty_pages); - if (err) SYNC_ERROR; - - // Reset dirty page array - _pma_state->metadata->num_dirty_pages = 0; - - return 0; - -sync_error: - fprintf(stderr, "(L%d) Error syncing PMA: %s\n", err_line, strerror(errno)); - - return -1; -} - -bool -pma_in_arena(void *address) { - return (address >= _pma_state->metadata->arena_start) - && (address < _pma_state->metadata->arena_end); -} - -//============================================================================== -// PRIVATE FUNCTIONS -//============================================================================== - -/** - * Verify that the checksum of a metadata page is valid - * - * Corruption or malicious interference is rare, so we assume that the checksum - * is correct and copy it into the global state in advance, then confirm its - * correctness there. - * - * @param meta_page Pointer to a metadata page loaded from disk - * - * @return Boolean (as int) for whether checksums match or not - */ -int -_pma_verify_checksum(PMAMetadata *meta_page) { - uint32_t checksum; - - // Copy metadata in advance of using it, since: 1) we expect the checksum to - // be valid; 2) we need to set the value of the checksum in the metadata to 0. - memcpy(_pma_state->metadata, meta_page, PMA_PAGE_SIZE); - - // Since we're computing the checksum on the object which itself includes the - // checksum, we treat the checksum as 0. - _pma_state->metadata->checksum = 0; - - // Compute checksum - checksum = crc_32((unsigned char *)_pma_state->metadata, PMA_PAGE_SIZE); - - // Compare checksums - return (checksum == meta_page->checksum); -} - -/** - * Sync updates from the dirty page cache in the metadata page to the page - * directory - * - * This sync is technically the *first* step of a new event, since the page - * directory for a snapshot is not valid until all of the changes from the dirty - * page cache have been applied. The PMA can crash at any moment, therefore - * applying the changes to the page directory from the previous event is - * required before processing a new one. Note that applying these changes to the - * page directory is an idempotent operation - which is good since we could - * theoretically crash on the same event repeatedly. - * - * @param fd Page directory file descriptor - * @param num_dirty_pages Size of dirty page cache - * @param dirty_pages Dirty page cache as array - * - * @return 0 success - * @return -1 failure; errno set to error code - */ -int -_pma_sync_dirty_pages(int fd, uint8_t num_dirty_pages, PMADirtyPageEntry *dirty_pages) { - PMAPageStatus cont_status; - uint64_t init_offset; - uint64_t index; - - for (uint8_t i = 0; i < num_dirty_pages; ++i) { - cont_status = (dirty_pages[i].status == FIRST) ? FOLLOW : dirty_pages[i].status; - init_offset = dirty_pages[i].offset; - index = dirty_pages[i].index; - - if (_pma_write_page_status(fd, index, dirty_pages[i].status)) return -1; - // Offset of 0 is code for "leave it alone" - if (init_offset) { - if (_pma_write_page_offset(fd, index, init_offset)) return -1; - } - - // The offset on disk doesn't actually matter for the continuation pages of - // a multi-page allocation, but it does matter for free page runs: just - // because two page runs are contiguous in memory, it doesn't mean they are - // contiguous on disk. An order of events like: - // - // [multi-page allocation] -> [shared-page allocation] -> [multi-page allocation] - // - // could produce a situation where the two multi-page allocations are - // adjacent in memory, but separated by one page on disk (because of - // copy-on-write using a new dpage during the shared page allocation). - for (uint32_t j = 1; j < dirty_pages[i].num_pages; ++j) { - assert((dirty_pages[i].status == FIRST) || (cont_status == FREE)); - - if (_pma_write_page_status(fd, (index + j), cont_status)) return -1; - // Offset of 0 is code for "leave it alone" - if (init_offset) { - if (_pma_write_page_offset(fd, (index + j), (init_offset + (j * PMA_PAGE_SIZE)))) return -1; - } - } - } - - return 0; -} - -/** - * Update page status of entry in page directory - * - * @param fd Page directory file descriptor - * @param index Directory index of entry - * @param status Page status - * - * @return 0 success - * @return -1 failure; errno set to error code - */ -int -_pma_write_page_status(int fd, uint64_t index, PMAPageStatus status) { - ssize_t bytes_out; - - bytes_out = pwrite( - fd, - (const void *)&status, - sizeof(PMAPageStatus), - ((index * sizeof(PMAPageDirEntry)) + sizeof(uint64_t))); - if (bytes_out < 1) { - return -1; - } - - return 0; -} - -/** - * Update page offset of entry in page directory - * - * @param fd Page directory file descriptor - * @param index Directory index of entry - * @param offset Page offset on disk - * - * @return 0 success - * @return -1 failure; errno set to error code - */ -int -_pma_write_page_offset(int fd, uint64_t index, uint64_t offset) { - ssize_t bytes_out; - - bytes_out = pwrite( - fd, - (const void *)&offset, - sizeof(uint64_t), - (index * sizeof(PMAPageDirEntry))); - if (bytes_out < 1) { - return -1; - } - - return 0; -} - -/** - * Add newly freed pages and page runs to the free page caches - * - * @param num_dirty_pages Size of dirty page cache - * @param dirty_pages Dirty page cache as array - * - * @return 0 success - * @return -1 failure; errno set to error code - */ -int -_pma_update_free_pages(uint8_t num_dirty_pages, PMADirtyPageEntry *dirty_pages) { - PMASinglePageCache *free_page; - PMAPageRunCache *page_run; - - // TODO: Pull out common code between here and pma_load - for (uint8_t i = 0; i < num_dirty_pages; ++i) { - if (dirty_pages[i].status != FREE) continue; - - if (dirty_pages[i].num_pages > 1) { - page_run = malloc(sizeof *page_run); - if (page_run == NULL) return -1; - - page_run->next = _pma_state->free_page_runs; - page_run->page = INDEX_TO_PTR(dirty_pages[i].index); - page_run->length = dirty_pages[i].num_pages; - _pma_state->free_page_runs = page_run; - - } else { - free_page = malloc(sizeof *free_page); - if (free_page == NULL) return -1; - - free_page->next = _pma_state->free_pages; - free_page->page = INDEX_TO_PTR(dirty_pages[i].index); - _pma_state->free_pages = free_page; - } - } - - return 0; -} - -/** - * Allocate memory within a shared allocation page. - * - * @param size Size in bytes to allocate (must be <= 1/4 page) - * - * @return NULL failure; errno set to error code - * @return void* address of the newly allocated memory - */ -void * -_pma_malloc_bytes(size_t size) -{ - PMASharedPageHeader *shared_page; - uint16_t i, slot_size; - uint8_t bucket, byte, bit; - - assert(size <= PMA_MAX_SHARED_ALLOC); - - // Don't bother with anything less than the minimum allocation size - if (size < PMA_MIN_ALLOC_SIZE) { - size = PMA_MIN_ALLOC_SIZE; - } - - // Find the right bucket - bucket = 1; - if (size) { - i = size - 1; - while (i >>= 1) bucket++; - } - slot_size = (1 << bucket); - bucket = bucket - PMA_MIN_ALLOC_SHIFT; - - // Search for a shared page with open slots - shared_page = _pma_state->metadata->shared_pages[bucket]; - while ((shared_page != NULL) && (shared_page->free == 0)) { - shared_page = shared_page->next; - } - - // Make a new shared page if necessary - if (shared_page == NULL) { - if (_pma_malloc_shared_page(bucket)) { - return NULL; - } - - shared_page = _pma_state->metadata->shared_pages[bucket]; - - } else { - if (_pma_copy_shared_page(shared_page)) { - return NULL; - } - } - - assert(shared_page->free); - - // Find first empty slot using bitmap (1 = empty, 0 = full) - byte = 0; - while (shared_page->bits[byte] == 0) { - assert(byte < PMA_BITMAP_SIZE); - ++byte; - } - i = shared_page->bits[byte]; - bit = 0; - while (~i & 1U) { - i >>= 1; - ++bit; - } - - // Mark slot full - shared_page->bits[byte] -= (1 << bit); - --(shared_page->free); - - // Return slot - return (char *)shared_page + - (sizeof(PMASharedPageHeader)) + - (slot_size * ((PMA_BITMAP_BITS * byte) + bit)); -} - -/** - * Allocate a new shared allocation page. - * - * @param bucket Into which bucket in the shared allocation pages array the new - * page will go (which also corresponds to the size of the slots - * in the page) - * - * @return 0 success - * @return -1 failure; errno set to error code - */ -int -_pma_malloc_shared_page(uint8_t bucket) -{ - PMASharedPageHeader *shared_page; - uint8_t shift; - - assert(bucket <= PMA_SHARED_BUCKETS); - - // Get a new writeable page - shared_page = (PMASharedPageHeader *)_pma_malloc_single_page(SHARED); - if (shared_page == NULL) { - return -1; - } - - // Compute shift - shift = bucket + PMA_MIN_ALLOC_SHIFT; - - // Initialize header for shared page - shared_page->dirty = 1; - shared_page->size = shift; - shared_page->free = ((PMA_PAGE_SIZE - sizeof(PMASharedPageHeader)) / (1 << shift)); - for (uint8_t i = 0; i < PMA_BITMAP_SIZE; ++i) { - shared_page->bits[i] = PMA_EMPTY_BITMAP; - } - - // Add new shared page to top of stack - shared_page->next = _pma_state->metadata->shared_pages[bucket]; - _pma_state->metadata->shared_pages[bucket] = shared_page; - - return 0; -} - -/** - * Allocate memory for a large object in one or more pages. - * - * @param size Size in bytes to allocate (must be > 1/4 page) - * - * @return NULL failure; errno set to error code - * @return void* address of the newly allocated memory - */ -void * -_pma_malloc_pages(size_t size) -{ - void *address; - uint64_t num_pages; - - assert(size > PMA_MAX_SHARED_ALLOC); - - // Round size up to nearest page boundary - size = PAGE_ROUND_UP(size); - num_pages = size >> PMA_PAGE_SHIFT; - - if (num_pages == 1) { - address = _pma_malloc_single_page(FIRST); - } else { - address = _pma_malloc_multi_pages(num_pages); - } - - return address; -} - -/** - * Allocate a single new page - * - * Reuse pages from the free page cache, if any are available. These pages are - * used for shared allocations and for "large" allocations that are between 1/4 - * and 1 page in size: (0.25, 1]. - * - * @param status Page status after allocation (SHARED or FIRST) - * - * @return NULL failure; errno set to error code - * @return void* address of the newly allocated memory - */ -void * -_pma_malloc_single_page(PMAPageStatus status) { - void *address; - PMASinglePageCache *free_page = _pma_state->free_pages; - - // Get an existing free page from cache, if available - if (free_page != NULL) { - address = free_page->page; - _pma_state->free_pages = free_page->next; - free(free_page); - - // Make the page writeable - mprotect(address, PMA_PAGE_SIZE, (PROT_READ | PROT_WRITE)); - - // Add page to dirty list - _pma_mark_page_dirty(PTR_TO_INDEX(address), 0, status, 1); - } else { - // Otherwise, allocate a new page - address = _pma_get_new_page(status); - } - - assert((((uint64_t)address) % PMA_PAGE_SIZE) == 0); - - return address; -} - -/** - * Allocate a contiguous block of multiple pages - * - * Reuse pages from the free page run cache, if any are available. - * - * @param num_pages # pages to allocate - * - * @return NULL failure; errno set to error code - * @return void* address of the newly allocated memory - */ -void * -_pma_malloc_multi_pages(uint64_t num_pages) { - void *address; - - address = _pma_get_cached_pages(num_pages); - if (!address) { - address = _pma_get_new_pages(num_pages); - } - - return address; -} - -/** - * Pull existing free pages from the free page run cache - * - * Does a pass over the entire cache to see if there is an exactly-sized page - * run. If so, it's used immediately. Otherwise, keeps track of the smallest - * page run that can be split to accommodate the requested allocation. - * - * @param num_pages # pages to allocate - * - * @return void* address of the newly allocated memory (NULL if none available) - */ -void * -_pma_get_cached_pages(uint64_t num_pages) { - PMAPageRunCache **pre_valid_ptr = NULL; - PMAPageRunCache **prev_node_ptr = &(_pma_state->free_page_runs); - PMAPageRunCache *page_run_cache = _pma_state->free_page_runs; - PMAPageRunCache *valid_page_run = NULL; - void *address = NULL; - - // Do a pass looking for an exactly-sized run. While doing this, also record the smallest run still big enough to fit - // our data. - while (page_run_cache != NULL) { - uint64_t run_length = page_run_cache->length; - - if (run_length == num_pages) { - valid_page_run = page_run_cache; - pre_valid_ptr = prev_node_ptr; - break; - - } else if (run_length > num_pages ) { - if ((valid_page_run == NULL) || (valid_page_run->length > run_length)) { - valid_page_run = page_run_cache; - pre_valid_ptr = prev_node_ptr; - } - } - - prev_node_ptr = &(page_run_cache->next); - page_run_cache = page_run_cache->next; - } - - // If run found... - if (valid_page_run != NULL) { - // Use it - address = valid_page_run->page; - - // If run larger than necessary by two pages... - if (valid_page_run->length > (num_pages + 1)) { - // Reduce it - valid_page_run->page = (uint8_t*)valid_page_run->page + (num_pages * PMA_PAGE_SIZE); - valid_page_run->length -= num_pages; - - // Otherwise... - } else { - // Update cache pointers: we're going to use the whole run or we're going - // to move the remaining page to the single-page cache. Either way, we're - // going to free the run object. - *pre_valid_ptr = valid_page_run->next; - - // If there's a page left... - if (valid_page_run->length == (num_pages + 1)) { - PMASinglePageCache *trailing_page = (PMASinglePageCache *)malloc(sizeof(PMASinglePageCache)); - - // Add it to the single-page cache - trailing_page->next = _pma_state->free_pages; - trailing_page->page = ((char *)address + (num_pages * PMA_PAGE_SIZE)); - _pma_state->free_pages = trailing_page; - } - - free(valid_page_run); - } - - // Make pages writeable - mprotect(address, (num_pages * PMA_PAGE_SIZE), (PROT_READ | PROT_WRITE)); - - // Add pages to dirty list - _pma_mark_page_dirty(PTR_TO_INDEX(address), 0, FIRST, num_pages); - } - - return address; -} - -/** - * Allocate a single new page - * - * Allocates a new page in virtual memory. May or may not use a new dpage. - * - * @param status Page status after allocation (SHARED or FIRST) - * - * @return NULL failure; errno set to error code - * @return void* address of the newly allocated memory - */ -void * -_pma_get_new_page(PMAPageStatus status) { - void *address; - uint64_t offset; - - // Get a dpage to which to map the address - offset = _pma_get_single_dpage(); - if (!offset) { - return NULL; - } - - // Try to map next open memory address to dpage - address = mmap( - _pma_state->metadata->arena_end, - PMA_PAGE_SIZE, - PROT_READ | PROT_WRITE, - PMA_MMAP_FLAGS, - _pma_state->snapshot_fd, - offset); - if (address == MAP_FAILED) { - address = _pma_state->metadata->arena_end; - WARNING("mmap failed"); - abort(); - } - - assert(address == _pma_state->metadata->arena_end); - - // Record PMA expansion - _pma_state->metadata->arena_end = (uint8_t*)_pma_state->metadata->arena_end + PMA_PAGE_SIZE; - - // Add page to dirty list - _pma_mark_page_dirty(PTR_TO_INDEX(address), offset, status, 1); - - return address; -} - -/** - * Allocate multiple new pages - * - * Allocate 2 or more pages in virtual memory. May or may not use new dpages. - * - * @param num_pages # pages to allocate - * - * @return NULL failure; errno set to error code - * @return void* address of the newly allocated memory - */ -void * -_pma_get_new_pages(uint64_t num_pages) { - void *address; - uint64_t bytes = (num_pages * PMA_PAGE_SIZE); - uint64_t offset = _pma_state->metadata->next_offset; - uint64_t size = _pma_state->metadata->snapshot_size; - uint64_t new_size = (offset + bytes); - - // Get new dpages. Extend snapshot backing file first, if necessary. - if (new_size >= size) { - // Multi-page allocations maybe larger than the snapshot resize increment - uint32_t multiplier = ((new_size - size) / PMA_SNAPSHOT_RESIZE_INC) + 1; - - // Fail if snapshot file couldn't be extended - if (_pma_extend_snapshot_file(multiplier)) return NULL; - } - - // Try to map dpages to address - address = mmap( - _pma_state->metadata->arena_end, - bytes, - PROT_READ | PROT_WRITE, - PMA_MMAP_FLAGS, - _pma_state->snapshot_fd, - offset); - if (address == MAP_FAILED) { - address = _pma_state->metadata->arena_end; - WARNING("mmap failed"); - abort(); - } - - assert(address == _pma_state->metadata->arena_end); - - // Update offset of next open dpage - _pma_state->metadata->next_offset += bytes; - _pma_state->metadata->arena_end = (uint8_t*)_pma_state->metadata->arena_end + bytes; - - // Add allocated pages to dirty list - _pma_mark_page_dirty(PTR_TO_INDEX(address), offset, FIRST, num_pages); - - return address; -} - -/** - * Deallocate one or more pages of allocated memory - * - * @param address Address of block to deallocated - * - * @return 0 success - * @return -1 failure; errno set to error code - */ -int -_pma_free_pages(void *address) { - uint32_t index = PTR_TO_INDEX(address); - uint32_t num_pages = 0; - - if ((uint64_t)address & PMA_PAGE_MASK) { - WARNING("address does not point to the root of a page"); - errno = EINVAL; - return -1; - } - - assert(_pma_state->page_directory.entries[index].status == FIRST); - - // Count number of pages in allocation - do { - ++num_pages; - } while (_pma_state->page_directory.entries[index + num_pages].status == FOLLOW); - - // Mark pages dirty - _pma_mark_page_dirty(index, 0, FREE, num_pages); - - return 0; -} - -/** - * Deallocate a block of memory in a shared allocation page. - * - * @param address Address of block to deallocated - * - * @return 0 success - * @return -1 failure; errno set to error code - */ -int -_pma_free_bytes(void *address) { - PMASharedPageHeader *header = (PMASharedPageHeader *)((uint64_t)address & (~PMA_PAGE_MASK)); - uint8_t slot = ((((uint64_t)address & PMA_PAGE_MASK) - sizeof(PMASharedPageHeader)) / (1 << header->size)); - uint8_t byte = slot / PMA_BITMAP_BITS; - uint8_t bit = slot % PMA_BITMAP_BITS; - - // Copy-on-write - _pma_copy_shared_page(header); - - if (header->bits[byte] & (1 << bit)) { - WARNING("bucketized address already free"); - errno = EINVAL; - return -1; - } - - header->bits[byte] += (1 << bit); - ++header->free; - - return 0; -} - -/** - * Copy a shared allocation page - * - * @param address Virtual memory address of shared allocation page - * - * @return 0 success - * @return -1 failure; errno set to error code - */ -int -_pma_copy_shared_page(void *address) { - PMASharedPageHeader *shared_page; - uint64_t offset; - - // Check if page has already been copied - shared_page = (PMASharedPageHeader*)address; - if (shared_page->dirty) { - return 0; - } - - offset = _pma_get_single_dpage(); - if (!offset) { - return -1; - } - - // Make sure dpage cache is writeable - if (!_pma_state->metadata->dpage_cache->dirty) { - if (_pma_copy_dpage_cache()) { - WARNING("dpage cache copy failed"); - abort(); - } - } - - // Copy page - _pma_copy_page(address, offset, SHARED, _pma_state->snapshot_fd); - - // Mark page dirty so it isn't copied again - shared_page->dirty = 1; - - return 0; -} - -/** - * Allocate a new dpage (disk page) - * - * Reuse a page from the free dpage cache, if any are available. - * - * @return 0 failure; errno set to error code - * @return uint64_t offset of new page in backing file - */ -uint64_t -_pma_get_single_dpage(void) { - uint64_t offset; - - // Get a cached dpage, if one is available - offset = _pma_get_cached_dpage(); - if (!offset) { - // Otherwise, get a new dpage from disk - // - // XX returns 0 on failure, should assert - offset = _pma_get_disk_dpage(); - } - - assert((offset % PMA_PAGE_SIZE) == 0); - - return offset; -} - -/** - * Pull a free dpage from the dpage cache - * - * @return offset of new page in backing file (0 if cache empty) - */ -uint64_t -_pma_get_cached_dpage(void) { - uint64_t offset; - uint16_t dirty = _pma_state->metadata->dpage_cache->dirty; - uint16_t size = _pma_state->metadata->dpage_cache->size; - uint16_t head; - - // If the cache is empty, or there's only one page in the cache and the cache - // hasn't been touched yet, then exit early. If the cache hasn't been touched - // yet, we'll need to copy-on-write the cache as well, so if there's only one - // page, don't even bother. - if ((size == 0) || ((size == 1) && !dirty)) { - return 0; - } - - // Special copy-on-write for dpage cache - if (!dirty) { - if (_pma_copy_dpage_cache()) { - void *address = _pma_state->metadata->dpage_cache; - WARNING(strerror(errno)); - abort(); - } - } - - // TODO: macros for dealing with cache? - // Pop page off queue; head can't be assigned earlier as _pma_copy_dpage_cache - // may also try to pop a page off of the queue - head = _pma_state->metadata->dpage_cache->head; - offset = _pma_state->metadata->dpage_cache->queue[head]; - assert(offset != 0); - _pma_state->metadata->dpage_cache->size -= 1; - _pma_state->metadata->dpage_cache->head = ((head + 1) % PMA_DPAGE_CACHE_SIZE); - - return offset; -} - -/** - * Copy the free dpage cache - * - * Free dpage cache needs to be copied using copy-on-write semantics when pages - * are added or removed. - * - * @return 0 success - * @return -1 failure; errno set to error code - */ -int -_pma_copy_dpage_cache(void) { - void *address; - uint64_t offset; - uint16_t dirty = _pma_state->metadata->dpage_cache->dirty; - uint16_t size = _pma_state->metadata->dpage_cache->size; - uint16_t head = _pma_state->metadata->dpage_cache->head; - - assert(!dirty); - - address = _pma_state->metadata->dpage_cache; - - // If pages available in cache... - if (size) { - // Use a page from the cache and record that it was used afterwards - offset = _pma_state->metadata->dpage_cache->queue[head]; - assert(offset != 0); - - _pma_copy_page(address, offset, FIRST, _pma_state->snapshot_fd); - - _pma_state->metadata->dpage_cache->size -= 1; - _pma_state->metadata->dpage_cache->head = ((head + 1) % PMA_DPAGE_CACHE_SIZE); - - } else { - // Otherwise, get a brand new page from disk - offset = _pma_get_disk_dpage(); - if (!offset) return -1; - - _pma_copy_page(address, offset, FIRST, _pma_state->snapshot_fd); - } - - // Mark dpage cache dirty (aka writeable) - _pma_state->metadata->dpage_cache->dirty = 1; - - return 0; -} - -/** - * Get a new free dpage on disk - * - * May require extending the snapshot backing file on disk. - * - * @return offset of new page in backing file (0 if failure) - */ -uint64_t -_pma_get_disk_dpage(void) { - uint64_t offset = _pma_state->metadata->next_offset; - uint64_t size = _pma_state->metadata->snapshot_size; - - // Get a new dpage. Extend snapshot backing file first, if necessary. - if (offset == size) { - // Fail if snapshot file couldn't be extended - if (_pma_extend_snapshot_file(1)) return 0; - } - - // Update offset of next open dpage - _pma_state->metadata->next_offset += PMA_PAGE_SIZE; - - return offset; -} - -/** - * Copy an existing page to a new dpage - * - * Core copy-on-write implementation. - * - * @param address Virtual memory address of existing page - * @param offset Offset of dpage in backing file to which to copy - * @param status Page status after copy (SHARED or FIRST) - * @param fd PMA file descriptor - */ -void -_pma_copy_page(void *address, uint64_t offset, PMAPageStatus status, int fd) { - void *new_address; - ssize_t bytes_out; - uint64_t index = PTR_TO_INDEX(address); - uint16_t tail = _pma_state->metadata->dpage_cache->tail; - - bytes_out = pwrite(fd, address, PMA_PAGE_SIZE, offset); - if (bytes_out != PMA_PAGE_SIZE) { - WARNING(strerror(errno)); - abort(); - } - - new_address = mmap( - address, - PMA_PAGE_SIZE, - PROT_READ | PROT_WRITE, - MAP_SHARED | MAP_FIXED, - fd, - offset); - if (new_address == MAP_FAILED) { - WARNING(strerror(errno)); - abort(); - } - - assert(new_address == address); - - // Add previous dpage to cache - // Note: the dpage cache should always be writeable here, either because the - // dpage cache is the page we just copied, or because it was made - // writeable in advance by _pma_copy_shared_page - assert(_pma_state->page_directory.entries[index].offset != 0); - _pma_state->metadata->dpage_cache->queue[tail] = _pma_state->page_directory.entries[index].offset; - _pma_state->metadata->dpage_cache->tail = ((tail + 1) % PMA_DPAGE_CACHE_SIZE); - - // Add page to dirty page list - _pma_mark_page_dirty(index, offset, status, 1); -} - -/** - * Add entry to the dirty page store - * - * @param index Index of page in page directory - * @param offset Offset of page in PMA file - * @param status Status of pages - * @param num_pages Number of pages represented by this entry - */ -void -_pma_mark_page_dirty(uint64_t index, uint64_t offset, PMAPageStatus status, uint32_t num_pages) { - PMADirtyPageEntry *dirty_page = _pma_state->metadata->dirty_pages; - - dirty_page += _pma_state->metadata->num_dirty_pages++; - - assert(_pma_state->metadata->num_dirty_pages <= PMA_DIRTY_PAGE_LIMIT); - - dirty_page->index = index; - dirty_page->offset = offset; - dirty_page->status = status; - dirty_page->num_pages = num_pages; -} - -/** - * Extend the size of the PMA backing file on disk - * - * Note: while it's possible that a multiplier larger than 2^32 could be valid - * (i.e. using ZFS is the file system, so the backing file can be up to - * 16 EiB in size, and the PMA backing file extension increment is less - * than 4 GiB), it almost certainly would never be encountered (the user - * needs to allocate a 2 EiB file to the loom?). - * - * @param multiplier New size = old size * multiplier - * - * @return 0 success - * @return -1 failure; errno set to error code - */ -int -_pma_extend_snapshot_file(uint32_t multiplier) { - off_t err; - ssize_t bytes; - uint64_t new_snapshot_size; - - // Reject invalid multipliers - if (!multiplier || (multiplier > PMA_MAX_RESIZE_FACTOR)) return -1; - - // Update size in metadata - new_snapshot_size = _pma_state->metadata->snapshot_size + (multiplier * PMA_SNAPSHOT_RESIZE_INC); - - // Extend snapshot file - err = lseek(_pma_state->snapshot_fd, (new_snapshot_size - 1), SEEK_SET); - if (err == -1) return -1; - - bytes = write(_pma_state->snapshot_fd, "", 1); - if (bytes < 1) return -1; - - _pma_state->metadata->snapshot_size = new_snapshot_size; - return 0; -} - -/** - * Log warning message to console. - * - * @param s Error message - * @param p Address which caused error - * @param l Line number - */ -void -_pma_warning(const char *s, void *p, int l) { - fprintf(stderr, "*** %d: %p - %s\n", l, p, s); -} - -/** - * Helper function to deallocate PMA state on shutdown. - */ -void -_pma_state_free(void) -{ - if (_pma_state) { - if (_pma_state->metadata) free(_pma_state->metadata); - free(_pma_state); - _pma_state = NULL; - } -} - -/** - * Helper function to allocate memory for PMA state. - * - * @return 1 allocated PMA state already exists - * @return 0 memory for new PMA state successfully allocated - */ -int -_pma_state_malloc(void) -{ - if (_pma_state != NULL) return 1; - PMAState *ret = calloc(1, sizeof *ret); - ret->metadata = calloc(1, sizeof *ret->metadata); - _pma_state = ret; - return 0; -} diff --git a/rust/ares/src/pma/malloc.h b/rust/ares/src/pma/malloc.h deleted file mode 100644 index bfb7d82..0000000 --- a/rust/ares/src/pma/malloc.h +++ /dev/null @@ -1,118 +0,0 @@ -/** - * Persistent Memory Arena for the New Mars Nock virtualization engine. - */ - -#pragma once - -#include -#include -#include - -//============================================================================== -// PROTOTYPES -//============================================================================== - -/** - * Struct returned from pma_load() - */ -typedef struct PMARootState PMARootState; -struct PMARootState { - uint64_t epoch; // Epoch ID of the most recently processed event - uint64_t event; // ID of the most recently processed event - uint64_t root; // Root after most recent event -}; - -/** - * Initialize a brand new PMA environment and event snapshot - * - * @param path File directory in which to create backing files for snapshot and - * page directory - * - * @return 0 success - * @return -1 failure; errno set to error code - */ -int -pma_init(const char *path); - -/** - * Load an existing PMA environment and event snapshot - * - * @param path File directory from which to load the backing files for the - * snapshot and page directory - * - * @return 0 success - * @return -1 failure; errno set to error code - */ -PMARootState -pma_load(const char *path); - -/** - * Safely unload the PMA after syncing changes to PMA state - * - * @param epoch Epoch of latest event successfully applied to state snapshot - * @param event Event number of latest event successfully applied to state - * snapshot - * - * @return 0 success - * @return -1 failure; errno set to error code - */ -int -pma_close(uint64_t epoch, uint64_t event, uint64_t root); - -/** - * Allocate a new block of memory in the PMA - * - * @param size Size in bytes to allocate - * - * @return NULL failure; errno set to error code - * @return void* address of the newly allocated memory - */ -void * -pma_malloc(size_t size); - -/** - * Deallocate an existing block of memory in the PMA - * - * @param address Address of block to deallocated - * - * @return 0 success - * @return -1 failure; errno set to error code - */ -int -pma_free(void *address); - -/** - * Sync changes to PMA state - * - * @param epoch Epoch of latest event successfully applied to state snapshot - * @param event Event number of latest event successfully applied to state - * snapshot - * - * @return 0 success - * @return -1 failure; errno set to error code - */ -int -pma_sync(uint64_t epoch, uint64_t event, uint64_t root); - -/** - * True if the address is in the PMA - */ -bool -pma_in_arena(void *address); - -/* - bp(X) where X is false will raise a SIGTRAP. If the process is being run - inside a debugger, this can be caught and ignored. It's equivalent to a - breakpoint. If run without a debugger, it will dump core, like an assert -*/ -#if defined(__i386__) || defined(__x86_64__) -#define bp(x) do { if(!(x)) __asm__ volatile("int $3"); } while (0) -#elif defined(__thumb__) -#define bp(x) do { if(!(x)) __asm__ volatile(".inst 0xde01"); } while (0) -#elif defined(__aarch64__) -#define bp(x) do { if(!(x)) __asm__ volatile(".inst 0xd4200000"); } while (0) -#elif defined(__arm__) -#define bp(x) do { if(!(x)) __asm__ volatile(".inst 0xe7f001f0"); } while (0) -#else -STATIC_ASSERT(0, "debugger break instruction unimplemented"); -#endif diff --git a/rust/ares/src/pma/test/internals.h b/rust/ares/src/pma/test/internals.h deleted file mode 100644 index cc0e343..0000000 --- a/rust/ares/src/pma/test/internals.h +++ /dev/null @@ -1,198 +0,0 @@ -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - - -//============================================================================== -// MACROS -//============================================================================== - -#define PMA_PAGE_SHIFT 12U -#define PMA_MIN_ALLOC_SHIFT 4U -#define PMA_BITMAP_BITS (8 * sizeof(uint8_t)) -#define PMA_SNAPSHOT_RESIZE_INC 0x100000000 -#define PMA_PAGE_SIZE (1UL << PMA_PAGE_SHIFT) -#define PMA_PAGE_MASK (PMA_PAGE_SIZE - 1) -#define PMA_MIN_ALLOC_SIZE (1U << PMA_MIN_ALLOC_SHIFT) -#define PMA_MAX_SHARED_SHIFT (PMA_PAGE_SHIFT - 2U) -#define PMA_MAX_SHARED_ALLOC (1UL << PMA_MAX_SHARED_SHIFT) -#define PMA_SHARED_BUCKETS (PMA_MAX_SHARED_SHIFT - PMA_MIN_ALLOC_SHIFT + 1) -#define PAGE_ROUND_DOWN(foo) (foo & (~PMA_PAGE_MASK)) -#define PAGE_ROUND_UP(foo) ((foo + PMA_PAGE_MASK) & (~PMA_PAGE_MASK)) -#define PTR_TO_INDEX(foo) ((((uint64_t)(foo)) - ((uint64_t)_pma_state->metadata->arena_start)) >> PMA_PAGE_SHIFT) -#define INDEX_TO_PTR(foo) (void *)((char *)_pma_state->metadata->arena_start + ((foo) * PMA_PAGE_SIZE)) -#ifdef __linux__ - #define PMA_MMAP_FLAGS (MAP_SHARED | MAP_FIXED_NOREPLACE) -#else - #define PMA_MMAP_FLAGS (MAP_SHARED | MAP_FIXED) -#endif -#define PMA_MAGIC_CODE 0xBADDECAFC0FFEE00 // i.e. all decaf coffee -#define PMA_DATA_VERSION 1 -#define PMA_EMPTY_BITMAP 0xFF -#define PMA_BITMAP_SIZE 32 -#define PMA_DPAGE_CACHE_SIZE ((PMA_PAGE_SIZE - sizeof(PMADPageCache)) / sizeof(uint64_t)) -#define PMA_DIRTY_PAGE_LIMIT 164 -#define PMA_SNAPSHOT_FILENAME "snap.bin" -#define PMA_PAGE_DIR_FILENAME "page.bin" -#define PMA_DEFAULT_DIR_NAME ".bin" -#define PMA_NEW_FILE_FLAGS (O_RDWR | O_CREAT) -#define PMA_LOAD_FILE_FLAGS (O_RDWR -#define PMA_DIR_PERMISSIONS (S_IRWXU | S_IRWXG | S_IRWXO) -#define PMA_FILE_PERMISSIONS (S_IRUSR | S_IWUSR | S_IRGRP | S_IWGRP) -#define PMA_INIT_SNAP_SIZE 0x40000000 -#define PMA_INIT_DIR_SIZE 0x400000 -#define PMA_MAXIMUM_DIR_SIZE 0x5500000000 -#ifdef __linux__ - #define PMA_SNAPSHOT_ADDR 0x10000 -#else - #define PMA_SNAPSHOT_ADDR 0x28000000000 -#endif -#define PMA_MAX_DISK_FILE_SIZE 0x100000000000 -#define PMA_MAX_RESIZE_FACTOR (PMA_MAX_DISK_FILE_SIZE / PMA_SNAPSHOT_RESIZE_INC) - - -//============================================================================== -// TYPES -//============================================================================== - -enum PMAPageStatus { - UNALLOCATED, - FREE, - SHARED, - FIRST, - FOLLOW -}; -typedef enum PMAPageStatus PMAPageStatus; - -typedef struct PMAPageDirEntry PMAPageDirEntry; -struct PMAPageDirEntry { - uint64_t offset; - PMAPageStatus status; -}; - -typedef struct PMAPageDir PMAPageDir; -struct PMAPageDir { - uint64_t size; - uint64_t next_index; - PMAPageDirEntry *entries; -}; - -typedef struct PMASharedPageHeader PMASharedPageHeader; -struct PMASharedPageHeader { - struct PMASharedPageHeader *next; - uint8_t dirty; - uint8_t size; - uint8_t free; - uint8_t bits[PMA_BITMAP_SIZE]; -}; - -typedef struct PMADirtyPageEntry PMADirtyPageEntry; -struct PMADirtyPageEntry { - uint64_t index; - uint64_t offset; - uint32_t num_pages; - PMAPageStatus status; -}; - -typedef struct PMASinglePageCache PMASinglePageCache; -struct PMASinglePageCache { - PMASinglePageCache *next; - void *page; -}; - -typedef struct PMAPageRunCache PMAPageRunCache; -struct PMAPageRunCache { - PMAPageRunCache *next; - void *page; - uint64_t length; -}; - -typedef struct PMADPageCache PMADPageCache; -struct PMADPageCache { - uint8_t dirty; - uint16_t size; - uint16_t head; - uint16_t tail; - uint64_t queue[]; -}; - -typedef struct PMAMetadata PMAMetadata; -struct PMAMetadata { - uint64_t magic_code; - uint32_t checksum; - uint32_t version; - uint64_t epoch; - uint64_t event; - uint64_t root; - void *arena_start; - void *arena_end; - PMASharedPageHeader *shared_pages[PMA_SHARED_BUCKETS]; - PMADPageCache *dpage_cache; - uint64_t snapshot_size; - uint64_t next_offset; - uint8_t num_dirty_pages; - uint64_t padding[2]; - PMADirtyPageEntry dirty_pages[PMA_DIRTY_PAGE_LIMIT]; -}; -static_assert(sizeof(PMAMetadata) == PMA_PAGE_SIZE, "PMAMetadata must be a page in length"); - -typedef struct PMAState PMAState; -struct PMAState { - PMAMetadata *metadata; - uint64_t meta_page_offset; - PMAPageDir page_directory; - int snapshot_fd; - int page_dir_fd; - PMASinglePageCache *free_pages; - PMAPageRunCache *free_page_runs; -}; - - -//============================================================================== -// GLOBALS -//============================================================================== - -extern PMAState *_pma_state; - - -//============================================================================== -// FUNCTIONS -//============================================================================== - -int _pma_verify_checksum(PMAMetadata *meta_page); -int _pma_sync_dirty_pages(int fd, uint8_t num_dirty_pages, PMADirtyPageEntry *dirty_pages); -int _pma_write_page_status(int fd, uint64_t index, PMAPageStatus status); -int _pma_write_page_offset(int fd, uint64_t index, uint64_t offset); -int _pma_update_free_pages(uint8_t num_dirty_pages, PMADirtyPageEntry *dirty_pages); -void *_pma_malloc_bytes(size_t size); -int _pma_malloc_shared_page(uint8_t bucket); -void *_pma_malloc_pages(size_t size); -void *_pma_malloc_single_page(PMAPageStatus status); -void *_pma_malloc_multi_pages(uint64_t num_pages); -void *_pma_get_cached_pages(uint64_t num_pages); -void *_pma_get_new_page(PMAPageStatus status); -void *_pma_get_new_pages(uint64_t num_pages); -int _pma_free_pages(void *address); -int _pma_free_bytes(void *address); -int _pma_copy_shared_page(void *address); -uint64_t _pma_get_single_dpage(void); -uint64_t _pma_get_cached_dpage(void); -int _pma_copy_dpage_cache(void); -uint64_t _pma_get_disk_dpage(void); -void _pma_copy_page(void *address, uint64_t offset, PMAPageStatus status, int fd); -void _pma_mark_page_dirty(uint64_t index, uint64_t offset, PMAPageStatus status, uint32_t num_pages); -int _pma_extend_snapshot_file(uint32_t multiplier); -void _pma_warning(const char *p, void *a, int l); -void _pma_state_free(void); -int _pma_state_malloc(void); diff --git a/rust/ares/src/pma/test/malloc.c b/rust/ares/src/pma/test/malloc.c deleted file mode 100644 index 158b5f7..0000000 --- a/rust/ares/src/pma/test/malloc.c +++ /dev/null @@ -1,1511 +0,0 @@ -#include -#include -#include -#include -#include - -#include "../malloc.h" -#include "../includes/checksum.h" -#include "internals.h" - -//============================================================================== -// CONFIGURABLE MACROS -//============================================================================== - -#define TEST_PMA_SNAPSHOT_TEMPLATE "test-snapshot-XXXXXX.bin" -#define TEST_PMA_SNAPSHOT_SUFFIX 4 - - -//============================================================================== -// TYPES -//============================================================================== - -typedef struct TestState TestState; -struct TestState { - char *dir; // Directory in which to generate test files -}; - - -//============================================================================== -// GLOBALS -//============================================================================== - -TestState *_test_state = NULL; - - -//============================================================================== -// FORWARD DECLARATIONS -//============================================================================== - -void test_pma_state_malloc_and_free(void); -void test_pma_extend_snapshot_file(void); -void test_pma_mark_page_dirty(void); -void test_pma_copy_page(void); -void test_pma_get_disk_dpage(void); -void test_pma_copy_dpage_cache(void); -void test_pma_get_cached_dpage(void); -void test_pma_copy_shared_page(void); -void test_pma_free_bytes(void); -void test_pma_free_pages(void); -void test_pma_get_new_pages(void); -void test_pma_get_new_page(void); -void test_pma_get_cached_pages(void); -void test_pma_malloc_single_page(void); -void test_pma_malloc_shared_page(void); -void test_pma_update_free_pages(void); -void test_pma_verify_checksum(void); -void test_pma_in_arena(void); -void test_pma_init(void); -void test_pma_sync(void); -void test_pma_load(void); - - -//============================================================================== -// MAIN & HELPERS -//============================================================================== - -void -test_pma(char* test_dir) { - // Set up test state - _test_state = malloc(sizeof(TestState)); - _test_state->dir = test_dir; - - // Run tests - test_pma_state_malloc_and_free(); - test_pma_extend_snapshot_file(); - test_pma_mark_page_dirty(); - test_pma_copy_page(); - test_pma_get_disk_dpage(); - test_pma_copy_dpage_cache(); - test_pma_get_cached_dpage(); - test_pma_copy_shared_page(); - test_pma_free_bytes(); - test_pma_free_pages(); - test_pma_get_new_pages(); - test_pma_get_new_page(); - test_pma_get_cached_pages(); - test_pma_malloc_single_page(); - test_pma_malloc_shared_page(); - test_pma_update_free_pages(); - test_pma_verify_checksum(); - test_pma_in_arena(); - test_pma_init(); - test_pma_sync(); - test_pma_load(); - - // Clean up - free(_test_state); - - // Done - printf("Unit tests PASSED\n"); -} - -int -_generate_test_snapshot(char **filename) { - size_t dir_len; - size_t file_len; - int fd; - - dir_len = strlen(_test_state->dir); - file_len = strlen(TEST_PMA_SNAPSHOT_TEMPLATE); - - *filename = malloc(dir_len + file_len + 1); - strcpy(*filename, _test_state->dir); - strcpy((*filename + dir_len), TEST_PMA_SNAPSHOT_TEMPLATE); - assert(*filename); - fd = mkstemps(*filename, TEST_PMA_SNAPSHOT_SUFFIX); - assert(fd > 0); - - return fd; -} - -void -_clean_up_test_snapshot(int fd, char *filename) { - close(fd); - unlink(filename); - free(filename); -} - - -//============================================================================== -// TESTS -//============================================================================== - -void -test_pma_state_malloc_and_free(void) { - int res = -1; - - // pre state malloc - assert(!_pma_state); - - // state malloc - res = _pma_state_malloc(); - assert(!res); - assert(_pma_state); - assert(_pma_state->metadata); - - // try state malloc again - res = _pma_state_malloc(); - assert(res == 1); - - // state free - _pma_state_free(); - assert(!_pma_state); - - // try state free again - _pma_state_free(); - - // free metadata separately - res = _pma_state_malloc(); - free(_pma_state->metadata); - _pma_state->metadata = NULL; - _pma_state_free(); -} - -void -test_pma_extend_snapshot_file(void) { - struct stat statbuf; - uint64_t multiplier; - int fd; - int ret; - char *filename = NULL; - - // Test 1: 0 multiplier - ret = _pma_extend_snapshot_file(0); - assert(ret == -1); - - // Test 2: massive multiplier - ret = _pma_extend_snapshot_file(0xffffffff); - assert(ret == -1); - - // Set up state & locals - _pma_state_malloc(); - _pma_state->metadata->snapshot_size = 0; - multiplier = 10; - - // Test 3: lseek fails; snapshot file doesn't exist - ret = _pma_extend_snapshot_file(multiplier); - assert(ret == -1); - assert(errno == ESPIPE); - - // Set up fd - errno = 0; - fd = _generate_test_snapshot(&filename); - close(fd); - fd = open(filename, O_RDONLY); - assert(fd > 0); - _pma_state->snapshot_fd = fd; - - // Test 4: write fails; snapshot file read only - errno = 0; - ret = _pma_extend_snapshot_file(multiplier); - assert(ret == -1); - assert(errno == EBADF); - close(fd); - - // Reset fd - fd = open(filename, O_RDWR); - assert(fd > 0); - _pma_state->snapshot_fd = fd; - - // Test 5: Successful - errno = 0; - ret = _pma_extend_snapshot_file(multiplier); - assert(ret == 0); - assert(errno == 0); - assert(fstat(fd, &statbuf) == 0); - assert((uint64_t)statbuf.st_size == (multiplier * PMA_SNAPSHOT_RESIZE_INC)); - assert((uint64_t)statbuf.st_size == _pma_state->metadata->snapshot_size); - - // Clean up - _clean_up_test_snapshot(fd, filename); - _pma_state_free(); -} - -void -test_pma_mark_page_dirty(void) { - PMADirtyPageEntry *dirty_page; - - // Set up state & locals - _pma_state_malloc(); - _pma_state->metadata->num_dirty_pages = 10; - dirty_page = (_pma_state->metadata->dirty_pages + 10); - dirty_page->index = 1; - dirty_page->offset = 2; - dirty_page->num_pages = 3; - dirty_page->status = FREE; - - // Test 1: mark page dirty - _pma_mark_page_dirty(4, 5, FIRST, 6); - assert(_pma_state->metadata->num_dirty_pages == 11); - assert(dirty_page->index == 4); - assert(dirty_page->offset == 5); - assert(dirty_page->num_pages == 6); - assert(dirty_page->status == FIRST); - - // Clean up - _pma_state_free(); -} - -void -test_pma_copy_page(void) { - const uint64_t page_uno_offset = 0; - const uint64_t page_dos_offset = PMA_PAGE_SIZE; - const uint64_t page_tre_offset = (2 * PMA_PAGE_SIZE); - const uint64_t file_size = (3 * PMA_PAGE_SIZE); - const uint16_t end_of_dpage_cache = (PMA_DPAGE_CACHE_SIZE - 1); - ssize_t bytes; - const int strlen = 6; - int fd; - const char *text_alpha = "ALPHA"; - const char *text_bravo = "BRAVO"; - const char *text_delta = "DELTA"; - char *filename; - char text_test[6] = { 0 }; - void *address; - - // Set up backing file - fd = _generate_test_snapshot(&filename); - assert(12287 == lseek(fd, (file_size - 1), SEEK_SET)); - assert(1 == write(fd, "", 1)); - assert(6 == pwrite(fd, text_alpha, strlen, 0)); - assert(6 == pwrite(fd, text_bravo, strlen, PMA_PAGE_SIZE)); - assert(6 == pwrite(fd, text_delta, strlen, (2 * PMA_PAGE_SIZE))); - - // Set up state - _pma_state_malloc(); - _pma_state->metadata->arena_start = PMA_SNAPSHOT_ADDR; - - _pma_state->metadata->dpage_cache = calloc(1, PMA_PAGE_SIZE); - _pma_state->metadata->dpage_cache->tail = end_of_dpage_cache; - _pma_state->metadata->dpage_cache->queue[end_of_dpage_cache] = 0; - - _pma_state->page_directory.entries = calloc(2, sizeof(PMAPageDirEntry)); - _pma_state->page_directory.entries[1].offset = page_dos_offset; - - // Set up address - address = mmap( - INDEX_TO_PTR(1), - PMA_PAGE_SIZE, - PROT_READ, - MAP_SHARED | MAP_FIXED, - fd, - page_dos_offset); - assert(MAP_FAILED != address); - - // Test 1: copy page in backing file - _pma_copy_page(address, page_tre_offset, FIRST, fd); - assert(0 == _pma_state->metadata->dpage_cache->tail); - assert(4096 == _pma_state->metadata->dpage_cache->queue[end_of_dpage_cache]); - bytes = pread(fd, text_test, strlen, page_uno_offset); - assert(6 == bytes); - assert(0 == strcmp(text_alpha, text_test)); - bytes = pread(fd, text_test, strlen, page_dos_offset); - assert(6 == bytes); - assert(0 == strcmp(text_bravo, text_test)); - bytes = pread(fd, text_test, strlen, page_tre_offset); - assert(6 == bytes); - assert(0 == strcmp(text_bravo, text_test)); - - // Clean up - munmap(INDEX_TO_PTR(0), file_size); - free(_pma_state->metadata->dpage_cache); - _clean_up_test_snapshot(fd, filename); - _pma_state_free(); -} - -void -test_pma_get_disk_dpage(void) { - struct stat statbuf; - uint64_t init_size = 2 * PMA_PAGE_SIZE; - uint64_t next_offset; - int fd; - char *filename; - - // Set up state - _pma_state_malloc(); - _pma_state->metadata->next_offset = init_size - PMA_PAGE_SIZE; - _pma_state->metadata->snapshot_size = init_size; - - // Test 1: get next dpage without extending snapshot backing file - next_offset = _pma_get_disk_dpage(); - assert(4096 == next_offset); - assert(8192 == _pma_state->metadata->next_offset); - - // Test 2: failure to extend backing file - next_offset = _pma_get_disk_dpage(); - assert(0 == next_offset); - assert(8192 == _pma_state->metadata->next_offset); - - // Set up backing file - fd = _generate_test_snapshot(&filename); - assert(8191 == lseek(fd, (init_size - 1), SEEK_SET)); - assert(1 == write(fd, "", 1)); - _pma_state->snapshot_fd = fd; - - // Test 3: get next dpage after extending snapshot backing file - next_offset = _pma_get_disk_dpage(); - assert(8192 == next_offset); - assert(12288 == _pma_state->metadata->next_offset); - assert(0 == fstat(fd, &statbuf)); - assert((uint64_t)statbuf.st_size == (PMA_SNAPSHOT_RESIZE_INC + init_size)); - - // Clean up - free(_pma_state->page_directory.entries); - _clean_up_test_snapshot(fd, filename); - _pma_state_free(); -} - -void -test_pma_copy_dpage_cache(void) { - const uint64_t page_uno_offset = PMA_PAGE_SIZE; - const uint64_t page_dos_offset = (2 * PMA_PAGE_SIZE); - const uint64_t page_tre_offset = (3 * PMA_PAGE_SIZE); - const uint64_t init_size = 4 * PMA_PAGE_SIZE; - const uint64_t test_code = 0xcafebabe8008135; - uint64_t data_buffer; - ssize_t bytes; - int fd = 0; - char *filename = NULL; - - // Set up backing file - fd = _generate_test_snapshot(&filename); - assert(16383 == lseek(fd, (init_size - 1), SEEK_SET)); - assert(1 == write(fd, "", 1)); - - // Set up state - _pma_state_malloc(); - _pma_state->metadata->arena_start = PMA_SNAPSHOT_ADDR; - _pma_state->metadata->snapshot_size = init_size; - _pma_state->metadata->dpage_cache = mmap( - INDEX_TO_PTR(0), - PMA_PAGE_SIZE, - PROT_READ | PROT_WRITE, - MAP_SHARED | MAP_FIXED, - fd, - page_uno_offset); - _pma_state->metadata->dpage_cache->dirty = 0; - _pma_state->metadata->dpage_cache->size = 0; - _pma_state->metadata->dpage_cache->head = 1; - _pma_state->metadata->dpage_cache->tail = 2; - _pma_state->metadata->dpage_cache->queue[0] = test_code; - _pma_state->metadata->dpage_cache->queue[1] = page_dos_offset; - _pma_state->page_directory.entries = malloc(sizeof(PMAPageDirEntry)); - _pma_state->page_directory.entries[0].offset = page_uno_offset; - - // Test 1: free page cache empty, getting new page fails - _pma_state->metadata->next_offset = init_size; - assert(_pma_copy_dpage_cache()); - - // Test 2: free page cache empty, getting new page succeeds - _pma_state->snapshot_fd = fd; - _pma_state->metadata->next_offset = page_tre_offset; - assert(0 == _pma_copy_dpage_cache()); - assert(16384 == _pma_state->metadata->next_offset); - bytes = pread(fd, &data_buffer, 8, (page_tre_offset + 8)); - assert(8 == bytes); - assert(0xcafebabe8008135 == data_buffer); - - // Reset dpage cache dirty bit - _pma_state->metadata->dpage_cache->dirty = 0; - - // Test 3: free page cache has a page - _pma_state->metadata->dpage_cache->size = 1; - assert(0 == _pma_copy_dpage_cache()); - bytes = pread(fd, &data_buffer, 8, (page_dos_offset + 8)); - assert(8 == bytes); - assert(0xcafebabe8008135 == data_buffer); - - // Clean up - munmap(INDEX_TO_PTR(0), init_size); - free(_pma_state->page_directory.entries); - _clean_up_test_snapshot(fd, filename); - _pma_state_free(); -} - -void -test_pma_get_cached_dpage(void) { - - // Set up state - _pma_state_malloc(); - _pma_state->metadata->dpage_cache = calloc(1, sizeof(PMADPageCache)); - - // Test 1: no pages in cache - _pma_state->metadata->dpage_cache->dirty = 1; - _pma_state->metadata->dpage_cache->size = 0; - assert(0 == _pma_get_cached_dpage()); - - // Test 2: only one page in cache and cache uncopied - _pma_state->metadata->dpage_cache->dirty = 0; - _pma_state->metadata->dpage_cache->size = 1; - assert(0 == _pma_get_cached_dpage()); - - // Test 3: successfully get page - _pma_state->metadata->dpage_cache->dirty = 1; - _pma_state->metadata->dpage_cache->size = 2; - _pma_state->metadata->dpage_cache->head = 0; - _pma_state->metadata->dpage_cache->tail = 1; - _pma_state->metadata->dpage_cache->queue[0] = 0xcafebabe8008135; - assert(0xcafebabe8008135 == _pma_get_cached_dpage()); - assert(1 == _pma_state->metadata->dpage_cache->size); - assert(1 == _pma_state->metadata->dpage_cache->head); - assert(1 == _pma_state->metadata->dpage_cache->tail); - - // Test 4: successfully get page & loop queue - _pma_state->metadata->dpage_cache->head = PMA_DPAGE_CACHE_SIZE - 1; - _pma_state->metadata->dpage_cache->queue[PMA_DPAGE_CACHE_SIZE - 1] = 0xdefaced0facade; - assert(0xdefaced0facade == _pma_get_cached_dpage()); - assert(0 == _pma_state->metadata->dpage_cache->size); - assert(0 == _pma_state->metadata->dpage_cache->head); - assert(1 == _pma_state->metadata->dpage_cache->tail); - - // Clean up - free(_pma_state->metadata->dpage_cache); - _pma_state_free(); -} - -void -test_pma_copy_shared_page(void) { - PMASharedPageHeader *clean_shared_page; - PMASharedPageHeader *dirty_shared_page; - ssize_t bytes; - const uint64_t init_size = 4 * PMA_PAGE_SIZE; - const uint64_t page_nul_offset = 0; - const uint64_t page_uno_offset = PMA_PAGE_SIZE; - const uint64_t page_dos_offset = (2 * PMA_PAGE_SIZE); - const uint64_t page_tre_offset = (3 * PMA_PAGE_SIZE); - const uint8_t page_uno_size = 10; - const uint8_t page_dos_size = 20; - uint8_t data_buffer; - int fd = 0; - char *filename = NULL; - - // Set up backing file - fd = _generate_test_snapshot(&filename); - assert(16383 == lseek(fd, (init_size - 1), SEEK_SET)); - assert(1 == write(fd, "", 1)); - - // Set up state - _pma_state_malloc(); - _pma_state->metadata->arena_start = PMA_SNAPSHOT_ADDR; - _pma_state->metadata->snapshot_size = init_size; - _pma_state->metadata->dpage_cache = mmap( - INDEX_TO_PTR(0), - PMA_PAGE_SIZE, - PROT_READ | PROT_WRITE, - MAP_SHARED | MAP_FIXED, - fd, - page_nul_offset); - _pma_state->metadata->dpage_cache->dirty = 1; - _pma_state->page_directory.entries = calloc(3, sizeof(PMAPageDirEntry)); - _pma_state->page_directory.entries[1].offset = page_uno_offset; - _pma_state->page_directory.entries[1].status = SHARED; - _pma_state->page_directory.entries[2].offset = page_dos_offset; - _pma_state->page_directory.entries[2].status = SHARED; - - // Set up shared pages - dirty_shared_page = mmap( - INDEX_TO_PTR(1), - PMA_PAGE_SIZE, - PROT_READ | PROT_WRITE, - MAP_SHARED | MAP_FIXED, - fd, - page_uno_offset); - dirty_shared_page->dirty = 1; - dirty_shared_page->size = page_uno_size; - - clean_shared_page = mmap( - INDEX_TO_PTR(2), - PMA_PAGE_SIZE, - PROT_READ | PROT_WRITE, - MAP_SHARED | MAP_FIXED, - fd, - page_dos_offset); - clean_shared_page->dirty = 0; - clean_shared_page->size = page_dos_size; - - // Test 1: don't copy if shared page already dirty - assert(0 == _pma_copy_shared_page(dirty_shared_page)); - - // Test 2: fail if a new dpage couldn't be acquired - _pma_state->metadata->dpage_cache->size = 0; - _pma_state->metadata->dpage_cache->head = 0; - _pma_state->metadata->dpage_cache->tail = 0; - assert(-1 == _pma_copy_shared_page(clean_shared_page)); - - // Test 3: success - _pma_state->snapshot_fd = fd; - _pma_state->metadata->dpage_cache->size = 1; - _pma_state->metadata->dpage_cache->tail = 1; - _pma_state->metadata->dpage_cache->queue[0] = page_tre_offset; - assert(0 == _pma_copy_shared_page(clean_shared_page)); - bytes = pread(fd, &data_buffer, 1, (page_uno_offset + 9)); - assert(1 == bytes); - assert(10 == data_buffer); - bytes = pread(fd, &data_buffer, 1, (page_dos_offset + 9)); - assert(1 == bytes); - assert(20 == data_buffer); - bytes = pread(fd, &data_buffer, 1, (page_tre_offset + 9)); - assert(1 == bytes); - assert(20 == data_buffer); - - // Clean up - free(_pma_state->page_directory.entries); - munmap(PMA_SNAPSHOT_ADDR, init_size); - _pma_state_free(); - _clean_up_test_snapshot(fd, filename); -} - -void -test_pma_free_bytes(void) { - PMASharedPageHeader *shared_page_16; - PMASharedPageHeader *shared_page_64; - PMASharedPageHeader *shared_page_256; - const uint64_t init_size = 3 * PMA_PAGE_SIZE; - const uint64_t page_uno_offset = 0; - const uint64_t page_dos_offset = PMA_PAGE_SIZE; - const uint64_t page_tre_offset = (2 * PMA_PAGE_SIZE); - const uint8_t page_uno_size = 4; - const uint8_t page_dos_size = 6; - const uint8_t page_tre_size = 8; - int fd = 0; - int ret; - char *filename = NULL; - - // Set up backing file - fd = _generate_test_snapshot(&filename); - assert(12287 == lseek(fd, (init_size - 1), SEEK_SET)); - assert(1 == write(fd, "", 1)); - - // Set up state - _pma_state_malloc(); - _pma_state->metadata->arena_start = PMA_SNAPSHOT_ADDR; - _pma_state->metadata->snapshot_size = init_size; - - // Set up shared pages - shared_page_16 = mmap( - INDEX_TO_PTR(0), - PMA_PAGE_SIZE, - PROT_READ | PROT_WRITE, - MAP_SHARED | MAP_FIXED, - fd, - page_uno_offset); - shared_page_16->dirty = 1; - shared_page_16->size = page_uno_size; - shared_page_16->free = 0; - for (int i = 0; i < PMA_BITMAP_SIZE; ++i) { - shared_page_16->bits[i] = 0; - } - - shared_page_64 = mmap( - INDEX_TO_PTR(1), - PMA_PAGE_SIZE, - PROT_READ | PROT_WRITE, - MAP_SHARED | MAP_FIXED, - fd, - page_dos_offset); - shared_page_64->dirty = 1; - shared_page_64->size = page_dos_size; - shared_page_64->free = 0; - for (int i = 0; i < PMA_BITMAP_SIZE; ++i) { - shared_page_64->bits[i] = 0; - } - - shared_page_256 = mmap( - INDEX_TO_PTR(2), - PMA_PAGE_SIZE, - PROT_READ | PROT_WRITE, - MAP_SHARED | MAP_FIXED, - fd, - page_tre_offset); - shared_page_256->dirty = 1; - shared_page_256->size = page_tre_size; - shared_page_256->free = 0; - for (int i = 0; i < PMA_BITMAP_SIZE; ++i) { - shared_page_256->bits[i] = 0; - } - - // Test 1: free slot 0 of shared page with slot size 16 - ret = _pma_free_bytes((char*)shared_page_16 + sizeof(PMASharedPageHeader)); - assert(0 == ret); - assert(1 == shared_page_16->free); - assert(0x01 == shared_page_16->bits[0]); - - // Test 2: free slot 8 of shared page with slot size 64 - ret = _pma_free_bytes((char*)shared_page_64 + sizeof(PMASharedPageHeader) + 448); - assert(0 == ret); - assert(1 == shared_page_64->free); - assert(0x80 == shared_page_64->bits[0]); - - // Test 3: free slot 15 of shared page with slot size 256 - ret = _pma_free_bytes((char*)shared_page_256 + sizeof(PMASharedPageHeader) + 3584); - assert(0 == ret); - assert(1 == shared_page_256->free); - assert(0x40 == shared_page_256->bits[1]); - - // Test 4: failure when freeing an already free slot - ret = _pma_free_bytes((char*)shared_page_16 + sizeof(PMASharedPageHeader)); - assert(-1 == ret); - - // Clean up - munmap(PMA_SNAPSHOT_ADDR, init_size); - _pma_state_free(); - _clean_up_test_snapshot(fd, filename); -} - -void -test_pma_free_pages(void) { - const uint64_t init_size = 3 * PMA_PAGE_SIZE; - const uint64_t solo_page_offset = 0; - const uint64_t duo_page_offset = PMA_PAGE_SIZE; - int fd = 0; - char *filename = NULL; - void *solo_page; - void *duo_page; - - // Set up backing file - fd = _generate_test_snapshot(&filename); - assert(12287 == lseek(fd, (init_size - 1), SEEK_SET)); - assert(1 == write(fd, "", 1)); - - // Set up state - _pma_state_malloc(); - _pma_state->metadata->arena_start = PMA_SNAPSHOT_ADDR; - _pma_state->metadata->snapshot_size = init_size; - _pma_state->page_directory.entries = calloc(3, sizeof(PMAPageDirEntry)); - _pma_state->page_directory.entries[0].status = FIRST; - _pma_state->page_directory.entries[0].offset = solo_page_offset; - _pma_state->page_directory.entries[1].status = FIRST; - _pma_state->page_directory.entries[1].offset = duo_page_offset; - _pma_state->page_directory.entries[2].status = FOLLOW; - _pma_state->page_directory.entries[2].offset = duo_page_offset + PMA_PAGE_SIZE; - - // Set up pages - solo_page = mmap( - INDEX_TO_PTR(0), - PMA_PAGE_SIZE, - PROT_READ | PROT_WRITE, - MAP_SHARED | MAP_FIXED, - fd, - solo_page_offset); - - duo_page = mmap( - INDEX_TO_PTR(1), - 2 * PMA_PAGE_SIZE, - PROT_READ | PROT_WRITE, - MAP_SHARED | MAP_FIXED, - fd, - duo_page_offset); - - // Test 1: fail when pointing to middle of page - assert(-1 == _pma_free_pages(solo_page + 1)); - - // Test 2: free single page allocation - assert(0 == _pma_free_pages(solo_page)); - - // test 3: free multi-page allocation - assert(0 == _pma_free_pages(duo_page)); - - // Clean up - munmap(PMA_SNAPSHOT_ADDR, init_size); - free(_pma_state->page_directory.entries); - _pma_state_free(); - _clean_up_test_snapshot(fd, filename); -} - -void -test_pma_get_new_pages(void) { - const uint64_t init_size = PMA_PAGE_SIZE; - const uint64_t num_pages = 2; - int fd = 0; - char *filename = NULL; - void* const address = PMA_SNAPSHOT_ADDR + PMA_PAGE_SIZE; - void* const arena_end = address + (2 * PMA_PAGE_SIZE); - - // Set up backing file - fd = _generate_test_snapshot(&filename); - assert(4095 == lseek(fd, (init_size - 1), SEEK_SET)); - assert(1 == write(fd, "", 1)); - - // Set up state - _pma_state_malloc(); - _pma_state->snapshot_fd = fd; - _pma_state->metadata->arena_start = PMA_SNAPSHOT_ADDR; - _pma_state->metadata->arena_end = address; - _pma_state->metadata->snapshot_size = init_size; - _pma_state->metadata->next_offset = init_size; - - // Test 1: allocate new pages - assert(address == _pma_get_new_pages(num_pages)); - assert(12288 == _pma_state->metadata->next_offset); - assert(arena_end == _pma_state->metadata->arena_end); - - // Clean Up - munmap(address, num_pages * PMA_PAGE_SIZE); - _pma_state_free(); - _clean_up_test_snapshot(fd, filename); -} - -void -test_pma_get_new_page(void) { - const uint64_t init_size = 2 * PMA_PAGE_SIZE; - const uint64_t init_offset = PMA_PAGE_SIZE; - int fd = 0; - char *filename = NULL; - void* const address = PMA_SNAPSHOT_ADDR; - void* const arena_end = address + PMA_PAGE_SIZE; - - // Set up backing file - fd = _generate_test_snapshot(&filename); - assert(8191 == lseek(fd, (init_size - 1), SEEK_SET)); - assert(1 == write(fd, "", 1)); - - // Set up state - _pma_state_malloc(); - _pma_state->snapshot_fd = fd; - _pma_state->metadata->arena_start = PMA_SNAPSHOT_ADDR; - _pma_state->metadata->arena_end = address; - _pma_state->metadata->snapshot_size = init_size; - _pma_state->metadata->next_offset = init_offset; - - _pma_state->metadata->dpage_cache = calloc(1, sizeof(PMADPageCache)); - _pma_state->metadata->dpage_cache->size = 0; - - // Test 1: allocate new pages - assert(address == _pma_get_new_page(FIRST)); - assert(8192 == _pma_state->metadata->next_offset); - assert(arena_end == _pma_state->metadata->arena_end); - - // Clean Up - munmap(address, PMA_PAGE_SIZE); - free(_pma_state->metadata->dpage_cache); - _pma_state_free(); - _clean_up_test_snapshot(fd, filename); -} - -void -test_pma_get_cached_pages(void) { - PMAPageRunCache *test_0_cache; - PMAPageRunCache *test_1_cache; - PMAPageRunCache *test_2_cache; - PMAPageRunCache *test_3_cache; - PMAPageRunCache *test_4_cache; - PMAPageRunCache *test_5_cache; - PMAPageRunCache *wip_ptr; - void *address; - - // Set up state - _pma_state_malloc(); - - // Set up run caches for test - test_0_cache = NULL; - - wip_ptr = calloc(1, sizeof(PMAPageRunCache)); - wip_ptr->page = 0x30000; - wip_ptr->length = 6; - wip_ptr->next = NULL; - test_1_cache = wip_ptr; - wip_ptr = calloc(1, sizeof(PMAPageRunCache)); - wip_ptr->page = 0x20000; - wip_ptr->length = 5; - wip_ptr->next = test_1_cache; - test_1_cache = wip_ptr; - wip_ptr = calloc(1, sizeof(PMAPageRunCache)); - wip_ptr->page = 0x10000; - wip_ptr->length = 4; - wip_ptr->next = test_1_cache; - test_1_cache = wip_ptr; - - wip_ptr = calloc(1, sizeof(PMAPageRunCache)); - wip_ptr->page = 0x30000; - wip_ptr->length = 6; - wip_ptr->next = NULL; - test_2_cache = wip_ptr; - wip_ptr = calloc(1, sizeof(PMAPageRunCache)); - wip_ptr->page = 0x20000; - wip_ptr->length = 4; - wip_ptr->next = test_2_cache; - test_2_cache = wip_ptr; - wip_ptr = calloc(1, sizeof(PMAPageRunCache)); - wip_ptr->page = 0x10000; - wip_ptr->length = 5; - wip_ptr->next = test_2_cache; - test_2_cache = wip_ptr; - - wip_ptr = calloc(1, sizeof(PMAPageRunCache)); - wip_ptr->page = 0x30000; - wip_ptr->length = 4; - wip_ptr->next = NULL; - test_3_cache = wip_ptr; - wip_ptr = calloc(1, sizeof(PMAPageRunCache)); - wip_ptr->page = 0x20000; - wip_ptr->length = 5; - wip_ptr->next = test_3_cache; - test_3_cache = wip_ptr; - wip_ptr = calloc(1, sizeof(PMAPageRunCache)); - wip_ptr->page = 0x10000; - wip_ptr->length = 6; - wip_ptr->next = test_3_cache; - test_3_cache = wip_ptr; - - test_4_cache = calloc(1, sizeof(PMAPageRunCache)); - test_4_cache->page = 0x40000; - test_4_cache->length = 2; - // Invalid pointer; used to confirm that we stop searching when we find exact run - test_4_cache->next = 0x8fffffffffffffff; - - wip_ptr = calloc(1, sizeof(PMAPageRunCache)); - wip_ptr->page = 0x50000; - wip_ptr->length = 3; - wip_ptr->next = NULL; - test_5_cache = wip_ptr; - wip_ptr = calloc(1, sizeof(PMAPageRunCache)); - wip_ptr->page = 0x99000; - wip_ptr->length = 1; - wip_ptr->next = test_5_cache; - test_5_cache = wip_ptr; - - // Test 0: page run cache empty - _pma_state->free_page_runs = test_0_cache; - address = _pma_get_cached_pages(2); - assert(NULL == address); - - // Test 1: find run bigger than requested, by two pages, at the very beginning - _pma_state->free_page_runs = test_1_cache; - address = _pma_get_cached_pages(2); - assert(0x10000 == address); - assert(2 == _pma_state->free_page_runs->length); - assert(0x12000 == _pma_state->free_page_runs->page); - assert(5 == _pma_state->free_page_runs->next->length); - assert(0x20000 == _pma_state->free_page_runs->next->page); - assert(6 == _pma_state->free_page_runs->next->next->length); - assert(0x30000 == _pma_state->free_page_runs->next->next->page); - assert(NULL == _pma_state->free_page_runs->next->next->next); - - // Test 2: find run bigger than request, by two pages, in the middle - _pma_state->free_page_runs = test_2_cache; - address = _pma_get_cached_pages(2); - assert(0x20000 == address); - assert(5 == _pma_state->free_page_runs->length); - assert(0x10000 == _pma_state->free_page_runs->page); - assert(2 == _pma_state->free_page_runs->next->length); - assert(0x22000 == _pma_state->free_page_runs->next->page); - assert(6 == _pma_state->free_page_runs->next->next->length); - assert(0x30000 == _pma_state->free_page_runs->next->next->page); - assert(NULL == _pma_state->free_page_runs->next->next->next); - - // Test 3: find run bigger than requested, by two pages, at the very end - _pma_state->free_page_runs = test_3_cache; - address = _pma_get_cached_pages(2); - assert(0x30000 == address); - assert(6 == _pma_state->free_page_runs->length); - assert(0x10000 == _pma_state->free_page_runs->page); - assert(5 == _pma_state->free_page_runs->next->length); - assert(0x20000 == _pma_state->free_page_runs->next->page); - assert(2 == _pma_state->free_page_runs->next->next->length); - assert(0x32000 == _pma_state->free_page_runs->next->next->page); - assert(NULL == _pma_state->free_page_runs->next->next->next); - - // Test 4: find exactly sized run, as only entry in cache, and stop looking - _pma_state->free_page_runs = test_4_cache; - address = _pma_get_cached_pages(2); - assert(0x40000 == address); - assert(0x8fffffffffffffff == _pma_state->free_page_runs); - - // Test 5: find run bigger than request, by a single page - _pma_state->free_page_runs = test_5_cache; - address = _pma_get_cached_pages(2); - assert(0x50000 == address); - assert(1 == _pma_state->free_page_runs->length); - assert(0x99000 == _pma_state->free_page_runs->page); - assert(NULL == _pma_state->free_page_runs->next); - assert(0x52000 == _pma_state->free_pages->page); - assert(NULL == _pma_state->free_pages->next); - - // Clean up - while (test_1_cache != NULL) { - wip_ptr = test_1_cache; - test_1_cache = test_1_cache->next; - free(wip_ptr); - } - while (test_2_cache != NULL) { - wip_ptr = test_2_cache; - test_2_cache = test_2_cache->next; - free(wip_ptr); - } - while (test_3_cache != NULL) { - wip_ptr = test_3_cache; - test_3_cache = test_3_cache->next; - free(wip_ptr); - } - free(_pma_state->free_pages); - free(_pma_state->free_page_runs); - _pma_state_free(); -} - -void -test_pma_malloc_single_page(void) { - PMASinglePageCache *wip_ptr; - - // Set up state - _pma_state_malloc(); - - // Set up free page cache - wip_ptr = calloc(1, sizeof(PMASinglePageCache)); - wip_ptr->page = 0x20000; - wip_ptr->next = NULL; - _pma_state->free_pages = wip_ptr; - wip_ptr = calloc(1, sizeof(PMASinglePageCache)); - wip_ptr->page = 0x10000; - wip_ptr->next = _pma_state->free_pages; - _pma_state->free_pages = wip_ptr; - - // Test 1: get page from free page cache - assert(0x10000 == _pma_malloc_single_page(FIRST)); - assert(0x20000 == _pma_state->free_pages->page); - assert(NULL == _pma_state->free_pages->next); - - // Case when no pages in free page cache tested by test_pma_get_new_page - - // Clean up - free(_pma_state->free_pages); - _pma_state_free(); -} - -void -test_pma_malloc_shared_page(void) { - PMASinglePageCache *free_pages; - PMASinglePageCache *wip_ptr; - const uint64_t mmap_size = 2 * PMA_PAGE_SIZE; - const uint8_t test_1_bucket_size = 0; - const uint8_t test_2_bucket_size = 0; - const uint8_t test_3_bucket_size = 6; - void *shared_pages; - - // Set up state - _pma_state_malloc(); - _pma_state->metadata->arena_start = PMA_SNAPSHOT_ADDR; - _pma_state->metadata->snapshot_size = PMA_PAGE_SIZE; - _pma_state->metadata->next_offset = PMA_PAGE_SIZE; - _pma_state->free_pages = NULL; - - _pma_state->metadata->dpage_cache = calloc(1, sizeof(PMADPageCache)); - _pma_state->metadata->dpage_cache->size = 0; - - // Set up shared pages - shared_pages = mmap( - PMA_SNAPSHOT_ADDR, - mmap_size, - PROT_READ | PROT_WRITE, - MAP_ANONYMOUS | MAP_PRIVATE, - -1, - 0); - assert(MAP_FAILED != shared_pages); - - // Set up free page cache - wip_ptr = calloc(1, sizeof(PMASinglePageCache)); - wip_ptr->page = (shared_pages + PMA_PAGE_SIZE); - wip_ptr->next = NULL; - free_pages = wip_ptr; - wip_ptr = calloc(1, sizeof(PMASinglePageCache)); - wip_ptr->page = shared_pages; - wip_ptr->next = free_pages; - free_pages = wip_ptr; - - // Test 1: could not allocate page - assert(-1 == _pma_malloc_shared_page(test_1_bucket_size)); - - // Test 2: 16 byte slots - _pma_state->free_pages = free_pages; - assert(0 == _pma_malloc_shared_page(test_2_bucket_size)); - assert(NULL != _pma_state->metadata->shared_pages[test_2_bucket_size]); - assert(1 == _pma_state->metadata->shared_pages[test_2_bucket_size]->dirty); - assert(4 == _pma_state->metadata->shared_pages[test_2_bucket_size]->size); - assert(253 == _pma_state->metadata->shared_pages[test_2_bucket_size]->free); - for (uint8_t i = 0; i < PMA_BITMAP_SIZE; ++i) { - assert(PMA_EMPTY_BITMAP == _pma_state->metadata->shared_pages[test_2_bucket_size]->bits[i]); - } - assert(NULL != _pma_state->free_pages); - assert((shared_pages + PMA_PAGE_SIZE) == _pma_state->free_pages->page); - assert(NULL == _pma_state->free_pages->next); - - // Test 3: 1024 byte slots - assert(0 == _pma_malloc_shared_page(test_3_bucket_size)); - assert(NULL != _pma_state->metadata->shared_pages[test_3_bucket_size]); - assert(1 == _pma_state->metadata->shared_pages[test_3_bucket_size]->dirty); - assert(10 == _pma_state->metadata->shared_pages[test_3_bucket_size]->size); - assert(3 == _pma_state->metadata->shared_pages[test_3_bucket_size]->free); - for (uint8_t i = 0; i < PMA_BITMAP_SIZE; ++i) { - assert(PMA_EMPTY_BITMAP == _pma_state->metadata->shared_pages[test_3_bucket_size]->bits[i]); - } - assert(NULL == _pma_state->free_pages); - - // Clean up - munmap(shared_pages, mmap_size); - _pma_state_free(); -} - -void -test_pma_update_free_pages(void) { - PMADirtyPageEntry test_1_dirty_pages[2]; - PMADirtyPageEntry test_2_dirty_page; - PMADirtyPageEntry test_3_dirty_page; - - // Set up state - _pma_state_malloc(); - _pma_state->metadata->arena_start = PMA_SNAPSHOT_ADDR; - - // Set up dirty pages - test_1_dirty_pages[0].index = 1; - test_1_dirty_pages[0].num_pages = 1; - test_1_dirty_pages[0].status = SHARED; - test_1_dirty_pages[1].index = 1; - test_1_dirty_pages[1].num_pages = 2; - test_1_dirty_pages[1].status = FIRST; - - test_2_dirty_page.index = 2; - test_2_dirty_page.num_pages = 1; - test_2_dirty_page.status = FREE; - - test_3_dirty_page.index = 3; - test_3_dirty_page.num_pages = 2; - test_3_dirty_page.status = FREE; - - // Test 1: all dirty pages have non-free status - assert(0 == _pma_update_free_pages(2, test_1_dirty_pages)); - assert(NULL == _pma_state->free_pages); - assert(NULL == _pma_state->free_page_runs); - - // Test 2: add single page to free page cache - assert(0 == _pma_update_free_pages(1, &test_2_dirty_page)); - assert(NULL != _pma_state->free_pages); - assert(INDEX_TO_PTR(2) == _pma_state->free_pages->page); - assert(NULL == _pma_state->free_pages->next); - - // Test 3: add multiple free pages to free page runs cache - assert(0 == _pma_update_free_pages(1, &test_3_dirty_page)); - assert(NULL != _pma_state->free_page_runs); - assert(INDEX_TO_PTR(3) == _pma_state->free_page_runs->page); - assert(2 == _pma_state->free_page_runs->length); - assert(NULL == _pma_state->free_page_runs->next); - - // Clean up - free(_pma_state->free_pages); - free(_pma_state->free_page_runs); - _pma_state_free(); -} - -void -test_pma_verify_checksum(void) { - PMAMetadata fake_metadata_page; - - // Set up state - _pma_state_malloc(); - - // Test 1: good checksum - fake_metadata_page.checksum = 0; - fake_metadata_page.checksum = crc_32( - (unsigned char *)(&fake_metadata_page), - PMA_PAGE_SIZE); - assert(1 == _pma_verify_checksum(&fake_metadata_page)); - - // Test 2: bad checksum - fake_metadata_page.checksum = 0xbaddecaf; - assert(0 == _pma_verify_checksum(&fake_metadata_page)); - - // Clean up - _pma_state_free(); -} - -void -test_pma_in_arena(void) { - // Set up state - _pma_state_malloc(); - _pma_state->metadata->arena_start = 0x7fffffff; - _pma_state->metadata->arena_end = 0x80000001; - - // Test 1: before arena start - assert(0 == pma_in_arena(0x10000000)); - - // Test 2: equal to arena start - assert(1 == pma_in_arena(0x7fffffff)); - - // Test 3: in arena - assert(1 == pma_in_arena(0x80000000)); - - // Test 4: equal to arena end - assert(0 == pma_in_arena(0x80000001)); - - // Test 5: after arena end - assert(0 == pma_in_arena(0xffffffff)); - - // Clean up - _pma_state_free(); -} - -void -test_pma_init(void) { - struct stat page_dir_statbuf; - struct stat page_dir_statbuf_v; - struct stat snapshot_statbuf; - struct stat snapshot_statbuf_v; - size_t dir_len; - uint32_t checksum; - char *page_dir_path; - char *snapshot_path; - - // Set up - dir_len = strlen(_test_state->dir); - - page_dir_path = malloc(dir_len + 15); - sprintf(page_dir_path, "%s/%s/%s", _test_state->dir, PMA_DEFAULT_DIR_NAME, PMA_PAGE_DIR_FILENAME); - - snapshot_path = malloc(dir_len + 15); - sprintf(snapshot_path, "%s/%s/%s", _test_state->dir, PMA_DEFAULT_DIR_NAME, PMA_SNAPSHOT_FILENAME); - - // Test 1: successful initialization - assert(0 == pma_init(_test_state->dir)); - - fstat(_pma_state->page_dir_fd, &page_dir_statbuf); - stat(page_dir_path, &page_dir_statbuf_v); - assert(page_dir_statbuf_v.st_dev == page_dir_statbuf.st_dev); - assert(page_dir_statbuf_v.st_ino == page_dir_statbuf.st_ino); - - fstat(_pma_state->snapshot_fd, &snapshot_statbuf); - stat(snapshot_path, &snapshot_statbuf_v); - assert(snapshot_statbuf_v.st_dev == snapshot_statbuf.st_dev); - assert(snapshot_statbuf_v.st_ino == snapshot_statbuf.st_ino); - - assert(0x400000 == page_dir_statbuf.st_size); - assert(0x40000000 == snapshot_statbuf.st_size); - - assert(NULL == _pma_state->free_pages); - assert(NULL == _pma_state->free_page_runs); - assert(0 == _pma_state->meta_page_offset); - - assert(0x400000 == _pma_state->page_directory.size); - assert(1 == _pma_state->page_directory.next_index); - assert(FIRST == _pma_state->page_directory.entries[0].status); - assert(8192 == _pma_state->page_directory.entries[0].offset); - - assert(0xBADDECAFC0FFEE00 == _pma_state->metadata->magic_code); - assert(1 == _pma_state->metadata->version); - assert(0 == _pma_state->metadata->epoch); - assert(0 == _pma_state->metadata->event); - assert(0 == _pma_state->metadata->root); - assert(0x10000 == _pma_state->metadata->arena_start); - assert(0x11000 == _pma_state->metadata->arena_end); - assert(12288 == _pma_state->metadata->next_offset); - assert(0x10000 == _pma_state->metadata->dpage_cache); - assert(0 == _pma_state->metadata->dpage_cache->dirty); - assert(0 == _pma_state->metadata->dpage_cache->size); - assert(0 == _pma_state->metadata->dpage_cache->head); - assert(0 == _pma_state->metadata->dpage_cache->tail); - assert(0 == _pma_state->metadata->num_dirty_pages); - assert(0 == _pma_state->metadata->dirty_pages[0].index); - assert(0 == _pma_state->metadata->dirty_pages[0].offset); - assert(0 == _pma_state->metadata->dirty_pages[0].num_pages); - - checksum = _pma_state->metadata->checksum; - _pma_state->metadata->checksum = 0; - assert(checksum == crc_32((unsigned char*)_pma_state->metadata, PMA_PAGE_SIZE)); - - // Clean up - munmap(_pma_state->metadata->arena_start, _pma_state->metadata->snapshot_size); - munmap(_pma_state->page_directory.entries, PMA_MAXIMUM_DIR_SIZE); - - _pma_state_free(); - - unlink(snapshot_path); - free(snapshot_path); - - unlink(page_dir_path); - free(page_dir_path); -} - -void -test_pma_sync(void) { - PMAMetadata *metadata_page_1; - PMAMetadata *metadata_page_2; - PMASharedPageHeader *shared_page_16b; - size_t dir_len; - char *page_dir_path; - char *snapshot_path; - - // Set up - dir_len = strlen(_test_state->dir); - - page_dir_path = malloc(dir_len + 15); - sprintf(page_dir_path, "%s/%s/%s", _test_state->dir, PMA_DEFAULT_DIR_NAME, PMA_PAGE_DIR_FILENAME); - - snapshot_path = malloc(dir_len + 15); - sprintf(snapshot_path, "%s/%s/%s", _test_state->dir, PMA_DEFAULT_DIR_NAME, PMA_SNAPSHOT_FILENAME); - - pma_init(_test_state->dir); - _pma_state->metadata->epoch = 1; - _pma_state->metadata->event = 1; - - // Test 1: good event, bad epoch - assert(-1 == pma_sync(0, 2, 0)); - - // Test 2: good epoch, bad event - assert(-1 == pma_sync(1, 0, 0)); - - // Test 3: successful sync - _pma_state->metadata->epoch = 0; - _pma_state->metadata->event = 0; - - pma_malloc(16); - assert(1 == _pma_state->metadata->num_dirty_pages); - - assert(0 == pma_sync(1, 2, 3)); - assert(1 == _pma_state->metadata->epoch); - assert(2 == _pma_state->metadata->event); - assert(3 == _pma_state->metadata->root); - assert(0x12000 == _pma_state->metadata->arena_end); - assert(0x11000 == _pma_state->metadata->shared_pages[0]); - assert(NULL == _pma_state->metadata->shared_pages[1]); - assert(NULL == _pma_state->metadata->shared_pages[2]); - assert(NULL == _pma_state->metadata->shared_pages[3]); - assert(NULL == _pma_state->metadata->shared_pages[4]); - assert(NULL == _pma_state->metadata->shared_pages[5]); - assert(NULL == _pma_state->metadata->shared_pages[6]); - assert(0x10000 == _pma_state->metadata->dpage_cache); - assert(0 == _pma_state->metadata->num_dirty_pages); - assert(16384 == _pma_state->metadata->next_offset); - - metadata_page_1 = mmap( - NULL, - PMA_PAGE_SIZE, - PROT_READ, - MAP_SHARED, - _pma_state->snapshot_fd, - 0); - metadata_page_2 = mmap( - NULL, - PMA_PAGE_SIZE, - PROT_READ, - MAP_SHARED, - _pma_state->snapshot_fd, - 4096); - shared_page_16b = mmap( - NULL, - PMA_PAGE_SIZE, - PROT_READ, - MAP_SHARED, - _pma_state->snapshot_fd, - 12288); - - assert(metadata_page_1->magic_code == _pma_state->metadata->magic_code); - assert(metadata_page_1->checksum == _pma_state->metadata->checksum); - assert(metadata_page_1->version == _pma_state->metadata->version); - assert(metadata_page_1->epoch == _pma_state->metadata->epoch); - assert(metadata_page_1->event == _pma_state->metadata->event); - assert(metadata_page_1->root == _pma_state->metadata->root); - assert(metadata_page_1->arena_start == _pma_state->metadata->arena_start); - assert(metadata_page_1->arena_end == _pma_state->metadata->arena_end); - assert(metadata_page_1->dpage_cache == _pma_state->metadata->dpage_cache); - assert(metadata_page_1->snapshot_size == _pma_state->metadata->snapshot_size); - assert(metadata_page_1->next_offset == _pma_state->metadata->next_offset); - - assert(1 == metadata_page_1->num_dirty_pages); - assert(1 == metadata_page_1->dirty_pages[0].index); - assert(12288 == metadata_page_1->dirty_pages[0].offset); - assert(1 == metadata_page_1->dirty_pages[0].num_pages); - assert(SHARED == metadata_page_1->dirty_pages[0].status); - - assert(0 == metadata_page_2->epoch); - assert(0 == metadata_page_2->event); - assert(0 == metadata_page_2->root); - assert(0x11000 == metadata_page_2->arena_end); - assert(NULL == metadata_page_2->shared_pages[0]); - assert(NULL == metadata_page_2->shared_pages[1]); - assert(NULL == metadata_page_2->shared_pages[2]); - assert(NULL == metadata_page_2->shared_pages[3]); - assert(NULL == metadata_page_2->shared_pages[4]); - assert(NULL == metadata_page_2->shared_pages[5]); - assert(NULL == metadata_page_2->shared_pages[6]); - assert(0x10000 == metadata_page_2->dpage_cache); - assert(0 == metadata_page_2->num_dirty_pages); - assert(12288 == metadata_page_2->next_offset); - - assert(NULL == shared_page_16b->next); - assert(0 == shared_page_16b->dirty); - assert(4 == shared_page_16b->size); - assert(252 == shared_page_16b->free); - - // Clean up - munmap(metadata_page_1, PMA_PAGE_SIZE); - munmap(metadata_page_2, PMA_PAGE_SIZE); - - munmap(_pma_state->metadata->arena_start, _pma_state->metadata->snapshot_size); - munmap(_pma_state->page_directory.entries, PMA_MAXIMUM_DIR_SIZE); - - _pma_state_free(); - - unlink(snapshot_path); - free(snapshot_path); - - unlink(page_dir_path); - free(page_dir_path); -} - -void -test_pma_load(void) { - PMARootState res; - size_t dir_len; - const uint64_t bad_code = 0x600DDECAFC0FFEE0; - const uint64_t old_event = 0; - const uint32_t bad_checksum = 0; - const uint32_t bad_version = 1337; - int snapshot_fd; - char *bin_path; - char *page_dir_path; - char *snapshot_path; - - // Set up - dir_len = strlen(_test_state->dir); - - bin_path = malloc(dir_len + 6); - sprintf(bin_path, "%s/%s", _test_state->dir, PMA_DEFAULT_DIR_NAME); - - page_dir_path = malloc(dir_len + 15); - sprintf(page_dir_path, "%s/%s", bin_path, PMA_PAGE_DIR_FILENAME); - - snapshot_path = malloc(dir_len + 15); - sprintf(snapshot_path, "%s/%s", bin_path, PMA_SNAPSHOT_FILENAME); - - // Test 1: dir doesn't exist - rmdir(bin_path); - res = pma_load(_test_state->dir); - assert(0 == res.epoch); - assert(0 == res.event); - assert(0 == res.root); - assert(2 == errno); - errno = 0; - - // Test 2: snapshot doesn't exist - pma_init(_test_state->dir); - assert(0 == pma_close(0, 1, 0)); - unlink(snapshot_path); - - res = pma_load(_test_state->dir); - assert(0 == res.epoch); - assert(0 == res.event); - assert(0 == res.root); - assert(2 == errno); - - errno = 0; - _pma_state_free(); - unlink(page_dir_path); - - // Test 3: page directory doesn't exist - pma_init(_test_state->dir); - assert(0 == pma_close(0, 1, 0)); - unlink(page_dir_path); - - res = pma_load(_test_state->dir); - assert(0 == res.epoch); - assert(0 == res.event); - assert(0 == res.root); - assert(2 == errno); - - errno = 0; - _pma_state_free(); - unlink(snapshot_path); - - // Test 4: bad magic code - pma_init(_test_state->dir); - assert(0 == pma_close(0, 1, 0)); - snapshot_fd = open(snapshot_path, PMA_NEW_FILE_FLAGS, PMA_FILE_PERMISSIONS); - assert(0 < snapshot_fd); - pwrite(snapshot_fd, &bad_code, 8, 0); - - res = pma_load(_test_state->dir); - assert(0 == res.epoch); - assert(0 == res.event); - assert(0 == res.root); - assert(EILSEQ == errno); - - errno = 0; - close(snapshot_fd); - unlink(snapshot_path); - unlink(page_dir_path); - - // Test 5: bad version - pma_init(_test_state->dir); - assert(0 == pma_close(0, 1, 0)); - snapshot_fd = open(snapshot_path, PMA_NEW_FILE_FLAGS, PMA_FILE_PERMISSIONS); - assert(0 < snapshot_fd); - pwrite(snapshot_fd, &bad_version, 4, 12); - - res = pma_load(_test_state->dir); - assert(0 == res.epoch); - assert(0 == res.event); - assert(0 == res.root); - assert(EILSEQ == errno); - - errno = 0; - close(snapshot_fd); - unlink(snapshot_path); - unlink(page_dir_path); - - // Test 6: both metadata pages have invalid checksum - pma_init(_test_state->dir); - assert(0 == pma_close(0, 1, 0)); - snapshot_fd = open(snapshot_path, PMA_NEW_FILE_FLAGS, PMA_FILE_PERMISSIONS); - assert(0 < snapshot_fd); - pwrite(snapshot_fd, &bad_checksum, 4, 8); - pwrite(snapshot_fd, &bad_checksum, 4, (PMA_PAGE_SIZE + 8)); - - res = pma_load(_test_state->dir); - assert(0 == res.epoch); - assert(0 == res.event); - assert(0 == res.root); - assert(EILSEQ == errno); - - errno = 0; - close(snapshot_fd); - unlink(snapshot_path); - unlink(page_dir_path); - - // Test 7: first metadata page is newer but has bad checksum - pma_init(_test_state->dir); - assert(0 == pma_close(1, 2, 3)); - snapshot_fd = open(snapshot_path, PMA_NEW_FILE_FLAGS, PMA_FILE_PERMISSIONS); - assert(0 < snapshot_fd); - pwrite(snapshot_fd, &bad_checksum, 4, 8); - pwrite(snapshot_fd, &old_event, 8, (PMA_PAGE_SIZE + 24)); - - res = pma_load(_test_state->dir); - assert(0 == res.epoch); - assert(0 == res.event); - assert(0 == res.root); - assert(0 == _pma_state->meta_page_offset); - - assert(0 == pma_close(4, 4, 4)); - close(snapshot_fd); - unlink(snapshot_path); - unlink(page_dir_path); - - // Test 8: second metadata page is newer - pma_init(_test_state->dir); - assert(0 == pma_sync(1, 2, 3)); - assert(0 == pma_close(4, 5, 6)); - - res = pma_load(_test_state->dir); - assert(4 == res.epoch); - assert(5 == res.event); - assert(6 == res.root); - assert(0 == _pma_state->meta_page_offset); - - assert(0 == pma_close(7, 8, 9)); - close(snapshot_fd); - unlink(snapshot_path); - unlink(page_dir_path); - - // Clean up - free(bin_path); - free(snapshot_path); - free(page_dir_path); -} diff --git a/rust/ares/src/serf.rs b/rust/ares/src/serf.rs index 89c70be..b553771 100644 --- a/rust/ares/src/serf.rs +++ b/rust/ares/src/serf.rs @@ -10,13 +10,16 @@ use crate::mem::NockStack; use crate::mug::*; use crate::newt::Newt; use crate::noun::{Atom, Cell, DirectAtom, Noun, Slots, D, T}; +use crate::persist::pma_meta_set; +use crate::persist::{pma_meta_get, pma_open, pma_sync, Persist}; use crate::trace::*; use ares_macros::tas; use signal_hook; use signal_hook::consts::SIGINT; use std::fs::create_dir_all; use std::io; -use std::path::{Path, PathBuf}; +use std::mem::size_of; +use std::path::PathBuf; use std::result::Result; use std::sync::atomic::{AtomicBool, Ordering}; use std::sync::Arc; @@ -26,6 +29,57 @@ crate::gdb!(); const FLAG_TRACE: u32 = 1 << 8; +#[repr(usize)] +enum BTMetaField { + SnapshotVersion = 0, + Snapshot = 1, +} +struct Snapshot(pub *mut SnapshotMem); + +impl Persist for Snapshot { + unsafe fn space_needed(&mut self, stack: &mut NockStack) -> usize { + let mut arvo = (*(self.0)).arvo; + let mut cold = (*(self.0)).cold; + let arvo_space_needed = arvo.space_needed(stack); + let cold_space_needed = cold.space_needed(stack); + (((size_of::() + 7) >> 3) << 3) + arvo_space_needed + cold_space_needed + } + + unsafe fn copy_to_buffer(&mut self, stack: &mut NockStack, buffer: &mut *mut u8) { + let snapshot_buffer = *buffer as *mut SnapshotMem; + std::ptr::copy_nonoverlapping(self.0, snapshot_buffer, 1); + *self = Snapshot(snapshot_buffer); + *buffer = snapshot_buffer.add(1) as *mut u8; + + let mut arvo = (*snapshot_buffer).arvo; + arvo.copy_to_buffer(stack, buffer); + (*snapshot_buffer).arvo = arvo; + + let mut cold = (*snapshot_buffer).cold; + cold.copy_to_buffer(stack, buffer); + (*snapshot_buffer).cold = cold; + } + + unsafe fn handle_to_u64(&self) -> u64 { + self.0 as u64 + } + + unsafe fn handle_from_u64(meta_handle: u64) -> Self { + Snapshot(meta_handle as *mut SnapshotMem) + } +} + +#[repr(C)] +#[repr(packed)] +struct SnapshotMem { + pub epoch: u64, + pub event_num: u64, + pub arvo: Noun, + pub cold: Cold, +} + +const PMA_CURRENT_SNAPSHOT_VERSION: u64 = 1; + struct Context { epoch: u64, event_num: u64, @@ -35,27 +89,87 @@ struct Context { } impl Context { - pub fn new( - _snap_path: &Path, + pub fn load( + snap_path: PathBuf, trace_info: Option, constant_hot_state: &[HotEntry], + ) -> Context { + pma_open(snap_path).expect("serf: pma open failed"); + + let snapshot_version = pma_meta_get(BTMetaField::SnapshotVersion as usize); + + let snapshot = match snapshot_version { + 0 => None, + 1 => Some(unsafe { + Snapshot::handle_from_u64(pma_meta_get(BTMetaField::Snapshot as usize)) + }), + _ => panic!("Unsupported snapshot version"), + }; + + Context::new(trace_info, snapshot, constant_hot_state) + } + + pub unsafe fn save(&mut self) { + let handle = { + let mut snapshot = Snapshot({ + let snapshot_mem_ptr: *mut SnapshotMem = self.nock_context.stack.struct_alloc(1); + + // Save into PMA (does not sync) + (*snapshot_mem_ptr).epoch = self.epoch; + (*snapshot_mem_ptr).event_num = self.event_num; + (*snapshot_mem_ptr).arvo = self.arvo; + (*snapshot_mem_ptr).cold = self.nock_context.cold; + snapshot_mem_ptr + }); + + let handle = snapshot.save_to_pma(&mut self.nock_context.stack); + + self.epoch = (*snapshot.0).epoch; + self.arvo = (*snapshot.0).arvo; + self.event_num = (*snapshot.0).event_num; + self.nock_context.cold = (*snapshot.0).cold; + + handle + }; + pma_meta_set( + BTMetaField::SnapshotVersion as usize, + PMA_CURRENT_SNAPSHOT_VERSION, + ); + pma_meta_set(BTMetaField::Snapshot as usize, handle); + } + + fn new( + trace_info: Option, + snapshot: Option, + constant_hot_state: &[HotEntry], ) -> Self { - // TODO: switch to Pma when ready - let mut stack = NockStack::new(512 << 10 << 10, 0); + let mut stack = NockStack::new(1024 << 10 << 10, 0); + let newt = Newt::new(); + let cache = Hamt::::new(&mut stack); + + let (epoch, event_num, arvo, mut cold) = unsafe { + match snapshot { + Some(snapshot) => ( + (*(snapshot.0)).epoch, + (*(snapshot.0)).event_num, + (*(snapshot.0)).arvo, + (*(snapshot.0)).cold, + ), + None => (0, 0, D(0), Cold::new(&mut stack)), + } + }; - let cold = Cold::new(&mut stack); let hot = Hot::init(&mut stack, constant_hot_state); - - let (epoch, event_num, arvo) = (0, 0, D(0)); + let warm = Warm::init(&mut stack, &mut cold, &hot); let mug = mug_u32(&mut stack, arvo); let nock_context = interpreter::Context { stack, - newt: Newt::new(), + newt, cold, - warm: Warm::new(), + warm, hot, - cache: Hamt::::new(), + cache, scry_stack: D(0), trace_info, }; @@ -73,20 +187,35 @@ impl Context { // Setters // - pub fn event_update(&mut self, new_event_num: u64, new_arvo: Noun) { + /// + /// ## Safety + /// + /// calls save(), which invalidates all nouns not in the context + /// until [preserve_event_update_leftovers] is called to resolve forwarding pointers. + pub unsafe fn event_update(&mut self, new_event_num: u64, new_arvo: Noun) { // XX: assert event numbers are continuous self.arvo = new_arvo; self.event_num = new_event_num; + self.save(); + + self.nock_context.cache = Hamt::new(&mut self.nock_context.stack); + self.nock_context.scry_stack = D(0); + + // XX save to PMA self.mug = mug_u32(&mut self.nock_context.stack, self.arvo); } - // - // Snapshot functions - // - - pub fn sync(&mut self) { - // TODO actually sync - eprintln!("serf: TODO sync"); + /// + /// ## Safety + /// + /// Preserves nouns and jet states in context and then calls [flip_top_frame]. + /// Other stack-allocated objects needing preservation should be preserved between + /// [event_update] and invocation of this function + pub unsafe fn preserve_event_update_leftovers(&mut self) { + let stack = &mut self.nock_context.stack; + stack.preserve(&mut self.nock_context.warm); + stack.preserve(&mut self.nock_context.hot); + stack.flip_top_frame(0); } // @@ -208,13 +337,13 @@ pub fn serf(constant_hot_state: &[HotEntry]) -> io::Result<()> { } } - let mut context = Context::new(&snap_path, trace_info, constant_hot_state); + let mut context = Context::load(snap_path, trace_info, constant_hot_state); context.ripe(); // Can't use for loop because it borrows newt while let Some(writ) = context.next() { // Reset the local cache and scry handler stack - context.nock_context.cache = Hamt::::new(); + context.nock_context.cache = Hamt::::new(&mut context.nock_context.stack); context.nock_context.scry_stack = D(0); let tag = slot(writ, 2)?.as_direct().unwrap(); @@ -229,8 +358,7 @@ pub fn serf(constant_hot_state: &[HotEntry]) -> io::Result<()> { } tas!(b"save") => { // XX what is eve for? - eprintln!("\r %save"); - context.sync(); + pma_sync(); } tas!(b"meld") => eprintln!("\r %meld: not implemented"), tas!(b"pack") => eprintln!("\r %pack: not implemented"), @@ -261,18 +389,6 @@ pub fn serf(constant_hot_state: &[HotEntry]) -> io::Result<()> { }; clear_interrupt(); - - // Persist data that should survive between events - // XX: Such data should go in the PMA once that's available, except - // the warm and hot state which should survive between events but not interpreter runs - unsafe { - let stack = &mut context.nock_context.stack; - stack.preserve(&mut context.arvo); - stack.preserve(&mut context.nock_context.cold); - stack.preserve(&mut context.nock_context.warm); - stack.preserve(&mut context.nock_context.hot); - stack.flip_top_frame(0); - } } Ok(()) @@ -365,7 +481,10 @@ fn play_life(context: &mut Context, eve: Noun) { let eved = lent(eve).expect("serf: play: boot event number failure") as u64; let arvo = slot(gat, 7).expect("serf: play: lifecycle didn't return initial Arvo"); - context.event_update(eved, arvo); + unsafe { + context.event_update(eved, arvo); + context.preserve_event_update_leftovers(); + } context.play_done(); } Err(error) => match error { @@ -384,6 +503,7 @@ fn play_list(context: &mut Context, mut lit: Noun) { let mut eve = context.event_num; while let Ok(cell) = lit.as_cell() { let ovo = cell.head(); + lit = cell.tail(); let trace_name = if context.nock_context.trace_info.is_some() { Some(format!("play [{}]", eve)) } else { @@ -398,13 +518,16 @@ fn play_list(context: &mut Context, mut lit: Noun) { .tail(); eve += 1; - context.event_update(eve, arvo); + unsafe { + context.event_update(eve, arvo); + context.nock_context.stack.preserve(&mut lit); + context.preserve_event_update_leftovers(); + } } Err(goof) => { return context.play_bail(goof); } } - lit = cell.tail(); } context.play_done(); } @@ -427,10 +550,14 @@ fn work(context: &mut Context, job: Noun) { match soft(context, job, trace_name) { Ok(res) => { let cell = res.as_cell().expect("serf: work: +slam returned atom"); - let fec = cell.head(); + let mut fec = cell.head(); let eve = context.event_num; - context.event_update(eve + 1, cell.tail()); + unsafe { + context.event_update(eve + 1, cell.tail()); + context.nock_context.stack.preserve(&mut fec); + context.preserve_event_update_leftovers(); + } context.work_done(fec); } Err(goof) => { @@ -447,14 +574,14 @@ fn work_swap(context: &mut Context, job: Noun, goof: Noun) { clear_interrupt(); let stack = &mut context.nock_context.stack; - context.nock_context.cache = Hamt::::new(); + context.nock_context.cache = Hamt::::new(stack); // crud ovo = [+(now) [%$ %arvo ~] [%crud goof ovo]] let job_cell = job.as_cell().expect("serf: work: job not a cell"); let job_now = job_cell.head().as_atom().expect("serf: work: now not atom"); let now = inc(stack, job_now).as_noun(); let wire = T(stack, &[D(0), D(tas!(b"arvo")), D(0)]); let crud = DirectAtom::new_panic(tas!(b"crud")); - let ovo = T(stack, &[now, wire, crud.as_noun(), goof, job_cell.tail()]); + let mut ovo = T(stack, &[now, wire, crud.as_noun(), goof, job_cell.tail()]); let trace_name = if context.nock_context.trace_info.is_some() { Some(work_trace_name( &mut context.nock_context.stack, @@ -468,10 +595,15 @@ fn work_swap(context: &mut Context, job: Noun, goof: Noun) { match soft(context, ovo, trace_name) { Ok(res) => { let cell = res.as_cell().expect("serf: work: crud +slam returned atom"); - let fec = cell.head(); + let mut fec = cell.head(); let eve = context.event_num; - context.event_update(eve + 1, cell.tail()); + unsafe { + context.event_update(eve + 1, cell.tail()); + context.nock_context.stack.preserve(&mut ovo); + context.nock_context.stack.preserve(&mut fec); + context.preserve_event_update_leftovers(); + } context.work_swap(ovo, fec); } Err(goof_crud) => { diff --git a/rust/ares/src/unifying_equality.rs b/rust/ares/src/unifying_equality.rs new file mode 100644 index 0000000..267a2f0 --- /dev/null +++ b/rust/ares/src/unifying_equality.rs @@ -0,0 +1,254 @@ +use crate::assert_acyclic; +use crate::assert_no_forwarding_pointers; +use crate::assert_no_junior_pointers; +use crate::mem::{NockStack, ALLOC, FRAME, STACK}; +use crate::noun::Noun; +use crate::persist::{pma_contains, pma_dirty}; +use either::Either::*; +use libc::{c_void, memcmp}; + +#[cfg(feature = "check_junior")] +#[macro_export] +macro_rules! assert_no_junior_pointers { + ( $x:expr, $y:expr ) => { + assert_no_alloc::permit_alloc(|| { + assert!($x.no_junior_pointers($y)); + }) + }; +} + +#[cfg(not(feature = "check_junior"))] +#[macro_export] +macro_rules! assert_no_junior_pointers { + ( $x:expr, $y:expr ) => {}; +} + +pub unsafe fn unifying_equality(stack: &mut NockStack, a: *mut Noun, b: *mut Noun) -> bool { + /* This version of unifying equality is not like that of vere. + * Vere does a tree comparison (accelerated by pointer equality and short-circuited by mug + * equality) and then unifies the nouns at the top level if they are equal. + * + * Here we recursively attempt to unify nouns. Pointer-equal nouns are already unified. + * Disequal mugs again short-circuit the unification and equality check. + * + * Since we expect atoms to be normalized, direct and indirect atoms do not unify with each + * other. For direct atoms, no unification is possible as there is no pointer involved in their + * representation. Equality is simply direct equality on the word representation. Indirect + * atoms require equality first of the size and then of the memory buffers' contents. + * + * Cell equality is tested (after mug and pointer equality) by attempting to unify the heads and tails, + * respectively, of cells, and then re-testing. If unification succeeds then the heads and + * tails will be pointer-wise equal and the cell itself can be unified. A failed unification of + * the head or the tail will already short-circuit the unification/equality test, so we will + * not return to re-test the pointer equality. + * + * When actually mutating references for unification, we must be careful to respect seniority. + * A reference to a more junior noun should always be replaced with a reference to a more + * senior noun, *never vice versa*, to avoid introducing references from more senior frames + * into more junior frames, which would result in incorrect operation of the copier. + */ + assert_acyclic!(*a); + assert_acyclic!(*b); + assert_no_forwarding_pointers!(*a); + assert_no_forwarding_pointers!(*b); + assert_no_junior_pointers!(stack, *a); + assert_no_junior_pointers!(stack, *b); + + // If the nouns are already word-equal we have nothing to do + if (*a).raw_equals(*b) { + return true; + }; + // If the nouns have cached mugs which are disequal we have nothing to do + if let (Ok(a_alloc), Ok(b_alloc)) = ((*a).as_allocated(), (*b).as_allocated()) { + if let (Some(a_mug), Some(b_mug)) = (a_alloc.get_cached_mug(), b_alloc.get_cached_mug()) { + if a_mug != b_mug { + return false; + }; + }; + }; + stack.frame_push(0); + *(stack.push::<(*mut Noun, *mut Noun)>()) = (a, b); + loop { + if stack.stack_is_empty() { + break; + }; + let (x, y): (*mut Noun, *mut Noun) = *(stack.top()); + if (*x).raw_equals(*y) { + stack.pop::<(*mut Noun, *mut Noun)>(); + continue; + }; + if let (Ok(x_alloc), Ok(y_alloc)) = ( + // equal direct atoms return true for raw_equals() + (*x).as_allocated(), + (*y).as_allocated(), + ) { + if let (Some(x_mug), Some(y_mug)) = (x_alloc.get_cached_mug(), y_alloc.get_cached_mug()) + { + if x_mug != y_mug { + break; // short-circuit, the mugs differ therefore the nouns must differ + } + }; + match (x_alloc.as_either(), y_alloc.as_either()) { + (Left(x_indirect), Left(y_indirect)) => { + let x_as_ptr = x_indirect.to_raw_pointer(); + let y_as_ptr = y_indirect.to_raw_pointer(); + if x_indirect.size() == y_indirect.size() + && memcmp( + x_indirect.data_pointer() as *const c_void, + y_indirect.data_pointer() as *const c_void, + x_indirect.size() << 3, + ) == 0 + { + let (_senior, junior) = senior_pointer_first(stack, x_as_ptr, y_as_ptr); + if x_as_ptr == junior { + if pma_contains(x, 1) { + pma_dirty(x, 1); + } + *x = *y; + } else { + if pma_contains(y, 1) { + pma_dirty(y, 1); + } + *y = *x; + } + stack.pop::<(*mut Noun, *mut Noun)>(); + continue; + } else { + break; + } + } + (Right(x_cell), Right(y_cell)) => { + let x_as_ptr = x_cell.to_raw_pointer() as *const u64; + let y_as_ptr = y_cell.to_raw_pointer() as *const u64; + if x_cell.head().raw_equals(y_cell.head()) + && x_cell.tail().raw_equals(y_cell.tail()) + { + let (_senior, junior) = senior_pointer_first(stack, x_as_ptr, y_as_ptr); + if x_as_ptr == junior { + if pma_contains(x, 1) { + pma_dirty(x, 1); + } + *x = *y; + } else { + if pma_contains(y, 1) { + pma_dirty(y, 1); + } + *y = *x; + } + stack.pop::<(*mut Noun, *mut Noun)>(); + continue; + } else { + /* THIS ISN'T AN INFINITE LOOP + * If we discover a disequality in either side, we will + * short-circuit the entire loop and reset the work stack. + * + * If both sides are equal, then we will discover pointer + * equality when we return and unify the cell. + */ + *(stack.push::<(*mut Noun, *mut Noun)>()) = + (x_cell.tail_as_mut(), y_cell.tail_as_mut()); + *(stack.push::<(*mut Noun, *mut Noun)>()) = + (x_cell.head_as_mut(), y_cell.head_as_mut()); + continue; + } + } + (_, _) => { + break; // cells don't unify with atoms + } + } + } else { + break; // direct atom not raw equal, so short circuit + } + } + stack.frame_pop(); + + assert_acyclic!(*a); + assert_acyclic!(*b); + assert_no_forwarding_pointers!(*a); + assert_no_forwarding_pointers!(*b); + assert_no_junior_pointers!(stack, *a); + assert_no_junior_pointers!(stack, *b); + + (*a).raw_equals(*b) +} + +unsafe fn senior_pointer_first( + stack: &NockStack, + a: *const u64, + b: *const u64, +) -> (*const u64, *const u64) { + let mut frame_pointer: *const u64 = stack.get_frame_pointer(); + let mut stack_pointer: *const u64 = stack.get_stack_pointer(); + let mut alloc_pointer: *const u64 = stack.get_alloc_pointer(); + let prev_stack_pointer = *(stack.prev_stack_pointer_pointer()); + + let (mut high_pointer, mut low_pointer): (*const u64, *const u64) = if stack.is_west() { + (prev_stack_pointer, alloc_pointer) + } else { + (alloc_pointer, prev_stack_pointer) + }; + + loop { + if low_pointer.is_null() || high_pointer.is_null() { + // we found the bottom of the stack; check entirety of the stack + low_pointer = stack.get_start(); + high_pointer = stack.get_start().add(stack.get_size()); + } + + match ( + a < high_pointer && a >= low_pointer, + b < high_pointer && b >= low_pointer, + ) { + (true, true) => { + // both pointers are in the same frame, pick arbitrarily (lower in mem) + break lower_pointer_first(a, b); + } + (true, false) => break (b, a), // a is in the frame, b is not, so b is senior + (false, true) => break (a, b), // b is in the frame, a is not, so a is senior + (false, false) => { + // chase up the stack + #[allow(clippy::comparison_chain)] + // test to see if the frame under consideration is a west frame + if stack_pointer < alloc_pointer { + stack_pointer = *(frame_pointer.sub(STACK + 1)) as *const u64; + alloc_pointer = *(frame_pointer.sub(ALLOC + 1)) as *const u64; + frame_pointer = *(frame_pointer.sub(FRAME + 1)) as *const u64; + + // both pointers are in the PMA, pick arbitrarily (lower in mem) + if frame_pointer.is_null() { + break lower_pointer_first(a, b); + }; + + // previous allocation pointer + high_pointer = alloc_pointer; + // "previous previous" stack pointer. this is the other boundary of the previous allocation arena + low_pointer = *(frame_pointer.add(STACK)) as *const u64; + } else if stack_pointer > alloc_pointer { + stack_pointer = *(frame_pointer.add(STACK)) as *const u64; + alloc_pointer = *(frame_pointer.add(ALLOC)) as *const u64; + frame_pointer = *(frame_pointer.add(FRAME)) as *const u64; + + // both pointers are in the PMA, pick arbitrarily (lower in mem) + if frame_pointer.is_null() { + break lower_pointer_first(a, b); + }; + + // previous allocation pointer + low_pointer = alloc_pointer; + // "previous previous" stack pointer. this is the other boundary of the previous allocation arena + high_pointer = *(frame_pointer.sub(STACK + 1)) as *const u64; + } else { + panic!("senior_pointer_first: stack_pointer == alloc_pointer"); + } + } + } + } +} + +fn lower_pointer_first(a: *const u64, b: *const u64) -> (*const u64, *const u64) { + if a < b { + (a, b) + } else { + (b, a) + } +} diff --git a/rust/ares_pma/Cargo.lock b/rust/ares_pma/Cargo.lock new file mode 100644 index 0000000..a4dfb7b --- /dev/null +++ b/rust/ares_pma/Cargo.lock @@ -0,0 +1,454 @@ +# This file is automatically @generated by Cargo. +# It is not intended for manual editing. +version = 3 + +[[package]] +name = "aho-corasick" +version = "1.1.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b2969dcb958b36655471fc61f7e416fa76033bdd4bfed0678d8fee1e2d07a1f0" +dependencies = [ + "memchr", +] + +[[package]] +name = "ares_pma" +version = "0.1.0" +dependencies = [ + "bindgen", + "cc", +] + +[[package]] +name = "bindgen" +version = "0.69.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9ffcebc3849946a7170a05992aac39da343a90676ab392c51a4280981d6379c2" +dependencies = [ + "bitflags", + "cexpr", + "clang-sys", + "lazy_static", + "lazycell", + "log", + "peeking_take_while", + "prettyplease", + "proc-macro2", + "quote", + "regex", + "rustc-hash", + "shlex", + "syn", + "which", +] + +[[package]] +name = "bitflags" +version = "2.4.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "327762f6e5a765692301e5bb513e0d9fef63be86bbc14528052b1cd3e6f03e07" + +[[package]] +name = "cc" +version = "1.0.83" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f1174fb0b6ec23863f8b971027804a42614e347eafb0a95bf0b12cdae21fc4d0" +dependencies = [ + "libc", +] + +[[package]] +name = "cexpr" +version = "0.6.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6fac387a98bb7c37292057cffc56d62ecb629900026402633ae9160df93a8766" +dependencies = [ + "nom", +] + +[[package]] +name = "cfg-if" +version = "1.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "baf1de4339761588bc0619e3cbc0120ee582ebb74b53b4efbf79117bd2da40fd" + +[[package]] +name = "clang-sys" +version = "1.6.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c688fc74432808e3eb684cae8830a86be1d66a2bd58e1f248ed0960a590baf6f" +dependencies = [ + "glob", + "libc", + "libloading", +] + +[[package]] +name = "either" +version = "1.9.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a26ae43d7bcc3b814de94796a5e736d4029efb0ee900c12e2d54c993ad1a1e07" + +[[package]] +name = "errno" +version = "0.3.8" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a258e46cdc063eb8519c00b9fc845fc47bcfca4130e2f08e88665ceda8474245" +dependencies = [ + "libc", + "windows-sys 0.52.0", +] + +[[package]] +name = "glob" +version = "0.3.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d2fabcfbdc87f4758337ca535fb41a6d701b65693ce38287d856d1674551ec9b" + +[[package]] +name = "home" +version = "0.5.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5444c27eef6923071f7ebcc33e3444508466a76f7a2b93da00ed6e19f30c1ddb" +dependencies = [ + "windows-sys 0.48.0", +] + +[[package]] +name = "lazy_static" +version = "1.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e2abad23fbc42b3700f2f279844dc832adb2b2eb069b2df918f455c4e18cc646" + +[[package]] +name = "lazycell" +version = "1.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "830d08ce1d1d941e6b30645f1a0eb5643013d835ce3779a5fc208261dbe10f55" + +[[package]] +name = "libc" +version = "0.2.150" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "89d92a4743f9a61002fae18374ed11e7973f530cb3a3255fb354818118b2203c" + +[[package]] +name = "libloading" +version = "0.7.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b67380fd3b2fbe7527a606e18729d21c6f3951633d0500574c4dc22d2d638b9f" +dependencies = [ + "cfg-if", + "winapi", +] + +[[package]] +name = "linux-raw-sys" +version = "0.4.11" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "969488b55f8ac402214f3f5fd243ebb7206cf82de60d3172994707a4bcc2b829" + +[[package]] +name = "log" +version = "0.4.20" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b5e6163cb8c49088c2c36f57875e58ccd8c87c7427f7fbd50ea6710b2f3f2e8f" + +[[package]] +name = "memchr" +version = "2.6.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f665ee40bc4a3c5590afb1e9677db74a508659dfd71e126420da8274909a0167" + +[[package]] +name = "minimal-lexical" +version = "0.2.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "68354c5c6bd36d73ff3feceb05efa59b6acb7626617f4962be322a825e61f79a" + +[[package]] +name = "nom" +version = "7.1.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d273983c5a657a70a3e8f2a01329822f3b8c8172b73826411a55751e404a0a4a" +dependencies = [ + "memchr", + "minimal-lexical", +] + +[[package]] +name = "once_cell" +version = "1.18.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "dd8b5dd2ae5ed71462c540258bedcb51965123ad7e7ccf4b9a8cafaa4a63576d" + +[[package]] +name = "peeking_take_while" +version = "0.1.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "19b17cddbe7ec3f8bc800887bab5e717348c95ea2ca0b1bf0837fb964dc67099" + +[[package]] +name = "prettyplease" +version = "0.2.15" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ae005bd773ab59b4725093fd7df83fd7892f7d8eafb48dbd7de6e024e4215f9d" +dependencies = [ + "proc-macro2", + "syn", +] + +[[package]] +name = "proc-macro2" +version = "1.0.70" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "39278fbbf5fb4f646ce651690877f89d1c5811a3d4acb27700c1cb3cdb78fd3b" +dependencies = [ + "unicode-ident", +] + +[[package]] +name = "quote" +version = "1.0.33" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5267fca4496028628a95160fc423a33e8b2e6af8a5302579e322e4b520293cae" +dependencies = [ + "proc-macro2", +] + +[[package]] +name = "regex" +version = "1.10.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "380b951a9c5e80ddfd6136919eef32310721aa4aacd4889a8d39124b026ab343" +dependencies = [ + "aho-corasick", + "memchr", + "regex-automata", + "regex-syntax", +] + +[[package]] +name = "regex-automata" +version = "0.4.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5f804c7828047e88b2d32e2d7fe5a105da8ee3264f01902f796c8e067dc2483f" +dependencies = [ + "aho-corasick", + "memchr", + "regex-syntax", +] + +[[package]] +name = "regex-syntax" +version = "0.8.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c08c74e62047bb2de4ff487b251e4a92e24f48745648451635cec7d591162d9f" + +[[package]] +name = "rustc-hash" +version = "1.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "08d43f7aa6b08d49f382cde6a7982047c3426db949b1424bc4b7ec9ae12c6ce2" + +[[package]] +name = "rustix" +version = "0.38.25" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "dc99bc2d4f1fed22595588a013687477aedf3cdcfb26558c559edb67b4d9b22e" +dependencies = [ + "bitflags", + "errno", + "libc", + "linux-raw-sys", + "windows-sys 0.48.0", +] + +[[package]] +name = "shlex" +version = "1.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a7cee0529a6d40f580e7a5e6c495c8fbfe21b7b52795ed4bb5e62cdf92bc6380" + +[[package]] +name = "syn" +version = "2.0.39" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "23e78b90f2fcf45d3e842032ce32e3f2d1545ba6636271dcbf24fa306d87be7a" +dependencies = [ + "proc-macro2", + "quote", + "unicode-ident", +] + +[[package]] +name = "unicode-ident" +version = "1.0.12" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3354b9ac3fae1ff6755cb6db53683adb661634f67557942dea4facebec0fee4b" + +[[package]] +name = "which" +version = "4.4.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "87ba24419a2078cd2b0f2ede2691b6c66d8e47836da3b6db8265ebad47afbfc7" +dependencies = [ + "either", + "home", + "once_cell", + "rustix", +] + +[[package]] +name = "winapi" +version = "0.3.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5c839a674fcd7a98952e593242ea400abe93992746761e38641405d28b00f419" +dependencies = [ + "winapi-i686-pc-windows-gnu", + "winapi-x86_64-pc-windows-gnu", +] + +[[package]] +name = "winapi-i686-pc-windows-gnu" +version = "0.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ac3b87c63620426dd9b991e5ce0329eff545bccbbb34f3be09ff6fb6ab51b7b6" + +[[package]] +name = "winapi-x86_64-pc-windows-gnu" +version = "0.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "712e227841d057c1ee1cd2fb22fa7e5a5461ae8e48fa2ca79ec42cfc1931183f" + +[[package]] +name = "windows-sys" +version = "0.48.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "677d2418bec65e3338edb076e806bc1ec15693c5d0104683f2efe857f61056a9" +dependencies = [ + "windows-targets 0.48.5", +] + +[[package]] +name = "windows-sys" +version = "0.52.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "282be5f36a8ce781fad8c8ae18fa3f9beff57ec1b52cb3de0789201425d9a33d" +dependencies = [ + "windows-targets 0.52.0", +] + +[[package]] +name = "windows-targets" +version = "0.48.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9a2fa6e2155d7247be68c096456083145c183cbbbc2764150dda45a87197940c" +dependencies = [ + "windows_aarch64_gnullvm 0.48.5", + "windows_aarch64_msvc 0.48.5", + "windows_i686_gnu 0.48.5", + "windows_i686_msvc 0.48.5", + "windows_x86_64_gnu 0.48.5", + "windows_x86_64_gnullvm 0.48.5", + "windows_x86_64_msvc 0.48.5", +] + +[[package]] +name = "windows-targets" +version = "0.52.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8a18201040b24831fbb9e4eb208f8892e1f50a37feb53cc7ff887feb8f50e7cd" +dependencies = [ + "windows_aarch64_gnullvm 0.52.0", + "windows_aarch64_msvc 0.52.0", + "windows_i686_gnu 0.52.0", + "windows_i686_msvc 0.52.0", + "windows_x86_64_gnu 0.52.0", + "windows_x86_64_gnullvm 0.52.0", + "windows_x86_64_msvc 0.52.0", +] + +[[package]] +name = "windows_aarch64_gnullvm" +version = "0.48.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2b38e32f0abccf9987a4e3079dfb67dcd799fb61361e53e2882c3cbaf0d905d8" + +[[package]] +name = "windows_aarch64_gnullvm" +version = "0.52.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "cb7764e35d4db8a7921e09562a0304bf2f93e0a51bfccee0bd0bb0b666b015ea" + +[[package]] +name = "windows_aarch64_msvc" +version = "0.48.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "dc35310971f3b2dbbf3f0690a219f40e2d9afcf64f9ab7cc1be722937c26b4bc" + +[[package]] +name = "windows_aarch64_msvc" +version = "0.52.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "bbaa0368d4f1d2aaefc55b6fcfee13f41544ddf36801e793edbbfd7d7df075ef" + +[[package]] +name = "windows_i686_gnu" +version = "0.48.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a75915e7def60c94dcef72200b9a8e58e5091744960da64ec734a6c6e9b3743e" + +[[package]] +name = "windows_i686_gnu" +version = "0.52.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a28637cb1fa3560a16915793afb20081aba2c92ee8af57b4d5f28e4b3e7df313" + +[[package]] +name = "windows_i686_msvc" +version = "0.48.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8f55c233f70c4b27f66c523580f78f1004e8b5a8b659e05a4eb49d4166cca406" + +[[package]] +name = "windows_i686_msvc" +version = "0.52.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ffe5e8e31046ce6230cc7215707b816e339ff4d4d67c65dffa206fd0f7aa7b9a" + +[[package]] +name = "windows_x86_64_gnu" +version = "0.48.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "53d40abd2583d23e4718fddf1ebec84dbff8381c07cae67ff7768bbf19c6718e" + +[[package]] +name = "windows_x86_64_gnu" +version = "0.52.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3d6fa32db2bc4a2f5abeacf2b69f7992cd09dca97498da74a151a3132c26befd" + +[[package]] +name = "windows_x86_64_gnullvm" +version = "0.48.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0b7b52767868a23d5bab768e390dc5f5c55825b6d30b86c844ff2dc7414044cc" + +[[package]] +name = "windows_x86_64_gnullvm" +version = "0.52.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1a657e1e9d3f514745a572a6846d3c7aa7dbe1658c056ed9c3344c4109a6949e" + +[[package]] +name = "windows_x86_64_msvc" +version = "0.48.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ed94fce61571a4006852b7389a063ab983c02eb1bb37b47f8272ce92d06d9538" + +[[package]] +name = "windows_x86_64_msvc" +version = "0.52.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "dff9641d1cd4be8d1a070daf9e3773c5f67e78b4d9d42263020c057706765c04" diff --git a/rust/ares_pma/Cargo.toml b/rust/ares_pma/Cargo.toml new file mode 100644 index 0000000..94612e4 --- /dev/null +++ b/rust/ares_pma/Cargo.toml @@ -0,0 +1,15 @@ +[package] +name = "ares_pma" +version = "0.1.0" +edition = "2018" + +# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html + +[dependencies] + +[build-dependencies] +bindgen = "0.69.1" +cc = "1.0" + +[features] +debug_prints = [] diff --git a/rust/ares_pma/build.rs b/rust/ares_pma/build.rs new file mode 100644 index 0000000..22ec4be --- /dev/null +++ b/rust/ares_pma/build.rs @@ -0,0 +1,83 @@ +extern crate bindgen; + +use std::env; +use std::path::PathBuf; + +fn main() { + let opt_level = env::var("OPT_LEVEL").unwrap(); + let define_debug = if env::var("CARGO_FEATURE_DEBUG_PRINTS").is_ok() { + "-DDEBUG" + } else { + "-UDEBUG" + }; + + // This is the directory where the `c` library is located. + let libdir_path = PathBuf::from("c-src") + // Canonicalize the path as `rustc-link-search` requires an absolute + // path. + .canonicalize() + .expect("cannot canonicalize path"); + let libdir_path_str = libdir_path.to_str().expect("Path is not a valid string"); + + // This is the path to the `c` headers file. + let headers_path = libdir_path.join("wrapper.h"); + let headers_path_str = headers_path.to_str().expect("Path is not a valid string"); + + println!("cargo:rerun-if-changed={}", libdir_path_str); + + let res = cc::Build::new() + .file( + libdir_path + .join("btree.c") + .to_str() + .expect("Path is not a valid string"), + ) + .file( + libdir_path + .join("lib") + .join("checksum.c") + .to_str() + .expect("Path is not a valid string"), + ) + .flag(format!("-O{}", opt_level).as_ref()) + .flag(define_debug) + .flag("-g3") + .flag("-Wall") + .flag("-Wextra") + .flag("-Wpedantic") + .flag("-Wformat=2") + .flag("-Wno-unused-parameter") + .flag("-Wshadow") + .flag("-Wwrite-strings") + .flag("-Wstrict-prototypes") + .flag("-Wold-style-definition") + .flag("-Wredundant-decls") + .flag("-Wnested-externs") + .flag("-Wmissing-include-dirs") + .try_compile("btree"); + + if let Err(err) = res { + panic!("{}", err); + } + + // The bindgen::Builder is the main entry point + // to bindgen, and lets you build up options for + // the resulting bindings. + let bindings = bindgen::Builder::default() + // The input header we would like to generate + // bindings for. + .header(headers_path_str) + // Tell cargo to invalidate the built crate whenever any of the + // included header files changed. + .parse_callbacks(Box::new(bindgen::CargoCallbacks::new())) + // Finish the builder and generate the bindings. + .generate() + // Unwrap the Result and panic on failure. + .expect("Unable to generate bindings"); + + // Write the bindings to the $OUT_DIR/bindings.rs file. + let out_path = PathBuf::from(env::var("OUT_DIR").unwrap()).join("bindings.rs"); + bindings + .write_to_file(out_path) + .expect("Couldn't write bindings!"); +} diff --git a/rust/ares_pma/c-src/btest.c b/rust/ares_pma/c-src/btest.c new file mode 100644 index 0000000..0191df2 --- /dev/null +++ b/rust/ares_pma/c-src/btest.c @@ -0,0 +1,298 @@ +#include "btree.h" +#include "btree.c" + +#include +#include + +static void +_test_nodeinteg(BT_state *state, BT_findpath *path, + vaof_t lo, vaof_t hi, pgno_t pg) +{ + size_t childidx = 0; + BT_page *parent = 0; + + assert(SUCC(_bt_find(state, path, lo, hi))); + parent = path->path[path->depth]; + childidx = path->idx[path->depth]; + assert(parent->datk[childidx].fo == pg); + assert(parent->datk[childidx].va == lo); + assert(parent->datk[childidx+1].va == hi); +} + +static size_t +_mlist_sizep(BT_mlistnode *head) +/* calculate the size of the mlist in pages */ +{ + size_t sz = 0; + while (head) { + size_t sz_p = addr2off(head->hi) - addr2off(head->lo); + sz += sz_p; + head = head->next; + } + return sz; +} + +static size_t +_flist_sizep(BT_flistnode *head) +/* calculate the size of the flist in pages */ +{ + size_t sz = 0; + while (head) { + size_t sz_p = head->hi - head->lo; + sz += sz_p; + head = head->next; + } + return sz; +} + +static BT_mlistnode * +_mlist_copy(BT_state *state) +{ + BT_mlistnode *head = state->mlist; + BT_mlistnode *ret, *prev; + ret = prev = calloc(1, sizeof *ret); + memcpy(ret, head, sizeof *head); + ret->next = 0; + head = head->next; + while (head) { + BT_mlistnode *copy = calloc(1, sizeof *copy); + memcpy(copy, head, sizeof *head); + prev->next = copy; + prev = copy; + head = head->next; + } + return ret; +} + +static BT_nlistnode * +_nlist_copy(BT_state *state) +{ + BT_nlistnode *head = state->nlist; + BT_nlistnode *ret, *prev; + ret = prev = calloc(1, sizeof *ret); + memcpy(ret, head, sizeof *head); + ret->next = 0; + head = head->next; + while (head) { + BT_nlistnode *copy = calloc(1, sizeof *copy); + memcpy(copy, head, sizeof *head); + prev->next = copy; + prev = copy; + head = head->next; + } + return ret; +} + +static BT_flistnode * +_flist_copy(BT_state *state) +{ + BT_flistnode *head = state->flist; + BT_flistnode *ret, *prev; + ret = prev = calloc(1, sizeof *ret); + memcpy(ret, head, sizeof *head); + ret->next = 0; + head = head->next; + while (head) { + BT_flistnode *copy = calloc(1, sizeof *copy); + memcpy(copy, head, sizeof *head); + prev->next = copy; + prev = copy; + head = head->next; + } + return ret; +} + +static int +_mlist_eq(BT_mlistnode *l, BT_mlistnode *r) +{ + while (l && r) { + if (l->lo != r->lo) + bp(0); + if (l->hi != r->hi) + bp(0); + l = l->next; r = r->next; + } + if (l == 0 && r == 0) + return 1; + bp(0); +} + +static int +_nlist_eq(BT_nlistnode *l, BT_nlistnode *r) +{ + while (l && r) { + if (l->lo != r->lo) + bp(0); + if (l->hi != r->hi) + bp(0); + l = l->next; r = r->next; + } + if (l == 0 && r == 0) + return 1; + bp(0); +} + +static int +_flist_eq(BT_flistnode *l, BT_flistnode *r) +{ + while (l && r) { + if (l->lo != r->lo) + bp(0); + if (l->hi != r->hi) + bp(0); + l = l->next; r = r->next; + } + if (l == 0 && r == 0) + return 1; + bp(0); +} + +int main(int argc, char *argv[]) +{ + DPRINTF("PMA Max Storage: %lld", ((uint64_t)UINT32_MAX * BT_PAGESIZE) - BLK_BASE_LEN_TOTAL); + DPUTS("PMA Tests"); + + BT_state *state1; + BT_findpath path = {0}; + int rc = 0; + + + DPUTS("== test 1: insert"); + + bt_state_new(&state1); + if (mkdir("./pmatest1", 0774) == -1) + return errno; + assert(SUCC(bt_state_open(state1, "./pmatest1", 0, 0644))); + +#define LOWEST_ADDR 0x2aaa80; + vaof_t lo = LOWEST_ADDR; + vaof_t hi = 0xDEADBEEF; + pgno_t pg = 1; /* dummy value */ + for (size_t i = 0; i < BT_DAT_MAXKEYS * 4; ++i) { + _bt_insert(state1, lo, hi, pg); + _test_nodeinteg(state1, &path, lo, hi, pg); + lo++; pg++; + } + + bt_state_close(state1); + + + DPUTS("== test 2: malloc"); + BT_state *state2; + + bt_state_new(&state2); + if (mkdir("./pmatest2", 0774) == -1) + return errno; + assert(SUCC(bt_state_open(state2, "./pmatest2", 0, 0644))); + + void *t2a = bt_malloc(state2, 10); + bt_free(state2, t2a, (BT_page*)t2a + 10); + void *t2b = bt_malloc(state2, 10); + /* should have pulled the same pointer due to eager mlist coalescing */ + assert(t2a == t2b); + ZERO(&path, sizeof path); + _bt_find(state2, &path, addr2off(t2b), addr2off((BT_page *)t2b + 10)); +#define T2P1_PRNT0 (path.path[path.depth]) +#define T2P1_CIDX0 (path.idx[path.depth]) +#define T2P1_CIDX1 (path.idx[path.depth] + 1) + /* check length as represented in btree */ + assert(T2P1_PRNT0->datk[T2P1_CIDX1].va + - T2P1_PRNT0->datk[T2P1_CIDX0].va + == 10); + bt_free(state2, t2b, (BT_page*)t2b + 10); + ZERO(&path, sizeof path); + _bt_find(state2, &path, addr2off(t2b), addr2off((BT_page *)t2b + 10)); + /* fo should be zero (free) */ + assert(path.path[path.depth]->datk[path.idx[path.depth]].fo == 0); + /* should invoke deletion coalescing - 10 page free range in btree */ + void *t2c = bt_malloc(state2, 20); + + bt_state_close(state2); + + + DPUTS("== test 3: ephemeral structure restoration"); + BT_state *state3; + + bt_state_new(&state3); + if (mkdir("./pmatest3", 0774) == -1) + return errno; + assert(SUCC(bt_state_open(state3, "./pmatest3", 0, 0644))); + + typedef struct lohi_pair lohi_pair; + struct lohi_pair + { + BT_page *lo; + BT_page *hi; + }; + +#define ITERATIONS 1000 +#define MAXALLOCPG 0xFF + lohi_pair allocs[ITERATIONS] = {0}; + size_t alloc_sizp = 0; + size_t flist_sizp = _flist_sizep(state3->flist); + size_t mlist_sizp = _mlist_sizep(state3->mlist); + BT_meta *meta = state3->meta_pages[state3->which]; + BT_page *root = _node_get(state3, meta->root); + size_t N; + for (size_t i = 0; i < ITERATIONS; i++) { + /* malloc a random number of pages <= 256 and store in the allocs array */ + int pages = random(); + pages &= MAXALLOCPG; + pages += 1; + allocs[i].lo = bt_malloc(state3, pages); + allocs[i].hi = allocs[i].lo + pages; + alloc_sizp += pages; + /* validate size changes to mlist and flist */ + assert(_flist_sizep(state3->flist) + == (flist_sizp - alloc_sizp)); + assert(_mlist_sizep(state3->mlist) + == (mlist_sizp - alloc_sizp)); + N = _bt_numkeys(root); + assert(root->datk[N-2].fo == 0); + } + + /* sync the state */ + /* bt_sync(state3); */ + + /* TODO: close and reopen state. validate ephemeral structures */ + + flist_sizp = _flist_sizep(state3->flist); + mlist_sizp = _mlist_sizep(state3->mlist); + alloc_sizp = 0; + /* for (size_t i = 0; i < ITERATIONS / 2; i++) { */ + /* /\* free half of the allocations *\/ */ + /* bt_free(state3, allocs[i].lo, allocs[i].hi); */ + /* alloc_sizp += allocs[i].hi - allocs[i].lo; */ + /* /\* validate size changes to mlist *\/ */ + /* assert(_mlist_sizep(state3->mlist) */ + /* == (mlist_sizp + alloc_sizp)); */ + /* } */ + + /* copy ephemeral structures */ + BT_mlistnode *mlist_copy = _mlist_copy(state3); + BT_nlistnode *nlist_copy = _nlist_copy(state3); + BT_flistnode *flist_copy = _flist_copy(state3); + assert(_mlist_eq(mlist_copy, state3->mlist)); + assert(_nlist_eq(nlist_copy, state3->nlist)); + assert(_flist_eq(flist_copy, state3->flist)); + + meta = state3->meta_pages[state3->which]; + BT_meta metacopy = {0}; + memcpy(&metacopy, meta, sizeof metacopy); + + bt_state_close(state3); + + bt_state_new(&state3); + + assert(SUCC(bt_state_open(state3, "./pmatest3", 0, 0644))); + + /* compare for equality copies of ephemeral structures with restored ephemeral + structures */ + meta = state3->meta_pages[state3->which]; + assert(meta->root == metacopy.root); + assert(_mlist_eq(mlist_copy, state3->mlist)); + assert(_nlist_eq(nlist_copy, state3->nlist)); + assert(_flist_eq(flist_copy, state3->flist)); + + return 0; +} diff --git a/rust/ares_pma/c-src/btree.c b/rust/ares_pma/c-src/btree.c new file mode 100644 index 0000000..e5c9b0e --- /dev/null +++ b/rust/ares_pma/c-src/btree.c @@ -0,0 +1,3199 @@ +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "btree.h" +#include "lib/checksum.h" + +typedef uint32_t pgno_t; /* a page number */ +typedef uint32_t vaof_t; /* a virtual address offset */ +typedef uint32_t flag_t; +typedef unsigned char BYTE; + +//// =========================================================================== +//// tmp tmp tmp tmp tmp +/* ;;: remove -- for debugging */ +/* + bp(X) where X is false will raise a SIGTRAP. If the process is being run + inside a debugger, this can be caught and ignored. It's equivalent to a + breakpoint. If run without a debugger, it will dump core, like an assert +*/ +#ifdef DEBUG +#if defined(__i386__) || defined(__x86_64__) +#define bp(x) do { if(!(x)) __asm__ volatile("int $3"); } while (0) +#elif defined(__thumb__) +#define bp(x) do { if(!(x)) __asm__ volatile(".inst 0xde01"); } while (0) +#elif defined(__aarch64__) +#define bp(x) do { if(!(x)) __asm__ volatile(".inst 0xd4200000"); } while (0) +#elif defined(__arm__) +#define bp(x) do { if(!(x)) __asm__ volatile(".inst 0xe7f001f0"); } while (0) +#else +STATIC_ASSERT(0, "debugger break instruction unimplemented"); +#endif +#else +#define bp(x) ((void)(0)) +#endif + +/* coalescing of memory freelist currently prohibited since we haven't + implemented coalescing of btree nodes (necessary) */ +#define CAN_COALESCE 0 +/* ;;: remove once confident in logic and delete all code dependencies on + state->node_freelist */ + +/* prints a node before and after a call to _bt_insertdat */ +#define DEBUG_PRINTNODE 0 + +#define ZERO(s, n) memset((s), 0, (n)) + +#define S7(A, B, C, D, E, F, G) A##B##C##D##E##F##G +#define S6(A, B, C, D, E, F, ...) S7(A, B, C, D, E, F, __VA_ARGS__) +#define S5(A, B, C, D, E, ...) S6(A, B, C, D, E, __VA_ARGS__) +#define S4(A, B, C, D, ...) S5(A, B, C, D, __VA_ARGS__) +#define S3(A, B, C, ...) S4(A, B, C, __VA_ARGS__) +#define S2(A, B, ...) S3(A, B, __VA_ARGS__) +#define S(A, ...) S2(A, __VA_ARGS__) + +#define KBYTES(x) ((size_t)(x) << 10) +#define MBYTES(x) ((size_t)(x) << 20) +#define GBYTES(x) ((size_t)(x) << 30) +#define TBYTES(x) ((size_t)(x) << 40) +#define PBYTES(x) ((size_t)(x) << 50) + +/* 4K page in bytes */ +#define P2BYTES(x) ((size_t)(x) << BT_PAGEBITS) +/* the opposite of P2BYTES */ +#define B2PAGES(x) ((size_t)(x) >> BT_PAGEBITS) + + +#define __packed __attribute__((__packed__)) +#define UNUSED(x) ((void)(x)) + +#ifdef DEBUG +# define DPRINTF(fmt, ...) \ + fprintf(stderr, "%s:%d " fmt "\n", __func__, __LINE__, __VA_ARGS__) +#else +# define DPRINTF(fmt, ...) ((void) 0) +#endif +#define DPUTS(arg) DPRINTF("%s", arg) +#define TRACE(...) DPUTS("") + +#define BT_SUCC 0 +#define SUCC(x) ((x) == BT_SUCC) + +/* given a pointer p returns the low page-aligned addr */ +#define LO_ALIGN_PAGE(p) ((BT_page *)(((uintptr_t)p) & ~(BT_PAGESIZE - 1))) + + +#define BT_MAPADDR ((BYTE *) S(0x1000,0000,0000)) + +static inline vaof_t +addr2off(void *p) +/* convert a pointer into a 32-bit page offset */ +{ + uintptr_t pu = (uintptr_t)p; + assert(pu >= (uintptr_t)BT_MAPADDR); + pu -= (uintptr_t)BT_MAPADDR; + assert((pu & ((1 << BT_PAGEBITS) - 1)) == 0); /* p must be page-aligned */ + return (vaof_t)(pu >> BT_PAGEBITS); +} + +static inline void * +off2addr(vaof_t off) +/* convert a 32-bit page offset into a pointer */ +{ + uintptr_t pu = (uintptr_t)off << BT_PAGEBITS; + pu += (uintptr_t)BT_MAPADDR; + return (void *)pu; +} + +#define BT_PAGEWORD 32ULL +#define BT_NUMMETAS 2 /* 2 metapages */ +#define BT_META_SECTION_WIDTH (BT_NUMMETAS * BT_PAGESIZE) +#define BT_ADDRSIZE (BT_PAGESIZE << BT_PAGEWORD) +#define PMA_GROW_SIZE (BT_PAGESIZE * 1024 * 64) + +#define BT_NOPAGE 0 + +#define BT_PROT_CLEAN (PROT_READ) +#define BT_FLAG_CLEAN (MAP_FIXED | MAP_SHARED) +#define BT_PROT_FREE (PROT_NONE) +#define BT_FLAG_FREE (MAP_ANONYMOUS | MAP_SHARED | MAP_FIXED | MAP_NORESERVE) +#define BT_PROT_DIRTY (PROT_READ | PROT_WRITE) +#define BT_FLAG_DIRTY (MAP_FIXED | MAP_SHARED) + +/* + FO2BY: file offset to byte + get byte INDEX into pma map from file offset +*/ +#define FO2BY(fo) \ + ((uint64_t)(fo) << BT_PAGEBITS) + +/* + BY2FO: byte to file offset + get pgno from byte INDEX into pma map +*/ +#define BY2FO(p) \ + ((pgno_t)((p) >> BT_PAGEBITS)) + +/* + FO2PA: file offset to page + get a reference to a BT_page from a file offset + + ;;: can simplify: + + ((BT_page*)state->map)[fo] +*/ +#define FO2PA(map, fo) \ + ((BT_page *)&(map)[FO2BY(fo)]) + +/* NMEMB: number of members in array, a */ +#define NMEMB(a) \ + (sizeof(a) / sizeof(a[0])) + +#define offsetof(st, m) \ + __builtin_offsetof(st, m) + + +//// =========================================================================== +//// btree types + +/* + btree page header. all pages share this header. Though for metapages, you can + expect it to be zeroed out. +*/ +typedef struct BT_pageheader BT_pageheader; +struct BT_pageheader { + uint8_t dirty[256]; /* dirty bit map */ +} __packed; + +/* + btree key/value data format + + BT_dat is used to provide a view of the data section in a BT_page where data is + stored like: + va fo va fo + bytes 0 4 8 12 + + The convenience macros given an index into the data array do the following: + BT_dat_lo(i) returns ith va (low addr) + BT_dat_hi(i) returns i+1th va (high addr) + BT_dat_fo(i) returns ith file offset +*/ +typedef union BT_dat BT_dat; +union BT_dat { + vaof_t va; /* virtual address offset */ + pgno_t fo; /* file offset */ +}; + +/* like BT_dat but when a struct is more useful than a union */ +typedef struct BT_kv BT_kv; +struct BT_kv { + vaof_t va; + pgno_t fo; +}; + +/* ;;: todo, perhaps rather than an index, return the data directly and typecast?? */ +#define BT_dat_lo(i) ((i) * 2) +#define BT_dat_fo(i) ((i) * 2 + 1) +#define BT_dat_hi(i) ((i) * 2 + 2) + +#define BT_dat_lo2(I, dat) +#define BT_dat_fo2(I, dat) +#define BT_dat_hi2(I, dat) + +/* BT_dat_maxva: pointer to highest va in page data section */ +#define BT_dat_maxva(p) \ + ((void *)&(p)->datd[BT_dat_lo(BT_DAT_MAXKEYS)]) + +/* BT_dat_maxfo: pointer to highest fo in page data section */ +#define BT_dat_maxfo(p) \ + ((void *)&(p)->datd[BT_dat_fo(BT_DAT_MAXVALS)]) + +#define BT_DAT_MAXBYTES (BT_PAGESIZE - sizeof(BT_pageheader)) +#define BT_DAT_MAXENTRIES (BT_DAT_MAXBYTES / sizeof(BT_dat)) +#define BT_DAT_MAXKEYS (BT_DAT_MAXENTRIES / 2) +/* #define BT_DAT_MAXKEYS 10 */ +#define BT_DAT_MAXVALS BT_DAT_MAXKEYS +static_assert(BT_DAT_MAXENTRIES % 2 == 0); +/* we assume off_t is 64 bit */ +static_assert(sizeof(off_t) == sizeof(uint64_t)); + +/* + all pages in the memory arena consist of a header and data section +*/ +typedef struct BT_page BT_page; +struct BT_page { + BT_pageheader head; /* header */ + union { /* data section */ + BT_dat datd[BT_DAT_MAXENTRIES]; /* union view */ + BT_kv datk[BT_DAT_MAXKEYS]; /* struct view */ + BYTE datc[BT_DAT_MAXBYTES]; /* byte-level view */ + }; +}; +static_assert(sizeof(BT_page) == BT_PAGESIZE); +static_assert(BT_DAT_MAXBYTES % sizeof(BT_dat) == 0); + +#define BT_MAGIC 0xBADDBABE +#define BT_VERSION 1 +/* + a meta page is like any other page, but the data section is used to store + additional information +*/ +#define BLK_BASE_LEN0 (MBYTES(2) - BT_META_SECTION_WIDTH) +#define BLK_BASE_LEN1 (MBYTES(8)) +#define BLK_BASE_LEN2 (BLK_BASE_LEN1 * 4) +#define BLK_BASE_LEN3 (BLK_BASE_LEN2 * 4) +#define BLK_BASE_LEN4 (BLK_BASE_LEN3 * 4) +#define BLK_BASE_LEN5 (BLK_BASE_LEN4 * 4) +#define BLK_BASE_LEN6 (BLK_BASE_LEN5 * 4) +#define BLK_BASE_LEN7 (BLK_BASE_LEN6 * 4) +#define BLK_BASE_LEN_TOTAL ( \ + BT_META_SECTION_WIDTH + \ + BLK_BASE_LEN0 + \ + BLK_BASE_LEN1 + \ + BLK_BASE_LEN2 + \ + BLK_BASE_LEN3 + \ + BLK_BASE_LEN4 + \ + BLK_BASE_LEN5 + \ + BLK_BASE_LEN6 + \ + BLK_BASE_LEN7) +typedef struct BT_meta BT_meta; +struct BT_meta { +#define BT_NUMROOTS 32 + uint32_t magic; + uint32_t version; + pgno_t last_pg; /* last page used in file */ + uint32_t _pad0; + uint64_t txnid; + void *fix_addr; /* fixed addr of btree */ + pgno_t blk_base[8]; /* block base array for striped node partition */ + /* ;;: for the blk_base array, code may be simpler if this were an array of + BT_page *. */ + uint8_t blk_cnt; /* currently highest valid block base */ + uint8_t depth; /* tree depth */ +#define BP_META ((uint8_t)0x02) + uint8_t flags; + uint8_t _pad1; + pgno_t root; + /* 64bit alignment manually checked - 72 bytes total above */ + uint64_t roots[BT_NUMROOTS]; /* for usage by ares */ + uint32_t chk; /* checksum */ +} __packed; +static_assert(sizeof(BT_meta) <= BT_DAT_MAXBYTES); + +/* the length of the metapage up to but excluding the checksum */ +#define BT_META_LEN (offsetof(BT_meta, chk)) + +#define BT_roots_bytelen (sizeof(BT_meta) - offsetof(BT_meta, roots)) + +typedef struct BT_mlistnode BT_mlistnode; +struct BT_mlistnode { + /* ;;: lo and hi might as well by (BT_page *) because we don't have any reason + to have finer granularity */ + BYTE *lo; /* low virtual address */ + BYTE *hi; /* high virtual address */ + BT_mlistnode *next; /* next freelist node */ +}; + +typedef struct BT_nlistnode BT_nlistnode; +struct BT_nlistnode { + BT_page *lo; /* low virtual address */ + BT_page *hi; /* high virtual address */ + BT_nlistnode *next; /* next freelist node */ +}; + +typedef struct BT_flistnode BT_flistnode; +struct BT_flistnode { + pgno_t lo; /* low pgno in persistent file */ + pgno_t hi; /* high pgno in persistent file */ + BT_flistnode *next; /* next freelist node */ +}; + +/* macro to access the metadata stored in a page's data section */ +#define METADATA(p) ((BT_meta *)(void *)(p)->datc) + +typedef struct BT_state BT_state; +struct BT_state { + int data_fd; + char *path; + void *fixaddr; + BYTE *map; + BT_meta *meta_pages[2]; /* double buffered */ + /* ;;: note, while meta_pages[which]->root stores a pgno, we may want to just + store a pointer to root in state in addition to avoid a _node_find on it + every time it's referenced */ + /* BT_page *root; */ + off_t file_size; /* the size of the pma file in bytes */ + pgno_t frontier; /* last non-free page in use by pma (exclusive) */ + unsigned int which; /* which double-buffered db are we using? */ + BT_nlistnode *nlist; /* node freelist */ + BT_mlistnode *mlist; /* memory freelist */ + BT_flistnode *flist; /* pma file freelist */ + BT_flistnode *pending_flist; + BT_nlistnode *pending_nlist; +}; + +/* + ;;: wrt to frontier: if you need to allocate space for data, push the frontier + out by that amount allocated. If you're allocating a new stripe, push it to + the end of that stripe. +*/ + + +//// =========================================================================== +//// btree internal routines + +static void _bt_printnode(BT_page *node) __attribute__((unused)); /* ;;: tmp */ +static int +_bt_insertdat(vaof_t lo, vaof_t hi, pgno_t fo, + BT_page *parent, size_t childidx); /* ;;: tmp */ + +static int _bt_flip_meta(BT_state *); + + +#define BT_MAXDEPTH 4 /* ;;: todo derive it */ +typedef struct BT_findpath BT_findpath; +struct BT_findpath { + BT_page *path[BT_MAXDEPTH]; + size_t idx[BT_MAXDEPTH]; + uint8_t depth; +}; + +/* _node_get: get a pointer to a node stored at file offset pgno */ +static BT_page * +_node_get(BT_state *state, pgno_t pgno) +{ + /* TODO: eventually, once we can store more than 2M of nodes, this will need + to reference the meta page's blk_base array to determine where a node is + mapped. i.e: + + - receive pgno + - find first pgno in blk_base that exceeds pgno : i + - sector that contains node is i-1 + - appropriately offset into i-1th fixed size partition: 2M, 8M, 16M, ... + + */ + + /* for now, this works because the 2M sector is at the beginning of both the + memory arena and pma file + */ + if (pgno <= 1) return 0; /* no nodes stored at 0 and 1 (metapages) */ + /* TODO: when partition striping is implemented, a call beyond the furthest + block base should result in the allocation of a new block base */ + assert((pgno * BT_PAGESIZE) < MBYTES(2)); + return FO2PA(state->map, pgno); +} + +/* ;;: I don't think we should need this if _bt_nalloc also returns a disc offset */ +static pgno_t +_fo_get(BT_state *state, BT_page *node) +{ + uintptr_t vaddr = (uintptr_t)node; + uintptr_t start = (uintptr_t)state->map; + return BY2FO(vaddr - start); +} + +static void +_mlist_record_alloc(BT_state *state, void *lo, void *hi) +{ + BT_mlistnode **head = &state->mlist; + BYTE *lob = lo; + BYTE *hib = hi; + while (*head) { + /* found chunk */ + if ((*head)->lo <= lob && (*head)->hi >= hib) + break; + assert((*head)->next); + head = &(*head)->next; + } + + if (hib < (*head)->hi) { + if (lob > (*head)->lo) { + BT_mlistnode *left = *head; + BT_mlistnode *right = calloc(1, sizeof *right); + right->hi = left->hi; + right->lo = hib; + right->next = left->next; + left->hi = lob; + left->next = right; + } + else { + /* lob equal */ + (*head)->lo = hib; + } + } + else if (lob > (*head)->lo) { + /* hib equal */ + (*head)->hi = lob; + } + else { + /* equals */ + BT_mlistnode *next = (*head)->next; + free(*head); + *head = next; + } +} + +static void +_nlist_record_alloc(BT_state *state, BT_page *lo) +{ + BT_nlistnode **head = &state->nlist; + BT_page *hi = lo + 1; + while (*head) { + /* found chunk */ + if ((*head)->lo <= lo && (*head)->hi >= hi) + break; + assert((*head)->next); + head = &(*head)->next; + } + + if (hi < (*head)->hi) { + if (lo > (*head)->lo) { + BT_nlistnode *left = *head; + BT_nlistnode *right = calloc(1, sizeof *right); + right->hi = left->hi; + right->lo = hi; + right->next = left->next; + left->hi = lo; + left->next = right; + } + else { + /* lo equal */ + (*head)->lo = hi; + } + } + else if (lo > (*head)->lo) { + /* hi equal */ + (*head)->hi = lo; + } + else { + /* equals */ + BT_nlistnode *next = (*head)->next; + free(*head); + *head = next; + } +} + +static void +_flist_record_alloc(BT_state *state, pgno_t lo, pgno_t hi) +{ + BT_flistnode **head = &state->flist; + while (*head) { + /* found chunk */ + if ((*head)->lo <= lo && (*head)->hi >= hi) + break; + assert((*head)->next); + head = &(*head)->next; + } + + if (hi < (*head)->hi) { + if (lo > (*head)->lo) { + BT_flistnode *left = *head; + BT_flistnode *right = calloc(1, sizeof *right); + right->hi = left->hi; + right->lo = hi; + right->next = left->next; + left->hi = lo; + left->next = right; + } + else { + /* lo equal */ + (*head)->lo = hi; + } + } + else if (lo > (*head)->lo) { + /* hi equal */ + (*head)->hi = lo; + } + else { + /* equals */ + BT_flistnode *next = (*head)->next; + free(*head); + *head = next; + } +} + +static BT_page * +_bt_nalloc(BT_state *state) +/* allocate a node in the node freelist */ +{ + /* TODO: maybe change _bt_nalloc to return both a file and a node offset as + params to the function and make actual return value an error code. This is + to avoid forcing some callers to immediately use _fo_get */ + BT_nlistnode **n = &state->nlist; + BT_page *ret = 0; + + for (; *n; n = &(*n)->next) { + size_t sz_p = (*n)->hi - (*n)->lo; + + /* ;;: refactor? this is ridiculous */ + if (sz_p >= 1) { + ret = (*n)->lo; + _nlist_record_alloc(state, ret); + break; + } + } + + if (ret == 0) { + DPUTS("nlist out of mem!"); + return 0; + } + + /* make node writable */ + if (mprotect(ret, sizeof(BT_page), BT_PROT_DIRTY) != 0) { + DPRINTF("mprotect of node: %p failed with %s", ret, strerror(errno)); + abort(); + } + + return ret; +} + +static int +_node_cow(BT_state *state, BT_page *node, pgno_t *pgno) +{ + BT_page *ret = _bt_nalloc(state); /* ;;: todo: assert node has no dirty entries */ + memcpy(ret->datk, node->datk, sizeof node->datk[0] * BT_DAT_MAXKEYS); + *pgno = _fo_get(state, ret); + return BT_SUCC; +} + +static void * +_bt_bsearch(BT_page *page, vaof_t va) __attribute((unused)); + +/* binary search a page's data section for a va. Returns a pointer to the found BT_dat */ +static void * +_bt_bsearch(BT_page *page, vaof_t va) +{ + /* ;;: todo: actually bsearch rather than linear */ + for (BT_kv *kv = &page->datk[0]; kv <= (BT_kv *)BT_dat_maxva(page); kv++) { + if (kv->va == va) + return kv; + } + + return 0; +} + +static size_t +_bt_childidx(BT_page *node, vaof_t lo, vaof_t hi) +/* looks up the child index in a parent node. If not found, return is + BT_DAT_MAXKEYS */ +{ + size_t i = 0; + for (; i < BT_DAT_MAXKEYS - 1; i++) { + vaof_t llo = node->datk[i].va; + vaof_t hhi = node->datk[i+1].va; + if (llo <= lo && hhi >= hi) + return i; + } + return BT_DAT_MAXKEYS; +} + +/* ;;: find returns a path to nodes that things should be in if they are there. */ +/* a leaf has a meta page depth eq to findpath depth */ +static int +_bt_find2(BT_state *state, + BT_page *node, + BT_findpath *path, + uint8_t maxdepth, + vaof_t lo, + vaof_t hi) +{ + /* ;;: meta node stores depth (node or leaf?) + look at root node and binsearch BT_dats where low is <= lo and high is >= hi + If at depth of metapage (a leaf), then done + otherwise grab node, increment depth, save node in path + */ + if (path->depth > maxdepth) + return ENOENT; + + assert(node != 0); + + size_t i; + if ((i = _bt_childidx(node, lo, hi)) == BT_DAT_MAXKEYS) + return ENOENT; + + if (path->depth == maxdepth) { + path->idx[path->depth] = i; + path->path[path->depth] = node; + return BT_SUCC; + } + /* then branch */ + else { + pgno_t fo = node->datk[i].fo; + BT_page *child = _node_get(state, fo); + path->idx[path->depth] = i; + path->path[path->depth] = node; + path->depth++; + return _bt_find2(state, child, path, maxdepth, lo, hi); + } +} + +static void +_bt_root_new(BT_meta *meta, BT_page *root) +{ + /* The first usable address in the PMA is just beyond the btree segment */ + root->datk[0].va = B2PAGES(BLK_BASE_LEN_TOTAL); + root->datk[0].fo = 0; + root->datk[1].va = UINT32_MAX; + root->datk[1].fo = 0; +} + +static int +_bt_find(BT_state *state, BT_findpath *path, vaof_t lo, vaof_t hi) +{ + path->depth = 1; + BT_meta *meta = state->meta_pages[state->which]; + BT_page *root = _node_get(state, meta->root); + uint8_t maxdepth = meta->depth; + return _bt_find2(state, root, path, maxdepth, lo, hi); +} + +static int +_bt_findpath_is_root(BT_findpath *path) __attribute((unused)); + +static int +_bt_findpath_is_root(BT_findpath *path) +{ + assert(path != 0); + return path->depth == 0; +} + +/* _bt_numkeys: find next empty space in node's data section. Returned as + index into node->datk. If the node is full, return is BT_DAT_MAXKEYS */ +static size_t +_bt_numkeys(BT_page *node) +{ + size_t i = 1; + for (; i < BT_DAT_MAXKEYS; i++) { + if (node->datk[i].va == 0) break; + } + return i; +} + +static int +_bt_datshift(BT_page *node, size_t i, size_t n) +/* shift data segment at i over by n KVs */ +{ + assert(i+n < BT_DAT_MAXKEYS); /* check buffer overflow */ + size_t siz = sizeof node->datk[0]; + size_t bytelen = (BT_DAT_MAXKEYS - i - n) * siz; + memmove(&node->datk[i+n], &node->datk[i], bytelen); + ZERO(&node->datk[i], n * siz); /* NB: not completely necessary */ + return BT_SUCC; +} + +/* _bt_split_datcopy: copy right half of left node to right node */ +static int +_bt_split_datcopy(BT_page *left, BT_page *right) +{ + size_t mid = BT_DAT_MAXKEYS / 2; + size_t bytelen = mid * sizeof(left->datk[0]); + /* copy rhs of left to right */ + memcpy(right->datk, &left->datk[mid], bytelen); + /* zero rhs of left */ + ZERO(&left->datk[mid], bytelen); /* ;;: note, this would be unnecessary if we stored node.N */ + /* the last entry in left should be the first entry in right */ + left->datk[mid].va = right->datk[0].va; + + return BT_SUCC; +} + +static int +_bt_ischilddirty(BT_page *parent, size_t child_idx) +{ + assert(child_idx < 2048); + uint8_t flag = parent->head.dirty[child_idx >> 3]; + return flag & (1 << (child_idx & 0x7)); +} + +/* ;;: todo: name the 0x8 and 4 literals and/or generalize */ +static int +_bt_dirtychild(BT_page *parent, size_t child_idx) +{ + assert(child_idx < 2048); + /* although there's nothing theoretically wrong with dirtying a dirty node, + there's probably a bug if we do it since a we only dirty a node when it's + alloced after a split or CoWed */ + assert(!_bt_ischilddirty(parent, child_idx)); + uint8_t *flag = &parent->head.dirty[child_idx >> 3]; + *flag |= 1 << (child_idx & 0x7); + return BT_SUCC; +} + +static int +_bt_cleanchild(BT_page *parent, size_t child_idx) +{ + assert(_bt_ischilddirty(parent, child_idx)); + uint8_t *flag = &parent->head.dirty[child_idx >> 3]; + *flag ^= 1 << (child_idx & 0x7); + return BT_SUCC; +} + +/* ;:: assert that the node is dirty when splitting */ +static int +_bt_split_child(BT_state *state, BT_page *parent, size_t i, pgno_t *newchild) +{ + /* ;;: todo: better error handling */ + assert(_bt_ischilddirty(parent, i)); + + int rc = BT_SUCC; + size_t N; + BT_page *left = _node_get(state, parent->datk[i].fo); + BT_page *right = _bt_nalloc(state); + if (right == 0) + return ENOMEM; + if (!SUCC(rc = _bt_split_datcopy(left, right))) + return rc; + + /* adjust high address of left node in parent */ + N = _bt_numkeys(left); + + /* insert reference to right child into parent node */ + N = _bt_numkeys(right); + vaof_t lo = right->datk[0].va; + vaof_t hi = right->datk[N-1].va; + + _bt_insertdat(lo, hi, _fo_get(state, right), parent, i); + + /* dirty right child */ + size_t ridx = _bt_childidx(parent, lo, hi); + assert(ridx == i+1); /* 0x100000020100;;: tmp? */ + _bt_dirtychild(parent, ridx); + + /* ;;: fix this */ + *newchild = _fo_get(state, right); + + return BT_SUCC; +} + +static int +_bt_rebalance(BT_state *state, BT_page *node) __attribute((unused)); + +static int +_bt_rebalance(BT_state *state, BT_page *node) +{ + return 255; +} + +/* insert lo, hi, and fo in parent's data section for childidx */ +static int +_bt_insertdat(vaof_t lo, vaof_t hi, pgno_t fo, + BT_page *parent, size_t childidx) +{ +#if DEBUG_PRINTNODE + DPRINTF("BEFORE INSERT lo %" PRIu32 " hi %" PRIu32 " fo %" PRIu32, lo, hi, fo); + _bt_printnode(parent); +#endif + + /* ;;: TODO confirm this logic is appropriate for branch nodes. (It /should/ + be correct for leaf nodes) */ + vaof_t llo = parent->datk[childidx].va; + vaof_t hhi = parent->datk[childidx+1].va; + + /* NB: it can be assumed that llo <= lo and hi <= hhi because this routine is + called using an index found with _bt_childidx */ + + /* duplicate */ + if (llo == lo && hhi == hi) { + parent->datk[childidx].fo = fo; + return BT_SUCC; + } + + if (llo == lo) { + _bt_datshift(parent, childidx + 1, 1); + vaof_t oldfo = parent->datk[childidx].fo; + parent->datk[childidx].fo = fo; + parent->datk[childidx+1].va = hi; + parent->datk[childidx+1].fo = (oldfo == 0) + ? 0 + : oldfo + (hi - llo); + } + else if (hhi == hi) { + _bt_datshift(parent, childidx + 1, 1); + parent->datk[childidx+1].va = lo; + parent->datk[childidx+1].fo = fo; + } + else { + _bt_datshift(parent, childidx + 1, 2); + parent->datk[childidx+1].va = lo; + parent->datk[childidx+1].fo = fo; + parent->datk[childidx+2].va = hi; + pgno_t lfo = parent->datk[childidx].fo; + vaof_t lva = parent->datk[childidx].va; + parent->datk[childidx+2].fo = (lfo == 0) + ? 0 + : lfo + (hi - lva); + } + +#if DEBUG_PRINTNODE + DPUTS("AFTER INSERT"); + _bt_printnode(parent); +#endif + return BT_SUCC; +} + + +//// =========================================================================== +//// wip - deletion coalescing + +/* ;;: todo: rename routines */ + +int +_bt_delco_1pass_0(BT_state *state, vaof_t lo, vaof_t hi, + BT_page *node, uint8_t depth, uint8_t maxdepth) +{ + /* Perform a dfs search on all ranges that fall within lo and hi */ + + size_t N = _bt_numkeys(node); + size_t loidx = 0; + size_t hiidx = 0; + + /* first find the entry that matches lo */ + size_t i; + for (i = 0; i < N-1; i++) { + vaof_t hhi = node->datk[i+1].va; + if (hhi > lo) { + loidx = i; + break; + } + } + + /* and then the entry that matches hi */ + for (; i < N; i++) { + vaof_t hhi = node->datk[i].va; + if (hhi >= hi) { + hiidx = i; + break; + } + } + + /* node->datk[loidx] - node->datk[hiidx] are the bounds on which to perform + the dfs */ + for (i = loidx; i < hiidx; i++) { + pgno_t pg = node->datk[i].fo; + + /* if at the leaf level, terminate with failure if pg is not free */ + if (depth == maxdepth) { + if (pg != 0) return 1; + else continue; + } + + /* otherwise, dfs the child node */ + BT_page *child = _node_get(state, pg); + if (!SUCC(_bt_delco_1pass_0(state, lo, hi, child, depth+1, maxdepth))) + return 1; + } + + /* whether we're at a leaf or a branch, by now all pages corresponding to the + hi-lo range must be free */ + return BT_SUCC; +} + +/* ;;: since this is called by another recursive function _bt_delco that first + finds if a split exists, this /could/ take a pgno to avoid unnecessarily + rewalking the tree. not a big deal though as is. */ +static int +_bt_delco_1pass(BT_state *state, vaof_t lo, vaof_t hi) +/* returns true if the leaves in the given range are all free (pgno of 0). false + otherwise. This must be the case for an insert into an overlapping range to + succeed */ +{ + BT_meta *meta = state->meta_pages[state->which]; + BT_page *root = _node_get(state, meta->root); + return _bt_delco_1pass_0(state, lo, hi, root, 1, meta->depth); +} + +static void +_mlist_insert(BT_state *state, void *lo, void *hi) +{ + BT_mlistnode **dst = &state->mlist; + BT_mlistnode **prev_dst = 0; + BYTE *lob = lo; + BYTE *hib = hi; + + while(*dst) { + if (hib == (*dst)->lo) { + (*dst)->lo = lob; + /* check if we can coalesce with left neighbor */ + if (prev_dst != 0) { + bp(0); /* ;;: note, this case should not hit. keeping for debugging. */ + /* dst equals &(*prev_dst)->next */ + assert(*prev_dst != 0); + if ((*prev_dst)->hi == lob) { + (*prev_dst)->hi = (*dst)->hi; + (*prev_dst)->next = (*dst)->next; + free(*dst); + } + } + return; + } + if (lob == (*dst)->hi) { + (*dst)->hi = hi; + /* check if we can coalesce with right neighbor */ + if ((*dst)->next != 0) { + if (hib == (*dst)->next->lo) { + (*dst)->hi = (*dst)->next->hi; + BT_mlistnode *dst_next = (*dst)->next; + (*dst)->next = (*dst)->next->next; + free(dst_next); + } + } + return; + } + if (hib > (*dst)->lo) { + assert(lob > (*dst)->hi); + assert(hib > (*dst)->hi); + prev_dst = dst; + dst = &(*dst)->next; + continue; + } + + /* otherwise, insert discontinuous node */ + BT_mlistnode *new = calloc(1, sizeof *new); + new->lo = lob; + new->hi = hib; + new->next = *dst; + *dst = new; + return; + } + + /* found end of list */ + BT_mlistnode *new = calloc(1, sizeof *new); + new->lo = lob; + new->hi = hib; + new->next = 0; + (*dst) = new; +} + +static void +_nlist_insert2(BT_state *state, BT_nlistnode **dst, BT_page *lo, BT_page *hi) +{ + BT_nlistnode **prev_dst = 0; + + while(*dst) { + if (hi == (*dst)->lo) { + (*dst)->lo = lo; + /* check if we can coalesce with left neighbor */ + if (prev_dst != 0) { + bp(0); /* ;;: note, this case should not hit. keeping for debugging. */ + /* dst equals &(*prev_dst)->next */ + assert(*prev_dst != 0); + if ((*prev_dst)->hi == lo) { + (*prev_dst)->hi = (*dst)->hi; + (*prev_dst)->next = (*dst)->next; + free(*dst); + } + } + return; + } + if (lo == (*dst)->hi) { + (*dst)->hi = hi; + /* check if we can coalesce with right neighbor */ + if ((*dst)->next != 0) { + if (hi == (*dst)->next->lo) { + (*dst)->hi = (*dst)->next->hi; + BT_nlistnode *dst_next = (*dst)->next; + (*dst)->next = (*dst)->next->next; + free(dst_next); + } + } + return; + } + if (hi > (*dst)->lo) { + assert(lo > (*dst)->hi); + assert(hi > (*dst)->hi); + prev_dst = dst; + dst = &(*dst)->next; + continue; + } + + /* otherwise, insert discontinuous node */ + BT_nlistnode *new = calloc(1, sizeof *new); + new->lo = lo; + new->hi = hi; + new->next = *dst; + *dst = new; + return; + } +} + +static void +_nlist_insert(BT_state *state, BT_nlistnode **dst, pgno_t nodepg) +{ + BT_page *lo = _node_get(state, nodepg); + BT_page *hi = _node_get(state, nodepg+1); + _nlist_insert2(state, dst, lo, hi); +} + +static void +_pending_nlist_merge(BT_state *state) +{ + BT_nlistnode *src_head = state->pending_nlist; + BT_nlistnode *prev = 0; + while (src_head) { + _nlist_insert2(state, &state->nlist, src_head->lo, src_head->hi); + prev = src_head; + src_head = src_head->next; + free(prev); + } +} + +static void +_flist_insert(BT_flistnode **dst, pgno_t lo, pgno_t hi) +{ + BT_flistnode **prev_dst = 0; + + while(*dst) { + if (hi == (*dst)->lo) { + (*dst)->lo = lo; + /* check if we can coalesce with left neighbor */ + if (prev_dst != 0) { + bp(0); /* ;;: note, this case should not hit. keeping for debugging. */ + /* dst equals &(*prev_dst)->next */ + assert(*prev_dst != 0); + if ((*prev_dst)->hi == lo) { + (*prev_dst)->hi = (*dst)->hi; + (*prev_dst)->next = (*dst)->next; + free(*dst); + } + } + return; + } + if (lo == (*dst)->hi) { + (*dst)->hi = hi; + /* check if we can coalesce with right neighbor */ + if ((*dst)->next != 0) { + if (hi == (*dst)->next->lo) { + (*dst)->hi = (*dst)->next->hi; + BT_flistnode *dst_next = (*dst)->next; + (*dst)->next = (*dst)->next->next; + free(dst_next); + } + } + return; + } + if (hi > (*dst)->lo) { + assert(lo > (*dst)->hi); + assert(hi > (*dst)->hi); + prev_dst = dst; + dst = &(*dst)->next; + continue; + } + + /* otherwise, insert discontinuous node */ + BT_flistnode *new = calloc(1, sizeof *new); + new->lo = lo; + new->hi = hi; + new->next = *dst; + *dst = new; + return; + } +} + +static void +_pending_flist_merge(BT_state *state) +{ + BT_flistnode *src_head = state->pending_flist; + BT_flistnode *prev = 0; + while (src_head) { + _flist_insert(&state->flist, src_head->lo, src_head->hi); + prev = src_head; + src_head = src_head->next; + free(prev); + } +} + + +/* ;;: todo move shit around */ +static void +_bt_delco_droptree2(BT_state *state, pgno_t nodepg, + uint8_t depth, uint8_t maxdepth, int isdirty) +{ + int ischilddirty = 0; + + /* branch */ + if (depth != maxdepth) { + BT_page *node = _node_get(state, nodepg); + for (size_t i = 0; i < BT_DAT_MAXKEYS; i++) { + BT_kv entry = node->datk[i]; + if (entry.fo == 0) + break; /* done */ + ischilddirty = _bt_ischilddirty(node, i); + _bt_delco_droptree2(state, entry.fo, depth+1, maxdepth, ischilddirty); + } + } + + /* branch and leaf */ + if (isdirty) { + _nlist_insert(state, &state->nlist, nodepg); + } + else { + _nlist_insert(state, &state->pending_nlist, nodepg); + } +} + +static void +_bt_delco_droptree(BT_state *state, pgno_t nodepg, uint8_t depth, int isdirty) +{ + /* completely drop a tree. Assume that all leaves under the tree are free + (pgno = 0) */ + assert(nodepg >= 2); + BT_meta *meta = state->meta_pages[state->which]; + _bt_delco_droptree2(state, nodepg, depth, meta->depth, isdirty); +} + +static void +_bt_delco_trim_rsubtree_lhs2(BT_state *state, vaof_t lo, vaof_t hi, + pgno_t nodepg, uint8_t depth, uint8_t maxdepth) +{ + BT_page *node = _node_get(state, nodepg); + size_t hiidx = 0; + size_t N = _bt_numkeys(node); + + /* find hi idx of range */ + size_t i; + for (i = 0; i < N; i++) { + vaof_t hhi = node->datk[i].va; + if (hhi >= hi) { + hiidx = i; + break; + } + } + + /* set the lo address of datk[hiidx] to hi */ + node->datk[hiidx-1].va = hi; + + /* drop the subtrees left of the range */ + if (depth != maxdepth) { + for (i = 0; i < hiidx-1; i++) { + pgno_t childpg = node->datk[i].fo; + if (childpg == 0) + break; + int ischilddirty = _bt_ischilddirty(node, i); + _bt_delco_droptree(state, childpg, depth+1, ischilddirty); + } + } + + /* memmove the buffer so the found range is the first in the node */ + BYTE *dst = (BYTE *)&node->datk[0].va; + BYTE *src = (BYTE *)&node->datk[hiidx-1].va; + BYTE *end = (BYTE *)&node->datk[BT_DAT_MAXKEYS-1].fo; + size_t len = end - src; + + memmove(dst, src, len); + + /* ;;: TODO add temporary asserts for testing? */ + + /* and now zero the moved range */ + ZERO(dst+len, end-(dst+len)); + + /* done if this is a leaf */ + if (depth == maxdepth) + return; + /* otherwise, recur on subtree */ + pgno_t rsubtree = node->datk[hiidx].fo; + _bt_delco_trim_rsubtree_lhs2(state, lo, hi, rsubtree, depth+1, maxdepth); +} + +static void +_bt_delco_trim_rsubtree_lhs(BT_state *state, vaof_t lo, vaof_t hi, + pgno_t nodepg, uint8_t depth) +{ + BT_meta *meta = state->meta_pages[state->which]; + _bt_delco_trim_rsubtree_lhs2(state, lo, hi, nodepg, depth, meta->depth); +} + +static void +_bt_delco_trim_lsubtree_rhs2(BT_state *state, vaof_t lo, vaof_t hi, + pgno_t nodepg, uint8_t depth, uint8_t maxdepth) +{ + BT_page *node = _node_get(state, nodepg); + size_t N = _bt_numkeys(node); + size_t loidx = 0; + + /* find low idx of range */ + size_t i; + for (i = 0; i < N-1; i++) { + vaof_t hhi = node->datk[i+1].va; + if (hhi > lo) { + loidx = i; + break; + } + } + + /* set the hi address of datk[loidx] to hi */ + node->datk[loidx+1].va = hi; + + /* drop the subtrees right of the range */ + if (depth != maxdepth) { + /* recur and droptree for branches */ + for (i = loidx+1; i < N-1; i++) { + pgno_t childpg = node->datk[i].fo; + if (childpg == 0) + break; + int ischilddirty = _bt_ischilddirty(node, i); + _bt_delco_droptree(state, childpg, depth+1, ischilddirty); + } + } + + /* always zero rhs whether node is a leaf or a branch */ + BYTE *beg = (BYTE *)&node->datk[loidx+1].fo; + BYTE *end = (BYTE *)&node->datk[BT_DAT_MAXKEYS-1].fo; + size_t len = end - beg; + + ZERO(beg, len); + /* ;;: this won't zero the last fo, but that should be fine. remove the assert + when you're confident it /is/ fine */ + assert(node->datk[BT_DAT_MAXKEYS-1].fo == 0); + + /* done if this is a leaf */ + if (depth == maxdepth) + return; + /* otherwise, recur on the left subtree */ + pgno_t lsubtree = node->datk[loidx].fo; + _bt_delco_trim_lsubtree_rhs2(state, lo, hi, lsubtree, depth+1, maxdepth); +} + +static void +_bt_delco_trim_lsubtree_rhs(BT_state *state, vaof_t lo, vaof_t hi, + pgno_t nodepg, uint8_t depth) +{ + BT_meta *meta = state->meta_pages[state->which]; + _bt_delco_trim_lsubtree_rhs2(state, lo, hi, nodepg, depth, meta->depth); +} + +static void +_bt_delco(BT_state *state, vaof_t lo, vaof_t hi, + pgno_t nodepg, uint8_t depth, uint8_t maxdepth) +{ + /* ;;: "find_internal_splits" in the original algorithm */ + BT_page *node = _node_get(state, nodepg); + size_t N = _bt_numkeys(node); + + size_t loidx = 0; + size_t hiidx = 0; + pgno_t lsubtree = 0; + pgno_t rsubtree = 0; + + /* find low idx of range */ + for (size_t i = 0; i < N-1; i++) { + vaof_t hhi = node->datk[i+1].va; + if (hhi > lo) { + loidx = i; + break; + } + } + + /* find high idx of range */ + for (size_t i = loidx; i < N; i++) { + vaof_t hhi = node->datk[i].va; + if (hhi >= hi) { + assert(i > 0); + hiidx = i - 1; + break; + } + } + + /* non-split range and at leaf. done */ + if (depth == maxdepth + && hiidx == loidx) { + return; + } + + lsubtree = node->datk[loidx].fo; + rsubtree = node->datk[hiidx].fo; + + if (depth < maxdepth) { + /* guarantee path is dirty by CoWing node if not */ + + /* ;;: refactor? code duplication?? */ + if (!_bt_ischilddirty(node, loidx)) { + BT_page *child = _node_get(state, lsubtree); + pgno_t newpg; + _node_cow(state, child, &newpg); + lsubtree = node->datk[loidx].fo = newpg; + _bt_dirtychild(node, loidx); + } + + if (!_bt_ischilddirty(node, hiidx)) { + BT_page *child = _node_get(state, rsubtree); + pgno_t newpg; + _node_cow(state, child, &newpg); + rsubtree = node->datk[hiidx].fo = newpg; + _bt_dirtychild(node, hiidx); + } + } + + /* non-split range, recurse to child tree */ + if (hiidx == loidx) { + pgno_t childpg = node->datk[loidx].fo; + _bt_delco(state, lo, hi, childpg, depth+1, maxdepth); + } + + /* split range discovered */ + if (hiidx > loidx) { + /* run first pass to guarantee range is completely free */ + if (!SUCC(_bt_delco_1pass(state, lo, hi))) { + /* attempted insert on split range that cannot be coalesced */ + assert(0); + } + + /* set leftmost boundary va to hi */ + node->datk[loidx+1].va = hi; + + /* set the lo side of the right boundary to hi */ + node->datk[hiidx].va = hi; + + /* drop all trees between the two subtrees */ + for (size_t i = loidx+1; i < hiidx; i++) { + pgno_t childpg = node->datk[i].fo; + int ischilddirty = _bt_ischilddirty(node, i); + _bt_delco_droptree(state, childpg, depth+1, ischilddirty); + } + + /* move buffer */ + BYTE *dst = (BYTE *)&node->datk[loidx+1].va; + BYTE *src = (BYTE *)&node->datk[hiidx].va; + BYTE *end = (BYTE *)&node->datk[BT_DAT_MAXKEYS-1].fo; + size_t len = end - src; + memmove(dst, src, len); + ZERO(dst+len, end-(dst+len)); + + /* unless at leaf trim left subtree then trim right subtree */ + if (depth < maxdepth) { + _bt_delco_trim_lsubtree_rhs(state, lo, hi, lsubtree, depth+1); + _bt_delco_trim_rsubtree_lhs(state, lo, hi, rsubtree, depth+1); + } + + /* done */ + return; + } +} + +/* ;;: todo, update meta->depth when we add a row. Should this be done in + _bt_rebalance? */ +static int +_bt_insert2(BT_state *state, vaof_t lo, vaof_t hi, pgno_t fo, + BT_page *node, size_t depth) +{ + /* ;;: to be written in such a way that node is guaranteed both dirty and + non-full */ + + /* ;;: remember: + - You need to CoW+dirty a node when you insert a non-dirty node. + - You need to insert into a node when: + - It's a leaf + - It's a branch and you CoWed the child + - Hence, all nodes in a path to a leaf being inserted into need to already + be dirty or explicitly Cowed. Splitting doesn't actually factor into this + decision afaict. + */ + + assert(node); + + int rc = 255; + size_t N = 0; + size_t childidx = _bt_childidx(node, lo, hi); + assert(childidx != BT_DAT_MAXKEYS); + BT_meta *meta = state->meta_pages[state->which]; + + if (depth < meta->depth) { + pgno_t childpgno = node->datk[childidx].fo; + BT_page *child = _node_get(state, childpgno); + N = _bt_numkeys(child); + } + + /* nullcond: node is a leaf */ + if (meta->depth == depth) { + /* guaranteed non-full and dirty by n-1 recursive call, so just insert */ + return _bt_insertdat(lo, hi, fo, node, childidx); + } + + /* do we need to CoW the child node? */ + if (!_bt_ischilddirty(node, childidx)) { + pgno_t pgno; + _node_cow(state, node, &pgno); + node->datk[childidx].fo = pgno; + _bt_dirtychild(node, childidx); + } + + /* do we need to split the child node? */ + if (N >= BT_DAT_MAXKEYS - 2) { + pgno_t rchild_pgno; + if (!SUCC(rc = _bt_split_child(state, node, childidx, &rchild_pgno))) + return rc; + + /* since we split the child's data, recalculate the child idx */ + /* ;;: note, this can be simplified into a conditional i++ */ + childidx = _bt_childidx(node, lo, hi); + + } + + /* the child is now guaranteed non-full (split) and dirty. Recurse */ + BT_page *child = _node_get(state, node->datk[childidx].fo); + return _bt_insert2(state, lo, hi, fo, child, depth+1); +} + +static int +_bt_insert(BT_state *state, vaof_t lo, vaof_t hi, pgno_t fo) +/* handles CoWing/splitting of the root page since it's special cased. Then + passes the child matching hi/lo to _bt_insert2 */ +{ + + BT_meta *meta = state->meta_pages[state->which]; + BT_page *root = _node_get(state, meta->root); + + /* the root MUST be dirty (zero checksum in metapage) */ + assert(meta->chk == 0); + + size_t N = _bt_numkeys(root); + + /* perform deletion coalescing (and preemptively guarantee path is dirty) if + inserting a non-zero (non-free) page */ + if (fo != 0) { + _bt_delco(state, lo, hi, meta->root, 1, meta->depth); + } + + /* CoW root's child if it isn't already dirty */ + size_t childidx = _bt_childidx(root, lo, hi); + assert(childidx != BT_DAT_MAXKEYS); /* ;;: this should catch the case of + improperly inserting into a split + range. Should we do it earlier or + differently? */ + if (meta->depth > 1 + && !_bt_ischilddirty(root, childidx)) { + BT_page *child = _node_get(state, root->datk[childidx].fo); + pgno_t newchildpg; + _node_cow(state, child, &newchildpg); + root->datk[childidx].fo = newchildpg; + _bt_dirtychild(root, childidx); + } + + /* before calling into recursive insert, handle root splitting since it's + special cased (2 allocs) */ + if (N >= BT_DAT_MAXKEYS - 2) { /* ;;: remind, fix all these conditions to be - 2 */ + pgno_t pg = 0; + + /* the old root is now the left child of the new root */ + BT_page *left = root; + BT_page *right = _bt_nalloc(state); + BT_page *rootnew = _bt_nalloc(state); + + /* split root's data across left and right nodes */ + _bt_split_datcopy(left, right); + /* save left and right in new root's .data */ + pg = _fo_get(state, left); + rootnew->datk[0].fo = pg; + rootnew->datk[0].va = 0; + pg = _fo_get(state, right); + rootnew->datk[1].fo = pg; + rootnew->datk[1].va = right->datk[0].va; + rootnew->datk[2].va = UINT32_MAX; + /* dirty new root's children */ + _bt_dirtychild(rootnew, 0); + _bt_dirtychild(rootnew, 1); + /* update meta page information. (root and depth) */ + pg = _fo_get(state, rootnew); + meta->root = pg; + meta->depth += 1; + root = rootnew; + } + + /* + meta is dirty + root is dirty and split if necessary + root's child in insert path is dirty and split if necessary + finally, recurse on child + */ + return _bt_insert2(state, lo, hi, fo, root, 1); + /* return _bt_insert2(state, lo, hi, fo, child, 1); */ +} + +/* ;;: wip */ +/* ;;: inspired by lmdb's MDB_pageparent. While seemingly unnecessary for + _bt_insert, this may be useful for _bt_delete when we implement deletion + coalescing */ +typedef struct BT_ppage BT_ppage; +struct BT_ppage { + BT_page *node; + BT_page *parent; +}; + +static int +_bt_delete(BT_state *state, vaof_t lo, vaof_t hi) __attribute((unused)); + +static int +_bt_delete(BT_state *state, vaof_t lo, vaof_t hi) +{ + /* ;;: tmp, implement coalescing of zero ranges and merging/rebalancing of + nodes */ + return _bt_insert(state, lo, hi, 0); +} + +static int +_mlist_new(BT_state *state) +{ + BT_meta *meta = state->meta_pages[state->which]; + BT_page *root = _node_get(state, meta->root); + /* assert(root->datk[0].fo == 0); */ + size_t N = _bt_numkeys(root); + + vaof_t lo = root->datk[0].va; + vaof_t hi = root->datk[N-1].va; + + BT_mlistnode *head = calloc(1, sizeof *head); + + head->next = 0; + head->lo = off2addr(lo); + head->hi = off2addr(hi); + state->mlist = head; + + return BT_SUCC; +} + +#if 0 +static int +_flist_grow(BT_state *state, BT_flistnode *space) +/* growing the flist consists of expanding the backing persistent file, pushing + that space onto the disk freelist, and updating the dimension members in + BT_state */ +{ + /* ;;: I don't see any reason to grow the backing file non-linearly, but we + may want to adjust the size of the amount grown based on performance + testing. */ + if (-1 == lseek(state->data_fd, state->file_size + PMA_GROW_SIZE, SEEK_SET)) + return errno; + if (-1 == write(state->data_fd, "", 1)) + return errno; + + + /* find the last node in the disk freelist */ + BT_flistnode *tail = state->flist; + for (; tail->next; tail = tail->next) + ; + + pgno_t lastpgfree = tail->hi; + + /* ;;: TODO, make sure you are certain of this logic. Further, add assertions + regarding relative positions of state->file_size, state->frontier, and + lastpgfree + + we MAY call into this routine even if there is freespace on the end + because it's possible that freespace isn't large enough. We may also call + into this routine when the frontier exceeds the last free pg because + that's just how freelists work. ofc, frontier should never exceed + file_size. what other assertions?? + + */ + + /* if the frontier (last pg in use) is less than the last page free, we should + coalesce the new node with the tail. */ + if (state->frontier <= lastpgfree) { + tail->hi += PMA_GROW_SIZE; /* ;;: THIS IS INCORRECT */ + } + /* otherwise, a new node needs to be allocated */ + else { + BT_flistnode *new = calloc(1, sizeof *new); + /* since the frontier exceeds the last pg free, new freespace should + naturally be allocated at the frontier */ + new->pg = state->frontier; + new->hi = PMA_GROW_SIZE; + tail->next = new; + } + + /* finally, update the file size */ + state->file_size += PMA_GROW_SIZE; + + return BT_SUCC; +} +#endif + +static int +_flist_new(BT_state *state) +#define FLIST_PG_START ((BT_META_SECTION_WIDTH + BLK_BASE_LEN0) / BT_PAGESIZE) +{ + BT_meta *meta = state->meta_pages[state->which]; + BT_page *root = _node_get(state, meta->root); + /* assert(root->datk[0].fo == 0); */ + size_t N = _bt_numkeys(root); + + vaof_t lo = root->datk[0].va; + vaof_t hi = root->datk[N-1].va; + size_t len = hi - lo; + + BT_flistnode *head = calloc(1, sizeof *head); + head->next = 0; + head->lo = FLIST_PG_START; + head->hi = FLIST_PG_START + len; + state->flist = head; + + return BT_SUCC; +} + +static int +_nlist_new(BT_state *state) +{ + BT_nlistnode *head = calloc(1, sizeof *head); + + /* the size of a new node freelist is just the first stripe length */ + head->lo = &((BT_page *)state->map)[BT_NUMMETAS]; + head->hi = head->lo + B2PAGES(BLK_BASE_LEN0); + head->next = 0; + + state->nlist = head; + + return BT_SUCC; +} + +static int +_nlist_delete(BT_state *state) +{ + BT_nlistnode *head, *prev; + head = prev = state->nlist; + while (head->next) { + prev = head; + head = head->next; + free(prev); + } + state->nlist = 0; + return BT_SUCC; +} + +#if 0 +static BT_nlistnode * +_nlist_read_prev(BT_nlistnode *head, BT_nlistnode *curr) +{ + /* find nlist node preceding curr and return it */ + BT_nlistnode *p, *n; + p = head; + n = head->next; + for (; n; p = n, n = n->next) { + if (n == curr) + return p; + } + return 0; +} + +/* TODO this is a pretty bad algorithm in terms of time complexity. It should be + fixed, but isn't necessary now as our nlist is quite small. You may want to + consider making nlist doubly linked or incorporate a sort and merge step. */ +static int +_nlist_read2(BT_state *state, BT_page *node, uint8_t maxdepth, + BT_nlistnode *head, uint8_t depth) +/* recursively walk all nodes in the btree. Allocating new nlist nodes when a + node is found to be in a stripe unaccounted for. For each node found, + split/shrink the appropriate node to account for the allocated page */ +{ + BT_nlistnode *p, *n; + p = head; + n = head->next; + + /* find the nlist node that fits the current btree node */ + for (; n; p = n, n = n->next) { + if (p->va <= node && p->va + p->sz > node) + break; + } + + /* if the nlist node is only one page wide, it needs to be freed */ + if (p->sz == 1) { + BT_nlistnode *prev = _nlist_read_prev(head, p); + prev->next = p->next; + free(p); + goto e; + } + + /* if the btree node resides at the end of the nlist node, just shrink it */ + BT_page *last = p->va + p->sz - 1; + if (last == node) { + p->sz -= 1; + goto e; + } + + /* if the btree node resides at the start of the nlist node, likewise shrink + it and update the va */ + if (p->va == node) { + p->sz -= 1; + p->va += 1; + goto e; + } + + /* otherwise, need to split the current nlist node */ + BT_nlistnode *right = calloc(1, sizeof *right); + size_t lsz = node - p->va; + size_t rsz = (p->va + p->sz) - node; + /* remove 1 page from the right nlist node's size to account for the allocated + btree node */ + rsz -= 1; + assert(lsz > 0 && rsz > 0); + + /* update the size of the left node. And set the size and va of the right + node. Finally, insert the new nlist node into the nlist. */ + p->sz = lsz; + right->sz = rsz; + right->va = node + 1; + right->next = p->next; + p->next = right; + + e: + /* if at a leaf, we're finished */ + if (depth == maxdepth) { + return BT_SUCC; + } + + /* otherwise iterate over all child nodes, recursively constructing the + list */ + int rc = BT_SUCC; + for (size_t i = 0; i < BT_DAT_MAXKEYS; i++) { + BT_kv kv = node->datk[i]; + BT_page *child = _node_get(state, node->datk[i].fo); + if (!child) continue; + if (!SUCC(rc = _nlist_read2(state, + child, + maxdepth, + head, + depth+1))) + return rc; + } + + /* all children traversed */ + return BT_SUCC; +} + +static int +_nlist_read(BT_state *state) +{ + /* ;;: this should theoretically be simpler than _mlist_read. right? We can + derive the stripes that contain nodes from the block base array stored in + the metapage. What else do we need to know? -- the parts of each stripe + that are free or in use. How can we discover that? + + 1) Without storing any per-page metadata, we could walk the entire tree + from the root. Check the page number of the node. And modify the freelist + accordingly. + + 2) If we stored per-page metadata, this would be simpler. Linearly traverse + each stripe and check if the page is BT_NODE or BT_FREE. + + -- are there downsides to (2)? The only advantage to this would be quicker + startup. So for now, going to traverse all nodes and for each node, + traverse the nlist and split it appropriately. + */ + + int rc = BT_SUCC; + BT_meta *meta = state->meta_pages[state->which]; + BT_page *root = _node_get(state, meta->root); + + /* ;;: since partition striping isn't implemented yet, simplifying code by + assuming all nodes reside in the 2M region */ + BT_nlistnode *head = calloc(1, sizeof *head); + head->sz = BLK_BASE_LEN0; + head->va = &((BT_page *)state->map)[BT_NUMMETAS]; + head->next = 0; + + if (!SUCC(rc = _nlist_read2(state, root, meta->depth, head, 1))) + return rc; + + state->nlist = head; + + return rc; +} + +static BT_mlistnode * +_mlist_read2(BT_state *state, BT_page *node, uint8_t maxdepth, uint8_t depth) +{ + /* leaf */ + if (depth == maxdepth) { + BT_mlistnode *head, *prev; + head = prev = calloc(1, sizeof *head); + + size_t i = 0; + BT_kv *kv = &node->datk[i]; + while (i < BT_DAT_MAXKEYS - 1) { +#if CAN_COALESCE + /* free and contiguous with previous mlist node: merge */ + if (kv->fo == 0 + && addr2off(prev->va) + prev->sz == kv->va) { + vaof_t hi = node->datk[i+1].va; + vaof_t lo = kv->va; + size_t len = hi - lo; + prev->sz += len; + } + /* free but not contiguous with previous mlist node: append new node */ + else if (kv->fo == 0) { +#endif + BT_mlistnode *new = calloc(1, sizeof *new); + vaof_t hi = node->datk[i+1].va; + vaof_t lo = kv->va; + size_t len = hi - lo; + new->sz = len; + new->va = off2addr(lo); + prev->next = new; + prev = new; +#if CAN_COALESCE + } +#endif + + kv = &node->datk[++i]; + } + return head; + } + + /* branch */ + size_t i = 0; + BT_mlistnode *head, *prev; + head = prev = 0; + for (; i < BT_DAT_MAXKEYS; ++i) { + BT_kv kv = node->datk[i]; + if (kv.fo == BT_NOPAGE) + continue; + BT_page *child = _node_get(state, kv.fo); + BT_mlistnode *new = _mlist_read2(state, child, maxdepth, depth+1); + if (head == 0) { + head = prev = new; + } + else { + /* just blindly append and unify the ends afterward */ + prev->next = new; + } + } + return 0; +} + +static int +_mlist_read(BT_state *state) +{ + BT_meta *meta = state->meta_pages[state->which]; + BT_page *root = _node_get(state, meta->root); + uint8_t maxdepth = meta->depth; + BT_mlistnode *head = _mlist_read2(state, root, maxdepth, 1); + + /* + trace the full freelist and unify nodes one last time + NB: linking the leaf nodes would make this unnecessary + */ +#if CAN_COALESCE + BT_mlistnode *p = head; + BT_mlistnode *n = head->next; + while (n) { + size_t llen = P2BYTES(p->sz); + uintptr_t laddr = (uintptr_t)p->va; + uintptr_t raddr = (uintptr_t)n->va; + /* contiguous: unify */ + if (laddr + llen == raddr) { + p->sz += n->sz; + p->next = n->next; + free(n); + } + } +#endif + + state->mlist = head; + return BT_SUCC; +} +#endif + +static int +_mlist_delete(BT_state *state) +{ + BT_mlistnode *head, *prev; + head = prev = state->mlist; + while (head->next) { + prev = head; + head = head->next; + free(prev); + } + state->mlist = 0; + return BT_SUCC; +} + +#if 0 +BT_flistnode * +_flist_read2(BT_state *state, BT_page *node, uint8_t maxdepth, uint8_t depth) +{ + size_t N = _bt_numkeys(node); + /* leaf */ + if (depth == maxdepth) { + BT_flistnode *head, *prev; + head = prev = calloc(1, sizeof(*head)); + + /* ;;: fixme the head won't get populated in this logic */ + size_t i = 0; + BT_kv *kv = &node->datk[i]; + while (i < N-1) { + /* Just blindly append nodes since they aren't guaranteed sorted */ + BT_flistnode *new = calloc(1, sizeof *new); + vaof_t hi = node->datk[i+1].va; + vaof_t lo = kv->va; + size_t len = hi - lo; + pgno_t fo = kv->fo; + new->sz = len; + new->pg = fo; + prev->next = new; + prev = new; + + kv = &node->datk[++i]; + } + for (size_t i = 0; i < N-1; i++) { + vaof_t hi = node->datk[i+1].va; + vaof_t lo = node->datk[i].va; + size_t len = hi - lo; + pgno_t fo = node->datk[i].fo; + /* not free */ + if (fo != 0) + continue; + } + return head; + } + + /* branch */ + size_t i = 0; + BT_flistnode *head, *prev; + head = prev = 0; + for (; i < N; ++i) { + BT_kv kv = node->datk[i]; + if (kv.fo == BT_NOPAGE) + continue; + BT_page *child = _node_get(state, kv.fo); + BT_flistnode *new = _flist_read2(state, child, maxdepth, depth+1); + if (head == 0) { + head = prev = new; + } + else { + /* just blindly append and unify the ends afterward */ + prev->next = new; + } + } + return 0; +} + +static int +_flist_read(BT_state *state) +{ + BT_meta *meta = state->meta_pages[state->which]; + BT_page *root = _node_get(state, meta->root); + uint8_t maxdepth = meta->depth; + BT_flistnode *head = _flist_read2(state, root, maxdepth, 1); + /* ;;: infinite loop with proper starting depth of 1. -- fix that! */ + /* BT_flistnode *head = _flist_read2(state, root, maxdepth, 1); */ + + if (head == 0) + return BT_SUCC; + + /* sort the freelist */ + _flist_mergesort(head); + + /* merge contiguous regions after sorting */ + BT_flistnode *p = head; + BT_flistnode *n = head->next; + while (n) { + size_t llen = p->sz; + pgno_t lfo = p->pg; + pgno_t rfo = n->pg; + /* contiguous: unify */ + if (lfo + llen == rfo) { + p->sz += n->sz; + p->next = n->next; + free(n); + } + } + + state->flist = head; + return BT_SUCC; +} +#endif + +static int +_flist_delete(BT_state *state) +{ + BT_flistnode *head, *prev; + head = prev = state->flist; + while (head->next) { + prev = head; + head = head->next; + free(prev); + } + state->flist = 0; + return BT_SUCC; +} + +#define CLOSE_FD(fd) \ + do { \ + close(fd); \ + fd = -1; \ + } while(0) + +/* TODO: move to lib */ +static uint32_t +nonzero_crc_32(void *dat, size_t len) +{ + unsigned char nonce = 0; + uint32_t chk = crc_32(dat, len); + + do { + if (nonce > 8) + abort(); + chk = update_crc_32(chk, nonce++); + } while (chk == 0); + + return chk; +} + +static void +_bt_state_restore_maps2(BT_state *state, BT_page *node, + uint8_t depth, uint8_t maxdepth) +{ + size_t N = _bt_numkeys(node); + + /* leaf */ + if (depth == maxdepth) { + for (size_t i = 0; i < N-1; i++) { + vaof_t lo = node->datk[i].va; + vaof_t hi = node->datk[i+1].va; + pgno_t pg = node->datk[i].fo; + + BYTE *loaddr = off2addr(lo); + BYTE *hiaddr = off2addr(hi); + size_t bytelen = hiaddr - loaddr; + off_t offset = P2BYTES(pg); + + if (pg != 0) { + /* not freespace, map readonly data on disk */ + if (loaddr != + mmap(loaddr, + bytelen, + BT_PROT_CLEAN, + BT_FLAG_CLEAN, + state->data_fd, + offset)) { + DPRINTF("mmap: failed to map at addr %p, errno: %s", loaddr, strerror(errno)); + abort(); + } + } + else { + /* freespace, map no access */ + if (loaddr != + mmap(loaddr, + bytelen, + BT_PROT_FREE, + BT_FLAG_FREE, + 0, 0)) { + DPRINTF("mmap: failed to map at addr %p, errno: %s", loaddr, strerror(errno)); + abort(); + } + } + } + return; + } + + /* branch - dfs all subtrees */ + for (size_t i = 0; i < N-1; i++) { + /* ;;: assuming node stripes when partition striping is implemented will be + 1:1 mapped to disk for simplicity. If that is not the case, they should + be handled here. */ + pgno_t pg = node->datk[i].fo; + BT_page *child = _node_get(state, pg); + _bt_state_restore_maps2(state, child, depth+1, maxdepth); + } +} + +static void +_bt_state_restore_maps(BT_state *state) +/* restores the memory map of the btree since data can be arbitrarily located */ +{ + /* TODO: add checks to ensure data isn't mapped into an invalid location + (e.g. a node stripe) */ + BT_meta *meta = state->meta_pages[state->which]; + BT_page *root = _node_get(state, meta->root); + _bt_state_restore_maps2(state, root, 1, meta->depth); +} + +static int +_bt_state_meta_which(BT_state *state) +{ + BT_meta *m1 = state->meta_pages[0]; + BT_meta *m2 = state->meta_pages[1]; + int which = -1; + + if (m1->chk == 0) { + /* first is dirty */ + which = 1; + } + else if (m2->chk == 0) { + /* second is dirty */ + which = 0; + } + else if (m1->txnid > m2->txnid) { + /* first is most recent */ + which = 0; + } + else if (m1->txnid < m2->txnid) { + /* second is most recent */ + which = 1; + } + else { + /* invalid state */ + return EINVAL; + } + + /* checksum the metapage found and abort if checksum doesn't match */ + BT_meta *meta = state->meta_pages[which]; + uint32_t chk = nonzero_crc_32(meta, BT_META_LEN); + if (chk != meta->chk) { + abort(); + } + + /* set which in state */ + state->which = which; + + return BT_SUCC; +} + +static int +_bt_state_read_header(BT_state *state) +{ + BT_meta *m1, *m2; + int rc = 1; + BYTE metas[BT_PAGESIZE*2] = {0}; + m1 = state->meta_pages[0]; + m2 = state->meta_pages[1]; + + TRACE(); + + if (pread(state->data_fd, metas, BT_PAGESIZE*2, 0) + != BT_PAGESIZE*2) { + /* new pma */ + return ENOENT; + } + + /* validate magic */ + if (m1->magic != BT_MAGIC) { + DPRINTF("metapage 0x%pX inconsistent magic: 0x%" PRIX32, m1, m1->magic); + return EINVAL; + } + if (m2->magic != BT_MAGIC) { + DPRINTF("metapage 0x%pX inconsistent magic: 0x%" PRIX32, m2, m2->magic); + return EINVAL; + } + + /* validate flags */ + if ((m1->flags & BP_META) != BP_META) { + DPRINTF("metapage 0x%pX missing meta page flag", m1); + return EINVAL; + } + if ((m2->flags & BP_META) != BP_META) { + DPRINTF("metapage 0x%pX missing meta page flag", m2); + return EINVAL; + } + + /* validate binary version */ + if (m1->version != BT_VERSION) { + DPRINTF("version mismatch on metapage: 0x%pX, metapage version: %" PRIu32 ", binary version %u", + m1, m1->version, BT_VERSION); + return EINVAL; + } + + /* validate binary version */ + if (m2->version != BT_VERSION) { + DPRINTF("version mismatch on metapage: 0x%pX, metapage version: %" PRIu32 ", binary version %u", + m2, m2->version, BT_VERSION); + return EINVAL; + } + + if (!SUCC(rc = _bt_state_meta_which(state))) + return rc; + + return BT_SUCC; +} + +static int +_bt_state_meta_new(BT_state *state) +#define INITIAL_ROOTPG 2 +{ + BT_page *p1, *p2, *root; + BT_meta meta = {0}; + + TRACE(); + + /* open the metapage region for writing */ + if (mprotect(BT_MAPADDR, BT_META_SECTION_WIDTH, + BT_PROT_DIRTY) != 0) { + DPRINTF("mprotect of metapage section failed with %s", strerror(errno)); + abort(); + } + + /* initialize the block base array */ + meta.blk_base[0] = BT_PAGESIZE * BT_NUMMETAS; + + root = _bt_nalloc(state); + _bt_root_new(&meta, root); + + /* initialize meta struct */ + meta.magic = BT_MAGIC; + meta.version = BT_VERSION; + meta.last_pg = 1; + meta.txnid = 0; + meta.fix_addr = BT_MAPADDR; + meta.blk_cnt = 1; + meta.depth = 1; + meta.flags = BP_META; + meta.root = _fo_get(state, root); + assert(meta.root == INITIAL_ROOTPG); /* ;;: remove?? */ + + /* initialize the metapages */ + p1 = &((BT_page *)state->map)[0]; + p2 = &((BT_page *)state->map)[1]; + + /* copy the metadata into the metapages */ + memcpy(METADATA(p1), &meta, sizeof meta); + /* ;;: todo, should the second metapage actually share a .root with the + first?? */ + memcpy(METADATA(p2), &meta, sizeof meta); + + /* only the active metapage should be writable (first page) */ + if (mprotect(BT_MAPADDR, BT_META_SECTION_WIDTH, BT_PROT_CLEAN) != 0) { + DPRINTF("mprotect of metapage section failed with %s", strerror(errno)); + abort(); + } + if (mprotect(BT_MAPADDR, BT_PAGESIZE, + BT_PROT_DIRTY) != 0) { + DPRINTF("mprotect of current metapage failed with %s", strerror(errno)); + abort(); + } + + return BT_SUCC; +} + +static void +_freelist_restore2(BT_state *state, BT_page *node, + uint8_t depth, uint8_t maxdepth) +{ + size_t N = _bt_numkeys(node); + + /* leaf */ + if (depth == maxdepth) { + for (size_t i = 0; i < N-1; i++) { + /* if allocated */ + if (node->datk[i].fo != 0) { + /* record allocated memory range */ + BT_page *lo = off2addr(node->datk[i].va); + BT_page *hi = off2addr(node->datk[i+1].va); + _mlist_record_alloc(state, lo, hi); + /* record allocated file range */ + ssize_t siz_p = hi - lo; + assert(siz_p > 0); + assert(siz_p < UINT32_MAX); + pgno_t lofo = node->datk[i].fo; + pgno_t hifo = lofo + (pgno_t)siz_p; + _flist_record_alloc(state, lofo, hifo); + } + } + return; + } + /* branch */ + for (size_t i = 0; i < N-1; i++) { + pgno_t fo = node->datk[i].fo; + if (fo != 0) { + /* record allocated node */ + BT_page *child = _node_get(state, fo); + _nlist_record_alloc(state, child); + _freelist_restore2(state, child, depth+1, maxdepth); + } + } +} + +static void +_freelist_restore(BT_state *state) +/* restores the mlist, nlist, and mlist */ +{ + BT_meta *meta = state->meta_pages[state->which]; + BT_page *root = _node_get(state, meta->root); + assert(SUCC(_nlist_new(state))); + assert(SUCC(_mlist_new(state))); + assert(SUCC(_flist_new(state))); + /* first record root's allocation */ + _nlist_record_alloc(state, root); + _freelist_restore2(state, root, 1, meta->depth); +} + +static int +_bt_state_load(BT_state *state) +{ + int rc; + int new = 0; + BT_page *p; + struct stat stat; + + TRACE(); + + /* map first node stripe (along with metapages) as read only */ + state->map = mmap(BT_MAPADDR, + BT_META_SECTION_WIDTH + BLK_BASE_LEN0, + BT_PROT_CLEAN, + BT_FLAG_CLEAN, + state->data_fd, + 0); + + p = (BT_page *)state->map; + state->meta_pages[0] = METADATA(p); + state->meta_pages[1] = METADATA(p + 1); + + if (!SUCC(rc = _bt_state_read_header(state))) { + if (rc != ENOENT) return rc; + DPUTS("creating new db"); + state->file_size = PMA_GROW_SIZE; + new = 1; + if(ftruncate(state->data_fd, PMA_GROW_SIZE)) { + return errno; + } + } + + if (state->map != BT_MAPADDR) { + DPRINTF("mmap: failed to map at addr %p, errno: %s", BT_MAPADDR, strerror(errno)); + abort(); + } + + BYTE *nullspace_addr = BT_MAPADDR + (BT_META_SECTION_WIDTH + BLK_BASE_LEN0); + size_t nullspace_len = BLK_BASE_LEN_TOTAL - (BT_META_SECTION_WIDTH + BLK_BASE_LEN0); + if (nullspace_addr != mmap(nullspace_addr, + nullspace_len, + BT_PROT_FREE, + BT_FLAG_FREE, + 0, 0)) { + DPRINTF("mmap: failed to map at addr %p, errno: %s", nullspace_addr, strerror(errno)); + abort(); + } + + /* new db, so populate metadata */ + if (new) { + /* ;;: move this logic to _flist_new */ + if (-1 == lseek(state->data_fd, state->file_size, SEEK_SET)) + return errno; + if (-1 == write(state->data_fd, "", 1)) + return errno; + + state->file_size = PMA_GROW_SIZE; + + assert(SUCC(_nlist_new(state))); + + if (!SUCC(rc = _bt_state_meta_new(state))) { + munmap(state->map, BT_ADDRSIZE); + return rc; + } + + assert(SUCC(_mlist_new(state))); + assert(SUCC(_flist_new(state))); + } + else { + /* restore data memory maps */ + _bt_state_restore_maps(state); + + /* restore ephemeral freelists */ + _freelist_restore(state); + + /* Dirty the metapage and root page */ + assert(SUCC(_bt_flip_meta(state))); + + /* Set the file length */ + // XX make sure the flist is updated with this! + if (fstat(state->data_fd, &stat) != 0) + return errno; + + state->file_size = stat.st_size; + } + + return BT_SUCC; +} + +/* ;;: TODO, when persistence has been implemented, _bt_falloc will probably + need to handle extension of the file with appropriate striping. i.e. if no + space is found on the freelist, save the last entry, expand the file size, + and set last_entry->next to a new node representing the newly added file + space */ +static pgno_t +_bt_falloc(BT_state *state, size_t pages) +{ + /* walk the persistent file freelist and return a pgno with sufficient + contiguous space for pages */ + BT_flistnode **n = &state->flist; + pgno_t ret = 0; + + /* first fit */ + for (; *n; n = &(*n)->next) { + size_t sz_p = (*n)->hi - (*n)->lo; + + if (sz_p >= pages) { + ret = (*n)->lo; + pgno_t hi = ret + pages; + _flist_record_alloc(state, ret, hi); + break; + } + } + + if (ret == 0) { + DPUTS("flist out of mem!"); + return UINT32_MAX; + } + + return ret; +} + +static int +_bt_sync_hasdirtypage(BT_state *state, BT_page *node) __attribute((unused)); + +static int +_bt_sync_hasdirtypage(BT_state *state, BT_page *node) +/* ;;: could be more efficiently replaced by a gcc vectorized builtin */ +{ + for (size_t i = 0; i < NMEMB(node->head.dirty); i++) { + if (node->head.dirty[i] != 0) + return 1; + } + + return 0; +} + +static int +_bt_sync_leaf(BT_state *state, BT_page *node) +{ + /* msync all of a leaf's data that is dirty. The caller is expected to sync + the node itself and mark it as clean in the parent. */ + size_t i = 0; + size_t N = _bt_numkeys(node); + + for (i = 0; i < N-1; i++) { + if (!_bt_ischilddirty(node, i)) + continue; /* not dirty. nothing to do */ + + /* ;;: we don't actually need the page, do we? */ + /* pgno_t pg = node->datk[i].fo; */ + vaof_t lo = node->datk[i].va; + vaof_t hi = node->datk[i+1].va; + size_t bytelen = P2BYTES(hi - lo); + void *addr = off2addr(lo); + + /* sync the page */ + if (msync(addr, bytelen, MS_SYNC) != 0) { + DPRINTF("msync of leaf: %p failed with %s", addr, strerror(errno)); + abort(); + } + + /* mprotect the data */ + if (mprotect(addr, bytelen, BT_PROT_CLEAN) != 0) { + DPRINTF("mprotect of leaf data failed with %s", strerror(errno)); + abort(); + } + + /* and clean the dirty bit */ + _bt_cleanchild(node, i); + } + + /* ;;: all data pages synced. should we now sync the node as well? No, I think + that should be the caller's responsibility */ + + /* ;;: it is probably faster to scan the dirty bit set and derive the datk idx + rather than iterate over the full datk array and check if it is dirty. This + was simpler to implement for now though. */ + /* while (_bt_sync_hasdirtypage(state, node)) { */ + /* ... */ + /* } */ + + return BT_SUCC; +} + +static int +_bt_sync_meta(BT_state *state) +/* syncs the metapage and performs necessary checksumming. Additionally, flips + the which */ +{ + BT_meta *meta = state->meta_pages[state->which]; + uint32_t chk; + int rc; + + /* increment the txnid */ + meta->txnid += 1; + + /* checksum the metapage */ + chk = nonzero_crc_32(meta, BT_META_LEN); + /* ;;: todo: guarantee the chk cannot be zero */ + + meta->chk = chk; + + /* sync the metapage */ + if (msync(LO_ALIGN_PAGE(meta), sizeof(BT_page), MS_SYNC) != 0) { + DPRINTF("msync of metapage: %p failed with %s", meta, strerror(errno)); + abort(); + } + + // ensure we have a new dirty metapage and root node + /* finally, make old metapage clean */ + rc = _bt_flip_meta(state); + + if (mprotect(LO_ALIGN_PAGE(meta), sizeof(BT_page), BT_PROT_CLEAN) != 0) { + DPRINTF("mprotect of old metapage failed with %s", strerror(errno)); + abort(); + } + + return rc; +} + +static int _bt_flip_meta(BT_state *state) { + BT_meta *meta = state->meta_pages[state->which]; + BT_meta *newmeta; + int newwhich; + + /* zero the new metapage's checksum */ + newwhich = state->which ? 0 : 1; + newmeta = state->meta_pages[newwhich]; + + /* mprotect dirty new metapage */ + if (mprotect(LO_ALIGN_PAGE(newmeta), sizeof(BT_page), BT_PROT_DIRTY) != 0) { + DPRINTF("mprotect of new metapage failed with %s", strerror(errno)); + abort(); + } + + newmeta->chk = 0; + + /* copy over metapage to new metapage excluding the checksum */ + memcpy(newmeta, meta, BT_META_LEN); + + /* CoW a new root since the root referred to by the metapage should always be + dirty */ + BT_page *root; + pgno_t newrootpg; + root = _node_get(state, newmeta->root); + if (!SUCC(_node_cow(state, root, &newrootpg))) + abort(); + + newmeta->root = newrootpg; + + /* switch the metapage we're referring to */ + state->which = newwhich; + + return BT_SUCC; +} + +static int +_bt_sync(BT_state *state, BT_page *node, uint8_t depth, uint8_t maxdepth) +/* recursively syncs the subtree under node. The caller is expected to sync node + itself and mark it clean. */ +{ + int rc = 0; + size_t N = _bt_numkeys(node); + + /* leaf */ + if (depth == maxdepth) { + _bt_sync_leaf(state, node); + goto e; + } + + /* do dfs */ + for (size_t i = 0; i < N-1; i++) { + if (!_bt_ischilddirty(node, i)) + continue; /* not dirty. nothing to do */ + + BT_page *child = _node_get(state, node->datk[i].fo); + + /* recursively sync the child's data */ + if ((rc = _bt_sync(state, child, depth+1, maxdepth))) + return rc; + + /* sync the child node */ + if (msync(child, sizeof(BT_page), MS_SYNC) != 0) { + DPRINTF("msync of child node: %p failed with %s", child, strerror(errno)); + abort(); + } + + /* unset child dirty bit */ + _bt_cleanchild(node, i); + } + + e: + /* all modifications done in node, mark it read-only */ + if (mprotect(node, sizeof(BT_page), BT_PROT_CLEAN) != 0) { + DPRINTF("mprotect of node failed with %s", strerror(errno)); + abort(); + } + + return BT_SUCC; +} + + +//// =========================================================================== +//// btree external routines + +int +bt_state_new(BT_state **state) +{ + // TRACE(); + + BT_state *s = calloc(1, sizeof *s); + s->data_fd = -1; + s->fixaddr = BT_MAPADDR; + *state = s; + return BT_SUCC; +} + +#define DATANAME "/data.pma" +int +bt_state_open(BT_state *state, const char *path, ULONG flags, mode_t mode) +{ + int oflags, rc; + char *dpath; + + TRACE(); + UNUSED(flags); + + oflags = O_RDWR | O_CREAT; + dpath = malloc(strlen(path) + sizeof(DATANAME)); + if (!dpath) return ENOMEM; + sprintf(dpath, "%s" DATANAME, path); + + if ((state->data_fd = open(dpath, oflags, mode)) == -1) + return errno; + + if (!SUCC(rc = _bt_state_load(state))) + goto e; + + state->path = strdup(dpath); + + e: + /* cleanup FDs stored in state if anything failed */ + if (!SUCC(rc)) { + if (state->data_fd != -1) CLOSE_FD(state->data_fd); + } + + free(dpath); + return rc; +} + +int +bt_state_close(BT_state *state) +{ + int rc; + bt_sync(state); + + _mlist_delete(state); + _flist_delete(state); + _nlist_delete(state); + + if ((rc = munmap(state->map, BT_ADDRSIZE)) != 0) { + rc = errno; + return rc; + } + if (state->data_fd != -1) CLOSE_FD(state->data_fd); + + ZERO(state, sizeof *state); + + return BT_SUCC; +} + +void * +bt_malloc(BT_state *state, size_t pages) +{ + BT_mlistnode **n = &state->mlist; + void *ret = 0; + /* first fit */ + for (; *n; n = &(*n)->next) { + size_t sz_p = addr2off((*n)->hi) - addr2off((*n)->lo); + + if (sz_p >= pages) { + ret = (*n)->lo; + BT_page *hi = ((BT_page *)ret) + pages; + _mlist_record_alloc(state, ret, hi); + break; + } + // XX return early if nothing suitable found in freelist + } + if (ret == 0) { + DPUTS("mlist out of mem!"); + return 0; + } + + pgno_t pgno = _bt_falloc(state, pages); + bp(pgno != 0); + _bt_insert(state, + addr2off(ret), + addr2off(ret) + pages, + pgno); + + DPRINTF("map %p to offset 0x%zx bytes (0x%zx pages)\n", ret, P2BYTES(pgno), pages); + if (ret != + mmap(ret, + P2BYTES(pages), + BT_PROT_DIRTY, + BT_FLAG_DIRTY, + state->data_fd, + P2BYTES(pgno))) { + DPRINTF("mmap: failed to map at addr %p, errno: %s", ret, strerror(errno)); + abort(); + } + bp(ret != 0); + return ret; +} + +// XX need to mmap fixed/anon/no_reserve and prot_none +void +bt_free(BT_state *state, void *lo, void *hi) +{ + vaof_t looff = addr2off(lo); + vaof_t hioff = addr2off(hi); + pgno_t lopg, hipg; + BT_findpath path = {0}; + + if (!SUCC(_bt_find(state, &path, looff, hioff))) { + DPRINTF("Failed to find range: (%p, %p)", lo, hi); + abort(); + } + + /* insert null into btree */ + _bt_insert(state, looff, hioff, 0); + /* insert freed range into mlist */ + _mlist_insert(state, lo, hi); + /* insert freed range into flist */ + BT_page *leaf = path.path[path.depth]; + size_t childidx = path.idx[path.depth]; + int isdirty = _bt_ischilddirty(leaf, childidx); + BT_kv kv = leaf->datk[childidx]; + vaof_t offset = looff - kv.va; + lopg = kv.fo + offset; + hipg = lopg + (looff - hioff); + if (isdirty) { + _flist_insert(&state->flist, lopg, hipg); + } + else { + _flist_insert(&state->pending_flist, lopg, hipg); + } + + /* ;;: is this correct? Shouldn't this actually happen when we merge the + pending_mlist on sync? */ + size_t bytelen = (BYTE *)hi - (BYTE *)lo; + + if (lo != + mmap(lo, + bytelen, + BT_PROT_FREE, + BT_FLAG_FREE, + 0, 0)) { + DPRINTF("mmap: failed to map at addr %p, errno: %s", lo, strerror(errno)); + abort(); + } +} + +// XX need to mprotect PROT_READ all ranges synced including root/meta +int +bt_sync(BT_state *state) +{ + /* as is often the case, handling the metapage/root is a special case, which + is done here. Syncing any other page of the tree is done in _bt_sync */ + BT_meta *meta = state->meta_pages[state->which]; + BT_page *root = _node_get(state, meta->root); + int rc = 0; + + if ((rc = _bt_sync(state, root, 1, meta->depth))) + return rc; + + /* merge the pending freelists */ + _pending_nlist_merge(state); + _pending_flist_merge(state); + + /* sync the root page */ + if (msync(root, sizeof(BT_page), MS_SYNC) != 0) { + DPRINTF("msync of root: %p failed with %s", root, strerror(errno)); + abort(); + } + + /* make root read-only */ + if (mprotect(root, sizeof(BT_page), BT_PROT_CLEAN) != 0) { + DPRINTF("mprotect of root failed with %s", strerror(errno)); + abort(); + } + + /* then sync the metapage */ + if ((rc = _bt_sync_meta(state))) + return rc; + + return BT_SUCC; +} + +uint64_t +bt_meta_get(BT_state *state, size_t idx) +{ + BT_meta *meta = state->meta_pages[state->which]; + assert((uintptr_t)&(meta->roots[idx]) - (uintptr_t)meta <= sizeof *meta); + return meta->roots[idx]; +} + +void +bt_meta_set(BT_state *state, size_t idx, uint64_t val) +{ + BT_meta *meta = state->meta_pages[state->which]; + assert((uintptr_t)&(meta->roots[idx]) - (uintptr_t)meta <= sizeof *meta); + meta->roots[idx] = val; +} + +int +_bt_range_of(BT_state *state, vaof_t p, vaof_t **lo, vaof_t **hi, + pgno_t nodepg, uint8_t depth, uint8_t maxdepth) +{ + BT_page *node = _node_get(state, nodepg); + size_t N = _bt_numkeys(node); + + vaof_t llo = 0; + vaof_t hhi = 0; + pgno_t pg = 0; + size_t i; + for (i = 0; i < N-1; i++) { + llo = node->datk[i].va; + hhi = node->datk[i+1].va; + pg = node->datk[i].fo; + if (llo <= p && hhi > p) { + break; + } + } + /* not found */ + if (i == N-1) + return 1; + + if (depth == maxdepth) { + **lo = llo; + **hi = hhi; + return BT_SUCC; + } + + return _bt_range_of(state, p, lo, hi, pg, depth+1, maxdepth); +} + +int +bt_range_of(BT_state *state, void *p, void **lo, void **hi) +{ + /* traverse tree looking for lo <= p and hi > p. return that range as a pair + of pointers NOT as two vaof_t + + 0: succ (found) + 1: otherwise + */ + + BT_meta *meta = state->meta_pages[state->which]; + pgno_t root = meta->root; + vaof_t *loret = 0; + vaof_t *hiret = 0; + vaof_t poff = addr2off(p); + int rc = 0; + if (!SUCC(rc = _bt_range_of(state, poff, &loret, &hiret, root, 1, meta->depth))) { + return rc; + } + *lo = off2addr(*loret); + *hi = off2addr(*hiret); + return BT_SUCC; +} + +/** + +pseudocode from ed: + +bt_dirty(btree, lo, hi): + loop: + (range_lo, range_hi) = find_range_for_pointer(btree, lo); + dirty_hi = min(hi, range_hi); + new_start_fo = data_cow(btree, lo, dirty_hi); + lo := range_hi; + if dirty_hi == hi then break; + +// precondition: given range does not cross a tree boundary +data_cow(btree, lo, hi): + (range_lo, range_hi, fo) = bt_find(btree, lo, hi); + size = lo - hi; + new_fo = data_alloc(btree.data_free, size); + + // puts data in the unified buffer cache without having to map virtual memory + write(fd, new_fo, size * BT_PAGESIZE, to_ptr(lo)); + + // maps new file offset with same data back into same memory + mmap(fd, new_fo, size, to_ptr(lo)); + + bt_insert(btree, lo, hi, new_fo); + + offset = lo - range_lo; + freelist_insert(btree.pending_data_flist, fo + offset, fo + offset + size); + return new_fo + +**/ + +static pgno_t +_bt_data_cow(BT_state *state, vaof_t lo, vaof_t hi, pgno_t pg) +{ + size_t len = hi - lo; + size_t bytelen = P2BYTES(len); + pgno_t newpg = _bt_falloc(state, len); + BYTE *loaddr = off2addr(lo); + off_t offset = P2BYTES(newpg); + + /* write call puts data in the unified buffer cache without having to map + virtual memory */ + if (pwrite(state->data_fd, loaddr, bytelen, offset) != (ssize_t)bytelen) + abort(); + + /* maps new file offset with same data back into memory */ + if (loaddr != + mmap(loaddr, + bytelen, + BT_PROT_DIRTY, + BT_FLAG_DIRTY, + state->data_fd, + offset)) { + DPRINTF("mmap: failed to map at addr %p, errno: %s", loaddr, strerror(errno)); + abort(); + } + + _bt_insert(state, lo, hi, newpg); + + _flist_insert(&state->pending_flist, pg, pg + len); + + return newpg; +} + +#define MIN(x, y) ((x) > (y) ? (y) : (x)) + +static int +_bt_dirty(BT_state *state, vaof_t lo, vaof_t hi, pgno_t nodepg, + uint8_t depth, uint8_t maxdepth) +{ + BT_page *node = _node_get(state, nodepg); + size_t N = _bt_numkeys(node); + size_t loidx = BT_DAT_MAXKEYS; // 0 is a valid loidx! + size_t hiidx = 0; + + /* find loidx of range */ + for (size_t i = 0; i < N-1; i++) { + vaof_t hhi = node->datk[i+1].va; + if (hhi > lo) { + loidx = i; + break; + } + } + assert(loidx < BT_DAT_MAXKEYS); + + /* find hiidx (exclusive) of range */ + for (size_t i = loidx+1; i < N; i++) { + vaof_t hhi = node->datk[i].va; + if (hhi >= hi) { + hiidx = i; + break; + } + } + assert(hiidx != 0); + + /* found a range in node that contains (lo-hi). May span multiple entries */ + /* leaf: base case. cow the data */ + if (depth == maxdepth) { + for (size_t i = loidx; i < hiidx; i++) { + vaof_t llo = node->datk[i].va; + vaof_t hhi = MIN(node->datk[i+1].va, hi); + pgno_t pg = node->datk[i].fo; + pgno_t newpg = _bt_data_cow(state, llo, hhi, pg); + _bt_insert(state, llo, hhi, newpg); + } + } else { + for (size_t i = loidx; i < hiidx; i++) { + /* branch: recursive case */ + pgno_t childpg = node->datk[i].fo; + /* iteratively recurse on all entries */ + _bt_dirty(state, lo, hi, childpg, depth+1, maxdepth); + } + } + return BT_SUCC; +} + +int +bt_dirty(BT_state *state, void *lo, void *hi) +{ + /* takes a range and ensures that entire range is CoWed */ + /* if part of the range is free then return 1 */ + BT_meta *meta = state->meta_pages[state->which]; + vaof_t looff = addr2off(lo); + vaof_t hioff = addr2off(hi); + + return _bt_dirty(state, looff, hioff, meta->root, 1, meta->depth); +} + +int +bt_next_alloc(BT_state *state, void *p, void **lo, void **hi) +/* if p is free, sets lo and hi to the bounds of the next adjacent allocated + space. If p is allocated, sets lo and hi to the bounds of the allocated space + it falls in. */ +{ + BT_mlistnode *head = state->mlist; + BYTE *pb = p; + BYTE* pma_end; + while (head) { + /* at last free block, different logic applies */ + if (head->next == 0) + goto end; + + /* p is in a free range, return the allocated hole after it */ + if (head->lo <= pb + && head->hi > pb) { + goto found; + } + + /* p is alloced, return this hole */ + if (head->next->lo > pb + && head->hi <= pb) { + goto found; + } + + head = head->next; + } + + /* not found */ + return 1; + + found: + /* the alloced space begins at the end of the free block */ + *lo = head->hi; + /* ... and ends at the start of the next free block */ + *hi = head->next->lo; + return BT_SUCC; + + end: + pma_end = (void *)((uintptr_t)BT_MAPADDR + BT_ADDRSIZE); + assert(head->hi <= pma_end); + /* no alloced region between tail of freelist and end of pma memory space */ + if (head->hi == pma_end) + return 1; + + /* otherwise, return the alloced region between the tail of the freelist and + the end of the memory arena */ + *lo = head->hi; + *hi = pma_end; + return BT_SUCC; +} + +void +bt_bounds(BT_state *state, void **lo, void **hi) +{ + *lo = BT_MAPADDR; + *hi = (void *)((uintptr_t)BT_MAPADDR + BT_ADDRSIZE); +} + +int +bt_inbounds(BT_state *state, void *p) +/* 1: if in the bounds of the PMA, 0 otherwise */ +{ + return p >= (void *)BT_MAPADDR + && p < (void *)((uintptr_t)BT_MAPADDR + BT_ADDRSIZE); +} + + +//// =========================================================================== +//// tests + +/* ;;: obv this should be moved to a separate file */ +static void +_sham_sync_clean(BT_page *node) +{ + for (uint8_t *dit = &node->head.dirty[0] + ; dit < &node->head.dirty[sizeof(node->head.dirty) - 1] + ; dit++) { + *dit = 0; + } +} + +static void +_sham_sync2(BT_state *state, BT_page *node, uint8_t depth, uint8_t maxdepth) +{ + if (depth == maxdepth) return; + + /* clean node */ + _sham_sync_clean(node); + + /* then recurse and clean all children with DFS */ + size_t N = _bt_numkeys(node); + for (size_t i = 1; i < N; ++i) { + BT_kv kv = node->datk[i]; + pgno_t childpg = kv.fo; + BT_page *child = _node_get(state, childpg); + _sham_sync2(state, child, depth+1, maxdepth); + } +} + +static void +_sham_sync(BT_state *state) __attribute((unused)); + +static void +_sham_sync(BT_state *state) +{ + /* walk the tree and unset the dirty bit from all pages */ + BT_meta *meta = state->meta_pages[state->which]; + BT_page *root = _node_get(state, meta->root); + meta->chk = nonzero_crc_32(meta, BT_META_LEN); + _sham_sync2(state, root, 1, meta->depth); +} + +static void +_bt_printnode(BT_page *node) +{ + fprintf(stderr, "node: %p\n", (void*)node); + fprintf(stderr, "data: \n"); + for (size_t i = 0; i < BT_DAT_MAXKEYS; ++i) { + if (i && node->datk[i].va == 0) + break; + fprintf(stderr, "[%5zu] %10x %10x\n", i, node->datk[i].va, node->datk[i].fo); + } +} + +/* + _bt_state_restore_maps2 + if pg 0: + mmap MAP_ANONYMOUS | MAP_FIXED | MAP_NO_RESERVE + PROT_NONE + + if pg !0: + mmap MAP_SHARED | MAP_FIXED + PROT_READ + + + ------------------ + + the three routines that make modification to the data maps are: + + bt_malloc: + + MAP_SHARED | MAP_FIXED + PROT_READ | PROT_WRITE + + _bt_data_cow: + + MAP_SHARED | MAP_FIXED + PROT_READ | PROT_WRITE + + bt_sync: + + (mprotect) + PROT_READ + + bt_free: + + MAP_ANONYMOUS | MAP_FIXED | MAP_NO_RESERVE + PROT_NONE + + ----------------- + + 8 linear mappings (striping) + + when we _bt_nalloc, mprotect(PROT_READ | PROT_WRITE) + + when we free a node: mprotect(PROT_NONE) + + additionally, when we sync, all allocated nodes: mprotect(PROT_READ) +*/ diff --git a/rust/ares_pma/c-src/btree.h b/rust/ares_pma/c-src/btree.h new file mode 100644 index 0000000..aad81e0 --- /dev/null +++ b/rust/ares_pma/c-src/btree.h @@ -0,0 +1,87 @@ +#ifndef __BTREE_H__ +#define __BTREE_H__ +#include +#include + +struct BT_state; +typedef struct BT_state BT_state; + +#define BT_PAGEBITS 14ULL +#define BT_PAGESIZE (1ULL << BT_PAGEBITS) /* 16K */ + +typedef unsigned long ULONG; + +//// =========================================================================== +//// btree external routines + +/** + * instantiate an opaque BT_state handle + */ +int bt_state_new(BT_state **state); + +/** + * Open the persistent state or create if one doesn't exist + */ +int bt_state_open(BT_state *state, const char *path, ULONG flags, mode_t mode); + +/** + * Close the persistent state + */ +int bt_state_close(BT_state *state); + +/** + * Allocate persistent memory space + */ +void * bt_malloc(BT_state *state, size_t pages); + +/** + * Free persistent memory space + */ +void bt_free(BT_state *state, void *lo, void *hi); + +/** + * Sync a snapshot of the persistent memory to disk + * This will **exit the process** on failure to avoid data corruption + */ +int bt_sync(BT_state *state); + +/** + * Get a metadata entry + */ +uint64_t bt_meta_get(BT_state *state, size_t idx); + +/** + * Set a metadata entry + */ +void bt_meta_set(BT_state *state, size_t idx, uint64_t val); + +/** + * Give the allocation range in the btree that a pointer lives in + */ +int bt_range_of(BT_state *state, void *p, void **lo, void **hi); + +/** + * Ensure a region of memory is "dirty" i.e. can be mutated + * + * A successful call to bt_dirty ensures that the memory range can be mutated + * until the next call to `bt_sync()` + */ +int bt_dirty(BT_state *state, void *lo, void *hi); + +/** + * Given a pointer, give the containing region of allocated memory, or the next + * highest if the pointer is to free memory + */ +int bt_next_alloc(BT_state *state, void *p, void **lo, void **hi); + +/** + * Return the memory bounds of the persistent-memory B-tree + */ +void bt_bounds(BT_state *state, void **lo, void **hi); + +/** + * Return whether a pointer is within the persistent-memory B-tree + */ +int bt_inbounds(BT_state *state, void *p); + +#endif diff --git a/rust/ares/src/pma/includes/checksum.c b/rust/ares_pma/c-src/lib/checksum.c similarity index 100% rename from rust/ares/src/pma/includes/checksum.c rename to rust/ares_pma/c-src/lib/checksum.c diff --git a/rust/ares/src/pma/includes/checksum.h b/rust/ares_pma/c-src/lib/checksum.h similarity index 100% rename from rust/ares/src/pma/includes/checksum.h rename to rust/ares_pma/c-src/lib/checksum.h diff --git a/rust/ares_pma/c-src/wrapper.h b/rust/ares_pma/c-src/wrapper.h new file mode 100644 index 0000000..5c56c79 --- /dev/null +++ b/rust/ares_pma/c-src/wrapper.h @@ -0,0 +1 @@ +#include "btree.h" diff --git a/rust/ares_pma/src/lib.rs b/rust/ares_pma/src/lib.rs new file mode 100644 index 0000000..a38a13a --- /dev/null +++ b/rust/ares_pma/src/lib.rs @@ -0,0 +1,5 @@ +#![allow(non_upper_case_globals)] +#![allow(non_camel_case_types)] +#![allow(non_snake_case)] + +include!(concat!(env!("OUT_DIR"), "/bindings.rs"));