From 41253e984811ee5f8951d6e0a463aa164f9dc29b Mon Sep 17 00:00:00 2001 From: Edward Amsden Date: Wed, 29 Nov 2023 10:15:46 -0600 Subject: [PATCH 001/128] pma: wip: copy in PMA source and create bindgen crate --- rust/ares_pma/Cargo.lock | 444 +++++ rust/ares_pma/Cargo.toml | 12 + rust/ares_pma/build.rs | 104 ++ rust/ares_pma/c-src/btest.c | 0 rust/ares_pma/c-src/btree.c | 2673 ++++++++++++++++++++++++++++ rust/ares_pma/c-src/btree.h | 26 + rust/ares_pma/c-src/lib/checksum.c | 134 ++ rust/ares_pma/c-src/wrapper.h | 1 + rust/ares_pma/src/lib.rs | 14 + 9 files changed, 3408 insertions(+) create mode 100644 rust/ares_pma/Cargo.lock create mode 100644 rust/ares_pma/Cargo.toml create mode 100644 rust/ares_pma/build.rs create mode 100644 rust/ares_pma/c-src/btest.c create mode 100644 rust/ares_pma/c-src/btree.c create mode 100644 rust/ares_pma/c-src/btree.h create mode 100644 rust/ares_pma/c-src/lib/checksum.c create mode 100644 rust/ares_pma/c-src/wrapper.h create mode 100644 rust/ares_pma/src/lib.rs diff --git a/rust/ares_pma/Cargo.lock b/rust/ares_pma/Cargo.lock new file mode 100644 index 0000000..1b2b535 --- /dev/null +++ b/rust/ares_pma/Cargo.lock @@ -0,0 +1,444 @@ +# This file is automatically @generated by Cargo. +# It is not intended for manual editing. +version = 3 + +[[package]] +name = "aho-corasick" +version = "1.1.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b2969dcb958b36655471fc61f7e416fa76033bdd4bfed0678d8fee1e2d07a1f0" +dependencies = [ + "memchr", +] + +[[package]] +name = "ares_pma" +version = "0.1.0" +dependencies = [ + "bindgen", +] + +[[package]] +name = "bindgen" +version = "0.69.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9ffcebc3849946a7170a05992aac39da343a90676ab392c51a4280981d6379c2" +dependencies = [ + "bitflags", + "cexpr", + "clang-sys", + "lazy_static", + "lazycell", + "log", + "peeking_take_while", + "prettyplease", + "proc-macro2", + "quote", + "regex", + "rustc-hash", + "shlex", + "syn", + "which", +] + +[[package]] +name = "bitflags" +version = "2.4.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "327762f6e5a765692301e5bb513e0d9fef63be86bbc14528052b1cd3e6f03e07" + +[[package]] +name = "cexpr" +version = "0.6.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6fac387a98bb7c37292057cffc56d62ecb629900026402633ae9160df93a8766" +dependencies = [ + "nom", +] + +[[package]] +name = "cfg-if" +version = "1.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "baf1de4339761588bc0619e3cbc0120ee582ebb74b53b4efbf79117bd2da40fd" + +[[package]] +name = "clang-sys" +version = "1.6.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c688fc74432808e3eb684cae8830a86be1d66a2bd58e1f248ed0960a590baf6f" +dependencies = [ + "glob", + "libc", + "libloading", +] + +[[package]] +name = "either" +version = "1.9.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a26ae43d7bcc3b814de94796a5e736d4029efb0ee900c12e2d54c993ad1a1e07" + +[[package]] +name = "errno" +version = "0.3.8" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a258e46cdc063eb8519c00b9fc845fc47bcfca4130e2f08e88665ceda8474245" +dependencies = [ + "libc", + "windows-sys 0.52.0", +] + +[[package]] +name = "glob" +version = "0.3.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d2fabcfbdc87f4758337ca535fb41a6d701b65693ce38287d856d1674551ec9b" + +[[package]] +name = "home" +version = "0.5.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5444c27eef6923071f7ebcc33e3444508466a76f7a2b93da00ed6e19f30c1ddb" +dependencies = [ + "windows-sys 0.48.0", +] + +[[package]] +name = "lazy_static" +version = "1.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e2abad23fbc42b3700f2f279844dc832adb2b2eb069b2df918f455c4e18cc646" + +[[package]] +name = "lazycell" +version = "1.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "830d08ce1d1d941e6b30645f1a0eb5643013d835ce3779a5fc208261dbe10f55" + +[[package]] +name = "libc" +version = "0.2.150" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "89d92a4743f9a61002fae18374ed11e7973f530cb3a3255fb354818118b2203c" + +[[package]] +name = "libloading" +version = "0.7.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b67380fd3b2fbe7527a606e18729d21c6f3951633d0500574c4dc22d2d638b9f" +dependencies = [ + "cfg-if", + "winapi", +] + +[[package]] +name = "linux-raw-sys" +version = "0.4.11" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "969488b55f8ac402214f3f5fd243ebb7206cf82de60d3172994707a4bcc2b829" + +[[package]] +name = "log" +version = "0.4.20" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b5e6163cb8c49088c2c36f57875e58ccd8c87c7427f7fbd50ea6710b2f3f2e8f" + +[[package]] +name = "memchr" +version = "2.6.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f665ee40bc4a3c5590afb1e9677db74a508659dfd71e126420da8274909a0167" + +[[package]] +name = "minimal-lexical" +version = "0.2.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "68354c5c6bd36d73ff3feceb05efa59b6acb7626617f4962be322a825e61f79a" + +[[package]] +name = "nom" +version = "7.1.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d273983c5a657a70a3e8f2a01329822f3b8c8172b73826411a55751e404a0a4a" +dependencies = [ + "memchr", + "minimal-lexical", +] + +[[package]] +name = "once_cell" +version = "1.18.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "dd8b5dd2ae5ed71462c540258bedcb51965123ad7e7ccf4b9a8cafaa4a63576d" + +[[package]] +name = "peeking_take_while" +version = "0.1.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "19b17cddbe7ec3f8bc800887bab5e717348c95ea2ca0b1bf0837fb964dc67099" + +[[package]] +name = "prettyplease" +version = "0.2.15" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ae005bd773ab59b4725093fd7df83fd7892f7d8eafb48dbd7de6e024e4215f9d" +dependencies = [ + "proc-macro2", + "syn", +] + +[[package]] +name = "proc-macro2" +version = "1.0.70" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "39278fbbf5fb4f646ce651690877f89d1c5811a3d4acb27700c1cb3cdb78fd3b" +dependencies = [ + "unicode-ident", +] + +[[package]] +name = "quote" +version = "1.0.33" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5267fca4496028628a95160fc423a33e8b2e6af8a5302579e322e4b520293cae" +dependencies = [ + "proc-macro2", +] + +[[package]] +name = "regex" +version = "1.10.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "380b951a9c5e80ddfd6136919eef32310721aa4aacd4889a8d39124b026ab343" +dependencies = [ + "aho-corasick", + "memchr", + "regex-automata", + "regex-syntax", +] + +[[package]] +name = "regex-automata" +version = "0.4.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5f804c7828047e88b2d32e2d7fe5a105da8ee3264f01902f796c8e067dc2483f" +dependencies = [ + "aho-corasick", + "memchr", + "regex-syntax", +] + +[[package]] +name = "regex-syntax" +version = "0.8.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c08c74e62047bb2de4ff487b251e4a92e24f48745648451635cec7d591162d9f" + +[[package]] +name = "rustc-hash" +version = "1.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "08d43f7aa6b08d49f382cde6a7982047c3426db949b1424bc4b7ec9ae12c6ce2" + +[[package]] +name = "rustix" +version = "0.38.25" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "dc99bc2d4f1fed22595588a013687477aedf3cdcfb26558c559edb67b4d9b22e" +dependencies = [ + "bitflags", + "errno", + "libc", + "linux-raw-sys", + "windows-sys 0.48.0", +] + +[[package]] +name = "shlex" +version = "1.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a7cee0529a6d40f580e7a5e6c495c8fbfe21b7b52795ed4bb5e62cdf92bc6380" + +[[package]] +name = "syn" +version = "2.0.39" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "23e78b90f2fcf45d3e842032ce32e3f2d1545ba6636271dcbf24fa306d87be7a" +dependencies = [ + "proc-macro2", + "quote", + "unicode-ident", +] + +[[package]] +name = "unicode-ident" +version = "1.0.12" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3354b9ac3fae1ff6755cb6db53683adb661634f67557942dea4facebec0fee4b" + +[[package]] +name = "which" +version = "4.4.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "87ba24419a2078cd2b0f2ede2691b6c66d8e47836da3b6db8265ebad47afbfc7" +dependencies = [ + "either", + "home", + "once_cell", + "rustix", +] + +[[package]] +name = "winapi" +version = "0.3.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5c839a674fcd7a98952e593242ea400abe93992746761e38641405d28b00f419" +dependencies = [ + "winapi-i686-pc-windows-gnu", + "winapi-x86_64-pc-windows-gnu", +] + +[[package]] +name = "winapi-i686-pc-windows-gnu" +version = "0.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ac3b87c63620426dd9b991e5ce0329eff545bccbbb34f3be09ff6fb6ab51b7b6" + +[[package]] +name = "winapi-x86_64-pc-windows-gnu" +version = "0.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "712e227841d057c1ee1cd2fb22fa7e5a5461ae8e48fa2ca79ec42cfc1931183f" + +[[package]] +name = "windows-sys" +version = "0.48.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "677d2418bec65e3338edb076e806bc1ec15693c5d0104683f2efe857f61056a9" +dependencies = [ + "windows-targets 0.48.5", +] + +[[package]] +name = "windows-sys" +version = "0.52.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "282be5f36a8ce781fad8c8ae18fa3f9beff57ec1b52cb3de0789201425d9a33d" +dependencies = [ + "windows-targets 0.52.0", +] + +[[package]] +name = "windows-targets" +version = "0.48.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9a2fa6e2155d7247be68c096456083145c183cbbbc2764150dda45a87197940c" +dependencies = [ + "windows_aarch64_gnullvm 0.48.5", + "windows_aarch64_msvc 0.48.5", + "windows_i686_gnu 0.48.5", + "windows_i686_msvc 0.48.5", + "windows_x86_64_gnu 0.48.5", + "windows_x86_64_gnullvm 0.48.5", + "windows_x86_64_msvc 0.48.5", +] + +[[package]] +name = "windows-targets" +version = "0.52.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8a18201040b24831fbb9e4eb208f8892e1f50a37feb53cc7ff887feb8f50e7cd" +dependencies = [ + "windows_aarch64_gnullvm 0.52.0", + "windows_aarch64_msvc 0.52.0", + "windows_i686_gnu 0.52.0", + "windows_i686_msvc 0.52.0", + "windows_x86_64_gnu 0.52.0", + "windows_x86_64_gnullvm 0.52.0", + "windows_x86_64_msvc 0.52.0", +] + +[[package]] +name = "windows_aarch64_gnullvm" +version = "0.48.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2b38e32f0abccf9987a4e3079dfb67dcd799fb61361e53e2882c3cbaf0d905d8" + +[[package]] +name = "windows_aarch64_gnullvm" +version = "0.52.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "cb7764e35d4db8a7921e09562a0304bf2f93e0a51bfccee0bd0bb0b666b015ea" + +[[package]] +name = "windows_aarch64_msvc" +version = "0.48.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "dc35310971f3b2dbbf3f0690a219f40e2d9afcf64f9ab7cc1be722937c26b4bc" + +[[package]] +name = "windows_aarch64_msvc" +version = "0.52.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "bbaa0368d4f1d2aaefc55b6fcfee13f41544ddf36801e793edbbfd7d7df075ef" + +[[package]] +name = "windows_i686_gnu" +version = "0.48.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a75915e7def60c94dcef72200b9a8e58e5091744960da64ec734a6c6e9b3743e" + +[[package]] +name = "windows_i686_gnu" +version = "0.52.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a28637cb1fa3560a16915793afb20081aba2c92ee8af57b4d5f28e4b3e7df313" + +[[package]] +name = "windows_i686_msvc" +version = "0.48.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8f55c233f70c4b27f66c523580f78f1004e8b5a8b659e05a4eb49d4166cca406" + +[[package]] +name = "windows_i686_msvc" +version = "0.52.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ffe5e8e31046ce6230cc7215707b816e339ff4d4d67c65dffa206fd0f7aa7b9a" + +[[package]] +name = "windows_x86_64_gnu" +version = "0.48.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "53d40abd2583d23e4718fddf1ebec84dbff8381c07cae67ff7768bbf19c6718e" + +[[package]] +name = "windows_x86_64_gnu" +version = "0.52.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3d6fa32db2bc4a2f5abeacf2b69f7992cd09dca97498da74a151a3132c26befd" + +[[package]] +name = "windows_x86_64_gnullvm" +version = "0.48.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0b7b52767868a23d5bab768e390dc5f5c55825b6d30b86c844ff2dc7414044cc" + +[[package]] +name = "windows_x86_64_gnullvm" +version = "0.52.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1a657e1e9d3f514745a572a6846d3c7aa7dbe1658c056ed9c3344c4109a6949e" + +[[package]] +name = "windows_x86_64_msvc" +version = "0.48.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ed94fce61571a4006852b7389a063ab983c02eb1bb37b47f8272ce92d06d9538" + +[[package]] +name = "windows_x86_64_msvc" +version = "0.52.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "dff9641d1cd4be8d1a070daf9e3773c5f67e78b4d9d42263020c057706765c04" diff --git a/rust/ares_pma/Cargo.toml b/rust/ares_pma/Cargo.toml new file mode 100644 index 0000000..d81f601 --- /dev/null +++ b/rust/ares_pma/Cargo.toml @@ -0,0 +1,12 @@ +[package] +name = "ares_pma" +version = "0.1.0" +edition = "2018" + +# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html + +[dependencies] + +[build-dependencies] +bindgen = "0.69.1" + diff --git a/rust/ares_pma/build.rs b/rust/ares_pma/build.rs new file mode 100644 index 0000000..bb17c44 --- /dev/null +++ b/rust/ares_pma/build.rs @@ -0,0 +1,104 @@ +extern crate bindgen; + +use std::env; +use std::path::PathBuf; + +use bindgen::CargoCallbacks; + +fn main() { + // This is the directory where the `c` library is located. + let libdir_path = PathBuf::from("c-src") + // Canonicalize the path as `rustc-link-search` requires an absolute + // path. + .canonicalize() + .expect("cannot canonicalize path"); + + // This is the path to the `c` headers file. + let headers_path = libdir_path.join("wrapper.h"); + let headers_path_str = headers_path.to_str().expect("Path is not a valid string"); + + // This is the path to the intermediate object file for our library. + let btree_obj_path = libdir_path.join("btree.o"); + let checksum_obj_path = libdir_path.join("lib").join("checksum.o"); + // This is the path to the static library file. + let lib_path = libdir_path.join("btree.a"); + + // Tell cargo to look for shared libraries in the specified directory + println!("cargo:rustc-link-search={}", libdir_path.to_str().unwrap()); + + // Tell cargo to tell rustc to link our `btree` library. Cargo will + // automatically know it must look for a `libbtree.a` file. + println!("cargo:rustc-link-lib=btree"); + + // Tell cargo to invalidate the built crate whenever the header changes. + println!("cargo:rerun-if-changed={}", headers_path_str); + + // Run `clang` to compile the `btree.c` file into a `btree.o` object file. + // Unwrap if it is not possible to spawn the process. + if !std::process::Command::new("clang") + .arg("-c") + .arg("-o") + .arg(&btree_obj_path) + .arg(libdir_path.join("btree.c")) + .output() + .expect("could not spawn `clang`") + .status + .success() + { + // Panic if the command was not successful. + panic!("could not compile object file"); + } + + // Run `clang` to compile the `btree.c` file into a `btree.o` object file. + // Unwrap if it is not possible to spawn the process. + if !std::process::Command::new("clang") + .arg("-c") + .arg("-o") + .arg(&checksum_obj_path) + .arg(libdir_path.join("lib").join("checksum.c")) + .output() + .expect("could not spawn `clang`") + .status + .success() + { + // Panic if the command was not successful. + panic!("could not compile object file"); + } + + // Run `ar` to generate the `libbtree.a` file from the `btree.o` file. + // Unwrap if it is not possible to spawn the process. + if !std::process::Command::new("ar") + .arg("rcs") + .arg(lib_path) + .arg(btree_obj_path) + .arg(checksum_obj_path) + .output() + .expect("could not spawn `ar`") + .status + .success() + { + // Panic if the command was not successful. + panic!("could not emit library file"); + } + + // The bindgen::Builder is the main entry point + // to bindgen, and lets you build up options for + // the resulting bindings. + let bindings = bindgen::Builder::default() + // The input header we would like to generate + // bindings for. + .header(headers_path_str) + // Tell cargo to invalidate the built crate whenever any of the + // included header files changed. + .parse_callbacks(Box::new(CargoCallbacks)) + // Finish the builder and generate the bindings. + .generate() + // Unwrap the Result and panic on failure. + .expect("Unable to generate bindings"); + + // Write the bindings to the $OUT_DIR/bindings.rs file. + let out_path = PathBuf::from(env::var("OUT_DIR").unwrap()).join("bindings.rs"); + bindings + .write_to_file(out_path) + .expect("Couldn't write bindings!"); +} diff --git a/rust/ares_pma/c-src/btest.c b/rust/ares_pma/c-src/btest.c new file mode 100644 index 0000000..e69de29 diff --git a/rust/ares_pma/c-src/btree.c b/rust/ares_pma/c-src/btree.c new file mode 100644 index 0000000..8393e68 --- /dev/null +++ b/rust/ares_pma/c-src/btree.c @@ -0,0 +1,2673 @@ +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "lib/checksum.h" + +typedef uint32_t pgno_t; /* a page number */ +typedef uint32_t vaof_t; /* a virtual address offset */ +typedef uint32_t flag_t; +typedef unsigned char BYTE; +typedef unsigned long ULONG; + +//// =========================================================================== +//// tmp tmp tmp tmp tmp +/* ;;: remove -- for debugging */ +/* + bp(X) where X is false will raise a SIGTRAP. If the process is being run + inside a debugger, this can be caught and ignored. It's equivalent to a + breakpoint. If run without a debugger, it will dump core, like an assert +*/ +#ifdef DEBUG +#if defined(__i386__) || defined(__x86_64__) +#define bp(x) do { if(!(x)) __asm__ volatile("int $3"); } while (0) +#elif defined(__thumb__) +#define bp(x) do { if(!(x)) __asm__ volatile(".inst 0xde01"); } while (0) +#elif defined(__aarch64__) +#define bp(x) do { if(!(x)) __asm__ volatile(".inst 0xd4200000"); } while (0) +#elif defined(__arm__) +#define bp(x) do { if(!(x)) __asm__ volatile(".inst 0xe7f001f0"); } while (0) +#else +STATIC_ASSERT(0, "debugger break instruction unimplemented"); +#endif +#else +#define bp(x) ((void)(0)) +#endif + +/* coalescing of memory freelist currently prohibited since we haven't + implemented coalescing of btree nodes (necessary) */ +#define CAN_COALESCE 0 +/* ;;: remove once confident in logic and delete all code dependencies on + state->node_freelist */ +#define USE_NLIST 1 +#if USE_NLIST +/* ;;: obviously this should be removed once we've fully switched over to the + nlist. And calls to _node_alloc should be updated to calls to _bt_nalloc */ +#define _node_alloc(...) _bt_nalloc(__VA_ARGS__) +#endif + +#define ZERO(s, n) memset((s), 0, (n)) + +#define S7(A, B, C, D, E, F, G) A##B##C##D##E##F##G +#define S6(A, B, C, D, E, F, ...) S7(A, B, C, D, E, F, __VA_ARGS__) +#define S5(A, B, C, D, E, ...) S6(A, B, C, D, E, __VA_ARGS__) +#define S4(A, B, C, D, ...) S5(A, B, C, D, __VA_ARGS__) +#define S3(A, B, C, ...) S4(A, B, C, __VA_ARGS__) +#define S2(A, B, ...) S3(A, B, __VA_ARGS__) +#define S(A, ...) S2(A, __VA_ARGS__) + +#define KBYTES(x) ((size_t)(x) << 10) +#define MBYTES(x) ((size_t)(x) << 20) +#define GBYTES(x) ((size_t)(x) << 30) +#define TBYTES(x) ((size_t)(x) << 40) +#define PBYTES(x) ((size_t)(x) << 50) + +/* 4K page in bytes */ +#define P2BYTES(x) ((size_t)(x) << 14) +/* the opposite of P2BYTES */ +#define B2PAGES(x) ((size_t)(x) >> 14) + + +#define __packed __attribute__((__packed__)) +#define UNUSED(x) ((void)(x)) + +#ifdef DEBUG +# define DPRINTF(fmt, ...) \ + fprintf(stderr, "%s:%d " fmt "\n", __func__, __LINE__, __VA_ARGS__) +#else +# define DPRINTF(fmt, ...) ((void) 0) +#endif +#define DPUTS(arg) DPRINTF("%s", arg) +#define TRACE(...) DPUTS("") + +#define BT_SUCC 0 +#define SUCC(x) ((x) == BT_SUCC) + + +#define BT_MAPADDR ((void *) S(0x1000,0000,0000)) + +/* convert addr offset to raw address */ +#define OFF2ADDR(x) ((void *)((uintptr_t)(BT_MAPADDR) + (x))) +/* convert raw memory address to offset */ +#define ADDR2OFF(a) ((vaof_t)((uintptr_t)(a) - (uintptr_t)BT_MAPADDR)) + +#define BT_PAGEBITS 14ULL +#define BT_PAGEWORD 32ULL +#define BT_PAGESIZE (1ULL << BT_PAGEBITS) /* 16K */ +#define BT_NUMMETAS 2 /* 2 metapages */ +#define BT_ADDRSIZE (BT_PAGESIZE << BT_PAGEWORD) +#define PMA_GROW_SIZE (BT_PAGESIZE * 1024) + +#define BT_NOPAGE 0 + +/* + FO2BY: file offset to byte + get byte INDEX into pma map from file offset +*/ +#define FO2BY(fo) \ + ((uint64_t)(fo) << BT_PAGEBITS) + +/* + BY2FO: byte to file offset + get pgno from byte INDEX into pma map +*/ +#define BY2FO(p) \ + ((pgno_t)((p) >> BT_PAGEBITS)) + +/* + FO2PA: file offset to page + get a reference to a BT_page from a file offset +*/ +#define FO2PA(map, fo) \ + ((BT_page *)&(map)[FO2BY(fo)]) + +/* NMEMB: number of members in array, a */ +#define NMEMB(a) \ + (sizeof(a[0]) / sizeof(a)) + +#define offsetof(st, m) \ + __builtin_offsetof(st, m) + + +//// =========================================================================== +//// btree types + +/* + btree page header. all pages share this header. Though for metapages, you can + expect it to be zeroed out. +*/ +typedef struct BT_pageheader BT_pageheader; +struct BT_pageheader { + uint8_t dirty[256]; /* dirty bit map */ +} __packed; + +/* + btree key/value data format + +/* + BT_dat is used to provide a view of the data section in a BT_page where data is + stored like: + va fo va fo + bytes 0 4 8 12 + + The convenience macros given an index into the data array do the following: + BT_dat_lo(i) returns ith va (low addr) + BT_dat_hi(i) returns i+1th va (high addr) + BT_dat_fo(i) returns ith file offset +*/ +typedef union BT_dat BT_dat; +union BT_dat { + vaof_t va; /* virtual address offset */ + pgno_t fo; /* file offset */ +}; + +/* like BT_dat but when a struct is more useful than a union */ +typedef struct BT_kv BT_kv; +struct BT_kv { + vaof_t va; + pgno_t fo; +}; + +/* ;;: todo, perhaps rather than an index, return the data directly and typecast?? */ +#define BT_dat_lo(i) ((i) * 2) +#define BT_dat_fo(i) ((i) * 2 + 1) +#define BT_dat_hi(i) ((i) * 2 + 2) + +#define BT_dat_lo2(I, dat) +#define BT_dat_fo2(I, dat) +#define BT_dat_hi2(I, dat) + +/* BT_dat_maxva: pointer to highest va in page data section */ +#define BT_dat_maxva(p) \ + ((void *)&(p)->datd[BT_dat_lo(BT_DAT_MAXKEYS)]) + +/* BT_dat_maxfo: pointer to highest fo in page data section */ +#define BT_dat_maxfo(p) \ + ((void *)&(p)->datd[BT_dat_fo(BT_DAT_MAXVALS)]) + +#define BT_DAT_MAXBYTES (BT_PAGESIZE - sizeof(BT_pageheader)) +#define BT_DAT_MAXENTRIES (BT_DAT_MAXBYTES / sizeof(BT_dat)) +#define BT_DAT_MAXKEYS (BT_DAT_MAXENTRIES / 2) +/* #define BT_DAT_MAXKEYS 10 */ +#define BT_DAT_MAXVALS BT_DAT_MAXKEYS +static_assert(BT_DAT_MAXENTRIES % 2 == 0); + +/* + all pages in the memory arena consist of a header and data section +*/ +typedef struct BT_page BT_page; +struct BT_page { + BT_pageheader head; /* ;;: TODO remove header and store all header data in BT_meta */ + union { /* data section */ + BT_dat datd[BT_DAT_MAXENTRIES]; /* union view */ + BT_kv datk[0]; /* struct view */ + BYTE datc[0]; /* byte-level view */ + }; +}; +static_assert(sizeof(BT_page) == BT_PAGESIZE); +static_assert(BT_DAT_MAXBYTES % sizeof(BT_dat) == 0); + +#define BT_MAGIC 0xBADDBABE +#define BT_VERSION 1 +/* + a meta page is like any other page, but the data section is used to store + additional information +*/ +#define BLK_BASE_LEN0 (MBYTES(2) - (BT_PAGESIZE * BT_NUMMETAS)) +#define BLK_BASE_LEN1 (BLK_BASE_LEN0 * 4) +#define BLK_BASE_LEN2 (BLK_BASE_LEN1 * 4) +#define BLK_BASE_LEN3 (BLK_BASE_LEN2 * 4) +#define BLK_BASE_LEN4 (BLK_BASE_LEN3 * 4) +#define BLK_BASE_LEN5 (BLK_BASE_LEN4 * 4) +#define BLK_BASE_LEN6 (BLK_BASE_LEN5 * 4) +#define BLK_BASE_LEN7 (BLK_BASE_LEN6 * 4) +typedef struct BT_meta BT_meta; +struct BT_meta { + uint32_t magic; + uint32_t version; + pgno_t last_pg; /* last page used in file */ + uint32_t _pad0; + uint64_t txnid; + void *fix_addr; /* fixed addr of btree */ + + pgno_t blk_base[8]; /* block base array for striped node partition */ + + /* ;;: for the blk_base array, code may be simpler if this were an array of + BT_page *. */ + + uint8_t blk_cnt; /* currently highest valid block base */ + uint8_t depth; /* tree depth */ +/* #define BP_DIRTY ((uint8_t)0x01) /\* ;;: TODO remove dirty flag *\/ */ +#define BP_META ((uint8_t)0x02) + uint8_t flags; + uint8_t _pad1; + pgno_t root; + /* ;;: confirm: shouldn't the checksum actually follow the roots array? */ + uint32_t chk; /* checksum */ + /* 64bit alignment manually checked */ + uint64_t roots[]; /* for usage by ares */ + + /* ;;: TODO: ensure the crc_32 checksum cannot be zero */ + +} __packed; +static_assert(sizeof(BT_meta) <= BT_DAT_MAXBYTES); + +/* the length of the metapage up to but excluding the checksum */ +#define BT_META_LEN (offsetof(BT_meta, chk)) + +#define BT_roots_bytelen (sizeof(BT_meta) - offsetof(BT_meta, roots)) + +typedef struct BT_mlistnode BT_mlistnode; +struct BT_mlistnode { + void *va; /* virtual address */ + size_t sz; /* size in pages */ + BT_mlistnode *next; /* next freelist node */ +}; + +typedef struct BT_nlistnode BT_nlistnode; +struct BT_nlistnode { + BT_page *va; /* virtual address */ + size_t sz; /* size in pages */ + BT_nlistnode *next; /* next freelist node */ +}; + +typedef struct BT_flistnode BT_flistnode; +struct BT_flistnode { + pgno_t pg; /* pgno - an offset in the persistent file */ + size_t sz; /* size in pages */ + BT_flistnode *next; /* next freelist node */ +}; + +/* macro to access the metadata stored in a page's data section */ +#define METADATA(p) ((BT_meta *)(void *)(p)->datc) + +typedef struct BT_state BT_state; +struct BT_state { + uint16_t flags; /* ;;: rem */ + int data_fd; + int meta_fd; /* ;;: confident can be removed because we're not explicitly calling write() */ + char *path; + ULONG branch_page_cnt; /* ;;: rem */ + ULONG leaf_page_cnt; /* ;;: rem */ + void *fixaddr; + BYTE *map; + BT_page *node_freelist; + BT_meta *meta_pages[2]; /* double buffered */ + /* ;;: note, while meta_pages[which]->root stores a pgno, we may want to just + store a pointer to root in state in addition to avoid a _node_find on it + every time it's referenced */ + /* BT_page *root; */ + off_t file_size; /* the size of the pma file in bytes */ + pgno_t frontier; /* last non-free page in use by pma (exclusive) */ + unsigned int which; /* which double-buffered db are we using? */ + BT_nlistnode *nlist; /* node freelist */ + BT_mlistnode *mlist; /* memory freelist */ + BT_flistnode *flist; /* pma file freelist */ + /* ;;: for deletion coalescing: + + when freeing data, push onto the pending flist and mlist. When pushing onto + the mlist, you can preemptively coalesce. You don't need to coalesce at all + in the pending flist. + + when inserting and coalescing, if you can free a node then push onto the + pending nlist + + */ + + BT_flistnode *pending_flist; + BT_nlistnode *pending_nlist; +}; + +/* + ;;: wrt to frontier: if you need to allocate space for data, push the frontier + out by that amount allocated. If you're allocating a new stripe, push it to + the end of that stripe. +*/ + + +//// =========================================================================== +//// btree internal routines + +static void _bt_printnode(BT_page *node); /* ;;: tmp */ +static int +_bt_insertdat(vaof_t lo, vaof_t hi, pgno_t fo, + BT_page *parent, size_t childidx); /* ;;: tmp */ + +#define BT_MAXDEPTH 4 /* ;;: todo derive it */ +typedef struct BT_findpath BT_findpath; +struct BT_findpath { + BT_page *path[BT_MAXDEPTH]; + size_t idx[BT_MAXDEPTH]; + uint8_t depth; +}; + +/* _node_get: get a pointer to a node stored at file offset pgno */ +static BT_page * +_node_get(BT_state *state, pgno_t pgno) +{ + /* TODO: eventually, once we can store more than 2M of nodes, this will need + to reference the meta page's blk_base array to determine where a node is + mapped. i.e: + + - receive pgno + - find first pgno in blk_base that exceeds pgno : i + - sector that contains node is i-1 + - appropriately offset into i-1th fixed size partition: 2M, 8M, 16M, ... + + */ + + /* for now, this works because the 2M sector is at the beginning of both the + memory arena and pma file + */ + if (pgno <= 1) return 0; /* no nodes stored at 0 and 1 (metapages) */ + /* TODO: when partition striping is implemented, a call beyond the furthest + block base should result in the allocation of a new block base */ + assert((pgno * BT_PAGESIZE) < MBYTES(2)); + return FO2PA(state->map, pgno); +} + +/* ;;: I don't think we should need this if _node_alloc also returns a disc offset */ +static pgno_t +_fo_get(BT_state *state, BT_page *node) +{ + uintptr_t vaddr = (uintptr_t)node; + uintptr_t start = (uintptr_t)state->map; + return BY2FO(vaddr - start); +} + +#ifndef USE_NLIST +static BT_page * /* ;;: change to return both a file and node offset as params to function. actual return value is error code */ +_node_alloc(BT_state *state) +{ + /* TODO: will eventually need to walk a node freelist that allocs space for + the striped node partitions. Since this is unimplemented, just allocating + space from first 2M */ + + /* ;;: when node freelist is implemented, will we need to return the file + offset of the node as well? This is important for splitting where we + allocate a new node and need to store its file offset in the parent's + data index */ + size_t width = (BYTE *)state->node_freelist - state->map; + assert(width < MBYTES(2)); + /* ;;: todo confirm data sections are zeroed */ + /* ZERO(state->node_freelist, BT_PAGESIZE); */ + return ++state->node_freelist; +} +#endif + +static BT_page * +_bt_nalloc(BT_state *state) +/* allocate a node in the node freelist */ +{ + BT_nlistnode **n = &state->nlist; + + for (; *n; n = &(*n)->next) { + /* ;;: this assert is temporary. When partition striping is + implemented. Rather than assert, conditionally check if we're at the + end of the current stripe. If so, allocate a new region and append that + to the freelist. */ + size_t width = (BYTE *)state->nlist - state->map; + assert(width < MBYTES(2)); + /* perfect fit */ + if ((*n)->sz == 1) { + BT_page *ret; + ret = (*n)->va; + *n = (*n)->next; + return ret; + } + /* larger than necessary: shrink the node */ + if ((*n)->sz > 1) { + BT_page *ret; + ret = (*n)->va; + (*n)->sz -= 1; + (*n)->va = (*n)->va + 1; + return ret; + } + } +} + +/* ;;: from our usage, _node_cow no longer needs to take indirect pointer to + newnode. We don't ever do anything with it */ +static int +_node_cow(BT_state *state, BT_page *node, BT_page **newnode, pgno_t *pgno) +{ + BT_page *ret = _node_alloc(state); + memcpy(ret->datk, node->datk, sizeof node->datk[0] * BT_DAT_MAXENTRIES); + *pgno = _fo_get(state, ret); + *newnode = ret; + return BT_SUCC; +} + +/* binary search a page's data section for a va. Returns a pointer to the found BT_dat */ +static void * +_bt_bsearch(BT_page *page, vaof_t va) +{ + /* ;;: todo: actually bsearch rather than linear */ + for (BT_kv *kv = &page->datk[0]; kv <= BT_dat_maxva(page); kv++) { + if (kv->va == va) + return kv; + } + + return 0; +} + +static size_t +_bt_childidx(BT_page *node, vaof_t lo, vaof_t hi) +/* looks up the child index in a parent node. If not found, return is + BT_DAT_MAXKEYS */ +{ + size_t i = 0; + for (; i < BT_DAT_MAXKEYS - 1; i++) { + vaof_t llo = node->datk[i].va; + vaof_t hhi = node->datk[i+1].va; + if (llo <= lo && hhi >= hi) + return i; + } + return BT_DAT_MAXKEYS; +} + +/* ;;: find returns a path to nodes that things should be in if they are there. */ +/* a leaf has a meta page depth eq to findpath depth */ +static int +_bt_find2(BT_state *state, + BT_page *node, + BT_findpath *path, + uint8_t maxdepth, + vaof_t lo, + vaof_t hi) +{ + /* ;;: meta node stores depth (node or leaf?) + look at root node and binsearch BT_dats where low is <= lo and high is >= hi + If at depth of metapage (a leaf), then done + otherwise grab node, increment depth, save node in path + */ + if (path->depth > maxdepth) + return ENOENT; + + assert(node != 0); + + size_t i; + if ((i = _bt_childidx(node, lo, hi)) == BT_DAT_MAXKEYS) + return ENOENT; + + if (path->depth == maxdepth) { + path->idx[path->depth] = i; + path->path[path->depth] = node; + return BT_SUCC; + } + /* then branch */ + else { + pgno_t fo = node->datk[i].fo; + BT_page *child = _node_get(state, fo); + path->idx[path->depth] = i; + path->path[path->depth] = node; + path->depth++; + return _bt_find2(state, child, path, maxdepth, lo, hi); + } +} + +static void +_bt_root_new(BT_page *root) +{ + root->datk[0].va = 0; + root->datk[0].fo = 0; + root->datk[1].va = UINT32_MAX; + root->datk[1].fo = 0; +} + +static int +_bt_find(BT_state *state, BT_findpath *path, vaof_t lo, vaof_t hi) +{ + path->depth = 1; + BT_meta *meta = state->meta_pages[state->which]; + BT_page *root = _node_get(state, meta->root); + uint8_t maxdepth = meta->depth; + return _bt_find2(state, root, path, maxdepth, lo, hi); +} + +static int +_bt_findpath_is_root(BT_findpath *path) +{ + assert(path != 0); + return path->depth == 0; +} + +/* _bt_numkeys: find next empty space in node's data section. Returned as + index into node->datk. If the node is full, return is BT_DAT_MAXKEYS */ +static size_t +_bt_numkeys(BT_page *node) +{ + size_t i = 1; + for (; i < BT_DAT_MAXKEYS; i++) { + if (node->datk[i].va == 0) break; + } + return i; +} + +static int +_bt_datshift(BT_page *node, size_t i, size_t n) +/* shift data segment at i over by n KVs */ +{ + assert(i+n < BT_DAT_MAXKEYS); /* check buffer overflow */ + size_t siz = sizeof node->datk[0]; + size_t bytelen = (BT_DAT_MAXKEYS - i - n) * siz; + memmove(&node->datk[i+n], &node->datk[i], bytelen); + ZERO(&node->datk[i], n * siz); + return BT_SUCC; +} + +/* _bt_split_datcopy: copy right half of left node to right node */ +static int +_bt_split_datcopy(BT_page *left, BT_page *right) +{ + size_t mid = BT_DAT_MAXKEYS / 2; + size_t bytelen = mid * sizeof(left->datk[0]); + /* copy rhs of left to right */ + memcpy(right->datk, &left->datk[mid], bytelen); + /* zero rhs of left */ + ZERO(&left->datk[mid], bytelen); /* ;;: note, this would be unnecessary if we stored node.N */ + /* the last entry in left should be the first entry in right */ + left->datk[mid].va = right->datk[0].va; + + return BT_SUCC; +} + +static int +_bt_ischilddirty(BT_page *parent, size_t child_idx) +{ + assert(child_idx < 2048); + uint8_t flag = parent->head.dirty[child_idx >> 3]; + return flag & (1 << (child_idx & 0x7)); +} + +/* ;;: todo: name the 0x8 and 4 literals and/or generalize */ +static int +_bt_dirtychild(BT_page *parent, size_t child_idx) +{ + assert(child_idx < 2048); + /* although there's nothing theoretically wrong with dirtying a dirty node, + there's probably a bug if we do it since a we only dirty a node when it's + alloced after a split or CoWed */ + assert(!_bt_ischilddirty(parent, child_idx)); + uint8_t *flag = &parent->head.dirty[child_idx >> 3]; + *flag |= 1 << (child_idx & 0x7); + return BT_SUCC; +} + +static int +_bt_cleanchild(BT_page *parent, size_t child_idx) +{ + assert(_bt_ischilddirty(parent, child_idx)); + uint8_t *flag = &parent->head.dirty[child_idx >> 3]; + *flag ^= 1 << (child_idx & 0x7); + return BT_SUCC; +} + +/* ;:: assert that the node is dirty when splitting */ +static int +_bt_split_child(BT_state *state, BT_page *parent, size_t i, pgno_t *newchild) +{ + /* ;;: todo: better error handling */ + /* ;;: todo: assert parent and left is dirty */ + int rc = BT_SUCC; + size_t N; + BT_page *left = _node_get(state, parent->datk[i].fo); + BT_page *right = _node_alloc(state); + if (right == 0) + return ENOMEM; + if (!SUCC(rc = _bt_split_datcopy(left, right))) + return rc; + + /* adjust high address of left node in parent */ + N = _bt_numkeys(left); + /* parent->datk[i+1].va = left->datk[N-1].va; /\* ;;: is this necessary? *\/ */ + + /* insert reference to right child into parent node */ + N = _bt_numkeys(right); + vaof_t lo = right->datk[0].va; + vaof_t hi = right->datk[N-1].va; + + _bt_insertdat(lo, hi, _fo_get(state, right), parent, i); + + /* dirty right child */ + size_t ridx = _bt_childidx(parent, lo, hi); + assert(ridx == i+1); /* 0x100000020100;;: tmp? */ + _bt_dirtychild(parent, ridx); + + /* ;;: fix this */ + *newchild = _fo_get(state, right); + + return BT_SUCC; +} + +/* ;;: since we won't be rebalancing on delete, but rather on insert, you should add rebalance logic to _bt_insert2 which checks the degree of a node and rebalances if less than minimum */ + +static int +_bt_rebalance(BT_state *state, BT_page *node) +{ + return 255; +} + +/* insert lo, hi, and fo in parent's data section for childidx */ +static int +_bt_insertdat(vaof_t lo, vaof_t hi, pgno_t fo, + BT_page *parent, size_t childidx) +{ + DPRINTF("BEFORE INSERT lo %" PRIu32 " hi %" PRIu32 " fo %" PRIu32, lo, hi, fo); + /* _bt_printnode(parent); */ + + /* ;;: TODO confirm this logic is appropriate for branch nodes. (It /should/ + be correct for leaf nodes) */ + vaof_t llo = parent->datk[childidx].va; + vaof_t hhi = parent->datk[childidx+1].va; + + /* duplicate */ + if (llo == lo && hhi == hi) { + parent->datk[childidx].fo = fo; + return BT_SUCC; + } + + if (llo == lo) { + _bt_datshift(parent, childidx + 1, 1); + vaof_t oldfo = parent->datk[childidx].fo; + parent->datk[childidx].fo = fo; + parent->datk[childidx+1].va = hi; + parent->datk[childidx+1].fo = oldfo + (hi - llo); + } + else if (hhi == hi) { + _bt_datshift(parent, childidx + 1, 1); + parent->datk[childidx+1].va = lo; + parent->datk[childidx+1].fo = fo; + } + else { + _bt_datshift(parent, childidx + 1, 2); + parent->datk[childidx+1].va = lo; + parent->datk[childidx+1].fo = fo; + parent->datk[childidx+2].va = hi; + pgno_t lfo = parent->datk[childidx].fo; + vaof_t lva = parent->datk[childidx].va; + parent->datk[childidx+2].fo = (lfo == 0) + ? 0 + : lfo + (hi - lva); + } + + DPUTS("AFTER INSERT"); + /* _bt_printnode(parent); */ + return BT_SUCC; +} + + +//// =========================================================================== +//// wip - deletion coalescing + +/* ;;: todo: rename routines */ + +int +_bt_delco_1pass_0(BT_state *state, vaof_t lo, vaof_t hi, + BT_page *node, uint8_t depth, uint8_t maxdepth) +{ + /* Perform a dfs search on all ranges that fall within lo and hi */ + + /* ;;: we can't use bt_childidx because the range of lo-hi may overlap ofc */ + size_t loidx = 0; + size_t hiidx = 0; + + /* first find the entry that matches lo */ + size_t i; + for (i = 0; i < BT_DAT_MAXKEYS-1; i++) { + vaof_t llo = node->datk[i].va; + if (llo <= lo) { + loidx = i; + break; + } + } + + /* and then the entry that matches hi */ + for (; i < BT_DAT_MAXKEYS-1; i++) { + vaof_t hhi = node->datk[i].va; + if (hhi >= hi) { + hiidx = hi; + break; + } + } + + /* node->datk[loidx] - node->datk[hiidx] are the bounds on which to perform + the dfs */ + for (i = loidx; i < hiidx; i++) { + vaof_t llo = node->datk[i].va; + pgno_t pg = node->datk[i].va; + + /* if at the leaf level, terminate with failure if pg is not free */ + if (depth == maxdepth) { + if (pg != 0) return 1; + else continue; + } + + /* otherwise, dfs the child node */ + BT_page *child = _node_get(state, pg); + if (!SUCC(_bt_delco_1pass_0(state, lo, hi, child, depth+1, maxdepth))) + return 1; + } + + /* whether we're at a leaf or a branch, by now all pages corresponding to the + hi-lo range must be free */ + return BT_SUCC; +} + +/* ;;: since this is called by another recursive function _bt_delco that first + finds if a split exists, this /could/ take a pgno to avoid unnecessarily + rewalking the tree. not a big deal though as is. */ +static int +_bt_delco_1pass(BT_state *state, vaof_t lo, vaof_t hi) +/* returns true if the leaves in the given range are all free (pgno of 0). false + otherwise. This must be the case for an insert into an overlapping range to + succeed */ +{ + BT_meta *meta = state->meta_pages[state->which]; + BT_page *root = _node_get(state, meta->root); + return _bt_delco_1pass_0(state, lo, hi, root, 1, meta->depth); +} + +static void +_pending_nlist_insert(BT_state *state, pgno_t nodepg) +{ + /* ;;: todo: need to account for a null head */ + BT_nlistnode *head = state->pending_nlist; + BT_page *va = _node_get(state, nodepg); + + /* we don't need to account for a freelist node's size because we aren't + coalescing the pending freelists */ + while (head->next) { + if (head->next->va > va) + break; + head = head->next; + } + + /* head->next is either null or has a higher address than va */ + BT_nlistnode *new = calloc(1, sizeof new); + new->next = head->next; + new->sz = 1; + new->va = va; + + head->next = new; +} + +static void +_pending_nlist_clear(BT_state *state) +{ + /* there's no need for a pending freelist "pop" routine as we only clear nodes + from it after all have been merged with the real freelists */ + BT_nlistnode *prev = state->pending_nlist; + BT_nlistnode *next = prev->next; + while (prev) { + free(prev); + prev = next; + next = next->next; + } + state->pending_nlist = 0; +} + +static BT_nlistnode * +_nlist_find(BT_nlistnode *head, BT_page *va) +/* find a node */ +{ + +} + +static void +_pending_nlist_merge(BT_state *state) +/* merge state->pending_nlist with state->nlist. To be called when syncing */ +{ + BT_nlistnode *src_head = state->pending_nlist; + BT_nlistnode *dst_head = state->nlist; + + while (src_head) { + /* ;;: todo refactor */ + while (dst_head) { + BT_page *dst_va = dst_head->va; + BT_page *src_va = src_head->va; + if (dst_head->va <= src_head->va + && dst_head->va + dst_head->sz >= src_head->va) { + /* found node in nlist that fits node in pending nlist */ + + dst_head->sz += 1; + break; + } + else if (dst_head->va + dst_head->sz < src_head->va + && dst_head->next->va > src_head->va) { + /* pending nlist node belongs between two nlist nodes */ + BT_nlistnode *new = calloc(1, sizeof *new); + memcpy(new, src_head, sizeof *src_head); + new->sz = 1; + new->va = src_head->va; + /* insert */ + new->next = dst_head->next; + dst_head->next = new; + break; + } + dst_head = dst_head->next; + } + if (!dst_head) { + /* need to track prev */ + } + + + src_head = src_head->next; + } + + _pending_nlist_clear(state); +} + + +/* ;;: todo move shit around */ +static void +_bt_delco_droptree2(BT_state *state, pgno_t nodepg, uint8_t depth, uint8_t maxdepth) +{ + /* branch */ + if (depth != maxdepth) { + BT_page *node = _node_get(state, nodepg); + for (size_t i = 0; i < BT_DAT_MAXKEYS; i++) { + BT_kv entry = node->datk[i]; + if (entry.fo == 0) + break; /* done */ + _bt_delco_droptree2(state, entry.fo, depth+1, maxdepth); + } + } + + _pending_nlist_insert(state, nodepg); +} + +static void +_bt_delco_droptree(BT_state *state, pgno_t nodepg, uint8_t depth) +{ + /* completely drop a tree. Assume that all leaves under the tree are free + (pgno = 0) */ + assert(nodepg >= 2); + BT_meta *meta = state->meta_pages[state->which]; + return _bt_delco_droptree2(state, nodepg, depth, meta->depth); +} + +static void +_bt_delco_trim_rsubtree_lhs2(BT_state *state, vaof_t lo, vaof_t hi, + pgno_t nodepg, uint8_t depth, uint8_t maxdepth) +{ + BT_page *node = _node_get(state, nodepg); + size_t hiidx = 0; + + /* find hi idx of range */ + size_t i; + for (i = 0; i < BT_DAT_MAXKEYS-1; i++) { + vaof_t hhi = node->datk[i].va; + if (hhi >= hi) { + hiidx = i; + break; + } + } + + /* set the lo address of datk[hiidx] to hi */ + node->datk[hiidx-1].va = hi; + + /* drop the subtrees left of the range */ + if (depth != maxdepth) { + for (i = 0; i < hiidx-1; i++) { + pgno_t childpg = node->datk[i].fo; + if (childpg == 0) + break; + _bt_delco_droptree(state, childpg, depth+1); + } + } + + /* memmove the buffer so the found range is the first in the node */ + BYTE *dst = (BYTE *)&node->datk[0].va; + BYTE *src = (BYTE *)&node->datk[hiidx-1].va; + BYTE *end = (BYTE *)&node->datk[BT_DAT_MAXKEYS-1].fo; + size_t len = end - src; + + memmove(dst, src, len); + + /* ;;: TODO add temporary asserts for testing? */ + + /* and now zero the moved range */ + ZERO(dst+len, end-(dst+len)); + + /* done if this is a leaf */ + if (depth == maxdepth) + return; + /* otherwise, recur on subtree */ + pgno_t rsubtree = node->datk[hiidx].fo; + return _bt_delco_trim_rsubtree_lhs2(state, lo, hi, rsubtree, depth+1, maxdepth); +} + +static void +_bt_delco_trim_rsubtree_lhs(BT_state *state, vaof_t lo, vaof_t hi, + pgno_t nodepg, uint8_t depth) +{ + BT_meta *meta = state->meta_pages[state->which]; + return _bt_delco_trim_rsubtree_lhs2(state, lo, hi, nodepg, depth, meta->depth); +} + +static void +_bt_delco_trim_lsubtree_rhs2(BT_state *state, vaof_t lo, vaof_t hi, + pgno_t nodepg, uint8_t depth, uint8_t maxdepth) +{ + BT_page *node = _node_get(state, nodepg); + size_t loidx = 0; + + /* find low idx of range */ + size_t i; + for (i = 0; i < BT_DAT_MAXKEYS-1; i++) { + vaof_t llo = node->datk[i].va; + if (llo <= lo) { + loidx = i; + break; + } + } + + /* set the hi address of datk[loidx] to hi */ + node->datk[loidx+1].va = hi; + + /* drop the subtrees right of the range */ + if (depth != maxdepth) { + /* recur and droptree for branches */ + for (i = loidx+1; i < BT_DAT_MAXKEYS-1; i++) { + pgno_t childpg = node->datk[i].fo; + if (childpg == 0) + break; + _bt_delco_droptree(state, childpg, depth+1); + } + } + + /* always zero rhs whether node is a leaf or a branch */ + BYTE *beg = (BYTE *)&node->datk[loidx+1].fo; + BYTE *end = (BYTE *)&node->datk[BT_DAT_MAXKEYS-1].fo; + size_t len = end - beg; + + ZERO(beg, len); + /* ;;: this won't zero the last fo, but that should be fine. remove the assert + when you're confident it /is/ fine */ + assert(node->datk[BT_DAT_MAXKEYS-1].fo == 0); + + /* done if this is a leaf */ + if (depth == maxdepth) + return; + /* otherwise, recur on the left subtree */ + pgno_t lsubtree = node->datk[loidx].fo; + return _bt_delco_trim_lsubtree_rhs2(state, lo, hi, lsubtree, depth+1, maxdepth); +} + +static void +_bt_delco_trim_lsubtree_rhs(BT_state *state, vaof_t lo, vaof_t hi, + pgno_t nodepg, uint8_t depth) +{ + BT_meta *meta = state->meta_pages[state->which]; + return _bt_delco_trim_lsubtree_rhs2(state, lo, hi, nodepg, depth, meta->depth); +} + +static void +_bt_delco(BT_state *state, vaof_t lo, vaof_t hi, + pgno_t nodepg, uint8_t depth, uint8_t maxdepth) +{ + /* ;;: "find_internal_splits" in the original algorithm */ + BT_page *node = _node_get(state, nodepg); + size_t N = _bt_numkeys(node); + + size_t loidx = 0; + size_t hiidx = 0; + pgno_t lsubtree = 0; + pgno_t rsubtree = 0; + + /* find low idx of range */ + for (size_t i = 0; i < BT_DAT_MAXKEYS-1; i++) { + vaof_t llo = node->datk[i].va; + if (llo <= lo) { + loidx = i; + break; + } + } + + /* find high idx of range */ + for (size_t i = loidx; i < BT_DAT_MAXKEYS-1; i++) { + vaof_t hhi = node->datk[i].va; + if (hhi >= hi) { + assert(i > 0); + hiidx = i - 1; + break; + } + } + + /* non-split range and at leaf. done */ + if (depth == maxdepth + && hiidx == loidx) { + return; + } + + lsubtree = node->datk[loidx].fo; + rsubtree = node->datk[hiidx].fo; + + if (depth < maxdepth) { + /* guarantee path is dirty by CoWing node if not */ + + /* ;;: refactor? code duplication?? */ + if (!_bt_ischilddirty(node, loidx)) { + BT_page *child = _node_get(state, lsubtree); + BT_page *new; + pgno_t newpg; + _node_cow(state, child, &new, &newpg); + lsubtree = node->datk[loidx].fo = newpg; + _bt_dirtychild(node, loidx); + } + + if (!_bt_ischilddirty(node, hiidx)) { + BT_page *child = _node_get(state, rsubtree); + BT_page *new; + pgno_t newpg; + _node_cow(state, child, &new, &newpg); + rsubtree = node->datk[hiidx].fo = newpg; + _bt_dirtychild(node, hiidx); + } + } + + /* non-split range, recurse to child tree */ + if (hiidx == loidx) { + pgno_t childpg = node->datk[loidx].fo; + _bt_delco(state, lo, hi, childpg, depth+1, maxdepth); + } + + /* split range discovered */ + if (hiidx > loidx) { + /* run first pass to guarantee range is completely free */ + if (!SUCC(_bt_delco_1pass(state, lo, hi))) { + /* attempted insert on split range that cannot be coalesced */ + assert(0); + } + + /* set leftmost boundary va to hi */ + node->datk[loidx+1].va = hi; + + /* set the lo side of the right boundary to hi */ + node->datk[hiidx].va = hi; + + /* drop all trees between the two subtrees */ + for (size_t i = loidx+1; i < hiidx; i++) { + pgno_t childpg = node->datk[i].fo; + _bt_delco_droptree(state, childpg, depth); + } + + /* move buffer */ + BYTE *dst = (BYTE *)&node->datk[loidx+1].va; + BYTE *src = (BYTE *)&node->datk[hiidx].va; + BYTE *end = (BYTE *)&node->datk[BT_DAT_MAXKEYS-1].fo; + size_t len = end - src; + memmove(dst, src, len); + ZERO(dst+len, end-(dst+len)); + + /* trim left subtree then trim right subtree */ + _bt_delco_trim_lsubtree_rhs(state, lo, hi, lsubtree, depth+1); + _bt_delco_trim_rsubtree_lhs(state, lo, hi, rsubtree, depth+1); + + /* done */ + return; + } +} + +/* ;;: todo, update meta->depth when we add a row. Should this be done in + _bt_rebalance? */ +static int +_bt_insert2(BT_state *state, vaof_t lo, vaof_t hi, pgno_t fo, + BT_page *node, size_t depth) +{ + /* ;;: to be written in such a way that node is guaranteed both dirty and + non-full */ + + /* ;;: remember: + - You need to CoW+dirty a node when you insert a non-dirty node. + - You need to insert into a node when: + - It's a leaf + - It's a branch and you CoWed the child + - Hence, all nodes in a path to a leaf being inserted into need to already + be dirty or explicitly Cowed. Splitting doesn't actually factor into this + decision afaict. + */ + + assert(node); + + int rc = 255; + size_t N = 0; + size_t childidx = _bt_childidx(node, lo, hi); + assert(childidx != BT_DAT_MAXKEYS); + BT_meta *meta = state->meta_pages[state->which]; + + if (depth < meta->depth) { + pgno_t childpgno = node->datk[childidx].fo; + BT_page *child = _node_get(state, childpgno); + N = _bt_numkeys(child); + } + + /* nullcond: node is a leaf */ + if (meta->depth == depth) { + /* guaranteed non-full and dirty by n-1 recursive call, so just insert */ + return _bt_insertdat(lo, hi, fo, node, childidx); + } + + /* do we need to CoW the child node? */ + if (!_bt_ischilddirty(node, childidx)) { + BT_page *newchild; + pgno_t pgno; + _node_cow(state, node, &newchild, &pgno); + node->datk[childidx].fo = pgno; + _bt_dirtychild(node, childidx); + } + + /* do we need to split the child node? */ + if (N >= BT_DAT_MAXKEYS - 2) { + pgno_t rchild_pgno; + if (!SUCC(rc = _bt_split_child(state, node, childidx, &rchild_pgno))) + return rc; + + /* since we split the child's data, recalculate the child idx */ + /* ;;: note, this can be simplified into a conditional i++ */ + childidx = _bt_childidx(node, lo, hi); + + } + + /* the child is now guaranteed non-full (split) and dirty. Recurse */ + BT_page *child = _node_get(state, node->datk[childidx].fo); + return _bt_insert2(state, lo, hi, fo, child, depth+1); +} + +static int +_bt_insert(BT_state *state, vaof_t lo, vaof_t hi, pgno_t fo) +/* handles CoWing/splitting of the root page since it's special cased. Then + passes the child matching hi/lo to _bt_insert2 */ +{ + int rc; + + BT_meta *meta = state->meta_pages[state->which]; + BT_page *root = _node_get(state, meta->root); + + /* the root MUST be dirty (zero checksum in metapage) */ + assert(meta->chk == 0); + + size_t N = _bt_numkeys(root); + + /* perform deletion coalescing (and preemptively guarantee path is dirty) if + inserting a non-zero (non-free) page */ + if (fo != 0) { + _bt_delco(state, lo, hi, meta->root, 1, meta->depth); + } + + /* CoW root's child if it isn't already dirty */ + size_t childidx = _bt_childidx(root, lo, hi); + assert(childidx != BT_DAT_MAXKEYS); /* ;;: this should catch the case of + improperly inserting into a split + range. Should we do it earlier or + differently? */ + if (meta->depth > 1 + && !_bt_ischilddirty(root, childidx)) { + BT_page *child = _node_get(state, root->datk[childidx].fo); + BT_page *newchild; + pgno_t newchildpg; + _node_cow(state, child, &newchild, &newchildpg); + root->datk[childidx].fo = newchildpg; + _bt_dirtychild(root, childidx); + } + + /* before calling into recursive insert, handle root splitting since it's + special cased (2 allocs) */ + if (N >= BT_DAT_MAXKEYS - 2) { /* ;;: remind, fix all these conditions to be - 2 */ + pgno_t pg = 0; + + /* the old root is now the left child of the new root */ + BT_page *left = root; + BT_page *right = _node_alloc(state); + BT_page *rootnew = _node_alloc(state); + + /* split root's data across left and right nodes */ + _bt_split_datcopy(left, right); + /* save left and right in new root's .data */ + pg = _fo_get(state, left); + rootnew->datk[0].fo = pg; + rootnew->datk[0].va = 0; + pg = _fo_get(state, right); + rootnew->datk[1].fo = pg; + rootnew->datk[1].va = right->datk[0].va; + rootnew->datk[2].va = UINT32_MAX; + /* dirty new root's children */ + _bt_dirtychild(rootnew, 0); + _bt_dirtychild(rootnew, 1); + /* update meta page information. (root and depth) */ + pg = _fo_get(state, rootnew); + meta->root = pg; + meta->depth += 1; + root = rootnew; + } + + /* + meta is dirty + root is dirty and split if necessary + root's child in insert path is dirty and split if necessary + finally, recurse on child + */ + return _bt_insert2(state, lo, hi, fo, root, 1); + /* return _bt_insert2(state, lo, hi, fo, child, 1); */ +} + +/* ;;: wip */ +/* ;;: inspired by lmdb's MDB_pageparent. While seemingly unnecessary for + _bt_insert, this may be useful for _bt_delete when we implement deletion + coalescing */ +typedef struct BT_ppage BT_ppage; +struct BT_ppage { + BT_page *node; + BT_page *parent; +}; + +static int +_bt_delete(BT_state *state, vaof_t lo, vaof_t hi) +{ + /* ;;: tmp, implement coalescing of zero ranges and merging/rebalancing of + nodes */ + return _bt_insert(state, lo, hi, 0); +} + +static int +_mlist_new(BT_state *state) +{ + /* implemented separate from _mlist_read since _mlist_read uses lo va == 0 to + stop parsing node's data. This, however, is a valid starting condition when + freshly creating the btree */ + + BT_meta *meta = state->meta_pages[state->which]; + BT_page *root = _node_get(state, meta->root); + assert(root->datk[0].fo == 0); + + vaof_t lo = root->datk[0].va; + vaof_t hi = root->datk[1].va; + size_t len = B2PAGES(hi - lo); + + BT_mlistnode *head = calloc(1, sizeof *head); + + head->next = 0; + head->sz = len; + head->va = OFF2ADDR(lo); + + state->mlist = head; + + return BT_SUCC; +} + +static int +_flist_grow(BT_state *state, BT_flistnode *space) +/* growing the flist consists of expanding the backing persistent file, pushing + that space onto the disk freelist, and updating the dimension members in + BT_state */ +{ + /* ;;: I don't see any reason to grow the backing file non-linearly, but we + may want to adjust the size of the amount grown based on performance + testing. */ + if (-1 == lseek(state->data_fd, state->file_size + PMA_GROW_SIZE, SEEK_SET)) + return errno; + if (-1 == write(state->data_fd, "", 1)) + return errno; + + + /* find the last node in the disk freelist */ + BT_flistnode *tail = state->flist; + for (; tail->next; tail = tail->next) + ; + + pgno_t lastpgfree = tail->pg + tail->sz; + + /* ;;: TODO, make sure you are certain of this logic. Further, add assertions + regarding relative positions of state->file_size, state->frontier, and + lastpgfree + + we MAY call into this routine even if there is freespace on the end + because it's possible that freespace isn't large enough. We may also call + into this routine when the frontier exceeds the last free pg because + that's just how freelists work. ofc, frontier should never exceed + file_size. what other assertions?? + + */ + + /* if the frontier (last pg in use) is less than the last page free, we should + coalesce the new node with the tail. */ + if (state->frontier <= lastpgfree) { + tail->sz += PMA_GROW_SIZE; + } + /* otherwise, a new node needs to be allocated */ + else { + BT_flistnode *new = calloc(1, sizeof *new); + /* since the frontier exceeds the last pg free, new freespace should + naturally be allocated at the frontier */ + new->pg = state->frontier; + new->sz = PMA_GROW_SIZE; + tail->next = new; + } + + /* finally, update the file size */ + state->file_size += PMA_GROW_SIZE; + + return BT_SUCC; +} + +static int +_flist_new(BT_state *state) +{ + BT_meta *meta = state->meta_pages[state->which]; + BT_page *root = _node_get(state, meta->root); + assert(root->datk[0].fo == 0); + + vaof_t lo = root->datk[0].va; + vaof_t hi = root->datk[1].va; + size_t len = B2PAGES(hi - lo); + + BT_flistnode *head = calloc(1, sizeof *head); + + head->next = 0; + head->sz = len; + head->pg = PMA_GROW_SIZE; /* ;;: should we invoke logic to expand the backing file + here? probably. implement it */ /* */ + state->flist = head; + + return BT_SUCC; +} + +#if USE_NLIST +static int +_nlist_new(BT_state *state) +#define NLIST_PG_START 2 /* the third page */ +{ + BT_meta *meta = state->meta_pages[state->which]; + BT_nlistnode *head = calloc(1, sizeof *head); + + /* the size of a new node freelist is just the first stripe length */ + head->sz = BLK_BASE_LEN0; + head->va = &((BT_page *)state->map)[BT_NUMMETAS]; + head->next = 0; + + state->nlist = head; + + return BT_SUCC; +} + +static BT_nlistnode * +_nlist_read_prev(BT_nlistnode *head, BT_nlistnode *curr) +{ + /* find nlist node preceding curr and return it */ + BT_nlistnode *p, *n; + p = head; + n = head->next; + for (; n; p = n, n = n->next) { + if (n == curr) + return p; + } + return 0; +} + +/* TODO this is a pretty bad algorithm in terms of time complexity. It should be + fixed, but isn't necessary now as our nlist is quite small. You may want to + consider making nlist doubly linked or incorporate a sort and merge step. */ +static int +_nlist_read2(BT_state *state, BT_page *node, uint8_t maxdepth, + BT_nlistnode *head, uint8_t depth) +/* recursively walk all nodes in the btree. Allocating new nlist nodes when a + node is found to be in a stripe unaccounted for. For each node found, + split/shrink the appropriate node to account for the allocated page */ +{ + BT_nlistnode *p, *n; + p = head; + n = head->next; + + /* find the nlist node that fits the current btree node */ + for (; n; p = n, n = n->next) { + if (p->va <= node && p->va + p->sz > node) + break; + } + + /* if the nlist node is only one page wide, it needs to be freed */ + if (p->sz == 1) { + BT_nlistnode *prev = _nlist_read_prev(head, p); + prev->next = p->next; + free(p); + goto e; + } + + /* if the btree node resides at the end of the nlist node, just shrink it */ + BT_page *last = p->va + p->sz - 1; + if (last == node) { + p->sz -= 1; + goto e; + } + + /* if the btree node resides at the start of the nlist node, likewise shrink + it and update the va */ + if (p->va == node) { + p->sz -= 1; + p->va += 1; + goto e; + } + + /* otherwise, need to split the current nlist node */ + BT_nlistnode *right = calloc(1, sizeof *right); + size_t lsz = node - p->va; + size_t rsz = (p->va + p->sz) - node; + /* remove 1 page from the right nlist node's size to account for the allocated + btree node */ + rsz -= 1; + assert(lsz > 0 && rsz > 0); + + /* update the size of the left node. And set the size and va of the right + node. Finally, insert the new nlist node into the nlist. */ + p->sz = lsz; + right->sz = rsz; + right->va = node + 1; + right->next = p->next; + p->next = right; + + e: + /* if at a leaf, we're finished */ + if (depth == maxdepth) { + return BT_SUCC; + } + + /* otherwise iterate over all child nodes, recursively constructing the + list */ + int rc = BT_SUCC; + for (size_t i = 0; i < BT_DAT_MAXKEYS; i++) { + BT_kv kv = node->datk[i]; + BT_page *child = _node_get(state, node->datk[i].fo); + if (!child) continue; + if (!SUCC(rc = _nlist_read2(state, + child, + maxdepth, + head, + depth+1))) + return rc; + } + + /* all children traversed */ + return BT_SUCC; +} + +static int +_nlist_read(BT_state *state) +{ + /* ;;: this should theoretically be simpler than _mlist_read. right? We can + derive the stripes that contain nodes from the block base array stored in + the metapage. What else do we need to know? -- the parts of each stripe + that are free or in use. How can we discover that? + + 1) Without storing any per-page metadata, we could walk the entire tree + from the root. Check the page number of the node. And modify the freelist + accordingly. + + 2) If we stored per-page metadata, this would be simpler. Linearly traverse + each stripe and check if the page is BT_NODE or BT_FREE. + + -- are there downsides to (2)? The only advantage to this would be quicker + startup. So for now, going to traverse all nodes and for each node, + traverse the nlist and split it appropriately. + */ + + int rc = BT_SUCC; + BT_meta *meta = state->meta_pages[state->which]; + BT_page *root = _node_get(state, meta->root); + + /* ;;: since partition striping isn't implemented yet, simplifying code by + assuming all nodes reside in the 2M region */ + BT_nlistnode *head = calloc(1, sizeof *head); + head->sz = BLK_BASE_LEN0; + head->va = &((BT_page *)state->map)[BT_NUMMETAS]; + head->next = 0; + + if (!SUCC(rc = _nlist_read2(state, root, meta->depth, head, 1))) + return rc; + + state->nlist = head; + + return rc; +} +#endif + +static BT_mlistnode * +_mlist_read2(BT_state *state, BT_page *node, uint8_t maxdepth, uint8_t depth) +{ + /* leaf */ + if (depth == maxdepth) { + BT_mlistnode *head, *prev; + head = prev = calloc(1, sizeof *head); + + size_t i = 0; + BT_kv *kv = &node->datk[i]; + while (i < BT_DAT_MAXKEYS - 1) { +#if CAN_COALESCE + /* free and contiguous with previous mlist node: merge */ + if (kv->fo == 0 + && ADDR2OFF(prev->va) + P2BYTES(prev->sz) == kv->va) { + vaof_t hi = node->datk[i+1].va; + vaof_t lo = kv->va; + size_t len = B2PAGES(hi - lo); + prev->sz += len; + } + /* free but not contiguous with previous mlist node: append new node */ + else if (kv->fo == 0) { +#endif + BT_mlistnode *new = calloc(1, sizeof *new); + vaof_t hi = node->datk[i+1].va; + vaof_t lo = kv->va; + size_t len = B2PAGES(hi - lo); + new->sz = len; + new->va = OFF2ADDR(lo); + prev->next = new; + prev = new; +#if CAN_COALESCE + } +#endif + + kv = &node->datk[++i]; + } + return head; + } + + /* branch */ + size_t i = 0; + BT_mlistnode *head, *prev; + head = prev = 0; + for (; i < BT_DAT_MAXKEYS; ++i) { + BT_kv kv = node->datk[i]; + if (kv.fo == BT_NOPAGE) + continue; + BT_page *child = _node_get(state, kv.fo); + BT_mlistnode *new = _mlist_read2(state, child, maxdepth, depth+1); + if (head == 0) { + head = prev = new; + } + else { + /* just blindly append and unify the ends afterward */ + prev->next = new; + } + } + return 0; +} + +static int +_mlist_read(BT_state *state) +{ + BT_meta *meta = state->meta_pages[state->which]; + BT_page *root = _node_get(state, meta->root); + uint8_t maxdepth = meta->depth; + BT_mlistnode *head = _mlist_read2(state, root, maxdepth, 1); + + /* + trace the full freelist and unify nodes one last time + NB: linking the leaf nodes would make this unnecessary + */ +#if CAN_COALESCE + BT_mlistnode *p = head; + BT_mlistnode *n = head->next; + while (n) { + size_t llen = P2BYTES(p->sz); + uintptr_t laddr = (uintptr_t)p->va; + uintptr_t raddr = (uintptr_t)n->va; + /* contiguous: unify */ + if (laddr + llen == raddr) { + p->sz += n->sz; + p->next = n->next; + free(n); + } + } +#endif + + state->mlist = head; + return BT_SUCC; +} + +static int +_mlist_delete(BT_state *state) +{ + BT_mlistnode *head, *prev; + head = prev = state->mlist; + while (head->next) { + prev = head; + head = head->next; + free(prev); + } + state->mlist = 0; + return BT_SUCC; +} + +static void +_flist_split(BT_flistnode *head, BT_flistnode **left, BT_flistnode **right) +/* split flist starting at head into two lists, left and right at the midpoint + of head */ +{ + assert(head != 0); + BT_flistnode *slow, *fast; + slow = head; fast = head->next; + + while (fast) { + fast = fast->next; + if (fast) { + slow = slow->next; + fast = fast->next; + } + } + + *left = head; + *right = slow->next; + slow->next = 0; +} + +static BT_flistnode * +_flist_merge2(BT_flistnode *l, BT_flistnode *r) +/* returns the furthest node in l that has a pg less than the first node in r */ +{ + assert(l); + assert(r); + + BT_flistnode *curr, *prev; + prev = l; + curr = l->next; + + while (curr) { + if (curr->pg < r->pg) { + prev = curr; + curr = curr->next; + } + } + + if (prev->pg < r->pg) + return prev; + + return 0; +} + +static BT_flistnode * +_flist_merge(BT_flistnode *l, BT_flistnode *r) +/* merge two sorted flists, l and r and return the sorted result */ +{ + BT_flistnode *head; + + if (!l) return r; + if (!r) return l; + + while (l && r) { + if (l->next == 0) { + l->next = r; + break; + } + if (r->next == 0) { + break; + } + + BT_flistnode *ll = _flist_merge2(l, r); + BT_flistnode *rnext = r->next; + /* insert head of r into appropriate spot in l */ + r->next = ll->next; + ll->next = r; + /* adjust l and r heads */ + l = ll->next; + r = rnext; + } + + return head; +} + +BT_flistnode * +_flist_mergesort(BT_flistnode *head) +{ + if (head == 0 || head->next == 0) + return head; + + BT_flistnode *l, *r; + _flist_split(head, &l, &r); + + /* ;;: todo, make it non-recursive. Though, shouldn't matter as much here + since O(log n). merge already non-recursive */ + _flist_mergesort(l); + _flist_mergesort(r); + + return _flist_merge(l, r); +} + +BT_flistnode * +_flist_read2(BT_state *state, BT_page *node, uint8_t maxdepth, uint8_t depth) +{ + /* leaf */ + if (depth == maxdepth) { + BT_flistnode *head, *prev; + head = prev = calloc(1, sizeof(*head)); + + /* ;;: fixme the head won't get populated in this logic */ + size_t i = 0; + BT_kv *kv = &node->datk[i]; + while (i < BT_DAT_MAXKEYS - 1) { + /* Just blindly append nodes since they aren't guaranteed sorted */ + BT_flistnode *new = calloc(1, sizeof *new); + vaof_t hi = node->datk[i+1].va; + vaof_t lo = kv->va; + size_t len = B2PAGES(hi - lo); + pgno_t fo = kv->fo; + new->sz = len; + new->pg = fo; + prev->next = new; + prev = new; + + kv = &node->datk[++i]; + } + return head; + } + + /* branch */ + size_t i = 0; + BT_flistnode *head, *prev; + head = prev = 0; + for (; i < BT_DAT_MAXKEYS; ++i) { + BT_kv kv = node->datk[i]; + if (kv.fo == BT_NOPAGE) + continue; + BT_page *child = _node_get(state, kv.fo); + BT_flistnode *new = _flist_read2(state, child, maxdepth, depth+1); + if (head == 0) { + head = prev = new; + } + else { + /* just blindly append and unify the ends afterward */ + prev->next = new; + } + } + return 0; +} + +static int +_flist_read(BT_state *state) +{ + BT_meta *meta = state->meta_pages[state->which]; + BT_page *root = _node_get(state, meta->root); + uint8_t maxdepth = meta->depth; + BT_flistnode *head = _flist_read2(state, root, maxdepth, 0); + /* ;;: infinite loop with proper starting depth of 1. -- fix that! */ + /* BT_flistnode *head = _flist_read2(state, root, maxdepth, 1); */ + + if (head == 0) + return BT_SUCC; + + /* sort the freelist */ + _flist_mergesort(head); + + /* merge contiguous regions after sorting */ + BT_flistnode *p = head; + BT_flistnode *n = head->next; + while (n) { + size_t llen = p->sz; + pgno_t lfo = p->pg; + pgno_t rfo = n->pg; + /* contiguous: unify */ + if (lfo + llen == rfo) { + p->sz += n->sz; + p->next = n->next; + free(n); + } + } + + state->flist = head; + return BT_SUCC; +} + +static int +_flist_delete(BT_state *state) +{ + BT_flistnode *head, *prev; + head = prev = state->flist; + while (head->next) { + prev = head; + head = head->next; + free(prev); + } + state->flist = 0; + return BT_SUCC; +} + +#define CLOSE_FD(fd) \ + do { \ + close(fd); \ + fd = -1; \ + } while(0) + +/* TODO: move to lib */ +static uint32_t +nonzero_crc_32(void *dat, size_t len) +{ + unsigned char nonce = 0; + uint32_t chk = crc_32(dat, len); + + do { + if (nonce > 8) + abort(); + chk = update_crc_32(chk, nonce++); + } while (chk == 0); + + return chk; +} + +static int +_bt_state_meta_which(BT_state *state, int *which) +{ + BT_meta *m1 = state->meta_pages[0]; + BT_meta *m2 = state->meta_pages[1]; + *which = -1; + + if (m1->flags == 0) { + /* first is dirty */ + *which = 1; + } + else if (m2->flags == 0) { + /* second is dirty */ + *which = 0; + } + else if (m1->txnid > m2->txnid) { + /* first is most recent */ + *which = 0; + } + else if (m1->txnid < m2->txnid) { + /* second is most recent */ + *which = 1; + } + else { + /* invalid state */ + return EINVAL; + } + + /* checksum the metapage found and abort if checksum doesn't match */ + BT_meta *meta = state->meta_pages[*which]; + uint32_t chk = nonzero_crc_32(meta, BT_META_LEN); + if (chk != meta->chk) { + abort(); + } + + return BT_SUCC; +} + +static int +_bt_state_read_header(BT_state *state) +{ + /* TODO: actually read the header and copy the data to meta when we implement + persistence */ + BT_page metas[2]; + int rc, len, which; + BT_meta *m1, *m2; + + /* pma already exists, parse metadata file */ + m1 = state->meta_pages[0]; + m2 = state->meta_pages[1]; + + /* ;;: TODO, need to store last page in use by pma in both metadata pages. choose the frontier after _bt_state_meta_which and store it in state */ + TRACE(); + + if ((len = pread(state->data_fd, metas, BT_PAGESIZE*2, 0)) + != BT_PAGESIZE*2) { + /* new pma */ + return ENOENT; + } + + /* validate magic */ + if (m1->magic != BT_MAGIC) { + DPRINTF("metapage 0x%pX inconsistent magic: 0x%" PRIX32, m1, m1->magic); + return EINVAL; + } + if (m2->magic != BT_MAGIC) { + DPRINTF("metapage 0x%pX inconsistent magic: 0x%" PRIX32, m2, m2->magic); + return EINVAL; + } + + /* validate flags */ + if (m1->flags & BP_META != BP_META) { + DPRINTF("metapage 0x%pX missing meta page flag", m1); + return EINVAL; + } + if (m2->flags & BP_META != BP_META) { + DPRINTF("metapage 0x%pX missing meta page flag", m2); + return EINVAL; + } + + /* validate binary version */ + if (m1->version != BT_VERSION) { + DPRINTF("version mismatch on metapage: 0x%pX, metapage version: %" PRIu32 ", binary version %u", + m1, m1->version, BT_VERSION); + return EINVAL; + } + + /* validate binary version */ + if (m2->version != BT_VERSION) { + DPRINTF("version mismatch on metapage: 0x%pX, metapage version: %" PRIu32 ", binary version %u", + m2, m2->version, BT_VERSION); + return EINVAL; + } + + if (!SUCC(rc = _bt_state_meta_which(state, &which))) + return rc; + + state->which = which; + + return BT_SUCC; +} + +static int +_bt_state_meta_new(BT_state *state) +#define INITIAL_ROOTPG 2 +{ + BT_page *p1, *p2, *root; + BT_meta meta = {0}; + int rc, pagesize; + + TRACE(); + + /* ;;: HERE HERE HERE: call node_alloc */ + root = _node_alloc(state); + _bt_root_new(root); + + pagesize = sizeof *p1; + + /* initialize meta struct */ + meta.magic = BT_MAGIC; + meta.version = BT_VERSION; + meta.last_pg = 1; + meta.txnid = 0; + meta.fix_addr = BT_MAPADDR; + meta.blk_cnt = 1; + meta.depth = 1; + meta.flags = BP_META; + meta.root = _fo_get(state, root); + assert(meta.root == INITIAL_ROOTPG); /* ;;: remove?? */ + + /* initialize the block base array */ + meta.blk_base[0] = BT_NUMMETAS + 1; + + /* initialize the metapages */ + p1 = &((BT_page *)state->map)[0]; + p2 = &((BT_page *)state->map)[1]; + + /* copy the metadata into the metapages */ + memcpy(METADATA(p1), &meta, sizeof meta); + /* ;;: todo, should the second metapage actually share a .root with the + first?? */ + memcpy(METADATA(p2), &meta, sizeof meta); + + return BT_SUCC; +} + +static int +_bt_state_load(BT_state *state) +{ + int rc; + int new = 0; + BT_page *p; + struct stat stat; + + TRACE(); + + if (!SUCC(rc = _bt_state_read_header(state))) { + if (rc != ENOENT) return rc; + DPUTS("creating new db"); + state->file_size = PMA_GROW_SIZE; + new = 1; + } + + state->map = mmap(BT_MAPADDR, + BT_ADDRSIZE, + PROT_READ | PROT_WRITE, + MAP_FIXED | MAP_SHARED, + state->data_fd, + 0); + + p = (BT_page *)state->map; + state->meta_pages[0] = METADATA(p); + state->meta_pages[0] = METADATA(p + 1); + +#ifndef USE_NLIST + state->node_freelist = &((BT_page *)state->map)[3]; /* begin allocating nodes + on third page (first two + are for metadata) -- this + was quite dumb. This is + the fourth page of + course. But it worked, + because in _bt_root_new + we use the third page + without calling the + allocation function */ +#endif + + /* new db, so populate metadata */ + if (new) { + /* ;;: move this logic to _flist_new */ + if (-1 == lseek(state->data_fd, state->file_size, SEEK_SET)) + return errno; + if (-1 == write(state->data_fd, "", 1)) + return errno; + + state->file_size = PMA_GROW_SIZE; + +#if USE_NLIST + /* ;;: necessary to call this before _bt_state_meta_new */ + assert(SUCC(_nlist_new(state))); +#endif + + if (!SUCC(rc = _bt_state_meta_new(state))) { + munmap(state->map, BT_ADDRSIZE); + return rc; + } + } + else { + if (fstat(state->data_fd, &stat) != 0) + return errno; + + state->file_size = stat.st_size; + } + + if (new) { + assert(SUCC(_mlist_new(state))); + assert(SUCC(_flist_new(state))); + } + else { + assert(SUCC(_mlist_read(state))); + assert(SUCC(_flist_read(state))); +#if USE_NLIST + /* ;;: this might need to be re-ordered given that _nlist_new needs to be + called before _bt_state_meta_new. Haven't thought about it yet. */ + assert(SUCC(_nlist_read(state))); +#endif + } + + return BT_SUCC; +} + +/* ;;: TODO, when persistence has been implemented, _bt_falloc will probably + need to handle extension of the file with appropriate striping. i.e. if no + space is found on the freelist, save the last entry, expand the file size, + and set last_entry->next to a new node representing the newly added file + space */ +static pgno_t +_bt_falloc(BT_state *state, size_t pages) +{ + /* walk the persistent file freelist and return a pgno with sufficient + contiguous space for pages */ + BT_flistnode **n = &state->flist; + pgno_t ret = 0; + + /* first fit */ + /* ;;: is there any reason to use a different allocation strategy for disk? */ + for (; *n; n = &(*n)->next) { + /* perfect fit */ + if ((*n)->sz == pages) { + pgno_t ret; + ret = (*n)->pg; + *n = (*n)->next; + return ret; + } + /* larger than necessary: shrink the node */ + if ((*n)->sz > pages) { + pgno_t ret; + ret = (*n)->pg; + (*n)->sz -= pages; + (*n)->pg = (*n)->pg + pages; + return ret; + } + } + + return 0; +} + +static int +_bt_sync_hasdirtypage(BT_state *state, BT_page *node) +/* ;;: could be more efficiently replaced by a gcc vectorized builtin */ +{ + for (size_t i = 0; i < NMEMB(node->head.dirty); i++) { + if (node->head.dirty[i] != 0) + return 1; + } + + return 0; +} + +static int +_bt_sync_leaf(BT_state *state, BT_page *node) +{ + /* msync all of a leaf's data that is dirty. The caller is expected to sync + the node itself and mark it as clean in the parent. */ + pgno_t pg; + size_t i = 0; + + for (size_t i = 0; i < BT_DAT_MAXKEYS-1; i++) { + if (!_bt_ischilddirty(node, i)) + continue; /* not dirty. nothing to do */ + + /* ;;: we don't actually need the page, do we? */ + /* pgno_t pg = node->datk[i].fo; */ + vaof_t lo = node->datk[i].va; + vaof_t hi = node->datk[i+1].va; + size_t bytelen = hi - lo; + void *addr = OFF2ADDR(lo); + + /* sync the page */ + if (msync(addr, bytelen, MS_SYNC)) + return errno; + + /* and clean the dirty bit */ + _bt_cleanchild(node, i); + } + + /* ;;: all data pages synced. should we now sync the node as well? No, I think + that should be the caller's responsibility */ + + /* ;;: it is probably faster to scan the dirty bit set and derive the datk idx + rather than iterate over the full datk array and check if it is dirty. This + was simpler to implement for now though. */ + /* while (_bt_sync_hasdirtypage(state, node)) { */ + /* ... */ + /* } */ + + return BT_SUCC; +} + +static int +_bt_sync_meta(BT_state *state) +/* syncs the metapage and performs necessary checksumming. Additionally, flips + the which */ +{ + BT_meta *meta = state->meta_pages[state->which]; + BT_meta *newmeta; + uint32_t chk; + int newwhich; + + /* checksum the metapage */ + chk = nonzero_crc_32(meta, BT_META_LEN); + /* ;;: todo: guarantee the chk cannot be zero */ + + meta->chk = chk; + + /* sync the metapage */ + if (msync(meta, sizeof(BT_page), MS_SYNC)) + return errno; + + /* zero the new metapage's checksum */ + newwhich = state->which ? 0 : 1; + newmeta = state->meta_pages[newwhich]; + newmeta->chk = 0; + + /* copy over metapage to new metapage excluding the checksum */ + memcpy(newmeta, meta, BT_META_LEN); + + /* CoW a new root since the root referred to by the metapage should always be + dirty */ + BT_page *root, *newroot; + pgno_t newrootpg; + root = _node_get(state, newmeta->root); + if (!SUCC(_node_cow(state, root, &newroot, &newrootpg))) + abort(); + + newmeta->root = newrootpg; + + /* finally, switch the metapage we're referring to */ + state->which = newwhich; + + return BT_SUCC; +} + +static int +_bt_sync(BT_state *state, BT_page *node, uint8_t depth, uint8_t maxdepth) +/* recursively syncs the subtree under node. The caller is expected to sync node + itself and mark it clean. */ +{ + int rc = 0; + + /* leaf */ + if (depth == maxdepth) { + _bt_sync_leaf(state, node); + return BT_SUCC; + } + + /* do dfs */ + for (size_t i = 0; i < BT_DAT_MAXKEYS-1; i++) { + if (!_bt_ischilddirty(node, i)) + continue; /* not dirty. nothing to do */ + + BT_page *child = _node_get(state, node->datk[i].fo); + + /* recursively sync the child's data */ + if (rc = _bt_sync(state, child, depth+1, maxdepth)) + return rc; + + /* sync the child node */ + if (msync(child, sizeof(BT_page), MS_SYNC)) + return errno; + + /* clean the child */ + _bt_cleanchild(node, i); + } + + return BT_SUCC; +} + + +//// =========================================================================== +//// btree external routines + +int +bt_state_new(BT_state **state) +{ + TRACE(); + + BT_state *s = calloc(1, sizeof *s); + s->meta_fd = s->data_fd = -1; + s->fixaddr = BT_MAPADDR; + *state = s; + return BT_SUCC; +} + +#define DATANAME "/data.pma" +int +bt_state_open(BT_state *state, const char *path, ULONG flags, mode_t mode) +{ + int oflags, rc; + char *dpath; + + TRACE(); + UNUSED(flags); + + oflags = O_RDWR | O_CREAT; + dpath = malloc(strlen(path) + sizeof(DATANAME)); + if (!dpath) return ENOMEM; + sprintf(dpath, "%s" DATANAME, path); + + if (mkdir(path, 0774) == -1) + return errno; + + if ((state->data_fd = open(dpath, oflags, mode)) == -1) + return errno; + + if (!SUCC(rc = _bt_state_load(state))) + goto e; + + /* ;;: this may be entirely unnecessary */ + oflags |= O_DSYNC; /* see man 2 open */ + if ((state->meta_fd = open(dpath, oflags, mode)) == -1) { + rc = errno; + goto e; + } + + state->path = strdup(dpath); + + e: + /* cleanup FDs stored in state if anything failed */ + if (!SUCC(rc)) { + if (state->data_fd != -1) CLOSE_FD(state->data_fd); + if (state->meta_fd != -1) CLOSE_FD(state->meta_fd); + } + + free(dpath); + return rc; +} + +int +bt_state_close(BT_state *state) +{ + int rc; + if (state->data_fd != -1) CLOSE_FD(state->data_fd); + if (state->meta_fd != -1) CLOSE_FD(state->meta_fd); + + _mlist_delete(state); + _flist_delete(state); + + /* ;;: wip delete the file because we haven't implemented persistence yet */ + if (!SUCC(rc = remove(state->path))) + return rc; + + return BT_SUCC; +} + +void * +bt_malloc(BT_state *state, size_t pages) +{ + BT_mlistnode **n = &state->mlist; + void *ret = 0; + /* first fit */ + for (; *n; n = &(*n)->next) { + /* perfect fit */ + if ((*n)->sz == pages) { + ret = (*n)->va; + *n = (*n)->next; + break; + } + /* larger than necessary: shrink the node */ + if ((*n)->sz > pages) { + ret = (*n)->va; + (*n)->sz -= pages; + (*n)->va = (BT_page *)(*n)->va + pages; + break; + } + } + + pgno_t pgno = _bt_falloc(state, pages); + bp(pgno != 0); + _bt_insert(state, + ADDR2OFF(ret), + ADDR2OFF(ret) + P2BYTES(pages), + pgno); + + bp(ret != 0); + return ret; +} + +void +bt_free(BT_state *state, void *lo, void *hi) +{ + vaof_t looff = ADDR2OFF(lo); + vaof_t hioff = ADDR2OFF(hi); + _bt_insert(state, looff, hioff, 0); + /* ;;: and now add freespace to state->flist. coalescing when you do so */ +} + +int +bt_sync(BT_state *state) +{ + /* as is often the case, handling the metapage/root is a special case, which + is done here. Syncing any other page of the tree is done in _bt_sync */ + BT_meta *meta = state->meta_pages[state->which]; + BT_page *root = _node_get(state, meta->root); + int rc = 0; + + if (rc = _bt_sync(state, root, 1, meta->depth)) + return rc; + + /* sync the root page */ + if (msync(root, sizeof(BT_page), MS_SYNC)) + return errno; + + /* then sync the metapage */ + if (rc = _bt_sync_meta(state)) + return rc; + + return BT_SUCC; +} + +uint64_t +bt_meta_get(BT_state *state, size_t idx) +{ + BT_meta *meta = state->meta_pages[state->which]; + assert((uintptr_t)&meta->roots[idx] - (uintptr_t)&meta <= sizeof *meta); + return meta->roots[idx]; +} + +void +bt_meta_set(BT_state *state, size_t idx, uint64_t val) +{ + BT_meta *meta = state->meta_pages[state->which]; + assert((uintptr_t)&meta->roots[idx] - (uintptr_t)&meta <= sizeof *meta); + meta->roots[idx] = val; +} + + +//// =========================================================================== +//// tests + +/* ;;: obv this should be moved to a separate file */ +static void +_sham_sync_clean(BT_page *node) +{ + for (uint8_t *dit = &node->head.dirty[0] + ; dit < &node->head.dirty[sizeof(node->head.dirty) - 1] + ; dit++) { + *dit = 0; + } +} + +static void +_sham_sync2(BT_state *state, BT_page *node, uint8_t depth, uint8_t maxdepth) +{ + if (depth == maxdepth) return; + + /* clean node */ + _sham_sync_clean(node); + + /* then recurse and clean all children with DFS */ + size_t N = _bt_numkeys(node); + for (size_t i = 1; i < N; ++i) { + BT_kv kv = node->datk[i]; + pgno_t childpg = kv.fo; + BT_page *child = _node_get(state, childpg); + _sham_sync2(state, child, depth+1, maxdepth); + } +} + +static void +_sham_sync(BT_state *state) +{ + /* walk the tree and unset the dirty bit from all pages */ + BT_meta *meta = state->meta_pages[state->which]; + BT_page *root = _node_get(state, meta->root); + meta->chk = nonzero_crc_32(meta, BT_META_LEN); + _sham_sync2(state, root, 1, meta->depth); +} + +static void +_bt_printnode(BT_page *node) +{ + printf("node: %p\n", node); + printf("data: \n"); + for (size_t i = 0; i < BT_DAT_MAXKEYS; ++i) { + if (i && node->datk[i].va == 0) + break; + printf("[%5zu] %10x %10x\n", i, node->datk[i].va, node->datk[i].fo); + } +} + +static void +_test_nodeinteg(BT_state *state, BT_findpath *path, + vaof_t lo, vaof_t hi, pgno_t pg) +{ + size_t childidx = 0; + BT_page *parent = 0; + + assert(SUCC(_bt_find(state, path, lo, hi))); + parent = path->path[path->depth]; + /* _bt_printnode(parent); */ + childidx = path->idx[path->depth]; + assert(parent->datk[childidx].fo == pg); + assert(parent->datk[childidx].va == lo); + assert(parent->datk[childidx+1].va == hi); +} + +int main(int argc, char *argv[]) +{ + BT_state *state; + BT_findpath path = {0}; + int rc = 0; + + +//// =========================================================================== +//// test0 wip + + /* deletion coalescing */ + bt_state_new(&state); + assert(SUCC(bt_state_open(state, "./pmatest", 0, 0644))); + + /* enable coalescing of the memory freelist */ +#undef CAN_COALESCE +#define CAN_COALESCE 1 + + /* ;;: disabling for now as I don't have an answer to the "how to find the hi + address on a bt_free call so that _bt_delete can be called" question */ +#if 0 + void *t0a = bt_malloc(state, 10); + void *t0b = bt_malloc(state, 10); + bt_free(state, t0a); + bt_free(state, t0b); + /* memory freelist got coallesced. next malloc call should find the same range + and result in attempting to insert a range that overlaps a non-coallesced + region */ + void *t0ab = bt_malloc(state, 20); + /* t0a should have the same address as t0ab */ + assert(t0a == t0ab); +#endif + + /* ;;: can still suitably test by calling insert and delete routines directly */ + _bt_insert(state, 0x1000, 0x4000, 4); + _bt_insert(state, 0x4000, 0x8000, 4); + _bt_delete(state, 0x1000, 0x4000); + _bt_delete(state, 0x4000, 0x8000); + _bt_insert(state, 0x1000, 0x7000, 7); + + + //// =========================================================================== + //// test1 + + bt_state_new(&state); + assert(SUCC(bt_state_open(state, "./pmatest", 0, 0644))); + void * xxx = bt_malloc(state, 10); /* tmp - testing malloc logic */ + + + /* splitting tests. Insert sufficient data to force splitting. breakpoint before + that split is performed */ + + /* the hhi == hi case for more predictable splitting math */ + vaof_t lo = 10; + /* vaof_t hi = BT_DAT_MAXKEYS * 4; */ + vaof_t hi = 0xDEADBEEF; + pgno_t pg = 1; /* dummy value */ + for (size_t i = 0; i < BT_DAT_MAXKEYS * 4; ++i) { + /* if (i % (BT_DAT_MAXKEYS - 2) == 0) */ + /* bp(0); /\* breakpoint on split case *\/ */ + _bt_insert(state, lo, hi, pg); + _test_nodeinteg(state, &path, lo, hi, pg); + lo++; pg++; + } + + int which = state->which; + /* sham sync and re-run insertions */ + _sham_sync(state); + for (size_t i = 0; i < BT_DAT_MAXKEYS * 4; ++i) { + _bt_insert(state, lo, hi, pg); + _test_nodeinteg(state, &path, lo++, hi, pg++); + } + assert(which != state->which); + + assert(SUCC(bt_state_close(state))); + + + +//// =========================================================================== +//// test2 + + assert(SUCC(bt_state_open(state, "./pmatest", 0, 644))); + _mlist_read(state); + _flist_read(state); + + /* varieties of insert */ + + /* 2.1 exact match */ + lo = 0x10; + hi = 0x20; + pg = 0xFFFFFFFF; + + bp(0); + _bt_insert(state, lo, hi, pg); + _bt_insert(state, lo, hi, pg); + + /* ;;: you should also probably assert the data is laid out in datk at you expect */ + _test_nodeinteg(state, &path, lo, hi, pg); + + _bt_delete(state, lo, hi); + + /* 2.2 neither bounds match */ + bp(0); + _bt_insert(state, lo, hi, pg); + _bt_insert(state, lo+2, hi-2, pg-1); + + _test_nodeinteg(state, &path, lo, hi, pg); + _test_nodeinteg(state, &path, lo+2, hi-2, pg-1); + + _bt_delete(state, lo, hi); + _bt_delete(state, lo+2, hi-2); + + /* 2.3 space to right */ + bp(0); + _bt_insert(state, lo, hi, pg); + _bt_insert(state, lo, hi-2, pg-1); + + _test_nodeinteg(state, &path, lo, hi, pg); + _test_nodeinteg(state, &path, lo, hi-2, pg-1); + + _bt_delete(state, lo, hi); + _bt_delete(state, lo, hi-2); + + /* 2.4 space to left */ + bp(0); + + _bt_insert(state, lo, hi, pg); + _bt_insert(state, lo+2, hi, pg-1); + + _test_nodeinteg(state, &path, lo, hi, pg); + _test_nodeinteg(state, &path, lo+2, hi, pg-1); + + _bt_delete(state, lo, hi); + _bt_delete(state, lo+2, hi); + + assert(SUCC(bt_state_close(state))); + + return 0; +} + + +/* ;;: + + 1) checksum m1 + 2) sync m1 + 3) zero m2 + 4) copy all of m1 to m2 excluding m1 + + The current dirty metapage should have a zero checksum so that it happens to + be synced by the OS, it won't be valid. + +*/ + +/* ;;: + + Check if root page is dirty from metapage. if not, exit sync + + Create a queue of dirty pages. + + BFS the tree. Add root page. Add all pages in dirty bit set. Advance read + head to next page (index 1) and do the same until read head and write head + are equal. + + queue consists of pairs of memory address and length. + + if length field is zero, we'll msync length 1 page. -- which means this is a + node. if when iterating over queue, we find a zero length entry, then add + that node's dirty page. + + --- + + this /was/ the initial plan after some discussion. But after further + discussion, we can actually do a depth first search. To make implementation + even more simple, we can do an iterative dfs where we start from the root + each time. Why? Because the bulk of time to execute is going to be disc + io. + + after each msync of a page, descend to the deepest dirty page. msync that + page. set that page's dirty bit in the parent to non-dirty. repeat. once + you're at the root page and there are no dirty bits set, sync the + root. Finally, sync the metapage (with checksumming). + + */ diff --git a/rust/ares_pma/c-src/btree.h b/rust/ares_pma/c-src/btree.h new file mode 100644 index 0000000..54f6aa2 --- /dev/null +++ b/rust/ares_pma/c-src/btree.h @@ -0,0 +1,26 @@ +#ifndef __BTREE_H__ +#define __BTREE_H__ +#include +#include +typedef unsigned long ULONG; + +//// =========================================================================== +//// btree external routines + +int bt_state_new(BT_state **state); + +int bt_state_open(BT_state *state, const char *path, ULONG flags, mode_t mode); + +int bt_state_close(BT_state *state); + +void * bt_malloc(BT_state *state, size_t pages); + +void bt_free(BT_state *state, void *lo, void *hi); + +int bt_sync(BT_state *state); + +uint64_t bt_meta_get(BT_state *state, size_t idx); + +void bt_meta_set(BT_state *state, size_t idx, uint64_t val); + +#endif diff --git a/rust/ares_pma/c-src/lib/checksum.c b/rust/ares_pma/c-src/lib/checksum.c new file mode 100644 index 0000000..faa69c9 --- /dev/null +++ b/rust/ares_pma/c-src/lib/checksum.c @@ -0,0 +1,134 @@ +/* + * Library: libcrc + * File: src/crc32.c (herein src/includes/checksum.c) + * Author: Lammert Bies + * + * This file is licensed under the MIT License as stated below + * + * Copyright (c) 1999-2016 Lammert Bies + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + * ----------- + * Accessed in 2023 by Alex Shelkovnykov on behalf of Tlon Corporation from + * https://github.com/lammertb/libcrc/tree/v2.0. + * + * Description + * ----------- + * The source file src/includes/checksum.c contains the routines which are + * needed to calculate a 32 bit CRC value of a sequence of bytes. + */ + +#include +#include +#include "checksum.h" + +static void init_crc32_tab( void ); + +static bool crc_tab32_init = false; +static uint32_t crc_tab32[256]; + +/* + * uint32_t crc_32( const unsigned char *input_str, size_t num_bytes ); + * + * The function crc_32() calculates in one pass the common 32 bit CRC value for + * a byte string that is passed to the function together with a parameter + * indicating the length. + */ +uint32_t crc_32( const unsigned char *input_str, size_t num_bytes ) { + + uint32_t crc; + uint32_t tmp; + uint32_t long_c; + const unsigned char *ptr; + size_t a; + + if ( ! crc_tab32_init ) init_crc32_tab(); + + crc = CRC_START_32; + ptr = input_str; + + if ( ptr != NULL ) for (a=0; a> 8) ^ crc_tab32[ tmp & 0xff ]; + + ptr++; + } + + crc ^= 0xffffffffL; + + return crc & 0xffffffffL; + +} /* crc_32 */ + +/* + * uint32_t update_crc_32( uint32_t crc, unsigned char c ); + * + * The function update_crc_32() calculates a new CRC-32 value based on the + * previous value of the CRC and the next byte of the data to be checked. + */ + +uint32_t update_crc_32( uint32_t crc, unsigned char c ) { + + uint32_t tmp; + uint32_t long_c; + + long_c = 0x000000ffL & (uint32_t) c; + + if ( ! crc_tab32_init ) init_crc32_tab(); + + tmp = crc ^ long_c; + crc = (crc >> 8) ^ crc_tab32[ tmp & 0xff ]; + + return crc & 0xffffffffL;; + +} /* update_crc_32 */ + +/* + * static void init_crc32_tab( void ); + * + * For optimal speed, the CRC32 calculation uses a table with pre-calculated + * bit patterns which are used in the XOR operations in the program. This table + * is generated once, the first time the CRC update routine is called. + */ + +static void init_crc32_tab( void ) { + + uint32_t i; + uint32_t j; + uint32_t crc; + + for (i=0; i<256; i++) { + + crc = i; + + for (j=0; j<8; j++) { + + if ( crc & 0x00000001L ) crc = ( crc >> 1 ) ^ CRC_POLY_32; + else crc = crc >> 1; + } + + crc_tab32[i] = crc; + } + + crc_tab32_init = true; + +} /* init_crc32_tab */ diff --git a/rust/ares_pma/c-src/wrapper.h b/rust/ares_pma/c-src/wrapper.h new file mode 100644 index 0000000..48206a7 --- /dev/null +++ b/rust/ares_pma/c-src/wrapper.h @@ -0,0 +1 @@ +#include diff --git a/rust/ares_pma/src/lib.rs b/rust/ares_pma/src/lib.rs new file mode 100644 index 0000000..7d12d9a --- /dev/null +++ b/rust/ares_pma/src/lib.rs @@ -0,0 +1,14 @@ +pub fn add(left: usize, right: usize) -> usize { + left + right +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn it_works() { + let result = add(2, 2); + assert_eq!(result, 4); + } +} From add377b0076a4dcd2ca25d0fc59f653ddefb1167 Mon Sep 17 00:00:00 2001 From: Edward Amsden Date: Wed, 29 Nov 2023 10:36:19 -0600 Subject: [PATCH 002/128] pma: export bindgen bindings from lib --- rust/ares_pma/src/lib.rs | 16 ++++------------ 1 file changed, 4 insertions(+), 12 deletions(-) diff --git a/rust/ares_pma/src/lib.rs b/rust/ares_pma/src/lib.rs index 7d12d9a..8448cc2 100644 --- a/rust/ares_pma/src/lib.rs +++ b/rust/ares_pma/src/lib.rs @@ -1,14 +1,6 @@ -pub fn add(left: usize, right: usize) -> usize { - left + right -} +#![allow(non_upper_case_globals)] +#![allow(non_camel_case_types)] +#![allow(non_snake_case)] -#[cfg(test)] -mod tests { - use super::*; +include!(concat!(env!("OUT_DIR"), "/bindings.rs")); - #[test] - fn it_works() { - let result = add(2, 2); - assert_eq!(result, 4); - } -} From accf255b1ad2c89d87261c09f7779052a4711ca0 Mon Sep 17 00:00:00 2001 From: Edward Amsden Date: Wed, 29 Nov 2023 11:41:31 -0600 Subject: [PATCH 003/128] pma: wip: forgot checksum.h --- rust/ares_pma/c-src/lib/checksum.h | 66 ++++++++++++++++++++++++++++++ 1 file changed, 66 insertions(+) create mode 100644 rust/ares_pma/c-src/lib/checksum.h diff --git a/rust/ares_pma/c-src/lib/checksum.h b/rust/ares_pma/c-src/lib/checksum.h new file mode 100644 index 0000000..0269131 --- /dev/null +++ b/rust/ares_pma/c-src/lib/checksum.h @@ -0,0 +1,66 @@ +/* + * Library: libcrc + * File: include/checksum.h (herein src/includes/checksum.h) + * Author: Lammert Bies + * + * This file is licensed under the MIT License as stated below + * + * Copyright (c) 1999-2016 Lammert Bies + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + * ----------- + * Accessed in 2023 by Alex Shelkovnykov on behalf of Tlon Corporation from + * https://github.com/lammertb/libcrc/tree/v2.0. + * + * Description + * ----------- + * The headerfile src/includes/checksum.h contains the definitions and + * prototypes for routines that can be used to calculate several kinds of + * checksums. + */ + +#ifndef DEF_LIBCRC_CHECKSUM_H +#define DEF_LIBCRC_CHECKSUM_H + +#include + +/* + * #define CRC_POLY_xxxx + * + * The constants of the form CRC_POLY_xxxx define the polynomials for some well + * known CRC calculations. + */ +#define CRC_POLY_32 0xEDB88320L + +/* + * #define CRC_START_xxxx + * + * The constants of the form CRC_START_xxxx define the values that are used for + * initialization of a CRC value for common used calculation methods. + */ +#define CRC_START_32 0xFFFFFFFFL + +/* + * Prototype list of global functions + */ +uint32_t crc_32(const unsigned char *input_str, size_t num_bytes); +uint32_t update_crc_32(uint32_t crc, unsigned char c); + +#endif // DEF_LIBCRC_CHECKSUM_H From 391166ca556a98bf0da7299af9694ec23493ce52 Mon Sep 17 00:00:00 2001 From: Edward Amsden Date: Wed, 29 Nov 2023 11:41:55 -0600 Subject: [PATCH 004/128] pma: update .gitignore --- .gitignore | 2 ++ 1 file changed, 2 insertions(+) diff --git a/.gitignore b/.gitignore index 917314b..e90d778 100644 --- a/.gitignore +++ b/.gitignore @@ -2,3 +2,5 @@ ships/ *.backup urbit *.jam.out +*.o +*.a From 187b28330816ac428bd2761576bbcb46ba884590 Mon Sep 17 00:00:00 2001 From: Edward Amsden Date: Wed, 29 Nov 2023 11:43:15 -0600 Subject: [PATCH 005/128] pma: fix wrapper.h --- rust/ares_pma/c-src/wrapper.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/rust/ares_pma/c-src/wrapper.h b/rust/ares_pma/c-src/wrapper.h index 48206a7..5c56c79 100644 --- a/rust/ares_pma/c-src/wrapper.h +++ b/rust/ares_pma/c-src/wrapper.h @@ -1 +1 @@ -#include +#include "btree.h" From 13963724a1ba88c8bd1b47cd77a2b6d985432cf9 Mon Sep 17 00:00:00 2001 From: barter-simsum Date: Wed, 29 Nov 2023 12:46:46 -0500 Subject: [PATCH 006/128] pma: declare BT_state in btree.h --- rust/ares_pma/c-src/btree.h | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/rust/ares_pma/c-src/btree.h b/rust/ares_pma/c-src/btree.h index 54f6aa2..5ebf8b4 100644 --- a/rust/ares_pma/c-src/btree.h +++ b/rust/ares_pma/c-src/btree.h @@ -2,6 +2,11 @@ #define __BTREE_H__ #include #include +#include + +struct BT_state; +typedef struct BT_state BT_state; + typedef unsigned long ULONG; //// =========================================================================== From 857c0051d9edb37cb0462141b69d34907203cbdb Mon Sep 17 00:00:00 2001 From: barter-simsum Date: Wed, 29 Nov 2023 12:52:42 -0500 Subject: [PATCH 007/128] pma: don't include stdlib.h in btree.h --- rust/ares_pma/c-src/btree.h | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/rust/ares_pma/c-src/btree.h b/rust/ares_pma/c-src/btree.h index 5ebf8b4..1e53eec 100644 --- a/rust/ares_pma/c-src/btree.h +++ b/rust/ares_pma/c-src/btree.h @@ -1,8 +1,7 @@ #ifndef __BTREE_H__ #define __BTREE_H__ -#include +#include #include -#include struct BT_state; typedef struct BT_state BT_state; From 96b31ffe7f8cd952aa58128d593cf6b338be5bdb Mon Sep 17 00:00:00 2001 From: barter-simsum Date: Wed, 29 Nov 2023 13:03:57 -0500 Subject: [PATCH 008/128] pma: move BT_meta.chk to end of struct. hardcode 32 roots --- rust/ares_pma/c-src/btree.c | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) diff --git a/rust/ares_pma/c-src/btree.c b/rust/ares_pma/c-src/btree.c index 8393e68..12410cd 100644 --- a/rust/ares_pma/c-src/btree.c +++ b/rust/ares_pma/c-src/btree.c @@ -233,6 +233,7 @@ static_assert(BT_DAT_MAXBYTES % sizeof(BT_dat) == 0); #define BLK_BASE_LEN7 (BLK_BASE_LEN6 * 4) typedef struct BT_meta BT_meta; struct BT_meta { +#define BT_NUMROOTS 32 uint32_t magic; uint32_t version; pgno_t last_pg; /* last page used in file */ @@ -252,13 +253,9 @@ struct BT_meta { uint8_t flags; uint8_t _pad1; pgno_t root; - /* ;;: confirm: shouldn't the checksum actually follow the roots array? */ + /* 64bit alignment manually checked - 72 bytes total above */ + uint64_t roots[BT_NUMROOTS]; /* for usage by ares */ uint32_t chk; /* checksum */ - /* 64bit alignment manually checked */ - uint64_t roots[]; /* for usage by ares */ - - /* ;;: TODO: ensure the crc_32 checksum cannot be zero */ - } __packed; static_assert(sizeof(BT_meta) <= BT_DAT_MAXBYTES); From f7ffa0de9732e8fceeff2cbd17e71f6f80aecb71 Mon Sep 17 00:00:00 2001 From: barter-simsum Date: Wed, 29 Nov 2023 14:05:22 -0500 Subject: [PATCH 009/128] pma: add additional interface stubs. implement later --- rust/ares_pma/c-src/btree.c | 47 +++++++++++++++++++++++++++++++++++++ 1 file changed, 47 insertions(+) diff --git a/rust/ares_pma/c-src/btree.c b/rust/ares_pma/c-src/btree.c index 12410cd..50cc165 100644 --- a/rust/ares_pma/c-src/btree.c +++ b/rust/ares_pma/c-src/btree.c @@ -2414,6 +2414,53 @@ bt_meta_set(BT_state *state, size_t idx, uint64_t val) meta->roots[idx] = val; } +int +bt_range_of(void *p, void **lo, void **hi) +{ + /* traverse tree looking for lo <= p and hi > p. return that range as a pair + of pointers NOT as two vaof_t + + 0: succ (found) + 1: otherwise + */ +} + +int +bt_dirty(void *lo, void *hi) +{ + /* takes a range and ensures that entire range is CoWed */ + /* if part of the range is free then return 1 */ +} + +int +bt_next_alloc(void *p, void **lo, void **hi) +{ + /* if p is in the mlist, return the next hole in the mlist */ + + /* if p is allocated, then return the hole that it is contained in */ +} + +/* also: accessors for the virtual memory of the pma low and high */ + +/* #define BT_MAPADDR ((void *) S(0x1000,0000,0000)) */ +/* #define BT_ADDRSIZE (BT_PAGESIZE << BT_PAGEWORD) */ + +/* i.e. MAP_ADDDR - MAP_ADDR + ADDRSIZE */ + +/* and a function that given a pointer tests if in range */ + +void +bt_bounds(void **lo, void **hi) +{ + +} + +int +bt_inbounds(void *p) +{ + /* 1: if in bounds of PMA (those returned by bt_bounds) */ +} + //// =========================================================================== //// tests From 093aff035689150c38aae4b27c7858a6e4f6fd22 Mon Sep 17 00:00:00 2001 From: Edward Amsden Date: Wed, 29 Nov 2023 16:36:30 -0600 Subject: [PATCH 010/128] pma: use cc crate in build.rs --- rust/ares_pma/Cargo.lock | 10 ++++ rust/ares_pma/Cargo.toml | 1 + rust/ares_pma/build.rs | 100 ++++++++++++++++----------------------- 3 files changed, 51 insertions(+), 60 deletions(-) diff --git a/rust/ares_pma/Cargo.lock b/rust/ares_pma/Cargo.lock index 1b2b535..a4dfb7b 100644 --- a/rust/ares_pma/Cargo.lock +++ b/rust/ares_pma/Cargo.lock @@ -16,6 +16,7 @@ name = "ares_pma" version = "0.1.0" dependencies = [ "bindgen", + "cc", ] [[package]] @@ -47,6 +48,15 @@ version = "2.4.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "327762f6e5a765692301e5bb513e0d9fef63be86bbc14528052b1cd3e6f03e07" +[[package]] +name = "cc" +version = "1.0.83" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f1174fb0b6ec23863f8b971027804a42614e347eafb0a95bf0b12cdae21fc4d0" +dependencies = [ + "libc", +] + [[package]] name = "cexpr" version = "0.6.0" diff --git a/rust/ares_pma/Cargo.toml b/rust/ares_pma/Cargo.toml index d81f601..b7ccdb4 100644 --- a/rust/ares_pma/Cargo.toml +++ b/rust/ares_pma/Cargo.toml @@ -9,4 +9,5 @@ edition = "2018" [build-dependencies] bindgen = "0.69.1" +cc = "1.0" diff --git a/rust/ares_pma/build.rs b/rust/ares_pma/build.rs index bb17c44..c4270bf 100644 --- a/rust/ares_pma/build.rs +++ b/rust/ares_pma/build.rs @@ -1,84 +1,64 @@ extern crate bindgen; use std::env; +use std::fs::create_dir_all; use std::path::PathBuf; use bindgen::CargoCallbacks; fn main() { + let profile = env::var("PROFILE").unwrap(); + let opt_level = match profile.as_ref() { + "debug" => 0, + "release" => 3, + _ => panic!("Unknown profile: {}", profile), + }; + // This is the directory where the `c` library is located. let libdir_path = PathBuf::from("c-src") // Canonicalize the path as `rustc-link-search` requires an absolute // path. .canonicalize() .expect("cannot canonicalize path"); + let libdir_path_str = libdir_path.to_str().expect("Path is not a valid string"); // This is the path to the `c` headers file. let headers_path = libdir_path.join("wrapper.h"); let headers_path_str = headers_path.to_str().expect("Path is not a valid string"); - // This is the path to the intermediate object file for our library. - let btree_obj_path = libdir_path.join("btree.o"); - let checksum_obj_path = libdir_path.join("lib").join("checksum.o"); - // This is the path to the static library file. - let lib_path = libdir_path.join("btree.a"); + println!("cargo:rerun-if-changed={}", libdir_path_str); - // Tell cargo to look for shared libraries in the specified directory - println!("cargo:rustc-link-search={}", libdir_path.to_str().unwrap()); + let res = cc::Build::new() + .file( + libdir_path + .join("btree.c") + .to_str() + .expect("Path is not a valid string"), + ) + .file( + libdir_path + .join("lib") + .join("checksum.c") + .to_str() + .expect("Path is not a valid string"), + ) + .flag("-g3") + .flag("-Wall") + .flag("-Wextra") + .flag("-Wpedantic") + .flag("-Wformat=2") + .flag("-Wno-unused-parameter") + .flag("-Wshadow") + .flag("-Wwrite-strings") + .flag("-Wstrict-prototypes") + .flag("-Wold-style-definition") + .flag("-Wredundant-decls") + .flag("-Wnested-externs") + .flag("-Wmissing-include-dirs") + .try_compile("btree"); - // Tell cargo to tell rustc to link our `btree` library. Cargo will - // automatically know it must look for a `libbtree.a` file. - println!("cargo:rustc-link-lib=btree"); - - // Tell cargo to invalidate the built crate whenever the header changes. - println!("cargo:rerun-if-changed={}", headers_path_str); - - // Run `clang` to compile the `btree.c` file into a `btree.o` object file. - // Unwrap if it is not possible to spawn the process. - if !std::process::Command::new("clang") - .arg("-c") - .arg("-o") - .arg(&btree_obj_path) - .arg(libdir_path.join("btree.c")) - .output() - .expect("could not spawn `clang`") - .status - .success() - { - // Panic if the command was not successful. - panic!("could not compile object file"); - } - - // Run `clang` to compile the `btree.c` file into a `btree.o` object file. - // Unwrap if it is not possible to spawn the process. - if !std::process::Command::new("clang") - .arg("-c") - .arg("-o") - .arg(&checksum_obj_path) - .arg(libdir_path.join("lib").join("checksum.c")) - .output() - .expect("could not spawn `clang`") - .status - .success() - { - // Panic if the command was not successful. - panic!("could not compile object file"); - } - - // Run `ar` to generate the `libbtree.a` file from the `btree.o` file. - // Unwrap if it is not possible to spawn the process. - if !std::process::Command::new("ar") - .arg("rcs") - .arg(lib_path) - .arg(btree_obj_path) - .arg(checksum_obj_path) - .output() - .expect("could not spawn `ar`") - .status - .success() - { - // Panic if the command was not successful. - panic!("could not emit library file"); + if let Err(err) = res { + panic!("{}", err); } // The bindgen::Builder is the main entry point From 6944b9b29512eb36b0c3d43be775102a4b9090d4 Mon Sep 17 00:00:00 2001 From: Edward Amsden Date: Wed, 29 Nov 2023 16:36:58 -0600 Subject: [PATCH 011/128] pma: format lib.rs --- rust/ares_pma/src/lib.rs | 1 - 1 file changed, 1 deletion(-) diff --git a/rust/ares_pma/src/lib.rs b/rust/ares_pma/src/lib.rs index 8448cc2..a38a13a 100644 --- a/rust/ares_pma/src/lib.rs +++ b/rust/ares_pma/src/lib.rs @@ -3,4 +3,3 @@ #![allow(non_snake_case)] include!(concat!(env!("OUT_DIR"), "/bindings.rs")); - From 1f61db551bab98b5f2f8c5fe80aa59c8a2fc0111 Mon Sep 17 00:00:00 2001 From: barter-simsum Date: Thu, 30 Nov 2023 11:35:10 -0500 Subject: [PATCH 012/128] pma: fix external routine stubs. add comments to header --- rust/ares_pma/c-src/btree.c | 24 ++++++----------- rust/ares_pma/c-src/btree.h | 54 +++++++++++++++++++++++++++++++++++++ 2 files changed, 62 insertions(+), 16 deletions(-) diff --git a/rust/ares_pma/c-src/btree.c b/rust/ares_pma/c-src/btree.c index 50cc165..596c445 100644 --- a/rust/ares_pma/c-src/btree.c +++ b/rust/ares_pma/c-src/btree.c @@ -12,13 +12,13 @@ #include #include +#include "btree.h" #include "lib/checksum.h" typedef uint32_t pgno_t; /* a page number */ typedef uint32_t vaof_t; /* a virtual address offset */ typedef uint32_t flag_t; typedef unsigned char BYTE; -typedef unsigned long ULONG; //// =========================================================================== //// tmp tmp tmp tmp tmp @@ -2415,7 +2415,7 @@ bt_meta_set(BT_state *state, size_t idx, uint64_t val) } int -bt_range_of(void *p, void **lo, void **hi) +bt_range_of(BT_state *state, void *p, void **lo, void **hi) { /* traverse tree looking for lo <= p and hi > p. return that range as a pair of pointers NOT as two vaof_t @@ -2426,37 +2426,29 @@ bt_range_of(void *p, void **lo, void **hi) } int -bt_dirty(void *lo, void *hi) +bt_dirty(BT_state *state, void *lo, void *hi) { /* takes a range and ensures that entire range is CoWed */ /* if part of the range is free then return 1 */ } int -bt_next_alloc(void *p, void **lo, void **hi) +bt_next_alloc(BT_state *state, void *p, void **lo, void **hi) { /* if p is in the mlist, return the next hole in the mlist */ /* if p is allocated, then return the hole that it is contained in */ } -/* also: accessors for the virtual memory of the pma low and high */ - -/* #define BT_MAPADDR ((void *) S(0x1000,0000,0000)) */ -/* #define BT_ADDRSIZE (BT_PAGESIZE << BT_PAGEWORD) */ - -/* i.e. MAP_ADDDR - MAP_ADDR + ADDRSIZE */ - -/* and a function that given a pointer tests if in range */ - void -bt_bounds(void **lo, void **hi) +bt_bounds(BT_state *state, void **lo, void **hi) { - + *lo = BT_MAPADDR; + *hi = (void *)((uintptr_t)BT_MAPADDR + BT_ADDRSIZE); } int -bt_inbounds(void *p) +bt_inbounds(BT_state *state, void *p) { /* 1: if in bounds of PMA (those returned by bt_bounds) */ } diff --git a/rust/ares_pma/c-src/btree.h b/rust/ares_pma/c-src/btree.h index 1e53eec..94b964b 100644 --- a/rust/ares_pma/c-src/btree.h +++ b/rust/ares_pma/c-src/btree.h @@ -11,20 +11,74 @@ typedef unsigned long ULONG; //// =========================================================================== //// btree external routines +/** + * instantiate an opaque BT_state handle + */ int bt_state_new(BT_state **state); +/** + * Open the persistent state or create if one doesn't exist + */ int bt_state_open(BT_state *state, const char *path, ULONG flags, mode_t mode); +/** + * Close the persistent state + */ int bt_state_close(BT_state *state); +/** + * Allocate persistent memory space + */ void * bt_malloc(BT_state *state, size_t pages); +/** + * Free persistent memory space + */ void bt_free(BT_state *state, void *lo, void *hi); +/** + * Sync a snapshot of the persistent memory to disk + * This will **exit the process** on failure to avoid data corruption + */ int bt_sync(BT_state *state); +/** + * Get a metadata entry + */ uint64_t bt_meta_get(BT_state *state, size_t idx); +/** + * Set a metadata entry + */ void bt_meta_set(BT_state *state, size_t idx, uint64_t val); +/** + * Give the allocation range in the btree that a pointer lives in + */ +int bt_range_of(BT_state *state, void *p, void **lo, void **hi); + +/** + * Ensure a region of memory is "dirty" i.e. can be mutated + * + * A successful call to bt_dirty ensures that the memory range can be mutated + * until the next call to `bt_sync()` + */ +int bt_dirty(BT_state *state, void *lo, void *hi); + +/** + * Given a pointer, give the containing region of allocated memory, or the next + * highest if the pointer is to free memory + */ +int bt_next_alloc(BT_state *state, void *p, void **lo, void **hi); + +/** + * Return the memory bounds of the persistent-memory B-tree + */ +void bt_bounds(BT_state *state, void **lo, void **hi); + +/** + * Return whether a pointer is within the persistent-memory B-tree + */ +int bt_inbounds(BT_state *state, void *p); + #endif From debabd9ec4a0998d89c9d5f3794ae93f9f1e816c Mon Sep 17 00:00:00 2001 From: barter-simsum Date: Thu, 30 Nov 2023 17:16:15 -0500 Subject: [PATCH 013/128] pma: implement bt_range_of --- rust/ares_pma/c-src/btree.c | 45 +++++++++++++++++++++++++++++++++++++ 1 file changed, 45 insertions(+) diff --git a/rust/ares_pma/c-src/btree.c b/rust/ares_pma/c-src/btree.c index 596c445..a3aab71 100644 --- a/rust/ares_pma/c-src/btree.c +++ b/rust/ares_pma/c-src/btree.c @@ -2414,6 +2414,38 @@ bt_meta_set(BT_state *state, size_t idx, uint64_t val) meta->roots[idx] = val; } +int +_bt_range_of(BT_state *state, vaof_t p, vaof_t **lo, vaof_t **hi, + pgno_t nodepg, uint8_t depth, uint8_t maxdepth) +{ + BT_page *node = _node_get(state, nodepg); + size_t N = _bt_numkeys(node); + + vaof_t llo = 0; + vaof_t hhi = 0; + pgno_t pg = 0; + size_t i; + for (i = 0; i < N-1; i++) { + llo = node->datk[i].va; + hhi = node->datk[i+1].va; + pg = node->datk[i].fo; + if (llo <= p && hhi > p) { + break; + } + } + /* not found */ + if (i == N-1) + return 1; + + if (depth == maxdepth) { + **lo = llo; + **hi = hhi; + return BT_SUCC; + } + + return _bt_range_of(state, p, lo, hi, pg, depth+1, maxdepth); +} + int bt_range_of(BT_state *state, void *p, void **lo, void **hi) { @@ -2423,6 +2455,19 @@ bt_range_of(BT_state *state, void *p, void **lo, void **hi) 0: succ (found) 1: otherwise */ + + BT_meta *meta = state->meta_pages[state->which]; + pgno_t root = meta->root; + vaof_t *loret = 0; + vaof_t *hiret = 0; + vaof_t poff = ADDR2OFF(p); + int rc = 0; + if (!SUCC(rc = _bt_range_of(state, poff, &loret, &hiret, root, 1, meta->depth))) { + return rc; + } + *lo = OFF2ADDR(*loret); + *hi = OFF2ADDR(*hiret); + return BT_SUCC; } int From 2aae3d046bd735ad9ffe00af46a5b95bfae20372 Mon Sep 17 00:00:00 2001 From: barter-simsum Date: Fri, 1 Dec 2023 16:30:14 -0500 Subject: [PATCH 014/128] pma: implement bt_next_alloc --- rust/ares_pma/c-src/btree.c | 30 ++++++++++++++++++++++++++++-- 1 file changed, 28 insertions(+), 2 deletions(-) diff --git a/rust/ares_pma/c-src/btree.c b/rust/ares_pma/c-src/btree.c index a3aab71..5d960fb 100644 --- a/rust/ares_pma/c-src/btree.c +++ b/rust/ares_pma/c-src/btree.c @@ -2479,10 +2479,36 @@ bt_dirty(BT_state *state, void *lo, void *hi) int bt_next_alloc(BT_state *state, void *p, void **lo, void **hi) +/* if p is free, sets lo and hi to the bounds of the next adjacent allocated + space. If p is allocated, sets lo and hi to the bounds of the allocated space + it falls in. */ { - /* if p is in the mlist, return the next hole in the mlist */ + BT_mlistnode *head = state->mlist; + while (head) { + /* p is in a free range, return the allocated hole after it */ + if (head->va <= p + && head->va + head->sz > p) { + goto found; + } - /* if p is allocated, then return the hole that it is contained in */ + /* p is alloced, return this hole */ + if (head->next->va > p + && head->va + head->sz <= p) { + goto found; + } + + head = head->next; + } + + /* not found */ + return 1; + + found: + /* the alloced space begins at the end of the free block */ + *lo = head->va + head->sz; + /* ... and ends at the start of the next free block */ + *hi = head->next->va; + return BT_SUCC; } void From dc760add87e8e4a80264e8d370c887ebedf406b7 Mon Sep 17 00:00:00 2001 From: barter-simsum Date: Fri, 1 Dec 2023 16:37:18 -0500 Subject: [PATCH 015/128] pma: amend bt_next_alloc impl --- rust/ares_pma/c-src/btree.c | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) diff --git a/rust/ares_pma/c-src/btree.c b/rust/ares_pma/c-src/btree.c index 5d960fb..4304f16 100644 --- a/rust/ares_pma/c-src/btree.c +++ b/rust/ares_pma/c-src/btree.c @@ -2485,6 +2485,10 @@ bt_next_alloc(BT_state *state, void *p, void **lo, void **hi) { BT_mlistnode *head = state->mlist; while (head) { + /* at last free block, different logic applies */ + if (head->next == 0) + goto end; + /* p is in a free range, return the allocated hole after it */ if (head->va <= p && head->va + head->sz > p) { @@ -2509,6 +2513,19 @@ bt_next_alloc(BT_state *state, void *p, void **lo, void **hi) /* ... and ends at the start of the next free block */ *hi = head->next->va; return BT_SUCC; + + end: + void *pma_end = (void *)((uintptr_t)BT_MAPADDR + BT_ADDRSIZE); + assert(head->va + head->sz <= pma_end); + /* no alloced region between tail of freelist and end of pma memory space */ + if (head->va + head->sz == pma_end) + return 1; + + /* otherwise, return the alloced region between the tail of the freelist and + the end of the memory arena */ + *lo = head->va + head->sz; + *hi = pma_end; + return BT_SUCC; } void From d7ffdc47deb681721da8b65ef0a0d7e88a0dd28e Mon Sep 17 00:00:00 2001 From: barter-simsum Date: Fri, 1 Dec 2023 23:17:38 -0500 Subject: [PATCH 016/128] pma: bt_dirty partial implm --- rust/ares_pma/c-src/btree.c | 115 ++++++++++++++++++++++++++++++++++++ 1 file changed, 115 insertions(+) diff --git a/rust/ares_pma/c-src/btree.c b/rust/ares_pma/c-src/btree.c index 4304f16..4f75a13 100644 --- a/rust/ares_pma/c-src/btree.c +++ b/rust/ares_pma/c-src/btree.c @@ -1024,6 +1024,9 @@ _bt_delco(BT_state *state, vaof_t lo, vaof_t hi, pgno_t rsubtree = 0; /* find low idx of range */ + + /* ;;: !!! fixme this is not incorrect. find first hi greater than lo. the lo + of that entry is the loidx */ for (size_t i = 0; i < BT_DAT_MAXKEYS-1; i++) { vaof_t llo = node->datk[i].va; if (llo <= lo) { @@ -2470,11 +2473,123 @@ bt_range_of(BT_state *state, void *p, void **lo, void **hi) return BT_SUCC; } +/** + +pseudocode from ed: + +bt_dirty(btree, lo, hi): + loop: + (range_lo, range_hi) = find_range_for_pointer(btree, lo); + dirty_hi = min(hi, range_hi); + new_start_fo = data_cow(btree, lo, dirty_hi); + bt_insert(btree, lo, dirty_hi, new_start_fo); + lo := range_hi; + if dirty_hi == hi then break; + +// precondition: given range does not cross a tree boundary +data_cow(btree, lo, hi): + (range_lo, range_hi, fo) = bt_find(btree, lo, hi); + size = lo - hi; + new_fo = data_alloc(btree.data_free, size); + + // puts data in the unified buffer cache without having to map virtual memory + write(fd, new_fo, size * BT_PAGESIZE, to_ptr(lo)); + + // maps new file offset with same data back into same memory + mmap(fd, new_fo, size, to_ptr(lo)); + + bt_insert(btree, lo, hi, new_fo); + + offset = lo - range_lo; + freelist_insert(btree.pending_data_flist, fo + offset, fo + offset + size); + return new_fo + +**/ + +/* pgno_t */ +/* _bt_data_cow(BT_state *state, vaof_t lo, vaof_t hi) */ +/* { */ + +/* } */ + +static pgno_t +_bt_data_cow(BT_state *state, BT_page *leaf, size_t i) +/* cow the data referenced by ith leaf entry. remap the new data to the same + offset. insert the old data into the pending data freelist. finally, return + the pgno of the new data */ +{ + vaof_t lo = leaf->datk[i].va; + vaof_t hi = leaf->datk[i+1].va; + size_t len = B2PAGES(hi - lo); + pgno_t newpg = _bt_falloc(state, len); + /* ;;: todo: perform write call without having to map memory */ + + /* ;;: todo: and now the mmap call */ + + /* insert into pending data freelist the old file chunk */ + + return newpg; +} + +static int +_bt_dirty(BT_state *state, vaof_t lo, vaof_t hi, pgno_t nodepg, + uint8_t depth, uint8_t maxdepth) +{ + BT_page *node = _node_get(state, nodepg); + size_t N = _bt_numkeys(node); + size_t loidx = 0; + size_t hiidx = 0; + + /* find loidx of range */ + for (size_t i = 0; i < N-1; i++) { + vaof_t hhi = node->datk[i+1].va; + if (hhi > lo) { + loidx = i; + break; + } + } + assert(loidx != 0); + + /* find hiidx of range */ + for (size_t i = loidx; i < N-1; i++) { + vaof_t hhi = node->datk[i+1].va; + if (hhi >= hi) { + hiidx = i; + break; + } + } + assert(hiidx != 0); + + /* leaf: base case */ + if (depth == maxdepth) { + + } + + /* found a range in node that contains (lo-hi). May span multiple entries */ + for (size_t i = loidx; i < hiidx; i++) { + /* leaf: base case. cow the data */ + if (depth == maxdepth) { + pgno_t newpg = _bt_data_cow(state, node, i); + _bt_insert(state, node->datk[i].va, node->datk[i+1].va, newpg); + } + + /* branch: recursive case */ + pgno_t childpg = node->datk[i].fo; + /* iteratively recurse on all entries */ + _bt_dirty(state, lo, hi, childpg, depth+1, maxdepth); + } +} + int bt_dirty(BT_state *state, void *lo, void *hi) { /* takes a range and ensures that entire range is CoWed */ /* if part of the range is free then return 1 */ + BT_meta *meta = state->meta_pages[state->which]; + vaof_t looff = ADDR2OFF(lo); + vaof_t hioff = ADDR2OFF(hi); + + return _bt_dirty(state, looff, hioff, meta->root, 1, meta->depth); } int From 3c0d34dd11a6d82be932cd017532d3785abd6b1c Mon Sep 17 00:00:00 2001 From: barter-simsum Date: Sat, 2 Dec 2023 08:02:14 -0500 Subject: [PATCH 017/128] pma: modifications to bt_dirty and _bt_data_cow --- rust/ares_pma/c-src/btree.c | 56 ++++++++++++++++++++++++------------- 1 file changed, 37 insertions(+), 19 deletions(-) diff --git a/rust/ares_pma/c-src/btree.c b/rust/ares_pma/c-src/btree.c index 4f75a13..4530346 100644 --- a/rust/ares_pma/c-src/btree.c +++ b/rust/ares_pma/c-src/btree.c @@ -2506,31 +2506,46 @@ data_cow(btree, lo, hi): **/ -/* pgno_t */ -/* _bt_data_cow(BT_state *state, vaof_t lo, vaof_t hi) */ -/* { */ - -/* } */ - static pgno_t -_bt_data_cow(BT_state *state, BT_page *leaf, size_t i) -/* cow the data referenced by ith leaf entry. remap the new data to the same - offset. insert the old data into the pending data freelist. finally, return - the pgno of the new data */ +_bt_data_cow(BT_state *state, vaof_t lo, vaof_t hi, pgno_t pg) { - vaof_t lo = leaf->datk[i].va; - vaof_t hi = leaf->datk[i+1].va; - size_t len = B2PAGES(hi - lo); - pgno_t newpg = _bt_falloc(state, len); - /* ;;: todo: perform write call without having to map memory */ + size_t byte_len = hi - lo; + pgno_t newpg = _bt_falloc(state, B2PAGES(byte_len)); + BYTE *loaddr = OFF2ADDR(lo); - /* ;;: todo: and now the mmap call */ + vaof_t arena_start = ADDR2OFF(BT_MAPADDR); + off_t offset = lo - arena_start; - /* insert into pending data freelist the old file chunk */ + /* write call puts data in the unified buffer cache without having to map + virtual memory */ + if (pwrite(state->data_fd, loaddr, byte_len, offset) != byte_len) + abort(); + + /* BYTE *arena_start = BT_MAPADDR; */ + /* BYTE *map_loc = arena_start + lo; */ + + /* maps new file offset with same data back into memory */ + mmap(BT_MAPADDR, + byte_len, + PROT_READ | PROT_WRITE, + MAP_FIXED | MAP_SHARED, + state->data_fd, + offset); /* ;;: using an offset here rather than + supplying the address directly. correct?? + check. */ + + /* ;;: ps. noticed a duplicate _bt_insert call in the bt_dirty and data_cow + psuedocode. Does the order matter? Should it happen in data_cow or + bt_dirty? afaict, we might as well do it here and let _bt_data_cow + return void. No opinion really */ + + /* ;;: todo: insert into freelist */ return newpg; } +#define MIN(x, y) ((x) > (y) ? (y) : (x)) + static int _bt_dirty(BT_state *state, vaof_t lo, vaof_t hi, pgno_t nodepg, uint8_t depth, uint8_t maxdepth) @@ -2569,8 +2584,11 @@ _bt_dirty(BT_state *state, vaof_t lo, vaof_t hi, pgno_t nodepg, for (size_t i = loidx; i < hiidx; i++) { /* leaf: base case. cow the data */ if (depth == maxdepth) { - pgno_t newpg = _bt_data_cow(state, node, i); - _bt_insert(state, node->datk[i].va, node->datk[i+1].va, newpg); + vaof_t llo = node->datk[i].va; + vaof_t hhi = MIN(node->datk[i+1].va, hi); + pgno_t pg = node->datk[i].fo; + pgno_t newpg = _bt_data_cow(state, llo, hhi, pg); + _bt_insert(state, llo, hhi, newpg); } /* branch: recursive case */ From 14f82b45a82c6e254d6d4d53c0ef6dc013a61392 Mon Sep 17 00:00:00 2001 From: barter-simsum Date: Mon, 4 Dec 2023 12:37:53 -0500 Subject: [PATCH 018/128] pma: bt_inbounds --- rust/ares_pma/c-src/btree.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/rust/ares_pma/c-src/btree.c b/rust/ares_pma/c-src/btree.c index 4530346..4c5a762 100644 --- a/rust/ares_pma/c-src/btree.c +++ b/rust/ares_pma/c-src/btree.c @@ -2670,8 +2670,10 @@ bt_bounds(BT_state *state, void **lo, void **hi) int bt_inbounds(BT_state *state, void *p) +/* 1: if in the bounds of the PMA, 0 otherwise */ { - /* 1: if in bounds of PMA (those returned by bt_bounds) */ + return p >= BT_MAPADDR + && p < (void *)((uintptr_t)BT_MAPADDR + BT_ADDRSIZE); } From db5ee1e371beba3be4e38fd82fd77ec418158ed2 Mon Sep 17 00:00:00 2001 From: barter-simsum Date: Mon, 4 Dec 2023 13:36:50 -0500 Subject: [PATCH 019/128] pma: fix ADDR2OFF and OFF2ADDR. amend all sites that improperly handle vaof_t this should be closely inspected. additionally fixed _bt_data_cow implementation and a few other misc things --- rust/ares_pma/c-src/btree.c | 83 ++++++++++++++++++++----------------- 1 file changed, 46 insertions(+), 37 deletions(-) diff --git a/rust/ares_pma/c-src/btree.c b/rust/ares_pma/c-src/btree.c index 4c5a762..abbd7dd 100644 --- a/rust/ares_pma/c-src/btree.c +++ b/rust/ares_pma/c-src/btree.c @@ -96,10 +96,24 @@ STATIC_ASSERT(0, "debugger break instruction unimplemented"); #define BT_MAPADDR ((void *) S(0x1000,0000,0000)) -/* convert addr offset to raw address */ -#define OFF2ADDR(x) ((void *)((uintptr_t)(BT_MAPADDR) + (x))) -/* convert raw memory address to offset */ -#define ADDR2OFF(a) ((vaof_t)((uintptr_t)(a) - (uintptr_t)BT_MAPADDR)) +static inline vaof_t +addr2off(void *p) +/* convert a pointer into a 32-bit page offset */ +{ + uintptr_t pu = (uintptr_t)p; + assert((pu & ((1 << 14) - 1)) == 0); /* p must be page-aligned */ + uintptr_t off = pu - (uintptr_t)BT_MAPADDR; + return (vaof_t)(pu >> 14); +} + +static inline void * +off2addr(vaof_t off) +/* convert a 32-bit page offset into a pointer */ +{ + uintptr_t pu = (uintptr_t)off << 14; + pu += (uintptr_t)BT_MAPADDR; + return (void *)pu; +} #define BT_PAGEBITS 14ULL #define BT_PAGEWORD 32ULL @@ -1293,13 +1307,13 @@ _mlist_new(BT_state *state) vaof_t lo = root->datk[0].va; vaof_t hi = root->datk[1].va; - size_t len = B2PAGES(hi - lo); + size_t len = hi - lo; BT_mlistnode *head = calloc(1, sizeof *head); head->next = 0; head->sz = len; - head->va = OFF2ADDR(lo); + head->va = off2addr(lo); state->mlist = head; @@ -1370,7 +1384,7 @@ _flist_new(BT_state *state) vaof_t lo = root->datk[0].va; vaof_t hi = root->datk[1].va; - size_t len = B2PAGES(hi - lo); + size_t len = hi - lo; BT_flistnode *head = calloc(1, sizeof *head); @@ -1554,10 +1568,10 @@ _mlist_read2(BT_state *state, BT_page *node, uint8_t maxdepth, uint8_t depth) #if CAN_COALESCE /* free and contiguous with previous mlist node: merge */ if (kv->fo == 0 - && ADDR2OFF(prev->va) + P2BYTES(prev->sz) == kv->va) { + && addr2off(prev->va) + prev->sz == kv->va) { vaof_t hi = node->datk[i+1].va; vaof_t lo = kv->va; - size_t len = B2PAGES(hi - lo); + size_t len = hi - lo; prev->sz += len; } /* free but not contiguous with previous mlist node: append new node */ @@ -1566,9 +1580,9 @@ _mlist_read2(BT_state *state, BT_page *node, uint8_t maxdepth, uint8_t depth) BT_mlistnode *new = calloc(1, sizeof *new); vaof_t hi = node->datk[i+1].va; vaof_t lo = kv->va; - size_t len = B2PAGES(hi - lo); + size_t len = hi - lo; new->sz = len; - new->va = OFF2ADDR(lo); + new->va = off2addr(lo); prev->next = new; prev = new; #if CAN_COALESCE @@ -1757,7 +1771,7 @@ _flist_read2(BT_state *state, BT_page *node, uint8_t maxdepth, uint8_t depth) BT_flistnode *new = calloc(1, sizeof *new); vaof_t hi = node->datk[i+1].va; vaof_t lo = kv->va; - size_t len = B2PAGES(hi - lo); + size_t len = hi - lo; pgno_t fo = kv->fo; new->sz = len; new->pg = fo; @@ -2156,8 +2170,8 @@ _bt_sync_leaf(BT_state *state, BT_page *node) /* pgno_t pg = node->datk[i].fo; */ vaof_t lo = node->datk[i].va; vaof_t hi = node->datk[i+1].va; - size_t bytelen = hi - lo; - void *addr = OFF2ADDR(lo); + size_t bytelen = P2BYTES(hi - lo); + void *addr = off2addr(lo); /* sync the page */ if (msync(addr, bytelen, MS_SYNC)) @@ -2361,8 +2375,8 @@ bt_malloc(BT_state *state, size_t pages) pgno_t pgno = _bt_falloc(state, pages); bp(pgno != 0); _bt_insert(state, - ADDR2OFF(ret), - ADDR2OFF(ret) + P2BYTES(pages), + addr2off(ret), + addr2off(ret) + pages, pgno); bp(ret != 0); @@ -2372,8 +2386,8 @@ bt_malloc(BT_state *state, size_t pages) void bt_free(BT_state *state, void *lo, void *hi) { - vaof_t looff = ADDR2OFF(lo); - vaof_t hioff = ADDR2OFF(hi); + vaof_t looff = addr2off(lo); + vaof_t hioff = addr2off(hi); _bt_insert(state, looff, hioff, 0); /* ;;: and now add freespace to state->flist. coalescing when you do so */ } @@ -2463,13 +2477,13 @@ bt_range_of(BT_state *state, void *p, void **lo, void **hi) pgno_t root = meta->root; vaof_t *loret = 0; vaof_t *hiret = 0; - vaof_t poff = ADDR2OFF(p); + vaof_t poff = addr2off(p); int rc = 0; if (!SUCC(rc = _bt_range_of(state, poff, &loret, &hiret, root, 1, meta->depth))) { return rc; } - *lo = OFF2ADDR(*loret); - *hi = OFF2ADDR(*hiret); + *lo = off2addr(*loret); + *hi = off2addr(*hiret); return BT_SUCC; } @@ -2507,18 +2521,19 @@ data_cow(btree, lo, hi): **/ static pgno_t -_bt_data_cow(BT_state *state, vaof_t lo, vaof_t hi, pgno_t pg) +_bt_data_cow(BT_state *state, vaof_t lo, vaof_t hi) { - size_t byte_len = hi - lo; - pgno_t newpg = _bt_falloc(state, B2PAGES(byte_len)); - BYTE *loaddr = OFF2ADDR(lo); + size_t len = hi - lo; + size_t bytelen = P2BYTES(len); + pgno_t newpg = _bt_falloc(state, len); + BYTE *loaddr = off2addr(lo); - vaof_t arena_start = ADDR2OFF(BT_MAPADDR); + vaof_t arena_start = addr2off(BT_MAPADDR); off_t offset = lo - arena_start; /* write call puts data in the unified buffer cache without having to map virtual memory */ - if (pwrite(state->data_fd, loaddr, byte_len, offset) != byte_len) + if (pwrite(state->data_fd, loaddr, bytelen, offset) != bytelen) abort(); /* BYTE *arena_start = BT_MAPADDR; */ @@ -2526,7 +2541,7 @@ _bt_data_cow(BT_state *state, vaof_t lo, vaof_t hi, pgno_t pg) /* maps new file offset with same data back into memory */ mmap(BT_MAPADDR, - byte_len, + bytelen, PROT_READ | PROT_WRITE, MAP_FIXED | MAP_SHARED, state->data_fd, @@ -2575,19 +2590,13 @@ _bt_dirty(BT_state *state, vaof_t lo, vaof_t hi, pgno_t nodepg, } assert(hiidx != 0); - /* leaf: base case */ - if (depth == maxdepth) { - - } - /* found a range in node that contains (lo-hi). May span multiple entries */ for (size_t i = loidx; i < hiidx; i++) { /* leaf: base case. cow the data */ if (depth == maxdepth) { vaof_t llo = node->datk[i].va; vaof_t hhi = MIN(node->datk[i+1].va, hi); - pgno_t pg = node->datk[i].fo; - pgno_t newpg = _bt_data_cow(state, llo, hhi, pg); + pgno_t newpg = _bt_data_cow(state, llo, hhi); _bt_insert(state, llo, hhi, newpg); } @@ -2604,8 +2613,8 @@ bt_dirty(BT_state *state, void *lo, void *hi) /* takes a range and ensures that entire range is CoWed */ /* if part of the range is free then return 1 */ BT_meta *meta = state->meta_pages[state->which]; - vaof_t looff = ADDR2OFF(lo); - vaof_t hioff = ADDR2OFF(hi); + vaof_t looff = addr2off(lo); + vaof_t hioff = addr2off(hi); return _bt_dirty(state, looff, hioff, meta->root, 1, meta->depth); } From 6a41cc156c65dd19eda68dcc1b6dbdd3f9cd80cb Mon Sep 17 00:00:00 2001 From: barter-simsum Date: Mon, 4 Dec 2023 13:47:33 -0500 Subject: [PATCH 020/128] pma: amend _bt_data_cow --- rust/ares_pma/c-src/btree.c | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) diff --git a/rust/ares_pma/c-src/btree.c b/rust/ares_pma/c-src/btree.c index abbd7dd..b515574 100644 --- a/rust/ares_pma/c-src/btree.c +++ b/rust/ares_pma/c-src/btree.c @@ -2496,7 +2496,6 @@ bt_dirty(btree, lo, hi): (range_lo, range_hi) = find_range_for_pointer(btree, lo); dirty_hi = min(hi, range_hi); new_start_fo = data_cow(btree, lo, dirty_hi); - bt_insert(btree, lo, dirty_hi, new_start_fo); lo := range_hi; if dirty_hi == hi then break; @@ -2549,12 +2548,9 @@ _bt_data_cow(BT_state *state, vaof_t lo, vaof_t hi) supplying the address directly. correct?? check. */ - /* ;;: ps. noticed a duplicate _bt_insert call in the bt_dirty and data_cow - psuedocode. Does the order matter? Should it happen in data_cow or - bt_dirty? afaict, we might as well do it here and let _bt_data_cow - return void. No opinion really */ + _bt_insert(state, lo, hi, newpg); - /* ;;: todo: insert into freelist */ + /* ;;: todo: insert into pending disk freelist state->pending_flist */ return newpg; } From 89eb301914ab92550957e3d66d31f3dbecdde7c6 Mon Sep 17 00:00:00 2001 From: barter-simsum Date: Mon, 4 Dec 2023 15:28:25 -0500 Subject: [PATCH 021/128] pma: some pending_flist operations. still need merge --- rust/ares_pma/c-src/btree.c | 83 ++++++++++++++++++++++++++++++++++--- 1 file changed, 77 insertions(+), 6 deletions(-) diff --git a/rust/ares_pma/c-src/btree.c b/rust/ares_pma/c-src/btree.c index b515574..9392970 100644 --- a/rust/ares_pma/c-src/btree.c +++ b/rust/ares_pma/c-src/btree.c @@ -796,6 +796,14 @@ _pending_nlist_insert(BT_state *state, pgno_t nodepg) BT_nlistnode *head = state->pending_nlist; BT_page *va = _node_get(state, nodepg); + /* freelist may be empty. create head */ + if (head == 0) { + state->pending_nlist = calloc(1, sizeof *state->pending_nlist); + state->pending_nlist->sz = 1; + state->pending_nlist->va = va; + return; + } + /* we don't need to account for a freelist node's size because we aren't coalescing the pending freelists */ while (head->next) { @@ -805,11 +813,10 @@ _pending_nlist_insert(BT_state *state, pgno_t nodepg) } /* head->next is either null or has a higher address than va */ - BT_nlistnode *new = calloc(1, sizeof new); - new->next = head->next; + BT_nlistnode *new = calloc(1, sizeof *new); new->sz = 1; new->va = va; - + new->next = head->next; head->next = new; } @@ -879,6 +886,69 @@ _pending_nlist_merge(BT_state *state) _pending_nlist_clear(state); } +static void +_pending_flist_insert(BT_state *state, pgno_t pg, size_t sz) +{ + BT_flistnode *head = state->pending_flist; + + /* freelist may be empty. create head */ + if (head == 0) { + state->pending_flist = calloc(1, sizeof *state->pending_flist); + state->pending_flist->pg = pg; + state->pending_flist->sz = sz; + return; + } + + while (head->next) { + /* next node starts at pg higher than this freechunk's termination */ + if (head->next->pg >= pg + sz) { + break; + } + head = head->next; + } + + /* if freed chunk follows head, expand head */ + if (head->pg + head->sz == pg) { + head->sz += sz; + return; + } + + /* if the freed chunk precedes next, expand next and pull pg back */ + if (head->next->pg == pg + sz) { + head->next->pg = pg; + head->next->sz += sz; + return; + } + + /* otherwise, insert a new node following head */ + BT_flistnode *new = calloc(1, sizeof *new); + new->pg = pg; + new->sz = sz; + new->next = head->next; + head->next = new; +} + +static void +_pending_flist_clear(BT_state *state) +{ + /* as with _pending_flist_clear. We only remove nodes from this list if it's + fully merged with state->flist */ + BT_flistnode *prev = state->pending_flist; + BT_flistnode *next = prev->next; + while (prev) { + free(prev); + prev = next; + next = next->next; + } + state->pending_flist = 0; +} + +static void +_pending_flist_merge(BT_state *state) +{ + +} + /* ;;: todo move shit around */ static void @@ -2520,7 +2590,7 @@ data_cow(btree, lo, hi): **/ static pgno_t -_bt_data_cow(BT_state *state, vaof_t lo, vaof_t hi) +_bt_data_cow(BT_state *state, vaof_t lo, vaof_t hi, pgno_t pg) { size_t len = hi - lo; size_t bytelen = P2BYTES(len); @@ -2550,7 +2620,7 @@ _bt_data_cow(BT_state *state, vaof_t lo, vaof_t hi) _bt_insert(state, lo, hi, newpg); - /* ;;: todo: insert into pending disk freelist state->pending_flist */ + _pending_flist_insert(state, pg, len); return newpg; } @@ -2592,7 +2662,8 @@ _bt_dirty(BT_state *state, vaof_t lo, vaof_t hi, pgno_t nodepg, if (depth == maxdepth) { vaof_t llo = node->datk[i].va; vaof_t hhi = MIN(node->datk[i+1].va, hi); - pgno_t newpg = _bt_data_cow(state, llo, hhi); + pgno_t pg = node->datk[i].fo; + pgno_t newpg = _bt_data_cow(state, llo, hhi, pg); _bt_insert(state, llo, hhi, newpg); } From 520f8133aba966b766237d9467c03edee881ad94 Mon Sep 17 00:00:00 2001 From: barter-simsum Date: Mon, 4 Dec 2023 16:26:51 -0500 Subject: [PATCH 022/128] pma: _pending_flist_merge did I do this right? need to review. additionally, need to rewrite _pendling_nlist_merge --- rust/ares_pma/c-src/btree.c | 80 +++++++++++++++++++++++++++++-------- 1 file changed, 64 insertions(+), 16 deletions(-) diff --git a/rust/ares_pma/c-src/btree.c b/rust/ares_pma/c-src/btree.c index 9392970..1786a39 100644 --- a/rust/ares_pma/c-src/btree.c +++ b/rust/ares_pma/c-src/btree.c @@ -875,8 +875,8 @@ _pending_nlist_merge(BT_state *state) } dst_head = dst_head->next; } - if (!dst_head) { - /* need to track prev */ + if (src_head) { + /* need to track prev or use double indirection */ } @@ -928,25 +928,73 @@ _pending_flist_insert(BT_state *state, pgno_t pg, size_t sz) head->next = new; } -static void -_pending_flist_clear(BT_state *state) -{ - /* as with _pending_flist_clear. We only remove nodes from this list if it's - fully merged with state->flist */ - BT_flistnode *prev = state->pending_flist; - BT_flistnode *next = prev->next; - while (prev) { - free(prev); - prev = next; - next = next->next; - } - state->pending_flist = 0; -} +/* static void */ +/* _pending_flist_clear(BT_state *state) */ +/* { */ +/* /\* as with _pending_flist_clear. We only remove nodes from this list if it's */ +/* fully merged with state->flist *\/ */ +/* BT_flistnode *prev = state->pending_flist; */ +/* BT_flistnode *next = prev->next; */ +/* while (prev) { */ +/* free(prev); */ +/* prev = next; */ +/* next = next->next; */ +/* } */ +/* state->pending_flist = 0; */ +/* } */ static void _pending_flist_merge(BT_state *state) { + BT_flistnode **src_head = &state->pending_flist; + BT_flistnode **dst_head = &state->flist; + while (*dst_head) { + /* src cleared. done */ + if (!*src_head) { + return; + } + + /* check if src node should be merged with dst **************************/ + pgno_t dst_pg = (*dst_head)->pg; + size_t dst_sz = (*dst_head)->sz; + pgno_t src_pg = (*src_head)->pg; + size_t src_sz = (*src_head)->sz; + pgno_t dst_next_pg = *dst_head ? (*dst_head)->next->pg : 0; + + /* source node immediately follows dst node's termination */ + if (dst_pg + dst_sz == src_pg) { + (*dst_head)->sz += src_sz; /* widen dst node */ + /* advance src node and free previous */ + BT_flistnode *prev = *src_head; + src_head = &(*src_head)->next; + free(prev); + } + /* source node's termination immediately precedes next dst node */ + else if (dst_next_pg == src_pg + src_sz) { + (*dst_head)->next->pg = src_pg; /* pull page back */ + (*dst_head)->next->sz += src_sz; /* widen node */ + /* advance src node and free previous */ + BT_flistnode *prev = *src_head; + src_head = &(*src_head)->next; + free(prev); + } + /* src node lies between but isn't contiguous with dst */ + else if (dst_next_pg > src_pg + src_sz + && dst_pg + dst_sz < src_pg) { + /* link src node in */ + (*src_head)->next = (*dst_head)->next; + (*dst_head)->next = *src_head; + /* and advance src node */ + src_head = &(*src_head)->next; + } + /* otherwise, advance dst node */ + else { + dst_head = &(*dst_head)->next; + } + } + /* merge what remains of src if anything */ + dst_head = src_head; } From 9708b47eca3a8b64d01dbd9be2abc09afca1aeb6 Mon Sep 17 00:00:00 2001 From: barter-simsum Date: Mon, 4 Dec 2023 16:31:49 -0500 Subject: [PATCH 023/128] pma: _pending_flist_merge one line change --- rust/ares_pma/c-src/btree.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/rust/ares_pma/c-src/btree.c b/rust/ares_pma/c-src/btree.c index 1786a39..775f73b 100644 --- a/rust/ares_pma/c-src/btree.c +++ b/rust/ares_pma/c-src/btree.c @@ -994,7 +994,7 @@ _pending_flist_merge(BT_state *state) } } /* merge what remains of src if anything */ - dst_head = src_head; + *dst_head = *src_head; } From bbc1c88120dcc7c02087573d892394b57f280333 Mon Sep 17 00:00:00 2001 From: barter-simsum Date: Mon, 4 Dec 2023 17:07:38 -0500 Subject: [PATCH 024/128] pma: amend _pending_flist_merge --- rust/ares_pma/c-src/btree.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/rust/ares_pma/c-src/btree.c b/rust/ares_pma/c-src/btree.c index 775f73b..001b77b 100644 --- a/rust/ares_pma/c-src/btree.c +++ b/rust/ares_pma/c-src/btree.c @@ -970,10 +970,10 @@ _pending_flist_merge(BT_state *state) src_head = &(*src_head)->next; free(prev); } - /* source node's termination immediately precedes next dst node */ - else if (dst_next_pg == src_pg + src_sz) { - (*dst_head)->next->pg = src_pg; /* pull page back */ - (*dst_head)->next->sz += src_sz; /* widen node */ + /* source node's termination immediately precedes dst node */ + else if (src_pg + src_sz == dst_pg) { + (*dst_head)->pg = src_pg; /* pull page back */ + (*dst_head)->sz += src_sz; /* widen node */ /* advance src node and free previous */ BT_flistnode *prev = *src_head; src_head = &(*src_head)->next; From 5beb2da584797c0476bad0c7e469f2b89be14cfa Mon Sep 17 00:00:00 2001 From: barter-simsum Date: Mon, 4 Dec 2023 17:07:48 -0500 Subject: [PATCH 025/128] pma: rewrite _pending_nlist_merge --- rust/ares_pma/c-src/btree.c | 111 +++++++++++++++--------------------- 1 file changed, 47 insertions(+), 64 deletions(-) diff --git a/rust/ares_pma/c-src/btree.c b/rust/ares_pma/c-src/btree.c index 001b77b..9e86d51 100644 --- a/rust/ares_pma/c-src/btree.c +++ b/rust/ares_pma/c-src/btree.c @@ -820,21 +820,6 @@ _pending_nlist_insert(BT_state *state, pgno_t nodepg) head->next = new; } -static void -_pending_nlist_clear(BT_state *state) -{ - /* there's no need for a pending freelist "pop" routine as we only clear nodes - from it after all have been merged with the real freelists */ - BT_nlistnode *prev = state->pending_nlist; - BT_nlistnode *next = prev->next; - while (prev) { - free(prev); - prev = next; - next = next->next; - } - state->pending_nlist = 0; -} - static BT_nlistnode * _nlist_find(BT_nlistnode *head, BT_page *va) /* find a node */ @@ -844,46 +829,59 @@ _nlist_find(BT_nlistnode *head, BT_page *va) static void _pending_nlist_merge(BT_state *state) -/* merge state->pending_nlist with state->nlist. To be called when syncing */ { - BT_nlistnode *src_head = state->pending_nlist; - BT_nlistnode *dst_head = state->nlist; + BT_nlistnode **src_head = &state->pending_nlist; + BT_nlistnode **dst_head = &state->nlist; - while (src_head) { - /* ;;: todo refactor */ - while (dst_head) { - BT_page *dst_va = dst_head->va; - BT_page *src_va = src_head->va; - if (dst_head->va <= src_head->va - && dst_head->va + dst_head->sz >= src_head->va) { - /* found node in nlist that fits node in pending nlist */ - - dst_head->sz += 1; - break; - } - else if (dst_head->va + dst_head->sz < src_head->va - && dst_head->next->va > src_head->va) { - /* pending nlist node belongs between two nlist nodes */ - BT_nlistnode *new = calloc(1, sizeof *new); - memcpy(new, src_head, sizeof *src_head); - new->sz = 1; - new->va = src_head->va; - /* insert */ - new->next = dst_head->next; - dst_head->next = new; - break; - } - dst_head = dst_head->next; - } - if (src_head) { - /* need to track prev or use double indirection */ + while (*dst_head) { + /* src cleared. done */ + if (!*src_head) { + return; } + /* check if src node should be merged with dst **************************/ + BT_page *dst_va = (*dst_head)->va; + size_t dst_sz = (*dst_head)->sz; + BT_page *src_va = (*src_head)->va; + /* NB: while we don't currently coalesce the pending nlist, it's not that + hard to account for if we did, so might as well generalize the merge + algorithm */ + size_t src_sz = (*src_head)->sz; + BT_page *dst_next_va = *dst_head ? (*dst_head)->next->va : 0; - src_head = src_head->next; + /* source node immediately follows dst node's termination */ + if (dst_va + dst_sz == src_va) { + (*dst_head)->sz += src_sz; /* widen dst node */ + /* advance src node and free previous */ + BT_nlistnode *prev = *src_head; + src_head = &(*src_head)->next; + free(prev); + } + /* source node's termination immediately precedes dst node */ + else if (dst_next_va == src_va + src_sz) { + (*dst_head)->va = src_va; /* pull va back */ + (*dst_head)->sz += src_sz; /* widen node */ + /* advance src node and free previous */ + BT_nlistnode *prev = *src_head; + src_head = &(*src_head)->next; + free(prev); + } + /* src node lies between but isn't contiguous with dst */ + else if (src_va > dst_va + dst_sz + && src_va + src_sz < dst_next_va) { + /* link src node in */ + (*src_head)->next = (*dst_head)->next; + (*dst_head)->next = *src_head; + /* and advance src node */ + src_head = &(*src_head)->next; + } + /* otherwise, advance dst node */ + else { + dst_head = &(*dst_head)->next; + } } - - _pending_nlist_clear(state); + /* merge what remains of src if anything */ + *dst_head = *src_head; } static void @@ -928,21 +926,6 @@ _pending_flist_insert(BT_state *state, pgno_t pg, size_t sz) head->next = new; } -/* static void */ -/* _pending_flist_clear(BT_state *state) */ -/* { */ -/* /\* as with _pending_flist_clear. We only remove nodes from this list if it's */ -/* fully merged with state->flist *\/ */ -/* BT_flistnode *prev = state->pending_flist; */ -/* BT_flistnode *next = prev->next; */ -/* while (prev) { */ -/* free(prev); */ -/* prev = next; */ -/* next = next->next; */ -/* } */ -/* state->pending_flist = 0; */ -/* } */ - static void _pending_flist_merge(BT_state *state) { From 7935f531ffdc1a17a14c5dd94a861ba3b20c7bd3 Mon Sep 17 00:00:00 2001 From: barter-simsum Date: Mon, 4 Dec 2023 17:11:54 -0500 Subject: [PATCH 026/128] pma: call pending freelist merge routines in bt_sync --- rust/ares_pma/c-src/btree.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/rust/ares_pma/c-src/btree.c b/rust/ares_pma/c-src/btree.c index 9e86d51..97c830e 100644 --- a/rust/ares_pma/c-src/btree.c +++ b/rust/ares_pma/c-src/btree.c @@ -2505,6 +2505,10 @@ bt_sync(BT_state *state) if (rc = _bt_sync(state, root, 1, meta->depth)) return rc; + /* merge the pending freelists */ + _pending_nlist_merge(state); + _pending_flist_merge(state); + /* sync the root page */ if (msync(root, sizeof(BT_page), MS_SYNC)) return errno; From ebb5a4063d2511c43a477ad2783c230b9046f526 Mon Sep 17 00:00:00 2001 From: Edward Amsden Date: Mon, 4 Dec 2023 14:08:16 -0600 Subject: [PATCH 027/128] snapshot: strip out old snapshotting code to make way for PMA --- rust/ares/src/lib.rs | 1 - rust/ares/src/main.rs | 3 - rust/ares/src/mem.rs | 74 ------------ rust/ares/src/serf.rs | 22 ++-- rust/ares/src/snapshot.rs | 13 -- rust/ares/src/snapshot/double_jam.rs | 171 --------------------------- rust/ares/src/snapshot/pma.rs | 123 ------------------- 7 files changed, 10 insertions(+), 397 deletions(-) delete mode 100644 rust/ares/src/snapshot.rs delete mode 100644 rust/ares/src/snapshot/double_jam.rs delete mode 100644 rust/ares/src/snapshot/pma.rs diff --git a/rust/ares/src/lib.rs b/rust/ares/src/lib.rs index 558006e..8393ff9 100644 --- a/rust/ares/src/lib.rs +++ b/rust/ares/src/lib.rs @@ -13,7 +13,6 @@ pub mod noun; pub mod serf; //pub mod bytecode; pub mod serialization; -pub mod snapshot; pub mod trace; /** Introduce useful functions for debugging diff --git a/rust/ares/src/main.rs b/rust/ares/src/main.rs index 874b55e..6175ad2 100644 --- a/rust/ares/src/main.rs +++ b/rust/ares/src/main.rs @@ -40,9 +40,6 @@ fn main() -> io::Result<()> { ares::noun::use_gdb(); ares::serf::use_gdb(); ares::serialization::use_gdb(); - ares::snapshot::use_gdb(); - ares::snapshot::double_jam::use_gdb(); - ares::snapshot::pma::use_gdb(); } if filename == "serf" { diff --git a/rust/ares/src/mem.rs b/rust/ares/src/mem.rs index f53b63b..be06b31 100644 --- a/rust/ares/src/mem.rs +++ b/rust/ares/src/mem.rs @@ -2,7 +2,6 @@ use crate::assert_acyclic; use crate::assert_no_forwarding_pointers; use crate::assert_no_junior_pointers; use crate::noun::{Atom, Cell, CellMemory, IndirectAtom, Noun, NounAllocator}; -use crate::snapshot::pma::{pma_in_arena, pma_malloc_w}; use assert_no_alloc::permit_alloc; use either::Either::{self, Left, Right}; use ibig::Stack; @@ -526,79 +525,6 @@ impl NockStack { } } - pub unsafe fn copy_pma(&mut self, noun: &mut Noun) { - // copy_pma() should only be called when there is a single stack - // frame; these asserts assure that. - assert!( - self.is_west() - && (*(self.prev_stack_pointer_pointer())).is_null() - && (*(self.prev_frame_pointer_pointer())).is_null() - ); - assert!(self.stack_is_empty()); - let noun_ptr = noun as *mut Noun; - *(self.push::()) = *noun; - *(self.push::<*mut Noun>()) = noun_ptr; - loop { - if self.stack_is_empty() { - break; - } - - let next_dest = *(self.top::<*mut Noun>()); - self.pop::<*mut Noun>(); - let next_noun = *(self.top::()); - self.pop::(); - - match next_noun.as_either_direct_allocated() { - Either::Left(_direct) => { - *next_dest = next_noun; - } - Either::Right(allocated) => match allocated.forwarding_pointer() { - Option::Some(new_allocated) => { - *next_dest = new_allocated.as_noun(); - } - Option::None => { - if pma_in_arena(allocated.to_raw_pointer()) { - *next_dest = allocated.as_noun(); - } else { - match allocated.as_either() { - Either::Left(mut indirect) => { - let new_indirect_alloc = - pma_malloc_w(indirect_raw_size(indirect)); - - copy_nonoverlapping( - indirect.to_raw_pointer(), - new_indirect_alloc, - indirect_raw_size(indirect), - ); - - indirect.set_forwarding_pointer(new_indirect_alloc); - - *next_dest = IndirectAtom::from_raw_pointer(new_indirect_alloc) - .as_noun(); - } - Either::Right(mut cell) => { - let new_cell_alloc: *mut CellMemory = - pma_malloc_w(word_size_of::()); - - (*new_cell_alloc).metadata = (*cell.to_raw_pointer()).metadata; - - *(self.push::()) = cell.tail(); - *(self.push::<*mut Noun>()) = &mut (*new_cell_alloc).tail; - *(self.push::()) = cell.head(); - *(self.push::<*mut Noun>()) = &mut (*new_cell_alloc).head; - - cell.set_forwarding_pointer(new_cell_alloc); - - *next_dest = Cell::from_raw_pointer(new_cell_alloc).as_noun(); - } - } - } - } - }, - } - } - } - pub unsafe fn frame_pop(&mut self) { let prev_frame_ptr = *self.prev_frame_pointer_pointer(); let prev_stack_ptr = *self.prev_stack_pointer_pointer(); diff --git a/rust/ares/src/serf.rs b/rust/ares/src/serf.rs index 35a1bab..f3940ef 100644 --- a/rust/ares/src/serf.rs +++ b/rust/ares/src/serf.rs @@ -10,8 +10,6 @@ use crate::mem::NockStack; use crate::mug::*; use crate::newt::Newt; use crate::noun::{Atom, Cell, DirectAtom, Noun, Slots, D, T}; -use crate::snapshot::double_jam::DoubleJam; -use crate::snapshot::Snapshot; use crate::trace::*; use ares_macros::tas; use signal_hook; @@ -31,7 +29,7 @@ const FLAG_TRACE: u32 = 1 << 8; struct Context { epoch: u64, event_num: u64, - snapshot: DoubleJam, + snapshot: (), arvo: Noun, mug: u32, nock_context: interpreter::Context, @@ -41,16 +39,18 @@ impl Context { pub fn new(snap_path: &PathBuf, trace_info: Option) -> Self { // TODO: switch to Pma when ready // let snap = &mut snapshot::pma::Pma::new(snap_path); - let mut snapshot = DoubleJam::new(snap_path); let mut stack = NockStack::new(1024 << 10 << 10, 0); + let snapshot = (); // XX PMA let newt = Newt::new(); let cache = Hamt::::new(); - let cold = Cold::new(&mut stack); - let warm = Warm::new(); - let hot = Hot::init(&mut stack); - let (epoch, event_num, arvo) = snapshot.load(&mut stack).unwrap_or((0, 0, D(0))); + let (epoch, event_num, arvo, mut cold) = (0, 0, D(0), Cold::new(&mut stack)); // XX to load + // from PMA; + + let mut hot = Hot::init(&mut stack); + let warm = Warm::init(&mut stack, &mut cold, &mut hot); + let mug = mug_u32(&mut stack, arvo); let nock_context = interpreter::Context { @@ -82,8 +82,7 @@ impl Context { // XX: assert event numbers are continuous self.arvo = new_arvo; self.event_num = new_event_num; - self.snapshot - .save(&mut self.nock_context.stack, &mut self.arvo); + // XX save to PMA self.mug = mug_u32(&mut self.nock_context.stack, self.arvo); } @@ -92,8 +91,7 @@ impl Context { // pub fn sync(&mut self) { - self.snapshot - .sync(&mut self.nock_context.stack, self.epoch, self.event_num); + // XX save to PMA } // diff --git a/rust/ares/src/snapshot.rs b/rust/ares/src/snapshot.rs deleted file mode 100644 index a8de436..0000000 --- a/rust/ares/src/snapshot.rs +++ /dev/null @@ -1,13 +0,0 @@ -use crate::mem::NockStack; -use crate::noun::Noun; - -pub mod double_jam; -pub mod pma; - -crate::gdb!(); - -pub trait Snapshot { - fn load(&mut self, stack: &mut NockStack) -> std::io::Result<(u64, u64, Noun)>; - fn save(&mut self, stack: &mut NockStack, noun: &mut Noun); - fn sync(&mut self, stack: &mut NockStack, epoch: u64, event: u64); -} diff --git a/rust/ares/src/snapshot/double_jam.rs b/rust/ares/src/snapshot/double_jam.rs deleted file mode 100644 index e021d2e..0000000 --- a/rust/ares/src/snapshot/double_jam.rs +++ /dev/null @@ -1,171 +0,0 @@ -/** Jam-based snapshotting - * - * This is a simple checkpoint system that should be safe but has (very) poor performance. This is - * intended as a working placeholder until the real PMA is hooked up. - * - * This keeps two files, .urb/chk/snapshot-0.jam and .urbit/chk/snapshot-1.jam. Each of these - * contains 64 bits for a mug checksum, then 64 bits for the event number, then a jam of the state. - * We alternate between writing these two files, so that at least one is always valid. - * - * When we start up, we read both files and pick the one with the higher event number. If either - * is corrupted, we use the other. - */ -use super::Snapshot; -use crate::mem::NockStack; -use crate::mug::mug_u32; -use crate::noun::{IndirectAtom, Noun, D}; -use crate::serialization::{cue, jam}; -use either::Either; -use memmap::Mmap; -use memmap::MmapMut; -use std::fs::{File, OpenOptions}; -use std::io; -use std::mem; -use std::path::{Path, PathBuf}; -use std::ptr::copy_nonoverlapping; -use std::ptr::write_bytes; - -crate::gdb!(); - -pub struct DoubleJam { - path: PathBuf, - noun: Noun, -} - -impl DoubleJam { - pub fn new>(path: P) -> Self { - Self { - path: path.as_ref().to_path_buf(), - noun: D(0), - } - } - - fn latest_snapshot(&self, stack: &mut NockStack) -> io::Result<(u8, u64, IndirectAtom)> { - let res0 = self.load_snapshot(stack, 0); - let res1 = self.load_snapshot(stack, 1); - - match (res0, res1) { - (Ok((event_number_0, state_0)), Ok((event_number_1, state_1))) => { - if event_number_0 > event_number_1 { - Ok((0, event_number_0, state_0)) - } else { - Ok((1, event_number_1, state_1)) - } - } - (Ok((event_number_0, state_0)), Err(_)) => Ok((0, event_number_0, state_0)), - (Err(_), Ok((event_number_1, state_1))) => Ok((1, event_number_1, state_1)), - (Err(_), Err(_)) => Err(io::Error::new( - io::ErrorKind::NotFound, - "no valid snapshot found", - )), - } - } - - fn load_snapshot(&self, stack: &mut NockStack, number: u8) -> io::Result<(u64, IndirectAtom)> { - let path = self.path.join(format!("snapshot-{}.jam", number)); - - eprintln!("\rload: snapshot at {:?}", path); - - let f = File::open(path)?; - - let in_len = f.metadata().unwrap().len() - 8; - let word_len = (in_len + 7) >> 3; - let (event_number, state) = unsafe { - let in_map = Mmap::map(&f).unwrap(); - let in_ptr = in_map.as_ptr(); - let (mut state, dest) = IndirectAtom::new_raw_mut(stack, word_len as usize); - let mugged = (*in_ptr.add(0) as u32) - | ((*in_ptr.add(1) as u32) << 8) - | ((*in_ptr.add(2) as u32) << 16) - | ((*in_ptr.add(3) as u32) << 24); - write_bytes(dest.add(word_len as usize - 1), 0, 1); - copy_nonoverlapping(in_ptr.add(8), dest as *mut u8, in_len as usize); - mem::drop(in_map); - state.normalize(); // know it's not direct because first word is event number - - if mug_u32(stack, state.as_noun()) != mugged { - return Err(io::Error::new( - io::ErrorKind::InvalidData, - "snapshot checksum mismatch", - )); - } - - (*state.data_pointer(), state) - }; - - Ok((event_number, state)) - } -} - -impl Snapshot for DoubleJam { - fn load(&mut self, stack: &mut NockStack) -> io::Result<(u64, u64, Noun)> { - let (_num, event_number, state) = self.latest_snapshot(stack)?; - - let jammed_arvo = - unsafe { IndirectAtom::new_raw(stack, state.size() - 1, state.data_pointer().add(1)) }; - - let arvo = cue(stack, jammed_arvo.as_atom()); - - Ok((0, event_number, arvo)) - } - - fn save(&mut self, _stack: &mut NockStack, noun: &mut Noun) { - // XX: I don't think this needs to be mut - self.noun = *noun; - } - - fn sync(&mut self, stack: &mut NockStack, _epoch: u64, event_number: u64) { - // Find the latest valid snapshot, and write to the other file. - let prev_snap = if let Ok((prev_snap, _, _)) = self.latest_snapshot(stack) { - prev_snap - } else { - 0 - }; - let snap_number = if prev_snap == 0 { 1 } else { 0 }; - let path = self.path.join(format!("snapshot-{}.jam", snap_number)); - - let jammed_arvo = jam(stack, self.noun); - let state = unsafe { - let (mut state, dest) = IndirectAtom::new_raw_mut(stack, jammed_arvo.size() + 1); - dest.write(event_number); - match jammed_arvo.as_either() { - Either::Left(direct) => { - copy_nonoverlapping(&direct.data() as *const u64, dest.add(1), 1); - } - Either::Right(indirect) => { - copy_nonoverlapping(indirect.data_pointer(), dest.add(1), jammed_arvo.size()); - } - }; - state.normalize_as_atom() - }; - - let mugged = mug_u32(stack, state.as_noun()); - - let f = OpenOptions::new() - .read(true) - .write(true) - .create(true) - .open(path) - .unwrap(); - - f.set_len(((state.size() + 1) << 3) as u64).unwrap(); - unsafe { - let mut out_map = MmapMut::map_mut(&f).unwrap(); - let out_ptr = out_map.as_mut_ptr(); - out_ptr.add(0).write(mugged as u8); - out_ptr.add(1).write((mugged >> 8) as u8); - out_ptr.add(2).write((mugged >> 16) as u8); - out_ptr.add(3).write((mugged >> 24) as u8); - copy_nonoverlapping( - state.data_pointer() as *mut u8, - out_ptr.add(8), - state.size() << 3, - ); - out_map.flush().unwrap(); - - // This appears to match c3/portable.h: fdatasync for linux, fcntl with F_FULLFSYNC for for - // macos, and fsync for some other platforms. - f.sync_data().unwrap(); - }; - } -} diff --git a/rust/ares/src/snapshot/pma.rs b/rust/ares/src/snapshot/pma.rs deleted file mode 100644 index 28fba4b..0000000 --- a/rust/ares/src/snapshot/pma.rs +++ /dev/null @@ -1,123 +0,0 @@ -use super::Snapshot; -use crate::mem::NockStack; -use crate::mug::mug_u32; -use crate::noun::{Noun, D}; -use libc::{c_char, c_int, c_void, size_t}; -use std::ffi::CString; -use std::path::{Path, PathBuf}; - -crate::gdb!(); - -mod raw { - use super::*; - - #[repr(C)] - pub struct RootState { - pub epoch: u64, - pub event: u64, - pub root: u64, - } - - #[link(name = "pma_malloc", kind = "static")] - extern "C" { - pub(super) fn pma_init(path: *const c_char) -> c_int; - pub(super) fn pma_load(path: *const c_char) -> RootState; - pub(super) fn pma_close(epoch: u64, event: u64, root: u64) -> c_int; - pub(super) fn pma_malloc(size: size_t) -> *mut c_void; - pub(super) fn pma_free(ptr: *mut c_void) -> c_int; - pub(super) fn pma_sync(epoch: u64, event: u64, root: u64) -> c_int; - pub(super) fn pma_in_arena(ptr: *const c_void) -> bool; - } - - #[link(name = "test_pma_malloc_unit", kind = "static")] - extern "C" { - pub(super) fn test_pma(path: *const c_char) -> c_void; - } -} - -unsafe fn pma_init>(path: P) -> i32 { - let path = CString::new(path.as_ref().to_str().unwrap()).unwrap(); - raw::pma_init(path.as_ptr()) -} - -unsafe fn pma_load>(path: P) -> (u64, u64, Noun) { - let path = CString::new(path.as_ref().to_str().unwrap()).unwrap(); - let rs = raw::pma_load(path.as_ptr()); - (rs.epoch, rs.event, Noun::from_raw(rs.root)) -} - -#[allow(dead_code)] -unsafe fn pma_close(epoch: u64, event: u64, root: Noun) -> i32 { - raw::pma_close(epoch, event, root.as_raw()) -} - -pub fn pma_malloc(size: usize) -> *mut T { - unsafe { raw::pma_malloc(size as size_t) as *mut T } -} - -/** Allocate a block of memory from the persistent memory arena. - * - * Size is in *words*, unlike the underlying pma_malloc. - */ -pub fn pma_malloc_w(size: usize) -> *mut T { - unsafe { raw::pma_malloc(size << 3 as size_t) as *mut T } -} - -#[allow(dead_code)] -unsafe fn pma_free(ptr: *mut T) -> i32 { - raw::pma_free(ptr as *mut c_void) -} - -unsafe fn pma_sync(epoch: u64, event: u64, root: Noun) -> i32 { - raw::pma_sync(epoch, event, root.as_raw()) -} - -pub fn pma_in_arena(ptr: *const T) -> bool { - unsafe { raw::pma_in_arena(ptr as *const c_void) } -} - -#[allow(dead_code)] -unsafe fn test_pma>(path: P) { - let path = CString::new(path.as_ref().to_str().unwrap()).unwrap(); - raw::test_pma(path.as_ptr()); -} - -pub struct Pma { - path: PathBuf, - noun: Noun, -} - -impl Pma { - pub fn new>(path: P) -> Self { - let path = path.as_ref().to_path_buf(); - Self { path, noun: D(0) } - } -} - -impl Snapshot for Pma { - fn save(&mut self, stack: &mut NockStack, noun: &mut Noun) { - // Required so everything in the PMA has a cached mug, otherwise we would try to write - let _mug = mug_u32(stack, *noun); - - unsafe { stack.copy_pma(noun) }; - self.noun = *noun; - } - - fn sync(&mut self, _stack: &mut NockStack, epoch: u64, event: u64) { - unsafe { - pma_sync(epoch, event, self.noun); - } - } - - fn load(&mut self, _stack: &mut NockStack) -> std::io::Result<(u64, u64, Noun)> { - let path = self.path.join(".bin/page.bin"); - if path.is_file() { - eprintln!("\rload: found snapshot at {:?}", path); - unsafe { Ok(pma_load(&self.path)) } - } else { - eprintln!("\rload: creating snapshot at {:?}", path); - unsafe { pma_init(&self.path) }; - Ok((0, 0, D(0))) - } - } -} From 267a1a3cd522b4e8c217d96e9a9f89f74402e84d Mon Sep 17 00:00:00 2001 From: Edward Amsden Date: Mon, 4 Dec 2023 16:24:39 -0600 Subject: [PATCH 028/128] pma: skeleton of integration into ares --- rust/ares/Cargo.lock | 34 ++++++++- rust/ares/Cargo.toml | 1 + rust/ares/src/jets/cold.rs | 13 ++++ rust/ares/src/lib.rs | 1 + rust/ares/src/persist.rs | 147 ++++++++++++++++++++++++++++++++++++ rust/ares_pma/c-src/btree.c | 2 - rust/ares_pma/c-src/btree.h | 3 + 7 files changed, 198 insertions(+), 3 deletions(-) create mode 100644 rust/ares/src/persist.rs diff --git a/rust/ares/Cargo.lock b/rust/ares/Cargo.lock index 7bee7f8..679043f 100644 --- a/rust/ares/Cargo.lock +++ b/rust/ares/Cargo.lock @@ -22,6 +22,7 @@ name = "ares" version = "0.1.0" dependencies = [ "ares_macros", + "ares_pma", "assert_no_alloc", "autotools", "bitvec", @@ -50,6 +51,14 @@ dependencies = [ "syn 1.0.109", ] +[[package]] +name = "ares_pma" +version = "0.1.0" +dependencies = [ + "bindgen 0.69.1", + "cc", +] + [[package]] name = "assert_no_alloc" version = "1.1.2" @@ -105,6 +114,29 @@ dependencies = [ "which", ] +[[package]] +name = "bindgen" +version = "0.69.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9ffcebc3849946a7170a05992aac39da343a90676ab392c51a4280981d6379c2" +dependencies = [ + "bitflags 2.4.1", + "cexpr", + "clang-sys", + "lazy_static", + "lazycell", + "log", + "peeking_take_while", + "prettyplease", + "proc-macro2", + "quote", + "regex", + "rustc-hash", + "shlex", + "syn 2.0.39", + "which", +] + [[package]] name = "bitflags" version = "1.3.2" @@ -827,7 +859,7 @@ version = "0.1.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "ced751f95a527a3458eb67c75e4ae7093d41585edaa7565f5769101502473019" dependencies = [ - "bindgen", + "bindgen 0.68.1", "pkg-config", ] diff --git a/rust/ares/Cargo.toml b/rust/ares/Cargo.toml index 27d1199..23316f7 100644 --- a/rust/ares/Cargo.toml +++ b/rust/ares/Cargo.toml @@ -12,6 +12,7 @@ edition = "2018" # Please keep these alphabetized [dependencies] ares_macros = { path = "../ares_macros" } +ares_pma = { path = "../ares_pma" } assert_no_alloc = "1.1.2" # use this when debugging requires allocation (e.g. eprintln) # assert_no_alloc = {version="1.1.2", features=["warn_debug"]} diff --git a/rust/ares/src/jets/cold.rs b/rust/ares/src/jets/cold.rs index 29a8564..c904cdc 100644 --- a/rust/ares/src/jets/cold.rs +++ b/rust/ares/src/jets/cold.rs @@ -4,6 +4,7 @@ use crate::noun; use crate::noun::{Atom, DirectAtom, Noun, Slots, D, T}; use std::ptr::copy_nonoverlapping; use std::ptr::null_mut; +use crate::persist::{Persist, PMA}; pub enum Error { NoParent, @@ -267,6 +268,7 @@ impl Iterator for NounList { } } +#[derive(Copy,Clone)] pub struct Cold(*mut ColdMem); struct ColdMem { @@ -291,6 +293,17 @@ struct ColdMem { path_to_batteries: Hamt, } +impl Persist for Cold { + unsafe fn space_needed(&mut self, stack: &mut NockStack, pma: &PMA) -> usize { + todo!() + } + + unsafe fn copy_to_buffer(&mut self, stack: &mut NockStack, pma: &PMA, buffer: *mut u8) -> u64 { + todo!() + } +} + + impl Preserve for Cold { unsafe fn assert_in_stack(&self, stack: &NockStack) { stack.assert_struct_is_in(self.0, 1); diff --git a/rust/ares/src/lib.rs b/rust/ares/src/lib.rs index 8393ff9..c0bb7a3 100644 --- a/rust/ares/src/lib.rs +++ b/rust/ares/src/lib.rs @@ -14,6 +14,7 @@ pub mod serf; //pub mod bytecode; pub mod serialization; pub mod trace; +pub mod persist; /** Introduce useful functions for debugging * diff --git a/rust/ares/src/persist.rs b/rust/ares/src/persist.rs new file mode 100644 index 0000000..4a8b138 --- /dev/null +++ b/rust/ares/src/persist.rs @@ -0,0 +1,147 @@ +use ares_pma::*; +use crate::mem::NockStack; +use crate::noun::Noun; +use crate::jets::cold::Cold; +use std::path::PathBuf; +use std::ffi::CString; +use std::mem::size_of; + +const PMA_MODE: mode_t = 0o600; // RW for user only +const PMA_FLAGS: ULONG = 0; // ignored for now + +const PMA_CURRENT_SNAPSHOT_VERSION: u64 = 1; + +/// Handle to a PMA +pub struct PMA(*mut BT_state); + +pub struct Snapshot(*mut SnapshotMem); + +#[repr(C)] +#[repr(packed)] +pub struct SnapshotMem { + pub epoch: u64, + pub event_num: u64, + pub arvo: Noun, + pub cold: Cold, + pub mug: u32, +} + +#[repr(usize)] +enum BTMetaField { + SnapshotVersion = 0, + Snapshot = 1, +} + +impl PMA { + #[cfg(unix)] + pub fn open(path: PathBuf) -> Result { + let mut state: *mut BT_state = std::ptr::null_mut(); + + // correct for Unix thus cfg gated + let path_cstring = CString::new(path.into_os_string().as_encoded_bytes())?; + unsafe { + bt_state_new(&mut state); + let err = bt_state_open(state, path_cstring.as_ptr(), PMA_FLAGS, PMA_MODE); + if err == 0 { + Ok(PMA(state)) + } else { + // XX need to free the state + Err(std::io::Error::from_raw_os_error(err)) + } + } + } + + #[cfg(windows)] + pub fn open(path: PathBuf) -> Result { + todo!() + } + + #[inline] + fn meta_get(&self, field: BTMetaField) -> u64 { + unsafe { bt_meta_get(self.0, field as usize) } + } + + #[inline] + fn meta_set(&self, field: BTMetaField, val: u64) { + unsafe { bt_meta_set(self.0, field as usize, val) }; + } + + pub fn load(&self) -> Snapshot { + let snapshot_version = self.meta_get(BTMetaField::SnapshotVersion); + + match snapshot_version { + 1 => { + Snapshot(self.meta_get(BTMetaField::Snapshot) as *mut SnapshotMem) + } + _ => panic!("Unsupported snapshot version") + } + } + + pub fn save(&self, stack: &mut NockStack, snapshot: &mut Snapshot) { + self.meta_set(BTMetaField::SnapshotVersion, PMA_CURRENT_SNAPSHOT_VERSION); + self.meta_set(BTMetaField::Snapshot, snapshot.save_to_pma(stack, self)); + } + + pub fn sync(&self) { + unsafe { + if bt_sync(self.0) != 0 { + panic!("PMA sync failed but did not abort: this should never happen."); + } + } + } + + pub fn close(self) -> Result<(), std::io::Error> { + // XX need a way to free the state after + let err = unsafe { bt_state_close(self.0) }; + if err == 0 { + Ok(()) + } else { + Err(std::io::Error::from_raw_os_error(err)) + } + } +} + +pub trait Persist { + /// Count how much space is needed, in bytes. May set marks so long as marks are cleaned up by + /// [copy_into_buffer] + unsafe fn space_needed(&mut self, stack: &mut NockStack, pma: &PMA) -> usize; + + /// Copy into the provided buffer, which may be assumed to be at least as large as the size + /// returned by [space_needed] on the same structure. Return a u64 handle that could be saved + /// in metadata + unsafe fn copy_to_buffer(&mut self, stack: &mut NockStack, pma: &PMA, buffer: *mut u8) -> u64; + + /// Persist an object into the PMA using [space_needed] and [copy_to_buffer], returning + /// a [u64] (probably a pointer or tagged pointer) that can be saved into + fn save_to_pma(&mut self, stack: &mut NockStack, pma: &PMA) -> u64 { + unsafe { + let space_as_pages = self.space_needed(stack, pma) + (BT_PAGESIZE as usize - 1) >> BT_PAGEBITS; + let buffer = bt_malloc(pma.0, space_as_pages) as *mut u8; + self.copy_to_buffer(stack, pma, buffer) + } + } +} + +impl Persist for Snapshot { + unsafe fn space_needed(&mut self, stack: &mut NockStack, pma: &PMA) -> usize { + let mut arvo = (*(self.0)).arvo; + let mut cold = (*(self.0)).cold; + let arvo_space_needed = arvo.space_needed(stack, pma); + let cold_space_needed = cold.space_needed(stack, pma); + ((size_of::() + 7 >> 3) << 3) + arvo_space_needed + cold_space_needed + } + + unsafe fn copy_to_buffer(&mut self, stack: &mut NockStack, pma: &PMA, buffer: *mut u8) -> u64 { + todo!() + } +} + +impl Persist for Noun { + unsafe fn space_needed(&mut self, stack: &mut NockStack, pma: &PMA) -> usize { + todo!() + } + + unsafe fn copy_to_buffer(&mut self, stack: &mut NockStack, pma: &PMA, buffer: *mut u8) -> u64 { + todo!() + } +} diff --git a/rust/ares_pma/c-src/btree.c b/rust/ares_pma/c-src/btree.c index 97c830e..18a6bbc 100644 --- a/rust/ares_pma/c-src/btree.c +++ b/rust/ares_pma/c-src/btree.c @@ -115,9 +115,7 @@ off2addr(vaof_t off) return (void *)pu; } -#define BT_PAGEBITS 14ULL #define BT_PAGEWORD 32ULL -#define BT_PAGESIZE (1ULL << BT_PAGEBITS) /* 16K */ #define BT_NUMMETAS 2 /* 2 metapages */ #define BT_ADDRSIZE (BT_PAGESIZE << BT_PAGEWORD) #define PMA_GROW_SIZE (BT_PAGESIZE * 1024) diff --git a/rust/ares_pma/c-src/btree.h b/rust/ares_pma/c-src/btree.h index 94b964b..aad81e0 100644 --- a/rust/ares_pma/c-src/btree.h +++ b/rust/ares_pma/c-src/btree.h @@ -6,6 +6,9 @@ struct BT_state; typedef struct BT_state BT_state; +#define BT_PAGEBITS 14ULL +#define BT_PAGESIZE (1ULL << BT_PAGEBITS) /* 16K */ + typedef unsigned long ULONG; //// =========================================================================== From 7112890b9689ead233cfc8d5585d1660ac4bb1be Mon Sep 17 00:00:00 2001 From: Edward Amsden Date: Mon, 4 Dec 2023 18:07:51 -0600 Subject: [PATCH 029/128] pma: take snapshots from serf --- rust/ares/src/jets/cold.rs | 18 +++++++--- rust/ares/src/lib.rs | 2 +- rust/ares/src/persist.rs | 74 ++++++++++++++++++++++++++++---------- rust/ares/src/serf.rs | 56 ++++++++++++++++++++++++----- 4 files changed, 116 insertions(+), 34 deletions(-) diff --git a/rust/ares/src/jets/cold.rs b/rust/ares/src/jets/cold.rs index c904cdc..08c427d 100644 --- a/rust/ares/src/jets/cold.rs +++ b/rust/ares/src/jets/cold.rs @@ -2,9 +2,9 @@ use crate::hamt::Hamt; use crate::mem::{unifying_equality, NockStack, Preserve}; use crate::noun; use crate::noun::{Atom, DirectAtom, Noun, Slots, D, T}; +use crate::persist::{Persist, PMA}; use std::ptr::copy_nonoverlapping; use std::ptr::null_mut; -use crate::persist::{Persist, PMA}; pub enum Error { NoParent, @@ -268,7 +268,7 @@ impl Iterator for NounList { } } -#[derive(Copy,Clone)] +#[derive(Copy, Clone)] pub struct Cold(*mut ColdMem); struct ColdMem { @@ -297,12 +297,20 @@ impl Persist for Cold { unsafe fn space_needed(&mut self, stack: &mut NockStack, pma: &PMA) -> usize { todo!() } - - unsafe fn copy_to_buffer(&mut self, stack: &mut NockStack, pma: &PMA, buffer: *mut u8) -> u64 { + + unsafe fn copy_to_buffer( + &mut self, + stack: &mut NockStack, + pma: &PMA, + buffer: *mut u8, + ) -> (u64, *mut u8) { todo!() } -} + unsafe fn handle_from_u64(meta_handle: u64) -> Self { + Cold(meta_handle as *mut ColdMem) + } +} impl Preserve for Cold { unsafe fn assert_in_stack(&self, stack: &NockStack) { diff --git a/rust/ares/src/lib.rs b/rust/ares/src/lib.rs index c0bb7a3..c90cb87 100644 --- a/rust/ares/src/lib.rs +++ b/rust/ares/src/lib.rs @@ -12,9 +12,9 @@ pub mod newt; pub mod noun; pub mod serf; //pub mod bytecode; +pub mod persist; pub mod serialization; pub mod trace; -pub mod persist; /** Introduce useful functions for debugging * diff --git a/rust/ares/src/persist.rs b/rust/ares/src/persist.rs index 4a8b138..99820cc 100644 --- a/rust/ares/src/persist.rs +++ b/rust/ares/src/persist.rs @@ -1,20 +1,20 @@ -use ares_pma::*; +use crate::jets::cold::Cold; use crate::mem::NockStack; use crate::noun::Noun; -use crate::jets::cold::Cold; -use std::path::PathBuf; +use ares_pma::*; use std::ffi::CString; use std::mem::size_of; +use std::path::PathBuf; const PMA_MODE: mode_t = 0o600; // RW for user only const PMA_FLAGS: ULONG = 0; // ignored for now - + const PMA_CURRENT_SNAPSHOT_VERSION: u64 = 1; /// Handle to a PMA pub struct PMA(*mut BT_state); -pub struct Snapshot(*mut SnapshotMem); +pub struct Snapshot(pub *mut SnapshotMem); #[repr(C)] #[repr(packed)] @@ -23,7 +23,6 @@ pub struct SnapshotMem { pub event_num: u64, pub arvo: Noun, pub cold: Cold, - pub mug: u32, } #[repr(usize)] @@ -70,10 +69,8 @@ impl PMA { let snapshot_version = self.meta_get(BTMetaField::SnapshotVersion); match snapshot_version { - 1 => { - Snapshot(self.meta_get(BTMetaField::Snapshot) as *mut SnapshotMem) - } - _ => panic!("Unsupported snapshot version") + 1 => Snapshot(self.meta_get(BTMetaField::Snapshot) as *mut SnapshotMem), + _ => panic!("Unsupported snapshot version"), } } @@ -83,7 +80,7 @@ impl PMA { } pub fn sync(&self) { - unsafe { + unsafe { if bt_sync(self.0) != 0 { panic!("PMA sync failed but did not abort: this should never happen."); } @@ -109,17 +106,25 @@ pub trait Persist { /// Copy into the provided buffer, which may be assumed to be at least as large as the size /// returned by [space_needed] on the same structure. Return a u64 handle that could be saved /// in metadata - unsafe fn copy_to_buffer(&mut self, stack: &mut NockStack, pma: &PMA, buffer: *mut u8) -> u64; + unsafe fn copy_to_buffer( + &mut self, + stack: &mut NockStack, + pma: &PMA, + buffer: *mut u8, + ) -> (u64, *mut u8); /// Persist an object into the PMA using [space_needed] and [copy_to_buffer], returning - /// a [u64] (probably a pointer or tagged pointer) that can be saved into + /// a [u64] (probably a pointer or tagged pointer) that can be saved into fn save_to_pma(&mut self, stack: &mut NockStack, pma: &PMA) -> u64 { unsafe { - let space_as_pages = self.space_needed(stack, pma) + (BT_PAGESIZE as usize - 1) >> BT_PAGEBITS; + let space_as_pages = + self.space_needed(stack, pma) + (BT_PAGESIZE as usize - 1) >> BT_PAGEBITS; let buffer = bt_malloc(pma.0, space_as_pages) as *mut u8; - self.copy_to_buffer(stack, pma, buffer) + self.copy_to_buffer(stack, pma, buffer).0 } } + + unsafe fn handle_from_u64(meta_handle: u64) -> Self; } impl Persist for Snapshot { @@ -128,11 +133,33 @@ impl Persist for Snapshot { let mut cold = (*(self.0)).cold; let arvo_space_needed = arvo.space_needed(stack, pma); let cold_space_needed = cold.space_needed(stack, pma); - ((size_of::() + 7 >> 3) << 3) + arvo_space_needed + cold_space_needed + (((size_of::() + 7) >> 3) << 3) + arvo_space_needed + cold_space_needed } - unsafe fn copy_to_buffer(&mut self, stack: &mut NockStack, pma: &PMA, buffer: *mut u8) -> u64 { - todo!() + unsafe fn copy_to_buffer( + &mut self, + stack: &mut NockStack, + pma: &PMA, + buffer: *mut u8, + ) -> (u64, *mut u8) { + let snapshot_buffer = buffer as *mut SnapshotMem; + let arvo_buffer = buffer.add(((size_of::() + 7) >> 3) << 3); + std::ptr::copy_nonoverlapping(self.0, snapshot_buffer, 1); + *self = Snapshot(snapshot_buffer); + + let mut arvo = (*snapshot_buffer).arvo; + let (_, cold_buffer) = arvo.copy_to_buffer(stack, pma, arvo_buffer); + (*snapshot_buffer).arvo = arvo; + + let mut cold = (*snapshot_buffer).cold; + let (_, rest_buffer) = cold.copy_to_buffer(stack, pma, cold_buffer); + (*snapshot_buffer).cold = cold; + + (snapshot_buffer as u64, rest_buffer) + } + + unsafe fn handle_from_u64(meta_handle: u64) -> Self { + Snapshot(meta_handle as *mut SnapshotMem) } } @@ -141,7 +168,16 @@ impl Persist for Noun { todo!() } - unsafe fn copy_to_buffer(&mut self, stack: &mut NockStack, pma: &PMA, buffer: *mut u8) -> u64 { + unsafe fn copy_to_buffer( + &mut self, + stack: &mut NockStack, + pma: &PMA, + buffer: *mut u8, + ) -> (u64, *mut u8) { todo!() } + + unsafe fn handle_from_u64(meta_handle: u64) -> Self { + Noun::from_raw(meta_handle) + } } diff --git a/rust/ares/src/serf.rs b/rust/ares/src/serf.rs index f3940ef..12f071b 100644 --- a/rust/ares/src/serf.rs +++ b/rust/ares/src/serf.rs @@ -10,6 +10,7 @@ use crate::mem::NockStack; use crate::mug::*; use crate::newt::Newt; use crate::noun::{Atom, Cell, DirectAtom, Noun, Slots, D, T}; +use crate::persist::{Snapshot, SnapshotMem, PMA}; use crate::trace::*; use ares_macros::tas; use signal_hook; @@ -29,24 +30,30 @@ const FLAG_TRACE: u32 = 1 << 8; struct Context { epoch: u64, event_num: u64, - snapshot: (), + pma: PMA, arvo: Noun, mug: u32, nock_context: interpreter::Context, } impl Context { - pub fn new(snap_path: &PathBuf, trace_info: Option) -> Self { + pub fn new(snap_path: PathBuf, trace_info: Option) -> Self { // TODO: switch to Pma when ready // let snap = &mut snapshot::pma::Pma::new(snap_path); let mut stack = NockStack::new(1024 << 10 << 10, 0); - let snapshot = (); // XX PMA let newt = Newt::new(); let cache = Hamt::::new(); + let pma = PMA::open(snap_path).unwrap(); - - let (epoch, event_num, arvo, mut cold) = (0, 0, D(0), Cold::new(&mut stack)); // XX to load - // from PMA; + let (epoch, event_num, arvo, mut cold) = unsafe { + let snapshot = pma.load(); + ( + (*(snapshot.0)).epoch, + (*(snapshot.0)).event_num, + (*(snapshot.0)).arvo, + (*(snapshot.0)).cold, + ) + }; let mut hot = Hot::init(&mut stack); let warm = Warm::init(&mut stack, &mut cold, &mut hot); @@ -67,7 +74,7 @@ impl Context { Context { epoch, event_num, - snapshot, + pma, arvo, mug, nock_context, @@ -82,6 +89,37 @@ impl Context { // XX: assert event numbers are continuous self.arvo = new_arvo; self.event_num = new_event_num; + let snapshot = unsafe { + let snapshot_mem_ptr: *mut SnapshotMem = self.nock_context.stack.struct_alloc(1); + + // Save into PMA (does not sync) + (*snapshot_mem_ptr).epoch = self.epoch; + (*snapshot_mem_ptr).event_num = self.event_num; + (*snapshot_mem_ptr).arvo = self.arvo; + (*snapshot_mem_ptr).cold = self.nock_context.cold; + let mut snapshot = Snapshot(snapshot_mem_ptr); + self.pma.save(&mut self.nock_context.stack, &mut snapshot); + snapshot + }; + + // reset pointers in context to PMA + unsafe { + self.arvo = (*(snapshot.0)).arvo; + self.nock_context.cold = (*(snapshot.0)).cold; + } + + // Reset the nock stack, freeing all memory used to compute the event + self.nock_context.stack.reset(0); + + // XX some things were invalidated when we reset the stack + self.nock_context.warm = Warm::init( + &mut self.nock_context.stack, + &mut self.nock_context.cold, + &mut self.nock_context.hot, + ); + self.nock_context.cache = Hamt::new(); + self.nock_context.scry_stack = D(0); + // XX save to PMA self.mug = mug_u32(&mut self.nock_context.stack, self.arvo); } @@ -91,7 +129,7 @@ impl Context { // pub fn sync(&mut self) { - // XX save to PMA + self.pma.sync() } // @@ -212,7 +250,7 @@ pub fn serf() -> io::Result<()> { } } - let mut context = Context::new(&snap_path, trace_info); + let mut context = Context::new(snap_path, trace_info); context.ripe(); // Can't use for loop because it borrows newt From 1395d4b88579499671bc0711747b66d4f95f1c39 Mon Sep 17 00:00:00 2001 From: Edward Amsden Date: Mon, 4 Dec 2023 18:18:29 -0600 Subject: [PATCH 030/128] pma: remove previous PMA code --- rust/ares/build.rs | 66 - rust/ares/src/pma/README.md | 8 - rust/ares/src/pma/includes/checksum.c | 134 -- rust/ares/src/pma/includes/checksum.h | 66 - rust/ares/src/pma/malloc.c | 2167 ------------------------- rust/ares/src/pma/malloc.h | 118 -- rust/ares/src/pma/test/internals.h | 198 --- rust/ares/src/pma/test/malloc.c | 1511 ----------------- 8 files changed, 4268 deletions(-) delete mode 100644 rust/ares/build.rs delete mode 100644 rust/ares/src/pma/README.md delete mode 100644 rust/ares/src/pma/includes/checksum.c delete mode 100644 rust/ares/src/pma/includes/checksum.h delete mode 100644 rust/ares/src/pma/malloc.c delete mode 100644 rust/ares/src/pma/malloc.h delete mode 100644 rust/ares/src/pma/test/internals.h delete mode 100644 rust/ares/src/pma/test/malloc.c diff --git a/rust/ares/build.rs b/rust/ares/build.rs deleted file mode 100644 index 9b2e41e..0000000 --- a/rust/ares/build.rs +++ /dev/null @@ -1,66 +0,0 @@ -fn main() { - use std::env; - let profile = env::var("PROFILE").unwrap(); - - println!("cargo:rerun-if-changed=build.rs"); - println!("cargo:rerun-if-changed=./src/pma"); - - match profile.as_ref() { - "debug" => debug(), - "release" => release(), - _ => { - println!("unknown profile: {}", profile); - std::process::exit(-1); - } - } - - cc::Build::new() - .file("./src/pma/test/malloc.c") - .opt_level(0) - .flag("-g3") - .flag("-Wno-int-conversion") - .flag("-w") - .compile("test_pma_malloc_unit"); -} - -fn debug() { - cc::Build::new() - .file("./src/pma/malloc.c") - .file("./src/pma/includes/checksum.c") - .opt_level(0) - .flag("-g3") - .flag("-Wall") - .flag("-Wextra") - .flag("-Wpedantic") - .flag("-Wformat=2") - .flag("-Wno-unused-parameter") - .flag("-Wshadow") - .flag("-Wwrite-strings") - .flag("-Wstrict-prototypes") - .flag("-Wold-style-definition") - .flag("-Wredundant-decls") - .flag("-Wnested-externs") - .flag("-Wmissing-include-dirs") - .compile("pma_malloc"); -} - -fn release() { - cc::Build::new() - .file("./src/pma/malloc.c") - .file("./src/pma/includes/checksum.c") - .warnings_into_errors(true) - .opt_level(3) - .flag("-Wall") - .flag("-Wextra") - .flag("-Wpedantic") - .flag("-Wformat=2") - .flag("-Wno-unused-parameter") - .flag("-Wshadow") - .flag("-Wwrite-strings") - .flag("-Wstrict-prototypes") - .flag("-Wold-style-definition") - .flag("-Wredundant-decls") - .flag("-Wnested-externs") - .flag("-Wmissing-include-dirs") - .compile("pma_malloc"); -} diff --git a/rust/ares/src/pma/README.md b/rust/ares/src/pma/README.md deleted file mode 100644 index a2cd3d1..0000000 --- a/rust/ares/src/pma/README.md +++ /dev/null @@ -1,8 +0,0 @@ -## PMA - TODO - -Ported from development in a -[separate repo](https://github.com/ashelkovnykov/pma_malloc). README will be -updated after the final implementation is complete, which replaces the -array-based page directory with a B+ Tree one. Until then, please refer to the -README in the above-linked directory. - diff --git a/rust/ares/src/pma/includes/checksum.c b/rust/ares/src/pma/includes/checksum.c deleted file mode 100644 index faa69c9..0000000 --- a/rust/ares/src/pma/includes/checksum.c +++ /dev/null @@ -1,134 +0,0 @@ -/* - * Library: libcrc - * File: src/crc32.c (herein src/includes/checksum.c) - * Author: Lammert Bies - * - * This file is licensed under the MIT License as stated below - * - * Copyright (c) 1999-2016 Lammert Bies - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to deal - * in the Software without restriction, including without limitation the rights - * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - * - * ----------- - * Accessed in 2023 by Alex Shelkovnykov on behalf of Tlon Corporation from - * https://github.com/lammertb/libcrc/tree/v2.0. - * - * Description - * ----------- - * The source file src/includes/checksum.c contains the routines which are - * needed to calculate a 32 bit CRC value of a sequence of bytes. - */ - -#include -#include -#include "checksum.h" - -static void init_crc32_tab( void ); - -static bool crc_tab32_init = false; -static uint32_t crc_tab32[256]; - -/* - * uint32_t crc_32( const unsigned char *input_str, size_t num_bytes ); - * - * The function crc_32() calculates in one pass the common 32 bit CRC value for - * a byte string that is passed to the function together with a parameter - * indicating the length. - */ -uint32_t crc_32( const unsigned char *input_str, size_t num_bytes ) { - - uint32_t crc; - uint32_t tmp; - uint32_t long_c; - const unsigned char *ptr; - size_t a; - - if ( ! crc_tab32_init ) init_crc32_tab(); - - crc = CRC_START_32; - ptr = input_str; - - if ( ptr != NULL ) for (a=0; a> 8) ^ crc_tab32[ tmp & 0xff ]; - - ptr++; - } - - crc ^= 0xffffffffL; - - return crc & 0xffffffffL; - -} /* crc_32 */ - -/* - * uint32_t update_crc_32( uint32_t crc, unsigned char c ); - * - * The function update_crc_32() calculates a new CRC-32 value based on the - * previous value of the CRC and the next byte of the data to be checked. - */ - -uint32_t update_crc_32( uint32_t crc, unsigned char c ) { - - uint32_t tmp; - uint32_t long_c; - - long_c = 0x000000ffL & (uint32_t) c; - - if ( ! crc_tab32_init ) init_crc32_tab(); - - tmp = crc ^ long_c; - crc = (crc >> 8) ^ crc_tab32[ tmp & 0xff ]; - - return crc & 0xffffffffL;; - -} /* update_crc_32 */ - -/* - * static void init_crc32_tab( void ); - * - * For optimal speed, the CRC32 calculation uses a table with pre-calculated - * bit patterns which are used in the XOR operations in the program. This table - * is generated once, the first time the CRC update routine is called. - */ - -static void init_crc32_tab( void ) { - - uint32_t i; - uint32_t j; - uint32_t crc; - - for (i=0; i<256; i++) { - - crc = i; - - for (j=0; j<8; j++) { - - if ( crc & 0x00000001L ) crc = ( crc >> 1 ) ^ CRC_POLY_32; - else crc = crc >> 1; - } - - crc_tab32[i] = crc; - } - - crc_tab32_init = true; - -} /* init_crc32_tab */ diff --git a/rust/ares/src/pma/includes/checksum.h b/rust/ares/src/pma/includes/checksum.h deleted file mode 100644 index 0269131..0000000 --- a/rust/ares/src/pma/includes/checksum.h +++ /dev/null @@ -1,66 +0,0 @@ -/* - * Library: libcrc - * File: include/checksum.h (herein src/includes/checksum.h) - * Author: Lammert Bies - * - * This file is licensed under the MIT License as stated below - * - * Copyright (c) 1999-2016 Lammert Bies - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to deal - * in the Software without restriction, including without limitation the rights - * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - * - * ----------- - * Accessed in 2023 by Alex Shelkovnykov on behalf of Tlon Corporation from - * https://github.com/lammertb/libcrc/tree/v2.0. - * - * Description - * ----------- - * The headerfile src/includes/checksum.h contains the definitions and - * prototypes for routines that can be used to calculate several kinds of - * checksums. - */ - -#ifndef DEF_LIBCRC_CHECKSUM_H -#define DEF_LIBCRC_CHECKSUM_H - -#include - -/* - * #define CRC_POLY_xxxx - * - * The constants of the form CRC_POLY_xxxx define the polynomials for some well - * known CRC calculations. - */ -#define CRC_POLY_32 0xEDB88320L - -/* - * #define CRC_START_xxxx - * - * The constants of the form CRC_START_xxxx define the values that are used for - * initialization of a CRC value for common used calculation methods. - */ -#define CRC_START_32 0xFFFFFFFFL - -/* - * Prototype list of global functions - */ -uint32_t crc_32(const unsigned char *input_str, size_t num_bytes); -uint32_t update_crc_32(uint32_t crc, unsigned char c); - -#endif // DEF_LIBCRC_CHECKSUM_H diff --git a/rust/ares/src/pma/malloc.c b/rust/ares/src/pma/malloc.c deleted file mode 100644 index 399e563..0000000 --- a/rust/ares/src/pma/malloc.c +++ /dev/null @@ -1,2167 +0,0 @@ -/** - * ---------------------------------------------------------------------------- - * "THE BEER-WARE LICENSE" (Revision 42): - * wrote this file. As long as you retain this notice you - * can do whatever you want with this stuff. If we meet some day, and you think - * this stuff is worth it, you can buy me a beer in return. Poul-Henning Kamp - * ---------------------------------------------------------------------------- - */ -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include "includes/checksum.h" -#include "malloc.h" - -//============================================================================== -// CONFIGURABLE MACROS -//============================================================================== - -/** - * PMA_PAGE_SIZE = 1 << PMA_PAGE_SHIFT - * - * Should be configured to native page size. - */ -#define PMA_PAGE_SHIFT 12U - -/** - * PMA_MIN_ALLOC_SIZE = 1 << PMA_MIN_ALLOC_SHIFT - * - * Note that types/sizes in PMASharedPageHeader are currently hardcoded to this - * value being 4. - */ -#define PMA_MIN_ALLOC_SHIFT 4U - -/** - * How many bits per bitmap element. Change only if not 8 bits/byte - */ -#define PMA_BITMAP_BITS (8 * sizeof(uint8_t)) - -/** - * Increment block size for resizing the snapshot backing file (4 GiB in bytes). - * This is just the default increment; the backing file is extended by the - * smallest multiple of this value sufficient to fit the new allocation. - */ -#define PMA_SNAPSHOT_RESIZE_INC 0x100000000 - -//============================================================================== -// AUTO MACROS (do not manually configure) -//============================================================================== - -/** - * Number bytes per page - */ -#define PMA_PAGE_SIZE (1UL << PMA_PAGE_SHIFT) - -/** - * A mask for the offset of an address inside a page - */ -#define PMA_PAGE_MASK (PMA_PAGE_SIZE - 1) - -/** - * Minimum size of an allocation in bytes - * - * If this is too small, it's too much work to manage small allocations. - */ -#define PMA_MIN_ALLOC_SIZE (1U << PMA_MIN_ALLOC_SHIFT) - -/** - * PMA_MAX_SHARED_ALLOC = 1 << PMA_MAX_SHARED_SHIFT - * - * Should be log_2 of 1/4 of page size. Also the number of buckets in the array - * of shared page pointers. - */ -#define PMA_MAX_SHARED_SHIFT (PMA_PAGE_SHIFT - 2U) - -/** - * Max slot size (in bytes) for shared page allocations - * - * In the original phk_malloc code, this was set to 1/2 the size of a page. - * However, since shared page metadata is stored as a header inside the page - * itself, an allocation of 1/2 a page will use a full page anyway. Therefore, - * the limit is set to 1/4 of a page to remove the overhead of dealing with - * the shared page header for a page containing a single allocation. - */ -#define PMA_MAX_SHARED_ALLOC (1UL << PMA_MAX_SHARED_SHIFT) - -/** - * Number of buckets for shared page linked lists in the metadata page - */ -#define PMA_SHARED_BUCKETS (PMA_MAX_SHARED_SHIFT - PMA_MIN_ALLOC_SHIFT + 1) - -/** - * Round address down to beginning of containing page - */ -#define PAGE_ROUND_DOWN(foo) (foo & (~PMA_PAGE_MASK)) - -/** - * Round address up to beginning of next page - */ -#define PAGE_ROUND_UP(foo) ((foo + PMA_PAGE_MASK) & (~PMA_PAGE_MASK)) - -/** - * Convert pointer to index in page directory - */ -#define PTR_TO_INDEX(foo) ((((uint64_t)(foo)) - ((uint64_t)_pma_state->metadata->arena_start)) >> PMA_PAGE_SHIFT) - -/** - * Convert index in page directory to pointer - */ -#define INDEX_TO_PTR(foo) (void *)((char *)_pma_state->metadata->arena_start + ((foo) * PMA_PAGE_SIZE)) - -/** - * Flags to use for all mmap operations, excluding initial metadata page mapping - * - * We don't care to what memory the metadata pages are mapped, so long as it's - * before the memory arena, because we track it in the PMA process itself. - * However, to retain consistent pointers between ship shutdown & relaunch, we - * want all memory arena mmap mappings to go to the exact address to which we - * tell them. Another mapping already existing at one of those addresses is a - * fatal error. - * - * For more info, see https://www.man7.org/linux/man-pages/man2/mmap.2.html. - */ -#ifdef __linux__ - #define PMA_MMAP_FLAGS (MAP_SHARED | MAP_FIXED_NOREPLACE) -#else - #define PMA_MMAP_FLAGS (MAP_SHARED | MAP_FIXED) -#endif - -/** - * Magic code that identifies a file as an event snapshot file - */ -#define PMA_MAGIC_CODE 0xBADDECAFC0FFEE00 // i.e. all decaf coffee - -/** - * Version of the persistent memory arena which created an event snapshot (in - * case of breaking changes) - */ -#define PMA_DATA_VERSION 1 - -/** - * Representation of an empty byte for a byte in a bitmap (1 = empty, 0 = full) - */ -#define PMA_EMPTY_BITMAP 0xFF - -/** - * See PMASharedPageHeader for explanation - */ -#define PMA_BITMAP_SIZE 32 - -/** - * Max number of dpage offsets that can fit into a cache of free dpages stored - * as an array in a single page (when factoring in space used by metadata). - * - * 511 for 4 KiB page - */ -#define PMA_DPAGE_CACHE_SIZE ((PMA_PAGE_SIZE - sizeof(PMADPageCache)) / sizeof(uint64_t)) - -/** - * Max number of dirty page entries that can be stored in the extra space of the - * metadata page. Caching the dirty page entries and writing them as a part of - * the metadata allows us to solve the problem of desynchronization between the - * metadata and page directory without using B+ Trees. - * - * 164 for 4 KiB page - */ -// #define PMA_DIRTY_PAGE_LIMIT ((PMA_PAGE_SIZE - sizeof(PMAMetadata)) / sizeof(PMADirtyPageEntry)) -#define PMA_DIRTY_PAGE_LIMIT 164 - -/** - * Default settings for new PMA backing files - * - * See https://www.man7.org/linux/man-pages/man2/chmod.2.html for more info - * about individual flags. - * - * Start with a page directory big enough to hold 1 GiB of data: - * - * 1 GiB = 262144 page entries - * (up to) 16 bytes per page dir entry - * 4096 / 16 = 256 entries per page - * 262144 / 256 = 1024 pages - * 1024 * 4096 = 4194304 bytes - * - * Maximum size of page directory = 340 GiB - */ -#define PMA_SNAPSHOT_FILENAME "snap.bin" -#define PMA_PAGE_DIR_FILENAME "page.bin" -#define PMA_DEFAULT_DIR_NAME ".bin" -#define PMA_NEW_FILE_FLAGS (O_RDWR | O_CREAT) -#define PMA_LOAD_FILE_FLAGS (O_RDWR) -#define PMA_DIR_PERMISSIONS (S_IRWXU | S_IRWXG | S_IRWXO) -#define PMA_FILE_PERMISSIONS (S_IRUSR | S_IWUSR | S_IRGRP | S_IWGRP) -#define PMA_INIT_SNAP_SIZE 0x40000000 -#define PMA_INIT_DIR_SIZE 0x400000 - -/** - * Maximum possible size of the page directory. This is how big the page - * directory would need to be to reach all addressable virtual memory in Linux. - */ -#define PMA_MAXIMUM_DIR_SIZE 0x5500000000 - -/** - * Base address for the PMA. Lowest address not reserved by Linux. - */ -#ifdef __linux__ - #define PMA_SNAPSHOT_ADDR 0x10000 -#else - #define PMA_SNAPSHOT_ADDR 0x28000000000 -#endif - -/** - * Maximum file size on disk for the filesystem (16 TiB for ext4). - * - * TODO: need to automatically discover this and set it accordingly - */ -#define PMA_MAX_DISK_FILE_SIZE 0x100000000000 - -/** - * Maximum multiplier for resizing the snapshot backing file. - */ -#define PMA_MAX_RESIZE_FACTOR (PMA_MAX_DISK_FILE_SIZE / PMA_SNAPSHOT_RESIZE_INC) - -//============================================================================== -// HELPER MACROS -//============================================================================== - -/* TODO: these should just be funlike macros. The "save line" and goto is - unnecessary */ -/** - * Log error and return failure during new PMA bootstrap - */ -#define INIT_ERROR do { err_line = __LINE__; goto init_error; } while(0) - -/** - * Log error and return failure during existing PMA load - */ -#define LOAD_ERROR do { err_line = __LINE__; goto load_error; } while(0) - -/** - * Log error and return failure during PMA sync - */ -#define SYNC_ERROR do { err_line = __LINE__; goto sync_error; } while(0) - -/** - * Log warning to console - */ -#define WARNING(foo) _pma_warning(foo, address, __LINE__) - -//============================================================================== -// TYPES -//============================================================================== - -/** - * Page statuses used in page directory - */ -enum PMAPageStatus { - UNALLOCATED, - FREE, - SHARED, - FIRST, - FOLLOW -}; -typedef enum PMAPageStatus PMAPageStatus; - -/** - * Directory entry for a page in virtual memory - */ -typedef struct PMAPageDirEntry PMAPageDirEntry; -struct PMAPageDirEntry { - uint64_t offset; // Offset for page in backing file - PMAPageStatus status; // Status of page -}; - -/** - * Directory of pages in virtual memory - */ -typedef struct PMAPageDir PMAPageDir; -struct PMAPageDir { - uint64_t size; // Number of slots currently supported by page directory - uint64_t next_index; // Index of next open slot in (makes it easier to resize) - PMAPageDirEntry *entries; // Address to start of page directory as an array of entries -}; - -/** - * Shared allocation page - * - * A shared page is an array of slots of a single size. The metadata for each - * page is stored as a header within the page itself. - * - * On a 64-bit system, the alignment of this struct is 8. This is relevant to - * the currently hard-coded values for simplifying how slots work. The ideal - * size of a hard-coded bitmap, given the number of slots available in a page - * after subtracting the header, is 32 bytes: - * - * X = max # slots in page (min slot size = 16-bytes) - * (4096 - (11 + ceil(X/8))) > 16X - * (4096 - (11 + (X/8) + 1)) > 16X - * 4084 - X/8 > 16X - * 32672 - X > 128X - * 32672 > 129X - * 253.27 > X - * X = 253 - * bitmap bytes = ceil(253 div 8) = ceil(31.625) = 32 - * - * However, the alignment adds padding bytes in between the scalar and array - * struct members: - * (253 * 16) + 11 + 5 + 32 = 4096 - * - * In this case, this doesn't affect the total number of - * available slots, but it could if the members of the PMASharedPageHeader change. - */ -typedef struct PMASharedPageHeader PMASharedPageHeader; -struct PMASharedPageHeader { - struct PMASharedPageHeader *next; // Next shared page; forms a stack as additional pages of the same slot size are allocated - uint8_t dirty; // Dirty bit; necessary when allocating twice to the same page in one event - uint8_t size; // Slot size for this page = 2^size - uint8_t free; // Number of free slots in page - uint8_t bits[PMA_BITMAP_SIZE]; // Bitmap of which slots are free -}; - -/** - * Update to page directory state for an allocation. A limited number of such - * updates can be stored behind the header in the metadata page, allowing - * simultaneous copy-on-write semantics for the metadata and updates to the page - * directory without a B+ Tree. - */ -typedef struct PMADirtyPageEntry PMADirtyPageEntry; -struct PMADirtyPageEntry { - uint64_t index; // Index in page directory - uint64_t offset; // Offset on disk backing file - uint32_t num_pages; // Number of pages marked dirty (for multi-page allocations) - PMAPageStatus status; // Page status after sync -}; - -/** - * Free page cache node - * - * Nodes form a linked list of single free pages. A free page is an allocated - * page already backed by disk, but available for use (the old values were - * freed). - * - * Free pages are purposely not merged into runs, because two pages being - * adjacent in virtual memory does not mean that they are adjacent on disk, and - * disk locality is preferable for multi-page allocations. - * - * The caches for free single pages and free multi-page runs are split to save - * time: any free page will do for a shared page or single page allocation, but - * exact ranges are preferable for multi-page allocations. - */ -typedef struct PMASinglePageCache PMASinglePageCache; -struct PMASinglePageCache { - PMASinglePageCache *next; // Next node in list - void *page; // Pointer to free page -}; - -/** - * Free page run cache node - * - * Nodes form a linked list of free multi-page runs. A free page is an allocated - * page already backed by disk, but available for use (the old values were - * freed). - * - * Free pages are purposely not merged into runs, because two pages being - * adjacent in virtual memory does not mean that they are adjacent on disk, and - * disk locality is preferable for multi-page allocations (typically, when the - * OS experiences a page miss, the OS/hardware will fetch not just the missing - * page, but also several of the following [nearby?] pages). - * - * The caches for free single pages and free multi-page runs are split to save - * time: any free page will do for a shared page or single page allocation, but - * exact ranges are preferable for multi-page allocations. - */ -typedef struct PMAPageRunCache PMAPageRunCache; -struct PMAPageRunCache { - PMAPageRunCache *next; // Next node in list - void *page; // Pointer to start of page run - uint64_t length; // Number of pages in run -}; - -/** - * Free dpage cache - * - * A dpage is a page-sized block already allocated to the snapshot file on disk - * but without memory mapped to it. Reusing free dpages allows allocations - * without growing the backing file. - * - * The cache contains only individual dpages. Since multi-page allocations are - * never moved, their corresponding dpage allocations never change. When freed, - * multi-page allocations in the free page run cache still refer to the same - * contiguous block of dpages that they were assigned upon initial allocation. - */ -typedef struct PMADPageCache PMADPageCache; -struct PMADPageCache { - uint8_t dirty; // Has dpage cache already been copied to a new page with PROT_WRITE - uint16_t size; // Number of entries in queue - uint16_t head; // Index of front of queue - uint16_t tail; // Index of back of queue - uint64_t queue[]; // Cache of free dpages as queue; array of size PMA_DPAGE_CACHE_SIZE -}; - -/** - * Persistent Memory Arena/event snapshot metadata - */ -typedef struct PMAMetadata PMAMetadata; -struct PMAMetadata { - uint64_t magic_code; // Stamp identifying a file as a New Mars PMA file - uint32_t checksum; // Checksum value to detect corruption - uint32_t version; // Version of Vere (New Mars?) used to produce the backing file - uint64_t epoch; // Epoch ID of the most recently processed event - uint64_t event; // ID of the most recently processed event - uint64_t root; // Root after most recent event - void *arena_start; // Beginning of mapped address space - void *arena_end; // End of mapped address space (first address beyond mapped range) - PMASharedPageHeader *shared_pages[PMA_SHARED_BUCKETS]; // Shared allocation pages - PMADPageCache *dpage_cache; // Cache of free dpges as queue - uint64_t snapshot_size; // Size of the backing file - uint64_t next_offset; // Next open dpage in the backing file - uint8_t num_dirty_pages; // Counter of dirty page entries - uint64_t padding[2]; // sizeof(PMAMetadata) must be PMA_PAGE_SIZE - PMADirtyPageEntry dirty_pages[PMA_DIRTY_PAGE_LIMIT]; // Queue of changes not yet synced to page directory -}; -static_assert(sizeof(PMAMetadata) == PMA_PAGE_SIZE, "PMAMetadata must be a page in length"); - -/** - * Struct containing global data used by PMA - * - * Containment zone for what would otherwise be global variables. Global state - * stored in struct and passed around to functions that need it. Data that - * could otherwise go into the metadata, but is recomputable as derived state - * should go here. - */ -typedef struct PMAState PMAState; -struct PMAState { - PMAMetadata *metadata; // Metadata; contains current status of snapshot - uint64_t meta_page_offset; // Offset on disk of next metadata page to be replaced - PMAPageDir page_directory; // Page directory; maps virtual memory addresses to pages on disk - int snapshot_fd; // File descriptor for PMA backing file - int page_dir_fd; // File descriptor for page directory - PMASinglePageCache *free_pages; // Cache of free single pages - PMAPageRunCache *free_page_runs; // Cache of free multi-page runs -}; - - -//============================================================================== -// GLOBALS -//============================================================================== - -PMAState *_pma_state = NULL; - - -//============================================================================== -// FORWARD DECLARATIONS -//============================================================================== - -int _pma_verify_checksum(PMAMetadata *meta_page); -int _pma_sync_dirty_pages(int fd, uint8_t num_dirty_pages, PMADirtyPageEntry *dirty_pages); -int _pma_write_page_status(int fd, uint64_t index, PMAPageStatus status); -int _pma_write_page_offset(int fd, uint64_t index, uint64_t offset); -int _pma_update_free_pages(uint8_t num_dirty_pages, PMADirtyPageEntry *dirty_pages); -void *_pma_malloc_bytes(size_t size); -int _pma_malloc_shared_page(uint8_t bucket); -void *_pma_malloc_pages(size_t size); -void *_pma_malloc_single_page(PMAPageStatus status); -void *_pma_malloc_multi_pages(uint64_t num_pages); -void *_pma_get_cached_pages(uint64_t num_pages); -void *_pma_get_new_page(PMAPageStatus status); -void *_pma_get_new_pages(uint64_t num_pages); -int _pma_free_pages(void *address); -int _pma_free_bytes(void *address); -int _pma_copy_shared_page(void *address); -uint64_t _pma_get_single_dpage(void); -uint64_t _pma_get_cached_dpage(void); -int _pma_copy_dpage_cache(void); -uint64_t _pma_get_disk_dpage(void); -void _pma_copy_page(void *address, uint64_t offset, PMAPageStatus status, int fd); -void _pma_mark_page_dirty(uint64_t index, uint64_t offset, PMAPageStatus status, uint32_t num_pages); -int _pma_extend_snapshot_file(uint32_t multiplier); -void _pma_warning(const char *p, void *a, int l); -void _pma_state_free(void); -int _pma_state_malloc(void); - - -//============================================================================== -// PUBLIC FUNCTIONS -//============================================================================== - -// TODO: Replace errno codes with our own error codes - -// TODO: Inconsistent abort() calls; should better define when an error is fatal - -int -pma_init(const char *path) { - DIR *dir; - char *filepath; - PMAMetadata *meta_pages = 0; - void *page_dir = 0; - uint64_t meta_bytes; - int err; - int err_line; - int page_dir_fd = 0; - int snapshot_fd = 0; - - // Precompute metadata and page directory sizes in bytes - meta_bytes = 2 * PMA_PAGE_SIZE; - - // Allocate memory for state - if (_pma_state_malloc()) return -1; - - // - // Create backing files - // - - // Initialize dir and file path buffer - filepath = malloc( - strlen(path) + 1 + - strlen(PMA_DEFAULT_DIR_NAME) + 1 + - strlen(PMA_SNAPSHOT_FILENAME) + 1); - - // Create input directory, if necessary - dir = opendir(path); - if (dir == NULL) { - // Error if opening dir failed for reason other than it doesn't exist - if (ENOENT != errno) INIT_ERROR; - - // Error if creating dir failed - if (mkdir(path, PMA_DIR_PERMISSIONS)) INIT_ERROR; - } - - // Create file path for dir of backing files - sprintf(filepath, "%s/%s", path, PMA_DEFAULT_DIR_NAME); - - // Create dir for backing files, if necessary - dir = opendir(filepath); - if (dir == NULL) { - // Error if opening dir failed for reason other than it doesn't exist - if (ENOENT != errno) INIT_ERROR; - - // Error if creating dir failed - if (mkdir(filepath, PMA_DIR_PERMISSIONS)) INIT_ERROR; - } - - // Create backing file for snapshot - sprintf(filepath, "%s/%s/%s", path, PMA_DEFAULT_DIR_NAME, PMA_SNAPSHOT_FILENAME); - snapshot_fd = open(filepath, PMA_NEW_FILE_FLAGS, PMA_FILE_PERMISSIONS); - if (snapshot_fd == -1) INIT_ERROR; - - // Create backing file for page directory - sprintf(filepath, "%s/%s/%s", path, PMA_DEFAULT_DIR_NAME, PMA_PAGE_DIR_FILENAME); - page_dir_fd = open(filepath, PMA_NEW_FILE_FLAGS, PMA_FILE_PERMISSIONS); - if (page_dir_fd == -1) INIT_ERROR; - - // - // Set initial sizes for backing files - // - - // Set initial size of snapshot file - err = lseek(snapshot_fd, (PMA_INIT_SNAP_SIZE - 1), SEEK_SET); - if (err == -1) INIT_ERROR; - err = write(snapshot_fd, "", 1); - if (err != 1) INIT_ERROR; - - // Set initial size of page directory - err = lseek(page_dir_fd, (PMA_INIT_DIR_SIZE - 1), SEEK_SET); - if (err == -1) INIT_ERROR; - err = write(page_dir_fd, "", 1); - if (err != 1) INIT_ERROR; - - // - // Initialize snapshot and page directory - // - - /* - * The following links are useful for understanding the layout of virtual memory for a Linux process: - * https://www.sobyte.net/post/2022-08/linux-virtual-memory/ - * https://blog.holbertonschool.com/hack-the-virtual-memory-malloc-the-heap-the-program-break/ - * Chapters 2 & 3 - * - * Practically, on my machine, this translates to the following virtual memory layout: - * - ??? = 0x0000 0000 0000 - 0x0000 0000 ffff 64 KiB - * - empty = 0x0000 0001 0000 - 0x559f ffff ffff ~85 TiB - * - data = 0x55a0 0000 0000 - 0x560f ffff ffff 448 GiB - * - heap = 0x5610 0000 0000 - 0x7f3f ffff ffff ~41 TiB - * - libs = 0x7f40 0000 0000 - 0x7f9f ffff ffff 384 GiB - * - stack = 0x7fa0 0000 0000 - 0x7ffb ffff ffff 368 GiB - * - vdso = 0x7ffc 0000 0000 - 0x7fff ffff ffff 16 GiB - * Note that these address ranges are rough approximations and the sizes are vastly larger for sections like 'data' - * and 'vdso' than the actual memory section for the process because I'm documenting the range in which the section - * can be found. Identical Linux processes will not have identical memory layouts due to Address Space Layout - * Randomization. - * - * Without explicit arguments, calls to mmap will return addresses in the above 'stack' range, and successive calls - * will grow down. I presume that this is due to the implementation of this proposal: https://lwn.net/Articles/91829/ - * - * Given these circumstances, probably the easiest things to do are: - * 1. mmap the snapshot to a low address (i.e. 0x1 0000) so that it can use all of the available space before the - * 'data' section - * 2. mmap the page directory using its maximum possible size (at least on Linux, it's okay to mmap a file to more - * pages than it actually occupies and have it grow into the space). Doing so on eliminates the need to ever - * resize the mapping using mremap. - * 3. mmap the page directory without a location hint. How big is this mmap? Given the above estimate of virtual - * memory available to the snapshot (85 TiB) and the ratio of snapshot size to page directory size (256:1), this - * mapping would be 340 GiB in size. Even assuming the kernel were not smart enough to work around the linked - * libs, this is still small enough to fit into the stack, according to the above memory section size estimates. - */ - - // Init metadata pages - meta_pages = mmap( - NULL, - meta_bytes, - PROT_READ | PROT_WRITE, - MAP_SHARED, - snapshot_fd, - 0); - if (meta_pages == MAP_FAILED) INIT_ERROR; - - // Init page directory - page_dir = mmap( - NULL, - PMA_MAXIMUM_DIR_SIZE, - PROT_READ | PROT_WRITE, - MAP_SHARED, - page_dir_fd, - 0); - if (page_dir == MAP_FAILED) INIT_ERROR; - - // Initialize simple metadata state - _pma_state->metadata->magic_code = PMA_MAGIC_CODE; - _pma_state->metadata->checksum = 0; - _pma_state->metadata->version = PMA_DATA_VERSION; - _pma_state->metadata->epoch = 0; - _pma_state->metadata->event = 0; - _pma_state->metadata->root = 0; - - // Initialize shared pages stacks - for(uint8_t i = 0; i < PMA_SHARED_BUCKETS; ++i) { - _pma_state->metadata->shared_pages[i] = NULL; - } - - // Initialize dirty page array - for(uint8_t i = 0; i < PMA_DIRTY_PAGE_LIMIT; ++i) { - _pma_state->metadata->dirty_pages[i].index = 0; - _pma_state->metadata->dirty_pages[i].offset = 0; - _pma_state->metadata->dirty_pages[i].num_pages = 0; - } - _pma_state->metadata->num_dirty_pages = 0; - - // Initialize snapshot page info - _pma_state->metadata->snapshot_size = PMA_INIT_SNAP_SIZE; - _pma_state->metadata->next_offset = meta_bytes + PMA_PAGE_SIZE; - - // Initialize arena start pointer - _pma_state->metadata->arena_start = (void *)PMA_SNAPSHOT_ADDR; - - // Manually allocate a page for the dpage cache - _pma_state->metadata->dpage_cache = mmap( - _pma_state->metadata->arena_start, - PMA_PAGE_SIZE, - PROT_READ | PROT_WRITE, - PMA_MMAP_FLAGS, - snapshot_fd, - meta_bytes); - if (_pma_state->metadata->dpage_cache == MAP_FAILED) INIT_ERROR; - - // Initialize arena end pointer - _pma_state->metadata->arena_end = ((char*)_pma_state->metadata->arena_start + PMA_PAGE_SIZE); - - // Setup initial dpage cache values - _pma_state->metadata->dpage_cache->dirty = 0; - _pma_state->metadata->dpage_cache->size = 0; - _pma_state->metadata->dpage_cache->head = 0; - _pma_state->metadata->dpage_cache->tail = 0; - - // - // Setup page directory - // - - _pma_state->page_directory.size = PMA_INIT_DIR_SIZE; - _pma_state->page_directory.next_index = 1; - _pma_state->page_directory.entries = (PMAPageDirEntry *)page_dir; - - // First page used by dpage cache - _pma_state->page_directory.entries[0].status = FIRST; - _pma_state->page_directory.entries[0].offset = meta_bytes; - - // - // Setup transient state - // - - // Replace the first metadata page, since they're identical - _pma_state->meta_page_offset = 0; - - // Initialize file descriptors - _pma_state->snapshot_fd = snapshot_fd; - _pma_state->page_dir_fd = page_dir_fd; - - // Initialize free page caches - _pma_state->free_pages = NULL; - _pma_state->free_page_runs = NULL; - - // - // Sync initial PMA state to disk - // - - // Sync dpage cache - err = msync( - _pma_state->metadata->dpage_cache, - PMA_PAGE_SIZE, - MS_SYNC); - if (err) INIT_ERROR; - - // Sync page directory - err = msync(_pma_state->page_directory.entries, PMA_PAGE_SIZE, MS_SYNC); - if (err) INIT_ERROR; - - // Compute checksum for metadata - _pma_state->metadata->checksum = crc_32((unsigned char*)_pma_state->metadata, PMA_PAGE_SIZE); - - // Copy and sync metadata to both buffers - memset(meta_pages, 0, meta_bytes); - memcpy(&meta_pages[0], _pma_state->metadata, PMA_PAGE_SIZE); - memcpy(&meta_pages[1], _pma_state->metadata, PMA_PAGE_SIZE); - if (msync(meta_pages, meta_bytes, MS_SYNC)) INIT_ERROR; - - // Remove PROT_WRITE permissions from snapshot and page directory - if (mprotect(meta_pages, meta_bytes, PROT_READ)) INIT_ERROR; - if (mprotect(_pma_state->metadata->dpage_cache, PMA_PAGE_SIZE, PROT_READ)) INIT_ERROR; - if (mprotect(page_dir, PMA_PAGE_SIZE, PROT_READ)) INIT_ERROR; - - // - // Done - // - - // Clean up - free(filepath); - munmap(meta_pages, meta_bytes); - - return 0; - -init_error: - fprintf(stderr, "(L%d) PMA initialization error: %s\n", err_line, strerror(errno)); - - if (meta_pages) munmap(meta_pages, meta_bytes); - if (page_dir) munmap(page_dir, PMA_INIT_DIR_SIZE); - if (snapshot_fd) close(snapshot_fd); - if (page_dir_fd) close(page_dir_fd); - free(filepath); - _pma_state_free(); - - return -1; -} - -PMARootState -pma_load(const char *path) { - PMAMetadata *newer_page; - PMAMetadata *older_page; - char *filepath; - void *address; - PMAMetadata *meta_pages = 0; - uint64_t index; - uint64_t meta_bytes; - int err; - int err_line; - int page_dir_fd = 0; - int snapshot_fd = 0; - - // Precompute metadata and page directory sizes in bytes - meta_bytes = 2 * PMA_PAGE_SIZE; - - // Allocate memory for state - if (_pma_state_malloc()) return (PMARootState){0}; - - // - // Open backing files - // - - // Initialize dir and file path buffer - filepath = malloc( - strlen(path) + 1 + - strlen(PMA_DEFAULT_DIR_NAME) + 1 + - strlen(PMA_SNAPSHOT_FILENAME) + 1); - - // Open backing file for snapshot - sprintf(filepath, "%s/%s/%s", path, PMA_DEFAULT_DIR_NAME, PMA_SNAPSHOT_FILENAME); - snapshot_fd = open(filepath, PMA_LOAD_FILE_FLAGS, PMA_FILE_PERMISSIONS); - if (snapshot_fd == -1) LOAD_ERROR; - _pma_state->snapshot_fd = snapshot_fd; - - // Open backing file for page directory - sprintf(filepath, "%s/%s/%s", path, PMA_DEFAULT_DIR_NAME, PMA_PAGE_DIR_FILENAME); - page_dir_fd = open(filepath, PMA_LOAD_FILE_FLAGS, PMA_FILE_PERMISSIONS); - if (page_dir_fd == -1) LOAD_ERROR; - _pma_state->page_dir_fd = page_dir_fd; - - // - // Verify file can be loaded - // - - // Read magic code - if (-1 == read(snapshot_fd, &_pma_state->metadata->magic_code, sizeof(uint64_t))) { - LOAD_ERROR; - } else if (_pma_state->metadata->magic_code != PMA_MAGIC_CODE) { - errno = EILSEQ; - LOAD_ERROR; - } - - // Read version - if (-1 == pread(snapshot_fd, &_pma_state->metadata->version, sizeof(uint32_t), 12)) { - LOAD_ERROR; - } else if (_pma_state->metadata->version != PMA_DATA_VERSION) { - // TODO: possibly upgrade - errno = EILSEQ; - LOAD_ERROR; - } - - // Load metadata pages - meta_pages = mmap( - NULL, - meta_bytes, - PROT_READ, - MAP_SHARED, - snapshot_fd, - 0); - if (meta_pages == MAP_FAILED) LOAD_ERROR; - - // Determine newer metadata page - newer_page = &meta_pages[0]; - older_page = &meta_pages[1]; - assert(newer_page->magic_code == PMA_MAGIC_CODE); assert(older_page->magic_code == PMA_MAGIC_CODE); - if ((newer_page->epoch < older_page->epoch) - || ((newer_page->epoch == older_page->epoch) - && (newer_page->event < older_page->event))) { - newer_page = &meta_pages[1]; - older_page = &meta_pages[0]; - } - - // Verify checksum for either page - if (!_pma_verify_checksum(newer_page)) { - if (_pma_verify_checksum(older_page)) { - newer_page = older_page; - } else { - errno = EILSEQ; - LOAD_ERROR; - } - } - - // Next page replaced is the older of the two pages - _pma_state->meta_page_offset = (newer_page == meta_pages) ? PMA_PAGE_SIZE : 0; - - // Update page directory using metadata dirty page list - err = _pma_sync_dirty_pages(page_dir_fd, _pma_state->metadata->num_dirty_pages, _pma_state->metadata->dirty_pages); - if (err) LOAD_ERROR; - - _pma_state->metadata->num_dirty_pages = 0; - - // - // Load page directory - // - - // mmap page directory - _pma_state->page_directory.entries = mmap( - NULL, - PMA_MAXIMUM_DIR_SIZE, - PROT_READ, - MAP_SHARED, - page_dir_fd, - 0); - if (_pma_state->page_directory.entries == MAP_FAILED) LOAD_ERROR; - - // - // Map pages and compute free page caches - // - - // get total number of indices - struct stat st; - fstat(page_dir_fd, &st); - _pma_state->page_directory.size = (st.st_size / sizeof(PMAPageDirEntry)) - 1; - - - index = 0; - while (index < _pma_state->page_directory.size) { - uint64_t count = 1; - - switch (_pma_state->page_directory.entries[index].status) { - case UNALLOCATED: - ++index; - continue; - - case FREE: - // While pages have FREE status AND are contiguous on disk, scan forward - ++index; - while ( - _pma_state->page_directory.entries[index].status == FREE && - _pma_state->page_directory.entries[index].offset == (_pma_state->page_directory.entries[index - 1].offset + PMA_PAGE_SIZE)) { - ++count; - ++index; - } - - // Add to appropriate free page cache - if (count == 1) { - PMASinglePageCache *free_page = malloc(sizeof *free_page); - - // Add it to the single-page cache - free_page->next = _pma_state->free_pages; - free_page->page = INDEX_TO_PTR(index - 1); - _pma_state->free_pages = free_page; - - } else { - PMAPageRunCache *page_run = malloc(sizeof *page_run); - - page_run->next = _pma_state->free_page_runs; - page_run->page = INDEX_TO_PTR(index - count); - page_run->length = count; - _pma_state->free_page_runs = page_run; - } - - // Map free pages (they're expected to be mapped but read only) - address = mmap( - INDEX_TO_PTR(index - count), - (PMA_PAGE_SIZE * count), - PROT_READ, - PMA_MMAP_FLAGS, - snapshot_fd, - _pma_state->page_directory.entries[index - count].offset); - - continue; - - case SHARED: - // Map immediately - address = mmap( - INDEX_TO_PTR(index), - PMA_PAGE_SIZE, - PROT_READ, - PMA_MMAP_FLAGS, - snapshot_fd, - _pma_state->page_directory.entries[index].offset); - if (address == MAP_FAILED) LOAD_ERROR; - - ++index; - - continue; - - case FIRST: - // While pages have FOLLOW status, scan forward - ++index; - while (_pma_state->page_directory.entries[index].status == FOLLOW) { - ++count; - ++index; - } - - // mmap entire block - address = mmap( - INDEX_TO_PTR(index - count), - (count * PMA_PAGE_SIZE), - PROT_READ, - PMA_MMAP_FLAGS, - snapshot_fd, - _pma_state->page_directory.entries[index - count].offset); - if (address == MAP_FAILED) LOAD_ERROR; - - continue; - - case FOLLOW: - // FOLLOW pages should be passed over correctly by FIRST case - default: - fprintf(stderr, "Index %" PRIu64 " invalid\n", index); - errno = EINVAL; - LOAD_ERROR; - } - } - - // Get next free index - _pma_state->page_directory.next_index = index; - - // - // Done - // - - // Clean up - munmap(meta_pages, meta_bytes); - free(filepath); - - return (PMARootState){ - .epoch = _pma_state->metadata->epoch, - .event = _pma_state->metadata->event, - .root = _pma_state->metadata->root, - }; - -load_error: - fprintf(stderr, "(L%d) Error loading PMA from %s: %s\n", err_line, path, strerror(errno)); - - if (meta_pages) munmap(meta_pages, meta_bytes); - if (_pma_state->page_directory.entries) { - munmap(_pma_state->page_directory.entries, PMA_MAXIMUM_DIR_SIZE); - } - if (_pma_state->metadata && _pma_state->metadata->arena_start) { - munmap(_pma_state->metadata->arena_start, - (uintptr_t)_pma_state->metadata->arena_end - - (uintptr_t)_pma_state->metadata->arena_start); - } - if (snapshot_fd > 0) close(snapshot_fd); - if (page_dir_fd > 0) close(page_dir_fd); - free(filepath); - _pma_state_free(); - - return (PMARootState){0}; -} - -int -pma_close(uint64_t epoch, uint64_t event, uint64_t root) { - // Sync changes to disk - if (pma_sync(epoch, event, root)) { - return -1; - } - - // Unmap page directory - munmap(_pma_state->page_directory.entries, PMA_MAXIMUM_DIR_SIZE); - - // Unmap snapshot - // XX should just be end - start? - munmap(_pma_state->metadata->arena_start, _pma_state->metadata->snapshot_size); - - // Close file descriptors - close(_pma_state->page_dir_fd); - close(_pma_state->snapshot_fd); - - // free pma state - _pma_state_free(); - - return 0; -} - -void * -pma_malloc(size_t size) { - void *result = NULL; - - /* MALLOC_LOCK */ - - if (!size) { - /* MALLOC_UNLOCK */ - return result; - } else if ((size + PMA_PAGE_SIZE) < size) { // Check for overflow - errno = ENOMEM; - } else if (size <= PMA_MAX_SHARED_ALLOC) { - result = _pma_malloc_bytes(size); - } else { - result = _pma_malloc_pages(size); - } - - /* MALLOC_UNLOCK */ - - return result; -} - -int -pma_free(void *address) { - uint64_t index; - - // TODO: This is legal for POSIX free, but would this ever happen for pma_free? - if (address == NULL) return 0; - - if (address < _pma_state->metadata->arena_start) { - WARNING("address too low to make sense"); - errno = EINVAL; - return -1; - } - if (address >= _pma_state->metadata->arena_end) { - WARNING("address too high to make sense"); - errno = EINVAL; - return -1; - } - - index = PTR_TO_INDEX(address); - switch (_pma_state->page_directory.entries[index].status) { - case UNALLOCATED: - // Something has definitely gone wrong if an address between arena_start - // and arena_end, with an index between 0 and next_free_index is - // unallocated - WARNING("address marked unallocated"); - errno = EINVAL; - return -1; - - case FREE: - WARNING("address already free"); - errno = EINVAL; - return -1; - - case SHARED: - return _pma_free_bytes(address); - - case FIRST: - return _pma_free_pages(address); - - case FOLLOW: - WARNING("address points to middle of multi-page allocation"); - errno = EINVAL; - return -1; - } - - return 0; -} - -int -pma_sync(uint64_t epoch, uint64_t event, uint64_t root) { - PMADPageCache *dpage_cache = _pma_state->metadata->dpage_cache; - ssize_t bytes_out; - int err; - int err_line; - - // Epoch & event may only increase - if ( - (epoch < _pma_state->metadata->epoch) || - ((epoch == _pma_state->metadata->epoch) && (event <= _pma_state->metadata->event))) { - errno = EINVAL; - return -1; - } - - // Clear dpage cache dirty bit and compute new size. This is the only place - // where the dpage cache active size should ever increase! - if (dpage_cache->dirty) { - dpage_cache->dirty = 0; - dpage_cache->size = (dpage_cache->tail - dpage_cache->head); - if (dpage_cache->size > PMA_DPAGE_CACHE_SIZE) { - // Simple correction of integer underflow when queue wraps around - dpage_cache->size += PMA_DPAGE_CACHE_SIZE; - } - } - - // Sync dirty pages - for (uint8_t i = 0; i < _pma_state->metadata->num_dirty_pages; ++i) { - void *address = INDEX_TO_PTR(_pma_state->metadata->dirty_pages[i].index); - uint64_t bytes = (_pma_state->metadata->dirty_pages[i].num_pages * PMA_PAGE_SIZE); - - // Clear dirty bit for shared pages - if (_pma_state->metadata->dirty_pages[i].status == SHARED) { - ((PMASharedPageHeader*)address)->dirty = 0; - } - - err = msync(address, bytes, MS_SYNC); - if (err) SYNC_ERROR; - - if (mprotect(address, bytes, PROT_READ)) SYNC_ERROR; - } - - // Compute checksum - _pma_state->metadata->epoch = epoch; - _pma_state->metadata->event = event; - _pma_state->metadata->root = root; - _pma_state->metadata->checksum = 0; - _pma_state->metadata->checksum - = crc_32((unsigned char *)_pma_state->metadata, PMA_PAGE_SIZE); - - // Sync metadata - // - // Note: It's a long-standing Unix convention that while both write and - // pwrite return the number of bytes written, when operating on a file - // (as opposed to a pipe or socket) it is assumed that the entire - // buffer will be written. If this isn't the case, an error has - // occurred. - bytes_out = pwrite( - _pma_state->snapshot_fd, - _pma_state->metadata, - PMA_PAGE_SIZE, - _pma_state->meta_page_offset); - if (bytes_out != PMA_PAGE_SIZE) SYNC_ERROR; - - _pma_state->meta_page_offset = _pma_state->meta_page_offset ? 0 : PMA_PAGE_SIZE; - - // Sync dirty pages in page directory - err = _pma_sync_dirty_pages( - _pma_state->page_dir_fd, - _pma_state->metadata->num_dirty_pages, - _pma_state->metadata->dirty_pages); - if (err) SYNC_ERROR; - - // Update free page caches - err = _pma_update_free_pages(_pma_state->metadata->num_dirty_pages, _pma_state->metadata->dirty_pages); - if (err) SYNC_ERROR; - - // Reset dirty page array - _pma_state->metadata->num_dirty_pages = 0; - - return 0; - -sync_error: - fprintf(stderr, "(L%d) Error syncing PMA: %s\n", err_line, strerror(errno)); - - return -1; -} - -bool -pma_in_arena(void *address) { - return (address >= _pma_state->metadata->arena_start) - && (address < _pma_state->metadata->arena_end); -} - -//============================================================================== -// PRIVATE FUNCTIONS -//============================================================================== - -/** - * Verify that the checksum of a metadata page is valid - * - * Corruption or malicious interference is rare, so we assume that the checksum - * is correct and copy it into the global state in advance, then confirm its - * correctness there. - * - * @param meta_page Pointer to a metadata page loaded from disk - * - * @return Boolean (as int) for whether checksums match or not - */ -int -_pma_verify_checksum(PMAMetadata *meta_page) { - uint32_t checksum; - - // Copy metadata in advance of using it, since: 1) we expect the checksum to - // be valid; 2) we need to set the value of the checksum in the metadata to 0. - memcpy(_pma_state->metadata, meta_page, PMA_PAGE_SIZE); - - // Since we're computing the checksum on the object which itself includes the - // checksum, we treat the checksum as 0. - _pma_state->metadata->checksum = 0; - - // Compute checksum - checksum = crc_32((unsigned char *)_pma_state->metadata, PMA_PAGE_SIZE); - - // Compare checksums - return (checksum == meta_page->checksum); -} - -/** - * Sync updates from the dirty page cache in the metadata page to the page - * directory - * - * This sync is technically the *first* step of a new event, since the page - * directory for a snapshot is not valid until all of the changes from the dirty - * page cache have been applied. The PMA can crash at any moment, therefore - * applying the changes to the page directory from the previous event is - * required before processing a new one. Note that applying these changes to the - * page directory is an idempotent operation - which is good since we could - * theoretically crash on the same event repeatedly. - * - * @param fd Page directory file descriptor - * @param num_dirty_pages Size of dirty page cache - * @param dirty_pages Dirty page cache as array - * - * @return 0 success - * @return -1 failure; errno set to error code - */ -int -_pma_sync_dirty_pages(int fd, uint8_t num_dirty_pages, PMADirtyPageEntry *dirty_pages) { - PMAPageStatus cont_status; - uint64_t init_offset; - uint64_t index; - - for (uint8_t i = 0; i < num_dirty_pages; ++i) { - cont_status = (dirty_pages[i].status == FIRST) ? FOLLOW : dirty_pages[i].status; - init_offset = dirty_pages[i].offset; - index = dirty_pages[i].index; - - if (_pma_write_page_status(fd, index, dirty_pages[i].status)) return -1; - // Offset of 0 is code for "leave it alone" - if (init_offset) { - if (_pma_write_page_offset(fd, index, init_offset)) return -1; - } - - // The offset on disk doesn't actually matter for the continuation pages of - // a multi-page allocation, but it does matter for free page runs: just - // because two page runs are contiguous in memory, it doesn't mean they are - // contiguous on disk. An order of events like: - // - // [multi-page allocation] -> [shared-page allocation] -> [multi-page allocation] - // - // could produce a situation where the two multi-page allocations are - // adjacent in memory, but separated by one page on disk (because of - // copy-on-write using a new dpage during the shared page allocation). - for (uint32_t j = 1; j < dirty_pages[i].num_pages; ++j) { - assert((dirty_pages[i].status == FIRST) || (cont_status == FREE)); - - if (_pma_write_page_status(fd, (index + j), cont_status)) return -1; - // Offset of 0 is code for "leave it alone" - if (init_offset) { - if (_pma_write_page_offset(fd, (index + j), (init_offset + (j * PMA_PAGE_SIZE)))) return -1; - } - } - } - - return 0; -} - -/** - * Update page status of entry in page directory - * - * @param fd Page directory file descriptor - * @param index Directory index of entry - * @param status Page status - * - * @return 0 success - * @return -1 failure; errno set to error code - */ -int -_pma_write_page_status(int fd, uint64_t index, PMAPageStatus status) { - ssize_t bytes_out; - - bytes_out = pwrite( - fd, - (const void *)&status, - sizeof(PMAPageStatus), - ((index * sizeof(PMAPageDirEntry)) + sizeof(uint64_t))); - if (bytes_out < 1) { - return -1; - } - - return 0; -} - -/** - * Update page offset of entry in page directory - * - * @param fd Page directory file descriptor - * @param index Directory index of entry - * @param offset Page offset on disk - * - * @return 0 success - * @return -1 failure; errno set to error code - */ -int -_pma_write_page_offset(int fd, uint64_t index, uint64_t offset) { - ssize_t bytes_out; - - bytes_out = pwrite( - fd, - (const void *)&offset, - sizeof(uint64_t), - (index * sizeof(PMAPageDirEntry))); - if (bytes_out < 1) { - return -1; - } - - return 0; -} - -/** - * Add newly freed pages and page runs to the free page caches - * - * @param num_dirty_pages Size of dirty page cache - * @param dirty_pages Dirty page cache as array - * - * @return 0 success - * @return -1 failure; errno set to error code - */ -int -_pma_update_free_pages(uint8_t num_dirty_pages, PMADirtyPageEntry *dirty_pages) { - PMASinglePageCache *free_page; - PMAPageRunCache *page_run; - - // TODO: Pull out common code between here and pma_load - for (uint8_t i = 0; i < num_dirty_pages; ++i) { - if (dirty_pages[i].status != FREE) continue; - - if (dirty_pages[i].num_pages > 1) { - page_run = malloc(sizeof *page_run); - if (page_run == NULL) return -1; - - page_run->next = _pma_state->free_page_runs; - page_run->page = INDEX_TO_PTR(dirty_pages[i].index); - page_run->length = dirty_pages[i].num_pages; - _pma_state->free_page_runs = page_run; - - } else { - free_page = malloc(sizeof *free_page); - if (free_page == NULL) return -1; - - free_page->next = _pma_state->free_pages; - free_page->page = INDEX_TO_PTR(dirty_pages[i].index); - _pma_state->free_pages = free_page; - } - } - - return 0; -} - -/** - * Allocate memory within a shared allocation page. - * - * @param size Size in bytes to allocate (must be <= 1/4 page) - * - * @return NULL failure; errno set to error code - * @return void* address of the newly allocated memory - */ -void * -_pma_malloc_bytes(size_t size) -{ - PMASharedPageHeader *shared_page; - uint16_t i, slot_size; - uint8_t bucket, byte, bit; - - assert(size <= PMA_MAX_SHARED_ALLOC); - - // Don't bother with anything less than the minimum allocation size - if (size < PMA_MIN_ALLOC_SIZE) { - size = PMA_MIN_ALLOC_SIZE; - } - - // Find the right bucket - bucket = 1; - if (size) { - i = size - 1; - while (i >>= 1) bucket++; - } - slot_size = (1 << bucket); - bucket = bucket - PMA_MIN_ALLOC_SHIFT; - - // Search for a shared page with open slots - shared_page = _pma_state->metadata->shared_pages[bucket]; - while ((shared_page != NULL) && (shared_page->free == 0)) { - shared_page = shared_page->next; - } - - // Make a new shared page if necessary - if (shared_page == NULL) { - if (_pma_malloc_shared_page(bucket)) { - return NULL; - } - - shared_page = _pma_state->metadata->shared_pages[bucket]; - - } else { - if (_pma_copy_shared_page(shared_page)) { - return NULL; - } - } - - assert(shared_page->free); - - // Find first empty slot using bitmap (1 = empty, 0 = full) - byte = 0; - while (shared_page->bits[byte] == 0) { - assert(byte < PMA_BITMAP_SIZE); - ++byte; - } - i = shared_page->bits[byte]; - bit = 0; - while (~i & 1U) { - i >>= 1; - ++bit; - } - - // Mark slot full - shared_page->bits[byte] -= (1 << bit); - --(shared_page->free); - - // Return slot - return (char *)shared_page + - (sizeof(PMASharedPageHeader)) + - (slot_size * ((PMA_BITMAP_BITS * byte) + bit)); -} - -/** - * Allocate a new shared allocation page. - * - * @param bucket Into which bucket in the shared allocation pages array the new - * page will go (which also corresponds to the size of the slots - * in the page) - * - * @return 0 success - * @return -1 failure; errno set to error code - */ -int -_pma_malloc_shared_page(uint8_t bucket) -{ - PMASharedPageHeader *shared_page; - uint8_t shift; - - assert(bucket <= PMA_SHARED_BUCKETS); - - // Get a new writeable page - shared_page = (PMASharedPageHeader *)_pma_malloc_single_page(SHARED); - if (shared_page == NULL) { - return -1; - } - - // Compute shift - shift = bucket + PMA_MIN_ALLOC_SHIFT; - - // Initialize header for shared page - shared_page->dirty = 1; - shared_page->size = shift; - shared_page->free = ((PMA_PAGE_SIZE - sizeof(PMASharedPageHeader)) / (1 << shift)); - for (uint8_t i = 0; i < PMA_BITMAP_SIZE; ++i) { - shared_page->bits[i] = PMA_EMPTY_BITMAP; - } - - // Add new shared page to top of stack - shared_page->next = _pma_state->metadata->shared_pages[bucket]; - _pma_state->metadata->shared_pages[bucket] = shared_page; - - return 0; -} - -/** - * Allocate memory for a large object in one or more pages. - * - * @param size Size in bytes to allocate (must be > 1/4 page) - * - * @return NULL failure; errno set to error code - * @return void* address of the newly allocated memory - */ -void * -_pma_malloc_pages(size_t size) -{ - void *address; - uint64_t num_pages; - - assert(size > PMA_MAX_SHARED_ALLOC); - - // Round size up to nearest page boundary - size = PAGE_ROUND_UP(size); - num_pages = size >> PMA_PAGE_SHIFT; - - if (num_pages == 1) { - address = _pma_malloc_single_page(FIRST); - } else { - address = _pma_malloc_multi_pages(num_pages); - } - - return address; -} - -/** - * Allocate a single new page - * - * Reuse pages from the free page cache, if any are available. These pages are - * used for shared allocations and for "large" allocations that are between 1/4 - * and 1 page in size: (0.25, 1]. - * - * @param status Page status after allocation (SHARED or FIRST) - * - * @return NULL failure; errno set to error code - * @return void* address of the newly allocated memory - */ -void * -_pma_malloc_single_page(PMAPageStatus status) { - void *address; - PMASinglePageCache *free_page = _pma_state->free_pages; - - // Get an existing free page from cache, if available - if (free_page != NULL) { - address = free_page->page; - _pma_state->free_pages = free_page->next; - free(free_page); - - // Make the page writeable - mprotect(address, PMA_PAGE_SIZE, (PROT_READ | PROT_WRITE)); - - // Add page to dirty list - _pma_mark_page_dirty(PTR_TO_INDEX(address), 0, status, 1); - } else { - // Otherwise, allocate a new page - address = _pma_get_new_page(status); - } - - assert((((uint64_t)address) % PMA_PAGE_SIZE) == 0); - - return address; -} - -/** - * Allocate a contiguous block of multiple pages - * - * Reuse pages from the free page run cache, if any are available. - * - * @param num_pages # pages to allocate - * - * @return NULL failure; errno set to error code - * @return void* address of the newly allocated memory - */ -void * -_pma_malloc_multi_pages(uint64_t num_pages) { - void *address; - - address = _pma_get_cached_pages(num_pages); - if (!address) { - address = _pma_get_new_pages(num_pages); - } - - return address; -} - -/** - * Pull existing free pages from the free page run cache - * - * Does a pass over the entire cache to see if there is an exactly-sized page - * run. If so, it's used immediately. Otherwise, keeps track of the smallest - * page run that can be split to accommodate the requested allocation. - * - * @param num_pages # pages to allocate - * - * @return void* address of the newly allocated memory (NULL if none available) - */ -void * -_pma_get_cached_pages(uint64_t num_pages) { - PMAPageRunCache **pre_valid_ptr = NULL; - PMAPageRunCache **prev_node_ptr = &(_pma_state->free_page_runs); - PMAPageRunCache *page_run_cache = _pma_state->free_page_runs; - PMAPageRunCache *valid_page_run = NULL; - void *address = NULL; - - // Do a pass looking for an exactly-sized run. While doing this, also record the smallest run still big enough to fit - // our data. - while (page_run_cache != NULL) { - uint64_t run_length = page_run_cache->length; - - if (run_length == num_pages) { - valid_page_run = page_run_cache; - pre_valid_ptr = prev_node_ptr; - break; - - } else if (run_length > num_pages ) { - if ((valid_page_run == NULL) || (valid_page_run->length > run_length)) { - valid_page_run = page_run_cache; - pre_valid_ptr = prev_node_ptr; - } - } - - prev_node_ptr = &(page_run_cache->next); - page_run_cache = page_run_cache->next; - } - - // If run found... - if (valid_page_run != NULL) { - // Use it - address = valid_page_run->page; - - // If run larger than necessary by two pages... - if (valid_page_run->length > (num_pages + 1)) { - // Reduce it - valid_page_run->page = (uint8_t*)valid_page_run->page + (num_pages * PMA_PAGE_SIZE); - valid_page_run->length -= num_pages; - - // Otherwise... - } else { - // Update cache pointers: we're going to use the whole run or we're going - // to move the remaining page to the single-page cache. Either way, we're - // going to free the run object. - *pre_valid_ptr = valid_page_run->next; - - // If there's a page left... - if (valid_page_run->length == (num_pages + 1)) { - PMASinglePageCache *trailing_page = (PMASinglePageCache *)malloc(sizeof(PMASinglePageCache)); - - // Add it to the single-page cache - trailing_page->next = _pma_state->free_pages; - trailing_page->page = ((char *)address + (num_pages * PMA_PAGE_SIZE)); - _pma_state->free_pages = trailing_page; - } - - free(valid_page_run); - } - - // Make pages writeable - mprotect(address, (num_pages * PMA_PAGE_SIZE), (PROT_READ | PROT_WRITE)); - - // Add pages to dirty list - _pma_mark_page_dirty(PTR_TO_INDEX(address), 0, FIRST, num_pages); - } - - return address; -} - -/** - * Allocate a single new page - * - * Allocates a new page in virtual memory. May or may not use a new dpage. - * - * @param status Page status after allocation (SHARED or FIRST) - * - * @return NULL failure; errno set to error code - * @return void* address of the newly allocated memory - */ -void * -_pma_get_new_page(PMAPageStatus status) { - void *address; - uint64_t offset; - - // Get a dpage to which to map the address - offset = _pma_get_single_dpage(); - if (!offset) { - return NULL; - } - - // Try to map next open memory address to dpage - address = mmap( - _pma_state->metadata->arena_end, - PMA_PAGE_SIZE, - PROT_READ | PROT_WRITE, - PMA_MMAP_FLAGS, - _pma_state->snapshot_fd, - offset); - if (address == MAP_FAILED) { - address = _pma_state->metadata->arena_end; - WARNING("mmap failed"); - abort(); - } - - assert(address == _pma_state->metadata->arena_end); - - // Record PMA expansion - _pma_state->metadata->arena_end = (uint8_t*)_pma_state->metadata->arena_end + PMA_PAGE_SIZE; - - // Add page to dirty list - _pma_mark_page_dirty(PTR_TO_INDEX(address), offset, status, 1); - - return address; -} - -/** - * Allocate multiple new pages - * - * Allocate 2 or more pages in virtual memory. May or may not use new dpages. - * - * @param num_pages # pages to allocate - * - * @return NULL failure; errno set to error code - * @return void* address of the newly allocated memory - */ -void * -_pma_get_new_pages(uint64_t num_pages) { - void *address; - uint64_t bytes = (num_pages * PMA_PAGE_SIZE); - uint64_t offset = _pma_state->metadata->next_offset; - uint64_t size = _pma_state->metadata->snapshot_size; - uint64_t new_size = (offset + bytes); - - // Get new dpages. Extend snapshot backing file first, if necessary. - if (new_size >= size) { - // Multi-page allocations maybe larger than the snapshot resize increment - uint32_t multiplier = ((new_size - size) / PMA_SNAPSHOT_RESIZE_INC) + 1; - - // Fail if snapshot file couldn't be extended - if (_pma_extend_snapshot_file(multiplier)) return NULL; - } - - // Try to map dpages to address - address = mmap( - _pma_state->metadata->arena_end, - bytes, - PROT_READ | PROT_WRITE, - PMA_MMAP_FLAGS, - _pma_state->snapshot_fd, - offset); - if (address == MAP_FAILED) { - address = _pma_state->metadata->arena_end; - WARNING("mmap failed"); - abort(); - } - - assert(address == _pma_state->metadata->arena_end); - - // Update offset of next open dpage - _pma_state->metadata->next_offset += bytes; - _pma_state->metadata->arena_end = (uint8_t*)_pma_state->metadata->arena_end + bytes; - - // Add allocated pages to dirty list - _pma_mark_page_dirty(PTR_TO_INDEX(address), offset, FIRST, num_pages); - - return address; -} - -/** - * Deallocate one or more pages of allocated memory - * - * @param address Address of block to deallocated - * - * @return 0 success - * @return -1 failure; errno set to error code - */ -int -_pma_free_pages(void *address) { - uint32_t index = PTR_TO_INDEX(address); - uint32_t num_pages = 0; - - if ((uint64_t)address & PMA_PAGE_MASK) { - WARNING("address does not point to the root of a page"); - errno = EINVAL; - return -1; - } - - assert(_pma_state->page_directory.entries[index].status == FIRST); - - // Count number of pages in allocation - do { - ++num_pages; - } while (_pma_state->page_directory.entries[index + num_pages].status == FOLLOW); - - // Mark pages dirty - _pma_mark_page_dirty(index, 0, FREE, num_pages); - - return 0; -} - -/** - * Deallocate a block of memory in a shared allocation page. - * - * @param address Address of block to deallocated - * - * @return 0 success - * @return -1 failure; errno set to error code - */ -int -_pma_free_bytes(void *address) { - PMASharedPageHeader *header = (PMASharedPageHeader *)((uint64_t)address & (~PMA_PAGE_MASK)); - uint8_t slot = ((((uint64_t)address & PMA_PAGE_MASK) - sizeof(PMASharedPageHeader)) / (1 << header->size)); - uint8_t byte = slot / PMA_BITMAP_BITS; - uint8_t bit = slot % PMA_BITMAP_BITS; - - // Copy-on-write - _pma_copy_shared_page(header); - - if (header->bits[byte] & (1 << bit)) { - WARNING("bucketized address already free"); - errno = EINVAL; - return -1; - } - - header->bits[byte] += (1 << bit); - ++header->free; - - return 0; -} - -/** - * Copy a shared allocation page - * - * @param address Virtual memory address of shared allocation page - * - * @return 0 success - * @return -1 failure; errno set to error code - */ -int -_pma_copy_shared_page(void *address) { - PMASharedPageHeader *shared_page; - uint64_t offset; - - // Check if page has already been copied - shared_page = (PMASharedPageHeader*)address; - if (shared_page->dirty) { - return 0; - } - - offset = _pma_get_single_dpage(); - if (!offset) { - return -1; - } - - // Make sure dpage cache is writeable - if (!_pma_state->metadata->dpage_cache->dirty) { - if (_pma_copy_dpage_cache()) { - WARNING("dpage cache copy failed"); - abort(); - } - } - - // Copy page - _pma_copy_page(address, offset, SHARED, _pma_state->snapshot_fd); - - // Mark page dirty so it isn't copied again - shared_page->dirty = 1; - - return 0; -} - -/** - * Allocate a new dpage (disk page) - * - * Reuse a page from the free dpage cache, if any are available. - * - * @return 0 failure; errno set to error code - * @return uint64_t offset of new page in backing file - */ -uint64_t -_pma_get_single_dpage(void) { - uint64_t offset; - - // Get a cached dpage, if one is available - offset = _pma_get_cached_dpage(); - if (!offset) { - // Otherwise, get a new dpage from disk - // - // XX returns 0 on failure, should assert - offset = _pma_get_disk_dpage(); - } - - assert((offset % PMA_PAGE_SIZE) == 0); - - return offset; -} - -/** - * Pull a free dpage from the dpage cache - * - * @return offset of new page in backing file (0 if cache empty) - */ -uint64_t -_pma_get_cached_dpage(void) { - uint64_t offset; - uint16_t dirty = _pma_state->metadata->dpage_cache->dirty; - uint16_t size = _pma_state->metadata->dpage_cache->size; - uint16_t head; - - // If the cache is empty, or there's only one page in the cache and the cache - // hasn't been touched yet, then exit early. If the cache hasn't been touched - // yet, we'll need to copy-on-write the cache as well, so if there's only one - // page, don't even bother. - if ((size == 0) || ((size == 1) && !dirty)) { - return 0; - } - - // Special copy-on-write for dpage cache - if (!dirty) { - if (_pma_copy_dpage_cache()) { - void *address = _pma_state->metadata->dpage_cache; - WARNING(strerror(errno)); - abort(); - } - } - - // TODO: macros for dealing with cache? - // Pop page off queue; head can't be assigned earlier as _pma_copy_dpage_cache - // may also try to pop a page off of the queue - head = _pma_state->metadata->dpage_cache->head; - offset = _pma_state->metadata->dpage_cache->queue[head]; - assert(offset != 0); - _pma_state->metadata->dpage_cache->size -= 1; - _pma_state->metadata->dpage_cache->head = ((head + 1) % PMA_DPAGE_CACHE_SIZE); - - return offset; -} - -/** - * Copy the free dpage cache - * - * Free dpage cache needs to be copied using copy-on-write semantics when pages - * are added or removed. - * - * @return 0 success - * @return -1 failure; errno set to error code - */ -int -_pma_copy_dpage_cache(void) { - void *address; - uint64_t offset; - uint16_t dirty = _pma_state->metadata->dpage_cache->dirty; - uint16_t size = _pma_state->metadata->dpage_cache->size; - uint16_t head = _pma_state->metadata->dpage_cache->head; - - assert(!dirty); - - address = _pma_state->metadata->dpage_cache; - - // If pages available in cache... - if (size) { - // Use a page from the cache and record that it was used afterwards - offset = _pma_state->metadata->dpage_cache->queue[head]; - assert(offset != 0); - - _pma_copy_page(address, offset, FIRST, _pma_state->snapshot_fd); - - _pma_state->metadata->dpage_cache->size -= 1; - _pma_state->metadata->dpage_cache->head = ((head + 1) % PMA_DPAGE_CACHE_SIZE); - - } else { - // Otherwise, get a brand new page from disk - offset = _pma_get_disk_dpage(); - if (!offset) return -1; - - _pma_copy_page(address, offset, FIRST, _pma_state->snapshot_fd); - } - - // Mark dpage cache dirty (aka writeable) - _pma_state->metadata->dpage_cache->dirty = 1; - - return 0; -} - -/** - * Get a new free dpage on disk - * - * May require extending the snapshot backing file on disk. - * - * @return offset of new page in backing file (0 if failure) - */ -uint64_t -_pma_get_disk_dpage(void) { - uint64_t offset = _pma_state->metadata->next_offset; - uint64_t size = _pma_state->metadata->snapshot_size; - - // Get a new dpage. Extend snapshot backing file first, if necessary. - if (offset == size) { - // Fail if snapshot file couldn't be extended - if (_pma_extend_snapshot_file(1)) return 0; - } - - // Update offset of next open dpage - _pma_state->metadata->next_offset += PMA_PAGE_SIZE; - - return offset; -} - -/** - * Copy an existing page to a new dpage - * - * Core copy-on-write implementation. - * - * @param address Virtual memory address of existing page - * @param offset Offset of dpage in backing file to which to copy - * @param status Page status after copy (SHARED or FIRST) - * @param fd PMA file descriptor - */ -void -_pma_copy_page(void *address, uint64_t offset, PMAPageStatus status, int fd) { - void *new_address; - ssize_t bytes_out; - uint64_t index = PTR_TO_INDEX(address); - uint16_t tail = _pma_state->metadata->dpage_cache->tail; - - bytes_out = pwrite(fd, address, PMA_PAGE_SIZE, offset); - if (bytes_out != PMA_PAGE_SIZE) { - WARNING(strerror(errno)); - abort(); - } - - new_address = mmap( - address, - PMA_PAGE_SIZE, - PROT_READ | PROT_WRITE, - MAP_SHARED | MAP_FIXED, - fd, - offset); - if (new_address == MAP_FAILED) { - WARNING(strerror(errno)); - abort(); - } - - assert(new_address == address); - - // Add previous dpage to cache - // Note: the dpage cache should always be writeable here, either because the - // dpage cache is the page we just copied, or because it was made - // writeable in advance by _pma_copy_shared_page - assert(_pma_state->page_directory.entries[index].offset != 0); - _pma_state->metadata->dpage_cache->queue[tail] = _pma_state->page_directory.entries[index].offset; - _pma_state->metadata->dpage_cache->tail = ((tail + 1) % PMA_DPAGE_CACHE_SIZE); - - // Add page to dirty page list - _pma_mark_page_dirty(index, offset, status, 1); -} - -/** - * Add entry to the dirty page store - * - * @param index Index of page in page directory - * @param offset Offset of page in PMA file - * @param status Status of pages - * @param num_pages Number of pages represented by this entry - */ -void -_pma_mark_page_dirty(uint64_t index, uint64_t offset, PMAPageStatus status, uint32_t num_pages) { - PMADirtyPageEntry *dirty_page = _pma_state->metadata->dirty_pages; - - dirty_page += _pma_state->metadata->num_dirty_pages++; - - assert(_pma_state->metadata->num_dirty_pages <= PMA_DIRTY_PAGE_LIMIT); - - dirty_page->index = index; - dirty_page->offset = offset; - dirty_page->status = status; - dirty_page->num_pages = num_pages; -} - -/** - * Extend the size of the PMA backing file on disk - * - * Note: while it's possible that a multiplier larger than 2^32 could be valid - * (i.e. using ZFS is the file system, so the backing file can be up to - * 16 EiB in size, and the PMA backing file extension increment is less - * than 4 GiB), it almost certainly would never be encountered (the user - * needs to allocate a 2 EiB file to the loom?). - * - * @param multiplier New size = old size * multiplier - * - * @return 0 success - * @return -1 failure; errno set to error code - */ -int -_pma_extend_snapshot_file(uint32_t multiplier) { - off_t err; - ssize_t bytes; - uint64_t new_snapshot_size; - - // Reject invalid multipliers - if (!multiplier || (multiplier > PMA_MAX_RESIZE_FACTOR)) return -1; - - // Update size in metadata - new_snapshot_size = _pma_state->metadata->snapshot_size + (multiplier * PMA_SNAPSHOT_RESIZE_INC); - - // Extend snapshot file - err = lseek(_pma_state->snapshot_fd, (new_snapshot_size - 1), SEEK_SET); - if (err == -1) return -1; - - bytes = write(_pma_state->snapshot_fd, "", 1); - if (bytes < 1) return -1; - - _pma_state->metadata->snapshot_size = new_snapshot_size; - return 0; -} - -/** - * Log warning message to console. - * - * @param s Error message - * @param p Address which caused error - * @param l Line number - */ -void -_pma_warning(const char *s, void *p, int l) { - fprintf(stderr, "*** %d: %p - %s\n", l, p, s); -} - -/** - * Helper function to deallocate PMA state on shutdown. - */ -void -_pma_state_free(void) -{ - if (_pma_state) { - if (_pma_state->metadata) free(_pma_state->metadata); - free(_pma_state); - _pma_state = NULL; - } -} - -/** - * Helper function to allocate memory for PMA state. - * - * @return 1 allocated PMA state already exists - * @return 0 memory for new PMA state successfully allocated - */ -int -_pma_state_malloc(void) -{ - if (_pma_state != NULL) return 1; - PMAState *ret = calloc(1, sizeof *ret); - ret->metadata = calloc(1, sizeof *ret->metadata); - _pma_state = ret; - return 0; -} diff --git a/rust/ares/src/pma/malloc.h b/rust/ares/src/pma/malloc.h deleted file mode 100644 index bfb7d82..0000000 --- a/rust/ares/src/pma/malloc.h +++ /dev/null @@ -1,118 +0,0 @@ -/** - * Persistent Memory Arena for the New Mars Nock virtualization engine. - */ - -#pragma once - -#include -#include -#include - -//============================================================================== -// PROTOTYPES -//============================================================================== - -/** - * Struct returned from pma_load() - */ -typedef struct PMARootState PMARootState; -struct PMARootState { - uint64_t epoch; // Epoch ID of the most recently processed event - uint64_t event; // ID of the most recently processed event - uint64_t root; // Root after most recent event -}; - -/** - * Initialize a brand new PMA environment and event snapshot - * - * @param path File directory in which to create backing files for snapshot and - * page directory - * - * @return 0 success - * @return -1 failure; errno set to error code - */ -int -pma_init(const char *path); - -/** - * Load an existing PMA environment and event snapshot - * - * @param path File directory from which to load the backing files for the - * snapshot and page directory - * - * @return 0 success - * @return -1 failure; errno set to error code - */ -PMARootState -pma_load(const char *path); - -/** - * Safely unload the PMA after syncing changes to PMA state - * - * @param epoch Epoch of latest event successfully applied to state snapshot - * @param event Event number of latest event successfully applied to state - * snapshot - * - * @return 0 success - * @return -1 failure; errno set to error code - */ -int -pma_close(uint64_t epoch, uint64_t event, uint64_t root); - -/** - * Allocate a new block of memory in the PMA - * - * @param size Size in bytes to allocate - * - * @return NULL failure; errno set to error code - * @return void* address of the newly allocated memory - */ -void * -pma_malloc(size_t size); - -/** - * Deallocate an existing block of memory in the PMA - * - * @param address Address of block to deallocated - * - * @return 0 success - * @return -1 failure; errno set to error code - */ -int -pma_free(void *address); - -/** - * Sync changes to PMA state - * - * @param epoch Epoch of latest event successfully applied to state snapshot - * @param event Event number of latest event successfully applied to state - * snapshot - * - * @return 0 success - * @return -1 failure; errno set to error code - */ -int -pma_sync(uint64_t epoch, uint64_t event, uint64_t root); - -/** - * True if the address is in the PMA - */ -bool -pma_in_arena(void *address); - -/* - bp(X) where X is false will raise a SIGTRAP. If the process is being run - inside a debugger, this can be caught and ignored. It's equivalent to a - breakpoint. If run without a debugger, it will dump core, like an assert -*/ -#if defined(__i386__) || defined(__x86_64__) -#define bp(x) do { if(!(x)) __asm__ volatile("int $3"); } while (0) -#elif defined(__thumb__) -#define bp(x) do { if(!(x)) __asm__ volatile(".inst 0xde01"); } while (0) -#elif defined(__aarch64__) -#define bp(x) do { if(!(x)) __asm__ volatile(".inst 0xd4200000"); } while (0) -#elif defined(__arm__) -#define bp(x) do { if(!(x)) __asm__ volatile(".inst 0xe7f001f0"); } while (0) -#else -STATIC_ASSERT(0, "debugger break instruction unimplemented"); -#endif diff --git a/rust/ares/src/pma/test/internals.h b/rust/ares/src/pma/test/internals.h deleted file mode 100644 index cc0e343..0000000 --- a/rust/ares/src/pma/test/internals.h +++ /dev/null @@ -1,198 +0,0 @@ -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - - -//============================================================================== -// MACROS -//============================================================================== - -#define PMA_PAGE_SHIFT 12U -#define PMA_MIN_ALLOC_SHIFT 4U -#define PMA_BITMAP_BITS (8 * sizeof(uint8_t)) -#define PMA_SNAPSHOT_RESIZE_INC 0x100000000 -#define PMA_PAGE_SIZE (1UL << PMA_PAGE_SHIFT) -#define PMA_PAGE_MASK (PMA_PAGE_SIZE - 1) -#define PMA_MIN_ALLOC_SIZE (1U << PMA_MIN_ALLOC_SHIFT) -#define PMA_MAX_SHARED_SHIFT (PMA_PAGE_SHIFT - 2U) -#define PMA_MAX_SHARED_ALLOC (1UL << PMA_MAX_SHARED_SHIFT) -#define PMA_SHARED_BUCKETS (PMA_MAX_SHARED_SHIFT - PMA_MIN_ALLOC_SHIFT + 1) -#define PAGE_ROUND_DOWN(foo) (foo & (~PMA_PAGE_MASK)) -#define PAGE_ROUND_UP(foo) ((foo + PMA_PAGE_MASK) & (~PMA_PAGE_MASK)) -#define PTR_TO_INDEX(foo) ((((uint64_t)(foo)) - ((uint64_t)_pma_state->metadata->arena_start)) >> PMA_PAGE_SHIFT) -#define INDEX_TO_PTR(foo) (void *)((char *)_pma_state->metadata->arena_start + ((foo) * PMA_PAGE_SIZE)) -#ifdef __linux__ - #define PMA_MMAP_FLAGS (MAP_SHARED | MAP_FIXED_NOREPLACE) -#else - #define PMA_MMAP_FLAGS (MAP_SHARED | MAP_FIXED) -#endif -#define PMA_MAGIC_CODE 0xBADDECAFC0FFEE00 // i.e. all decaf coffee -#define PMA_DATA_VERSION 1 -#define PMA_EMPTY_BITMAP 0xFF -#define PMA_BITMAP_SIZE 32 -#define PMA_DPAGE_CACHE_SIZE ((PMA_PAGE_SIZE - sizeof(PMADPageCache)) / sizeof(uint64_t)) -#define PMA_DIRTY_PAGE_LIMIT 164 -#define PMA_SNAPSHOT_FILENAME "snap.bin" -#define PMA_PAGE_DIR_FILENAME "page.bin" -#define PMA_DEFAULT_DIR_NAME ".bin" -#define PMA_NEW_FILE_FLAGS (O_RDWR | O_CREAT) -#define PMA_LOAD_FILE_FLAGS (O_RDWR -#define PMA_DIR_PERMISSIONS (S_IRWXU | S_IRWXG | S_IRWXO) -#define PMA_FILE_PERMISSIONS (S_IRUSR | S_IWUSR | S_IRGRP | S_IWGRP) -#define PMA_INIT_SNAP_SIZE 0x40000000 -#define PMA_INIT_DIR_SIZE 0x400000 -#define PMA_MAXIMUM_DIR_SIZE 0x5500000000 -#ifdef __linux__ - #define PMA_SNAPSHOT_ADDR 0x10000 -#else - #define PMA_SNAPSHOT_ADDR 0x28000000000 -#endif -#define PMA_MAX_DISK_FILE_SIZE 0x100000000000 -#define PMA_MAX_RESIZE_FACTOR (PMA_MAX_DISK_FILE_SIZE / PMA_SNAPSHOT_RESIZE_INC) - - -//============================================================================== -// TYPES -//============================================================================== - -enum PMAPageStatus { - UNALLOCATED, - FREE, - SHARED, - FIRST, - FOLLOW -}; -typedef enum PMAPageStatus PMAPageStatus; - -typedef struct PMAPageDirEntry PMAPageDirEntry; -struct PMAPageDirEntry { - uint64_t offset; - PMAPageStatus status; -}; - -typedef struct PMAPageDir PMAPageDir; -struct PMAPageDir { - uint64_t size; - uint64_t next_index; - PMAPageDirEntry *entries; -}; - -typedef struct PMASharedPageHeader PMASharedPageHeader; -struct PMASharedPageHeader { - struct PMASharedPageHeader *next; - uint8_t dirty; - uint8_t size; - uint8_t free; - uint8_t bits[PMA_BITMAP_SIZE]; -}; - -typedef struct PMADirtyPageEntry PMADirtyPageEntry; -struct PMADirtyPageEntry { - uint64_t index; - uint64_t offset; - uint32_t num_pages; - PMAPageStatus status; -}; - -typedef struct PMASinglePageCache PMASinglePageCache; -struct PMASinglePageCache { - PMASinglePageCache *next; - void *page; -}; - -typedef struct PMAPageRunCache PMAPageRunCache; -struct PMAPageRunCache { - PMAPageRunCache *next; - void *page; - uint64_t length; -}; - -typedef struct PMADPageCache PMADPageCache; -struct PMADPageCache { - uint8_t dirty; - uint16_t size; - uint16_t head; - uint16_t tail; - uint64_t queue[]; -}; - -typedef struct PMAMetadata PMAMetadata; -struct PMAMetadata { - uint64_t magic_code; - uint32_t checksum; - uint32_t version; - uint64_t epoch; - uint64_t event; - uint64_t root; - void *arena_start; - void *arena_end; - PMASharedPageHeader *shared_pages[PMA_SHARED_BUCKETS]; - PMADPageCache *dpage_cache; - uint64_t snapshot_size; - uint64_t next_offset; - uint8_t num_dirty_pages; - uint64_t padding[2]; - PMADirtyPageEntry dirty_pages[PMA_DIRTY_PAGE_LIMIT]; -}; -static_assert(sizeof(PMAMetadata) == PMA_PAGE_SIZE, "PMAMetadata must be a page in length"); - -typedef struct PMAState PMAState; -struct PMAState { - PMAMetadata *metadata; - uint64_t meta_page_offset; - PMAPageDir page_directory; - int snapshot_fd; - int page_dir_fd; - PMASinglePageCache *free_pages; - PMAPageRunCache *free_page_runs; -}; - - -//============================================================================== -// GLOBALS -//============================================================================== - -extern PMAState *_pma_state; - - -//============================================================================== -// FUNCTIONS -//============================================================================== - -int _pma_verify_checksum(PMAMetadata *meta_page); -int _pma_sync_dirty_pages(int fd, uint8_t num_dirty_pages, PMADirtyPageEntry *dirty_pages); -int _pma_write_page_status(int fd, uint64_t index, PMAPageStatus status); -int _pma_write_page_offset(int fd, uint64_t index, uint64_t offset); -int _pma_update_free_pages(uint8_t num_dirty_pages, PMADirtyPageEntry *dirty_pages); -void *_pma_malloc_bytes(size_t size); -int _pma_malloc_shared_page(uint8_t bucket); -void *_pma_malloc_pages(size_t size); -void *_pma_malloc_single_page(PMAPageStatus status); -void *_pma_malloc_multi_pages(uint64_t num_pages); -void *_pma_get_cached_pages(uint64_t num_pages); -void *_pma_get_new_page(PMAPageStatus status); -void *_pma_get_new_pages(uint64_t num_pages); -int _pma_free_pages(void *address); -int _pma_free_bytes(void *address); -int _pma_copy_shared_page(void *address); -uint64_t _pma_get_single_dpage(void); -uint64_t _pma_get_cached_dpage(void); -int _pma_copy_dpage_cache(void); -uint64_t _pma_get_disk_dpage(void); -void _pma_copy_page(void *address, uint64_t offset, PMAPageStatus status, int fd); -void _pma_mark_page_dirty(uint64_t index, uint64_t offset, PMAPageStatus status, uint32_t num_pages); -int _pma_extend_snapshot_file(uint32_t multiplier); -void _pma_warning(const char *p, void *a, int l); -void _pma_state_free(void); -int _pma_state_malloc(void); diff --git a/rust/ares/src/pma/test/malloc.c b/rust/ares/src/pma/test/malloc.c deleted file mode 100644 index 158b5f7..0000000 --- a/rust/ares/src/pma/test/malloc.c +++ /dev/null @@ -1,1511 +0,0 @@ -#include -#include -#include -#include -#include - -#include "../malloc.h" -#include "../includes/checksum.h" -#include "internals.h" - -//============================================================================== -// CONFIGURABLE MACROS -//============================================================================== - -#define TEST_PMA_SNAPSHOT_TEMPLATE "test-snapshot-XXXXXX.bin" -#define TEST_PMA_SNAPSHOT_SUFFIX 4 - - -//============================================================================== -// TYPES -//============================================================================== - -typedef struct TestState TestState; -struct TestState { - char *dir; // Directory in which to generate test files -}; - - -//============================================================================== -// GLOBALS -//============================================================================== - -TestState *_test_state = NULL; - - -//============================================================================== -// FORWARD DECLARATIONS -//============================================================================== - -void test_pma_state_malloc_and_free(void); -void test_pma_extend_snapshot_file(void); -void test_pma_mark_page_dirty(void); -void test_pma_copy_page(void); -void test_pma_get_disk_dpage(void); -void test_pma_copy_dpage_cache(void); -void test_pma_get_cached_dpage(void); -void test_pma_copy_shared_page(void); -void test_pma_free_bytes(void); -void test_pma_free_pages(void); -void test_pma_get_new_pages(void); -void test_pma_get_new_page(void); -void test_pma_get_cached_pages(void); -void test_pma_malloc_single_page(void); -void test_pma_malloc_shared_page(void); -void test_pma_update_free_pages(void); -void test_pma_verify_checksum(void); -void test_pma_in_arena(void); -void test_pma_init(void); -void test_pma_sync(void); -void test_pma_load(void); - - -//============================================================================== -// MAIN & HELPERS -//============================================================================== - -void -test_pma(char* test_dir) { - // Set up test state - _test_state = malloc(sizeof(TestState)); - _test_state->dir = test_dir; - - // Run tests - test_pma_state_malloc_and_free(); - test_pma_extend_snapshot_file(); - test_pma_mark_page_dirty(); - test_pma_copy_page(); - test_pma_get_disk_dpage(); - test_pma_copy_dpage_cache(); - test_pma_get_cached_dpage(); - test_pma_copy_shared_page(); - test_pma_free_bytes(); - test_pma_free_pages(); - test_pma_get_new_pages(); - test_pma_get_new_page(); - test_pma_get_cached_pages(); - test_pma_malloc_single_page(); - test_pma_malloc_shared_page(); - test_pma_update_free_pages(); - test_pma_verify_checksum(); - test_pma_in_arena(); - test_pma_init(); - test_pma_sync(); - test_pma_load(); - - // Clean up - free(_test_state); - - // Done - printf("Unit tests PASSED\n"); -} - -int -_generate_test_snapshot(char **filename) { - size_t dir_len; - size_t file_len; - int fd; - - dir_len = strlen(_test_state->dir); - file_len = strlen(TEST_PMA_SNAPSHOT_TEMPLATE); - - *filename = malloc(dir_len + file_len + 1); - strcpy(*filename, _test_state->dir); - strcpy((*filename + dir_len), TEST_PMA_SNAPSHOT_TEMPLATE); - assert(*filename); - fd = mkstemps(*filename, TEST_PMA_SNAPSHOT_SUFFIX); - assert(fd > 0); - - return fd; -} - -void -_clean_up_test_snapshot(int fd, char *filename) { - close(fd); - unlink(filename); - free(filename); -} - - -//============================================================================== -// TESTS -//============================================================================== - -void -test_pma_state_malloc_and_free(void) { - int res = -1; - - // pre state malloc - assert(!_pma_state); - - // state malloc - res = _pma_state_malloc(); - assert(!res); - assert(_pma_state); - assert(_pma_state->metadata); - - // try state malloc again - res = _pma_state_malloc(); - assert(res == 1); - - // state free - _pma_state_free(); - assert(!_pma_state); - - // try state free again - _pma_state_free(); - - // free metadata separately - res = _pma_state_malloc(); - free(_pma_state->metadata); - _pma_state->metadata = NULL; - _pma_state_free(); -} - -void -test_pma_extend_snapshot_file(void) { - struct stat statbuf; - uint64_t multiplier; - int fd; - int ret; - char *filename = NULL; - - // Test 1: 0 multiplier - ret = _pma_extend_snapshot_file(0); - assert(ret == -1); - - // Test 2: massive multiplier - ret = _pma_extend_snapshot_file(0xffffffff); - assert(ret == -1); - - // Set up state & locals - _pma_state_malloc(); - _pma_state->metadata->snapshot_size = 0; - multiplier = 10; - - // Test 3: lseek fails; snapshot file doesn't exist - ret = _pma_extend_snapshot_file(multiplier); - assert(ret == -1); - assert(errno == ESPIPE); - - // Set up fd - errno = 0; - fd = _generate_test_snapshot(&filename); - close(fd); - fd = open(filename, O_RDONLY); - assert(fd > 0); - _pma_state->snapshot_fd = fd; - - // Test 4: write fails; snapshot file read only - errno = 0; - ret = _pma_extend_snapshot_file(multiplier); - assert(ret == -1); - assert(errno == EBADF); - close(fd); - - // Reset fd - fd = open(filename, O_RDWR); - assert(fd > 0); - _pma_state->snapshot_fd = fd; - - // Test 5: Successful - errno = 0; - ret = _pma_extend_snapshot_file(multiplier); - assert(ret == 0); - assert(errno == 0); - assert(fstat(fd, &statbuf) == 0); - assert((uint64_t)statbuf.st_size == (multiplier * PMA_SNAPSHOT_RESIZE_INC)); - assert((uint64_t)statbuf.st_size == _pma_state->metadata->snapshot_size); - - // Clean up - _clean_up_test_snapshot(fd, filename); - _pma_state_free(); -} - -void -test_pma_mark_page_dirty(void) { - PMADirtyPageEntry *dirty_page; - - // Set up state & locals - _pma_state_malloc(); - _pma_state->metadata->num_dirty_pages = 10; - dirty_page = (_pma_state->metadata->dirty_pages + 10); - dirty_page->index = 1; - dirty_page->offset = 2; - dirty_page->num_pages = 3; - dirty_page->status = FREE; - - // Test 1: mark page dirty - _pma_mark_page_dirty(4, 5, FIRST, 6); - assert(_pma_state->metadata->num_dirty_pages == 11); - assert(dirty_page->index == 4); - assert(dirty_page->offset == 5); - assert(dirty_page->num_pages == 6); - assert(dirty_page->status == FIRST); - - // Clean up - _pma_state_free(); -} - -void -test_pma_copy_page(void) { - const uint64_t page_uno_offset = 0; - const uint64_t page_dos_offset = PMA_PAGE_SIZE; - const uint64_t page_tre_offset = (2 * PMA_PAGE_SIZE); - const uint64_t file_size = (3 * PMA_PAGE_SIZE); - const uint16_t end_of_dpage_cache = (PMA_DPAGE_CACHE_SIZE - 1); - ssize_t bytes; - const int strlen = 6; - int fd; - const char *text_alpha = "ALPHA"; - const char *text_bravo = "BRAVO"; - const char *text_delta = "DELTA"; - char *filename; - char text_test[6] = { 0 }; - void *address; - - // Set up backing file - fd = _generate_test_snapshot(&filename); - assert(12287 == lseek(fd, (file_size - 1), SEEK_SET)); - assert(1 == write(fd, "", 1)); - assert(6 == pwrite(fd, text_alpha, strlen, 0)); - assert(6 == pwrite(fd, text_bravo, strlen, PMA_PAGE_SIZE)); - assert(6 == pwrite(fd, text_delta, strlen, (2 * PMA_PAGE_SIZE))); - - // Set up state - _pma_state_malloc(); - _pma_state->metadata->arena_start = PMA_SNAPSHOT_ADDR; - - _pma_state->metadata->dpage_cache = calloc(1, PMA_PAGE_SIZE); - _pma_state->metadata->dpage_cache->tail = end_of_dpage_cache; - _pma_state->metadata->dpage_cache->queue[end_of_dpage_cache] = 0; - - _pma_state->page_directory.entries = calloc(2, sizeof(PMAPageDirEntry)); - _pma_state->page_directory.entries[1].offset = page_dos_offset; - - // Set up address - address = mmap( - INDEX_TO_PTR(1), - PMA_PAGE_SIZE, - PROT_READ, - MAP_SHARED | MAP_FIXED, - fd, - page_dos_offset); - assert(MAP_FAILED != address); - - // Test 1: copy page in backing file - _pma_copy_page(address, page_tre_offset, FIRST, fd); - assert(0 == _pma_state->metadata->dpage_cache->tail); - assert(4096 == _pma_state->metadata->dpage_cache->queue[end_of_dpage_cache]); - bytes = pread(fd, text_test, strlen, page_uno_offset); - assert(6 == bytes); - assert(0 == strcmp(text_alpha, text_test)); - bytes = pread(fd, text_test, strlen, page_dos_offset); - assert(6 == bytes); - assert(0 == strcmp(text_bravo, text_test)); - bytes = pread(fd, text_test, strlen, page_tre_offset); - assert(6 == bytes); - assert(0 == strcmp(text_bravo, text_test)); - - // Clean up - munmap(INDEX_TO_PTR(0), file_size); - free(_pma_state->metadata->dpage_cache); - _clean_up_test_snapshot(fd, filename); - _pma_state_free(); -} - -void -test_pma_get_disk_dpage(void) { - struct stat statbuf; - uint64_t init_size = 2 * PMA_PAGE_SIZE; - uint64_t next_offset; - int fd; - char *filename; - - // Set up state - _pma_state_malloc(); - _pma_state->metadata->next_offset = init_size - PMA_PAGE_SIZE; - _pma_state->metadata->snapshot_size = init_size; - - // Test 1: get next dpage without extending snapshot backing file - next_offset = _pma_get_disk_dpage(); - assert(4096 == next_offset); - assert(8192 == _pma_state->metadata->next_offset); - - // Test 2: failure to extend backing file - next_offset = _pma_get_disk_dpage(); - assert(0 == next_offset); - assert(8192 == _pma_state->metadata->next_offset); - - // Set up backing file - fd = _generate_test_snapshot(&filename); - assert(8191 == lseek(fd, (init_size - 1), SEEK_SET)); - assert(1 == write(fd, "", 1)); - _pma_state->snapshot_fd = fd; - - // Test 3: get next dpage after extending snapshot backing file - next_offset = _pma_get_disk_dpage(); - assert(8192 == next_offset); - assert(12288 == _pma_state->metadata->next_offset); - assert(0 == fstat(fd, &statbuf)); - assert((uint64_t)statbuf.st_size == (PMA_SNAPSHOT_RESIZE_INC + init_size)); - - // Clean up - free(_pma_state->page_directory.entries); - _clean_up_test_snapshot(fd, filename); - _pma_state_free(); -} - -void -test_pma_copy_dpage_cache(void) { - const uint64_t page_uno_offset = PMA_PAGE_SIZE; - const uint64_t page_dos_offset = (2 * PMA_PAGE_SIZE); - const uint64_t page_tre_offset = (3 * PMA_PAGE_SIZE); - const uint64_t init_size = 4 * PMA_PAGE_SIZE; - const uint64_t test_code = 0xcafebabe8008135; - uint64_t data_buffer; - ssize_t bytes; - int fd = 0; - char *filename = NULL; - - // Set up backing file - fd = _generate_test_snapshot(&filename); - assert(16383 == lseek(fd, (init_size - 1), SEEK_SET)); - assert(1 == write(fd, "", 1)); - - // Set up state - _pma_state_malloc(); - _pma_state->metadata->arena_start = PMA_SNAPSHOT_ADDR; - _pma_state->metadata->snapshot_size = init_size; - _pma_state->metadata->dpage_cache = mmap( - INDEX_TO_PTR(0), - PMA_PAGE_SIZE, - PROT_READ | PROT_WRITE, - MAP_SHARED | MAP_FIXED, - fd, - page_uno_offset); - _pma_state->metadata->dpage_cache->dirty = 0; - _pma_state->metadata->dpage_cache->size = 0; - _pma_state->metadata->dpage_cache->head = 1; - _pma_state->metadata->dpage_cache->tail = 2; - _pma_state->metadata->dpage_cache->queue[0] = test_code; - _pma_state->metadata->dpage_cache->queue[1] = page_dos_offset; - _pma_state->page_directory.entries = malloc(sizeof(PMAPageDirEntry)); - _pma_state->page_directory.entries[0].offset = page_uno_offset; - - // Test 1: free page cache empty, getting new page fails - _pma_state->metadata->next_offset = init_size; - assert(_pma_copy_dpage_cache()); - - // Test 2: free page cache empty, getting new page succeeds - _pma_state->snapshot_fd = fd; - _pma_state->metadata->next_offset = page_tre_offset; - assert(0 == _pma_copy_dpage_cache()); - assert(16384 == _pma_state->metadata->next_offset); - bytes = pread(fd, &data_buffer, 8, (page_tre_offset + 8)); - assert(8 == bytes); - assert(0xcafebabe8008135 == data_buffer); - - // Reset dpage cache dirty bit - _pma_state->metadata->dpage_cache->dirty = 0; - - // Test 3: free page cache has a page - _pma_state->metadata->dpage_cache->size = 1; - assert(0 == _pma_copy_dpage_cache()); - bytes = pread(fd, &data_buffer, 8, (page_dos_offset + 8)); - assert(8 == bytes); - assert(0xcafebabe8008135 == data_buffer); - - // Clean up - munmap(INDEX_TO_PTR(0), init_size); - free(_pma_state->page_directory.entries); - _clean_up_test_snapshot(fd, filename); - _pma_state_free(); -} - -void -test_pma_get_cached_dpage(void) { - - // Set up state - _pma_state_malloc(); - _pma_state->metadata->dpage_cache = calloc(1, sizeof(PMADPageCache)); - - // Test 1: no pages in cache - _pma_state->metadata->dpage_cache->dirty = 1; - _pma_state->metadata->dpage_cache->size = 0; - assert(0 == _pma_get_cached_dpage()); - - // Test 2: only one page in cache and cache uncopied - _pma_state->metadata->dpage_cache->dirty = 0; - _pma_state->metadata->dpage_cache->size = 1; - assert(0 == _pma_get_cached_dpage()); - - // Test 3: successfully get page - _pma_state->metadata->dpage_cache->dirty = 1; - _pma_state->metadata->dpage_cache->size = 2; - _pma_state->metadata->dpage_cache->head = 0; - _pma_state->metadata->dpage_cache->tail = 1; - _pma_state->metadata->dpage_cache->queue[0] = 0xcafebabe8008135; - assert(0xcafebabe8008135 == _pma_get_cached_dpage()); - assert(1 == _pma_state->metadata->dpage_cache->size); - assert(1 == _pma_state->metadata->dpage_cache->head); - assert(1 == _pma_state->metadata->dpage_cache->tail); - - // Test 4: successfully get page & loop queue - _pma_state->metadata->dpage_cache->head = PMA_DPAGE_CACHE_SIZE - 1; - _pma_state->metadata->dpage_cache->queue[PMA_DPAGE_CACHE_SIZE - 1] = 0xdefaced0facade; - assert(0xdefaced0facade == _pma_get_cached_dpage()); - assert(0 == _pma_state->metadata->dpage_cache->size); - assert(0 == _pma_state->metadata->dpage_cache->head); - assert(1 == _pma_state->metadata->dpage_cache->tail); - - // Clean up - free(_pma_state->metadata->dpage_cache); - _pma_state_free(); -} - -void -test_pma_copy_shared_page(void) { - PMASharedPageHeader *clean_shared_page; - PMASharedPageHeader *dirty_shared_page; - ssize_t bytes; - const uint64_t init_size = 4 * PMA_PAGE_SIZE; - const uint64_t page_nul_offset = 0; - const uint64_t page_uno_offset = PMA_PAGE_SIZE; - const uint64_t page_dos_offset = (2 * PMA_PAGE_SIZE); - const uint64_t page_tre_offset = (3 * PMA_PAGE_SIZE); - const uint8_t page_uno_size = 10; - const uint8_t page_dos_size = 20; - uint8_t data_buffer; - int fd = 0; - char *filename = NULL; - - // Set up backing file - fd = _generate_test_snapshot(&filename); - assert(16383 == lseek(fd, (init_size - 1), SEEK_SET)); - assert(1 == write(fd, "", 1)); - - // Set up state - _pma_state_malloc(); - _pma_state->metadata->arena_start = PMA_SNAPSHOT_ADDR; - _pma_state->metadata->snapshot_size = init_size; - _pma_state->metadata->dpage_cache = mmap( - INDEX_TO_PTR(0), - PMA_PAGE_SIZE, - PROT_READ | PROT_WRITE, - MAP_SHARED | MAP_FIXED, - fd, - page_nul_offset); - _pma_state->metadata->dpage_cache->dirty = 1; - _pma_state->page_directory.entries = calloc(3, sizeof(PMAPageDirEntry)); - _pma_state->page_directory.entries[1].offset = page_uno_offset; - _pma_state->page_directory.entries[1].status = SHARED; - _pma_state->page_directory.entries[2].offset = page_dos_offset; - _pma_state->page_directory.entries[2].status = SHARED; - - // Set up shared pages - dirty_shared_page = mmap( - INDEX_TO_PTR(1), - PMA_PAGE_SIZE, - PROT_READ | PROT_WRITE, - MAP_SHARED | MAP_FIXED, - fd, - page_uno_offset); - dirty_shared_page->dirty = 1; - dirty_shared_page->size = page_uno_size; - - clean_shared_page = mmap( - INDEX_TO_PTR(2), - PMA_PAGE_SIZE, - PROT_READ | PROT_WRITE, - MAP_SHARED | MAP_FIXED, - fd, - page_dos_offset); - clean_shared_page->dirty = 0; - clean_shared_page->size = page_dos_size; - - // Test 1: don't copy if shared page already dirty - assert(0 == _pma_copy_shared_page(dirty_shared_page)); - - // Test 2: fail if a new dpage couldn't be acquired - _pma_state->metadata->dpage_cache->size = 0; - _pma_state->metadata->dpage_cache->head = 0; - _pma_state->metadata->dpage_cache->tail = 0; - assert(-1 == _pma_copy_shared_page(clean_shared_page)); - - // Test 3: success - _pma_state->snapshot_fd = fd; - _pma_state->metadata->dpage_cache->size = 1; - _pma_state->metadata->dpage_cache->tail = 1; - _pma_state->metadata->dpage_cache->queue[0] = page_tre_offset; - assert(0 == _pma_copy_shared_page(clean_shared_page)); - bytes = pread(fd, &data_buffer, 1, (page_uno_offset + 9)); - assert(1 == bytes); - assert(10 == data_buffer); - bytes = pread(fd, &data_buffer, 1, (page_dos_offset + 9)); - assert(1 == bytes); - assert(20 == data_buffer); - bytes = pread(fd, &data_buffer, 1, (page_tre_offset + 9)); - assert(1 == bytes); - assert(20 == data_buffer); - - // Clean up - free(_pma_state->page_directory.entries); - munmap(PMA_SNAPSHOT_ADDR, init_size); - _pma_state_free(); - _clean_up_test_snapshot(fd, filename); -} - -void -test_pma_free_bytes(void) { - PMASharedPageHeader *shared_page_16; - PMASharedPageHeader *shared_page_64; - PMASharedPageHeader *shared_page_256; - const uint64_t init_size = 3 * PMA_PAGE_SIZE; - const uint64_t page_uno_offset = 0; - const uint64_t page_dos_offset = PMA_PAGE_SIZE; - const uint64_t page_tre_offset = (2 * PMA_PAGE_SIZE); - const uint8_t page_uno_size = 4; - const uint8_t page_dos_size = 6; - const uint8_t page_tre_size = 8; - int fd = 0; - int ret; - char *filename = NULL; - - // Set up backing file - fd = _generate_test_snapshot(&filename); - assert(12287 == lseek(fd, (init_size - 1), SEEK_SET)); - assert(1 == write(fd, "", 1)); - - // Set up state - _pma_state_malloc(); - _pma_state->metadata->arena_start = PMA_SNAPSHOT_ADDR; - _pma_state->metadata->snapshot_size = init_size; - - // Set up shared pages - shared_page_16 = mmap( - INDEX_TO_PTR(0), - PMA_PAGE_SIZE, - PROT_READ | PROT_WRITE, - MAP_SHARED | MAP_FIXED, - fd, - page_uno_offset); - shared_page_16->dirty = 1; - shared_page_16->size = page_uno_size; - shared_page_16->free = 0; - for (int i = 0; i < PMA_BITMAP_SIZE; ++i) { - shared_page_16->bits[i] = 0; - } - - shared_page_64 = mmap( - INDEX_TO_PTR(1), - PMA_PAGE_SIZE, - PROT_READ | PROT_WRITE, - MAP_SHARED | MAP_FIXED, - fd, - page_dos_offset); - shared_page_64->dirty = 1; - shared_page_64->size = page_dos_size; - shared_page_64->free = 0; - for (int i = 0; i < PMA_BITMAP_SIZE; ++i) { - shared_page_64->bits[i] = 0; - } - - shared_page_256 = mmap( - INDEX_TO_PTR(2), - PMA_PAGE_SIZE, - PROT_READ | PROT_WRITE, - MAP_SHARED | MAP_FIXED, - fd, - page_tre_offset); - shared_page_256->dirty = 1; - shared_page_256->size = page_tre_size; - shared_page_256->free = 0; - for (int i = 0; i < PMA_BITMAP_SIZE; ++i) { - shared_page_256->bits[i] = 0; - } - - // Test 1: free slot 0 of shared page with slot size 16 - ret = _pma_free_bytes((char*)shared_page_16 + sizeof(PMASharedPageHeader)); - assert(0 == ret); - assert(1 == shared_page_16->free); - assert(0x01 == shared_page_16->bits[0]); - - // Test 2: free slot 8 of shared page with slot size 64 - ret = _pma_free_bytes((char*)shared_page_64 + sizeof(PMASharedPageHeader) + 448); - assert(0 == ret); - assert(1 == shared_page_64->free); - assert(0x80 == shared_page_64->bits[0]); - - // Test 3: free slot 15 of shared page with slot size 256 - ret = _pma_free_bytes((char*)shared_page_256 + sizeof(PMASharedPageHeader) + 3584); - assert(0 == ret); - assert(1 == shared_page_256->free); - assert(0x40 == shared_page_256->bits[1]); - - // Test 4: failure when freeing an already free slot - ret = _pma_free_bytes((char*)shared_page_16 + sizeof(PMASharedPageHeader)); - assert(-1 == ret); - - // Clean up - munmap(PMA_SNAPSHOT_ADDR, init_size); - _pma_state_free(); - _clean_up_test_snapshot(fd, filename); -} - -void -test_pma_free_pages(void) { - const uint64_t init_size = 3 * PMA_PAGE_SIZE; - const uint64_t solo_page_offset = 0; - const uint64_t duo_page_offset = PMA_PAGE_SIZE; - int fd = 0; - char *filename = NULL; - void *solo_page; - void *duo_page; - - // Set up backing file - fd = _generate_test_snapshot(&filename); - assert(12287 == lseek(fd, (init_size - 1), SEEK_SET)); - assert(1 == write(fd, "", 1)); - - // Set up state - _pma_state_malloc(); - _pma_state->metadata->arena_start = PMA_SNAPSHOT_ADDR; - _pma_state->metadata->snapshot_size = init_size; - _pma_state->page_directory.entries = calloc(3, sizeof(PMAPageDirEntry)); - _pma_state->page_directory.entries[0].status = FIRST; - _pma_state->page_directory.entries[0].offset = solo_page_offset; - _pma_state->page_directory.entries[1].status = FIRST; - _pma_state->page_directory.entries[1].offset = duo_page_offset; - _pma_state->page_directory.entries[2].status = FOLLOW; - _pma_state->page_directory.entries[2].offset = duo_page_offset + PMA_PAGE_SIZE; - - // Set up pages - solo_page = mmap( - INDEX_TO_PTR(0), - PMA_PAGE_SIZE, - PROT_READ | PROT_WRITE, - MAP_SHARED | MAP_FIXED, - fd, - solo_page_offset); - - duo_page = mmap( - INDEX_TO_PTR(1), - 2 * PMA_PAGE_SIZE, - PROT_READ | PROT_WRITE, - MAP_SHARED | MAP_FIXED, - fd, - duo_page_offset); - - // Test 1: fail when pointing to middle of page - assert(-1 == _pma_free_pages(solo_page + 1)); - - // Test 2: free single page allocation - assert(0 == _pma_free_pages(solo_page)); - - // test 3: free multi-page allocation - assert(0 == _pma_free_pages(duo_page)); - - // Clean up - munmap(PMA_SNAPSHOT_ADDR, init_size); - free(_pma_state->page_directory.entries); - _pma_state_free(); - _clean_up_test_snapshot(fd, filename); -} - -void -test_pma_get_new_pages(void) { - const uint64_t init_size = PMA_PAGE_SIZE; - const uint64_t num_pages = 2; - int fd = 0; - char *filename = NULL; - void* const address = PMA_SNAPSHOT_ADDR + PMA_PAGE_SIZE; - void* const arena_end = address + (2 * PMA_PAGE_SIZE); - - // Set up backing file - fd = _generate_test_snapshot(&filename); - assert(4095 == lseek(fd, (init_size - 1), SEEK_SET)); - assert(1 == write(fd, "", 1)); - - // Set up state - _pma_state_malloc(); - _pma_state->snapshot_fd = fd; - _pma_state->metadata->arena_start = PMA_SNAPSHOT_ADDR; - _pma_state->metadata->arena_end = address; - _pma_state->metadata->snapshot_size = init_size; - _pma_state->metadata->next_offset = init_size; - - // Test 1: allocate new pages - assert(address == _pma_get_new_pages(num_pages)); - assert(12288 == _pma_state->metadata->next_offset); - assert(arena_end == _pma_state->metadata->arena_end); - - // Clean Up - munmap(address, num_pages * PMA_PAGE_SIZE); - _pma_state_free(); - _clean_up_test_snapshot(fd, filename); -} - -void -test_pma_get_new_page(void) { - const uint64_t init_size = 2 * PMA_PAGE_SIZE; - const uint64_t init_offset = PMA_PAGE_SIZE; - int fd = 0; - char *filename = NULL; - void* const address = PMA_SNAPSHOT_ADDR; - void* const arena_end = address + PMA_PAGE_SIZE; - - // Set up backing file - fd = _generate_test_snapshot(&filename); - assert(8191 == lseek(fd, (init_size - 1), SEEK_SET)); - assert(1 == write(fd, "", 1)); - - // Set up state - _pma_state_malloc(); - _pma_state->snapshot_fd = fd; - _pma_state->metadata->arena_start = PMA_SNAPSHOT_ADDR; - _pma_state->metadata->arena_end = address; - _pma_state->metadata->snapshot_size = init_size; - _pma_state->metadata->next_offset = init_offset; - - _pma_state->metadata->dpage_cache = calloc(1, sizeof(PMADPageCache)); - _pma_state->metadata->dpage_cache->size = 0; - - // Test 1: allocate new pages - assert(address == _pma_get_new_page(FIRST)); - assert(8192 == _pma_state->metadata->next_offset); - assert(arena_end == _pma_state->metadata->arena_end); - - // Clean Up - munmap(address, PMA_PAGE_SIZE); - free(_pma_state->metadata->dpage_cache); - _pma_state_free(); - _clean_up_test_snapshot(fd, filename); -} - -void -test_pma_get_cached_pages(void) { - PMAPageRunCache *test_0_cache; - PMAPageRunCache *test_1_cache; - PMAPageRunCache *test_2_cache; - PMAPageRunCache *test_3_cache; - PMAPageRunCache *test_4_cache; - PMAPageRunCache *test_5_cache; - PMAPageRunCache *wip_ptr; - void *address; - - // Set up state - _pma_state_malloc(); - - // Set up run caches for test - test_0_cache = NULL; - - wip_ptr = calloc(1, sizeof(PMAPageRunCache)); - wip_ptr->page = 0x30000; - wip_ptr->length = 6; - wip_ptr->next = NULL; - test_1_cache = wip_ptr; - wip_ptr = calloc(1, sizeof(PMAPageRunCache)); - wip_ptr->page = 0x20000; - wip_ptr->length = 5; - wip_ptr->next = test_1_cache; - test_1_cache = wip_ptr; - wip_ptr = calloc(1, sizeof(PMAPageRunCache)); - wip_ptr->page = 0x10000; - wip_ptr->length = 4; - wip_ptr->next = test_1_cache; - test_1_cache = wip_ptr; - - wip_ptr = calloc(1, sizeof(PMAPageRunCache)); - wip_ptr->page = 0x30000; - wip_ptr->length = 6; - wip_ptr->next = NULL; - test_2_cache = wip_ptr; - wip_ptr = calloc(1, sizeof(PMAPageRunCache)); - wip_ptr->page = 0x20000; - wip_ptr->length = 4; - wip_ptr->next = test_2_cache; - test_2_cache = wip_ptr; - wip_ptr = calloc(1, sizeof(PMAPageRunCache)); - wip_ptr->page = 0x10000; - wip_ptr->length = 5; - wip_ptr->next = test_2_cache; - test_2_cache = wip_ptr; - - wip_ptr = calloc(1, sizeof(PMAPageRunCache)); - wip_ptr->page = 0x30000; - wip_ptr->length = 4; - wip_ptr->next = NULL; - test_3_cache = wip_ptr; - wip_ptr = calloc(1, sizeof(PMAPageRunCache)); - wip_ptr->page = 0x20000; - wip_ptr->length = 5; - wip_ptr->next = test_3_cache; - test_3_cache = wip_ptr; - wip_ptr = calloc(1, sizeof(PMAPageRunCache)); - wip_ptr->page = 0x10000; - wip_ptr->length = 6; - wip_ptr->next = test_3_cache; - test_3_cache = wip_ptr; - - test_4_cache = calloc(1, sizeof(PMAPageRunCache)); - test_4_cache->page = 0x40000; - test_4_cache->length = 2; - // Invalid pointer; used to confirm that we stop searching when we find exact run - test_4_cache->next = 0x8fffffffffffffff; - - wip_ptr = calloc(1, sizeof(PMAPageRunCache)); - wip_ptr->page = 0x50000; - wip_ptr->length = 3; - wip_ptr->next = NULL; - test_5_cache = wip_ptr; - wip_ptr = calloc(1, sizeof(PMAPageRunCache)); - wip_ptr->page = 0x99000; - wip_ptr->length = 1; - wip_ptr->next = test_5_cache; - test_5_cache = wip_ptr; - - // Test 0: page run cache empty - _pma_state->free_page_runs = test_0_cache; - address = _pma_get_cached_pages(2); - assert(NULL == address); - - // Test 1: find run bigger than requested, by two pages, at the very beginning - _pma_state->free_page_runs = test_1_cache; - address = _pma_get_cached_pages(2); - assert(0x10000 == address); - assert(2 == _pma_state->free_page_runs->length); - assert(0x12000 == _pma_state->free_page_runs->page); - assert(5 == _pma_state->free_page_runs->next->length); - assert(0x20000 == _pma_state->free_page_runs->next->page); - assert(6 == _pma_state->free_page_runs->next->next->length); - assert(0x30000 == _pma_state->free_page_runs->next->next->page); - assert(NULL == _pma_state->free_page_runs->next->next->next); - - // Test 2: find run bigger than request, by two pages, in the middle - _pma_state->free_page_runs = test_2_cache; - address = _pma_get_cached_pages(2); - assert(0x20000 == address); - assert(5 == _pma_state->free_page_runs->length); - assert(0x10000 == _pma_state->free_page_runs->page); - assert(2 == _pma_state->free_page_runs->next->length); - assert(0x22000 == _pma_state->free_page_runs->next->page); - assert(6 == _pma_state->free_page_runs->next->next->length); - assert(0x30000 == _pma_state->free_page_runs->next->next->page); - assert(NULL == _pma_state->free_page_runs->next->next->next); - - // Test 3: find run bigger than requested, by two pages, at the very end - _pma_state->free_page_runs = test_3_cache; - address = _pma_get_cached_pages(2); - assert(0x30000 == address); - assert(6 == _pma_state->free_page_runs->length); - assert(0x10000 == _pma_state->free_page_runs->page); - assert(5 == _pma_state->free_page_runs->next->length); - assert(0x20000 == _pma_state->free_page_runs->next->page); - assert(2 == _pma_state->free_page_runs->next->next->length); - assert(0x32000 == _pma_state->free_page_runs->next->next->page); - assert(NULL == _pma_state->free_page_runs->next->next->next); - - // Test 4: find exactly sized run, as only entry in cache, and stop looking - _pma_state->free_page_runs = test_4_cache; - address = _pma_get_cached_pages(2); - assert(0x40000 == address); - assert(0x8fffffffffffffff == _pma_state->free_page_runs); - - // Test 5: find run bigger than request, by a single page - _pma_state->free_page_runs = test_5_cache; - address = _pma_get_cached_pages(2); - assert(0x50000 == address); - assert(1 == _pma_state->free_page_runs->length); - assert(0x99000 == _pma_state->free_page_runs->page); - assert(NULL == _pma_state->free_page_runs->next); - assert(0x52000 == _pma_state->free_pages->page); - assert(NULL == _pma_state->free_pages->next); - - // Clean up - while (test_1_cache != NULL) { - wip_ptr = test_1_cache; - test_1_cache = test_1_cache->next; - free(wip_ptr); - } - while (test_2_cache != NULL) { - wip_ptr = test_2_cache; - test_2_cache = test_2_cache->next; - free(wip_ptr); - } - while (test_3_cache != NULL) { - wip_ptr = test_3_cache; - test_3_cache = test_3_cache->next; - free(wip_ptr); - } - free(_pma_state->free_pages); - free(_pma_state->free_page_runs); - _pma_state_free(); -} - -void -test_pma_malloc_single_page(void) { - PMASinglePageCache *wip_ptr; - - // Set up state - _pma_state_malloc(); - - // Set up free page cache - wip_ptr = calloc(1, sizeof(PMASinglePageCache)); - wip_ptr->page = 0x20000; - wip_ptr->next = NULL; - _pma_state->free_pages = wip_ptr; - wip_ptr = calloc(1, sizeof(PMASinglePageCache)); - wip_ptr->page = 0x10000; - wip_ptr->next = _pma_state->free_pages; - _pma_state->free_pages = wip_ptr; - - // Test 1: get page from free page cache - assert(0x10000 == _pma_malloc_single_page(FIRST)); - assert(0x20000 == _pma_state->free_pages->page); - assert(NULL == _pma_state->free_pages->next); - - // Case when no pages in free page cache tested by test_pma_get_new_page - - // Clean up - free(_pma_state->free_pages); - _pma_state_free(); -} - -void -test_pma_malloc_shared_page(void) { - PMASinglePageCache *free_pages; - PMASinglePageCache *wip_ptr; - const uint64_t mmap_size = 2 * PMA_PAGE_SIZE; - const uint8_t test_1_bucket_size = 0; - const uint8_t test_2_bucket_size = 0; - const uint8_t test_3_bucket_size = 6; - void *shared_pages; - - // Set up state - _pma_state_malloc(); - _pma_state->metadata->arena_start = PMA_SNAPSHOT_ADDR; - _pma_state->metadata->snapshot_size = PMA_PAGE_SIZE; - _pma_state->metadata->next_offset = PMA_PAGE_SIZE; - _pma_state->free_pages = NULL; - - _pma_state->metadata->dpage_cache = calloc(1, sizeof(PMADPageCache)); - _pma_state->metadata->dpage_cache->size = 0; - - // Set up shared pages - shared_pages = mmap( - PMA_SNAPSHOT_ADDR, - mmap_size, - PROT_READ | PROT_WRITE, - MAP_ANONYMOUS | MAP_PRIVATE, - -1, - 0); - assert(MAP_FAILED != shared_pages); - - // Set up free page cache - wip_ptr = calloc(1, sizeof(PMASinglePageCache)); - wip_ptr->page = (shared_pages + PMA_PAGE_SIZE); - wip_ptr->next = NULL; - free_pages = wip_ptr; - wip_ptr = calloc(1, sizeof(PMASinglePageCache)); - wip_ptr->page = shared_pages; - wip_ptr->next = free_pages; - free_pages = wip_ptr; - - // Test 1: could not allocate page - assert(-1 == _pma_malloc_shared_page(test_1_bucket_size)); - - // Test 2: 16 byte slots - _pma_state->free_pages = free_pages; - assert(0 == _pma_malloc_shared_page(test_2_bucket_size)); - assert(NULL != _pma_state->metadata->shared_pages[test_2_bucket_size]); - assert(1 == _pma_state->metadata->shared_pages[test_2_bucket_size]->dirty); - assert(4 == _pma_state->metadata->shared_pages[test_2_bucket_size]->size); - assert(253 == _pma_state->metadata->shared_pages[test_2_bucket_size]->free); - for (uint8_t i = 0; i < PMA_BITMAP_SIZE; ++i) { - assert(PMA_EMPTY_BITMAP == _pma_state->metadata->shared_pages[test_2_bucket_size]->bits[i]); - } - assert(NULL != _pma_state->free_pages); - assert((shared_pages + PMA_PAGE_SIZE) == _pma_state->free_pages->page); - assert(NULL == _pma_state->free_pages->next); - - // Test 3: 1024 byte slots - assert(0 == _pma_malloc_shared_page(test_3_bucket_size)); - assert(NULL != _pma_state->metadata->shared_pages[test_3_bucket_size]); - assert(1 == _pma_state->metadata->shared_pages[test_3_bucket_size]->dirty); - assert(10 == _pma_state->metadata->shared_pages[test_3_bucket_size]->size); - assert(3 == _pma_state->metadata->shared_pages[test_3_bucket_size]->free); - for (uint8_t i = 0; i < PMA_BITMAP_SIZE; ++i) { - assert(PMA_EMPTY_BITMAP == _pma_state->metadata->shared_pages[test_3_bucket_size]->bits[i]); - } - assert(NULL == _pma_state->free_pages); - - // Clean up - munmap(shared_pages, mmap_size); - _pma_state_free(); -} - -void -test_pma_update_free_pages(void) { - PMADirtyPageEntry test_1_dirty_pages[2]; - PMADirtyPageEntry test_2_dirty_page; - PMADirtyPageEntry test_3_dirty_page; - - // Set up state - _pma_state_malloc(); - _pma_state->metadata->arena_start = PMA_SNAPSHOT_ADDR; - - // Set up dirty pages - test_1_dirty_pages[0].index = 1; - test_1_dirty_pages[0].num_pages = 1; - test_1_dirty_pages[0].status = SHARED; - test_1_dirty_pages[1].index = 1; - test_1_dirty_pages[1].num_pages = 2; - test_1_dirty_pages[1].status = FIRST; - - test_2_dirty_page.index = 2; - test_2_dirty_page.num_pages = 1; - test_2_dirty_page.status = FREE; - - test_3_dirty_page.index = 3; - test_3_dirty_page.num_pages = 2; - test_3_dirty_page.status = FREE; - - // Test 1: all dirty pages have non-free status - assert(0 == _pma_update_free_pages(2, test_1_dirty_pages)); - assert(NULL == _pma_state->free_pages); - assert(NULL == _pma_state->free_page_runs); - - // Test 2: add single page to free page cache - assert(0 == _pma_update_free_pages(1, &test_2_dirty_page)); - assert(NULL != _pma_state->free_pages); - assert(INDEX_TO_PTR(2) == _pma_state->free_pages->page); - assert(NULL == _pma_state->free_pages->next); - - // Test 3: add multiple free pages to free page runs cache - assert(0 == _pma_update_free_pages(1, &test_3_dirty_page)); - assert(NULL != _pma_state->free_page_runs); - assert(INDEX_TO_PTR(3) == _pma_state->free_page_runs->page); - assert(2 == _pma_state->free_page_runs->length); - assert(NULL == _pma_state->free_page_runs->next); - - // Clean up - free(_pma_state->free_pages); - free(_pma_state->free_page_runs); - _pma_state_free(); -} - -void -test_pma_verify_checksum(void) { - PMAMetadata fake_metadata_page; - - // Set up state - _pma_state_malloc(); - - // Test 1: good checksum - fake_metadata_page.checksum = 0; - fake_metadata_page.checksum = crc_32( - (unsigned char *)(&fake_metadata_page), - PMA_PAGE_SIZE); - assert(1 == _pma_verify_checksum(&fake_metadata_page)); - - // Test 2: bad checksum - fake_metadata_page.checksum = 0xbaddecaf; - assert(0 == _pma_verify_checksum(&fake_metadata_page)); - - // Clean up - _pma_state_free(); -} - -void -test_pma_in_arena(void) { - // Set up state - _pma_state_malloc(); - _pma_state->metadata->arena_start = 0x7fffffff; - _pma_state->metadata->arena_end = 0x80000001; - - // Test 1: before arena start - assert(0 == pma_in_arena(0x10000000)); - - // Test 2: equal to arena start - assert(1 == pma_in_arena(0x7fffffff)); - - // Test 3: in arena - assert(1 == pma_in_arena(0x80000000)); - - // Test 4: equal to arena end - assert(0 == pma_in_arena(0x80000001)); - - // Test 5: after arena end - assert(0 == pma_in_arena(0xffffffff)); - - // Clean up - _pma_state_free(); -} - -void -test_pma_init(void) { - struct stat page_dir_statbuf; - struct stat page_dir_statbuf_v; - struct stat snapshot_statbuf; - struct stat snapshot_statbuf_v; - size_t dir_len; - uint32_t checksum; - char *page_dir_path; - char *snapshot_path; - - // Set up - dir_len = strlen(_test_state->dir); - - page_dir_path = malloc(dir_len + 15); - sprintf(page_dir_path, "%s/%s/%s", _test_state->dir, PMA_DEFAULT_DIR_NAME, PMA_PAGE_DIR_FILENAME); - - snapshot_path = malloc(dir_len + 15); - sprintf(snapshot_path, "%s/%s/%s", _test_state->dir, PMA_DEFAULT_DIR_NAME, PMA_SNAPSHOT_FILENAME); - - // Test 1: successful initialization - assert(0 == pma_init(_test_state->dir)); - - fstat(_pma_state->page_dir_fd, &page_dir_statbuf); - stat(page_dir_path, &page_dir_statbuf_v); - assert(page_dir_statbuf_v.st_dev == page_dir_statbuf.st_dev); - assert(page_dir_statbuf_v.st_ino == page_dir_statbuf.st_ino); - - fstat(_pma_state->snapshot_fd, &snapshot_statbuf); - stat(snapshot_path, &snapshot_statbuf_v); - assert(snapshot_statbuf_v.st_dev == snapshot_statbuf.st_dev); - assert(snapshot_statbuf_v.st_ino == snapshot_statbuf.st_ino); - - assert(0x400000 == page_dir_statbuf.st_size); - assert(0x40000000 == snapshot_statbuf.st_size); - - assert(NULL == _pma_state->free_pages); - assert(NULL == _pma_state->free_page_runs); - assert(0 == _pma_state->meta_page_offset); - - assert(0x400000 == _pma_state->page_directory.size); - assert(1 == _pma_state->page_directory.next_index); - assert(FIRST == _pma_state->page_directory.entries[0].status); - assert(8192 == _pma_state->page_directory.entries[0].offset); - - assert(0xBADDECAFC0FFEE00 == _pma_state->metadata->magic_code); - assert(1 == _pma_state->metadata->version); - assert(0 == _pma_state->metadata->epoch); - assert(0 == _pma_state->metadata->event); - assert(0 == _pma_state->metadata->root); - assert(0x10000 == _pma_state->metadata->arena_start); - assert(0x11000 == _pma_state->metadata->arena_end); - assert(12288 == _pma_state->metadata->next_offset); - assert(0x10000 == _pma_state->metadata->dpage_cache); - assert(0 == _pma_state->metadata->dpage_cache->dirty); - assert(0 == _pma_state->metadata->dpage_cache->size); - assert(0 == _pma_state->metadata->dpage_cache->head); - assert(0 == _pma_state->metadata->dpage_cache->tail); - assert(0 == _pma_state->metadata->num_dirty_pages); - assert(0 == _pma_state->metadata->dirty_pages[0].index); - assert(0 == _pma_state->metadata->dirty_pages[0].offset); - assert(0 == _pma_state->metadata->dirty_pages[0].num_pages); - - checksum = _pma_state->metadata->checksum; - _pma_state->metadata->checksum = 0; - assert(checksum == crc_32((unsigned char*)_pma_state->metadata, PMA_PAGE_SIZE)); - - // Clean up - munmap(_pma_state->metadata->arena_start, _pma_state->metadata->snapshot_size); - munmap(_pma_state->page_directory.entries, PMA_MAXIMUM_DIR_SIZE); - - _pma_state_free(); - - unlink(snapshot_path); - free(snapshot_path); - - unlink(page_dir_path); - free(page_dir_path); -} - -void -test_pma_sync(void) { - PMAMetadata *metadata_page_1; - PMAMetadata *metadata_page_2; - PMASharedPageHeader *shared_page_16b; - size_t dir_len; - char *page_dir_path; - char *snapshot_path; - - // Set up - dir_len = strlen(_test_state->dir); - - page_dir_path = malloc(dir_len + 15); - sprintf(page_dir_path, "%s/%s/%s", _test_state->dir, PMA_DEFAULT_DIR_NAME, PMA_PAGE_DIR_FILENAME); - - snapshot_path = malloc(dir_len + 15); - sprintf(snapshot_path, "%s/%s/%s", _test_state->dir, PMA_DEFAULT_DIR_NAME, PMA_SNAPSHOT_FILENAME); - - pma_init(_test_state->dir); - _pma_state->metadata->epoch = 1; - _pma_state->metadata->event = 1; - - // Test 1: good event, bad epoch - assert(-1 == pma_sync(0, 2, 0)); - - // Test 2: good epoch, bad event - assert(-1 == pma_sync(1, 0, 0)); - - // Test 3: successful sync - _pma_state->metadata->epoch = 0; - _pma_state->metadata->event = 0; - - pma_malloc(16); - assert(1 == _pma_state->metadata->num_dirty_pages); - - assert(0 == pma_sync(1, 2, 3)); - assert(1 == _pma_state->metadata->epoch); - assert(2 == _pma_state->metadata->event); - assert(3 == _pma_state->metadata->root); - assert(0x12000 == _pma_state->metadata->arena_end); - assert(0x11000 == _pma_state->metadata->shared_pages[0]); - assert(NULL == _pma_state->metadata->shared_pages[1]); - assert(NULL == _pma_state->metadata->shared_pages[2]); - assert(NULL == _pma_state->metadata->shared_pages[3]); - assert(NULL == _pma_state->metadata->shared_pages[4]); - assert(NULL == _pma_state->metadata->shared_pages[5]); - assert(NULL == _pma_state->metadata->shared_pages[6]); - assert(0x10000 == _pma_state->metadata->dpage_cache); - assert(0 == _pma_state->metadata->num_dirty_pages); - assert(16384 == _pma_state->metadata->next_offset); - - metadata_page_1 = mmap( - NULL, - PMA_PAGE_SIZE, - PROT_READ, - MAP_SHARED, - _pma_state->snapshot_fd, - 0); - metadata_page_2 = mmap( - NULL, - PMA_PAGE_SIZE, - PROT_READ, - MAP_SHARED, - _pma_state->snapshot_fd, - 4096); - shared_page_16b = mmap( - NULL, - PMA_PAGE_SIZE, - PROT_READ, - MAP_SHARED, - _pma_state->snapshot_fd, - 12288); - - assert(metadata_page_1->magic_code == _pma_state->metadata->magic_code); - assert(metadata_page_1->checksum == _pma_state->metadata->checksum); - assert(metadata_page_1->version == _pma_state->metadata->version); - assert(metadata_page_1->epoch == _pma_state->metadata->epoch); - assert(metadata_page_1->event == _pma_state->metadata->event); - assert(metadata_page_1->root == _pma_state->metadata->root); - assert(metadata_page_1->arena_start == _pma_state->metadata->arena_start); - assert(metadata_page_1->arena_end == _pma_state->metadata->arena_end); - assert(metadata_page_1->dpage_cache == _pma_state->metadata->dpage_cache); - assert(metadata_page_1->snapshot_size == _pma_state->metadata->snapshot_size); - assert(metadata_page_1->next_offset == _pma_state->metadata->next_offset); - - assert(1 == metadata_page_1->num_dirty_pages); - assert(1 == metadata_page_1->dirty_pages[0].index); - assert(12288 == metadata_page_1->dirty_pages[0].offset); - assert(1 == metadata_page_1->dirty_pages[0].num_pages); - assert(SHARED == metadata_page_1->dirty_pages[0].status); - - assert(0 == metadata_page_2->epoch); - assert(0 == metadata_page_2->event); - assert(0 == metadata_page_2->root); - assert(0x11000 == metadata_page_2->arena_end); - assert(NULL == metadata_page_2->shared_pages[0]); - assert(NULL == metadata_page_2->shared_pages[1]); - assert(NULL == metadata_page_2->shared_pages[2]); - assert(NULL == metadata_page_2->shared_pages[3]); - assert(NULL == metadata_page_2->shared_pages[4]); - assert(NULL == metadata_page_2->shared_pages[5]); - assert(NULL == metadata_page_2->shared_pages[6]); - assert(0x10000 == metadata_page_2->dpage_cache); - assert(0 == metadata_page_2->num_dirty_pages); - assert(12288 == metadata_page_2->next_offset); - - assert(NULL == shared_page_16b->next); - assert(0 == shared_page_16b->dirty); - assert(4 == shared_page_16b->size); - assert(252 == shared_page_16b->free); - - // Clean up - munmap(metadata_page_1, PMA_PAGE_SIZE); - munmap(metadata_page_2, PMA_PAGE_SIZE); - - munmap(_pma_state->metadata->arena_start, _pma_state->metadata->snapshot_size); - munmap(_pma_state->page_directory.entries, PMA_MAXIMUM_DIR_SIZE); - - _pma_state_free(); - - unlink(snapshot_path); - free(snapshot_path); - - unlink(page_dir_path); - free(page_dir_path); -} - -void -test_pma_load(void) { - PMARootState res; - size_t dir_len; - const uint64_t bad_code = 0x600DDECAFC0FFEE0; - const uint64_t old_event = 0; - const uint32_t bad_checksum = 0; - const uint32_t bad_version = 1337; - int snapshot_fd; - char *bin_path; - char *page_dir_path; - char *snapshot_path; - - // Set up - dir_len = strlen(_test_state->dir); - - bin_path = malloc(dir_len + 6); - sprintf(bin_path, "%s/%s", _test_state->dir, PMA_DEFAULT_DIR_NAME); - - page_dir_path = malloc(dir_len + 15); - sprintf(page_dir_path, "%s/%s", bin_path, PMA_PAGE_DIR_FILENAME); - - snapshot_path = malloc(dir_len + 15); - sprintf(snapshot_path, "%s/%s", bin_path, PMA_SNAPSHOT_FILENAME); - - // Test 1: dir doesn't exist - rmdir(bin_path); - res = pma_load(_test_state->dir); - assert(0 == res.epoch); - assert(0 == res.event); - assert(0 == res.root); - assert(2 == errno); - errno = 0; - - // Test 2: snapshot doesn't exist - pma_init(_test_state->dir); - assert(0 == pma_close(0, 1, 0)); - unlink(snapshot_path); - - res = pma_load(_test_state->dir); - assert(0 == res.epoch); - assert(0 == res.event); - assert(0 == res.root); - assert(2 == errno); - - errno = 0; - _pma_state_free(); - unlink(page_dir_path); - - // Test 3: page directory doesn't exist - pma_init(_test_state->dir); - assert(0 == pma_close(0, 1, 0)); - unlink(page_dir_path); - - res = pma_load(_test_state->dir); - assert(0 == res.epoch); - assert(0 == res.event); - assert(0 == res.root); - assert(2 == errno); - - errno = 0; - _pma_state_free(); - unlink(snapshot_path); - - // Test 4: bad magic code - pma_init(_test_state->dir); - assert(0 == pma_close(0, 1, 0)); - snapshot_fd = open(snapshot_path, PMA_NEW_FILE_FLAGS, PMA_FILE_PERMISSIONS); - assert(0 < snapshot_fd); - pwrite(snapshot_fd, &bad_code, 8, 0); - - res = pma_load(_test_state->dir); - assert(0 == res.epoch); - assert(0 == res.event); - assert(0 == res.root); - assert(EILSEQ == errno); - - errno = 0; - close(snapshot_fd); - unlink(snapshot_path); - unlink(page_dir_path); - - // Test 5: bad version - pma_init(_test_state->dir); - assert(0 == pma_close(0, 1, 0)); - snapshot_fd = open(snapshot_path, PMA_NEW_FILE_FLAGS, PMA_FILE_PERMISSIONS); - assert(0 < snapshot_fd); - pwrite(snapshot_fd, &bad_version, 4, 12); - - res = pma_load(_test_state->dir); - assert(0 == res.epoch); - assert(0 == res.event); - assert(0 == res.root); - assert(EILSEQ == errno); - - errno = 0; - close(snapshot_fd); - unlink(snapshot_path); - unlink(page_dir_path); - - // Test 6: both metadata pages have invalid checksum - pma_init(_test_state->dir); - assert(0 == pma_close(0, 1, 0)); - snapshot_fd = open(snapshot_path, PMA_NEW_FILE_FLAGS, PMA_FILE_PERMISSIONS); - assert(0 < snapshot_fd); - pwrite(snapshot_fd, &bad_checksum, 4, 8); - pwrite(snapshot_fd, &bad_checksum, 4, (PMA_PAGE_SIZE + 8)); - - res = pma_load(_test_state->dir); - assert(0 == res.epoch); - assert(0 == res.event); - assert(0 == res.root); - assert(EILSEQ == errno); - - errno = 0; - close(snapshot_fd); - unlink(snapshot_path); - unlink(page_dir_path); - - // Test 7: first metadata page is newer but has bad checksum - pma_init(_test_state->dir); - assert(0 == pma_close(1, 2, 3)); - snapshot_fd = open(snapshot_path, PMA_NEW_FILE_FLAGS, PMA_FILE_PERMISSIONS); - assert(0 < snapshot_fd); - pwrite(snapshot_fd, &bad_checksum, 4, 8); - pwrite(snapshot_fd, &old_event, 8, (PMA_PAGE_SIZE + 24)); - - res = pma_load(_test_state->dir); - assert(0 == res.epoch); - assert(0 == res.event); - assert(0 == res.root); - assert(0 == _pma_state->meta_page_offset); - - assert(0 == pma_close(4, 4, 4)); - close(snapshot_fd); - unlink(snapshot_path); - unlink(page_dir_path); - - // Test 8: second metadata page is newer - pma_init(_test_state->dir); - assert(0 == pma_sync(1, 2, 3)); - assert(0 == pma_close(4, 5, 6)); - - res = pma_load(_test_state->dir); - assert(4 == res.epoch); - assert(5 == res.event); - assert(6 == res.root); - assert(0 == _pma_state->meta_page_offset); - - assert(0 == pma_close(7, 8, 9)); - close(snapshot_fd); - unlink(snapshot_path); - unlink(page_dir_path); - - // Clean up - free(bin_path); - free(snapshot_path); - free(page_dir_path); -} From bbe034e3827219eceecea7aad22b8bc5432287f0 Mon Sep 17 00:00:00 2001 From: barter-simsum Date: Mon, 4 Dec 2023 19:50:20 -0500 Subject: [PATCH 031/128] pma: _mlist_insert --- rust/ares_pma/c-src/btree.c | 47 ++++++++++++++++++++++++++++++++++++- 1 file changed, 46 insertions(+), 1 deletion(-) diff --git a/rust/ares_pma/c-src/btree.c b/rust/ares_pma/c-src/btree.c index 18a6bbc..f74e2ca 100644 --- a/rust/ares_pma/c-src/btree.c +++ b/rust/ares_pma/c-src/btree.c @@ -787,6 +787,51 @@ _bt_delco_1pass(BT_state *state, vaof_t lo, vaof_t hi) return _bt_delco_1pass_0(state, lo, hi, root, 1, meta->depth); } +static void +_mlist_insert(BT_state *state, void *lo, void *hi) +{ + BT_mlistnode *head = state->mlist; + BYTE *lob = lo; + BYTE *hib = hi; + + assert(head); + + while (head->next) { + BYTE *vob = head->va; + size_t siz = head->sz; + BYTE *nob = head->next->va; + + /* freed chunk immediately precedes head */ + if (hi == vob) { + head->va = lo; + head->sz += (hib - lob); + return; + } + /* freed chunk immediately follows termination of head */ + if (vob + siz == lo) { + head->sz += (hib - lob); + return; + } + /* freed chunk between head and next but not contiguous */ + if (lob > vob + siz + && hib < nob) { + BT_mlistnode *new = calloc(1, sizeof *new); + new->sz = (hib - lob); + new->va = lob; + new->next = head->next; + head->next = new; + return; + } + head = head->next; + } + /* freelist completely searched. Chunk must be at tail and not contiguous */ + BT_mlistnode *new = calloc(1, sizeof *new); + new->sz = (hib - lob); + new->va = lob; + new->next = head->next; + head->next = new; +} + static void _pending_nlist_insert(BT_state *state, pgno_t nodepg) { @@ -2488,7 +2533,7 @@ bt_free(BT_state *state, void *lo, void *hi) vaof_t looff = addr2off(lo); vaof_t hioff = addr2off(hi); _bt_insert(state, looff, hioff, 0); - /* ;;: and now add freespace to state->flist. coalescing when you do so */ + _mlist_insert(state, lo, hi); } int From d1ee3f673dc5391e227b539d514ef5d000e5a8a0 Mon Sep 17 00:00:00 2001 From: barter-simsum Date: Mon, 4 Dec 2023 20:13:33 -0500 Subject: [PATCH 032/128] pma: misc cleans up code. clearing comments and unnecessary struct members also fixed lowidx calculation bug in deletion coalescing routines --- rust/ares_pma/c-src/btree.c | 76 +++++++++++-------------------------- 1 file changed, 23 insertions(+), 53 deletions(-) diff --git a/rust/ares_pma/c-src/btree.c b/rust/ares_pma/c-src/btree.c index f74e2ca..65967f4 100644 --- a/rust/ares_pma/c-src/btree.c +++ b/rust/ares_pma/c-src/btree.c @@ -219,8 +219,8 @@ static_assert(BT_DAT_MAXENTRIES % 2 == 0); */ typedef struct BT_page BT_page; struct BT_page { - BT_pageheader head; /* ;;: TODO remove header and store all header data in BT_meta */ - union { /* data section */ + BT_pageheader head; /* header */ + union { /* data section */ BT_dat datd[BT_DAT_MAXENTRIES]; /* union view */ BT_kv datk[0]; /* struct view */ BYTE datc[0]; /* byte-level view */ @@ -260,7 +260,6 @@ struct BT_meta { uint8_t blk_cnt; /* currently highest valid block base */ uint8_t depth; /* tree depth */ -/* #define BP_DIRTY ((uint8_t)0x01) /\* ;;: TODO remove dirty flag *\/ */ #define BP_META ((uint8_t)0x02) uint8_t flags; uint8_t _pad1; @@ -302,12 +301,9 @@ struct BT_flistnode { typedef struct BT_state BT_state; struct BT_state { - uint16_t flags; /* ;;: rem */ int data_fd; int meta_fd; /* ;;: confident can be removed because we're not explicitly calling write() */ char *path; - ULONG branch_page_cnt; /* ;;: rem */ - ULONG leaf_page_cnt; /* ;;: rem */ void *fixaddr; BYTE *map; BT_page *node_freelist; @@ -322,17 +318,6 @@ struct BT_state { BT_nlistnode *nlist; /* node freelist */ BT_mlistnode *mlist; /* memory freelist */ BT_flistnode *flist; /* pma file freelist */ - /* ;;: for deletion coalescing: - - when freeing data, push onto the pending flist and mlist. When pushing onto - the mlist, you can preemptively coalesce. You don't need to coalesce at all - in the pending flist. - - when inserting and coalescing, if you can free a node then push onto the - pending nlist - - */ - BT_flistnode *pending_flist; BT_nlistnode *pending_nlist; }; @@ -402,10 +387,6 @@ _node_alloc(BT_state *state) the striped node partitions. Since this is unimplemented, just allocating space from first 2M */ - /* ;;: when node freelist is implemented, will we need to return the file - offset of the node as well? This is important for splitting where we - allocate a new node and need to store its file offset in the parent's - data index */ size_t width = (BYTE *)state->node_freelist - state->map; assert(width < MBYTES(2)); /* ;;: todo confirm data sections are zeroed */ @@ -445,15 +426,12 @@ _bt_nalloc(BT_state *state) } } -/* ;;: from our usage, _node_cow no longer needs to take indirect pointer to - newnode. We don't ever do anything with it */ static int -_node_cow(BT_state *state, BT_page *node, BT_page **newnode, pgno_t *pgno) +_node_cow(BT_state *state, BT_page *node, pgno_t *pgno) { BT_page *ret = _node_alloc(state); memcpy(ret->datk, node->datk, sizeof node->datk[0] * BT_DAT_MAXENTRIES); *pgno = _fo_get(state, ret); - *newnode = ret; return BT_SUCC; } @@ -627,7 +605,8 @@ static int _bt_split_child(BT_state *state, BT_page *parent, size_t i, pgno_t *newchild) { /* ;;: todo: better error handling */ - /* ;;: todo: assert parent and left is dirty */ + assert(_bt_ischilddirty(parent, i)); + int rc = BT_SUCC; size_t N; BT_page *left = _node_get(state, parent->datk[i].fo); @@ -639,7 +618,6 @@ _bt_split_child(BT_state *state, BT_page *parent, size_t i, pgno_t *newchild) /* adjust high address of left node in parent */ N = _bt_numkeys(left); - /* parent->datk[i+1].va = left->datk[N-1].va; /\* ;;: is this necessary? *\/ */ /* insert reference to right child into parent node */ N = _bt_numkeys(right); @@ -659,8 +637,6 @@ _bt_split_child(BT_state *state, BT_page *parent, size_t i, pgno_t *newchild) return BT_SUCC; } -/* ;;: since we won't be rebalancing on delete, but rather on insert, you should add rebalance logic to _bt_insert2 which checks the degree of a node and rebalances if less than minimum */ - static int _bt_rebalance(BT_state *state, BT_page *node) { @@ -727,22 +703,22 @@ _bt_delco_1pass_0(BT_state *state, vaof_t lo, vaof_t hi, { /* Perform a dfs search on all ranges that fall within lo and hi */ - /* ;;: we can't use bt_childidx because the range of lo-hi may overlap ofc */ + size_t N = _bt_numkeys(node); size_t loidx = 0; size_t hiidx = 0; /* first find the entry that matches lo */ size_t i; - for (i = 0; i < BT_DAT_MAXKEYS-1; i++) { - vaof_t llo = node->datk[i].va; - if (llo <= lo) { + for (i = 0; i < N-1; i++) { + vaof_t hhi = node->datk[i+1].va; + if (hhi > lo) { loidx = i; break; } } /* and then the entry that matches hi */ - for (; i < BT_DAT_MAXKEYS-1; i++) { + for (; i < N-1; i++) { vaof_t hhi = node->datk[i].va; if (hhi >= hi) { hiidx = hi; @@ -1116,13 +1092,14 @@ _bt_delco_trim_lsubtree_rhs2(BT_state *state, vaof_t lo, vaof_t hi, pgno_t nodepg, uint8_t depth, uint8_t maxdepth) { BT_page *node = _node_get(state, nodepg); + size_t N = _bt_numkeys(node); size_t loidx = 0; /* find low idx of range */ size_t i; - for (i = 0; i < BT_DAT_MAXKEYS-1; i++) { - vaof_t llo = node->datk[i].va; - if (llo <= lo) { + for (i = 0; i < N-1; i++) { + vaof_t hhi = node->datk[i+1].va; + if (hhi > lo) { loidx = i; break; } @@ -1182,12 +1159,9 @@ _bt_delco(BT_state *state, vaof_t lo, vaof_t hi, pgno_t rsubtree = 0; /* find low idx of range */ - - /* ;;: !!! fixme this is not incorrect. find first hi greater than lo. the lo - of that entry is the loidx */ - for (size_t i = 0; i < BT_DAT_MAXKEYS-1; i++) { - vaof_t llo = node->datk[i].va; - if (llo <= lo) { + for (size_t i = 0; i < N-1; i++) { + vaof_t hhi = node->datk[i+1].va; + if (hhi > lo) { loidx = i; break; } @@ -1218,18 +1192,16 @@ _bt_delco(BT_state *state, vaof_t lo, vaof_t hi, /* ;;: refactor? code duplication?? */ if (!_bt_ischilddirty(node, loidx)) { BT_page *child = _node_get(state, lsubtree); - BT_page *new; pgno_t newpg; - _node_cow(state, child, &new, &newpg); + _node_cow(state, child, &newpg); lsubtree = node->datk[loidx].fo = newpg; _bt_dirtychild(node, loidx); } if (!_bt_ischilddirty(node, hiidx)) { BT_page *child = _node_get(state, rsubtree); - BT_page *new; pgno_t newpg; - _node_cow(state, child, &new, &newpg); + _node_cow(state, child, &newpg); rsubtree = node->datk[hiidx].fo = newpg; _bt_dirtychild(node, hiidx); } @@ -1319,9 +1291,8 @@ _bt_insert2(BT_state *state, vaof_t lo, vaof_t hi, pgno_t fo, /* do we need to CoW the child node? */ if (!_bt_ischilddirty(node, childidx)) { - BT_page *newchild; pgno_t pgno; - _node_cow(state, node, &newchild, &pgno); + _node_cow(state, node, &pgno); node->datk[childidx].fo = pgno; _bt_dirtychild(node, childidx); } @@ -1373,9 +1344,8 @@ _bt_insert(BT_state *state, vaof_t lo, vaof_t hi, pgno_t fo) if (meta->depth > 1 && !_bt_ischilddirty(root, childidx)) { BT_page *child = _node_get(state, root->datk[childidx].fo); - BT_page *newchild; pgno_t newchildpg; - _node_cow(state, child, &newchild, &newchildpg); + _node_cow(state, child, &newchildpg); root->datk[childidx].fo = newchildpg; _bt_dirtychild(root, childidx); } @@ -2368,10 +2338,10 @@ _bt_sync_meta(BT_state *state) /* CoW a new root since the root referred to by the metapage should always be dirty */ - BT_page *root, *newroot; + BT_page *root; pgno_t newrootpg; root = _node_get(state, newmeta->root); - if (!SUCC(_node_cow(state, root, &newroot, &newrootpg))) + if (!SUCC(_node_cow(state, root, &newrootpg))) abort(); newmeta->root = newrootpg; From 1c88d390e3d533adce94b69e3e37cc9408f86695 Mon Sep 17 00:00:00 2001 From: Edward Amsden Date: Mon, 4 Dec 2023 20:50:07 -0600 Subject: [PATCH 033/128] pma: space_needed for Noun --- rust/ares/src/noun.rs | 5 ++++ rust/ares/src/persist.rs | 54 +++++++++++++++++++++++++++++++++++++--- 2 files changed, 56 insertions(+), 3 deletions(-) diff --git a/rust/ares/src/noun.rs b/rust/ares/src/noun.rs index 94d21a2..0e4b5af 100644 --- a/rust/ares/src/noun.rs +++ b/rust/ares/src/noun.rs @@ -423,6 +423,11 @@ impl IndirectAtom { unsafe { *(self.to_raw_pointer().add(1)) as usize } } + /** Memory size of an indirect atom (including size + metadata fields) in 64-bit words */ + pub fn raw_size(&self) -> usize { + self.size() + 2 + } + pub fn bit_size(&self) -> usize { unsafe { ((self.size() - 1) << 6) + 64 diff --git a/rust/ares/src/persist.rs b/rust/ares/src/persist.rs index 99820cc..43ef1e1 100644 --- a/rust/ares/src/persist.rs +++ b/rust/ares/src/persist.rs @@ -1,8 +1,9 @@ use crate::jets::cold::Cold; use crate::mem::NockStack; -use crate::noun::Noun; +use crate::noun::{Allocated, CellMemory, Noun}; use ares_pma::*; -use std::ffi::CString; +use either::Either::{Left, Right}; +use std::ffi::{c_void, CString}; use std::mem::size_of; use std::path::PathBuf; @@ -65,6 +66,11 @@ impl PMA { unsafe { bt_meta_set(self.0, field as usize, val) }; } + pub unsafe fn contains(&self, ptr: *const T, count: usize) -> bool { + bt_inbounds(self.0, ptr as *mut c_void) != 0 + && bt_inbounds(self.0, ptr.add(count) as *mut c_void) != 0 + } + pub fn load(&self) -> Snapshot { let snapshot_version = self.meta_get(BTMetaField::SnapshotVersion); @@ -163,9 +169,51 @@ impl Persist for Snapshot { } } +/// Ensure an allocated noun is marked and return if it was already marked +fn mark(a: Allocated) -> bool { + todo!() +} + +/// Unmark an allocated noun +fn unmark_noun(a: Allocated) -> bool { + todo!() +} + impl Persist for Noun { unsafe fn space_needed(&mut self, stack: &mut NockStack, pma: &PMA) -> usize { - todo!() + let mut space = 0usize; + stack.frame_push(0); + *(stack.push::()) = *self; + loop { + if stack.stack_is_empty() { + break; + } + let noun = *(stack.top::()); + stack.pop::(); + + if let Ok(allocated) = noun.as_allocated() { + // not counting direct atoms, they go in + match allocated.as_either() { + Left(indirect) => { + let count = indirect.raw_size(); + if !pma.contains(indirect.to_raw_pointer(), count) { + if !mark(allocated) { + space += count * size_of::(); + } + } + } + Right(cell) => { + if !pma.contains(cell.to_raw_pointer(), 1) { + if !mark(allocated) { + space += size_of::(); + } + } + } + } + } + } + stack.frame_pop(); + space } unsafe fn copy_to_buffer( From 35301c6e121442b23551159f7e568c6bb7c760cf Mon Sep 17 00:00:00 2001 From: Edward Amsden Date: Mon, 4 Dec 2023 22:55:32 -0600 Subject: [PATCH 034/128] pma: copy nouns in --- rust/ares/src/persist.rs | 79 +++++++++++++++++++++++++++++++++++++--- 1 file changed, 73 insertions(+), 6 deletions(-) diff --git a/rust/ares/src/persist.rs b/rust/ares/src/persist.rs index 43ef1e1..b1dfe7d 100644 --- a/rust/ares/src/persist.rs +++ b/rust/ares/src/persist.rs @@ -1,17 +1,20 @@ use crate::jets::cold::Cold; use crate::mem::NockStack; -use crate::noun::{Allocated, CellMemory, Noun}; +use crate::noun::{Allocated, Cell, CellMemory, IndirectAtom, Noun}; use ares_pma::*; use either::Either::{Left, Right}; use std::ffi::{c_void, CString}; use std::mem::size_of; use std::path::PathBuf; +use std::ptr::copy_nonoverlapping; const PMA_MODE: mode_t = 0o600; // RW for user only const PMA_FLAGS: ULONG = 0; // ignored for now const PMA_CURRENT_SNAPSHOT_VERSION: u64 = 1; +const NOUN_MARKED: u64 = 1 << 63; + /// Handle to a PMA pub struct PMA(*mut BT_state); @@ -170,13 +173,16 @@ impl Persist for Snapshot { } /// Ensure an allocated noun is marked and return if it was already marked -fn mark(a: Allocated) -> bool { - todo!() +unsafe fn mark(a: Allocated) -> bool { + let metadata = a.get_metadata(); + a.set_metadata(metadata | NOUN_MARKED); + metadata & NOUN_MARKED != 0 } /// Unmark an allocated noun -fn unmark_noun(a: Allocated) -> bool { - todo!() +unsafe fn unmark(a: Allocated) { + let metadata = a.get_metadata(); + a.set_metadata(metadata & !NOUN_MARKED); } impl Persist for Noun { @@ -222,7 +228,68 @@ impl Persist for Noun { pma: &PMA, buffer: *mut u8, ) -> (u64, *mut u8) { - todo!() + let mut buffer_u64 = buffer as *mut u64; + stack.frame_push(0); + *(stack.push::<(Noun, *mut Noun)>()) = (*self, self as *mut Noun); + + loop { + if stack.stack_is_empty() { + break; + } + + let (noun, dest) = *(stack.top::<(Noun, *mut Noun)>()); + + match noun.as_either_direct_allocated() { + Left(direct) => { + *dest = noun; + } + Right(allocated) => { + if let Some(a) = allocated.forwarding_pointer() { + *dest = a.as_noun(); + continue; + } + + match allocated.as_either() { + Left(mut indirect) => { + let count = indirect.raw_size(); + if pma.contains(indirect.to_raw_pointer(), count) { + *dest = noun; + continue; + } + + unmark(allocated); + copy_nonoverlapping(indirect.to_raw_pointer(), buffer_u64, count); + indirect.set_forwarding_pointer(buffer_u64); + *dest = IndirectAtom::from_raw_pointer(buffer_u64).as_noun(); + buffer_u64 = buffer_u64.add(count); + } + Right(mut cell) => { + if pma.contains(cell.to_raw_pointer(), 1) { + *dest = noun; + continue; + } + + unmark(allocated); + + let new_cell_mem = buffer_u64 as *mut CellMemory; + copy_nonoverlapping(cell.to_raw_pointer(), new_cell_mem, 1); + cell.set_forwarding_pointer(new_cell_mem); + + *dest = Cell::from_raw_pointer(new_cell_mem).as_noun(); + + *(stack.push::<(Noun, *mut Noun)>()) = + (cell.tail(), &mut (*new_cell_mem).tail); + *(stack.push::<(Noun, *mut Noun)>()) = + (cell.head(), &mut (*new_cell_mem).head); + + buffer_u64 = new_cell_mem.add(1) as *mut u64; + } + } + } + } + } + + (self.as_raw(), buffer_u64 as *mut u8) } unsafe fn handle_from_u64(meta_handle: u64) -> Self { From 984b7dcdfba21a60bcd18c4e98e243e96a2df6ed Mon Sep 17 00:00:00 2001 From: barter-simsum Date: Tue, 5 Dec 2023 17:24:46 -0500 Subject: [PATCH 035/128] pma: fix mmap call in _bt_data_cow --- rust/ares_pma/c-src/btree.c | 15 +++++---------- 1 file changed, 5 insertions(+), 10 deletions(-) diff --git a/rust/ares_pma/c-src/btree.c b/rust/ares_pma/c-src/btree.c index 65967f4..28d4a99 100644 --- a/rust/ares_pma/c-src/btree.c +++ b/rust/ares_pma/c-src/btree.c @@ -213,6 +213,8 @@ struct BT_kv { /* #define BT_DAT_MAXKEYS 10 */ #define BT_DAT_MAXVALS BT_DAT_MAXKEYS static_assert(BT_DAT_MAXENTRIES % 2 == 0); +/* we assume off_t is 64 bit */ +static_assert(sizeof(off_t) == sizeof(uint64_t)); /* all pages in the memory arena consist of a header and data section @@ -2644,27 +2646,20 @@ _bt_data_cow(BT_state *state, vaof_t lo, vaof_t hi, pgno_t pg) size_t bytelen = P2BYTES(len); pgno_t newpg = _bt_falloc(state, len); BYTE *loaddr = off2addr(lo); - - vaof_t arena_start = addr2off(BT_MAPADDR); - off_t offset = lo - arena_start; + off_t offset = P2BYTES(newpg); /* write call puts data in the unified buffer cache without having to map virtual memory */ if (pwrite(state->data_fd, loaddr, bytelen, offset) != bytelen) abort(); - /* BYTE *arena_start = BT_MAPADDR; */ - /* BYTE *map_loc = arena_start + lo; */ - /* maps new file offset with same data back into memory */ - mmap(BT_MAPADDR, + mmap(loaddr, bytelen, PROT_READ | PROT_WRITE, MAP_FIXED | MAP_SHARED, state->data_fd, - offset); /* ;;: using an offset here rather than - supplying the address directly. correct?? - check. */ + offset); _bt_insert(state, lo, hi, newpg); From a8202225664c5c1ee095f0742dc9d2f61b7ffe5d Mon Sep 17 00:00:00 2001 From: barter-simsum Date: Tue, 5 Dec 2023 19:29:35 -0500 Subject: [PATCH 036/128] pma: remove .meta_fd from BT_state --- rust/ares_pma/c-src/btree.c | 12 +----------- 1 file changed, 1 insertion(+), 11 deletions(-) diff --git a/rust/ares_pma/c-src/btree.c b/rust/ares_pma/c-src/btree.c index 28d4a99..619aab1 100644 --- a/rust/ares_pma/c-src/btree.c +++ b/rust/ares_pma/c-src/btree.c @@ -304,7 +304,6 @@ struct BT_flistnode { typedef struct BT_state BT_state; struct BT_state { int data_fd; - int meta_fd; /* ;;: confident can be removed because we're not explicitly calling write() */ char *path; void *fixaddr; BYTE *map; @@ -2399,7 +2398,7 @@ bt_state_new(BT_state **state) TRACE(); BT_state *s = calloc(1, sizeof *s); - s->meta_fd = s->data_fd = -1; + s->data_fd = -1; s->fixaddr = BT_MAPADDR; *state = s; return BT_SUCC; @@ -2429,20 +2428,12 @@ bt_state_open(BT_state *state, const char *path, ULONG flags, mode_t mode) if (!SUCC(rc = _bt_state_load(state))) goto e; - /* ;;: this may be entirely unnecessary */ - oflags |= O_DSYNC; /* see man 2 open */ - if ((state->meta_fd = open(dpath, oflags, mode)) == -1) { - rc = errno; - goto e; - } - state->path = strdup(dpath); e: /* cleanup FDs stored in state if anything failed */ if (!SUCC(rc)) { if (state->data_fd != -1) CLOSE_FD(state->data_fd); - if (state->meta_fd != -1) CLOSE_FD(state->meta_fd); } free(dpath); @@ -2454,7 +2445,6 @@ bt_state_close(BT_state *state) { int rc; if (state->data_fd != -1) CLOSE_FD(state->data_fd); - if (state->meta_fd != -1) CLOSE_FD(state->meta_fd); _mlist_delete(state); _flist_delete(state); From b957bb6c5b9c770824bff4e0e7389f4f38bcc8e3 Mon Sep 17 00:00:00 2001 From: barter-simsum Date: Tue, 5 Dec 2023 19:29:56 -0500 Subject: [PATCH 037/128] pma: misc fixes --- rust/ares_pma/c-src/btree.c | 25 ++++++++++++++++--------- 1 file changed, 16 insertions(+), 9 deletions(-) diff --git a/rust/ares_pma/c-src/btree.c b/rust/ares_pma/c-src/btree.c index 619aab1..b3b9335 100644 --- a/rust/ares_pma/c-src/btree.c +++ b/rust/ares_pma/c-src/btree.c @@ -73,9 +73,9 @@ STATIC_ASSERT(0, "debugger break instruction unimplemented"); #define PBYTES(x) ((size_t)(x) << 50) /* 4K page in bytes */ -#define P2BYTES(x) ((size_t)(x) << 14) +#define P2BYTES(x) ((size_t)(x) << BT_PAGEBITS) /* the opposite of P2BYTES */ -#define B2PAGES(x) ((size_t)(x) >> 14) +#define B2PAGES(x) ((size_t)(x) >> BT_PAGEBITS) #define __packed __attribute__((__packed__)) @@ -101,16 +101,16 @@ addr2off(void *p) /* convert a pointer into a 32-bit page offset */ { uintptr_t pu = (uintptr_t)p; - assert((pu & ((1 << 14) - 1)) == 0); /* p must be page-aligned */ + assert((pu & ((1 << BT_PAGEBITS) - 1)) == 0); /* p must be page-aligned */ uintptr_t off = pu - (uintptr_t)BT_MAPADDR; - return (vaof_t)(pu >> 14); + return (vaof_t)(pu >> BT_PAGEBITS); } static inline void * off2addr(vaof_t off) /* convert a 32-bit page offset into a pointer */ { - uintptr_t pu = (uintptr_t)off << 14; + uintptr_t pu = (uintptr_t)off << BT_PAGEBITS; pu += (uintptr_t)BT_MAPADDR; return (void *)pu; } @@ -139,6 +139,10 @@ off2addr(vaof_t off) /* FO2PA: file offset to page get a reference to a BT_page from a file offset + + /* ;;: can simplify: + + ((BT_page*)state->map)[fo] */ #define FO2PA(map, fo) \ ((BT_page *)&(map)[FO2BY(fo)]) @@ -307,7 +311,7 @@ struct BT_state { char *path; void *fixaddr; BYTE *map; - BT_page *node_freelist; + BT_page *node_freelist; /* ;;: REMOVE */ BT_meta *meta_pages[2]; /* double buffered */ /* ;;: note, while meta_pages[which]->root stores a pgno, we may want to just store a pointer to root in state in addition to avoid a _node_find on it @@ -550,7 +554,7 @@ _bt_datshift(BT_page *node, size_t i, size_t n) size_t siz = sizeof node->datk[0]; size_t bytelen = (BT_DAT_MAXKEYS - i - n) * siz; memmove(&node->datk[i+n], &node->datk[i], bytelen); - ZERO(&node->datk[i], n * siz); + ZERO(&node->datk[i], n * siz); /* NB: not completely necessary */ return BT_SUCC; } @@ -657,6 +661,9 @@ _bt_insertdat(vaof_t lo, vaof_t hi, pgno_t fo, vaof_t llo = parent->datk[childidx].va; vaof_t hhi = parent->datk[childidx+1].va; + /* NB: it can be assumed that llo <= lo and hi <= hhi because this routine is + called using an index found with _bt_childidx */ + /* duplicate */ if (llo == lo && hhi == hi) { parent->datk[childidx].fo = fo; @@ -1997,11 +2004,11 @@ _bt_state_meta_which(BT_state *state, int *which) BT_meta *m2 = state->meta_pages[1]; *which = -1; - if (m1->flags == 0) { + if (m1->chk == 0) { /* first is dirty */ *which = 1; } - else if (m2->flags == 0) { + else if (m2->chk == 0) { /* second is dirty */ *which = 0; } From df591e419fc9e6f32a33a961049326216812b94b Mon Sep 17 00:00:00 2001 From: barter-simsum Date: Tue, 5 Dec 2023 21:19:03 -0500 Subject: [PATCH 038/128] pma: increment meta.txnid on sync --- rust/ares_pma/c-src/btree.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/rust/ares_pma/c-src/btree.c b/rust/ares_pma/c-src/btree.c index b3b9335..1c79321 100644 --- a/rust/ares_pma/c-src/btree.c +++ b/rust/ares_pma/c-src/btree.c @@ -258,12 +258,9 @@ struct BT_meta { uint32_t _pad0; uint64_t txnid; void *fix_addr; /* fixed addr of btree */ - pgno_t blk_base[8]; /* block base array for striped node partition */ - /* ;;: for the blk_base array, code may be simpler if this were an array of BT_page *. */ - uint8_t blk_cnt; /* currently highest valid block base */ uint8_t depth; /* tree depth */ #define BP_META ((uint8_t)0x02) @@ -2326,6 +2323,9 @@ _bt_sync_meta(BT_state *state) uint32_t chk; int newwhich; + /* increment the txnid */ + meta->txnid += 1; + /* checksum the metapage */ chk = nonzero_crc_32(meta, BT_META_LEN); /* ;;: todo: guarantee the chk cannot be zero */ From 2b6f5fdc2614ba8ee6edcb1ab048ee49aea0aa06 Mon Sep 17 00:00:00 2001 From: barter-simsum Date: Tue, 5 Dec 2023 21:21:26 -0500 Subject: [PATCH 039/128] pma: abort if mmap calls return MAP_FAILED --- rust/ares_pma/c-src/btree.c | 19 +++++++++++++------ 1 file changed, 13 insertions(+), 6 deletions(-) diff --git a/rust/ares_pma/c-src/btree.c b/rust/ares_pma/c-src/btree.c index 1c79321..2695829 100644 --- a/rust/ares_pma/c-src/btree.c +++ b/rust/ares_pma/c-src/btree.c @@ -2164,6 +2164,9 @@ _bt_state_load(BT_state *state) state->data_fd, 0); + if (state->map == MAP_FAILED) + abort(); + p = (BT_page *)state->map; state->meta_pages[0] = METADATA(p); state->meta_pages[0] = METADATA(p + 1); @@ -2651,12 +2654,16 @@ _bt_data_cow(BT_state *state, vaof_t lo, vaof_t hi, pgno_t pg) abort(); /* maps new file offset with same data back into memory */ - mmap(loaddr, - bytelen, - PROT_READ | PROT_WRITE, - MAP_FIXED | MAP_SHARED, - state->data_fd, - offset); + void *map; + map = mmap(loaddr, + bytelen, + PROT_READ | PROT_WRITE, + MAP_FIXED | MAP_SHARED, + state->data_fd, + offset); + + if (map == MAP_FAILED) + abort(); _bt_insert(state, lo, hi, newpg); From 87320843deb951dc56011b367070e1356c1917ce Mon Sep 17 00:00:00 2001 From: barter-simsum Date: Tue, 5 Dec 2023 21:39:21 -0500 Subject: [PATCH 040/128] pma: remove old node freelist implm and refactor _bt_state_load --- rust/ares_pma/c-src/btree.c | 78 ++++++++----------------------------- 1 file changed, 16 insertions(+), 62 deletions(-) diff --git a/rust/ares_pma/c-src/btree.c b/rust/ares_pma/c-src/btree.c index 2695829..744bfd7 100644 --- a/rust/ares_pma/c-src/btree.c +++ b/rust/ares_pma/c-src/btree.c @@ -49,12 +49,6 @@ STATIC_ASSERT(0, "debugger break instruction unimplemented"); #define CAN_COALESCE 0 /* ;;: remove once confident in logic and delete all code dependencies on state->node_freelist */ -#define USE_NLIST 1 -#if USE_NLIST -/* ;;: obviously this should be removed once we've fully switched over to the - nlist. And calls to _node_alloc should be updated to calls to _bt_nalloc */ -#define _node_alloc(...) _bt_nalloc(__VA_ARGS__) -#endif #define ZERO(s, n) memset((s), 0, (n)) @@ -308,7 +302,6 @@ struct BT_state { char *path; void *fixaddr; BYTE *map; - BT_page *node_freelist; /* ;;: REMOVE */ BT_meta *meta_pages[2]; /* double buffered */ /* ;;: note, while meta_pages[which]->root stores a pgno, we may want to just store a pointer to root in state in addition to avoid a _node_find on it @@ -372,7 +365,7 @@ _node_get(BT_state *state, pgno_t pgno) return FO2PA(state->map, pgno); } -/* ;;: I don't think we should need this if _node_alloc also returns a disc offset */ +/* ;;: I don't think we should need this if _bt_nalloc also returns a disc offset */ static pgno_t _fo_get(BT_state *state, BT_page *node) { @@ -381,26 +374,13 @@ _fo_get(BT_state *state, BT_page *node) return BY2FO(vaddr - start); } -#ifndef USE_NLIST -static BT_page * /* ;;: change to return both a file and node offset as params to function. actual return value is error code */ -_node_alloc(BT_state *state) -{ - /* TODO: will eventually need to walk a node freelist that allocs space for - the striped node partitions. Since this is unimplemented, just allocating - space from first 2M */ - - size_t width = (BYTE *)state->node_freelist - state->map; - assert(width < MBYTES(2)); - /* ;;: todo confirm data sections are zeroed */ - /* ZERO(state->node_freelist, BT_PAGESIZE); */ - return ++state->node_freelist; -} -#endif - static BT_page * _bt_nalloc(BT_state *state) /* allocate a node in the node freelist */ { + /* TODO: maybe change _bt_nalloc to return both a file and a node offset as + params to the function and make actual return value an error code. This is + to avoid forcing some callers to immediately use _fo_get */ BT_nlistnode **n = &state->nlist; for (; *n; n = &(*n)->next) { @@ -409,6 +389,7 @@ _bt_nalloc(BT_state *state) end of the current stripe. If so, allocate a new region and append that to the freelist. */ size_t width = (BYTE *)state->nlist - state->map; + /* ;;: asserting 2M for now since partition striping is unimplemented */ assert(width < MBYTES(2)); /* perfect fit */ if ((*n)->sz == 1) { @@ -431,7 +412,7 @@ _bt_nalloc(BT_state *state) static int _node_cow(BT_state *state, BT_page *node, pgno_t *pgno) { - BT_page *ret = _node_alloc(state); + BT_page *ret = _bt_nalloc(state); memcpy(ret->datk, node->datk, sizeof node->datk[0] * BT_DAT_MAXENTRIES); *pgno = _fo_get(state, ret); return BT_SUCC; @@ -612,7 +593,7 @@ _bt_split_child(BT_state *state, BT_page *parent, size_t i, pgno_t *newchild) int rc = BT_SUCC; size_t N; BT_page *left = _node_get(state, parent->datk[i].fo); - BT_page *right = _node_alloc(state); + BT_page *right = _bt_nalloc(state); if (right == 0) return ENOMEM; if (!SUCC(rc = _bt_split_datcopy(left, right))) @@ -1362,8 +1343,8 @@ _bt_insert(BT_state *state, vaof_t lo, vaof_t hi, pgno_t fo) /* the old root is now the left child of the new root */ BT_page *left = root; - BT_page *right = _node_alloc(state); - BT_page *rootnew = _node_alloc(state); + BT_page *right = _bt_nalloc(state); + BT_page *rootnew = _bt_nalloc(state); /* split root's data across left and right nodes */ _bt_split_datcopy(left, right); @@ -1516,7 +1497,6 @@ _flist_new(BT_state *state) return BT_SUCC; } -#if USE_NLIST static int _nlist_new(BT_state *state) #define NLIST_PG_START 2 /* the third page */ @@ -1671,7 +1651,6 @@ _nlist_read(BT_state *state) return rc; } -#endif static BT_mlistnode * _mlist_read2(BT_state *state, BT_page *node, uint8_t maxdepth, uint8_t depth) @@ -2106,8 +2085,7 @@ _bt_state_meta_new(BT_state *state) TRACE(); - /* ;;: HERE HERE HERE: call node_alloc */ - root = _node_alloc(state); + root = _bt_nalloc(state); _bt_root_new(root); pagesize = sizeof *p1; @@ -2171,19 +2149,6 @@ _bt_state_load(BT_state *state) state->meta_pages[0] = METADATA(p); state->meta_pages[0] = METADATA(p + 1); -#ifndef USE_NLIST - state->node_freelist = &((BT_page *)state->map)[3]; /* begin allocating nodes - on third page (first two - are for metadata) -- this - was quite dumb. This is - the fourth page of - course. But it worked, - because in _bt_root_new - we use the third page - without calling the - allocation function */ -#endif - /* new db, so populate metadata */ if (new) { /* ;;: move this logic to _flist_new */ @@ -2194,10 +2159,9 @@ _bt_state_load(BT_state *state) state->file_size = PMA_GROW_SIZE; -#if USE_NLIST - /* ;;: necessary to call this before _bt_state_meta_new */ assert(SUCC(_nlist_new(state))); -#endif + assert(SUCC(_mlist_new(state))); + assert(SUCC(_flist_new(state))); if (!SUCC(rc = _bt_state_meta_new(state))) { munmap(state->map, BT_ADDRSIZE); @@ -2205,26 +2169,16 @@ _bt_state_load(BT_state *state) } } else { + assert(SUCC(_nlist_read(state))); + assert(SUCC(_mlist_read(state))); + assert(SUCC(_flist_read(state))); + if (fstat(state->data_fd, &stat) != 0) return errno; state->file_size = stat.st_size; } - if (new) { - assert(SUCC(_mlist_new(state))); - assert(SUCC(_flist_new(state))); - } - else { - assert(SUCC(_mlist_read(state))); - assert(SUCC(_flist_read(state))); -#if USE_NLIST - /* ;;: this might need to be re-ordered given that _nlist_new needs to be - called before _bt_state_meta_new. Haven't thought about it yet. */ - assert(SUCC(_nlist_read(state))); -#endif - } - return BT_SUCC; } From 1f68d65bb86139cc21864e08b7eda8f2ea649c83 Mon Sep 17 00:00:00 2001 From: Edward Amsden Date: Tue, 5 Dec 2023 23:32:39 -0600 Subject: [PATCH 041/128] hamt: allocate outermost stem on NockStack --- rust/ares/src/hamt.rs | 179 +++++++++++++++++++------------------ rust/ares/src/jets.rs | 4 +- rust/ares/src/jets/cold.rs | 6 +- rust/ares/src/jets/nock.rs | 2 +- rust/ares/src/jets/warm.rs | 6 +- rust/ares/src/main.rs | 4 +- rust/ares/src/serf.rs | 6 +- 7 files changed, 108 insertions(+), 99 deletions(-) diff --git a/rust/ares/src/hamt.rs b/rust/ares/src/hamt.rs index e940b36..df1e626 100644 --- a/rust/ares/src/hamt.rs +++ b/rust/ares/src/hamt.rs @@ -161,6 +161,7 @@ impl MutHamt { } #[repr(packed)] +#[repr(C)] struct Stem { bitmap: u32, typemap: u32, @@ -218,6 +219,7 @@ impl Stem { } #[repr(packed)] +#[repr(C)] struct Leaf { len: usize, buffer: *mut (Noun, T), // mutable for unifying equality @@ -238,6 +240,8 @@ impl Leaf { } #[derive(Copy, Clone)] +#[repr(packed)] +#[repr(C)] union Entry { stem: Stem, leaf: Leaf, @@ -256,19 +260,21 @@ assert_eq_size!(&[(Noun, ())], Leaf<()>); assert_eq_size!(&[Entry<()>], Stem<()>); #[derive(Copy, Clone)] -pub struct Hamt(Stem); +pub struct Hamt(*mut Stem); impl Hamt { pub fn is_null(&self) -> bool { - self.0.bitmap == 0 + unsafe { (*self.0).bitmap == 0 } } // Make a new, empty HAMT - pub fn new() -> Self { - Hamt(Stem { - bitmap: 0, - typemap: 0, - buffer: null(), - }) + pub fn new(stack: &mut NockStack) -> Self { + unsafe { + let stem_ptr = stack.struct_alloc::>(1); + (*stem_ptr).bitmap = 0; + (*stem_ptr).typemap = 0; + (*stem_ptr).buffer = null(); + Hamt(stem_ptr) + } } /** @@ -278,7 +284,7 @@ impl Hamt { * in the HAMT */ pub fn lookup(&self, stack: &mut NockStack, n: &mut Noun) -> Option { - let mut stem = self.0; + let mut stem = unsafe { *self.0 }; let mut mug = mug_u32(stack, *n); 'lookup: loop { let chunk = mug & 0x1F; // 5 bits @@ -309,9 +315,9 @@ impl Hamt { pub fn insert(&self, stack: &mut NockStack, n: &mut Noun, t: T) -> Hamt { let mut mug = mug_u32(stack, *n); let mut depth = 0u8; - let mut stem = self.0; - let mut stem_ret = self.0; - let mut dest = &mut stem_ret as *mut Stem; + let mut stem = unsafe { *self.0 }; + let stem_ret = unsafe { stack.struct_alloc::>(1) }; + let mut dest = stem_ret; unsafe { 'insert: loop { let chunk = mug & 0x1F; // 5 bits @@ -429,17 +435,12 @@ impl Hamt { } } -impl Default for Hamt { - fn default() -> Self { - Self::new() - } -} - impl Preserve for Hamt { unsafe fn assert_in_stack(&self, stack: &NockStack) { - stack.assert_struct_is_in(self.0.buffer, self.0.size()); + stack.assert_struct_is_in(self.0, 1); + stack.assert_struct_is_in((*self.0).buffer, (*self.0).size()); let mut traversal_stack: [Option<(Stem, u32)>; 6] = [None; 6]; - traversal_stack[0] = Some((self.0, 0)); + traversal_stack[0] = Some(((*self.0), 0)); let mut traversal_depth = 1; 'check: loop { if traversal_depth == 0 { @@ -481,78 +482,86 @@ impl Preserve for Hamt { } unsafe fn preserve(&mut self, stack: &mut NockStack) { - if stack.is_in_frame(self.0.buffer) { - let dest_buffer = stack.struct_alloc_in_previous_frame(self.0.size()); - copy_nonoverlapping(self.0.buffer, dest_buffer, self.0.size()); - self.0.buffer = dest_buffer; - // Here we're using the Rust stack since the array is a fixed - // size. Thus it will be cleaned up if the Rust thread running - // this is killed, and is therefore not an issue vs. if it were allocated - // on the heap. - // - // In the past, this traversal stack was allocated in NockStack, but - // exactly the right way to do this is less clear with the split stack. - let mut traversal_stack: [Option<(Stem, u32)>; 6] = [None; 6]; - traversal_stack[0] = Some((self.0, 0)); - let mut traversal_depth = 1; - 'preserve: loop { - if traversal_depth == 0 { - break; - } - let (stem, mut position) = traversal_stack[traversal_depth - 1] - .expect("Attempted to access uninitialized array element"); - // can we loop over the size and count leading 0s remaining in the bitmap? - 'preserve_stem: loop { - if position >= 32 { - traversal_depth -= 1; - continue 'preserve; + if stack.is_in_frame(self.0) { + let dest_stem = stack.struct_alloc_in_previous_frame(1); + copy_nonoverlapping(self.0, dest_stem, 1); + if stack.is_in_frame((*dest_stem).buffer) { + let dest_buffer = stack.struct_alloc_in_previous_frame((*dest_stem).size()); + copy_nonoverlapping((*dest_stem).buffer, dest_buffer, (*dest_stem).size()); + (*dest_stem).buffer = dest_buffer; + // Here we're using the Rust stack since the array is a fixed + // size. Thus it will be cleaned up if the Rust thread running + // this is killed, and is therefore not an issue vs. if it were allocated + // on the heap. + // + // In the past, this traversal stack was allocated in NockStack, but + // exactly the right way to do this is less clear with the split stack. + let mut traversal_stack: [Option<(Stem, u32)>; 6] = [None; 6]; + traversal_stack[0] = Some(((*dest_stem), 0)); + let mut traversal_depth = 1; + 'preserve: loop { + if traversal_depth == 0 { + break; } - match stem.entry(position) { - None => { - position += 1; - continue 'preserve_stem; + let (stem, mut position) = traversal_stack[traversal_depth - 1] + .expect("Attempted to access uninitialized array element"); + // can we loop over the size and count leading 0s remaining in the bitmap? + 'preserve_stem: loop { + if position >= 32 { + traversal_depth -= 1; + continue 'preserve; } - Some((Left(next_stem), idx)) => { - if stack.is_in_frame(next_stem.buffer) { - let dest_buffer = - stack.struct_alloc_in_previous_frame(next_stem.size()); - copy_nonoverlapping( - next_stem.buffer, - dest_buffer, - next_stem.size(), - ); - let new_stem = Stem { - bitmap: next_stem.bitmap, - typemap: next_stem.typemap, - buffer: dest_buffer, - }; - *(stem.buffer.add(idx) as *mut Entry) = Entry { stem: new_stem }; - assert!(traversal_depth <= 5); // will increment - traversal_stack[traversal_depth - 1] = Some((stem, position + 1)); - traversal_stack[traversal_depth] = Some((new_stem, 0)); - traversal_depth += 1; - continue 'preserve; - } else { + match stem.entry(position) { + None => { position += 1; continue 'preserve_stem; } - } - Some((Right(leaf), idx)) => { - if stack.is_in_frame(leaf.buffer) { - let dest_buffer = stack.struct_alloc_in_previous_frame(leaf.len); - copy_nonoverlapping(leaf.buffer, dest_buffer, leaf.len); - let new_leaf = Leaf { - len: leaf.len, - buffer: dest_buffer, - }; - for pair in new_leaf.to_mut_slice().iter_mut() { - pair.0.preserve(stack); - pair.1.preserve(stack); + Some((Left(next_stem), idx)) => { + if stack.is_in_frame(next_stem.buffer) { + let dest_buffer = + stack.struct_alloc_in_previous_frame(next_stem.size()); + copy_nonoverlapping( + next_stem.buffer, + dest_buffer, + next_stem.size(), + ); + let new_stem = Stem { + bitmap: next_stem.bitmap, + typemap: next_stem.typemap, + buffer: dest_buffer, + }; + *(stem.buffer.add(idx) as *mut Entry) = + Entry { stem: new_stem }; + assert!(traversal_depth <= 5); // will increment + traversal_stack[traversal_depth - 1] = + Some((stem, position + 1)); + traversal_stack[traversal_depth] = Some((new_stem, 0)); + traversal_depth += 1; + continue 'preserve; + } else { + position += 1; + continue 'preserve_stem; } - *(stem.buffer.add(idx) as *mut Entry) = Entry { leaf: new_leaf }; } - position += 1; - continue 'preserve_stem; + Some((Right(leaf), idx)) => { + if stack.is_in_frame(leaf.buffer) { + let dest_buffer = + stack.struct_alloc_in_previous_frame(leaf.len); + copy_nonoverlapping(leaf.buffer, dest_buffer, leaf.len); + let new_leaf = Leaf { + len: leaf.len, + buffer: dest_buffer, + }; + for pair in new_leaf.to_mut_slice().iter_mut() { + pair.0.preserve(stack); + pair.1.preserve(stack); + } + *(stem.buffer.add(idx) as *mut Entry) = + Entry { leaf: new_leaf }; + } + position += 1; + continue 'preserve_stem; + } } } } diff --git a/rust/ares/src/jets.rs b/rust/ares/src/jets.rs index f2bc319..5a78b9b 100644 --- a/rust/ares/src/jets.rs +++ b/rust/ares/src/jets.rs @@ -279,9 +279,9 @@ pub mod util { let mut stack = NockStack::new(8 << 10 << 10, 0); let newt = Newt::new_mock(); let cold = Cold::new(&mut stack); - let warm = Warm::new(); + let warm = Warm::new(&mut stack); let hot = Hot::init(&mut stack); - let cache = Hamt::::new(); + let cache = Hamt::::new(&mut stack); Context { stack, diff --git a/rust/ares/src/jets/cold.rs b/rust/ares/src/jets/cold.rs index 08c427d..21a5bdf 100644 --- a/rust/ares/src/jets/cold.rs +++ b/rust/ares/src/jets/cold.rs @@ -339,9 +339,9 @@ impl Cold { } pub fn new(stack: &mut NockStack) -> Self { - let battery_to_paths = Hamt::new(); - let root_to_paths = Hamt::new(); - let path_to_batteries = Hamt::new(); + let battery_to_paths = Hamt::new(stack); + let root_to_paths = Hamt::new(stack); + let path_to_batteries = Hamt::new(stack); unsafe { let cold_mem_ptr: *mut ColdMem = stack.struct_alloc(1); *cold_mem_ptr = ColdMem { diff --git a/rust/ares/src/jets/nock.rs b/rust/ares/src/jets/nock.rs index 993f57d..c89eaf7 100644 --- a/rust/ares/src/jets/nock.rs +++ b/rust/ares/src/jets/nock.rs @@ -39,7 +39,7 @@ pub fn jet_mink(context: &mut Context, subject: Noun) -> Result { let old_cache = context.cache; let old_scry_stack = context.scry_stack; - context.cache = Hamt::::new(); + context.cache = Hamt::::new(&mut context.stack); context.scry_stack = T(&mut context.stack, &[scry_handler, old_scry_stack]); match util::mink(context, v_subject, v_formula) { diff --git a/rust/ares/src/jets/warm.rs b/rust/ares/src/jets/warm.rs index 3b7e4c9..b7b8264 100644 --- a/rust/ares/src/jets/warm.rs +++ b/rust/ares/src/jets/warm.rs @@ -85,8 +85,8 @@ impl Iterator for WarmEntry { impl Warm { #[allow(clippy::new_without_default)] - pub fn new() -> Self { - Warm(Hamt::new()) + pub fn new(stack: &mut NockStack) -> Self { + Warm(Hamt::new(stack)) } fn insert( @@ -111,7 +111,7 @@ impl Warm { } pub fn init(stack: &mut NockStack, cold: &mut Cold, hot: &Hot) -> Self { - let mut warm = Self::new(); + let mut warm = Self::new(stack); for (mut path, axis, jet) in *hot { let batteries_list = cold.find(stack, &mut path); for batteries in batteries_list { diff --git a/rust/ares/src/main.rs b/rust/ares/src/main.rs index 6175ad2..fa2fe55 100644 --- a/rust/ares/src/main.rs +++ b/rust/ares/src/main.rs @@ -64,9 +64,9 @@ fn main() -> io::Result<()> { .as_cell() .expect("Input must be jam of subject/formula pair"); let newt = Newt::new_mock(); - let cache = Hamt::::new(); + let cache = Hamt::::new(&mut stack); let cold = Cold::new(&mut stack); - let warm = Warm::new(); + let warm = Warm::new(&mut stack); let hot = Hot::init(&mut stack); let mut context = Context { stack, diff --git a/rust/ares/src/serf.rs b/rust/ares/src/serf.rs index 12f071b..361b145 100644 --- a/rust/ares/src/serf.rs +++ b/rust/ares/src/serf.rs @@ -42,7 +42,7 @@ impl Context { // let snap = &mut snapshot::pma::Pma::new(snap_path); let mut stack = NockStack::new(1024 << 10 << 10, 0); let newt = Newt::new(); - let cache = Hamt::::new(); + let cache = Hamt::::new(&mut stack); let pma = PMA::open(snap_path).unwrap(); let (epoch, event_num, arvo, mut cold) = unsafe { @@ -117,7 +117,7 @@ impl Context { &mut self.nock_context.cold, &mut self.nock_context.hot, ); - self.nock_context.cache = Hamt::new(); + self.nock_context.cache = Hamt::new(&mut self.nock_context.stack); self.nock_context.scry_stack = D(0); // XX save to PMA @@ -256,7 +256,7 @@ pub fn serf() -> io::Result<()> { // Can't use for loop because it borrows newt while let Some(writ) = context.next() { // Reset the local cache and scry handler stack - context.nock_context.cache = Hamt::::new(); + context.nock_context.cache = Hamt::::new(&mut context.nock_context.stack); context.nock_context.scry_stack = D(0); context.nock_context.stack.frame_push(0); From 1cdb55ce7ff086a2c3784c9497f350c09dfaed4c Mon Sep 17 00:00:00 2001 From: barter-simsum Date: Wed, 6 Dec 2023 14:05:54 -0500 Subject: [PATCH 042/128] pma: _bt_state_restore_maps2 --- rust/ares_pma/c-src/btree.c | 80 ++++++++++++++++++++++++++++++++----- 1 file changed, 70 insertions(+), 10 deletions(-) diff --git a/rust/ares_pma/c-src/btree.c b/rust/ares_pma/c-src/btree.c index 744bfd7..c0b3dce 100644 --- a/rust/ares_pma/c-src/btree.c +++ b/rust/ares_pma/c-src/btree.c @@ -1973,6 +1973,60 @@ nonzero_crc_32(void *dat, size_t len) return chk; } +static void +_bt_state_restore_maps2(BT_state *state, BT_page *node, + uint8_t depth, uint8_t maxdepth) +{ + size_t N = _bt_numkeys(node); + + /* leaf */ + if (depth == maxdepth) { + for (size_t i = 0; i < N-1; i++) { + vaof_t lo = node->datk[i].va; + vaof_t hi = node->datk[i+1].va; + pgno_t pg = node->datk[i].fo; + + BYTE *loaddr = off2addr(lo); + BYTE *hiaddr = off2addr(hi); + size_t bytelen = hiaddr - loaddr; + off_t offset = P2BYTES(pg); + + if (loaddr != + mmap(loaddr, + bytelen, + PROT_READ | PROT_WRITE, + MAP_FIXED | MAP_SHARED, + state->data_fd, + offset)) { + DPRINTF("mmap: failed to map at addr %p", loaddr); + abort(); + } + } + return; + } + + /* branch - bfs all subtrees */ + for (size_t i = 0; i < N-1; i++) { + /* ;;: assuming node stripes when partition striping is implemented will be + 1:1 mapped to disk for simplicity. If that is not the case, they should + be handled here. */ + pgno_t pg = node->datk[i].fo; + BT_page *child = _node_get(state, pg); + return _bt_state_restore_maps2(state, child, depth+1, maxdepth); + } +} + +static void +_bt_state_restore_maps(BT_state *state) +/* restores the memory map of the btree since data can be arbitrarily located */ +{ + /* TODO: add checks to ensure data isn't mapped into an invalid location + (e.g. a node stripe) */ + BT_meta *meta = state->meta_pages[state->which]; + BT_page *root = _node_get(state, meta->root); + _bt_state_restore_maps2(state, root, 1, meta->depth); +} + static int _bt_state_meta_which(BT_state *state, int *which) { @@ -2142,8 +2196,10 @@ _bt_state_load(BT_state *state) state->data_fd, 0); - if (state->map == MAP_FAILED) + if (state->map != BT_MAPADDR) { + DPRINTF("mmap: failed to map at addr %p", BT_MAPADDR); abort(); + } p = (BT_page *)state->map; state->meta_pages[0] = METADATA(p); @@ -2169,6 +2225,7 @@ _bt_state_load(BT_state *state) } } else { + /* restore ephemeral freelists */ assert(SUCC(_nlist_read(state))); assert(SUCC(_mlist_read(state))); assert(SUCC(_flist_read(state))); @@ -2177,6 +2234,9 @@ _bt_state_load(BT_state *state) return errno; state->file_size = stat.st_size; + + /* restore data memory maps */ + _bt_state_restore_maps(state); } return BT_SUCC; @@ -2608,16 +2668,16 @@ _bt_data_cow(BT_state *state, vaof_t lo, vaof_t hi, pgno_t pg) abort(); /* maps new file offset with same data back into memory */ - void *map; - map = mmap(loaddr, - bytelen, - PROT_READ | PROT_WRITE, - MAP_FIXED | MAP_SHARED, - state->data_fd, - offset); - - if (map == MAP_FAILED) + if (loaddr != + mmap(loaddr, + bytelen, + PROT_READ | PROT_WRITE, + MAP_FIXED | MAP_SHARED, + state->data_fd, + offset)) { + DPRINTF("mmap: failed to map at addr %p", loaddr); abort(); + } _bt_insert(state, lo, hi, newpg); From f07bc63e01ce49657eeb9ee47114c5ef7e70f3dc Mon Sep 17 00:00:00 2001 From: barter-simsum Date: Wed, 6 Dec 2023 18:17:51 -0500 Subject: [PATCH 043/128] pma: test restructuring --- rust/ares_pma/c-src/btest.c | 47 +++++++++ rust/ares_pma/c-src/btree.c | 204 +----------------------------------- 2 files changed, 49 insertions(+), 202 deletions(-) diff --git a/rust/ares_pma/c-src/btest.c b/rust/ares_pma/c-src/btest.c index e69de29..ee4c728 100644 --- a/rust/ares_pma/c-src/btest.c +++ b/rust/ares_pma/c-src/btest.c @@ -0,0 +1,47 @@ +#include "btree.h" +#include "btree.c" + + +static void +_test_nodeinteg(BT_state *state, BT_findpath *path, + vaof_t lo, vaof_t hi, pgno_t pg) +{ + size_t childidx = 0; + BT_page *parent = 0; + + assert(SUCC(_bt_find(state, path, lo, hi))); + parent = path->path[path->depth]; + /* _bt_printnode(parent); */ + childidx = path->idx[path->depth]; + assert(parent->datk[childidx].fo == pg); + assert(parent->datk[childidx].va == lo); + assert(parent->datk[childidx+1].va == hi); +} + +int main(int argc, char *argv[]) +{ + DPUTS("PMA Tests"); + + BT_state *state; + BT_findpath path = {0}; + int rc = 0; + + bt_state_new(&state); + + + DPUTS("== test 1: insert"); + assert(SUCC(bt_state_open(state, "./pmatest", 0, 0644))); + + vaof_t lo = 10; + vaof_t hi = 0xDEADBEEF; + pgno_t pg = 1; /* dummy value */ + for (size_t i = 0; i < BT_DAT_MAXKEYS * 4; ++i) { + /* if (i % (BT_DAT_MAXKEYS - 2) == 0) */ + /* bp(0); /\* breakpoint on split case *\/ */ + _bt_insert(state, lo, hi, pg); + _test_nodeinteg(state, &path, lo, hi, pg); + lo++; pg++; + } + + return 0; +} diff --git a/rust/ares_pma/c-src/btree.c b/rust/ares_pma/c-src/btree.c index c0b3dce..437c3f7 100644 --- a/rust/ares_pma/c-src/btree.c +++ b/rust/ares_pma/c-src/btree.c @@ -632,7 +632,7 @@ _bt_insertdat(vaof_t lo, vaof_t hi, pgno_t fo, BT_page *parent, size_t childidx) { DPRINTF("BEFORE INSERT lo %" PRIu32 " hi %" PRIu32 " fo %" PRIu32, lo, hi, fo); - /* _bt_printnode(parent); */ + _bt_printnode(parent); /* ;;: TODO confirm this logic is appropriate for branch nodes. (It /should/ be correct for leaf nodes) */ @@ -673,7 +673,7 @@ _bt_insertdat(vaof_t lo, vaof_t hi, pgno_t fo, } DPUTS("AFTER INSERT"); - /* _bt_printnode(parent); */ + _bt_printnode(parent); return BT_SUCC; } @@ -2867,203 +2867,3 @@ _bt_printnode(BT_page *node) printf("[%5zu] %10x %10x\n", i, node->datk[i].va, node->datk[i].fo); } } - -static void -_test_nodeinteg(BT_state *state, BT_findpath *path, - vaof_t lo, vaof_t hi, pgno_t pg) -{ - size_t childidx = 0; - BT_page *parent = 0; - - assert(SUCC(_bt_find(state, path, lo, hi))); - parent = path->path[path->depth]; - /* _bt_printnode(parent); */ - childidx = path->idx[path->depth]; - assert(parent->datk[childidx].fo == pg); - assert(parent->datk[childidx].va == lo); - assert(parent->datk[childidx+1].va == hi); -} - -int main(int argc, char *argv[]) -{ - BT_state *state; - BT_findpath path = {0}; - int rc = 0; - - -//// =========================================================================== -//// test0 wip - - /* deletion coalescing */ - bt_state_new(&state); - assert(SUCC(bt_state_open(state, "./pmatest", 0, 0644))); - - /* enable coalescing of the memory freelist */ -#undef CAN_COALESCE -#define CAN_COALESCE 1 - - /* ;;: disabling for now as I don't have an answer to the "how to find the hi - address on a bt_free call so that _bt_delete can be called" question */ -#if 0 - void *t0a = bt_malloc(state, 10); - void *t0b = bt_malloc(state, 10); - bt_free(state, t0a); - bt_free(state, t0b); - /* memory freelist got coallesced. next malloc call should find the same range - and result in attempting to insert a range that overlaps a non-coallesced - region */ - void *t0ab = bt_malloc(state, 20); - /* t0a should have the same address as t0ab */ - assert(t0a == t0ab); -#endif - - /* ;;: can still suitably test by calling insert and delete routines directly */ - _bt_insert(state, 0x1000, 0x4000, 4); - _bt_insert(state, 0x4000, 0x8000, 4); - _bt_delete(state, 0x1000, 0x4000); - _bt_delete(state, 0x4000, 0x8000); - _bt_insert(state, 0x1000, 0x7000, 7); - - - //// =========================================================================== - //// test1 - - bt_state_new(&state); - assert(SUCC(bt_state_open(state, "./pmatest", 0, 0644))); - void * xxx = bt_malloc(state, 10); /* tmp - testing malloc logic */ - - - /* splitting tests. Insert sufficient data to force splitting. breakpoint before - that split is performed */ - - /* the hhi == hi case for more predictable splitting math */ - vaof_t lo = 10; - /* vaof_t hi = BT_DAT_MAXKEYS * 4; */ - vaof_t hi = 0xDEADBEEF; - pgno_t pg = 1; /* dummy value */ - for (size_t i = 0; i < BT_DAT_MAXKEYS * 4; ++i) { - /* if (i % (BT_DAT_MAXKEYS - 2) == 0) */ - /* bp(0); /\* breakpoint on split case *\/ */ - _bt_insert(state, lo, hi, pg); - _test_nodeinteg(state, &path, lo, hi, pg); - lo++; pg++; - } - - int which = state->which; - /* sham sync and re-run insertions */ - _sham_sync(state); - for (size_t i = 0; i < BT_DAT_MAXKEYS * 4; ++i) { - _bt_insert(state, lo, hi, pg); - _test_nodeinteg(state, &path, lo++, hi, pg++); - } - assert(which != state->which); - - assert(SUCC(bt_state_close(state))); - - - -//// =========================================================================== -//// test2 - - assert(SUCC(bt_state_open(state, "./pmatest", 0, 644))); - _mlist_read(state); - _flist_read(state); - - /* varieties of insert */ - - /* 2.1 exact match */ - lo = 0x10; - hi = 0x20; - pg = 0xFFFFFFFF; - - bp(0); - _bt_insert(state, lo, hi, pg); - _bt_insert(state, lo, hi, pg); - - /* ;;: you should also probably assert the data is laid out in datk at you expect */ - _test_nodeinteg(state, &path, lo, hi, pg); - - _bt_delete(state, lo, hi); - - /* 2.2 neither bounds match */ - bp(0); - _bt_insert(state, lo, hi, pg); - _bt_insert(state, lo+2, hi-2, pg-1); - - _test_nodeinteg(state, &path, lo, hi, pg); - _test_nodeinteg(state, &path, lo+2, hi-2, pg-1); - - _bt_delete(state, lo, hi); - _bt_delete(state, lo+2, hi-2); - - /* 2.3 space to right */ - bp(0); - _bt_insert(state, lo, hi, pg); - _bt_insert(state, lo, hi-2, pg-1); - - _test_nodeinteg(state, &path, lo, hi, pg); - _test_nodeinteg(state, &path, lo, hi-2, pg-1); - - _bt_delete(state, lo, hi); - _bt_delete(state, lo, hi-2); - - /* 2.4 space to left */ - bp(0); - - _bt_insert(state, lo, hi, pg); - _bt_insert(state, lo+2, hi, pg-1); - - _test_nodeinteg(state, &path, lo, hi, pg); - _test_nodeinteg(state, &path, lo+2, hi, pg-1); - - _bt_delete(state, lo, hi); - _bt_delete(state, lo+2, hi); - - assert(SUCC(bt_state_close(state))); - - return 0; -} - - -/* ;;: - - 1) checksum m1 - 2) sync m1 - 3) zero m2 - 4) copy all of m1 to m2 excluding m1 - - The current dirty metapage should have a zero checksum so that it happens to - be synced by the OS, it won't be valid. - -*/ - -/* ;;: - - Check if root page is dirty from metapage. if not, exit sync - - Create a queue of dirty pages. - - BFS the tree. Add root page. Add all pages in dirty bit set. Advance read - head to next page (index 1) and do the same until read head and write head - are equal. - - queue consists of pairs of memory address and length. - - if length field is zero, we'll msync length 1 page. -- which means this is a - node. if when iterating over queue, we find a zero length entry, then add - that node's dirty page. - - --- - - this /was/ the initial plan after some discussion. But after further - discussion, we can actually do a depth first search. To make implementation - even more simple, we can do an iterative dfs where we start from the root - each time. Why? Because the bulk of time to execute is going to be disc - io. - - after each msync of a page, descend to the deepest dirty page. msync that - page. set that page's dirty bit in the parent to non-dirty. repeat. once - you're at the root page and there are no dirty bits set, sync the - root. Finally, sync the metapage (with checksumming). - - */ From e4695f641c342789e006d89b2dcdf5d7bb4f2020 Mon Sep 17 00:00:00 2001 From: barter-simsum Date: Wed, 6 Dec 2023 18:23:33 -0500 Subject: [PATCH 044/128] pma: misc bug fixes in _bt_state_load --- rust/ares_pma/c-src/btree.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/rust/ares_pma/c-src/btree.c b/rust/ares_pma/c-src/btree.c index 437c3f7..a321998 100644 --- a/rust/ares_pma/c-src/btree.c +++ b/rust/ares_pma/c-src/btree.c @@ -388,7 +388,7 @@ _bt_nalloc(BT_state *state) implemented. Rather than assert, conditionally check if we're at the end of the current stripe. If so, allocate a new region and append that to the freelist. */ - size_t width = (BYTE *)state->nlist - state->map; + size_t width = (BYTE *)state->nlist->va - state->map; /* ;;: asserting 2M for now since partition striping is unimplemented */ assert(width < MBYTES(2)); /* perfect fit */ @@ -2216,13 +2216,14 @@ _bt_state_load(BT_state *state) state->file_size = PMA_GROW_SIZE; assert(SUCC(_nlist_new(state))); - assert(SUCC(_mlist_new(state))); - assert(SUCC(_flist_new(state))); if (!SUCC(rc = _bt_state_meta_new(state))) { munmap(state->map, BT_ADDRSIZE); return rc; } + + assert(SUCC(_mlist_new(state))); + assert(SUCC(_flist_new(state))); } else { /* restore ephemeral freelists */ From 76fa10e1ea037d5b46ce0700089bf1a3d57940fa Mon Sep 17 00:00:00 2001 From: Edward Amsden Date: Wed, 6 Dec 2023 23:08:05 -0600 Subject: [PATCH 045/128] pma: persist instance for Hamt --- rust/ares/src/hamt.rs | 185 ++++++++++++++++++++++++++++++++++++- rust/ares/src/jets/cold.rs | 8 +- rust/ares/src/persist.rs | 51 ++++++---- 3 files changed, 220 insertions(+), 24 deletions(-) diff --git a/rust/ares/src/hamt.rs b/rust/ares/src/hamt.rs index df1e626..7e8ad25 100644 --- a/rust/ares/src/hamt.rs +++ b/rust/ares/src/hamt.rs @@ -2,8 +2,10 @@ use crate::mem::{unifying_equality, NockStack, Preserve}; use crate::mug::mug_u32; use crate::noun::Noun; use either::Either::{self, *}; -use std::ptr::{copy_nonoverlapping, null}; +use std::ptr::{copy_nonoverlapping, null, null_mut}; use std::slice; +use crate::persist::{Persist, PMA}; +use std::mem::size_of; type MutStemEntry = Either<*mut MutStem, Leaf>; @@ -165,7 +167,7 @@ impl MutHamt { struct Stem { bitmap: u32, typemap: u32, - buffer: *const Entry, + buffer: *mut Entry, } impl Copy for Stem {} @@ -270,9 +272,11 @@ impl Hamt { pub fn new(stack: &mut NockStack) -> Self { unsafe { let stem_ptr = stack.struct_alloc::>(1); - (*stem_ptr).bitmap = 0; - (*stem_ptr).typemap = 0; - (*stem_ptr).buffer = null(); + *stem_ptr = Stem { + bitmap: 0, + typemap: 0, + buffer: null_mut(), + }; Hamt(stem_ptr) } } @@ -569,3 +573,174 @@ impl Preserve for Hamt { } } } + +#[derive(Copy,Clone)] +struct StemTraversalEntry { + bitmap_remaining: u32, + typemap_remaining: u32, + stem_ptr: *mut Stem, +} + +impl Persist for Hamt { + unsafe fn space_needed(&mut self, stack: &mut NockStack, pma: &PMA) -> usize { + if pma.contains(self.0, 1) { return 0; } + let mut bytes: usize = size_of::>(); + if pma.contains((*self.0).buffer, (*self.0).size()) { return bytes }; + + let mut depth: usize = 0; + let mut traversal = [ + Stem { + bitmap: 0, + typemap: 0, + buffer: null_mut(), + }; + 6 + ]; + traversal[0] = (*self.0); + + loop { + assert!(depth < 6); + if traversal[depth].bitmap == 0 { + if depth == 0 { break bytes; } + depth -= 1; + } + + let next_chunk = traversal[depth].bitmap.trailing_zeros(); + let next_type = traversal[depth].typemap & (1 << next_chunk) != 0; + let next_entry = *traversal[depth].buffer; + traversal[depth].bitmap = traversal[depth].bitmap >> (next_chunk + 1); + traversal[depth].typemap = traversal[depth].typemap >> (next_chunk + 1); + traversal[depth].buffer = traversal[depth].buffer.add(1); + + if next_type { // true->stem false->leaf + // found another stem + traversal[depth + 1] = next_entry.stem; + + if pma.contains(traversal[depth + 1].buffer, traversal[depth + 1].size()) { + continue; + } + + // count the buffer for the next stem + bytes += traversal[depth + 1].size() * size_of::>(); + depth += 1; + } else { + let mut leaf = next_entry.leaf; + + if leaf.len == 0 { + continue; + } + + if pma.contains(leaf.buffer, leaf.len) { + continue; + } + + bytes += size_of::<(Noun, T)>() * leaf.len; + + while leaf.len > 0 { + bytes += (*leaf.buffer).0.space_needed(stack, pma); + bytes += (*leaf.buffer).1.space_needed(stack, pma); + leaf.buffer = leaf.buffer.add(1); + leaf.len -= 1; + } + } + } + } + + // XX this is subtly wrong, we need to track destination pointers somehow and not just write + // into the traversal stack + unsafe fn copy_to_buffer( + &mut self, + stack: &mut NockStack, + pma: &PMA, + buffer: &mut *mut u8, + ) { + if pma.contains(self.0, 1) { return; } + let stem_ptr = *buffer as *mut Stem; + copy_nonoverlapping(self.0, stem_ptr, 1); + *buffer = stem_ptr.add(1) as *mut u8; + (*self).0 = stem_ptr; + + let stem_buffer_size = (*stem_ptr).size(); + if pma.contains((*stem_ptr).buffer, stem_buffer_size) { return; } + let stem_buffer_ptr = *buffer as *mut Entry; + copy_nonoverlapping((*stem_ptr).buffer, stem_buffer_ptr, stem_buffer_size); + *buffer = stem_buffer_ptr.add(stem_buffer_size) as *mut u8; + (*stem_ptr).buffer = stem_buffer_ptr; + + let mut depth: usize = 0; + let mut traversal = [ + Stem { + bitmap: 0, + typemap: 0, + buffer: null_mut(), + }; + 6 + ]; + + traversal[0] = *stem_ptr; + + loop { + if traversal[depth].bitmap == 0 { + if depth == 0 { + break; + } + depth -= 1; + continue; + } + + let next_chunk = traversal[depth].bitmap.trailing_zeros(); + let next_type = traversal[depth].typemap & (1 << next_chunk) != 0; + let next_entry_ptr = traversal[depth].buffer; + + traversal[depth].bitmap = traversal[depth].bitmap >> (next_chunk + 1); + traversal[depth].typemap = traversal[depth].typemap >> (next_chunk + 1); + traversal[depth].buffer = traversal[depth].buffer.add(1); + + if next_type { + // Stem case + assert!(depth < 5); + + let stem_ptr: *mut Stem = &mut (*next_entry_ptr).stem; + let stem_size = (*stem_ptr).size(); + let stem_buffer_ptr = *buffer as *mut Entry; + + copy_nonoverlapping((*stem_ptr).buffer, stem_buffer_ptr, stem_size); + *buffer = stem_buffer_ptr.add(stem_size) as *mut u8; + + (*stem_ptr).buffer = stem_buffer_ptr; + + traversal[depth + 1] = *stem_ptr; + depth += 1; + continue; + } else { + // Leaf case + let leaf_ptr: *mut Leaf = &mut (*next_entry_ptr).leaf; + let leaf_buffer_ptr = *buffer as *mut (Noun, T); + + copy_nonoverlapping((*leaf_ptr).buffer, leaf_buffer_ptr, (*leaf_ptr).len); + *buffer = leaf_buffer_ptr.add((*leaf_ptr).len) as *mut u8; + + (*leaf_ptr).buffer = leaf_buffer_ptr; + + let mut leaf_idx = 0; + + while leaf_idx < (*leaf_ptr).len { + (*(*leaf_ptr).buffer.add(leaf_idx)).0.copy_to_buffer(stack, pma, buffer); + (*(*leaf_ptr).buffer.add(leaf_idx)).1.copy_to_buffer(stack, pma, buffer); + + leaf_idx += 1; + } + + continue; + } + } + } + + unsafe fn handle_to_u64(&self) -> u64 { + todo!() + } + + unsafe fn handle_from_u64(meta_handle: u64) -> Self { + todo!() + } +} diff --git a/rust/ares/src/jets/cold.rs b/rust/ares/src/jets/cold.rs index 21a5bdf..da9205c 100644 --- a/rust/ares/src/jets/cold.rs +++ b/rust/ares/src/jets/cold.rs @@ -302,11 +302,15 @@ impl Persist for Cold { &mut self, stack: &mut NockStack, pma: &PMA, - buffer: *mut u8, - ) -> (u64, *mut u8) { + buffer: &mut *mut u8, + ) { todo!() } + unsafe fn handle_to_u64(&self) -> u64 { + self.0 as u64 + } + unsafe fn handle_from_u64(meta_handle: u64) -> Self { Cold(meta_handle as *mut ColdMem) } diff --git a/rust/ares/src/persist.rs b/rust/ares/src/persist.rs index b1dfe7d..16c3354 100644 --- a/rust/ares/src/persist.rs +++ b/rust/ares/src/persist.rs @@ -7,6 +7,7 @@ use std::ffi::{c_void, CString}; use std::mem::size_of; use std::path::PathBuf; use std::ptr::copy_nonoverlapping; +use std::convert::TryInto; const PMA_MODE: mode_t = 0o600; // RW for user only const PMA_FLAGS: ULONG = 0; // ignored for now @@ -119,20 +120,31 @@ pub trait Persist { &mut self, stack: &mut NockStack, pma: &PMA, - buffer: *mut u8, - ) -> (u64, *mut u8); + buffer: &mut *mut u8, + ); /// Persist an object into the PMA using [space_needed] and [copy_to_buffer], returning /// a [u64] (probably a pointer or tagged pointer) that can be saved into fn save_to_pma(&mut self, stack: &mut NockStack, pma: &PMA) -> u64 { unsafe { - let space_as_pages = - self.space_needed(stack, pma) + (BT_PAGESIZE as usize - 1) >> BT_PAGEBITS; - let buffer = bt_malloc(pma.0, space_as_pages) as *mut u8; - self.copy_to_buffer(stack, pma, buffer).0 + let space = self.space_needed(stack, pma); + + if space == 0 { + return self.handle_to_u64(); + } + + let space_as_pages = (space + (BT_PAGESIZE as usize - 1)) >> BT_PAGEBITS; + + let mut buffer = bt_malloc(pma.0, space_as_pages) as *mut u8; + let orig_buffer = buffer; + self.copy_to_buffer(stack, pma, &mut buffer); + assert!(orig_buffer.offset_from(buffer) > 0); + assert!(orig_buffer.offset_from(buffer) <= space.try_into().unwrap()); + self.handle_to_u64() } } + unsafe fn handle_to_u64(&self) -> u64; unsafe fn handle_from_u64(meta_handle: u64) -> Self; } @@ -149,22 +161,24 @@ impl Persist for Snapshot { &mut self, stack: &mut NockStack, pma: &PMA, - buffer: *mut u8, - ) -> (u64, *mut u8) { - let snapshot_buffer = buffer as *mut SnapshotMem; - let arvo_buffer = buffer.add(((size_of::() + 7) >> 3) << 3); + buffer: &mut *mut u8, + ) { + let snapshot_buffer = *buffer as *mut SnapshotMem; std::ptr::copy_nonoverlapping(self.0, snapshot_buffer, 1); *self = Snapshot(snapshot_buffer); + *buffer = snapshot_buffer.add(1) as *mut u8; let mut arvo = (*snapshot_buffer).arvo; - let (_, cold_buffer) = arvo.copy_to_buffer(stack, pma, arvo_buffer); + arvo.copy_to_buffer(stack, pma, buffer); (*snapshot_buffer).arvo = arvo; let mut cold = (*snapshot_buffer).cold; - let (_, rest_buffer) = cold.copy_to_buffer(stack, pma, cold_buffer); + cold.copy_to_buffer(stack, pma, buffer); (*snapshot_buffer).cold = cold; + } - (snapshot_buffer as u64, rest_buffer) + unsafe fn handle_to_u64(&self) -> u64 { + self.0 as u64 } unsafe fn handle_from_u64(meta_handle: u64) -> Self { @@ -226,9 +240,9 @@ impl Persist for Noun { &mut self, stack: &mut NockStack, pma: &PMA, - buffer: *mut u8, - ) -> (u64, *mut u8) { - let mut buffer_u64 = buffer as *mut u64; + buffer: &mut *mut u8, + ) { + let mut buffer_u64 = (*buffer) as *mut u64; stack.frame_push(0); *(stack.push::<(Noun, *mut Noun)>()) = (*self, self as *mut Noun); @@ -288,8 +302,11 @@ impl Persist for Noun { } } } + *buffer = buffer_u64 as *mut u8; + } - (self.as_raw(), buffer_u64 as *mut u8) + unsafe fn handle_to_u64(&self) -> u64 { + self.as_raw() } unsafe fn handle_from_u64(meta_handle: u64) -> Self { From 07c541e87ea4461b2250d9653a2c7d381d662e2a Mon Sep 17 00:00:00 2001 From: Edward Amsden Date: Wed, 6 Dec 2023 23:08:33 -0600 Subject: [PATCH 046/128] pma: fix warning in build script --- rust/ares_pma/build.rs | 1 - 1 file changed, 1 deletion(-) diff --git a/rust/ares_pma/build.rs b/rust/ares_pma/build.rs index c4270bf..cb9b8a1 100644 --- a/rust/ares_pma/build.rs +++ b/rust/ares_pma/build.rs @@ -1,7 +1,6 @@ extern crate bindgen; use std::env; -use std::fs::create_dir_all; use std::path::PathBuf; use bindgen::CargoCallbacks; From 6d7a8a1283d6c87bbb2dd8c76959668fd046bb27 Mon Sep 17 00:00:00 2001 From: Edward Amsden Date: Wed, 6 Dec 2023 23:08:47 -0600 Subject: [PATCH 047/128] devshell: add bacon --- rust/flake.nix | 1 + 1 file changed, 1 insertion(+) diff --git a/rust/flake.nix b/rust/flake.nix index 3f2587a..9ecead5 100644 --- a/rust/flake.nix +++ b/rust/flake.nix @@ -31,6 +31,7 @@ ]) pkgs.autoconf-archive pkgs.cargo-watch + pkgs.bacon pkgs.iconv pkgs.llvmPackages.clang pkgs.pkg-config From 06da6c67578a350ebda87635377e408310dbacac Mon Sep 17 00:00:00 2001 From: Edward Amsden Date: Thu, 7 Dec 2023 00:10:19 -0600 Subject: [PATCH 048/128] pma: persist instance for Batteries --- rust/ares/src/jets/cold.rs | 50 ++++++++++++++++++++++++++++++++++++++ rust/ares/src/noun.rs | 15 ++++++++++++ rust/ares/src/persist.rs | 43 +++++++++++++++++++++++++++++++- 3 files changed, 107 insertions(+), 1 deletion(-) diff --git a/rust/ares/src/jets/cold.rs b/rust/ares/src/jets/cold.rs index da9205c..4aa392f 100644 --- a/rust/ares/src/jets/cold.rs +++ b/rust/ares/src/jets/cold.rs @@ -5,6 +5,7 @@ use crate::noun::{Atom, DirectAtom, Noun, Slots, D, T}; use crate::persist::{Persist, PMA}; use std::ptr::copy_nonoverlapping; use std::ptr::null_mut; +use std::mem::size_of; pub enum Error { NoParent, @@ -32,6 +33,55 @@ struct BatteriesMem { parent_batteries: Batteries, } +impl Persist for Batteries { + unsafe fn space_needed(&mut self, stack: &mut NockStack, pma: &PMA) -> usize { + let mut bytes = 0; + let mut batteries = *self; + + loop { + if batteries.0.is_null() { break; } + if pma.contains(batteries.0, 1) { break; } + bytes += size_of::(); + bytes += (*batteries.0).battery.space_needed(stack, pma); + bytes += (*batteries.0).parent_axis.space_needed(stack, pma); + batteries = (*batteries.0).parent_batteries; + } + bytes + } + + unsafe fn copy_to_buffer( + &mut self, + stack: &mut NockStack, + pma: &PMA, + buffer: &mut *mut u8, + ) { + let mut dest = self; + loop { + if (*dest).0.is_null() { break; } + if pma.contains((*dest).0, 1) { break; } + + let batteries_mem_ptr = *buffer as *mut BatteriesMem; + copy_nonoverlapping((*dest).0, batteries_mem_ptr, 1); + *buffer = batteries_mem_ptr.add(1) as *mut u8; + + (*batteries_mem_ptr).battery.copy_to_buffer(stack, pma, buffer); + (*batteries_mem_ptr).parent_axis.copy_to_buffer(stack, pma, buffer); + + (*dest).0 = batteries_mem_ptr; + dest = &mut (*(*dest).0).parent_batteries; + } + } + + unsafe fn handle_to_u64(&self) -> u64 { + self.0 as u64 + } + + unsafe fn handle_from_u64(meta_handle: u64) -> Self { + Batteries(meta_handle as *mut BatteriesMem) + } + +} + impl Preserve for Batteries { unsafe fn assert_in_stack(&self, stack: &NockStack) { if self.0.is_null() { diff --git a/rust/ares/src/noun.rs b/rust/ares/src/noun.rs index 0e4b5af..9936de0 100644 --- a/rust/ares/src/noun.rs +++ b/rust/ares/src/noun.rs @@ -876,6 +876,21 @@ impl Atom { *self } } + + /** Make an atom from a raw u64 + * + * # Safety + * + * Note that the [u64] parameter is *not*, in general, the value of the atom! + * + * In particular, anything with the high bit set will be treated as a tagged pointer. + * This method is only to be used to restore an atom from the raw [u64] representation + * returned by [Noun::as_raw], and should only be used if we are sure the restored noun is in + * fact an atom. + */ + pub unsafe fn from_raw(raw: u64) -> Atom { + Atom { raw } + } } impl fmt::Display for Atom { diff --git a/rust/ares/src/persist.rs b/rust/ares/src/persist.rs index 16c3354..42c0420 100644 --- a/rust/ares/src/persist.rs +++ b/rust/ares/src/persist.rs @@ -1,6 +1,6 @@ use crate::jets::cold::Cold; use crate::mem::NockStack; -use crate::noun::{Allocated, Cell, CellMemory, IndirectAtom, Noun}; +use crate::noun::{Allocated, Cell, CellMemory, IndirectAtom, Noun, Atom}; use ares_pma::*; use either::Either::{Left, Right}; use std::ffi::{c_void, CString}; @@ -199,6 +199,47 @@ unsafe fn unmark(a: Allocated) { a.set_metadata(metadata & !NOUN_MARKED); } +impl Persist for Atom { + unsafe fn space_needed(&mut self, stack: &mut NockStack, pma: &PMA) -> usize { + if let Ok(indirect) = self.as_indirect() { + let count = indirect.raw_size(); + if !pma.contains(indirect.to_raw_pointer(), count) { + if !mark(indirect.as_allocated()) { + return count * size_of::(); + } + } + } + 0 + } + + unsafe fn copy_to_buffer(&mut self, stack: &mut NockStack, pma: &PMA, buffer: &mut *mut u8) { + if let Ok(mut indirect) = self.as_indirect() { + let count = indirect.raw_size(); + if !pma.contains(indirect.to_raw_pointer(), count) { + if let Some(forward) = indirect.forwarding_pointer() { + *self = forward.as_atom(); + } else { + let indirect_buffer_ptr = *buffer as *mut u64; + copy_nonoverlapping(indirect.to_raw_pointer(), indirect_buffer_ptr, count); + *buffer = indirect_buffer_ptr.add(count) as *mut u8; + + indirect.set_forwarding_pointer(indirect_buffer_ptr); + + *self = IndirectAtom::from_raw_pointer(indirect_buffer_ptr).as_atom(); + } + } + } + } + + unsafe fn handle_to_u64(&self) -> u64 { + self.as_noun().as_raw() + } + + unsafe fn handle_from_u64(meta_handle: u64) -> Self { + Atom::from_raw(meta_handle) + } +} + impl Persist for Noun { unsafe fn space_needed(&mut self, stack: &mut NockStack, pma: &PMA) -> usize { let mut space = 0usize; From 295da34cf851f6035c317525cf831718acac58ef Mon Sep 17 00:00:00 2001 From: Edward Amsden Date: Thu, 7 Dec 2023 01:07:26 -0600 Subject: [PATCH 049/128] pma: preserve instance for BatteriesList --- rust/ares/src/jets/cold.rs | 41 ++++++++++++++++++++++++++++++++++++++ 1 file changed, 41 insertions(+) diff --git a/rust/ares/src/jets/cold.rs b/rust/ares/src/jets/cold.rs index 4aa392f..3449757 100644 --- a/rust/ares/src/jets/cold.rs +++ b/rust/ares/src/jets/cold.rs @@ -194,6 +194,47 @@ struct BatteriesListMem { next: BatteriesList, } +impl Persist for BatteriesList { + unsafe fn space_needed(&mut self, stack: &mut NockStack, pma: &PMA) -> usize { + let mut bytes = 0; + let mut list = *self; + loop { + if list.0.is_null() { break; } + if pma.contains(list.0, 1) { break; } + bytes += size_of::(); + bytes += (*list.0).batteries.space_needed(stack, pma); + + list = (*list.0).next; + } + bytes + } + + unsafe fn copy_to_buffer(&mut self, stack: &mut NockStack, pma: &PMA, buffer: &mut *mut u8) { + let mut dest = self; + + loop { + if (*dest).0.is_null() { break; } + if pma.contains((*dest).0, 1) { break; } + + let list_mem_ptr = *buffer as *mut BatteriesListMem; + copy_nonoverlapping((*dest).0, list_mem_ptr, 1); + *buffer = list_mem_ptr.add(1) as *mut u8; + (*dest).0 = list_mem_ptr; + + (*(*dest).0).batteries.copy_to_buffer(stack, pma, buffer); + dest = &mut (*(*dest).0).next; + } + } + + unsafe fn handle_to_u64(&self) -> u64 { + self.0 as u64 + } + + unsafe fn handle_from_u64(meta_handle: u64) -> Self { + BatteriesList(meta_handle as *mut BatteriesListMem) + } +} + impl Preserve for BatteriesList { unsafe fn assert_in_stack(&self, stack: &NockStack) { if self.0.is_null() { From bd31c02baaf14d27fd7e81d63ce08b9744ed9161 Mon Sep 17 00:00:00 2001 From: Edward Amsden Date: Thu, 7 Dec 2023 01:19:27 -0600 Subject: [PATCH 050/128] pma: Persist instance for NounList --- rust/ares/src/jets/cold.rs | 44 ++++++++++++++++++++++++++++++++++++++ 1 file changed, 44 insertions(+) diff --git a/rust/ares/src/jets/cold.rs b/rust/ares/src/jets/cold.rs index 3449757..fa2eccc 100644 --- a/rust/ares/src/jets/cold.rs +++ b/rust/ares/src/jets/cold.rs @@ -307,6 +307,50 @@ struct NounListMem { next: NounList, } +impl Persist for NounList { + unsafe fn space_needed(&mut self, stack: &mut NockStack, pma: &PMA) -> usize { + let mut bytes: usize = 0; + let mut list = *self; + + loop { + if list.0.is_null() { break; } + if pma.contains(list.0, 1) { break; } + + bytes += size_of::(); + bytes += (*list.0).element.space_needed(stack, pma); + + list = (*list.0).next; + } + bytes + } + + unsafe fn copy_to_buffer(&mut self, stack: &mut NockStack, pma: &PMA, buffer: &mut *mut u8) { + let mut dest = self; + + loop { + if (*dest).0.is_null() { break; } + if pma.contains((*dest).0, 1) { break; } + + let noun_list_mem_ptr = *buffer as *mut NounListMem; + copy_nonoverlapping((*dest).0, noun_list_mem_ptr, 1); + *buffer = noun_list_mem_ptr.add(1) as *mut u8; + + (*dest).0 = noun_list_mem_ptr; + (*(*dest).0).element.copy_to_buffer(stack, pma, buffer); + + dest = &mut (*(*dest).0).next; + } + } + + unsafe fn handle_to_u64(&self) -> u64 { + self.0 as u64 + } + + unsafe fn handle_from_u64(meta_handle: u64) -> Self { + NounList(meta_handle as *mut NounListMem) + } +} + impl Preserve for NounList { unsafe fn assert_in_stack(&self, stack: &NockStack) { if self.0.is_null() { From d321a13da8d69f044e9d590ff2854897494f891a Mon Sep 17 00:00:00 2001 From: Edward Amsden Date: Thu, 7 Dec 2023 01:28:07 -0600 Subject: [PATCH 051/128] pma: Preserve instance for Cold --- rust/ares/src/jets/cold.rs | 20 ++++++++++++++++++-- 1 file changed, 18 insertions(+), 2 deletions(-) diff --git a/rust/ares/src/jets/cold.rs b/rust/ares/src/jets/cold.rs index fa2eccc..3b6dee5 100644 --- a/rust/ares/src/jets/cold.rs +++ b/rust/ares/src/jets/cold.rs @@ -430,7 +430,13 @@ struct ColdMem { impl Persist for Cold { unsafe fn space_needed(&mut self, stack: &mut NockStack, pma: &PMA) -> usize { - todo!() + if pma.contains(self.0, 1) { return 0; } + + let mut bytes = size_of::(); + bytes += (*(*self).0).battery_to_paths.space_needed(stack, pma); + bytes += (*(*self).0).root_to_paths.space_needed(stack, pma); + bytes += (*(*self).0).path_to_batteries.space_needed(stack, pma); + bytes } unsafe fn copy_to_buffer( @@ -439,7 +445,17 @@ impl Persist for Cold { pma: &PMA, buffer: &mut *mut u8, ) { - todo!() + if pma.contains(self.0, 1) { return; } + + let cold_mem_ptr = *buffer as *mut ColdMem; + copy_nonoverlapping(self.0, cold_mem_ptr, 1); + *buffer = cold_mem_ptr.add(1) as *mut u8; + + (*self).0 = cold_mem_ptr; + + (*(*self).0).battery_to_paths.copy_to_buffer(stack, pma, buffer); + (*(*self).0).root_to_paths.copy_to_buffer(stack, pma, buffer); + (*(*self).0).path_to_batteries.copy_to_buffer(stack, pma, buffer); } unsafe fn handle_to_u64(&self) -> u64 { From 70e0f6abd50c2d11ac65f03ba7b84ac3145a9947 Mon Sep 17 00:00:00 2001 From: Edward Amsden Date: Thu, 7 Dec 2023 01:28:21 -0600 Subject: [PATCH 052/128] pma: re-initialize hot state after stack reset --- rust/ares/src/serf.rs | 3 +++ 1 file changed, 3 insertions(+) diff --git a/rust/ares/src/serf.rs b/rust/ares/src/serf.rs index 361b145..405bde2 100644 --- a/rust/ares/src/serf.rs +++ b/rust/ares/src/serf.rs @@ -111,6 +111,9 @@ impl Context { // Reset the nock stack, freeing all memory used to compute the event self.nock_context.stack.reset(0); + // Since we reset the nockstack the stack-allocated hot state isn't valid anymore + self.nock_context.hot = Hot::init(&mut self.nock_context.stack); + // XX some things were invalidated when we reset the stack self.nock_context.warm = Warm::init( &mut self.nock_context.stack, From 863afcd7ef235c70732093a7f5331fed4ba2042c Mon Sep 17 00:00:00 2001 From: Edward Amsden Date: Thu, 7 Dec 2023 01:34:47 -0600 Subject: [PATCH 053/128] pma: format --- rust/ares/src/hamt.rs | 76 ++++++++++++++++--------------- rust/ares/src/jets/cold.rs | 93 ++++++++++++++++++++++++-------------- rust/ares/src/noun.rs | 2 +- rust/ares/src/persist.rs | 29 +++--------- 4 files changed, 108 insertions(+), 92 deletions(-) diff --git a/rust/ares/src/hamt.rs b/rust/ares/src/hamt.rs index 7e8ad25..fe32842 100644 --- a/rust/ares/src/hamt.rs +++ b/rust/ares/src/hamt.rs @@ -1,11 +1,11 @@ use crate::mem::{unifying_equality, NockStack, Preserve}; use crate::mug::mug_u32; use crate::noun::Noun; +use crate::persist::{Persist, PMA}; use either::Either::{self, *}; +use std::mem::size_of; use std::ptr::{copy_nonoverlapping, null, null_mut}; use std::slice; -use crate::persist::{Persist, PMA}; -use std::mem::size_of; type MutStemEntry = Either<*mut MutStem, Leaf>; @@ -574,7 +574,7 @@ impl Preserve for Hamt { } } -#[derive(Copy,Clone)] +#[derive(Copy, Clone)] struct StemTraversalEntry { bitmap_remaining: u32, typemap_remaining: u32, @@ -583,25 +583,28 @@ struct StemTraversalEntry { impl Persist for Hamt { unsafe fn space_needed(&mut self, stack: &mut NockStack, pma: &PMA) -> usize { - if pma.contains(self.0, 1) { return 0; } + if pma.contains(self.0, 1) { + return 0; + } let mut bytes: usize = size_of::>(); - if pma.contains((*self.0).buffer, (*self.0).size()) { return bytes }; + if pma.contains((*self.0).buffer, (*self.0).size()) { + return bytes; + }; let mut depth: usize = 0; - let mut traversal = [ - Stem { - bitmap: 0, - typemap: 0, - buffer: null_mut(), - }; - 6 - ]; + let mut traversal = [Stem { + bitmap: 0, + typemap: 0, + buffer: null_mut(), + }; 6]; traversal[0] = (*self.0); loop { assert!(depth < 6); if traversal[depth].bitmap == 0 { - if depth == 0 { break bytes; } + if depth == 0 { + break bytes; + } depth -= 1; } @@ -612,7 +615,8 @@ impl Persist for Hamt { traversal[depth].typemap = traversal[depth].typemap >> (next_chunk + 1); traversal[depth].buffer = traversal[depth].buffer.add(1); - if next_type { // true->stem false->leaf + if next_type { + // true->stem false->leaf // found another stem traversal[depth + 1] = next_entry.stem; @@ -635,7 +639,7 @@ impl Persist for Hamt { } bytes += size_of::<(Noun, T)>() * leaf.len; - + while leaf.len > 0 { bytes += (*leaf.buffer).0.space_needed(stack, pma); bytes += (*leaf.buffer).1.space_needed(stack, pma); @@ -648,34 +652,30 @@ impl Persist for Hamt { // XX this is subtly wrong, we need to track destination pointers somehow and not just write // into the traversal stack - unsafe fn copy_to_buffer( - &mut self, - stack: &mut NockStack, - pma: &PMA, - buffer: &mut *mut u8, - ) { - if pma.contains(self.0, 1) { return; } + unsafe fn copy_to_buffer(&mut self, stack: &mut NockStack, pma: &PMA, buffer: &mut *mut u8) { + if pma.contains(self.0, 1) { + return; + } let stem_ptr = *buffer as *mut Stem; copy_nonoverlapping(self.0, stem_ptr, 1); *buffer = stem_ptr.add(1) as *mut u8; (*self).0 = stem_ptr; let stem_buffer_size = (*stem_ptr).size(); - if pma.contains((*stem_ptr).buffer, stem_buffer_size) { return; } + if pma.contains((*stem_ptr).buffer, stem_buffer_size) { + return; + } let stem_buffer_ptr = *buffer as *mut Entry; copy_nonoverlapping((*stem_ptr).buffer, stem_buffer_ptr, stem_buffer_size); *buffer = stem_buffer_ptr.add(stem_buffer_size) as *mut u8; (*stem_ptr).buffer = stem_buffer_ptr; let mut depth: usize = 0; - let mut traversal = [ - Stem { - bitmap: 0, - typemap: 0, - buffer: null_mut(), - }; - 6 - ]; + let mut traversal = [Stem { + bitmap: 0, + typemap: 0, + buffer: null_mut(), + }; 6]; traversal[0] = *stem_ptr; @@ -708,7 +708,7 @@ impl Persist for Hamt { *buffer = stem_buffer_ptr.add(stem_size) as *mut u8; (*stem_ptr).buffer = stem_buffer_ptr; - + traversal[depth + 1] = *stem_ptr; depth += 1; continue; @@ -719,14 +719,18 @@ impl Persist for Hamt { copy_nonoverlapping((*leaf_ptr).buffer, leaf_buffer_ptr, (*leaf_ptr).len); *buffer = leaf_buffer_ptr.add((*leaf_ptr).len) as *mut u8; - + (*leaf_ptr).buffer = leaf_buffer_ptr; let mut leaf_idx = 0; while leaf_idx < (*leaf_ptr).len { - (*(*leaf_ptr).buffer.add(leaf_idx)).0.copy_to_buffer(stack, pma, buffer); - (*(*leaf_ptr).buffer.add(leaf_idx)).1.copy_to_buffer(stack, pma, buffer); + (*(*leaf_ptr).buffer.add(leaf_idx)) + .0 + .copy_to_buffer(stack, pma, buffer); + (*(*leaf_ptr).buffer.add(leaf_idx)) + .1 + .copy_to_buffer(stack, pma, buffer); leaf_idx += 1; } diff --git a/rust/ares/src/jets/cold.rs b/rust/ares/src/jets/cold.rs index 3b6dee5..9db0ef0 100644 --- a/rust/ares/src/jets/cold.rs +++ b/rust/ares/src/jets/cold.rs @@ -3,9 +3,9 @@ use crate::mem::{unifying_equality, NockStack, Preserve}; use crate::noun; use crate::noun::{Atom, DirectAtom, Noun, Slots, D, T}; use crate::persist::{Persist, PMA}; +use std::mem::size_of; use std::ptr::copy_nonoverlapping; use std::ptr::null_mut; -use std::mem::size_of; pub enum Error { NoParent, @@ -39,8 +39,12 @@ impl Persist for Batteries { let mut batteries = *self; loop { - if batteries.0.is_null() { break; } - if pma.contains(batteries.0, 1) { break; } + if batteries.0.is_null() { + break; + } + if pma.contains(batteries.0, 1) { + break; + } bytes += size_of::(); bytes += (*batteries.0).battery.space_needed(stack, pma); bytes += (*batteries.0).parent_axis.space_needed(stack, pma); @@ -49,23 +53,26 @@ impl Persist for Batteries { bytes } - unsafe fn copy_to_buffer( - &mut self, - stack: &mut NockStack, - pma: &PMA, - buffer: &mut *mut u8, - ) { + unsafe fn copy_to_buffer(&mut self, stack: &mut NockStack, pma: &PMA, buffer: &mut *mut u8) { let mut dest = self; loop { - if (*dest).0.is_null() { break; } - if pma.contains((*dest).0, 1) { break; } + if (*dest).0.is_null() { + break; + } + if pma.contains((*dest).0, 1) { + break; + } let batteries_mem_ptr = *buffer as *mut BatteriesMem; copy_nonoverlapping((*dest).0, batteries_mem_ptr, 1); *buffer = batteries_mem_ptr.add(1) as *mut u8; - (*batteries_mem_ptr).battery.copy_to_buffer(stack, pma, buffer); - (*batteries_mem_ptr).parent_axis.copy_to_buffer(stack, pma, buffer); + (*batteries_mem_ptr) + .battery + .copy_to_buffer(stack, pma, buffer); + (*batteries_mem_ptr) + .parent_axis + .copy_to_buffer(stack, pma, buffer); (*dest).0 = batteries_mem_ptr; dest = &mut (*(*dest).0).parent_batteries; @@ -79,7 +86,6 @@ impl Persist for Batteries { unsafe fn handle_from_u64(meta_handle: u64) -> Self { Batteries(meta_handle as *mut BatteriesMem) } - } impl Preserve for Batteries { @@ -199,8 +205,12 @@ impl Persist for BatteriesList { let mut bytes = 0; let mut list = *self; loop { - if list.0.is_null() { break; } - if pma.contains(list.0, 1) { break; } + if list.0.is_null() { + break; + } + if pma.contains(list.0, 1) { + break; + } bytes += size_of::(); bytes += (*list.0).batteries.space_needed(stack, pma); @@ -213,8 +223,12 @@ impl Persist for BatteriesList { let mut dest = self; loop { - if (*dest).0.is_null() { break; } - if pma.contains((*dest).0, 1) { break; } + if (*dest).0.is_null() { + break; + } + if pma.contains((*dest).0, 1) { + break; + } let list_mem_ptr = *buffer as *mut BatteriesListMem; copy_nonoverlapping((*dest).0, list_mem_ptr, 1); @@ -313,8 +327,12 @@ impl Persist for NounList { let mut list = *self; loop { - if list.0.is_null() { break; } - if pma.contains(list.0, 1) { break; } + if list.0.is_null() { + break; + } + if pma.contains(list.0, 1) { + break; + } bytes += size_of::(); bytes += (*list.0).element.space_needed(stack, pma); @@ -328,8 +346,12 @@ impl Persist for NounList { let mut dest = self; loop { - if (*dest).0.is_null() { break; } - if pma.contains((*dest).0, 1) { break; } + if (*dest).0.is_null() { + break; + } + if pma.contains((*dest).0, 1) { + break; + } let noun_list_mem_ptr = *buffer as *mut NounListMem; copy_nonoverlapping((*dest).0, noun_list_mem_ptr, 1); @@ -430,7 +452,9 @@ struct ColdMem { impl Persist for Cold { unsafe fn space_needed(&mut self, stack: &mut NockStack, pma: &PMA) -> usize { - if pma.contains(self.0, 1) { return 0; } + if pma.contains(self.0, 1) { + return 0; + } let mut bytes = size_of::(); bytes += (*(*self).0).battery_to_paths.space_needed(stack, pma); @@ -439,13 +463,10 @@ impl Persist for Cold { bytes } - unsafe fn copy_to_buffer( - &mut self, - stack: &mut NockStack, - pma: &PMA, - buffer: &mut *mut u8, - ) { - if pma.contains(self.0, 1) { return; } + unsafe fn copy_to_buffer(&mut self, stack: &mut NockStack, pma: &PMA, buffer: &mut *mut u8) { + if pma.contains(self.0, 1) { + return; + } let cold_mem_ptr = *buffer as *mut ColdMem; copy_nonoverlapping(self.0, cold_mem_ptr, 1); @@ -453,9 +474,15 @@ impl Persist for Cold { (*self).0 = cold_mem_ptr; - (*(*self).0).battery_to_paths.copy_to_buffer(stack, pma, buffer); - (*(*self).0).root_to_paths.copy_to_buffer(stack, pma, buffer); - (*(*self).0).path_to_batteries.copy_to_buffer(stack, pma, buffer); + (*(*self).0) + .battery_to_paths + .copy_to_buffer(stack, pma, buffer); + (*(*self).0) + .root_to_paths + .copy_to_buffer(stack, pma, buffer); + (*(*self).0) + .path_to_batteries + .copy_to_buffer(stack, pma, buffer); } unsafe fn handle_to_u64(&self) -> u64 { diff --git a/rust/ares/src/noun.rs b/rust/ares/src/noun.rs index 9936de0..4fc886e 100644 --- a/rust/ares/src/noun.rs +++ b/rust/ares/src/noun.rs @@ -879,7 +879,7 @@ impl Atom { /** Make an atom from a raw u64 * - * # Safety + * # Safety * * Note that the [u64] parameter is *not*, in general, the value of the atom! * diff --git a/rust/ares/src/persist.rs b/rust/ares/src/persist.rs index 42c0420..514bb7c 100644 --- a/rust/ares/src/persist.rs +++ b/rust/ares/src/persist.rs @@ -1,13 +1,13 @@ use crate::jets::cold::Cold; use crate::mem::NockStack; -use crate::noun::{Allocated, Cell, CellMemory, IndirectAtom, Noun, Atom}; +use crate::noun::{Allocated, Atom, Cell, CellMemory, IndirectAtom, Noun}; use ares_pma::*; use either::Either::{Left, Right}; +use std::convert::TryInto; use std::ffi::{c_void, CString}; use std::mem::size_of; use std::path::PathBuf; use std::ptr::copy_nonoverlapping; -use std::convert::TryInto; const PMA_MODE: mode_t = 0o600; // RW for user only const PMA_FLAGS: ULONG = 0; // ignored for now @@ -116,19 +116,14 @@ pub trait Persist { /// Copy into the provided buffer, which may be assumed to be at least as large as the size /// returned by [space_needed] on the same structure. Return a u64 handle that could be saved /// in metadata - unsafe fn copy_to_buffer( - &mut self, - stack: &mut NockStack, - pma: &PMA, - buffer: &mut *mut u8, - ); + unsafe fn copy_to_buffer(&mut self, stack: &mut NockStack, pma: &PMA, buffer: &mut *mut u8); /// Persist an object into the PMA using [space_needed] and [copy_to_buffer], returning /// a [u64] (probably a pointer or tagged pointer) that can be saved into fn save_to_pma(&mut self, stack: &mut NockStack, pma: &PMA) -> u64 { unsafe { - let space = self.space_needed(stack, pma); - + let space = self.space_needed(stack, pma); + if space == 0 { return self.handle_to_u64(); } @@ -157,12 +152,7 @@ impl Persist for Snapshot { (((size_of::() + 7) >> 3) << 3) + arvo_space_needed + cold_space_needed } - unsafe fn copy_to_buffer( - &mut self, - stack: &mut NockStack, - pma: &PMA, - buffer: &mut *mut u8, - ) { + unsafe fn copy_to_buffer(&mut self, stack: &mut NockStack, pma: &PMA, buffer: &mut *mut u8) { let snapshot_buffer = *buffer as *mut SnapshotMem; std::ptr::copy_nonoverlapping(self.0, snapshot_buffer, 1); *self = Snapshot(snapshot_buffer); @@ -277,12 +267,7 @@ impl Persist for Noun { space } - unsafe fn copy_to_buffer( - &mut self, - stack: &mut NockStack, - pma: &PMA, - buffer: &mut *mut u8, - ) { + unsafe fn copy_to_buffer(&mut self, stack: &mut NockStack, pma: &PMA, buffer: &mut *mut u8) { let mut buffer_u64 = (*buffer) as *mut u64; stack.frame_push(0); *(stack.push::<(Noun, *mut Noun)>()) = (*self, self as *mut Noun); From 03ad01d23d62915b2c4a38e30b7c9c8cb86f0600 Mon Sep 17 00:00:00 2001 From: Edward Amsden Date: Thu, 7 Dec 2023 01:36:53 -0600 Subject: [PATCH 054/128] pma: fix up last rust todos --- rust/ares/src/hamt.rs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/rust/ares/src/hamt.rs b/rust/ares/src/hamt.rs index fe32842..7e9ef50 100644 --- a/rust/ares/src/hamt.rs +++ b/rust/ares/src/hamt.rs @@ -741,10 +741,10 @@ impl Persist for Hamt { } unsafe fn handle_to_u64(&self) -> u64 { - todo!() + self.0 as u64 } unsafe fn handle_from_u64(meta_handle: u64) -> Self { - todo!() + Hamt(meta_handle as *mut Stem) } } From b082af0859c8b617734054a61ef6dc16b46dd21e Mon Sep 17 00:00:00 2001 From: Edward Amsden Date: Thu, 7 Dec 2023 09:41:16 -0600 Subject: [PATCH 055/128] PMA: throw unimplemented when trying to open on windows --- rust/ares/src/persist.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/rust/ares/src/persist.rs b/rust/ares/src/persist.rs index 514bb7c..c8fd105 100644 --- a/rust/ares/src/persist.rs +++ b/rust/ares/src/persist.rs @@ -57,7 +57,7 @@ impl PMA { #[cfg(windows)] pub fn open(path: PathBuf) -> Result { - todo!() + unimplemented!() } #[inline] From a3f23f1f9d01dfb79041b1a6b35caf978ddcab5b Mon Sep 17 00:00:00 2001 From: Edward Amsden Date: Thu, 7 Dec 2023 09:43:26 -0600 Subject: [PATCH 056/128] ci: Run workflows on any changes under rust/ --- .github/workflows/ares-feature.yml | 4 +--- .github/workflows/ares-status.yml | 4 +--- 2 files changed, 2 insertions(+), 6 deletions(-) diff --git a/.github/workflows/ares-feature.yml b/.github/workflows/ares-feature.yml index 4020531..f9facd2 100644 --- a/.github/workflows/ares-feature.yml +++ b/.github/workflows/ares-feature.yml @@ -5,9 +5,7 @@ on: paths: - '.github/workflows/ares-feature.yml' - '.github/workflows/ares-shared.yml' - - 'rust/ares/**' - - 'rust/ares_macros/**' - - 'rust/ibig-rs/**' + - 'rust/**' jobs: urbit: diff --git a/.github/workflows/ares-status.yml b/.github/workflows/ares-status.yml index 815113b..3523954 100644 --- a/.github/workflows/ares-status.yml +++ b/.github/workflows/ares-status.yml @@ -7,9 +7,7 @@ on: paths: - '.github/workflows/ares-shared.yml' - '.github/workflows/ares-status.yml' - - 'rust/ares/**' - - 'rust/ares_macros/**' - - 'rust/ibig-rs/**' + - 'rust/**' jobs: urbit: From 3eb0b3a5d5a6f7a7ec14db36e6d32f59be47a2d4 Mon Sep 17 00:00:00 2001 From: barter-simsum Date: Thu, 7 Dec 2023 18:40:27 -0500 Subject: [PATCH 057/128] pma: implement bt_state_close to a sufficient degree --- rust/ares_pma/c-src/btree.c | 25 ++++++++++++++++++++++--- 1 file changed, 22 insertions(+), 3 deletions(-) diff --git a/rust/ares_pma/c-src/btree.c b/rust/ares_pma/c-src/btree.c index a321998..5e8f490 100644 --- a/rust/ares_pma/c-src/btree.c +++ b/rust/ares_pma/c-src/btree.c @@ -1514,6 +1514,20 @@ _nlist_new(BT_state *state) return BT_SUCC; } +static int +_nlist_delete(BT_state *state) +{ + BT_nlistnode *head, *prev; + head = prev = state->nlist; + while (head->next) { + prev = head; + head = head->next; + free(prev); + } + state->nlist = 0; + return BT_SUCC; +} + static BT_nlistnode * _nlist_read_prev(BT_nlistnode *head, BT_nlistnode *curr) { @@ -2469,14 +2483,19 @@ int bt_state_close(BT_state *state) { int rc; - if (state->data_fd != -1) CLOSE_FD(state->data_fd); + bt_sync(state); _mlist_delete(state); _flist_delete(state); + _nlist_delete(state); - /* ;;: wip delete the file because we haven't implemented persistence yet */ - if (!SUCC(rc = remove(state->path))) + if ((rc = munmap(state->map, BT_ADDRSIZE)) != 0) { + rc = errno; return rc; + } + if (state->data_fd != -1) CLOSE_FD(state->data_fd); + + ZERO(state, sizeof *state); return BT_SUCC; } From d1139abf9c5372a068eae8a6c1ee4c4e7ee4147b Mon Sep 17 00:00:00 2001 From: barter-simsum Date: Fri, 8 Dec 2023 17:41:18 -0500 Subject: [PATCH 058/128] pma: fix bt_sync segfault --- rust/ares_pma/c-src/btree.c | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/rust/ares_pma/c-src/btree.c b/rust/ares_pma/c-src/btree.c index 5e8f490..562a2ff 100644 --- a/rust/ares_pma/c-src/btree.c +++ b/rust/ares_pma/c-src/btree.c @@ -1020,10 +1020,11 @@ _bt_delco_trim_rsubtree_lhs2(BT_state *state, vaof_t lo, vaof_t hi, { BT_page *node = _node_get(state, nodepg); size_t hiidx = 0; + size_t N = _bt_numkeys(node); /* find hi idx of range */ size_t i; - for (i = 0; i < BT_DAT_MAXKEYS-1; i++) { + for (i = 0; i < N-1; i++) { vaof_t hhi = node->datk[i].va; if (hhi >= hi) { hiidx = i; @@ -1097,7 +1098,7 @@ _bt_delco_trim_lsubtree_rhs2(BT_state *state, vaof_t lo, vaof_t hi, /* drop the subtrees right of the range */ if (depth != maxdepth) { /* recur and droptree for branches */ - for (i = loidx+1; i < BT_DAT_MAXKEYS-1; i++) { + for (i = loidx+1; i < N-1; i++) { pgno_t childpg = node->datk[i].fo; if (childpg == 0) break; @@ -1154,7 +1155,7 @@ _bt_delco(BT_state *state, vaof_t lo, vaof_t hi, } /* find high idx of range */ - for (size_t i = loidx; i < BT_DAT_MAXKEYS-1; i++) { + for (size_t i = loidx; i < N-1; i++) { vaof_t hhi = node->datk[i].va; if (hhi >= hi) { assert(i > 0); @@ -2312,8 +2313,9 @@ _bt_sync_leaf(BT_state *state, BT_page *node) the node itself and mark it as clean in the parent. */ pgno_t pg; size_t i = 0; + size_t N = _bt_numkeys(node); - for (size_t i = 0; i < BT_DAT_MAXKEYS-1; i++) { + for (size_t i = 0; i < N-1; i++) { if (!_bt_ischilddirty(node, i)) continue; /* not dirty. nothing to do */ @@ -2398,6 +2400,7 @@ _bt_sync(BT_state *state, BT_page *node, uint8_t depth, uint8_t maxdepth) itself and mark it clean. */ { int rc = 0; + size_t N = _bt_numkeys(node); /* leaf */ if (depth == maxdepth) { @@ -2406,7 +2409,7 @@ _bt_sync(BT_state *state, BT_page *node, uint8_t depth, uint8_t maxdepth) } /* do dfs */ - for (size_t i = 0; i < BT_DAT_MAXKEYS-1; i++) { + for (size_t i = 0; i < N-1; i++) { if (!_bt_ischilddirty(node, i)) continue; /* not dirty. nothing to do */ From d75aed7bd2f17540495a1bd23f812d9adfeb0d09 Mon Sep 17 00:00:00 2001 From: barter-simsum Date: Fri, 8 Dec 2023 18:15:31 -0500 Subject: [PATCH 059/128] pma: various bug fixes - initialize root to page offset beyond first node stripe (not zero). Is this improper? We initialize the mlist at the first va in the root. Maybe that part should handle the node stripes instead? Don't know if it actually matters. - fix insert tests - fix state->meta_pages initialization - other minor fixes --- rust/ares_pma/c-src/btest.c | 33 +++++++++++++++++++++++---------- rust/ares_pma/c-src/btree.c | 15 ++++++++------- 2 files changed, 31 insertions(+), 17 deletions(-) diff --git a/rust/ares_pma/c-src/btest.c b/rust/ares_pma/c-src/btest.c index ee4c728..7dc7080 100644 --- a/rust/ares_pma/c-src/btest.c +++ b/rust/ares_pma/c-src/btest.c @@ -22,26 +22,39 @@ int main(int argc, char *argv[]) { DPUTS("PMA Tests"); - BT_state *state; + BT_state *state1; BT_findpath path = {0}; int rc = 0; - bt_state_new(&state); - - + DPUTS("== test 1: insert"); - assert(SUCC(bt_state_open(state, "./pmatest", 0, 0644))); - vaof_t lo = 10; + bt_state_new(&state1); + + assert(SUCC(bt_state_open(state1, "./pmatest1", 0, 0644))); + +#define LOWEST_ADDR 0x200000; + vaof_t lo = LOWEST_ADDR; vaof_t hi = 0xDEADBEEF; pgno_t pg = 1; /* dummy value */ for (size_t i = 0; i < BT_DAT_MAXKEYS * 4; ++i) { - /* if (i % (BT_DAT_MAXKEYS - 2) == 0) */ - /* bp(0); /\* breakpoint on split case *\/ */ - _bt_insert(state, lo, hi, pg); - _test_nodeinteg(state, &path, lo, hi, pg); + DPRINTF("== i: %zu", i); + _bt_insert(state1, lo, hi, pg); + _test_nodeinteg(state1, &path, lo, hi, pg); lo++; pg++; } + bt_state_close(state1); + + + DPUTS("== test 2: malloc"); + BT_state *state2; + + bt_state_new(&state2); + assert(SUCC(bt_state_open(state2, "./pmatest2", 0, 0644))); + + void *t2a = bt_malloc(state2, 10); + bt_free(state2, t2a, (BT_page*)t2a + 10); + return 0; } diff --git a/rust/ares_pma/c-src/btree.c b/rust/ares_pma/c-src/btree.c index 562a2ff..ad2f01c 100644 --- a/rust/ares_pma/c-src/btree.c +++ b/rust/ares_pma/c-src/btree.c @@ -487,9 +487,10 @@ _bt_find2(BT_state *state, } static void -_bt_root_new(BT_page *root) +_bt_root_new(BT_meta *meta, BT_page *root) { - root->datk[0].va = 0; + /* The first usable address in the PMA is just beyond the first node stripe */ + root->datk[0].va = meta->blk_base[0] + BLK_BASE_LEN0; root->datk[0].fo = 0; root->datk[1].va = UINT32_MAX; root->datk[1].fo = 0; @@ -2154,8 +2155,11 @@ _bt_state_meta_new(BT_state *state) TRACE(); + /* initialize the block base array */ + meta.blk_base[0] = BT_PAGESIZE * BT_NUMMETAS; + root = _bt_nalloc(state); - _bt_root_new(root); + _bt_root_new(&meta, root); pagesize = sizeof *p1; @@ -2171,9 +2175,6 @@ _bt_state_meta_new(BT_state *state) meta.root = _fo_get(state, root); assert(meta.root == INITIAL_ROOTPG); /* ;;: remove?? */ - /* initialize the block base array */ - meta.blk_base[0] = BT_NUMMETAS + 1; - /* initialize the metapages */ p1 = &((BT_page *)state->map)[0]; p2 = &((BT_page *)state->map)[1]; @@ -2218,7 +2219,7 @@ _bt_state_load(BT_state *state) p = (BT_page *)state->map; state->meta_pages[0] = METADATA(p); - state->meta_pages[0] = METADATA(p + 1); + state->meta_pages[1] = METADATA(p + 1); /* new db, so populate metadata */ if (new) { From e7bc9f34ac35cf767f79ee4aa36ab69cfcac3629 Mon Sep 17 00:00:00 2001 From: barter-simsum Date: Fri, 8 Dec 2023 18:40:15 -0500 Subject: [PATCH 060/128] pma: malloc test update debugging mlist coalescing and deletion coalescing issues --- rust/ares_pma/c-src/btest.c | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/rust/ares_pma/c-src/btest.c b/rust/ares_pma/c-src/btest.c index 7dc7080..7ce2b24 100644 --- a/rust/ares_pma/c-src/btest.c +++ b/rust/ares_pma/c-src/btest.c @@ -11,7 +11,6 @@ _test_nodeinteg(BT_state *state, BT_findpath *path, assert(SUCC(_bt_find(state, path, lo, hi))); parent = path->path[path->depth]; - /* _bt_printnode(parent); */ childidx = path->idx[path->depth]; assert(parent->datk[childidx].fo == pg); assert(parent->datk[childidx].va == lo); @@ -55,6 +54,16 @@ int main(int argc, char *argv[]) void *t2a = bt_malloc(state2, 10); bt_free(state2, t2a, (BT_page*)t2a + 10); + void *t2b = bt_malloc(state2, 10); + /* should have pulled the same pointer due to eager mlist coalescing */ + /* assert(t2a == t2b); /\* ;;: not working. fix whatever is wrong with mlist coalescing *\/ */ + ZERO(&path, sizeof path); + _bt_find(state2, &path, addr2off(t2b), addr2off((BT_page *)t2b + 10)); + bt_free(state2, t2b, (BT_page*)t2b + 10); + ZERO(&path, sizeof path); + _bt_find(state2, &path, addr2off(t2b), addr2off((BT_page *)t2b + 10)); + /* should invoke deletion coalescing - 10 page free range in btree */ + void *t2c = bt_malloc(state2, 20); return 0; } From 81f58631e131da3701675084ba0de5d9184b0766 Mon Sep 17 00:00:00 2001 From: barter-simsum Date: Fri, 8 Dec 2023 18:58:24 -0500 Subject: [PATCH 061/128] pma: fix _pending_nlist_insert and _mlist_insert --- rust/ares_pma/c-src/btest.c | 2 +- rust/ares_pma/c-src/btree.c | 21 +++++++++++++++++---- 2 files changed, 18 insertions(+), 5 deletions(-) diff --git a/rust/ares_pma/c-src/btest.c b/rust/ares_pma/c-src/btest.c index 7ce2b24..23a838f 100644 --- a/rust/ares_pma/c-src/btest.c +++ b/rust/ares_pma/c-src/btest.c @@ -56,7 +56,7 @@ int main(int argc, char *argv[]) bt_free(state2, t2a, (BT_page*)t2a + 10); void *t2b = bt_malloc(state2, 10); /* should have pulled the same pointer due to eager mlist coalescing */ - /* assert(t2a == t2b); /\* ;;: not working. fix whatever is wrong with mlist coalescing *\/ */ + assert(t2a == t2b); ZERO(&path, sizeof path); _bt_find(state2, &path, addr2off(t2b), addr2off((BT_page *)t2b + 10)); bt_free(state2, t2b, (BT_page*)t2b + 10); diff --git a/rust/ares_pma/c-src/btree.c b/rust/ares_pma/c-src/btree.c index ad2f01c..5ec5c2c 100644 --- a/rust/ares_pma/c-src/btree.c +++ b/rust/ares_pma/c-src/btree.c @@ -759,10 +759,10 @@ _mlist_insert(BT_state *state, void *lo, void *hi) assert(head); - while (head->next) { + while (head) { BYTE *vob = head->va; size_t siz = head->sz; - BYTE *nob = head->next->va; + BYTE *nob = head->next ? head->next->va : 0; /* freed chunk immediately precedes head */ if (hi == vob) { @@ -810,14 +810,27 @@ _pending_nlist_insert(BT_state *state, pgno_t nodepg) return; } + if (!head->next) { + if (head->va < va) + goto append; + /* otherwise prepend and update mlist head reference */ + BT_nlistnode *new = calloc(1, sizeof *new); + new->sz = 1; + new->va = va; + new->next = head; + state->nlist = new; + } + /* we don't need to account for a freelist node's size because we aren't coalescing the pending freelists */ - while (head->next) { - if (head->next->va > va) + while (head) { + BT_page *nva = head->next ? head->next->va : (void*)-1; + if (nva > va) break; head = head->next; } + append: /* head->next is either null or has a higher address than va */ BT_nlistnode *new = calloc(1, sizeof *new); new->sz = 1; From 10ed711da59cf104ed5164f4bb8c89b5aaff99bf Mon Sep 17 00:00:00 2001 From: barter-simsum Date: Fri, 8 Dec 2023 19:27:34 -0500 Subject: [PATCH 062/128] pma: fix _bt_delco hiidx iteration --- rust/ares_pma/c-src/btree.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/rust/ares_pma/c-src/btree.c b/rust/ares_pma/c-src/btree.c index 5ec5c2c..102bea5 100644 --- a/rust/ares_pma/c-src/btree.c +++ b/rust/ares_pma/c-src/btree.c @@ -705,7 +705,7 @@ _bt_delco_1pass_0(BT_state *state, vaof_t lo, vaof_t hi, } /* and then the entry that matches hi */ - for (; i < N-1; i++) { + for (; i < N; i++) { vaof_t hhi = node->datk[i].va; if (hhi >= hi) { hiidx = hi; @@ -1038,7 +1038,7 @@ _bt_delco_trim_rsubtree_lhs2(BT_state *state, vaof_t lo, vaof_t hi, /* find hi idx of range */ size_t i; - for (i = 0; i < N-1; i++) { + for (i = 0; i < N; i++) { vaof_t hhi = node->datk[i].va; if (hhi >= hi) { hiidx = i; @@ -1169,7 +1169,7 @@ _bt_delco(BT_state *state, vaof_t lo, vaof_t hi, } /* find high idx of range */ - for (size_t i = loidx; i < N-1; i++) { + for (size_t i = loidx; i < N; i++) { vaof_t hhi = node->datk[i].va; if (hhi >= hi) { assert(i > 0); @@ -2745,7 +2745,7 @@ _bt_dirty(BT_state *state, vaof_t lo, vaof_t hi, pgno_t nodepg, assert(loidx != 0); /* find hiidx of range */ - for (size_t i = loidx; i < N-1; i++) { + for (size_t i = loidx; i < N; i++) { vaof_t hhi = node->datk[i+1].va; if (hhi >= hi) { hiidx = i; From 17d74aed4974d0810d26f91fb7ba58f33e6be0da Mon Sep 17 00:00:00 2001 From: barter-simsum Date: Fri, 8 Dec 2023 19:40:37 -0500 Subject: [PATCH 063/128] pma: fix deletion coalescing first pass. update malloc tests --- rust/ares_pma/c-src/btest.c | 9 +++++++++ rust/ares_pma/c-src/btree.c | 4 ++-- 2 files changed, 11 insertions(+), 2 deletions(-) diff --git a/rust/ares_pma/c-src/btest.c b/rust/ares_pma/c-src/btest.c index 23a838f..612e410 100644 --- a/rust/ares_pma/c-src/btest.c +++ b/rust/ares_pma/c-src/btest.c @@ -59,9 +59,18 @@ int main(int argc, char *argv[]) assert(t2a == t2b); ZERO(&path, sizeof path); _bt_find(state2, &path, addr2off(t2b), addr2off((BT_page *)t2b + 10)); +#define T2P1_PRNT0 (path.path[path.depth]) +#define T2P1_CIDX0 (path.idx[path.depth]) +#define T2P1_CIDX1 (path.idx[path.depth] + 1) + /* check length as represented in btree */ + assert(T2P1_PRNT0->datk[T2P1_CIDX1].va + - T2P1_PRNT0->datk[T2P1_CIDX0].va + == 10); bt_free(state2, t2b, (BT_page*)t2b + 10); ZERO(&path, sizeof path); _bt_find(state2, &path, addr2off(t2b), addr2off((BT_page *)t2b + 10)); + /* fo should be zero (free) */ + assert(path.path[path.depth]->datk[path.idx[path.depth]].fo == 0); /* should invoke deletion coalescing - 10 page free range in btree */ void *t2c = bt_malloc(state2, 20); diff --git a/rust/ares_pma/c-src/btree.c b/rust/ares_pma/c-src/btree.c index 102bea5..c293f58 100644 --- a/rust/ares_pma/c-src/btree.c +++ b/rust/ares_pma/c-src/btree.c @@ -708,7 +708,7 @@ _bt_delco_1pass_0(BT_state *state, vaof_t lo, vaof_t hi, for (; i < N; i++) { vaof_t hhi = node->datk[i].va; if (hhi >= hi) { - hiidx = hi; + hiidx = i; break; } } @@ -717,7 +717,7 @@ _bt_delco_1pass_0(BT_state *state, vaof_t lo, vaof_t hi, the dfs */ for (i = loidx; i < hiidx; i++) { vaof_t llo = node->datk[i].va; - pgno_t pg = node->datk[i].va; + pgno_t pg = node->datk[i].fo; /* if at the leaf level, terminate with failure if pg is not free */ if (depth == maxdepth) { From 82cd52acb865319626396ffb023b581e6535d03f Mon Sep 17 00:00:00 2001 From: barter-simsum Date: Fri, 8 Dec 2023 19:51:44 -0500 Subject: [PATCH 064/128] pma: _bt_delco: don't trim subtrees in leaves --- rust/ares_pma/c-src/btree.c | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/rust/ares_pma/c-src/btree.c b/rust/ares_pma/c-src/btree.c index c293f58..1bdb642 100644 --- a/rust/ares_pma/c-src/btree.c +++ b/rust/ares_pma/c-src/btree.c @@ -1242,9 +1242,11 @@ _bt_delco(BT_state *state, vaof_t lo, vaof_t hi, memmove(dst, src, len); ZERO(dst+len, end-(dst+len)); - /* trim left subtree then trim right subtree */ - _bt_delco_trim_lsubtree_rhs(state, lo, hi, lsubtree, depth+1); - _bt_delco_trim_rsubtree_lhs(state, lo, hi, rsubtree, depth+1); + /* unless at leaf trim left subtree then trim right subtree */ + if (depth < maxdepth) { + _bt_delco_trim_lsubtree_rhs(state, lo, hi, lsubtree, depth+1); + _bt_delco_trim_rsubtree_lhs(state, lo, hi, rsubtree, depth+1); + } /* done */ return; From 7a81e40829c1ed2ebda20517c54596c8706fff73 Mon Sep 17 00:00:00 2001 From: Edward Amsden Date: Fri, 8 Dec 2023 19:50:20 -0600 Subject: [PATCH 065/128] pma: move most snapshot loading definition and logic into serf --- rust/ares/src/persist.rs | 76 ++------------------- rust/ares/src/serf.rs | 141 ++++++++++++++++++++++++++++++--------- 2 files changed, 115 insertions(+), 102 deletions(-) diff --git a/rust/ares/src/persist.rs b/rust/ares/src/persist.rs index c8fd105..f4f30b3 100644 --- a/rust/ares/src/persist.rs +++ b/rust/ares/src/persist.rs @@ -12,30 +12,11 @@ use std::ptr::copy_nonoverlapping; const PMA_MODE: mode_t = 0o600; // RW for user only const PMA_FLAGS: ULONG = 0; // ignored for now -const PMA_CURRENT_SNAPSHOT_VERSION: u64 = 1; - const NOUN_MARKED: u64 = 1 << 63; /// Handle to a PMA pub struct PMA(*mut BT_state); -pub struct Snapshot(pub *mut SnapshotMem); - -#[repr(C)] -#[repr(packed)] -pub struct SnapshotMem { - pub epoch: u64, - pub event_num: u64, - pub arvo: Noun, - pub cold: Cold, -} - -#[repr(usize)] -enum BTMetaField { - SnapshotVersion = 0, - Snapshot = 1, -} - impl PMA { #[cfg(unix)] pub fn open(path: PathBuf) -> Result { @@ -61,13 +42,13 @@ impl PMA { } #[inline] - fn meta_get(&self, field: BTMetaField) -> u64 { - unsafe { bt_meta_get(self.0, field as usize) } + pub fn meta_get(&self, field: usize) -> u64 { + unsafe { bt_meta_get(self.0, field) } } #[inline] - fn meta_set(&self, field: BTMetaField, val: u64) { - unsafe { bt_meta_set(self.0, field as usize, val) }; + pub fn meta_set(&self, field: usize, val: u64) { + unsafe { bt_meta_set(self.0, field, val) }; } pub unsafe fn contains(&self, ptr: *const T, count: usize) -> bool { @@ -75,20 +56,6 @@ impl PMA { && bt_inbounds(self.0, ptr.add(count) as *mut c_void) != 0 } - pub fn load(&self) -> Snapshot { - let snapshot_version = self.meta_get(BTMetaField::SnapshotVersion); - - match snapshot_version { - 1 => Snapshot(self.meta_get(BTMetaField::Snapshot) as *mut SnapshotMem), - _ => panic!("Unsupported snapshot version"), - } - } - - pub fn save(&self, stack: &mut NockStack, snapshot: &mut Snapshot) { - self.meta_set(BTMetaField::SnapshotVersion, PMA_CURRENT_SNAPSHOT_VERSION); - self.meta_set(BTMetaField::Snapshot, snapshot.save_to_pma(stack, self)); - } - pub fn sync(&self) { unsafe { if bt_sync(self.0) != 0 { @@ -120,7 +87,7 @@ pub trait Persist { /// Persist an object into the PMA using [space_needed] and [copy_to_buffer], returning /// a [u64] (probably a pointer or tagged pointer) that can be saved into - fn save_to_pma(&mut self, stack: &mut NockStack, pma: &PMA) -> u64 { + unsafe fn save_to_pma(&mut self, stack: &mut NockStack, pma: &PMA) -> u64 { unsafe { let space = self.space_needed(stack, pma); @@ -143,39 +110,6 @@ pub trait Persist { unsafe fn handle_from_u64(meta_handle: u64) -> Self; } -impl Persist for Snapshot { - unsafe fn space_needed(&mut self, stack: &mut NockStack, pma: &PMA) -> usize { - let mut arvo = (*(self.0)).arvo; - let mut cold = (*(self.0)).cold; - let arvo_space_needed = arvo.space_needed(stack, pma); - let cold_space_needed = cold.space_needed(stack, pma); - (((size_of::() + 7) >> 3) << 3) + arvo_space_needed + cold_space_needed - } - - unsafe fn copy_to_buffer(&mut self, stack: &mut NockStack, pma: &PMA, buffer: &mut *mut u8) { - let snapshot_buffer = *buffer as *mut SnapshotMem; - std::ptr::copy_nonoverlapping(self.0, snapshot_buffer, 1); - *self = Snapshot(snapshot_buffer); - *buffer = snapshot_buffer.add(1) as *mut u8; - - let mut arvo = (*snapshot_buffer).arvo; - arvo.copy_to_buffer(stack, pma, buffer); - (*snapshot_buffer).arvo = arvo; - - let mut cold = (*snapshot_buffer).cold; - cold.copy_to_buffer(stack, pma, buffer); - (*snapshot_buffer).cold = cold; - } - - unsafe fn handle_to_u64(&self) -> u64 { - self.0 as u64 - } - - unsafe fn handle_from_u64(meta_handle: u64) -> Self { - Snapshot(meta_handle as *mut SnapshotMem) - } -} - /// Ensure an allocated noun is marked and return if it was already marked unsafe fn mark(a: Allocated) -> bool { let metadata = a.get_metadata(); diff --git a/rust/ares/src/serf.rs b/rust/ares/src/serf.rs index 405bde2..288e893 100644 --- a/rust/ares/src/serf.rs +++ b/rust/ares/src/serf.rs @@ -10,13 +10,14 @@ use crate::mem::NockStack; use crate::mug::*; use crate::newt::Newt; use crate::noun::{Atom, Cell, DirectAtom, Noun, Slots, D, T}; -use crate::persist::{Snapshot, SnapshotMem, PMA}; +use crate::persist::{Persist, PMA}; use crate::trace::*; use ares_macros::tas; use signal_hook; use signal_hook::consts::SIGINT; use std::fs::create_dir_all; use std::io; +use std::mem::size_of; use std::path::PathBuf; use std::result::Result; use std::sync::atomic::{AtomicBool, Ordering}; @@ -27,6 +28,57 @@ crate::gdb!(); const FLAG_TRACE: u32 = 1 << 8; +#[repr(usize)] +enum BTMetaField { + SnapshotVersion = 0, + Snapshot = 1, +} +struct Snapshot(pub *mut SnapshotMem); + +impl Persist for Snapshot { + unsafe fn space_needed(&mut self, stack: &mut NockStack, pma: &PMA) -> usize { + let mut arvo = (*(self.0)).arvo; + let mut cold = (*(self.0)).cold; + let arvo_space_needed = arvo.space_needed(stack, pma); + let cold_space_needed = cold.space_needed(stack, pma); + (((size_of::() + 7) >> 3) << 3) + arvo_space_needed + cold_space_needed + } + + unsafe fn copy_to_buffer(&mut self, stack: &mut NockStack, pma: &PMA, buffer: &mut *mut u8) { + let snapshot_buffer = *buffer as *mut SnapshotMem; + std::ptr::copy_nonoverlapping(self.0, snapshot_buffer, 1); + *self = Snapshot(snapshot_buffer); + *buffer = snapshot_buffer.add(1) as *mut u8; + + let mut arvo = (*snapshot_buffer).arvo; + arvo.copy_to_buffer(stack, pma, buffer); + (*snapshot_buffer).arvo = arvo; + + let mut cold = (*snapshot_buffer).cold; + cold.copy_to_buffer(stack, pma, buffer); + (*snapshot_buffer).cold = cold; + } + + unsafe fn handle_to_u64(&self) -> u64 { + self.0 as u64 + } + + unsafe fn handle_from_u64(meta_handle: u64) -> Self { + Snapshot(meta_handle as *mut SnapshotMem) + } +} + +#[repr(C)] +#[repr(packed)] +struct SnapshotMem { + pub epoch: u64, + pub event_num: u64, + pub arvo: Noun, + pub cold: Cold, +} + +const PMA_CURRENT_SNAPSHOT_VERSION: u64 = 1; + struct Context { epoch: u64, event_num: u64, @@ -37,22 +89,66 @@ struct Context { } impl Context { - pub fn new(snap_path: PathBuf, trace_info: Option) -> Self { - // TODO: switch to Pma when ready - // let snap = &mut snapshot::pma::Pma::new(snap_path); + pub fn load(snap_path: PathBuf, trace_info: Option) -> Context { + let mut pma = PMA::open(snap_path).expect("serf: pma open failed"); + + let snapshot_version = pma.meta_get(BTMetaField::SnapshotVersion as usize); + + let snapshot = match snapshot_version { + 0 => None, + 1 => Some(unsafe { + Snapshot::handle_from_u64(pma.meta_get(BTMetaField::Snapshot as usize)) + }), + _ => panic!("Unsupported snapshot version"), + }; + + Context::new(trace_info, pma, snapshot) + } + + pub fn save(&mut self) { + let handle = unsafe { + let mut snapshot = Snapshot({ + let snapshot_mem_ptr: *mut SnapshotMem = self.nock_context.stack.struct_alloc(1); + + // Save into PMA (does not sync) + (*snapshot_mem_ptr).epoch = self.epoch; + (*snapshot_mem_ptr).event_num = self.event_num; + (*snapshot_mem_ptr).arvo = self.arvo; + (*snapshot_mem_ptr).cold = self.nock_context.cold; + snapshot_mem_ptr + }); + + let handle = snapshot.save_to_pma(&mut self.nock_context.stack, &mut self.pma); + + self.epoch = (*snapshot.0).epoch; + self.arvo = (*snapshot.0).arvo; + self.event_num = (*snapshot.0).event_num; + self.nock_context.cold = (*snapshot.0).cold; + + handle + }; + self.pma.meta_set( + BTMetaField::SnapshotVersion as usize, + PMA_CURRENT_SNAPSHOT_VERSION, + ); + self.pma.meta_set(BTMetaField::Snapshot as usize, handle); + } + + fn new(trace_info: Option, pma: PMA, snapshot: Option) -> Self { let mut stack = NockStack::new(1024 << 10 << 10, 0); let newt = Newt::new(); let cache = Hamt::::new(&mut stack); - let pma = PMA::open(snap_path).unwrap(); let (epoch, event_num, arvo, mut cold) = unsafe { - let snapshot = pma.load(); - ( - (*(snapshot.0)).epoch, - (*(snapshot.0)).event_num, - (*(snapshot.0)).arvo, - (*(snapshot.0)).cold, - ) + match snapshot { + Some(snapshot) => ( + (*(snapshot.0)).epoch, + (*(snapshot.0)).event_num, + (*(snapshot.0)).arvo, + (*(snapshot.0)).cold, + ), + None => (0, 0, D(0), Cold::new(&mut stack)), + } }; let mut hot = Hot::init(&mut stack); @@ -89,24 +185,7 @@ impl Context { // XX: assert event numbers are continuous self.arvo = new_arvo; self.event_num = new_event_num; - let snapshot = unsafe { - let snapshot_mem_ptr: *mut SnapshotMem = self.nock_context.stack.struct_alloc(1); - - // Save into PMA (does not sync) - (*snapshot_mem_ptr).epoch = self.epoch; - (*snapshot_mem_ptr).event_num = self.event_num; - (*snapshot_mem_ptr).arvo = self.arvo; - (*snapshot_mem_ptr).cold = self.nock_context.cold; - let mut snapshot = Snapshot(snapshot_mem_ptr); - self.pma.save(&mut self.nock_context.stack, &mut snapshot); - snapshot - }; - - // reset pointers in context to PMA - unsafe { - self.arvo = (*(snapshot.0)).arvo; - self.nock_context.cold = (*(snapshot.0)).cold; - } + self.save(); // Reset the nock stack, freeing all memory used to compute the event self.nock_context.stack.reset(0); @@ -253,7 +332,7 @@ pub fn serf() -> io::Result<()> { } } - let mut context = Context::new(snap_path, trace_info); + let mut context = Context::load(snap_path, trace_info); context.ripe(); // Can't use for loop because it borrows newt From 28d9032acd379c107048e977f54d0385b5873668 Mon Sep 17 00:00:00 2001 From: Edward Amsden Date: Fri, 8 Dec 2023 23:16:17 -0600 Subject: [PATCH 066/128] serf: comments about where to put warm and hot states --- rust/ares/src/serf.rs | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/rust/ares/src/serf.rs b/rust/ares/src/serf.rs index 288e893..20da304 100644 --- a/rust/ares/src/serf.rs +++ b/rust/ares/src/serf.rs @@ -191,9 +191,18 @@ impl Context { self.nock_context.stack.reset(0); // Since we reset the nockstack the stack-allocated hot state isn't valid anymore + // + // XX What should be done instead is, at serf initialization: push a frame and + // initialize the hot stack within it, then preserve() the hot stack to the outer frame. + // Then, save the stack and reset to the saved stack for each event, thus avoiding the need + // to recreate the hot state each event, since it does not change over the execution of the + // interpreter. self.nock_context.hot = Hot::init(&mut self.nock_context.stack); - // XX some things were invalidated when we reset the stack + // XX the above trick won't work for the warm state, since it changes whenever the cold + // state does. One possibility is to just save the warm and hot states in the snapshot + // anyway, but throw them away in load() since function pointers are invalidated by the + // restart. self.nock_context.warm = Warm::init( &mut self.nock_context.stack, &mut self.nock_context.cold, From ef5954b3b1934895616fc32de98e11ae6aa25e16 Mon Sep 17 00:00:00 2001 From: Edward Amsden Date: Fri, 8 Dec 2023 23:16:50 -0600 Subject: [PATCH 067/128] serf: format --- rust/ares/src/serf.rs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/rust/ares/src/serf.rs b/rust/ares/src/serf.rs index 20da304..6ed55e1 100644 --- a/rust/ares/src/serf.rs +++ b/rust/ares/src/serf.rs @@ -192,7 +192,7 @@ impl Context { // Since we reset the nockstack the stack-allocated hot state isn't valid anymore // - // XX What should be done instead is, at serf initialization: push a frame and + // XX What should be done instead is, at serf initialization: push a frame and // initialize the hot stack within it, then preserve() the hot stack to the outer frame. // Then, save the stack and reset to the saved stack for each event, thus avoiding the need // to recreate the hot state each event, since it does not change over the execution of the @@ -202,7 +202,7 @@ impl Context { // XX the above trick won't work for the warm state, since it changes whenever the cold // state does. One possibility is to just save the warm and hot states in the snapshot // anyway, but throw them away in load() since function pointers are invalidated by the - // restart. + // restart. self.nock_context.warm = Warm::init( &mut self.nock_context.stack, &mut self.nock_context.cold, From 48817bb694b549dc3e41e8e8ddc5c0cdc3dffc50 Mon Sep 17 00:00:00 2001 From: Edward Amsden Date: Sat, 9 Dec 2023 11:18:42 -0600 Subject: [PATCH 068/128] hamt: remove unused struct StemTraversalEntry --- rust/ares/src/hamt.rs | 7 ------- 1 file changed, 7 deletions(-) diff --git a/rust/ares/src/hamt.rs b/rust/ares/src/hamt.rs index 7e9ef50..5621906 100644 --- a/rust/ares/src/hamt.rs +++ b/rust/ares/src/hamt.rs @@ -574,13 +574,6 @@ impl Preserve for Hamt { } } -#[derive(Copy, Clone)] -struct StemTraversalEntry { - bitmap_remaining: u32, - typemap_remaining: u32, - stem_ptr: *mut Stem, -} - impl Persist for Hamt { unsafe fn space_needed(&mut self, stack: &mut NockStack, pma: &PMA) -> usize { if pma.contains(self.0, 1) { From 95e05b2dad732728544d6aef127f69add17e9cd0 Mon Sep 17 00:00:00 2001 From: Edward Amsden Date: Sat, 9 Dec 2023 11:32:48 -0600 Subject: [PATCH 069/128] pma: reorganization and doc comments --- rust/ares/src/persist.rs | 45 +++++++++++++++++++++++----------------- 1 file changed, 26 insertions(+), 19 deletions(-) diff --git a/rust/ares/src/persist.rs b/rust/ares/src/persist.rs index f4f30b3..0b10825 100644 --- a/rust/ares/src/persist.rs +++ b/rust/ares/src/persist.rs @@ -75,18 +75,31 @@ impl PMA { } } +/** + * This trait defines operations for copying a structure into the PMA. + * + * This is done in two phases. The [space_needed] phase counts how much space the structure needs in + * the PMA, not counting referenced structures already in the PMA. Then a buffer is allocated in + * the PMA of at least the computed size, and the [copy_to_buffer] phase copies the structure into + * this buffer. + * + * The phases are separated so that instances of the trait may compose, while still allocating a + * single buffer. Thus, in the instance for a HAMT, the [space_needed] method for the HAMT will + * call the [space_needed] method on each noun key, and on each value, as well as computing the + * size of the HAMT's own structures. Similarly, the [copy_to_buffer] method for the HAMT will call + * the [copy_to_buffer] method for the keys and values as it copies its own structures in. + */ pub trait Persist { /// Count how much space is needed, in bytes. May set marks so long as marks are cleaned up by /// [copy_into_buffer] unsafe fn space_needed(&mut self, stack: &mut NockStack, pma: &PMA) -> usize; /// Copy into the provided buffer, which may be assumed to be at least as large as the size - /// returned by [space_needed] on the same structure. Return a u64 handle that could be saved - /// in metadata + /// returned by [space_needed] on the same structure. unsafe fn copy_to_buffer(&mut self, stack: &mut NockStack, pma: &PMA, buffer: &mut *mut u8); /// Persist an object into the PMA using [space_needed] and [copy_to_buffer], returning - /// a [u64] (probably a pointer or tagged pointer) that can be saved into + /// a [u64] (probably a pointer or tagged pointer) that can be saved into metadata. unsafe fn save_to_pma(&mut self, stack: &mut NockStack, pma: &PMA) -> u64 { unsafe { let space = self.space_needed(stack, pma); @@ -176,22 +189,16 @@ impl Persist for Noun { let noun = *(stack.top::()); stack.pop::(); - if let Ok(allocated) = noun.as_allocated() { - // not counting direct atoms, they go in - match allocated.as_either() { - Left(indirect) => { - let count = indirect.raw_size(); - if !pma.contains(indirect.to_raw_pointer(), count) { - if !mark(allocated) { - space += count * size_of::(); - } - } - } - Right(cell) => { - if !pma.contains(cell.to_raw_pointer(), 1) { - if !mark(allocated) { - space += size_of::(); - } + match noun.as_either_atom_cell() { + Left(mut atom) => { + space += atom.space_needed(stack, pma); + } + Right(cell) => { + if !pma.contains(cell.to_raw_pointer(), 1) { + if !mark(cell.as_allocated()) { + space += size_of::(); + (*stack.push::()) = cell.tail(); + (*stack.push::()) = cell.head(); } } } From baebb515e2dad209a5d771e9d53363fdc169f659 Mon Sep 17 00:00:00 2001 From: Edward Amsden Date: Sat, 9 Dec 2023 11:46:47 -0600 Subject: [PATCH 070/128] hamt: doc comments and cleanup --- rust/ares/src/hamt.rs | 16 +++++++++++----- 1 file changed, 11 insertions(+), 5 deletions(-) diff --git a/rust/ares/src/hamt.rs b/rust/ares/src/hamt.rs index 5621906..4028e82 100644 --- a/rust/ares/src/hamt.rs +++ b/rust/ares/src/hamt.rs @@ -162,6 +162,17 @@ impl MutHamt { } } +/** + * This is the core memory structure of an immutable HAMT. + * + * The root Stem lives in its own memory allocation, addressed by the pointer wrapped by [Hamt]. + * All other Stems and Leaves live in memory blocks pointed to by [buffer]. The memory pointed to + * by this field may be zero to 32 entries, depending on the *number of bits set* in bitmap. + * + * Addressing a chunk of the key's hash is done by counting the number of set bits in the bitmap + * before the chunk'th bit. The typemap is a parallel bitmap in which bits are set if the + * corresponding entry is a stem, and cleared if it is a leaf. + */ #[repr(packed)] #[repr(C)] struct Stem { @@ -643,8 +654,6 @@ impl Persist for Hamt { } } - // XX this is subtly wrong, we need to track destination pointers somehow and not just write - // into the traversal stack unsafe fn copy_to_buffer(&mut self, stack: &mut NockStack, pma: &PMA, buffer: &mut *mut u8) { if pma.contains(self.0, 1) { return; @@ -704,7 +713,6 @@ impl Persist for Hamt { traversal[depth + 1] = *stem_ptr; depth += 1; - continue; } else { // Leaf case let leaf_ptr: *mut Leaf = &mut (*next_entry_ptr).leaf; @@ -727,8 +735,6 @@ impl Persist for Hamt { leaf_idx += 1; } - - continue; } } } From e3d4f5797993cf3bfe7c56bb64b2a3d2875d2dd6 Mon Sep 17 00:00:00 2001 From: Edward Amsden Date: Mon, 11 Dec 2023 08:59:15 -0600 Subject: [PATCH 071/128] pma: fully integrate top-stack GC --- rust/ares/src/serf.rs | 34 ++++++++++------------------------ 1 file changed, 10 insertions(+), 24 deletions(-) diff --git a/rust/ares/src/serf.rs b/rust/ares/src/serf.rs index 6fff0be..5ca2939 100644 --- a/rust/ares/src/serf.rs +++ b/rust/ares/src/serf.rs @@ -23,6 +23,7 @@ use std::result::Result; use std::sync::atomic::{AtomicBool, Ordering}; use std::sync::Arc; use std::time::Instant; +use crate::mem::Preserve; crate::gdb!(); @@ -79,17 +80,16 @@ struct SnapshotMem { const PMA_CURRENT_SNAPSHOT_VERSION: u64 = 1; -struct Context<'a> { +struct Context { epoch: u64, event_num: u64, pma: PMA, arvo: Noun, mug: u32, - constant_hot_state: &'a [HotEntry], nock_context: interpreter::Context, } -impl<'a> Context<'a> { +impl Context { pub fn load(snap_path: PathBuf, trace_info: Option, constant_hot_state: &[HotEntry], ) -> Context { @@ -138,7 +138,7 @@ impl<'a> Context<'a> { } fn new(trace_info: Option, pma: PMA, snapshot: Option, - constant_hot_state: &'a [HotEntry], + constant_hot_state: &[HotEntry], ) -> Self { let mut stack = NockStack::new(1024 << 10 << 10, 0); let newt = Newt::new(); @@ -178,7 +178,6 @@ impl<'a> Context<'a> { arvo, mug, nock_context, - constant_hot_state, } } @@ -192,27 +191,14 @@ impl<'a> Context<'a> { self.event_num = new_event_num; self.save(); - // Reset the nock stack, freeing all memory used to compute the event - self.nock_context.stack.reset(0); + unsafe { + self.nock_context.hot.preserve(&mut self.nock_context.stack); + self.nock_context.warm.preserve(&mut self.nock_context.stack); - // Since we reset the nockstack the stack-allocated hot state isn't valid anymore - // - // XX What should be done instead is, at serf initialization: push a frame and - // initialize the hot stack within it, then preserve() the hot stack to the outer frame. - // Then, save the stack and reset to the saved stack for each event, thus avoiding the need - // to recreate the hot state each event, since it does not change over the execution of the - // interpreter. - self.nock_context.hot = Hot::init(&mut self.nock_context.stack, self.constant_hot_state); + // Reset the nock stack, freeing all memory used to compute the event + self.nock_context.stack.flip_top_frame(0); + } - // XX the above trick won't work for the warm state, since it changes whenever the cold - // state does. One possibility is to just save the warm and hot states in the snapshot - // anyway, but throw them away in load() since function pointers are invalidated by the - // restart. - self.nock_context.warm = Warm::init( - &mut self.nock_context.stack, - &mut self.nock_context.cold, - &mut self.nock_context.hot, - ); self.nock_context.cache = Hamt::new(&mut self.nock_context.stack); self.nock_context.scry_stack = D(0); From f7e6cc2770c44e1bb14661dc6e78d07bed51505e Mon Sep 17 00:00:00 2001 From: Edward Amsden Date: Mon, 11 Dec 2023 08:59:38 -0600 Subject: [PATCH 072/128] serf: format --- rust/ares/src/serf.rs | 23 +++++++++++++++-------- 1 file changed, 15 insertions(+), 8 deletions(-) diff --git a/rust/ares/src/serf.rs b/rust/ares/src/serf.rs index 5ca2939..48c075b 100644 --- a/rust/ares/src/serf.rs +++ b/rust/ares/src/serf.rs @@ -7,6 +7,7 @@ use crate::jets::list::util::{lent, zing}; use crate::jets::nock::util::mook; use crate::jets::warm::Warm; use crate::mem::NockStack; +use crate::mem::Preserve; use crate::mug::*; use crate::newt::Newt; use crate::noun::{Atom, Cell, DirectAtom, Noun, Slots, D, T}; @@ -23,7 +24,6 @@ use std::result::Result; use std::sync::atomic::{AtomicBool, Ordering}; use std::sync::Arc; use std::time::Instant; -use crate::mem::Preserve; crate::gdb!(); @@ -90,9 +90,11 @@ struct Context { } impl Context { - pub fn load(snap_path: PathBuf, trace_info: Option, - constant_hot_state: &[HotEntry], - ) -> Context { + pub fn load( + snap_path: PathBuf, + trace_info: Option, + constant_hot_state: &[HotEntry], + ) -> Context { let mut pma = PMA::open(snap_path).expect("serf: pma open failed"); let snapshot_version = pma.meta_get(BTMetaField::SnapshotVersion as usize); @@ -137,9 +139,12 @@ impl Context { self.pma.meta_set(BTMetaField::Snapshot as usize, handle); } - fn new(trace_info: Option, pma: PMA, snapshot: Option, - constant_hot_state: &[HotEntry], - ) -> Self { + fn new( + trace_info: Option, + pma: PMA, + snapshot: Option, + constant_hot_state: &[HotEntry], + ) -> Self { let mut stack = NockStack::new(1024 << 10 << 10, 0); let newt = Newt::new(); let cache = Hamt::::new(&mut stack); @@ -193,7 +198,9 @@ impl Context { unsafe { self.nock_context.hot.preserve(&mut self.nock_context.stack); - self.nock_context.warm.preserve(&mut self.nock_context.stack); + self.nock_context + .warm + .preserve(&mut self.nock_context.stack); // Reset the nock stack, freeing all memory used to compute the event self.nock_context.stack.flip_top_frame(0); From fc3be96f0007550148c1ea307fce9d00f72f1d87 Mon Sep 17 00:00:00 2001 From: Edward Amsden Date: Mon, 11 Dec 2023 09:11:16 -0600 Subject: [PATCH 073/128] serf: no need to GC stack in event-update as it happens at end of event loop --- rust/ares/src/serf.rs | 14 +------------- 1 file changed, 1 insertion(+), 13 deletions(-) diff --git a/rust/ares/src/serf.rs b/rust/ares/src/serf.rs index 48c075b..30effee 100644 --- a/rust/ares/src/serf.rs +++ b/rust/ares/src/serf.rs @@ -196,16 +196,6 @@ impl Context { self.event_num = new_event_num; self.save(); - unsafe { - self.nock_context.hot.preserve(&mut self.nock_context.stack); - self.nock_context - .warm - .preserve(&mut self.nock_context.stack); - - // Reset the nock stack, freeing all memory used to compute the event - self.nock_context.stack.flip_top_frame(0); - } - self.nock_context.cache = Hamt::new(&mut self.nock_context.stack); self.nock_context.scry_stack = D(0); @@ -391,9 +381,7 @@ pub fn serf(constant_hot_state: &[HotEntry]) -> io::Result<()> { clear_interrupt(); - // Persist data that should survive between events - // XX: Such data should go in the PMA once that's available, except - // the warm and hot state which should survive between events but not interpreter runs + // unsafe { let stack = &mut context.nock_context.stack; stack.preserve(&mut context.arvo); From 1ee7ce5c32a6e05932e958ee58f35902f0edefe7 Mon Sep 17 00:00:00 2001 From: Edward Amsden Date: Mon, 11 Dec 2023 14:16:17 -0600 Subject: [PATCH 074/128] 2stackz: clarify doc comment on NockStack::reset Co-authored-by: Jonathan Paprocki <51337059+drbeefsupreme@users.noreply.github.com> --- rust/ares/src/mem.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/rust/ares/src/mem.rs b/rust/ares/src/mem.rs index 9a6352a..873dedd 100644 --- a/rust/ares/src/mem.rs +++ b/rust/ares/src/mem.rs @@ -123,7 +123,7 @@ impl NockStack { } } - /** Resets the NockStack. */ + /** Resets the NockStack. The top frame is west as in the initial creation of the NockStack. */ pub fn reset(&mut self, top_slots: usize) { self.frame_pointer = unsafe { self.start.add(RESERVED + top_slots) } as *mut u64; self.stack_pointer = self.frame_pointer; From 7e110aa6c5b416f3edf73f75d65d2281ad83d9e6 Mon Sep 17 00:00:00 2001 From: Edward Amsden Date: Mon, 11 Dec 2023 14:18:34 -0600 Subject: [PATCH 075/128] 2stackz: assert is_west after reset Co-authored-by: Jonathan Paprocki <51337059+drbeefsupreme@users.noreply.github.com> --- rust/ares/src/mem.rs | 1 + 1 file changed, 1 insertion(+) diff --git a/rust/ares/src/mem.rs b/rust/ares/src/mem.rs index 873dedd..2387ad5 100644 --- a/rust/ares/src/mem.rs +++ b/rust/ares/src/mem.rs @@ -133,6 +133,7 @@ impl NockStack { *self.frame_pointer.sub(FRAME + 1) = ptr::null::() as u64; // "frame pointer" from "previous" frame *self.frame_pointer.sub(STACK + 1) = ptr::null::() as u64; // "stack pointer" from "previous" frame *self.frame_pointer.sub(ALLOC + 1) = self.start as u64; // "alloc pointer" from "previous" frame + assert!(self.is_west()); }; } From fd1af932c632bf0a7f39bcc57c8d4a273ef1f0ba Mon Sep 17 00:00:00 2001 From: Edward Amsden Date: Mon, 11 Dec 2023 14:46:05 -0600 Subject: [PATCH 076/128] 2stackz: fix wrong-end previous allocation pointer in flip_top_frame() --- rust/ares/src/mem.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/rust/ares/src/mem.rs b/rust/ares/src/mem.rs index 2387ad5..2242943 100644 --- a/rust/ares/src/mem.rs +++ b/rust/ares/src/mem.rs @@ -103,7 +103,7 @@ impl NockStack { let new_frame_pointer = self.start.add(self.size).sub(RESERVED + top_slots) as *mut u64; *new_frame_pointer.add(FRAME) = ptr::null::() as u64; *new_frame_pointer.add(STACK) = ptr::null::() as u64; - *new_frame_pointer.add(ALLOC) = self.start as u64; + *new_frame_pointer.add(ALLOC) = self.start.add(self.size) as u64; self.frame_pointer = new_frame_pointer; self.stack_pointer = new_frame_pointer; self.alloc_pointer = new_alloc_pointer; From ede9918c03f562ef7cf812e7898d9cd673794ea7 Mon Sep 17 00:00:00 2001 From: Edward Amsden Date: Mon, 11 Dec 2023 18:34:06 -0600 Subject: [PATCH 077/128] pma: some quick fixes --- rust/ares_pma/c-src/btree.c | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/rust/ares_pma/c-src/btree.c b/rust/ares_pma/c-src/btree.c index 1bdb642..7b823a1 100644 --- a/rust/ares_pma/c-src/btree.c +++ b/rust/ares_pma/c-src/btree.c @@ -2477,9 +2477,6 @@ bt_state_open(BT_state *state, const char *path, ULONG flags, mode_t mode) if (!dpath) return ENOMEM; sprintf(dpath, "%s" DATANAME, path); - if (mkdir(path, 0774) == -1) - return errno; - if ((state->data_fd = open(dpath, oflags, mode)) == -1) return errno; @@ -2592,7 +2589,7 @@ uint64_t bt_meta_get(BT_state *state, size_t idx) { BT_meta *meta = state->meta_pages[state->which]; - assert((uintptr_t)&meta->roots[idx] - (uintptr_t)&meta <= sizeof *meta); + assert((uintptr_t)&(meta->roots[idx]) - (uintptr_t)meta <= sizeof *meta); return meta->roots[idx]; } @@ -2600,7 +2597,7 @@ void bt_meta_set(BT_state *state, size_t idx, uint64_t val) { BT_meta *meta = state->meta_pages[state->which]; - assert((uintptr_t)&meta->roots[idx] - (uintptr_t)&meta <= sizeof *meta); + assert((uintptr_t)&(meta->roots[idx]) - (uintptr_t)meta <= sizeof *meta); meta->roots[idx] = val; } From 548dc3f3c42c3930730cdab8fc2fcd6c3a52c58d Mon Sep 17 00:00:00 2001 From: Edward Amsden Date: Mon, 11 Dec 2023 19:06:45 -0600 Subject: [PATCH 078/128] hamt: when preserving, actually write outermost stem pointer back to mutable self --- rust/ares/src/hamt.rs | 1 + 1 file changed, 1 insertion(+) diff --git a/rust/ares/src/hamt.rs b/rust/ares/src/hamt.rs index 4028e82..d166d17 100644 --- a/rust/ares/src/hamt.rs +++ b/rust/ares/src/hamt.rs @@ -500,6 +500,7 @@ impl Preserve for Hamt { if stack.is_in_frame(self.0) { let dest_stem = stack.struct_alloc_in_previous_frame(1); copy_nonoverlapping(self.0, dest_stem, 1); + self.0 = dest_stem; if stack.is_in_frame((*dest_stem).buffer) { let dest_buffer = stack.struct_alloc_in_previous_frame((*dest_stem).size()); copy_nonoverlapping((*dest_stem).buffer, dest_buffer, (*dest_stem).size()); From bbaa758c95b3d4d69d644abc72fd1113434bcb9e Mon Sep 17 00:00:00 2001 From: Edward Amsden Date: Mon, 11 Dec 2023 23:15:18 -0600 Subject: [PATCH 079/128] hamt: fix persist instance --- rust/ares/src/hamt.rs | 1 + 1 file changed, 1 insertion(+) diff --git a/rust/ares/src/hamt.rs b/rust/ares/src/hamt.rs index d166d17..6a8055b 100644 --- a/rust/ares/src/hamt.rs +++ b/rust/ares/src/hamt.rs @@ -611,6 +611,7 @@ impl Persist for Hamt { break bytes; } depth -= 1; + continue; } let next_chunk = traversal[depth].bitmap.trailing_zeros(); From 94d92e4a3e1b0374d6176364daf20d2d2fee11d9 Mon Sep 17 00:00:00 2001 From: Edward Amsden Date: Mon, 11 Dec 2023 23:15:53 -0600 Subject: [PATCH 080/128] pma: add a missing ftruncate and mmap call, and lots of notes --- rust/ares_pma/c-src/btree.c | 19 ++++++++++++++++++- 1 file changed, 18 insertions(+), 1 deletion(-) diff --git a/rust/ares_pma/c-src/btree.c b/rust/ares_pma/c-src/btree.c index 7b823a1..fe7ebb2 100644 --- a/rust/ares_pma/c-src/btree.c +++ b/rust/ares_pma/c-src/btree.c @@ -112,7 +112,7 @@ off2addr(vaof_t off) #define BT_PAGEWORD 32ULL #define BT_NUMMETAS 2 /* 2 metapages */ #define BT_ADDRSIZE (BT_PAGESIZE << BT_PAGEWORD) -#define PMA_GROW_SIZE (BT_PAGESIZE * 1024) +#define PMA_GROW_SIZE (BT_PAGESIZE * 1024 * 64) #define BT_NOPAGE 0 @@ -2218,6 +2218,9 @@ _bt_state_load(BT_state *state) DPUTS("creating new db"); state->file_size = PMA_GROW_SIZE; new = 1; + if(ftruncate(state->data_fd, PMA_GROW_SIZE)) { + return errno; + } } state->map = mmap(BT_MAPADDR, @@ -2536,6 +2539,7 @@ bt_malloc(BT_state *state, size_t pages) (*n)->va = (BT_page *)(*n)->va + pages; break; } + // XX return early if nothing suitable found in freelist } pgno_t pgno = _bt_falloc(state, pages); @@ -2545,10 +2549,22 @@ bt_malloc(BT_state *state, size_t pages) addr2off(ret) + pages, pgno); + DPRINTF("map %p to offset %lld bytes (%lld pages)\n", ret, P2BYTES(pgno), pgno); + if (ret != + mmap(ret, + P2BYTES(pages), + PROT_READ | PROT_WRITE, + MAP_FIXED | MAP_SHARED, + state->data_fd, + P2BYTES(pgno))) { + DPRINTF("mmap: failed to map at addr %p", ret); + abort(); + } bp(ret != 0); return ret; } +// XX need to mmap fixed/anon/no_reserve and prot_none void bt_free(BT_state *state, void *lo, void *hi) { @@ -2558,6 +2574,7 @@ bt_free(BT_state *state, void *lo, void *hi) _mlist_insert(state, lo, hi); } +// XX need to mprotect PROT_READ all ranges synced including root/meta int bt_sync(BT_state *state) { From 9271deb3d62387468b5a8167996783ca96f6ccb7 Mon Sep 17 00:00:00 2001 From: Edward Amsden Date: Mon, 11 Dec 2023 23:16:32 -0600 Subject: [PATCH 081/128] pma: set -DDEBUG for default profile --- rust/ares_pma/build.rs | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/rust/ares_pma/build.rs b/rust/ares_pma/build.rs index cb9b8a1..f79c4ed 100644 --- a/rust/ares_pma/build.rs +++ b/rust/ares_pma/build.rs @@ -7,10 +7,11 @@ use bindgen::CargoCallbacks; fn main() { let profile = env::var("PROFILE").unwrap(); - let opt_level = match profile.as_ref() { - "debug" => 0, - "release" => 3, - _ => panic!("Unknown profile: {}", profile), + let opt_level = env::var("OPT_LEVEL").unwrap(); + let define_debug = if profile == "debug" { + "-DDEBUG" + } else { + "-UDEBUG" }; // This is the directory where the `c` library is located. @@ -41,6 +42,8 @@ fn main() { .to_str() .expect("Path is not a valid string"), ) + .flag(format!("-O{}", opt_level).as_ref()) + .flag(define_debug) .flag("-g3") .flag("-Wall") .flag("-Wextra") From e96d7ecb4c93f5ef1bfabac0b93e51178bac949e Mon Sep 17 00:00:00 2001 From: barter-simsum Date: Tue, 12 Dec 2023 18:23:37 -0500 Subject: [PATCH 082/128] pma: mmap and mprotect revisions wip --- rust/ares_pma/c-src/btree.c | 143 +++++++++++++++++++++++++++++++----- 1 file changed, 125 insertions(+), 18 deletions(-) diff --git a/rust/ares_pma/c-src/btree.c b/rust/ares_pma/c-src/btree.c index fe7ebb2..0f079db 100644 --- a/rust/ares_pma/c-src/btree.c +++ b/rust/ares_pma/c-src/btree.c @@ -382,6 +382,7 @@ _bt_nalloc(BT_state *state) params to the function and make actual return value an error code. This is to avoid forcing some callers to immediately use _fo_get */ BT_nlistnode **n = &state->nlist; + BT_page *ret = 0; for (; *n; n = &(*n)->next) { /* ;;: this assert is temporary. When partition striping is @@ -396,7 +397,7 @@ _bt_nalloc(BT_state *state) BT_page *ret; ret = (*n)->va; *n = (*n)->next; - return ret; + break; } /* larger than necessary: shrink the node */ if ((*n)->sz > 1) { @@ -404,9 +405,12 @@ _bt_nalloc(BT_state *state) ret = (*n)->va; (*n)->sz -= 1; (*n)->va = (*n)->va + 1; - return ret; + break; } } + + /* make node writeable */ + mprotect(ret, sizeof(BT_page), PROT_READ | PROT_WRITE); } static int @@ -1494,6 +1498,7 @@ _flist_grow(BT_state *state, BT_flistnode *space) static int _flist_new(BT_state *state) +#define FLIST_PG_START (((BT_PAGESIZE * BT_NUMMETAS) + BLK_BASE_LEN0) / BT_PAGESIZE) { BT_meta *meta = state->meta_pages[state->which]; BT_page *root = _node_get(state, meta->root); @@ -1507,8 +1512,7 @@ _flist_new(BT_state *state) head->next = 0; head->sz = len; - head->pg = PMA_GROW_SIZE; /* ;;: should we invoke logic to expand the backing file - here? probably. implement it */ /* */ + head->pg = FLIST_PG_START; state->flist = head; return BT_SUCC; @@ -2022,15 +2026,30 @@ _bt_state_restore_maps2(BT_state *state, BT_page *node, size_t bytelen = hiaddr - loaddr; off_t offset = P2BYTES(pg); - if (loaddr != - mmap(loaddr, - bytelen, - PROT_READ | PROT_WRITE, - MAP_FIXED | MAP_SHARED, - state->data_fd, - offset)) { - DPRINTF("mmap: failed to map at addr %p", loaddr); - abort(); + if (pg != 0) { + /* not freespace, map readonly data on disk */ + if (loaddr != + mmap(loaddr, + bytelen, + PROT_READ, + MAP_FIXED | MAP_SHARED, + state->data_fd, + offset)) { + DPRINTF("mmap: failed to map at addr %p", loaddr); + abort(); + } + } + else { + /* freespace, map no access */ + if (loaddr != + mmap(loaddr, + bytelen, + PROT_NONE, + MAP_FIXED | MAP_ANONYMOUS | MAP_NORESERVE, + -1, 0)) { + DPRINTF("mmap: failed to map at addr %p", loaddr); + abort(); + } } } return; @@ -2224,8 +2243,10 @@ _bt_state_load(BT_state *state) } state->map = mmap(BT_MAPADDR, - BT_ADDRSIZE, - PROT_READ | PROT_WRITE, + BT_ADDRSIZE, /* should actually just be first 2M + stripe. and then from there + should map like it's freespace. */ + PROT_READ | PROT_WRITE, /* ;;: PROT_READ */ MAP_FIXED | MAP_SHARED, state->data_fd, 0); @@ -2392,6 +2413,13 @@ _bt_sync_meta(BT_state *state) /* zero the new metapage's checksum */ newwhich = state->which ? 0 : 1; newmeta = state->meta_pages[newwhich]; + + /* make new metapage writeable */ + if (mprotect(newmeta, sizeof(BT_page), PROT_READ | PROT_WRITE) != 0) { + DPRINTF("mprotect of new metapage failed with %s", strerror(errno)); + abort(); + } + newmeta->chk = 0; /* copy over metapage to new metapage excluding the checksum */ @@ -2407,9 +2435,15 @@ _bt_sync_meta(BT_state *state) newmeta->root = newrootpg; - /* finally, switch the metapage we're referring to */ + /* switch the metapage we're referring to */ state->which = newwhich; + /* finally, make old metapage read-only */ + if (mprotect(meta, sizeof(BT_page), PROT_READ) != 0) { + DPRINTF("mprotect of old metapage failed with %s", strerror(errno)); + abort(); + } + return BT_SUCC; } @@ -2424,7 +2458,7 @@ _bt_sync(BT_state *state, BT_page *node, uint8_t depth, uint8_t maxdepth) /* leaf */ if (depth == maxdepth) { _bt_sync_leaf(state, node); - return BT_SUCC; + goto e; } /* do dfs */ @@ -2442,10 +2476,17 @@ _bt_sync(BT_state *state, BT_page *node, uint8_t depth, uint8_t maxdepth) if (msync(child, sizeof(BT_page), MS_SYNC)) return errno; - /* clean the child */ + /* unset child dirty bit */ _bt_cleanchild(node, i); } + e: + /* all modifications done in node, mark it read-only */ + if (mprotect(node, sizeof(BT_page), PROT_READ) != 0) { + DPRINTF("mprotect of node failed with %s", strerror(errno)); + abort(); + } + return BT_SUCC; } @@ -2572,6 +2613,20 @@ bt_free(BT_state *state, void *lo, void *hi) vaof_t hioff = addr2off(hi); _bt_insert(state, looff, hioff, 0); _mlist_insert(state, lo, hi); + + /* ;;: is this correct? Shouldn't this actually happen when we merge the + pending_mlist on sync? */ + size_t bytelen = (BYTE *)hi - (BYTE *)lo; + + if (lo != + mmap(lo, + bytelen, + PROT_NONE, + MAP_ANONYMOUS | MAP_FIXED | MAP_NORESERVE, + -1, 0)) { + DPRINTF("mmap: failed to map at addr %p", lo); + abort(); + } } // XX need to mprotect PROT_READ all ranges synced including root/meta @@ -2595,6 +2650,12 @@ bt_sync(BT_state *state) if (msync(root, sizeof(BT_page), MS_SYNC)) return errno; + /* make root read-only */ + if (mprotect(root, sizeof(BT_page), PROT_READ) != 0) { + DPRINTF("mprotect of root failed with %s", strerror(errno)); + abort(); + } + /* then sync the metapage */ if (rc = _bt_sync_meta(state)) return rc; @@ -2920,3 +2981,49 @@ _bt_printnode(BT_page *node) printf("[%5zu] %10x %10x\n", i, node->datk[i].va, node->datk[i].fo); } } + +/* + _bt_state_restore_maps2 + if pg 0: + mmap MAP_ANONYMOUS | MAP_FIXED | MAP_NO_RESERVE + PROT_NONE + + if pg !0: + mmap MAP_SHARED | MAP_FIXED + PROT_READ + + + ------------------ + + the three routines that make modification to the data maps are: + + bt_malloc: + + MAP_SHARED | MAP_FIXED + PROT_READ | PROT_WRITE + + _bt_data_cow: + + MAP_SHARED | MAP_FIXED + PROT_READ | PROT_WRITE + + bt_sync: + + (mprotect) + PROT_READ + + bt_free: + + MAP_ANONYMOUS | MAP_FIXED | MAP_NO_RESERVE + PROT_NONE + + ----------------- + + 8 linear mappings (striping) + + when we _bt_nalloc, mprotect(PROT_READ | PROT_WRITE) + + when we free a node: mprotect(PROT_NONE) + + additionally, when we sync, all allocated nodes: mprotect(PROT_READ) +*/ From 13b5f6bee6e13045ac907261eeb992dea1494368 Mon Sep 17 00:00:00 2001 From: barter-simsum Date: Tue, 12 Dec 2023 20:34:12 -0500 Subject: [PATCH 083/128] pma: set protection of leaf data when syncing --- rust/ares_pma/c-src/btree.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/rust/ares_pma/c-src/btree.c b/rust/ares_pma/c-src/btree.c index 0f079db..b1878ec 100644 --- a/rust/ares_pma/c-src/btree.c +++ b/rust/ares_pma/c-src/btree.c @@ -2370,6 +2370,12 @@ _bt_sync_leaf(BT_state *state, BT_page *node) if (msync(addr, bytelen, MS_SYNC)) return errno; + /* mprotect the data */ + if (mprotect(addr, bytelen, PROT_READ) != 0) { + DPRINTF("mprotect of leaf data failed with %s", strerror(errno)); + abort(); + } + /* and clean the dirty bit */ _bt_cleanchild(node, i); } From 7dfc32681affa6f0696bec78bd3b9e61ddc19562 Mon Sep 17 00:00:00 2001 From: barter-simsum Date: Tue, 12 Dec 2023 22:22:48 -0500 Subject: [PATCH 084/128] pma: more mmap changes. WIP --- rust/ares_pma/c-src/btree.c | 45 ++++++++++++++++++++++++++++--------- 1 file changed, 34 insertions(+), 11 deletions(-) diff --git a/rust/ares_pma/c-src/btree.c b/rust/ares_pma/c-src/btree.c index b1878ec..9641082 100644 --- a/rust/ares_pma/c-src/btree.c +++ b/rust/ares_pma/c-src/btree.c @@ -88,7 +88,7 @@ STATIC_ASSERT(0, "debugger break instruction unimplemented"); #define SUCC(x) ((x) == BT_SUCC) -#define BT_MAPADDR ((void *) S(0x1000,0000,0000)) +#define BT_MAPADDR ((BYTE *) S(0x1000,0000,0000)) static inline vaof_t addr2off(void *p) @@ -111,6 +111,7 @@ off2addr(vaof_t off) #define BT_PAGEWORD 32ULL #define BT_NUMMETAS 2 /* 2 metapages */ +#define BT_META_SECTION_WIDTH (BT_NUMMETAS * BT_PAGESIZE) #define BT_ADDRSIZE (BT_PAGESIZE << BT_PAGEWORD) #define PMA_GROW_SIZE (BT_PAGESIZE * 1024 * 64) @@ -235,7 +236,7 @@ static_assert(BT_DAT_MAXBYTES % sizeof(BT_dat) == 0); a meta page is like any other page, but the data section is used to store additional information */ -#define BLK_BASE_LEN0 (MBYTES(2) - (BT_PAGESIZE * BT_NUMMETAS)) +#define BLK_BASE_LEN0 (MBYTES(2) - BT_META_SECTION_WIDTH) #define BLK_BASE_LEN1 (BLK_BASE_LEN0 * 4) #define BLK_BASE_LEN2 (BLK_BASE_LEN1 * 4) #define BLK_BASE_LEN3 (BLK_BASE_LEN2 * 4) @@ -409,8 +410,10 @@ _bt_nalloc(BT_state *state) } } - /* make node writeable */ + /* make node writable */ mprotect(ret, sizeof(BT_page), PROT_READ | PROT_WRITE); + + return ret; } static int @@ -1498,7 +1501,7 @@ _flist_grow(BT_state *state, BT_flistnode *space) static int _flist_new(BT_state *state) -#define FLIST_PG_START (((BT_PAGESIZE * BT_NUMMETAS) + BLK_BASE_LEN0) / BT_PAGESIZE) +#define FLIST_PG_START ((BT_META_SECTION_WIDTH + BLK_BASE_LEN0) / BT_PAGESIZE) { BT_meta *meta = state->meta_pages[state->which]; BT_page *root = _node_get(state, meta->root); @@ -2189,6 +2192,13 @@ _bt_state_meta_new(BT_state *state) TRACE(); + /* open the metapage region for writing */ + if (mprotect(BT_MAPADDR, BT_META_SECTION_WIDTH, + PROT_READ | PROT_WRITE) != 0) { + DPRINTF("mprotect of metapage section failed with %s", strerror(errno)); + abort(); + } + /* initialize the block base array */ meta.blk_base[0] = BT_PAGESIZE * BT_NUMMETAS; @@ -2219,6 +2229,17 @@ _bt_state_meta_new(BT_state *state) first?? */ memcpy(METADATA(p2), &meta, sizeof meta); + /* only the active metapage should be writable (first page) */ + if (mprotect(BT_MAPADDR, BT_META_SECTION_WIDTH, PROT_READ) != 0) { + DPRINTF("mprotect of metapage section failed with %s", strerror(errno)); + abort(); + } + if (mprotect(BT_MAPADDR, BT_PAGESIZE, + PROT_READ | PROT_WRITE) != 0) { + DPRINTF("mprotect of current metapage failed with %s", strerror(errno)); + abort(); + } + return BT_SUCC; } @@ -2242,11 +2263,10 @@ _bt_state_load(BT_state *state) } } + /* map first node stripe (along with metapages) as read only */ state->map = mmap(BT_MAPADDR, - BT_ADDRSIZE, /* should actually just be first 2M - stripe. and then from there - should map like it's freespace. */ - PROT_READ | PROT_WRITE, /* ;;: PROT_READ */ + BT_META_SECTION_WIDTH + BLK_BASE_LEN0, + PROT_READ, MAP_FIXED | MAP_SHARED, state->data_fd, 0); @@ -2420,7 +2440,7 @@ _bt_sync_meta(BT_state *state) newwhich = state->which ? 0 : 1; newmeta = state->meta_pages[newwhich]; - /* make new metapage writeable */ + /* make new metapage writable */ if (mprotect(newmeta, sizeof(BT_page), PROT_READ | PROT_WRITE) != 0) { DPRINTF("mprotect of new metapage failed with %s", strerror(errno)); abort(); @@ -2527,6 +2547,9 @@ bt_state_open(BT_state *state, const char *path, ULONG flags, mode_t mode) if (!dpath) return ENOMEM; sprintf(dpath, "%s" DATANAME, path); + if (mkdir(path, 0774) == -1) + return errno; + if ((state->data_fd = open(dpath, oflags, mode)) == -1) return errno; @@ -2596,7 +2619,7 @@ bt_malloc(BT_state *state, size_t pages) addr2off(ret) + pages, pgno); - DPRINTF("map %p to offset %lld bytes (%lld pages)\n", ret, P2BYTES(pgno), pgno); + DPRINTF("map %p to offset 0x%zx bytes (0x%x pages)\n", ret, P2BYTES(pgno), pgno); if (ret != mmap(ret, P2BYTES(pages), @@ -2929,7 +2952,7 @@ int bt_inbounds(BT_state *state, void *p) /* 1: if in the bounds of the PMA, 0 otherwise */ { - return p >= BT_MAPADDR + return p >= (void *)BT_MAPADDR && p < (void *)((uintptr_t)BT_MAPADDR + BT_ADDRSIZE); } From 88fdfe0755f470a2f287d4d39fb501dbf1696ff3 Mon Sep 17 00:00:00 2001 From: barter-simsum Date: Wed, 13 Dec 2023 07:17:05 -0500 Subject: [PATCH 085/128] pma: misc bug fixes currently debugging an "invalid argument" error in the mmap call in bt_free --- rust/ares_pma/c-src/btree.c | 12 +++++------- 1 file changed, 5 insertions(+), 7 deletions(-) diff --git a/rust/ares_pma/c-src/btree.c b/rust/ares_pma/c-src/btree.c index 9641082..e4d16ba 100644 --- a/rust/ares_pma/c-src/btree.c +++ b/rust/ares_pma/c-src/btree.c @@ -395,14 +395,12 @@ _bt_nalloc(BT_state *state) assert(width < MBYTES(2)); /* perfect fit */ if ((*n)->sz == 1) { - BT_page *ret; ret = (*n)->va; *n = (*n)->next; break; } /* larger than necessary: shrink the node */ if ((*n)->sz > 1) { - BT_page *ret; ret = (*n)->va; (*n)->sz -= 1; (*n)->va = (*n)->va + 1; @@ -419,8 +417,8 @@ _bt_nalloc(BT_state *state) static int _node_cow(BT_state *state, BT_page *node, pgno_t *pgno) { - BT_page *ret = _bt_nalloc(state); - memcpy(ret->datk, node->datk, sizeof node->datk[0] * BT_DAT_MAXENTRIES); + BT_page *ret = _bt_nalloc(state); /* ;;: todo: assert node has no dirty entries */ + memcpy(ret->datk, node->datk, sizeof node->datk[0] * BT_DAT_MAXKEYS); *pgno = _fo_get(state, ret); return BT_SUCC; } @@ -2049,7 +2047,7 @@ _bt_state_restore_maps2(BT_state *state, BT_page *node, bytelen, PROT_NONE, MAP_FIXED | MAP_ANONYMOUS | MAP_NORESERVE, - -1, 0)) { + 0, 0)) { DPRINTF("mmap: failed to map at addr %p", loaddr); abort(); } @@ -2652,8 +2650,8 @@ bt_free(BT_state *state, void *lo, void *hi) bytelen, PROT_NONE, MAP_ANONYMOUS | MAP_FIXED | MAP_NORESERVE, - -1, 0)) { - DPRINTF("mmap: failed to map at addr %p", lo); + 0, 0)) { + DPRINTF("mmap: failed to map at addr %p :: %s", lo, strerror(errno)); abort(); } } From e6300165844b0abd980c268f61e8a9de16666966 Mon Sep 17 00:00:00 2001 From: barter-simsum Date: Wed, 13 Dec 2023 07:45:44 -0500 Subject: [PATCH 086/128] pma: mmap changes appear to be working --- rust/ares_pma/c-src/btree.c | 43 +++++++++++++++++++++---------------- 1 file changed, 25 insertions(+), 18 deletions(-) diff --git a/rust/ares_pma/c-src/btree.c b/rust/ares_pma/c-src/btree.c index e4d16ba..2caa394 100644 --- a/rust/ares_pma/c-src/btree.c +++ b/rust/ares_pma/c-src/btree.c @@ -117,6 +117,13 @@ off2addr(vaof_t off) #define BT_NOPAGE 0 +#define BT_PROT_CLEAN (PROT_READ) +#define BT_FLAG_CLEAN (MAP_FIXED | MAP_SHARED) +#define BT_PROT_FREE (PROT_NONE) +#define BT_FLAG_FREE (MAP_ANONYMOUS | MAP_PRIVATE | MAP_FIXED | MAP_NORESERVE) +#define BT_PROT_DIRTY (PROT_READ | PROT_WRITE) +#define BT_FLAG_DIRTY (MAP_FIXED | MAP_SHARED) + /* FO2BY: file offset to byte get byte INDEX into pma map from file offset @@ -2032,11 +2039,11 @@ _bt_state_restore_maps2(BT_state *state, BT_page *node, if (loaddr != mmap(loaddr, bytelen, - PROT_READ, - MAP_FIXED | MAP_SHARED, + BT_PROT_CLEAN, + BT_FLAG_CLEAN, state->data_fd, offset)) { - DPRINTF("mmap: failed to map at addr %p", loaddr); + DPRINTF("mmap: failed to map at addr %p, errno: %s", loaddr, strerror(errno)); abort(); } } @@ -2045,10 +2052,10 @@ _bt_state_restore_maps2(BT_state *state, BT_page *node, if (loaddr != mmap(loaddr, bytelen, - PROT_NONE, - MAP_FIXED | MAP_ANONYMOUS | MAP_NORESERVE, + BT_PROT_FREE, + BT_FLAG_FREE, 0, 0)) { - DPRINTF("mmap: failed to map at addr %p", loaddr); + DPRINTF("mmap: failed to map at addr %p, errno: %s", loaddr, strerror(errno)); abort(); } } @@ -2264,13 +2271,13 @@ _bt_state_load(BT_state *state) /* map first node stripe (along with metapages) as read only */ state->map = mmap(BT_MAPADDR, BT_META_SECTION_WIDTH + BLK_BASE_LEN0, - PROT_READ, - MAP_FIXED | MAP_SHARED, + BT_PROT_CLEAN, + BT_FLAG_CLEAN, state->data_fd, 0); if (state->map != BT_MAPADDR) { - DPRINTF("mmap: failed to map at addr %p", BT_MAPADDR); + DPRINTF("mmap: failed to map at addr %p, errno: %s", BT_MAPADDR, strerror(errno)); abort(); } @@ -2621,11 +2628,11 @@ bt_malloc(BT_state *state, size_t pages) if (ret != mmap(ret, P2BYTES(pages), - PROT_READ | PROT_WRITE, - MAP_FIXED | MAP_SHARED, + BT_PROT_DIRTY, + BT_FLAG_DIRTY, state->data_fd, P2BYTES(pgno))) { - DPRINTF("mmap: failed to map at addr %p", ret); + DPRINTF("mmap: failed to map at addr %p, errno: %s", ret, strerror(errno)); abort(); } bp(ret != 0); @@ -2648,10 +2655,10 @@ bt_free(BT_state *state, void *lo, void *hi) if (lo != mmap(lo, bytelen, - PROT_NONE, - MAP_ANONYMOUS | MAP_FIXED | MAP_NORESERVE, + BT_PROT_FREE, + BT_FLAG_FREE, 0, 0)) { - DPRINTF("mmap: failed to map at addr %p :: %s", lo, strerror(errno)); + DPRINTF("mmap: failed to map at addr %p, errno: %s", lo, strerror(errno)); abort(); } } @@ -2812,11 +2819,11 @@ _bt_data_cow(BT_state *state, vaof_t lo, vaof_t hi, pgno_t pg) if (loaddr != mmap(loaddr, bytelen, - PROT_READ | PROT_WRITE, - MAP_FIXED | MAP_SHARED, + BT_PROT_DIRTY, + BT_FLAG_DIRTY, state->data_fd, offset)) { - DPRINTF("mmap: failed to map at addr %p", loaddr); + DPRINTF("mmap: failed to map at addr %p, errno: %s", loaddr, strerror(errno)); abort(); } From c4b188802076285b9cf70af66b874681a91db90f Mon Sep 17 00:00:00 2001 From: barter-simsum Date: Wed, 13 Dec 2023 07:52:36 -0500 Subject: [PATCH 087/128] pma: update mprotect calls to use BT_PROT_* macros --- rust/ares_pma/c-src/btree.c | 22 +++++++++++----------- 1 file changed, 11 insertions(+), 11 deletions(-) diff --git a/rust/ares_pma/c-src/btree.c b/rust/ares_pma/c-src/btree.c index 2caa394..7d6befd 100644 --- a/rust/ares_pma/c-src/btree.c +++ b/rust/ares_pma/c-src/btree.c @@ -416,7 +416,7 @@ _bt_nalloc(BT_state *state) } /* make node writable */ - mprotect(ret, sizeof(BT_page), PROT_READ | PROT_WRITE); + mprotect(ret, sizeof(BT_page), BT_PROT_DIRTY); return ret; } @@ -2199,7 +2199,7 @@ _bt_state_meta_new(BT_state *state) /* open the metapage region for writing */ if (mprotect(BT_MAPADDR, BT_META_SECTION_WIDTH, - PROT_READ | PROT_WRITE) != 0) { + BT_PROT_DIRTY) != 0) { DPRINTF("mprotect of metapage section failed with %s", strerror(errno)); abort(); } @@ -2235,12 +2235,12 @@ _bt_state_meta_new(BT_state *state) memcpy(METADATA(p2), &meta, sizeof meta); /* only the active metapage should be writable (first page) */ - if (mprotect(BT_MAPADDR, BT_META_SECTION_WIDTH, PROT_READ) != 0) { + if (mprotect(BT_MAPADDR, BT_META_SECTION_WIDTH, BT_PROT_CLEAN) != 0) { DPRINTF("mprotect of metapage section failed with %s", strerror(errno)); abort(); } if (mprotect(BT_MAPADDR, BT_PAGESIZE, - PROT_READ | PROT_WRITE) != 0) { + BT_PROT_DIRTY) != 0) { DPRINTF("mprotect of current metapage failed with %s", strerror(errno)); abort(); } @@ -2396,7 +2396,7 @@ _bt_sync_leaf(BT_state *state, BT_page *node) return errno; /* mprotect the data */ - if (mprotect(addr, bytelen, PROT_READ) != 0) { + if (mprotect(addr, bytelen, BT_PROT_CLEAN) != 0) { DPRINTF("mprotect of leaf data failed with %s", strerror(errno)); abort(); } @@ -2445,8 +2445,8 @@ _bt_sync_meta(BT_state *state) newwhich = state->which ? 0 : 1; newmeta = state->meta_pages[newwhich]; - /* make new metapage writable */ - if (mprotect(newmeta, sizeof(BT_page), PROT_READ | PROT_WRITE) != 0) { + /* mprotect dirty new metapage */ + if (mprotect(newmeta, sizeof(BT_page), BT_PROT_DIRTY) != 0) { DPRINTF("mprotect of new metapage failed with %s", strerror(errno)); abort(); } @@ -2469,8 +2469,8 @@ _bt_sync_meta(BT_state *state) /* switch the metapage we're referring to */ state->which = newwhich; - /* finally, make old metapage read-only */ - if (mprotect(meta, sizeof(BT_page), PROT_READ) != 0) { + /* finally, make old metapage clean */ + if (mprotect(meta, sizeof(BT_page), BT_PROT_CLEAN) != 0) { DPRINTF("mprotect of old metapage failed with %s", strerror(errno)); abort(); } @@ -2513,7 +2513,7 @@ _bt_sync(BT_state *state, BT_page *node, uint8_t depth, uint8_t maxdepth) e: /* all modifications done in node, mark it read-only */ - if (mprotect(node, sizeof(BT_page), PROT_READ) != 0) { + if (mprotect(node, sizeof(BT_page), BT_PROT_CLEAN) != 0) { DPRINTF("mprotect of node failed with %s", strerror(errno)); abort(); } @@ -2685,7 +2685,7 @@ bt_sync(BT_state *state) return errno; /* make root read-only */ - if (mprotect(root, sizeof(BT_page), PROT_READ) != 0) { + if (mprotect(root, sizeof(BT_page), BT_PROT_CLEAN) != 0) { DPRINTF("mprotect of root failed with %s", strerror(errno)); abort(); } From 3786277230ee6611a38623773817e0452a19c3f1 Mon Sep 17 00:00:00 2001 From: barter-simsum Date: Wed, 13 Dec 2023 18:25:35 -0500 Subject: [PATCH 088/128] pma: remove mkdir call from bt_state_open --- rust/ares_pma/c-src/btest.c | 5 ++++- rust/ares_pma/c-src/btree.c | 3 --- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/rust/ares_pma/c-src/btest.c b/rust/ares_pma/c-src/btest.c index 612e410..99ad0a0 100644 --- a/rust/ares_pma/c-src/btest.c +++ b/rust/ares_pma/c-src/btest.c @@ -29,7 +29,8 @@ int main(int argc, char *argv[]) DPUTS("== test 1: insert"); bt_state_new(&state1); - + if (mkdir("./pmatest1", 0774) == -1) + return errno; assert(SUCC(bt_state_open(state1, "./pmatest1", 0, 0644))); #define LOWEST_ADDR 0x200000; @@ -50,6 +51,8 @@ int main(int argc, char *argv[]) BT_state *state2; bt_state_new(&state2); + if (mkdir("./pmatest2", 0774) == -1) + return errno; assert(SUCC(bt_state_open(state2, "./pmatest2", 0, 0644))); void *t2a = bt_malloc(state2, 10); diff --git a/rust/ares_pma/c-src/btree.c b/rust/ares_pma/c-src/btree.c index 7d6befd..3de8719 100644 --- a/rust/ares_pma/c-src/btree.c +++ b/rust/ares_pma/c-src/btree.c @@ -2552,9 +2552,6 @@ bt_state_open(BT_state *state, const char *path, ULONG flags, mode_t mode) if (!dpath) return ENOMEM; sprintf(dpath, "%s" DATANAME, path); - if (mkdir(path, 0774) == -1) - return errno; - if ((state->data_fd = open(dpath, oflags, mode)) == -1) return errno; From 20693a32ceed5a72339cf64c3eeb3bd31dc0e3e8 Mon Sep 17 00:00:00 2001 From: barter-simsum Date: Wed, 13 Dec 2023 21:32:45 -0500 Subject: [PATCH 089/128] pma: tests --- rust/ares_pma/c-src/btest.c | 40 +++++++++++++++++++++++++++++++++++++ 1 file changed, 40 insertions(+) diff --git a/rust/ares_pma/c-src/btest.c b/rust/ares_pma/c-src/btest.c index 99ad0a0..1c5239a 100644 --- a/rust/ares_pma/c-src/btest.c +++ b/rust/ares_pma/c-src/btest.c @@ -1,6 +1,8 @@ #include "btree.h" #include "btree.c" +#include +#include static void _test_nodeinteg(BT_state *state, BT_findpath *path, @@ -77,5 +79,43 @@ int main(int argc, char *argv[]) /* should invoke deletion coalescing - 10 page free range in btree */ void *t2c = bt_malloc(state2, 20); + bt_state_close(state2); + + + DPUTS("== test 3: ephemeral structure restoration"); + BT_state *state3; + + bt_state_new(&state3); + if (mkdir("./pmatest3", 0774) == -1) + return errno; + assert(SUCC(bt_state_open(state3, "./pmatest3", 0, 0644))); + + typedef struct lohi_pair lohi_pair; + struct lohi_pair + { + BT_page *lo; + BT_page *hi; + }; + + /* ;;: getting strange abort in bt_malloc precisely when i = 131 + + bt_malloc:2632 mmap: failed to map at addr 0x100811538000, errno: Invalid argument + + * obviously the addr is arbitrary + */ +#define ITERATIONS 1000 +#define MAXALLOCPG 0xFF + lohi_pair allocs[ITERATIONS] = {0}; + for (size_t i = 0; i < ITERATIONS; i++) { + /* malloc a random number of pages < 256 and store in the allocs array */ + int pages = rand(); + pages &= MAXALLOCPG; + allocs[i].lo = bt_malloc(state3, pages); + allocs[i].hi = allocs[i].lo + pages; + } + + /* sync the state */ + bt_sync(state3); + return 0; } From 92bb42683d0d27482040a7b9b83a02755e3e41e5 Mon Sep 17 00:00:00 2001 From: barter-simsum Date: Wed, 13 Dec 2023 21:36:34 -0500 Subject: [PATCH 090/128] pma: fix c test3 --- rust/ares_pma/c-src/btest.c | 9 ++------- 1 file changed, 2 insertions(+), 7 deletions(-) diff --git a/rust/ares_pma/c-src/btest.c b/rust/ares_pma/c-src/btest.c index 1c5239a..69b4f7c 100644 --- a/rust/ares_pma/c-src/btest.c +++ b/rust/ares_pma/c-src/btest.c @@ -97,19 +97,14 @@ int main(int argc, char *argv[]) BT_page *hi; }; - /* ;;: getting strange abort in bt_malloc precisely when i = 131 - - bt_malloc:2632 mmap: failed to map at addr 0x100811538000, errno: Invalid argument - - * obviously the addr is arbitrary - */ #define ITERATIONS 1000 #define MAXALLOCPG 0xFF lohi_pair allocs[ITERATIONS] = {0}; for (size_t i = 0; i < ITERATIONS; i++) { - /* malloc a random number of pages < 256 and store in the allocs array */ + /* malloc a random number of pages <= 256 and store in the allocs array */ int pages = rand(); pages &= MAXALLOCPG; + pages += 1; allocs[i].lo = bt_malloc(state3, pages); allocs[i].hi = allocs[i].lo + pages; } From 559844230474156c788cecf64bd4f4f9a9a1d8bc Mon Sep 17 00:00:00 2001 From: Edward Amsden Date: Thu, 14 Dec 2023 00:45:09 -0600 Subject: [PATCH 091/128] pma: fix save_to_pma buffer size assertions --- rust/ares/src/persist.rs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/rust/ares/src/persist.rs b/rust/ares/src/persist.rs index 0b10825..c66d3e3 100644 --- a/rust/ares/src/persist.rs +++ b/rust/ares/src/persist.rs @@ -113,8 +113,8 @@ pub trait Persist { let mut buffer = bt_malloc(pma.0, space_as_pages) as *mut u8; let orig_buffer = buffer; self.copy_to_buffer(stack, pma, &mut buffer); - assert!(orig_buffer.offset_from(buffer) > 0); - assert!(orig_buffer.offset_from(buffer) <= space.try_into().unwrap()); + assert!(buffer.offset_from(orig_buffer) > 0); + assert!(buffer.offset_from(orig_buffer) == space.try_into().unwrap()); self.handle_to_u64() } } From bfdb1b9eb7b6de00aaa70cbbb6cb38eba046ba6c Mon Sep 17 00:00:00 2001 From: Edward Amsden Date: Thu, 14 Dec 2023 00:56:39 -0600 Subject: [PATCH 092/128] pma: fix and simplify copy_to_buffer for Noun --- rust/ares/src/persist.rs | 19 +++++++------------ 1 file changed, 7 insertions(+), 12 deletions(-) diff --git a/rust/ares/src/persist.rs b/rust/ares/src/persist.rs index c66d3e3..b713b3a 100644 --- a/rust/ares/src/persist.rs +++ b/rust/ares/src/persist.rs @@ -211,19 +211,18 @@ impl Persist for Noun { unsafe fn copy_to_buffer(&mut self, stack: &mut NockStack, pma: &PMA, buffer: &mut *mut u8) { let mut buffer_u64 = (*buffer) as *mut u64; stack.frame_push(0); - *(stack.push::<(Noun, *mut Noun)>()) = (*self, self as *mut Noun); + *(stack.push::<*mut Noun>()) = (self as *mut Noun); loop { if stack.stack_is_empty() { break; } - let (noun, dest) = *(stack.top::<(Noun, *mut Noun)>()); + let dest = *(stack.top::<*mut Noun>()); + stack.pop::<*mut Noun>(); - match noun.as_either_direct_allocated() { - Left(direct) => { - *dest = noun; - } + match (*dest).as_either_direct_allocated() { + Left(direct) => {} Right(allocated) => { if let Some(a) = allocated.forwarding_pointer() { *dest = a.as_noun(); @@ -234,7 +233,6 @@ impl Persist for Noun { Left(mut indirect) => { let count = indirect.raw_size(); if pma.contains(indirect.to_raw_pointer(), count) { - *dest = noun; continue; } @@ -246,7 +244,6 @@ impl Persist for Noun { } Right(mut cell) => { if pma.contains(cell.to_raw_pointer(), 1) { - *dest = noun; continue; } @@ -258,10 +255,8 @@ impl Persist for Noun { *dest = Cell::from_raw_pointer(new_cell_mem).as_noun(); - *(stack.push::<(Noun, *mut Noun)>()) = - (cell.tail(), &mut (*new_cell_mem).tail); - *(stack.push::<(Noun, *mut Noun)>()) = - (cell.head(), &mut (*new_cell_mem).head); + *(stack.push::<*mut Noun>()) = &mut (*new_cell_mem).tail; + *(stack.push::<*mut Noun>()) = &mut (*new_cell_mem).head; buffer_u64 = new_cell_mem.add(1) as *mut u64; } From adfc9ddb1067db1b95f6225335a1e31c0980f9e9 Mon Sep 17 00:00:00 2001 From: Edward Amsden Date: Thu, 14 Dec 2023 14:28:27 -0600 Subject: [PATCH 093/128] pma: fix assert in save_to_pma --- rust/ares/src/persist.rs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/rust/ares/src/persist.rs b/rust/ares/src/persist.rs index b713b3a..56e683c 100644 --- a/rust/ares/src/persist.rs +++ b/rust/ares/src/persist.rs @@ -113,8 +113,8 @@ pub trait Persist { let mut buffer = bt_malloc(pma.0, space_as_pages) as *mut u8; let orig_buffer = buffer; self.copy_to_buffer(stack, pma, &mut buffer); - assert!(buffer.offset_from(orig_buffer) > 0); - assert!(buffer.offset_from(orig_buffer) == space.try_into().unwrap()); + let space_isize: isize = space.try_into().unwrap(); + assert!(buffer.offset_from(orig_buffer) == space_isize); self.handle_to_u64() } } From 92c1a23b7dad9ffa7d5fff8d326caf996cc4ec82 Mon Sep 17 00:00:00 2001 From: barter-simsum Date: Thu, 14 Dec 2023 16:30:29 -0500 Subject: [PATCH 094/128] pma: abort on failed msync/mprotect --- rust/ares_pma/c-src/btree.c | 25 +++++++++++++++++-------- 1 file changed, 17 insertions(+), 8 deletions(-) diff --git a/rust/ares_pma/c-src/btree.c b/rust/ares_pma/c-src/btree.c index 3de8719..98a0e3a 100644 --- a/rust/ares_pma/c-src/btree.c +++ b/rust/ares_pma/c-src/btree.c @@ -87,6 +87,9 @@ STATIC_ASSERT(0, "debugger break instruction unimplemented"); #define BT_SUCC 0 #define SUCC(x) ((x) == BT_SUCC) +/* given a pointer p returns the low page-aligned addr */ +#define LO_ALIGN_PAGE(p) ((BT_page *)(((uintptr_t)p) & ~(BT_PAGESIZE - 1))) + #define BT_MAPADDR ((BYTE *) S(0x1000,0000,0000)) @@ -2392,8 +2395,10 @@ _bt_sync_leaf(BT_state *state, BT_page *node) void *addr = off2addr(lo); /* sync the page */ - if (msync(addr, bytelen, MS_SYNC)) - return errno; + if (msync(addr, bytelen, MS_SYNC)) { + DPRINTF("msync of leaf: %p failed with %s", addr, strerror(errno)); + abort(); + } /* mprotect the data */ if (mprotect(addr, bytelen, BT_PROT_CLEAN) != 0) { @@ -2438,15 +2443,17 @@ _bt_sync_meta(BT_state *state) meta->chk = chk; /* sync the metapage */ - if (msync(meta, sizeof(BT_page), MS_SYNC)) - return errno; + if (msync(LO_ALIGN_PAGE(meta), sizeof(BT_page), MS_SYNC)) { + DPRINTF("msync of metapage: %p failed with %s", meta, strerror(errno)); + abort(); + } /* zero the new metapage's checksum */ newwhich = state->which ? 0 : 1; newmeta = state->meta_pages[newwhich]; /* mprotect dirty new metapage */ - if (mprotect(newmeta, sizeof(BT_page), BT_PROT_DIRTY) != 0) { + if (mprotect(LO_ALIGN_PAGE(newmeta), sizeof(BT_page), BT_PROT_DIRTY) != 0) { DPRINTF("mprotect of new metapage failed with %s", strerror(errno)); abort(); } @@ -2470,7 +2477,7 @@ _bt_sync_meta(BT_state *state) state->which = newwhich; /* finally, make old metapage clean */ - if (mprotect(meta, sizeof(BT_page), BT_PROT_CLEAN) != 0) { + if (mprotect(LO_ALIGN_PAGE(meta), sizeof(BT_page), BT_PROT_CLEAN) != 0) { DPRINTF("mprotect of old metapage failed with %s", strerror(errno)); abort(); } @@ -2504,8 +2511,10 @@ _bt_sync(BT_state *state, BT_page *node, uint8_t depth, uint8_t maxdepth) return rc; /* sync the child node */ - if (msync(child, sizeof(BT_page), MS_SYNC)) - return errno; + if (msync(child, sizeof(BT_page), MS_SYNC)) { + DPRINTF("msync of child node: %p failed with %s", child, strerror(errno)); + abort(); + } /* unset child dirty bit */ _bt_cleanchild(node, i); From fa30180b684445a361435e9156c31d3ac7a02609 Mon Sep 17 00:00:00 2001 From: Edward Amsden Date: Thu, 14 Dec 2023 15:24:35 -0600 Subject: [PATCH 095/128] pma: noun and hamt persist fixes --- rust/ares/src/hamt.rs | 2 ++ rust/ares/src/persist.rs | 1 + 2 files changed, 3 insertions(+) diff --git a/rust/ares/src/hamt.rs b/rust/ares/src/hamt.rs index d73946c..a687aba 100644 --- a/rust/ares/src/hamt.rs +++ b/rust/ares/src/hamt.rs @@ -606,6 +606,8 @@ impl Persist for Hamt { return bytes; }; + bytes += (*self.0).size() * size_of::>(); + let mut depth: usize = 0; let mut traversal = [Stem { bitmap: 0, diff --git a/rust/ares/src/persist.rs b/rust/ares/src/persist.rs index 56e683c..730bb72 100644 --- a/rust/ares/src/persist.rs +++ b/rust/ares/src/persist.rs @@ -265,6 +265,7 @@ impl Persist for Noun { } } *buffer = buffer_u64 as *mut u8; + stack.frame_pop(); } unsafe fn handle_to_u64(&self) -> u64 { From a8b3619e9f03e2e21d24c6aabe528f5f010aefc6 Mon Sep 17 00:00:00 2001 From: barter-simsum Date: Thu, 14 Dec 2023 16:35:33 -0500 Subject: [PATCH 096/128] pma: more msync/mprotect abort --- rust/ares_pma/c-src/btree.c | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/rust/ares_pma/c-src/btree.c b/rust/ares_pma/c-src/btree.c index 98a0e3a..3590e4c 100644 --- a/rust/ares_pma/c-src/btree.c +++ b/rust/ares_pma/c-src/btree.c @@ -419,7 +419,10 @@ _bt_nalloc(BT_state *state) } /* make node writable */ - mprotect(ret, sizeof(BT_page), BT_PROT_DIRTY); + mprotect(ret, sizeof(BT_page), BT_PROT_DIRTY) { + DPRINTF("mprotect of node: %p failed with %s", ret, strerror(errno)); + abort(); + } return ret; } @@ -2687,8 +2690,10 @@ bt_sync(BT_state *state) _pending_flist_merge(state); /* sync the root page */ - if (msync(root, sizeof(BT_page), MS_SYNC)) - return errno; + if (msync(root, sizeof(BT_page), MS_SYNC)) { + DPRINTF("msync of root: %p failed with %s", root, strerror(errno)); + abort(); + } /* make root read-only */ if (mprotect(root, sizeof(BT_page), BT_PROT_CLEAN) != 0) { From a509c2fabe8cd458ad5799aba945442fadc45b79 Mon Sep 17 00:00:00 2001 From: barter-simsum Date: Thu, 14 Dec 2023 16:40:42 -0500 Subject: [PATCH 097/128] pma: even more msync/mprotect changes --- rust/ares_pma/c-src/btree.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/rust/ares_pma/c-src/btree.c b/rust/ares_pma/c-src/btree.c index 3590e4c..1b9dda3 100644 --- a/rust/ares_pma/c-src/btree.c +++ b/rust/ares_pma/c-src/btree.c @@ -419,7 +419,7 @@ _bt_nalloc(BT_state *state) } /* make node writable */ - mprotect(ret, sizeof(BT_page), BT_PROT_DIRTY) { + if (mprotect(ret, sizeof(BT_page), BT_PROT_DIRTY) != 0) { DPRINTF("mprotect of node: %p failed with %s", ret, strerror(errno)); abort(); } @@ -2398,7 +2398,7 @@ _bt_sync_leaf(BT_state *state, BT_page *node) void *addr = off2addr(lo); /* sync the page */ - if (msync(addr, bytelen, MS_SYNC)) { + if (msync(addr, bytelen, MS_SYNC) != 0) { DPRINTF("msync of leaf: %p failed with %s", addr, strerror(errno)); abort(); } @@ -2446,7 +2446,7 @@ _bt_sync_meta(BT_state *state) meta->chk = chk; /* sync the metapage */ - if (msync(LO_ALIGN_PAGE(meta), sizeof(BT_page), MS_SYNC)) { + if (msync(LO_ALIGN_PAGE(meta), sizeof(BT_page), MS_SYNC) != 0) { DPRINTF("msync of metapage: %p failed with %s", meta, strerror(errno)); abort(); } @@ -2514,7 +2514,7 @@ _bt_sync(BT_state *state, BT_page *node, uint8_t depth, uint8_t maxdepth) return rc; /* sync the child node */ - if (msync(child, sizeof(BT_page), MS_SYNC)) { + if (msync(child, sizeof(BT_page), MS_SYNC) != 0) { DPRINTF("msync of child node: %p failed with %s", child, strerror(errno)); abort(); } @@ -2690,7 +2690,7 @@ bt_sync(BT_state *state) _pending_flist_merge(state); /* sync the root page */ - if (msync(root, sizeof(BT_page), MS_SYNC)) { + if (msync(root, sizeof(BT_page), MS_SYNC) != 0) { DPRINTF("msync of root: %p failed with %s", root, strerror(errno)); abort(); } From caeea6ec2de5221fbee94a2aac7959493c1f39bf Mon Sep 17 00:00:00 2001 From: Edward Amsden Date: Thu, 14 Dec 2023 18:08:55 -0600 Subject: [PATCH 098/128] unifying equality: move to own module --- rust/ares/src/hamt.rs | 3 +- rust/ares/src/interpreter.rs | 4 +- rust/ares/src/jets.rs | 3 +- rust/ares/src/jets/cold.rs | 3 +- rust/ares/src/lib.rs | 1 + rust/ares/src/mem.rs | 256 +++-------------------------- rust/ares/src/unifying_equality.rs | 243 +++++++++++++++++++++++++++ 7 files changed, 273 insertions(+), 240 deletions(-) create mode 100644 rust/ares/src/unifying_equality.rs diff --git a/rust/ares/src/hamt.rs b/rust/ares/src/hamt.rs index a687aba..c381373 100644 --- a/rust/ares/src/hamt.rs +++ b/rust/ares/src/hamt.rs @@ -1,4 +1,5 @@ -use crate::mem::{unifying_equality, NockStack, Preserve}; +use crate::mem::{NockStack, Preserve}; +use crate::unifying_equality::unifying_equality; use crate::mug::mug_u32; use crate::noun::Noun; use crate::persist::{Persist, PMA}; diff --git a/rust/ares/src/interpreter.rs b/rust/ares/src/interpreter.rs index bf406af..ba342e6 100644 --- a/rust/ares/src/interpreter.rs +++ b/rust/ares/src/interpreter.rs @@ -1,13 +1,13 @@ use crate::assert_acyclic; use crate::assert_no_forwarding_pointers; use crate::assert_no_junior_pointers; +use crate::unifying_equality::unifying_equality; use crate::hamt::Hamt; use crate::jets::cold; use crate::jets::cold::Cold; use crate::jets::hot::Hot; use crate::jets::warm::Warm; use crate::jets::JetErr; -use crate::mem::unifying_equality; use crate::mem::NockStack; use crate::mem::Preserve; use crate::newt::Newt; @@ -1304,7 +1304,7 @@ mod hint { use crate::jets; use crate::jets::cold; use crate::jets::nock::util::{mook, LEAF}; - use crate::mem::unifying_equality; + use crate::unifying_equality::unifying_equality; use crate::noun::{tape, Atom, Cell, Noun, D, T}; use crate::serf::TERMINATOR; use ares_macros::tas; diff --git a/rust/ares/src/jets.rs b/rust/ares/src/jets.rs index 7c76b3e..470c258 100644 --- a/rust/ares/src/jets.rs +++ b/rust/ares/src/jets.rs @@ -307,7 +307,8 @@ pub mod util { pub mod test { use super::*; use crate::hamt::Hamt; - use crate::mem::{unifying_equality, NockStack}; + use crate::mem::NockStack; + use crate::unifying_equality::unifying_equality; use crate::noun::{Atom, Noun, D, T}; use assert_no_alloc::assert_no_alloc; use ibig::UBig; diff --git a/rust/ares/src/jets/cold.rs b/rust/ares/src/jets/cold.rs index 9db0ef0..8f0689a 100644 --- a/rust/ares/src/jets/cold.rs +++ b/rust/ares/src/jets/cold.rs @@ -1,5 +1,6 @@ use crate::hamt::Hamt; -use crate::mem::{unifying_equality, NockStack, Preserve}; +use crate::mem::{NockStack, Preserve}; +use crate::unifying_equality::unifying_equality; use crate::noun; use crate::noun::{Atom, DirectAtom, Noun, Slots, D, T}; use crate::persist::{Persist, PMA}; diff --git a/rust/ares/src/lib.rs b/rust/ares/src/lib.rs index c90cb87..17b7223 100644 --- a/rust/ares/src/lib.rs +++ b/rust/ares/src/lib.rs @@ -15,6 +15,7 @@ pub mod serf; pub mod persist; pub mod serialization; pub mod trace; +pub mod unifying_equality; /** Introduce useful functions for debugging * diff --git a/rust/ares/src/mem.rs b/rust/ares/src/mem.rs index 22909f5..b8259a4 100644 --- a/rust/ares/src/mem.rs +++ b/rust/ares/src/mem.rs @@ -142,6 +142,26 @@ impl NockStack { self.frame_pointer } + /** Current stack pointer of this NockStack */ + pub fn get_stack_pointer(&self) -> *const u64 { + self.stack_pointer + } + + /** Current alloc pointer of this NockStack */ + pub fn get_alloc_pointer(&self) -> *const u64 { + self.alloc_pointer + } + + /** Start of the memory range for this NockStack */ + pub fn get_start(&self) -> *const u64 { + self.start + } + + /** End of the memory range for this NockStack */ + pub fn get_size(&self) -> usize { + self.size + } + /** Checks if the current stack frame has West polarity */ #[inline] pub fn is_west(&self) -> bool { @@ -227,7 +247,7 @@ impl NockStack { } /** Pointer to where the previous stack pointer is saved in a frame */ - unsafe fn prev_stack_pointer_pointer(&self) -> *mut *mut u64 { + pub unsafe fn prev_stack_pointer_pointer(&self) -> *mut *mut u64 { if !self.pc { self.slot_pointer(STACK) as *mut *mut u64 } else { @@ -816,240 +836,6 @@ impl NockStack { } } -#[cfg(feature = "check_junior")] -#[macro_export] -macro_rules! assert_no_junior_pointers { - ( $x:expr, $y:expr ) => { - assert_no_alloc::permit_alloc(|| { - assert!($x.no_junior_pointers($y)); - }) - }; -} - -#[cfg(not(feature = "check_junior"))] -#[macro_export] -macro_rules! assert_no_junior_pointers { - ( $x:expr, $y:expr ) => {}; -} - -pub unsafe fn unifying_equality(stack: &mut NockStack, a: *mut Noun, b: *mut Noun) -> bool { - /* This version of unifying equality is not like that of vere. - * Vere does a tree comparison (accelerated by pointer equality and short-circuited by mug - * equality) and then unifies the nouns at the top level if they are equal. - * - * Here we recursively attempt to unify nouns. Pointer-equal nouns are already unified. - * Disequal mugs again short-circuit the unification and equality check. - * - * Since we expect atoms to be normalized, direct and indirect atoms do not unify with each - * other. For direct atoms, no unification is possible as there is no pointer involved in their - * representation. Equality is simply direct equality on the word representation. Indirect - * atoms require equality first of the size and then of the memory buffers' contents. - * - * Cell equality is tested (after mug and pointer equality) by attempting to unify the heads and tails, - * respectively, of cells, and then re-testing. If unification succeeds then the heads and - * tails will be pointer-wise equal and the cell itself can be unified. A failed unification of - * the head or the tail will already short-circuit the unification/equality test, so we will - * not return to re-test the pointer equality. - * - * When actually mutating references for unification, we must be careful to respect seniority. - * A reference to a more junior noun should always be replaced with a reference to a more - * senior noun, *never vice versa*, to avoid introducing references from more senior frames - * into more junior frames, which would result in incorrect operation of the copier. - */ - assert_acyclic!(*a); - assert_acyclic!(*b); - assert_no_forwarding_pointers!(*a); - assert_no_forwarding_pointers!(*b); - assert_no_junior_pointers!(stack, *a); - assert_no_junior_pointers!(stack, *b); - - // If the nouns are already word-equal we have nothing to do - if (*a).raw_equals(*b) { - return true; - }; - // If the nouns have cached mugs which are disequal we have nothing to do - if let (Ok(a_alloc), Ok(b_alloc)) = ((*a).as_allocated(), (*b).as_allocated()) { - if let (Some(a_mug), Some(b_mug)) = (a_alloc.get_cached_mug(), b_alloc.get_cached_mug()) { - if a_mug != b_mug { - return false; - }; - }; - }; - stack.frame_push(0); - *(stack.push::<(*mut Noun, *mut Noun)>()) = (a, b); - loop { - if stack.stack_is_empty() { - break; - }; - let (x, y): (*mut Noun, *mut Noun) = *(stack.top()); - if (*x).raw_equals(*y) { - stack.pop::<(*mut Noun, *mut Noun)>(); - continue; - }; - if let (Ok(x_alloc), Ok(y_alloc)) = ( - // equal direct atoms return true for raw_equals() - (*x).as_allocated(), - (*y).as_allocated(), - ) { - if let (Some(x_mug), Some(y_mug)) = (x_alloc.get_cached_mug(), y_alloc.get_cached_mug()) - { - if x_mug != y_mug { - break; // short-circuit, the mugs differ therefore the nouns must differ - } - }; - match (x_alloc.as_either(), y_alloc.as_either()) { - (Left(x_indirect), Left(y_indirect)) => { - let x_as_ptr = x_indirect.to_raw_pointer(); - let y_as_ptr = y_indirect.to_raw_pointer(); - if x_indirect.size() == y_indirect.size() - && memcmp( - x_indirect.data_pointer() as *const c_void, - y_indirect.data_pointer() as *const c_void, - x_indirect.size() << 3, - ) == 0 - { - let (_senior, junior) = senior_pointer_first(stack, x_as_ptr, y_as_ptr); - if x_as_ptr == junior { - *x = *y; - } else { - *y = *x; - } - stack.pop::<(*mut Noun, *mut Noun)>(); - continue; - } else { - break; - } - } - (Right(x_cell), Right(y_cell)) => { - let x_as_ptr = x_cell.to_raw_pointer() as *const u64; - let y_as_ptr = y_cell.to_raw_pointer() as *const u64; - if x_cell.head().raw_equals(y_cell.head()) - && x_cell.tail().raw_equals(y_cell.tail()) - { - let (_senior, junior) = senior_pointer_first(stack, x_as_ptr, y_as_ptr); - if x_as_ptr == junior { - *x = *y; - } else { - *y = *x; - } - stack.pop::<(*mut Noun, *mut Noun)>(); - continue; - } else { - /* THIS ISN'T AN INFINITE LOOP - * If we discover a disequality in either side, we will - * short-circuit the entire loop and reset the work stack. - * - * If both sides are equal, then we will discover pointer - * equality when we return and unify the cell. - */ - *(stack.push::<(*mut Noun, *mut Noun)>()) = - (x_cell.tail_as_mut(), y_cell.tail_as_mut()); - *(stack.push::<(*mut Noun, *mut Noun)>()) = - (x_cell.head_as_mut(), y_cell.head_as_mut()); - continue; - } - } - (_, _) => { - break; // cells don't unify with atoms - } - } - } else { - break; // direct atom not raw equal, so short circuit - } - } - stack.frame_pop(); - - assert_acyclic!(*a); - assert_acyclic!(*b); - assert_no_forwarding_pointers!(*a); - assert_no_forwarding_pointers!(*b); - assert_no_junior_pointers!(stack, *a); - assert_no_junior_pointers!(stack, *b); - - (*a).raw_equals(*b) -} - -unsafe fn senior_pointer_first( - stack: &NockStack, - a: *const u64, - b: *const u64, -) -> (*const u64, *const u64) { - let mut frame_pointer: *const u64 = stack.frame_pointer; - let mut stack_pointer: *const u64 = stack.stack_pointer; - let mut alloc_pointer: *const u64 = stack.alloc_pointer; - let prev_stack_pointer = *(stack.prev_stack_pointer_pointer()); - - let (mut high_pointer, mut low_pointer): (*const u64, *const u64) = if stack.is_west() { - (prev_stack_pointer, alloc_pointer) - } else { - (alloc_pointer, prev_stack_pointer) - }; - - loop { - if low_pointer.is_null() || high_pointer.is_null() { - // we found the bottom of the stack; check entirety of the stack - low_pointer = stack.start; - high_pointer = stack.start.add(stack.size); - } - - match ( - a < high_pointer && a >= low_pointer, - b < high_pointer && b >= low_pointer, - ) { - (true, true) => { - // both pointers are in the same frame, pick arbitrarily (lower in mem) - break lower_pointer_first(a, b); - } - (true, false) => break (b, a), // a is in the frame, b is not, so b is senior - (false, true) => break (a, b), // b is in the frame, a is not, so a is senior - (false, false) => { - // chase up the stack - #[allow(clippy::comparison_chain)] - // test to see if the frame under consideration is a west frame - if stack_pointer < alloc_pointer { - stack_pointer = *(frame_pointer.sub(STACK + 1)) as *const u64; - alloc_pointer = *(frame_pointer.sub(ALLOC + 1)) as *const u64; - frame_pointer = *(frame_pointer.sub(FRAME + 1)) as *const u64; - - // both pointers are in the PMA, pick arbitrarily (lower in mem) - if frame_pointer.is_null() { - break lower_pointer_first(a, b); - }; - - // previous allocation pointer - high_pointer = alloc_pointer; - // "previous previous" stack pointer. this is the other boundary of the previous allocation arena - low_pointer = *(frame_pointer.add(STACK)) as *const u64; - } else if stack_pointer > alloc_pointer { - stack_pointer = *(frame_pointer.add(STACK)) as *const u64; - alloc_pointer = *(frame_pointer.add(ALLOC)) as *const u64; - frame_pointer = *(frame_pointer.add(FRAME)) as *const u64; - - // both pointers are in the PMA, pick arbitrarily (lower in mem) - if frame_pointer.is_null() { - break lower_pointer_first(a, b); - }; - - // previous allocation pointer - low_pointer = alloc_pointer; - // "previous previous" stack pointer. this is the other boundary of the previous allocation arena - high_pointer = *(frame_pointer.sub(STACK + 1)) as *const u64; - } else { - panic!("senior_pointer_first: stack_pointer == alloc_pointer"); - } - } - } - } -} - -fn lower_pointer_first(a: *const u64, b: *const u64) -> (*const u64, *const u64) { - if a < b { - (a, b) - } else { - (b, a) - } -} - impl NounAllocator for NockStack { unsafe fn alloc_indirect(&mut self, words: usize) -> *mut u64 { self.indirect_alloc(words) diff --git a/rust/ares/src/unifying_equality.rs b/rust/ares/src/unifying_equality.rs new file mode 100644 index 0000000..e133018 --- /dev/null +++ b/rust/ares/src/unifying_equality.rs @@ -0,0 +1,243 @@ +use crate::assert_acyclic; +use crate::assert_no_forwarding_pointers; +use crate::assert_no_junior_pointers; +use crate::mem::{NockStack, FRAME, STACK, ALLOC}; +use crate::noun::Noun; +use either::Either::*; +use libc::{c_void, memcmp}; + +#[cfg(feature = "check_junior")] +#[macro_export] +macro_rules! assert_no_junior_pointers { + ( $x:expr, $y:expr ) => { + assert_no_alloc::permit_alloc(|| { + assert!($x.no_junior_pointers($y)); + }) + }; +} + +#[cfg(not(feature = "check_junior"))] +#[macro_export] +macro_rules! assert_no_junior_pointers { + ( $x:expr, $y:expr ) => {}; +} + + +pub unsafe fn unifying_equality(stack: &mut NockStack, a: *mut Noun, b: *mut Noun) -> bool { + /* This version of unifying equality is not like that of vere. + * Vere does a tree comparison (accelerated by pointer equality and short-circuited by mug + * equality) and then unifies the nouns at the top level if they are equal. + * + * Here we recursively attempt to unify nouns. Pointer-equal nouns are already unified. + * Disequal mugs again short-circuit the unification and equality check. + * + * Since we expect atoms to be normalized, direct and indirect atoms do not unify with each + * other. For direct atoms, no unification is possible as there is no pointer involved in their + * representation. Equality is simply direct equality on the word representation. Indirect + * atoms require equality first of the size and then of the memory buffers' contents. + * + * Cell equality is tested (after mug and pointer equality) by attempting to unify the heads and tails, + * respectively, of cells, and then re-testing. If unification succeeds then the heads and + * tails will be pointer-wise equal and the cell itself can be unified. A failed unification of + * the head or the tail will already short-circuit the unification/equality test, so we will + * not return to re-test the pointer equality. + * + * When actually mutating references for unification, we must be careful to respect seniority. + * A reference to a more junior noun should always be replaced with a reference to a more + * senior noun, *never vice versa*, to avoid introducing references from more senior frames + * into more junior frames, which would result in incorrect operation of the copier. + */ + assert_acyclic!(*a); + assert_acyclic!(*b); + assert_no_forwarding_pointers!(*a); + assert_no_forwarding_pointers!(*b); + assert_no_junior_pointers!(stack, *a); + assert_no_junior_pointers!(stack, *b); + + // If the nouns are already word-equal we have nothing to do + if (*a).raw_equals(*b) { + return true; + }; + // If the nouns have cached mugs which are disequal we have nothing to do + if let (Ok(a_alloc), Ok(b_alloc)) = ((*a).as_allocated(), (*b).as_allocated()) { + if let (Some(a_mug), Some(b_mug)) = (a_alloc.get_cached_mug(), b_alloc.get_cached_mug()) { + if a_mug != b_mug { + return false; + }; + }; + }; + stack.frame_push(0); + *(stack.push::<(*mut Noun, *mut Noun)>()) = (a, b); + loop { + if stack.stack_is_empty() { + break; + }; + let (x, y): (*mut Noun, *mut Noun) = *(stack.top()); + if (*x).raw_equals(*y) { + stack.pop::<(*mut Noun, *mut Noun)>(); + continue; + }; + if let (Ok(x_alloc), Ok(y_alloc)) = ( + // equal direct atoms return true for raw_equals() + (*x).as_allocated(), + (*y).as_allocated(), + ) { + if let (Some(x_mug), Some(y_mug)) = (x_alloc.get_cached_mug(), y_alloc.get_cached_mug()) + { + if x_mug != y_mug { + break; // short-circuit, the mugs differ therefore the nouns must differ + } + }; + match (x_alloc.as_either(), y_alloc.as_either()) { + (Left(x_indirect), Left(y_indirect)) => { + let x_as_ptr = x_indirect.to_raw_pointer(); + let y_as_ptr = y_indirect.to_raw_pointer(); + if x_indirect.size() == y_indirect.size() + && memcmp( + x_indirect.data_pointer() as *const c_void, + y_indirect.data_pointer() as *const c_void, + x_indirect.size() << 3, + ) == 0 + { + let (_senior, junior) = senior_pointer_first(stack, x_as_ptr, y_as_ptr); + if x_as_ptr == junior { + *x = *y; + } else { + *y = *x; + } + stack.pop::<(*mut Noun, *mut Noun)>(); + continue; + } else { + break; + } + } + (Right(x_cell), Right(y_cell)) => { + let x_as_ptr = x_cell.to_raw_pointer() as *const u64; + let y_as_ptr = y_cell.to_raw_pointer() as *const u64; + if x_cell.head().raw_equals(y_cell.head()) + && x_cell.tail().raw_equals(y_cell.tail()) + { + let (_senior, junior) = senior_pointer_first(stack, x_as_ptr, y_as_ptr); + if x_as_ptr == junior { + *x = *y; + } else { + *y = *x; + } + stack.pop::<(*mut Noun, *mut Noun)>(); + continue; + } else { + /* THIS ISN'T AN INFINITE LOOP + * If we discover a disequality in either side, we will + * short-circuit the entire loop and reset the work stack. + * + * If both sides are equal, then we will discover pointer + * equality when we return and unify the cell. + */ + *(stack.push::<(*mut Noun, *mut Noun)>()) = + (x_cell.tail_as_mut(), y_cell.tail_as_mut()); + *(stack.push::<(*mut Noun, *mut Noun)>()) = + (x_cell.head_as_mut(), y_cell.head_as_mut()); + continue; + } + } + (_, _) => { + break; // cells don't unify with atoms + } + } + } else { + break; // direct atom not raw equal, so short circuit + } + } + stack.frame_pop(); + + assert_acyclic!(*a); + assert_acyclic!(*b); + assert_no_forwarding_pointers!(*a); + assert_no_forwarding_pointers!(*b); + assert_no_junior_pointers!(stack, *a); + assert_no_junior_pointers!(stack, *b); + + (*a).raw_equals(*b) +} + +unsafe fn senior_pointer_first( + stack: &NockStack, + a: *const u64, + b: *const u64, +) -> (*const u64, *const u64) { + let mut frame_pointer: *const u64 = stack.get_frame_pointer(); + let mut stack_pointer: *const u64 = stack.get_stack_pointer(); + let mut alloc_pointer: *const u64 = stack.get_alloc_pointer(); + let prev_stack_pointer = *(stack.prev_stack_pointer_pointer()); + + let (mut high_pointer, mut low_pointer): (*const u64, *const u64) = if stack.is_west() { + (prev_stack_pointer, alloc_pointer) + } else { + (alloc_pointer, prev_stack_pointer) + }; + + loop { + if low_pointer.is_null() || high_pointer.is_null() { + // we found the bottom of the stack; check entirety of the stack + low_pointer = stack.get_start(); + high_pointer = stack.get_start().add(stack.get_size()); + } + + match ( + a < high_pointer && a >= low_pointer, + b < high_pointer && b >= low_pointer, + ) { + (true, true) => { + // both pointers are in the same frame, pick arbitrarily (lower in mem) + break lower_pointer_first(a, b); + } + (true, false) => break (b, a), // a is in the frame, b is not, so b is senior + (false, true) => break (a, b), // b is in the frame, a is not, so a is senior + (false, false) => { + // chase up the stack + #[allow(clippy::comparison_chain)] + // test to see if the frame under consideration is a west frame + if stack_pointer < alloc_pointer { + stack_pointer = *(frame_pointer.sub(STACK + 1)) as *const u64; + alloc_pointer = *(frame_pointer.sub(ALLOC + 1)) as *const u64; + frame_pointer = *(frame_pointer.sub(FRAME + 1)) as *const u64; + + // both pointers are in the PMA, pick arbitrarily (lower in mem) + if frame_pointer.is_null() { + break lower_pointer_first(a, b); + }; + + // previous allocation pointer + high_pointer = alloc_pointer; + // "previous previous" stack pointer. this is the other boundary of the previous allocation arena + low_pointer = *(frame_pointer.add(STACK)) as *const u64; + } else if stack_pointer > alloc_pointer { + stack_pointer = *(frame_pointer.add(STACK)) as *const u64; + alloc_pointer = *(frame_pointer.add(ALLOC)) as *const u64; + frame_pointer = *(frame_pointer.add(FRAME)) as *const u64; + + // both pointers are in the PMA, pick arbitrarily (lower in mem) + if frame_pointer.is_null() { + break lower_pointer_first(a, b); + }; + + // previous allocation pointer + low_pointer = alloc_pointer; + // "previous previous" stack pointer. this is the other boundary of the previous allocation arena + high_pointer = *(frame_pointer.sub(STACK + 1)) as *const u64; + } else { + panic!("senior_pointer_first: stack_pointer == alloc_pointer"); + } + } + } + } +} + +fn lower_pointer_first(a: *const u64, b: *const u64) -> (*const u64, *const u64) { + if a < b { + (a, b) + } else { + (b, a) + } +} + From 8fb0953247ea7e83915a4a6206ef8599c7e1aad3 Mon Sep 17 00:00:00 2001 From: Edward Amsden Date: Thu, 14 Dec 2023 21:42:31 -0600 Subject: [PATCH 099/128] pma: dirty in unifying equality This changes the PMA to be a static global instead of a threaded-through handle. We use a OnceLock to enforce one-time-only opening of the PMA. There is no public PMA type any more, just a collection of module-level methods which act through the global reference. This is consistent with the assumptions of the underlying C code, that there will be only one PMA open in a process, and avoids the need to thread the PMA to unifying equality calls. --- rust/ares/src/hamt.rs | 26 ++--- rust/ares/src/interpreter.rs | 1 + rust/ares/src/jets/cold.rs | 62 +++++------ rust/ares/src/mem.rs | 2 + rust/ares/src/persist.rs | 162 ++++++++++++++++++----------- rust/ares/src/serf.rs | 43 +++----- rust/ares/src/unifying_equality.rs | 7 ++ 7 files changed, 169 insertions(+), 134 deletions(-) diff --git a/rust/ares/src/hamt.rs b/rust/ares/src/hamt.rs index c381373..845d773 100644 --- a/rust/ares/src/hamt.rs +++ b/rust/ares/src/hamt.rs @@ -2,7 +2,7 @@ use crate::mem::{NockStack, Preserve}; use crate::unifying_equality::unifying_equality; use crate::mug::mug_u32; use crate::noun::Noun; -use crate::persist::{Persist, PMA}; +use crate::persist::{Persist, pma_contains}; use either::Either::{self, *}; use std::mem::size_of; use std::ptr::{copy_nonoverlapping, null, null_mut}; @@ -598,12 +598,12 @@ impl Preserve for Hamt { } impl Persist for Hamt { - unsafe fn space_needed(&mut self, stack: &mut NockStack, pma: &PMA) -> usize { - if pma.contains(self.0, 1) { + unsafe fn space_needed(&mut self, stack: &mut NockStack) -> usize { + if pma_contains(self.0, 1) { return 0; } let mut bytes: usize = size_of::>(); - if pma.contains((*self.0).buffer, (*self.0).size()) { + if pma_contains((*self.0).buffer, (*self.0).size()) { return bytes; }; @@ -639,7 +639,7 @@ impl Persist for Hamt { // found another stem traversal[depth + 1] = next_entry.stem; - if pma.contains(traversal[depth + 1].buffer, traversal[depth + 1].size()) { + if pma_contains(traversal[depth + 1].buffer, traversal[depth + 1].size()) { continue; } @@ -653,15 +653,15 @@ impl Persist for Hamt { continue; } - if pma.contains(leaf.buffer, leaf.len) { + if pma_contains(leaf.buffer, leaf.len) { continue; } bytes += size_of::<(Noun, T)>() * leaf.len; while leaf.len > 0 { - bytes += (*leaf.buffer).0.space_needed(stack, pma); - bytes += (*leaf.buffer).1.space_needed(stack, pma); + bytes += (*leaf.buffer).0.space_needed(stack); + bytes += (*leaf.buffer).1.space_needed(stack); leaf.buffer = leaf.buffer.add(1); leaf.len -= 1; } @@ -669,8 +669,8 @@ impl Persist for Hamt { } } - unsafe fn copy_to_buffer(&mut self, stack: &mut NockStack, pma: &PMA, buffer: &mut *mut u8) { - if pma.contains(self.0, 1) { + unsafe fn copy_to_buffer(&mut self, stack: &mut NockStack, buffer: &mut *mut u8) { + if pma_contains(self.0, 1) { return; } let stem_ptr = *buffer as *mut Stem; @@ -679,7 +679,7 @@ impl Persist for Hamt { (*self).0 = stem_ptr; let stem_buffer_size = (*stem_ptr).size(); - if pma.contains((*stem_ptr).buffer, stem_buffer_size) { + if pma_contains((*stem_ptr).buffer, stem_buffer_size) { return; } let stem_buffer_ptr = *buffer as *mut Entry; @@ -743,10 +743,10 @@ impl Persist for Hamt { while leaf_idx < (*leaf_ptr).len { (*(*leaf_ptr).buffer.add(leaf_idx)) .0 - .copy_to_buffer(stack, pma, buffer); + .copy_to_buffer(stack, buffer); (*(*leaf_ptr).buffer.add(leaf_idx)) .1 - .copy_to_buffer(stack, pma, buffer); + .copy_to_buffer(stack, buffer); leaf_idx += 1; } diff --git a/rust/ares/src/interpreter.rs b/rust/ares/src/interpreter.rs index ba342e6..6fddbbe 100644 --- a/rust/ares/src/interpreter.rs +++ b/rust/ares/src/interpreter.rs @@ -23,6 +23,7 @@ use std::result; use std::sync::atomic::Ordering; use std::sync::Arc; use std::time::Instant; +use crate::persist::PMA; crate::gdb!(); diff --git a/rust/ares/src/jets/cold.rs b/rust/ares/src/jets/cold.rs index 8f0689a..98bfaf3 100644 --- a/rust/ares/src/jets/cold.rs +++ b/rust/ares/src/jets/cold.rs @@ -3,7 +3,7 @@ use crate::mem::{NockStack, Preserve}; use crate::unifying_equality::unifying_equality; use crate::noun; use crate::noun::{Atom, DirectAtom, Noun, Slots, D, T}; -use crate::persist::{Persist, PMA}; +use crate::persist::{Persist, pma_contains}; use std::mem::size_of; use std::ptr::copy_nonoverlapping; use std::ptr::null_mut; @@ -35,7 +35,7 @@ struct BatteriesMem { } impl Persist for Batteries { - unsafe fn space_needed(&mut self, stack: &mut NockStack, pma: &PMA) -> usize { + unsafe fn space_needed(&mut self, stack: &mut NockStack) -> usize { let mut bytes = 0; let mut batteries = *self; @@ -43,24 +43,24 @@ impl Persist for Batteries { if batteries.0.is_null() { break; } - if pma.contains(batteries.0, 1) { + if pma_contains(batteries.0, 1) { break; } bytes += size_of::(); - bytes += (*batteries.0).battery.space_needed(stack, pma); - bytes += (*batteries.0).parent_axis.space_needed(stack, pma); + bytes += (*batteries.0).battery.space_needed(stack); + bytes += (*batteries.0).parent_axis.space_needed(stack); batteries = (*batteries.0).parent_batteries; } bytes } - unsafe fn copy_to_buffer(&mut self, stack: &mut NockStack, pma: &PMA, buffer: &mut *mut u8) { + unsafe fn copy_to_buffer(&mut self, stack: &mut NockStack, buffer: &mut *mut u8) { let mut dest = self; loop { if (*dest).0.is_null() { break; } - if pma.contains((*dest).0, 1) { + if pma_contains((*dest).0, 1) { break; } @@ -70,10 +70,10 @@ impl Persist for Batteries { (*batteries_mem_ptr) .battery - .copy_to_buffer(stack, pma, buffer); + .copy_to_buffer(stack, buffer); (*batteries_mem_ptr) .parent_axis - .copy_to_buffer(stack, pma, buffer); + .copy_to_buffer(stack, buffer); (*dest).0 = batteries_mem_ptr; dest = &mut (*(*dest).0).parent_batteries; @@ -202,32 +202,32 @@ struct BatteriesListMem { } impl Persist for BatteriesList { - unsafe fn space_needed(&mut self, stack: &mut NockStack, pma: &PMA) -> usize { + unsafe fn space_needed(&mut self, stack: &mut NockStack) -> usize { let mut bytes = 0; let mut list = *self; loop { if list.0.is_null() { break; } - if pma.contains(list.0, 1) { + if pma_contains(list.0, 1) { break; } bytes += size_of::(); - bytes += (*list.0).batteries.space_needed(stack, pma); + bytes += (*list.0).batteries.space_needed(stack); list = (*list.0).next; } bytes } - unsafe fn copy_to_buffer(&mut self, stack: &mut NockStack, pma: &PMA, buffer: &mut *mut u8) { + unsafe fn copy_to_buffer(&mut self, stack: &mut NockStack, buffer: &mut *mut u8) { let mut dest = self; loop { if (*dest).0.is_null() { break; } - if pma.contains((*dest).0, 1) { + if pma_contains((*dest).0, 1) { break; } @@ -236,7 +236,7 @@ impl Persist for BatteriesList { *buffer = list_mem_ptr.add(1) as *mut u8; (*dest).0 = list_mem_ptr; - (*(*dest).0).batteries.copy_to_buffer(stack, pma, buffer); + (*(*dest).0).batteries.copy_to_buffer(stack, buffer); dest = &mut (*(*dest).0).next; } } @@ -323,7 +323,7 @@ struct NounListMem { } impl Persist for NounList { - unsafe fn space_needed(&mut self, stack: &mut NockStack, pma: &PMA) -> usize { + unsafe fn space_needed(&mut self, stack: &mut NockStack) -> usize { let mut bytes: usize = 0; let mut list = *self; @@ -331,26 +331,26 @@ impl Persist for NounList { if list.0.is_null() { break; } - if pma.contains(list.0, 1) { + if pma_contains(list.0, 1) { break; } bytes += size_of::(); - bytes += (*list.0).element.space_needed(stack, pma); + bytes += (*list.0).element.space_needed(stack); list = (*list.0).next; } bytes } - unsafe fn copy_to_buffer(&mut self, stack: &mut NockStack, pma: &PMA, buffer: &mut *mut u8) { + unsafe fn copy_to_buffer(&mut self, stack: &mut NockStack, buffer: &mut *mut u8) { let mut dest = self; loop { if (*dest).0.is_null() { break; } - if pma.contains((*dest).0, 1) { + if pma_contains((*dest).0, 1) { break; } @@ -359,7 +359,7 @@ impl Persist for NounList { *buffer = noun_list_mem_ptr.add(1) as *mut u8; (*dest).0 = noun_list_mem_ptr; - (*(*dest).0).element.copy_to_buffer(stack, pma, buffer); + (*(*dest).0).element.copy_to_buffer(stack, buffer); dest = &mut (*(*dest).0).next; } @@ -452,20 +452,20 @@ struct ColdMem { } impl Persist for Cold { - unsafe fn space_needed(&mut self, stack: &mut NockStack, pma: &PMA) -> usize { - if pma.contains(self.0, 1) { + unsafe fn space_needed(&mut self, stack: &mut NockStack) -> usize { + if pma_contains(self.0, 1) { return 0; } let mut bytes = size_of::(); - bytes += (*(*self).0).battery_to_paths.space_needed(stack, pma); - bytes += (*(*self).0).root_to_paths.space_needed(stack, pma); - bytes += (*(*self).0).path_to_batteries.space_needed(stack, pma); + bytes += (*(*self).0).battery_to_paths.space_needed(stack); + bytes += (*(*self).0).root_to_paths.space_needed(stack); + bytes += (*(*self).0).path_to_batteries.space_needed(stack); bytes } - unsafe fn copy_to_buffer(&mut self, stack: &mut NockStack, pma: &PMA, buffer: &mut *mut u8) { - if pma.contains(self.0, 1) { + unsafe fn copy_to_buffer(&mut self, stack: &mut NockStack, buffer: &mut *mut u8) { + if pma_contains(self.0, 1) { return; } @@ -477,13 +477,13 @@ impl Persist for Cold { (*(*self).0) .battery_to_paths - .copy_to_buffer(stack, pma, buffer); + .copy_to_buffer(stack, buffer); (*(*self).0) .root_to_paths - .copy_to_buffer(stack, pma, buffer); + .copy_to_buffer(stack, buffer); (*(*self).0) .path_to_batteries - .copy_to_buffer(stack, pma, buffer); + .copy_to_buffer(stack, buffer); } unsafe fn handle_to_u64(&self) -> u64 { diff --git a/rust/ares/src/mem.rs b/rust/ares/src/mem.rs index b8259a4..83fe4b4 100644 --- a/rust/ares/src/mem.rs +++ b/rust/ares/src/mem.rs @@ -2,6 +2,7 @@ use crate::assert_acyclic; use crate::assert_no_forwarding_pointers; use crate::assert_no_junior_pointers; use crate::noun::{Atom, Cell, CellMemory, IndirectAtom, Noun, NounAllocator}; +use crate::persist::PMA; use assert_no_alloc::permit_alloc; use either::Either::{self, Left, Right}; use ibig::Stack; @@ -50,6 +51,7 @@ pub struct NockStack { alloc_pointer: *mut u64, /** MMap which must be kept alive as long as this NockStack is */ memory: MmapMut, + /** PMA from which we will copy into the NockStack */ /** Whether or not pre_copy() has been called on the current stack frame. */ pc: bool, } diff --git a/rust/ares/src/persist.rs b/rust/ares/src/persist.rs index 730bb72..eae490c 100644 --- a/rust/ares/src/persist.rs +++ b/rust/ares/src/persist.rs @@ -8,6 +8,7 @@ use std::ffi::{c_void, CString}; use std::mem::size_of; use std::path::PathBuf; use std::ptr::copy_nonoverlapping; +use std::sync::OnceLock; const PMA_MODE: mode_t = 0o600; // RW for user only const PMA_FLAGS: ULONG = 0; // ignored for now @@ -15,66 +16,88 @@ const PMA_FLAGS: ULONG = 0; // ignored for now const NOUN_MARKED: u64 = 1 << 63; /// Handle to a PMA -pub struct PMA(*mut BT_state); +#[derive(Copy,Clone)] +struct PMAState(*mut BT_state); -impl PMA { - #[cfg(unix)] - pub fn open(path: PathBuf) -> Result { - let mut state: *mut BT_state = std::ptr::null_mut(); +pub const PMA: OnceLock = OnceLock::new(); - // correct for Unix thus cfg gated - let path_cstring = CString::new(path.into_os_string().as_encoded_bytes())?; - unsafe { - bt_state_new(&mut state); - let err = bt_state_open(state, path_cstring.as_ptr(), PMA_FLAGS, PMA_MODE); - if err == 0 { - Ok(PMA(state)) - } else { - // XX need to free the state - Err(std::io::Error::from_raw_os_error(err)) - } - } - } +fn get_pma_state() -> Option { + PMA.get().map(|r| { *r }) +} - #[cfg(windows)] - pub fn open(path: PathBuf) -> Result { - unimplemented!() - } +fn pma_state_err() -> std::io::Error { + std::io::Error::new(std::io::ErrorKind::AlreadyExists, "PMA") +} - #[inline] - pub fn meta_get(&self, field: usize) -> u64 { - unsafe { bt_meta_get(self.0, field) } - } +#[cfg(unix)] +pub fn pma_open(path: PathBuf) -> Result<(), std::io::Error> { + let mut state: *mut BT_state = std::ptr::null_mut(); - #[inline] - pub fn meta_set(&self, field: usize, val: u64) { - unsafe { bt_meta_set(self.0, field, val) }; - } - - pub unsafe fn contains(&self, ptr: *const T, count: usize) -> bool { - bt_inbounds(self.0, ptr as *mut c_void) != 0 - && bt_inbounds(self.0, ptr.add(count) as *mut c_void) != 0 - } - - pub fn sync(&self) { - unsafe { - if bt_sync(self.0) != 0 { - panic!("PMA sync failed but did not abort: this should never happen."); - } - } - } - - pub fn close(self) -> Result<(), std::io::Error> { - // XX need a way to free the state after - let err = unsafe { bt_state_close(self.0) }; + // correct for Unix thus cfg gated + let path_cstring = CString::new(path.into_os_string().as_encoded_bytes())?; + unsafe { + bt_state_new(&mut state); + let err = bt_state_open(state, path_cstring.as_ptr(), PMA_FLAGS, PMA_MODE); if err == 0 { + PMA.set(PMAState(state)); //.or(Err(std::io::Error::new(std::io::ErrorKind::AlreadyExists, "PMA")))? Ok(()) } else { + // XX need to free the state Err(std::io::Error::from_raw_os_error(err)) } } } +#[cfg(windows)] +pub fn pma_open(path: PathBuf) -> Result { + unimplemented!() +} + +pub fn pma_close() -> Result<(), std::io::Error> { + // XX need a way to free the state after + let err = unsafe { bt_state_close(get_pma_state().ok_or_else(pma_state_err)?.0) }; + if err == 0 { + Ok(()) + } else { + Err(std::io::Error::from_raw_os_error(err)) + } +} + +#[inline] +pub fn pma_meta_get(field: usize) -> u64 { + unsafe { bt_meta_get(get_pma_state().unwrap().0, field) } +} + +#[inline] +pub fn pma_meta_set(field: usize, val: u64) { + unsafe { bt_meta_set(get_pma_state().unwrap().0, field, val) }; +} + +pub unsafe fn pma_contains(ptr: *const T, count: usize) -> bool { + if let Some(pma_state) = get_pma_state() { + bt_inbounds(pma_state.0, ptr as *mut c_void) != 0 + && bt_inbounds(pma_state.0, ptr.add(count) as *mut c_void) != 0 + } else { + false + } +} + +pub fn pma_sync() { + unsafe { + if bt_sync(get_pma_state().unwrap().0) != 0 { + panic!("PMA sync failed but did not abort: this should never happen."); + } + } +} + +pub unsafe fn pma_dirty(ptr: *mut T, count: usize) { + let lo = bt_page_round_down(ptr); + let hi = bt_page_round_up(ptr.add(count)); + let e = bt_dirty(get_pma_state().unwrap().0, lo, hi); + assert!(e == 0); +} + + /** * This trait defines operations for copying a structure into the PMA. * @@ -92,17 +115,17 @@ impl PMA { pub trait Persist { /// Count how much space is needed, in bytes. May set marks so long as marks are cleaned up by /// [copy_into_buffer] - unsafe fn space_needed(&mut self, stack: &mut NockStack, pma: &PMA) -> usize; + unsafe fn space_needed(&mut self, stack: &mut NockStack) -> usize; /// Copy into the provided buffer, which may be assumed to be at least as large as the size /// returned by [space_needed] on the same structure. - unsafe fn copy_to_buffer(&mut self, stack: &mut NockStack, pma: &PMA, buffer: &mut *mut u8); + unsafe fn copy_to_buffer(&mut self, stack: &mut NockStack, buffer: &mut *mut u8); /// Persist an object into the PMA using [space_needed] and [copy_to_buffer], returning /// a [u64] (probably a pointer or tagged pointer) that can be saved into metadata. - unsafe fn save_to_pma(&mut self, stack: &mut NockStack, pma: &PMA) -> u64 { + unsafe fn save_to_pma(&mut self, stack: &mut NockStack) -> u64 { unsafe { - let space = self.space_needed(stack, pma); + let space = self.space_needed(stack); if space == 0 { return self.handle_to_u64(); @@ -110,9 +133,9 @@ pub trait Persist { let space_as_pages = (space + (BT_PAGESIZE as usize - 1)) >> BT_PAGEBITS; - let mut buffer = bt_malloc(pma.0, space_as_pages) as *mut u8; + let mut buffer = bt_malloc(get_pma_state().unwrap().0, space_as_pages) as *mut u8; let orig_buffer = buffer; - self.copy_to_buffer(stack, pma, &mut buffer); + self.copy_to_buffer(stack, &mut buffer); let space_isize: isize = space.try_into().unwrap(); assert!(buffer.offset_from(orig_buffer) == space_isize); self.handle_to_u64() @@ -137,10 +160,10 @@ unsafe fn unmark(a: Allocated) { } impl Persist for Atom { - unsafe fn space_needed(&mut self, stack: &mut NockStack, pma: &PMA) -> usize { + unsafe fn space_needed(&mut self, stack: &mut NockStack) -> usize { if let Ok(indirect) = self.as_indirect() { let count = indirect.raw_size(); - if !pma.contains(indirect.to_raw_pointer(), count) { + if !pma_contains(indirect.to_raw_pointer(), count) { if !mark(indirect.as_allocated()) { return count * size_of::(); } @@ -149,10 +172,10 @@ impl Persist for Atom { 0 } - unsafe fn copy_to_buffer(&mut self, stack: &mut NockStack, pma: &PMA, buffer: &mut *mut u8) { + unsafe fn copy_to_buffer(&mut self, stack: &mut NockStack, buffer: &mut *mut u8) { if let Ok(mut indirect) = self.as_indirect() { let count = indirect.raw_size(); - if !pma.contains(indirect.to_raw_pointer(), count) { + if !pma_contains(indirect.to_raw_pointer(), count) { if let Some(forward) = indirect.forwarding_pointer() { *self = forward.as_atom(); } else { @@ -178,7 +201,7 @@ impl Persist for Atom { } impl Persist for Noun { - unsafe fn space_needed(&mut self, stack: &mut NockStack, pma: &PMA) -> usize { + unsafe fn space_needed(&mut self, stack: &mut NockStack) -> usize { let mut space = 0usize; stack.frame_push(0); *(stack.push::()) = *self; @@ -191,10 +214,10 @@ impl Persist for Noun { match noun.as_either_atom_cell() { Left(mut atom) => { - space += atom.space_needed(stack, pma); + space += atom.space_needed(stack); } Right(cell) => { - if !pma.contains(cell.to_raw_pointer(), 1) { + if !pma_contains(cell.to_raw_pointer(), 1) { if !mark(cell.as_allocated()) { space += size_of::(); (*stack.push::()) = cell.tail(); @@ -208,7 +231,7 @@ impl Persist for Noun { space } - unsafe fn copy_to_buffer(&mut self, stack: &mut NockStack, pma: &PMA, buffer: &mut *mut u8) { + unsafe fn copy_to_buffer(&mut self, stack: &mut NockStack, buffer: &mut *mut u8) { let mut buffer_u64 = (*buffer) as *mut u64; stack.frame_push(0); *(stack.push::<*mut Noun>()) = (self as *mut Noun); @@ -232,7 +255,7 @@ impl Persist for Noun { match allocated.as_either() { Left(mut indirect) => { let count = indirect.raw_size(); - if pma.contains(indirect.to_raw_pointer(), count) { + if pma_contains(indirect.to_raw_pointer(), count) { continue; } @@ -243,7 +266,7 @@ impl Persist for Noun { buffer_u64 = buffer_u64.add(count); } Right(mut cell) => { - if pma.contains(cell.to_raw_pointer(), 1) { + if pma_contains(cell.to_raw_pointer(), 1) { continue; } @@ -276,3 +299,16 @@ impl Persist for Noun { Noun::from_raw(meta_handle) } } + +/** Mask to mask out pointer bits not aligned with a BT_PAGESIZE page */ +const BT_PAGEBITS_MASK_OUT: u64 = !((1 << BT_PAGEBITS) - 1); + +// round an address down to a page boundary +fn bt_page_round_down(ptr: *mut T) -> *mut c_void { + ((ptr as u64) & BT_PAGEBITS_MASK_OUT) as *mut c_void +} + +// round an address up to a page boundary +fn bt_page_round_up(ptr: *mut T) -> *mut c_void { + (((ptr as u64) + (BT_PAGESIZE as u64) - 1) & BT_PAGEBITS_MASK_OUT) as *mut c_void +} diff --git a/rust/ares/src/serf.rs b/rust/ares/src/serf.rs index 30effee..698241e 100644 --- a/rust/ares/src/serf.rs +++ b/rust/ares/src/serf.rs @@ -1,3 +1,4 @@ +use crate::persist::pma_meta_set; use crate::hamt::Hamt; use crate::interpreter; use crate::interpreter::{inc, interpret, Error}; @@ -11,7 +12,7 @@ use crate::mem::Preserve; use crate::mug::*; use crate::newt::Newt; use crate::noun::{Atom, Cell, DirectAtom, Noun, Slots, D, T}; -use crate::persist::{Persist, PMA}; +use crate::persist::{Persist, pma_open, pma_meta_get, pma_sync}; use crate::trace::*; use ares_macros::tas; use signal_hook; @@ -37,26 +38,26 @@ enum BTMetaField { struct Snapshot(pub *mut SnapshotMem); impl Persist for Snapshot { - unsafe fn space_needed(&mut self, stack: &mut NockStack, pma: &PMA) -> usize { + unsafe fn space_needed(&mut self, stack: &mut NockStack) -> usize { let mut arvo = (*(self.0)).arvo; let mut cold = (*(self.0)).cold; - let arvo_space_needed = arvo.space_needed(stack, pma); - let cold_space_needed = cold.space_needed(stack, pma); + let arvo_space_needed = arvo.space_needed(stack); + let cold_space_needed = cold.space_needed(stack); (((size_of::() + 7) >> 3) << 3) + arvo_space_needed + cold_space_needed } - unsafe fn copy_to_buffer(&mut self, stack: &mut NockStack, pma: &PMA, buffer: &mut *mut u8) { + unsafe fn copy_to_buffer(&mut self, stack: &mut NockStack, buffer: &mut *mut u8) { let snapshot_buffer = *buffer as *mut SnapshotMem; std::ptr::copy_nonoverlapping(self.0, snapshot_buffer, 1); *self = Snapshot(snapshot_buffer); *buffer = snapshot_buffer.add(1) as *mut u8; let mut arvo = (*snapshot_buffer).arvo; - arvo.copy_to_buffer(stack, pma, buffer); + arvo.copy_to_buffer(stack, buffer); (*snapshot_buffer).arvo = arvo; let mut cold = (*snapshot_buffer).cold; - cold.copy_to_buffer(stack, pma, buffer); + cold.copy_to_buffer(stack, buffer); (*snapshot_buffer).cold = cold; } @@ -83,7 +84,6 @@ const PMA_CURRENT_SNAPSHOT_VERSION: u64 = 1; struct Context { epoch: u64, event_num: u64, - pma: PMA, arvo: Noun, mug: u32, nock_context: interpreter::Context, @@ -95,19 +95,19 @@ impl Context { trace_info: Option, constant_hot_state: &[HotEntry], ) -> Context { - let mut pma = PMA::open(snap_path).expect("serf: pma open failed"); + pma_open(snap_path).expect("serf: pma open failed"); - let snapshot_version = pma.meta_get(BTMetaField::SnapshotVersion as usize); + let snapshot_version = pma_meta_get(BTMetaField::SnapshotVersion as usize); let snapshot = match snapshot_version { 0 => None, 1 => Some(unsafe { - Snapshot::handle_from_u64(pma.meta_get(BTMetaField::Snapshot as usize)) + Snapshot::handle_from_u64(pma_meta_get(BTMetaField::Snapshot as usize)) }), _ => panic!("Unsupported snapshot version"), }; - Context::new(trace_info, pma, snapshot, constant_hot_state) + Context::new(trace_info, snapshot, constant_hot_state) } pub fn save(&mut self) { @@ -123,7 +123,7 @@ impl Context { snapshot_mem_ptr }); - let handle = snapshot.save_to_pma(&mut self.nock_context.stack, &mut self.pma); + let handle = snapshot.save_to_pma(&mut self.nock_context.stack); self.epoch = (*snapshot.0).epoch; self.arvo = (*snapshot.0).arvo; @@ -132,16 +132,15 @@ impl Context { handle }; - self.pma.meta_set( + pma_meta_set( BTMetaField::SnapshotVersion as usize, PMA_CURRENT_SNAPSHOT_VERSION, ); - self.pma.meta_set(BTMetaField::Snapshot as usize, handle); + pma_meta_set(BTMetaField::Snapshot as usize, handle); } fn new( trace_info: Option, - pma: PMA, snapshot: Option, constant_hot_state: &[HotEntry], ) -> Self { @@ -179,7 +178,6 @@ impl Context { Context { epoch, event_num, - pma, arvo, mug, nock_context, @@ -203,14 +201,6 @@ impl Context { self.mug = mug_u32(&mut self.nock_context.stack, self.arvo); } - // - // Snapshot functions - // - - pub fn sync(&mut self) { - self.pma.sync() - } - // // Newt functions // @@ -348,8 +338,7 @@ pub fn serf(constant_hot_state: &[HotEntry]) -> io::Result<()> { tas!(b"exit") => eprintln!("exit"), tas!(b"save") => { // XX what is eve for? - eprintln!("save"); - context.sync(); + pma_sync(); } tas!(b"meld") => eprintln!("meld"), tas!(b"pack") => eprintln!("pack"), diff --git a/rust/ares/src/unifying_equality.rs b/rust/ares/src/unifying_equality.rs index e133018..3ca3675 100644 --- a/rust/ares/src/unifying_equality.rs +++ b/rust/ares/src/unifying_equality.rs @@ -2,6 +2,7 @@ use crate::assert_acyclic; use crate::assert_no_forwarding_pointers; use crate::assert_no_junior_pointers; use crate::mem::{NockStack, FRAME, STACK, ALLOC}; +use crate::persist::{pma_contains, pma_dirty}; use crate::noun::Noun; use either::Either::*; use libc::{c_void, memcmp}; @@ -101,8 +102,14 @@ pub unsafe fn unifying_equality(stack: &mut NockStack, a: *mut Noun, b: *mut Nou { let (_senior, junior) = senior_pointer_first(stack, x_as_ptr, y_as_ptr); if x_as_ptr == junior { + if pma_contains(x, 1) { + pma_dirty(x, 1); + } *x = *y; } else { + if pma_contains(y, 1) { + pma_dirty(y, 1); + } *y = *x; } stack.pop::<(*mut Noun, *mut Noun)>(); From bdccac7ff29b53538e0926e804395d0905e58686 Mon Sep 17 00:00:00 2001 From: Edward Amsden Date: Thu, 14 Dec 2023 22:17:25 -0600 Subject: [PATCH 100/128] pma: fixup of global static PMA --- rust/ares/src/interpreter.rs | 1 - rust/ares/src/mem.rs | 1 - rust/ares/src/persist.rs | 27 ++++++++++++++------------- 3 files changed, 14 insertions(+), 15 deletions(-) diff --git a/rust/ares/src/interpreter.rs b/rust/ares/src/interpreter.rs index 6fddbbe..ba342e6 100644 --- a/rust/ares/src/interpreter.rs +++ b/rust/ares/src/interpreter.rs @@ -23,7 +23,6 @@ use std::result; use std::sync::atomic::Ordering; use std::sync::Arc; use std::time::Instant; -use crate::persist::PMA; crate::gdb!(); diff --git a/rust/ares/src/mem.rs b/rust/ares/src/mem.rs index 83fe4b4..0412b9a 100644 --- a/rust/ares/src/mem.rs +++ b/rust/ares/src/mem.rs @@ -2,7 +2,6 @@ use crate::assert_acyclic; use crate::assert_no_forwarding_pointers; use crate::assert_no_junior_pointers; use crate::noun::{Atom, Cell, CellMemory, IndirectAtom, Noun, NounAllocator}; -use crate::persist::PMA; use assert_no_alloc::permit_alloc; use either::Either::{self, Left, Right}; use ibig::Stack; diff --git a/rust/ares/src/persist.rs b/rust/ares/src/persist.rs index eae490c..898c6ed 100644 --- a/rust/ares/src/persist.rs +++ b/rust/ares/src/persist.rs @@ -17,12 +17,12 @@ const NOUN_MARKED: u64 = 1 << 63; /// Handle to a PMA #[derive(Copy,Clone)] -struct PMAState(*mut BT_state); +struct PMAState(u64); // this is idiotic but necessary for Rust to let us put this in a oncelock -pub const PMA: OnceLock = OnceLock::new(); +static PMA: OnceLock = OnceLock::new(); -fn get_pma_state() -> Option { - PMA.get().map(|r| { *r }) +fn get_pma_state() -> Option<*mut BT_state> { + PMA.get().map(|r| { r.0 as *mut BT_state }) } fn pma_state_err() -> std::io::Error { @@ -39,7 +39,8 @@ pub fn pma_open(path: PathBuf) -> Result<(), std::io::Error> { bt_state_new(&mut state); let err = bt_state_open(state, path_cstring.as_ptr(), PMA_FLAGS, PMA_MODE); if err == 0 { - PMA.set(PMAState(state)); //.or(Err(std::io::Error::new(std::io::ErrorKind::AlreadyExists, "PMA")))? + PMA.set(PMAState(state as u64)).or_else(|state| { Err(state.0 as *mut BT_state) }).expect("PMA state already initialized to:"); + assert!(get_pma_state().is_some()); Ok(()) } else { // XX need to free the state @@ -55,7 +56,7 @@ pub fn pma_open(path: PathBuf) -> Result { pub fn pma_close() -> Result<(), std::io::Error> { // XX need a way to free the state after - let err = unsafe { bt_state_close(get_pma_state().ok_or_else(pma_state_err)?.0) }; + let err = unsafe { bt_state_close(get_pma_state().ok_or_else(pma_state_err)?) }; if err == 0 { Ok(()) } else { @@ -65,18 +66,18 @@ pub fn pma_close() -> Result<(), std::io::Error> { #[inline] pub fn pma_meta_get(field: usize) -> u64 { - unsafe { bt_meta_get(get_pma_state().unwrap().0, field) } + unsafe { bt_meta_get(get_pma_state().unwrap(), field) } } #[inline] pub fn pma_meta_set(field: usize, val: u64) { - unsafe { bt_meta_set(get_pma_state().unwrap().0, field, val) }; + unsafe { bt_meta_set(get_pma_state().unwrap(), field, val) }; } pub unsafe fn pma_contains(ptr: *const T, count: usize) -> bool { if let Some(pma_state) = get_pma_state() { - bt_inbounds(pma_state.0, ptr as *mut c_void) != 0 - && bt_inbounds(pma_state.0, ptr.add(count) as *mut c_void) != 0 + bt_inbounds(pma_state, ptr as *mut c_void) != 0 + && bt_inbounds(pma_state, ptr.add(count) as *mut c_void) != 0 } else { false } @@ -84,7 +85,7 @@ pub unsafe fn pma_contains(ptr: *const T, count: usize) -> bool { pub fn pma_sync() { unsafe { - if bt_sync(get_pma_state().unwrap().0) != 0 { + if bt_sync(get_pma_state().unwrap()) != 0 { panic!("PMA sync failed but did not abort: this should never happen."); } } @@ -93,7 +94,7 @@ pub fn pma_sync() { pub unsafe fn pma_dirty(ptr: *mut T, count: usize) { let lo = bt_page_round_down(ptr); let hi = bt_page_round_up(ptr.add(count)); - let e = bt_dirty(get_pma_state().unwrap().0, lo, hi); + let e = bt_dirty(get_pma_state().unwrap(), lo, hi); assert!(e == 0); } @@ -133,7 +134,7 @@ pub trait Persist { let space_as_pages = (space + (BT_PAGESIZE as usize - 1)) >> BT_PAGEBITS; - let mut buffer = bt_malloc(get_pma_state().unwrap().0, space_as_pages) as *mut u8; + let mut buffer = bt_malloc(get_pma_state().unwrap(), space_as_pages) as *mut u8; let orig_buffer = buffer; self.copy_to_buffer(stack, &mut buffer); let space_isize: isize = space.try_into().unwrap(); From ff204263ff2cf81cda5e9d3c740d8f235da2fdd9 Mon Sep 17 00:00:00 2001 From: barter-simsum Date: Fri, 15 Dec 2023 00:50:26 -0500 Subject: [PATCH 101/128] pma: misc fixes - primarily fixes a bug in _mlist_insert and _pending_flist_insert - also updates some tests - disables node data printing on _bt_insertdat with DEBUG_PRINTNODE macro --- rust/ares_pma/c-src/btest.c | 14 ++++++++++++- rust/ares_pma/c-src/btree.c | 42 ++++++++++++++++++++++++++++++------- 2 files changed, 47 insertions(+), 9 deletions(-) diff --git a/rust/ares_pma/c-src/btest.c b/rust/ares_pma/c-src/btest.c index 69b4f7c..c707a79 100644 --- a/rust/ares_pma/c-src/btest.c +++ b/rust/ares_pma/c-src/btest.c @@ -102,7 +102,7 @@ int main(int argc, char *argv[]) lohi_pair allocs[ITERATIONS] = {0}; for (size_t i = 0; i < ITERATIONS; i++) { /* malloc a random number of pages <= 256 and store in the allocs array */ - int pages = rand(); + int pages = random(); pages &= MAXALLOCPG; pages += 1; allocs[i].lo = bt_malloc(state3, pages); @@ -110,7 +110,19 @@ int main(int argc, char *argv[]) } /* sync the state */ + /* bt_sync(state3); */ + + /* TODO: close and reopen state. validate ephemeral structures */ + + for (size_t i = 0; i < ITERATIONS / 2; i++) { + /* free half of the allocations */ + bt_free(state3, allocs[i].lo, allocs[i].hi); + } + + /* resync the state */ bt_sync(state3); + /* TODO: close and reopen state. validate ephemeral structures */ + return 0; } diff --git a/rust/ares_pma/c-src/btree.c b/rust/ares_pma/c-src/btree.c index 1b9dda3..6ba3d2a 100644 --- a/rust/ares_pma/c-src/btree.c +++ b/rust/ares_pma/c-src/btree.c @@ -50,6 +50,9 @@ STATIC_ASSERT(0, "debugger break instruction unimplemented"); /* ;;: remove once confident in logic and delete all code dependencies on state->node_freelist */ +/* prints a node before and after a call to _bt_insertdat */ +#define DEBUG_PRINTNODE 0 + #define ZERO(s, n) memset((s), 0, (n)) #define S7(A, B, C, D, E, F, G) A##B##C##D##E##F##G @@ -650,8 +653,10 @@ static int _bt_insertdat(vaof_t lo, vaof_t hi, pgno_t fo, BT_page *parent, size_t childidx) { +#if DEBUG_PRINTNODE DPRINTF("BEFORE INSERT lo %" PRIu32 " hi %" PRIu32 " fo %" PRIu32, lo, hi, fo); _bt_printnode(parent); +#endif /* ;;: TODO confirm this logic is appropriate for branch nodes. (It /should/ be correct for leaf nodes) */ @@ -691,8 +696,10 @@ _bt_insertdat(vaof_t lo, vaof_t hi, pgno_t fo, : lfo + (hi - lva); } +#if DEBUG_PRINTNODE DPUTS("AFTER INSERT"); _bt_printnode(parent); +#endif return BT_SUCC; } @@ -770,20 +777,30 @@ _bt_delco_1pass(BT_state *state, vaof_t lo, vaof_t hi) static void _mlist_insert(BT_state *state, void *lo, void *hi) -{ +{ /* ;;: this logic could be simplified with indirect pointers */ BT_mlistnode *head = state->mlist; BYTE *lob = lo; BYTE *hib = hi; assert(head); + /* special case: freed chunk precedes but is not contiguous with head */ + if (hi < head->va) { + BT_mlistnode *new = calloc(1, sizeof *new); + new->sz = (hib - lob); + new->va = lob; + new->next = head; + state->mlist = new; + return; + } + while (head) { BYTE *vob = head->va; size_t siz = head->sz; BYTE *nob = head->next ? head->next->va : 0; /* freed chunk immediately precedes head */ - if (hi == vob) { + if (hib == vob) { head->va = lo; head->sz += (hib - lob); return; @@ -923,7 +940,7 @@ _pending_nlist_merge(BT_state *state) static void _pending_flist_insert(BT_state *state, pgno_t pg, size_t sz) -{ +{ /* ;;: again, this logic could probably be simplified with an indirect pointer */ BT_flistnode *head = state->pending_flist; /* freelist may be empty. create head */ @@ -955,12 +972,21 @@ _pending_flist_insert(BT_state *state, pgno_t pg, size_t sz) return; } - /* otherwise, insert a new node following head */ + /* otherwise, insert a new node either preceding or following head */ BT_flistnode *new = calloc(1, sizeof *new); new->pg = pg; new->sz = sz; - new->next = head->next; - head->next = new; + + if (head->pg < pg + sz) { + /* should only happen if head is the first node in the freelist */ + assert(head == state->pending_flist); + new->next = head; + state->pending_flist = new; + } + else { + new->next = head->next; + head->next = new; + } } static void @@ -3018,8 +3044,8 @@ _sham_sync(BT_state *state) static void _bt_printnode(BT_page *node) { - printf("node: %p\n", node); - printf("data: \n"); + fprintf(stderr, "node: %p\n", node); + fprintf(stderr, "data: \n"); for (size_t i = 0; i < BT_DAT_MAXKEYS; ++i) { if (i && node->datk[i].va == 0) break; From a37cf3e779397fcfb4de7ace87d2c9c00d3d6a3b Mon Sep 17 00:00:00 2001 From: Edward Amsden Date: Fri, 15 Dec 2023 00:15:04 -0600 Subject: [PATCH 102/128] pma: when persisting hamt check if buffer already in PMA --- rust/ares/src/hamt.rs | 15 ++++++++++++++- 1 file changed, 14 insertions(+), 1 deletion(-) diff --git a/rust/ares/src/hamt.rs b/rust/ares/src/hamt.rs index 845d773..dcccb5d 100644 --- a/rust/ares/src/hamt.rs +++ b/rust/ares/src/hamt.rs @@ -719,18 +719,31 @@ impl Persist for Hamt { let stem_ptr: *mut Stem = &mut (*next_entry_ptr).stem; let stem_size = (*stem_ptr).size(); + + if pma_contains((*stem_ptr).buffer, stem_size) { + continue; + } + let stem_buffer_ptr = *buffer as *mut Entry; copy_nonoverlapping((*stem_ptr).buffer, stem_buffer_ptr, stem_size); *buffer = stem_buffer_ptr.add(stem_size) as *mut u8; (*stem_ptr).buffer = stem_buffer_ptr; - traversal[depth + 1] = *stem_ptr; depth += 1; } else { // Leaf case let leaf_ptr: *mut Leaf = &mut (*next_entry_ptr).leaf; + + if (*leaf_ptr).len == 0 { + continue; + } + + if pma_contains((*leaf_ptr).buffer, (*leaf_ptr).len) { + continue; + } + let leaf_buffer_ptr = *buffer as *mut (Noun, T); copy_nonoverlapping((*leaf_ptr).buffer, leaf_buffer_ptr, (*leaf_ptr).len); From 40a04ec8ab30324f6998dad2a32415f0f9d647d6 Mon Sep 17 00:00:00 2001 From: barter-simsum Date: Fri, 15 Dec 2023 01:19:28 -0500 Subject: [PATCH 103/128] pma: _bt_printnode print to stderr --- rust/ares_pma/c-src/btree.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/rust/ares_pma/c-src/btree.c b/rust/ares_pma/c-src/btree.c index 6ba3d2a..ef30dc2 100644 --- a/rust/ares_pma/c-src/btree.c +++ b/rust/ares_pma/c-src/btree.c @@ -3049,7 +3049,7 @@ _bt_printnode(BT_page *node) for (size_t i = 0; i < BT_DAT_MAXKEYS; ++i) { if (i && node->datk[i].va == 0) break; - printf("[%5zu] %10x %10x\n", i, node->datk[i].va, node->datk[i].fo); + fprintf(stderr, "[%5zu] %10x %10x\n", i, node->datk[i].va, node->datk[i].fo); } } From 965865da139e5470315e8fd52d1512a91f149219 Mon Sep 17 00:00:00 2001 From: barter-simsum Date: Fri, 15 Dec 2023 03:24:42 -0500 Subject: [PATCH 104/128] pma: mmap freespace ahead of first node partition --- rust/ares_pma/c-src/btree.c | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/rust/ares_pma/c-src/btree.c b/rust/ares_pma/c-src/btree.c index ef30dc2..6ffbd53 100644 --- a/rust/ares_pma/c-src/btree.c +++ b/rust/ares_pma/c-src/btree.c @@ -2313,6 +2313,17 @@ _bt_state_load(BT_state *state) abort(); } + BYTE *nullspace_addr = BT_MAPADDR + (BT_META_SECTION_WIDTH + BLK_BASE_LEN0); + size_t nullspace_len = BT_ADDRSIZE - (BT_META_SECTION_WIDTH + BLK_BASE_LEN0); + if (nullspace_addr != mmap(nullspace_addr, + nullspace_len, + BT_PROT_FREE, + BT_FLAG_FREE, + 0, 0)) { + DPRINTF("mmap: failed to map at addr %p, errno: %s", nullspace_addr, strerror(errno)); + abort(); + } + p = (BT_page *)state->map; state->meta_pages[0] = METADATA(p); state->meta_pages[1] = METADATA(p + 1); From 031e583c3c1def2d59753c54397d86238e6f8229 Mon Sep 17 00:00:00 2001 From: barter-simsum Date: Fri, 15 Dec 2023 05:38:35 -0500 Subject: [PATCH 105/128] pma: fix DPRINTF in bt_malloc --- rust/ares_pma/c-src/btree.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/rust/ares_pma/c-src/btree.c b/rust/ares_pma/c-src/btree.c index 6ffbd53..a836465 100644 --- a/rust/ares_pma/c-src/btree.c +++ b/rust/ares_pma/c-src/btree.c @@ -2670,7 +2670,7 @@ bt_malloc(BT_state *state, size_t pages) addr2off(ret) + pages, pgno); - DPRINTF("map %p to offset 0x%zx bytes (0x%x pages)\n", ret, P2BYTES(pgno), pgno); + DPRINTF("map %p to offset 0x%zx bytes (0x%zx pages)\n", ret, P2BYTES(pgno), pages); if (ret != mmap(ret, P2BYTES(pages), From 66aaa255f71673fa5d1232c42e4d95c1da2c60cb Mon Sep 17 00:00:00 2001 From: barter-simsum Date: Fri, 15 Dec 2023 06:36:30 -0500 Subject: [PATCH 106/128] pma: fix freelist node size calculation in _mlist_insert --- rust/ares_pma/c-src/btree.c | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/rust/ares_pma/c-src/btree.c b/rust/ares_pma/c-src/btree.c index a836465..bd7eeee 100644 --- a/rust/ares_pma/c-src/btree.c +++ b/rust/ares_pma/c-src/btree.c @@ -787,7 +787,7 @@ _mlist_insert(BT_state *state, void *lo, void *hi) /* special case: freed chunk precedes but is not contiguous with head */ if (hi < head->va) { BT_mlistnode *new = calloc(1, sizeof *new); - new->sz = (hib - lob); + new->sz = B2PAGES(hib - lob); new->va = lob; new->next = head; state->mlist = new; @@ -802,19 +802,19 @@ _mlist_insert(BT_state *state, void *lo, void *hi) /* freed chunk immediately precedes head */ if (hib == vob) { head->va = lo; - head->sz += (hib - lob); + head->sz += B2PAGES(hib - lob); return; } /* freed chunk immediately follows termination of head */ if (vob + siz == lo) { - head->sz += (hib - lob); + head->sz += B2PAGES(hib - lob); return; } /* freed chunk between head and next but not contiguous */ if (lob > vob + siz && hib < nob) { BT_mlistnode *new = calloc(1, sizeof *new); - new->sz = (hib - lob); + new->sz = B2PAGES(hib - lob); new->va = lob; new->next = head->next; head->next = new; @@ -824,7 +824,7 @@ _mlist_insert(BT_state *state, void *lo, void *hi) } /* freelist completely searched. Chunk must be at tail and not contiguous */ BT_mlistnode *new = calloc(1, sizeof *new); - new->sz = (hib - lob); + new->sz = B2PAGES(hib - lob); new->va = lob; new->next = head->next; head->next = new; @@ -1475,7 +1475,6 @@ _mlist_new(BT_state *state) head->next = 0; head->sz = len; head->va = off2addr(lo); - state->mlist = head; return BT_SUCC; From 02b1ae3dd20bc7fbc3cfd8b172c3e9f3ec2b8fb5 Mon Sep 17 00:00:00 2001 From: barter-simsum Date: Fri, 15 Dec 2023 06:36:53 -0500 Subject: [PATCH 107/128] pma: btest.c changes --- rust/ares_pma/c-src/btest.c | 41 ++++++++++++++++++++++++++++++++++++- 1 file changed, 40 insertions(+), 1 deletion(-) diff --git a/rust/ares_pma/c-src/btest.c b/rust/ares_pma/c-src/btest.c index c707a79..a779813 100644 --- a/rust/ares_pma/c-src/btest.c +++ b/rust/ares_pma/c-src/btest.c @@ -19,6 +19,30 @@ _test_nodeinteg(BT_state *state, BT_findpath *path, assert(parent->datk[childidx+1].va == hi); } +static size_t +_mlist_sizep(BT_mlistnode *head) +/* calculate the size of the mlist in pages */ +{ + size_t sz = 0; + while (head) { + sz += head->sz; + head = head->next; + } + return sz; +} + +static size_t +_flist_sizep(BT_flistnode *head) +/* calculate the size of the flist in pages */ +{ + size_t sz = 0; + while (head) { + sz += head->sz; + head = head->next; + } + return sz; +} + int main(int argc, char *argv[]) { DPUTS("PMA Tests"); @@ -40,7 +64,6 @@ int main(int argc, char *argv[]) vaof_t hi = 0xDEADBEEF; pgno_t pg = 1; /* dummy value */ for (size_t i = 0; i < BT_DAT_MAXKEYS * 4; ++i) { - DPRINTF("== i: %zu", i); _bt_insert(state1, lo, hi, pg); _test_nodeinteg(state1, &path, lo, hi, pg); lo++; pg++; @@ -100,6 +123,9 @@ int main(int argc, char *argv[]) #define ITERATIONS 1000 #define MAXALLOCPG 0xFF lohi_pair allocs[ITERATIONS] = {0}; + size_t alloc_sizp = 0; + size_t flist_sizp = _flist_sizep(state3->flist); + size_t mlist_sizp = _mlist_sizep(state3->mlist); for (size_t i = 0; i < ITERATIONS; i++) { /* malloc a random number of pages <= 256 and store in the allocs array */ int pages = random(); @@ -107,6 +133,12 @@ int main(int argc, char *argv[]) pages += 1; allocs[i].lo = bt_malloc(state3, pages); allocs[i].hi = allocs[i].lo + pages; + alloc_sizp += pages; + /* validate size changes to mlist and flist */ + assert(_flist_sizep(state3->flist) + == (flist_sizp - alloc_sizp)); + assert(_mlist_sizep(state3->mlist) + == (mlist_sizp - alloc_sizp)); } /* sync the state */ @@ -114,9 +146,16 @@ int main(int argc, char *argv[]) /* TODO: close and reopen state. validate ephemeral structures */ + flist_sizp = _flist_sizep(state3->flist); + mlist_sizp = _mlist_sizep(state3->mlist); + alloc_sizp = 0; for (size_t i = 0; i < ITERATIONS / 2; i++) { /* free half of the allocations */ bt_free(state3, allocs[i].lo, allocs[i].hi); + alloc_sizp += allocs[i].hi - allocs[i].lo; + /* validate size changes to mlist */ + assert(_mlist_sizep(state3->mlist) + == (mlist_sizp + alloc_sizp)); } /* resync the state */ From 3acf8e74c1518bcb329d521082411ca5be70103e Mon Sep 17 00:00:00 2001 From: Edward Amsden Date: Fri, 15 Dec 2023 10:00:38 -0600 Subject: [PATCH 108/128] serf: ensure locals are preserved and top frame flipped after PMA save --- rust/ares/src/serf.rs | 64 ++++++++++++++++++++++++++++++------------- 1 file changed, 45 insertions(+), 19 deletions(-) diff --git a/rust/ares/src/serf.rs b/rust/ares/src/serf.rs index 698241e..ea13c1f 100644 --- a/rust/ares/src/serf.rs +++ b/rust/ares/src/serf.rs @@ -110,8 +110,8 @@ impl Context { Context::new(trace_info, snapshot, constant_hot_state) } - pub fn save(&mut self) { - let handle = unsafe { + pub unsafe fn save(&mut self) { + let handle = { let mut snapshot = Snapshot({ let snapshot_mem_ptr: *mut SnapshotMem = self.nock_context.stack.struct_alloc(1); @@ -188,7 +188,12 @@ impl Context { // Setters // - pub fn event_update(&mut self, new_event_num: u64, new_arvo: Noun) { + /// + /// ## Safety + /// + /// calls save(), which invalidates all nouns not in the context + /// until [preserve_event_update_leftovers] is called to resolve forwarding pointers. + pub unsafe fn event_update(&mut self, new_event_num: u64, new_arvo: Noun) { // XX: assert event numbers are continuous self.arvo = new_arvo; self.event_num = new_event_num; @@ -201,6 +206,21 @@ impl Context { self.mug = mug_u32(&mut self.nock_context.stack, self.arvo); } + /// + /// ## Safety + /// + /// Preserves nouns and jet states in context and then calls [flip_top_frame]. + /// Other stack-allocated objects needing preservation should be preserved between + /// [event_update] and invocation of this function + pub unsafe fn preserve_event_update_leftovers(&mut self) { + let stack = &mut self.nock_context.stack; + stack.preserve(&mut self.arvo); + stack.preserve(&mut self.nock_context.cold); + stack.preserve(&mut self.nock_context.warm); + stack.preserve(&mut self.nock_context.hot); + stack.flip_top_frame(0); + } + // // Newt functions // @@ -370,15 +390,6 @@ pub fn serf(constant_hot_state: &[HotEntry]) -> io::Result<()> { clear_interrupt(); - // - unsafe { - let stack = &mut context.nock_context.stack; - stack.preserve(&mut context.arvo); - stack.preserve(&mut context.nock_context.cold); - stack.preserve(&mut context.nock_context.warm); - stack.preserve(&mut context.nock_context.hot); - stack.flip_top_frame(0); - } } Ok(()) @@ -471,7 +482,10 @@ fn play_life(context: &mut Context, eve: Noun) { let eved = lent(eve).expect("serf: play: boot event number failure") as u64; let arvo = slot(gat, 7).expect("serf: play: lifecycle didn't return initial Arvo"); - context.event_update(eved, arvo); + unsafe { + context.event_update(eved, arvo); + context.preserve_event_update_leftovers(); + } context.play_done(); } Err(error) => match error { @@ -504,7 +518,10 @@ fn play_list(context: &mut Context, mut lit: Noun) { .tail(); eve += 1; - context.event_update(eve, arvo); + unsafe { + context.event_update(eve, arvo); + context.preserve_event_update_leftovers(); + } } Err(goof) => { return context.play_bail(goof); @@ -533,10 +550,14 @@ fn work(context: &mut Context, job: Noun) { match soft(context, job, trace_name) { Ok(res) => { let cell = res.as_cell().expect("serf: work: +slam returned atom"); - let fec = cell.head(); + let mut fec = cell.head(); let eve = context.event_num; - context.event_update(eve + 1, cell.tail()); + unsafe { + context.event_update(eve + 1, cell.tail()); + context.nock_context.stack.preserve(&mut fec); + context.preserve_event_update_leftovers(); + } context.work_done(fec); } Err(goof) => { @@ -560,7 +581,7 @@ fn work_swap(context: &mut Context, job: Noun, goof: Noun) { let now = inc(stack, job_now).as_noun(); let wire = T(stack, &[D(0), D(tas!(b"arvo")), D(0)]); let crud = DirectAtom::new_panic(tas!(b"crud")); - let ovo = T(stack, &[now, wire, crud.as_noun(), goof, job_cell.tail()]); + let mut ovo = T(stack, &[now, wire, crud.as_noun(), goof, job_cell.tail()]); let trace_name = if context.nock_context.trace_info.is_some() { Some(work_trace_name( &mut context.nock_context.stack, @@ -574,10 +595,15 @@ fn work_swap(context: &mut Context, job: Noun, goof: Noun) { match soft(context, ovo, trace_name) { Ok(res) => { let cell = res.as_cell().expect("serf: work: crud +slam returned atom"); - let fec = cell.head(); + let mut fec = cell.head(); let eve = context.event_num; - context.event_update(eve + 1, cell.tail()); + unsafe { + context.event_update(eve + 1, cell.tail()); + context.nock_context.stack.preserve(&mut ovo); + context.nock_context.stack.preserve(&mut fec); + context.preserve_event_update_leftovers(); + } context.work_swap(ovo, fec); } Err(goof_crud) => { From dd814b3e1fc4f93c3b3201c6465fd9ddf9d67c8f Mon Sep 17 00:00:00 2001 From: Edward Amsden Date: Fri, 15 Dec 2023 11:16:38 -0600 Subject: [PATCH 109/128] pma: add crate feature to control debugging printfs --- rust/ares/Cargo.toml | 2 ++ rust/ares_pma/Cargo.toml | 2 ++ rust/ares_pma/build.rs | 2 +- 3 files changed, 5 insertions(+), 1 deletion(-) diff --git a/rust/ares/Cargo.toml b/rust/ares/Cargo.toml index 23316f7..d07e754 100644 --- a/rust/ares/Cargo.toml +++ b/rust/ares/Cargo.toml @@ -12,6 +12,8 @@ edition = "2018" # Please keep these alphabetized [dependencies] ares_macros = { path = "../ares_macros" } +# Use this when debugging requires the debug printfs in the PMA +# ares_pma = { path = "../ares_pma", features=["debug_prints"] } ares_pma = { path = "../ares_pma" } assert_no_alloc = "1.1.2" # use this when debugging requires allocation (e.g. eprintln) diff --git a/rust/ares_pma/Cargo.toml b/rust/ares_pma/Cargo.toml index b7ccdb4..94612e4 100644 --- a/rust/ares_pma/Cargo.toml +++ b/rust/ares_pma/Cargo.toml @@ -11,3 +11,5 @@ edition = "2018" bindgen = "0.69.1" cc = "1.0" +[features] +debug_prints = [] diff --git a/rust/ares_pma/build.rs b/rust/ares_pma/build.rs index f79c4ed..eb2ca17 100644 --- a/rust/ares_pma/build.rs +++ b/rust/ares_pma/build.rs @@ -8,7 +8,7 @@ use bindgen::CargoCallbacks; fn main() { let profile = env::var("PROFILE").unwrap(); let opt_level = env::var("OPT_LEVEL").unwrap(); - let define_debug = if profile == "debug" { + let define_debug = if env::var("CARGO_FEATURE_DEBUG_PRINTS").is_ok() { "-DDEBUG" } else { "-UDEBUG" From f092c482ee3b4859afa14da86d6769235db671bd Mon Sep 17 00:00:00 2001 From: Edward Amsden Date: Fri, 15 Dec 2023 11:17:08 -0600 Subject: [PATCH 110/128] pma: format --- rust/ares/src/hamt.rs | 6 +++--- rust/ares/src/interpreter.rs | 4 ++-- rust/ares/src/jets.rs | 2 +- rust/ares/src/jets/cold.rs | 20 ++++++-------------- rust/ares/src/persist.rs | 9 +++++---- rust/ares/src/serf.rs | 5 ++--- rust/ares/src/unifying_equality.rs | 6 ++---- 7 files changed, 21 insertions(+), 31 deletions(-) diff --git a/rust/ares/src/hamt.rs b/rust/ares/src/hamt.rs index dcccb5d..27c79ea 100644 --- a/rust/ares/src/hamt.rs +++ b/rust/ares/src/hamt.rs @@ -1,8 +1,8 @@ use crate::mem::{NockStack, Preserve}; -use crate::unifying_equality::unifying_equality; use crate::mug::mug_u32; use crate::noun::Noun; -use crate::persist::{Persist, pma_contains}; +use crate::persist::{pma_contains, Persist}; +use crate::unifying_equality::unifying_equality; use either::Either::{self, *}; use std::mem::size_of; use std::ptr::{copy_nonoverlapping, null, null_mut}; @@ -719,7 +719,7 @@ impl Persist for Hamt { let stem_ptr: *mut Stem = &mut (*next_entry_ptr).stem; let stem_size = (*stem_ptr).size(); - + if pma_contains((*stem_ptr).buffer, stem_size) { continue; } diff --git a/rust/ares/src/interpreter.rs b/rust/ares/src/interpreter.rs index ba342e6..358f396 100644 --- a/rust/ares/src/interpreter.rs +++ b/rust/ares/src/interpreter.rs @@ -1,7 +1,6 @@ use crate::assert_acyclic; use crate::assert_no_forwarding_pointers; use crate::assert_no_junior_pointers; -use crate::unifying_equality::unifying_equality; use crate::hamt::Hamt; use crate::jets::cold; use crate::jets::cold::Cold; @@ -15,6 +14,7 @@ use crate::noun; use crate::noun::{Atom, Cell, IndirectAtom, Noun, Slots, D, T}; use crate::serf::TERMINATOR; use crate::trace::{write_nock_trace, TraceInfo, TraceStack}; +use crate::unifying_equality::unifying_equality; use ares_macros::tas; use assert_no_alloc::assert_no_alloc; use bitvec::prelude::{BitSlice, Lsb0}; @@ -1304,9 +1304,9 @@ mod hint { use crate::jets; use crate::jets::cold; use crate::jets::nock::util::{mook, LEAF}; - use crate::unifying_equality::unifying_equality; use crate::noun::{tape, Atom, Cell, Noun, D, T}; use crate::serf::TERMINATOR; + use crate::unifying_equality::unifying_equality; use ares_macros::tas; use std::sync::atomic::Ordering; use std::sync::Arc; diff --git a/rust/ares/src/jets.rs b/rust/ares/src/jets.rs index 470c258..00fc1a0 100644 --- a/rust/ares/src/jets.rs +++ b/rust/ares/src/jets.rs @@ -308,8 +308,8 @@ pub mod util { use super::*; use crate::hamt::Hamt; use crate::mem::NockStack; - use crate::unifying_equality::unifying_equality; use crate::noun::{Atom, Noun, D, T}; + use crate::unifying_equality::unifying_equality; use assert_no_alloc::assert_no_alloc; use ibig::UBig; diff --git a/rust/ares/src/jets/cold.rs b/rust/ares/src/jets/cold.rs index 98bfaf3..6bd8b50 100644 --- a/rust/ares/src/jets/cold.rs +++ b/rust/ares/src/jets/cold.rs @@ -1,9 +1,9 @@ use crate::hamt::Hamt; use crate::mem::{NockStack, Preserve}; -use crate::unifying_equality::unifying_equality; use crate::noun; use crate::noun::{Atom, DirectAtom, Noun, Slots, D, T}; -use crate::persist::{Persist, pma_contains}; +use crate::persist::{pma_contains, Persist}; +use crate::unifying_equality::unifying_equality; use std::mem::size_of; use std::ptr::copy_nonoverlapping; use std::ptr::null_mut; @@ -68,9 +68,7 @@ impl Persist for Batteries { copy_nonoverlapping((*dest).0, batteries_mem_ptr, 1); *buffer = batteries_mem_ptr.add(1) as *mut u8; - (*batteries_mem_ptr) - .battery - .copy_to_buffer(stack, buffer); + (*batteries_mem_ptr).battery.copy_to_buffer(stack, buffer); (*batteries_mem_ptr) .parent_axis .copy_to_buffer(stack, buffer); @@ -475,15 +473,9 @@ impl Persist for Cold { (*self).0 = cold_mem_ptr; - (*(*self).0) - .battery_to_paths - .copy_to_buffer(stack, buffer); - (*(*self).0) - .root_to_paths - .copy_to_buffer(stack, buffer); - (*(*self).0) - .path_to_batteries - .copy_to_buffer(stack, buffer); + (*(*self).0).battery_to_paths.copy_to_buffer(stack, buffer); + (*(*self).0).root_to_paths.copy_to_buffer(stack, buffer); + (*(*self).0).path_to_batteries.copy_to_buffer(stack, buffer); } unsafe fn handle_to_u64(&self) -> u64 { diff --git a/rust/ares/src/persist.rs b/rust/ares/src/persist.rs index 898c6ed..f0ddc1b 100644 --- a/rust/ares/src/persist.rs +++ b/rust/ares/src/persist.rs @@ -16,13 +16,13 @@ const PMA_FLAGS: ULONG = 0; // ignored for now const NOUN_MARKED: u64 = 1 << 63; /// Handle to a PMA -#[derive(Copy,Clone)] +#[derive(Copy, Clone)] struct PMAState(u64); // this is idiotic but necessary for Rust to let us put this in a oncelock static PMA: OnceLock = OnceLock::new(); fn get_pma_state() -> Option<*mut BT_state> { - PMA.get().map(|r| { r.0 as *mut BT_state }) + PMA.get().map(|r| r.0 as *mut BT_state) } fn pma_state_err() -> std::io::Error { @@ -39,7 +39,9 @@ pub fn pma_open(path: PathBuf) -> Result<(), std::io::Error> { bt_state_new(&mut state); let err = bt_state_open(state, path_cstring.as_ptr(), PMA_FLAGS, PMA_MODE); if err == 0 { - PMA.set(PMAState(state as u64)).or_else(|state| { Err(state.0 as *mut BT_state) }).expect("PMA state already initialized to:"); + PMA.set(PMAState(state as u64)) + .or_else(|state| Err(state.0 as *mut BT_state)) + .expect("PMA state already initialized to:"); assert!(get_pma_state().is_some()); Ok(()) } else { @@ -98,7 +100,6 @@ pub unsafe fn pma_dirty(ptr: *mut T, count: usize) { assert!(e == 0); } - /** * This trait defines operations for copying a structure into the PMA. * diff --git a/rust/ares/src/serf.rs b/rust/ares/src/serf.rs index ea13c1f..474a277 100644 --- a/rust/ares/src/serf.rs +++ b/rust/ares/src/serf.rs @@ -1,4 +1,3 @@ -use crate::persist::pma_meta_set; use crate::hamt::Hamt; use crate::interpreter; use crate::interpreter::{inc, interpret, Error}; @@ -12,7 +11,8 @@ use crate::mem::Preserve; use crate::mug::*; use crate::newt::Newt; use crate::noun::{Atom, Cell, DirectAtom, Noun, Slots, D, T}; -use crate::persist::{Persist, pma_open, pma_meta_get, pma_sync}; +use crate::persist::pma_meta_set; +use crate::persist::{pma_meta_get, pma_open, pma_sync, Persist}; use crate::trace::*; use ares_macros::tas; use signal_hook; @@ -389,7 +389,6 @@ pub fn serf(constant_hot_state: &[HotEntry]) -> io::Result<()> { }; clear_interrupt(); - } Ok(()) diff --git a/rust/ares/src/unifying_equality.rs b/rust/ares/src/unifying_equality.rs index 3ca3675..83b90fc 100644 --- a/rust/ares/src/unifying_equality.rs +++ b/rust/ares/src/unifying_equality.rs @@ -1,9 +1,9 @@ use crate::assert_acyclic; use crate::assert_no_forwarding_pointers; use crate::assert_no_junior_pointers; -use crate::mem::{NockStack, FRAME, STACK, ALLOC}; -use crate::persist::{pma_contains, pma_dirty}; +use crate::mem::{NockStack, ALLOC, FRAME, STACK}; use crate::noun::Noun; +use crate::persist::{pma_contains, pma_dirty}; use either::Either::*; use libc::{c_void, memcmp}; @@ -23,7 +23,6 @@ macro_rules! assert_no_junior_pointers { ( $x:expr, $y:expr ) => {}; } - pub unsafe fn unifying_equality(stack: &mut NockStack, a: *mut Noun, b: *mut Noun) -> bool { /* This version of unifying equality is not like that of vere. * Vere does a tree comparison (accelerated by pointer equality and short-circuited by mug @@ -247,4 +246,3 @@ fn lower_pointer_first(a: *const u64, b: *const u64) -> (*const u64, *const u64) (b, a) } } - From 820b1ac57ace251c0d473593fdc648ce1ab53cfa Mon Sep 17 00:00:00 2001 From: Edward Amsden Date: Fri, 15 Dec 2023 12:27:17 -0600 Subject: [PATCH 111/128] main: add stop_for_debug feature to stop Ares when it starts for a debugger to attach --- rust/ares/Cargo.toml | 1 + rust/ares/src/main.rs | 11 +++++++---- 2 files changed, 8 insertions(+), 4 deletions(-) diff --git a/rust/ares/Cargo.toml b/rust/ares/Cargo.toml index d07e754..8bb6bcb 100644 --- a/rust/ares/Cargo.toml +++ b/rust/ares/Cargo.toml @@ -55,3 +55,4 @@ check_acyclic = [] check_forwarding = [] check_junior = [] sham_hints = [] +stop_for_debug = [] diff --git a/rust/ares/src/main.rs b/rust/ares/src/main.rs index 28cea59..483f08f 100644 --- a/rust/ares/src/main.rs +++ b/rust/ares/src/main.rs @@ -5,10 +5,13 @@ use std::io; fn main() -> io::Result<()> { // debug - // eprintln!("serf: pid {}", std::process::id()); - // if unsafe { libc::kill(std::process::id() as i32, libc::SIGSTOP) } != 0 { - // panic!("Could not stop ourselves."); - // }; + #[cfg(feature = "stop_for_debug")] + { + eprintln!("serf: pid {}", std::process::id()); + if unsafe { libc::kill(std::process::id() as i32, libc::SIGSTOP) } != 0 { + panic!("Could not stop ourselves."); + }; + } let filename = env::args().nth(1).expect("Must provide input filename"); From 0496de5f4c2e4cb5b4921ce309cb7613ef7b2909 Mon Sep 17 00:00:00 2001 From: barter-simsum Date: Fri, 15 Dec 2023 16:00:46 -0500 Subject: [PATCH 112/128] pma: testing out a mmap call /before/ reading header --- rust/ares_pma/c-src/btree.c | 72 ++++++++++++++++++++++++++++++++----- 1 file changed, 64 insertions(+), 8 deletions(-) diff --git a/rust/ares_pma/c-src/btree.c b/rust/ares_pma/c-src/btree.c index bd7eeee..ada1478 100644 --- a/rust/ares_pma/c-src/btree.c +++ b/rust/ares_pma/c-src/btree.c @@ -2154,6 +2154,60 @@ _bt_state_meta_which(BT_state *state, int *which) return BT_SUCC; } +static int +_bt_state_read_header(BT_state *state) +{ + BT_meta *m1, *m2; + int which = 0; + int rc = 1; + m1 = state->meta_pages[0]; + m2 = state->meta_pages[1]; + + TRACE(); + + /* validate magic */ + if (m1->magic != BT_MAGIC) { + DPRINTF("metapage 0x%pX inconsistent magic: 0x%" PRIX32, m1, m1->magic); + return EINVAL; + } + if (m2->magic != BT_MAGIC) { + DPRINTF("metapage 0x%pX inconsistent magic: 0x%" PRIX32, m2, m2->magic); + return EINVAL; + } + + /* validate flags */ + if (m1->flags & BP_META != BP_META) { + DPRINTF("metapage 0x%pX missing meta page flag", m1); + return EINVAL; + } + if (m2->flags & BP_META != BP_META) { + DPRINTF("metapage 0x%pX missing meta page flag", m2); + return EINVAL; + } + + /* validate binary version */ + if (m1->version != BT_VERSION) { + DPRINTF("version mismatch on metapage: 0x%pX, metapage version: %" PRIu32 ", binary version %u", + m1, m1->version, BT_VERSION); + return EINVAL; + } + + /* validate binary version */ + if (m2->version != BT_VERSION) { + DPRINTF("version mismatch on metapage: 0x%pX, metapage version: %" PRIu32 ", binary version %u", + m2, m2->version, BT_VERSION); + return EINVAL; + } + + if (!SUCC(rc = _bt_state_meta_which(state, &which))) + return rc; + + state->which = which; + + return BT_SUCC; +} + +#if 0 static int _bt_state_read_header(BT_state *state) { @@ -2217,6 +2271,7 @@ _bt_state_read_header(BT_state *state) return BT_SUCC; } +#endif static int _bt_state_meta_new(BT_state *state) @@ -2289,6 +2344,15 @@ _bt_state_load(BT_state *state) TRACE(); + /* map first node stripe (along with metapages) as read only */ + state->map = mmap(BT_MAPADDR, + BT_META_SECTION_WIDTH + BLK_BASE_LEN0, + BT_PROT_CLEAN, + BT_FLAG_CLEAN, + state->data_fd, + 0); + + if (!SUCC(rc = _bt_state_read_header(state))) { if (rc != ENOENT) return rc; DPUTS("creating new db"); @@ -2299,14 +2363,6 @@ _bt_state_load(BT_state *state) } } - /* map first node stripe (along with metapages) as read only */ - state->map = mmap(BT_MAPADDR, - BT_META_SECTION_WIDTH + BLK_BASE_LEN0, - BT_PROT_CLEAN, - BT_FLAG_CLEAN, - state->data_fd, - 0); - if (state->map != BT_MAPADDR) { DPRINTF("mmap: failed to map at addr %p, errno: %s", BT_MAPADDR, strerror(errno)); abort(); From 0ec984663cd4e4d58e8a79daf10fc3ce7bc10bfd Mon Sep 17 00:00:00 2001 From: barter-simsum Date: Fri, 15 Dec 2023 18:38:52 -0500 Subject: [PATCH 113/128] pma: revisions to codepath handling opening of existing pma --- rust/ares_pma/c-src/btest.c | 5 ++ rust/ares_pma/c-src/btree.c | 91 +++++++------------------------------ 2 files changed, 21 insertions(+), 75 deletions(-) diff --git a/rust/ares_pma/c-src/btest.c b/rust/ares_pma/c-src/btest.c index a779813..8bf3e98 100644 --- a/rust/ares_pma/c-src/btest.c +++ b/rust/ares_pma/c-src/btest.c @@ -161,6 +161,11 @@ int main(int argc, char *argv[]) /* resync the state */ bt_sync(state3); + bt_state_close(state3); + + bt_state_new(&state3); + + assert(SUCC(bt_state_open(state3, "./pmatest3", 0, 0644))); /* TODO: close and reopen state. validate ephemeral structures */ return 0; diff --git a/rust/ares_pma/c-src/btree.c b/rust/ares_pma/c-src/btree.c index ada1478..b2cd564 100644 --- a/rust/ares_pma/c-src/btree.c +++ b/rust/ares_pma/c-src/btree.c @@ -2117,27 +2117,27 @@ _bt_state_restore_maps(BT_state *state) } static int -_bt_state_meta_which(BT_state *state, int *which) +_bt_state_meta_which(BT_state *state) { BT_meta *m1 = state->meta_pages[0]; BT_meta *m2 = state->meta_pages[1]; - *which = -1; + int which = -1; if (m1->chk == 0) { /* first is dirty */ - *which = 1; + which = 1; } else if (m2->chk == 0) { /* second is dirty */ - *which = 0; + which = 0; } else if (m1->txnid > m2->txnid) { /* first is most recent */ - *which = 0; + which = 0; } else if (m1->txnid < m2->txnid) { /* second is most recent */ - *which = 1; + which = 1; } else { /* invalid state */ @@ -2145,86 +2145,30 @@ _bt_state_meta_which(BT_state *state, int *which) } /* checksum the metapage found and abort if checksum doesn't match */ - BT_meta *meta = state->meta_pages[*which]; + BT_meta *meta = state->meta_pages[which]; uint32_t chk = nonzero_crc_32(meta, BT_META_LEN); if (chk != meta->chk) { abort(); } - return BT_SUCC; -} - -static int -_bt_state_read_header(BT_state *state) -{ - BT_meta *m1, *m2; - int which = 0; - int rc = 1; - m1 = state->meta_pages[0]; - m2 = state->meta_pages[1]; - - TRACE(); - - /* validate magic */ - if (m1->magic != BT_MAGIC) { - DPRINTF("metapage 0x%pX inconsistent magic: 0x%" PRIX32, m1, m1->magic); - return EINVAL; - } - if (m2->magic != BT_MAGIC) { - DPRINTF("metapage 0x%pX inconsistent magic: 0x%" PRIX32, m2, m2->magic); - return EINVAL; - } - - /* validate flags */ - if (m1->flags & BP_META != BP_META) { - DPRINTF("metapage 0x%pX missing meta page flag", m1); - return EINVAL; - } - if (m2->flags & BP_META != BP_META) { - DPRINTF("metapage 0x%pX missing meta page flag", m2); - return EINVAL; - } - - /* validate binary version */ - if (m1->version != BT_VERSION) { - DPRINTF("version mismatch on metapage: 0x%pX, metapage version: %" PRIu32 ", binary version %u", - m1, m1->version, BT_VERSION); - return EINVAL; - } - - /* validate binary version */ - if (m2->version != BT_VERSION) { - DPRINTF("version mismatch on metapage: 0x%pX, metapage version: %" PRIu32 ", binary version %u", - m2, m2->version, BT_VERSION); - return EINVAL; - } - - if (!SUCC(rc = _bt_state_meta_which(state, &which))) - return rc; - + /* set which in state */ state->which = which; return BT_SUCC; } -#if 0 static int _bt_state_read_header(BT_state *state) { - /* TODO: actually read the header and copy the data to meta when we implement - persistence */ - BT_page metas[2]; - int rc, len, which; BT_meta *m1, *m2; - - /* pma already exists, parse metadata file */ + int rc = 1; + BYTE metas[BT_PAGESIZE*2] = {0}; m1 = state->meta_pages[0]; m2 = state->meta_pages[1]; - /* ;;: TODO, need to store last page in use by pma in both metadata pages. choose the frontier after _bt_state_meta_which and store it in state */ TRACE(); - if ((len = pread(state->data_fd, metas, BT_PAGESIZE*2, 0)) + if (pread(state->data_fd, metas, BT_PAGESIZE*2, 0) != BT_PAGESIZE*2) { /* new pma */ return ENOENT; @@ -2264,14 +2208,11 @@ _bt_state_read_header(BT_state *state) return EINVAL; } - if (!SUCC(rc = _bt_state_meta_which(state, &which))) + if (!SUCC(rc = _bt_state_meta_which(state))) return rc; - state->which = which; - return BT_SUCC; } -#endif static int _bt_state_meta_new(BT_state *state) @@ -2352,6 +2293,9 @@ _bt_state_load(BT_state *state) state->data_fd, 0); + p = (BT_page *)state->map; + state->meta_pages[0] = METADATA(p); + state->meta_pages[1] = METADATA(p + 1); if (!SUCC(rc = _bt_state_read_header(state))) { if (rc != ENOENT) return rc; @@ -2379,10 +2323,6 @@ _bt_state_load(BT_state *state) abort(); } - p = (BT_page *)state->map; - state->meta_pages[0] = METADATA(p); - state->meta_pages[1] = METADATA(p + 1); - /* new db, so populate metadata */ if (new) { /* ;;: move this logic to _flist_new */ @@ -2637,6 +2577,7 @@ bt_state_new(BT_state **state) BT_state *s = calloc(1, sizeof *s); s->data_fd = -1; s->fixaddr = BT_MAPADDR; + s->which = -1; *state = s; return BT_SUCC; } From 9a8229837da482ee9d32acf660ebfdb4bffd0df1 Mon Sep 17 00:00:00 2001 From: barter-simsum Date: Fri, 15 Dec 2023 18:41:50 -0500 Subject: [PATCH 114/128] pma: initialize state->which to 0 in bt_state_open --- rust/ares_pma/c-src/btree.c | 1 - 1 file changed, 1 deletion(-) diff --git a/rust/ares_pma/c-src/btree.c b/rust/ares_pma/c-src/btree.c index b2cd564..901b59a 100644 --- a/rust/ares_pma/c-src/btree.c +++ b/rust/ares_pma/c-src/btree.c @@ -2577,7 +2577,6 @@ bt_state_new(BT_state **state) BT_state *s = calloc(1, sizeof *s); s->data_fd = -1; s->fixaddr = BT_MAPADDR; - s->which = -1; *state = s; return BT_SUCC; } From 2c016500ac88de6e66564b5651c44ba8f00cb210 Mon Sep 17 00:00:00 2001 From: barter-simsum Date: Fri, 15 Dec 2023 19:24:37 -0500 Subject: [PATCH 115/128] pma: restore mmaps before regenerating ephemeral state --- rust/ares_pma/c-src/btree.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/rust/ares_pma/c-src/btree.c b/rust/ares_pma/c-src/btree.c index 901b59a..8fc87db 100644 --- a/rust/ares_pma/c-src/btree.c +++ b/rust/ares_pma/c-src/btree.c @@ -2344,6 +2344,9 @@ _bt_state_load(BT_state *state) assert(SUCC(_flist_new(state))); } else { + /* restore data memory maps */ + _bt_state_restore_maps(state); + /* restore ephemeral freelists */ assert(SUCC(_nlist_read(state))); assert(SUCC(_mlist_read(state))); @@ -2353,9 +2356,6 @@ _bt_state_load(BT_state *state) return errno; state->file_size = stat.st_size; - - /* restore data memory maps */ - _bt_state_restore_maps(state); } return BT_SUCC; From 3a6ed42054fadea46efa3237afce5182ec8781ce Mon Sep 17 00:00:00 2001 From: barter-simsum Date: Fri, 15 Dec 2023 19:49:35 -0500 Subject: [PATCH 116/128] pma: depth passed to _flist_read2 should be 1 not 0 --- rust/ares_pma/c-src/btree.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/rust/ares_pma/c-src/btree.c b/rust/ares_pma/c-src/btree.c index 8fc87db..485f782 100644 --- a/rust/ares_pma/c-src/btree.c +++ b/rust/ares_pma/c-src/btree.c @@ -1982,7 +1982,7 @@ _flist_read(BT_state *state) BT_meta *meta = state->meta_pages[state->which]; BT_page *root = _node_get(state, meta->root); uint8_t maxdepth = meta->depth; - BT_flistnode *head = _flist_read2(state, root, maxdepth, 0); + BT_flistnode *head = _flist_read2(state, root, maxdepth, 1); /* ;;: infinite loop with proper starting depth of 1. -- fix that! */ /* BT_flistnode *head = _flist_read2(state, root, maxdepth, 1); */ From c0e68a27f69f5eaa73ef56be1230421cab447f8d Mon Sep 17 00:00:00 2001 From: barter-simsum Date: Sun, 17 Dec 2023 20:05:03 -0500 Subject: [PATCH 117/128] pma: btest ephemeral structure equality tests --- rust/ares_pma/c-src/btest.c | 134 +++++++++++++++++++++++++++++++++--- 1 file changed, 124 insertions(+), 10 deletions(-) diff --git a/rust/ares_pma/c-src/btest.c b/rust/ares_pma/c-src/btest.c index 8bf3e98..9bc6713 100644 --- a/rust/ares_pma/c-src/btest.c +++ b/rust/ares_pma/c-src/btest.c @@ -43,6 +43,108 @@ _flist_sizep(BT_flistnode *head) return sz; } +static BT_mlistnode * +_mlist_copy(BT_state *state) +{ + BT_mlistnode *head = state->mlist; + BT_mlistnode *ret, *prev; + ret = prev = calloc(1, sizeof *ret); + memcpy(ret, head, sizeof *head); + ret->next = 0; + head = head->next; + while (head) { + BT_mlistnode *copy = calloc(1, sizeof *copy); + memcpy(copy, head, sizeof *head); + prev->next = copy; + prev = copy; + head = head->next; + } + return ret; +} + +static BT_nlistnode * +_nlist_copy(BT_state *state) +{ + BT_nlistnode *head = state->nlist; + BT_nlistnode *ret, *prev; + ret = prev = calloc(1, sizeof *ret); + memcpy(ret, head, sizeof *head); + ret->next = 0; + head = head->next; + while (head) { + BT_nlistnode *copy = calloc(1, sizeof *copy); + memcpy(copy, head, sizeof *head); + prev->next = copy; + prev = copy; + head = head->next; + } + return ret; +} + +static BT_flistnode * +_flist_copy(BT_state *state) +{ + BT_flistnode *head = state->flist; + BT_flistnode *ret, *prev; + ret = prev = calloc(1, sizeof *ret); + memcpy(ret, head, sizeof *head); + ret->next = 0; + head = head->next; + while (head) { + BT_flistnode *copy = calloc(1, sizeof *copy); + memcpy(copy, head, sizeof *head); + prev->next = copy; + prev = copy; + head = head->next; + } + return ret; +} + +static int +_mlist_eq(BT_mlistnode *l, BT_mlistnode *r) +{ + while (l && r) { + if (l->sz != r->sz) + bp(0); + if (l->va != r->va) + bp(0); + l = l->next; r = r->next; + } + if (l == 0 && r == 0) + return 1; + bp(0); +} + +static int +_nlist_eq(BT_nlistnode *l, BT_nlistnode *r) +{ + while (l && r) { + if (l->sz != r->sz) + bp(0); + if (l->va != r->va) + bp(0); + l = l->next; r = r->next; + } + if (l == 0 && r == 0) + return 1; + bp(0); +} + +static int +_flist_eq(BT_flistnode *l, BT_flistnode *r) +{ + while (l && r) { + if (l->sz != r->sz) + bp(0); + if (l->pg != r->pg) + bp(0); + l = l->next; r = r->next; + } + if (l == 0 && r == 0) + return 1; + bp(0); +} + int main(int argc, char *argv[]) { DPUTS("PMA Tests"); @@ -149,24 +251,36 @@ int main(int argc, char *argv[]) flist_sizp = _flist_sizep(state3->flist); mlist_sizp = _mlist_sizep(state3->mlist); alloc_sizp = 0; - for (size_t i = 0; i < ITERATIONS / 2; i++) { - /* free half of the allocations */ - bt_free(state3, allocs[i].lo, allocs[i].hi); - alloc_sizp += allocs[i].hi - allocs[i].lo; - /* validate size changes to mlist */ - assert(_mlist_sizep(state3->mlist) - == (mlist_sizp + alloc_sizp)); - } + /* for (size_t i = 0; i < ITERATIONS / 2; i++) { */ + /* /\* free half of the allocations *\/ */ + /* bt_free(state3, allocs[i].lo, allocs[i].hi); */ + /* alloc_sizp += allocs[i].hi - allocs[i].lo; */ + /* /\* validate size changes to mlist *\/ */ + /* assert(_mlist_sizep(state3->mlist) */ + /* == (mlist_sizp + alloc_sizp)); */ + /* } */ - /* resync the state */ bt_sync(state3); + /* copy ephemeral structures */ + BT_mlistnode *mlist_copy = _mlist_copy(state3); + BT_nlistnode *nlist_copy = _nlist_copy(state3); + BT_flistnode *flist_copy = _flist_copy(state3); + assert(_mlist_eq(mlist_copy, state3->mlist)); + assert(_nlist_eq(nlist_copy, state3->nlist)); + assert(_flist_eq(flist_copy, state3->flist)); + bt_state_close(state3); bt_state_new(&state3); assert(SUCC(bt_state_open(state3, "./pmatest3", 0, 0644))); - /* TODO: close and reopen state. validate ephemeral structures */ + + /* compare for equality copies of ephemeral structures with restored ephemeral + structures */ + assert(_mlist_eq(mlist_copy, state3->mlist)); + assert(_nlist_eq(nlist_copy, state3->nlist)); + assert(_flist_eq(flist_copy, state3->flist)); return 0; } From 9a74aff3e892a8c95d41a5b67413c797712c5edc Mon Sep 17 00:00:00 2001 From: barter-simsum Date: Mon, 18 Dec 2023 22:02:58 -0500 Subject: [PATCH 118/128] pma: significant revisions to freelist logic --- rust/ares_pma/c-src/btest.c | 36 +- rust/ares_pma/c-src/btree.c | 684 ++++++++++++++++++------------------ 2 files changed, 367 insertions(+), 353 deletions(-) diff --git a/rust/ares_pma/c-src/btest.c b/rust/ares_pma/c-src/btest.c index 9bc6713..9a97d7f 100644 --- a/rust/ares_pma/c-src/btest.c +++ b/rust/ares_pma/c-src/btest.c @@ -25,7 +25,8 @@ _mlist_sizep(BT_mlistnode *head) { size_t sz = 0; while (head) { - sz += head->sz; + size_t sz_p = addr2off(head->hi) - addr2off(head->lo); + sz += sz_p; head = head->next; } return sz; @@ -37,7 +38,8 @@ _flist_sizep(BT_flistnode *head) { size_t sz = 0; while (head) { - sz += head->sz; + size_t sz_p = head->hi - head->lo; + sz += sz_p; head = head->next; } return sz; @@ -104,9 +106,9 @@ static int _mlist_eq(BT_mlistnode *l, BT_mlistnode *r) { while (l && r) { - if (l->sz != r->sz) + if (l->lo != r->lo) bp(0); - if (l->va != r->va) + if (l->hi != r->hi) bp(0); l = l->next; r = r->next; } @@ -119,9 +121,9 @@ static int _nlist_eq(BT_nlistnode *l, BT_nlistnode *r) { while (l && r) { - if (l->sz != r->sz) + if (l->lo != r->lo) bp(0); - if (l->va != r->va) + if (l->hi != r->hi) bp(0); l = l->next; r = r->next; } @@ -134,9 +136,9 @@ static int _flist_eq(BT_flistnode *l, BT_flistnode *r) { while (l && r) { - if (l->sz != r->sz) + if (l->lo != r->lo) bp(0); - if (l->pg != r->pg) + if (l->hi != r->hi) bp(0); l = l->next; r = r->next; } @@ -161,7 +163,7 @@ int main(int argc, char *argv[]) return errno; assert(SUCC(bt_state_open(state1, "./pmatest1", 0, 0644))); -#define LOWEST_ADDR 0x200000; +#define LOWEST_ADDR 0x2aaa80; vaof_t lo = LOWEST_ADDR; vaof_t hi = 0xDEADBEEF; pgno_t pg = 1; /* dummy value */ @@ -251,14 +253,14 @@ int main(int argc, char *argv[]) flist_sizp = _flist_sizep(state3->flist); mlist_sizp = _mlist_sizep(state3->mlist); alloc_sizp = 0; - /* for (size_t i = 0; i < ITERATIONS / 2; i++) { */ - /* /\* free half of the allocations *\/ */ - /* bt_free(state3, allocs[i].lo, allocs[i].hi); */ - /* alloc_sizp += allocs[i].hi - allocs[i].lo; */ - /* /\* validate size changes to mlist *\/ */ - /* assert(_mlist_sizep(state3->mlist) */ - /* == (mlist_sizp + alloc_sizp)); */ - /* } */ + for (size_t i = 0; i < ITERATIONS / 2; i++) { + /* free half of the allocations */ + bt_free(state3, allocs[i].lo, allocs[i].hi); + alloc_sizp += allocs[i].hi - allocs[i].lo; + /* validate size changes to mlist */ + assert(_mlist_sizep(state3->mlist) + == (mlist_sizp + alloc_sizp)); + } bt_sync(state3); diff --git a/rust/ares_pma/c-src/btree.c b/rust/ares_pma/c-src/btree.c index 485f782..2796810 100644 --- a/rust/ares_pma/c-src/btree.c +++ b/rust/ares_pma/c-src/btree.c @@ -126,7 +126,7 @@ off2addr(vaof_t off) #define BT_PROT_CLEAN (PROT_READ) #define BT_FLAG_CLEAN (MAP_FIXED | MAP_SHARED) #define BT_PROT_FREE (PROT_NONE) -#define BT_FLAG_FREE (MAP_ANONYMOUS | MAP_PRIVATE | MAP_FIXED | MAP_NORESERVE) +#define BT_FLAG_FREE (MAP_ANONYMOUS | MAP_SHARED | MAP_FIXED | MAP_NORESERVE) #define BT_PROT_DIRTY (PROT_READ | PROT_WRITE) #define BT_FLAG_DIRTY (MAP_FIXED | MAP_SHARED) @@ -250,13 +250,23 @@ static_assert(BT_DAT_MAXBYTES % sizeof(BT_dat) == 0); additional information */ #define BLK_BASE_LEN0 (MBYTES(2) - BT_META_SECTION_WIDTH) -#define BLK_BASE_LEN1 (BLK_BASE_LEN0 * 4) +#define BLK_BASE_LEN1 (MBYTES(8)) #define BLK_BASE_LEN2 (BLK_BASE_LEN1 * 4) #define BLK_BASE_LEN3 (BLK_BASE_LEN2 * 4) #define BLK_BASE_LEN4 (BLK_BASE_LEN3 * 4) #define BLK_BASE_LEN5 (BLK_BASE_LEN4 * 4) #define BLK_BASE_LEN6 (BLK_BASE_LEN5 * 4) #define BLK_BASE_LEN7 (BLK_BASE_LEN6 * 4) +#define BLK_BASE_LEN_TOTAL ( \ + BT_META_SECTION_WIDTH + \ + BLK_BASE_LEN0 + \ + BLK_BASE_LEN1 + \ + BLK_BASE_LEN2 + \ + BLK_BASE_LEN3 + \ + BLK_BASE_LEN4 + \ + BLK_BASE_LEN5 + \ + BLK_BASE_LEN6 + \ + BLK_BASE_LEN7) typedef struct BT_meta BT_meta; struct BT_meta { #define BT_NUMROOTS 32 @@ -288,22 +298,24 @@ static_assert(sizeof(BT_meta) <= BT_DAT_MAXBYTES); typedef struct BT_mlistnode BT_mlistnode; struct BT_mlistnode { - void *va; /* virtual address */ - size_t sz; /* size in pages */ + /* ;;: lo and hi might as well by (BT_page *) because we don't have any reason + to have finer granularity */ + BYTE *lo; /* low virtual address */ + BYTE *hi; /* high virtual address */ BT_mlistnode *next; /* next freelist node */ }; typedef struct BT_nlistnode BT_nlistnode; struct BT_nlistnode { - BT_page *va; /* virtual address */ - size_t sz; /* size in pages */ + BT_page *lo; /* low virtual address */ + BT_page *hi; /* high virtual address */ BT_nlistnode *next; /* next freelist node */ }; typedef struct BT_flistnode BT_flistnode; struct BT_flistnode { - pgno_t pg; /* pgno - an offset in the persistent file */ - size_t sz; /* size in pages */ + pgno_t lo; /* low pgno in persistent file */ + pgno_t hi; /* high pgno in persistent file */ BT_flistnode *next; /* next freelist node */ }; @@ -399,28 +411,28 @@ _bt_nalloc(BT_state *state) BT_page *ret = 0; for (; *n; n = &(*n)->next) { - /* ;;: this assert is temporary. When partition striping is - implemented. Rather than assert, conditionally check if we're at the - end of the current stripe. If so, allocate a new region and append that - to the freelist. */ - size_t width = (BYTE *)state->nlist->va - state->map; - /* ;;: asserting 2M for now since partition striping is unimplemented */ - assert(width < MBYTES(2)); + size_t sz_p = (*n)->hi - (*n)->lo; /* perfect fit */ - if ((*n)->sz == 1) { - ret = (*n)->va; + if (sz_p == 1) { + ret = (*n)->lo; + BT_nlistnode *prev = *n; *n = (*n)->next; + free(prev); break; } /* larger than necessary: shrink the node */ - if ((*n)->sz > 1) { - ret = (*n)->va; - (*n)->sz -= 1; - (*n)->va = (*n)->va + 1; + if (sz_p > 1) { + ret = (*n)->lo; + (*n)->lo += 1; break; } } + if (ret == 0) { + DPUTS("nlist out of mem!"); + abort(); + } + /* make node writable */ if (mprotect(ret, sizeof(BT_page), BT_PROT_DIRTY) != 0) { DPRINTF("mprotect of node: %p failed with %s", ret, strerror(errno)); @@ -511,7 +523,7 @@ static void _bt_root_new(BT_meta *meta, BT_page *root) { /* The first usable address in the PMA is just beyond the first node stripe */ - root->datk[0].va = meta->blk_base[0] + BLK_BASE_LEN0; + root->datk[0].va = B2PAGES(BLK_BASE_LEN_TOTAL); root->datk[0].fo = 0; root->datk[1].va = UINT32_MAX; root->datk[1].fo = 0; @@ -777,108 +789,118 @@ _bt_delco_1pass(BT_state *state, vaof_t lo, vaof_t hi) static void _mlist_insert(BT_state *state, void *lo, void *hi) -{ /* ;;: this logic could be simplified with indirect pointers */ - BT_mlistnode *head = state->mlist; +{ + BT_mlistnode **dst = &state->mlist; + BT_mlistnode **prev_dst = 0; BYTE *lob = lo; BYTE *hib = hi; - assert(head); + while(*dst) { + if (hib == (*dst)->lo) { + (*dst)->lo = lob; + /* check if we can coalesce with left neighbor */ + if (prev_dst != 0) { + bp(0); /* ;;: note, this case should not hit. keeping for debugging. */ + /* dst equals &(*prev_dst)->next */ + assert(*prev_dst != 0); + if ((*prev_dst)->hi == lob) { + (*prev_dst)->hi = (*dst)->hi; + (*prev_dst)->next = (*dst)->next; + free(*dst); + } + } + return; + } + if (lob == (*dst)->hi) { + (*dst)->hi = hi; + /* check if we can coalesce with right neighbor */ + if ((*dst)->next != 0) { + if (hib == (*dst)->next->lo) { + (*dst)->hi = (*dst)->next->hi; + BT_mlistnode *dst_next = (*dst)->next; + (*dst)->next = (*dst)->next->next; + free(dst_next); + } + } + return; + } + if (hib > (*dst)->lo) { + assert(lob > (*dst)->hi); + assert(hib > (*dst)->hi); + prev_dst = dst; + dst = &(*dst)->next; + continue; + } - /* special case: freed chunk precedes but is not contiguous with head */ - if (hi < head->va) { + /* otherwise, insert discontinuous node */ BT_mlistnode *new = calloc(1, sizeof *new); - new->sz = B2PAGES(hib - lob); - new->va = lob; - new->next = head; - state->mlist = new; + new->lo = lob; + new->hi = hib; + new->next = *dst; + *dst = new; return; } - while (head) { - BYTE *vob = head->va; - size_t siz = head->sz; - BYTE *nob = head->next ? head->next->va : 0; - - /* freed chunk immediately precedes head */ - if (hib == vob) { - head->va = lo; - head->sz += B2PAGES(hib - lob); - return; - } - /* freed chunk immediately follows termination of head */ - if (vob + siz == lo) { - head->sz += B2PAGES(hib - lob); - return; - } - /* freed chunk between head and next but not contiguous */ - if (lob > vob + siz - && hib < nob) { - BT_mlistnode *new = calloc(1, sizeof *new); - new->sz = B2PAGES(hib - lob); - new->va = lob; - new->next = head->next; - head->next = new; - return; - } - head = head->next; - } - /* freelist completely searched. Chunk must be at tail and not contiguous */ + /* found end of list */ BT_mlistnode *new = calloc(1, sizeof *new); - new->sz = B2PAGES(hib - lob); - new->va = lob; - new->next = head->next; - head->next = new; + new->lo = lob; + new->hi = hib; + new->next = 0; + (*dst) = new; } static void -_pending_nlist_insert(BT_state *state, pgno_t nodepg) +_nlist_insert(BT_state *state, BT_nlistnode **dst, pgno_t nodepg) { - /* ;;: todo: need to account for a null head */ - BT_nlistnode *head = state->pending_nlist; - BT_page *va = _node_get(state, nodepg); + BT_nlistnode **prev_dst = 0; + BT_page *lo = _node_get(state, nodepg); + BT_page *hi = lo+1; - /* freelist may be empty. create head */ - if (head == 0) { - state->pending_nlist = calloc(1, sizeof *state->pending_nlist); - state->pending_nlist->sz = 1; - state->pending_nlist->va = va; + while(*dst) { + if (hi == (*dst)->lo) { + (*dst)->lo = lo; + /* check if we can coalesce with left neighbor */ + if (prev_dst != 0) { + bp(0); /* ;;: note, this case should not hit. keeping for debugging. */ + /* dst equals &(*prev_dst)->next */ + assert(*prev_dst != 0); + if ((*prev_dst)->hi == lo) { + (*prev_dst)->hi = (*dst)->hi; + (*prev_dst)->next = (*dst)->next; + free(*dst); + } + } + return; + } + if (lo == (*dst)->hi) { + (*dst)->hi = hi; + /* check if we can coalesce with right neighbor */ + if ((*dst)->next != 0) { + if (hi == (*dst)->next->lo) { + (*dst)->hi = (*dst)->next->hi; + BT_nlistnode *dst_next = (*dst)->next; + (*dst)->next = (*dst)->next->next; + free(dst_next); + } + } + return; + } + if (hi > (*dst)->lo) { + assert(lo > (*dst)->hi); + assert(hi > (*dst)->hi); + prev_dst = dst; + dst = &(*dst)->next; + continue; + } + + /* otherwise, insert discontinuous node */ + BT_nlistnode *new = calloc(1, sizeof *new); + new->lo = lo; + new->hi = hi; + new->next = *dst; + *dst = new; return; } - - if (!head->next) { - if (head->va < va) - goto append; - /* otherwise prepend and update mlist head reference */ - BT_nlistnode *new = calloc(1, sizeof *new); - new->sz = 1; - new->va = va; - new->next = head; - state->nlist = new; - } - - /* we don't need to account for a freelist node's size because we aren't - coalescing the pending freelists */ - while (head) { - BT_page *nva = head->next ? head->next->va : (void*)-1; - if (nva > va) - break; - head = head->next; - } - - append: - /* head->next is either null or has a higher address than va */ - BT_nlistnode *new = calloc(1, sizeof *new); - new->sz = 1; - new->va = va; - new->next = head->next; - head->next = new; -} - -static BT_nlistnode * -_nlist_find(BT_nlistnode *head, BT_page *va) -/* find a node */ -{ - } static void @@ -893,36 +915,33 @@ _pending_nlist_merge(BT_state *state) return; } + /* ;;: TODO: you still need to coalesce neighbor nodes in dst if we widen + them */ + /* check if src node should be merged with dst **************************/ - BT_page *dst_va = (*dst_head)->va; - size_t dst_sz = (*dst_head)->sz; - BT_page *src_va = (*src_head)->va; - /* NB: while we don't currently coalesce the pending nlist, it's not that - hard to account for if we did, so might as well generalize the merge - algorithm */ - size_t src_sz = (*src_head)->sz; - BT_page *dst_next_va = *dst_head ? (*dst_head)->next->va : 0; + BT_page *dst_nlo = (*dst_head)->next ? (*dst_head)->next->lo : 0; /* source node immediately follows dst node's termination */ - if (dst_va + dst_sz == src_va) { - (*dst_head)->sz += src_sz; /* widen dst node */ + if ((*dst_head)->hi == (*src_head)->lo) { + /* expand dst node */ + (*dst_head)->hi = (*src_head)->hi; /* advance src node and free previous */ BT_nlistnode *prev = *src_head; src_head = &(*src_head)->next; free(prev); } /* source node's termination immediately precedes dst node */ - else if (dst_next_va == src_va + src_sz) { - (*dst_head)->va = src_va; /* pull va back */ - (*dst_head)->sz += src_sz; /* widen node */ + else if ((*src_head)->hi == (*dst_head)->lo) { + /* expand dst node */ + (*src_head)->lo = (*dst_head)->lo; /* advance src node and free previous */ BT_nlistnode *prev = *src_head; src_head = &(*src_head)->next; free(prev); } - /* src node lies between but isn't contiguous with dst */ - else if (src_va > dst_va + dst_sz - && src_va + src_sz < dst_next_va) { + /* src node is discontiguously between dst head and next */ + else if ((*src_head)->lo > (*dst_head)->hi + && (*src_head)->hi < dst_nlo) { /* link src node in */ (*src_head)->next = (*dst_head)->next; (*dst_head)->next = *src_head; @@ -939,54 +958,55 @@ _pending_nlist_merge(BT_state *state) } static void -_pending_flist_insert(BT_state *state, pgno_t pg, size_t sz) -{ /* ;;: again, this logic could probably be simplified with an indirect pointer */ - BT_flistnode *head = state->pending_flist; +_flist_insert(BT_flistnode **dst, pgno_t lo, pgno_t hi) +{ + BT_flistnode **prev_dst = 0; - /* freelist may be empty. create head */ - if (head == 0) { - state->pending_flist = calloc(1, sizeof *state->pending_flist); - state->pending_flist->pg = pg; - state->pending_flist->sz = sz; - return; - } - - while (head->next) { - /* next node starts at pg higher than this freechunk's termination */ - if (head->next->pg >= pg + sz) { - break; + while(*dst) { + if (hi == (*dst)->lo) { + (*dst)->lo = lo; + /* check if we can coalesce with left neighbor */ + if (prev_dst != 0) { + bp(0); /* ;;: note, this case should not hit. keeping for debugging. */ + /* dst equals &(*prev_dst)->next */ + assert(*prev_dst != 0); + if ((*prev_dst)->hi == lo) { + (*prev_dst)->hi = (*dst)->hi; + (*prev_dst)->next = (*dst)->next; + free(*dst); + } + } + return; + } + if (lo == (*dst)->hi) { + (*dst)->hi = hi; + /* check if we can coalesce with right neighbor */ + if ((*dst)->next != 0) { + if (hi == (*dst)->next->lo) { + (*dst)->hi = (*dst)->next->hi; + BT_flistnode *dst_next = (*dst)->next; + (*dst)->next = (*dst)->next->next; + free(dst_next); + } + } + return; + } + if (hi > (*dst)->lo) { + assert(lo > (*dst)->hi); + assert(hi > (*dst)->hi); + prev_dst = dst; + dst = &(*dst)->next; + continue; } - head = head->next; - } - /* if freed chunk follows head, expand head */ - if (head->pg + head->sz == pg) { - head->sz += sz; + /* otherwise, insert discontinuous node */ + BT_flistnode *new = calloc(1, sizeof *new); + new->lo = lo; + new->hi = hi; + new->next = *dst; + *dst = new; return; } - - /* if the freed chunk precedes next, expand next and pull pg back */ - if (head->next->pg == pg + sz) { - head->next->pg = pg; - head->next->sz += sz; - return; - } - - /* otherwise, insert a new node either preceding or following head */ - BT_flistnode *new = calloc(1, sizeof *new); - new->pg = pg; - new->sz = sz; - - if (head->pg < pg + sz) { - /* should only happen if head is the first node in the freelist */ - assert(head == state->pending_flist); - new->next = head; - state->pending_flist = new; - } - else { - new->next = head->next; - head->next = new; - } } static void @@ -1001,33 +1021,33 @@ _pending_flist_merge(BT_state *state) return; } + /* ;;: TODO: you still need to coalesce neighbor nodes in dst if we widen + them */ + /* check if src node should be merged with dst **************************/ - pgno_t dst_pg = (*dst_head)->pg; - size_t dst_sz = (*dst_head)->sz; - pgno_t src_pg = (*src_head)->pg; - size_t src_sz = (*src_head)->sz; - pgno_t dst_next_pg = *dst_head ? (*dst_head)->next->pg : 0; + pgno_t dst_nlo = (*dst_head)->next ? (*dst_head)->next->lo : 0; /* source node immediately follows dst node's termination */ - if (dst_pg + dst_sz == src_pg) { - (*dst_head)->sz += src_sz; /* widen dst node */ + if ((*dst_head)->hi == (*src_head)->lo) { + /* expand dst node */ + (*dst_head)->hi = (*src_head)->hi; /* advance src node and free previous */ BT_flistnode *prev = *src_head; src_head = &(*src_head)->next; free(prev); } /* source node's termination immediately precedes dst node */ - else if (src_pg + src_sz == dst_pg) { - (*dst_head)->pg = src_pg; /* pull page back */ - (*dst_head)->sz += src_sz; /* widen node */ + else if ((*src_head)->hi == (*dst_head)->lo) { + /* expand dst node */ + (*src_head)->lo = (*dst_head)->lo; /* advance src node and free previous */ BT_flistnode *prev = *src_head; src_head = &(*src_head)->next; free(prev); } - /* src node lies between but isn't contiguous with dst */ - else if (dst_next_pg > src_pg + src_sz - && dst_pg + dst_sz < src_pg) { + /* src node is discontiguously between dst head and next */ + else if ((*src_head)->lo > (*dst_head)->hi + && (*src_head)->hi < dst_nlo) { /* link src node in */ (*src_head)->next = (*dst_head)->next; (*dst_head)->next = *src_head; @@ -1046,30 +1066,40 @@ _pending_flist_merge(BT_state *state) /* ;;: todo move shit around */ static void -_bt_delco_droptree2(BT_state *state, pgno_t nodepg, uint8_t depth, uint8_t maxdepth) +_bt_delco_droptree2(BT_state *state, pgno_t nodepg, + uint8_t depth, uint8_t maxdepth, int isdirty) { + int ischilddirty = 0; + /* branch */ if (depth != maxdepth) { BT_page *node = _node_get(state, nodepg); - for (size_t i = 0; i < BT_DAT_MAXKEYS; i++) { + for (size_t i = 0; i < BT_DAT_MAXKEYS; i++) { BT_kv entry = node->datk[i]; if (entry.fo == 0) break; /* done */ - _bt_delco_droptree2(state, entry.fo, depth+1, maxdepth); + ischilddirty = _bt_ischilddirty(node, i); + _bt_delco_droptree2(state, entry.fo, depth+1, maxdepth, ischilddirty); } } - _pending_nlist_insert(state, nodepg); + /* branch and leaf */ + if (isdirty) { + _nlist_insert(state, &state->nlist, nodepg); + } + else { + _nlist_insert(state, &state->pending_nlist, nodepg); + } } static void -_bt_delco_droptree(BT_state *state, pgno_t nodepg, uint8_t depth) +_bt_delco_droptree(BT_state *state, pgno_t nodepg, uint8_t depth, int isdirty) { /* completely drop a tree. Assume that all leaves under the tree are free (pgno = 0) */ assert(nodepg >= 2); BT_meta *meta = state->meta_pages[state->which]; - return _bt_delco_droptree2(state, nodepg, depth, meta->depth); + return _bt_delco_droptree2(state, nodepg, depth, meta->depth, isdirty); } static void @@ -1099,7 +1129,8 @@ _bt_delco_trim_rsubtree_lhs2(BT_state *state, vaof_t lo, vaof_t hi, pgno_t childpg = node->datk[i].fo; if (childpg == 0) break; - _bt_delco_droptree(state, childpg, depth+1); + int ischilddirty = _bt_ischilddirty(node, i); + _bt_delco_droptree(state, childpg, depth+1, ischilddirty); } } @@ -1160,7 +1191,8 @@ _bt_delco_trim_lsubtree_rhs2(BT_state *state, vaof_t lo, vaof_t hi, pgno_t childpg = node->datk[i].fo; if (childpg == 0) break; - _bt_delco_droptree(state, childpg, depth+1); + int ischilddirty = _bt_ischilddirty(node, i); + _bt_delco_droptree(state, childpg, depth+1, ischilddirty); } } @@ -1275,7 +1307,8 @@ _bt_delco(BT_state *state, vaof_t lo, vaof_t hi, /* drop all trees between the two subtrees */ for (size_t i = loidx+1; i < hiidx; i++) { pgno_t childpg = node->datk[i].fo; - _bt_delco_droptree(state, childpg, depth); + int ischilddirty = _bt_ischilddirty(node, i); + _bt_delco_droptree(state, childpg, depth+1, ischilddirty); } /* move buffer */ @@ -1458,28 +1491,25 @@ _bt_delete(BT_state *state, vaof_t lo, vaof_t hi) static int _mlist_new(BT_state *state) { - /* implemented separate from _mlist_read since _mlist_read uses lo va == 0 to - stop parsing node's data. This, however, is a valid starting condition when - freshly creating the btree */ - BT_meta *meta = state->meta_pages[state->which]; BT_page *root = _node_get(state, meta->root); - assert(root->datk[0].fo == 0); + /* assert(root->datk[0].fo == 0); */ + size_t N = _bt_numkeys(root); vaof_t lo = root->datk[0].va; - vaof_t hi = root->datk[1].va; - size_t len = hi - lo; + vaof_t hi = root->datk[N-1].va; BT_mlistnode *head = calloc(1, sizeof *head); head->next = 0; - head->sz = len; - head->va = off2addr(lo); + head->lo = off2addr(lo); + head->hi = off2addr(hi); state->mlist = head; return BT_SUCC; } +#if 0 static int _flist_grow(BT_state *state, BT_flistnode *space) /* growing the flist consists of expanding the backing persistent file, pushing @@ -1500,7 +1530,7 @@ _flist_grow(BT_state *state, BT_flistnode *space) for (; tail->next; tail = tail->next) ; - pgno_t lastpgfree = tail->pg + tail->sz; + pgno_t lastpgfree = tail->hi; /* ;;: TODO, make sure you are certain of this logic. Further, add assertions regarding relative positions of state->file_size, state->frontier, and @@ -1517,7 +1547,7 @@ _flist_grow(BT_state *state, BT_flistnode *space) /* if the frontier (last pg in use) is less than the last page free, we should coalesce the new node with the tail. */ if (state->frontier <= lastpgfree) { - tail->sz += PMA_GROW_SIZE; + tail->hi += PMA_GROW_SIZE; /* ;;: THIS IS INCORRECT */ } /* otherwise, a new node needs to be allocated */ else { @@ -1525,7 +1555,7 @@ _flist_grow(BT_state *state, BT_flistnode *space) /* since the frontier exceeds the last pg free, new freespace should naturally be allocated at the frontier */ new->pg = state->frontier; - new->sz = PMA_GROW_SIZE; + new->hi = PMA_GROW_SIZE; tail->next = new; } @@ -1534,6 +1564,7 @@ _flist_grow(BT_state *state, BT_flistnode *space) return BT_SUCC; } +#endif static int _flist_new(BT_state *state) @@ -1541,17 +1572,17 @@ _flist_new(BT_state *state) { BT_meta *meta = state->meta_pages[state->which]; BT_page *root = _node_get(state, meta->root); - assert(root->datk[0].fo == 0); + /* assert(root->datk[0].fo == 0); */ + size_t N = _bt_numkeys(root); vaof_t lo = root->datk[0].va; - vaof_t hi = root->datk[1].va; + vaof_t hi = root->datk[N-1].va; size_t len = hi - lo; BT_flistnode *head = calloc(1, sizeof *head); - head->next = 0; - head->sz = len; - head->pg = FLIST_PG_START; + head->lo = FLIST_PG_START; + head->hi = FLIST_PG_START + len; state->flist = head; return BT_SUCC; @@ -1559,14 +1590,13 @@ _flist_new(BT_state *state) static int _nlist_new(BT_state *state) -#define NLIST_PG_START 2 /* the third page */ { BT_meta *meta = state->meta_pages[state->which]; BT_nlistnode *head = calloc(1, sizeof *head); /* the size of a new node freelist is just the first stripe length */ - head->sz = BLK_BASE_LEN0; - head->va = &((BT_page *)state->map)[BT_NUMMETAS]; + head->lo = &((BT_page *)state->map)[BT_NUMMETAS]; + head->hi = head->lo + BLK_BASE_LEN0; head->next = 0; state->nlist = head; @@ -1588,6 +1618,7 @@ _nlist_delete(BT_state *state) return BT_SUCC; } +#if 0 static BT_nlistnode * _nlist_read_prev(BT_nlistnode *head, BT_nlistnode *curr) { @@ -1818,6 +1849,7 @@ _mlist_read(BT_state *state) state->mlist = head; return BT_SUCC; } +#endif static int _mlist_delete(BT_state *state) @@ -1833,103 +1865,11 @@ _mlist_delete(BT_state *state) return BT_SUCC; } -static void -_flist_split(BT_flistnode *head, BT_flistnode **left, BT_flistnode **right) -/* split flist starting at head into two lists, left and right at the midpoint - of head */ -{ - assert(head != 0); - BT_flistnode *slow, *fast; - slow = head; fast = head->next; - - while (fast) { - fast = fast->next; - if (fast) { - slow = slow->next; - fast = fast->next; - } - } - - *left = head; - *right = slow->next; - slow->next = 0; -} - -static BT_flistnode * -_flist_merge2(BT_flistnode *l, BT_flistnode *r) -/* returns the furthest node in l that has a pg less than the first node in r */ -{ - assert(l); - assert(r); - - BT_flistnode *curr, *prev; - prev = l; - curr = l->next; - - while (curr) { - if (curr->pg < r->pg) { - prev = curr; - curr = curr->next; - } - } - - if (prev->pg < r->pg) - return prev; - - return 0; -} - -static BT_flistnode * -_flist_merge(BT_flistnode *l, BT_flistnode *r) -/* merge two sorted flists, l and r and return the sorted result */ -{ - BT_flistnode *head; - - if (!l) return r; - if (!r) return l; - - while (l && r) { - if (l->next == 0) { - l->next = r; - break; - } - if (r->next == 0) { - break; - } - - BT_flistnode *ll = _flist_merge2(l, r); - BT_flistnode *rnext = r->next; - /* insert head of r into appropriate spot in l */ - r->next = ll->next; - ll->next = r; - /* adjust l and r heads */ - l = ll->next; - r = rnext; - } - - return head; -} - -BT_flistnode * -_flist_mergesort(BT_flistnode *head) -{ - if (head == 0 || head->next == 0) - return head; - - BT_flistnode *l, *r; - _flist_split(head, &l, &r); - - /* ;;: todo, make it non-recursive. Though, shouldn't matter as much here - since O(log n). merge already non-recursive */ - _flist_mergesort(l); - _flist_mergesort(r); - - return _flist_merge(l, r); -} - +#if 0 BT_flistnode * _flist_read2(BT_state *state, BT_page *node, uint8_t maxdepth, uint8_t depth) { + size_t N = _bt_numkeys(node); /* leaf */ if (depth == maxdepth) { BT_flistnode *head, *prev; @@ -1938,7 +1878,7 @@ _flist_read2(BT_state *state, BT_page *node, uint8_t maxdepth, uint8_t depth) /* ;;: fixme the head won't get populated in this logic */ size_t i = 0; BT_kv *kv = &node->datk[i]; - while (i < BT_DAT_MAXKEYS - 1) { + while (i < N-1) { /* Just blindly append nodes since they aren't guaranteed sorted */ BT_flistnode *new = calloc(1, sizeof *new); vaof_t hi = node->datk[i+1].va; @@ -1952,6 +1892,15 @@ _flist_read2(BT_state *state, BT_page *node, uint8_t maxdepth, uint8_t depth) kv = &node->datk[++i]; } + for (size_t i = 0; i < N-1; i++) { + vaof_t hi = node->datk[i+1].va; + vaof_t lo = node->datk[i].va; + size_t len = hi - lo; + pgno_t fo = node->datk[i].fo; + /* not free */ + if (fo != 0) + continue; + } return head; } @@ -1959,7 +1908,7 @@ _flist_read2(BT_state *state, BT_page *node, uint8_t maxdepth, uint8_t depth) size_t i = 0; BT_flistnode *head, *prev; head = prev = 0; - for (; i < BT_DAT_MAXKEYS; ++i) { + for (; i < N; ++i) { BT_kv kv = node->datk[i]; if (kv.fo == BT_NOPAGE) continue; @@ -2010,6 +1959,7 @@ _flist_read(BT_state *state) state->flist = head; return BT_SUCC; } +#endif static int _flist_delete(BT_state *state) @@ -2094,7 +2044,7 @@ _bt_state_restore_maps2(BT_state *state, BT_page *node, return; } - /* branch - bfs all subtrees */ + /* branch - dfs all subtrees */ for (size_t i = 0; i < N-1; i++) { /* ;;: assuming node stripes when partition striping is implemented will be 1:1 mapped to disk for simplicity. If that is not the case, they should @@ -2275,6 +2225,39 @@ _bt_state_meta_new(BT_state *state) return BT_SUCC; } +static void +_mlist_record_alloc(BT_state *state, BYTE *lo, BYTE *hi) +/* record an allocation in the mlist */ +{ + +} + +static void +_freelist_restore2(BT_state *state, BT_page *node, + uint8_t depth, uint8_t maxdepth) +{ + size_t N = _bt_numkeys(node); + + /* leaf */ + if (depth == maxdepth) { + + } + /* branch */ + +} + +static void +_freelist_restore(BT_state *state) +/* restores the mlist, nlist, and mlist */ +{ + BT_meta *meta = state->meta_pages[state->which]; + BT_page *root = _node_get(state, meta->root); + assert(SUCC(_nlist_new(state))); + assert(SUCC(_mlist_new(state))); + assert(SUCC(_flist_new(state))); + _freelist_restore2(state, root, 1, meta->depth); +} + static int _bt_state_load(BT_state *state) { @@ -2313,7 +2296,7 @@ _bt_state_load(BT_state *state) } BYTE *nullspace_addr = BT_MAPADDR + (BT_META_SECTION_WIDTH + BLK_BASE_LEN0); - size_t nullspace_len = BT_ADDRSIZE - (BT_META_SECTION_WIDTH + BLK_BASE_LEN0); + size_t nullspace_len = BLK_BASE_LEN_TOTAL - (BT_META_SECTION_WIDTH + BLK_BASE_LEN0); if (nullspace_addr != mmap(nullspace_addr, nullspace_len, BT_PROT_FREE, @@ -2348,9 +2331,7 @@ _bt_state_load(BT_state *state) _bt_state_restore_maps(state); /* restore ephemeral freelists */ - assert(SUCC(_nlist_read(state))); - assert(SUCC(_mlist_read(state))); - assert(SUCC(_flist_read(state))); + _freelist_restore(state); if (fstat(state->data_fd, &stat) != 0) return errno; @@ -2375,26 +2356,27 @@ _bt_falloc(BT_state *state, size_t pages) pgno_t ret = 0; /* first fit */ - /* ;;: is there any reason to use a different allocation strategy for disk? */ for (; *n; n = &(*n)->next) { + size_t sz_p = (*n)->hi - (*n)->lo; /* perfect fit */ - if ((*n)->sz == pages) { - pgno_t ret; - ret = (*n)->pg; + if (sz_p == pages) { + pgno_t ret = (*n)->lo; + BT_flistnode *prev = *n; *n = (*n)->next; + free(prev); return ret; } /* larger than necessary: shrink the node */ - if ((*n)->sz > pages) { + if (sz_p > pages) { pgno_t ret; - ret = (*n)->pg; - (*n)->sz -= pages; - (*n)->pg = (*n)->pg + pages; + ret = (*n)->lo; + (*n)->lo += pages; return ret; } } - return 0; + DPUTS("flist out of mem!"); + abort(); } static int @@ -2642,17 +2624,20 @@ bt_malloc(BT_state *state, size_t pages) void *ret = 0; /* first fit */ for (; *n; n = &(*n)->next) { + size_t sz_p = addr2off((*n)->hi) - addr2off((*n)->lo); + /* perfect fit */ - if ((*n)->sz == pages) { - ret = (*n)->va; + if (sz_p == pages) { + ret = (*n)->lo; + BT_mlistnode *prev = *n; *n = (*n)->next; + free(prev); break; } /* larger than necessary: shrink the node */ - if ((*n)->sz > pages) { - ret = (*n)->va; - (*n)->sz -= pages; - (*n)->va = (BT_page *)(*n)->va + pages; + if (sz_p > pages) { + ret = (*n)->lo; + (*n)->lo = (void *)((BT_page *)(*n)->lo + pages); break; } // XX return early if nothing suitable found in freelist @@ -2684,10 +2669,36 @@ bt_malloc(BT_state *state, size_t pages) void bt_free(BT_state *state, void *lo, void *hi) { + BT_meta *meta = state->meta_pages[state->which]; + BT_page *root = _node_get(state, meta->root); vaof_t looff = addr2off(lo); vaof_t hioff = addr2off(hi); + pgno_t lopg, hipg; + BT_findpath path = {0}; + + if (!SUCC(_bt_find(state, &path, looff, hioff))) { + DPRINTF("Failed to find range: (%p, %p)", lo, hi); + abort(); + } + + /* insert null into btree */ _bt_insert(state, looff, hioff, 0); + /* insert freed range into mlist */ _mlist_insert(state, lo, hi); + /* insert freed range into flist */ + BT_page *leaf = path.path[path.depth]; + size_t childidx = path.idx[path.depth]; + int isdirty = _bt_ischilddirty(leaf, childidx); + BT_kv kv = leaf->datk[childidx]; + vaof_t offset = looff - kv.va; + lopg = kv.fo + offset; + hipg = lopg + (looff - hioff); + if (isdirty) { + _flist_insert(&state->flist, lopg, hipg); + } + else { + _flist_insert(&state->pending_flist, lopg, hipg); + } /* ;;: is this correct? Shouldn't this actually happen when we merge the pending_mlist on sync? */ @@ -2872,7 +2883,7 @@ _bt_data_cow(BT_state *state, vaof_t lo, vaof_t hi, pgno_t pg) _bt_insert(state, lo, hi, newpg); - _pending_flist_insert(state, pg, len); + _flist_insert(&state->pending_flist, pg, pg + len); return newpg; } @@ -2945,20 +2956,21 @@ bt_next_alloc(BT_state *state, void *p, void **lo, void **hi) it falls in. */ { BT_mlistnode *head = state->mlist; + BYTE *pb = p; while (head) { /* at last free block, different logic applies */ if (head->next == 0) goto end; /* p is in a free range, return the allocated hole after it */ - if (head->va <= p - && head->va + head->sz > p) { + if (head->lo <= pb + && head->hi > pb) { goto found; } /* p is alloced, return this hole */ - if (head->next->va > p - && head->va + head->sz <= p) { + if (head->next->lo > pb + && head->hi <= pb) { goto found; } @@ -2970,21 +2982,21 @@ bt_next_alloc(BT_state *state, void *p, void **lo, void **hi) found: /* the alloced space begins at the end of the free block */ - *lo = head->va + head->sz; + *lo = head->hi; /* ... and ends at the start of the next free block */ - *hi = head->next->va; + *hi = head->next->lo; return BT_SUCC; end: - void *pma_end = (void *)((uintptr_t)BT_MAPADDR + BT_ADDRSIZE); - assert(head->va + head->sz <= pma_end); + BYTE *pma_end = (void *)((uintptr_t)BT_MAPADDR + BT_ADDRSIZE); + assert(head->hi <= pma_end); /* no alloced region between tail of freelist and end of pma memory space */ - if (head->va + head->sz == pma_end) + if (head->hi == pma_end) return 1; /* otherwise, return the alloced region between the tail of the freelist and the end of the memory arena */ - *lo = head->va + head->sz; + *lo = head->hi; *hi = pma_end; return BT_SUCC; } From f601a61827b4c17d222836c5ad6e0ab8eb07b7be Mon Sep 17 00:00:00 2001 From: Edward Amsden Date: Tue, 19 Dec 2023 09:56:34 -0600 Subject: [PATCH 119/128] pma: clean up a bunch of warnings (not all though) --- rust/ares_pma/c-src/btree.c | 52 +++++++++++++++---------------------- 1 file changed, 21 insertions(+), 31 deletions(-) diff --git a/rust/ares_pma/c-src/btree.c b/rust/ares_pma/c-src/btree.c index 2796810..9992dcb 100644 --- a/rust/ares_pma/c-src/btree.c +++ b/rust/ares_pma/c-src/btree.c @@ -102,7 +102,6 @@ addr2off(void *p) { uintptr_t pu = (uintptr_t)p; assert((pu & ((1 << BT_PAGEBITS) - 1)) == 0); /* p must be page-aligned */ - uintptr_t off = pu - (uintptr_t)BT_MAPADDR; return (vaof_t)(pu >> BT_PAGEBITS); } @@ -148,7 +147,7 @@ off2addr(vaof_t off) FO2PA: file offset to page get a reference to a BT_page from a file offset - /* ;;: can simplify: + ;;: can simplify: ((BT_page*)state->map)[fo] */ @@ -178,7 +177,6 @@ struct BT_pageheader { /* btree key/value data format -/* BT_dat is used to provide a view of the data section in a BT_page where data is stored like: va fo va fo @@ -353,7 +351,7 @@ struct BT_state { //// =========================================================================== //// btree internal routines -static void _bt_printnode(BT_page *node); /* ;;: tmp */ +static void _bt_printnode(BT_page *node) __attribute__((unused)); /* ;;: tmp */ static int _bt_insertdat(vaof_t lo, vaof_t hi, pgno_t fo, BT_page *parent, size_t childidx); /* ;;: tmp */ @@ -456,7 +454,7 @@ static void * _bt_bsearch(BT_page *page, vaof_t va) { /* ;;: todo: actually bsearch rather than linear */ - for (BT_kv *kv = &page->datk[0]; kv <= BT_dat_maxva(page); kv++) { + for (BT_kv *kv = &page->datk[0]; kv <= (BT_kv *)BT_dat_maxva(page); kv++) { if (kv->va == va) return kv; } @@ -753,7 +751,6 @@ _bt_delco_1pass_0(BT_state *state, vaof_t lo, vaof_t hi, /* node->datk[loidx] - node->datk[hiidx] are the bounds on which to perform the dfs */ for (i = loidx; i < hiidx; i++) { - vaof_t llo = node->datk[i].va; pgno_t pg = node->datk[i].fo; /* if at the leaf level, terminate with failure if pg is not free */ @@ -1099,7 +1096,7 @@ _bt_delco_droptree(BT_state *state, pgno_t nodepg, uint8_t depth, int isdirty) (pgno = 0) */ assert(nodepg >= 2); BT_meta *meta = state->meta_pages[state->which]; - return _bt_delco_droptree2(state, nodepg, depth, meta->depth, isdirty); + _bt_delco_droptree2(state, nodepg, depth, meta->depth, isdirty); } static void @@ -1152,7 +1149,7 @@ _bt_delco_trim_rsubtree_lhs2(BT_state *state, vaof_t lo, vaof_t hi, return; /* otherwise, recur on subtree */ pgno_t rsubtree = node->datk[hiidx].fo; - return _bt_delco_trim_rsubtree_lhs2(state, lo, hi, rsubtree, depth+1, maxdepth); + _bt_delco_trim_rsubtree_lhs2(state, lo, hi, rsubtree, depth+1, maxdepth); } static void @@ -1160,7 +1157,7 @@ _bt_delco_trim_rsubtree_lhs(BT_state *state, vaof_t lo, vaof_t hi, pgno_t nodepg, uint8_t depth) { BT_meta *meta = state->meta_pages[state->which]; - return _bt_delco_trim_rsubtree_lhs2(state, lo, hi, nodepg, depth, meta->depth); + _bt_delco_trim_rsubtree_lhs2(state, lo, hi, nodepg, depth, meta->depth); } static void @@ -1211,7 +1208,7 @@ _bt_delco_trim_lsubtree_rhs2(BT_state *state, vaof_t lo, vaof_t hi, return; /* otherwise, recur on the left subtree */ pgno_t lsubtree = node->datk[loidx].fo; - return _bt_delco_trim_lsubtree_rhs2(state, lo, hi, lsubtree, depth+1, maxdepth); + _bt_delco_trim_lsubtree_rhs2(state, lo, hi, lsubtree, depth+1, maxdepth); } static void @@ -1219,7 +1216,7 @@ _bt_delco_trim_lsubtree_rhs(BT_state *state, vaof_t lo, vaof_t hi, pgno_t nodepg, uint8_t depth) { BT_meta *meta = state->meta_pages[state->which]; - return _bt_delco_trim_lsubtree_rhs2(state, lo, hi, nodepg, depth, meta->depth); + _bt_delco_trim_lsubtree_rhs2(state, lo, hi, nodepg, depth, meta->depth); } static void @@ -1399,7 +1396,6 @@ _bt_insert(BT_state *state, vaof_t lo, vaof_t hi, pgno_t fo) /* handles CoWing/splitting of the root page since it's special cased. Then passes the child matching hi/lo to _bt_insert2 */ { - int rc; BT_meta *meta = state->meta_pages[state->which]; BT_page *root = _node_get(state, meta->root); @@ -1591,7 +1587,6 @@ _flist_new(BT_state *state) static int _nlist_new(BT_state *state) { - BT_meta *meta = state->meta_pages[state->which]; BT_nlistnode *head = calloc(1, sizeof *head); /* the size of a new node freelist is just the first stripe length */ @@ -2051,7 +2046,7 @@ _bt_state_restore_maps2(BT_state *state, BT_page *node, be handled here. */ pgno_t pg = node->datk[i].fo; BT_page *child = _node_get(state, pg); - return _bt_state_restore_maps2(state, child, depth+1, maxdepth); + _bt_state_restore_maps2(state, child, depth+1, maxdepth); } } @@ -2135,11 +2130,11 @@ _bt_state_read_header(BT_state *state) } /* validate flags */ - if (m1->flags & BP_META != BP_META) { + if ((m1->flags & BP_META) != BP_META) { DPRINTF("metapage 0x%pX missing meta page flag", m1); return EINVAL; } - if (m2->flags & BP_META != BP_META) { + if ((m2->flags & BP_META) != BP_META) { DPRINTF("metapage 0x%pX missing meta page flag", m2); return EINVAL; } @@ -2170,7 +2165,6 @@ _bt_state_meta_new(BT_state *state) { BT_page *p1, *p2, *root; BT_meta meta = {0}; - int rc, pagesize; TRACE(); @@ -2187,8 +2181,6 @@ _bt_state_meta_new(BT_state *state) root = _bt_nalloc(state); _bt_root_new(&meta, root); - pagesize = sizeof *p1; - /* initialize meta struct */ meta.magic = BT_MAGIC; meta.version = BT_VERSION; @@ -2360,7 +2352,7 @@ _bt_falloc(BT_state *state, size_t pages) size_t sz_p = (*n)->hi - (*n)->lo; /* perfect fit */ if (sz_p == pages) { - pgno_t ret = (*n)->lo; + ret = (*n)->lo; BT_flistnode *prev = *n; *n = (*n)->next; free(prev); @@ -2368,7 +2360,6 @@ _bt_falloc(BT_state *state, size_t pages) } /* larger than necessary: shrink the node */ if (sz_p > pages) { - pgno_t ret; ret = (*n)->lo; (*n)->lo += pages; return ret; @@ -2396,11 +2387,10 @@ _bt_sync_leaf(BT_state *state, BT_page *node) { /* msync all of a leaf's data that is dirty. The caller is expected to sync the node itself and mark it as clean in the parent. */ - pgno_t pg; size_t i = 0; size_t N = _bt_numkeys(node); - for (size_t i = 0; i < N-1; i++) { + for (i = 0; i < N-1; i++) { if (!_bt_ischilddirty(node, i)) continue; /* not dirty. nothing to do */ @@ -2524,7 +2514,7 @@ _bt_sync(BT_state *state, BT_page *node, uint8_t depth, uint8_t maxdepth) BT_page *child = _node_get(state, node->datk[i].fo); /* recursively sync the child's data */ - if (rc = _bt_sync(state, child, depth+1, maxdepth)) + if ((rc = _bt_sync(state, child, depth+1, maxdepth))) return rc; /* sync the child node */ @@ -2669,8 +2659,6 @@ bt_malloc(BT_state *state, size_t pages) void bt_free(BT_state *state, void *lo, void *hi) { - BT_meta *meta = state->meta_pages[state->which]; - BT_page *root = _node_get(state, meta->root); vaof_t looff = addr2off(lo); vaof_t hioff = addr2off(hi); pgno_t lopg, hipg; @@ -2725,7 +2713,7 @@ bt_sync(BT_state *state) BT_page *root = _node_get(state, meta->root); int rc = 0; - if (rc = _bt_sync(state, root, 1, meta->depth)) + if ((rc = _bt_sync(state, root, 1, meta->depth))) return rc; /* merge the pending freelists */ @@ -2745,7 +2733,7 @@ bt_sync(BT_state *state) } /* then sync the metapage */ - if (rc = _bt_sync_meta(state)) + if ((rc = _bt_sync_meta(state))) return rc; return BT_SUCC; @@ -2866,7 +2854,7 @@ _bt_data_cow(BT_state *state, vaof_t lo, vaof_t hi, pgno_t pg) /* write call puts data in the unified buffer cache without having to map virtual memory */ - if (pwrite(state->data_fd, loaddr, bytelen, offset) != bytelen) + if (pwrite(state->data_fd, loaddr, bytelen, offset) != (ssize_t)bytelen) abort(); /* maps new file offset with same data back into memory */ @@ -2935,6 +2923,7 @@ _bt_dirty(BT_state *state, vaof_t lo, vaof_t hi, pgno_t nodepg, /* iteratively recurse on all entries */ _bt_dirty(state, lo, hi, childpg, depth+1, maxdepth); } + return BT_SUCC; } int @@ -2957,6 +2946,7 @@ bt_next_alloc(BT_state *state, void *p, void **lo, void **hi) { BT_mlistnode *head = state->mlist; BYTE *pb = p; + BYTE* pma_end; while (head) { /* at last free block, different logic applies */ if (head->next == 0) @@ -2988,7 +2978,7 @@ bt_next_alloc(BT_state *state, void *p, void **lo, void **hi) return BT_SUCC; end: - BYTE *pma_end = (void *)((uintptr_t)BT_MAPADDR + BT_ADDRSIZE); + pma_end = (void *)((uintptr_t)BT_MAPADDR + BT_ADDRSIZE); assert(head->hi <= pma_end); /* no alloced region between tail of freelist and end of pma memory space */ if (head->hi == pma_end) @@ -3062,7 +3052,7 @@ _sham_sync(BT_state *state) static void _bt_printnode(BT_page *node) { - fprintf(stderr, "node: %p\n", node); + fprintf(stderr, "node: %p\n", (void*)node); fprintf(stderr, "data: \n"); for (size_t i = 0; i < BT_DAT_MAXKEYS; ++i) { if (i && node->datk[i].va == 0) From 232a5bdff5c215100d57ebf988d43b3d8cdaf3f7 Mon Sep 17 00:00:00 2001 From: barter-simsum Date: Tue, 19 Dec 2023 16:29:03 -0500 Subject: [PATCH 120/128] pma: freelist restoration using *_record_alloc and insertdat bugfix --- rust/ares_pma/c-src/btree.c | 223 ++++++++++++++++++++++++++++-------- 1 file changed, 175 insertions(+), 48 deletions(-) diff --git a/rust/ares_pma/c-src/btree.c b/rust/ares_pma/c-src/btree.c index 9992dcb..6e12192 100644 --- a/rust/ares_pma/c-src/btree.c +++ b/rust/ares_pma/c-src/btree.c @@ -398,6 +398,126 @@ _fo_get(BT_state *state, BT_page *node) return BY2FO(vaddr - start); } +static void +_mlist_record_alloc(BT_state *state, void *lo, void *hi) +{ + BT_mlistnode **head = &state->mlist; + BYTE *lob = lo; + BYTE *hib = hi; + while (*head) { + /* found chunk */ + if ((*head)->lo <= lob && (*head)->hi >= hib) + break; + assert((*head)->next); + head = &(*head)->next; + } + + if (hib < (*head)->hi) { + if (lob > (*head)->lo) { + BT_mlistnode *left = *head; + BT_mlistnode *right = calloc(1, sizeof *right); + right->hi = left->hi; + right->lo = hib; + right->next = left->next; + left->hi = lob; + left->next = right; + } + else { + /* lob equal */ + (*head)->lo = hib; + } + } + else if (lob > (*head)->lo) { + /* hib equal */ + (*head)->hi = lob; + } + else { + /* equals */ + BT_mlistnode *next = (*head)->next; + free(*head); + *head = next; + } +} + +static void +_nlist_record_alloc(BT_state *state, BT_page *lo) +{ + BT_nlistnode **head = &state->nlist; + BT_page *hi = lo + 1; + while (*head) { + /* found chunk */ + if ((*head)->lo <= lo && (*head)->hi >= hi) + break; + assert((*head)->next); + head = &(*head)->next; + } + + if (hi < (*head)->hi) { + if (lo > (*head)->lo) { + BT_nlistnode *left = *head; + BT_nlistnode *right = calloc(1, sizeof *right); + right->hi = left->hi; + right->lo = hi; + right->next = left->next; + left->hi = lo; + left->next = right; + } + else { + /* lo equal */ + (*head)->lo = hi; + } + } + else if (lo > (*head)->lo) { + /* hi equal */ + (*head)->hi = lo; + } + else { + /* equals */ + BT_nlistnode *next = (*head)->next; + free(*head); + *head = next; + } +} + +static void +_flist_record_alloc(BT_state *state, pgno_t lo, pgno_t hi) +{ + BT_flistnode **head = &state->flist; + while (*head) { + /* found chunk */ + if ((*head)->lo <= lo && (*head)->hi >= hi) + break; + assert((*head)->next); + head = &(*head)->next; + } + + if (hi < (*head)->hi) { + if (lo > (*head)->lo) { + BT_flistnode *left = *head; + BT_flistnode *right = calloc(1, sizeof *right); + right->hi = left->hi; + right->lo = hi; + right->next = left->next; + left->hi = lo; + left->next = right; + } + else { + /* lo equal */ + (*head)->lo = hi; + } + } + else if (lo > (*head)->lo) { + /* hi equal */ + (*head)->hi = lo; + } + else { + /* equals */ + BT_flistnode *next = (*head)->next; + free(*head); + *head = next; + } +} + static BT_page * _bt_nalloc(BT_state *state) /* allocate a node in the node freelist */ @@ -410,25 +530,18 @@ _bt_nalloc(BT_state *state) for (; *n; n = &(*n)->next) { size_t sz_p = (*n)->hi - (*n)->lo; - /* perfect fit */ - if (sz_p == 1) { + + /* ;;: refactor? this is ridiculous */ + if (sz_p >= 1) { ret = (*n)->lo; - BT_nlistnode *prev = *n; - *n = (*n)->next; - free(prev); - break; - } - /* larger than necessary: shrink the node */ - if (sz_p > 1) { - ret = (*n)->lo; - (*n)->lo += 1; + _nlist_record_alloc(state, ret); break; } } if (ret == 0) { DPUTS("nlist out of mem!"); - abort(); + return 0; } /* make node writable */ @@ -687,7 +800,9 @@ _bt_insertdat(vaof_t lo, vaof_t hi, pgno_t fo, vaof_t oldfo = parent->datk[childidx].fo; parent->datk[childidx].fo = fo; parent->datk[childidx+1].va = hi; - parent->datk[childidx+1].fo = oldfo + (hi - llo); + parent->datk[childidx+1].fo = (oldfo == 0) + ? 0 + : oldfo + (hi - llo); } else if (hhi == hi) { _bt_datshift(parent, childidx + 1, 1); @@ -2217,13 +2332,6 @@ _bt_state_meta_new(BT_state *state) return BT_SUCC; } -static void -_mlist_record_alloc(BT_state *state, BYTE *lo, BYTE *hi) -/* record an allocation in the mlist */ -{ - -} - static void _freelist_restore2(BT_state *state, BT_page *node, uint8_t depth, uint8_t maxdepth) @@ -2232,10 +2340,34 @@ _freelist_restore2(BT_state *state, BT_page *node, /* leaf */ if (depth == maxdepth) { - + for (size_t i = 0; i < N-1; i++) { + /* if allocated */ + if (node->datk[i].fo != 0) { + /* record allocated memory range */ + BT_page *lo = off2addr(node->datk[i].va); + BT_page *hi = off2addr(node->datk[i+1].va); + _mlist_record_alloc(state, lo, hi); + /* record allocated file range */ + ssize_t siz_p = hi - lo; + assert(siz_p > 0); + assert(siz_p < UINT32_MAX); + pgno_t lofo = node->datk[i].fo; + pgno_t hifo = lofo + (pgno_t)siz_p; + _flist_record_alloc(state, lofo, hifo); + } + } + return; } /* branch */ - + for (size_t i = 0; i < N-1; i++) { + pgno_t fo = node->datk[i].fo; + if (fo != 0) { + /* record allocated node */ + BT_page *child = _node_get(state, fo); + _nlist_record_alloc(state, child); + _freelist_restore2(state, child, depth+1, maxdepth); + } + } } static void @@ -2247,6 +2379,8 @@ _freelist_restore(BT_state *state) assert(SUCC(_nlist_new(state))); assert(SUCC(_mlist_new(state))); assert(SUCC(_flist_new(state))); + /* first record root's allocation */ + _nlist_record_alloc(state, root); _freelist_restore2(state, root, 1, meta->depth); } @@ -2350,24 +2484,21 @@ _bt_falloc(BT_state *state, size_t pages) /* first fit */ for (; *n; n = &(*n)->next) { size_t sz_p = (*n)->hi - (*n)->lo; - /* perfect fit */ - if (sz_p == pages) { + + if (sz_p >= pages) { ret = (*n)->lo; - BT_flistnode *prev = *n; - *n = (*n)->next; - free(prev); - return ret; - } - /* larger than necessary: shrink the node */ - if (sz_p > pages) { - ret = (*n)->lo; - (*n)->lo += pages; - return ret; + pgno_t hi = ret + pages; + _flist_record_alloc(state, ret, hi); + break; } } - DPUTS("flist out of mem!"); - abort(); + if (ret == 0) { + DPUTS("flist out of mem!"); + return UINT32_MAX; + } + + return ret; } static int @@ -2616,21 +2747,17 @@ bt_malloc(BT_state *state, size_t pages) for (; *n; n = &(*n)->next) { size_t sz_p = addr2off((*n)->hi) - addr2off((*n)->lo); - /* perfect fit */ - if (sz_p == pages) { + if (sz_p >= pages) { ret = (*n)->lo; - BT_mlistnode *prev = *n; - *n = (*n)->next; - free(prev); + BT_page *hi = (BT_page *)ret + pages; + _mlist_record_alloc(state, ret, hi); break; } - /* larger than necessary: shrink the node */ - if (sz_p > pages) { - ret = (*n)->lo; - (*n)->lo = (void *)((BT_page *)(*n)->lo + pages); - break; - } - // XX return early if nothing suitable found in freelist + // XX return early if nothing suitable found in freelist + } + if (ret == 0) { + DPUTS("mlist out of mem!"); + return 0; } pgno_t pgno = _bt_falloc(state, pages); From 0dac274a63d32c4696533ea3d92d21915a738033 Mon Sep 17 00:00:00 2001 From: barter-simsum Date: Tue, 19 Dec 2023 17:47:57 -0500 Subject: [PATCH 121/128] pma: minor bug fixes --- rust/ares_pma/c-src/btree.c | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/rust/ares_pma/c-src/btree.c b/rust/ares_pma/c-src/btree.c index 6e12192..cd4b456 100644 --- a/rust/ares_pma/c-src/btree.c +++ b/rust/ares_pma/c-src/btree.c @@ -101,6 +101,8 @@ addr2off(void *p) /* convert a pointer into a 32-bit page offset */ { uintptr_t pu = (uintptr_t)p; + assert(pu >= (uintptr_t)BT_MAPADDR); + pu -= (uintptr_t)BT_MAPADDR; assert((pu & ((1 << BT_PAGEBITS) - 1)) == 0); /* p must be page-aligned */ return (vaof_t)(pu >> BT_PAGEBITS); } @@ -633,7 +635,7 @@ _bt_find2(BT_state *state, static void _bt_root_new(BT_meta *meta, BT_page *root) { - /* The first usable address in the PMA is just beyond the first node stripe */ + /* The first usable address in the PMA is just beyond the btree segment */ root->datk[0].va = B2PAGES(BLK_BASE_LEN_TOTAL); root->datk[0].fo = 0; root->datk[1].va = UINT32_MAX; @@ -1706,7 +1708,7 @@ _nlist_new(BT_state *state) /* the size of a new node freelist is just the first stripe length */ head->lo = &((BT_page *)state->map)[BT_NUMMETAS]; - head->hi = head->lo + BLK_BASE_LEN0; + head->hi = head->lo + B2PAGES(BLK_BASE_LEN0); head->next = 0; state->nlist = head; @@ -2749,7 +2751,7 @@ bt_malloc(BT_state *state, size_t pages) if (sz_p >= pages) { ret = (*n)->lo; - BT_page *hi = (BT_page *)ret + pages; + BT_page *hi = ((BT_page *)ret) + pages; _mlist_record_alloc(state, ret, hi); break; } From 2762872a9449d028d06d1985681f16552baa7ac0 Mon Sep 17 00:00:00 2001 From: barter-simsum Date: Tue, 19 Dec 2023 17:48:04 -0500 Subject: [PATCH 122/128] pma: wip test revisions --- rust/ares_pma/c-src/btest.c | 31 +++++++++++++++++++++---------- 1 file changed, 21 insertions(+), 10 deletions(-) diff --git a/rust/ares_pma/c-src/btest.c b/rust/ares_pma/c-src/btest.c index 9a97d7f..2bad28e 100644 --- a/rust/ares_pma/c-src/btest.c +++ b/rust/ares_pma/c-src/btest.c @@ -230,6 +230,9 @@ int main(int argc, char *argv[]) size_t alloc_sizp = 0; size_t flist_sizp = _flist_sizep(state3->flist); size_t mlist_sizp = _mlist_sizep(state3->mlist); + BT_meta *meta = state3->meta_pages[state3->which]; + BT_page *root = _node_get(state3, meta->root); + size_t N; for (size_t i = 0; i < ITERATIONS; i++) { /* malloc a random number of pages <= 256 and store in the allocs array */ int pages = random(); @@ -243,6 +246,8 @@ int main(int argc, char *argv[]) == (flist_sizp - alloc_sizp)); assert(_mlist_sizep(state3->mlist) == (mlist_sizp - alloc_sizp)); + N = _bt_numkeys(root); + assert(root->datk[N-2].fo == 0); } /* sync the state */ @@ -253,16 +258,14 @@ int main(int argc, char *argv[]) flist_sizp = _flist_sizep(state3->flist); mlist_sizp = _mlist_sizep(state3->mlist); alloc_sizp = 0; - for (size_t i = 0; i < ITERATIONS / 2; i++) { - /* free half of the allocations */ - bt_free(state3, allocs[i].lo, allocs[i].hi); - alloc_sizp += allocs[i].hi - allocs[i].lo; - /* validate size changes to mlist */ - assert(_mlist_sizep(state3->mlist) - == (mlist_sizp + alloc_sizp)); - } - - bt_sync(state3); + /* for (size_t i = 0; i < ITERATIONS / 2; i++) { */ + /* /\* free half of the allocations *\/ */ + /* bt_free(state3, allocs[i].lo, allocs[i].hi); */ + /* alloc_sizp += allocs[i].hi - allocs[i].lo; */ + /* /\* validate size changes to mlist *\/ */ + /* assert(_mlist_sizep(state3->mlist) */ + /* == (mlist_sizp + alloc_sizp)); */ + /* } */ /* copy ephemeral structures */ BT_mlistnode *mlist_copy = _mlist_copy(state3); @@ -272,6 +275,12 @@ int main(int argc, char *argv[]) assert(_nlist_eq(nlist_copy, state3->nlist)); assert(_flist_eq(flist_copy, state3->flist)); + meta = state3->meta_pages[state3->which]; + BT_meta metacopy = {0}; + memcpy(&metacopy, meta, sizeof metacopy); + + bt_sync(state3); + bt_state_close(state3); bt_state_new(&state3); @@ -280,6 +289,8 @@ int main(int argc, char *argv[]) /* compare for equality copies of ephemeral structures with restored ephemeral structures */ + meta = state3->meta_pages[state3->which]; + assert(meta->root == metacopy.root); assert(_mlist_eq(mlist_copy, state3->mlist)); assert(_nlist_eq(nlist_copy, state3->nlist)); assert(_flist_eq(flist_copy, state3->flist)); From feeb27879481e9083d80630e4f14de581ab97300 Mon Sep 17 00:00:00 2001 From: Edward Amsden Date: Tue, 19 Dec 2023 20:55:57 -0600 Subject: [PATCH 123/128] pma: lots of c-side bugfixes --- rust/ares_pma/c-src/btest.c | 3 +- rust/ares_pma/c-src/btree.c | 58 +++++++++++++++++++++++++------------ 2 files changed, 40 insertions(+), 21 deletions(-) diff --git a/rust/ares_pma/c-src/btest.c b/rust/ares_pma/c-src/btest.c index 2bad28e..0191df2 100644 --- a/rust/ares_pma/c-src/btest.c +++ b/rust/ares_pma/c-src/btest.c @@ -149,6 +149,7 @@ _flist_eq(BT_flistnode *l, BT_flistnode *r) int main(int argc, char *argv[]) { + DPRINTF("PMA Max Storage: %lld", ((uint64_t)UINT32_MAX * BT_PAGESIZE) - BLK_BASE_LEN_TOTAL); DPUTS("PMA Tests"); BT_state *state1; @@ -279,8 +280,6 @@ int main(int argc, char *argv[]) BT_meta metacopy = {0}; memcpy(&metacopy, meta, sizeof metacopy); - bt_sync(state3); - bt_state_close(state3); bt_state_new(&state3); diff --git a/rust/ares_pma/c-src/btree.c b/rust/ares_pma/c-src/btree.c index cd4b456..5400f72 100644 --- a/rust/ares_pma/c-src/btree.c +++ b/rust/ares_pma/c-src/btree.c @@ -358,6 +358,9 @@ static int _bt_insertdat(vaof_t lo, vaof_t hi, pgno_t fo, BT_page *parent, size_t childidx); /* ;;: tmp */ +static int _bt_flip_meta(BT_state *); + + #define BT_MAXDEPTH 4 /* ;;: todo derive it */ typedef struct BT_findpath BT_findpath; struct BT_findpath { @@ -2461,6 +2464,11 @@ _bt_state_load(BT_state *state) /* restore ephemeral freelists */ _freelist_restore(state); + /* Dirty the metapage and root page */ + assert(SUCC(_bt_flip_meta(state))); + + /* Set the file length */ + // XX make sure the flist is updated with this! if (fstat(state->data_fd, &stat) != 0) return errno; @@ -2569,9 +2577,8 @@ _bt_sync_meta(BT_state *state) the which */ { BT_meta *meta = state->meta_pages[state->which]; - BT_meta *newmeta; uint32_t chk; - int newwhich; + int rc; /* increment the txnid */ meta->txnid += 1; @@ -2588,6 +2595,23 @@ _bt_sync_meta(BT_state *state) abort(); } + // ensure we have a new dirty metapage and root node + /* finally, make old metapage clean */ + rc = _bt_flip_meta(state); + + if (mprotect(LO_ALIGN_PAGE(meta), sizeof(BT_page), BT_PROT_CLEAN) != 0) { + DPRINTF("mprotect of old metapage failed with %s", strerror(errno)); + abort(); + } + + return rc; +} + +static int _bt_flip_meta(BT_state *state) { + BT_meta *meta = state->meta_pages[state->which]; + BT_meta *newmeta; + int newwhich; + /* zero the new metapage's checksum */ newwhich = state->which ? 0 : 1; newmeta = state->meta_pages[newwhich]; @@ -2616,12 +2640,6 @@ _bt_sync_meta(BT_state *state) /* switch the metapage we're referring to */ state->which = newwhich; - /* finally, make old metapage clean */ - if (mprotect(LO_ALIGN_PAGE(meta), sizeof(BT_page), BT_PROT_CLEAN) != 0) { - DPRINTF("mprotect of old metapage failed with %s", strerror(errno)); - abort(); - } - return BT_SUCC; } @@ -2677,7 +2695,7 @@ _bt_sync(BT_state *state, BT_page *node, uint8_t depth, uint8_t maxdepth) int bt_state_new(BT_state **state) { - TRACE(); + // TRACE(); BT_state *s = calloc(1, sizeof *s); s->data_fd = -1; @@ -3026,9 +3044,9 @@ _bt_dirty(BT_state *state, vaof_t lo, vaof_t hi, pgno_t nodepg, } assert(loidx != 0); - /* find hiidx of range */ - for (size_t i = loidx; i < N; i++) { - vaof_t hhi = node->datk[i+1].va; + /* find hiidx (exclusive) of range */ + for (size_t i = loidx+1; i < N; i++) { + vaof_t hhi = node->datk[i].va; if (hhi >= hi) { hiidx = i; break; @@ -3037,20 +3055,22 @@ _bt_dirty(BT_state *state, vaof_t lo, vaof_t hi, pgno_t nodepg, assert(hiidx != 0); /* found a range in node that contains (lo-hi). May span multiple entries */ - for (size_t i = loidx; i < hiidx; i++) { /* leaf: base case. cow the data */ - if (depth == maxdepth) { + if (depth == maxdepth) { + for (size_t i = loidx; i < hiidx; i++) { vaof_t llo = node->datk[i].va; vaof_t hhi = MIN(node->datk[i+1].va, hi); pgno_t pg = node->datk[i].fo; pgno_t newpg = _bt_data_cow(state, llo, hhi, pg); _bt_insert(state, llo, hhi, newpg); + } + } else { + for (size_t i = loidx; i < hiidx; i++) { + /* branch: recursive case */ + pgno_t childpg = node->datk[i].fo; + /* iteratively recurse on all entries */ + _bt_dirty(state, lo, hi, childpg, depth+1, maxdepth); } - - /* branch: recursive case */ - pgno_t childpg = node->datk[i].fo; - /* iteratively recurse on all entries */ - _bt_dirty(state, lo, hi, childpg, depth+1, maxdepth); } return BT_SUCC; } From 86a8922e0fe7eec97509268595b34d18b4f22e46 Mon Sep 17 00:00:00 2001 From: Edward Amsden Date: Tue, 19 Dec 2023 20:56:33 -0600 Subject: [PATCH 124/128] pma: make sure unifying equality dirties for unifying a cell in the PMA, not just an indirect atom --- rust/ares/src/unifying_equality.rs | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/rust/ares/src/unifying_equality.rs b/rust/ares/src/unifying_equality.rs index 83b90fc..267a2f0 100644 --- a/rust/ares/src/unifying_equality.rs +++ b/rust/ares/src/unifying_equality.rs @@ -125,8 +125,14 @@ pub unsafe fn unifying_equality(stack: &mut NockStack, a: *mut Noun, b: *mut Nou { let (_senior, junior) = senior_pointer_first(stack, x_as_ptr, y_as_ptr); if x_as_ptr == junior { + if pma_contains(x, 1) { + pma_dirty(x, 1); + } *x = *y; } else { + if pma_contains(y, 1) { + pma_dirty(y, 1); + } *y = *x; } stack.pop::<(*mut Noun, *mut Noun)>(); From 58c49bc5e1bd0787927187dd50d58c18dff1723c Mon Sep 17 00:00:00 2001 From: Edward Amsden Date: Tue, 19 Dec 2023 21:36:27 -0600 Subject: [PATCH 125/128] pma: dont assert loidx != in _bt_dirty, it could well be! --- rust/ares_pma/c-src/btree.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/rust/ares_pma/c-src/btree.c b/rust/ares_pma/c-src/btree.c index 5400f72..6055f10 100644 --- a/rust/ares_pma/c-src/btree.c +++ b/rust/ares_pma/c-src/btree.c @@ -3031,7 +3031,7 @@ _bt_dirty(BT_state *state, vaof_t lo, vaof_t hi, pgno_t nodepg, { BT_page *node = _node_get(state, nodepg); size_t N = _bt_numkeys(node); - size_t loidx = 0; + size_t loidx = BT_DAT_MAXKEYS; // 0 is a valid loidx! size_t hiidx = 0; /* find loidx of range */ @@ -3042,7 +3042,7 @@ _bt_dirty(BT_state *state, vaof_t lo, vaof_t hi, pgno_t nodepg, break; } } - assert(loidx != 0); + assert(loidx < BT_DAT_MAXKEYS); /* find hiidx (exclusive) of range */ for (size_t i = loidx+1; i < N; i++) { From 3ae877f6552c578c011da40d67ee05ac032049a5 Mon Sep 17 00:00:00 2001 From: Edward Amsden Date: Wed, 20 Dec 2023 00:16:55 -0600 Subject: [PATCH 126/128] pma: lint and c warnings cleanup --- rust/ares/src/hamt.rs | 18 +++++++------- rust/ares/src/jets/cold.rs | 48 ++++++++++++++++++------------------- rust/ares/src/mem.rs | 1 - rust/ares/src/persist.rs | 28 +++++++++------------- rust/ares/src/serf.rs | 5 ++-- rust/ares_pma/build.rs | 5 +--- rust/ares_pma/c-src/btree.c | 24 ++++++++++++++++--- 7 files changed, 68 insertions(+), 61 deletions(-) diff --git a/rust/ares/src/hamt.rs b/rust/ares/src/hamt.rs index 27c79ea..1b9fa04 100644 --- a/rust/ares/src/hamt.rs +++ b/rust/ares/src/hamt.rs @@ -5,7 +5,7 @@ use crate::persist::{pma_contains, Persist}; use crate::unifying_equality::unifying_equality; use either::Either::{self, *}; use std::mem::size_of; -use std::ptr::{copy_nonoverlapping, null, null_mut}; +use std::ptr::{copy_nonoverlapping, null_mut}; use std::slice; type MutStemEntry = Either<*mut MutStem, Leaf>; @@ -557,7 +557,7 @@ impl Preserve for Hamt { typemap: next_stem.typemap, buffer: dest_buffer, }; - *(stem.buffer.add(idx) as *mut Entry) = + *stem.buffer.add(idx) = Entry { stem: new_stem }; assert!(traversal_depth <= 5); // will increment traversal_stack[traversal_depth - 1] = @@ -583,7 +583,7 @@ impl Preserve for Hamt { pair.0.preserve(stack); pair.1.preserve(stack); } - *(stem.buffer.add(idx) as *mut Entry) = + *stem.buffer.add(idx) = Entry { leaf: new_leaf }; } position += 1; @@ -615,7 +615,7 @@ impl Persist for Hamt { typemap: 0, buffer: null_mut(), }; 6]; - traversal[0] = (*self.0); + traversal[0] = *self.0; loop { assert!(depth < 6); @@ -630,8 +630,8 @@ impl Persist for Hamt { let next_chunk = traversal[depth].bitmap.trailing_zeros(); let next_type = traversal[depth].typemap & (1 << next_chunk) != 0; let next_entry = *traversal[depth].buffer; - traversal[depth].bitmap = traversal[depth].bitmap >> (next_chunk + 1); - traversal[depth].typemap = traversal[depth].typemap >> (next_chunk + 1); + traversal[depth].bitmap >>= next_chunk + 1; + traversal[depth].typemap >>= next_chunk + 1; traversal[depth].buffer = traversal[depth].buffer.add(1); if next_type { @@ -676,7 +676,7 @@ impl Persist for Hamt { let stem_ptr = *buffer as *mut Stem; copy_nonoverlapping(self.0, stem_ptr, 1); *buffer = stem_ptr.add(1) as *mut u8; - (*self).0 = stem_ptr; + self.0 = stem_ptr; let stem_buffer_size = (*stem_ptr).size(); if pma_contains((*stem_ptr).buffer, stem_buffer_size) { @@ -709,8 +709,8 @@ impl Persist for Hamt { let next_type = traversal[depth].typemap & (1 << next_chunk) != 0; let next_entry_ptr = traversal[depth].buffer; - traversal[depth].bitmap = traversal[depth].bitmap >> (next_chunk + 1); - traversal[depth].typemap = traversal[depth].typemap >> (next_chunk + 1); + traversal[depth].bitmap >>= next_chunk + 1; + traversal[depth].typemap >>= next_chunk + 1; traversal[depth].buffer = traversal[depth].buffer.add(1); if next_type { diff --git a/rust/ares/src/jets/cold.rs b/rust/ares/src/jets/cold.rs index 6bd8b50..3be45ee 100644 --- a/rust/ares/src/jets/cold.rs +++ b/rust/ares/src/jets/cold.rs @@ -57,15 +57,15 @@ impl Persist for Batteries { unsafe fn copy_to_buffer(&mut self, stack: &mut NockStack, buffer: &mut *mut u8) { let mut dest = self; loop { - if (*dest).0.is_null() { + if dest.0.is_null() { break; } - if pma_contains((*dest).0, 1) { + if pma_contains(dest.0, 1) { break; } let batteries_mem_ptr = *buffer as *mut BatteriesMem; - copy_nonoverlapping((*dest).0, batteries_mem_ptr, 1); + copy_nonoverlapping(dest.0, batteries_mem_ptr, 1); *buffer = batteries_mem_ptr.add(1) as *mut u8; (*batteries_mem_ptr).battery.copy_to_buffer(stack, buffer); @@ -73,8 +73,8 @@ impl Persist for Batteries { .parent_axis .copy_to_buffer(stack, buffer); - (*dest).0 = batteries_mem_ptr; - dest = &mut (*(*dest).0).parent_batteries; + dest.0 = batteries_mem_ptr; + dest = &mut (*dest.0).parent_batteries; } } @@ -222,20 +222,20 @@ impl Persist for BatteriesList { let mut dest = self; loop { - if (*dest).0.is_null() { + if dest.0.is_null() { break; } - if pma_contains((*dest).0, 1) { + if pma_contains(dest.0, 1) { break; } let list_mem_ptr = *buffer as *mut BatteriesListMem; - copy_nonoverlapping((*dest).0, list_mem_ptr, 1); + copy_nonoverlapping(dest.0, list_mem_ptr, 1); *buffer = list_mem_ptr.add(1) as *mut u8; - (*dest).0 = list_mem_ptr; + dest.0 = list_mem_ptr; - (*(*dest).0).batteries.copy_to_buffer(stack, buffer); - dest = &mut (*(*dest).0).next; + (*dest.0).batteries.copy_to_buffer(stack, buffer); + dest = &mut (*dest.0).next; } } @@ -345,21 +345,21 @@ impl Persist for NounList { let mut dest = self; loop { - if (*dest).0.is_null() { + if dest.0.is_null() { break; } - if pma_contains((*dest).0, 1) { + if pma_contains(dest.0, 1) { break; } let noun_list_mem_ptr = *buffer as *mut NounListMem; - copy_nonoverlapping((*dest).0, noun_list_mem_ptr, 1); + copy_nonoverlapping(dest.0, noun_list_mem_ptr, 1); *buffer = noun_list_mem_ptr.add(1) as *mut u8; - (*dest).0 = noun_list_mem_ptr; - (*(*dest).0).element.copy_to_buffer(stack, buffer); + dest.0 = noun_list_mem_ptr; + (*dest.0).element.copy_to_buffer(stack, buffer); - dest = &mut (*(*dest).0).next; + dest = &mut (*dest.0).next; } } @@ -456,9 +456,9 @@ impl Persist for Cold { } let mut bytes = size_of::(); - bytes += (*(*self).0).battery_to_paths.space_needed(stack); - bytes += (*(*self).0).root_to_paths.space_needed(stack); - bytes += (*(*self).0).path_to_batteries.space_needed(stack); + bytes += (*self.0).battery_to_paths.space_needed(stack); + bytes += (*self.0).root_to_paths.space_needed(stack); + bytes += (*self.0).path_to_batteries.space_needed(stack); bytes } @@ -471,11 +471,11 @@ impl Persist for Cold { copy_nonoverlapping(self.0, cold_mem_ptr, 1); *buffer = cold_mem_ptr.add(1) as *mut u8; - (*self).0 = cold_mem_ptr; + self.0 = cold_mem_ptr; - (*(*self).0).battery_to_paths.copy_to_buffer(stack, buffer); - (*(*self).0).root_to_paths.copy_to_buffer(stack, buffer); - (*(*self).0).path_to_batteries.copy_to_buffer(stack, buffer); + (*self.0).battery_to_paths.copy_to_buffer(stack, buffer); + (*self.0).root_to_paths.copy_to_buffer(stack, buffer); + (*self.0).path_to_batteries.copy_to_buffer(stack, buffer); } unsafe fn handle_to_u64(&self) -> u64 { diff --git a/rust/ares/src/mem.rs b/rust/ares/src/mem.rs index 0412b9a..31f81c5 100644 --- a/rust/ares/src/mem.rs +++ b/rust/ares/src/mem.rs @@ -5,7 +5,6 @@ use crate::noun::{Atom, Cell, CellMemory, IndirectAtom, Noun, NounAllocator}; use assert_no_alloc::permit_alloc; use either::Either::{self, Left, Right}; use ibig::Stack; -use libc::{c_void, memcmp}; use memmap::MmapMut; use std::alloc::Layout; use std::mem; diff --git a/rust/ares/src/persist.rs b/rust/ares/src/persist.rs index f0ddc1b..1c94de3 100644 --- a/rust/ares/src/persist.rs +++ b/rust/ares/src/persist.rs @@ -1,4 +1,3 @@ -use crate::jets::cold::Cold; use crate::mem::NockStack; use crate::noun::{Allocated, Atom, Cell, CellMemory, IndirectAtom, Noun}; use ares_pma::*; @@ -39,8 +38,7 @@ pub fn pma_open(path: PathBuf) -> Result<(), std::io::Error> { bt_state_new(&mut state); let err = bt_state_open(state, path_cstring.as_ptr(), PMA_FLAGS, PMA_MODE); if err == 0 { - PMA.set(PMAState(state as u64)) - .or_else(|state| Err(state.0 as *mut BT_state)) + PMA.set(PMAState(state as u64)).map_err(|state| state.0 as *mut BT_state) .expect("PMA state already initialized to:"); assert!(get_pma_state().is_some()); Ok(()) @@ -162,19 +160,17 @@ unsafe fn unmark(a: Allocated) { } impl Persist for Atom { - unsafe fn space_needed(&mut self, stack: &mut NockStack) -> usize { + unsafe fn space_needed(&mut self, _stack: &mut NockStack) -> usize { if let Ok(indirect) = self.as_indirect() { let count = indirect.raw_size(); - if !pma_contains(indirect.to_raw_pointer(), count) { - if !mark(indirect.as_allocated()) { - return count * size_of::(); - } + if !pma_contains(indirect.to_raw_pointer(), count) && !mark(indirect.as_allocated()) { + return count * size_of::(); } } 0 } - unsafe fn copy_to_buffer(&mut self, stack: &mut NockStack, buffer: &mut *mut u8) { + unsafe fn copy_to_buffer(&mut self, _stack: &mut NockStack, buffer: &mut *mut u8) { if let Ok(mut indirect) = self.as_indirect() { let count = indirect.raw_size(); if !pma_contains(indirect.to_raw_pointer(), count) { @@ -219,12 +215,10 @@ impl Persist for Noun { space += atom.space_needed(stack); } Right(cell) => { - if !pma_contains(cell.to_raw_pointer(), 1) { - if !mark(cell.as_allocated()) { - space += size_of::(); - (*stack.push::()) = cell.tail(); - (*stack.push::()) = cell.head(); - } + if !pma_contains(cell.to_raw_pointer(), 1) && !mark(cell.as_allocated()) { + space += size_of::(); + (*stack.push::()) = cell.tail(); + (*stack.push::()) = cell.head(); } } } @@ -236,7 +230,7 @@ impl Persist for Noun { unsafe fn copy_to_buffer(&mut self, stack: &mut NockStack, buffer: &mut *mut u8) { let mut buffer_u64 = (*buffer) as *mut u64; stack.frame_push(0); - *(stack.push::<*mut Noun>()) = (self as *mut Noun); + *(stack.push::<*mut Noun>()) = self as *mut Noun; loop { if stack.stack_is_empty() { @@ -247,7 +241,7 @@ impl Persist for Noun { stack.pop::<*mut Noun>(); match (*dest).as_either_direct_allocated() { - Left(direct) => {} + Left(_direct) => {} Right(allocated) => { if let Some(a) = allocated.forwarding_pointer() { *dest = a.as_noun(); diff --git a/rust/ares/src/serf.rs b/rust/ares/src/serf.rs index 206c777..b553771 100644 --- a/rust/ares/src/serf.rs +++ b/rust/ares/src/serf.rs @@ -7,7 +7,6 @@ use crate::jets::list::util::{lent, zing}; use crate::jets::nock::util::mook; use crate::jets::warm::Warm; use crate::mem::NockStack; -use crate::mem::Preserve; use crate::mug::*; use crate::newt::Newt; use crate::noun::{Atom, Cell, DirectAtom, Noun, Slots, D, T}; @@ -160,8 +159,8 @@ impl Context { } }; - let mut hot = Hot::init(&mut stack, constant_hot_state); - let warm = Warm::init(&mut stack, &mut cold, &mut hot); + let hot = Hot::init(&mut stack, constant_hot_state); + let warm = Warm::init(&mut stack, &mut cold, &hot); let mug = mug_u32(&mut stack, arvo); let nock_context = interpreter::Context { diff --git a/rust/ares_pma/build.rs b/rust/ares_pma/build.rs index eb2ca17..22ec4be 100644 --- a/rust/ares_pma/build.rs +++ b/rust/ares_pma/build.rs @@ -3,10 +3,7 @@ extern crate bindgen; use std::env; use std::path::PathBuf; -use bindgen::CargoCallbacks; - fn main() { - let profile = env::var("PROFILE").unwrap(); let opt_level = env::var("OPT_LEVEL").unwrap(); let define_debug = if env::var("CARGO_FEATURE_DEBUG_PRINTS").is_ok() { "-DDEBUG" @@ -72,7 +69,7 @@ fn main() { .header(headers_path_str) // Tell cargo to invalidate the built crate whenever any of the // included header files changed. - .parse_callbacks(Box::new(CargoCallbacks)) + .parse_callbacks(Box::new(bindgen::CargoCallbacks::new())) // Finish the builder and generate the bindings. .generate() // Unwrap the Result and panic on failure. diff --git a/rust/ares_pma/c-src/btree.c b/rust/ares_pma/c-src/btree.c index 6055f10..90edddb 100644 --- a/rust/ares_pma/c-src/btree.c +++ b/rust/ares_pma/c-src/btree.c @@ -158,7 +158,7 @@ off2addr(vaof_t off) /* NMEMB: number of members in array, a */ #define NMEMB(a) \ - (sizeof(a[0]) / sizeof(a)) + (sizeof(a) / sizeof(a[0])) #define offsetof(st, m) \ __builtin_offsetof(st, m) @@ -236,8 +236,8 @@ struct BT_page { BT_pageheader head; /* header */ union { /* data section */ BT_dat datd[BT_DAT_MAXENTRIES]; /* union view */ - BT_kv datk[0]; /* struct view */ - BYTE datc[0]; /* byte-level view */ + BT_kv datk[BT_DAT_MAXKEYS]; /* struct view */ + BYTE datc[BT_DAT_MAXBYTES]; /* byte-level view */ }; }; static_assert(sizeof(BT_page) == BT_PAGESIZE); @@ -567,6 +567,9 @@ _node_cow(BT_state *state, BT_page *node, pgno_t *pgno) return BT_SUCC; } +static void * +_bt_bsearch(BT_page *page, vaof_t va) __attribute((unused)); + /* binary search a page's data section for a va. Returns a pointer to the found BT_dat */ static void * _bt_bsearch(BT_page *page, vaof_t va) @@ -655,6 +658,9 @@ _bt_find(BT_state *state, BT_findpath *path, vaof_t lo, vaof_t hi) return _bt_find2(state, root, path, maxdepth, lo, hi); } +static int +_bt_findpath_is_root(BT_findpath *path) __attribute((unused)); + static int _bt_findpath_is_root(BT_findpath *path) { @@ -770,6 +776,9 @@ _bt_split_child(BT_state *state, BT_page *parent, size_t i, pgno_t *newchild) return BT_SUCC; } +static int +_bt_rebalance(BT_state *state, BT_page *node) __attribute((unused)); + static int _bt_rebalance(BT_state *state, BT_page *node) { @@ -1596,6 +1605,9 @@ struct BT_ppage { BT_page *parent; }; +static int +_bt_delete(BT_state *state, vaof_t lo, vaof_t hi) __attribute((unused)); + static int _bt_delete(BT_state *state, vaof_t lo, vaof_t hi) { @@ -2511,6 +2523,9 @@ _bt_falloc(BT_state *state, size_t pages) return ret; } +static int +_bt_sync_hasdirtypage(BT_state *state, BT_page *node) __attribute((unused)); + static int _bt_sync_hasdirtypage(BT_state *state, BT_page *node) /* ;;: could be more efficiently replaced by a gcc vectorized builtin */ @@ -3188,6 +3203,9 @@ _sham_sync2(BT_state *state, BT_page *node, uint8_t depth, uint8_t maxdepth) } } +static void +_sham_sync(BT_state *state) __attribute((unused)); + static void _sham_sync(BT_state *state) { From 4080f5014559a71a1671fbc43cd25116048957d2 Mon Sep 17 00:00:00 2001 From: Edward Amsden Date: Wed, 20 Dec 2023 00:20:10 -0600 Subject: [PATCH 127/128] pma: format --- rust/ares/src/hamt.rs | 6 ++---- rust/ares/src/persist.rs | 3 ++- 2 files changed, 4 insertions(+), 5 deletions(-) diff --git a/rust/ares/src/hamt.rs b/rust/ares/src/hamt.rs index 1b9fa04..d7f08a3 100644 --- a/rust/ares/src/hamt.rs +++ b/rust/ares/src/hamt.rs @@ -557,8 +557,7 @@ impl Preserve for Hamt { typemap: next_stem.typemap, buffer: dest_buffer, }; - *stem.buffer.add(idx) = - Entry { stem: new_stem }; + *stem.buffer.add(idx) = Entry { stem: new_stem }; assert!(traversal_depth <= 5); // will increment traversal_stack[traversal_depth - 1] = Some((stem, position + 1)); @@ -583,8 +582,7 @@ impl Preserve for Hamt { pair.0.preserve(stack); pair.1.preserve(stack); } - *stem.buffer.add(idx) = - Entry { leaf: new_leaf }; + *stem.buffer.add(idx) = Entry { leaf: new_leaf }; } position += 1; continue 'preserve_stem; diff --git a/rust/ares/src/persist.rs b/rust/ares/src/persist.rs index 1c94de3..f13abc5 100644 --- a/rust/ares/src/persist.rs +++ b/rust/ares/src/persist.rs @@ -38,7 +38,8 @@ pub fn pma_open(path: PathBuf) -> Result<(), std::io::Error> { bt_state_new(&mut state); let err = bt_state_open(state, path_cstring.as_ptr(), PMA_FLAGS, PMA_MODE); if err == 0 { - PMA.set(PMAState(state as u64)).map_err(|state| state.0 as *mut BT_state) + PMA.set(PMAState(state as u64)) + .map_err(|state| state.0 as *mut BT_state) .expect("PMA state already initialized to:"); assert!(get_pma_state().is_some()); Ok(()) From 4c8851c385b52e42da86854dbdc9cf618ff6b2c4 Mon Sep 17 00:00:00 2001 From: barter-simsum Date: Wed, 20 Dec 2023 09:37:04 -0500 Subject: [PATCH 128/128] pma: simpler pending freelist merge. fixes use after free --- rust/ares_pma/c-src/btree.c | 122 +++++++----------------------------- 1 file changed, 23 insertions(+), 99 deletions(-) diff --git a/rust/ares_pma/c-src/btree.c b/rust/ares_pma/c-src/btree.c index 90edddb..e5c9b0e 100644 --- a/rust/ares_pma/c-src/btree.c +++ b/rust/ares_pma/c-src/btree.c @@ -976,11 +976,9 @@ _mlist_insert(BT_state *state, void *lo, void *hi) } static void -_nlist_insert(BT_state *state, BT_nlistnode **dst, pgno_t nodepg) +_nlist_insert2(BT_state *state, BT_nlistnode **dst, BT_page *lo, BT_page *hi) { BT_nlistnode **prev_dst = 0; - BT_page *lo = _node_get(state, nodepg); - BT_page *hi = lo+1; while(*dst) { if (hi == (*dst)->lo) { @@ -1029,58 +1027,25 @@ _nlist_insert(BT_state *state, BT_nlistnode **dst, pgno_t nodepg) } } +static void +_nlist_insert(BT_state *state, BT_nlistnode **dst, pgno_t nodepg) +{ + BT_page *lo = _node_get(state, nodepg); + BT_page *hi = _node_get(state, nodepg+1); + _nlist_insert2(state, dst, lo, hi); +} + static void _pending_nlist_merge(BT_state *state) { - BT_nlistnode **src_head = &state->pending_nlist; - BT_nlistnode **dst_head = &state->nlist; - - while (*dst_head) { - /* src cleared. done */ - if (!*src_head) { - return; - } - - /* ;;: TODO: you still need to coalesce neighbor nodes in dst if we widen - them */ - - /* check if src node should be merged with dst **************************/ - BT_page *dst_nlo = (*dst_head)->next ? (*dst_head)->next->lo : 0; - - /* source node immediately follows dst node's termination */ - if ((*dst_head)->hi == (*src_head)->lo) { - /* expand dst node */ - (*dst_head)->hi = (*src_head)->hi; - /* advance src node and free previous */ - BT_nlistnode *prev = *src_head; - src_head = &(*src_head)->next; - free(prev); - } - /* source node's termination immediately precedes dst node */ - else if ((*src_head)->hi == (*dst_head)->lo) { - /* expand dst node */ - (*src_head)->lo = (*dst_head)->lo; - /* advance src node and free previous */ - BT_nlistnode *prev = *src_head; - src_head = &(*src_head)->next; - free(prev); - } - /* src node is discontiguously between dst head and next */ - else if ((*src_head)->lo > (*dst_head)->hi - && (*src_head)->hi < dst_nlo) { - /* link src node in */ - (*src_head)->next = (*dst_head)->next; - (*dst_head)->next = *src_head; - /* and advance src node */ - src_head = &(*src_head)->next; - } - /* otherwise, advance dst node */ - else { - dst_head = &(*dst_head)->next; - } + BT_nlistnode *src_head = state->pending_nlist; + BT_nlistnode *prev = 0; + while (src_head) { + _nlist_insert2(state, &state->nlist, src_head->lo, src_head->hi); + prev = src_head; + src_head = src_head->next; + free(prev); } - /* merge what remains of src if anything */ - *dst_head = *src_head; } static void @@ -1138,55 +1103,14 @@ _flist_insert(BT_flistnode **dst, pgno_t lo, pgno_t hi) static void _pending_flist_merge(BT_state *state) { - BT_flistnode **src_head = &state->pending_flist; - BT_flistnode **dst_head = &state->flist; - - while (*dst_head) { - /* src cleared. done */ - if (!*src_head) { - return; - } - - /* ;;: TODO: you still need to coalesce neighbor nodes in dst if we widen - them */ - - /* check if src node should be merged with dst **************************/ - pgno_t dst_nlo = (*dst_head)->next ? (*dst_head)->next->lo : 0; - - /* source node immediately follows dst node's termination */ - if ((*dst_head)->hi == (*src_head)->lo) { - /* expand dst node */ - (*dst_head)->hi = (*src_head)->hi; - /* advance src node and free previous */ - BT_flistnode *prev = *src_head; - src_head = &(*src_head)->next; - free(prev); - } - /* source node's termination immediately precedes dst node */ - else if ((*src_head)->hi == (*dst_head)->lo) { - /* expand dst node */ - (*src_head)->lo = (*dst_head)->lo; - /* advance src node and free previous */ - BT_flistnode *prev = *src_head; - src_head = &(*src_head)->next; - free(prev); - } - /* src node is discontiguously between dst head and next */ - else if ((*src_head)->lo > (*dst_head)->hi - && (*src_head)->hi < dst_nlo) { - /* link src node in */ - (*src_head)->next = (*dst_head)->next; - (*dst_head)->next = *src_head; - /* and advance src node */ - src_head = &(*src_head)->next; - } - /* otherwise, advance dst node */ - else { - dst_head = &(*dst_head)->next; - } + BT_flistnode *src_head = state->pending_flist; + BT_flistnode *prev = 0; + while (src_head) { + _flist_insert(&state->flist, src_head->lo, src_head->hi); + prev = src_head; + src_head = src_head->next; + free(prev); } - /* merge what remains of src if anything */ - *dst_head = *src_head; }