From fea159ab2dbee45fc1f76da208c6b8628cf52ee6 Mon Sep 17 00:00:00 2001 From: hosted-fornet Date: Wed, 25 Sep 2024 22:07:41 -0700 Subject: [PATCH 01/37] fd_manager: add initial runtime module --- Cargo.lock | 1 + kinode/Cargo.toml | 1 + kinode/src/fd_manager.rs | 192 +++++++++++++++++++++++++++++++++++++++ kinode/src/main.rs | 17 ++++ lib/src/core.rs | 26 ++++++ 5 files changed, 237 insertions(+) create mode 100644 kinode/src/fd_manager.rs diff --git a/Cargo.lock b/Cargo.lock index c1af03ec..b9f99d0c 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -3665,6 +3665,7 @@ dependencies = [ "kit", "lazy_static", "lib", + "libc", "nohash-hasher", "open", "public-ip", diff --git a/kinode/Cargo.toml b/kinode/Cargo.toml index b6834c61..31c17650 100644 --- a/kinode/Cargo.toml +++ b/kinode/Cargo.toml @@ -62,6 +62,7 @@ indexmap = "2.4" jwt = "0.16" lib = { path = "../lib" } lazy_static = "1.4.0" +libc = "0.2" nohash-hasher = "0.2.0" open = "5.1.4" public-ip = "0.2.2" diff --git a/kinode/src/fd_manager.rs b/kinode/src/fd_manager.rs new file mode 100644 index 00000000..8cbf3693 --- /dev/null +++ b/kinode/src/fd_manager.rs @@ -0,0 +1,192 @@ +use lib::types::core::{ + KernelMessage, Message, MessageReceiver, MessageSender, PrintSender, + Printout, ProcessId, Request, FdManagerRequest, FdManagerError, FD_MANAGER_PROCESS_ID, +}; +use std::{ + collections::HashMap, + sync::Arc, +}; + +const DEFAULT_MAX_OPEN_FDS: u64 = 180; +const DEFAULT_FDS_AS_FRACTION_OF_ULIMIT_PERCENTAGE: u64 = 60; +const DEFAULT_UPDATE_ULIMIT_SECS: u64 = 3600; +const DEFAULT_CULL_FRACTION_DENOMINATOR: u64 = 2; + +struct State { + fds: HashMap, + mode: Mode, + total_fds: u64, + max_fds: u64, + cull_fraction_denominator: u64, +} + +enum Mode { + /// don't update the max_fds except by user input + StaticMax, + /// check the system's ulimit periodically and update max_fds accordingly + DynamicMax { + max_fds_as_fraction_of_ulimit_percentage: u64, + update_ulimit_secs: u64, + } +} + +impl State { + fn new() -> Self { + Self::default() + } + + fn default() -> Self { + Self { + fds: HashMap::new(), + mode: Mode::default(), + total_fds: 0, + max_fds: DEFAULT_MAX_OPEN_FDS, + cull_fraction_denominator: DEFAULT_CULL_FRACTION_DENOMINATOR, + } + } + + fn update_max_fds_from_ulimit(&mut self, ulimit_max_fds: u64) { + let Mode::DynamicMax { ref max_fds_as_fraction_of_ulimit_percentage, .. } = self.mode else { + return; + }; + self.max_fds = ulimit_max_fds * max_fds_as_fraction_of_ulimit_percentage / 100; + } +} + +impl Mode { + fn default() -> Self { + Self::DynamicMax { + max_fds_as_fraction_of_ulimit_percentage: DEFAULT_FDS_AS_FRACTION_OF_ULIMIT_PERCENTAGE, + update_ulimit_secs: DEFAULT_UPDATE_ULIMIT_SECS, + } + } +} + +/// The fd_manager entrypoint. +pub async fn fd_manager( + our_node: Arc, + send_to_loop: MessageSender, + send_to_terminal: PrintSender, + mut recv_from_loop: MessageReceiver, +) -> anyhow::Result<()> { + let mut state = State::new(); + let mut interval = { + // in code block to release the reference into state + let Mode::DynamicMax { ref update_ulimit_secs, .. } = state.mode else { + return Ok(()) + }; + tokio::time::interval(tokio::time::Duration::from_secs( + update_ulimit_secs.clone() + )) + }; + let our_node = our_node.as_str(); + loop { + tokio::select! { + Some(message) = recv_from_loop.recv() => { + handle_message(message, &mut interval, &mut state)?; + } + _ = interval.tick() => { + update_max_fds(&send_to_terminal, &mut state).await?; + } + } + + if state.total_fds >= state.max_fds { + send_cull(our_node, &send_to_loop, &state).await?; + } + } +} + +fn handle_message(km: KernelMessage, _interval: &mut tokio::time::Interval, state: &mut State) -> anyhow::Result<()> { + let Message::Request(Request { + body, + .. + }) = km.message else { + return Err(FdManagerError::NotARequest.into()); + }; + let request: FdManagerRequest = serde_json::from_slice(&body) + .map_err(|_e| FdManagerError::BadRequest)?; + match request { + FdManagerRequest::OpenFds { number_opened } => { + state.total_fds += number_opened; + state.fds + .entry(km.source.process) + .and_modify(|e| *e += number_opened) + .or_insert(number_opened); + } + FdManagerRequest::CloseFds { mut number_closed } => { + assert!(state.total_fds >= number_closed); + state.total_fds -= number_closed; + state.fds + .entry(km.source.process) + .and_modify(|e| { + assert!(e >= &mut number_closed); + *e -= number_closed + }) + .or_insert(number_closed); + } + FdManagerRequest::Cull { .. } => { + return Err(FdManagerError::FdManagerWasSentCull.into()); + } + FdManagerRequest::UpdateMaxFdsAsFractionOfUlimitPercentage(_new) => { + unimplemented!(); + } + FdManagerRequest::UpdateUpdateUlimitSecs(_new) => { + unimplemented!(); + } + FdManagerRequest::UpdateCullFractionDenominator(_new) => { + unimplemented!(); + } + } + Ok(()) +} + +async fn update_max_fds(send_to_terminal: &PrintSender, state: &mut State) -> anyhow::Result<()> { + let ulimit_max_fds = match get_max_fd_limit() { + Ok(ulimit_max_fds) => ulimit_max_fds, + Err(_) => { + Printout::new(1, "Couldn't update max fd limit: ulimit failed") + .send(send_to_terminal).await; + return Ok(()); + } + }; + state.update_max_fds_from_ulimit(ulimit_max_fds); + Ok(()) +} + +async fn send_cull(our_node: &str, send_to_loop: &MessageSender, state: &State) -> anyhow::Result<()> { + let message = Message::Request(Request { + inherit: false, + expects_response: None, + body: serde_json::to_vec(&FdManagerRequest::Cull { + cull_fraction_denominator: state.cull_fraction_denominator.clone(), + }).unwrap(), + metadata: None, + capabilities: vec![], + }); + for process_id in state.fds.keys() { + KernelMessage::builder() + .id(rand::random()) + .source((our_node.clone(), FD_MANAGER_PROCESS_ID.clone())) + .target((our_node.clone(), process_id.clone())) + .message(message.clone()) + .build() + .unwrap() + .send(send_to_loop) + .await; + } + Ok(()) +} + +fn get_max_fd_limit() -> anyhow::Result { + let mut rlim = libc::rlimit { + rlim_cur: 0, // Current limit + rlim_max: 0, // Maximum limit value + }; + + // RLIMIT_NOFILE is the resource indicating the maximum file descriptor number. + if unsafe { libc::getrlimit(libc::RLIMIT_NOFILE, &mut rlim) } == 0 { + Ok(rlim.rlim_cur as u64) + } else { + Err(anyhow::anyhow!("Failed to get the resource limit.")) + } +} diff --git a/kinode/src/main.rs b/kinode/src/main.rs index 61619479..50c4fe7e 100644 --- a/kinode/src/main.rs +++ b/kinode/src/main.rs @@ -17,6 +17,7 @@ use tokio::sync::mpsc; mod eth; #[cfg(feature = "simulation-mode")] mod fakenet; +mod fd_manager; mod http; mod kernel; mod keygen; @@ -42,6 +43,7 @@ const VFS_CHANNEL_CAPACITY: usize = 1_000; const CAP_CHANNEL_CAPACITY: usize = 1_000; const KV_CHANNEL_CAPACITY: usize = 1_000; const SQLITE_CHANNEL_CAPACITY: usize = 1_000; +const FD_MANAGER_CHANNEL_CAPACITY: usize = 1_000; const VERSION: &str = env!("CARGO_PKG_VERSION"); const WS_MIN_PORT: u16 = 9_000; const TCP_MIN_PORT: u16 = 10_000; @@ -175,6 +177,9 @@ async fn main() { // vfs maintains metadata about files in fs for processes let (vfs_message_sender, vfs_message_receiver): (MessageSender, MessageReceiver) = mpsc::channel(VFS_CHANNEL_CAPACITY); + // fd_manager makes sure we don't overrun the `ulimit -n`: max number of file descriptors + let (fd_manager_sender, fd_manager_receiver): (MessageSender, MessageReceiver) = + mpsc::channel(FD_MANAGER_CHANNEL_CAPACITY); // terminal receives prints via this channel, all other modules send prints let (print_sender, print_receiver): (PrintSender, PrintReceiver) = mpsc::channel(TERMINAL_CHANNEL_CAPACITY); @@ -282,6 +287,12 @@ async fn main() { None, false, ), + ( + ProcessId::new(Some("fd_manager"), "distro", "sys"), + fd_manager_sender, + None, + false, + ), ]; /* @@ -351,6 +362,12 @@ async fn main() { db, home_directory_path.clone(), )); + tasks.spawn(fd_manager::fd_manager( + our_name_arc.clone(), + kernel_message_sender.clone(), + print_sender.clone(), + fd_manager_receiver, + )); tasks.spawn(kv::kv( our_name_arc.clone(), kernel_message_sender.clone(), diff --git a/lib/src/core.rs b/lib/src/core.rs index b8a28676..a626fc86 100644 --- a/lib/src/core.rs +++ b/lib/src/core.rs @@ -17,6 +17,7 @@ lazy_static::lazy_static! { pub static ref STATE_PROCESS_ID: ProcessId = ProcessId::new(Some("state"), "distro", "sys"); pub static ref KV_PROCESS_ID: ProcessId = ProcessId::new(Some("kv"), "distro", "sys"); pub static ref SQLITE_PROCESS_ID: ProcessId = ProcessId::new(Some("sqlite"), "distro", "sys"); + pub static ref FD_MANAGER_PROCESS_ID: ProcessId = ProcessId::new(Some("fd_manager"), "distro", "sys"); } // @@ -2068,3 +2069,28 @@ impl KnsUpdate { self.ports.get(protocol) } } + +#[derive(Clone, Debug, Serialize, Deserialize)] +pub enum FdManagerRequest { + /// other process -> fd_manager + OpenFds { number_opened: u64 }, + CloseFds { number_closed: u64 }, + + /// fd_manager -> other process + Cull { cull_fraction_denominator: u64 }, + + /// administrative + UpdateMaxFdsAsFractionOfUlimitPercentage(u64), + UpdateUpdateUlimitSecs(u64), + UpdateCullFractionDenominator(u64), +} + +#[derive(Debug, Error)] +pub enum FdManagerError { + #[error("fd_manager: received a non-Request message")] + NotARequest, + #[error("fd_manager: received a non-FdManangerRequest")] + BadRequest, + #[error("fd_manager: received a FdManagerRequest::Cull, but I am the one who culls")] + FdManagerWasSentCull, +} From a4ad477fe93fd63f8034abe1a90ff5c2882ceb4f Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" <41898282+github-actions[bot]@users.noreply.github.com> Date: Thu, 26 Sep 2024 05:08:14 +0000 Subject: [PATCH 02/37] Format Rust code using rustfmt --- kinode/src/fd_manager.rs | 62 ++++++++++++++++++++++++---------------- lib/src/core.rs | 12 ++++++-- 2 files changed, 46 insertions(+), 28 deletions(-) diff --git a/kinode/src/fd_manager.rs b/kinode/src/fd_manager.rs index 8cbf3693..78f190db 100644 --- a/kinode/src/fd_manager.rs +++ b/kinode/src/fd_manager.rs @@ -1,11 +1,8 @@ use lib::types::core::{ - KernelMessage, Message, MessageReceiver, MessageSender, PrintSender, - Printout, ProcessId, Request, FdManagerRequest, FdManagerError, FD_MANAGER_PROCESS_ID, -}; -use std::{ - collections::HashMap, - sync::Arc, + FdManagerError, FdManagerRequest, KernelMessage, Message, MessageReceiver, MessageSender, + PrintSender, Printout, ProcessId, Request, FD_MANAGER_PROCESS_ID, }; +use std::{collections::HashMap, sync::Arc}; const DEFAULT_MAX_OPEN_FDS: u64 = 180; const DEFAULT_FDS_AS_FRACTION_OF_ULIMIT_PERCENTAGE: u64 = 60; @@ -27,7 +24,7 @@ enum Mode { DynamicMax { max_fds_as_fraction_of_ulimit_percentage: u64, update_ulimit_secs: u64, - } + }, } impl State { @@ -46,7 +43,11 @@ impl State { } fn update_max_fds_from_ulimit(&mut self, ulimit_max_fds: u64) { - let Mode::DynamicMax { ref max_fds_as_fraction_of_ulimit_percentage, .. } = self.mode else { + let Mode::DynamicMax { + ref max_fds_as_fraction_of_ulimit_percentage, + .. + } = self.mode + else { return; }; self.max_fds = ulimit_max_fds * max_fds_as_fraction_of_ulimit_percentage / 100; @@ -72,12 +73,14 @@ pub async fn fd_manager( let mut state = State::new(); let mut interval = { // in code block to release the reference into state - let Mode::DynamicMax { ref update_ulimit_secs, .. } = state.mode else { - return Ok(()) + let Mode::DynamicMax { + ref update_ulimit_secs, + .. + } = state.mode + else { + return Ok(()); }; - tokio::time::interval(tokio::time::Duration::from_secs( - update_ulimit_secs.clone() - )) + tokio::time::interval(tokio::time::Duration::from_secs(update_ulimit_secs.clone())) }; let our_node = our_node.as_str(); loop { @@ -96,19 +99,21 @@ pub async fn fd_manager( } } -fn handle_message(km: KernelMessage, _interval: &mut tokio::time::Interval, state: &mut State) -> anyhow::Result<()> { - let Message::Request(Request { - body, - .. - }) = km.message else { +fn handle_message( + km: KernelMessage, + _interval: &mut tokio::time::Interval, + state: &mut State, +) -> anyhow::Result<()> { + let Message::Request(Request { body, .. }) = km.message else { return Err(FdManagerError::NotARequest.into()); }; - let request: FdManagerRequest = serde_json::from_slice(&body) - .map_err(|_e| FdManagerError::BadRequest)?; + let request: FdManagerRequest = + serde_json::from_slice(&body).map_err(|_e| FdManagerError::BadRequest)?; match request { FdManagerRequest::OpenFds { number_opened } => { state.total_fds += number_opened; - state.fds + state + .fds .entry(km.source.process) .and_modify(|e| *e += number_opened) .or_insert(number_opened); @@ -116,7 +121,8 @@ fn handle_message(km: KernelMessage, _interval: &mut tokio::time::Interval, stat FdManagerRequest::CloseFds { mut number_closed } => { assert!(state.total_fds >= number_closed); state.total_fds -= number_closed; - state.fds + state + .fds .entry(km.source.process) .and_modify(|e| { assert!(e >= &mut number_closed); @@ -145,7 +151,8 @@ async fn update_max_fds(send_to_terminal: &PrintSender, state: &mut State) -> an Ok(ulimit_max_fds) => ulimit_max_fds, Err(_) => { Printout::new(1, "Couldn't update max fd limit: ulimit failed") - .send(send_to_terminal).await; + .send(send_to_terminal) + .await; return Ok(()); } }; @@ -153,13 +160,18 @@ async fn update_max_fds(send_to_terminal: &PrintSender, state: &mut State) -> an Ok(()) } -async fn send_cull(our_node: &str, send_to_loop: &MessageSender, state: &State) -> anyhow::Result<()> { +async fn send_cull( + our_node: &str, + send_to_loop: &MessageSender, + state: &State, +) -> anyhow::Result<()> { let message = Message::Request(Request { inherit: false, expects_response: None, body: serde_json::to_vec(&FdManagerRequest::Cull { cull_fraction_denominator: state.cull_fraction_denominator.clone(), - }).unwrap(), + }) + .unwrap(), metadata: None, capabilities: vec![], }); diff --git a/lib/src/core.rs b/lib/src/core.rs index a626fc86..3cb65d82 100644 --- a/lib/src/core.rs +++ b/lib/src/core.rs @@ -2073,11 +2073,17 @@ impl KnsUpdate { #[derive(Clone, Debug, Serialize, Deserialize)] pub enum FdManagerRequest { /// other process -> fd_manager - OpenFds { number_opened: u64 }, - CloseFds { number_closed: u64 }, + OpenFds { + number_opened: u64, + }, + CloseFds { + number_closed: u64, + }, /// fd_manager -> other process - Cull { cull_fraction_denominator: u64 }, + Cull { + cull_fraction_denominator: u64, + }, /// administrative UpdateMaxFdsAsFractionOfUlimitPercentage(u64), From c3e01d2800df7ec7472d9eef3e5cc70b93850201 Mon Sep 17 00:00:00 2001 From: hosted-fornet Date: Thu, 26 Sep 2024 12:14:09 -0700 Subject: [PATCH 03/37] fd_manager: add helpers --- kinode/src/fd_manager.rs | 43 +++++++++++++++++++++++++++++++++++++--- 1 file changed, 40 insertions(+), 3 deletions(-) diff --git a/kinode/src/fd_manager.rs b/kinode/src/fd_manager.rs index 78f190db..a4831c08 100644 --- a/kinode/src/fd_manager.rs +++ b/kinode/src/fd_manager.rs @@ -1,5 +1,5 @@ use lib::types::core::{ - FdManagerError, FdManagerRequest, KernelMessage, Message, MessageReceiver, MessageSender, + Address, FdManagerError, FdManagerRequest, KernelMessage, Message, MessageReceiver, MessageSender, PrintSender, Printout, ProcessId, Request, FD_MANAGER_PROCESS_ID, }; use std::{collections::HashMap, sync::Arc}; @@ -178,8 +178,8 @@ async fn send_cull( for process_id in state.fds.keys() { KernelMessage::builder() .id(rand::random()) - .source((our_node.clone(), FD_MANAGER_PROCESS_ID.clone())) - .target((our_node.clone(), process_id.clone())) + .source((our_node, FD_MANAGER_PROCESS_ID.clone())) + .target((our_node, process_id.clone())) .message(message.clone()) .build() .unwrap() @@ -202,3 +202,40 @@ fn get_max_fd_limit() -> anyhow::Result { Err(anyhow::anyhow!("Failed to get the resource limit.")) } } + +pub async fn send_fd_manager_open(our: &Address, number_opened: u64, send_to_loop: &MessageSender) -> anyhow::Result<()> { + let message = Message::Request(Request { + inherit: false, + expects_response: None, + body: serde_json::to_vec(&FdManagerRequest::OpenFds { number_opened }).unwrap(), + metadata: None, + capabilities: vec![], + }); + send_to_fd_manager(our, message, send_to_loop).await?; + Ok(()) +} + +pub async fn send_fd_manager_close(our: &Address, number_closed: u64, send_to_loop: &MessageSender) -> anyhow::Result<()> { + let message = Message::Request(Request { + inherit: false, + expects_response: None, + body: serde_json::to_vec(&FdManagerRequest::CloseFds { number_closed }).unwrap(), + metadata: None, + capabilities: vec![], + }); + send_to_fd_manager(our, message, send_to_loop).await?; + Ok(()) +} + +async fn send_to_fd_manager(our: &Address, message: Message, send_to_loop: &MessageSender) -> anyhow::Result<()> { + KernelMessage::builder() + .id(rand::random()) + .source(our.clone()) + .target((our.node.clone(), FD_MANAGER_PROCESS_ID.clone())) + .message(message) + .build() + .unwrap() + .send(send_to_loop) + .await; + Ok(()) +} From bfaf63120fb7b1f51b25af44d4f710f251b4da63 Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" <41898282+github-actions[bot]@users.noreply.github.com> Date: Thu, 26 Sep 2024 19:18:14 +0000 Subject: [PATCH 04/37] Format Rust code using rustfmt --- kinode/src/fd_manager.rs | 22 +++++++++++++++++----- 1 file changed, 17 insertions(+), 5 deletions(-) diff --git a/kinode/src/fd_manager.rs b/kinode/src/fd_manager.rs index a4831c08..76cc9371 100644 --- a/kinode/src/fd_manager.rs +++ b/kinode/src/fd_manager.rs @@ -1,6 +1,6 @@ use lib::types::core::{ - Address, FdManagerError, FdManagerRequest, KernelMessage, Message, MessageReceiver, MessageSender, - PrintSender, Printout, ProcessId, Request, FD_MANAGER_PROCESS_ID, + Address, FdManagerError, FdManagerRequest, KernelMessage, Message, MessageReceiver, + MessageSender, PrintSender, Printout, ProcessId, Request, FD_MANAGER_PROCESS_ID, }; use std::{collections::HashMap, sync::Arc}; @@ -203,7 +203,11 @@ fn get_max_fd_limit() -> anyhow::Result { } } -pub async fn send_fd_manager_open(our: &Address, number_opened: u64, send_to_loop: &MessageSender) -> anyhow::Result<()> { +pub async fn send_fd_manager_open( + our: &Address, + number_opened: u64, + send_to_loop: &MessageSender, +) -> anyhow::Result<()> { let message = Message::Request(Request { inherit: false, expects_response: None, @@ -215,7 +219,11 @@ pub async fn send_fd_manager_open(our: &Address, number_opened: u64, send_to_loo Ok(()) } -pub async fn send_fd_manager_close(our: &Address, number_closed: u64, send_to_loop: &MessageSender) -> anyhow::Result<()> { +pub async fn send_fd_manager_close( + our: &Address, + number_closed: u64, + send_to_loop: &MessageSender, +) -> anyhow::Result<()> { let message = Message::Request(Request { inherit: false, expects_response: None, @@ -227,7 +235,11 @@ pub async fn send_fd_manager_close(our: &Address, number_closed: u64, send_to_lo Ok(()) } -async fn send_to_fd_manager(our: &Address, message: Message, send_to_loop: &MessageSender) -> anyhow::Result<()> { +async fn send_to_fd_manager( + our: &Address, + message: Message, + send_to_loop: &MessageSender, +) -> anyhow::Result<()> { KernelMessage::builder() .id(rand::random()) .source(our.clone()) From de003dca0d0df44a4b4dfa3022975bab8e705a00 Mon Sep 17 00:00:00 2001 From: hosted-fornet Date: Thu, 26 Sep 2024 16:09:19 -0700 Subject: [PATCH 05/37] vfs: hook into fd_manager --- kinode/src/fd_manager.rs | 34 +++++++++++++--- kinode/src/main.rs | 2 +- kinode/src/vfs.rs | 83 +++++++++++++++++++++++++++++++++------- lib/src/core.rs | 3 ++ 4 files changed, 103 insertions(+), 19 deletions(-) diff --git a/kinode/src/fd_manager.rs b/kinode/src/fd_manager.rs index 76cc9371..6de71d9a 100644 --- a/kinode/src/fd_manager.rs +++ b/kinode/src/fd_manager.rs @@ -86,7 +86,13 @@ pub async fn fd_manager( loop { tokio::select! { Some(message) = recv_from_loop.recv() => { - handle_message(message, &mut interval, &mut state)?; + if let Some(to_print) = handle_message( + message, + &mut interval, + &mut state, + )? { + Printout::new(2, to_print).send(&send_to_terminal).await; + } } _ = interval.tick() => { update_max_fds(&send_to_terminal, &mut state).await?; @@ -94,6 +100,16 @@ pub async fn fd_manager( } if state.total_fds >= state.max_fds { + Printout::new( + 2, + format!( + "Have {} open >= {} max fds; sending Cull Request...", + state.total_fds, + state.max_fds, + ) + ) + .send(&send_to_terminal) + .await; send_cull(our_node, &send_to_loop, &state).await?; } } @@ -103,13 +119,13 @@ fn handle_message( km: KernelMessage, _interval: &mut tokio::time::Interval, state: &mut State, -) -> anyhow::Result<()> { +) -> anyhow::Result> { let Message::Request(Request { body, .. }) = km.message else { return Err(FdManagerError::NotARequest.into()); }; let request: FdManagerRequest = serde_json::from_slice(&body).map_err(|_e| FdManagerError::BadRequest)?; - match request { + let return_value = match request { FdManagerRequest::OpenFds { number_opened } => { state.total_fds += number_opened; state @@ -117,9 +133,16 @@ fn handle_message( .entry(km.source.process) .and_modify(|e| *e += number_opened) .or_insert(number_opened); + None } FdManagerRequest::CloseFds { mut number_closed } => { assert!(state.total_fds >= number_closed); + let return_value = Some(format!( + "{} closed {} of {}", + km.source.process, + number_closed, + state.total_fds, + )); state.total_fds -= number_closed; state .fds @@ -129,6 +152,7 @@ fn handle_message( *e -= number_closed }) .or_insert(number_closed); + return_value } FdManagerRequest::Cull { .. } => { return Err(FdManagerError::FdManagerWasSentCull.into()); @@ -142,8 +166,8 @@ fn handle_message( FdManagerRequest::UpdateCullFractionDenominator(_new) => { unimplemented!(); } - } - Ok(()) + }; + Ok(return_value) } async fn update_max_fds(send_to_terminal: &PrintSender, state: &mut State) -> anyhow::Result<()> { diff --git a/kinode/src/main.rs b/kinode/src/main.rs index 50c4fe7e..9b4cee0b 100644 --- a/kinode/src/main.rs +++ b/kinode/src/main.rs @@ -17,7 +17,7 @@ use tokio::sync::mpsc; mod eth; #[cfg(feature = "simulation-mode")] mod fakenet; -mod fd_manager; +pub mod fd_manager; mod http; mod kernel; mod keygen; diff --git a/kinode/src/vfs.rs b/kinode/src/vfs.rs index 9baad6be..960027c2 100644 --- a/kinode/src/vfs.rs +++ b/kinode/src/vfs.rs @@ -1,9 +1,9 @@ use dashmap::DashMap; use lib::types::core::{ - Address, CapMessage, CapMessageSender, Capability, DirEntry, FileMetadata, FileType, + Address, CapMessage, CapMessageSender, Capability, DirEntry, FdManagerRequest, FileMetadata, FileType, KernelMessage, LazyLoadBlob, Message, MessageReceiver, MessageSender, PackageId, PrintSender, Printout, ProcessId, Request, Response, VfsAction, VfsError, VfsRequest, VfsResponse, - KERNEL_PROCESS_ID, VFS_PROCESS_ID, + FD_MANAGER_PROCESS_ID, KERNEL_PROCESS_ID, VFS_PROCESS_ID, }; use std::{ collections::{HashMap, HashSet, VecDeque}, @@ -52,7 +52,10 @@ pub async fn vfs( .map_err(|e| anyhow::anyhow!("failed creating vfs dir! {e:?}"))?; let vfs_path = Arc::new(fs::canonicalize(&vfs_path).await?); - let files = Files::new(); + let files = Files::new( + Address::new(our_node.as_str(), VFS_PROCESS_ID.clone()), + send_to_loop.clone(), + ); let process_queues: HashMap>>> = HashMap::default(); @@ -71,6 +74,22 @@ pub async fn vfs( continue; } + if km.source.process == *FD_MANAGER_PROCESS_ID { + let files = files.clone(); + let send_to_terminal = send_to_terminal.clone(); + tokio::spawn(async move { + if let Err(e) = handle_fd_request(km, files).await { + Printout::new( + 1, + format!("vfs: got request from fd_manager that errored: {e:?}"), + ) + .send(&send_to_terminal) + .await; + }; + }); + continue; + } + let queue = process_queues .get(&km.source.process) .cloned() @@ -137,6 +156,8 @@ struct Files { cursor_positions: Arc>, /// access order of files access_order: Arc>>, + our: Address, + send_to_loop: MessageSender, } struct FileEntry { @@ -145,11 +166,13 @@ struct FileEntry { } impl Files { - pub fn new() -> Self { + pub fn new(our: Address, send_to_loop: MessageSender) -> Self { Self { open_files: Arc::new(DashMap::new()), cursor_positions: Arc::new(DashMap::new()), access_order: Arc::new(Mutex::new(UniqueQueue::new())), + our, + send_to_loop, } } @@ -167,10 +190,6 @@ impl Files { return Ok(entry.value().file.clone()); } - if self.open_files.len() >= MAX_OPEN_FILES { - self.close_least_recently_used_files().await?; - } - let mut file = self.try_open_file(&path, create, truncate).await?; if let Some(position) = self.cursor_positions.get(&path) { file.seek(SeekFrom::Start(*position)).await?; @@ -184,18 +203,29 @@ impl Files { }, ); self.update_access_order(&path).await; + crate::fd_manager::send_fd_manager_open(&self.our, 1, &self.send_to_loop) + .await + .map_err(|e| VfsError::Other { error: e.to_string() })?; Ok(file) } + async fn remove_file(&self, path: &Path) -> Result<(), VfsError> { + if self.open_files.remove(path).is_some() { + crate::fd_manager::send_fd_manager_close(&self.our, 1, &self.send_to_loop) + .await + .map_err(|e| VfsError::Other { error: e.to_string() })?; + } + Ok(()) + } + async fn update_access_order(&self, path: &Path) { let mut access_order = self.access_order.lock().await; access_order.push_back(path.to_path_buf()); } - async fn close_least_recently_used_files(&self) -> Result<(), VfsError> { + async fn close_least_recently_used_files(&self, to_close: u64) -> Result<(), VfsError> { let mut access_order = self.access_order.lock().await; let mut closed = 0; - let to_close = MAX_OPEN_FILES / 3; // close 33% of max open files while closed < to_close { if let Some(path) = access_order.pop_front() { @@ -218,6 +248,9 @@ impl Files { break; // no more files to close } } + crate::fd_manager::send_fd_manager_close(&self.our, closed, &self.send_to_loop) + .await + .map_err(|e| VfsError::Other { error: e.to_string() })?; Ok(()) } @@ -361,7 +394,7 @@ async fn handle_request( } VfsAction::CreateFile => { // create truncates any file that might've existed before - files.open_files.remove(&path); + files.remove_file(&path).await?; let _file = files.open_file(&path, true, true).await?; (VfsResponse::Ok, None) } @@ -373,7 +406,7 @@ async fn handle_request( } VfsAction::CloseFile => { // removes file from scope, resets file_handle and cursor. - files.open_files.remove(&path); + files.remove_file(&path).await?; (VfsResponse::Ok, None) } VfsAction::WriteAll => { @@ -470,7 +503,7 @@ async fn handle_request( } VfsAction::RemoveFile => { fs::remove_file(&path).await?; - files.open_files.remove(&path); + files.remove_file(&path).await?; (VfsResponse::Ok, None) } VfsAction::RemoveDir => { @@ -993,3 +1026,27 @@ fn join_paths_safely(base: &PathBuf, extension: &str) -> PathBuf { let extension_path = Path::new(extension_str); base.join(extension_path) } + +async fn handle_fd_request(km: KernelMessage, files: Files) -> anyhow::Result<()> { + let Message::Request(Request { + body, + .. + }) = km.message + else { + return Err(anyhow::anyhow!("not a request")); + }; + + let request: FdManagerRequest = serde_json::from_slice(&body)?; + + match request { + FdManagerRequest::Cull { cull_fraction_denominator } => { + let fraction_to_close = files.open_files.len() as u64 / cull_fraction_denominator; + files.close_least_recently_used_files(fraction_to_close).await?; + } + _ => { + return Err(anyhow::anyhow!("non-Cull FdManagerRequest")); + } + } + + Ok(()) +} diff --git a/lib/src/core.rs b/lib/src/core.rs index 3cb65d82..d2579cc0 100644 --- a/lib/src/core.rs +++ b/lib/src/core.rs @@ -1720,6 +1720,8 @@ pub enum VfsError { NotFound { path: String }, #[error("Creating directory failed at path: {path}: {error}")] CreateDirError { path: String, error: String }, + #[error("Other error: {error}")] + Other { error: String }, } impl VfsError { @@ -1734,6 +1736,7 @@ impl VfsError { VfsError::BadJson { .. } => "NoJson", VfsError::NotFound { .. } => "NotFound", VfsError::CreateDirError { .. } => "CreateDirError", + VfsError::Other { .. } => "Other", } } } From cce79118ae9b78b65718a48eaf79029a0d730274 Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" <41898282+github-actions[bot]@users.noreply.github.com> Date: Thu, 26 Sep 2024 23:09:42 +0000 Subject: [PATCH 06/37] Format Rust code using rustfmt --- kinode/src/fd_manager.rs | 9 +++------ kinode/src/vfs.rs | 34 ++++++++++++++++++++-------------- 2 files changed, 23 insertions(+), 20 deletions(-) diff --git a/kinode/src/fd_manager.rs b/kinode/src/fd_manager.rs index 6de71d9a..55fb7453 100644 --- a/kinode/src/fd_manager.rs +++ b/kinode/src/fd_manager.rs @@ -104,9 +104,8 @@ pub async fn fd_manager( 2, format!( "Have {} open >= {} max fds; sending Cull Request...", - state.total_fds, - state.max_fds, - ) + state.total_fds, state.max_fds, + ), ) .send(&send_to_terminal) .await; @@ -139,9 +138,7 @@ fn handle_message( assert!(state.total_fds >= number_closed); let return_value = Some(format!( "{} closed {} of {}", - km.source.process, - number_closed, - state.total_fds, + km.source.process, number_closed, state.total_fds, )); state.total_fds -= number_closed; state diff --git a/kinode/src/vfs.rs b/kinode/src/vfs.rs index 960027c2..0764d960 100644 --- a/kinode/src/vfs.rs +++ b/kinode/src/vfs.rs @@ -1,9 +1,9 @@ use dashmap::DashMap; use lib::types::core::{ - Address, CapMessage, CapMessageSender, Capability, DirEntry, FdManagerRequest, FileMetadata, FileType, - KernelMessage, LazyLoadBlob, Message, MessageReceiver, MessageSender, PackageId, PrintSender, - Printout, ProcessId, Request, Response, VfsAction, VfsError, VfsRequest, VfsResponse, - FD_MANAGER_PROCESS_ID, KERNEL_PROCESS_ID, VFS_PROCESS_ID, + Address, CapMessage, CapMessageSender, Capability, DirEntry, FdManagerRequest, FileMetadata, + FileType, KernelMessage, LazyLoadBlob, Message, MessageReceiver, MessageSender, PackageId, + PrintSender, Printout, ProcessId, Request, Response, VfsAction, VfsError, VfsRequest, + VfsResponse, FD_MANAGER_PROCESS_ID, KERNEL_PROCESS_ID, VFS_PROCESS_ID, }; use std::{ collections::{HashMap, HashSet, VecDeque}, @@ -205,7 +205,9 @@ impl Files { self.update_access_order(&path).await; crate::fd_manager::send_fd_manager_open(&self.our, 1, &self.send_to_loop) .await - .map_err(|e| VfsError::Other { error: e.to_string() })?; + .map_err(|e| VfsError::Other { + error: e.to_string(), + })?; Ok(file) } @@ -213,7 +215,9 @@ impl Files { if self.open_files.remove(path).is_some() { crate::fd_manager::send_fd_manager_close(&self.our, 1, &self.send_to_loop) .await - .map_err(|e| VfsError::Other { error: e.to_string() })?; + .map_err(|e| VfsError::Other { + error: e.to_string(), + })?; } Ok(()) } @@ -250,7 +254,9 @@ impl Files { } crate::fd_manager::send_fd_manager_close(&self.our, closed, &self.send_to_loop) .await - .map_err(|e| VfsError::Other { error: e.to_string() })?; + .map_err(|e| VfsError::Other { + error: e.to_string(), + })?; Ok(()) } @@ -1028,20 +1034,20 @@ fn join_paths_safely(base: &PathBuf, extension: &str) -> PathBuf { } async fn handle_fd_request(km: KernelMessage, files: Files) -> anyhow::Result<()> { - let Message::Request(Request { - body, - .. - }) = km.message - else { + let Message::Request(Request { body, .. }) = km.message else { return Err(anyhow::anyhow!("not a request")); }; let request: FdManagerRequest = serde_json::from_slice(&body)?; match request { - FdManagerRequest::Cull { cull_fraction_denominator } => { + FdManagerRequest::Cull { + cull_fraction_denominator, + } => { let fraction_to_close = files.open_files.len() as u64 / cull_fraction_denominator; - files.close_least_recently_used_files(fraction_to_close).await?; + files + .close_least_recently_used_files(fraction_to_close) + .await?; } _ => { return Err(anyhow::anyhow!("non-Cull FdManagerRequest")); From 9b91f3cd966468c95b594ff5823665212b56f5f8 Mon Sep 17 00:00:00 2001 From: dr-frmr Date: Fri, 27 Sep 2024 16:30:10 -0400 Subject: [PATCH 07/37] net: add MAX_PEERS, add idle timeouts --- kinode/src/net/connect.rs | 4 +++- kinode/src/net/indirect.rs | 1 + kinode/src/net/mod.rs | 10 ++++---- kinode/src/net/tcp/mod.rs | 2 ++ kinode/src/net/tcp/utils.rs | 7 +++++- kinode/src/net/types.rs | 47 +++++++++++++++++++++++++++++++++++-- kinode/src/net/utils.rs | 6 +++++ kinode/src/net/ws/mod.rs | 2 ++ kinode/src/net/ws/utils.rs | 7 +++++- 9 files changed, 77 insertions(+), 9 deletions(-) diff --git a/kinode/src/net/connect.rs b/kinode/src/net/connect.rs index 9b8b8554..5ff8e621 100644 --- a/kinode/src/net/connect.rs +++ b/kinode/src/net/connect.rs @@ -7,8 +7,9 @@ use tokio::sync::mpsc; /// if target is a peer, queue to be routed /// otherwise, create peer and initiate routing pub async fn send_to_peer(ext: &IdentityExt, data: &NetData, km: KernelMessage) { - if let Some(peer) = data.peers.get_mut(&km.target.node) { + if let Some(mut peer) = data.peers.get_mut(&km.target.node) { peer.sender.send(km).expect("net: peer sender was dropped"); + peer.set_last_message(); } else { let Some(peer_id) = data.pki.get(&km.target.node) else { return utils::error_offline(km, &ext.network_error_tx).await; @@ -22,6 +23,7 @@ pub async fn send_to_peer(ext: &IdentityExt, data: &NetData, km: KernelMessage) identity: peer_id.clone(), routing_for: false, sender: peer_tx.clone(), + last_message: 0, }, ); tokio::spawn(connect_to_peer( diff --git a/kinode/src/net/indirect.rs b/kinode/src/net/indirect.rs index 1fc91120..ba47ea48 100644 --- a/kinode/src/net/indirect.rs +++ b/kinode/src/net/indirect.rs @@ -36,6 +36,7 @@ pub async fn connect_to_router(router_id: &Identity, ext: &IdentityExt, data: &N identity: router_id.clone(), routing_for: false, sender: peer_tx.clone(), + last_message: 0, }, ); if let Some((_ip, port)) = router_id.tcp_routing() { diff --git a/kinode/src/net/mod.rs b/kinode/src/net/mod.rs index 0433bd5f..25e30cae 100644 --- a/kinode/src/net/mod.rs +++ b/kinode/src/net/mod.rs @@ -45,7 +45,7 @@ pub async fn networking( // start by initializing the structs where we'll store PKI in memory // and store a mapping of peers we have an active route for let pki: OnchainPKI = Arc::new(DashMap::new()); - let peers: Peers = Arc::new(DashMap::new()); + let peers: Peers = Peers(Arc::new(DashMap::new())); // only used by routers let pending_passthroughs: PendingPassthroughs = Arc::new(DashMap::new()); @@ -171,6 +171,7 @@ async fn handle_local_request( NetAction::GetPeers => ( NetResponse::Peers( data.peers + .0 .iter() .map(|p| p.identity.clone()) .collect::>(), @@ -189,10 +190,11 @@ async fn handle_local_request( )); printout.push_str(&format!("our Identity: {:#?}\r\n", ext.our)); printout.push_str(&format!( - "we have connections with {} peers:\r\n", - data.peers.len() + "we have connections with {} peers ({} max):\r\n", + data.peers.0.len(), + utils::MAX_PEERS, )); - for peer in data.peers.iter() { + for peer in data.peers.0.iter() { printout.push_str(&format!( " {}, routing_for={}\r\n", peer.identity.name, peer.routing_for, diff --git a/kinode/src/net/tcp/mod.rs b/kinode/src/net/tcp/mod.rs index 02dde518..37d6e42a 100644 --- a/kinode/src/net/tcp/mod.rs +++ b/kinode/src/net/tcp/mod.rs @@ -222,6 +222,7 @@ async fn recv_connection( identity: their_id.clone(), routing_for: their_handshake.proxy_request, sender: peer_tx, + last_message: 0, }, ); tokio::spawn(utils::maintain_connection( @@ -343,6 +344,7 @@ pub async fn recv_via_router( identity: peer_id.clone(), routing_for: false, sender: peer_tx, + last_message: 0, }, ); // maintain direct connection diff --git a/kinode/src/net/tcp/utils.rs b/kinode/src/net/tcp/utils.rs index 9a3eba22..40383f05 100644 --- a/kinode/src/net/tcp/utils.rs +++ b/kinode/src/net/tcp/utils.rs @@ -1,7 +1,7 @@ use crate::net::{ tcp::PeerConnection, types::{HandshakePayload, IdentityExt, Peers}, - utils::{print_debug, print_loud, MESSAGE_MAX_SIZE}, + utils::{print_debug, print_loud, IDLE_TIMEOUT, MESSAGE_MAX_SIZE}, }; use lib::types::core::{KernelMessage, MessageSender, NodeId, PrintSender}; use { @@ -82,9 +82,14 @@ pub async fn maintain_connection( } }; + let timeout = tokio::time::sleep(IDLE_TIMEOUT); + tokio::select! { _ = write => (), _ = read => (), + _ = timeout => { + print_debug(&print_tx, &format!("net: closing idle connection with {peer_name}")).await; + } } print_debug(&print_tx, &format!("net: connection lost with {peer_name}")).await; diff --git a/kinode/src/net/types.rs b/kinode/src/net/types.rs index 08a5cbd3..160cc931 100644 --- a/kinode/src/net/types.rs +++ b/kinode/src/net/types.rs @@ -1,3 +1,4 @@ +use crate::net::utils; use lib::types::core::{ Identity, KernelMessage, MessageSender, NetworkErrorSender, NodeId, PrintSender, }; @@ -54,7 +55,40 @@ pub struct RoutingRequest { pub target: NodeId, } -pub type Peers = Arc>; +#[derive(Clone)] +pub struct Peers(pub Arc>); + +impl Peers { + pub fn get(&self, name: &str) -> Option> { + self.0.get(name) + } + + pub fn get_mut( + &self, + name: &str, + ) -> std::option::Option> { + self.0.get_mut(name) + } + + pub fn contains_key(&self, name: &str) -> bool { + self.0.contains_key(name) + } + + /// when a peer is inserted, if the total number of peers exceeds the limit, + /// remove the one with the oldest last_message. + pub fn insert(&self, name: String, peer: Peer) { + self.0.insert(name, peer); + if self.0.len() > utils::MAX_PEERS { + let oldest = self.0.iter().min_by_key(|p| p.last_message).unwrap(); + self.0.remove(oldest.key()); + } + } + + pub fn remove(&self, name: &str) -> Option<(String, Peer)> { + self.0.remove(name) + } +} + pub type OnchainPKI = Arc>; /// (from, target) -> from's socket @@ -73,15 +107,24 @@ impl PendingStream { } } -#[derive(Clone)] pub struct Peer { pub identity: Identity, /// If true, we are routing for them and have a RoutingClientConnection /// associated with them. We can send them prompts to establish Passthroughs. pub routing_for: bool, pub sender: UnboundedSender, + /// unix timestamp of last message sent *or* received + pub last_message: u64, } +impl Peer { + pub fn set_last_message(&mut self) { + self.last_message = std::time::SystemTime::now() + .duration_since(std::time::UNIX_EPOCH) + .unwrap() + .as_secs() + } +} /// [`Identity`], with additional fields for networking. #[derive(Clone)] pub struct IdentityExt { diff --git a/kinode/src/net/utils.rs b/kinode/src/net/utils.rs index 293aea95..52b9e924 100644 --- a/kinode/src/net/utils.rs +++ b/kinode/src/net/utils.rs @@ -27,6 +27,12 @@ pub const MESSAGE_MAX_SIZE: u32 = 10_485_800; pub const TIMEOUT: std::time::Duration = std::time::Duration::from_secs(5); +/// 30 minute idle timeout for connections +pub const IDLE_TIMEOUT: std::time::Duration = std::time::Duration::from_secs(1800); + +/// maximum number of peers (open connections, but does not include passthroughs we provide!) +pub const MAX_PEERS: usize = 100; + pub async fn create_passthrough( our: &Identity, our_ip: &str, diff --git a/kinode/src/net/ws/mod.rs b/kinode/src/net/ws/mod.rs index 8df1d40f..1b5c5379 100644 --- a/kinode/src/net/ws/mod.rs +++ b/kinode/src/net/ws/mod.rs @@ -194,6 +194,7 @@ pub async fn recv_via_router( identity: peer_id.clone(), routing_for: false, sender: peer_tx, + last_message: 0, }, ); // maintain direct connection @@ -279,6 +280,7 @@ async fn recv_connection( identity: their_id.clone(), routing_for: their_handshake.proxy_request, sender: peer_tx, + last_message: 0, }, ); tokio::spawn(utils::maintain_connection( diff --git a/kinode/src/net/ws/utils.rs b/kinode/src/net/ws/utils.rs index 15bbc3da..ab3acdb8 100644 --- a/kinode/src/net/ws/utils.rs +++ b/kinode/src/net/ws/utils.rs @@ -1,6 +1,6 @@ use crate::net::{ types::{HandshakePayload, IdentityExt, Peers}, - utils::{print_debug, print_loud, MESSAGE_MAX_SIZE}, + utils::{print_debug, print_loud, IDLE_TIMEOUT, MESSAGE_MAX_SIZE}, ws::{PeerConnection, WebSocket}, }; use lib::core::{KernelMessage, MessageSender, NodeId, PrintSender}; @@ -103,9 +103,14 @@ pub async fn maintain_connection( } }; + let timeout = tokio::time::sleep(IDLE_TIMEOUT); + tokio::select! { _ = write => (), _ = read => (), + _ = timeout => { + print_debug(&print_tx, &format!("net: closing idle connection with {peer_name}")).await; + } } print_debug(&print_tx, &format!("net: connection lost with {peer_name}")).await; From f69c3d616e8200127b65ac12fc5687f5cdfd086c Mon Sep 17 00:00:00 2001 From: dr-frmr Date: Mon, 30 Sep 2024 14:23:03 -0400 Subject: [PATCH 08/37] restructure max_peers, add max_passthroughs -- both now boot flags, plus better diagnostics --- kinode/src/main.rs | 10 ++++++ kinode/src/net/connect.rs | 14 ++------ kinode/src/net/indirect.rs | 14 ++------ kinode/src/net/mod.rs | 62 +++++++++++++++++++++++++------- kinode/src/net/tcp/mod.rs | 28 ++++----------- kinode/src/net/types.rs | 74 +++++++++++++++++++++++++++++++------- kinode/src/net/utils.rs | 56 +++++++++++++++++++++-------- kinode/src/net/ws/mod.rs | 28 ++++----------- 8 files changed, 181 insertions(+), 105 deletions(-) diff --git a/kinode/src/main.rs b/kinode/src/main.rs index 9b4cee0b..9e3dbf0a 100644 --- a/kinode/src/main.rs +++ b/kinode/src/main.rs @@ -353,6 +353,8 @@ async fn main() { print_sender.clone(), net_message_receiver, *matches.get_one::("reveal-ip").unwrap_or(&true), + *matches.get_one::("max-peers").unwrap_or(&100), + *matches.get_one::("max-passthroughs").unwrap_or(&0), )); tasks.spawn(state::state_sender( our_name_arc.clone(), @@ -695,6 +697,14 @@ fn build_command() -> Command { .arg( arg!(--"number-log-files" "Number of logs to rotate (default 4)") .value_parser(value_parser!(u64)), + ) + .arg( + arg!(--"max-peers" "Maximum number of peers to hold active connections with (default 100)") + .value_parser(value_parser!(u32)), + ) + .arg( + arg!(--"max-passthroughs" "Maximum number of passthroughs serve as a router (default 0)") + .value_parser(value_parser!(u32)), ); #[cfg(feature = "simulation-mode")] diff --git a/kinode/src/net/connect.rs b/kinode/src/net/connect.rs index 5ff8e621..4d817106 100644 --- a/kinode/src/net/connect.rs +++ b/kinode/src/net/connect.rs @@ -14,18 +14,10 @@ pub async fn send_to_peer(ext: &IdentityExt, data: &NetData, km: KernelMessage) let Some(peer_id) = data.pki.get(&km.target.node) else { return utils::error_offline(km, &ext.network_error_tx).await; }; - let (peer_tx, peer_rx) = mpsc::unbounded_channel(); + let (mut peer, peer_rx) = Peer::new(peer_id.clone(), false); // send message to be routed - peer_tx.send(km).unwrap(); - data.peers.insert( - peer_id.name.clone(), - Peer { - identity: peer_id.clone(), - routing_for: false, - sender: peer_tx.clone(), - last_message: 0, - }, - ); + peer.send(km); + data.peers.insert(peer_id.name.clone(), peer); tokio::spawn(connect_to_peer( ext.clone(), data.clone(), diff --git a/kinode/src/net/indirect.rs b/kinode/src/net/indirect.rs index ba47ea48..4328f25b 100644 --- a/kinode/src/net/indirect.rs +++ b/kinode/src/net/indirect.rs @@ -1,7 +1,7 @@ use crate::net::types::{IdentityExt, NetData, Peer}; use crate::net::{connect, tcp, utils, ws}; use lib::types::core::{Identity, NodeRouting}; -use tokio::{sync::mpsc, time}; +use tokio::time; pub async fn maintain_routers(ext: IdentityExt, data: NetData) -> anyhow::Result<()> { let NodeRouting::Routers(ref routers) = ext.our.routing else { @@ -29,16 +29,8 @@ pub async fn connect_to_router(router_id: &Identity, ext: &IdentityExt, data: &N &format!("net: attempting to connect to router {}", router_id.name), ) .await; - let (peer_tx, peer_rx) = mpsc::unbounded_channel(); - data.peers.insert( - router_id.name.clone(), - Peer { - identity: router_id.clone(), - routing_for: false, - sender: peer_tx.clone(), - last_message: 0, - }, - ); + let (peer, peer_rx) = Peer::new(router_id.clone(), false); + data.peers.insert(router_id.name.clone(), peer); if let Some((_ip, port)) = router_id.tcp_routing() { match tcp::init_direct(ext, data, &router_id, *port, true, peer_rx).await { Ok(()) => { diff --git a/kinode/src/net/mod.rs b/kinode/src/net/mod.rs index 25e30cae..c2d72ff6 100644 --- a/kinode/src/net/mod.rs +++ b/kinode/src/net/mod.rs @@ -3,9 +3,15 @@ use lib::types::core::{ NetworkErrorSender, NodeRouting, PrintSender, }; use types::{ - IdentityExt, NetData, OnchainPKI, Peers, PendingPassthroughs, TCP_PROTOCOL, WS_PROTOCOL, + ActivePassthroughs, IdentityExt, NetData, OnchainPKI, Peers, PendingPassthroughs, TCP_PROTOCOL, + WS_PROTOCOL, +}; +use { + dashmap::{DashMap, DashSet}, + ring::signature::Ed25519KeyPair, + std::sync::Arc, + tokio::task::JoinSet, }; -use {dashmap::DashMap, ring::signature::Ed25519KeyPair, std::sync::Arc, tokio::task::JoinSet}; mod connect; mod indirect; @@ -31,7 +37,11 @@ pub async fn networking( network_error_tx: NetworkErrorSender, print_tx: PrintSender, kernel_message_rx: MessageReceiver, - _reveal_ip: bool, // only used if indirect + // only used if indirect -- TODO use + _reveal_ip: bool, + max_peers: u32, + // only used by routers + max_passthroughs: u32, ) -> anyhow::Result<()> { let ext = IdentityExt { our: Arc::new(our), @@ -45,14 +55,18 @@ pub async fn networking( // start by initializing the structs where we'll store PKI in memory // and store a mapping of peers we have an active route for let pki: OnchainPKI = Arc::new(DashMap::new()); - let peers: Peers = Peers(Arc::new(DashMap::new())); + let peers: Peers = Peers::new(max_peers); // only used by routers let pending_passthroughs: PendingPassthroughs = Arc::new(DashMap::new()); + let active_passthroughs: ActivePassthroughs = Arc::new(DashSet::new()); let net_data = NetData { pki, peers, pending_passthroughs, + active_passthroughs, + max_peers, + max_passthroughs, }; let mut tasks = JoinSet::>::new(); @@ -171,7 +185,7 @@ async fn handle_local_request( NetAction::GetPeers => ( NetResponse::Peers( data.peers - .0 + .peers() .iter() .map(|p| p.identity.clone()) .collect::>(), @@ -191,19 +205,28 @@ async fn handle_local_request( printout.push_str(&format!("our Identity: {:#?}\r\n", ext.our)); printout.push_str(&format!( "we have connections with {} peers ({} max):\r\n", - data.peers.0.len(), - utils::MAX_PEERS, + data.peers.peers().len(), + data.max_peers, )); - for peer in data.peers.0.iter() { + + let now = std::time::SystemTime::now() + .duration_since(std::time::UNIX_EPOCH) + .unwrap() + .as_secs(); + for peer in data.peers.peers().iter() { printout.push_str(&format!( - " {}, routing_for={}\r\n", - peer.identity.name, peer.routing_for, + " {},{} last message {}s ago\r\n", + peer.identity.name, + if peer.routing_for { " (routing)" } else { "" }, + now.saturating_sub(peer.last_message) )); } + printout.push_str(&format!( - "we have {} entries in the PKI\r\n", - data.pki.len() + "we allow {} max passthroughs\r\n", + data.max_passthroughs )); + if !data.pending_passthroughs.is_empty() { printout.push_str(&format!( "we have {} pending passthroughs:\r\n", @@ -214,6 +237,21 @@ async fn handle_local_request( } } + if !data.active_passthroughs.is_empty() { + printout.push_str(&format!( + "we have {} active passthroughs:\r\n", + data.active_passthroughs.len() + )); + for p in data.active_passthroughs.iter() { + printout.push_str(&format!(" {} -> {}\r\n", p.0, p.1)); + } + } + + printout.push_str(&format!( + "we have {} entries in the PKI\r\n", + data.pki.len() + )); + (NetResponse::Diagnostics(printout), None) } NetAction::Sign => ( diff --git a/kinode/src/net/tcp/mod.rs b/kinode/src/net/tcp/mod.rs index 37d6e42a..b039a49b 100644 --- a/kinode/src/net/tcp/mod.rs +++ b/kinode/src/net/tcp/mod.rs @@ -175,8 +175,7 @@ async fn recv_connection( &ext.our_ip, from_id, target_id, - &data.peers, - &data.pending_passthroughs, + &data, PendingStream::Tcp(stream), ) .await; @@ -215,16 +214,9 @@ async fn recv_connection( &their_id, )?; - let (peer_tx, peer_rx) = mpsc::unbounded_channel(); - data.peers.insert( - their_id.name.clone(), - Peer { - identity: their_id.clone(), - routing_for: their_handshake.proxy_request, - sender: peer_tx, - last_message: 0, - }, - ); + let (peer, peer_rx) = Peer::new(their_id.clone(), their_handshake.proxy_request); + data.peers.insert(their_id.name.clone(), peer); + tokio::spawn(utils::maintain_connection( their_handshake.name, data.peers, @@ -337,16 +329,8 @@ pub async fn recv_via_router( }; match connect_with_handshake_via_router(&ext, &peer_id, &router_id, stream).await { Ok(connection) => { - let (peer_tx, peer_rx) = mpsc::unbounded_channel(); - data.peers.insert( - peer_id.name.clone(), - Peer { - identity: peer_id.clone(), - routing_for: false, - sender: peer_tx, - last_message: 0, - }, - ); + let (peer, peer_rx) = Peer::new(peer_id.clone(), false); + data.peers.insert(peer_id.name.clone(), peer); // maintain direct connection tokio::spawn(utils::maintain_connection( peer_id.name, diff --git a/kinode/src/net/types.rs b/kinode/src/net/types.rs index 160cc931..37d448a0 100644 --- a/kinode/src/net/types.rs +++ b/kinode/src/net/types.rs @@ -1,14 +1,13 @@ -use crate::net::utils; use lib::types::core::{ Identity, KernelMessage, MessageSender, NetworkErrorSender, NodeId, PrintSender, }; use { - dashmap::DashMap, + dashmap::{DashMap, DashSet}, ring::signature::Ed25519KeyPair, serde::{Deserialize, Serialize}, std::sync::Arc, tokio::net::TcpStream, - tokio::sync::mpsc::UnboundedSender, + tokio::sync::mpsc::{UnboundedReceiver, UnboundedSender}, tokio_tungstenite::{MaybeTlsStream, WebSocketStream}, }; @@ -56,48 +55,69 @@ pub struct RoutingRequest { } #[derive(Clone)] -pub struct Peers(pub Arc>); +pub struct Peers { + max_peers: u32, + peers: Arc>, +} impl Peers { + pub fn new(max_peers: u32) -> Self { + Self { + max_peers, + peers: Arc::new(DashMap::new()), + } + } + + pub fn peers(&self) -> &DashMap { + &self.peers + } + pub fn get(&self, name: &str) -> Option> { - self.0.get(name) + self.peers.get(name) } pub fn get_mut( &self, name: &str, ) -> std::option::Option> { - self.0.get_mut(name) + self.peers.get_mut(name) } pub fn contains_key(&self, name: &str) -> bool { - self.0.contains_key(name) + self.peers.contains_key(name) } /// when a peer is inserted, if the total number of peers exceeds the limit, /// remove the one with the oldest last_message. pub fn insert(&self, name: String, peer: Peer) { - self.0.insert(name, peer); - if self.0.len() > utils::MAX_PEERS { - let oldest = self.0.iter().min_by_key(|p| p.last_message).unwrap(); - self.0.remove(oldest.key()); + self.peers.insert(name, peer); + if self.peers.len() > self.max_peers as usize { + let oldest = self.peers.iter().min_by_key(|p| p.last_message).unwrap(); + self.peers.remove(oldest.key()); } } pub fn remove(&self, name: &str) -> Option<(String, Peer)> { - self.0.remove(name) + self.peers.remove(name) } } pub type OnchainPKI = Arc>; /// (from, target) -> from's socket +/// +/// only used by routers pub type PendingPassthroughs = Arc>; pub enum PendingStream { WebSocket(WebSocketStream>), Tcp(TcpStream), } +/// (from, target) +/// +/// only used by routers +pub type ActivePassthroughs = Arc>; + impl PendingStream { pub fn is_ws(&self) -> bool { matches!(self, PendingStream::WebSocket(_)) @@ -118,6 +138,31 @@ pub struct Peer { } impl Peer { + /// Create a new Peer. + /// If `routing_for` is true, we are routing for them. + pub fn new(identity: Identity, routing_for: bool) -> (Self, UnboundedReceiver) { + let (peer_tx, peer_rx) = tokio::sync::mpsc::unbounded_channel(); + ( + Self { + identity, + routing_for, + sender: peer_tx, + last_message: std::time::SystemTime::now() + .duration_since(std::time::UNIX_EPOCH) + .unwrap() + .as_secs(), + }, + peer_rx, + ) + } + + /// Send a message to the peer. + pub fn send(&mut self, km: KernelMessage) { + self.sender.send(km).expect("net: peer sender was dropped"); + self.set_last_message(); + } + + /// Update the last message time to now. pub fn set_last_message(&mut self) { self.last_message = std::time::SystemTime::now() .duration_since(std::time::UNIX_EPOCH) @@ -141,5 +186,10 @@ pub struct IdentityExt { pub struct NetData { pub pki: OnchainPKI, pub peers: Peers, + /// only used by routers pub pending_passthroughs: PendingPassthroughs, + /// only used by routers + pub active_passthroughs: ActivePassthroughs, + pub max_peers: u32, + pub max_passthroughs: u32, } diff --git a/kinode/src/net/utils.rs b/kinode/src/net/utils.rs index 52b9e924..da05a5ff 100644 --- a/kinode/src/net/utils.rs +++ b/kinode/src/net/utils.rs @@ -1,10 +1,10 @@ use crate::net::types::{ - HandshakePayload, OnchainPKI, Peers, PendingPassthroughs, PendingStream, RoutingRequest, + ActivePassthroughs, HandshakePayload, NetData, OnchainPKI, PendingStream, RoutingRequest, TCP_PROTOCOL, WS_PROTOCOL, }; use lib::types::core::{ Identity, KernelMessage, KnsUpdate, Message, MessageSender, NetAction, NetworkErrorSender, - NodeRouting, PrintSender, Printout, Request, Response, SendError, SendErrorKind, + NodeId, NodeRouting, PrintSender, Printout, Request, Response, SendError, SendErrorKind, WrappedSendError, }; use { @@ -30,24 +30,33 @@ pub const TIMEOUT: std::time::Duration = std::time::Duration::from_secs(5); /// 30 minute idle timeout for connections pub const IDLE_TIMEOUT: std::time::Duration = std::time::Duration::from_secs(1800); -/// maximum number of peers (open connections, but does not include passthroughs we provide!) -pub const MAX_PEERS: usize = 100; - pub async fn create_passthrough( our: &Identity, our_ip: &str, from_id: Identity, target_id: Identity, - peers: &Peers, - pending_passthroughs: &PendingPassthroughs, + data: &NetData, socket_1: PendingStream, ) -> anyhow::Result<()> { + // if we already are at the max number of passthroughs, reject + if data.active_passthroughs.len() + data.pending_passthroughs.len() + >= data.max_passthroughs as usize + { + return Err(anyhow::anyhow!("max passthroughs reached")); + } // if the target has already generated a pending passthrough for this source, // immediately match them - if let Some(((_target, _from), pending_stream)) = - pending_passthroughs.remove(&(target_id.name.clone(), from_id.name.clone())) + if let Some(((from, target), pending_stream)) = data + .pending_passthroughs + .remove(&(target_id.name.clone(), from_id.name.clone())) { - tokio::spawn(maintain_passthrough(socket_1, pending_stream)); + tokio::spawn(maintain_passthrough( + from, + target, + socket_1, + pending_stream, + data.active_passthroughs.clone(), + )); return Ok(()); } if socket_1.is_tcp() { @@ -63,7 +72,13 @@ pub async fn create_passthrough( from_id.name )); }; - tokio::spawn(maintain_passthrough(socket_1, PendingStream::Tcp(stream_2))); + tokio::spawn(maintain_passthrough( + from_id.name, + target_id.name, + socket_1, + PendingStream::Tcp(stream_2), + data.active_passthroughs.clone(), + )); return Ok(()); } } else if socket_1.is_ws() { @@ -79,14 +94,17 @@ pub async fn create_passthrough( )); }; tokio::spawn(maintain_passthrough( + from_id.name, + target_id.name, socket_1, PendingStream::WebSocket(socket_2), + data.active_passthroughs.clone(), )); return Ok(()); } } // create passthrough to indirect node that we do routing for - let target_peer = peers.get(&target_id.name).ok_or(anyhow::anyhow!( + let target_peer = data.peers.get(&target_id.name).ok_or(anyhow::anyhow!( "can't route to {}, not a peer, for passthrough requested by {}", target_id.name, from_id.name @@ -119,12 +137,20 @@ pub async fn create_passthrough( // or if the target node connects to us with a matching passthrough. // TODO it is currently possible to have dangling passthroughs in the map // if the target is "connected" to us but nonresponsive. - pending_passthroughs.insert((from_id.name, target_id.name), socket_1); + data.pending_passthroughs + .insert((from_id.name, target_id.name), socket_1); Ok(()) } /// cross the streams -- spawn on own task -pub async fn maintain_passthrough(socket_1: PendingStream, socket_2: PendingStream) { +pub async fn maintain_passthrough( + from: NodeId, + target: NodeId, + socket_1: PendingStream, + socket_2: PendingStream, + active_passthroughs: ActivePassthroughs, +) { + active_passthroughs.insert((from.clone(), target.clone())); match (socket_1, socket_2) { (PendingStream::Tcp(socket_1), PendingStream::Tcp(socket_2)) => { // do not use bidirectional because if one side closes, @@ -176,9 +202,9 @@ pub async fn maintain_passthrough(socket_1: PendingStream, socket_2: PendingStre } _ => { // these foolish combinations must never occur - return; } } + active_passthroughs.remove(&(from, target)); } pub fn ingest_log(log: KnsUpdate, pki: &OnchainPKI) { diff --git a/kinode/src/net/ws/mod.rs b/kinode/src/net/ws/mod.rs index 1b5c5379..c800cdcb 100644 --- a/kinode/src/net/ws/mod.rs +++ b/kinode/src/net/ws/mod.rs @@ -187,16 +187,8 @@ pub async fn recv_via_router( }; match connect_with_handshake_via_router(&ext, &peer_id, &router_id, socket).await { Ok(connection) => { - let (peer_tx, peer_rx) = mpsc::unbounded_channel(); - data.peers.insert( - peer_id.name.clone(), - Peer { - identity: peer_id.clone(), - routing_for: false, - sender: peer_tx, - last_message: 0, - }, - ); + let (peer, peer_rx) = Peer::new(peer_id.clone(), false); + data.peers.insert(peer_id.name.clone(), peer); // maintain direct connection tokio::spawn(utils::maintain_connection( peer_id.name, @@ -233,8 +225,7 @@ async fn recv_connection( &ext.our_ip, from_id, target_id, - &data.peers, - &data.pending_passthroughs, + &data, PendingStream::WebSocket(socket), ) .await; @@ -273,16 +264,9 @@ async fn recv_connection( &their_id, )?; - let (peer_tx, peer_rx) = mpsc::unbounded_channel(); - data.peers.insert( - their_id.name.clone(), - Peer { - identity: their_id.clone(), - routing_for: their_handshake.proxy_request, - sender: peer_tx, - last_message: 0, - }, - ); + let (peer, peer_rx) = Peer::new(their_id.clone(), their_handshake.proxy_request); + data.peers.insert(their_id.name.clone(), peer); + tokio::spawn(utils::maintain_connection( their_handshake.name, data.peers, From f81af4246121c21d88404a9ebd6a342ae3ca6cf8 Mon Sep 17 00:00:00 2001 From: dr-frmr Date: Mon, 30 Sep 2024 14:57:13 -0400 Subject: [PATCH 09/37] wire fdmanager cull up to net --- kinode/src/main.rs | 14 +++++++++++--- kinode/src/net/mod.rs | 32 +++++++++++++++++++++++++++----- kinode/src/net/types.rs | 12 ++++++++++++ 3 files changed, 50 insertions(+), 8 deletions(-) diff --git a/kinode/src/main.rs b/kinode/src/main.rs index 9e3dbf0a..b3911338 100644 --- a/kinode/src/main.rs +++ b/kinode/src/main.rs @@ -48,6 +48,10 @@ const VERSION: &str = env!("CARGO_PKG_VERSION"); const WS_MIN_PORT: u16 = 9_000; const TCP_MIN_PORT: u16 = 10_000; const MAX_PORT: u16 = 65_535; + +const DEFAULT_MAX_PEERS: u32 = 32; +const DEFAULT_MAX_PASSTHROUGHS: u32 = 0; + /// default routers as a eth-provider fallback const DEFAULT_ETH_PROVIDERS: &str = include_str!("eth/default_providers_mainnet.json"); #[cfg(not(feature = "simulation-mode"))] @@ -353,8 +357,12 @@ async fn main() { print_sender.clone(), net_message_receiver, *matches.get_one::("reveal-ip").unwrap_or(&true), - *matches.get_one::("max-peers").unwrap_or(&100), - *matches.get_one::("max-passthroughs").unwrap_or(&0), + *matches + .get_one::("max-peers") + .unwrap_or(&DEFAULT_MAX_PEERS), + *matches + .get_one::("max-passthroughs") + .unwrap_or(&DEFAULT_MAX_PASSTHROUGHS), )); tasks.spawn(state::state_sender( our_name_arc.clone(), @@ -699,7 +707,7 @@ fn build_command() -> Command { .value_parser(value_parser!(u64)), ) .arg( - arg!(--"max-peers" "Maximum number of peers to hold active connections with (default 100)") + arg!(--"max-peers" "Maximum number of peers to hold active connections with (default 32)") .value_parser(value_parser!(u32)), ) .arg( diff --git a/kinode/src/net/mod.rs b/kinode/src/net/mod.rs index c2d72ff6..d6d733d9 100644 --- a/kinode/src/net/mod.rs +++ b/kinode/src/net/mod.rs @@ -167,7 +167,8 @@ async fn handle_local_request( ) { match rmp_serde::from_slice::(request_body) { Err(_e) => { - // ignore + // only other possible message is from fd_manager -- handle here + handle_fdman(km, request_body, data).await; } Ok(NetAction::ConnectionRequest(_)) => { // we shouldn't get these locally, ignore @@ -222,10 +223,12 @@ async fn handle_local_request( )); } - printout.push_str(&format!( - "we allow {} max passthroughs\r\n", - data.max_passthroughs - )); + if data.max_passthroughs > 0 { + printout.push_str(&format!( + "we allow {} max passthroughs\r\n", + data.max_passthroughs + )); + } if !data.pending_passthroughs.is_empty() { printout.push_str(&format!( @@ -324,6 +327,25 @@ async fn handle_local_request( } } +async fn handle_fdman(km: &KernelMessage, request_body: &[u8], data: &NetData) { + if km.source.process != *lib::core::FD_MANAGER_PROCESS_ID { + return; + } + let Ok(req) = rmp_serde::from_slice::(request_body) else { + return; + }; + match req { + lib::core::FdManagerRequest::Cull { + cull_fraction_denominator, + } => { + // we are requested to cull a fraction of our peers! + // TODO cull passthroughs too? + data.peers.cull(cull_fraction_denominator); + } + _ => return, + } +} + async fn handle_remote_request( ext: &IdentityExt, km: &KernelMessage, diff --git a/kinode/src/net/types.rs b/kinode/src/net/types.rs index 37d448a0..f87fc45d 100644 --- a/kinode/src/net/types.rs +++ b/kinode/src/net/types.rs @@ -100,6 +100,18 @@ impl Peers { pub fn remove(&self, name: &str) -> Option<(String, Peer)> { self.peers.remove(name) } + + /// close the (peer count / fraction) oldest connections + pub fn cull(&self, fraction: u64) { + let num_to_remove = (self.peers.len() as f64 / fraction as f64).ceil() as usize; + let mut to_remove = Vec::with_capacity(num_to_remove); + let mut sorted_peers: Vec<_> = self.peers.iter().collect(); + sorted_peers.sort_by_key(|p| p.last_message); + to_remove.extend(sorted_peers.iter().take(num_to_remove)); + for peer in to_remove { + self.peers.remove(&peer.identity.name); + } + } } pub type OnchainPKI = Arc>; From 3df85701e587d338035644da2a85fb2b7f1c531d Mon Sep 17 00:00:00 2001 From: dr-frmr Date: Mon, 30 Sep 2024 16:44:54 -0400 Subject: [PATCH 10/37] add fd manager to net, add diagnostics requests/responses to fd manager, fix capability risk in terminal userspace --- kinode/packages/terminal/pkg/manifest.json | 21 ++-- kinode/src/fd_manager.rs | 125 +++++++++++++++++---- kinode/src/main.rs | 5 + kinode/src/net/connect.rs | 2 +- kinode/src/net/indirect.rs | 2 +- kinode/src/net/mod.rs | 4 +- kinode/src/net/tcp/mod.rs | 15 +-- kinode/src/net/tcp/utils.rs | 2 +- kinode/src/net/types.rs | 14 ++- kinode/src/net/utils.rs | 31 +++-- kinode/src/net/ws/mod.rs | 7 +- kinode/src/net/ws/utils.rs | 2 +- lib/src/core.rs | 37 +++--- 13 files changed, 187 insertions(+), 80 deletions(-) diff --git a/kinode/packages/terminal/pkg/manifest.json b/kinode/packages/terminal/pkg/manifest.json index 72003cee..918314c4 100644 --- a/kinode/packages/terminal/pkg/manifest.json +++ b/kinode/packages/terminal/pkg/manifest.json @@ -5,12 +5,7 @@ "on_exit": "Restart", "request_networking": true, "request_capabilities": [ - "net:distro:sys", - "filesystem:distro:sys", - "http_server:distro:sys", - "http_client:distro:sys", - "kernel:distro:sys", - "vfs:distro:sys", + "chess:chess:sys", "eth:distro:sys", { "process": "eth:distro:sys", @@ -18,10 +13,16 @@ "root": true } }, - "sqlite:distro:sys", - "kv:distro:sys", - "chess:chess:sys", + "fd_manager:distro:sys", + "filesystem:distro:sys", + "http_server:distro:sys", + "http_client:distro:sys", + "kernel:distro:sys", "kns_indexer:kns_indexer:sys", + "kv:distro:sys", + "net:distro:sys", + "sqlite:distro:sys", + "vfs:distro:sys", { "process": "vfs:distro:sys", "params": { @@ -30,6 +31,6 @@ } ], "grant_capabilities": [], - "public": true + "public": false } ] \ No newline at end of file diff --git a/kinode/src/fd_manager.rs b/kinode/src/fd_manager.rs index 55fb7453..e1f83fc2 100644 --- a/kinode/src/fd_manager.rs +++ b/kinode/src/fd_manager.rs @@ -1,6 +1,7 @@ use lib::types::core::{ - Address, FdManagerError, FdManagerRequest, KernelMessage, Message, MessageReceiver, - MessageSender, PrintSender, Printout, ProcessId, Request, FD_MANAGER_PROCESS_ID, + Address, FdManagerError, FdManagerRequest, FdManagerResponse, KernelMessage, Message, + MessageReceiver, MessageSender, PrintSender, Printout, ProcessId, Request, + FD_MANAGER_PROCESS_ID, }; use std::{collections::HashMap, sync::Arc}; @@ -28,16 +29,19 @@ enum Mode { } impl State { - fn new() -> Self { - Self::default() + fn new(static_max_fds: Option) -> Self { + Self::default(static_max_fds) } - fn default() -> Self { + fn default(static_max_fds: Option) -> Self { Self { fds: HashMap::new(), - mode: Mode::default(), + mode: Mode::default(static_max_fds), total_fds: 0, - max_fds: DEFAULT_MAX_OPEN_FDS, + max_fds: match static_max_fds { + Some(max) => max, + None => DEFAULT_MAX_OPEN_FDS, + }, cull_fraction_denominator: DEFAULT_CULL_FRACTION_DENOMINATOR, } } @@ -55,10 +59,14 @@ impl State { } impl Mode { - fn default() -> Self { - Self::DynamicMax { - max_fds_as_fraction_of_ulimit_percentage: DEFAULT_FDS_AS_FRACTION_OF_ULIMIT_PERCENTAGE, - update_ulimit_secs: DEFAULT_UPDATE_ULIMIT_SECS, + fn default(static_max_fds: Option) -> Self { + match static_max_fds { + Some(_) => Self::StaticMax, + None => Self::DynamicMax { + max_fds_as_fraction_of_ulimit_percentage: + DEFAULT_FDS_AS_FRACTION_OF_ULIMIT_PERCENTAGE, + update_ulimit_secs: DEFAULT_UPDATE_ULIMIT_SECS, + }, } } } @@ -69,8 +77,9 @@ pub async fn fd_manager( send_to_loop: MessageSender, send_to_terminal: PrintSender, mut recv_from_loop: MessageReceiver, + static_max_fds: Option, ) -> anyhow::Result<()> { - let mut state = State::new(); + let mut state = State::new(static_max_fds); let mut interval = { // in code block to release the reference into state let Mode::DynamicMax { @@ -80,7 +89,7 @@ pub async fn fd_manager( else { return Ok(()); }; - tokio::time::interval(tokio::time::Duration::from_secs(update_ulimit_secs.clone())) + tokio::time::interval(tokio::time::Duration::from_secs(*update_ulimit_secs)) }; let our_node = our_node.as_str(); loop { @@ -90,7 +99,8 @@ pub async fn fd_manager( message, &mut interval, &mut state, - )? { + &send_to_loop, + ).await? { Printout::new(2, to_print).send(&send_to_terminal).await; } } @@ -114,12 +124,18 @@ pub async fn fd_manager( } } -fn handle_message( +async fn handle_message( km: KernelMessage, _interval: &mut tokio::time::Interval, state: &mut State, + send_to_loop: &MessageSender, ) -> anyhow::Result> { - let Message::Request(Request { body, .. }) = km.message else { + let Message::Request(Request { + body, + expects_response, + .. + }) = km.message + else { return Err(FdManagerError::NotARequest.into()); }; let request: FdManagerRequest = @@ -154,14 +170,79 @@ fn handle_message( FdManagerRequest::Cull { .. } => { return Err(FdManagerError::FdManagerWasSentCull.into()); } - FdManagerRequest::UpdateMaxFdsAsFractionOfUlimitPercentage(_new) => { - unimplemented!(); + FdManagerRequest::UpdateMaxFdsAsFractionOfUlimitPercentage(new) => { + match state.mode { + Mode::DynamicMax { + ref mut max_fds_as_fraction_of_ulimit_percentage, + .. + } => *max_fds_as_fraction_of_ulimit_percentage = new, + _ => return Err(FdManagerError::BadRequest.into()), + } + None } - FdManagerRequest::UpdateUpdateUlimitSecs(_new) => { - unimplemented!(); + FdManagerRequest::UpdateUpdateUlimitSecs(new) => { + match state.mode { + Mode::DynamicMax { + ref mut update_ulimit_secs, + .. + } => *update_ulimit_secs = new, + _ => return Err(FdManagerError::BadRequest.into()), + } + None } - FdManagerRequest::UpdateCullFractionDenominator(_new) => { - unimplemented!(); + FdManagerRequest::UpdateCullFractionDenominator(new) => { + state.cull_fraction_denominator = new; + None + } + FdManagerRequest::GetState => { + if expects_response.is_some() { + KernelMessage::builder() + .id(km.id) + .source(km.target) + .target(km.rsvp.unwrap_or(km.source)) + .message(Message::Response(( + lib::core::Response { + body: serde_json::to_vec(&FdManagerResponse::GetState( + state.fds.clone(), + )) + .unwrap(), + inherit: false, + metadata: None, + capabilities: vec![], + }, + None, + ))) + .build() + .unwrap() + .send(send_to_loop) + .await; + } + None + } + FdManagerRequest::GetProcessFdCount(process) => { + if expects_response.is_some() { + KernelMessage::builder() + .id(km.id) + .source(km.target) + .target(km.rsvp.unwrap_or(km.source)) + .message(Message::Response(( + lib::core::Response { + body: serde_json::to_vec(&FdManagerResponse::GetProcessFdCount( + *state.fds.get(&process).unwrap_or(&0), + )) + .unwrap(), + inherit: false, + metadata: None, + capabilities: vec![], + }, + None, + ))) + .build() + .unwrap() + .send(send_to_loop) + .await; + } + None } }; Ok(return_value) diff --git a/kinode/src/main.rs b/kinode/src/main.rs index b3911338..100b2a99 100644 --- a/kinode/src/main.rs +++ b/kinode/src/main.rs @@ -377,6 +377,7 @@ async fn main() { kernel_message_sender.clone(), print_sender.clone(), fd_manager_receiver, + matches.get_one::("soft-ulimit").copied(), )); tasks.spawn(kv::kv( our_name_arc.clone(), @@ -713,6 +714,10 @@ fn build_command() -> Command { .arg( arg!(--"max-passthroughs" "Maximum number of passthroughs serve as a router (default 0)") .value_parser(value_parser!(u32)), + ) + .arg( + arg!(--"soft-ulimit" "Enforce a static maximum number of file descriptors (default fetched from system)") + .value_parser(value_parser!(u64)), ); #[cfg(feature = "simulation-mode")] diff --git a/kinode/src/net/connect.rs b/kinode/src/net/connect.rs index 4d817106..8d038626 100644 --- a/kinode/src/net/connect.rs +++ b/kinode/src/net/connect.rs @@ -17,7 +17,7 @@ pub async fn send_to_peer(ext: &IdentityExt, data: &NetData, km: KernelMessage) let (mut peer, peer_rx) = Peer::new(peer_id.clone(), false); // send message to be routed peer.send(km); - data.peers.insert(peer_id.name.clone(), peer); + data.peers.insert(peer_id.name.clone(), peer).await; tokio::spawn(connect_to_peer( ext.clone(), data.clone(), diff --git a/kinode/src/net/indirect.rs b/kinode/src/net/indirect.rs index 4328f25b..e7c1d141 100644 --- a/kinode/src/net/indirect.rs +++ b/kinode/src/net/indirect.rs @@ -30,7 +30,7 @@ pub async fn connect_to_router(router_id: &Identity, ext: &IdentityExt, data: &N ) .await; let (peer, peer_rx) = Peer::new(router_id.clone(), false); - data.peers.insert(router_id.name.clone(), peer); + data.peers.insert(router_id.name.clone(), peer).await; if let Some((_ip, port)) = router_id.tcp_routing() { match tcp::init_direct(ext, data, &router_id, *port, true, peer_rx).await { Ok(()) => { diff --git a/kinode/src/net/mod.rs b/kinode/src/net/mod.rs index d6d733d9..7951a203 100644 --- a/kinode/src/net/mod.rs +++ b/kinode/src/net/mod.rs @@ -55,7 +55,7 @@ pub async fn networking( // start by initializing the structs where we'll store PKI in memory // and store a mapping of peers we have an active route for let pki: OnchainPKI = Arc::new(DashMap::new()); - let peers: Peers = Peers::new(max_peers); + let peers: Peers = Peers::new(max_peers, ext.kernel_message_tx.clone()); // only used by routers let pending_passthroughs: PendingPassthroughs = Arc::new(DashMap::new()); let active_passthroughs: ActivePassthroughs = Arc::new(DashSet::new()); @@ -340,7 +340,7 @@ async fn handle_fdman(km: &KernelMessage, request_body: &[u8], data: &NetData) { } => { // we are requested to cull a fraction of our peers! // TODO cull passthroughs too? - data.peers.cull(cull_fraction_denominator); + data.peers.cull(cull_fraction_denominator).await; } _ => return, } diff --git a/kinode/src/net/tcp/mod.rs b/kinode/src/net/tcp/mod.rs index b039a49b..faeaf9b3 100644 --- a/kinode/src/net/tcp/mod.rs +++ b/kinode/src/net/tcp/mod.rs @@ -170,15 +170,8 @@ async fn recv_connection( if len != 32 { let (from_id, target_id) = validate_routing_request(&ext.our.name, &first_message, &data.pki)?; - return create_passthrough( - &ext.our, - &ext.our_ip, - from_id, - target_id, - &data, - PendingStream::Tcp(stream), - ) - .await; + return create_passthrough(&ext, from_id, target_id, &data, PendingStream::Tcp(stream)) + .await; } let mut buf = [0u8; 65535]; @@ -215,7 +208,7 @@ async fn recv_connection( )?; let (peer, peer_rx) = Peer::new(their_id.clone(), their_handshake.proxy_request); - data.peers.insert(their_id.name.clone(), peer); + data.peers.insert(their_id.name.clone(), peer).await; tokio::spawn(utils::maintain_connection( their_handshake.name, @@ -330,7 +323,7 @@ pub async fn recv_via_router( match connect_with_handshake_via_router(&ext, &peer_id, &router_id, stream).await { Ok(connection) => { let (peer, peer_rx) = Peer::new(peer_id.clone(), false); - data.peers.insert(peer_id.name.clone(), peer); + data.peers.insert(peer_id.name.clone(), peer).await; // maintain direct connection tokio::spawn(utils::maintain_connection( peer_id.name, diff --git a/kinode/src/net/tcp/utils.rs b/kinode/src/net/tcp/utils.rs index 40383f05..e10ad97c 100644 --- a/kinode/src/net/tcp/utils.rs +++ b/kinode/src/net/tcp/utils.rs @@ -93,7 +93,7 @@ pub async fn maintain_connection( } print_debug(&print_tx, &format!("net: connection lost with {peer_name}")).await; - peers.remove(&peer_name); + peers.remove(&peer_name).await; } async fn send_protocol_message( diff --git a/kinode/src/net/types.rs b/kinode/src/net/types.rs index f87fc45d..99f50351 100644 --- a/kinode/src/net/types.rs +++ b/kinode/src/net/types.rs @@ -1,3 +1,4 @@ +use crate::net::utils; use lib::types::core::{ Identity, KernelMessage, MessageSender, NetworkErrorSender, NodeId, PrintSender, }; @@ -57,13 +58,15 @@ pub struct RoutingRequest { #[derive(Clone)] pub struct Peers { max_peers: u32, + send_to_loop: MessageSender, peers: Arc>, } impl Peers { - pub fn new(max_peers: u32) -> Self { + pub fn new(max_peers: u32, send_to_loop: MessageSender) -> Self { Self { max_peers, + send_to_loop, peers: Arc::new(DashMap::new()), } } @@ -89,20 +92,22 @@ impl Peers { /// when a peer is inserted, if the total number of peers exceeds the limit, /// remove the one with the oldest last_message. - pub fn insert(&self, name: String, peer: Peer) { + pub async fn insert(&self, name: String, peer: Peer) { self.peers.insert(name, peer); + utils::send_fd_manager_open(1, &self.send_to_loop).await; if self.peers.len() > self.max_peers as usize { let oldest = self.peers.iter().min_by_key(|p| p.last_message).unwrap(); self.peers.remove(oldest.key()); } } - pub fn remove(&self, name: &str) -> Option<(String, Peer)> { + pub async fn remove(&self, name: &str) -> Option<(String, Peer)> { + utils::send_fd_manager_close(1, &self.send_to_loop).await; self.peers.remove(name) } /// close the (peer count / fraction) oldest connections - pub fn cull(&self, fraction: u64) { + pub async fn cull(&self, fraction: u64) { let num_to_remove = (self.peers.len() as f64 / fraction as f64).ceil() as usize; let mut to_remove = Vec::with_capacity(num_to_remove); let mut sorted_peers: Vec<_> = self.peers.iter().collect(); @@ -111,6 +116,7 @@ impl Peers { for peer in to_remove { self.peers.remove(&peer.identity.name); } + utils::send_fd_manager_close(num_to_remove as u64, &self.send_to_loop).await; } } diff --git a/kinode/src/net/utils.rs b/kinode/src/net/utils.rs index da05a5ff..7ea8cba9 100644 --- a/kinode/src/net/utils.rs +++ b/kinode/src/net/utils.rs @@ -1,11 +1,11 @@ use crate::net::types::{ - ActivePassthroughs, HandshakePayload, NetData, OnchainPKI, PendingStream, RoutingRequest, - TCP_PROTOCOL, WS_PROTOCOL, + ActivePassthroughs, HandshakePayload, IdentityExt, NetData, OnchainPKI, PendingStream, + RoutingRequest, TCP_PROTOCOL, WS_PROTOCOL, }; use lib::types::core::{ - Identity, KernelMessage, KnsUpdate, Message, MessageSender, NetAction, NetworkErrorSender, - NodeId, NodeRouting, PrintSender, Printout, Request, Response, SendError, SendErrorKind, - WrappedSendError, + Address, Identity, KernelMessage, KnsUpdate, Message, MessageSender, NetAction, + NetworkErrorSender, NodeId, NodeRouting, PrintSender, Printout, Request, Response, SendError, + SendErrorKind, WrappedSendError, NET_PROCESS_ID, }; use { futures::{SinkExt, StreamExt}, @@ -31,8 +31,7 @@ pub const TIMEOUT: std::time::Duration = std::time::Duration::from_secs(5); pub const IDLE_TIMEOUT: std::time::Duration = std::time::Duration::from_secs(1800); pub async fn create_passthrough( - our: &Identity, - our_ip: &str, + ext: &IdentityExt, from_id: Identity, target_id: Identity, data: &NetData, @@ -62,7 +61,7 @@ pub async fn create_passthrough( if socket_1.is_tcp() { if let Some((ip, tcp_port)) = target_id.tcp_routing() { // create passthrough to direct node over tcp - let tcp_url = make_conn_url(our_ip, ip, tcp_port, TCP_PROTOCOL)?; + let tcp_url = make_conn_url(&ext.our_ip, ip, tcp_port, TCP_PROTOCOL)?; let Ok(Ok(stream_2)) = time::timeout(TIMEOUT, tokio::net::TcpStream::connect(tcp_url.to_string())).await else { @@ -84,7 +83,7 @@ pub async fn create_passthrough( } else if socket_1.is_ws() { if let Some((ip, ws_port)) = target_id.ws_routing() { // create passthrough to direct node over websocket - let ws_url = make_conn_url(our_ip, ip, ws_port, WS_PROTOCOL)?; + let ws_url = make_conn_url(&ext.our_ip, ip, ws_port, WS_PROTOCOL)?; let Ok(Ok((socket_2, _response))) = time::timeout(TIMEOUT, connect_async(ws_url)).await else { return Err(anyhow::anyhow!( @@ -121,7 +120,7 @@ pub async fn create_passthrough( target_peer.sender.send( KernelMessage::builder() .id(rand::random()) - .source((our.name.as_str(), "net", "distro", "sys")) + .source((ext.our.name.as_str(), "net", "distro", "sys")) .target((target_id.name.as_str(), "net", "distro", "sys")) .message(Message::Request(Request { inherit: false, @@ -382,6 +381,18 @@ pub async fn parse_hello_message( .await; } +/// Send an OpenFds message to the fd_manager. +pub async fn send_fd_manager_open(num_opened: u64, kernel_message_tx: &MessageSender) { + let our: Address = Address::new("our", NET_PROCESS_ID.clone()); + let _ = crate::fd_manager::send_fd_manager_open(&our, num_opened, kernel_message_tx).await; +} + +/// Send a CloseFds message to the fd_manager. +pub async fn send_fd_manager_close(num_closed: u64, kernel_message_tx: &MessageSender) { + let our: Address = Address::new("our", NET_PROCESS_ID.clone()); + let _ = crate::fd_manager::send_fd_manager_close(&our, num_closed, kernel_message_tx).await; +} + /// Create a terminal printout at verbosity level 0. pub async fn print_loud(print_tx: &PrintSender, content: &str) { Printout::new(0, content).send(print_tx).await; diff --git a/kinode/src/net/ws/mod.rs b/kinode/src/net/ws/mod.rs index c800cdcb..0e2b9714 100644 --- a/kinode/src/net/ws/mod.rs +++ b/kinode/src/net/ws/mod.rs @@ -188,7 +188,7 @@ pub async fn recv_via_router( match connect_with_handshake_via_router(&ext, &peer_id, &router_id, socket).await { Ok(connection) => { let (peer, peer_rx) = Peer::new(peer_id.clone(), false); - data.peers.insert(peer_id.name.clone(), peer); + data.peers.insert(peer_id.name.clone(), peer).await; // maintain direct connection tokio::spawn(utils::maintain_connection( peer_id.name, @@ -221,8 +221,7 @@ async fn recv_connection( let (from_id, target_id) = validate_routing_request(&ext.our.name, first_message, &data.pki)?; return create_passthrough( - &ext.our, - &ext.our_ip, + &ext, from_id, target_id, &data, @@ -265,7 +264,7 @@ async fn recv_connection( )?; let (peer, peer_rx) = Peer::new(their_id.clone(), their_handshake.proxy_request); - data.peers.insert(their_id.name.clone(), peer); + data.peers.insert(their_id.name.clone(), peer).await; tokio::spawn(utils::maintain_connection( their_handshake.name, diff --git a/kinode/src/net/ws/utils.rs b/kinode/src/net/ws/utils.rs index ab3acdb8..dbfb9f4b 100644 --- a/kinode/src/net/ws/utils.rs +++ b/kinode/src/net/ws/utils.rs @@ -114,7 +114,7 @@ pub async fn maintain_connection( } print_debug(&print_tx, &format!("net: connection lost with {peer_name}")).await; - peers.remove(&peer_name); + peers.remove(&peer_name).await; } async fn send_protocol_message( diff --git a/lib/src/core.rs b/lib/src/core.rs index d2579cc0..27472fa7 100644 --- a/lib/src/core.rs +++ b/lib/src/core.rs @@ -8,16 +8,17 @@ use thiserror::Error; lazy_static::lazy_static! { pub static ref ETH_PROCESS_ID: ProcessId = ProcessId::new(Some("eth"), "distro", "sys"); + pub static ref FD_MANAGER_PROCESS_ID: ProcessId = ProcessId::new(Some("fd_manager"), "distro", "sys"); pub static ref HTTP_CLIENT_PROCESS_ID: ProcessId = ProcessId::new(Some("http_client"), "distro", "sys"); pub static ref HTTP_SERVER_PROCESS_ID: ProcessId = ProcessId::new(Some("http_server"), "distro", "sys"); pub static ref KERNEL_PROCESS_ID: ProcessId = ProcessId::new(Some("kernel"), "distro", "sys"); + pub static ref KV_PROCESS_ID: ProcessId = ProcessId::new(Some("kv"), "distro", "sys"); + pub static ref NET_PROCESS_ID: ProcessId = ProcessId::new(Some("net"), "distro", "sys"); + pub static ref STATE_PROCESS_ID: ProcessId = ProcessId::new(Some("state"), "distro", "sys"); + pub static ref SQLITE_PROCESS_ID: ProcessId = ProcessId::new(Some("sqlite"), "distro", "sys"); pub static ref TERMINAL_PROCESS_ID: ProcessId = ProcessId::new(Some("terminal"), "terminal", "sys"); pub static ref TIMER_PROCESS_ID: ProcessId = ProcessId::new(Some("timer"), "distro", "sys"); pub static ref VFS_PROCESS_ID: ProcessId = ProcessId::new(Some("vfs"), "distro", "sys"); - pub static ref STATE_PROCESS_ID: ProcessId = ProcessId::new(Some("state"), "distro", "sys"); - pub static ref KV_PROCESS_ID: ProcessId = ProcessId::new(Some("kv"), "distro", "sys"); - pub static ref SQLITE_PROCESS_ID: ProcessId = ProcessId::new(Some("sqlite"), "distro", "sys"); - pub static ref FD_MANAGER_PROCESS_ID: ProcessId = ProcessId::new(Some("fd_manager"), "distro", "sys"); } // @@ -2076,22 +2077,32 @@ impl KnsUpdate { #[derive(Clone, Debug, Serialize, Deserialize)] pub enum FdManagerRequest { /// other process -> fd_manager - OpenFds { - number_opened: u64, - }, - CloseFds { - number_closed: u64, - }, + OpenFds { number_opened: u64 }, + /// other process -> fd_manager + CloseFds { number_closed: u64 }, /// fd_manager -> other process - Cull { - cull_fraction_denominator: u64, - }, + Cull { cull_fraction_denominator: u64 }, /// administrative UpdateMaxFdsAsFractionOfUlimitPercentage(u64), + /// administrative UpdateUpdateUlimitSecs(u64), + /// administrative UpdateCullFractionDenominator(u64), + + /// get a `HashMap` of all `ProcessId`s to their known number of open file descriptors. + GetState, + /// get the `u64` known number of file descriptors used by `ProcessId`. + GetProcessFdCount(ProcessId), +} + +#[derive(Debug, Serialize, Deserialize)] +pub enum FdManagerResponse { + /// response to [`FdManagerRequest::GetState`] + GetState(HashMap), + /// response to [`FdManagerRequest::GetProcessFdCount`] + GetProcessFdCount(u64), } #[derive(Debug, Error)] From 952282d8b12c85e5e3a0427b7aaab32dff1a10ca Mon Sep 17 00:00:00 2001 From: hosted-fornet Date: Tue, 1 Oct 2024 09:50:25 -0700 Subject: [PATCH 11/37] vfs: remove now-unused const --- kinode/src/vfs.rs | 3 --- 1 file changed, 3 deletions(-) diff --git a/kinode/src/vfs.rs b/kinode/src/vfs.rs index 0764d960..f9362aff 100644 --- a/kinode/src/vfs.rs +++ b/kinode/src/vfs.rs @@ -19,9 +19,6 @@ use tokio::{ sync::Mutex, }; -// Constants for file cleanup -const MAX_OPEN_FILES: usize = 180; - /// The main VFS service function. /// /// This function sets up the VFS, handles incoming requests, and manages file operations. From aca798588812b9d063b2f572ca2e4bd1ba9e9a6a Mon Sep 17 00:00:00 2001 From: hosted-fornet Date: Tue, 1 Oct 2024 18:52:16 -0700 Subject: [PATCH 12/37] net: fix crash by correctly removing peer (bug: didnt await) --- kinode/src/net/connect.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kinode/src/net/connect.rs b/kinode/src/net/connect.rs index 8d038626..12931a2a 100644 --- a/kinode/src/net/connect.rs +++ b/kinode/src/net/connect.rs @@ -151,7 +151,7 @@ pub async fn handle_failed_connection( &format!("net: failed to connect to {}", peer_id.name), ) .await; - drop(data.peers.remove(&peer_id.name)); + data.peers.remove(&peer_id.name).await; peer_rx.close(); while let Some(km) = peer_rx.recv().await { utils::error_offline(km, &ext.network_error_tx).await; From a2fa27610cbbb77f16f48e9ad9883411191693d6 Mon Sep 17 00:00:00 2001 From: hosted-fornet Date: Tue, 1 Oct 2024 18:52:53 -0700 Subject: [PATCH 13/37] fd_manager: handle errors to avoid crashing kernel --- kinode/src/fd_manager.rs | 38 +++++++++++++++++++++----------------- 1 file changed, 21 insertions(+), 17 deletions(-) diff --git a/kinode/src/fd_manager.rs b/kinode/src/fd_manager.rs index e1f83fc2..ac800d89 100644 --- a/kinode/src/fd_manager.rs +++ b/kinode/src/fd_manager.rs @@ -95,17 +95,29 @@ pub async fn fd_manager( loop { tokio::select! { Some(message) = recv_from_loop.recv() => { - if let Some(to_print) = handle_message( + match handle_message( message, &mut interval, &mut state, &send_to_loop, - ).await? { - Printout::new(2, to_print).send(&send_to_terminal).await; + ).await { + Ok(Some(to_print)) => { + Printout::new(2, to_print).send(&send_to_terminal).await; + } + Err(e) => { + Printout::new(1, &format!("handle_message error: {e:?}")) + .send(&send_to_terminal) + .await; + } + _ => {} } } _ = interval.tick() => { - update_max_fds(&send_to_terminal, &mut state).await?; + if let Err(e) = update_max_fds(&mut state).await { + Printout::new(1, &format!("update_max_fds error: {e:?}")) + .send(&send_to_terminal) + .await; + } } } @@ -119,7 +131,7 @@ pub async fn fd_manager( ) .send(&send_to_terminal) .await; - send_cull(our_node, &send_to_loop, &state).await?; + send_cull(our_node, &send_to_loop, &state).await; } } } @@ -248,16 +260,9 @@ async fn handle_message( Ok(return_value) } -async fn update_max_fds(send_to_terminal: &PrintSender, state: &mut State) -> anyhow::Result<()> { - let ulimit_max_fds = match get_max_fd_limit() { - Ok(ulimit_max_fds) => ulimit_max_fds, - Err(_) => { - Printout::new(1, "Couldn't update max fd limit: ulimit failed") - .send(send_to_terminal) - .await; - return Ok(()); - } - }; +async fn update_max_fds(state: &mut State) -> anyhow::Result<()> { + let ulimit_max_fds = get_max_fd_limit() + .map_err(|_| anyhow::anyhow!("Couldn't update max fd limit: ulimit failed"))?; state.update_max_fds_from_ulimit(ulimit_max_fds); Ok(()) } @@ -266,7 +271,7 @@ async fn send_cull( our_node: &str, send_to_loop: &MessageSender, state: &State, -) -> anyhow::Result<()> { +) { let message = Message::Request(Request { inherit: false, expects_response: None, @@ -288,7 +293,6 @@ async fn send_cull( .send(send_to_loop) .await; } - Ok(()) } fn get_max_fd_limit() -> anyhow::Result { From 01337236ac25e1977eef2698bca712f5ad5b19f0 Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" <41898282+github-actions[bot]@users.noreply.github.com> Date: Wed, 2 Oct 2024 01:53:15 +0000 Subject: [PATCH 14/37] Format Rust code using rustfmt --- kinode/src/fd_manager.rs | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/kinode/src/fd_manager.rs b/kinode/src/fd_manager.rs index ac800d89..02e326b0 100644 --- a/kinode/src/fd_manager.rs +++ b/kinode/src/fd_manager.rs @@ -267,11 +267,7 @@ async fn update_max_fds(state: &mut State) -> anyhow::Result<()> { Ok(()) } -async fn send_cull( - our_node: &str, - send_to_loop: &MessageSender, - state: &State, -) { +async fn send_cull(our_node: &str, send_to_loop: &MessageSender, state: &State) { let message = Message::Request(Request { inherit: false, expects_response: None, From 2d2d9464ca9177fc4699d6aca747afa7bf941d76 Mon Sep 17 00:00:00 2001 From: hosted-fornet Date: Wed, 2 Oct 2024 16:30:08 -0700 Subject: [PATCH 15/37] net: fix deadlock when bumping old peer (DashMap multiply refd) --- kinode/src/net/types.rs | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/kinode/src/net/types.rs b/kinode/src/net/types.rs index 99f50351..b4758aaa 100644 --- a/kinode/src/net/types.rs +++ b/kinode/src/net/types.rs @@ -94,10 +94,11 @@ impl Peers { /// remove the one with the oldest last_message. pub async fn insert(&self, name: String, peer: Peer) { self.peers.insert(name, peer); - utils::send_fd_manager_open(1, &self.send_to_loop).await; if self.peers.len() > self.max_peers as usize { - let oldest = self.peers.iter().min_by_key(|p| p.last_message).unwrap(); - self.peers.remove(oldest.key()); + let oldest = self.peers.iter().min_by_key(|p| p.last_message).unwrap().key().clone(); + self.peers.remove(&oldest); + } else { + utils::send_fd_manager_open(1, &self.send_to_loop).await; } } From 9471d7a2cd855b791ad57c0296e07f6065a83ba2 Mon Sep 17 00:00:00 2001 From: hosted-fornet Date: Wed, 2 Oct 2024 16:30:28 -0700 Subject: [PATCH 16/37] net: deserialize FdManagerRequest using serde_json --- kinode/src/net/mod.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kinode/src/net/mod.rs b/kinode/src/net/mod.rs index 7951a203..4531926d 100644 --- a/kinode/src/net/mod.rs +++ b/kinode/src/net/mod.rs @@ -331,7 +331,7 @@ async fn handle_fdman(km: &KernelMessage, request_body: &[u8], data: &NetData) { if km.source.process != *lib::core::FD_MANAGER_PROCESS_ID { return; } - let Ok(req) = rmp_serde::from_slice::(request_body) else { + let Ok(req) = serde_json::from_slice::(request_body) else { return; }; match req { From 045b7a1c20c4cc034a0ffdcd1a3a0f958725c088 Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" <41898282+github-actions[bot]@users.noreply.github.com> Date: Wed, 2 Oct 2024 23:39:24 +0000 Subject: [PATCH 17/37] Format Rust code using rustfmt --- kinode/src/net/types.rs | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/kinode/src/net/types.rs b/kinode/src/net/types.rs index b4758aaa..565c7ba4 100644 --- a/kinode/src/net/types.rs +++ b/kinode/src/net/types.rs @@ -95,7 +95,13 @@ impl Peers { pub async fn insert(&self, name: String, peer: Peer) { self.peers.insert(name, peer); if self.peers.len() > self.max_peers as usize { - let oldest = self.peers.iter().min_by_key(|p| p.last_message).unwrap().key().clone(); + let oldest = self + .peers + .iter() + .min_by_key(|p| p.last_message) + .unwrap() + .key() + .clone(); self.peers.remove(&oldest); } else { utils::send_fd_manager_open(1, &self.send_to_loop).await; From 569403b62c35e4f386747d8e64f866a4d196a989 Mon Sep 17 00:00:00 2001 From: hosted-fornet Date: Wed, 2 Oct 2024 18:54:11 -0700 Subject: [PATCH 18/37] net: if run out of passthroughs, remove oldest TODO: update the time used to judge to least recently sent? --- kinode/src/net/mod.rs | 6 ++-- kinode/src/net/types.rs | 8 +++-- kinode/src/net/utils.rs | 68 ++++++++++++++++++++++++++++++++++++----- 3 files changed, 69 insertions(+), 13 deletions(-) diff --git a/kinode/src/net/mod.rs b/kinode/src/net/mod.rs index 4531926d..2b2c1e15 100644 --- a/kinode/src/net/mod.rs +++ b/kinode/src/net/mod.rs @@ -7,7 +7,7 @@ use types::{ WS_PROTOCOL, }; use { - dashmap::{DashMap, DashSet}, + dashmap::DashMap, ring::signature::Ed25519KeyPair, std::sync::Arc, tokio::task::JoinSet, @@ -58,7 +58,7 @@ pub async fn networking( let peers: Peers = Peers::new(max_peers, ext.kernel_message_tx.clone()); // only used by routers let pending_passthroughs: PendingPassthroughs = Arc::new(DashMap::new()); - let active_passthroughs: ActivePassthroughs = Arc::new(DashSet::new()); + let active_passthroughs: ActivePassthroughs = Arc::new(DashMap::new()); let net_data = NetData { pki, @@ -246,7 +246,7 @@ async fn handle_local_request( data.active_passthroughs.len() )); for p in data.active_passthroughs.iter() { - printout.push_str(&format!(" {} -> {}\r\n", p.0, p.1)); + printout.push_str(&format!(" {} -> {}\r\n", p.key().0, p.key().1)); } } diff --git a/kinode/src/net/types.rs b/kinode/src/net/types.rs index 565c7ba4..1212ad30 100644 --- a/kinode/src/net/types.rs +++ b/kinode/src/net/types.rs @@ -3,7 +3,7 @@ use lib::types::core::{ Identity, KernelMessage, MessageSender, NetworkErrorSender, NodeId, PrintSender, }; use { - dashmap::{DashMap, DashSet}, + dashmap::DashMap, ring::signature::Ed25519KeyPair, serde::{Deserialize, Serialize}, std::sync::Arc, @@ -132,7 +132,7 @@ pub type OnchainPKI = Arc>; /// (from, target) -> from's socket /// /// only used by routers -pub type PendingPassthroughs = Arc>; +pub type PendingPassthroughs = Arc>; pub enum PendingStream { WebSocket(WebSocketStream>), Tcp(TcpStream), @@ -141,7 +141,7 @@ pub enum PendingStream { /// (from, target) /// /// only used by routers -pub type ActivePassthroughs = Arc>; +pub type ActivePassthroughs = Arc>; impl PendingStream { pub fn is_ws(&self) -> bool { @@ -152,6 +152,8 @@ impl PendingStream { } } +type KillSender = tokio::sync::mpsc::Sender<()>; + pub struct Peer { pub identity: Identity, /// If true, we are routing for them and have a RoutingClientConnection diff --git a/kinode/src/net/utils.rs b/kinode/src/net/utils.rs index 7ea8cba9..4907fa03 100644 --- a/kinode/src/net/utils.rs +++ b/kinode/src/net/utils.rs @@ -38,17 +38,58 @@ pub async fn create_passthrough( socket_1: PendingStream, ) -> anyhow::Result<()> { // if we already are at the max number of passthroughs, reject + if data.max_passthroughs == 0 { + return Err(anyhow::anyhow!( + "passthrough denied: this node has disallowed passthroughs. Start node with `--max-passthroughs ` to allow passthroughs" + )); + } + // remove pending before checking bound because otherwise we stop + // ourselves from matching pending if this connection will be + // the max_passthroughs passthrough + let maybe_pending = data + .pending_passthroughs + .remove(&(target_id.name.clone(), from_id.name.clone())); if data.active_passthroughs.len() + data.pending_passthroughs.len() >= data.max_passthroughs as usize { - return Err(anyhow::anyhow!("max passthroughs reached")); + let oldest_active = data + .active_passthroughs + .iter() + .min_by_key(|p| p.0); + let (oldest_active_key, oldest_active_time, oldest_active_kill_sender) = match oldest_active { + None => (None, get_now(), None), + Some(oldest_active) => { + let (oldest_active_key, oldest_active_val) = oldest_active.pair(); + let oldest_active_key = oldest_active_key.clone(); + let (oldest_active_time, oldest_active_kill_sender) = oldest_active_val.clone(); + (Some(oldest_active_key), oldest_active_time, Some(oldest_active_kill_sender)) + } + }; + let oldest_pending = data + .pending_passthroughs + .iter() + .min_by_key(|p| p.1); + let (oldest_pending_key, oldest_pending_time) = match oldest_pending { + None => (None, get_now()), + Some(oldest_pending) => { + let (oldest_pending_key, oldest_pending_val) = oldest_pending.pair(); + let oldest_pending_key = oldest_pending_key.clone(); + let (_, oldest_pending_time) = oldest_pending_val; + (Some(oldest_pending_key), oldest_pending_time.clone()) + } + }; + if oldest_active_time < oldest_pending_time { + // active key is oldest + oldest_active_kill_sender.unwrap().send(()).await.unwrap(); + data.active_passthroughs.remove(&oldest_active_key.unwrap()); + } else { + // pending key is oldest + data.pending_passthroughs.remove(&oldest_pending_key.unwrap()); + } } // if the target has already generated a pending passthrough for this source, // immediately match them - if let Some(((from, target), pending_stream)) = data - .pending_passthroughs - .remove(&(target_id.name.clone(), from_id.name.clone())) - { + if let Some(((from, target), (pending_stream, _))) = maybe_pending { tokio::spawn(maintain_passthrough( from, target, @@ -136,8 +177,9 @@ pub async fn create_passthrough( // or if the target node connects to us with a matching passthrough. // TODO it is currently possible to have dangling passthroughs in the map // if the target is "connected" to us but nonresponsive. + let now = get_now(); data.pending_passthroughs - .insert((from_id.name, target_id.name), socket_1); + .insert((from_id.name, target_id.name), (socket_1, now)); Ok(()) } @@ -149,7 +191,9 @@ pub async fn maintain_passthrough( socket_2: PendingStream, active_passthroughs: ActivePassthroughs, ) { - active_passthroughs.insert((from.clone(), target.clone())); + let now = get_now(); + let (kill_sender, mut kill_receiver) = tokio::sync::mpsc::channel(1); + active_passthroughs.insert((from.clone(), target.clone()), (now, kill_sender)); match (socket_1, socket_2) { (PendingStream::Tcp(socket_1), PendingStream::Tcp(socket_2)) => { // do not use bidirectional because if one side closes, @@ -160,6 +204,7 @@ pub async fn maintain_passthrough( tokio::select! { _ = copy(&mut r1, &mut w2) => {}, _ = copy(&mut r2, &mut w1) => {}, + _ = kill_receiver.recv() => {}, } } (PendingStream::WebSocket(mut socket_1), PendingStream::WebSocket(mut socket_2)) => { @@ -194,6 +239,7 @@ pub async fn maintain_passthrough( break } } + _ = kill_receiver.recv() => break, } } let _ = socket_1.close(None).await; @@ -402,3 +448,11 @@ pub async fn print_loud(print_tx: &PrintSender, content: &str) { pub async fn print_debug(print_tx: &PrintSender, content: &str) { Printout::new(2, content).send(print_tx).await; } + +pub fn get_now() -> u64 { + let now = std::time::SystemTime::now() + .duration_since(std::time::UNIX_EPOCH) + .unwrap() + .as_secs(); + now +} From e3a19f41f1ef36381a871d1764eddec078a6a441 Mon Sep 17 00:00:00 2001 From: hosted-fornet Date: Wed, 2 Oct 2024 18:56:08 -0700 Subject: [PATCH 19/37] fd_manager: dont crash --- kinode/src/fd_manager.rs | 14 +++++++++++--- 1 file changed, 11 insertions(+), 3 deletions(-) diff --git a/kinode/src/fd_manager.rs b/kinode/src/fd_manager.rs index 02e326b0..be86e0e0 100644 --- a/kinode/src/fd_manager.rs +++ b/kinode/src/fd_manager.rs @@ -164,7 +164,7 @@ async fn handle_message( } FdManagerRequest::CloseFds { mut number_closed } => { assert!(state.total_fds >= number_closed); - let return_value = Some(format!( + let mut return_value = Some(format!( "{} closed {} of {}", km.source.process, number_closed, state.total_fds, )); @@ -173,8 +173,16 @@ async fn handle_message( .fds .entry(km.source.process) .and_modify(|e| { - assert!(e >= &mut number_closed); - *e -= number_closed + if e < &mut number_closed { + return_value.as_mut().unwrap().push_str(&format!( + "\n!!process claims to have closed more fds ({}) than it had open: {}!!", + number_closed, + e, + )); + *e = 0; + } else { + *e -= number_closed; + } }) .or_insert(number_closed); return_value From 059efbb52a41aa74719d50b074688f8e55b78260 Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" <41898282+github-actions[bot]@users.noreply.github.com> Date: Thu, 3 Oct 2024 01:56:31 +0000 Subject: [PATCH 20/37] Format Rust code using rustfmt --- kinode/src/net/mod.rs | 7 +------ kinode/src/net/utils.rs | 22 +++++++++++----------- 2 files changed, 12 insertions(+), 17 deletions(-) diff --git a/kinode/src/net/mod.rs b/kinode/src/net/mod.rs index 2b2c1e15..8a1aab10 100644 --- a/kinode/src/net/mod.rs +++ b/kinode/src/net/mod.rs @@ -6,12 +6,7 @@ use types::{ ActivePassthroughs, IdentityExt, NetData, OnchainPKI, Peers, PendingPassthroughs, TCP_PROTOCOL, WS_PROTOCOL, }; -use { - dashmap::DashMap, - ring::signature::Ed25519KeyPair, - std::sync::Arc, - tokio::task::JoinSet, -}; +use {dashmap::DashMap, ring::signature::Ed25519KeyPair, std::sync::Arc, tokio::task::JoinSet}; mod connect; mod indirect; diff --git a/kinode/src/net/utils.rs b/kinode/src/net/utils.rs index 4907fa03..592cf125 100644 --- a/kinode/src/net/utils.rs +++ b/kinode/src/net/utils.rs @@ -52,23 +52,22 @@ pub async fn create_passthrough( if data.active_passthroughs.len() + data.pending_passthroughs.len() >= data.max_passthroughs as usize { - let oldest_active = data - .active_passthroughs - .iter() - .min_by_key(|p| p.0); - let (oldest_active_key, oldest_active_time, oldest_active_kill_sender) = match oldest_active { + let oldest_active = data.active_passthroughs.iter().min_by_key(|p| p.0); + let (oldest_active_key, oldest_active_time, oldest_active_kill_sender) = match oldest_active + { None => (None, get_now(), None), Some(oldest_active) => { let (oldest_active_key, oldest_active_val) = oldest_active.pair(); let oldest_active_key = oldest_active_key.clone(); let (oldest_active_time, oldest_active_kill_sender) = oldest_active_val.clone(); - (Some(oldest_active_key), oldest_active_time, Some(oldest_active_kill_sender)) + ( + Some(oldest_active_key), + oldest_active_time, + Some(oldest_active_kill_sender), + ) } }; - let oldest_pending = data - .pending_passthroughs - .iter() - .min_by_key(|p| p.1); + let oldest_pending = data.pending_passthroughs.iter().min_by_key(|p| p.1); let (oldest_pending_key, oldest_pending_time) = match oldest_pending { None => (None, get_now()), Some(oldest_pending) => { @@ -84,7 +83,8 @@ pub async fn create_passthrough( data.active_passthroughs.remove(&oldest_active_key.unwrap()); } else { // pending key is oldest - data.pending_passthroughs.remove(&oldest_pending_key.unwrap()); + data.pending_passthroughs + .remove(&oldest_pending_key.unwrap()); } } // if the target has already generated a pending passthrough for this source, From 60f0bb8eac0d3ad1cb4753427838bc1aa7f1a469 Mon Sep 17 00:00:00 2001 From: hosted-fornet Date: Thu, 3 Oct 2024 10:08:36 -0700 Subject: [PATCH 21/37] fd_manager: dont crash --- kinode/src/fd_manager.rs | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/kinode/src/fd_manager.rs b/kinode/src/fd_manager.rs index be86e0e0..694cf108 100644 --- a/kinode/src/fd_manager.rs +++ b/kinode/src/fd_manager.rs @@ -163,12 +163,20 @@ async fn handle_message( None } FdManagerRequest::CloseFds { mut number_closed } => { - assert!(state.total_fds >= number_closed); let mut return_value = Some(format!( "{} closed {} of {}", km.source.process, number_closed, state.total_fds, )); - state.total_fds -= number_closed; + if state.total_fds < number_closed { + return_value.as_mut().unwrap().push_str(&format!( + "\n!!process claims to have closed more fds ({}) than we have open for all processes ({})!!", + number_closed, + state.total_fds, + )); + state.total_fds = 0; + } else { + state.total_fds -= number_closed; + } state .fds .entry(km.source.process) From d892bf10e9894cbfb8a924e0860a2e84d1b2d96b Mon Sep 17 00:00:00 2001 From: hosted-fornet Date: Thu, 3 Oct 2024 13:55:17 -0700 Subject: [PATCH 22/37] fd_manager: fix CloseFds bug: if no entry, error --- kinode/src/fd_manager.rs | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/kinode/src/fd_manager.rs b/kinode/src/fd_manager.rs index 694cf108..aa6ef6bd 100644 --- a/kinode/src/fd_manager.rs +++ b/kinode/src/fd_manager.rs @@ -163,6 +163,13 @@ async fn handle_message( None } FdManagerRequest::CloseFds { mut number_closed } => { + if !state.fds.contains_key(&km.source.process) { + return Err(anyhow::anyhow!( + "{} attempted to CloseFds {} but does not have any open!", + km.source.process, + number_closed, + )); + } let mut return_value = Some(format!( "{} closed {} of {}", km.source.process, number_closed, state.total_fds, @@ -191,8 +198,7 @@ async fn handle_message( } else { *e -= number_closed; } - }) - .or_insert(number_closed); + }); return_value } FdManagerRequest::Cull { .. } => { From 7288735bc9ed24126066c751c05327f64a368b59 Mon Sep 17 00:00:00 2001 From: hosted-fornet Date: Thu, 3 Oct 2024 14:07:04 -0700 Subject: [PATCH 23/37] fd_manager: provide a bit more detail on CloseFd --- kinode/src/fd_manager.rs | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/kinode/src/fd_manager.rs b/kinode/src/fd_manager.rs index aa6ef6bd..2a4056ec 100644 --- a/kinode/src/fd_manager.rs +++ b/kinode/src/fd_manager.rs @@ -171,12 +171,12 @@ async fn handle_message( )); } let mut return_value = Some(format!( - "{} closed {} of {}", + "{} closed {} of {} total;", km.source.process, number_closed, state.total_fds, )); if state.total_fds < number_closed { return_value.as_mut().unwrap().push_str(&format!( - "\n!!process claims to have closed more fds ({}) than we have open for all processes ({})!!", + " !!process claims to have closed more fds ({}) than we have open for all processes ({})!!", number_closed, state.total_fds, )); @@ -190,7 +190,7 @@ async fn handle_message( .and_modify(|e| { if e < &mut number_closed { return_value.as_mut().unwrap().push_str(&format!( - "\n!!process claims to have closed more fds ({}) than it had open: {}!!", + " !!process claims to have closed more fds ({}) than it had open: {}!!", number_closed, e, )); @@ -198,6 +198,7 @@ async fn handle_message( } else { *e -= number_closed; } + return_value.as_mut().unwrap().push_str(&format!(" {e} left")); }); return_value } From 0c34b7f5b238007c7124f06e4d5f0535dffe9223 Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" <41898282+github-actions[bot]@users.noreply.github.com> Date: Thu, 3 Oct 2024 21:07:36 +0000 Subject: [PATCH 24/37] Format Rust code using rustfmt --- kinode/src/fd_manager.rs | 31 +++++++++++++++---------------- 1 file changed, 15 insertions(+), 16 deletions(-) diff --git a/kinode/src/fd_manager.rs b/kinode/src/fd_manager.rs index 2a4056ec..863df137 100644 --- a/kinode/src/fd_manager.rs +++ b/kinode/src/fd_manager.rs @@ -184,22 +184,21 @@ async fn handle_message( } else { state.total_fds -= number_closed; } - state - .fds - .entry(km.source.process) - .and_modify(|e| { - if e < &mut number_closed { - return_value.as_mut().unwrap().push_str(&format!( - " !!process claims to have closed more fds ({}) than it had open: {}!!", - number_closed, - e, - )); - *e = 0; - } else { - *e -= number_closed; - } - return_value.as_mut().unwrap().push_str(&format!(" {e} left")); - }); + state.fds.entry(km.source.process).and_modify(|e| { + if e < &mut number_closed { + return_value.as_mut().unwrap().push_str(&format!( + " !!process claims to have closed more fds ({}) than it had open: {}!!", + number_closed, e, + )); + *e = 0; + } else { + *e -= number_closed; + } + return_value + .as_mut() + .unwrap() + .push_str(&format!(" {e} left")); + }); return_value } FdManagerRequest::Cull { .. } => { From b661e7b59342fef62f1926398fcf1d20b0d85680 Mon Sep 17 00:00:00 2001 From: hosted-fornet Date: Thu, 3 Oct 2024 14:32:30 -0700 Subject: [PATCH 25/37] fd_manager: improve print --- kinode/src/fd_manager.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kinode/src/fd_manager.rs b/kinode/src/fd_manager.rs index 863df137..ad8590f0 100644 --- a/kinode/src/fd_manager.rs +++ b/kinode/src/fd_manager.rs @@ -197,7 +197,7 @@ async fn handle_message( return_value .as_mut() .unwrap() - .push_str(&format!(" {e} left")); + .push_str(&format!(" ({e} left to process after close)")); }); return_value } From 1da3132392e4db48950cbfbb671b84cc3884f710 Mon Sep 17 00:00:00 2001 From: dr-frmr Date: Fri, 4 Oct 2024 16:37:39 -0400 Subject: [PATCH 26/37] MVP of limit-based fd_manager --- .../app_store/app_store/src/http_api.rs | 5 +- .../packages/app_store/app_store/src/lib.rs | 4 +- kinode/src/fd_manager.rs | 170 +++++++----------- kinode/src/main.rs | 8 +- kinode/src/net/mod.rs | 49 +++-- kinode/src/net/types.rs | 27 +-- kinode/src/net/utils.rs | 18 +- kinode/src/vfs.rs | 83 ++++----- lib/src/core.rs | 31 ++-- 9 files changed, 180 insertions(+), 215 deletions(-) diff --git a/kinode/packages/app_store/app_store/src/http_api.rs b/kinode/packages/app_store/app_store/src/http_api.rs index edc088d4..3788d737 100644 --- a/kinode/packages/app_store/app_store/src/http_api.rs +++ b/kinode/packages/app_store/app_store/src/http_api.rs @@ -474,7 +474,10 @@ fn serve_paths( &our.node().to_string(), ) { Ok(_) => { - println!("successfully installed package: {:?}", process_package_id); + println!( + "successfully installed {}:{}", + process_package_id.package_name, process_package_id.publisher_node + ); Ok((StatusCode::CREATED, None, vec![])) } Err(e) => Ok(( diff --git a/kinode/packages/app_store/app_store/src/lib.rs b/kinode/packages/app_store/app_store/src/lib.rs index bcfeb7a1..f3b5731b 100644 --- a/kinode/packages/app_store/app_store/src/lib.rs +++ b/kinode/packages/app_store/app_store/src/lib.rs @@ -261,8 +261,8 @@ fn handle_local_request( match utils::install(&package_id, metadata, &version_hash, state, &our.node) { Ok(()) => { println!( - "successfully installed package: {:?}", - &package_id.to_process_lib() + "successfully installed {}:{}", + package_id.package_name, package_id.publisher_node ); LocalResponse::InstallResponse(InstallResponse::Success) } diff --git a/kinode/src/fd_manager.rs b/kinode/src/fd_manager.rs index ad8590f0..4b597038 100644 --- a/kinode/src/fd_manager.rs +++ b/kinode/src/fd_manager.rs @@ -1,5 +1,5 @@ use lib::types::core::{ - Address, FdManagerError, FdManagerRequest, FdManagerResponse, KernelMessage, Message, + Address, FdManagerError, FdManagerRequest, FdManagerResponse, FdsLimit, KernelMessage, Message, MessageReceiver, MessageSender, PrintSender, Printout, ProcessId, Request, FD_MANAGER_PROCESS_ID, }; @@ -8,14 +8,12 @@ use std::{collections::HashMap, sync::Arc}; const DEFAULT_MAX_OPEN_FDS: u64 = 180; const DEFAULT_FDS_AS_FRACTION_OF_ULIMIT_PERCENTAGE: u64 = 60; const DEFAULT_UPDATE_ULIMIT_SECS: u64 = 3600; -const DEFAULT_CULL_FRACTION_DENOMINATOR: u64 = 2; +const _DEFAULT_CULL_FRACTION_DENOMINATOR: u64 = 2; struct State { - fds: HashMap, + fds_limits: HashMap, mode: Mode, - total_fds: u64, max_fds: u64, - cull_fraction_denominator: u64, } enum Mode { @@ -35,14 +33,12 @@ impl State { fn default(static_max_fds: Option) -> Self { Self { - fds: HashMap::new(), + fds_limits: HashMap::new(), mode: Mode::default(static_max_fds), - total_fds: 0, max_fds: match static_max_fds { Some(max) => max, None => DEFAULT_MAX_OPEN_FDS, }, - cull_fraction_denominator: DEFAULT_CULL_FRACTION_DENOMINATOR, } } @@ -91,11 +87,11 @@ pub async fn fd_manager( }; tokio::time::interval(tokio::time::Duration::from_secs(*update_ulimit_secs)) }; - let our_node = our_node.as_str(); loop { tokio::select! { Some(message) = recv_from_loop.recv() => { match handle_message( + &our_node, message, &mut interval, &mut state, @@ -120,23 +116,11 @@ pub async fn fd_manager( } } } - - if state.total_fds >= state.max_fds { - Printout::new( - 2, - format!( - "Have {} open >= {} max fds; sending Cull Request...", - state.total_fds, state.max_fds, - ), - ) - .send(&send_to_terminal) - .await; - send_cull(our_node, &send_to_loop, &state).await; - } } } async fn handle_message( + our_node: &str, km: KernelMessage, _interval: &mut tokio::time::Interval, state: &mut State, @@ -153,56 +137,37 @@ async fn handle_message( let request: FdManagerRequest = serde_json::from_slice(&body).map_err(|_e| FdManagerError::BadRequest)?; let return_value = match request { - FdManagerRequest::OpenFds { number_opened } => { - state.total_fds += number_opened; - state - .fds - .entry(km.source.process) - .and_modify(|e| *e += number_opened) - .or_insert(number_opened); + FdManagerRequest::RequestFdsLimit => { + // divide max_fds by number of processes requesting fds limits, + // then send each process its new limit + // TODO can weight different processes differently + let per_process_limit = state.max_fds / (state.fds_limits.len() + 1) as u64; + state.fds_limits.insert( + km.source.process, + FdsLimit { + limit: per_process_limit, + hit_count: 0, + }, + ); + state.fds_limits.iter_mut().for_each(|(_process, limit)| { + limit.limit = per_process_limit; + limit.hit_count = 0; + }); + send_all_fds_limits(our_node, send_to_loop, state).await; + None } - FdManagerRequest::CloseFds { mut number_closed } => { - if !state.fds.contains_key(&km.source.process) { - return Err(anyhow::anyhow!( - "{} attempted to CloseFds {} but does not have any open!", - km.source.process, - number_closed, - )); - } - let mut return_value = Some(format!( - "{} closed {} of {} total;", - km.source.process, number_closed, state.total_fds, - )); - if state.total_fds < number_closed { - return_value.as_mut().unwrap().push_str(&format!( - " !!process claims to have closed more fds ({}) than we have open for all processes ({})!!", - number_closed, - state.total_fds, - )); - state.total_fds = 0; - } else { - state.total_fds -= number_closed; - } - state.fds.entry(km.source.process).and_modify(|e| { - if e < &mut number_closed { - return_value.as_mut().unwrap().push_str(&format!( - " !!process claims to have closed more fds ({}) than it had open: {}!!", - number_closed, e, - )); - *e = 0; - } else { - *e -= number_closed; - } - return_value - .as_mut() - .unwrap() - .push_str(&format!(" ({e} left to process after close)")); + FdManagerRequest::FdsLimitHit => { + // sender process hit its fd limit + // TODO react to this + state.fds_limits.get_mut(&km.source.process).map(|limit| { + limit.hit_count += 1; }); - return_value + Some(format!("{} hit its fd limit", km.source.process)) } - FdManagerRequest::Cull { .. } => { - return Err(FdManagerError::FdManagerWasSentCull.into()); + FdManagerRequest::FdsLimit(_) => { + // should only send this, never receive it + return Err(FdManagerError::FdManagerWasSentLimit.into()); } FdManagerRequest::UpdateMaxFdsAsFractionOfUlimitPercentage(new) => { match state.mode { @@ -224,8 +189,8 @@ async fn handle_message( } None } - FdManagerRequest::UpdateCullFractionDenominator(new) => { - state.cull_fraction_denominator = new; + FdManagerRequest::UpdateCullFractionDenominator(_new) => { + // state.cull_fraction_denominator = new; None } FdManagerRequest::GetState => { @@ -237,7 +202,7 @@ async fn handle_message( .message(Message::Response(( lib::core::Response { body: serde_json::to_vec(&FdManagerResponse::GetState( - state.fds.clone(), + state.fds_limits.clone(), )) .unwrap(), inherit: false, @@ -253,7 +218,7 @@ async fn handle_message( } None } - FdManagerRequest::GetProcessFdCount(process) => { + FdManagerRequest::GetProcessFdLimit(process) => { if expects_response.is_some() { KernelMessage::builder() .id(km.id) @@ -261,8 +226,12 @@ async fn handle_message( .target(km.rsvp.unwrap_or(km.source)) .message(Message::Response(( lib::core::Response { - body: serde_json::to_vec(&FdManagerResponse::GetProcessFdCount( - *state.fds.get(&process).unwrap_or(&0), + body: serde_json::to_vec(&FdManagerResponse::GetProcessFdLimit( + state + .fds_limits + .get(&process) + .map(|limit| limit.limit) + .unwrap_or(0), )) .unwrap(), inherit: false, @@ -289,23 +258,19 @@ async fn update_max_fds(state: &mut State) -> anyhow::Result<()> { Ok(()) } -async fn send_cull(our_node: &str, send_to_loop: &MessageSender, state: &State) { - let message = Message::Request(Request { - inherit: false, - expects_response: None, - body: serde_json::to_vec(&FdManagerRequest::Cull { - cull_fraction_denominator: state.cull_fraction_denominator.clone(), - }) - .unwrap(), - metadata: None, - capabilities: vec![], - }); - for process_id in state.fds.keys() { +async fn send_all_fds_limits(our_node: &str, send_to_loop: &MessageSender, state: &State) { + for (process_id, limit) in &state.fds_limits { KernelMessage::builder() .id(rand::random()) .source((our_node, FD_MANAGER_PROCESS_ID.clone())) .target((our_node, process_id.clone())) - .message(message.clone()) + .message(Message::Request(Request { + inherit: false, + expects_response: None, + body: serde_json::to_vec(&FdManagerRequest::FdsLimit(limit.limit)).unwrap(), + metadata: None, + capabilities: vec![], + })) .build() .unwrap() .send(send_to_loop) @@ -327,43 +292,29 @@ fn get_max_fd_limit() -> anyhow::Result { } } -pub async fn send_fd_manager_open( - our: &Address, - number_opened: u64, - send_to_loop: &MessageSender, -) -> anyhow::Result<()> { +pub async fn send_fd_manager_request_fds_limit(our: &Address, send_to_loop: &MessageSender) { let message = Message::Request(Request { inherit: false, expects_response: None, - body: serde_json::to_vec(&FdManagerRequest::OpenFds { number_opened }).unwrap(), + body: serde_json::to_vec(&FdManagerRequest::RequestFdsLimit).unwrap(), metadata: None, capabilities: vec![], }); - send_to_fd_manager(our, message, send_to_loop).await?; - Ok(()) + send_to_fd_manager(our, message, send_to_loop).await } -pub async fn send_fd_manager_close( - our: &Address, - number_closed: u64, - send_to_loop: &MessageSender, -) -> anyhow::Result<()> { +pub async fn send_fd_manager_hit_fds_limit(our: &Address, send_to_loop: &MessageSender) { let message = Message::Request(Request { inherit: false, expects_response: None, - body: serde_json::to_vec(&FdManagerRequest::CloseFds { number_closed }).unwrap(), + body: serde_json::to_vec(&FdManagerRequest::FdsLimitHit).unwrap(), metadata: None, capabilities: vec![], }); - send_to_fd_manager(our, message, send_to_loop).await?; - Ok(()) + send_to_fd_manager(our, message, send_to_loop).await } -async fn send_to_fd_manager( - our: &Address, - message: Message, - send_to_loop: &MessageSender, -) -> anyhow::Result<()> { +async fn send_to_fd_manager(our: &Address, message: Message, send_to_loop: &MessageSender) { KernelMessage::builder() .id(rand::random()) .source(our.clone()) @@ -372,6 +323,5 @@ async fn send_to_fd_manager( .build() .unwrap() .send(send_to_loop) - .await; - Ok(()) + .await } diff --git a/kinode/src/main.rs b/kinode/src/main.rs index 100b2a99..1b97f9fe 100644 --- a/kinode/src/main.rs +++ b/kinode/src/main.rs @@ -49,8 +49,8 @@ const WS_MIN_PORT: u16 = 9_000; const TCP_MIN_PORT: u16 = 10_000; const MAX_PORT: u16 = 65_535; -const DEFAULT_MAX_PEERS: u32 = 32; -const DEFAULT_MAX_PASSTHROUGHS: u32 = 0; +const DEFAULT_MAX_PEERS: u64 = 32; +const DEFAULT_MAX_PASSTHROUGHS: u64 = 0; /// default routers as a eth-provider fallback const DEFAULT_ETH_PROVIDERS: &str = include_str!("eth/default_providers_mainnet.json"); @@ -358,10 +358,10 @@ async fn main() { net_message_receiver, *matches.get_one::("reveal-ip").unwrap_or(&true), *matches - .get_one::("max-peers") + .get_one::("max-peers") .unwrap_or(&DEFAULT_MAX_PEERS), *matches - .get_one::("max-passthroughs") + .get_one::("max-passthroughs") .unwrap_or(&DEFAULT_MAX_PASSTHROUGHS), )); tasks.spawn(state::state_sender( diff --git a/kinode/src/net/mod.rs b/kinode/src/net/mod.rs index 8a1aab10..52bc2216 100644 --- a/kinode/src/net/mod.rs +++ b/kinode/src/net/mod.rs @@ -1,6 +1,9 @@ -use lib::types::core::{ - Identity, KernelMessage, MessageReceiver, MessageSender, NetAction, NetResponse, - NetworkErrorSender, NodeRouting, PrintSender, +use lib::{ + core::Address, + types::core::{ + Identity, KernelMessage, MessageReceiver, MessageSender, NetAction, NetResponse, + NetworkErrorSender, NodeRouting, PrintSender, NET_PROCESS_ID, + }, }; use types::{ ActivePassthroughs, IdentityExt, NetData, OnchainPKI, Peers, PendingPassthroughs, TCP_PROTOCOL, @@ -34,10 +37,16 @@ pub async fn networking( kernel_message_rx: MessageReceiver, // only used if indirect -- TODO use _reveal_ip: bool, - max_peers: u32, + max_peers: u64, // only used by routers - max_passthroughs: u32, + max_passthroughs: u64, ) -> anyhow::Result<()> { + crate::fd_manager::send_fd_manager_request_fds_limit( + &Address::new(&our.name, NET_PROCESS_ID.clone()), + &kernel_message_tx, + ) + .await; + let ext = IdentityExt { our: Arc::new(our), our_ip: Arc::new(our_ip), @@ -62,6 +71,7 @@ pub async fn networking( active_passthroughs, max_peers, max_passthroughs, + fds_limit: 100, // TODO blocking request to fd_manager to get max num of fds at boot }; let mut tasks = JoinSet::>::new(); @@ -116,12 +126,12 @@ pub async fn networking( async fn local_recv( ext: IdentityExt, mut kernel_message_rx: MessageReceiver, - data: NetData, + mut data: NetData, ) -> anyhow::Result<()> { while let Some(km) = kernel_message_rx.recv().await { if km.target.node == ext.our.name { // handle messages sent to us - handle_message(&ext, km, &data).await; + handle_message(&ext, km, &mut data).await; } else { connect::send_to_peer(&ext, &data, km).await; } @@ -129,7 +139,7 @@ async fn local_recv( Err(anyhow::anyhow!("net: kernel message channel was dropped")) } -async fn handle_message(ext: &IdentityExt, km: KernelMessage, data: &NetData) { +async fn handle_message(ext: &IdentityExt, km: KernelMessage, data: &mut NetData) { match &km.message { lib::core::Message::Request(request) => handle_request(ext, &km, &request.body, data).await, lib::core::Message::Response((response, _context)) => { @@ -142,7 +152,7 @@ async fn handle_request( ext: &IdentityExt, km: &KernelMessage, request_body: &[u8], - data: &NetData, + data: &mut NetData, ) { if km.source.node == ext.our.name { handle_local_request(ext, km, request_body, data).await; @@ -158,7 +168,7 @@ async fn handle_local_request( ext: &IdentityExt, km: &KernelMessage, request_body: &[u8], - data: &NetData, + data: &mut NetData, ) { match rmp_serde::from_slice::(request_body) { Err(_e) => { @@ -322,7 +332,7 @@ async fn handle_local_request( } } -async fn handle_fdman(km: &KernelMessage, request_body: &[u8], data: &NetData) { +async fn handle_fdman(km: &KernelMessage, request_body: &[u8], data: &mut NetData) { if km.source.process != *lib::core::FD_MANAGER_PROCESS_ID { return; } @@ -330,12 +340,19 @@ async fn handle_fdman(km: &KernelMessage, request_body: &[u8], data: &NetData) { return; }; match req { - lib::core::FdManagerRequest::Cull { - cull_fraction_denominator, - } => { - // we are requested to cull a fraction of our peers! + lib::core::FdManagerRequest::FdsLimit(fds_limit) => { + data.fds_limit = fds_limit; + if data.max_peers > fds_limit { + data.max_peers = fds_limit; + } + // TODO combine with max_peers check + if data.max_passthroughs > fds_limit { + data.max_passthroughs = fds_limit; + } // TODO cull passthroughs too? - data.peers.cull(cull_fraction_denominator).await; + if data.peers.peers().len() >= data.fds_limit as usize { + data.peers.cull(2).await; + } } _ => return, } diff --git a/kinode/src/net/types.rs b/kinode/src/net/types.rs index 1212ad30..67cb1285 100644 --- a/kinode/src/net/types.rs +++ b/kinode/src/net/types.rs @@ -1,6 +1,6 @@ -use crate::net::utils; use lib::types::core::{ - Identity, KernelMessage, MessageSender, NetworkErrorSender, NodeId, PrintSender, + Address, Identity, KernelMessage, MessageSender, NetworkErrorSender, NodeId, PrintSender, + NET_PROCESS_ID, }; use { dashmap::DashMap, @@ -57,13 +57,13 @@ pub struct RoutingRequest { #[derive(Clone)] pub struct Peers { - max_peers: u32, + max_peers: u64, send_to_loop: MessageSender, peers: Arc>, } impl Peers { - pub fn new(max_peers: u32, send_to_loop: MessageSender) -> Self { + pub fn new(max_peers: u64, send_to_loop: MessageSender) -> Self { Self { max_peers, send_to_loop, @@ -103,13 +103,15 @@ impl Peers { .key() .clone(); self.peers.remove(&oldest); - } else { - utils::send_fd_manager_open(1, &self.send_to_loop).await; + crate::fd_manager::send_fd_manager_hit_fds_limit( + &Address::new("our", NET_PROCESS_ID.clone()), + &self.send_to_loop, + ) + .await; } } pub async fn remove(&self, name: &str) -> Option<(String, Peer)> { - utils::send_fd_manager_close(1, &self.send_to_loop).await; self.peers.remove(name) } @@ -123,7 +125,11 @@ impl Peers { for peer in to_remove { self.peers.remove(&peer.identity.name); } - utils::send_fd_manager_close(num_to_remove as u64, &self.send_to_loop).await; + crate::fd_manager::send_fd_manager_hit_fds_limit( + &Address::new("our", NET_PROCESS_ID.clone()), + &self.send_to_loop, + ) + .await; } } @@ -217,6 +223,7 @@ pub struct NetData { pub pending_passthroughs: PendingPassthroughs, /// only used by routers pub active_passthroughs: ActivePassthroughs, - pub max_peers: u32, - pub max_passthroughs: u32, + pub max_peers: u64, + pub max_passthroughs: u64, + pub fds_limit: u64, } diff --git a/kinode/src/net/utils.rs b/kinode/src/net/utils.rs index 592cf125..8c9d108e 100644 --- a/kinode/src/net/utils.rs +++ b/kinode/src/net/utils.rs @@ -3,9 +3,9 @@ use crate::net::types::{ RoutingRequest, TCP_PROTOCOL, WS_PROTOCOL, }; use lib::types::core::{ - Address, Identity, KernelMessage, KnsUpdate, Message, MessageSender, NetAction, - NetworkErrorSender, NodeId, NodeRouting, PrintSender, Printout, Request, Response, SendError, - SendErrorKind, WrappedSendError, NET_PROCESS_ID, + Identity, KernelMessage, KnsUpdate, Message, MessageSender, NetAction, NetworkErrorSender, + NodeId, NodeRouting, PrintSender, Printout, Request, Response, SendError, SendErrorKind, + WrappedSendError, }; use { futures::{SinkExt, StreamExt}, @@ -427,18 +427,6 @@ pub async fn parse_hello_message( .await; } -/// Send an OpenFds message to the fd_manager. -pub async fn send_fd_manager_open(num_opened: u64, kernel_message_tx: &MessageSender) { - let our: Address = Address::new("our", NET_PROCESS_ID.clone()); - let _ = crate::fd_manager::send_fd_manager_open(&our, num_opened, kernel_message_tx).await; -} - -/// Send a CloseFds message to the fd_manager. -pub async fn send_fd_manager_close(num_closed: u64, kernel_message_tx: &MessageSender) { - let our: Address = Address::new("our", NET_PROCESS_ID.clone()); - let _ = crate::fd_manager::send_fd_manager_close(&our, num_closed, kernel_message_tx).await; -} - /// Create a terminal printout at verbosity level 0. pub async fn print_loud(print_tx: &PrintSender, content: &str) { Printout::new(0, content).send(print_tx).await; diff --git a/kinode/src/vfs.rs b/kinode/src/vfs.rs index f9362aff..87f65d30 100644 --- a/kinode/src/vfs.rs +++ b/kinode/src/vfs.rs @@ -51,12 +51,14 @@ pub async fn vfs( let files = Files::new( Address::new(our_node.as_str(), VFS_PROCESS_ID.clone()), - send_to_loop.clone(), + send_to_loop, ); let process_queues: HashMap>>> = HashMap::default(); + crate::fd_manager::send_fd_manager_request_fds_limit(&files.our, &files.send_to_loop).await; + while let Some(km) = recv_from_loop.recv().await { if *our_node != km.source.node { Printout::new( @@ -72,10 +74,10 @@ pub async fn vfs( } if km.source.process == *FD_MANAGER_PROCESS_ID { - let files = files.clone(); + let mut files = files.clone(); let send_to_terminal = send_to_terminal.clone(); tokio::spawn(async move { - if let Err(e) = handle_fd_request(km, files).await { + if let Err(e) = handle_fd_request(km, &mut files).await { Printout::new( 1, format!("vfs: got request from fd_manager that errored: {e:?}"), @@ -99,9 +101,8 @@ pub async fn vfs( // Clone Arcs for the new task let our_node = our_node.clone(); - let send_to_loop = send_to_loop.clone(); let send_to_caps_oracle = send_to_caps_oracle.clone(); - let files = files.clone(); + let mut files = files.clone(); let vfs_path = vfs_path.clone(); tokio::spawn(async move { @@ -110,15 +111,8 @@ pub async fn vfs( let (km_id, km_rsvp) = (km.id.clone(), km.rsvp.clone().unwrap_or(km.source.clone())); - if let Err(e) = handle_request( - &our_node, - km, - files, - &send_to_loop, - &send_to_caps_oracle, - &vfs_path, - ) - .await + if let Err(e) = + handle_request(&our_node, km, &mut files, &send_to_caps_oracle, &vfs_path).await { KernelMessage::builder() .id(km_id) @@ -135,7 +129,7 @@ pub async fn vfs( ))) .build() .unwrap() - .send(&send_to_loop) + .send(&files.send_to_loop) .await; } } @@ -153,8 +147,9 @@ struct Files { cursor_positions: Arc>, /// access order of files access_order: Arc>>, - our: Address, - send_to_loop: MessageSender, + pub our: Address, + pub send_to_loop: MessageSender, + pub fds_limit: u64, } struct FileEntry { @@ -170,6 +165,7 @@ impl Files { access_order: Arc::new(Mutex::new(UniqueQueue::new())), our, send_to_loop, + fds_limit: 100, // TODO blocking request to fd_manager to get max num of fds at boot } } @@ -200,22 +196,19 @@ impl Files { }, ); self.update_access_order(&path).await; - crate::fd_manager::send_fd_manager_open(&self.our, 1, &self.send_to_loop) - .await - .map_err(|e| VfsError::Other { - error: e.to_string(), - })?; + + // if open files >= fds_limit, close the (limit/2) least recently used files + if self.open_files.len() as u64 >= self.fds_limit { + crate::fd_manager::send_fd_manager_hit_fds_limit(&self.our, &self.send_to_loop).await; + self.close_least_recently_used_files(self.fds_limit / 2) + .await?; + } + Ok(file) } async fn remove_file(&self, path: &Path) -> Result<(), VfsError> { - if self.open_files.remove(path).is_some() { - crate::fd_manager::send_fd_manager_close(&self.our, 1, &self.send_to_loop) - .await - .map_err(|e| VfsError::Other { - error: e.to_string(), - })?; - } + self.open_files.remove(path); Ok(()) } @@ -249,11 +242,6 @@ impl Files { break; // no more files to close } } - crate::fd_manager::send_fd_manager_close(&self.our, closed, &self.send_to_loop) - .await - .map_err(|e| VfsError::Other { - error: e.to_string(), - })?; Ok(()) } @@ -290,8 +278,7 @@ impl Files { async fn handle_request( our_node: &str, km: KernelMessage, - files: Files, - send_to_loop: &MessageSender, + files: &mut Files, send_to_caps_oracle: &CapMessageSender, vfs_path: &PathBuf, ) -> Result<(), VfsError> { @@ -347,7 +334,7 @@ async fn handle_request( ))) .build() .unwrap() - .send(send_to_loop) + .send(&files.send_to_loop) .await; return Ok(()); } else { @@ -661,7 +648,7 @@ async fn handle_request( })) .build() .unwrap() - .send(send_to_loop) + .send(&files.send_to_loop) .await; } @@ -1030,7 +1017,7 @@ fn join_paths_safely(base: &PathBuf, extension: &str) -> PathBuf { base.join(extension_path) } -async fn handle_fd_request(km: KernelMessage, files: Files) -> anyhow::Result<()> { +async fn handle_fd_request(km: KernelMessage, files: &mut Files) -> anyhow::Result<()> { let Message::Request(Request { body, .. }) = km.message else { return Err(anyhow::anyhow!("not a request")); }; @@ -1038,13 +1025,17 @@ async fn handle_fd_request(km: KernelMessage, files: Files) -> anyhow::Result<() let request: FdManagerRequest = serde_json::from_slice(&body)?; match request { - FdManagerRequest::Cull { - cull_fraction_denominator, - } => { - let fraction_to_close = files.open_files.len() as u64 / cull_fraction_denominator; - files - .close_least_recently_used_files(fraction_to_close) - .await?; + FdManagerRequest::FdsLimit(fds_limit) => { + files.fds_limit = fds_limit; + if files.open_files.len() as u64 >= fds_limit { + crate::fd_manager::send_fd_manager_hit_fds_limit(&files.our, &files.send_to_loop) + .await; + files + .close_least_recently_used_files( + (files.open_files.len() as u64 - fds_limit) / 2, + ) + .await?; + } } _ => { return Err(anyhow::anyhow!("non-Cull FdManagerRequest")); diff --git a/lib/src/core.rs b/lib/src/core.rs index 27472fa7..f3fb16ce 100644 --- a/lib/src/core.rs +++ b/lib/src/core.rs @@ -2077,12 +2077,15 @@ impl KnsUpdate { #[derive(Clone, Debug, Serialize, Deserialize)] pub enum FdManagerRequest { /// other process -> fd_manager - OpenFds { number_opened: u64 }, + /// must send this to fd_manager to get an initial fds_limit + RequestFdsLimit, /// other process -> fd_manager - CloseFds { number_closed: u64 }, + /// send this to notify fd_manager that limit was hit, + /// which may or may not be reacted to + FdsLimitHit, /// fd_manager -> other process - Cull { cull_fraction_denominator: u64 }, + FdsLimit(u64), /// administrative UpdateMaxFdsAsFractionOfUlimitPercentage(u64), @@ -2091,18 +2094,24 @@ pub enum FdManagerRequest { /// administrative UpdateCullFractionDenominator(u64), - /// get a `HashMap` of all `ProcessId`s to their known number of open file descriptors. + /// get a `HashMap` of all `ProcessId`s to their number of allocated file descriptors. GetState, - /// get the `u64` known number of file descriptors used by `ProcessId`. - GetProcessFdCount(ProcessId), + /// get the `u64` number of file descriptors allocated to `ProcessId`. + GetProcessFdLimit(ProcessId), } #[derive(Debug, Serialize, Deserialize)] pub enum FdManagerResponse { /// response to [`FdManagerRequest::GetState`] - GetState(HashMap), - /// response to [`FdManagerRequest::GetProcessFdCount`] - GetProcessFdCount(u64), + GetState(HashMap), + /// response to [`FdManagerRequest::GetProcessFdLimit`] + GetProcessFdLimit(u64), +} + +#[derive(Copy, Clone, Debug, Serialize, Deserialize)] +pub struct FdsLimit { + pub limit: u64, + pub hit_count: u64, } #[derive(Debug, Error)] @@ -2111,6 +2120,6 @@ pub enum FdManagerError { NotARequest, #[error("fd_manager: received a non-FdManangerRequest")] BadRequest, - #[error("fd_manager: received a FdManagerRequest::Cull, but I am the one who culls")] - FdManagerWasSentCull, + #[error("fd_manager: received a FdManagerRequest::FdsLimit, but I am the one who sets limits")] + FdManagerWasSentLimit, } From 3efd85c6ac689f9c33526a2e41f787b235feccdb Mon Sep 17 00:00:00 2001 From: dr-frmr Date: Fri, 4 Oct 2024 17:28:27 -0400 Subject: [PATCH 27/37] fix: update users when fd limit changes from system --- kinode/src/fd_manager.rs | 16 +++++++++++----- 1 file changed, 11 insertions(+), 5 deletions(-) diff --git a/kinode/src/fd_manager.rs b/kinode/src/fd_manager.rs index 4b597038..85fe449f 100644 --- a/kinode/src/fd_manager.rs +++ b/kinode/src/fd_manager.rs @@ -109,10 +109,16 @@ pub async fn fd_manager( } } _ = interval.tick() => { - if let Err(e) = update_max_fds(&mut state).await { - Printout::new(1, &format!("update_max_fds error: {e:?}")) + let old_max_fds = state.max_fds; + match update_max_fds(&mut state).await { + Ok(new) => { + if new != old_max_fds { + send_all_fds_limits(&our_node, &send_to_loop, &state).await; + } + } + Err(e) => Printout::new(1, &format!("update_max_fds error: {e:?}")) .send(&send_to_terminal) - .await; + .await, } } } @@ -251,11 +257,11 @@ async fn handle_message( Ok(return_value) } -async fn update_max_fds(state: &mut State) -> anyhow::Result<()> { +async fn update_max_fds(state: &mut State) -> anyhow::Result { let ulimit_max_fds = get_max_fd_limit() .map_err(|_| anyhow::anyhow!("Couldn't update max fd limit: ulimit failed"))?; state.update_max_fds_from_ulimit(ulimit_max_fds); - Ok(()) + Ok(ulimit_max_fds) } async fn send_all_fds_limits(our_node: &str, send_to_loop: &MessageSender, state: &State) { From 4ab54032083c7899b59326e822ceb673871980cf Mon Sep 17 00:00:00 2001 From: dr-frmr Date: Fri, 4 Oct 2024 17:31:22 -0400 Subject: [PATCH 28/37] fix: update limits on system ulimit change --- kinode/src/fd_manager.rs | 20 +++++++++++++------- 1 file changed, 13 insertions(+), 7 deletions(-) diff --git a/kinode/src/fd_manager.rs b/kinode/src/fd_manager.rs index 85fe449f..d8d0b3af 100644 --- a/kinode/src/fd_manager.rs +++ b/kinode/src/fd_manager.rs @@ -52,6 +52,16 @@ impl State { }; self.max_fds = ulimit_max_fds * max_fds_as_fraction_of_ulimit_percentage / 100; } + + fn update_all_fds_limits(&mut self) { + let len = self.fds_limits.len() as u64; + let per_process_limit = self.max_fds / len; + for limit in self.fds_limits.values_mut() { + limit.limit = per_process_limit; + // reset hit count when updating limits + limit.hit_count = 0; + } + } } impl Mode { @@ -113,6 +123,7 @@ pub async fn fd_manager( match update_max_fds(&mut state).await { Ok(new) => { if new != old_max_fds { + state.update_all_fds_limits(); send_all_fds_limits(&our_node, &send_to_loop, &state).await; } } @@ -147,20 +158,15 @@ async fn handle_message( // divide max_fds by number of processes requesting fds limits, // then send each process its new limit // TODO can weight different processes differently - let per_process_limit = state.max_fds / (state.fds_limits.len() + 1) as u64; state.fds_limits.insert( km.source.process, FdsLimit { - limit: per_process_limit, + limit: 0, hit_count: 0, }, ); - state.fds_limits.iter_mut().for_each(|(_process, limit)| { - limit.limit = per_process_limit; - limit.hit_count = 0; - }); + state.update_all_fds_limits(); send_all_fds_limits(our_node, send_to_loop, state).await; - None } FdManagerRequest::FdsLimitHit => { From 8f28f5486c1620ef093eb501a45e444ddca67eb4 Mon Sep 17 00:00:00 2001 From: dr-frmr Date: Fri, 4 Oct 2024 17:33:22 -0400 Subject: [PATCH 29/37] fix --- kinode/src/fd_manager.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kinode/src/fd_manager.rs b/kinode/src/fd_manager.rs index d8d0b3af..29b40edd 100644 --- a/kinode/src/fd_manager.rs +++ b/kinode/src/fd_manager.rs @@ -55,7 +55,7 @@ impl State { fn update_all_fds_limits(&mut self) { let len = self.fds_limits.len() as u64; - let per_process_limit = self.max_fds / len; + let per_process_limit = self.max_fds / std::cmp::max(len, 1); for limit in self.fds_limits.values_mut() { limit.limit = per_process_limit; // reset hit count when updating limits From 7128350746f2070ebddcbdd21782c56b1bd7375a Mon Sep 17 00:00:00 2001 From: dr-frmr Date: Fri, 4 Oct 2024 18:35:23 -0400 Subject: [PATCH 30/37] fds limit fixes --- kinode/src/net/mod.rs | 7 ++++--- kinode/src/net/types.rs | 9 ++++----- kinode/src/vfs.rs | 6 ++---- 3 files changed, 10 insertions(+), 12 deletions(-) diff --git a/kinode/src/net/mod.rs b/kinode/src/net/mod.rs index 52bc2216..11226dc2 100644 --- a/kinode/src/net/mod.rs +++ b/kinode/src/net/mod.rs @@ -71,7 +71,7 @@ pub async fn networking( active_passthroughs, max_peers, max_passthroughs, - fds_limit: 100, // TODO blocking request to fd_manager to get max num of fds at boot + fds_limit: 10, // small hardcoded limit that gets replaced by fd_manager soon after boot }; let mut tasks = JoinSet::>::new(); @@ -349,9 +349,10 @@ async fn handle_fdman(km: &KernelMessage, request_body: &[u8], data: &mut NetDat if data.max_passthroughs > fds_limit { data.max_passthroughs = fds_limit; } - // TODO cull passthroughs too? + // TODO cull passthroughs too if data.peers.peers().len() >= data.fds_limit as usize { - data.peers.cull(2).await; + let diff = data.peers.peers().len() - data.fds_limit as usize; + data.peers.cull(diff).await; } } _ => return, diff --git a/kinode/src/net/types.rs b/kinode/src/net/types.rs index 67cb1285..f5119283 100644 --- a/kinode/src/net/types.rs +++ b/kinode/src/net/types.rs @@ -115,13 +115,12 @@ impl Peers { self.peers.remove(name) } - /// close the (peer count / fraction) oldest connections - pub async fn cull(&self, fraction: u64) { - let num_to_remove = (self.peers.len() as f64 / fraction as f64).ceil() as usize; - let mut to_remove = Vec::with_capacity(num_to_remove); + /// close the n oldest connections + pub async fn cull(&self, n: usize) { + let mut to_remove = Vec::with_capacity(n); let mut sorted_peers: Vec<_> = self.peers.iter().collect(); sorted_peers.sort_by_key(|p| p.last_message); - to_remove.extend(sorted_peers.iter().take(num_to_remove)); + to_remove.extend(sorted_peers.iter().take(n)); for peer in to_remove { self.peers.remove(&peer.identity.name); } diff --git a/kinode/src/vfs.rs b/kinode/src/vfs.rs index 87f65d30..2c8f43a2 100644 --- a/kinode/src/vfs.rs +++ b/kinode/src/vfs.rs @@ -165,7 +165,7 @@ impl Files { access_order: Arc::new(Mutex::new(UniqueQueue::new())), our, send_to_loop, - fds_limit: 100, // TODO blocking request to fd_manager to get max num of fds at boot + fds_limit: 10, // small hardcoded limit that gets replaced by fd_manager soon after boot } } @@ -1031,9 +1031,7 @@ async fn handle_fd_request(km: KernelMessage, files: &mut Files) -> anyhow::Resu crate::fd_manager::send_fd_manager_hit_fds_limit(&files.our, &files.send_to_loop) .await; files - .close_least_recently_used_files( - (files.open_files.len() as u64 - fds_limit) / 2, - ) + .close_least_recently_used_files(files.open_files.len() as u64 - fds_limit) .await?; } } From 3e4d9d85f58ba911e37e4c629fc234a54c9c1bc0 Mon Sep 17 00:00:00 2001 From: dr-frmr Date: Sun, 6 Oct 2024 23:30:16 -0400 Subject: [PATCH 31/37] fix: terminal: don't crash when win_cols < prompt_len --- kinode/src/terminal/mod.rs | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/kinode/src/terminal/mod.rs b/kinode/src/terminal/mod.rs index 382888be..b89df83b 100644 --- a/kinode/src/terminal/mod.rs +++ b/kinode/src/terminal/mod.rs @@ -392,7 +392,9 @@ async fn handle_event( cursor::MoveTo(0, height), terminal::Clear(ClearType::CurrentLine) )?; - *win_cols = width - 1; + // since we subtract prompt_len from win_cols, win_cols must always + // be >= prompt_len + *win_cols = std::cmp::max(width - 1, current_line.prompt_len as u16); *win_rows = height; if current_line.cursor_col + current_line.prompt_len as u16 > *win_cols { current_line.cursor_col = *win_cols - current_line.prompt_len as u16; From 6174d3e9a7599798891fd5917e089b99aa5711f1 Mon Sep 17 00:00:00 2001 From: dr-frmr Date: Sun, 6 Oct 2024 23:48:49 -0400 Subject: [PATCH 32/37] bring up default max % to 90 and add fixed padding for kernel --- kinode/src/fd_manager.rs | 18 +++++++++++++++--- 1 file changed, 15 insertions(+), 3 deletions(-) diff --git a/kinode/src/fd_manager.rs b/kinode/src/fd_manager.rs index 29b40edd..856849fe 100644 --- a/kinode/src/fd_manager.rs +++ b/kinode/src/fd_manager.rs @@ -3,19 +3,23 @@ use lib::types::core::{ MessageReceiver, MessageSender, PrintSender, Printout, ProcessId, Request, FD_MANAGER_PROCESS_ID, }; +use serde::{Deserialize, Serialize}; use std::{collections::HashMap, sync::Arc}; const DEFAULT_MAX_OPEN_FDS: u64 = 180; -const DEFAULT_FDS_AS_FRACTION_OF_ULIMIT_PERCENTAGE: u64 = 60; +const DEFAULT_FDS_AS_FRACTION_OF_ULIMIT_PERCENTAGE: u64 = 90; +const SYS_RESERVED_FDS: u64 = 30; const DEFAULT_UPDATE_ULIMIT_SECS: u64 = 3600; const _DEFAULT_CULL_FRACTION_DENOMINATOR: u64 = 2; +#[derive(Debug, Serialize, Deserialize)] struct State { fds_limits: HashMap, mode: Mode, max_fds: u64, } +#[derive(Debug, Serialize, Deserialize)] enum Mode { /// don't update the max_fds except by user input StaticMax, @@ -50,7 +54,15 @@ impl State { else { return; }; - self.max_fds = ulimit_max_fds * max_fds_as_fraction_of_ulimit_percentage / 100; + let min_ulimit = SYS_RESERVED_FDS + 10; + if ulimit_max_fds <= min_ulimit { + panic!( + "fatal: ulimit from system ({ulimit_max_fds}) is too small to operate Kinode. Please run Kinode with a larger ulimit (at least {min_ulimit}).", + ); + } + + self.max_fds = + ulimit_max_fds * max_fds_as_fraction_of_ulimit_percentage / 100 - SYS_RESERVED_FDS; } fn update_all_fds_limits(&mut self) { @@ -228,7 +240,7 @@ async fn handle_message( .send(send_to_loop) .await; } - None + Some(format!("fd_manager: {:?}", state)) } FdManagerRequest::GetProcessFdLimit(process) => { if expects_response.is_some() { From 54b078cae751d9207fa5e571ea46f6c9d475f70d Mon Sep 17 00:00:00 2001 From: dr-frmr Date: Mon, 7 Oct 2024 20:17:03 -0400 Subject: [PATCH 33/37] kv <> fd_manager --- kinode/src/kv.rs | 254 ++++++++++++++++++++++++++++--------------- kinode/src/sqlite.rs | 67 +++++++++++- kinode/src/vfs.rs | 23 ++-- 3 files changed, 241 insertions(+), 103 deletions(-) diff --git a/kinode/src/kv.rs b/kinode/src/kv.rs index 7c02ffd2..ff793ae5 100644 --- a/kinode/src/kv.rs +++ b/kinode/src/kv.rs @@ -1,8 +1,10 @@ +use crate::vfs::UniqueQueue; use dashmap::DashMap; use lib::types::core::{ - Address, CapMessage, CapMessageSender, Capability, KernelMessage, KvAction, KvError, KvRequest, - KvResponse, LazyLoadBlob, Message, MessageReceiver, MessageSender, PackageId, PrintSender, - Printout, ProcessId, Request, Response, KV_PROCESS_ID, + Address, CapMessage, CapMessageSender, Capability, FdManagerRequest, KernelMessage, KvAction, + KvError, KvRequest, KvResponse, LazyLoadBlob, Message, MessageReceiver, MessageSender, + PackageId, PrintSender, Printout, ProcessId, Request, Response, FD_MANAGER_PROCESS_ID, + KV_PROCESS_ID, }; use rocksdb::OptimisticTransactionDB; use std::{ @@ -11,6 +13,80 @@ use std::{ }; use tokio::{fs, sync::Mutex}; +#[derive(Clone)] +struct KvState { + our: Arc
, + kv_path: Arc, + send_to_loop: MessageSender, + send_to_terminal: PrintSender, + open_kvs: Arc>, + /// access order of dbs, used to cull if we hit the fds limit + access_order: Arc>>, + txs: Arc>)>>>, + fds_limit: u64, +} + +impl KvState { + pub fn new( + our: Address, + send_to_terminal: PrintSender, + send_to_loop: MessageSender, + home_directory_path: String, + ) -> Self { + Self { + our: Arc::new(our), + kv_path: Arc::new(format!("{home_directory_path}/kv")), + send_to_loop, + send_to_terminal, + open_kvs: Arc::new(DashMap::new()), + access_order: Arc::new(Mutex::new(UniqueQueue::new())), + txs: Arc::new(DashMap::new()), + fds_limit: 10, + } + } + + pub async fn open_db(&mut self, package_id: PackageId, db: String) -> Result<(), KvError> { + let key = (package_id.clone(), db.clone()); + if self.open_kvs.contains_key(&key) { + return Ok(()); + } + + if self.open_kvs.len() as u64 >= self.fds_limit { + // close least recently used db + let key = self.access_order.lock().await.pop_front().unwrap(); + self.remove_db(key.0, key.1).await; + } + + let db_path = format!("{}/{}/{}", self.kv_path.as_str(), package_id, db); + fs::create_dir_all(&db_path).await?; + + self.open_kvs.insert( + key, + OptimisticTransactionDB::open_default(&db_path).map_err(rocks_to_kv_err)?, + ); + let mut access_order = self.access_order.lock().await; + access_order.push_back((package_id, db)); + Ok(()) + } + + pub async fn remove_db(&mut self, package_id: PackageId, db: String) { + let db_path = format!("{}/{}/{}", self.kv_path.as_str(), package_id, db); + self.open_kvs.remove(&(package_id.clone(), db.to_string())); + let mut access_order = self.access_order.lock().await; + access_order.remove(&(package_id, db)); + let _ = fs::remove_dir_all(&db_path).await; + } + + pub async fn remove_least_recently_used_dbs(&mut self, n: u64) { + for _ in 0..n { + let mut lock = self.access_order.lock().await; + let key = lock.pop_front().unwrap(); + drop(lock); + self.remove_db(key.0, key.1).await; + } + } +} + pub async fn kv( our_node: Arc, send_to_loop: MessageSender, @@ -19,31 +95,44 @@ pub async fn kv( send_to_caps_oracle: CapMessageSender, home_directory_path: String, ) -> anyhow::Result<()> { - let kv_path = Arc::new(format!("{home_directory_path}/kv")); - if let Err(e) = fs::create_dir_all(&*kv_path).await { + let our = Address::new(our_node.as_str(), KV_PROCESS_ID.clone()); + + crate::fd_manager::send_fd_manager_request_fds_limit(&our, &send_to_loop).await; + + let mut state = KvState::new(our, send_to_terminal, send_to_loop, home_directory_path); + + if let Err(e) = fs::create_dir_all(state.kv_path.as_str()).await { panic!("failed creating kv dir! {e:?}"); } - let open_kvs: Arc> = - Arc::new(DashMap::new()); - let txs: Arc>)>>> = Arc::new(DashMap::new()); - let process_queues: HashMap>>> = HashMap::new(); while let Some(km) = recv_from_loop.recv().await { - if *our_node != km.source.node { + if state.our.node != km.source.node { Printout::new( 1, format!( - "kv: got request from {}, but requests must come from our node {our_node}", - km.source.node + "kv: got request from {}, but requests must come from our node {}", + km.source.node, state.our.node, ), ) - .send(&send_to_terminal) + .send(&state.send_to_terminal) .await; continue; } + if km.source.process == *FD_MANAGER_PROCESS_ID { + if let Err(e) = handle_fd_request(km, &mut state).await { + Printout::new( + 1, + format!("kv: got request from fd_manager that errored: {e:?}"), + ) + .send(&state.send_to_terminal) + .await; + }; + continue; + } + let queue = process_queues .get(&km.source.process) .cloned() @@ -55,13 +144,8 @@ pub async fn kv( } // clone Arcs - let our_node = our_node.clone(); - let send_to_loop = send_to_loop.clone(); - let send_to_terminal = send_to_terminal.clone(); + let mut state = state.clone(); let send_to_caps_oracle = send_to_caps_oracle.clone(); - let open_kvs = open_kvs.clone(); - let txs = txs.clone(); - let kv_path = kv_path.clone(); tokio::spawn(async move { let mut queue_lock = queue.lock().await; @@ -69,23 +153,13 @@ pub async fn kv( let (km_id, km_rsvp) = (km.id.clone(), km.rsvp.clone().unwrap_or(km.source.clone())); - if let Err(e) = handle_request( - &our_node, - km, - open_kvs, - txs, - &send_to_loop, - &send_to_caps_oracle, - &kv_path, - ) - .await - { + if let Err(e) = handle_request(km, &mut state, &send_to_caps_oracle).await { Printout::new(1, format!("kv: {e}")) - .send(&send_to_terminal) + .send(&state.send_to_terminal) .await; KernelMessage::builder() .id(km_id) - .source((our_node.as_str(), KV_PROCESS_ID.clone())) + .source(state.our.as_ref().clone()) .target(km_rsvp) .message(Message::Response(( Response { @@ -98,7 +172,7 @@ pub async fn kv( ))) .build() .unwrap() - .send(&send_to_loop) + .send(&state.send_to_loop) .await; } } @@ -108,13 +182,9 @@ pub async fn kv( } async fn handle_request( - our_node: &str, km: KernelMessage, - open_kvs: Arc>, - txs: Arc>)>>>, - send_to_loop: &MessageSender, + state: &mut KvState, send_to_caps_oracle: &CapMessageSender, - kv_path: &str, ) -> Result<(), KvError> { let KernelMessage { id, @@ -145,15 +215,12 @@ async fn handle_request( } }; - check_caps( - our_node, - &source, - &open_kvs, - send_to_caps_oracle, - &request, - kv_path, - ) - .await?; + check_caps(&source, state, send_to_caps_oracle, &request).await?; + + // always open to ensure db exists + state + .open_db(request.package_id.clone(), request.db.clone()) + .await?; let (body, bytes) = match &request.action { KvAction::Open => { @@ -165,7 +232,7 @@ async fn handle_request( (serde_json::to_vec(&KvResponse::Ok).unwrap(), None) } KvAction::Get { key } => { - let db = match open_kvs.get(&(request.package_id, request.db)) { + let db = match state.open_kvs.get(&(request.package_id, request.db)) { None => { return Err(KvError::NoDb); } @@ -190,14 +257,14 @@ async fn handle_request( } KvAction::BeginTx => { let tx_id = rand::random::(); - txs.insert(tx_id, Vec::new()); + state.txs.insert(tx_id, Vec::new()); ( serde_json::to_vec(&KvResponse::BeginTx { tx_id }).unwrap(), None, ) } KvAction::Set { key, tx_id } => { - let db = match open_kvs.get(&(request.package_id, request.db)) { + let db = match state.open_kvs.get(&(request.package_id, request.db)) { None => { return Err(KvError::NoDb); } @@ -214,7 +281,7 @@ async fn handle_request( db.put(key, blob.bytes).map_err(rocks_to_kv_err)?; } Some(tx_id) => { - let mut tx = match txs.get_mut(tx_id) { + let mut tx = match state.txs.get_mut(tx_id) { None => { return Err(KvError::NoTx); } @@ -227,7 +294,7 @@ async fn handle_request( (serde_json::to_vec(&KvResponse::Ok).unwrap(), None) } KvAction::Delete { key, tx_id } => { - let db = match open_kvs.get(&(request.package_id, request.db)) { + let db = match state.open_kvs.get(&(request.package_id, request.db)) { None => { return Err(KvError::NoDb); } @@ -238,7 +305,7 @@ async fn handle_request( db.delete(key).map_err(rocks_to_kv_err)?; } Some(tx_id) => { - let mut tx = match txs.get_mut(tx_id) { + let mut tx = match state.txs.get_mut(tx_id) { None => { return Err(KvError::NoTx); } @@ -250,14 +317,14 @@ async fn handle_request( (serde_json::to_vec(&KvResponse::Ok).unwrap(), None) } KvAction::Commit { tx_id } => { - let db = match open_kvs.get(&(request.package_id, request.db)) { + let db = match state.open_kvs.get(&(request.package_id, request.db)) { None => { return Err(KvError::NoDb); } Some(db) => db, }; - let txs = match txs.remove(tx_id).map(|(_, tx)| tx) { + let txs = match state.txs.remove(tx_id).map(|(_, tx)| tx) { None => { return Err(KvError::NoTx); } @@ -291,7 +358,7 @@ async fn handle_request( } KvAction::Backup => { // looping through open dbs and flushing their memtables - for db_ref in open_kvs.iter() { + for db_ref in state.open_kvs.iter() { let db = db_ref.value(); db.flush().map_err(rocks_to_kv_err)?; } @@ -302,7 +369,7 @@ async fn handle_request( if let Some(target) = km.rsvp.or_else(|| expects_response.map(|_| source)) { KernelMessage::builder() .id(id) - .source((our_node, KV_PROCESS_ID.clone())) + .source(state.our.as_ref().clone()) .target(target) .message(Message::Response(( Response { @@ -319,7 +386,7 @@ async fn handle_request( })) .build() .unwrap() - .send(send_to_loop) + .send(&state.send_to_loop) .await; } @@ -327,12 +394,10 @@ async fn handle_request( } async fn check_caps( - our_node: &str, source: &Address, - open_kvs: &Arc>, + state: &mut KvState, send_to_caps_oracle: &CapMessageSender, request: &KvRequest, - kv_path: &str, ) -> Result<(), KvError> { let (send_cap_bool, recv_cap_bool) = tokio::sync::oneshot::channel(); let src_package_id = PackageId::new(source.process.package(), source.process.publisher()); @@ -346,10 +411,7 @@ async fn check_caps( .send(CapMessage::Has { on: source.process.clone(), cap: Capability { - issuer: Address { - node: our_node.to_string(), - process: KV_PROCESS_ID.clone(), - }, + issuer: state.our.as_ref().clone(), params: serde_json::json!({ "kind": "write", "db": request.db.to_string(), @@ -372,10 +434,7 @@ async fn check_caps( .send(CapMessage::Has { on: source.process.clone(), cap: Capability { - issuer: Address { - node: our_node.to_string(), - process: KV_PROCESS_ID.clone(), - }, + issuer: state.our.as_ref().clone(), params: serde_json::json!({ "kind": "read", "db": request.db.to_string(), @@ -403,7 +462,7 @@ async fn check_caps( add_capability( "read", &request.db.to_string(), - &our_node, + &state.our, &source, send_to_caps_oracle, ) @@ -411,22 +470,22 @@ async fn check_caps( add_capability( "write", &request.db.to_string(), - &our_node, + &state.our, &source, send_to_caps_oracle, ) .await?; - if open_kvs.contains_key(&(request.package_id.clone(), request.db.clone())) { + if state + .open_kvs + .contains_key(&(request.package_id.clone(), request.db.clone())) + { return Ok(()); } - let db_path = format!("{}/{}/{}", kv_path, request.package_id, request.db); - fs::create_dir_all(&db_path).await?; - - let db = OptimisticTransactionDB::open_default(&db_path).map_err(rocks_to_kv_err)?; - - open_kvs.insert((request.package_id.clone(), request.db.clone()), db); + state + .open_db(request.package_id.clone(), request.db.clone()) + .await?; Ok(()) } KvAction::RemoveDb { .. } => { @@ -436,28 +495,51 @@ async fn check_caps( }); } - let db_path = format!("{}/{}/{}", kv_path, request.package_id, request.db); - open_kvs.remove(&(request.package_id.clone(), request.db.clone())); + state + .remove_db(request.package_id.clone(), request.db.clone()) + .await; - fs::remove_dir_all(&db_path).await?; Ok(()) } KvAction::Backup { .. } => Ok(()), } } +async fn handle_fd_request(km: KernelMessage, state: &mut KvState) -> anyhow::Result<()> { + let Message::Request(Request { body, .. }) = km.message else { + return Err(anyhow::anyhow!("not a request")); + }; + + let request: FdManagerRequest = serde_json::from_slice(&body)?; + + match request { + FdManagerRequest::FdsLimit(new_fds_limit) => { + state.fds_limit = new_fds_limit; + if state.open_kvs.len() as u64 >= state.fds_limit { + crate::fd_manager::send_fd_manager_hit_fds_limit(&state.our, &state.send_to_loop) + .await; + state + .remove_least_recently_used_dbs(state.open_kvs.len() as u64 - state.fds_limit) + .await; + } + } + _ => { + return Err(anyhow::anyhow!("non-Cull FdManagerRequest")); + } + } + + Ok(()) +} + async fn add_capability( kind: &str, db: &str, - our_node: &str, + our: &Address, source: &Address, send_to_caps_oracle: &CapMessageSender, ) -> Result<(), KvError> { let cap = Capability { - issuer: Address { - node: our_node.to_string(), - process: KV_PROCESS_ID.clone(), - }, + issuer: our.clone(), params: serde_json::json!({ "kind": kind, "db": db }).to_string(), }; let (send_cap_bool, recv_cap_bool) = tokio::sync::oneshot::channel(); diff --git a/kinode/src/sqlite.rs b/kinode/src/sqlite.rs index 6608723c..d9af4ee0 100644 --- a/kinode/src/sqlite.rs +++ b/kinode/src/sqlite.rs @@ -1,9 +1,10 @@ use base64::{engine::general_purpose::STANDARD as base64_standard, Engine}; use dashmap::DashMap; use lib::types::core::{ - Address, CapMessage, CapMessageSender, Capability, KernelMessage, LazyLoadBlob, Message, - MessageReceiver, MessageSender, PackageId, PrintSender, Printout, ProcessId, Request, Response, - SqlValue, SqliteAction, SqliteError, SqliteRequest, SqliteResponse, SQLITE_PROCESS_ID, + Address, CapMessage, CapMessageSender, Capability, FdManagerRequest, KernelMessage, + LazyLoadBlob, Message, MessageReceiver, MessageSender, PackageId, PrintSender, Printout, + ProcessId, Request, Response, SqlValue, SqliteAction, SqliteError, SqliteRequest, + SqliteResponse, FD_MANAGER_PROCESS_ID, SQLITE_PROCESS_ID, }; use rusqlite::Connection; use std::{ @@ -35,9 +36,16 @@ pub async fn sqlite( let open_dbs: Arc>> = Arc::new(DashMap::new()); let txs: Arc)>>> = Arc::new(DashMap::new()); + let mut fds_limit = 10; let process_queues: HashMap>>> = HashMap::new(); + crate::fd_manager::send_fd_manager_request_fds_limit( + &Address::new(our_node.as_str(), SQLITE_PROCESS_ID.clone()), + &send_to_loop, + ) + .await; + while let Some(km) = recv_from_loop.recv().await { if *our_node != km.source.node { Printout::new( @@ -52,6 +60,26 @@ pub async fn sqlite( continue; } + if km.source.process == *FD_MANAGER_PROCESS_ID { + if let Err(e) = handle_fd_request( + our_node.as_str(), + km, + &open_dbs, + &mut fds_limit, + &send_to_loop, + ) + .await + { + Printout::new( + 1, + format!("kv: got request from fd_manager that errored: {e:?}"), + ) + .send(&send_to_terminal) + .await; + }; + continue; + } + let queue = process_queues .get(&km.source.process) .cloned() @@ -459,6 +487,39 @@ async fn check_caps( } } +async fn handle_fd_request( + our_node: &str, + km: KernelMessage, + open_dbs: &Arc>>, + fds_limit: &mut u64, + send_to_loop: &MessageSender, +) -> anyhow::Result<()> { + let Message::Request(Request { body, .. }) = km.message else { + return Err(anyhow::anyhow!("not a request")); + }; + + let request: FdManagerRequest = serde_json::from_slice(&body)?; + + match request { + FdManagerRequest::FdsLimit(new_fds_limit) => { + *fds_limit = new_fds_limit; + if open_dbs.len() as u64 >= *fds_limit { + crate::fd_manager::send_fd_manager_hit_fds_limit( + &Address::new(our_node, SQLITE_PROCESS_ID.clone()), + &send_to_loop, + ) + .await; + // TODO close least recently used dbs! + } + } + _ => { + return Err(anyhow::anyhow!("non-Cull FdManagerRequest")); + } + } + + Ok(()) +} + async fn add_capability( kind: &str, db: &str, diff --git a/kinode/src/vfs.rs b/kinode/src/vfs.rs index 2c8f43a2..00f7f566 100644 --- a/kinode/src/vfs.rs +++ b/kinode/src/vfs.rs @@ -49,7 +49,7 @@ pub async fn vfs( .map_err(|e| anyhow::anyhow!("failed creating vfs dir! {e:?}"))?; let vfs_path = Arc::new(fs::canonicalize(&vfs_path).await?); - let files = Files::new( + let mut files = Files::new( Address::new(our_node.as_str(), VFS_PROCESS_ID.clone()), send_to_loop, ); @@ -74,18 +74,14 @@ pub async fn vfs( } if km.source.process == *FD_MANAGER_PROCESS_ID { - let mut files = files.clone(); - let send_to_terminal = send_to_terminal.clone(); - tokio::spawn(async move { - if let Err(e) = handle_fd_request(km, &mut files).await { - Printout::new( - 1, - format!("vfs: got request from fd_manager that errored: {e:?}"), - ) - .send(&send_to_terminal) - .await; - }; - }); + if let Err(e) = handle_fd_request(km, &mut files).await { + Printout::new( + 1, + format!("vfs: got request from fd_manager that errored: {e:?}"), + ) + .send(&send_to_terminal) + .await; + }; continue; } @@ -925,7 +921,6 @@ fn get_file_type(metadata: &std::fs::Metadata) -> FileType { } /// helper cache for most recently used paths - pub struct UniqueQueue where T: Eq + Hash, From ae902f65cad1f9a1e5b36234f68173a458e1fad1 Mon Sep 17 00:00:00 2001 From: dr-frmr Date: Mon, 7 Oct 2024 22:02:30 -0400 Subject: [PATCH 34/37] sqlite <> fd_manager --- kinode/src/kv.rs | 16 +-- kinode/src/sqlite.rs | 253 ++++++++++++++++++++++++------------------- 2 files changed, 147 insertions(+), 122 deletions(-) diff --git a/kinode/src/kv.rs b/kinode/src/kv.rs index ff793ae5..977fb58b 100644 --- a/kinode/src/kv.rs +++ b/kinode/src/kv.rs @@ -410,14 +410,14 @@ async fn check_caps( send_to_caps_oracle .send(CapMessage::Has { on: source.process.clone(), - cap: Capability { - issuer: state.our.as_ref().clone(), - params: serde_json::json!({ + cap: Capability::new( + state.our.as_ref().clone(), + serde_json::json!({ "kind": "write", "db": request.db.to_string(), }) .to_string(), - }, + ), responder: send_cap_bool, }) .await?; @@ -433,14 +433,14 @@ async fn check_caps( send_to_caps_oracle .send(CapMessage::Has { on: source.process.clone(), - cap: Capability { - issuer: state.our.as_ref().clone(), - params: serde_json::json!({ + cap: Capability::new( + state.our.as_ref().clone(), + serde_json::json!({ "kind": "read", "db": request.db.to_string(), }) .to_string(), - }, + ), responder: send_cap_bool, }) .await?; diff --git a/kinode/src/sqlite.rs b/kinode/src/sqlite.rs index d9af4ee0..91f52b3f 100644 --- a/kinode/src/sqlite.rs +++ b/kinode/src/sqlite.rs @@ -1,3 +1,4 @@ +use crate::vfs::UniqueQueue; use base64::{engine::general_purpose::STANDARD as base64_standard, Engine}; use dashmap::DashMap; use lib::types::core::{ @@ -21,6 +22,82 @@ lazy_static::lazy_static! { HashSet::from(["ALTER", "ANALYZE", "COMMIT", "CREATE", "DELETE", "DETACH", "DROP", "END", "INSERT", "REINDEX", "RELEASE", "RENAME", "REPLACE", "ROLLBACK", "SAVEPOINT", "UPDATE", "VACUUM"]); } +#[derive(Clone)] +struct SqliteState { + our: Arc
, + sqlite_path: Arc, + send_to_loop: MessageSender, + send_to_terminal: PrintSender, + open_dbs: Arc>>, + access_order: Arc>>, + txs: Arc)>>>, + fds_limit: u64, +} + +impl SqliteState { + pub fn new( + our: Address, + send_to_terminal: PrintSender, + send_to_loop: MessageSender, + home_directory_path: String, + ) -> Self { + Self { + our: Arc::new(our), + sqlite_path: Arc::new(format!("{home_directory_path}/sqlite")), + send_to_loop, + send_to_terminal, + open_dbs: Arc::new(DashMap::new()), + access_order: Arc::new(Mutex::new(UniqueQueue::new())), + txs: Arc::new(DashMap::new()), + fds_limit: 10, + } + } + + pub async fn open_db(&mut self, package_id: PackageId, db: String) -> Result<(), SqliteError> { + let key = (package_id.clone(), db.clone()); + if self.open_dbs.contains_key(&key) { + return Ok(()); + } + + if self.open_dbs.len() as u64 >= self.fds_limit { + // close least recently used db + let key = self.access_order.lock().await.pop_front().unwrap(); + self.remove_db(key.0, key.1).await; + } + + let db_path = format!("{}/{}/{}", self.sqlite_path.as_str(), package_id, db); + fs::create_dir_all(&db_path).await?; + + let db_file_path = format!("{}/{}.db", db_path, db); + + let db_conn = Connection::open(db_file_path)?; + let _ = db_conn.execute("PRAGMA journal_mode=WAL", []); + + self.open_dbs.insert(key, Mutex::new(db_conn)); + + let mut access_order = self.access_order.lock().await; + access_order.push_back((package_id, db)); + Ok(()) + } + + pub async fn remove_db(&mut self, package_id: PackageId, db: String) { + let db_path = format!("{}/{}/{}", self.sqlite_path.as_str(), package_id, db); + self.open_dbs.remove(&(package_id.clone(), db.to_string())); + let mut access_order = self.access_order.lock().await; + access_order.remove(&(package_id, db)); + let _ = fs::remove_dir_all(&db_path).await; + } + + pub async fn remove_least_recently_used_dbs(&mut self, n: u64) { + for _ in 0..n { + let mut lock = self.access_order.lock().await; + let key = lock.pop_front().unwrap(); + drop(lock); + self.remove_db(key.0, key.1).await; + } + } +} + pub async fn sqlite( our_node: Arc, send_to_loop: MessageSender, @@ -29,52 +106,39 @@ pub async fn sqlite( send_to_caps_oracle: CapMessageSender, home_directory_path: String, ) -> anyhow::Result<()> { - let sqlite_path = Arc::new(format!("{home_directory_path}/sqlite")); - if let Err(e) = fs::create_dir_all(&*sqlite_path).await { + let our = Address::new(our_node.as_str(), SQLITE_PROCESS_ID.clone()); + + crate::fd_manager::send_fd_manager_request_fds_limit(&our, &send_to_loop).await; + + let mut state = SqliteState::new(our, send_to_terminal, send_to_loop, home_directory_path); + + if let Err(e) = fs::create_dir_all(state.sqlite_path.as_str()).await { panic!("failed creating sqlite dir! {e:?}"); } - let open_dbs: Arc>> = Arc::new(DashMap::new()); - let txs: Arc)>>> = Arc::new(DashMap::new()); - let mut fds_limit = 10; - let process_queues: HashMap>>> = HashMap::new(); - crate::fd_manager::send_fd_manager_request_fds_limit( - &Address::new(our_node.as_str(), SQLITE_PROCESS_ID.clone()), - &send_to_loop, - ) - .await; - while let Some(km) = recv_from_loop.recv().await { - if *our_node != km.source.node { + if state.our.node != km.source.node { Printout::new( 1, format!( - "sqlite: got request from {}, but requests must come from our node {our_node}", - km.source.node + "sqlite: got request from {}, but requests must come from our node {}", + km.source.node, state.our.node ), ) - .send(&send_to_terminal) + .send(&state.send_to_terminal) .await; continue; } if km.source.process == *FD_MANAGER_PROCESS_ID { - if let Err(e) = handle_fd_request( - our_node.as_str(), - km, - &open_dbs, - &mut fds_limit, - &send_to_loop, - ) - .await - { + if let Err(e) = handle_fd_request(km, &mut state).await { Printout::new( 1, - format!("kv: got request from fd_manager that errored: {e:?}"), + format!("sqlite: got request from fd_manager that errored: {e:?}"), ) - .send(&send_to_terminal) + .send(&state.send_to_terminal) .await; }; continue; @@ -91,13 +155,8 @@ pub async fn sqlite( } // clone Arcs - let our_node = our_node.clone(); - let send_to_loop = send_to_loop.clone(); - let send_to_terminal = send_to_terminal.clone(); + let mut state = state.clone(); let send_to_caps_oracle = send_to_caps_oracle.clone(); - let open_dbs = open_dbs.clone(); - let txs = txs.clone(); - let sqlite_path = sqlite_path.clone(); tokio::spawn(async move { let mut queue_lock = queue.lock().await; @@ -105,23 +164,13 @@ pub async fn sqlite( let (km_id, km_rsvp) = (km.id.clone(), km.rsvp.clone().unwrap_or(km.source.clone())); - if let Err(e) = handle_request( - &our_node, - km, - open_dbs, - txs, - &send_to_loop, - &send_to_caps_oracle, - &sqlite_path, - ) - .await - { + if let Err(e) = handle_request(km, &mut state, &send_to_caps_oracle).await { Printout::new(1, format!("sqlite: {e}")) - .send(&send_to_terminal) + .send(&state.send_to_terminal) .await; KernelMessage::builder() .id(km_id) - .source((our_node.as_str(), SQLITE_PROCESS_ID.clone())) + .source(state.our.as_ref().clone()) .target(km_rsvp) .message(Message::Response(( Response { @@ -135,7 +184,7 @@ pub async fn sqlite( ))) .build() .unwrap() - .send(&send_to_loop) + .send(&state.send_to_loop) .await; } } @@ -145,13 +194,9 @@ pub async fn sqlite( } async fn handle_request( - our_node: &str, km: KernelMessage, - open_dbs: Arc>>, - txs: Arc)>>>, - send_to_loop: &MessageSender, + state: &mut SqliteState, send_to_caps_oracle: &CapMessageSender, - sqlite_path: &str, ) -> Result<(), SqliteError> { let KernelMessage { id, @@ -182,15 +227,12 @@ async fn handle_request( } }; - check_caps( - our_node, - &source, - &open_dbs, - send_to_caps_oracle, - &request, - sqlite_path, - ) - .await?; + check_caps(&source, state, send_to_caps_oracle, &request).await?; + + // always open to ensure db exists + state + .open_db(request.package_id.clone(), request.db.clone()) + .await?; let (body, bytes) = match request.action { SqliteAction::Open => { @@ -202,7 +244,7 @@ async fn handle_request( (serde_json::to_vec(&SqliteResponse::Ok).unwrap(), None) } SqliteAction::Read { query } => { - let db = match open_dbs.get(&(request.package_id, request.db)) { + let db = match state.open_dbs.get(&(request.package_id, request.db)) { Some(db) => db, None => { return Err(SqliteError::NoDb); @@ -258,7 +300,7 @@ async fn handle_request( ) } SqliteAction::Write { statement, tx_id } => { - let db = match open_dbs.get(&(request.package_id, request.db)) { + let db = match state.open_dbs.get(&(request.package_id, request.db)) { Some(db) => db, None => { return Err(SqliteError::NoDb); @@ -280,7 +322,9 @@ async fn handle_request( match tx_id { Some(tx_id) => { - txs.entry(tx_id) + state + .txs + .entry(tx_id) .or_default() .push((statement.clone(), parameters)); } @@ -293,7 +337,7 @@ async fn handle_request( } SqliteAction::BeginTx => { let tx_id = rand::random::(); - txs.insert(tx_id, Vec::new()); + state.txs.insert(tx_id, Vec::new()); ( serde_json::to_vec(&SqliteResponse::BeginTx { tx_id }).unwrap(), @@ -301,7 +345,7 @@ async fn handle_request( ) } SqliteAction::Commit { tx_id } => { - let db = match open_dbs.get(&(request.package_id, request.db)) { + let db = match state.open_dbs.get(&(request.package_id, request.db)) { Some(db) => db, None => { return Err(SqliteError::NoDb); @@ -309,7 +353,7 @@ async fn handle_request( }; let mut db = db.lock().await; - let txs = match txs.remove(&tx_id).map(|(_, tx)| tx) { + let txs = match state.txs.remove(&tx_id).map(|(_, tx)| tx) { None => { return Err(SqliteError::NoTx); } @@ -325,7 +369,7 @@ async fn handle_request( (serde_json::to_vec(&SqliteResponse::Ok).unwrap(), None) } SqliteAction::Backup => { - for db_ref in open_dbs.iter() { + for db_ref in state.open_dbs.iter() { let db = db_ref.value().lock().await; let result: rusqlite::Result<()> = db .query_row("PRAGMA wal_checkpoint(TRUNCATE)", [], |_| Ok(())) @@ -343,7 +387,7 @@ async fn handle_request( if let Some(target) = km.rsvp.or_else(|| expects_response.map(|_| source)) { KernelMessage::builder() .id(id) - .source((our_node, SQLITE_PROCESS_ID.clone())) + .source(state.our.as_ref().clone()) .target(target) .message(Message::Response(( Response { @@ -360,7 +404,7 @@ async fn handle_request( })) .build() .unwrap() - .send(send_to_loop) + .send(&state.send_to_loop) .await; } @@ -368,12 +412,10 @@ async fn handle_request( } async fn check_caps( - our_node: &str, source: &Address, - open_dbs: &Arc>>, + state: &mut SqliteState, send_to_caps_oracle: &CapMessageSender, request: &SqliteRequest, - sqlite_path: &str, ) -> Result<(), SqliteError> { let (send_cap_bool, recv_cap_bool) = tokio::sync::oneshot::channel(); let src_package_id = PackageId::new(source.process.package(), source.process.publisher()); @@ -384,7 +426,7 @@ async fn check_caps( .send(CapMessage::Has { on: source.process.clone(), cap: Capability::new( - (our_node, SQLITE_PROCESS_ID.clone()), + state.our.as_ref().clone(), serde_json::json!({ "kind": "write", "db": request.db.to_string(), @@ -407,7 +449,7 @@ async fn check_caps( .send(CapMessage::Has { on: source.process.clone(), cap: Capability::new( - (our_node, SQLITE_PROCESS_ID.clone()), + state.our.as_ref().clone(), serde_json::json!({ "kind": "read", "db": request.db.to_string(), @@ -435,7 +477,7 @@ async fn check_caps( add_capability( "read", &request.db.to_string(), - &our_node, + &state.our, &source, send_to_caps_oracle, ) @@ -443,28 +485,22 @@ async fn check_caps( add_capability( "write", &request.db.to_string(), - &our_node, + &state.our, &source, send_to_caps_oracle, ) .await?; - if open_dbs.contains_key(&(request.package_id.clone(), request.db.clone())) { + if state + .open_dbs + .contains_key(&(request.package_id.clone(), request.db.clone())) + { return Ok(()); } - let db_path = format!("{}/{}/{}", sqlite_path, request.package_id, request.db); - fs::create_dir_all(&db_path).await?; - - let db_file_path = format!("{}/{}.db", db_path, request.db); - - let db = Connection::open(db_file_path)?; - let _ = db.execute("PRAGMA journal_mode=WAL", []); - - open_dbs.insert( - (request.package_id.clone(), request.db.clone()), - Mutex::new(db), - ); + state + .open_db(request.package_id.clone(), request.db.clone()) + .await?; Ok(()) } SqliteAction::RemoveDb => { @@ -474,10 +510,9 @@ async fn check_caps( }); } - let db_path = format!("{}/{}/{}", sqlite_path, request.package_id, request.db); - open_dbs.remove(&(request.package_id.clone(), request.db.clone())); - - fs::remove_dir_all(&db_path).await?; + state + .remove_db(request.package_id.clone(), request.db.clone()) + .await; Ok(()) } SqliteAction::Backup => { @@ -487,13 +522,7 @@ async fn check_caps( } } -async fn handle_fd_request( - our_node: &str, - km: KernelMessage, - open_dbs: &Arc>>, - fds_limit: &mut u64, - send_to_loop: &MessageSender, -) -> anyhow::Result<()> { +async fn handle_fd_request(km: KernelMessage, state: &mut SqliteState) -> anyhow::Result<()> { let Message::Request(Request { body, .. }) = km.message else { return Err(anyhow::anyhow!("not a request")); }; @@ -502,14 +531,13 @@ async fn handle_fd_request( match request { FdManagerRequest::FdsLimit(new_fds_limit) => { - *fds_limit = new_fds_limit; - if open_dbs.len() as u64 >= *fds_limit { - crate::fd_manager::send_fd_manager_hit_fds_limit( - &Address::new(our_node, SQLITE_PROCESS_ID.clone()), - &send_to_loop, - ) - .await; - // TODO close least recently used dbs! + state.fds_limit = new_fds_limit; + if state.open_dbs.len() as u64 >= state.fds_limit { + crate::fd_manager::send_fd_manager_hit_fds_limit(&state.our, &state.send_to_loop) + .await; + state + .remove_least_recently_used_dbs(state.open_dbs.len() as u64 - state.fds_limit) + .await; } } _ => { @@ -523,15 +551,12 @@ async fn handle_fd_request( async fn add_capability( kind: &str, db: &str, - our_node: &str, + our: &Address, source: &Address, send_to_caps_oracle: &CapMessageSender, ) -> Result<(), SqliteError> { let cap = Capability { - issuer: Address { - node: our_node.to_string(), - process: SQLITE_PROCESS_ID.clone(), - }, + issuer: our.clone(), params: serde_json::json!({ "kind": kind, "db": db }).to_string(), }; let (send_cap_bool, recv_cap_bool) = tokio::sync::oneshot::channel(); From b87cfbc6741833aad7ea020b0f86451ed146134c Mon Sep 17 00:00:00 2001 From: bitful-pannul Date: Tue, 8 Oct 2024 23:25:25 +0200 Subject: [PATCH 35/37] hotfix kv & sqlite: do not remove dbs upon fd cull --- kinode/src/kv.rs | 7 +++++-- kinode/src/sqlite.rs | 11 +++++++++-- 2 files changed, 14 insertions(+), 4 deletions(-) diff --git a/kinode/src/kv.rs b/kinode/src/kv.rs index 977fb58b..00aaa04e 100644 --- a/kinode/src/kv.rs +++ b/kinode/src/kv.rs @@ -70,11 +70,9 @@ impl KvState { } pub async fn remove_db(&mut self, package_id: PackageId, db: String) { - let db_path = format!("{}/{}/{}", self.kv_path.as_str(), package_id, db); self.open_kvs.remove(&(package_id.clone(), db.to_string())); let mut access_order = self.access_order.lock().await; access_order.remove(&(package_id, db)); - let _ = fs::remove_dir_all(&db_path).await; } pub async fn remove_least_recently_used_dbs(&mut self, n: u64) { @@ -499,6 +497,11 @@ async fn check_caps( .remove_db(request.package_id.clone(), request.db.clone()) .await; + let _ = fs::remove_dir_all(format!( + "{}/{}/{}", + state.kv_path, request.package_id, request.db + )); + Ok(()) } KvAction::Backup { .. } => Ok(()), diff --git a/kinode/src/sqlite.rs b/kinode/src/sqlite.rs index 91f52b3f..8a90f783 100644 --- a/kinode/src/sqlite.rs +++ b/kinode/src/sqlite.rs @@ -56,6 +56,9 @@ impl SqliteState { pub async fn open_db(&mut self, package_id: PackageId, db: String) -> Result<(), SqliteError> { let key = (package_id.clone(), db.clone()); if self.open_dbs.contains_key(&key) { + // let mut access_order = self.access_order.lock().await; + // access_order.remove(&key); + // access_order.push_back(key); return Ok(()); } @@ -81,11 +84,9 @@ impl SqliteState { } pub async fn remove_db(&mut self, package_id: PackageId, db: String) { - let db_path = format!("{}/{}/{}", self.sqlite_path.as_str(), package_id, db); self.open_dbs.remove(&(package_id.clone(), db.to_string())); let mut access_order = self.access_order.lock().await; access_order.remove(&(package_id, db)); - let _ = fs::remove_dir_all(&db_path).await; } pub async fn remove_least_recently_used_dbs(&mut self, n: u64) { @@ -513,6 +514,12 @@ async fn check_caps( state .remove_db(request.package_id.clone(), request.db.clone()) .await; + + let _ = fs::remove_dir_all(format!( + "{}/{}/{}", + state.sqlite_path, request.package_id, request.db + )); + Ok(()) } SqliteAction::Backup => { From 03b531ef64ed1f818ddfbcf10994999f2fd2ccbd Mon Sep 17 00:00:00 2001 From: dr-frmr Date: Tue, 8 Oct 2024 18:14:21 -0400 Subject: [PATCH 36/37] fixes --- kinode/src/kv.rs | 8 ++++++-- kinode/src/sqlite.rs | 11 ++++++----- 2 files changed, 12 insertions(+), 7 deletions(-) diff --git a/kinode/src/kv.rs b/kinode/src/kv.rs index 00aaa04e..99c315cd 100644 --- a/kinode/src/kv.rs +++ b/kinode/src/kv.rs @@ -48,6 +48,9 @@ impl KvState { pub async fn open_db(&mut self, package_id: PackageId, db: String) -> Result<(), KvError> { let key = (package_id.clone(), db.clone()); if self.open_kvs.contains_key(&key) { + let mut access_order = self.access_order.lock().await; + access_order.remove(&key); + access_order.push_back(key); return Ok(()); } @@ -497,10 +500,11 @@ async fn check_caps( .remove_db(request.package_id.clone(), request.db.clone()) .await; - let _ = fs::remove_dir_all(format!( + fs::remove_dir_all(format!( "{}/{}/{}", state.kv_path, request.package_id, request.db - )); + )) + .await?; Ok(()) } diff --git a/kinode/src/sqlite.rs b/kinode/src/sqlite.rs index 8a90f783..baaa5e18 100644 --- a/kinode/src/sqlite.rs +++ b/kinode/src/sqlite.rs @@ -56,9 +56,9 @@ impl SqliteState { pub async fn open_db(&mut self, package_id: PackageId, db: String) -> Result<(), SqliteError> { let key = (package_id.clone(), db.clone()); if self.open_dbs.contains_key(&key) { - // let mut access_order = self.access_order.lock().await; - // access_order.remove(&key); - // access_order.push_back(key); + let mut access_order = self.access_order.lock().await; + access_order.remove(&key); + access_order.push_back(key); return Ok(()); } @@ -515,10 +515,11 @@ async fn check_caps( .remove_db(request.package_id.clone(), request.db.clone()) .await; - let _ = fs::remove_dir_all(format!( + fs::remove_dir_all(format!( "{}/{}/{}", state.sqlite_path, request.package_id, request.db - )); + )) + .await?; Ok(()) } From 0cdc2b6b74a5d2aa72722c1d5e8b196858fce057 Mon Sep 17 00:00:00 2001 From: dr-frmr Date: Tue, 8 Oct 2024 18:57:50 -0400 Subject: [PATCH 37/37] fd_man: weight fds allocated by number of times modules hit limit --- kinode/src/fd_manager.rs | 31 +++++++++++++++++++------------ 1 file changed, 19 insertions(+), 12 deletions(-) diff --git a/kinode/src/fd_manager.rs b/kinode/src/fd_manager.rs index 856849fe..ee81fdfe 100644 --- a/kinode/src/fd_manager.rs +++ b/kinode/src/fd_manager.rs @@ -65,14 +65,21 @@ impl State { ulimit_max_fds * max_fds_as_fraction_of_ulimit_percentage / 100 - SYS_RESERVED_FDS; } - fn update_all_fds_limits(&mut self) { - let len = self.fds_limits.len() as u64; - let per_process_limit = self.max_fds / std::cmp::max(len, 1); + async fn update_all_fds_limits(&mut self, our_node: &str, send_to_loop: &MessageSender) { + let weights = self + .fds_limits + .values() + .map(|limit| limit.hit_count) + .sum::(); + let statically_allocated = self.max_fds as f64 / 2.0; + let per_process_unweighted = + statically_allocated / std::cmp::max(self.fds_limits.len() as u64, 1) as f64; + let per_process_weighted = statically_allocated / std::cmp::max(weights, 1) as f64; for limit in self.fds_limits.values_mut() { - limit.limit = per_process_limit; - // reset hit count when updating limits - limit.hit_count = 0; + limit.limit = (per_process_unweighted + per_process_weighted * limit.hit_count as f64) + .floor() as u64; } + send_all_fds_limits(our_node, send_to_loop, self).await; } } @@ -135,8 +142,7 @@ pub async fn fd_manager( match update_max_fds(&mut state).await { Ok(new) => { if new != old_max_fds { - state.update_all_fds_limits(); - send_all_fds_limits(&our_node, &send_to_loop, &state).await; + state.update_all_fds_limits(our_node.as_str(), &send_to_loop).await; } } Err(e) => Printout::new(1, &format!("update_max_fds error: {e:?}")) @@ -174,19 +180,20 @@ async fn handle_message( km.source.process, FdsLimit { limit: 0, - hit_count: 0, + hit_count: 1, // starts with 1 to give initial weight }, ); - state.update_all_fds_limits(); - send_all_fds_limits(our_node, send_to_loop, state).await; + state.update_all_fds_limits(our_node, &send_to_loop).await; None } FdManagerRequest::FdsLimitHit => { // sender process hit its fd limit - // TODO react to this + // react to this by incrementing hit count and + // re-weighting all processes' limits state.fds_limits.get_mut(&km.source.process).map(|limit| { limit.hit_count += 1; }); + state.update_all_fds_limits(our_node, &send_to_loop).await; Some(format!("{} hit its fd limit", km.source.process)) } FdManagerRequest::FdsLimit(_) => {