From 0697b417a0a15fc2414397b639330b97bb3ec069 Mon Sep 17 00:00:00 2001 From: Conrad Irwin Date: Mon, 29 Apr 2024 18:13:28 -0600 Subject: [PATCH] Hang diagnostics (#11190) Release Notes: - Added diagnostics for main-thread hangs on macOS. These are only enabled if you've opted into diagnostics. --------- Co-authored-by: Mikayla --- Cargo.lock | 2 + Cargo.toml | 1 + crates/client/src/telemetry.rs | 32 +- crates/collab/src/api/events.rs | 128 ++++- .../telemetry_events/src/telemetry_events.rs | 18 + crates/zed/Cargo.toml | 2 + crates/zed/src/main.rs | 358 +----------- crates/zed/src/reliability.rs | 536 ++++++++++++++++++ 8 files changed, 721 insertions(+), 356 deletions(-) create mode 100644 crates/zed/src/reliability.rs diff --git a/Cargo.lock b/Cargo.lock index f4ff1ee348..a27363a584 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -12701,6 +12701,7 @@ dependencies = [ "markdown_preview", "menu", "mimalloc", + "nix 0.28.0", "node_runtime", "notifications", "outline", @@ -12723,6 +12724,7 @@ dependencies = [ "tab_switcher", "task", "tasks_ui", + "telemetry_events", "terminal_view", "theme", "theme_selector", diff --git a/Cargo.toml b/Cargo.toml index 5207322249..b2ecb65f67 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -284,6 +284,7 @@ lazy_static = "1.4.0" linkify = "0.10.0" log = { version = "0.4.16", features = ["kv_unstable_serde"] } nanoid = "0.4" +nix = "0.28" ordered-float = "2.1.1" palette = { version = "0.7.5", default-features = false, features = ["std"] } parking_lot = "0.12.1" diff --git a/crates/client/src/telemetry.rs b/crates/client/src/telemetry.rs index 0ee25eb602..fb386e379d 100644 --- a/crates/client/src/telemetry.rs +++ b/crates/client/src/telemetry.rs @@ -421,7 +421,7 @@ impl Telemetry { return; } - let Some(checksum_seed) = &*ZED_CLIENT_CHECKSUM_SEED else { + if ZED_CLIENT_CHECKSUM_SEED.is_none() { return; }; @@ -466,15 +466,9 @@ impl Telemetry { serde_json::to_writer(&mut json_bytes, &request_body)?; } - let mut summer = Sha256::new(); - summer.update(checksum_seed); - summer.update(&json_bytes); - summer.update(checksum_seed); - let mut checksum = String::new(); - for byte in summer.finalize().as_slice() { - use std::fmt::Write; - write!(&mut checksum, "{:02x}", byte).unwrap(); - } + let Some(checksum) = calculate_json_checksum(&json_bytes) else { + return Ok(()); + }; let request = http::Request::builder() .method(Method::POST) @@ -657,3 +651,21 @@ mod tests { && telemetry.state.lock().first_event_date_time.is_none() } } + +pub fn calculate_json_checksum(json: &impl AsRef<[u8]>) -> Option { + let Some(checksum_seed) = &*ZED_CLIENT_CHECKSUM_SEED else { + return None; + }; + + let mut summer = Sha256::new(); + summer.update(checksum_seed); + summer.update(&json); + summer.update(checksum_seed); + let mut checksum = String::new(); + for byte in summer.finalize().as_slice() { + use std::fmt::Write; + write!(&mut checksum, "{:02x}", byte).unwrap(); + } + + Some(checksum) +} diff --git a/crates/collab/src/api/events.rs b/crates/collab/src/api/events.rs index 3c954d6014..9c9361dc7d 100644 --- a/crates/collab/src/api/events.rs +++ b/crates/collab/src/api/events.rs @@ -18,11 +18,15 @@ use telemetry_events::{ ActionEvent, AppEvent, AssistantEvent, CallEvent, CopilotEvent, CpuEvent, EditEvent, EditorEvent, Event, EventRequestBody, EventWrapper, ExtensionEvent, MemoryEvent, SettingEvent, }; +use uuid::Uuid; + +static CRASH_REPORTS_BUCKET: &str = "zed-crash-reports"; pub fn router() -> Router { Router::new() .route("/telemetry/events", post(post_events)) .route("/telemetry/crashes", post(post_crash)) + .route("/telemetry/hangs", post(post_hang)) } pub struct ZedChecksumHeader(Vec); @@ -85,8 +89,6 @@ pub async fn post_crash( headers: HeaderMap, body: Bytes, ) -> Result<()> { - static CRASH_REPORTS_BUCKET: &str = "zed-crash-reports"; - let report = IpsFile::parse(&body)?; let version_threshold = SemanticVersion::new(0, 123, 0); @@ -222,6 +224,107 @@ pub async fn post_crash( Ok(()) } +pub async fn post_hang( + Extension(app): Extension>, + TypedHeader(ZedChecksumHeader(checksum)): TypedHeader, + body: Bytes, +) -> Result<()> { + let Some(expected) = calculate_json_checksum(app.clone(), &body) else { + return Err(Error::Http( + StatusCode::INTERNAL_SERVER_ERROR, + "events not enabled".into(), + ))?; + }; + + if checksum != expected { + return Err(Error::Http( + StatusCode::BAD_REQUEST, + "invalid checksum".into(), + ))?; + } + + let incident_id = Uuid::new_v4().to_string(); + + // dump JSON into S3 so we can get frame offsets if we need to. + if let Some(blob_store_client) = app.blob_store_client.as_ref() { + blob_store_client + .put_object() + .bucket(CRASH_REPORTS_BUCKET) + .key(incident_id.clone() + ".hang.json") + .acl(aws_sdk_s3::types::ObjectCannedAcl::PublicRead) + .body(ByteStream::from(body.to_vec())) + .send() + .await + .map_err(|e| log::error!("Failed to upload crash: {}", e)) + .ok(); + } + + let report: telemetry_events::HangReport = serde_json::from_slice(&body).map_err(|err| { + log::error!("can't parse report json: {err}"); + Error::Internal(anyhow!(err)) + })?; + + let mut backtrace = "Possible hang detected on main threadL".to_string(); + let unknown = "".to_string(); + for frame in report.backtrace.iter() { + backtrace.push_str(&format!("\n{}", frame.symbols.first().unwrap_or(&unknown))); + } + + tracing::error!( + service = "client", + version = %report.app_version.unwrap_or_default().to_string(), + os_name = %report.os_name, + os_version = report.os_version.unwrap_or_default().to_string(), + incident_id = %incident_id, + installation_id = %report.installation_id.unwrap_or_default(), + backtrace = %backtrace, + "hang report"); + + if let Some(slack_panics_webhook) = app.config.slack_panics_webhook.clone() { + let payload = slack::WebhookBody::new(|w| { + w.add_section(|s| s.text(slack::Text::markdown("Possible Hang".to_string()))) + .add_section(|s| { + s.add_field(slack::Text::markdown(format!( + "*Version:*\n {} ", + report.app_version.unwrap_or_default() + ))) + .add_field({ + let hostname = app.config.blob_store_url.clone().unwrap_or_default(); + let hostname = hostname.strip_prefix("https://").unwrap_or_else(|| { + hostname.strip_prefix("http://").unwrap_or_default() + }); + + slack::Text::markdown(format!( + "*Incident:*\n", + CRASH_REPORTS_BUCKET, + hostname, + incident_id, + incident_id.chars().take(8).collect::(), + )) + }) + }) + .add_rich_text(|r| r.add_preformatted(|p| p.add_text(backtrace))) + }); + let payload_json = serde_json::to_string(&payload).map_err(|err| { + log::error!("Failed to serialize payload to JSON: {err}"); + Error::Internal(anyhow!(err)) + })?; + + reqwest::Client::new() + .post(slack_panics_webhook) + .header("Content-Type", "application/json") + .body(payload_json) + .send() + .await + .map_err(|err| { + log::error!("Failed to send payload to Slack: {err}"); + Error::Internal(anyhow!(err)) + })?; + } + + Ok(()) +} + pub async fn post_events( Extension(app): Extension>, TypedHeader(ZedChecksumHeader(checksum)): TypedHeader, @@ -235,19 +338,14 @@ pub async fn post_events( ))? }; - let Some(checksum_seed) = app.config.zed_client_checksum_seed.as_ref() else { + let Some(expected) = calculate_json_checksum(app.clone(), &body) else { return Err(Error::Http( StatusCode::INTERNAL_SERVER_ERROR, "events not enabled".into(), ))?; }; - let mut summer = Sha256::new(); - summer.update(checksum_seed); - summer.update(&body); - summer.update(checksum_seed); - - if &checksum != &summer.finalize()[..] { + if checksum != expected { return Err(Error::Http( StatusCode::BAD_REQUEST, "invalid checksum".into(), @@ -1061,3 +1159,15 @@ impl ActionEventRow { } } } + +pub fn calculate_json_checksum(app: Arc, json: &impl AsRef<[u8]>) -> Option> { + let Some(checksum_seed) = app.config.zed_client_checksum_seed.as_ref() else { + return None; + }; + + let mut summer = Sha256::new(); + summer.update(checksum_seed); + summer.update(&json); + summer.update(checksum_seed); + Some(summer.finalize().into_iter().collect()) +} diff --git a/crates/telemetry_events/src/telemetry_events.rs b/crates/telemetry_events/src/telemetry_events.rs index 3eb86ab4dd..e49676c41f 100644 --- a/crates/telemetry_events/src/telemetry_events.rs +++ b/crates/telemetry_events/src/telemetry_events.rs @@ -135,3 +135,21 @@ pub struct ExtensionEvent { pub struct AppEvent { pub operation: String, } + +#[derive(Clone, Debug, PartialEq, Serialize, Deserialize)] +pub struct BacktraceFrame { + pub ip: usize, + pub symbol_addr: usize, + pub base: Option, + pub symbols: Vec, +} + +#[derive(Clone, Debug, PartialEq, Serialize, Deserialize)] +pub struct HangReport { + pub backtrace: Vec, + pub app_version: Option, + pub os_name: String, + pub os_version: Option, + pub architecture: String, + pub installation_id: Option, +} diff --git a/crates/zed/Cargo.toml b/crates/zed/Cargo.toml index 248b9488d4..a67f2cdf08 100644 --- a/crates/zed/Cargo.toml +++ b/crates/zed/Cargo.toml @@ -62,6 +62,7 @@ log.workspace = true markdown_preview.workspace = true menu.workspace = true mimalloc = "0.1" +nix = {workspace = true, features = ["pthread"] } node_runtime.workspace = true notifications.workspace = true outline.workspace = true @@ -84,6 +85,7 @@ smol.workspace = true tab_switcher.workspace = true task.workspace = true tasks_ui.workspace = true +telemetry_events.workspace = true terminal_view.workspace = true theme.workspace = true theme_selector.workspace = true diff --git a/crates/zed/src/main.rs b/crates/zed/src/main.rs index de8c95dbf5..6b13ae6bb7 100644 --- a/crates/zed/src/main.rs +++ b/crates/zed/src/main.rs @@ -3,11 +3,10 @@ // Disable command line from opening on release mode #![cfg_attr(not(debug_assertions), windows_subsystem = "windows")] +mod reliability; mod zed; use anyhow::{anyhow, Context as _, Result}; -use backtrace::Backtrace; -use chrono::Utc; use clap::{command, Parser}; use cli::FORCE_CLI_MODE_ENV_VAR_NAME; use client::{parse_zed_link, telemetry::Telemetry, Client, DevServerToken, UserStore}; @@ -19,11 +18,8 @@ use editor::{Editor, EditorMode}; use env_logger::Builder; use fs::RealFs; use futures::{future, StreamExt}; -use gpui::{ - App, AppContext, AsyncAppContext, Context, SemanticVersion, Task, ViewContext, VisualContext, -}; +use gpui::{App, AppContext, AsyncAppContext, Context, Task, ViewContext, VisualContext}; use image_viewer; -use isahc::{prelude::Configurable, Request}; use language::LanguageRegistry; use log::LevelFilter; @@ -31,8 +27,7 @@ use assets::Assets; use mimalloc::MiMalloc; use node_runtime::RealNodeRuntime; use parking_lot::Mutex; -use release_channel::{AppCommitSha, ReleaseChannel, RELEASE_CHANNEL}; -use serde::{Deserialize, Serialize}; +use release_channel::AppCommitSha; use settings::{ default_settings, handle_settings_file_changes, watch_config_file, Settings, SettingsStore, }; @@ -40,22 +35,16 @@ use simplelog::ConfigBuilder; use smol::process::Command; use std::{ env, - ffi::OsStr, fs::OpenOptions, io::{IsTerminal, Write}, - panic, path::Path, - sync::{ - atomic::{AtomicU32, Ordering}, - Arc, - }, - thread, + sync::Arc, }; use theme::{ActiveTheme, SystemAppearance, ThemeRegistry, ThemeSettings}; use util::{ - http::{HttpClient, HttpClientWithUrl}, + http::HttpClientWithUrl, maybe, parse_env_output, - paths::{self, CRASHES_DIR, CRASHES_RETIRED_DIR}, + paths::{self}, ResultExt, TryFutureExt, }; use uuid::Uuid; @@ -93,7 +82,18 @@ fn init_headless(dev_server_token: DevServerToken) { } init_logger(); - App::new().run(|cx| { + let app = App::new(); + + let session_id = Uuid::new_v4().to_string(); + let (installation_id, _) = app + .background_executor() + .block(installation_id()) + .ok() + .unzip(); + + reliability::init_panic_hook(&app, installation_id.clone(), session_id.clone()); + + app.run(|cx| { release_channel::init(env!("CARGO_PKG_VERSION"), cx); if let Some(build_sha) = option_env!("ZED_COMMIT_SHA") { AppCommitSha::set_global(AppCommitSha(build_sha.into()), cx); @@ -145,12 +145,7 @@ fn init_headless(dev_server_token: DevServerToken) { ); handle_settings_file_changes(user_settings_file_rx, cx); - let (installation_id, _) = cx - .background_executor() - .block(installation_id()) - .ok() - .unzip(); - upload_panics_and_crashes(client.http_client(), installation_id, cx); + reliability::init(client.http_client(), installation_id, cx); headless::init( client.clone(), @@ -189,7 +184,7 @@ fn init_ui(args: Args) { .ok() .unzip(); let session_id = Uuid::new_v4().to_string(); - init_panic_hook(&app, installation_id.clone(), session_id.clone()); + reliability::init_panic_hook(&app, installation_id.clone(), session_id.clone()); let git_binary_path = if option_env!("ZED_BUNDLE").as_deref() == Some("true") { app.path_for_auxiliary_executable("git") @@ -386,7 +381,7 @@ fn init_ui(args: Args) { cx.set_menus(app_menus()); initialize_workspace(app_state.clone(), cx); - upload_panics_and_crashes(client.http_client(), installation_id, cx); + reliability::init(client.http_client(), installation_id, cx); cx.activate(true); @@ -688,317 +683,6 @@ fn init_stdout_logger() { }) .init(); } - -#[derive(Serialize, Deserialize)] -struct LocationData { - file: String, - line: u32, -} - -#[derive(Serialize, Deserialize)] -struct Panic { - thread: String, - payload: String, - #[serde(skip_serializing_if = "Option::is_none")] - location_data: Option, - backtrace: Vec, - app_version: String, - release_channel: String, - os_name: String, - os_version: Option, - architecture: String, - panicked_on: i64, - #[serde(skip_serializing_if = "Option::is_none")] - installation_id: Option, - session_id: String, -} - -#[derive(Serialize)] -struct PanicRequest { - panic: Panic, -} - -static PANIC_COUNT: AtomicU32 = AtomicU32::new(0); - -fn init_panic_hook(app: &App, installation_id: Option, session_id: String) { - let is_pty = stdout_is_a_pty(); - let app_metadata = app.metadata(); - - panic::set_hook(Box::new(move |info| { - let prior_panic_count = PANIC_COUNT.fetch_add(1, Ordering::SeqCst); - if prior_panic_count > 0 { - // Give the panic-ing thread time to write the panic file - loop { - std::thread::yield_now(); - } - } - - let thread = thread::current(); - let thread_name = thread.name().unwrap_or(""); - - let payload = info - .payload() - .downcast_ref::<&str>() - .map(|s| s.to_string()) - .or_else(|| info.payload().downcast_ref::().map(|s| s.clone())) - .unwrap_or_else(|| "Box".to_string()); - - if *release_channel::RELEASE_CHANNEL == ReleaseChannel::Dev { - let location = info.location().unwrap(); - let backtrace = Backtrace::new(); - eprintln!( - "Thread {:?} panicked with {:?} at {}:{}:{}\n{:?}", - thread_name, - payload, - location.file(), - location.line(), - location.column(), - backtrace, - ); - std::process::exit(-1); - } - - let app_version = if let Some(version) = app_metadata.app_version { - version.to_string() - } else { - option_env!("CARGO_PKG_VERSION") - .unwrap_or("dev") - .to_string() - }; - - let backtrace = Backtrace::new(); - let mut backtrace = backtrace - .frames() - .iter() - .flat_map(|frame| { - frame - .symbols() - .iter() - .filter_map(|frame| Some(format!("{:#}", frame.name()?))) - }) - .collect::>(); - - // Strip out leading stack frames for rust panic-handling. - if let Some(ix) = backtrace - .iter() - .position(|name| name == "rust_begin_unwind") - { - backtrace.drain(0..=ix); - } - - let panic_data = Panic { - thread: thread_name.into(), - payload, - location_data: info.location().map(|location| LocationData { - file: location.file().into(), - line: location.line(), - }), - app_version: app_version.to_string(), - release_channel: RELEASE_CHANNEL.display_name().into(), - os_name: app_metadata.os_name.into(), - os_version: app_metadata - .os_version - .as_ref() - .map(SemanticVersion::to_string), - architecture: env::consts::ARCH.into(), - panicked_on: Utc::now().timestamp_millis(), - backtrace, - installation_id: installation_id.clone(), - session_id: session_id.clone(), - }; - - if let Some(panic_data_json) = serde_json::to_string_pretty(&panic_data).log_err() { - log::error!("{}", panic_data_json); - } - - if !is_pty { - if let Some(panic_data_json) = serde_json::to_string(&panic_data).log_err() { - let timestamp = chrono::Utc::now().format("%Y_%m_%d %H_%M_%S").to_string(); - let panic_file_path = paths::LOGS_DIR.join(format!("zed-{}.panic", timestamp)); - let panic_file = std::fs::OpenOptions::new() - .append(true) - .create(true) - .open(&panic_file_path) - .log_err(); - if let Some(mut panic_file) = panic_file { - writeln!(&mut panic_file, "{}", panic_data_json).log_err(); - panic_file.flush().log_err(); - } - } - } - - std::process::abort(); - })); -} - -fn upload_panics_and_crashes( - http: Arc, - installation_id: Option, - cx: &mut AppContext, -) { - let telemetry_settings = *client::TelemetrySettings::get_global(cx); - cx.background_executor() - .spawn(async move { - let most_recent_panic = upload_previous_panics(http.clone(), telemetry_settings) - .await - .log_err() - .flatten(); - upload_previous_crashes(http, most_recent_panic, installation_id, telemetry_settings) - .await - .log_err() - }) - .detach() -} - -/// Uploads panics via `zed.dev`. -async fn upload_previous_panics( - http: Arc, - telemetry_settings: client::TelemetrySettings, -) -> Result> { - let panic_report_url = http.build_url("/api/panic"); - let mut children = smol::fs::read_dir(&*paths::LOGS_DIR).await?; - - let mut most_recent_panic = None; - - while let Some(child) = children.next().await { - let child = child?; - let child_path = child.path(); - - if child_path.extension() != Some(OsStr::new("panic")) { - continue; - } - let filename = if let Some(filename) = child_path.file_name() { - filename.to_string_lossy() - } else { - continue; - }; - - if !filename.starts_with("zed") { - continue; - } - - if telemetry_settings.diagnostics { - let panic_file_content = smol::fs::read_to_string(&child_path) - .await - .context("error reading panic file")?; - - let panic: Option = serde_json::from_str(&panic_file_content) - .ok() - .or_else(|| { - panic_file_content - .lines() - .next() - .and_then(|line| serde_json::from_str(line).ok()) - }) - .unwrap_or_else(|| { - log::error!("failed to deserialize panic file {:?}", panic_file_content); - None - }); - - if let Some(panic) = panic { - most_recent_panic = Some((panic.panicked_on, panic.payload.clone())); - - let body = serde_json::to_string(&PanicRequest { panic }).unwrap(); - - let request = Request::post(&panic_report_url) - .redirect_policy(isahc::config::RedirectPolicy::Follow) - .header("Content-Type", "application/json") - .body(body.into())?; - let response = http.send(request).await.context("error sending panic")?; - if !response.status().is_success() { - log::error!("Error uploading panic to server: {}", response.status()); - } - } - } - - // We've done what we can, delete the file - std::fs::remove_file(child_path) - .context("error removing panic") - .log_err(); - } - Ok::<_, anyhow::Error>(most_recent_panic) -} - -static LAST_CRASH_UPLOADED: &'static str = "LAST_CRASH_UPLOADED"; - -/// upload crashes from apple's diagnostic reports to our server. -/// (only if telemetry is enabled) -async fn upload_previous_crashes( - http: Arc, - most_recent_panic: Option<(i64, String)>, - installation_id: Option, - telemetry_settings: client::TelemetrySettings, -) -> Result<()> { - if !telemetry_settings.diagnostics { - return Ok(()); - } - let last_uploaded = KEY_VALUE_STORE - .read_kvp(LAST_CRASH_UPLOADED)? - .unwrap_or("zed-2024-01-17-221900.ips".to_string()); // don't upload old crash reports from before we had this. - let mut uploaded = last_uploaded.clone(); - - let crash_report_url = http.build_zed_api_url("/telemetry/crashes", &[])?; - - // crash directories are only set on MacOS - for dir in [&*CRASHES_DIR, &*CRASHES_RETIRED_DIR] - .iter() - .filter_map(|d| d.as_deref()) - { - let mut children = smol::fs::read_dir(&dir).await?; - while let Some(child) = children.next().await { - let child = child?; - let Some(filename) = child - .path() - .file_name() - .map(|f| f.to_string_lossy().to_lowercase()) - else { - continue; - }; - - if !filename.starts_with("zed-") || !filename.ends_with(".ips") { - continue; - } - - if filename <= last_uploaded { - continue; - } - - let body = smol::fs::read_to_string(&child.path()) - .await - .context("error reading crash file")?; - - let mut request = Request::post(&crash_report_url.to_string()) - .redirect_policy(isahc::config::RedirectPolicy::Follow) - .header("Content-Type", "text/plain"); - - if let Some((panicked_on, payload)) = most_recent_panic.as_ref() { - request = request - .header("x-zed-panicked-on", format!("{}", panicked_on)) - .header("x-zed-panic", payload) - } - if let Some(installation_id) = installation_id.as_ref() { - request = request.header("x-zed-installation-id", installation_id); - } - - let request = request.body(body.into())?; - - let response = http.send(request).await.context("error sending crash")?; - if !response.status().is_success() { - log::error!("Error uploading crash to server: {}", response.status()); - } - - if uploaded < filename { - uploaded = filename.clone(); - KEY_VALUE_STORE - .write_kvp(LAST_CRASH_UPLOADED.to_string(), filename) - .await?; - } - } - } - - Ok(()) -} - async fn load_login_shell_environment() -> Result<()> { let marker = "ZED_LOGIN_SHELL_START"; let shell = env::var("SHELL").context( diff --git a/crates/zed/src/reliability.rs b/crates/zed/src/reliability.rs new file mode 100644 index 0000000000..b277235e6b --- /dev/null +++ b/crates/zed/src/reliability.rs @@ -0,0 +1,536 @@ +use anyhow::{Context, Result}; +use backtrace::{self, Backtrace}; +use chrono::Utc; +use db::kvp::KEY_VALUE_STORE; +use gpui::{App, AppContext, SemanticVersion}; +use isahc::config::Configurable; + +use paths::{CRASHES_DIR, CRASHES_RETIRED_DIR}; +use release_channel::ReleaseChannel; +use release_channel::RELEASE_CHANNEL; +use serde::{Deserialize, Serialize}; +use settings::Settings; +use smol::stream::StreamExt; +use std::{ + env, + ffi::OsStr, + sync::{atomic::Ordering, Arc}, +}; +use std::{io::Write, panic, sync::atomic::AtomicU32, thread}; +use util::{ + http::{self, HttpClient, HttpClientWithUrl}, + paths, ResultExt, +}; + +use crate::stdout_is_a_pty; + +#[derive(Serialize, Deserialize)] +struct LocationData { + file: String, + line: u32, +} + +#[derive(Serialize, Deserialize)] +struct Panic { + thread: String, + payload: String, + #[serde(skip_serializing_if = "Option::is_none")] + location_data: Option, + backtrace: Vec, + app_version: String, + release_channel: String, + os_name: String, + os_version: Option, + architecture: String, + panicked_on: i64, + #[serde(skip_serializing_if = "Option::is_none")] + installation_id: Option, + session_id: String, +} + +#[derive(Serialize)] +struct PanicRequest { + panic: Panic, +} + +static PANIC_COUNT: AtomicU32 = AtomicU32::new(0); + +pub fn init_panic_hook(app: &App, installation_id: Option, session_id: String) { + let is_pty = stdout_is_a_pty(); + let app_metadata = app.metadata(); + + panic::set_hook(Box::new(move |info| { + let prior_panic_count = PANIC_COUNT.fetch_add(1, Ordering::SeqCst); + if prior_panic_count > 0 { + // Give the panic-ing thread time to write the panic file + loop { + std::thread::yield_now(); + } + } + + let thread = thread::current(); + let thread_name = thread.name().unwrap_or(""); + + let payload = info + .payload() + .downcast_ref::<&str>() + .map(|s| s.to_string()) + .or_else(|| info.payload().downcast_ref::().map(|s| s.clone())) + .unwrap_or_else(|| "Box".to_string()); + + if *release_channel::RELEASE_CHANNEL == ReleaseChannel::Dev { + let location = info.location().unwrap(); + let backtrace = Backtrace::new(); + eprintln!( + "Thread {:?} panicked with {:?} at {}:{}:{}\n{:?}", + thread_name, + payload, + location.file(), + location.line(), + location.column(), + backtrace, + ); + std::process::exit(-1); + } + + let app_version = if let Some(version) = app_metadata.app_version { + version.to_string() + } else { + option_env!("CARGO_PKG_VERSION") + .unwrap_or("dev") + .to_string() + }; + + let backtrace = Backtrace::new(); + let mut backtrace = backtrace + .frames() + .iter() + .flat_map(|frame| { + frame + .symbols() + .iter() + .filter_map(|frame| Some(format!("{:#}", frame.name()?))) + }) + .collect::>(); + + // Strip out leading stack frames for rust panic-handling. + if let Some(ix) = backtrace + .iter() + .position(|name| name == "rust_begin_unwind") + { + backtrace.drain(0..=ix); + } + + let panic_data = Panic { + thread: thread_name.into(), + payload, + location_data: info.location().map(|location| LocationData { + file: location.file().into(), + line: location.line(), + }), + app_version: app_version.to_string(), + release_channel: RELEASE_CHANNEL.display_name().into(), + os_name: app_metadata.os_name.into(), + os_version: app_metadata + .os_version + .as_ref() + .map(SemanticVersion::to_string), + architecture: env::consts::ARCH.into(), + panicked_on: Utc::now().timestamp_millis(), + backtrace, + installation_id: installation_id.clone(), + session_id: session_id.clone(), + }; + + if let Some(panic_data_json) = serde_json::to_string_pretty(&panic_data).log_err() { + log::error!("{}", panic_data_json); + } + + if !is_pty { + if let Some(panic_data_json) = serde_json::to_string(&panic_data).log_err() { + let timestamp = chrono::Utc::now().format("%Y_%m_%d %H_%M_%S").to_string(); + let panic_file_path = paths::LOGS_DIR.join(format!("zed-{}.panic", timestamp)); + let panic_file = std::fs::OpenOptions::new() + .append(true) + .create(true) + .open(&panic_file_path) + .log_err(); + if let Some(mut panic_file) = panic_file { + writeln!(&mut panic_file, "{}", panic_data_json).log_err(); + panic_file.flush().log_err(); + } + } + } + + std::process::abort(); + })); +} + +pub fn init( + http_client: Arc, + installation_id: Option, + cx: &mut AppContext, +) { + #[cfg(target_os = "macos")] + monitor_main_thread_hangs(http_client.clone(), installation_id.clone(), cx); + + upload_panics_and_crashes(http_client, installation_id, cx) +} + +#[cfg(target_os = "macos")] +pub fn monitor_main_thread_hangs( + http_client: Arc, + installation_id: Option, + cx: &AppContext, +) { + use nix::sys::signal::{ + sigaction, SaFlags, SigAction, SigHandler, SigSet, + Signal::{self, SIGUSR2}, + }; + + use parking_lot::Mutex; + + use std::{ + ffi::c_int, + sync::{mpsc, OnceLock}, + time::Duration, + }; + use telemetry_events::{BacktraceFrame, HangReport}; + use util::http::Method; + + use nix::sys::pthread; + + let foreground_executor = cx.foreground_executor(); + let background_executor = cx.background_executor(); + let telemetry_settings = *client::TelemetrySettings::get_global(cx); + let metadata = cx.app_metadata(); + + // Initialize SIGUSR2 handler to send a backrace to a channel. + let (backtrace_tx, backtrace_rx) = mpsc::channel(); + static BACKTRACE: Mutex> = Mutex::new(Vec::new()); + static BACKTRACE_SENDER: OnceLock> = OnceLock::new(); + BACKTRACE_SENDER.get_or_init(|| backtrace_tx); + BACKTRACE.lock().reserve(100); + + fn handle_backtrace_signal() { + unsafe { + extern "C" fn handle_sigusr2(_i: c_int) { + unsafe { + // ASYNC SIGNAL SAFETY: This lock is only accessed one other time, + // which can only be triggered by This signal handler. In addition, + // this signal handler is immediately removed by SA_RESETHAND, and this + // signal handler cannot be re-entrant due to to the SIGUSR2 mask defined + // below + let mut bt = BACKTRACE.lock(); + bt.clear(); + backtrace::trace_unsynchronized(|frame| { + if bt.len() < bt.capacity() { + bt.push(frame.clone()); + true + } else { + false + } + }); + } + + BACKTRACE_SENDER.get().unwrap().send(()).ok(); + } + + let mut mask = SigSet::empty(); + mask.add(SIGUSR2); + sigaction( + Signal::SIGUSR2, + &SigAction::new( + SigHandler::Handler(handle_sigusr2), + SaFlags::SA_RESTART | SaFlags::SA_RESETHAND, + mask, + ), + ) + .log_err(); + } + } + + handle_backtrace_signal(); + let main_thread = pthread::pthread_self(); + + let (mut tx, mut rx) = futures::channel::mpsc::channel(3); + foreground_executor + .spawn(async move { while let Some(_) = rx.next().await {} }) + .detach(); + + background_executor + .spawn({ + let background_executor = background_executor.clone(); + async move { + loop { + background_executor.timer(Duration::from_secs(1)).await; + match tx.try_send(()) { + Ok(_) => continue, + Err(e) => { + if e.into_send_error().is_full() { + pthread::pthread_kill(main_thread, SIGUSR2).log_err(); + } + // Only detect the first hang + break; + } + } + } + } + }) + .detach(); + + background_executor + .clone() + .spawn(async move { + loop { + while let Some(_) = backtrace_rx.recv().ok() { + if !telemetry_settings.diagnostics { + return; + } + + // ASYNC SIGNAL SAFETY: This lock is only accessed _after_ + // the backtrace transmitter has fired, which itself is only done + // by the signal handler. And due to SA_RESETHAND the signal handler + // will not run again until `handle_backtrace_signal` is called. + let raw_backtrace = BACKTRACE.lock().drain(..).collect::>(); + let backtrace: Vec<_> = raw_backtrace + .into_iter() + .map(|frame| { + let mut btf = BacktraceFrame { + ip: frame.ip() as usize, + symbol_addr: frame.symbol_address() as usize, + base: frame.module_base_address().map(|addr| addr as usize), + symbols: vec![], + }; + + backtrace::resolve_frame(&frame, |symbol| { + if let Some(name) = symbol.name() { + btf.symbols.push(name.to_string()); + } + }); + + btf + }) + .collect(); + + // IMPORTANT: Don't move this to before `BACKTRACE.lock()` + handle_backtrace_signal(); + + log::error!( + "Suspected hang on main thread:\n{}", + backtrace + .iter() + .flat_map(|bt| bt.symbols.first().as_ref().map(|s| s.as_str())) + .collect::>() + .join("\n") + ); + + let report = HangReport { + backtrace, + app_version: metadata.app_version, + os_name: metadata.os_name.to_owned(), + os_version: metadata.os_version, + architecture: env::consts::ARCH.into(), + installation_id: installation_id.clone(), + }; + + let Some(json_bytes) = serde_json::to_vec(&report).log_err() else { + continue; + }; + + let Some(checksum) = client::telemetry::calculate_json_checksum(&json_bytes) + else { + continue; + }; + + let Ok(url) = http_client.build_zed_api_url("/telemetry/hangs", &[]) else { + continue; + }; + + let Ok(request) = http::Request::builder() + .method(Method::POST) + .uri(url.as_ref()) + .header("x-zed-checksum", checksum) + .body(json_bytes.into()) + else { + continue; + }; + + if let Some(response) = http_client.send(request).await.log_err() { + if response.status() != 200 { + log::error!("Failed to send hang report: HTTP {:?}", response.status()); + } + } + } + } + }) + .detach() +} + +fn upload_panics_and_crashes( + http: Arc, + installation_id: Option, + cx: &mut AppContext, +) { + let telemetry_settings = *client::TelemetrySettings::get_global(cx); + cx.background_executor() + .spawn(async move { + let most_recent_panic = upload_previous_panics(http.clone(), telemetry_settings) + .await + .log_err() + .flatten(); + upload_previous_crashes(http, most_recent_panic, installation_id, telemetry_settings) + .await + .log_err() + }) + .detach() +} + +/// Uploads panics via `zed.dev`. +async fn upload_previous_panics( + http: Arc, + telemetry_settings: client::TelemetrySettings, +) -> Result> { + let panic_report_url = http.build_url("/api/panic"); + let mut children = smol::fs::read_dir(&*paths::LOGS_DIR).await?; + + let mut most_recent_panic = None; + + while let Some(child) = children.next().await { + let child = child?; + let child_path = child.path(); + + if child_path.extension() != Some(OsStr::new("panic")) { + continue; + } + let filename = if let Some(filename) = child_path.file_name() { + filename.to_string_lossy() + } else { + continue; + }; + + if !filename.starts_with("zed") { + continue; + } + + if telemetry_settings.diagnostics { + let panic_file_content = smol::fs::read_to_string(&child_path) + .await + .context("error reading panic file")?; + + let panic: Option = serde_json::from_str(&panic_file_content) + .ok() + .or_else(|| { + panic_file_content + .lines() + .next() + .and_then(|line| serde_json::from_str(line).ok()) + }) + .unwrap_or_else(|| { + log::error!("failed to deserialize panic file {:?}", panic_file_content); + None + }); + + if let Some(panic) = panic { + most_recent_panic = Some((panic.panicked_on, panic.payload.clone())); + + let body = serde_json::to_string(&PanicRequest { panic }).unwrap(); + + let request = http::Request::post(&panic_report_url) + .redirect_policy(isahc::config::RedirectPolicy::Follow) + .header("Content-Type", "application/json") + .body(body.into())?; + let response = http.send(request).await.context("error sending panic")?; + if !response.status().is_success() { + log::error!("Error uploading panic to server: {}", response.status()); + } + } + } + + // We've done what we can, delete the file + std::fs::remove_file(child_path) + .context("error removing panic") + .log_err(); + } + Ok::<_, anyhow::Error>(most_recent_panic) +} + +static LAST_CRASH_UPLOADED: &'static str = "LAST_CRASH_UPLOADED"; + +/// upload crashes from apple's diagnostic reports to our server. +/// (only if telemetry is enabled) +async fn upload_previous_crashes( + http: Arc, + most_recent_panic: Option<(i64, String)>, + installation_id: Option, + telemetry_settings: client::TelemetrySettings, +) -> Result<()> { + if !telemetry_settings.diagnostics { + return Ok(()); + } + let last_uploaded = KEY_VALUE_STORE + .read_kvp(LAST_CRASH_UPLOADED)? + .unwrap_or("zed-2024-01-17-221900.ips".to_string()); // don't upload old crash reports from before we had this. + let mut uploaded = last_uploaded.clone(); + + let crash_report_url = http.build_zed_api_url("/telemetry/crashes", &[])?; + + // crash directories are only set on MacOS + for dir in [&*CRASHES_DIR, &*CRASHES_RETIRED_DIR] + .iter() + .filter_map(|d| d.as_deref()) + { + let mut children = smol::fs::read_dir(&dir).await?; + while let Some(child) = children.next().await { + let child = child?; + let Some(filename) = child + .path() + .file_name() + .map(|f| f.to_string_lossy().to_lowercase()) + else { + continue; + }; + + if !filename.starts_with("zed-") || !filename.ends_with(".ips") { + continue; + } + + if filename <= last_uploaded { + continue; + } + + let body = smol::fs::read_to_string(&child.path()) + .await + .context("error reading crash file")?; + + let mut request = http::Request::post(&crash_report_url.to_string()) + .redirect_policy(isahc::config::RedirectPolicy::Follow) + .header("Content-Type", "text/plain"); + + if let Some((panicked_on, payload)) = most_recent_panic.as_ref() { + request = request + .header("x-zed-panicked-on", format!("{}", panicked_on)) + .header("x-zed-panic", payload) + } + if let Some(installation_id) = installation_id.as_ref() { + request = request.header("x-zed-installation-id", installation_id); + } + + let request = request.body(body.into())?; + + let response = http.send(request).await.context("error sending crash")?; + if !response.status().is_success() { + log::error!("Error uploading crash to server: {}", response.status()); + } + + if uploaded < filename { + uploaded = filename.clone(); + KEY_VALUE_STORE + .write_kvp(LAST_CRASH_UPLOADED.to_string(), filename) + .await?; + } + } + } + + Ok(()) +}