Skip to content

Commit

Permalink
[crashtracker] Enable client to configure which signals to trap
Browse files Browse the repository at this point in the history
  • Loading branch information
danielsn committed Feb 5, 2025
1 parent f2fb806 commit e9867ad
Show file tree
Hide file tree
Showing 10 changed files with 242 additions and 83 deletions.
1 change: 1 addition & 0 deletions bin_tests/src/bin/crashtracker_bin_test.rs
Original file line number Diff line number Diff line change
Expand Up @@ -54,6 +54,7 @@ mod unix {
create_alt_stack: true,
use_alt_stack: true,
resolve_frames: crashtracker::StacktraceCollection::WithoutSymbols,
signals: vec![libc::SIGBUS, libc::SIGSEGV],
endpoint,
timeout_ms: TEST_COLLECTOR_TIMEOUT_MS,
unix_socket_path: Some("".to_string()),
Expand Down
4 changes: 4 additions & 0 deletions crashtracker-ffi/src/collector/datatypes.rs
Original file line number Diff line number Diff line change
Expand Up @@ -62,6 +62,8 @@ pub struct Config<'a> {
/// If None, the crashtracker will infer the agent host from env variables.
pub endpoint: Option<&'a Endpoint>,
pub resolve_frames: StacktraceCollection,
/// The set of signals we should be registered for.
pub signals: Slice<'a, i32>,
/// Timeout in milliseconds before the signal handler starts tearing things down to return.
/// This is given as a uint32_t, but the actual timeout needs to fit inside of an i32 (max
/// 2^31-1). This is a limitation of the various interfaces used to guarantee the timeout.
Expand All @@ -84,6 +86,7 @@ impl<'a> TryFrom<Config<'a>> for datadog_crashtracker::CrashtrackerConfiguration
let use_alt_stack = value.use_alt_stack;
let endpoint = value.endpoint.cloned();
let resolve_frames = value.resolve_frames;
let signals = value.signals.iter().copied().collect();
let timeout_ms = value.timeout_ms;
let unix_socket_path = value.optional_unix_socket_filename.try_to_string_option()?;
Self::new(
Expand All @@ -92,6 +95,7 @@ impl<'a> TryFrom<Config<'a>> for datadog_crashtracker::CrashtrackerConfiguration
use_alt_stack,
endpoint,
resolve_frames,
signals,
timeout_ms,
unix_socket_path,
)
Expand Down
10 changes: 9 additions & 1 deletion crashtracker-ffi/src/collector/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@ use anyhow::Context;
pub use counters::*;
use datadog_crashtracker::CrashtrackerReceiverConfig;
pub use datatypes::*;
use ddcommon_ffi::{wrap_with_void_ffi_result, VoidResult};
use ddcommon_ffi::{wrap_with_void_ffi_result, Slice, VoidResult};
use function_name::named;
pub use spans::*;

Expand Down Expand Up @@ -133,3 +133,11 @@ pub unsafe extern "C" fn ddog_crasht_init_without_receiver(
)?
})
}

#[no_mangle]
/// Returns a list of signals suitable for use in a crashtracker config.
pub extern "C" fn ddog_crasht_default_signals<'a>() -> Slice<'a, libc::c_int> {
static DEFAULT_SYMBOLS: [libc::c_int; 4] =
[libc::SIGBUS, libc::SIGABRT, libc::SIGSEGV, libc::SIGILL];
Slice::new(&DEFAULT_SYMBOLS)
}
6 changes: 4 additions & 2 deletions crashtracker/src/collector/api.rs
Original file line number Diff line number Diff line change
Expand Up @@ -80,9 +80,9 @@ pub fn init(
metadata: Metadata,
) -> anyhow::Result<()> {
update_metadata(metadata)?;
update_config(config)?;
update_config(config.clone())?;
configure_receiver(receiver_config);
register_crash_handlers()?;
register_crash_handlers(&config)?;
Ok(())
}

Expand Down Expand Up @@ -128,6 +128,7 @@ fn test_crash() -> anyhow::Result<()> {
use_alt_stack,
endpoint,
resolve_frames,
vec![libc::SIGBUS, libc::SIGSEGV],
timeout_ms,
None,
)?;
Expand Down Expand Up @@ -185,6 +186,7 @@ fn test_altstack_paradox() -> anyhow::Result<()> {
use_alt_stack,
endpoint,
resolve_frames,
vec![libc::SIGBUS, libc::SIGSEGV],
timeout_ms,
None,
);
Expand Down
123 changes: 58 additions & 65 deletions crashtracker/src/collector/crash_handler.rs
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@ use super::saguard::SaGuard;
use crate::crash_info::Metadata;
use crate::shared::configuration::{CrashtrackerConfiguration, CrashtrackerReceiverConfig};
use crate::shared::constants::*;
use crate::signal_from_signum;
use anyhow::Context;
use libc::{
c_void, execve, mmap, nfds_t, sigaltstack, siginfo_t, ucontext_t, MAP_ANON, MAP_FAILED,
Expand All @@ -19,6 +20,7 @@ use nix::sys::signal::{self, SaFlags, SigAction, SigHandler, SigSet};
use nix::sys::socket;
use nix::sys::wait::{waitpid, WaitPidFlag, WaitStatus};
use nix::unistd::{close, Pid};
use std::collections::HashMap;
use std::ffi::CString;
use std::fs::{File, OpenOptions};
use std::io::Write;
Expand Down Expand Up @@ -54,12 +56,6 @@ use libc::fork as vfork;
#[cfg(target_os = "linux")]
use libc::vfork;

#[derive(Debug)]
struct OldHandlers {
pub sigbus: SigAction,
pub sigsegv: SigAction,
}

struct Receiver {
receiver_uds: RawFd,
receiver_pid: i32,
Expand Down Expand Up @@ -255,7 +251,8 @@ fn wait_for_pollhup(target_fd: RawFd, timeout_ms: i32) -> anyhow::Result<bool> {
// This means that we can always clean up the memory inside one of these using
// `Box::from_raw` to recreate the box, then dropping it.
static ALTSTACK_INIT: AtomicBool = AtomicBool::new(false);
static OLD_HANDLERS: AtomicPtr<OldHandlers> = AtomicPtr::new(ptr::null_mut());
static OLD_HANDLERS: AtomicPtr<HashMap<i32, (signal::Signal, SigAction)>> =
AtomicPtr::new(ptr::null_mut());
static METADATA: AtomicPtr<(Metadata, String)> = AtomicPtr::new(ptr::null_mut());
static CONFIG: AtomicPtr<(CrashtrackerConfiguration, String)> = AtomicPtr::new(ptr::null_mut());
static RECEIVER_CONFIG: AtomicPtr<CrashtrackerReceiverConfig> = AtomicPtr::new(ptr::null_mut());
Expand Down Expand Up @@ -397,46 +394,36 @@ extern "C" fn handle_posix_sigaction(signum: i32, sig_info: *mut siginfo_t, ucon
// instant of time between when the handlers are registered, and the
// `OLD_HANDLERS` are set. This should be very short, but is hard to fully
// eliminate given the existing POSIX APIs.
let old_handlers = unsafe { &*OLD_HANDLERS.load(SeqCst) };
let old_sigaction = if signum == libc::SIGSEGV {
old_handlers.sigsegv
} else if signum == libc::SIGBUS {
old_handlers.sigbus
let old_handlers = unsafe { &*OLD_HANDLERS.swap(std::ptr::null_mut(), SeqCst) };

if let Some((signal, sigaction)) = old_handlers.get(&signum) {
// How we chain depends on what kind of handler we're chaining to.
// https://www.gnu.org/software/libc/manual/html_node/Signal-Handling.html
// https://man7.org/linux/man-pages/man2/sigaction.2.html
// Follow the approach here:
// https://stackoverflow.com/questions/6015498/executing-default-signal-handler
match sigaction.handler() {
SigHandler::SigDfl => {
// In the case of a default handler, we want to invoke it so that
// the core-dump can be generated. Restoring the handler then
// re-raising the signal accomplishes that.
unsafe { signal::sigaction(*signal, sigaction) }
.unwrap_or_else(|_| std::process::abort());
// Signals are only delivered once.
// In the case where we were invoked because of a crash, returning
// is technically UB but in practice re-invokes the crashing instr
// and re-raises the signal. In the case where we were invoked by
// `raise(SIGSEGV)` we need to re-raise the signal, or the default
// handler will never receive it.
unsafe { libc::raise(signum) };
}
SigHandler::SigIgn => (), // Return and ignore the signal.
SigHandler::Handler(f) => f(signum),
SigHandler::SigAction(f) => f(signum, sig_info, ucontext),
};
} else {
unreachable!("The only signals we're registered for are SEGV and BUS")
};

// How we chain depends on what kind of handler we're chaining to.
// https://www.gnu.org/software/libc/manual/html_node/Signal-Handling.html
// https://man7.org/linux/man-pages/man2/sigaction.2.html
// Follow the approach here:
// https://stackoverflow.com/questions/6015498/executing-default-signal-handler
match old_sigaction.handler() {
SigHandler::SigDfl => {
// In the case of a default handler, we want to invoke it so that
// the core-dump can be generated. Restoring the handler then
// re-raising the signal accomplishes that.
let signal = if signum == libc::SIGSEGV {
signal::SIGSEGV
} else if signum == libc::SIGBUS {
signal::SIGBUS
} else {
unreachable!("The only signals we're registered for are SEGV and BUS")
};
unsafe { signal::sigaction(signal, &old_sigaction) }
.unwrap_or_else(|_| std::process::abort());
// Signals are only delivered once.
// In the case where we were invoked because of a crash, returning
// is technically UB but in practice re-invokes the crashing instr
// and re-raises the signal. In the case where we were invoked by
// `raise(SIGSEGV)` we need to re-raise the signal, or the default
// handler will never receive it.
unsafe { libc::raise(signum) };
}
SigHandler::SigIgn => (), // Return and ignore the signal.
SigHandler::Handler(f) => f(signum),
SigHandler::SigAction(f) => f(signum, sig_info, ucontext),
};
eprintln!("Unexpected signal number {signum}, can't chain the handlers");
}
}

fn receiver_from_socket(unix_socket_path: &str) -> anyhow::Result<Receiver> {
Expand Down Expand Up @@ -611,29 +598,33 @@ fn handle_posix_signal_impl(
/// will not yet be stored. This would lead to unexpected behaviour for the
/// user. This should only matter if something crashes concurrently with
/// this function executing.
pub fn register_crash_handlers() -> anyhow::Result<()> {
pub fn register_crash_handlers(config: &CrashtrackerConfiguration) -> anyhow::Result<()> {
if !OLD_HANDLERS.load(SeqCst).is_null() {
return Ok(());
}

let config_ptr = CONFIG.load(SeqCst);
anyhow::ensure!(!config_ptr.is_null(), "No crashtracking config");
let (config, _config_str) = unsafe { config_ptr.as_ref().context("config ptr")? };

unsafe {
if config.create_alt_stack {
create_alt_stack()?;
}
let sigbus = register_signal_handler(signal::SIGBUS, config)?;
let sigsegv = register_signal_handler(signal::SIGSEGV, config)?;
let boxed_ptr = Box::into_raw(Box::new(OldHandlers { sigbus, sigsegv }));

let res = OLD_HANDLERS.compare_exchange(ptr::null_mut(), boxed_ptr, SeqCst, SeqCst);
if config.create_alt_stack {
unsafe { create_alt_stack()? };
}
let mut old_handlers = HashMap::new();
// TODO: if this fails, we end in a situation where handlers have been registered with no chain
for signum in &config.signals {
let signal = signal_from_signum(*signum)?;
anyhow::ensure!(
res.is_ok(),
"TOCTTOU error in crashtracker::register_crash_handlers"
!old_handlers.contains_key(signum),
"Handler already registered for {signal}"
);
let handler = unsafe { register_signal_handler(signal, config)? };
old_handlers.insert(*signum, (signal, handler));
}
let boxed_ptr = Box::into_raw(Box::new(old_handlers));

let res = OLD_HANDLERS.compare_exchange(ptr::null_mut(), boxed_ptr, SeqCst, SeqCst);
anyhow::ensure!(
res.is_ok(),
"TOCTTOU error in crashtracker::register_crash_handlers"
);

Ok(())
}

Expand Down Expand Up @@ -674,9 +665,11 @@ pub fn restore_old_handlers(inside_signal_handler: bool) -> anyhow::Result<()> {
anyhow::ensure!(!prev.is_null(), "No crashtracking previous signal handlers");
// Safety: The only nonnull pointer stored here comes from Box::into_raw()
let prev = unsafe { Box::from_raw(prev) };
// Safety: The value restored here was returned from a previous sigaction call
unsafe { signal::sigaction(signal::SIGBUS, &prev.sigbus)? };
unsafe { signal::sigaction(signal::SIGSEGV, &prev.sigsegv)? };
for (_signum, (signal, sigaction)) in prev.iter() {
// Safety: The value restored here was returned from a previous sigaction call
unsafe { signal::sigaction(*signal, sigaction)? };
}

// We want to avoid freeing memory inside the handler, so just leak it
// This is fine since we're crashing anyway at this point
if inside_signal_handler {
Expand Down
22 changes: 10 additions & 12 deletions crashtracker/src/collector/emitters.rs
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@ use crate::collector::counters::emit_counters;
use crate::collector::spans::{emit_spans, emit_traces};
use crate::shared::constants::*;
use crate::CrashtrackerConfiguration;
use crate::SignalNames;
use crate::StacktraceCollection;
use anyhow::Context;
use backtrace::Frame;
Expand Down Expand Up @@ -185,19 +186,16 @@ fn emit_siginfo(w: &mut impl Write, sig_info: *const siginfo_t) -> anyhow::Resul
anyhow::ensure!(!sig_info.is_null());

let si_signo = unsafe { (*sig_info).si_signo };
let si_signo_human_readable = match si_signo {
libc::SIGABRT => "SIGABRT",
libc::SIGBUS => "SIGBUS",
libc::SIGSEGV => "SIGSEGV",
libc::SIGSYS => "SIGSYS",
_ => "UNKNOWN",
};
let si_signo_human_readable: SignalNames = si_signo.into();

// Derive the faulting address from `sig_info`
let si_addr: Option<usize> = if si_signo == libc::SIGSEGV || si_signo == libc::SIGBUS {
unsafe { Some((*sig_info).si_addr() as usize) }
} else {
None
// https://man7.org/linux/man-pages/man2/sigaction.2.html
// SIGILL, SIGFPE, SIGSEGV, SIGBUS, and SIGTRAP fill in si_addr with the address of the fault.
let si_addr: Option<usize> = match si_signo {
libc::SIGILL | libc::SIGFPE | libc::SIGSEGV | libc::SIGBUS | libc::SIGTRAP => {
Some(unsafe { (*sig_info).si_addr() as usize })
}
_ => None,
};

let si_code = unsafe { (*sig_info).si_code };
Expand All @@ -214,7 +212,7 @@ fn emit_siginfo(w: &mut impl Write, sig_info: *const siginfo_t) -> anyhow::Resul
write!(w, ", \"si_signo\": {si_signo}")?;
write!(
w,
", \"si_signo_human_readable\": \"{si_signo_human_readable}\""
", \"si_signo_human_readable\": \"{si_signo_human_readable:?}\""
)?;
if let Some(si_addr) = si_addr {
write!(w, ", \"si_addr\": \"{si_addr:#018x}\"")?;
Expand Down
Loading

0 comments on commit e9867ad

Please sign in to comment.