Skip to content

Commit a33e3fe

Browse files
committed
fix(prof): avoid dlclose if threads did not join
If the PHP engine dlclose's the handle and the shared object is unloaded while another thread is running, we've hit undefined behavior. This also probably results in a crash (on platforms that unload instead of no-op). This may be the source of a crash when php-fpm does a log rotate. When doing the rotate, php-fpm shuts down all workers. If a worker is slow to process an upload and the timeout is hit, then we could hit this issue.
1 parent 9cc3c1f commit a33e3fe

File tree

5 files changed

+132
-47
lines changed

5 files changed

+132
-47
lines changed

Diff for: Cargo.lock

+37-16
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

Diff for: profiling/Cargo.toml

+1
Original file line numberDiff line numberDiff line change
@@ -33,6 +33,7 @@ serde_json = {version = "1.0"}
3333
rand = { version = "0.8.5" }
3434
rand_distr = { version = "0.4.3" }
3535
rustc-hash = "1.1.0"
36+
thiserror = "2"
3637
uuid = { version = "1.0", features = ["v4"] }
3738

3839
[dev-dependencies]

Diff for: profiling/src/lib.rs

+23-6
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,8 @@ mod exception;
2222
mod timeline;
2323

2424
use crate::config::{SystemSettings, INITIAL_SYSTEM_SETTINGS};
25+
use crate::profiling::ShutdownError;
26+
use crate::zend::datadog_sapi_globals_request_info;
2527
use bindings::{
2628
self as zend, ddog_php_prof_php_version, ddog_php_prof_php_version_id, ZendExtension,
2729
ZendResult,
@@ -46,8 +48,6 @@ use std::sync::{Arc, Once};
4648
use std::time::{Duration, Instant};
4749
use uuid::Uuid;
4850

49-
use crate::zend::datadog_sapi_globals_request_info;
50-
5151
/// Name of the profiling module and zend_extension. Must not contain any
5252
/// interior null bytes and must be null terminated.
5353
static PROFILER_NAME: &[u8] = b"datadog-profiling\0";
@@ -889,11 +889,28 @@ extern "C" fn startup(extension: *mut ZendExtension) -> ZendResult {
889889
ZendResult::Success
890890
}
891891

892-
extern "C" fn shutdown(_extension: *mut ZendExtension) {
892+
extern "C" fn shutdown(extension: *mut ZendExtension) {
893893
#[cfg(debug_assertions)]
894-
trace!("shutdown({:p})", _extension);
895-
896-
Profiler::shutdown(Duration::from_secs(2));
894+
trace!("shutdown({:p})", extension);
895+
896+
match Profiler::shutdown(Duration::from_secs(2)) {
897+
Ok(hit_timeout) => {
898+
// If a timeout was reached, then the thread is probably alive.
899+
// This means the engine cannot unload our handle, or else we'd
900+
// hit immediate undefined behavior (and likely crash).
901+
if hit_timeout {
902+
// SAFETY: during mshutdown, we have ownership of the extension
903+
// struct. Our threads (which failed to join) do not mutate
904+
// this struct at all either, providing no races.
905+
unsafe { (*extension).handle = ptr::null_mut() }
906+
}
907+
}
908+
Err(err) => match err {
909+
// todo: do we actually need to panic/unwind here? We're already
910+
// shutting down... can we just be graceful?
911+
ShutdownError::ThreadPanic { payload } => std::panic::resume_unwind(payload.0),
912+
},
913+
}
897914

898915
// SAFETY: calling in shutdown before zai config is shutdown, and after
899916
// all configuration is done being accessed. Well... in the happy-path,

Diff for: profiling/src/profiling/mod.rs

+46-15
Original file line numberDiff line numberDiff line change
@@ -17,10 +17,9 @@ use crate::bindings::ddog_php_prof_get_active_fiber_test as ddog_php_prof_get_ac
1717

1818
use crate::bindings::{datadog_php_profiling_get_profiling_context, zend_execute_data};
1919
use crate::config::SystemSettings;
20+
use crate::profiling::thread_utils::{JoinError, ThreadPanicError};
2021
use crate::{CLOCKS, TAGS};
2122
use chrono::Utc;
22-
#[cfg(feature = "timeline")]
23-
use core::{ptr, str};
2423
use crossbeam_channel::{Receiver, Sender, TrySendError};
2524
use datadog_profiling::api::{
2625
Function, Label as ApiLabel, Location, Period, Sample, ValueType as ApiValueType,
@@ -35,11 +34,14 @@ use std::hash::Hash;
3534
use std::intrinsics::transmute;
3635
use std::mem::forget;
3736
use std::num::NonZeroI64;
37+
use std::ops::BitOrAssign;
3838
use std::sync::atomic::{AtomicBool, AtomicPtr, AtomicU32, Ordering};
3939
use std::sync::{Arc, Barrier};
4040
use std::thread::JoinHandle;
4141
use std::time::{Duration, Instant, SystemTime};
4242

43+
#[cfg(feature = "timeline")]
44+
use core::{ptr, str};
4345
#[cfg(feature = "timeline")]
4446
use std::time::UNIX_EPOCH;
4547

@@ -689,33 +691,53 @@ impl Profiler {
689691
/// Completes the shutdown process; to start it, call [Profiler::stop]
690692
/// before calling [Profiler::shutdown].
691693
/// Note the timeout is per thread, and there may be multiple threads.
694+
/// Returns Ok(true) if any thread hit a timeout.
692695
///
693696
/// Safety: only safe to be called in `SHUTDOWN`/`MSHUTDOWN` phase
694-
pub fn shutdown(timeout: Duration) {
697+
pub fn shutdown(timeout: Duration) -> Result<bool, ShutdownError> {
695698
// SAFETY: the `take` access is a thread-safe API, and the PROFILER is
696699
// not being mutated outside single-threaded phases such as minit and
697700
// mshutdown.
698701
if let Some(profiler) = unsafe { PROFILER.take() } {
699-
profiler.join_collector_and_uploader(timeout);
702+
Ok(profiler.join_collector_and_uploader(timeout)?)
703+
} else {
704+
Ok(false)
700705
}
701706
}
702707

703-
pub fn join_collector_and_uploader(self, timeout: Duration) {
708+
fn join_collector_and_uploader(self, timeout: Duration) -> Result<bool, ThreadPanicError> {
704709
if self.should_join.load(Ordering::SeqCst) {
705-
thread_utils::join_timeout(
706-
self.time_collector_handle,
707-
timeout,
708-
"Recent samples may be lost.",
709-
);
710+
let result = thread_utils::join_timeout(self.time_collector_handle, timeout);
711+
let mut hit_timeout = if let Err(err) = result {
712+
match err {
713+
JoinError::ThreadPanic(err) => Err(err)?,
714+
timeout_err => {
715+
warn!("{}, recent samples may be lost", timeout_err);
716+
true
717+
}
718+
}
719+
} else {
720+
false
721+
};
710722

711723
// Wait for the time_collector to join, since that will drop
712724
// the sender half of the channel that the uploader is
713725
// holding, allowing it to finish.
714-
thread_utils::join_timeout(
715-
self.uploader_handle,
716-
timeout,
717-
"Recent samples are most likely lost.",
718-
);
726+
let result = thread_utils::join_timeout(self.uploader_handle, timeout);
727+
hit_timeout.bitor_assign(if let Err(err) = result {
728+
match err {
729+
JoinError::ThreadPanic(err) => Err(err)?,
730+
timeout_err => {
731+
warn!("{}, recent samples are most likely lost", timeout_err);
732+
true
733+
}
734+
}
735+
} else {
736+
false
737+
});
738+
Ok(hit_timeout)
739+
} else {
740+
Ok(false)
719741
}
720742
}
721743

@@ -1288,6 +1310,15 @@ impl Profiler {
12881310
}
12891311
}
12901312

1313+
#[derive(thiserror::Error, Debug)]
1314+
pub enum ShutdownError {
1315+
#[error(transparent)]
1316+
ThreadPanic {
1317+
#[from]
1318+
payload: ThreadPanicError,
1319+
},
1320+
}
1321+
12911322
#[cfg(test)]
12921323
mod tests {
12931324
use super::*;

0 commit comments

Comments
 (0)