Skip to content

Commit c199419

Browse files
authored
fix(prof): avoid dlclose if threads did not join (#3075)
* fix(prof): avoid dlclose if threads did not join If the PHP engine dlclose's the handle and the shared object is unloaded while another thread is running, we've hit undefined behavior. This also probably results in a crash (on platforms that unload instead of no-op). This may be the source of a crash when php-fpm does a log rotate. When doing the rotate, php-fpm shuts down all workers. If a worker is slow to process an upload and the timeout is hit, then we could hit this issue. * refactor to panic::resume_unwind immediately * style: reconnect safety comment to unsafe block
1 parent 9cc3c1f commit c199419

File tree

6 files changed

+99
-49
lines changed

6 files changed

+99
-49
lines changed

Diff for: Cargo.lock

+37-16
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

Diff for: profiling/Cargo.toml

+1
Original file line numberDiff line numberDiff line change
@@ -33,6 +33,7 @@ serde_json = {version = "1.0"}
3333
rand = { version = "0.8.5" }
3434
rand_distr = { version = "0.4.3" }
3535
rustc-hash = "1.1.0"
36+
thiserror = "2"
3637
uuid = { version = "1.0", features = ["v4"] }
3738

3839
[dev-dependencies]

Diff for: profiling/src/lib.rs

+15-6
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,7 @@ mod exception;
2222
mod timeline;
2323

2424
use crate::config::{SystemSettings, INITIAL_SYSTEM_SETTINGS};
25+
use crate::zend::datadog_sapi_globals_request_info;
2526
use bindings::{
2627
self as zend, ddog_php_prof_php_version, ddog_php_prof_php_version_id, ZendExtension,
2728
ZendResult,
@@ -46,8 +47,6 @@ use std::sync::{Arc, Once};
4647
use std::time::{Duration, Instant};
4748
use uuid::Uuid;
4849

49-
use crate::zend::datadog_sapi_globals_request_info;
50-
5150
/// Name of the profiling module and zend_extension. Must not contain any
5251
/// interior null bytes and must be null terminated.
5352
static PROFILER_NAME: &[u8] = b"datadog-profiling\0";
@@ -889,11 +888,21 @@ extern "C" fn startup(extension: *mut ZendExtension) -> ZendResult {
889888
ZendResult::Success
890889
}
891890

892-
extern "C" fn shutdown(_extension: *mut ZendExtension) {
891+
extern "C" fn shutdown(extension: *mut ZendExtension) {
893892
#[cfg(debug_assertions)]
894-
trace!("shutdown({:p})", _extension);
895-
896-
Profiler::shutdown(Duration::from_secs(2));
893+
trace!("shutdown({:p})", extension);
894+
895+
// If a timeout was reached, then the thread is possibly alive.
896+
// This means the engine cannot unload our handle, or else we'd hit
897+
// immediate undefined behavior (and likely crash).
898+
if let Err(err) = Profiler::shutdown(Duration::from_secs(2)) {
899+
let num_failures = err.num_failures;
900+
error!("{num_failures} thread(s) failed to join, intentionally leaking the extension's handle to prevent unloading");
901+
// SAFETY: during mshutdown, we have ownership of the extension struct.
902+
// Our threads (which failed to join) do not mutate this struct at all
903+
// either, providing no races.
904+
unsafe { (*extension).handle = ptr::null_mut() }
905+
}
897906

898907
// SAFETY: calling in shutdown before zai config is shutdown, and after
899908
// all configuration is done being accessed. Well... in the happy-path,

Diff for: profiling/src/php_ffi.c

+1
Original file line numberDiff line numberDiff line change
@@ -265,6 +265,7 @@ void datadog_php_profiling_copy_string_view_into_zval(zval *dest, zai_str view,
265265
#ifdef CFG_TEST
266266
(void)dest;
267267
(void)view;
268+
(void)persistent;
268269
ZEND_ASSERT(0);
269270
#else
270271
if (view.len == 0) {

Diff for: profiling/src/profiling/mod.rs

+25-15
Original file line numberDiff line numberDiff line change
@@ -19,8 +19,6 @@ use crate::bindings::{datadog_php_profiling_get_profiling_context, zend_execute_
1919
use crate::config::SystemSettings;
2020
use crate::{CLOCKS, TAGS};
2121
use chrono::Utc;
22-
#[cfg(feature = "timeline")]
23-
use core::{ptr, str};
2422
use crossbeam_channel::{Receiver, Sender, TrySendError};
2523
use datadog_profiling::api::{
2624
Function, Label as ApiLabel, Location, Period, Sample, ValueType as ApiValueType,
@@ -40,6 +38,8 @@ use std::sync::{Arc, Barrier};
4038
use std::thread::JoinHandle;
4139
use std::time::{Duration, Instant, SystemTime};
4240

41+
#[cfg(feature = "timeline")]
42+
use core::{ptr, str};
4343
#[cfg(feature = "timeline")]
4444
use std::time::UNIX_EPOCH;
4545

@@ -689,33 +689,39 @@ impl Profiler {
689689
/// Completes the shutdown process; to start it, call [Profiler::stop]
690690
/// before calling [Profiler::shutdown].
691691
/// Note the timeout is per thread, and there may be multiple threads.
692+
/// Returns Ok(true) if any thread hit a timeout.
692693
///
693694
/// Safety: only safe to be called in `SHUTDOWN`/`MSHUTDOWN` phase
694-
pub fn shutdown(timeout: Duration) {
695+
pub fn shutdown(timeout: Duration) -> Result<(), JoinError> {
695696
// SAFETY: the `take` access is a thread-safe API, and the PROFILER is
696697
// not being mutated outside single-threaded phases such as minit and
697698
// mshutdown.
698699
if let Some(profiler) = unsafe { PROFILER.take() } {
699-
profiler.join_collector_and_uploader(timeout);
700+
profiler.join_collector_and_uploader(timeout)
701+
} else {
702+
Ok(())
700703
}
701704
}
702705

703-
pub fn join_collector_and_uploader(self, timeout: Duration) {
706+
fn join_collector_and_uploader(self, timeout: Duration) -> Result<(), JoinError> {
704707
if self.should_join.load(Ordering::SeqCst) {
705-
thread_utils::join_timeout(
706-
self.time_collector_handle,
707-
timeout,
708-
"Recent samples may be lost.",
709-
);
708+
let result1 = thread_utils::join_timeout(self.time_collector_handle, timeout);
709+
if let Err(err) = &result1 {
710+
warn!("{err}, recent samples may be lost");
711+
}
710712

711713
// Wait for the time_collector to join, since that will drop
712714
// the sender half of the channel that the uploader is
713715
// holding, allowing it to finish.
714-
thread_utils::join_timeout(
715-
self.uploader_handle,
716-
timeout,
717-
"Recent samples are most likely lost.",
718-
);
716+
let result2 = thread_utils::join_timeout(self.uploader_handle, timeout);
717+
if let Err(err) = &result2 {
718+
warn!("{err}, recent samples are most likely lost");
719+
}
720+
721+
let num_failures = result1.is_err() as usize + result2.is_err() as usize;
722+
result2.or(result1).map_err(|_| JoinError { num_failures })
723+
} else {
724+
Ok(())
719725
}
720726
}
721727

@@ -1288,6 +1294,10 @@ impl Profiler {
12881294
}
12891295
}
12901296

1297+
pub struct JoinError {
1298+
pub num_failures: usize,
1299+
}
1300+
12911301
#[cfg(test)]
12921302
mod tests {
12931303
use super::*;

Diff for: profiling/src/profiling/thread_utils.rs

+20-12
Original file line numberDiff line numberDiff line change
@@ -1,15 +1,15 @@
1-
#[cfg(php_zts)]
2-
use crate::sapi::Sapi;
31
use crate::SAPI;
4-
#[cfg(php_zts)]
5-
use libc::c_char;
62
use libc::sched_yield;
7-
use log::warn;
83
use once_cell::sync::OnceCell;
94
use std::mem::MaybeUninit;
105
use std::thread::JoinHandle;
116
use std::time::{Duration, Instant};
127

8+
#[cfg(php_zts)]
9+
use crate::sapi::Sapi;
10+
#[cfg(php_zts)]
11+
use libc::c_char;
12+
1313
/// Spawns a thread and masks off the signals that the Zend Engine uses.
1414
pub fn spawn<F, T>(name: &str, f: F) -> JoinHandle<T>
1515
where
@@ -50,11 +50,18 @@ where
5050
}
5151
}
5252

53+
#[derive(thiserror::Error, Debug)]
54+
#[error("timeout of {timeout_ms} ms reached when joining thread {thread}")]
55+
pub struct TimeoutError {
56+
thread: String,
57+
timeout_ms: u128,
58+
}
59+
5360
/// Waits for the handle to be finished. If finished, it will join the handle.
54-
/// Otherwise, it will leak the handle.
61+
/// Otherwise, it will leak the handle and return an error.
5562
/// # Panics
56-
/// Panics if the thread being joined has panic'd.
57-
pub fn join_timeout(handle: JoinHandle<()>, timeout: Duration, impact: &str) {
63+
/// If the thread being joined has panic'd, this will resume the panic.
64+
pub fn join_timeout(handle: JoinHandle<()>, timeout: Duration) -> Result<(), TimeoutError> {
5865
// After notifying the other threads, it's likely they'll need some time
5966
// to respond adequately. Joining on the JoinHandle is supposed to be the
6067
// correct way to do this, but we've observed this can panic:
@@ -65,15 +72,16 @@ pub fn join_timeout(handle: JoinHandle<()>, timeout: Duration, impact: &str) {
6572
while !handle.is_finished() {
6673
unsafe { sched_yield() };
6774
if start.elapsed() >= timeout {
68-
let name = handle.thread().name().unwrap_or("{unknown}");
69-
warn!("Timeout of {timeout:?} reached when joining thread '{name}'. {impact}");
70-
return;
75+
let thread = handle.thread().name().unwrap_or("{unknown}").to_string();
76+
let timeout_ms = timeout.as_millis();
77+
return Err(TimeoutError { thread, timeout_ms });
7178
}
7279
}
7380

7481
if let Err(err) = handle.join() {
75-
std::panic::resume_unwind(err)
82+
std::panic::resume_unwind(err);
7683
}
84+
Ok(())
7785
}
7886

7987
thread_local! {

0 commit comments

Comments
 (0)