From c6e8a8f679f40d2528008918e2d636e0b48b8107 Mon Sep 17 00:00:00 2001 From: Eliza Weisman Date: Sat, 24 Jun 2023 10:13:17 -0700 Subject: [PATCH] tracing: fix recursive `register_callsite` deadlock (#2634) ## Motivation A deadlock exists when a collector's `register_callsite` method calls into code that also contains tracing instrumentation and triggers a second `register_callsite` call for the same callsite. This is because the current implementation of the `MacroCallsite` type holds a `core::sync::Once` which it uses to ensure that it is only added to the callsite registry a single time. This deadlock was fixed in v0.1.x in PR #2083, but the issue still exists on v0.2.x. ## Solution This branch forward-ports the solution from #2083. Rather than using a `core::sync::Once`, we now track the callsite's registration state directly in `MacroCallsite`. If a callsite has started registering, but has not yet completed, subsequent `register` calls will just immediately receive an `Interest::sometimes` until the registration has completed, rather than waiting to attempt their own registration. I've also forward-ported the tests for this that were added in #2083. --- tracing/src/lib.rs | 59 +++++++++++++++---- .../tests/rebuild_interest_doesnt_deadlock.rs | 51 ++++++++++++++++ .../register_callsite_doesnt_deadlock.rs | 49 +++++++++++++++ 3 files changed, 146 insertions(+), 13 deletions(-) create mode 100644 tracing/tests/rebuild_interest_doesnt_deadlock.rs create mode 100644 tracing/tests/register_callsite_doesnt_deadlock.rs diff --git a/tracing/src/lib.rs b/tracing/src/lib.rs index 916cf03cdf..1494048523 100644 --- a/tracing/src/lib.rs +++ b/tracing/src/lib.rs @@ -988,8 +988,7 @@ pub mod __macro_support { pub use crate::callsite::{Callsite, Registration}; use crate::{collect::Interest, Metadata}; use core::fmt; - use core::sync::atomic::{AtomicUsize, Ordering}; - use tracing_core::Once; + use core::sync::atomic::{AtomicU8, Ordering}; /// Callsite implementation used by macro-generated code. /// @@ -1003,9 +1002,9 @@ pub mod __macro_support { where T: 'static, { - interest: AtomicUsize, + interest: AtomicU8, + register: AtomicU8, meta: &'static Metadata<'static>, - register: Once, registration: &'static Registration, } @@ -1023,12 +1022,21 @@ pub mod __macro_support { registration: &'static Registration, ) -> Self { Self { - interest: AtomicUsize::new(0xDEADFACED), + interest: AtomicU8::new(Self::INTEREST_EMPTY), + register: AtomicU8::new(Self::UNREGISTERED), meta, - register: Once::new(), registration, } } + + const UNREGISTERED: u8 = 0; + const REGISTERING: u8 = 1; + const REGISTERED: u8 = 2; + + const INTEREST_NEVER: u8 = 0; + const INTEREST_SOMETIMES: u8 = 1; + const INTEREST_ALWAYS: u8 = 2; + const INTEREST_EMPTY: u8 = 0xFF; } impl MacroCallsite<&'static dyn Callsite> { @@ -1046,11 +1054,36 @@ pub mod __macro_support { // This only happens once (or if the cached interest value was corrupted). #[cold] pub fn register(&'static self) -> Interest { - self.register - .call_once(|| crate::callsite::register(self.registration)); + // Attempt to advance the registration state to `REGISTERING`... + match self.register.compare_exchange( + Self::UNREGISTERED, + Self::REGISTERING, + Ordering::AcqRel, + Ordering::Acquire, + ) { + Ok(_) => { + // Okay, we advanced the state, try to register the callsite. + crate::callsite::register(self.registration); + self.register.store(Self::REGISTERED, Ordering::Release); + } + // Great, the callsite is already registered! Just load its + // previous cached interest. + Err(Self::REGISTERED) => {} + // Someone else is registering... + Err(_state) => { + debug_assert_eq!( + _state, + Self::REGISTERING, + "weird callsite registration state" + ); + // Just hit `enabled` this time. + return Interest::sometimes(); + } + } + match self.interest.load(Ordering::Relaxed) { - 0 => Interest::never(), - 2 => Interest::always(), + Self::INTEREST_NEVER => Interest::never(), + Self::INTEREST_ALWAYS => Interest::always(), _ => Interest::sometimes(), } } @@ -1067,9 +1100,9 @@ pub mod __macro_support { #[inline] pub fn interest(&'static self) -> Interest { match self.interest.load(Ordering::Relaxed) { - 0 => Interest::never(), - 1 => Interest::sometimes(), - 2 => Interest::always(), + Self::INTEREST_NEVER => Interest::never(), + Self::INTEREST_SOMETIMES => Interest::sometimes(), + Self::INTEREST_ALWAYS => Interest::always(), _ => self.register(), } } diff --git a/tracing/tests/rebuild_interest_doesnt_deadlock.rs b/tracing/tests/rebuild_interest_doesnt_deadlock.rs new file mode 100644 index 0000000000..cd877cbafd --- /dev/null +++ b/tracing/tests/rebuild_interest_doesnt_deadlock.rs @@ -0,0 +1,51 @@ +use std::{sync::mpsc, thread, time::Duration}; +use tracing::{ + callsite, + collect::{self, Collect, Interest}, + metadata::Metadata, + span, Event, +}; + +#[test] +fn rebuild_callsites_doesnt_deadlock() { + pub struct EvilCollector; + + impl Collect for EvilCollector { + fn register_callsite(&self, meta: &'static Metadata<'static>) -> Interest { + tracing::info!(?meta, "registered a callsite"); + Interest::always() + } + + fn enabled(&self, _: &Metadata<'_>) -> bool { + true + } + fn new_span(&self, _: &span::Attributes<'_>) -> span::Id { + span::Id::from_u64(1) + } + fn record(&self, _: &span::Id, _: &span::Record<'_>) {} + fn record_follows_from(&self, _: &span::Id, _: &span::Id) {} + fn event(&self, _: &Event<'_>) {} + fn enter(&self, _: &span::Id) {} + fn exit(&self, _: &span::Id) {} + fn current_span(&self) -> tracing_core::span::Current { + unimplemented!() + } + } + + collect::set_global_default(EvilCollector).unwrap(); + + // spawn a thread, and assert it doesn't hang... + let (tx, didnt_hang) = mpsc::channel(); + let th = thread::spawn(move || { + tracing::info!("hello world!"); + callsite::rebuild_interest_cache(); + tx.send(()).unwrap(); + }); + + didnt_hang + // Note: 60 seconds is *way* more than enough, but let's be generous in + // case of e.g. slow CI machines. + .recv_timeout(Duration::from_secs(60)) + .expect("the thread must not have hung!"); + th.join().expect("thread should join successfully"); +} diff --git a/tracing/tests/register_callsite_doesnt_deadlock.rs b/tracing/tests/register_callsite_doesnt_deadlock.rs new file mode 100644 index 0000000000..32f3e686bc --- /dev/null +++ b/tracing/tests/register_callsite_doesnt_deadlock.rs @@ -0,0 +1,49 @@ +use std::{sync::mpsc, thread, time::Duration}; +use tracing::{ + collect::{self, Collect, Interest}, + metadata::Metadata, + span, Event, +}; + +#[test] +fn register_callsite_doesnt_deadlock() { + pub struct EvilCollector; + + impl Collect for EvilCollector { + fn register_callsite(&self, meta: &'static Metadata<'static>) -> Interest { + tracing::info!(?meta, "registered a callsite"); + Interest::always() + } + + fn enabled(&self, _: &Metadata<'_>) -> bool { + true + } + fn new_span(&self, _: &span::Attributes<'_>) -> span::Id { + span::Id::from_u64(1) + } + fn record(&self, _: &span::Id, _: &span::Record<'_>) {} + fn record_follows_from(&self, _: &span::Id, _: &span::Id) {} + fn event(&self, _: &Event<'_>) {} + fn enter(&self, _: &span::Id) {} + fn exit(&self, _: &span::Id) {} + fn current_span(&self) -> tracing_core::span::Current { + unimplemented!() + } + } + + collect::set_global_default(EvilCollector).unwrap(); + + // spawn a thread, and assert it doesn't hang... + let (tx, didnt_hang) = mpsc::channel(); + let th = thread::spawn(move || { + tracing::info!("hello world!"); + tx.send(()).unwrap(); + }); + + didnt_hang + // Note: 60 seconds is *way* more than enough, but let's be generous in + // case of e.g. slow CI machines. + .recv_timeout(Duration::from_secs(60)) + .expect("the thread must not have hung!"); + th.join().expect("thread should join successfully"); +}