Skip to content

Commit

Permalink
tracing: fix recursive register_callsite deadlock (#2634)
Browse files Browse the repository at this point in the history
## Motivation

A deadlock exists when a collector's `register_callsite` method calls
into code that also contains tracing instrumentation and triggers a
second `register_callsite` call for the same callsite. This is because
the current implementation of the `MacroCallsite` type holds a
`core::sync::Once` which it uses to ensure that it is only added to the
callsite registry a single time. This deadlock was fixed in v0.1.x in PR
#2083, but the issue still exists on v0.2.x.

## Solution

This branch forward-ports the solution from #2083. Rather than using a
`core::sync::Once`, we now track the callsite's registration state
directly in `MacroCallsite`. If a callsite has started registering, but
has not yet completed, subsequent `register` calls will just immediately
receive an `Interest::sometimes` until the registration has completed,
rather than waiting to attempt their own registration.

I've also forward-ported the tests for this that were added in #2083.
  • Loading branch information
hawkw committed Jun 24, 2023
1 parent 9ad2740 commit c6e8a8f
Show file tree
Hide file tree
Showing 3 changed files with 146 additions and 13 deletions.
59 changes: 46 additions & 13 deletions tracing/src/lib.rs
Expand Up @@ -988,8 +988,7 @@ pub mod __macro_support {
pub use crate::callsite::{Callsite, Registration};
use crate::{collect::Interest, Metadata};
use core::fmt;
use core::sync::atomic::{AtomicUsize, Ordering};
use tracing_core::Once;
use core::sync::atomic::{AtomicU8, Ordering};

/// Callsite implementation used by macro-generated code.
///
Expand All @@ -1003,9 +1002,9 @@ pub mod __macro_support {
where
T: 'static,
{
interest: AtomicUsize,
interest: AtomicU8,
register: AtomicU8,
meta: &'static Metadata<'static>,
register: Once,
registration: &'static Registration<T>,
}

Expand All @@ -1023,12 +1022,21 @@ pub mod __macro_support {
registration: &'static Registration<T>,
) -> Self {
Self {
interest: AtomicUsize::new(0xDEADFACED),
interest: AtomicU8::new(Self::INTEREST_EMPTY),
register: AtomicU8::new(Self::UNREGISTERED),
meta,
register: Once::new(),
registration,
}
}

const UNREGISTERED: u8 = 0;
const REGISTERING: u8 = 1;
const REGISTERED: u8 = 2;

const INTEREST_NEVER: u8 = 0;
const INTEREST_SOMETIMES: u8 = 1;
const INTEREST_ALWAYS: u8 = 2;
const INTEREST_EMPTY: u8 = 0xFF;
}

impl MacroCallsite<&'static dyn Callsite> {
Expand All @@ -1046,11 +1054,36 @@ pub mod __macro_support {
// This only happens once (or if the cached interest value was corrupted).
#[cold]
pub fn register(&'static self) -> Interest {
self.register
.call_once(|| crate::callsite::register(self.registration));
// Attempt to advance the registration state to `REGISTERING`...
match self.register.compare_exchange(
Self::UNREGISTERED,
Self::REGISTERING,
Ordering::AcqRel,
Ordering::Acquire,
) {
Ok(_) => {
// Okay, we advanced the state, try to register the callsite.
crate::callsite::register(self.registration);
self.register.store(Self::REGISTERED, Ordering::Release);
}
// Great, the callsite is already registered! Just load its
// previous cached interest.
Err(Self::REGISTERED) => {}
// Someone else is registering...
Err(_state) => {
debug_assert_eq!(
_state,
Self::REGISTERING,
"weird callsite registration state"
);
// Just hit `enabled` this time.
return Interest::sometimes();
}
}

match self.interest.load(Ordering::Relaxed) {
0 => Interest::never(),
2 => Interest::always(),
Self::INTEREST_NEVER => Interest::never(),
Self::INTEREST_ALWAYS => Interest::always(),
_ => Interest::sometimes(),
}
}
Expand All @@ -1067,9 +1100,9 @@ pub mod __macro_support {
#[inline]
pub fn interest(&'static self) -> Interest {
match self.interest.load(Ordering::Relaxed) {
0 => Interest::never(),
1 => Interest::sometimes(),
2 => Interest::always(),
Self::INTEREST_NEVER => Interest::never(),
Self::INTEREST_SOMETIMES => Interest::sometimes(),
Self::INTEREST_ALWAYS => Interest::always(),
_ => self.register(),
}
}
Expand Down
51 changes: 51 additions & 0 deletions tracing/tests/rebuild_interest_doesnt_deadlock.rs
@@ -0,0 +1,51 @@
use std::{sync::mpsc, thread, time::Duration};
use tracing::{
callsite,
collect::{self, Collect, Interest},
metadata::Metadata,
span, Event,
};

#[test]
fn rebuild_callsites_doesnt_deadlock() {
pub struct EvilCollector;

impl Collect for EvilCollector {
fn register_callsite(&self, meta: &'static Metadata<'static>) -> Interest {
tracing::info!(?meta, "registered a callsite");
Interest::always()
}

fn enabled(&self, _: &Metadata<'_>) -> bool {
true
}
fn new_span(&self, _: &span::Attributes<'_>) -> span::Id {
span::Id::from_u64(1)
}
fn record(&self, _: &span::Id, _: &span::Record<'_>) {}
fn record_follows_from(&self, _: &span::Id, _: &span::Id) {}
fn event(&self, _: &Event<'_>) {}
fn enter(&self, _: &span::Id) {}
fn exit(&self, _: &span::Id) {}
fn current_span(&self) -> tracing_core::span::Current {
unimplemented!()
}
}

collect::set_global_default(EvilCollector).unwrap();

// spawn a thread, and assert it doesn't hang...
let (tx, didnt_hang) = mpsc::channel();
let th = thread::spawn(move || {
tracing::info!("hello world!");
callsite::rebuild_interest_cache();
tx.send(()).unwrap();
});

didnt_hang
// Note: 60 seconds is *way* more than enough, but let's be generous in
// case of e.g. slow CI machines.
.recv_timeout(Duration::from_secs(60))
.expect("the thread must not have hung!");
th.join().expect("thread should join successfully");
}
49 changes: 49 additions & 0 deletions tracing/tests/register_callsite_doesnt_deadlock.rs
@@ -0,0 +1,49 @@
use std::{sync::mpsc, thread, time::Duration};
use tracing::{
collect::{self, Collect, Interest},
metadata::Metadata,
span, Event,
};

#[test]
fn register_callsite_doesnt_deadlock() {
pub struct EvilCollector;

impl Collect for EvilCollector {
fn register_callsite(&self, meta: &'static Metadata<'static>) -> Interest {
tracing::info!(?meta, "registered a callsite");
Interest::always()
}

fn enabled(&self, _: &Metadata<'_>) -> bool {
true
}
fn new_span(&self, _: &span::Attributes<'_>) -> span::Id {
span::Id::from_u64(1)
}
fn record(&self, _: &span::Id, _: &span::Record<'_>) {}
fn record_follows_from(&self, _: &span::Id, _: &span::Id) {}
fn event(&self, _: &Event<'_>) {}
fn enter(&self, _: &span::Id) {}
fn exit(&self, _: &span::Id) {}
fn current_span(&self) -> tracing_core::span::Current {
unimplemented!()
}
}

collect::set_global_default(EvilCollector).unwrap();

// spawn a thread, and assert it doesn't hang...
let (tx, didnt_hang) = mpsc::channel();
let th = thread::spawn(move || {
tracing::info!("hello world!");
tx.send(()).unwrap();
});

didnt_hang
// Note: 60 seconds is *way* more than enough, but let's be generous in
// case of e.g. slow CI machines.
.recv_timeout(Duration::from_secs(60))
.expect("the thread must not have hung!");
th.join().expect("thread should join successfully");
}

0 comments on commit c6e8a8f

Please sign in to comment.