Skip to main content

hypercall/observability/
tracing.rs

1//! OpenTelemetry tracing and Pyroscope profiling initialization.
2//!
3//! This module provides configurable distributed tracing via OpenTelemetry
4//! and continuous profiling via Pyroscope.
5//!
6//! # Configuration
7//!
8//! All configuration is via environment variables:
9//!
10//! ## Tracing (OpenTelemetry/OTLP)
11//! - `TRACING_ENABLED`: Enable/disable tracing ("true"/"false", default: "true")
12//! - `OTEL_EXPORTER_OTLP_ENDPOINT`: OTLP collector endpoint (e.g., "http://alloy:4317")
13//! - `OTEL_SERVICE_NAME`: Service name for traces (default: "hypercall")
14//! - `TRACING_SAMPLE_RATIO`: Sampling ratio 0.0-1.0 (default: "1.0" = always sample)
15//!
16//! ## Profiling (Pyroscope)
17//! - `PYROSCOPE_ENABLED`: Enable/disable profiling ("true"/"false", default: "true")
18//! - `PYROSCOPE_SERVER_ADDRESS`: Pyroscope server address (e.g., "http://alloy:4040")
19//! - `PYROSCOPE_APPLICATION_NAME`: Application name for profiles (default: "hypercall")
20//!
21//! # Usage
22//!
23//! ```rust,ignore
24//! use hypercall::observability::tracing::{init_tracing, shutdown_tracing};
25//!
26//! #[tokio::main]
27//! async fn main() {
28//!     // Initialize tracing (call early in main)
29//!     init_tracing(None);
30//!
31//!     // ... application code ...
32//!
33//!     // Graceful shutdown (flushes pending traces)
34//!     shutdown_tracing();
35//! }
36//! ```
37
38#[cfg(feature = "otel-tracing")]
39use opentelemetry::trace::TracerProvider;
40#[cfg(feature = "otel-tracing")]
41use opentelemetry::{KeyValue, StringValue};
42#[cfg(feature = "otel-tracing")]
43use opentelemetry_otlp::WithExportConfig;
44#[cfg(feature = "otel-tracing")]
45use opentelemetry_sdk::{
46    runtime,
47    trace::{RandomIdGenerator, Sampler, TracerProvider as SdkTracerProvider},
48    Resource,
49};
50#[cfg(feature = "otel-tracing")]
51use tracing_subscriber::{layer::SubscriberExt, util::SubscriberInitExt, EnvFilter};
52
53#[cfg(feature = "otel-tracing")]
54use std::sync::OnceLock;
55
56#[cfg(feature = "otel-tracing")]
57static TRACER_PROVIDER: OnceLock<SdkTracerProvider> = OnceLock::new();
58
59// We use a Box<dyn Any> to store the running pyroscope agent since the type is complex
60#[cfg(feature = "otel-tracing")]
61static PYROSCOPE_AGENT: OnceLock<std::sync::Mutex<Option<Box<dyn std::any::Any + Send>>>> =
62    OnceLock::new();
63
64/// Initialize OpenTelemetry tracing and Pyroscope profiling.
65///
66/// This function sets up:
67/// 1. A tracing subscriber with both console (fmt) and OTLP layers
68/// 2. Pyroscope continuous profiling (if enabled)
69///
70/// Call this early in main(), before any tracing macros are used.
71#[cfg(feature = "otel-tracing")]
72pub fn init_tracing(config: Option<&crate::backend_config::ObservabilityRuntimeConfig>) {
73    let tracing_config = config.map(|cfg| &cfg.tracing);
74    let tracing_enabled = tracing_config.map(|cfg| cfg.enabled).unwrap_or(true);
75
76    if !tracing_enabled {
77        // Fall back to simple fmt subscriber
78        tracing_subscriber::fmt()
79            .with_env_filter(EnvFilter::from_default_env())
80            .init();
81        tracing::info!("OTLP tracing disabled, using fmt subscriber only");
82        return;
83    }
84
85    let otlp_endpoint = tracing_config
86        .map(|cfg| cfg.otlp_endpoint.clone())
87        .unwrap_or_else(|| "http://localhost:4317".to_string());
88    let service_name = tracing_config
89        .map(|cfg| cfg.service_name.clone())
90        .unwrap_or_else(|| "hypercall".to_string());
91    let sample_ratio = tracing_config.map(|cfg| cfg.sample_ratio).unwrap_or(1.0);
92
93    // Build the OTLP exporter
94    let exporter = opentelemetry_otlp::SpanExporter::builder()
95        .with_tonic()
96        .with_endpoint(&otlp_endpoint)
97        .build()
98        .expect("Failed to create OTLP exporter");
99
100    // Build the tracer provider with sampling
101    let sampler = if sample_ratio >= 1.0 {
102        Sampler::AlwaysOn
103    } else if sample_ratio <= 0.0 {
104        Sampler::AlwaysOff
105    } else {
106        Sampler::TraceIdRatioBased(sample_ratio)
107    };
108
109    // Create resource with service name
110    let resource = Resource::new(vec![KeyValue::new(
111        "service.name",
112        StringValue::from(service_name.clone()),
113    )]);
114
115    let tracer_provider = SdkTracerProvider::builder()
116        .with_batch_exporter(exporter, runtime::Tokio)
117        .with_sampler(sampler)
118        .with_id_generator(RandomIdGenerator::default())
119        .with_resource(resource)
120        .build();
121
122    // Store provider for shutdown
123    let _ = TRACER_PROVIDER.set(tracer_provider.clone());
124
125    // Create the OpenTelemetry tracing layer
126    let tracer = tracer_provider.tracer("hypercall");
127    let otel_layer = tracing_opentelemetry::layer().with_tracer(tracer);
128
129    // Build the subscriber with both fmt and OTLP layers
130    tracing_subscriber::registry()
131        .with(EnvFilter::from_default_env())
132        .with(tracing_subscriber::fmt::layer())
133        .with(otel_layer)
134        .init();
135
136    tracing::info!(
137        endpoint = %otlp_endpoint,
138        service = %service_name,
139        sample_ratio = %sample_ratio,
140        "OpenTelemetry tracing initialized"
141    );
142
143    // Initialize Pyroscope profiling
144    init_pyroscope(config);
145}
146
147/// Initialize Pyroscope continuous profiling.
148#[cfg(feature = "otel-tracing")]
149fn init_pyroscope(config: Option<&crate::backend_config::ObservabilityRuntimeConfig>) {
150    let pyroscope_config = config.map(|cfg| &cfg.pyroscope);
151    let pyroscope_enabled = pyroscope_config.map(|cfg| cfg.enabled).unwrap_or(true);
152
153    if !pyroscope_enabled {
154        tracing::info!("Pyroscope profiling disabled");
155        return;
156    }
157
158    let server_address = match pyroscope_config.and_then(|cfg| cfg.server_address.clone()) {
159        Some(addr) => addr,
160        None => {
161            tracing::warn!("PYROSCOPE_SERVER_ADDRESS not set, profiling disabled");
162            return;
163        }
164    };
165
166    let application_name = pyroscope_config
167        .map(|cfg| cfg.application_name.clone())
168        .unwrap_or_else(|| "hypercall".to_string());
169
170    // Build and start the Pyroscope agent
171    match pyroscope::PyroscopeAgent::builder(&server_address, &application_name)
172        .backend(pyroscope_pprofrs::pprof_backend(
173            pyroscope_pprofrs::PprofConfig::new().sample_rate(100),
174        ))
175        .build()
176    {
177        Ok(agent) => match agent.start() {
178            Ok(running_agent) => {
179                tracing::info!(
180                    server = %server_address,
181                    application = %application_name,
182                    "Pyroscope profiling started"
183                );
184                // Store for shutdown
185                let _ = PYROSCOPE_AGENT.get_or_init(|| {
186                    std::sync::Mutex::new(Some(
187                        Box::new(running_agent) as Box<dyn std::any::Any + Send>
188                    ))
189                });
190            }
191            Err(e) => {
192                tracing::error!(error = %e, "Failed to start Pyroscope agent");
193            }
194        },
195        Err(e) => {
196            tracing::error!(error = %e, "Failed to build Pyroscope agent");
197        }
198    }
199}
200
201/// Gracefully shutdown tracing and profiling.
202///
203/// This flushes any pending traces and stops the profiling agent.
204/// Call this before application exit.
205#[cfg(feature = "otel-tracing")]
206pub fn shutdown_tracing() {
207    tracing::info!("Shutting down tracing...");
208
209    // Shutdown the tracer provider (flushes pending spans)
210    if let Some(provider) = TRACER_PROVIDER.get() {
211        if let Err(e) = provider.shutdown() {
212            eprintln!("Error shutting down tracer provider: {e}");
213        }
214    }
215
216    // Drop the pyroscope agent to stop it
217    if let Some(agent_mutex) = PYROSCOPE_AGENT.get() {
218        if let Ok(mut guard) = agent_mutex.lock() {
219            let _ = guard.take(); // Drop the agent
220        }
221    }
222
223    tracing::info!("Tracing shutdown complete");
224}
225
226// Fallback implementations when feature is disabled
227
228/// Initialize tracing (no-op when otel-tracing feature is disabled).
229#[cfg(not(feature = "otel-tracing"))]
230pub fn init_tracing(_config: Option<&crate::backend_config::ObservabilityRuntimeConfig>) {
231    tracing_subscriber::fmt()
232        .with_env_filter(tracing_subscriber::EnvFilter::from_default_env())
233        .init();
234}
235
236/// Shutdown tracing (no-op when otel-tracing feature is disabled).
237#[cfg(not(feature = "otel-tracing"))]
238pub fn shutdown_tracing() {
239    // No-op
240}
241
242#[cfg(test)]
243mod tests {
244    #[test]
245    #[cfg(not(feature = "otel-tracing"))]
246    fn test_init_tracing_noop() {
247        // Just verify it doesn't panic
248        // Note: can't actually call init_tracing in tests as it sets global subscriber
249    }
250}