Skip to main content

hypercall/observability/metrics_collector/
operations.rs

1use super::*;
2
3impl MetricsCollector {
4    // ===== Operational Metrics =====
5
6    pub(super) async fn collect_operational_metrics(&self) {
7        // DB pool metrics
8        if let Some(ref handler) = self.db {
9            let pool = handler.get_pool();
10            let state = pool.state();
11
12            gauge!("ht_db_pool_connections").set(state.connections as f64);
13            gauge!("ht_db_pool_idle").set(state.idle_connections as f64);
14
15            debug!(
16                "DB pool: {} connections, {} idle",
17                state.connections, state.idle_connections
18            );
19        }
20    }
21
22    // ===== Invariant Metrics (fast, in-memory) =====
23    // These check critical system invariants that should never be violated
24
25    pub(super) async fn collect_invariant_metrics(&self) {
26        // Spot price staleness from oracles
27        if let Some(ref greeks_cache) = self.greeks_cache {
28            let staleness = greeks_cache.get_spot_price_staleness().await;
29            let unhealthy = greeks_cache.get_unhealthy_oracles().await;
30
31            // Track max staleness across all oracles
32            let mut max_staleness: f64 = 0.0;
33
34            // Large sentinel value for missing oracles (ensures alert fires)
35            const MISSING_ORACLE_STALENESS: f64 = 999999.0;
36
37            for (underlying, staleness_opt) in staleness {
38                match staleness_opt {
39                    Some(secs) => {
40                        gauge!("ht_spot_price_staleness_seconds", "underlying" => underlying.clone())
41                            .set(secs);
42
43                        if secs > max_staleness {
44                            max_staleness = secs;
45                        }
46
47                        // Warn if stale (> 30 seconds is concerning for a 2s poll interval)
48                        if secs > 30.0 {
49                            warn!(
50                                "⚠️ STALE SPOT PRICE: {} is {:.1}s old (threshold: 30s)",
51                                underlying, secs
52                            );
53                        }
54                    }
55                    None => {
56                        // Oracle hasn't fetched yet - set to a very large value
57                        // Also update max_staleness so the alert fires
58                        gauge!("ht_spot_price_staleness_seconds", "underlying" => underlying.clone())
59                            .set(MISSING_ORACLE_STALENESS);
60                        if MISSING_ORACLE_STALENESS > max_staleness {
61                            max_staleness = MISSING_ORACLE_STALENESS;
62                        }
63                        warn!("⚠️ NO SPOT PRICE: {} has never been fetched", underlying);
64                    }
65                }
66            }
67
68            // Track max staleness for alerting
69            gauge!("ht_spot_price_max_staleness_seconds").set(max_staleness);
70
71            // Track number of unhealthy oracles
72            gauge!("ht_unhealthy_oracles").set(unhealthy.len() as f64);
73
74            if !unhealthy.is_empty() {
75                error!(
76                    "🚨 UNHEALTHY ORACLES: {:?} - margin calculations may be unreliable!",
77                    unhealthy
78                );
79            }
80        }
81    }
82
83    pub(super) async fn collect_vol_surface_metrics(&self) {
84        let Some(ref oracle) = self.risk_vol_oracle else {
85            return;
86        };
87        const MISSING_VOL_STALENESS: f64 = 999_999.0;
88
89        for status in oracle.statuses() {
90            gauge!(
91                "ht_vol_oracle_connected",
92                "provider" => status.provider.as_str(),
93                "underlying" => status.underlying.clone(),
94                "route_facing" => status.route_facing.to_string()
95            )
96            .set(if status.connected { 1.0 } else { 0.0 });
97            gauge!(
98                "ht_vol_surface_ready",
99                "provider" => status.provider.as_str(),
100                "underlying" => status.underlying.clone(),
101                "route_facing" => status.route_facing.to_string()
102            )
103            .set(if status.ready { 1.0 } else { 0.0 });
104            gauge!(
105                "ht_vol_surface_points",
106                "provider" => status.provider.as_str(),
107                "underlying" => status.underlying.clone(),
108                "route_facing" => status.route_facing.to_string()
109            )
110            .set(status.surface_points as f64);
111            gauge!(
112                "ht_vol_surface_staleness_seconds",
113                "provider" => status.provider.as_str(),
114                "underlying" => status.underlying.clone(),
115                "route_facing" => status.route_facing.to_string()
116            )
117            .set(status.staleness_seconds.unwrap_or(MISSING_VOL_STALENESS));
118            if let Some(threshold_seconds) = status.staleness_threshold_seconds {
119                gauge!(
120                    "ht_vol_surface_staleness_threshold_seconds",
121                    "provider" => status.provider.as_str(),
122                    "underlying" => status.underlying,
123                    "route_facing" => status.route_facing.to_string()
124                )
125                .set(threshold_seconds);
126            }
127        }
128    }
129}