khora_core/telemetry/monitoring.rs
1// Copyright 2025 eraflo
2//
3// Licensed under the Apache License, Version 2.0 (the "License");
4// you may not use this file except in compliance with the License.
5// You may obtain a copy of the License at
6//
7// http://www.apache.org/licenses/LICENSE-2.0
8//
9// Unless required by applicable law or agreed to in writing, software
10// distributed under the License is distributed on an "AS IS" BASIS,
11// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12// See the License for the specific language governing permissions and
13// limitations under the License.
14
15//! Provides traits and data structures for active resource monitoring.
16//!
17//! "Monitoring" is distinct from "metrics" in that it involves actively polling
18//! a system resource (like VRAM or a GPU) to get a snapshot of its state, whereas
19//! metrics are typically discrete, event-based measurements.
20
21use std::borrow::Cow;
22use std::fmt::Debug;
23
24use crate::platform::{BatteryLevel, ThermalStatus};
25use crate::renderer::api::core::gpu_hook::GpuHook;
26
27/// The core trait for a resource monitor.
28///
29/// A `ResourceMonitor` is a stateful object, typically living in the `khora-infra`
30/// crate, that knows how to query a specific system resource. The `khora-telemetry`
31/// service will hold a collection of these monitors and periodically call `update`
32/// and `get_usage_report` on them.
33pub trait ResourceMonitor: Send + Sync + Debug + 'static {
34 /// Returns a unique, human-readable identifier for this monitor instance.
35 fn monitor_id(&self) -> Cow<'static, str>;
36
37 /// Returns the general type of resource being monitored.
38 fn resource_type(&self) -> MonitoredResourceType;
39
40 /// Returns a snapshot of the current usage data for the monitored resource.
41 fn get_usage_report(&self) -> ResourceUsageReport;
42
43 /// Returns a GPU performance report, if this monitor supports it.
44 fn get_gpu_report(&self) -> Option<GpuReport> {
45 None
46 }
47
48 /// Returns a hardware health report, if this monitor supports it.
49 fn get_hardware_report(&self) -> Option<HardwareReport> {
50 None
51 }
52
53 /// Returns a list of discrete metrics collected by this monitor.
54 fn get_metrics(
55 &self,
56 ) -> Vec<(
57 crate::telemetry::metrics::MetricId,
58 crate::telemetry::metrics::MetricValue,
59 )> {
60 Vec::new()
61 }
62
63 /// Allows downcasting to a concrete `ResourceMonitor` type.
64 fn as_any(&self) -> &dyn std::any::Any;
65
66 /// Triggers the monitor to update its internal state by polling the resource.
67 /// This default implementation does nothing, for monitors that update passively.
68 fn update(&self) {
69 // Default: no-op
70 }
71}
72
73/// An enumeration of the types of resources that can be monitored.
74#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
75pub enum MonitoredResourceType {
76 /// Video RAM on a GPU.
77 Vram,
78 /// Main system RAM.
79 SystemRam,
80 /// General GPU performance (e.g., execution timing).
81 Gpu,
82 /// General platform hardware status (thermal, CPU load).
83 Hardware,
84}
85
86/// A generic, unified report of resource usage, typically in bytes.
87#[derive(Debug, Clone, Copy, Default)]
88pub struct ResourceUsageReport {
89 /// The number of bytes currently in use.
90 pub current_bytes: u64,
91 /// The peak number of bytes ever in use simultaneously, if tracked.
92 pub peak_bytes: Option<u64>,
93 /// The total capacity of the resource in bytes, if known.
94 pub total_capacity_bytes: Option<u64>,
95}
96
97/// A report of physical hardware status (CPU, thermal, etc.).
98#[derive(Debug, Clone, Copy, Default)]
99pub struct HardwareReport {
100 /// Current thermal status.
101 pub thermal: ThermalStatus,
102 /// Current battery/power status.
103 pub battery: BatteryLevel,
104 /// Overall CPU load (0.0 to 1.0).
105 pub cpu_load: f32,
106 /// Overall GPU load (0.0 to 1.0), if reported by the hardware monitor.
107 pub gpu_load: Option<f32>,
108 /// Detailed GPU timing report for the current frame.
109 pub gpu_timings: Option<GpuReport>,
110}
111
112/// A report of GPU performance timings for a single frame.
113#[derive(Debug, Clone, Copy, Default)]
114pub struct GpuReport {
115 /// The frame number this report corresponds to.
116 pub frame_number: u64,
117 /// Raw timestamp query results for each GPU hook, in microseconds.
118 /// The order corresponds to the `GpuHook` enum definition.
119 pub hook_timings_us: [Option<u32>; 4],
120 /// The CPU time spent preparing the frame, in microseconds.
121 pub cpu_preparation_time_us: Option<u32>,
122 /// The CPU time spent submitting commands for the frame, in microseconds.
123 pub cpu_submission_time_us: Option<u32>,
124 /// The number of draw calls in this frame.
125 pub draw_calls: u32,
126 /// The number of triangles rendered in this frame.
127 pub triangles_rendered: u32,
128}
129
130/// A detailed report of system memory (RAM) usage and allocation patterns.
131#[derive(Debug, Clone, Copy, Default)]
132pub struct MemoryReport {
133 /// The number of bytes of system RAM currently in use by the application.
134 pub current_usage_bytes: usize,
135 /// The peak number of bytes of system RAM ever used simultaneously.
136 pub peak_usage_bytes: usize,
137 /// The number of bytes allocated since the last monitor update.
138 pub allocation_delta_bytes: usize,
139 /// The total number of times the memory usage has been sampled.
140 pub sample_count: u64,
141
142 // Extended statistics (often from a tracking allocator)
143 /// The total number of allocation calls since the start.
144 pub total_allocations: u64,
145 /// The total number of deallocation calls since the start.
146 pub total_deallocations: u64,
147 /// The total number of reallocation calls since the start.
148 pub total_reallocations: u64,
149 /// The cumulative sum of all bytes ever allocated.
150 pub bytes_allocated_lifetime: u64,
151 /// The cumulative sum of all bytes ever deallocated.
152 pub bytes_deallocated_lifetime: u64,
153 /// The number of allocations classified as "large" (e.g., >= 1MB).
154 pub large_allocations: u64,
155 /// The total byte size of all "large" allocations.
156 pub large_allocation_bytes: u64,
157 /// The number of allocations classified as "small" (e.g., < 1KB).
158 pub small_allocations: u64,
159 /// The total byte size of all "small" allocations.
160 pub small_allocation_bytes: u64,
161 /// A calculated ratio indicating potential memory fragmentation.
162 pub fragmentation_ratio: f64,
163 /// A calculated ratio of memory still in use versus total ever allocated.
164 pub allocation_efficiency: f64,
165 /// The calculated average size of a single memory allocation in bytes.
166 pub average_allocation_size: f64,
167}
168
169/// A report of Video RAM (VRAM) usage.
170#[derive(Debug, Clone, Copy, Default)]
171pub struct VramReport {
172 /// The number of bytes of VRAM currently in use.
173 pub current_usage_bytes: usize,
174 /// The peak number of bytes of VRAM ever in use, if tracked.
175 pub peak_usage_bytes: Option<usize>,
176 /// The total physical VRAM capacity in bytes, if available.
177 pub total_capacity_bytes: Option<usize>,
178}
179
180/// A trait for types that can provide VRAM usage statistics.
181/// This is typically implemented by a `GraphicsDevice` or a dedicated monitor in `khora-infra`.
182pub trait VramProvider: Send + Sync {
183 /// Returns the current VRAM usage in megabytes.
184 fn get_vram_usage_mb(&self) -> f32;
185 /// Returns the peak VRAM usage in megabytes.
186 fn get_vram_peak_mb(&self) -> f32;
187 /// Returns the total VRAM capacity in megabytes, if available.
188 fn get_vram_capacity_mb(&self) -> Option<f32>;
189}
190
191impl MemoryReport {
192 /// Returns the current memory usage in megabytes (MB).
193 pub fn current_usage_mb(&self) -> f64 {
194 self.current_usage_bytes as f64 / (1024.0 * 1024.0)
195 }
196
197 /// Returns the peak memory usage in megabytes (MB).
198 pub fn peak_usage_mb(&self) -> f64 {
199 self.peak_usage_bytes as f64 / (1024.0 * 1024.0)
200 }
201
202 /// Returns the change in allocated bytes since the last update, in kilobytes (KB).
203 pub fn allocation_delta_kb(&self) -> f64 {
204 self.allocation_delta_bytes as f64 / 1024.0
205 }
206
207 /// Calculates the memory turnover rate (allocations + deallocations per sample).
208 pub fn memory_turnover_rate(&self) -> f64 {
209 if self.sample_count > 0 {
210 (self.total_allocations + self.total_deallocations) as f64 / self.sample_count as f64
211 } else {
212 0.0
213 }
214 }
215
216 /// Calculates the percentage of total allocations that were classified as "large".
217 pub fn large_allocation_percentage(&self) -> f64 {
218 if self.total_allocations > 0 {
219 (self.large_allocations as f64 / self.total_allocations as f64) * 100.0
220 } else {
221 0.0
222 }
223 }
224
225 /// Returns the memory allocation efficiency as a percentage.
226 pub fn memory_utilization_efficiency(&self) -> f64 {
227 self.allocation_efficiency * 100.0
228 }
229
230 /// Returns the average allocation size in megabytes (MB).
231 pub fn average_allocation_size_mb(&self) -> f64 {
232 self.average_allocation_size / (1024.0 * 1024.0)
233 }
234
235 /// Returns a descriptive string for the current fragmentation status.
236 pub fn fragmentation_status(&self) -> &'static str {
237 match self.fragmentation_ratio {
238 r if r < 0.1 => "Low",
239 r if r < 0.3 => "Moderate",
240 r if r < 0.6 => "High",
241 _ => "Critical",
242 }
243 }
244}
245
246impl GpuReport {
247 /// Gets the timing for a specific GPU performance hook, in microseconds.
248 pub fn get_hook_timing_us(&self, hook: GpuHook) -> Option<u32> {
249 self.hook_timings_us[hook as usize]
250 }
251
252 /// Calculates the duration of the main render pass, in microseconds.
253 pub fn main_pass_duration_us(&self) -> Option<u32> {
254 match (
255 self.get_hook_timing_us(GpuHook::MainPassBegin),
256 self.get_hook_timing_us(GpuHook::MainPassEnd),
257 ) {
258 (Some(begin), Some(end)) if end >= begin => Some(end - begin),
259 _ => None,
260 }
261 }
262
263 /// Calculates the total GPU duration for the frame, in microseconds.
264 pub fn frame_total_duration_us(&self) -> Option<u32> {
265 match (
266 self.get_hook_timing_us(GpuHook::FrameStart),
267 self.get_hook_timing_us(GpuHook::FrameEnd),
268 ) {
269 (Some(start), Some(end)) if end >= start => Some(end - start),
270 _ => None,
271 }
272 }
273
274 /// Sets the timing for a specific hook, in microseconds.
275 pub fn set_hook_timing_us(&mut self, hook: GpuHook, timing_us: Option<u32>) {
276 self.hook_timings_us[hook as usize] = timing_us;
277 }
278}