khora_core/telemetry/
monitoring.rs

1// Copyright 2025 eraflo
2//
3// Licensed under the Apache License, Version 2.0 (the "License");
4// you may not use this file except in compliance with the License.
5// You may obtain a copy of the License at
6//
7//     http://www.apache.org/licenses/LICENSE-2.0
8//
9// Unless required by applicable law or agreed to in writing, software
10// distributed under the License is distributed on an "AS IS" BASIS,
11// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12// See the License for the specific language governing permissions and
13// limitations under the License.
14
15//! Provides traits and data structures for active resource monitoring.
16//!
17//! "Monitoring" is distinct from "metrics" in that it involves actively polling
18//! a system resource (like VRAM or a GPU) to get a snapshot of its state, whereas
19//! metrics are typically discrete, event-based measurements.
20
21use std::borrow::Cow;
22use std::fmt::Debug;
23
24use crate::platform::{BatteryLevel, ThermalStatus};
25use crate::renderer::api::core::gpu_hook::GpuHook;
26
27/// The core trait for a resource monitor.
28///
29/// A `ResourceMonitor` is a stateful object, typically living in the `khora-infra`
30/// crate, that knows how to query a specific system resource. The `khora-telemetry`
31/// service will hold a collection of these monitors and periodically call `update`
32/// and `get_usage_report` on them.
33pub trait ResourceMonitor: Send + Sync + Debug + 'static {
34    /// Returns a unique, human-readable identifier for this monitor instance.
35    fn monitor_id(&self) -> Cow<'static, str>;
36
37    /// Returns the general type of resource being monitored.
38    fn resource_type(&self) -> MonitoredResourceType;
39
40    /// Returns a snapshot of the current usage data for the monitored resource.
41    fn get_usage_report(&self) -> ResourceUsageReport;
42
43    /// Returns a GPU performance report, if this monitor supports it.
44    fn get_gpu_report(&self) -> Option<GpuReport> {
45        None
46    }
47
48    /// Returns a hardware health report, if this monitor supports it.
49    fn get_hardware_report(&self) -> Option<HardwareReport> {
50        None
51    }
52
53    /// Returns a list of discrete metrics collected by this monitor.
54    fn get_metrics(
55        &self,
56    ) -> Vec<(
57        crate::telemetry::metrics::MetricId,
58        crate::telemetry::metrics::MetricValue,
59    )> {
60        Vec::new()
61    }
62
63    /// Allows downcasting to a concrete `ResourceMonitor` type.
64    fn as_any(&self) -> &dyn std::any::Any;
65
66    /// Triggers the monitor to update its internal state by polling the resource.
67    /// This default implementation does nothing, for monitors that update passively.
68    fn update(&self) {
69        // Default: no-op
70    }
71}
72
73/// An enumeration of the types of resources that can be monitored.
74#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
75pub enum MonitoredResourceType {
76    /// Video RAM on a GPU.
77    Vram,
78    /// Main system RAM.
79    SystemRam,
80    /// General GPU performance (e.g., execution timing).
81    Gpu,
82    /// General platform hardware status (thermal, CPU load).
83    Hardware,
84}
85
86/// A generic, unified report of resource usage, typically in bytes.
87#[derive(Debug, Clone, Copy, Default)]
88pub struct ResourceUsageReport {
89    /// The number of bytes currently in use.
90    pub current_bytes: u64,
91    /// The peak number of bytes ever in use simultaneously, if tracked.
92    pub peak_bytes: Option<u64>,
93    /// The total capacity of the resource in bytes, if known.
94    pub total_capacity_bytes: Option<u64>,
95}
96
97/// A report of physical hardware status (CPU, thermal, etc.).
98#[derive(Debug, Clone, Copy, Default)]
99pub struct HardwareReport {
100    /// Current thermal status.
101    pub thermal: ThermalStatus,
102    /// Current battery/power status.
103    pub battery: BatteryLevel,
104    /// Overall CPU load (0.0 to 1.0).
105    pub cpu_load: f32,
106    /// Overall GPU load (0.0 to 1.0), if reported by the hardware monitor.
107    pub gpu_load: Option<f32>,
108    /// Detailed GPU timing report for the current frame.
109    pub gpu_timings: Option<GpuReport>,
110}
111
112/// A report of GPU performance timings for a single frame.
113#[derive(Debug, Clone, Copy, Default)]
114pub struct GpuReport {
115    /// The frame number this report corresponds to.
116    pub frame_number: u64,
117    /// Raw timestamp query results for each GPU hook, in microseconds.
118    /// The order corresponds to the `GpuHook` enum definition.
119    pub hook_timings_us: [Option<u32>; 4],
120    /// The CPU time spent preparing the frame, in microseconds.
121    pub cpu_preparation_time_us: Option<u32>,
122    /// The CPU time spent submitting commands for the frame, in microseconds.
123    pub cpu_submission_time_us: Option<u32>,
124    /// The number of draw calls in this frame.
125    pub draw_calls: u32,
126    /// The number of triangles rendered in this frame.
127    pub triangles_rendered: u32,
128}
129
130/// A detailed report of system memory (RAM) usage and allocation patterns.
131#[derive(Debug, Clone, Copy, Default)]
132pub struct MemoryReport {
133    /// The number of bytes of system RAM currently in use by the application.
134    pub current_usage_bytes: usize,
135    /// The peak number of bytes of system RAM ever used simultaneously.
136    pub peak_usage_bytes: usize,
137    /// The number of bytes allocated since the last monitor update.
138    pub allocation_delta_bytes: usize,
139    /// The total number of times the memory usage has been sampled.
140    pub sample_count: u64,
141
142    // Extended statistics (often from a tracking allocator)
143    /// The total number of allocation calls since the start.
144    pub total_allocations: u64,
145    /// The total number of deallocation calls since the start.
146    pub total_deallocations: u64,
147    /// The total number of reallocation calls since the start.
148    pub total_reallocations: u64,
149    /// The cumulative sum of all bytes ever allocated.
150    pub bytes_allocated_lifetime: u64,
151    /// The cumulative sum of all bytes ever deallocated.
152    pub bytes_deallocated_lifetime: u64,
153    /// The number of allocations classified as "large" (e.g., >= 1MB).
154    pub large_allocations: u64,
155    /// The total byte size of all "large" allocations.
156    pub large_allocation_bytes: u64,
157    /// The number of allocations classified as "small" (e.g., < 1KB).
158    pub small_allocations: u64,
159    /// The total byte size of all "small" allocations.
160    pub small_allocation_bytes: u64,
161    /// A calculated ratio indicating potential memory fragmentation.
162    pub fragmentation_ratio: f64,
163    /// A calculated ratio of memory still in use versus total ever allocated.
164    pub allocation_efficiency: f64,
165    /// The calculated average size of a single memory allocation in bytes.
166    pub average_allocation_size: f64,
167}
168
169/// A report of Video RAM (VRAM) usage.
170#[derive(Debug, Clone, Copy, Default)]
171pub struct VramReport {
172    /// The number of bytes of VRAM currently in use.
173    pub current_usage_bytes: usize,
174    /// The peak number of bytes of VRAM ever in use, if tracked.
175    pub peak_usage_bytes: Option<usize>,
176    /// The total physical VRAM capacity in bytes, if available.
177    pub total_capacity_bytes: Option<usize>,
178}
179
180/// A trait for types that can provide VRAM usage statistics.
181/// This is typically implemented by a `GraphicsDevice` or a dedicated monitor in `khora-infra`.
182pub trait VramProvider: Send + Sync {
183    /// Returns the current VRAM usage in megabytes.
184    fn get_vram_usage_mb(&self) -> f32;
185    /// Returns the peak VRAM usage in megabytes.
186    fn get_vram_peak_mb(&self) -> f32;
187    /// Returns the total VRAM capacity in megabytes, if available.
188    fn get_vram_capacity_mb(&self) -> Option<f32>;
189}
190
191impl MemoryReport {
192    /// Returns the current memory usage in megabytes (MB).
193    pub fn current_usage_mb(&self) -> f64 {
194        self.current_usage_bytes as f64 / (1024.0 * 1024.0)
195    }
196
197    /// Returns the peak memory usage in megabytes (MB).
198    pub fn peak_usage_mb(&self) -> f64 {
199        self.peak_usage_bytes as f64 / (1024.0 * 1024.0)
200    }
201
202    /// Returns the change in allocated bytes since the last update, in kilobytes (KB).
203    pub fn allocation_delta_kb(&self) -> f64 {
204        self.allocation_delta_bytes as f64 / 1024.0
205    }
206
207    /// Calculates the memory turnover rate (allocations + deallocations per sample).
208    pub fn memory_turnover_rate(&self) -> f64 {
209        if self.sample_count > 0 {
210            (self.total_allocations + self.total_deallocations) as f64 / self.sample_count as f64
211        } else {
212            0.0
213        }
214    }
215
216    /// Calculates the percentage of total allocations that were classified as "large".
217    pub fn large_allocation_percentage(&self) -> f64 {
218        if self.total_allocations > 0 {
219            (self.large_allocations as f64 / self.total_allocations as f64) * 100.0
220        } else {
221            0.0
222        }
223    }
224
225    /// Returns the memory allocation efficiency as a percentage.
226    pub fn memory_utilization_efficiency(&self) -> f64 {
227        self.allocation_efficiency * 100.0
228    }
229
230    /// Returns the average allocation size in megabytes (MB).
231    pub fn average_allocation_size_mb(&self) -> f64 {
232        self.average_allocation_size / (1024.0 * 1024.0)
233    }
234
235    /// Returns a descriptive string for the current fragmentation status.
236    pub fn fragmentation_status(&self) -> &'static str {
237        match self.fragmentation_ratio {
238            r if r < 0.1 => "Low",
239            r if r < 0.3 => "Moderate",
240            r if r < 0.6 => "High",
241            _ => "Critical",
242        }
243    }
244}
245
246impl GpuReport {
247    /// Gets the timing for a specific GPU performance hook, in microseconds.
248    pub fn get_hook_timing_us(&self, hook: GpuHook) -> Option<u32> {
249        self.hook_timings_us[hook as usize]
250    }
251
252    /// Calculates the duration of the main render pass, in microseconds.
253    pub fn main_pass_duration_us(&self) -> Option<u32> {
254        match (
255            self.get_hook_timing_us(GpuHook::MainPassBegin),
256            self.get_hook_timing_us(GpuHook::MainPassEnd),
257        ) {
258            (Some(begin), Some(end)) if end >= begin => Some(end - begin),
259            _ => None,
260        }
261    }
262
263    /// Calculates the total GPU duration for the frame, in microseconds.
264    pub fn frame_total_duration_us(&self) -> Option<u32> {
265        match (
266            self.get_hook_timing_us(GpuHook::FrameStart),
267            self.get_hook_timing_us(GpuHook::FrameEnd),
268        ) {
269            (Some(start), Some(end)) if end >= start => Some(end - start),
270            _ => None,
271        }
272    }
273
274    /// Sets the timing for a specific hook, in microseconds.
275    pub fn set_hook_timing_us(&mut self, hook: GpuHook, timing_us: Option<u32>) {
276        self.hook_timings_us[hook as usize] = timing_us;
277    }
278}