Mountain/ApplicationState/Internal/Recovery/RecoverState.rs
1//! # RecoverState Module (Internal)
2//!
3//! ## RESPONSIBILITIES
4//! Provides state recovery utilities including validation, timeout handling,
5//! and exponential backoff for recovery operations.
6//!
7//! ## ARCHITECTURAL ROLE
8//! RecoverState is part of the **Internal::Recovery** module, providing
9//! recovery utilities for corrupted or invalid state.
10//!
11//! ## KEY COMPONENTS
12//! - validate_and_clean_state: Filters state by validator function
13//! - safe_state_operation_with_timeout: Executes operation with timeout
14//! - recover_state_with_backoff: Retries with exponential backoff
15//!
16//! ## ERROR HANDLING
17//! - Validates state before operations
18//! - Timeout protection for operations
19//! - Exponential backoff for retries
20//!
21//! ## LOGGING
22//! Operations are logged at appropriate levels (error, warn).
23//!
24//! ## PERFORMANCE CONSIDERATIONS
25//! - Efficient validation with retain
26//! - Timeout prevents hanging operations
27//! - Exponential backoff prevents overwhelming system
28//!
29//! ## TODO
30//! - [ ] Add state validation rules
31//! - [ ] Implement checkpoint recovery
32//! - [ ] Add recovery metrics collection
33
34use std::collections::HashMap;
35
36use CommonLibrary::Error::CommonError::CommonError;
37use log::{error, warn};
38
39/// Validates and cleans up state data by removing entries that don't pass
40/// validation.
41///
42/// # Arguments
43/// * `state_data` - The state data to validate and clean
44/// * `validator` - Function that returns true for valid entries
45///
46/// # Type Parameters
47/// * `T` - The type of values in the state map
48///
49/// # Behavior
50/// - Retains only entries where validator returns true
51/// - In-place modification of the HashMap
52pub fn validate_and_clean_state<T>(state_data:&mut HashMap<String, T>, validator:impl Fn(&T) -> bool) {
53 let original_len = state_data.len();
54 state_data.retain(|_, value| validator(value));
55 let removed_count = original_len - state_data.len();
56
57 if removed_count > 0 {
58 warn!(
59 "[RecoverState] Removed {} invalid state entries ({} remaining)",
60 removed_count,
61 state_data.len()
62 );
63 }
64}
65
66/// Safe state operation with timeout protection.
67///
68/// # Arguments
69/// * `operation` - The operation to execute
70/// * `timeout_ms` - Timeout in milliseconds
71/// * `operation_name` - Name of the operation for logging
72///
73/// # Type Parameters
74/// * `T` - The return type of the operation
75/// * `F` - The operation function type
76///
77/// # Returns
78/// Result containing the operation result or CommonError
79///
80/// # Behavior
81/// - Executes operation in a separate thread
82/// - Waits for result or timeout
83/// - Returns error if timeout occurs
84pub fn safe_state_operation_with_timeout<T, F>(
85 operation:F,
86 timeout_ms:u64,
87 operation_name:&str,
88) -> Result<T, CommonError>
89where
90 F: FnOnce() -> Result<T, CommonError> + Send + 'static,
91 T: Send + 'static, {
92 let (sender, receiver) = std::sync::mpsc::channel();
93
94 std::thread::spawn(move || {
95 let result = operation();
96 let _ = sender.send(result);
97 });
98
99 match receiver.recv_timeout(std::time::Duration::from_millis(timeout_ms)) {
100 Ok(result) => result,
101 Err(_) => {
102 error!("[RecoverState] Operation '{}' timed out after {}ms", operation_name, timeout_ms);
103 Err(CommonError::Unknown { Description:format!("Operation '{}' timed out", operation_name) })
104 },
105 }
106}
107
108/// Attempt state recovery with exponential backoff.
109///
110/// # Arguments
111/// * `operation` - The operation to retry
112/// * `max_attempts` - Maximum number of retry attempts
113/// * `operation_name` - Name of the operation for logging
114///
115/// # Type Parameters
116/// * `F` - The operation function type
117/// * `T` - The return type of the operation
118///
119/// # Returns
120/// Result containing the operation result or CommonError
121///
122/// # Behavior
123/// - Retries operation up to max_attempts times
124/// - Uses exponential backoff (doubles delay after each failure)
125/// - Starts with 100ms delay
126/// - Logs each attempt and failure
127pub async fn recover_state_with_backoff<F, T>(
128 operation:F,
129 max_attempts:u32,
130 operation_name:&str,
131) -> Result<T, CommonError>
132where
133 F: Fn() -> Result<T, CommonError> + Send, {
134 let mut attempt = 0;
135 let mut delay_ms = 100;
136
137 while attempt < max_attempts {
138 match operation() {
139 Ok(result) => return Ok(result),
140 Err(error) => {
141 attempt += 1;
142 if attempt == max_attempts {
143 return Err(error);
144 }
145
146 warn!(
147 "[RecoverState] Attempt {} failed for '{}': {}. Retrying in {}ms...",
148 attempt, operation_name, error, delay_ms
149 );
150
151 tokio::time::sleep(tokio::time::Duration::from_millis(delay_ms)).await;
152
153 // Apply exponential backoff by doubling the delay after each failure
154 // to prevent overwhelming the system during recovery attempts.
155 delay_ms *= 2;
156 },
157 }
158 }
159
160 Err(CommonError::Unknown {
161 Description:format!(
162 "Failed to recover state for '{}' after {} attempts",
163 operation_name, max_attempts
164 ),
165 })
166}