AirLibrary/Indexing/Scan/
ScanFile.rs

1//! # ScanFile
2//!
3//! ## File: Indexing/Scan/ScanFile.rs
4//!
5//! ## Role in Air Architecture
6//!
7//! Provides individual file scanning functionality for the File Indexer
8//! service, handling reading, metadata extraction, and categorization of files
9//! for indexing.
10//!
11//! ## Primary Responsibility
12//!
13//! Scan individual files to extract metadata, content, and prepare them for
14//! indexing operations.
15//!
16//! ## Secondary Responsibilities
17//!
18//! - File access validation and permission checking
19//! - Encoding detection for text files
20//! - Language detection for code files
21//! - File size validation
22//! - Symbolic link detection
23//!
24//! ## Dependencies
25//!
26//! **External Crates:**
27//! - `tokio` - Async file I/O operations
28//! - `sha2` - Checksum calculation for file integrity
29//!
30//! **Internal Modules:**
31//! - `crate::Result` - Error handling type
32//! - `crate::AirError` - Error types
33//! - `crate::Configuration::IndexingConfig` - Indexing configuration
34//! - `super::super::State::CreateState` - State structure definitions
35//! - `super::Process::ProcessContent` - Content processing operations
36//!
37//! ## Dependents
38//!
39//! - `Indexing::Scan::ScanDirectory` - Batch file processing
40//! - `Indexing::Watch::WatchFile` - Individual file change handling
41//! - `Indexing::mod::FileIndexer` - Main file indexer implementation
42//!
43//! ## VSCode Pattern Reference
44//!
45//! Inspired by VSCode's file scanning in
46//! `src/vs/workbench/services/files/`
47//!
48//! ## Security Considerations
49//!
50//! - Path canonicalization before access
51//! - File size limits enforced
52//! - Timeout protection for I/O operations
53//! - Permission checking before reads
54//!
55//! ## Performance Considerations
56//!
57//! - Asynchronous file reading
58//! - Batch processing operations
59//! - Memory-efficient streaming for large files
60//! - Cached metadata when available
61//!
62//! ## Error Handling Strategy
63//!
64//! File scanning returns Results with detailed error messages about
65//! why a file cannot be scanned or accessed. Errors are logged and
66//! individual file failures don't halt batch operations.
67//!
68//! ## Thread Safety
69//!
70//! File scanning operations are designed for parallel execution and
71/// produce results that can be safely merged into shared state.
72use std::{
73	path::PathBuf,
74	time::{Duration, Instant},
75};
76
77use crate::{
78	AirError,
79	Configuration::IndexingConfig,
80	Indexing::{
81		Process::{
82			ProcessContent::{DetectEncoding, DetectLanguage, DetectMimeType},
83			ExtractSymbols::ExtractSymbols,
84		},
85		State::CreateState::{FileMetadata, SymbolInfo},
86	},
87	Result,
88};
89
90/// Index a single file internally with comprehensive validation
91///
92/// This function is called by parallel tasks during directory scanning
93/// and includes:
94/// - File metadata extraction
95/// - Size validation
96/// - SHA-256 checksum calculation
97/// - Encoding detection
98/// - MIME type detection
99/// - Language detection
100/// - Symbol extraction for code files
101pub async fn IndexFileInternal(
102	file_path:&PathBuf,
103	config:&IndexingConfig,
104	_patterns:&[String],
105) -> Result<(FileMetadata, Vec<SymbolInfo>)> {
106	let start_time = Instant::now();
107
108	// Get file metadata with error handling
109	let metadata = std::fs::metadata(file_path)
110		.map_err(|e| AirError::FileSystem(format!("Failed to get file metadata: {}", e)))?;
111
112	// Get modified time
113	let modified = metadata
114		.modified()
115		.map_err(|e| AirError::FileSystem(format!("Failed to get modification time: {}", e)))?;
116
117	let modified_time = chrono::DateTime::<chrono::Utc>::from(modified);
118
119	// Check if file size exceeds limit
120	let file_size = metadata.len();
121	if file_size > config.MaxFileSizeMb as u64 * 1024 * 1024 {
122		return Err(AirError::FileSystem(format!(
123			"File size {} exceeds limit {} MB",
124			file_size, config.MaxFileSizeMb
125		)));
126	}
127
128	// File read with timeout protection
129	let content = tokio::time::timeout(Duration::from_secs(30), tokio::fs::read(file_path))
130		.await
131		.map_err(|_| AirError::FileSystem(format!("Timeout reading file: {} (30s limit)", file_path.display())))?
132		.map_err(|e| AirError::FileSystem(format!("Failed to read file: {}", e)))?;
133
134	// Check for symbolic link
135	let is_symlink = std::fs::symlink_metadata(file_path)
136		.map(|m| m.file_type().is_symlink())
137		.unwrap_or(false);
138
139	// Calculate SHA-256 checksum
140	let checksum = CalculateChecksum(&content);
141
142	// Detect file encoding
143	let encoding = DetectEncoding(&content);
144
145	// Detect MIME type
146	let mime_type = DetectMimeType(file_path, &content);
147
148	// Detect programming language
149	let language = DetectLanguage(file_path);
150
151	// Count lines for text files
152	let line_count = if mime_type.starts_with("text/") {
153		Some(content.iter().filter(|&&b| b == b'\n').count() as u32 + 1)
154	} else {
155		None
156	};
157
158	// Extract symbols from code for VSCode Outline View
159	let symbols = if let Some(lang) = &language {
160		ExtractSymbols(file_path, &content, lang).await?
161	} else {
162		Vec::new()
163	};
164
165	let permissions = GetPermissionsString(&metadata);
166
167	let elapsed = start_time.elapsed();
168
169	log::trace!(
170		"[ScanFile] Indexed {} in {}ms ({} symbols)",
171		file_path.display(),
172		elapsed.as_millis(),
173		symbols.len()
174	);
175
176	Ok((
177		FileMetadata {
178			path:file_path.clone(),
179			size:file_size,
180			modified:modified_time,
181			mime_type,
182			language,
183			line_count,
184			checksum,
185			is_symlink,
186			permissions,
187			encoding,
188			indexed_at:chrono::Utc::now(),
189			symbol_count:symbols.len() as u32,
190		},
191		symbols,
192	))
193}
194
195/// Validate file access and permissions before scanning
196pub async fn ValidateFileAccess(file_path:&PathBuf) -> bool {
197	tokio::task::spawn_blocking({
198		let file_path = file_path.to_path_buf();
199		move || {
200			// Try to read file metadata
201			let can_access = std::fs::metadata(&file_path).is_ok();
202			if can_access {
203				// Try to open file for reading
204				std::fs::File::open(&file_path).is_ok()
205			} else {
206				false
207			}
208		}
209	})
210	.await
211	.unwrap_or(false)
212}
213
214/// Calculate SHA-256 checksum for file content
215pub fn CalculateChecksum(content:&[u8]) -> String {
216	use sha2::{Digest, Sha256};
217	let mut hasher = Sha256::new();
218	hasher.update(content);
219	format!("{:x}", hasher.finalize())
220}
221
222/// Get file permissions as string
223#[cfg(unix)]
224pub fn GetPermissionsString(metadata:&std::fs::Metadata) -> String {
225	use std::os::unix::fs::PermissionsExt;
226	let mode = metadata.permissions().mode();
227	let mut perms = String::new();
228	// Read permission
229	perms.push(if mode & 0o400 != 0 { 'r' } else { '-' });
230	// Write permission
231	perms.push(if mode & 0o200 != 0 { 'w' } else { '-' });
232	// Execute permission
233	perms.push(if mode & 0o100 != 0 { 'x' } else { '-' });
234	// Group permissions
235	perms.push(if mode & 0o040 != 0 { 'r' } else { '-' });
236	perms.push(if mode & 0o020 != 0 { 'w' } else { '-' });
237	perms.push(if mode & 0o010 != 0 { 'x' } else { '-' });
238	// Other permissions
239	perms.push(if mode & 0o004 != 0 { 'r' } else { '-' });
240	perms.push(if mode & 0o002 != 0 { 'w' } else { '-' });
241	perms.push(if mode & 0o001 != 0 { 'x' } else { '-' });
242	perms
243}
244
245/// Get file permissions as string for non-Unix systems
246#[cfg(not(unix))]
247pub fn GetPermissionsString(_metadata:&std::fs::Metadata) -> String { "--------".to_string() }
248
249/// Scan file and return just the metadata (without symbols)
250pub async fn ScanFileMetadata(file_path:&PathBuf) -> Result<FileMetadata> {
251	let metadata = std::fs::metadata(file_path)
252		.map_err(|e| AirError::FileSystem(format!("Failed to get file metadata: {}", e)))?;
253
254	let modified = metadata
255		.modified()
256		.map_err(|e| AirError::FileSystem(format!("Failed to get modification time: {}", e)))?;
257
258	let modified_time = chrono::DateTime::<chrono::Utc>::from(modified);
259
260	Ok(FileMetadata {
261		path:file_path.clone(),
262		size:metadata.len(),
263		modified:modified_time,
264		mime_type:"application/octet-stream".to_string(),
265		language:None,
266		line_count:None,
267		checksum:String::new(),
268		is_symlink:metadata.file_type().is_symlink(),
269		permissions:GetPermissionsString(&metadata),
270		encoding:None,
271		indexed_at:chrono::Utc::now(),
272		symbol_count:0,
273	})
274}
275
276/// Check if file has been modified since last indexed
277pub fn FileModifiedSince(file_path:&PathBuf, last_indexed:chrono::DateTime<chrono::Utc>) -> Result<bool> {
278	let metadata = std::fs::metadata(file_path)
279		.map_err(|e| AirError::FileSystem(format!("Failed to get file metadata: {}", e)))?;
280
281	let modified = metadata
282		.modified()
283		.map_err(|e| AirError::FileSystem(format!("Failed to get modification time: {}", e)))?;
284
285	let modified_time = chrono::DateTime::<chrono::Utc>::from(modified);
286
287	Ok(modified_time > last_indexed)
288}
289
290/// Get file size with error handling
291pub async fn GetFileSize(file_path:&PathBuf) -> Result<u64> {
292	tokio::task::spawn_blocking({
293		let file_path = file_path.to_path_buf();
294		move || {
295			let metadata = std::fs::metadata(&file_path)
296				.map_err(|e| AirError::FileSystem(format!("Failed to get file metadata: {}", e)))?;
297			Ok(metadata.len())
298		}
299	})
300	.await?
301}
302
303/// Check if file is text-based (likely to be code or documentation)
304pub fn IsTextFile(metadata:&FileMetadata) -> bool {
305	metadata.mime_type.starts_with("text/")
306		|| metadata.mime_type.contains("json")
307		|| metadata.mime_type.contains("xml")
308		|| metadata.mime_type.contains("yaml")
309		|| metadata.mime_type.contains("toml")
310		|| metadata.language.is_some()
311}
312
313/// Check if file is binary (not suitable for indexing)
314pub fn IsBinaryFile(metadata:&FileMetadata) -> bool {
315	!IsTextFile(metadata)
316		|| metadata.mime_type == "application/octet-stream"
317		|| metadata.mime_type == "application/zip"
318		|| metadata.mime_type == "application/x-tar"
319		|| metadata.mime_type == "application/x-gzip"
320		|| metadata.mime_type == "application/x-bzip2"
321}