AirLibrary/Indexing/Store/
QueryIndex.rs

1//! # QueryIndex
2//!
3//! ## File: Indexing/Store/QueryIndex.rs
4//!
5//! ## Role in Air Architecture
6//!
7//! Provides index query functionality for the File Indexer service,
8//! handling search operations across indexed files with multiple search modes.
9//!
10//! ## Primary Responsibility
11//!
12//! Query the file index to find symbols and content matching search criteria,
13//! supporting literal, regex, fuzzy, and exact search modes.
14//!
15//! ## Secondary Responsibilities
16//!
17//! - Multi-mode search (literal, regex, fuzzy, exact)
18//! - Case sensitivity and whole word matching
19//! - Path and language filtering
20//! - Result pagination and ranking
21//! - Search query sanitization
22//!
23//! ## Dependencies
24//!
25//! **External Crates:**
26//! - `regex` - Regular expression search patterns
27//! - `tokio` - Async file I/O operations
28//!
29//! **Internal Modules:**
30//! - `crate::Result` - Error handling type
31//! - `crate::AirError` - Error types
32//! - `super::super::FileIndex` - Index structure definitions
33//!
34//! ## Dependents
35//!
36//! - `Indexing::mod::FileIndexer` - Main file indexer implementation
37//!
38//! ## VSCode Pattern Reference
39//!
40//! Inspired by VSCode's search functionality in
41//! `src/vs/workbench/services/search/common/`
42//!
43//! ## Security Considerations
44//!
45//! - Search query sanitization prevents injection
46//! - Query length limits
47//! - Control character filtering
48//!
49//! ## Performance Considerations
50//!
51//! - Content index for fast token lookup
52//! - Result pagination to limit memory usage
53//! - Efficient string matching algorithms
54//! - Fuzzy search with configurable distance
55//!
56//! ## Error Handling Strategy
57//!
58//! Query operations return detailed error messages for invalid queries
59//! or search failures, treating individual file read errors as warnings.
60//!
61//! ## Thread Safety
62//!
63//! Query operations read from shared Arc<RwLock<>> state and
64//! return safe-ownership results for the caller.
65
66use std::path::PathBuf;
67
68use regex::Regex;
69
70use crate::{AirError, Result};
71// Use the full paths to types in State::CreateState
72use crate::Indexing::State::CreateState::{FileIndex, FileMetadata};
73
74/// Maximum search results per query (pagination default)
75pub const MAX_SEARCH_RESULTS_DEFAULT:u32 = 100;
76
77/// Search query with multiple modes
78#[derive(Debug, Clone)]
79pub struct SearchQuery {
80	/// Search text
81	pub query:String,
82	/// Query mode (regex, literal, fuzzy)
83	pub mode:SearchMode,
84	/// Case sensitive search
85	pub case_sensitive:bool,
86	/// Exact word match
87	pub whole_word:bool,
88	/// Regex pattern (only for regex mode)
89	pub regex:Option<Regex>,
90	/// Maximum results per page
91	pub max_results:u32,
92	/// Page number for pagination
93	pub page:u32,
94}
95
96/// Search mode
97#[derive(Debug, Clone, PartialEq)]
98pub enum SearchMode {
99	/// Literal text search
100	Literal,
101	/// Regular expression search
102	Regex,
103	/// Fuzzy search with typo tolerance
104	Fuzzy,
105	/// Exact match
106	Exact,
107}
108
109/// Search result with relevance scoring
110#[derive(Debug, Clone)]
111pub struct SearchResult {
112	/// File path
113	pub path:String,
114	/// File name
115	pub file_name:String,
116	/// Matched lines with context
117	pub matches:Vec<SearchMatch>,
118	/// Relevance score (higher = more relevant)
119	pub relevance:f64,
120	/// Matched language (if applicable)
121	pub language:Option<String>,
122}
123
124/// Search match with full context
125#[derive(Debug, Clone)]
126pub struct SearchMatch {
127	/// Line number (1-indexed)
128	pub line_number:u32,
129	/// Line content
130	pub line_content:String,
131	/// Match start position
132	pub match_start:usize,
133	/// Match end position
134	pub match_end:usize,
135	/// Lines before match for context
136	pub context_before:Vec<String>,
137	/// Lines after match for context
138	pub context_after:Vec<String>,
139}
140
141/// Paginated search results
142#[derive(Debug, Clone)]
143pub struct PaginatedSearchResults {
144	/// Current page of results
145	pub results:Vec<SearchResult>,
146	/// Total number of results (across all pages)
147	pub total_count:u32,
148	/// Current page number (0-indexed)
149	pub page:u32,
150	/// Number of pages
151	pub total_pages:u32,
152	/// Results per page
153	pub page_size:u32,
154}
155
156impl IntoIterator for PaginatedSearchResults {
157	type Item = SearchResult;
158	type IntoIter = std::vec::IntoIter<SearchResult>;
159
160	fn into_iter(self) -> Self::IntoIter { self.results.into_iter() }
161}
162
163impl<'a> IntoIterator for &'a PaginatedSearchResults {
164	type Item = &'a SearchResult;
165	type IntoIter = std::slice::Iter<'a, SearchResult>;
166
167	fn into_iter(self) -> Self::IntoIter { self.results.iter() }
168}
169
170/// Search files with multiple modes and comprehensive query handling
171///
172/// Features:
173/// - Sanitized search query
174/// - Multiple search modes (literal, regex, fuzzy, exact)
175/// - Case sensitivity option
176/// - Whole word matching
177/// - Path filtering
178/// - Result pagination
179/// - Relevance-based ranking
180/// - Language filtering
181pub async fn QueryIndexSearch(
182	index:&FileIndex,
183	query:SearchQuery,
184	path_filter:Option<String>,
185	language_filter:Option<String>,
186) -> Result<PaginatedSearchResults> {
187	log::info!("[QueryIndex] Searching for: '{}' (mode: {:?})", query.query, query.mode);
188
189	// Sanitize search query
190	let sanitized_query = SanitizeSearchQuery(&query.query)?;
191
192	// Build search parameters
193	let case_sensitive = query.case_sensitive;
194	let whole_word = query.whole_word;
195	let max_results = if query.max_results == 0 {
196		MAX_SEARCH_RESULTS_DEFAULT
197	} else {
198		query.max_results.min(1000) // Cap at 1000 results
199	};
200
201	let mut all_results = Vec::new();
202
203	// Search based on mode
204	match query.mode {
205		SearchMode::Literal => {
206			QueryIndexLiteral(
207				&sanitized_query,
208				case_sensitive,
209				whole_word,
210				path_filter.as_deref(),
211				language_filter.as_deref(),
212				index,
213				&mut all_results,
214			)
215			.await;
216		},
217		SearchMode::Regex => {
218			if let Some(regex) = &query.regex {
219				QueryIndexRegex(
220					regex,
221					path_filter.as_deref(),
222					language_filter.as_deref(),
223					index,
224					&mut all_results,
225				)
226				.await;
227			} else {
228				// Try to compile regex from query
229				if let Ok(regex) = Regex::new(&sanitized_query) {
230					QueryIndexRegex(
231						&regex,
232						path_filter.as_deref(),
233						language_filter.as_deref(),
234						index,
235						&mut all_results,
236					)
237					.await;
238				}
239			}
240		},
241		SearchMode::Fuzzy => {
242			QueryIndexFuzzy(
243				&sanitized_query,
244				case_sensitive,
245				path_filter.as_deref(),
246				language_filter.as_deref(),
247				index,
248				&mut all_results,
249			)
250			.await;
251		},
252		SearchMode::Exact => {
253			QueryIndexExact(
254				&sanitized_query,
255				case_sensitive,
256				path_filter.as_deref(),
257				language_filter.as_deref(),
258				index,
259				&mut all_results,
260			)
261			.await;
262		},
263	}
264
265	// Rank results by relevance
266	all_results.sort_by(|a, b| b.relevance.partial_cmp(&a.relevance).unwrap());
267
268	// Calculate pagination
269	let total_count = all_results.len() as u32;
270	let total_pages = if max_results == 0 { 0 } else { total_count.div_ceil(max_results) };
271	let page = query.page.min(total_pages.saturating_sub(1));
272
273	// Extract current page
274	let start = (page * max_results) as usize;
275	let end = ((page + 1) * max_results).min(total_count) as usize;
276	let page_results = all_results[start..end].to_vec();
277
278	log::info!(
279		"[QueryIndex] Search completed: {} total results, page {} of {}",
280		total_count,
281		page + 1,
282		total_pages
283	);
284
285	Ok(PaginatedSearchResults { results:page_results, total_count, page, total_pages, page_size:max_results })
286}
287
288/// Sanitize search query to prevent injection and invalid patterns
289pub fn SanitizeSearchQuery(query:&str) -> Result<String> {
290	// Remove null bytes and control characters
291	let sanitized:String = query.chars().filter(|c| *c != '\0' && !c.is_control()).collect();
292
293	// Limit query length
294	if sanitized.len() > 1000 {
295		return Err(AirError::Validation(
296			"Search query exceeds maximum length of 1000 characters".to_string(),
297		));
298	}
299
300	Ok(sanitized)
301}
302
303/// Literal search (default mode)
304async fn QueryIndexLiteral(
305	query:&str,
306	case_sensitive:bool,
307	whole_word:bool,
308	path_filter:Option<&str>,
309	language_filter:Option<&str>,
310	index:&FileIndex,
311	results:&mut Vec<SearchResult>,
312) {
313	let search_query = if case_sensitive { query.to_string() } else { query.to_lowercase() };
314
315	// Search in content index first (faster)
316	if let Some(file_paths) = index.content_index.get(&search_query.to_lowercase()) {
317		for file_path in file_paths {
318			if let Some(metadata) = index.files.get(file_path) {
319				if MatchesFilters(file_path, metadata, path_filter, language_filter) {
320					if let Ok(search_result) =
321						FindMatchesInFile(file_path, &search_query, case_sensitive, whole_word, index).await
322					{
323						if !search_result.matches.is_empty() {
324							results.push(search_result);
325						}
326					}
327				}
328			}
329		}
330	}
331
332	// Also search in file names
333	for (file_path, metadata) in &index.files {
334		if results.len() >= 1000 {
335			break;
336		}
337
338		let file_name = file_path.file_name().unwrap_or_default().to_string_lossy().to_string();
339
340		let name_to_search = if case_sensitive { file_name.clone() } else { file_name.to_lowercase() };
341
342		if name_to_search.contains(&search_query) {
343			if MatchesFilters(file_path, metadata, path_filter, language_filter) {
344				// Filename match has lower relevance than content match
345				results.push(SearchResult {
346					path:file_path.to_string_lossy().to_string(),
347					file_name,
348					matches:Vec::new(),
349					relevance:0.3,
350					language:metadata.language.clone(),
351				});
352			}
353		}
354	}
355}
356
357/// Regex search mode
358async fn QueryIndexRegex(
359	regex:&Regex,
360	path_filter:Option<&str>,
361	language_filter:Option<&str>,
362	index:&FileIndex,
363	results:&mut Vec<SearchResult>,
364) {
365	for (file_path, metadata) in &index.files {
366		if results.len() >= 1000 {
367			break;
368		}
369
370		if !MatchesFilters(file_path, metadata, path_filter, language_filter) {
371			continue;
372		}
373
374		if let Ok(content) = tokio::fs::read_to_string(file_path).await {
375			let matches = FindRegexMatches(&content, regex);
376
377			if !matches.is_empty() {
378				let relevance = CalculateRelevance(&matches, metadata);
379
380				results.push(SearchResult {
381					path:file_path.to_string_lossy().to_string(),
382					file_name:file_path.file_name().unwrap_or_default().to_string_lossy().to_string(),
383					matches,
384					relevance,
385					language:metadata.language.clone(),
386				});
387			}
388		}
389	}
390}
391
392/// Fuzzy search with typo tolerance using Levenshtein distance
393async fn QueryIndexFuzzy(
394	query:&str,
395	case_sensitive:bool,
396	path_filter:Option<&str>,
397	language_filter:Option<&str>,
398	index:&FileIndex,
399	results:&mut Vec<SearchResult>,
400) {
401	let query_lower = query.to_lowercase();
402
403	for (file_path, metadata) in &index.files {
404		if results.len() >= 1000 {
405			break;
406		}
407
408		if !MatchesFilters(file_path, metadata, path_filter, language_filter) {
409			continue;
410		}
411
412		if let Ok(content) = tokio::fs::read_to_string(file_path).await {
413			let max_distance = 2; // TODO: Make this configurable
414			let matches = FindFuzzyMatches(&content, &query_lower, case_sensitive, max_distance);
415
416			if !matches.is_empty() {
417				let relevance = CalculateRelevance(&matches, metadata) * 0.8; // Fuzzy matches have lower relevance
418
419				results.push(SearchResult {
420					path:file_path.to_string_lossy().to_string(),
421					file_name:file_path.file_name().unwrap_or_default().to_string_lossy().to_string(),
422					matches,
423					relevance,
424					language:metadata.language.clone(),
425				});
426			}
427		}
428	}
429}
430
431/// Exact match search (whole word, case-sensitive)
432async fn QueryIndexExact(
433	query:&str,
434	_case_sensitive:bool,
435	path_filter:Option<&str>,
436	language_filter:Option<&str>,
437	index:&FileIndex,
438	results:&mut Vec<SearchResult>,
439) {
440	for (file_path, metadata) in &index.files {
441		if results.len() >= 1000 {
442			break;
443		}
444
445		if !MatchesFilters(file_path, metadata, path_filter, language_filter) {
446			continue;
447		}
448
449		if let Ok(content) = tokio::fs::read_to_string(file_path).await {
450			let matches = FindExactMatches(&content, query);
451
452			if !matches.is_empty() {
453				let relevance = CalculateRelevance(&matches, metadata) * 1.1; // Exact matches have higher relevance
454
455				results.push(SearchResult {
456					path:file_path.to_string_lossy().to_string(),
457					file_name:file_path.file_name().unwrap_or_default().to_string_lossy().to_string(),
458					matches,
459					relevance,
460					language:metadata.language.clone(),
461				});
462			}
463		}
464	}
465}
466
467/// Find matches in a single file with context
468async fn FindMatchesInFile(
469	file_path:&PathBuf,
470	query:&str,
471	case_sensitive:bool,
472	whole_word:bool,
473	index:&FileIndex,
474) -> Result<SearchResult> {
475	let content = tokio::fs::read_to_string(file_path)
476		.await
477		.map_err(|e| AirError::FileSystem(format!("Failed to read file: {}", e)))?;
478
479	let metadata = index
480		.files
481		.get(file_path)
482		.ok_or_else(|| AirError::Internal("File metadata not found in index".to_string()))?;
483
484	let matches = FindMatchesWithContext(&content, query, case_sensitive, whole_word);
485	let relevance = CalculateRelevance(&matches, metadata);
486
487	Ok(SearchResult {
488		path:file_path.to_string_lossy().to_string(),
489		file_name:file_path.file_name().unwrap_or_default().to_string_lossy().to_string(),
490		matches,
491		relevance,
492		language:metadata.language.clone(),
493	})
494}
495
496/// Find matches in content with surrounding context
497fn FindMatchesWithContext(content:&str, query:&str, case_sensitive:bool, whole_word:bool) -> Vec<SearchMatch> {
498	let mut matches = Vec::new();
499	let lines:Vec<&str> = content.lines().collect();
500
501	let search_in = |line:&str| -> Option<(usize, usize)> {
502		let line_to_search = if case_sensitive { line.to_string() } else { line.to_lowercase() };
503
504		let query_to_find = if case_sensitive { query.to_string() } else { query.to_lowercase() };
505
506		let start = if whole_word {
507			FindWholeWordMatch(&line_to_search, &query_to_find)
508		} else {
509			line_to_search.find(&query_to_find)
510		};
511
512		start.map(|s| (s, s + query.len()))
513	};
514
515	for (line_idx, line) in lines.iter().enumerate() {
516		let line_number = line_idx as u32 + 1;
517
518		if let Some((match_start, match_end)) = search_in(line) {
519			// Get context lines (2 before, 2 after)
520			let context_start = line_idx.saturating_sub(2);
521			let context_end = (line_idx + 3).min(lines.len());
522
523			let context_before = lines[context_start..line_idx].iter().map(|s| s.to_string()).collect();
524
525			let context_after = lines[line_idx + 1..context_end].iter().map(|s| s.to_string()).collect();
526
527			matches.push(SearchMatch {
528				line_number,
529				line_content:line.to_string(),
530				match_start,
531				match_end,
532				context_before,
533				context_after,
534			});
535		}
536	}
537
538	matches
539}
540
541/// Find whole word match with word boundary detection
542fn FindWholeWordMatch(line:&str, word:&str) -> Option<usize> {
543	let mut start = 0;
544
545	while let Some(pos) = line[start..].find(word) {
546		let actual_pos = start + pos;
547
548		// Check word boundary before
549		let valid_before = actual_pos == 0
550			|| line
551				.chars()
552				.nth(actual_pos - 1)
553				.map_or(true, |c| !c.is_alphanumeric() && c != '_');
554
555		// Check word boundary after
556		let match_end = actual_pos + word.len();
557		let valid_after =
558			match_end == line.len() || line.chars().nth(match_end).map_or(true, |c| !c.is_alphanumeric() && c != '_');
559
560		if valid_before && valid_after {
561			return Some(actual_pos);
562		}
563
564		start = actual_pos + 1;
565	}
566
567	None
568}
569
570/// Find regex matches in content
571fn FindRegexMatches(content:&str, regex:&Regex) -> Vec<SearchMatch> {
572	let mut matches = Vec::new();
573	let lines:Vec<&str> = content.lines().collect();
574
575	for (line_idx, line) in lines.iter().enumerate() {
576		let line_number = line_idx as u32 + 1;
577
578		for mat in regex.find_iter(line) {
579			matches.push(SearchMatch {
580				line_number,
581				line_content:line.to_string(),
582				match_start:mat.start(),
583				match_end:mat.end(),
584				context_before:Vec::new(),
585				context_after:Vec::new(),
586			});
587		}
588	}
589
590	matches
591}
592
593/// Find fuzzy matches using Levenshtein distance algorithm
594fn FindFuzzyMatches(content:&str, query:&str, case_sensitive:bool, max_distance:usize) -> Vec<SearchMatch> {
595	let mut matches = Vec::new();
596	let lines:Vec<&str> = content.lines().collect();
597
598	for (line_idx, line) in lines.iter().enumerate() {
599		let line_number = line_idx as u32 + 1;
600		let line_to_search = if case_sensitive { line.to_string() } else { line.to_lowercase() };
601
602		// Calculate Levenshtein distance for fuzzy matching
603		if let Some(pos) = line_to_search.find(query) {
604			// Check if the match is within the MaxDistance threshold
605			let distance = CalculateLevenshteinDistance(&line_to_search[pos..pos.saturating_add(query.len())], query);
606
607			if distance <= max_distance {
608				matches.push(SearchMatch {
609					line_number,
610					line_content:line.to_string(),
611					match_start:pos,
612					match_end:pos + query.len(),
613					context_before:Vec::new(),
614					context_after:Vec::new(),
615				});
616			}
617		}
618	}
619
620	matches
621}
622
623/// Find exact matches (word boundary and case-sensitive)
624fn FindExactMatches(content:&str, query:&str) -> Vec<SearchMatch> { FindMatchesWithContext(content, query, true, true) }
625
626/// Calculate Levenshtein distance between two strings
627fn CalculateLevenshteinDistance(s1:&str, s2:&str) -> usize {
628	let s1_chars:Vec<char> = s1.chars().collect();
629	let s2_chars:Vec<char> = s2.chars().collect();
630	let len1 = s1_chars.len();
631	let len2 = s2_chars.len();
632
633	// Create a 2D matrix to store distances
634	let mut dp = vec![vec![0usize; len2 + 1]; len1 + 1];
635
636	// Initialize the matrix
637	for i in 0..=len1 {
638		dp[i][0] = i;
639	}
640	for j in 0..=len2 {
641		dp[0][j] = j;
642	}
643
644	// Calculate distances
645	for i in 1..=len1 {
646		for j in 1..=len2 {
647			if s1_chars[i - 1] == s2_chars[j - 1] {
648				dp[i][j] = dp[i - 1][j - 1];
649			} else {
650				dp[i][j] = 1 + [
651					dp[i - 1][j],     // deletion
652					dp[i][j - 1],     // insertion
653					dp[i - 1][j - 1], // substitution
654				]
655				.into_iter()
656				.min()
657				.unwrap();
658			}
659		}
660	}
661
662	dp[len1][len2]
663}
664
665/// Calculate relevance score for search results
666fn CalculateRelevance(matches:&[SearchMatch], metadata:&FileMetadata) -> f64 {
667	let match_count = matches.len();
668	let line_count = metadata.line_count.unwrap_or(1) as f64;
669
670	// Base relevance: ratio of matching lines to total lines
671	let mut relevance = (match_count as f64 / line_count) * 10.0;
672
673	// Bonus for more matches
674	if match_count > 0 {
675		relevance += (match_count as f64).log10() * 0.5;
676	}
677
678	// Bonus for recently modified files
679	let days_old = (chrono::Utc::now() - metadata.modified).num_days() as f64;
680	relevance += 1.0 / (days_old + 1.0).max(1.0);
681
682	relevance.min(10.0).max(0.0)
683}
684
685/// Check if file matches filters
686pub fn MatchesFilters(
687	file_path:&PathBuf,
688	metadata:&FileMetadata,
689	path_filter:Option<&str>,
690	language_filter:Option<&str>,
691) -> bool {
692	// Check path filter
693	if let Some(search_path) = path_filter {
694		if !file_path.to_string_lossy().contains(search_path) {
695			return false;
696		}
697	}
698
699	// Check language filter
700	if let Some(lang) = language_filter {
701		if metadata.language.as_deref() != Some(lang) {
702			return false;
703		}
704	}
705
706	true
707}