Skip to main content

AirLibrary/Indexing/Store/
QueryIndex.rs

1//! # QueryIndex
2//!
3//! ## File: Indexing/Store/QueryIndex.rs
4//!
5//! ## Role in Air Architecture
6//!
7//! Provides index query functionality for the File Indexer service,
8//! handling search operations across indexed files with multiple search modes.
9//!
10//! ## Primary Responsibility
11//!
12//! Query the file index to find symbols and content matching search criteria,
13//! supporting literal, regex, fuzzy, and exact search modes.
14//!
15//! ## Secondary Responsibilities
16//!
17//! - Multi-mode search (literal, regex, fuzzy, exact)
18//! - Case sensitivity and whole word matching
19//! - Path and language filtering
20//! - Result pagination and ranking
21//! - Search query sanitization
22//!
23//! ## Dependencies
24//!
25//! **External Crates:**
26//! - `regex` - Regular expression search patterns
27//! - `tokio` - Async file I/O operations
28//!
29//! **Internal Modules:**
30//! - `crate::Result` - Error handling type
31//! - `crate::AirError` - Error types
32//! - `super::super::FileIndex` - Index structure definitions
33//!
34//! ## Dependents
35//!
36//! - `Indexing::mod::FileIndexer` - Main file indexer implementation
37//!
38//! ## VSCode Pattern Reference
39//!
40//! Inspired by VSCode's search functionality in
41//! `src/vs/workbench/services/search/common/`
42//!
43//! ## Security Considerations
44//!
45//! - Search query sanitization prevents injection
46//! - Query length limits
47//! - Control character filtering
48//!
49//! ## Performance Considerations
50//!
51//! - Content index for fast token lookup
52//! - Result pagination to limit memory usage
53//! - Efficient string matching algorithms
54//! - Fuzzy search with configurable distance
55//!
56//! ## Error Handling Strategy
57//!
58//! Query operations return detailed error messages for invalid queries
59//! or search failures, treating individual file read errors as warnings.
60//!
61//! ## Thread Safety
62//!
63//! Query operations read from shared Arc<RwLock<>> state and
64//! return safe-ownership results for the caller.
65
66use std::path::PathBuf;
67
68use regex::Regex;
69
70use crate::{AirError, Result, dev_log};
71// Use the full paths to types in State::CreateState
72use crate::Indexing::State::CreateState::{FileIndex, FileMetadata};
73
74/// Maximum search results per query (pagination default)
75pub const MAX_SEARCH_RESULTS_DEFAULT:u32 = 100;
76
77/// Search query with multiple modes
78#[derive(Debug, Clone)]
79pub struct SearchQuery {
80	/// Search text
81	pub query:String,
82
83	/// Query mode (regex, literal, fuzzy)
84	pub mode:SearchMode,
85
86	/// Case sensitive search
87	pub case_sensitive:bool,
88
89	/// Exact word match
90	pub whole_word:bool,
91
92	/// Regex pattern (only for regex mode)
93	pub regex:Option<Regex>,
94
95	/// Maximum results per page
96	pub max_results:u32,
97
98	/// Page number for pagination
99	pub page:u32,
100}
101
102/// Search mode
103#[derive(Debug, Clone, PartialEq)]
104pub enum SearchMode {
105	/// Literal text search
106	Literal,
107
108	/// Regular expression search
109	Regex,
110
111	/// Fuzzy search with typo tolerance
112	Fuzzy,
113
114	/// Exact match
115	Exact,
116}
117
118/// Search result with relevance scoring
119#[derive(Debug, Clone)]
120pub struct SearchResult {
121	/// File path
122	pub path:String,
123
124	/// File name
125	pub file_name:String,
126
127	/// Matched lines with context
128	pub matches:Vec<SearchMatch>,
129
130	/// Relevance score (higher = more relevant)
131	pub relevance:f64,
132
133	/// Matched language (if applicable)
134	pub language:Option<String>,
135}
136
137/// Search match with full context
138#[derive(Debug, Clone)]
139pub struct SearchMatch {
140	/// Line number (1-indexed)
141	pub line_number:u32,
142
143	/// Line content
144	pub line_content:String,
145
146	/// Match start position
147	pub match_start:usize,
148
149	/// Match end position
150	pub match_end:usize,
151
152	/// Lines before match for context
153	pub context_before:Vec<String>,
154
155	/// Lines after match for context
156	pub context_after:Vec<String>,
157}
158
159/// Paginated search results
160#[derive(Debug, Clone)]
161pub struct PaginatedSearchResults {
162	/// Current page of results
163	pub results:Vec<SearchResult>,
164
165	/// Total number of results (across all pages)
166	pub total_count:u32,
167
168	/// Current page number (0-indexed)
169	pub page:u32,
170
171	/// Number of pages
172	pub total_pages:u32,
173
174	/// Results per page
175	pub page_size:u32,
176}
177
178impl IntoIterator for PaginatedSearchResults {
179	type Item = SearchResult;
180
181	type IntoIter = std::vec::IntoIter<SearchResult>;
182
183	fn into_iter(self) -> Self::IntoIter { self.results.into_iter() }
184}
185
186impl<'a> IntoIterator for &'a PaginatedSearchResults {
187	type Item = &'a SearchResult;
188
189	type IntoIter = std::slice::Iter<'a, SearchResult>;
190
191	fn into_iter(self) -> Self::IntoIter { self.results.iter() }
192}
193
194/// Search files with multiple modes and comprehensive query handling
195///
196/// Features:
197/// - Sanitized search query
198/// - Multiple search modes (literal, regex, fuzzy, exact)
199/// - Case sensitivity option
200/// - Whole word matching
201/// - Path filtering
202/// - Result pagination
203/// - Relevance-based ranking
204/// - Language filtering
205pub async fn QueryIndexSearch(
206	index:&FileIndex,
207
208	query:SearchQuery,
209
210	path_filter:Option<String>,
211
212	language_filter:Option<String>,
213) -> Result<PaginatedSearchResults> {
214	dev_log!(
215		"indexing",
216		"[QueryIndex] Searching for: '{}' (mode: {:?})",
217		query.query,
218		query.mode
219	);
220
221	// Sanitize search query
222	let sanitized_query = SanitizeSearchQuery(&query.query)?;
223
224	// Build search parameters
225	let case_sensitive = query.case_sensitive;
226
227	let whole_word = query.whole_word;
228
229	let max_results = if query.max_results == 0 {
230		MAX_SEARCH_RESULTS_DEFAULT
231	} else {
232		query.max_results.min(1000) // Cap at 1000 results
233	};
234
235	let mut all_results = Vec::new();
236
237	// Search based on mode
238	match query.mode {
239		SearchMode::Literal => {
240			QueryIndexLiteral(
241				&sanitized_query,
242				case_sensitive,
243				whole_word,
244				path_filter.as_deref(),
245				language_filter.as_deref(),
246				index,
247				&mut all_results,
248			)
249			.await;
250		},
251
252		SearchMode::Regex => {
253			if let Some(regex) = &query.regex {
254				QueryIndexRegex(
255					regex,
256					path_filter.as_deref(),
257					language_filter.as_deref(),
258					index,
259					&mut all_results,
260				)
261				.await;
262			} else {
263				// Try to compile regex from query
264				if let Ok(regex) = Regex::new(&sanitized_query) {
265					QueryIndexRegex(
266						&regex,
267						path_filter.as_deref(),
268						language_filter.as_deref(),
269						index,
270						&mut all_results,
271					)
272					.await;
273				}
274			}
275		},
276
277		SearchMode::Fuzzy => {
278			QueryIndexFuzzy(
279				&sanitized_query,
280				case_sensitive,
281				path_filter.as_deref(),
282				language_filter.as_deref(),
283				index,
284				&mut all_results,
285			)
286			.await;
287		},
288
289		SearchMode::Exact => {
290			QueryIndexExact(
291				&sanitized_query,
292				case_sensitive,
293				path_filter.as_deref(),
294				language_filter.as_deref(),
295				index,
296				&mut all_results,
297			)
298			.await;
299		},
300	}
301
302	// Rank results by relevance
303	all_results.sort_by(|a, b| b.relevance.partial_cmp(&a.relevance).unwrap());
304
305	// Calculate pagination
306	let total_count = all_results.len() as u32;
307
308	let total_pages = if max_results == 0 { 0 } else { total_count.div_ceil(max_results) };
309
310	let page = query.page.min(total_pages.saturating_sub(1));
311
312	// Extract current page
313	let start = (page * max_results) as usize;
314
315	let end = ((page + 1) * max_results).min(total_count) as usize;
316
317	let page_results = all_results[start..end].to_vec();
318
319	dev_log!(
320		"indexing",
321		"[QueryIndex] Search completed: {} total results, page {} of {}",
322		total_count,
323		page + 1,
324		total_pages
325	);
326
327	Ok(PaginatedSearchResults { results:page_results, total_count, page, total_pages, page_size:max_results })
328}
329
330/// Sanitize search query to prevent injection and invalid patterns
331pub fn SanitizeSearchQuery(query:&str) -> Result<String> {
332	// Remove null bytes and control characters
333	let sanitized:String = query.chars().filter(|c| *c != '\0' && !c.is_control()).collect();
334
335	// Limit query length
336	if sanitized.len() > 1000 {
337		return Err(AirError::Validation(
338			"Search query exceeds maximum length of 1000 characters".to_string(),
339		));
340	}
341
342	Ok(sanitized)
343}
344
345/// Literal search (default mode)
346async fn QueryIndexLiteral(
347	query:&str,
348
349	case_sensitive:bool,
350
351	whole_word:bool,
352
353	path_filter:Option<&str>,
354
355	language_filter:Option<&str>,
356
357	index:&FileIndex,
358
359	results:&mut Vec<SearchResult>,
360) {
361	let search_query = if case_sensitive { query.to_string() } else { query.to_lowercase() };
362
363	// Search in content index first (faster)
364	if let Some(file_paths) = index.content_index.get(&search_query.to_lowercase()) {
365		for file_path in file_paths {
366			if let Some(metadata) = index.files.get(file_path) {
367				if MatchesFilters(file_path, metadata, path_filter, language_filter) {
368					if let Ok(search_result) =
369						FindMatchesInFile(file_path, &search_query, case_sensitive, whole_word, index).await
370					{
371						if !search_result.matches.is_empty() {
372							results.push(search_result);
373						}
374					}
375				}
376			}
377		}
378	}
379
380	// Also search in file names
381	for (file_path, metadata) in &index.files {
382		if results.len() >= 1000 {
383			break;
384		}
385
386		let file_name = file_path.file_name().unwrap_or_default().to_string_lossy().to_string();
387
388		let name_to_search = if case_sensitive { file_name.clone() } else { file_name.to_lowercase() };
389
390		if name_to_search.contains(&search_query) {
391			if MatchesFilters(file_path, metadata, path_filter, language_filter) {
392				// Filename match has lower relevance than content match
393				results.push(SearchResult {
394					path:file_path.to_string_lossy().to_string(),
395					file_name,
396					matches:Vec::new(),
397					relevance:0.3,
398					language:metadata.language.clone(),
399				});
400			}
401		}
402	}
403}
404
405/// Regex search mode
406async fn QueryIndexRegex(
407	regex:&Regex,
408
409	path_filter:Option<&str>,
410
411	language_filter:Option<&str>,
412
413	index:&FileIndex,
414
415	results:&mut Vec<SearchResult>,
416) {
417	for (file_path, metadata) in &index.files {
418		if results.len() >= 1000 {
419			break;
420		}
421
422		if !MatchesFilters(file_path, metadata, path_filter, language_filter) {
423			continue;
424		}
425
426		if let Ok(content) = tokio::fs::read_to_string(file_path).await {
427			let matches = FindRegexMatches(&content, regex);
428
429			if !matches.is_empty() {
430				let relevance = CalculateRelevance(&matches, metadata);
431
432				results.push(SearchResult {
433					path:file_path.to_string_lossy().to_string(),
434					file_name:file_path.file_name().unwrap_or_default().to_string_lossy().to_string(),
435					matches,
436					relevance,
437					language:metadata.language.clone(),
438				});
439			}
440		}
441	}
442}
443
444/// Fuzzy search with typo tolerance using Levenshtein distance
445async fn QueryIndexFuzzy(
446	query:&str,
447
448	case_sensitive:bool,
449
450	path_filter:Option<&str>,
451
452	language_filter:Option<&str>,
453
454	index:&FileIndex,
455
456	results:&mut Vec<SearchResult>,
457) {
458	let query_lower = query.to_lowercase();
459
460	for (file_path, metadata) in &index.files {
461		if results.len() >= 1000 {
462			break;
463		}
464
465		if !MatchesFilters(file_path, metadata, path_filter, language_filter) {
466			continue;
467		}
468
469		if let Ok(content) = tokio::fs::read_to_string(file_path).await {
470			const MAX_FUZZY_DISTANCE:usize = 2;
471
472			let matches = FindFuzzyMatches(&content, &query_lower, case_sensitive, MAX_FUZZY_DISTANCE);
473
474			if !matches.is_empty() {
475				let relevance = CalculateRelevance(&matches, metadata) * 0.8; // Fuzzy matches have lower relevance
476
477				results.push(SearchResult {
478					path:file_path.to_string_lossy().to_string(),
479					file_name:file_path.file_name().unwrap_or_default().to_string_lossy().to_string(),
480					matches,
481					relevance,
482					language:metadata.language.clone(),
483				});
484			}
485		}
486	}
487}
488
489/// Exact match search (whole word, case-sensitive)
490async fn QueryIndexExact(
491	query:&str,
492
493	_case_sensitive:bool,
494
495	path_filter:Option<&str>,
496
497	language_filter:Option<&str>,
498
499	index:&FileIndex,
500
501	results:&mut Vec<SearchResult>,
502) {
503	for (file_path, metadata) in &index.files {
504		if results.len() >= 1000 {
505			break;
506		}
507
508		if !MatchesFilters(file_path, metadata, path_filter, language_filter) {
509			continue;
510		}
511
512		if let Ok(content) = tokio::fs::read_to_string(file_path).await {
513			let matches = FindExactMatches(&content, query);
514
515			if !matches.is_empty() {
516				let relevance = CalculateRelevance(&matches, metadata) * 1.1; // Exact matches have higher relevance
517
518				results.push(SearchResult {
519					path:file_path.to_string_lossy().to_string(),
520					file_name:file_path.file_name().unwrap_or_default().to_string_lossy().to_string(),
521					matches,
522					relevance,
523					language:metadata.language.clone(),
524				});
525			}
526		}
527	}
528}
529
530/// Find matches in a single file with context
531async fn FindMatchesInFile(
532	file_path:&PathBuf,
533
534	query:&str,
535
536	case_sensitive:bool,
537
538	whole_word:bool,
539
540	index:&FileIndex,
541) -> Result<SearchResult> {
542	let content = tokio::fs::read_to_string(file_path)
543		.await
544		.map_err(|e| AirError::FileSystem(format!("Failed to read file: {}", e)))?;
545
546	let metadata = index
547		.files
548		.get(file_path)
549		.ok_or_else(|| AirError::Internal("File metadata not found in index".to_string()))?;
550
551	let matches = FindMatchesWithContext(&content, query, case_sensitive, whole_word);
552
553	let relevance = CalculateRelevance(&matches, metadata);
554
555	Ok(SearchResult {
556		path:file_path.to_string_lossy().to_string(),
557		file_name:file_path.file_name().unwrap_or_default().to_string_lossy().to_string(),
558		matches,
559		relevance,
560		language:metadata.language.clone(),
561	})
562}
563
564/// Find matches in content with surrounding context
565fn FindMatchesWithContext(content:&str, query:&str, case_sensitive:bool, whole_word:bool) -> Vec<SearchMatch> {
566	let mut matches = Vec::new();
567
568	let lines:Vec<&str> = content.lines().collect();
569
570	let search_in = |line:&str| -> Option<(usize, usize)> {
571		let line_to_search = if case_sensitive { line.to_string() } else { line.to_lowercase() };
572
573		let query_to_find = if case_sensitive { query.to_string() } else { query.to_lowercase() };
574
575		let start = if whole_word {
576			FindWholeWordMatch(&line_to_search, &query_to_find)
577		} else {
578			line_to_search.find(&query_to_find)
579		};
580
581		start.map(|s| (s, s + query.len()))
582	};
583
584	for (line_idx, line) in lines.iter().enumerate() {
585		let line_number = line_idx as u32 + 1;
586
587		if let Some((match_start, match_end)) = search_in(line) {
588			// Get context lines (2 before, 2 after)
589			let context_start = line_idx.saturating_sub(2);
590
591			let context_end = (line_idx + 3).min(lines.len());
592
593			let context_before = lines[context_start..line_idx].iter().map(|s| s.to_string()).collect();
594
595			let context_after = lines[line_idx + 1..context_end].iter().map(|s| s.to_string()).collect();
596
597			matches.push(SearchMatch {
598				line_number,
599				line_content:line.to_string(),
600				match_start,
601				match_end,
602				context_before,
603				context_after,
604			});
605		}
606	}
607
608	matches
609}
610
611/// Find whole word match with word boundary detection
612fn FindWholeWordMatch(line:&str, word:&str) -> Option<usize> {
613	let mut start = 0;
614
615	while let Some(pos) = line[start..].find(word) {
616		let actual_pos = start + pos;
617
618		// Check word boundary before
619		let valid_before = actual_pos == 0
620			|| line
621				.chars()
622				.nth(actual_pos - 1)
623				.map_or(true, |c| !c.is_alphanumeric() && c != '_');
624
625		// Check word boundary after
626		let match_end = actual_pos + word.len();
627
628		let valid_after =
629			match_end == line.len() || line.chars().nth(match_end).map_or(true, |c| !c.is_alphanumeric() && c != '_');
630
631		if valid_before && valid_after {
632			return Some(actual_pos);
633		}
634
635		start = actual_pos + 1;
636	}
637
638	None
639}
640
641/// Find regex matches in content
642fn FindRegexMatches(content:&str, regex:&Regex) -> Vec<SearchMatch> {
643	let mut matches = Vec::new();
644
645	let lines:Vec<&str> = content.lines().collect();
646
647	for (line_idx, line) in lines.iter().enumerate() {
648		let line_number = line_idx as u32 + 1;
649
650		for mat in regex.find_iter(line) {
651			matches.push(SearchMatch {
652				line_number,
653				line_content:line.to_string(),
654				match_start:mat.start(),
655				match_end:mat.end(),
656				context_before:Vec::new(),
657				context_after:Vec::new(),
658			});
659		}
660	}
661
662	matches
663}
664
665/// Find fuzzy matches using Levenshtein distance algorithm
666fn FindFuzzyMatches(content:&str, query:&str, case_sensitive:bool, max_distance:usize) -> Vec<SearchMatch> {
667	let mut matches = Vec::new();
668
669	let lines:Vec<&str> = content.lines().collect();
670
671	for (line_idx, line) in lines.iter().enumerate() {
672		let line_number = line_idx as u32 + 1;
673
674		let line_to_search = if case_sensitive { line.to_string() } else { line.to_lowercase() };
675
676		// Calculate Levenshtein distance for fuzzy matching
677		if let Some(pos) = line_to_search.find(query) {
678			// Check if the match is within the MaxDistance threshold
679			let distance = CalculateLevenshteinDistance(&line_to_search[pos..pos.saturating_add(query.len())], query);
680
681			if distance <= max_distance {
682				matches.push(SearchMatch {
683					line_number,
684					line_content:line.to_string(),
685					match_start:pos,
686					match_end:pos + query.len(),
687					context_before:Vec::new(),
688					context_after:Vec::new(),
689				});
690			}
691		}
692	}
693
694	matches
695}
696
697/// Find exact matches (word boundary and case-sensitive)
698fn FindExactMatches(content:&str, query:&str) -> Vec<SearchMatch> { FindMatchesWithContext(content, query, true, true) }
699
700/// Calculate Levenshtein distance between two strings
701fn CalculateLevenshteinDistance(s1:&str, s2:&str) -> usize {
702	let s1_chars:Vec<char> = s1.chars().collect();
703
704	let s2_chars:Vec<char> = s2.chars().collect();
705
706	let len1 = s1_chars.len();
707
708	let len2 = s2_chars.len();
709
710	// Create a 2D matrix to store distances
711	let mut dp = vec![vec![0usize; len2 + 1]; len1 + 1];
712
713	// Initialize the matrix
714	for i in 0..=len1 {
715		dp[i][0] = i;
716	}
717
718	for j in 0..=len2 {
719		dp[0][j] = j;
720	}
721
722	// Calculate distances
723	for i in 1..=len1 {
724		for j in 1..=len2 {
725			if s1_chars[i - 1] == s2_chars[j - 1] {
726				dp[i][j] = dp[i - 1][j - 1];
727			} else {
728				dp[i][j] = 1 + [
729					dp[i - 1][j],     // deletion
730					dp[i][j - 1],     // insertion
731					dp[i - 1][j - 1], // substitution
732				]
733				.into_iter()
734				.min()
735				.unwrap();
736			}
737		}
738	}
739
740	dp[len1][len2]
741}
742
743/// Calculate relevance score for search results
744fn CalculateRelevance(matches:&[SearchMatch], metadata:&FileMetadata) -> f64 {
745	let match_count = matches.len();
746
747	let line_count = metadata.line_count.unwrap_or(1) as f64;
748
749	// Base relevance: ratio of matching lines to total lines
750	let mut relevance = (match_count as f64 / line_count) * 10.0;
751
752	// Bonus for more matches
753	if match_count > 0 {
754		relevance += (match_count as f64).log10() * 0.5;
755	}
756
757	// Bonus for recently modified files
758	let days_old = (chrono::Utc::now() - metadata.modified).num_days() as f64;
759
760	relevance += 1.0 / (days_old + 1.0).max(1.0);
761
762	relevance.min(10.0).max(0.0)
763}
764
765/// Check if file matches filters
766pub fn MatchesFilters(
767	file_path:&PathBuf,
768
769	metadata:&FileMetadata,
770
771	path_filter:Option<&str>,
772
773	language_filter:Option<&str>,
774) -> bool {
775	// Check path filter
776	if let Some(search_path) = path_filter {
777		if !file_path.to_string_lossy().contains(search_path) {
778			return false;
779		}
780	}
781
782	// Check language filter
783	if let Some(lang) = language_filter {
784		if metadata.language.as_deref() != Some(lang) {
785			return false;
786		}
787	}
788
789	true
790}