Skip to main content

AirLibrary/Indexing/Scan/
ScanDirectory.rs

1//! # ScanDirectory
2//!
3//! ## File: Indexing/Scan/ScanDirectory.rs
4//!
5//! ## Role in Air Architecture
6//!
7//! Provides directory scanning functionality for the File Indexer service,
8//! handling recursive traversal of directories to discover files for indexing.
9//!
10//! ## Primary Responsibility
11//!
12//! Scan directories recursively to discover files matching include patterns
13//! while respecting exclude patterns and filesystem limits.
14//!
15//! ## Secondary Responsibilities
16//!
17//! - Validate directory permissions before scanning
18//! - Parallel file enumeration for performance
19//! - Skip directories like node_modules, target, .git
20//! - Collect files with metadata for batch processing
21//!
22//! ## Dependencies
23//!
24//! **External Crates:**
25//! - `ignore` - .gitignore-aware directory walking
26//! - `tokio` - Async runtime for I/O operations
27//!
28//! **Internal Modules:**
29//! - `crate::Result` - Error handling type
30//! - `crate::AirError` - Error types
31//! - `crate::Configuration::IndexingConfig` - Indexing configuration
32//!
33//! ## Dependents
34//!
35//! - `Indexing::mod::FileIndexer` - Main file indexer implementation
36//! - `Indexing::Background::StartWatcher` - Background task scanning
37//!
38//! ## VSCode Pattern Reference
39//!
40//! Inspired by VSCode's file system scanning in
41//! `src/vs/base/common/files/`
42//!
43//! ## Security Considerations
44//!
45//! - Path traversal protection through canonicalization
46//! - Symbolic link following disabled by default
47//! - Depth limits prevent infinite recursion
48//! - Permission checking before access
49//!
50//! ## Performance Considerations
51//!
52//! - Parallel directory scanning with limited concurrency
53//! - Batch collection of files for processing
54//! - Lazy evaluation with ignore crate
55//! - Early filtering by file patterns
56//!
57//! ## Error Handling Strategy
58//!
59//! Scan operations log warnings for individual errors and continue,
60//! returning a result only if the top-level operation fails.
61//!
62//! ## Thread Safety
63//!
64//! Scan operations are designed to be called from async tasks and
65//! return collectable results for parallel processing.
66
67use std::{path::Path, sync::Arc};
68
69use tokio::sync::Semaphore;
70
71use crate::{
72	AirError,
73	Configuration::IndexingConfig,
74	Indexing::{Scan::ScanFile::ValidateFileAccess, State::CreateState::FileIndex},
75	Result,
76	dev_log,
77};
78
79/// Scan directory result with statistics
80#[derive(Debug, Clone)]
81pub struct ScanDirectoryResult {
82	/// Number of files discovered
83	pub files_found:u32,
84
85	/// Number of files skipped (due to patterns/size)
86	pub files_skipped:u32,
87
88	/// Number of errors encountered
89	pub errors:u32,
90
91	/// Total size of discovered files in bytes
92	pub total_size:u64,
93}
94
95/// Scan a directory recursively and collect matching files
96///
97/// Features:
98/// - Path traversal protection
99/// - Symbolic link handling (disabled by default)
100/// - File size validation
101/// - Permission error handling
102/// - Include/exclude pattern support
103/// - Parallel scanning with semaphore limits
104pub async fn ScanDirectory(
105	path:&str,
106
107	patterns:Vec<String>,
108
109	config:&IndexingConfig,
110
111	_max_parallel:usize,
112) -> Result<(Vec<std::path::PathBuf>, ScanDirectoryResult)> {
113	let directory_path = crate::Configuration::ConfigurationManager::ExpandPath(path)?;
114
115	// Validate directory exists and is accessible
116	if !directory_path.exists() {
117		return Err(AirError::FileSystem(format!("Directory does not exist: {}", path)));
118	}
119
120	if !directory_path.is_dir() {
121		return Err(AirError::FileSystem(format!("Path is not a directory: {}", path)));
122	}
123
124	// Check directory permissions
125	CheckDirectoryPermissions(&directory_path).await?;
126
127	// Build file patterns
128	let include_patterns = if patterns.is_empty() { config.FileTypes.clone() } else { patterns };
129
130	// Walk directory with .gitignore support
131	let walker = ignore::WalkBuilder::new(&directory_path)
132		.max_depth(Some(10)) // Prevent infinite recursion
133		.hidden(false)
134		.follow_links(false) // Don't follow symlinks by default
135		.build();
136
137	let mut files_to_scan:Vec<std::path::PathBuf> = Vec::new();
138
139	let mut files_found = 0u32;
140
141	let mut files_skipped = 0u32;
142
143	let mut errors = 0u32;
144
145	let mut total_size = 0u64;
146
147	// Collect all files first
148	for result in walker {
149		match result {
150			Ok(entry) => {
151				// Only index regular files (not directories or symlinks)
152				if entry.file_type().map(|ft| ft.is_file()).unwrap_or(false) {
153					let file_path = entry.path().to_path_buf();
154
155					// Check if file is a symbolic link
156					if entry.path_is_symlink() {
157						dev_log!("indexing", "[ScanDirectory] Skipping symlink: {}", file_path.display());
158
159						files_skipped += 1;
160
161						continue;
162					}
163
164					// Check file size limit
165					if let Ok(metadata) = entry.metadata() {
166						let file_size = metadata.len();
167
168						if file_size > config.MaxFileSizeMb as u64 * 1024 * 1024 {
169							dev_log!(
170								"indexing",
171								"warn: [ScanDirectory] Skipping oversized file: {} ({} bytes)",
172								file_path.display(),
173								file_size
174							);
175
176							files_skipped += 1;
177
178							continue;
179						}
180
181						// Check file pattern
182						if MatchesPatterns(&file_path, &include_patterns) {
183							// Try to get file access to validate permissions
184							if ValidateFileAccess(&file_path).await {
185								files_to_scan.push(file_path);
186
187								files_found += 1;
188
189								total_size += file_size;
190							} else {
191								dev_log!(
192									"indexing",
193									"warn: [ScanDirectory] Cannot access file (permission denied): {}",
194									file_path.display()
195								);
196
197								errors += 1;
198							}
199						} else {
200							files_skipped += 1;
201						}
202					} else {
203						errors += 1;
204					}
205				}
206			},
207
208			Err(e) => {
209				dev_log!("indexing", "warn: [ScanDirectory] Error walking directory: {}", e);
210
211				errors += 1;
212			},
213		}
214	}
215
216	dev_log!(
217		"indexing",
218		"[ScanDirectory] Directory scan completed: {} files, {} skipped, {} errors, {} bytes",
219		files_found,
220		files_skipped,
221		errors,
222		total_size
223	);
224
225	Ok((
226		files_to_scan,
227		ScanDirectoryResult { files_found, files_skipped, errors, total_size },
228	))
229}
230
231/// Scan a directory and remove deleted files from index
232pub async fn ScanAndRemoveDeleted(index:&mut FileIndex, directory_path:&Path) -> Result<u32> {
233	let mut paths_to_remove = Vec::new();
234
235	let all_paths:Vec<_> = index.files.keys().cloned().collect();
236
237	for path in all_paths {
238		if !path.exists() && path.starts_with(directory_path) {
239			paths_to_remove.push(path.clone());
240		}
241	}
242
243	let removed_count = paths_to_remove.len();
244
245	for path in paths_to_remove {
246		index.files.remove(&path);
247
248		index.file_symbols.remove(&path);
249
250		// Remove from symbol index
251		for (_, locations) in index.symbol_index.iter_mut() {
252			locations.retain(|loc| loc.file_path != path);
253		}
254
255		// Remove from content index
256		for (_, files) in index.content_index.iter_mut() {
257			files.retain(|p| p != &path);
258		}
259	}
260
261	Ok(removed_count as u32)
262}
263
264/// Check directory read permissions
265async fn CheckDirectoryPermissions(path:&Path) -> Result<()> {
266	tokio::task::spawn_blocking({
267		let path = path.to_path_buf();
268		move || {
269			std::fs::read_dir(&path)
270				.map_err(|e| AirError::FileSystem(format!("Cannot read directory {}: {}", path.display(), e)))?;
271			Ok(())
272		}
273	})
274	.await?
275}
276
277/// Check if file path matches any of the provided patterns
278pub fn MatchesPatterns(file_path:&std::path::Path, patterns:&[String]) -> bool {
279	if patterns.is_empty() {
280		return true;
281	}
282
283	let file_name = file_path.file_name().unwrap_or_default().to_string_lossy().to_string();
284
285	for pattern in patterns {
286		if MatchesPattern(&file_name, pattern) {
287			return true;
288		}
289	}
290
291	false
292}
293
294/// Check if filename matches a single pattern
295pub fn MatchesPattern(filename:&str, pattern:&str) -> bool {
296	if pattern.starts_with("*.") {
297		let extension = &pattern[2..];
298
299		filename.ends_with(extension)
300	} else {
301		filename == pattern
302	}
303}
304
305/// Get default exclude patterns for directory scanning
306pub fn GetDefaultExcludePatterns() -> Vec<String> {
307	vec![
308		"node_modules".to_string(),
309		"target".to_string(),
310		".git".to_string(),
311		".svn".to_string(),
312		".hg".to_string(),
313		".bzr".to_string(),
314		"dist".to_string(),
315		"build".to_string(),
316		".next".to_string(),
317		".nuxt".to_string(),
318		"__pycache__".to_string(),
319		"*.pyc".to_string(),
320		".venv".to_string(),
321		"venv".to_string(),
322		"env".to_string(),
323		".env".to_string(),
324		".idea".to_string(),
325		".vscode".to_string(),
326		".DS_Store".to_string(),
327		"Thumbs.db".to_string(),
328	]
329}
330
331/// Parallel scan of multiple directories
332pub async fn ScanDirectoriesParallel(
333	directories:Vec<String>,
334
335	patterns:Vec<String>,
336
337	config:&IndexingConfig,
338
339	max_parallel:usize,
340) -> Result<(Vec<std::path::PathBuf>, ScanDirectoryResult)> {
341	let semaphore = Arc::new(Semaphore::new(max_parallel));
342
343	let mut all_files = Vec::new();
344
345	let mut total_result = ScanDirectoryResult { files_found:0, files_skipped:0, errors:0, total_size:0 };
346
347	let mut scan_tasks = Vec::new();
348
349	for directory in directories {
350		let permit = semaphore.clone().acquire_owned().await.unwrap();
351
352		let config_clone = config.clone();
353
354		let patterns_clone = patterns.clone();
355
356		let task = tokio::spawn(async move {
357			let _permit = permit;
358			ScanDirectory(&directory, patterns_clone, &config_clone, max_parallel).await
359		});
360
361		scan_tasks.push(task);
362	}
363
364	// Collect results
365	for task in scan_tasks {
366		match task.await {
367			Ok(Ok((files, result))) => {
368				all_files.extend(files);
369
370				total_result.files_found += result.files_found;
371
372				total_result.files_skipped += result.files_skipped;
373
374				total_result.errors += result.errors;
375
376				total_result.total_size += result.total_size;
377			},
378
379			Ok(Err(e)) => {
380				dev_log!("indexing", "error: [ScanDirectory] Parallel scan failed: {}", e);
381
382				total_result.errors += 1;
383			},
384
385			Err(e) => {
386				dev_log!("indexing", "error: [ScanDirectory] Parallel task panicked: {}", e);
387
388				total_result.errors += 1;
389			},
390		}
391	}
392
393	Ok((all_files, total_result))
394}
395
396/// Get file count statistics for a directory without full scan
397pub async fn GetDirectoryStatistics(path:&str, max_depth:Option<usize>) -> Result<DirectoryStatistics> {
398	let directory_path = crate::Configuration::ConfigurationManager::ExpandPath(path)?;
399
400	if !directory_path.exists() || !directory_path.is_dir() {
401		return Err(AirError::FileSystem(format!("Invalid directory: {}", path)));
402	}
403
404	let mut file_count = 0u64;
405
406	let mut total_size = 0u64;
407
408	let mut directory_count = 0u64;
409
410	let mut hidden_count = 0u64;
411
412	let walker = ignore::WalkBuilder::new(&directory_path)
413		.max_depth(max_depth)
414		.hidden(true)
415		.follow_links(false)
416		.build();
417
418	for entry in walker.flatten() {
419		let file_type = entry.file_type().expect("Failed to get file type");
420
421		if file_type.is_file() {
422			file_count += 1;
423
424			if let Ok(metadata) = entry.metadata() {
425				total_size += metadata.len();
426			}
427		} else if file_type.is_dir() {
428			directory_count += 1;
429		}
430
431		if entry.depth() > 0
432			&& entry
433				.path()
434				.components()
435				.any(|c| c.as_os_str().to_string_lossy().starts_with('.'))
436		{
437			hidden_count += 1;
438		}
439	}
440
441	Ok(DirectoryStatistics { file_count, directory_count, hidden_count, total_size })
442}
443
444/// Directory statistics
445#[derive(Debug, Clone)]
446pub struct DirectoryStatistics {
447	pub file_count:u64,
448
449	pub directory_count:u64,
450
451	pub hidden_count:u64,
452
453	pub total_size:u64,
454}