Skip to content

Commit e25192b

Browse files
committed
added better gitignore parsing
1 parent cf0968a commit e25192b

File tree

3 files changed

+77
-49
lines changed

3 files changed

+77
-49
lines changed

.github/workflows/CI.yml

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -138,4 +138,5 @@ jobs:
138138
MATURIN_PYPI_TOKEN: ${{ secrets.PYPI_API_TOKEN }}
139139
with:
140140
command: upload
141-
args: --non-interactive --skip-existing wheels-linux-x86_64/* wheels-linux-x86/* wheels-linux-aarch64/* wheels-linux-armv7/* wheels-linux-ppc64le/* wheels-macos-x86_64/* wheels-macos-aarch64/* wheels-sdist/*
141+
args: --non-interactive --skip-existing wheels-linux-x86_64/* wheels-linux-x86/* wheels-linux-aarch64/* wheels-linux-armv7/* wheels-linux-ppc64le/* wheels-macos-x86_64/* wheels-macos-aarch64/* wheels-sdist/*
142+

src/file.rs

Lines changed: 74 additions & 47 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,6 @@
1-
use std::path::Path;
1+
use std::path::{Path, PathBuf};
2+
use ignore::gitignore::{Gitignore, GitignoreBuilder};
3+
use ignore::WalkBuilder;
24
use tantivy::{schema::Schema, IndexWriter, doc, Term};
35
use anyhow::Result;
46
use async_trait::async_trait;
@@ -11,7 +13,6 @@ use crate::intelligence::{TreeSitterFile, TSLanguage};
1113
use crate::symbol::SymbolLocations;
1214
use crate::schema::build_schema;
1315
use sha2::{Sha256, Digest};
14-
use globset::{Glob, GlobSet, GlobSetBuilder};
1516

1617
pub struct File {
1718
pub schema: Schema,
@@ -60,13 +61,13 @@ impl File {
6061
impl Indexable for File {
6162
async fn index_repository(&self, root_path: &Path, writer: &IndexWriter) -> Result<()> {
6263
let existing_docs = load_existing_docs(writer, &self.hash_field, &self.path_field)?;
63-
let mut gitignore = GlobSetBuilder::new();
64+
let gitignore_manager = GitignoreManager::new(root_path.to_path_buf()).await?;
6465

6566
traverse_and_index_files(
66-
root_path, writer, &self.schema, self.path_field, self.content_field,
67+
root_path, writer, self.path_field, self.content_field,
6768
self.symbol_locations_field, self.symbols_field, self.line_end_indices_field,
6869
self.lang_field, self.hash_field, self.content_insensitive_field,
69-
&existing_docs, &mut gitignore, root_path).await
70+
&existing_docs, &gitignore_manager).await
7071
}
7172

7273
fn schema(&self) -> Schema {
@@ -93,73 +94,96 @@ fn load_existing_docs(writer: &IndexWriter, hash_field: &tantivy::schema::Field,
9394
Ok(existing_docs)
9495
}
9596

96-
async fn parse_gitignore(current_path: &Path, builder: &mut GlobSetBuilder) -> Result<()> {
97-
let gitignore_path = current_path.join(".gitignore");
98-
99-
if gitignore_path.exists() {
100-
let contents = tokio::fs::read_to_string(&gitignore_path).await?;
101-
for line in contents.lines() {
102-
let trimmed_line = line.trim();
103-
if !trimmed_line.starts_with('#') && !trimmed_line.is_empty() {
104-
let absolute_pattern = if trimmed_line.starts_with('/') {
105-
// The pattern is already an absolute path, so we just use it as is
106-
current_path.join(trimmed_line.trim_start_matches('/'))
107-
} else {
108-
// The pattern is a relative path, so we join it with the current path
109-
current_path.join(trimmed_line)
110-
};
111-
let pattern = absolute_pattern.to_string_lossy().replace("\\", "/");
112-
// println!("Adding to gitignore: {}", pattern);
113-
builder.add(Glob::new(&pattern)?);
97+
struct GitignoreManager {
98+
root_path: PathBuf,
99+
gitignores: Vec<(PathBuf, Gitignore)>,
100+
}
101+
102+
impl GitignoreManager {
103+
async fn new(root_path: PathBuf) -> Result<Self> {
104+
let mut manager = GitignoreManager {
105+
root_path,
106+
gitignores: Vec::new(),
107+
};
108+
manager.load_gitignores().await?;
109+
Ok(manager)
110+
}
111+
112+
async fn load_gitignores(&mut self) -> Result<()> {
113+
let walk = WalkBuilder::new(&self.root_path)
114+
.hidden(false)
115+
.git_ignore(false)
116+
.build();
117+
118+
for entry in walk {
119+
let entry = entry?;
120+
let path = entry.path();
121+
if path.file_name() == Some(".gitignore".as_ref()) {
122+
let gitignore_dir = path.parent().unwrap().to_path_buf();
123+
let mut builder = GitignoreBuilder::new(&gitignore_dir);
124+
builder.add(path);
125+
match builder.build() {
126+
Ok(gitignore) => {
127+
self.gitignores.push((gitignore_dir, gitignore));
128+
},
129+
Err(err) => {
130+
eprintln!("Error building gitignore for {:?}: {}", path, err);
131+
// Optionally, you can choose to return the error or continue
132+
// return Err(err.into());
133+
}
134+
}
114135
}
115136
}
137+
138+
// Sort gitignores from most specific (deepest) to least specific (root)
139+
self.gitignores.sort_by(|a, b| b.0.components().count().cmp(&a.0.components().count()));
140+
141+
Ok(())
116142
}
117143

118-
Ok(())
144+
fn is_ignored(&self, path: &Path) -> bool {
145+
for (dir, gitignore) in &self.gitignores {
146+
if path.starts_with(dir) {
147+
let relative_path = path.strip_prefix(dir).unwrap();
148+
match gitignore.matched(relative_path, false) {
149+
ignore::Match::Ignore(_) => return true,
150+
ignore::Match::Whitelist(_) => return false,
151+
ignore::Match::None => continue,
152+
}
153+
}
154+
}
155+
false
156+
}
119157
}
120158

121-
122159
fn traverse_and_index_files<'a>(
123160
path: &'a Path,
124161
writer: &'a IndexWriter,
125-
schema: &'a Schema,
126162
path_field: tantivy::schema::Field,
127163
content_field: tantivy::schema::Field,
128164
symbol_locations_field: tantivy::schema::Field,
129165
symbols_field: tantivy::schema::Field,
130166
line_end_indices_field: tantivy::schema::Field,
131167
lang_field: tantivy::schema::Field,
132168
hash_field: tantivy::schema::Field,
133-
content_insensitive_field: tantivy::schema::Field, // New field
169+
content_insensitive_field: tantivy::schema::Field,
134170
existing_docs: &'a HashMap<String, String>,
135-
gitignore: &'a mut GlobSetBuilder,
136-
root_path: &'a Path,
171+
gitignore_manager: &'a GitignoreManager,
137172
) -> BoxFuture<'a, Result<()>> {
138173
Box::pin(async move {
139-
// Parse .gitignore in the current directory and update the builder
140-
parse_gitignore(path, gitignore).await?;
141-
142-
// Build the GlobSet from the builder
143-
let globset = gitignore.build()?;
144-
145174
let mut entries = fs::read_dir(path).await?;
146175
while let Some(entry) = entries.next_entry().await? {
147176
let path = entry.path();
148-
149-
// Convert the path to an absolute path
150-
let absolute_path = path.canonicalize()?;
151-
let absolute_path_str = absolute_path.to_string_lossy().replace("\\", "/");
152-
153-
// Skip paths that match .gitignore patterns
154-
if globset.is_match(&absolute_path_str) {
177+
178+
if gitignore_manager.is_ignored(&path) {
155179
continue;
156180
}
157-
158-
if path.is_dir() {
181+
182+
if path.is_dir() {
159183
traverse_and_index_files(
160-
&path, writer, schema, path_field, content_field, symbol_locations_field,
184+
&path, writer, path_field, content_field, symbol_locations_field,
161185
symbols_field, line_end_indices_field, lang_field, hash_field, content_insensitive_field,
162-
existing_docs, gitignore, root_path).await?;
186+
existing_docs, gitignore_manager).await?;
163187
} else if path.is_file() {
164188
let path_clone = path.clone();
165189
let content = spawn_blocking(move || std::fs::read(&path_clone)).await??;
@@ -173,6 +197,9 @@ fn traverse_and_index_files<'a>(
173197
let mut hasher = Sha256::new();
174198
hasher.update(&content_str);
175199
let hash = format!("{:x}", hasher.finalize());
200+
201+
let absolute_path = path.canonicalize()?;
202+
let absolute_path_str = absolute_path.to_string_lossy().replace("\\", "/");
176203

177204
let path_str = absolute_path_str.clone();
178205
if let Some(existing_hash) = existing_docs.get(&path_str) {
@@ -224,7 +251,7 @@ fn traverse_and_index_files<'a>(
224251
// Convert content to lower case for case-insensitive search
225252
let content_insensitive = content_str.to_lowercase();
226253

227-
// println!("{}", absolute_path_str);
254+
println!("{}", absolute_path_str);
228255

229256
let doc = tantivy::doc!(
230257
path_field => path_str,

src/lib.rs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,7 @@ use std::path::Path;
1515
pub use file::File;
1616
pub use indexes::{Indexes, Indexable};
1717
pub use repository::Repository;
18-
use search::Searcher;
18+
pub use search::Searcher;
1919
pub use sync_handle::SyncHandle;
2020

2121
use pyo3::prelude::*;

0 commit comments

Comments
 (0)