Skip to content

Commit

Permalink
Merge pull request #5 from pisa-engine/python_indexer
Browse files Browse the repository at this point in the history
python indexer
  • Loading branch information
seanmacavaney authored Sep 17, 2024
2 parents 3d697bf + 94c35d4 commit f11f64d
Show file tree
Hide file tree
Showing 6 changed files with 177 additions and 7 deletions.
1 change: 1 addition & 0 deletions python/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -17,3 +17,4 @@ crate-type = ["cdylib"]
[dependencies]
pyo3 = { version = "0.21.2", features = ["extension-module"] }
bmp = { path = "../" }
bincode = "1.3.3"
30 changes: 29 additions & 1 deletion python/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -23,8 +23,36 @@ pip install target/wheels/*.whl
```

## Usage
### Index

### Index from CIFF

```python
from bmp import ciff2bmp
ciff2bmp(ciff_file="/path/to/ciff", output="/path/to/index", bsize=32, compress_range=False)
```

### Index with Python

```python
from bmp import Indexer
indexer = Indexer('/path/to/index', bsize=32, compress_range=False)
indexer.add_document('doc1', {'a': 1, 'b': 5, 'c': 8}) # docid, vector
indexer.add_document('doc2', {'a': 2, 'c': 1, 'd': 8, 'f': 2})
...
indexer.finish()
```

#### Search

```python
from bmp import search, Searcher

# batch operation
results = search(index="/path/to/index", queries="/path/to/queries", k=10, alpha=1.0, beta=1.0)
# -> str (TREC run file)

# query-by-query operation
searcher = Searcher("/path/to/index") # loads index into memory once
searcher.search({'tok1': 5.3, 'tok2': 1.1}, k=10, alpha=1.0, beta=1.0)
# -> Tuple[List[str], List[float]] (doc IDs, scores) for this query
```
2 changes: 1 addition & 1 deletion python/python/bmp/__init__.py
Original file line number Diff line number Diff line change
@@ -1 +1 @@
from bmp._bmp import ciff2bmp, search, Searcher
from bmp._bmp import ciff2bmp, search, Searcher, InvertedIndexer, Indexer
121 changes: 121 additions & 0 deletions python/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -91,6 +91,125 @@ fn search(
Ok(to_trec(&q_ids, results, index.documents()))
}

#[pyclass]
struct InvertedIndexer {
path: PathBuf,
bsize: usize,
compress_range: bool,
inv_builder: bmp::index::inverted_index::IndexBuilder,
fwd_builder: bmp::index::forward_index::ForwardIndexBuilder,
}

#[pymethods]
impl InvertedIndexer {

#[new]
fn py_new(path: PathBuf, bsize: usize, compress_range: bool) -> PyResult<Self> {
Ok(InvertedIndexer {
path: path,
bsize: bsize,
compress_range: compress_range,
inv_builder: bmp::index::inverted_index::IndexBuilder::new(0, bsize),
fwd_builder: bmp::index::forward_index::ForwardIndexBuilder::new(0),
})
}

fn add_document(
&mut self,
doc_id: String,
vector: Vec<(u32, u32)>,
) -> PyResult<()> {
self.inv_builder.insert_document(&doc_id);
self.fwd_builder.insert_document(vector);
Ok(())
}

fn add_term(
&mut self,
term: String,
postings: Vec<(u32, u32)>,
) -> PyResult<()> {
self.inv_builder.insert_term(&term, postings);
Ok(())
}

fn finish(
&mut self,
) -> PyResult<()> {
let builder = std::mem::replace(&mut self.inv_builder, bmp::index::inverted_index::IndexBuilder::new(0, 0));
let inverted_index = builder.build(self.compress_range);
let forward_index = self.fwd_builder.build();
let b_forward_index = bmp::index::forward_index::fwd2bfwd(&forward_index, self.bsize);
let file = std::fs::File::create(self.path.clone()).expect("Failed to create file");
let writer = std::io::BufWriter::new(file);
// Serialize the index directly into a file using bincode
bincode::serialize_into(writer, &(&inverted_index, &b_forward_index))
.expect("Failed to serialize");
Ok(())
}
}

#[pyclass]
struct Indexer {
path: PathBuf,
bsize: usize,
compress_range: bool,
inv_builder: bmp::index::inverted_index::IndexBuilder,
fwd_builder: bmp::index::forward_index::ForwardIndexBuilder,
term_map: HashMap<String, u32>,
}

#[pymethods]
impl Indexer {

#[new]
fn py_new(path: PathBuf, bsize: usize, compress_range: bool) -> PyResult<Self> {
Ok(Indexer {
path: path,
bsize: bsize,
compress_range: compress_range,
inv_builder: bmp::index::inverted_index::IndexBuilder::new(0, bsize),
fwd_builder: bmp::index::forward_index::ForwardIndexBuilder::new(0),
term_map: HashMap::new(),
})
}

fn add_document(
&mut self,
doc_id: String,
vector: HashMap<String, u32>,
) -> PyResult<()> {
let doc_idx = self.inv_builder.insert_document(&doc_id);
let mut int_vector: Vec<(u32, u32)> = Vec::new();
for (term, weight) in &vector {
if !self.term_map.contains_key(term) {
self.term_map.insert(term.clone(), self.term_map.len() as u32);
self.inv_builder.insert_term(term, Vec::new());
}
let term_idx = self.term_map[term];
self.inv_builder.push_posting(term_idx, doc_idx, *weight);
int_vector.push((term_idx, *weight))
}
self.fwd_builder.insert_document(int_vector);
Ok(())
}

fn finish(
&mut self,
) -> PyResult<()> {
let builder = std::mem::replace(&mut self.inv_builder, bmp::index::inverted_index::IndexBuilder::new(0, 0));
let inverted_index = builder.build(self.compress_range);
let forward_index = self.fwd_builder.build();
let b_forward_index = bmp::index::forward_index::fwd2bfwd(&forward_index, self.bsize);
let file = std::fs::File::create(self.path.clone()).expect("Failed to create file");
let writer = std::io::BufWriter::new(file);
// Serialize the index directly into a file using bincode
bincode::serialize_into(writer, &(&inverted_index, &b_forward_index))
.expect("Failed to serialize");
Ok(())
}
}

/// A Python module implemented in Rust. The name of this function must match
/// the `lib.name` setting in the `Cargo.toml`, else Python will not be able to
/// import the module.
Expand All @@ -99,5 +218,7 @@ fn _bmp(m: &Bound<'_, PyModule>) -> PyResult<()> {
m.add_function(wrap_pyfunction!(ciff2bmp, m)?)?;
m.add_function(wrap_pyfunction!(search, m)?)?;
m.add_class::<Searcher>()?;
m.add_class::<InvertedIndexer>()?;
m.add_class::<Indexer>()?;
Ok(())
}
3 changes: 3 additions & 0 deletions src/index/forward_index.rs
Original file line number Diff line number Diff line change
Expand Up @@ -54,6 +54,9 @@ impl ForwardIndexBuilder {
self.forward_index.data[*doc_id as usize].push((term_id as u32, *score));
}
}
pub fn insert_document(&mut self, vector: Vec<(u32, u32)>) {
self.forward_index.data.push(vector);
}
pub fn build(&mut self) -> ForwardIndex {
for doc in &mut self.forward_index.data {
doc.sort_by_key(|d| d.0);
Expand Down
27 changes: 22 additions & 5 deletions src/index/inverted_index.rs
Original file line number Diff line number Diff line change
Expand Up @@ -93,8 +93,15 @@ impl IndexBuilder {
self.terms.push(term.to_string());
}

pub fn insert_document(&mut self, name: &str) {
pub fn push_posting(&mut self, term_id: u32, doc_id: u32, tf: u32) {
// Pushes the doc_id and tf to the posting assocaited with term_id. This function assumes doc_ids are added in an increasing order.
self.posting_lists[term_id as usize].push((doc_id, tf));
}

pub fn insert_document(&mut self, name: &str) -> u32 {
let doc_id = self.documents.len();
self.documents.push(name.to_string());
return doc_id as u32;
}

fn compress(data: &[u8]) -> Vec<crate::index::posting_list::CompressedBlock> {
Expand All @@ -116,12 +123,16 @@ impl IndexBuilder {
}

pub fn build(self, compress_range: bool) -> Index {
let mut num_docs = self.num_documents;
if num_docs == 0 {
num_docs = self.documents.len();
}
let posting_lists: Vec<PostingList> = self
.posting_lists
.into_par_iter()
.map(|p_list| {
let range_size = self.bsize;
let blocks_num = div_ceil(self.num_documents, range_size);
let blocks_num = div_ceil(num_docs, range_size);
let mut range_maxes: Vec<u8> = vec![0; blocks_num];
p_list.iter().for_each(|&(docid, score)| {
let current_max = &mut range_maxes[docid as usize / range_size];
Expand All @@ -148,12 +159,18 @@ impl IndexBuilder {
.collect();

let mut build = MapBuilder::memory();
self.terms.iter().enumerate().for_each(|(index, term)| {
let _ = build.insert(term, index as u64);

let mut indexed_terms: Vec<(usize, &String)> = self.terms.iter().enumerate().collect();

// Sort the terms lexicographically while keeping the original indices
indexed_terms.sort_by(|a, b| a.1.cmp(b.1));

indexed_terms.iter().for_each(|(index, term)| {
let _ = build.insert(term, *index as u64);
});

Index {
num_documents: self.num_documents,
num_documents: num_docs,
posting_lists,
termmap: Map::new(build.into_inner().unwrap()).unwrap(),
documents: self.documents,
Expand Down

0 comments on commit f11f64d

Please sign in to comment.