From f4245b702549dbd38fd21d616b9853aadfba919f Mon Sep 17 00:00:00 2001 From: Sean MacAvaney Date: Thu, 22 Aug 2024 13:29:44 +0100 Subject: [PATCH 1/3] python indexer --- python/Cargo.toml | 1 + python/README.md | 40 +++++++++++++++++++++++- python/python/bmp/__init__.py | 2 +- python/src/lib.rs | 59 +++++++++++++++++++++++++++++++++++ src/index/forward_index.rs | 3 ++ src/index/inverted_index.rs | 8 +++-- 6 files changed, 109 insertions(+), 4 deletions(-) diff --git a/python/Cargo.toml b/python/Cargo.toml index 4fc9940..0e61797 100644 --- a/python/Cargo.toml +++ b/python/Cargo.toml @@ -17,3 +17,4 @@ crate-type = ["cdylib"] [dependencies] pyo3 = { version = "0.21.2", features = ["extension-module"] } bmp = { path = "../" } +bincode = "1.3.3" diff --git a/python/README.md b/python/README.md index a44b2fd..2b936ca 100644 --- a/python/README.md +++ b/python/README.md @@ -23,8 +23,46 @@ pip install target/wheels/*.whl ``` ## Usage -### Index + +### Index from CIFF + ```python from bmp import ciff2bmp ciff2bmp(ciff_file="/path/to/ciff", output="/path/to/index", bsize=32, compress_range=False) ``` + +### Index with Python + +```python +from bmp import Indexer +import string +import random +indexer = Indexer('/path/to/index', bsize=32, compress_range=False) +terms = [(c, []) for c in string.ascii_letters] +for doc in range(10_000): + dvec = [] + for idx in range(random.randrange(1, 10)): + tf = random.randrange(1, 1000) + tok = random.randrange(len(terms)) + dvec.append((tok, tf)) + terms[tok][1].append((doc, tf)) + indexer.add_document(f'doc{doc}', dvec) +for term, postings in terms: + indexer.add_term(term, postings) +indexer.finish() +``` + +#### Search + +```python +from bmp import search, Searcher + +# batch operation +results = search(index="/path/to/index", queries="/path/to/queries", k=10, alpha=1.0, beta=1.0) +# -> str (TREC run file) + +# query-by-query operation +searcher = Searcher("/path/to/index") # loads index into memory once +searcher.search({'tok1': 5.3, 'tok2': 1.1}, k=10, alpha=1.0, beta=1.0) +# -> Tuple[List[str], List[float]] (doc IDs, scores) for this query +``` diff --git a/python/python/bmp/__init__.py b/python/python/bmp/__init__.py index 130a303..2d24761 100644 --- a/python/python/bmp/__init__.py +++ b/python/python/bmp/__init__.py @@ -1 +1 @@ -from bmp._bmp import ciff2bmp, search, Searcher +from bmp._bmp import ciff2bmp, search, Searcher, Indexer diff --git a/python/src/lib.rs b/python/src/lib.rs index e45c3e9..0c5eae4 100644 --- a/python/src/lib.rs +++ b/python/src/lib.rs @@ -91,6 +91,64 @@ fn search( Ok(to_trec(&q_ids, results, index.documents())) } +#[pyclass] +struct Indexer { + path: PathBuf, + bsize: usize, + compress_range: bool, + inv_builder: bmp::index::inverted_index::IndexBuilder, + fwd_builder: bmp::index::forward_index::ForwardIndexBuilder, +} + +#[pymethods] +impl Indexer { + + #[new] + fn py_new(path: PathBuf, bsize: usize, compress_range: bool) -> PyResult { + Ok(Indexer { + path: path, + bsize: bsize, + compress_range: compress_range, + inv_builder: bmp::index::inverted_index::IndexBuilder::new(0, bsize), + fwd_builder: bmp::index::forward_index::ForwardIndexBuilder::new(0), + }) + } + + fn add_document( + &mut self, + doc_id: String, + vector: Vec<(u32, u32)>, + ) -> PyResult<()> { + self.inv_builder.insert_document(&doc_id); + self.fwd_builder.insert_document(vector); + Ok(()) + } + + fn add_term( + &mut self, + term: String, + postings: Vec<(u32, u32)>, + ) -> PyResult<()> { + self.inv_builder.insert_term(&term, postings); + Ok(()) + } + + fn finish( + &mut self, + ) -> PyResult<()> { + let builder = std::mem::replace(&mut self.inv_builder, bmp::index::inverted_index::IndexBuilder::new(0, 0)); + let inverted_index = builder.build(self.compress_range); + let forward_index = self.fwd_builder.build(); + let b_forward_index = bmp::index::forward_index::fwd2bfwd(&forward_index, self.bsize); + let file = std::fs::File::create(self.path.clone()).expect("Failed to create file"); + let writer = std::io::BufWriter::new(file); + // Serialize the index directly into a file using bincode + bincode::serialize_into(writer, &(&inverted_index, &b_forward_index)) + .expect("Failed to serialize"); + Ok(()) + } +} + /// A Python module implemented in Rust. The name of this function must match /// the `lib.name` setting in the `Cargo.toml`, else Python will not be able to /// import the module. @@ -99,5 +157,6 @@ fn _bmp(m: &Bound<'_, PyModule>) -> PyResult<()> { m.add_function(wrap_pyfunction!(ciff2bmp, m)?)?; m.add_function(wrap_pyfunction!(search, m)?)?; m.add_class::()?; + m.add_class::()?; Ok(()) } diff --git a/src/index/forward_index.rs b/src/index/forward_index.rs index 56d93e9..29762ba 100644 --- a/src/index/forward_index.rs +++ b/src/index/forward_index.rs @@ -54,6 +54,9 @@ impl ForwardIndexBuilder { self.forward_index.data[*doc_id as usize].push((term_id as u32, *score)); } } + pub fn insert_document(&mut self, vector: Vec<(u32, u32)>) { + self.forward_index.data.push(vector); + } pub fn build(&mut self) -> ForwardIndex { for doc in &mut self.forward_index.data { doc.sort_by_key(|d| d.0); diff --git a/src/index/inverted_index.rs b/src/index/inverted_index.rs index 02e8d57..f79e1a7 100644 --- a/src/index/inverted_index.rs +++ b/src/index/inverted_index.rs @@ -116,12 +116,16 @@ impl IndexBuilder { } pub fn build(self, compress_range: bool) -> Index { + let mut num_docs = self.num_documents; + if num_docs == 0 { + num_docs = self.documents.len(); + } let posting_lists: Vec = self .posting_lists .into_par_iter() .map(|p_list| { let range_size = self.bsize; - let blocks_num = div_ceil(self.num_documents, range_size); + let blocks_num = div_ceil(num_docs, range_size); let mut range_maxes: Vec = vec![0; blocks_num]; p_list.iter().for_each(|&(docid, score)| { let current_max = &mut range_maxes[docid as usize / range_size]; @@ -153,7 +157,7 @@ impl IndexBuilder { }); Index { - num_documents: self.num_documents, + num_documents: num_docs, posting_lists, termmap: Map::new(build.into_inner().unwrap()).unwrap(), documents: self.documents, From ceea6f6cdf23e8ce2c9858c17bb578aa01cfcd6e Mon Sep 17 00:00:00 2001 From: Sean MacAvaney Date: Tue, 17 Sep 2024 14:22:38 +0100 Subject: [PATCH 2/3] Indexer vs InvertedIndexer --- python/README.md | 16 ++------- python/python/bmp/__init__.py | 2 +- python/src/lib.rs | 68 +++++++++++++++++++++++++++++++++-- src/index/inverted_index.rs | 18 ++++++++-- 4 files changed, 84 insertions(+), 20 deletions(-) diff --git a/python/README.md b/python/README.md index 2b936ca..6318641 100644 --- a/python/README.md +++ b/python/README.md @@ -35,20 +35,10 @@ ciff2bmp(ciff_file="/path/to/ciff", output="/path/to/index", bsize=32, compress_ ```python from bmp import Indexer -import string -import random indexer = Indexer('/path/to/index', bsize=32, compress_range=False) -terms = [(c, []) for c in string.ascii_letters] -for doc in range(10_000): - dvec = [] - for idx in range(random.randrange(1, 10)): - tf = random.randrange(1, 1000) - tok = random.randrange(len(terms)) - dvec.append((tok, tf)) - terms[tok][1].append((doc, tf)) - indexer.add_document(f'doc{doc}', dvec) -for term, postings in terms: - indexer.add_term(term, postings) +indexer.add_document('doc1', {'a': 1, 'b': 5, 'c': 8}) # docid, vector +indexer.add_document('doc2', {'a': 2, 'c': 1, 'd': 8, 'f': 2}) +... indexer.finish() ``` diff --git a/python/python/bmp/__init__.py b/python/python/bmp/__init__.py index 2d24761..ae1609d 100644 --- a/python/python/bmp/__init__.py +++ b/python/python/bmp/__init__.py @@ -1 +1 @@ -from bmp._bmp import ciff2bmp, search, Searcher, Indexer +from bmp._bmp import ciff2bmp, search, Searcher, InvertedIndexer, Indexer diff --git a/python/src/lib.rs b/python/src/lib.rs index 0c5eae4..6ecbd54 100644 --- a/python/src/lib.rs +++ b/python/src/lib.rs @@ -92,7 +92,7 @@ fn search( } #[pyclass] -struct Indexer { +struct InvertedIndexer { path: PathBuf, bsize: usize, compress_range: bool, @@ -101,11 +101,11 @@ struct Indexer { } #[pymethods] -impl Indexer { +impl InvertedIndexer { #[new] fn py_new(path: PathBuf, bsize: usize, compress_range: bool) -> PyResult { - Ok(Indexer { + Ok(InvertedIndexer { path: path, bsize: bsize, compress_range: compress_range, @@ -149,6 +149,67 @@ impl Indexer { } } +#[pyclass] +struct Indexer { + path: PathBuf, + bsize: usize, + compress_range: bool, + inv_builder: bmp::index::inverted_index::IndexBuilder, + fwd_builder: bmp::index::forward_index::ForwardIndexBuilder, + term_map: HashMap, +} + +#[pymethods] +impl Indexer { + + #[new] + fn py_new(path: PathBuf, bsize: usize, compress_range: bool) -> PyResult { + Ok(Indexer { + path: path, + bsize: bsize, + compress_range: compress_range, + inv_builder: bmp::index::inverted_index::IndexBuilder::new(0, bsize), + fwd_builder: bmp::index::forward_index::ForwardIndexBuilder::new(0), + term_map: HashMap::new(), + }) + } + + fn add_document( + &mut self, + doc_id: String, + vector: HashMap, + ) -> PyResult<()> { + let doc_idx = self.inv_builder.insert_document(&doc_id); + let mut int_vector: Vec<(u32, u32)> = Vec::new(); + for (term, weight) in &vector { + if !self.term_map.contains_key(term) { + self.term_map.insert(term.clone(), self.term_map.len() as u32); + self.inv_builder.insert_term(term, Vec::new()); + } + let term_idx = self.term_map[term]; + self.inv_builder.push_posting(term_idx, doc_idx, *weight); + int_vector.push((term_idx, *weight)) + } + self.fwd_builder.insert_document(int_vector); + Ok(()) + } + + fn finish( + &mut self, + ) -> PyResult<()> { + let builder = std::mem::replace(&mut self.inv_builder, bmp::index::inverted_index::IndexBuilder::new(0, 0)); + let inverted_index = builder.build(self.compress_range); + let forward_index = self.fwd_builder.build(); + let b_forward_index = bmp::index::forward_index::fwd2bfwd(&forward_index, self.bsize); + let file = std::fs::File::create(self.path.clone()).expect("Failed to create file"); + let writer = std::io::BufWriter::new(file); + // Serialize the index directly into a file using bincode + bincode::serialize_into(writer, &(&inverted_index, &b_forward_index)) + .expect("Failed to serialize"); + Ok(()) + } +} + /// A Python module implemented in Rust. The name of this function must match /// the `lib.name` setting in the `Cargo.toml`, else Python will not be able to /// import the module. @@ -157,6 +218,7 @@ fn _bmp(m: &Bound<'_, PyModule>) -> PyResult<()> { m.add_function(wrap_pyfunction!(ciff2bmp, m)?)?; m.add_function(wrap_pyfunction!(search, m)?)?; m.add_class::()?; + m.add_class::()?; m.add_class::()?; Ok(()) } diff --git a/src/index/inverted_index.rs b/src/index/inverted_index.rs index f79e1a7..9b07ff5 100644 --- a/src/index/inverted_index.rs +++ b/src/index/inverted_index.rs @@ -93,8 +93,14 @@ impl IndexBuilder { self.terms.push(term.to_string()); } - pub fn insert_document(&mut self, name: &str) { + pub fn push_posting(&mut self, term_id: u32, doc_id: u32, tf: u32) { + self.posting_lists[term_id as usize].push((doc_id, tf)); + } + + pub fn insert_document(&mut self, name: &str) -> u32 { + let doc_id = self.documents.len(); self.documents.push(name.to_string()); + return doc_id as u32; } fn compress(data: &[u8]) -> Vec { @@ -152,8 +158,14 @@ impl IndexBuilder { .collect(); let mut build = MapBuilder::memory(); - self.terms.iter().enumerate().for_each(|(index, term)| { - let _ = build.insert(term, index as u64); + + let mut indexed_terms: Vec<(usize, &String)> = self.terms.iter().enumerate().collect(); + + // Sort the terms lexicographically while keeping the original indices + indexed_terms.sort_by(|a, b| a.1.cmp(b.1)); + + indexed_terms.iter().for_each(|(index, term)| { + let _ = build.insert(term, *index as u64); }); Index { From 94c35d4333353c0ca935d4f2eb74a594691ac102 Mon Sep 17 00:00:00 2001 From: Sean MacAvaney Date: Tue, 17 Sep 2024 14:33:38 +0100 Subject: [PATCH 3/3] add comment --- src/index/inverted_index.rs | 1 + 1 file changed, 1 insertion(+) diff --git a/src/index/inverted_index.rs b/src/index/inverted_index.rs index 9b07ff5..adfa52a 100644 --- a/src/index/inverted_index.rs +++ b/src/index/inverted_index.rs @@ -94,6 +94,7 @@ impl IndexBuilder { } pub fn push_posting(&mut self, term_id: u32, doc_id: u32, tf: u32) { + // Pushes the doc_id and tf to the posting assocaited with term_id. This function assumes doc_ids are added in an increasing order. self.posting_lists[term_id as usize].push((doc_id, tf)); }