From f1edc2c55e1638fbae2fa4c676f277408ecc9fd7 Mon Sep 17 00:00:00 2001 From: "N. Tessa Pierce-Ward" Date: Wed, 27 Sep 2023 13:04:30 -0700 Subject: [PATCH 01/40] init changes --- Cargo.lock | 178 +++++++++++++++++++++------------------------------ Cargo.toml | 3 +- src/index.rs | 60 +++++++++++++---- 3 files changed, 122 insertions(+), 119 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 11246cb2..0e453e9e 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -2,12 +2,6 @@ # It is not intended for manual editing. version = 3 -[[package]] -name = "Inflector" -version = "0.11.4" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "fe438c63458706e03479442743baae6c88256498e6431708f6dfc520a26515d3" - [[package]] name = "adler" version = "1.0.2" @@ -127,6 +121,12 @@ dependencies = [ "thiserror", ] +[[package]] +name = "binary-merge" +version = "0.1.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "597bb81c80a54b6a4381b23faba8d7774b144c94cbd1d6fe3f1329bd776554ab" + [[package]] name = "bindgen" version = "0.65.1" @@ -269,6 +269,15 @@ dependencies = [ "pkg-config", ] +[[package]] +name = "camino" +version = "1.1.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c59e92b5a388f549b863a7bea62612c09f24c8393560709a54558a9abdfb3b9c" +dependencies = [ + "serde", +] + [[package]] name = "cc" version = "1.0.83" @@ -477,6 +486,18 @@ version = "1.9.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "a26ae43d7bcc3b814de94796a5e736d4029efb0ee900c12e2d54c993ad1a1e07" +[[package]] +name = "enum_dispatch" +version = "0.3.12" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8f33313078bb8d4d05a2733a94ac4c2d8a0df9a2b84424ebf4f33bfc224a890e" +dependencies = [ + "once_cell", + "proc-macro2", + "quote", + "syn 2.0.29", +] + [[package]] name = "env_logger" version = "0.10.0" @@ -542,37 +563,12 @@ dependencies = [ "num-traits", ] -[[package]] -name = "flume" -version = "0.10.14" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1657b4441c3403d9f7b3409e47575237dac27b1b5726df654a6ecbf92f0f7577" -dependencies = [ - "futures-core", - "futures-sink", - "nanorand", - "pin-project", - "spin", -] - [[package]] name = "funty" version = "2.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "e6d5a32815ae3f33302d95fdcb2ce17862f8c65363dcfd29360480ba1001fc9c" -[[package]] -name = "futures-core" -version = "0.3.28" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4bca583b7e26f571124fe5b7561d49cb2868d79116cfa0eefce955557c6fee8c" - -[[package]] -name = "futures-sink" -version = "0.3.28" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f43be4fe21a13b9781a69afa4985b0f6ee0e1afab2c6f454a8cf30e2b2237b6e" - [[package]] name = "generic-array" version = "0.14.7" @@ -623,6 +619,12 @@ dependencies = [ "ahash", ] +[[package]] +name = "heck" +version = "0.4.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "95505c38b4572b2d910cecb0281560f54b440a19336cbbcb27bf6ce6adc6f5a8" + [[package]] name = "hermit-abi" version = "0.3.2" @@ -688,6 +690,15 @@ dependencies = [ "generic-array", ] +[[package]] +name = "inplace-vec-builder" +version = "0.1.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "cf64c2edc8226891a71f127587a2861b132d2b942310843814d5001d99a1d307" +dependencies = [ + "smallvec", +] + [[package]] name = "is-terminal" version = "0.4.9" @@ -844,9 +855,9 @@ checksum = "5486aed0026218e61b8a01d5fbd5a0a134649abb71a0e53b7bc088529dced86e" [[package]] name = "memmap2" -version = "0.5.10" +version = "0.7.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "83faa42c0a078c393f6b29d5db232d8be22776a891f8f56e5284faee4a20b327" +checksum = "f49388d20533534cd19360ad3d6a7dadc885944aa802ba3995040c5ec11288c6" dependencies = [ "libc", ] @@ -881,15 +892,6 @@ version = "0.0.5" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "a2983372caf4480544083767bf2d27defafe32af49ab4df3a0b7fc90793a3664" -[[package]] -name = "nanorand" -version = "0.7.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6a51313c5820b0b02bd422f4b44776fbf47961755c74ce64afc73bfad10226c3" -dependencies = [ - "getrandom", -] - [[package]] name = "needletail" version = "0.5.1" @@ -988,25 +990,27 @@ checksum = "dd8b5dd2ae5ed71462c540258bedcb51965123ad7e7ccf4b9a8cafaa4a63576d" [[package]] name = "ouroboros" -version = "0.15.6" +version = "0.18.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e1358bd1558bd2a083fed428ffeda486fbfb323e698cdda7794259d592ca72db" +checksum = "1c86de06555b970aec45229b27291b53154f21a5743a163419f4e4c0b065dcde" dependencies = [ "aliasable", "ouroboros_macro", + "static_assertions", ] [[package]] name = "ouroboros_macro" -version = "0.15.6" +version = "0.18.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5f7d21ccd03305a674437ee1248f3ab5d4b1db095cf1caf49f1713ddf61956b7" +checksum = "8cad0c4b129e9696e37cb712b243777b90ef489a0bfaa0ac34e7d9b860e4f134" dependencies = [ - "Inflector", + "heck", + "itertools", "proc-macro-error", "proc-macro2", "quote", - "syn 1.0.109", + "syn 2.0.29", ] [[package]] @@ -1061,39 +1065,20 @@ version = "0.1.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "19b17cddbe7ec3f8bc800887bab5e717348c95ea2ca0b1bf0837fb964dc67099" -[[package]] -name = "pin-project" -version = "1.1.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "fda4ed1c6c173e3fc7a83629421152e01d7b1f9b7f65fb301e490e8cfc656422" -dependencies = [ - "pin-project-internal", -] - -[[package]] -name = "pin-project-internal" -version = "1.1.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4359fd9c9171ec6e8c62926d6faaf553a8dc3f64e1507e76da7911b4f6a04405" -dependencies = [ - "proc-macro2", - "quote", - "syn 2.0.29", -] - [[package]] name = "piz" -version = "0.4.0" +version = "0.5.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "58c75d1c00e6d407e283cc66d9d4fd0985ef1703c761520845b93c4f981bfb65" +checksum = "898b071c1938a2c92b95c18708cbf38f2566a01f0ab9dd7bdf4329987e5c2e17" dependencies = [ + "camino", "chrono", "codepage-437", "crc32fast", "flate2", "log", + "memchr", "thiserror", - "twoway", ] [[package]] @@ -1409,9 +1394,9 @@ dependencies = [ [[package]] name = "retain_mut" -version = "0.1.9" +version = "0.1.7" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4389f1d5789befaf6029ebd9f7dac4af7f7e3d61b69d4f30e2ac02b57e7712b0" +checksum = "8c31b5c4033f8fdde8700e4657be2c497e7288f01515be52168c631e2e4d4086" [[package]] name = "rkyv" @@ -1443,9 +1428,9 @@ dependencies = [ [[package]] name = "roaring" -version = "0.9.0" +version = "0.10.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "dd539cab4e32019956fe7e0cf160bb6d4802f4be2b52c4253d76d3bb0f85a5f7" +checksum = "6106b5cf8587f5834158895e9715a3c6c9716c8aefab57f1f7680917191c7873" dependencies = [ "bytemuck", "byteorder", @@ -1603,15 +1588,18 @@ checksum = "bceb57dc07c92cdae60f5b27b3fa92ecaaa42fe36c55e22dbfb0b44893e0b1f7" [[package]] name = "sourmash" version = "0.12.0" -source = "git+https://github.com/sourmash-bio/sourmash?rev=ff1092f8f366339caa59d7203f623813228f4356#ff1092f8f366339caa59d7203f623813228f4356" dependencies = [ "az", "bytecount", "byteorder", + "camino", "cfg-if", + "chrono", "counter", + "csv", + "enum_dispatch", "fixedbitset", - "flume", + "getrandom", "getset", "histogram", "log", @@ -1641,15 +1629,6 @@ dependencies = [ "web-sys", ] -[[package]] -name = "spin" -version = "0.9.8" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6980e8d7511241f8acf4aebddbb1ff938df5eebe98691418c4468d0b72a96a67" -dependencies = [ - "lock_api", -] - [[package]] name = "static_assertions" version = "1.1.0" @@ -1787,16 +1766,6 @@ version = "0.1.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "1f3ccbac311fea05f86f61904b462b55fb3df8837a366dfc601a0161d0532f20" -[[package]] -name = "twoway" -version = "0.2.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c57ffb460d7c24cd6eda43694110189030a3d1dfe418416d9468fd1c1d290b47" -dependencies = [ - "memchr", - "unchecked-index", -] - [[package]] name = "twox-hash" version = "1.6.3" @@ -1810,9 +1779,9 @@ dependencies = [ [[package]] name = "typed-builder" -version = "0.10.0" +version = "0.14.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "89851716b67b937e393b3daa8423e67ddfc4bbbf1654bcf05488e95e0828db0c" +checksum = "64cba322cb9b7bc6ca048de49e83918223f35e7a86311267013afff257004870" dependencies = [ "proc-macro2", "quote", @@ -1825,12 +1794,6 @@ version = "1.16.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "497961ef93d974e23eb6f433eb5fe1b7930b659f06d12dec6fc44a8f554c0bba" -[[package]] -name = "unchecked-index" -version = "0.2.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "eeba86d422ce181a719445e51872fa30f1f7413b62becb52e95ec91aa262d85c" - [[package]] name = "unicode-ident" version = "1.0.11" @@ -1857,10 +1820,13 @@ checksum = "accd4ea62f7bb7a82fe23066fb0957d48ef677f6eeb8215f372f52e48bb32426" [[package]] name = "vec-collections" -version = "0.3.6" +version = "0.4.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4f2390c4dc8ae8640c57d067b1a3d40bc05c124cc6bc7394d761b53435d41b76" +checksum = "3c9965c8f2ffed1dbcd16cafe18a009642f540fa22661c6cfd6309ddb02e4982" dependencies = [ + "binary-merge", + "inplace-vec-builder", + "lazy_static", "num-traits", "serde", "smallvec", diff --git a/Cargo.toml b/Cargo.toml index 8d1938f2..c67fc419 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -12,7 +12,8 @@ crate-type = ["cdylib"] pyo3 = { version = "0.19.2", features = ["extension-module", "anyhow"] } rayon = "1.8.0" serde = { version = "1.0.136", features = ["derive"] } -sourmash = { git = "https://github.com/sourmash-bio/sourmash", "rev" = "ff1092f8f366339caa59d7203f623813228f4356" } +#sourmash = { git = "https://github.com/sourmash-bio/sourmash", "rev" = "ff1092f8f366339caa59d7203f623813228f4356" } +sourmash = { path = "../sourmash/src/core", features = ["branchwater"] } serde_json = "1.0.107" niffler = "2.4.0" log = "0.4.14" diff --git a/src/index.rs b/src/index.rs index bee725cd..21f61162 100644 --- a/src/index.rs +++ b/src/index.rs @@ -1,4 +1,10 @@ -use sourmash::index::revindex::RevIndex; +//use sourmash::index::revindex::RevIndex; +use sourmash::collection::Collection; +use sourmash::index::revindex::{prepare_query, RevIndex, RevIndexOps}; +use sourmash::manifest::Manifest; +use sourmash::prelude::*; +use sourmash::signature::{Signature, SigsTrait}; +use sourmash::storage::{FSStorage, InnerStorage, ZipStorage}; use sourmash::sketch::Sketch; use std::path::Path; @@ -6,26 +12,56 @@ use crate::utils::{load_sigpaths_from_zip_or_pathlist, ReportType}; pub fn index>( siglist: P, - template: Sketch, + template: Sketch, + manifest: Option

, + selection: Selection, output: P, save_paths: bool, colors: bool, ) -> Result<(), Box> { println!("Loading siglist"); - let (index_sigs, _temp_dir) = - load_sigpaths_from_zip_or_pathlist(&siglist, &template, ReportType::Index)?; + // let (index_sigs, _temp_dir) = load_sigpaths_from_zip_or_pathlist(&siglist)?; - // if index_sigs pathlist is empty, bail - if index_sigs.is_empty() { - bail!("No signatures to index loaded, exiting."); - } + // // if index_sigs pathlist is empty, bail + // if index_sigs.is_empty() { + // bail!("No signatures to index loaded, exiting."); + // } - // Create or open the RevIndex database with the provided output path and colors flag - let db = RevIndex::create(output.as_ref(), colors); + // // Create or open the RevIndex database with the provided output path and colors flag + // let db = RevIndex::create(output.as_ref(), colors); - // Index the signatures using the loaded template, threshold, and save_paths option - db.index(index_sigs, &template, 0.0, save_paths); + // // Index the signatures using the loaded template, threshold, and save_paths option + // db.index(index_sigs, &template, 0.0, save_paths); + + let manifest = if let Some(m) = manifest { + let rdr = std::fs::OpenOptions::new().read(true).open(m.as_ref())?; + Some(Manifest::from_reader(rdr)?) + } else { + None + }; + + let collection = if matches!(siglist.as_ref().extension(), Some("zip")) { + if let Some(m) = manifest { + let storage = ZipStorage::from_file(siglist)?; + Collection::new(m, InnerStorage::new(storage)) + } else { + Collection::from_zipfile(siglist)? + } + } else { + let manifest = manifest.ok_or_else(|| "Need a manifest")?; + let storage = FSStorage::builder() + .fullpath("".into()) + .subdir("".into()) + .build(); + Collection::new(manifest, InnerStorage::new(storage)) + }; + + RevIndex::create( + output.as_ref(), + collection.select(&selection)?.try_into()?, + colors, + )?; Ok(()) } From cb702b3842cd90498cc540126db99f3bbe6a97b8 Mon Sep 17 00:00:00 2001 From: "N. Tessa Pierce-Ward" Date: Wed, 27 Sep 2023 15:53:42 -0700 Subject: [PATCH 02/40] compiling code using newer mastiff branch --- Cargo.lock | 1 + Cargo.toml | 1 + src/check.rs | 5 +++-- src/index.rs | 14 ++++++++------ src/lib.rs | 34 +++++++++++++++++++++++++++++++++- src/mastiff_manygather.rs | 21 ++++++++++++++++++--- src/mastiff_manysearch.rs | 11 +++++++++-- src/utils.rs | 12 ++++++------ 8 files changed, 79 insertions(+), 20 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 0e453e9e..e802d28d 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1221,6 +1221,7 @@ dependencies = [ "anyhow", "assert_cmd", "assert_matches", + "camino", "csv", "env_logger", "log", diff --git a/Cargo.toml b/Cargo.toml index c67fc419..f96dbfd8 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -24,6 +24,7 @@ zip = "0.6" tempfile = "3.8" needletail = "0.5.1" csv = "1.2.2" +camino = "1.1.6" [dev-dependencies] assert_cmd = "2.0.4" diff --git a/src/check.rs b/src/check.rs index 3b6484ee..7df0ca2a 100644 --- a/src/check.rs +++ b/src/check.rs @@ -2,7 +2,8 @@ use std::path::Path; use crate::utils::is_revindex_database; -use sourmash::index::revindex::RevIndex; +use sourmash::index::revindex::{RevIndex, RevIndexOps}; + pub fn check>(index: P, quick: bool) -> Result<(), Box> { if !is_revindex_database(index.as_ref()) { @@ -13,7 +14,7 @@ pub fn check>(index: P, quick: bool) -> Result<(), Box>( - siglist: P, - template: Sketch, + siglist: PathBuf, + // template: Sketch, manifest: Option

, selection: Selection, output: P, @@ -41,7 +43,7 @@ pub fn index>( None }; - let collection = if matches!(siglist.as_ref().extension(), Some("zip")) { + let collection = if matches!(&siglist.extension(), Some("zip")) { if let Some(m) = manifest { let storage = ZipStorage::from_file(siglist)?; Collection::new(m, InnerStorage::new(storage)) diff --git a/src/lib.rs b/src/lib.rs index e7de2643..b924d2a8 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -16,6 +16,8 @@ mod manysketch; mod mastiff_manygather; mod mastiff_manysearch; mod multisearch; +use sourmash::selection::Selection; +use sourmash::encodings::HashFunctions; #[pyfunction] fn do_manysearch( @@ -103,10 +105,24 @@ fn do_fastmultigather( // if a siglist path is a revindex, run mastiff_manygather. If not, run multigather let template = build_template(ksize, scaled, &moltype); if is_revindex_database(siglist_path.as_ref()) { + // build selection instead of template + let hash_function = match moltype.as_str() { + "dna" => HashFunctions::Murmur64Dna, + "protein" => HashFunctions::Murmur64Protein, + "dayhoff" => HashFunctions::Murmur64Dayhoff, + "hp" => HashFunctions::Murmur64Hp, + _ => panic!("Unknown molecule type: {}", moltype), + }; + let selection = Selection::builder() + .ksize(ksize.into()) + .scaled(scaled as u32) + .moltype(hash_function) + .build(); match mastiff_manygather::mastiff_manygather( query_filenames, siglist_path, template, + selection, threshold_bp, output_path, ) { @@ -160,9 +176,25 @@ fn do_index( save_paths: bool, colors: bool, ) -> anyhow::Result { + let hash_function = match moltype.as_str() { + "dna" => HashFunctions::Murmur64Dna, + "protein" => HashFunctions::Murmur64Protein, + "dayhoff" => HashFunctions::Murmur64Dayhoff, + "hp" => HashFunctions::Murmur64Hp, + _ => panic!("Unknown molecule type: {}", moltype), + }; + let selection = Selection::builder() + .ksize(ksize.into()) + .scaled(scaled as u32) + .moltype(hash_function) + .build(); + // match index::index(siglist, template, output, save_paths, colors) { + // convert siglist to PathBuf // build template from ksize, scaled let template = build_template(ksize, scaled, &moltype); - match index::index(siglist, template, output, save_paths, colors) { + let location = camino::Utf8PathBuf::from(siglist); + let manifest = None; + match index::index(location, manifest, selection, output, save_paths, colors) { Ok(_) => Ok(0), Err(e) => { eprintln!("Error: {e}"); diff --git a/src/mastiff_manygather.rs b/src/mastiff_manygather.rs index 291f6b36..4ac8b5b3 100644 --- a/src/mastiff_manygather.rs +++ b/src/mastiff_manygather.rs @@ -6,7 +6,16 @@ use sourmash::signature::Signature; use sourmash::sketch::Sketch; use std::path::Path; -use sourmash::index::revindex::RevIndex; +// use sourmash::collection::Collection; +// use sourmash::selection::Selection;A +use sourmash::prelude::*; +// use sourmash::index::revindex::{prepare_query, RevIndex, RevIndexOps}; +// use sourmash::manifest::Manifest; +// use sourmash::prelude::*; +// use sourmash::signature::{Signature, SigsTrait}; +// use sourmash::storage::{FSStorage, InnerStorage, ZipStorage}; + +use sourmash::index::revindex::{RevIndex, RevIndexOps}; use std::sync::atomic; use std::sync::atomic::AtomicUsize; @@ -22,6 +31,7 @@ pub fn mastiff_manygather>( queries_file: P, index: P, template: Sketch, + selection: Selection, threshold_bp: usize, output: Option

, ) -> Result<(), Box> { @@ -32,7 +42,7 @@ pub fn mastiff_manygather>( ); } // Open database once - let db = RevIndex::open(index.as_ref(), true); + let db = RevIndex::open(index.as_ref(), true)?; println!("Loaded DB"); // Load query paths @@ -89,6 +99,10 @@ pub fn mastiff_manygather>( match Signature::from_path(filename) { Ok(query_sig) => { let location = filename.display().to_string(); + // if let Some(q) = prepare_query(&query_sig, &selection) { + // query = Some(q); + // } + // let query = query.expect("Couldn't find a compatible MinHash"); if let Some(query) = prepare_query(&query_sig, &template, &location) { // let query_size = query.minhash.size() as f64; let threshold = threshold_bp / query.minhash.scaled() as usize; @@ -105,7 +119,8 @@ pub fn mastiff_manygather>( hash_to_color, threshold, &query.minhash, - &template, + // Some(selection.clone()), + None, ); // extract matches from Result diff --git a/src/mastiff_manysearch.rs b/src/mastiff_manysearch.rs index 4681a8ef..654c1c17 100644 --- a/src/mastiff_manysearch.rs +++ b/src/mastiff_manysearch.rs @@ -6,7 +6,14 @@ use sourmash::signature::{Signature, SigsTrait}; use sourmash::sketch::Sketch; use std::path::Path; -use sourmash::index::revindex::RevIndex; +// use sourmash::collection::Collection; +// use sourmash::index::revindex::{prepare_query, RevIndex, RevIndexOps}; +// use sourmash::manifest::Manifest; +// use sourmash::prelude::*; +// use sourmash::storage::{FSStorage, InnerStorage, ZipStorage}; + +// use sourmash::index::revindex::RevIndex; +use sourmash::index::revindex::{RevIndex, RevIndexOps}; use std::sync::atomic; use std::sync::atomic::AtomicUsize; @@ -30,7 +37,7 @@ pub fn mastiff_manysearch>( ); } // Open database once - let db = RevIndex::open(index.as_ref(), true); + let db = RevIndex::open(index.as_ref(), true)?; println!("Loaded DB"); // Load query paths diff --git a/src/utils.rs b/src/utils.rs index 4cd36630..d592543c 100644 --- a/src/utils.rs +++ b/src/utils.rs @@ -19,8 +19,8 @@ use anyhow::{anyhow, Result}; use std::cmp::{Ordering, PartialOrd}; -use sourmash::prelude::FracMinHashOps; -use sourmash::prelude::MinHashOps; +// use sourmash::prelude::FracMinHashOps; +// use sourmash::prelude::HashOps; use sourmash::signature::{Signature, SigsTrait}; use sourmash::sketch::minhash::{max_hash_for_scaled, KmerMinHash}; use sourmash::sketch::Sketch; @@ -791,10 +791,10 @@ pub fn consume_query_by_gather + std::fmt::Debug + std::fmt::Disp pub fn build_template(ksize: u8, scaled: usize, moltype: &str) -> Sketch { let hash_function = match moltype { - "dna" => HashFunctions::murmur64_DNA, - "protein" => HashFunctions::murmur64_protein, - "dayhoff" => HashFunctions::murmur64_dayhoff, - "hp" => HashFunctions::murmur64_hp, + "dna" => HashFunctions::Murmur64Dna, + "protein" => HashFunctions::Murmur64Protein, + "dayhoff" => HashFunctions::Murmur64Dayhoff, + "hp" => HashFunctions::Murmur64Hp, _ => panic!("Unknown molecule type: {}", moltype), }; //adjust ksize if not dna From db318bab88c17b33409c3232c61a5a70cd92f8af Mon Sep 17 00:00:00 2001 From: "N. Tessa Pierce-Ward" Date: Mon, 2 Oct 2023 16:07:43 -0700 Subject: [PATCH 03/40] use selection --- Cargo.toml | 1 + src/mastiff_manygather.rs | 3 +-- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/Cargo.toml b/Cargo.toml index f96dbfd8..3e636aec 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -13,6 +13,7 @@ pyo3 = { version = "0.19.2", features = ["extension-module", "anyhow"] } rayon = "1.8.0" serde = { version = "1.0.136", features = ["derive"] } #sourmash = { git = "https://github.com/sourmash-bio/sourmash", "rev" = "ff1092f8f366339caa59d7203f623813228f4356" } +#sourmash = { git = "https://github.com/sourmash-bio/sourmash", branch = "lirber/mastiff", features = ["branchwater"] } sourmash = { path = "../sourmash/src/core", features = ["branchwater"] } serde_json = "1.0.107" niffler = "2.4.0" diff --git a/src/mastiff_manygather.rs b/src/mastiff_manygather.rs index 4ac8b5b3..10954d8a 100644 --- a/src/mastiff_manygather.rs +++ b/src/mastiff_manygather.rs @@ -119,8 +119,7 @@ pub fn mastiff_manygather>( hash_to_color, threshold, &query.minhash, - // Some(selection.clone()), - None, + Some(selection.clone()), ); // extract matches from Result From 07c8362284429813a41f790fd3a4c31b8a906c12 Mon Sep 17 00:00:00 2001 From: "N. Tessa Pierce-Ward" Date: Sun, 5 Nov 2023 14:13:02 -0800 Subject: [PATCH 04/40] rustfmt --- src/check.rs | 1 - src/index.rs | 4 ++-- src/lib.rs | 20 ++++++++++---------- src/mastiff_manygather.rs | 2 +- 4 files changed, 13 insertions(+), 14 deletions(-) diff --git a/src/check.rs b/src/check.rs index 7df0ca2a..7fea2eca 100644 --- a/src/check.rs +++ b/src/check.rs @@ -4,7 +4,6 @@ use crate::utils::is_revindex_database; use sourmash::index::revindex::{RevIndex, RevIndexOps}; - pub fn check>(index: P, quick: bool) -> Result<(), Box> { if !is_revindex_database(index.as_ref()) { bail!( diff --git a/src/index.rs b/src/index.rs index 7ed445cf..b2c38d63 100644 --- a/src/index.rs +++ b/src/index.rs @@ -7,14 +7,14 @@ use sourmash::prelude::*; // use sourmash::signature::{Signature, SigsTrait}; use sourmash::storage::{FSStorage, InnerStorage, ZipStorage}; // use sourmash::sketch::Sketch; -use std::path::Path; use camino::Utf8PathBuf as PathBuf; +use std::path::Path; use crate::utils::{load_sigpaths_from_zip_or_pathlist, ReportType}; pub fn index>( siglist: PathBuf, - // template: Sketch, + // template: Sketch, manifest: Option

, selection: Selection, output: P, diff --git a/src/lib.rs b/src/lib.rs index b924d2a8..efb47082 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -16,8 +16,8 @@ mod manysketch; mod mastiff_manygather; mod mastiff_manysearch; mod multisearch; -use sourmash::selection::Selection; use sourmash::encodings::HashFunctions; +use sourmash::selection::Selection; #[pyfunction] fn do_manysearch( @@ -114,10 +114,10 @@ fn do_fastmultigather( _ => panic!("Unknown molecule type: {}", moltype), }; let selection = Selection::builder() - .ksize(ksize.into()) - .scaled(scaled as u32) - .moltype(hash_function) - .build(); + .ksize(ksize.into()) + .scaled(scaled as u32) + .moltype(hash_function) + .build(); match mastiff_manygather::mastiff_manygather( query_filenames, siglist_path, @@ -184,12 +184,12 @@ fn do_index( _ => panic!("Unknown molecule type: {}", moltype), }; let selection = Selection::builder() - .ksize(ksize.into()) - .scaled(scaled as u32) - .moltype(hash_function) - .build(); + .ksize(ksize.into()) + .scaled(scaled as u32) + .moltype(hash_function) + .build(); // match index::index(siglist, template, output, save_paths, colors) { - // convert siglist to PathBuf + // convert siglist to PathBuf // build template from ksize, scaled let template = build_template(ksize, scaled, &moltype); let location = camino::Utf8PathBuf::from(siglist); diff --git a/src/mastiff_manygather.rs b/src/mastiff_manygather.rs index e51af754..61a0d522 100644 --- a/src/mastiff_manygather.rs +++ b/src/mastiff_manygather.rs @@ -100,7 +100,7 @@ pub fn mastiff_manygather>( Ok(query_sig) => { let location = filename.display().to_string(); // if let Some(q) = prepare_query(&query_sig, &selection) { - // query = Some(q); + // query = Some(q); // } // let query = query.expect("Couldn't find a compatible MinHash"); if let Some(query) = prepare_query(&query_sig, &template, &location) { From ff4846920d326733b920bb351dfeae004048c413 Mon Sep 17 00:00:00 2001 From: Luiz Irber Date: Tue, 21 Nov 2023 17:38:28 -0800 Subject: [PATCH 05/40] update deps --- Cargo.lock | 263 ++++------------------------------------------------- Cargo.toml | 2 +- 2 files changed, 17 insertions(+), 248 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 08ef6a0b..4391cffd 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -8,17 +8,6 @@ version = "1.0.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "f26201604c87b1e01bd3d98f8d5d9a8fcbb815e8cedb41ffccbeb4bf593a35fe" -[[package]] -name = "aes" -version = "0.8.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ac1f845298e95f983ff1944b728ae08b8cebab80d684f0a832ed0fc74dfa27e2" -dependencies = [ - "cfg-if", - "cipher", - "cpufeatures", -] - [[package]] name = "ahash" version = "0.7.6" @@ -105,12 +94,6 @@ version = "1.2.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "7b7e4c2464d97fe331d41de9d5db0def0a96f4d823b8b32a2efd503578988973" -[[package]] -name = "base64ct" -version = "1.6.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8c3c1a368f70d6cf7302d78f8f7093da241fb8e8807c05cc9e51a125895a6d5b" - [[package]] name = "bgzip" version = "0.2.2" @@ -172,15 +155,6 @@ dependencies = [ "wyz", ] -[[package]] -name = "block-buffer" -version = "0.10.4" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3078c7629b62d3f0439517fa394996acacc5cbc91c5a20d8c658e77abd503a71" -dependencies = [ - "generic-array", -] - [[package]] name = "bstr" version = "1.6.2" @@ -313,21 +287,11 @@ dependencies = [ "iana-time-zone", "js-sys", "num-traits", - "time 0.1.45", + "time", "wasm-bindgen", "windows-targets", ] -[[package]] -name = "cipher" -version = "0.4.4" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "773f3b9af64447d2ce9850330c473515014aa235e6a783b02db81ff39e4a3dad" -dependencies = [ - "crypto-common", - "inout", -] - [[package]] name = "clang-sys" version = "1.6.1" @@ -348,12 +312,6 @@ dependencies = [ "csv", ] -[[package]] -name = "constant_time_eq" -version = "0.1.5" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "245097e9a4535ee1e3e3931fcfcd55a796a44c643e8596ff6566d68f09b87bbc" - [[package]] name = "core-foundation-sys" version = "0.8.4" @@ -369,15 +327,6 @@ dependencies = [ "num-traits", ] -[[package]] -name = "cpufeatures" -version = "0.2.9" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a17b76ff3a4162b0b27f354a0c87015ddad39d35f9c0c36607a3bdd175dde1f1" -dependencies = [ - "libc", -] - [[package]] name = "crc32fast" version = "1.3.2" @@ -420,16 +369,6 @@ dependencies = [ "cfg-if", ] -[[package]] -name = "crypto-common" -version = "0.1.6" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1bfb12502f3fc46cca1bb51ac28df9d618d813cdc3d2f25b9fe775a34af26bb3" -dependencies = [ - "generic-array", - "typenum", -] - [[package]] name = "csv" version = "1.3.0" @@ -451,29 +390,12 @@ dependencies = [ "memchr", ] -[[package]] -name = "deranged" -version = "0.3.8" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f2696e8a945f658fd14dc3b87242e6b80cd0f36ff04ea560fa39082368847946" - [[package]] name = "difflib" version = "0.4.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "6184e33543162437515c2e2b48714794e37845ec9851711914eec9d308f6ebe8" -[[package]] -name = "digest" -version = "0.10.7" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9ed9a281f7bc9b7576e61468ba615a66a5c8cfdff42420a70aa82701a3b1e292" -dependencies = [ - "block-buffer", - "crypto-common", - "subtle", -] - [[package]] name = "doc-comment" version = "0.3.3" @@ -569,16 +491,6 @@ version = "2.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "e6d5a32815ae3f33302d95fdcb2ce17862f8c65363dcfd29360480ba1001fc9c" -[[package]] -name = "generic-array" -version = "0.14.7" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "85649ca51fd72272d7821adaf274ad91c288277713d9c18820d8499a7ff69e9a" -dependencies = [ - "typenum", - "version_check", -] - [[package]] name = "getrandom" version = "0.2.10" @@ -637,15 +549,6 @@ version = "0.6.9" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "12cb882ccb290b8646e554b157ab0b71e64e8d5bef775cd66b6531e52d302669" -[[package]] -name = "hmac" -version = "0.12.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6c49c37c09c17a53d937dfbb742eb3a961d65a994e6bcdcf37e7399d0cc8ab5e" -dependencies = [ - "digest", -] - [[package]] name = "humantime" version = "2.1.0" @@ -681,15 +584,6 @@ version = "2.0.4" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "1e186cfbae8084e513daff4240b4797e342f988cecda4fb6c939150f96315fd8" -[[package]] -name = "inout" -version = "0.1.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a0c10553d664a4d0bcff9f4215d0aac67a639cc68ef660840afe309b807bc9f5" -dependencies = [ - "generic-array", -] - [[package]] name = "inplace-vec-builder" version = "0.1.1" @@ -855,9 +749,9 @@ checksum = "5486aed0026218e61b8a01d5fbd5a0a134649abb71a0e53b7bc088529dced86e" [[package]] name = "memmap2" -version = "0.7.1" +version = "0.9.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f49388d20533534cd19360ad3d6a7dadc885944aa802ba3995040c5ec11288c6" +checksum = "deaba38d7abf1d4cca21cc89e932e542ba2b9258664d2a9ef0e61512039c9375" dependencies = [ "libc", ] @@ -918,7 +812,7 @@ dependencies = [ "flate2", "thiserror", "xz2", - "zstd 0.12.4", + "zstd", ] [[package]] @@ -973,15 +867,6 @@ dependencies = [ "autocfg", ] -[[package]] -name = "numsep" -version = "0.1.12" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ad5c49c3e12c314efb1f43cba136031b657dcd59ee26936ab2be313c5e97da22" -dependencies = [ - "slicestring", -] - [[package]] name = "once_cell" version = "1.18.0" @@ -1036,29 +921,6 @@ dependencies = [ "windows-targets", ] -[[package]] -name = "password-hash" -version = "0.4.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7676374caaee8a325c9e7a2ae557f216c5563a171d6997b0ef8a65af35147700" -dependencies = [ - "base64ct", - "rand_core", - "subtle", -] - -[[package]] -name = "pbkdf2" -version = "0.11.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "83a0692ec44e4cf1ef28ca317f14f8f07da2d95ec3fa01f86e4467b725e60917" -dependencies = [ - "digest", - "hmac", - "password-hash", - "sha2", -] - [[package]] name = "peeking_take_while" version = "0.1.2" @@ -1532,28 +1394,6 @@ dependencies = [ "serde", ] -[[package]] -name = "sha1" -version = "0.10.5" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f04293dc80c3993519f2d7f6f511707ee7094fe0c6d3406feb330cdb3540eba3" -dependencies = [ - "cfg-if", - "cpufeatures", - "digest", -] - -[[package]] -name = "sha2" -version = "0.10.7" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "479fb9d862239e610720565ca91403019f2f00410f1864c5aa7479b950a76ed8" -dependencies = [ - "cfg-if", - "cpufeatures", - "digest", -] - [[package]] name = "shlex" version = "1.1.0" @@ -1572,18 +1412,6 @@ version = "0.3.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "8542b68b8800c3cda649d2c72d688b6907b30f1580043135d61669d4aad1c175" -[[package]] -name = "size" -version = "0.4.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9fed904c7fb2856d868b92464fc8fa597fce366edea1a9cbfaa8cb5fe080bd6d" - -[[package]] -name = "slicestring" -version = "0.1.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "636b979c5672ac7c2a1120ca0a9a6074cd090dadfec42af6f8a5baea1223d180" - [[package]] name = "smallvec" version = "1.11.0" @@ -1599,9 +1427,9 @@ checksum = "bceb57dc07c92cdae60f5b27b3fa92ecaaa42fe36c55e22dbfb0b44893e0b1f7" [[package]] name = "sourmash" version = "0.12.0" +source = "git+https://github.com/sourmash-bio/sourmash?branch=lirber/mastiff#cfe28341af5847235bbf853424ee4917995665ee" dependencies = [ "az", - "bytecount", "byteorder", "camino", "cfg-if", @@ -1620,7 +1448,6 @@ dependencies = [ "niffler", "nohash-hasher", "num-iter", - "numsep", "once_cell", "ouroboros", "piz", @@ -1631,7 +1458,6 @@ dependencies = [ "rocksdb", "serde", "serde_json", - "size", "thiserror", "twox-hash", "typed-builder", @@ -1646,12 +1472,6 @@ version = "1.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "a2eb9349b6444b326872e140eb1cf5e7c522154d69e7a0ffb0fb81c06b37543f" -[[package]] -name = "subtle" -version = "2.5.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "81cdd64d312baedb58e21336b31bc043b77e01cc99033ce76ef539f78e965ebc" - [[package]] name = "syn" version = "1.0.109" @@ -1745,23 +1565,6 @@ dependencies = [ "winapi", ] -[[package]] -name = "time" -version = "0.3.28" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "17f6bb557fd245c28e6411aa56b6403c689ad95061f50e4be16c274e70a17e48" -dependencies = [ - "deranged", - "serde", - "time-core", -] - -[[package]] -name = "time-core" -version = "0.1.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7300fbefb4dadc1af235a9cef3737cea692a9d97e1b9cbcd4ebdae6f8868e6fb" - [[package]] name = "tinyvec" version = "1.6.0" @@ -1799,12 +1602,6 @@ dependencies = [ "syn 1.0.109", ] -[[package]] -name = "typenum" -version = "1.16.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "497961ef93d974e23eb6f433eb5fe1b7930b659f06d12dec6fc44a8f554c0bba" - [[package]] name = "unicode-ident" version = "1.0.11" @@ -1873,9 +1670,9 @@ checksum = "9c8d87e72b64a3b4db28d11ce29237c246188f4f51057d65a7eab63b7987e423" [[package]] name = "wasm-bindgen" -version = "0.2.87" +version = "0.2.88" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7706a72ab36d8cb1f80ffbf0e071533974a60d0a308d01a5d0375bf60499a342" +checksum = "7daec296f25a1bae309c0cd5c29c4b260e510e6d813c286b19eaadf409d40fce" dependencies = [ "cfg-if", "serde", @@ -1885,9 +1682,9 @@ dependencies = [ [[package]] name = "wasm-bindgen-backend" -version = "0.2.87" +version = "0.2.88" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5ef2b6d3c510e9625e5fe6f509ab07d66a760f0885d858736483c32ed7809abd" +checksum = "e397f4664c0e4e428e8313a469aaa58310d302159845980fd23b0f22a847f217" dependencies = [ "bumpalo", "log", @@ -1900,9 +1697,9 @@ dependencies = [ [[package]] name = "wasm-bindgen-macro" -version = "0.2.87" +version = "0.2.88" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "dee495e55982a3bd48105a7b947fd2a9b4a8ae3010041b9e0faab3f9cd028f1d" +checksum = "5961017b3b08ad5f3fe39f1e79877f8ee7c23c5e5fd5eb80de95abc41f1f16b2" dependencies = [ "quote", "wasm-bindgen-macro-support", @@ -1910,9 +1707,9 @@ dependencies = [ [[package]] name = "wasm-bindgen-macro-support" -version = "0.2.87" +version = "0.2.88" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "54681b18a46765f095758388f2d0cf16eb8d4169b639ab575a8f5693af210c7b" +checksum = "c5353b8dab669f5e10f5bd76df26a9360c748f054f862ff5f3f8aae0c7fb3907" dependencies = [ "proc-macro2", "quote", @@ -1923,9 +1720,9 @@ dependencies = [ [[package]] name = "wasm-bindgen-shared" -version = "0.2.87" +version = "0.2.88" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ca6ad05a4870b2bf5fe995117d3728437bd27d7cd5f06f13c17443ef369775a1" +checksum = "0d046c5d029ba91a1ed14da14dca44b68bf2f124cfbaf741c54151fdb3e0750b" [[package]] name = "web-sys" @@ -2067,27 +1864,9 @@ version = "0.6.6" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "760394e246e4c28189f19d488c058bf16f564016aefac5d32bb1f3b51d5e9261" dependencies = [ - "aes", "byteorder", - "bzip2", - "constant_time_eq", "crc32fast", "crossbeam-utils", - "flate2", - "hmac", - "pbkdf2", - "sha1", - "time 0.3.28", - "zstd 0.11.2+zstd.1.5.2", -] - -[[package]] -name = "zstd" -version = "0.11.2+zstd.1.5.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "20cc960326ece64f010d2d2107537f26dc589a6573a316bd5b1dba685fa5fde4" -dependencies = [ - "zstd-safe 5.0.2+zstd.1.5.2", ] [[package]] @@ -2096,17 +1875,7 @@ version = "0.12.4" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "1a27595e173641171fc74a1232b7b1c7a7cb6e18222c11e9dfb9888fa424c53c" dependencies = [ - "zstd-safe 6.0.6", -] - -[[package]] -name = "zstd-safe" -version = "5.0.2+zstd.1.5.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1d2a5585e04f9eea4b2a3d1eca508c4dee9592a89ef6f450c11719da0726f4db" -dependencies = [ - "libc", - "zstd-sys", + "zstd-safe", ] [[package]] diff --git a/Cargo.toml b/Cargo.toml index 45cc21c5..0ce8b0e7 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -21,7 +21,7 @@ log = "0.4.14" env_logger = "0.10.1" simple-error = "0.3.0" anyhow = "1.0.75" -zip = "0.6" +zip = { version = "0.6", default-features = false } tempfile = "3.8" needletail = "0.5.1" csv = "1.3.0" From ddf6c8c3baef2ab19d7e44f3639817c6650c8bc2 Mon Sep 17 00:00:00 2001 From: Luiz Irber Date: Thu, 30 Nov 2023 20:56:37 -0800 Subject: [PATCH 06/40] update to sourmash 0.12.0 --- Cargo.lock | 31 ++++++++++++++++--------------- Cargo.toml | 3 ++- 2 files changed, 18 insertions(+), 16 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index b5561ba8..bd83ed3b 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -630,9 +630,9 @@ dependencies = [ [[package]] name = "js-sys" -version = "0.3.64" +version = "0.3.66" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c5f195fe497f702db0f318b07fdd68edb16955aed830df8363d837542f8f935a" +checksum = "cee9c64da59eae3b50095c18d3e74f8b73c0b86d2792824ff01bbce68ba229ca" dependencies = [ "wasm-bindgen", ] @@ -1427,7 +1427,8 @@ checksum = "bceb57dc07c92cdae60f5b27b3fa92ecaaa42fe36c55e22dbfb0b44893e0b1f7" [[package]] name = "sourmash" version = "0.12.0" -source = "git+https://github.com/sourmash-bio/sourmash?branch=lirber/mastiff#cfe28341af5847235bbf853424ee4917995665ee" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "760c7b049cc70294122c44c4e6d0922ed0e79a8e04f2d739b98a982027a9fd4a" dependencies = [ "az", "byteorder", @@ -1670,9 +1671,9 @@ checksum = "9c8d87e72b64a3b4db28d11ce29237c246188f4f51057d65a7eab63b7987e423" [[package]] name = "wasm-bindgen" -version = "0.2.88" +version = "0.2.89" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7daec296f25a1bae309c0cd5c29c4b260e510e6d813c286b19eaadf409d40fce" +checksum = "0ed0d4f68a3015cc185aff4db9506a015f4b96f95303897bfa23f846db54064e" dependencies = [ "cfg-if", "serde", @@ -1682,9 +1683,9 @@ dependencies = [ [[package]] name = "wasm-bindgen-backend" -version = "0.2.88" +version = "0.2.89" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e397f4664c0e4e428e8313a469aaa58310d302159845980fd23b0f22a847f217" +checksum = "1b56f625e64f3a1084ded111c4d5f477df9f8c92df113852fa5a374dbda78826" dependencies = [ "bumpalo", "log", @@ -1697,9 +1698,9 @@ dependencies = [ [[package]] name = "wasm-bindgen-macro" -version = "0.2.88" +version = "0.2.89" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5961017b3b08ad5f3fe39f1e79877f8ee7c23c5e5fd5eb80de95abc41f1f16b2" +checksum = "0162dbf37223cd2afce98f3d0785506dcb8d266223983e4b5b525859e6e182b2" dependencies = [ "quote", "wasm-bindgen-macro-support", @@ -1707,9 +1708,9 @@ dependencies = [ [[package]] name = "wasm-bindgen-macro-support" -version = "0.2.88" +version = "0.2.89" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c5353b8dab669f5e10f5bd76df26a9360c748f054f862ff5f3f8aae0c7fb3907" +checksum = "f0eb82fcb7930ae6219a7ecfd55b217f5f0893484b7a13022ebb2b2bf20b5283" dependencies = [ "proc-macro2", "quote", @@ -1720,15 +1721,15 @@ dependencies = [ [[package]] name = "wasm-bindgen-shared" -version = "0.2.88" +version = "0.2.89" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0d046c5d029ba91a1ed14da14dca44b68bf2f124cfbaf741c54151fdb3e0750b" +checksum = "7ab9b36309365056cd639da3134bf87fa8f3d86008abf99e612384a6eecd459f" [[package]] name = "web-sys" -version = "0.3.64" +version = "0.3.66" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9b85cbef8c220a6abc02aefd892dfc0fc23afb1c6a426316ec33253a3877249b" +checksum = "50c24a44ec86bb68fbecd1b3efed7e85ea5621b39b35ef2766b66cd984f8010f" dependencies = [ "js-sys", "wasm-bindgen", diff --git a/Cargo.toml b/Cargo.toml index 67d2cd2b..1d1d3f47 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -14,7 +14,8 @@ rayon = "1.8.0" serde = { version = "1.0.192", features = ["derive"] } #sourmash = { git = "https://github.com/sourmash-bio/sourmash", "rev" = "ff1092f8f366339caa59d7203f623813228f4356" } -sourmash = { git = "https://github.com/sourmash-bio/sourmash", branch = "lirber/mastiff", features = ["branchwater"] } +#sourmash = { git = "https://github.com/sourmash-bio/sourmash", branch = "lirber/mastiff", features = ["branchwater"] } +sourmash = { version = "0.12.0", features = ["branchwater"] } serde_json = "1.0.108" niffler = "2.4.0" log = "0.4.14" From b48ac885ef1e9faf6b6dd44e376997cc5c62114f Mon Sep 17 00:00:00 2001 From: Luiz Irber Date: Fri, 1 Dec 2023 15:22:29 -0800 Subject: [PATCH 07/40] fix index --- Cargo.lock | 1 + Cargo.toml | 2 +- src/index.rs | 11 +++++++++-- 3 files changed, 11 insertions(+), 3 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index bd83ed3b..9b850190 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1868,6 +1868,7 @@ dependencies = [ "byteorder", "crc32fast", "crossbeam-utils", + "flate2", ] [[package]] diff --git a/Cargo.toml b/Cargo.toml index 1d1d3f47..c5952f98 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -22,7 +22,7 @@ log = "0.4.14" env_logger = "0.10.1" simple-error = "0.3.0" anyhow = "1.0.75" -zip = { version = "0.6", default-features = false } +zip = { version = "0.6", default-features = false, features = ["deflate"] } tempfile = "3.8" needletail = "0.5.1" csv = "1.3.0" diff --git a/src/index.rs b/src/index.rs index b2c38d63..6768d058 100644 --- a/src/index.rs +++ b/src/index.rs @@ -10,7 +10,7 @@ use sourmash::storage::{FSStorage, InnerStorage, ZipStorage}; use camino::Utf8PathBuf as PathBuf; use std::path::Path; -use crate::utils::{load_sigpaths_from_zip_or_pathlist, ReportType}; +use crate::utils::load_sketchlist_filenames; pub fn index>( siglist: PathBuf, @@ -51,7 +51,14 @@ pub fn index>( Collection::from_zipfile(siglist)? } } else { - let manifest = manifest.ok_or_else(|| "Need a manifest")?; + let manifest = manifest.unwrap_or_else(|| { + let sig_paths: Vec<_> = load_sketchlist_filenames(&siglist) + .unwrap_or_else(|_| panic!("Error loading siglist")) + .into_iter() + .map(|v| PathBuf::from_path_buf(v).unwrap()) + .collect(); + sig_paths.as_slice().into() + }); let storage = FSStorage::builder() .fullpath("".into()) .subdir("".into()) From f1145a16b4f799113eb134497180003482f4c9ac Mon Sep 17 00:00:00 2001 From: "N. Tessa Pierce-Ward" Date: Fri, 1 Dec 2023 16:26:13 -0800 Subject: [PATCH 08/40] rm reporting line checks not in smash core idx --- src/python/tests/test_index.py | 38 ++++++++++++++++++++++------------ 1 file changed, 25 insertions(+), 13 deletions(-) diff --git a/src/python/tests/test_index.py b/src/python/tests/test_index.py index 2968b356..eeb8f76a 100644 --- a/src/python/tests/test_index.py +++ b/src/python/tests/test_index.py @@ -89,7 +89,7 @@ def test_index_missing_siglist(runtmp, capfd): captured = capfd.readouterr() print(captured.err) - assert 'Error: No such file or directory ' in captured.err + assert 'Error loading siglist' in captured.err def test_index_bad_siglist(runtmp, capfd): @@ -103,7 +103,7 @@ def test_index_bad_siglist(runtmp, capfd): captured = capfd.readouterr() print(captured.err) - assert "Error: invalid line in fromfile" in captured.err + assert 'Error loading siglist' in captured.err print(runtmp.last_result.err) @@ -128,21 +128,29 @@ def test_index_bad_siglist_2(runtmp, capfd): def test_index_empty_siglist(runtmp, capfd): + ## TODO: index:: do not write output if no signatures to write? + # OR, warn user? + # test empty siglist file siglist = runtmp.output('db-sigs.txt') output = runtmp.output('out.db') make_file_list(siglist, []) # empty - with pytest.raises(utils.SourmashCommandFailed): - runtmp.sourmash('scripts', 'index', siglist, + # with pytest.raises(utils.SourmashCommandFailed): + runtmp.sourmash('scripts', 'index', siglist, '-o', output) captured = capfd.readouterr() + assert os.path.exists(output) # do we want an empty file, or no file? + print(runtmp.last_result.out) + print(runtmp.last_result.err) print(captured.err) - assert "No signatures to index loaded, exiting." in captured.err + # assert "No signatures to index loaded, exiting." in captured.err def test_index_nomatch_sig_in_siglist(runtmp, capfd): + ## TODO: index:: do not write output if no signatures to write? + # test index with a siglist file that has (only) a non-matching ksize sig siglist = runtmp.output('against.txt') db = runtmp.output('db.rdb') @@ -151,13 +159,16 @@ def test_index_nomatch_sig_in_siglist(runtmp, capfd): sig1 = get_test_data('1.fa.k21.sig.gz') make_file_list(siglist, [sig2, sig1]) - with pytest.raises(utils.SourmashCommandFailed): - runtmp.sourmash('scripts', 'index', siglist, + # with pytest.raises(utils.SourmashCommandFailed): + runtmp.sourmash('scripts', 'index', siglist, '-o', db) captured = capfd.readouterr() + assert os.path.exists(db) # do we want an empty file, or no file? + print(runtmp.last_result.out) + print(runtmp.last_result.err) print(captured.err) - assert "Couldn't find a compatible MinHash" in captured.err + # assert "Couldn't find a compatible MinHash" in captured.err def test_index_zipfile(runtmp, capfd): @@ -184,7 +195,7 @@ def test_index_zipfile(runtmp, capfd): assert 'index is done' in runtmp.last_result.err captured = capfd.readouterr() print(captured.err) - assert 'Found 3 filepaths' in captured.err + # assert 'Found 3 filepaths' in captured.err def test_index_zipfile_repeated_md5sums(runtmp, capfd): @@ -212,7 +223,7 @@ def test_index_zipfile_repeated_md5sums(runtmp, capfd): captured = capfd.readouterr() print(captured.err) - assert 'Found 3 filepaths' in captured.err + # assert 'Found 3 filepaths' in captured.err assert 'index is done' in runtmp.last_result.err @@ -243,8 +254,8 @@ def test_index_zipfile_multiparam(runtmp, capfd): assert 'index is done' in runtmp.last_result.err captured = capfd.readouterr() print(captured.err) - assert 'WARNING: skipped 5 index paths - no compatible signatures.' in captured.err - assert 'Found 4 filepaths' in captured.err + # assert 'WARNING: skipped 5 index paths - no compatible signatures.' in captured.err + # assert 'Found 4 filepaths' in captured.err def test_index_zipfile_bad(runtmp, capfd): @@ -266,7 +277,8 @@ def test_index_zipfile_bad(runtmp, capfd): captured = capfd.readouterr() print(captured.err) - assert 'Error: invalid Zip archive: Could not find central directory end' in captured.err + assert "Couldn't find End Of Central Directory Record" in captured.err + # assert 'Error: invalid Zip archive: Could not find central directory end' in captured.err def test_index_check(runtmp): From a6785dcc08d4bc7ddb8a46194edcfe7177311d4f Mon Sep 17 00:00:00 2001 From: "N. Tessa Pierce-Ward" Date: Thu, 7 Dec 2023 13:21:04 -0800 Subject: [PATCH 09/40] use selection instead of template --- src/mastiff_manygather.rs | 34 ++++++++++++++++++++-------------- 1 file changed, 20 insertions(+), 14 deletions(-) diff --git a/src/mastiff_manygather.rs b/src/mastiff_manygather.rs index 61a0d522..2fcec3cc 100644 --- a/src/mastiff_manygather.rs +++ b/src/mastiff_manygather.rs @@ -7,7 +7,7 @@ use sourmash::sketch::Sketch; use std::path::Path; // use sourmash::collection::Collection; -// use sourmash::selection::Selection;A +// use sourmash::selection::Selection; use sourmash::prelude::*; // use sourmash::index::revindex::{prepare_query, RevIndex, RevIndexOps}; // use sourmash::manifest::Manifest; @@ -15,7 +15,7 @@ use sourmash::prelude::*; // use sourmash::signature::{Signature, SigsTrait}; // use sourmash::storage::{FSStorage, InnerStorage, ZipStorage}; -use sourmash::index::revindex::{RevIndex, RevIndexOps}; +use sourmash::index::revindex::{prepare_query, RevIndex, RevIndexOps}; use std::sync::atomic; use std::sync::atomic::AtomicUsize; @@ -24,8 +24,8 @@ use std::fs::File; use std::io::{BufWriter, Write}; use crate::utils::{ - is_revindex_database, load_sigpaths_from_zip_or_pathlist, prepare_query, ReportType, -}; + is_revindex_database, load_sigpaths_from_zip_or_pathlist, ReportType, +}; // prepare_query pub fn mastiff_manygather>( queries_file: P, @@ -96,27 +96,33 @@ pub fn mastiff_manygather>( let mut results = vec![]; // load query signature from path: - match Signature::from_path(filename) { + // todo: add reason text to expect instead of using match arms? + // note: can't keep track of failed paths if we do that? + match Signature::from_path(filename).expect("REASON").swap_remove(0).select(&selection) { Ok(query_sig) => { - let location = filename.display().to_string(); - // if let Some(q) = prepare_query(&query_sig, &selection) { - // query = Some(q); + eprintln!("query_sig selection scaled: {}", selection.scaled()?.to_string()); + let mut query = None; + // if let Some(q) = prepare_query(query_sig, &selection) { + // query = Some(q); // } // let query = query.expect("Couldn't find a compatible MinHash"); - if let Some(query) = prepare_query(&query_sig, &template, &location) { + if let Some(q) = prepare_query(query_sig.clone(), &selection) { + query = Some(q); + let query = query.expect("Couldn't find a compatible MinHash"); + //if let Some(query) = prepare_query(&query_sig, &template, &location) { // let query_size = query.minhash.size() as f64; - let threshold = threshold_bp / query.minhash.scaled() as usize; + let threshold = threshold_bp / query.scaled() as usize; // mastiff gather code let (counter, query_colors, hash_to_color) = - db.prepare_gather_counters(&query.minhash); + db.prepare_gather_counters(&query); let matches = db.gather( counter, query_colors, hash_to_color, threshold, - &query.minhash, + &query, Some(selection.clone()), ); @@ -124,8 +130,8 @@ pub fn mastiff_manygather>( if let Ok(matches) = matches { for match_ in &matches { results.push(( - query.name.clone(), - query.md5sum.clone(), + query_sig.name().clone(), + query.md5sum().clone(), match_.name().clone(), match_.md5().clone(), match_.f_match(), // f_match_query From 21f20caeb75e07a8e99591708a909809f3b5867a Mon Sep 17 00:00:00 2001 From: "N. Tessa Pierce-Ward" Date: Mon, 22 Jan 2024 17:46:33 -0800 Subject: [PATCH 10/40] rustfmt --- src/mastiff_manygather.rs | 17 +++++++++++------ 1 file changed, 11 insertions(+), 6 deletions(-) diff --git a/src/mastiff_manygather.rs b/src/mastiff_manygather.rs index 2fcec3cc..a4293066 100644 --- a/src/mastiff_manygather.rs +++ b/src/mastiff_manygather.rs @@ -23,9 +23,7 @@ use std::sync::atomic::AtomicUsize; use std::fs::File; use std::io::{BufWriter, Write}; -use crate::utils::{ - is_revindex_database, load_sigpaths_from_zip_or_pathlist, ReportType, -}; // prepare_query +use crate::utils::{is_revindex_database, load_sigpaths_from_zip_or_pathlist, ReportType}; // prepare_query pub fn mastiff_manygather>( queries_file: P, @@ -98,9 +96,16 @@ pub fn mastiff_manygather>( // load query signature from path: // todo: add reason text to expect instead of using match arms? // note: can't keep track of failed paths if we do that? - match Signature::from_path(filename).expect("REASON").swap_remove(0).select(&selection) { + match Signature::from_path(filename) + .expect("REASON") + .swap_remove(0) + .select(&selection) + { Ok(query_sig) => { - eprintln!("query_sig selection scaled: {}", selection.scaled()?.to_string()); + eprintln!( + "query_sig selection scaled: {}", + selection.scaled()?.to_string() + ); let mut query = None; // if let Some(q) = prepare_query(query_sig, &selection) { // query = Some(q); @@ -109,7 +114,7 @@ pub fn mastiff_manygather>( if let Some(q) = prepare_query(query_sig.clone(), &selection) { query = Some(q); let query = query.expect("Couldn't find a compatible MinHash"); - //if let Some(query) = prepare_query(&query_sig, &template, &location) { + //if let Some(query) = prepare_query(&query_sig, &template, &location) { // let query_size = query.minhash.size() as f64; let threshold = threshold_bp / query.scaled() as usize; From 45b598fbf363c2c9baa98f5a50ba9bb63068c30d Mon Sep 17 00:00:00 2001 From: "N. Tessa Pierce-Ward" Date: Mon, 22 Jan 2024 18:36:38 -0800 Subject: [PATCH 11/40] fix query file no exist errs --- Cargo.toml | 4 +- src/mastiff_manygather.rs | 122 ++++++++++++++------------- src/python/tests/test_multigather.py | 3 +- 3 files changed, 66 insertions(+), 63 deletions(-) diff --git a/Cargo.toml b/Cargo.toml index c0d47a42..dde6aa5d 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -12,9 +12,9 @@ crate-type = ["cdylib"] pyo3 = { version = "0.20.2", features = ["extension-module", "anyhow"] } rayon = "1.8.1" serde = { version = "1.0.195", features = ["derive"] } -#sourmash = { git = "https://github.com/sourmash-bio/sourmash", "rev" = "b780164a2cc6db1cf66e58ca5ea55b83b563921e" } +sourmash = { git = "https://github.com/sourmash-bio/sourmash", branch = "select-downsample", features = ["branchwater"] } #sourmash = { git = "https://github.com/sourmash-bio/sourmash", branch = "lirber/mastiff", features = ["branchwater"] } -sourmash = { version = "0.12.0", features = ["branchwater"] } +#sourmash = { version = "0.12.0", features = ["branchwater"] } serde_json = "1.0.111" niffler = "2.4.0" log = "0.4.14" diff --git a/src/mastiff_manygather.rs b/src/mastiff_manygather.rs index a4293066..6ef89adc 100644 --- a/src/mastiff_manygather.rs +++ b/src/mastiff_manygather.rs @@ -2,6 +2,7 @@ use anyhow::Result; use rayon::prelude::*; +use sourmash::ffi::signature; use sourmash::signature::Signature; use sourmash::sketch::Sketch; use std::path::Path; @@ -94,71 +95,74 @@ pub fn mastiff_manygather>( let mut results = vec![]; // load query signature from path: - // todo: add reason text to expect instead of using match arms? - // note: can't keep track of failed paths if we do that? - match Signature::from_path(filename) - .expect("REASON") - .swap_remove(0) - .select(&selection) - { - Ok(query_sig) => { - eprintln!( - "query_sig selection scaled: {}", - selection.scaled()?.to_string() - ); - let mut query = None; - // if let Some(q) = prepare_query(query_sig, &selection) { - // query = Some(q); - // } - // let query = query.expect("Couldn't find a compatible MinHash"); - if let Some(q) = prepare_query(query_sig.clone(), &selection) { - query = Some(q); - let query = query.expect("Couldn't find a compatible MinHash"); - //if let Some(query) = prepare_query(&query_sig, &template, &location) { - // let query_size = query.minhash.size() as f64; - let threshold = threshold_bp / query.scaled() as usize; - - // mastiff gather code - let (counter, query_colors, hash_to_color) = - db.prepare_gather_counters(&query); - - let matches = db.gather( - counter, - query_colors, - hash_to_color, - threshold, - &query, - Some(selection.clone()), - ); - - // extract matches from Result - if let Ok(matches) = matches { - for match_ in &matches { - results.push(( - query_sig.name().clone(), - query.md5sum().clone(), - match_.name().clone(), - match_.md5().clone(), - match_.f_match(), // f_match_query - match_.intersect_bp(), - )); // intersect_bp + match Signature::from_path(filename) { + Ok(mut signature) => { + match signature.swap_remove(0).select(&selection) { + Ok(query_sig) => { + eprintln!( + "query_sig selection scaled: {}", + selection.scaled()?.to_string() + ); + let mut query = None; + + if let Some(q) = prepare_query(query_sig.clone(), &selection) { + query = Some(q); + let query = query.expect("Couldn't find a compatible MinHash"); + + let threshold = threshold_bp / query.scaled() as usize; + + // mastiff gather code + let (counter, query_colors, hash_to_color) = + db.prepare_gather_counters(&query); + + let matches = db.gather( + counter, + query_colors, + hash_to_color, + threshold, + &query, + Some(selection.clone()), + ); + + // extract matches from Result + if let Ok(matches) = matches { + for match_ in &matches { + results.push(( + query_sig.name().clone(), + query.md5sum().clone(), + match_.name().clone(), + match_.md5().clone(), + match_.f_match(), // f_match_query + match_.intersect_bp(), + )); // intersect_bp + } + } else { + eprintln!("Error gathering matches: {:?}", matches.err()); + } + } else { + if !queryfile_name.ends_with(".zip") { + eprintln!( + "WARNING: no compatible sketches in path '{}'", + filename.display() + ); + } + let _ = skipped_paths.fetch_add(1, atomic::Ordering::SeqCst); + } + if results.is_empty() { + None + } else { + Some(results) } - } else { - eprintln!("Error gathering matches: {:?}", matches.err()); } - } else { - if !queryfile_name.ends_with(".zip") { + Err(err) => { + let _ = failed_paths.fetch_add(1, atomic::Ordering::SeqCst); + eprintln!("Error in processing: {}", err); eprintln!( - "WARNING: no compatible sketches in path '{}'", + "WARNING: could not process item from path '{}'", filename.display() ); + None } - let _ = skipped_paths.fetch_add(1, atomic::Ordering::SeqCst); - } - if results.is_empty() { - None - } else { - Some(results) } } Err(err) => { diff --git a/src/python/tests/test_multigather.py b/src/python/tests/test_multigather.py index a00d2b62..646e9309 100644 --- a/src/python/tests/test_multigather.py +++ b/src/python/tests/test_multigather.py @@ -244,7 +244,7 @@ def test_bad_query_2(runtmp, capfd, indexed): @pytest.mark.parametrize('indexed', [False, True]) def test_missing_query(runtmp, capfd, indexed): - # test missingfile in querylist + # test missing query query_list = runtmp.output('query.txt') against_list = runtmp.output('against.txt') @@ -263,7 +263,6 @@ def test_missing_query(runtmp, capfd, indexed): captured = capfd.readouterr() print(captured.err) - assert "WARNING: could not load sketches from path 'no-exist'" in captured.err assert "WARNING: 1 query paths failed to load. See error messages above." From 34ed1bbe14a941cc02c6514b629d47318a8a44d1 Mon Sep 17 00:00:00 2001 From: "N. Tessa Pierce-Ward" Date: Tue, 23 Jan 2024 15:14:07 -0800 Subject: [PATCH 12/40] update mastiff_manygather --- Cargo.toml | 4 +- src/mastiff_manygather.rs | 115 +++++++++++++++++--------------------- 2 files changed, 54 insertions(+), 65 deletions(-) diff --git a/Cargo.toml b/Cargo.toml index dde6aa5d..490e4c56 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -12,8 +12,8 @@ crate-type = ["cdylib"] pyo3 = { version = "0.20.2", features = ["extension-module", "anyhow"] } rayon = "1.8.1" serde = { version = "1.0.195", features = ["derive"] } -sourmash = { git = "https://github.com/sourmash-bio/sourmash", branch = "select-downsample", features = ["branchwater"] } -#sourmash = { git = "https://github.com/sourmash-bio/sourmash", branch = "lirber/mastiff", features = ["branchwater"] } +sourmash = { git = "https://github.com/sourmash-bio/sourmash", branch="select-downsample", features = ["branchwater"] } +#sourmash = { git = "https://github.com/sourmash-bio/sourmash", rev= "94b88cc314f781342721addc5ed35c531732a9b6", features = ["branchwater"] } #sourmash = { version = "0.12.0", features = ["branchwater"] } serde_json = "1.0.111" niffler = "2.4.0" diff --git a/src/mastiff_manygather.rs b/src/mastiff_manygather.rs index 6ef89adc..1e50add6 100644 --- a/src/mastiff_manygather.rs +++ b/src/mastiff_manygather.rs @@ -5,6 +5,7 @@ use rayon::prelude::*; use sourmash::ffi::signature; use sourmash::signature::Signature; use sourmash::sketch::Sketch; +use sourmash::sketch::minhash::KmerMinHash; use std::path::Path; // use sourmash::collection::Collection; @@ -84,36 +85,25 @@ pub fn mastiff_manygather>( let skipped_paths = AtomicUsize::new(0); let failed_paths = AtomicUsize::new(0); - let send = query_paths - .par_iter() - .filter_map(|filename| { - let i = processed_sigs.fetch_add(1, atomic::Ordering::SeqCst); - if i % 1000 == 0 { - eprintln!("Processed {} search sigs", i); - } - - let mut results = vec![]; - - // load query signature from path: - match Signature::from_path(filename) { - Ok(mut signature) => { - match signature.swap_remove(0).select(&selection) { - Ok(query_sig) => { - eprintln!( - "query_sig selection scaled: {}", - selection.scaled()?.to_string() - ); - let mut query = None; - - if let Some(q) = prepare_query(query_sig.clone(), &selection) { - query = Some(q); - let query = query.expect("Couldn't find a compatible MinHash"); - let threshold = threshold_bp / query.scaled() as usize; - - // mastiff gather code - let (counter, query_colors, hash_to_color) = - db.prepare_gather_counters(&query); + let send = query_paths + .par_iter() + .filter_map(|filename| { + // ... existing setup code ... + let threshold = threshold_bp / selection.scaled()? as usize; + + match Signature::from_path(filename) { + Ok(mut signatures) if !signatures.is_empty() => { + match signatures.swap_remove(0).select(&selection) { + Ok(query_sig) => { + let mut results = vec![]; + let mut found_compatible_sketch = false; + for sketch in query_sig.iter() { + if let Sketch::MinHash(query) = sketch { + found_compatible_sketch = true; + // eprintln!("query-size: {}", sketch.size()); + // Gather! + let (counter, query_colors, hash_to_color) = db.prepare_gather_counters(&query); let matches = db.gather( counter, @@ -123,8 +113,7 @@ pub fn mastiff_manygather>( &query, Some(selection.clone()), ); - - // extract matches from Result + // extract results if let Ok(matches) = matches { for match_ in &matches { results.push(( @@ -139,45 +128,45 @@ pub fn mastiff_manygather>( } else { eprintln!("Error gathering matches: {:?}", matches.err()); } - } else { - if !queryfile_name.ends_with(".zip") { - eprintln!( - "WARNING: no compatible sketches in path '{}'", - filename.display() - ); - } - let _ = skipped_paths.fetch_add(1, atomic::Ordering::SeqCst); } - if results.is_empty() { - None - } else { - Some(results) + } + if !found_compatible_sketch { + if !queryfile_name.ends_with(".zip") { + eprintln!( + "WARNING: no compatible sketches in path '{}'", + filename.display() + ); } + let _ = skipped_paths.fetch_add(1, atomic::Ordering::SeqCst); } - Err(err) => { - let _ = failed_paths.fetch_add(1, atomic::Ordering::SeqCst); - eprintln!("Error in processing: {}", err); - eprintln!( - "WARNING: could not process item from path '{}'", - filename.display() - ); + + if results.is_empty() { None + } else { + Some(results) } } + Err(err) => { + eprintln!("Error selecting sketches: {}", err); + let _ = failed_paths.fetch_add(1, atomic::Ordering::SeqCst); + None + } } - Err(err) => { - let _ = failed_paths.fetch_add(1, atomic::Ordering::SeqCst); - eprintln!("Sketch loading error: {}", err); - eprintln!( - "WARNING: could not load sketches from path '{}'", - filename.display() - ); - None - } } - }) - .flatten() - .try_for_each_with(send, |s, m| s.send(m)); + Ok(_) => { + eprintln!("No signatures found in '{}'", filename.display()); + let _ = failed_paths.fetch_add(1, atomic::Ordering::SeqCst); + None + } + Err(err) => { + eprintln!("WARNING: could not load sketches from path '{}': {}", filename.display(), err); + let _ = failed_paths.fetch_add(1, atomic::Ordering::SeqCst); + None + } + } + }) + .flatten() + .try_for_each_with(send, |s, m| s.send(m)); // do some cleanup and error handling - if let Err(e) = send { @@ -203,7 +192,7 @@ pub fn mastiff_manygather>( } if failed_paths > 0 { eprintln!( - "WARNING: {} signature paths failed to load. See error messages above.", + "WARNING: {} query paths failed to load. See error messages above.", failed_paths ); } From 243d1067178c37763029263cd16084ae9e569199 Mon Sep 17 00:00:00 2001 From: "N. Tessa Pierce-Ward" Date: Tue, 23 Jan 2024 15:16:10 -0800 Subject: [PATCH 13/40] rustfmt --- src/mastiff_manygather.rs | 148 +++++++++++++++++++------------------- 1 file changed, 76 insertions(+), 72 deletions(-) diff --git a/src/mastiff_manygather.rs b/src/mastiff_manygather.rs index 1e50add6..b581d73e 100644 --- a/src/mastiff_manygather.rs +++ b/src/mastiff_manygather.rs @@ -4,8 +4,8 @@ use rayon::prelude::*; use sourmash::ffi::signature; use sourmash::signature::Signature; -use sourmash::sketch::Sketch; use sourmash::sketch::minhash::KmerMinHash; +use sourmash::sketch::Sketch; use std::path::Path; // use sourmash::collection::Collection; @@ -85,88 +85,92 @@ pub fn mastiff_manygather>( let skipped_paths = AtomicUsize::new(0); let failed_paths = AtomicUsize::new(0); - let send = query_paths - .par_iter() - .filter_map(|filename| { - // ... existing setup code ... - let threshold = threshold_bp / selection.scaled()? as usize; - - match Signature::from_path(filename) { - Ok(mut signatures) if !signatures.is_empty() => { - match signatures.swap_remove(0).select(&selection) { - Ok(query_sig) => { - let mut results = vec![]; - let mut found_compatible_sketch = false; - for sketch in query_sig.iter() { - if let Sketch::MinHash(query) = sketch { - found_compatible_sketch = true; - // eprintln!("query-size: {}", sketch.size()); - // Gather! - let (counter, query_colors, hash_to_color) = db.prepare_gather_counters(&query); - - let matches = db.gather( - counter, - query_colors, - hash_to_color, - threshold, - &query, - Some(selection.clone()), - ); - // extract results - if let Ok(matches) = matches { - for match_ in &matches { - results.push(( - query_sig.name().clone(), - query.md5sum().clone(), - match_.name().clone(), - match_.md5().clone(), - match_.f_match(), // f_match_query - match_.intersect_bp(), - )); // intersect_bp + .par_iter() + .filter_map(|filename| { + // ... existing setup code ... + let threshold = threshold_bp / selection.scaled()? as usize; + + match Signature::from_path(filename) { + Ok(mut signatures) if !signatures.is_empty() => { + match signatures.swap_remove(0).select(&selection) { + Ok(query_sig) => { + let mut results = vec![]; + let mut found_compatible_sketch = false; + for sketch in query_sig.iter() { + if let Sketch::MinHash(query) = sketch { + found_compatible_sketch = true; + // eprintln!("query-size: {}", sketch.size()); + // Gather! + let (counter, query_colors, hash_to_color) = + db.prepare_gather_counters(&query); + + let matches = db.gather( + counter, + query_colors, + hash_to_color, + threshold, + &query, + Some(selection.clone()), + ); + // extract results + if let Ok(matches) = matches { + for match_ in &matches { + results.push(( + query_sig.name().clone(), + query.md5sum().clone(), + match_.name().clone(), + match_.md5().clone(), + match_.f_match(), // f_match_query + match_.intersect_bp(), + )); // intersect_bp + } + } else { + eprintln!("Error gathering matches: {:?}", matches.err()); } - } else { - eprintln!("Error gathering matches: {:?}", matches.err()); } } - } - if !found_compatible_sketch { - if !queryfile_name.ends_with(".zip") { - eprintln!( - "WARNING: no compatible sketches in path '{}'", - filename.display() - ); + if !found_compatible_sketch { + if !queryfile_name.ends_with(".zip") { + eprintln!( + "WARNING: no compatible sketches in path '{}'", + filename.display() + ); + } + let _ = skipped_paths.fetch_add(1, atomic::Ordering::SeqCst); } - let _ = skipped_paths.fetch_add(1, atomic::Ordering::SeqCst); - } - if results.is_empty() { + if results.is_empty() { + None + } else { + Some(results) + } + } + Err(err) => { + eprintln!("Error selecting sketches: {}", err); + let _ = failed_paths.fetch_add(1, atomic::Ordering::SeqCst); None - } else { - Some(results) } } - Err(err) => { - eprintln!("Error selecting sketches: {}", err); - let _ = failed_paths.fetch_add(1, atomic::Ordering::SeqCst); - None - } + } + Ok(_) => { + eprintln!("No signatures found in '{}'", filename.display()); + let _ = failed_paths.fetch_add(1, atomic::Ordering::SeqCst); + None + } + Err(err) => { + eprintln!( + "WARNING: could not load sketches from path '{}': {}", + filename.display(), + err + ); + let _ = failed_paths.fetch_add(1, atomic::Ordering::SeqCst); + None } } - Ok(_) => { - eprintln!("No signatures found in '{}'", filename.display()); - let _ = failed_paths.fetch_add(1, atomic::Ordering::SeqCst); - None - } - Err(err) => { - eprintln!("WARNING: could not load sketches from path '{}': {}", filename.display(), err); - let _ = failed_paths.fetch_add(1, atomic::Ordering::SeqCst); - None - } - } - }) - .flatten() - .try_for_each_with(send, |s, m| s.send(m)); + }) + .flatten() + .try_for_each_with(send, |s, m| s.send(m)); // do some cleanup and error handling - if let Err(e) = send { From 87219aa60e06dc8f7f4919eabd10f91c198de17f Mon Sep 17 00:00:00 2001 From: "N. Tessa Pierce-Ward" Date: Tue, 23 Jan 2024 15:18:20 -0800 Subject: [PATCH 14/40] add cargo lock --- Cargo.lock | 248 ++++++++++++++++++++++++----------------------------- 1 file changed, 113 insertions(+), 135 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index c457c851..e593bb3c 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -2,12 +2,6 @@ # It is not intended for manual editing. version = 3 -[[package]] -name = "Inflector" -version = "0.11.4" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "fe438c63458706e03479442743baae6c88256498e6431708f6dfc520a26515d3" - [[package]] name = "adler" version = "1.0.2" @@ -110,6 +104,12 @@ dependencies = [ "thiserror", ] +[[package]] +name = "binary-merge" +version = "0.1.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "597bb81c80a54b6a4381b23faba8d7774b144c94cbd1d6fe3f1329bd776554ab" + [[package]] name = "bindgen" version = "0.65.1" @@ -222,6 +222,12 @@ version = "1.4.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "14c189c53d098945499cdfa7ecc63567cf3886b3332b312a5b4585d8d3a6a610" +[[package]] +name = "bytes" +version = "1.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a2bd12c1caf447e69cd4528f47f94d203fd2582878ecb9e9465484c4148a8223" + [[package]] name = "bzip2" version = "0.4.4" @@ -248,6 +254,9 @@ name = "camino" version = "1.1.6" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "c59e92b5a388f549b863a7bea62612c09f24c8393560709a54558a9abdfb3b9c" +dependencies = [ + "serde", +] [[package]] name = "cc" @@ -405,6 +414,18 @@ version = "1.9.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "a26ae43d7bcc3b814de94796a5e736d4029efb0ee900c12e2d54c993ad1a1e07" +[[package]] +name = "enum_dispatch" +version = "0.3.12" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8f33313078bb8d4d05a2733a94ac4c2d8a0df9a2b84424ebf4f33bfc224a890e" +dependencies = [ + "once_cell", + "proc-macro2", + "quote", + "syn 2.0.48", +] + [[package]] name = "env_logger" version = "0.10.2" @@ -459,37 +480,12 @@ dependencies = [ "num-traits", ] -[[package]] -name = "flume" -version = "0.10.14" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1657b4441c3403d9f7b3409e47575237dac27b1b5726df654a6ecbf92f0f7577" -dependencies = [ - "futures-core", - "futures-sink", - "nanorand", - "pin-project", - "spin", -] - [[package]] name = "funty" version = "2.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "e6d5a32815ae3f33302d95fdcb2ce17862f8c65363dcfd29360480ba1001fc9c" -[[package]] -name = "futures-core" -version = "0.3.30" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "dfc6580bb841c5a68e9ef15c77ccc837b40a7504914d52e47b8b0e9bbda25a1d" - -[[package]] -name = "futures-sink" -version = "0.3.30" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9fb8e00e87438d937621c1c6269e53f536c14d3fbd6a042bb24879e57d474fb5" - [[package]] name = "getrandom" version = "0.2.10" @@ -544,9 +540,12 @@ checksum = "443144c8cdadd93ebf52ddb4056d257f5b52c04d3c804e657d19eb73fc33668b" [[package]] name = "histogram" -version = "0.6.9" +version = "0.8.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "12cb882ccb290b8646e554b157ab0b71e64e8d5bef775cd66b6531e52d302669" +checksum = "de0f59c8ab5f8d1f1dd481174172ce418e2e306d665cdd8057c0bd457c447159" +dependencies = [ + "thiserror", +] [[package]] name = "humantime" @@ -583,6 +582,15 @@ version = "2.0.4" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "1e186cfbae8084e513daff4240b4797e342f988cecda4fb6c939150f96315fd8" +[[package]] +name = "inplace-vec-builder" +version = "0.1.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "cf64c2edc8226891a71f127587a2861b132d2b942310843814d5001d99a1d307" +dependencies = [ + "smallvec", +] + [[package]] name = "is-terminal" version = "0.4.9" @@ -594,6 +602,15 @@ dependencies = [ "windows-sys 0.48.0", ] +[[package]] +name = "itertools" +version = "0.12.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "25db6b064527c5d482d0423354fcd07a89a2dfe07b67892e62411946db7f07b0" +dependencies = [ + "either", +] + [[package]] name = "itoa" version = "1.0.9" @@ -730,9 +747,9 @@ checksum = "5486aed0026218e61b8a01d5fbd5a0a134649abb71a0e53b7bc088529dced86e" [[package]] name = "memmap2" -version = "0.5.10" +version = "0.9.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "83faa42c0a078c393f6b29d5db232d8be22776a891f8f56e5284faee4a20b327" +checksum = "45fd3a57831bf88bc63f8cebc0cf956116276e97fef3966103e96416209f7c92" dependencies = [ "libc", ] @@ -767,15 +784,6 @@ version = "0.0.5" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "a2983372caf4480544083767bf2d27defafe32af49ab4df3a0b7fc90793a3664" -[[package]] -name = "nanorand" -version = "0.7.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6a51313c5820b0b02bd422f4b44776fbf47961755c74ce64afc73bfad10226c3" -dependencies = [ - "getrandom", -] - [[package]] name = "needletail" version = "0.5.1" @@ -857,15 +865,6 @@ dependencies = [ "autocfg", ] -[[package]] -name = "numsep" -version = "0.1.12" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ad5c49c3e12c314efb1f43cba136031b657dcd59ee26936ab2be313c5e97da22" -dependencies = [ - "slicestring", -] - [[package]] name = "once_cell" version = "1.18.0" @@ -874,25 +873,27 @@ checksum = "dd8b5dd2ae5ed71462c540258bedcb51965123ad7e7ccf4b9a8cafaa4a63576d" [[package]] name = "ouroboros" -version = "0.15.6" +version = "0.18.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e1358bd1558bd2a083fed428ffeda486fbfb323e698cdda7794259d592ca72db" +checksum = "97b7be5a8a3462b752f4be3ff2b2bf2f7f1d00834902e46be2a4d68b87b0573c" dependencies = [ "aliasable", "ouroboros_macro", + "static_assertions", ] [[package]] name = "ouroboros_macro" -version = "0.15.6" +version = "0.18.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5f7d21ccd03305a674437ee1248f3ab5d4b1db095cf1caf49f1713ddf61956b7" +checksum = "b645dcde5f119c2c454a92d0dfa271a2a3b205da92e4292a68ead4bdbfde1f33" dependencies = [ - "Inflector", - "proc-macro-error", + "heck", + "itertools", "proc-macro2", + "proc-macro2-diagnostics", "quote", - "syn 1.0.109", + "syn 2.0.48", ] [[package]] @@ -924,39 +925,20 @@ version = "0.1.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "19b17cddbe7ec3f8bc800887bab5e717348c95ea2ca0b1bf0837fb964dc67099" -[[package]] -name = "pin-project" -version = "1.1.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "fda4ed1c6c173e3fc7a83629421152e01d7b1f9b7f65fb301e490e8cfc656422" -dependencies = [ - "pin-project-internal", -] - -[[package]] -name = "pin-project-internal" -version = "1.1.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4359fd9c9171ec6e8c62926d6faaf553a8dc3f64e1507e76da7911b4f6a04405" -dependencies = [ - "proc-macro2", - "quote", - "syn 2.0.48", -] - [[package]] name = "piz" -version = "0.4.0" +version = "0.5.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "58c75d1c00e6d407e283cc66d9d4fd0985ef1703c761520845b93c4f981bfb65" +checksum = "898b071c1938a2c92b95c18708cbf38f2566a01f0ab9dd7bdf4329987e5c2e17" dependencies = [ + "camino", "chrono", "codepage-437", "crc32fast", "flate2", "log", + "memchr", "thiserror", - "twoway", ] [[package]] @@ -1053,6 +1035,19 @@ dependencies = [ "unicode-ident", ] +[[package]] +name = "proc-macro2-diagnostics" +version = "0.10.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "af066a9c399a26e020ada66a034357a868728e72cd426f3adcd35f80d88d88c8" +dependencies = [ + "proc-macro2", + "quote", + "syn 2.0.48", + "version_check", + "yansi", +] + [[package]] name = "ptr_meta" version = "0.1.4" @@ -1264,12 +1259,13 @@ checksum = "8c31b5c4033f8fdde8700e4657be2c497e7288f01515be52168c631e2e4d4086" [[package]] name = "rkyv" -version = "0.7.42" +version = "0.7.43" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0200c8230b013893c0b2d6213d6ec64ed2b9be2e0e016682b7224ff82cff5c58" +checksum = "527a97cdfef66f65998b5f3b637c26f5a5ec09cc52a3f9932313ac645f4190f5" dependencies = [ "bitvec", "bytecheck", + "bytes", "hashbrown", "ptr_meta", "rend", @@ -1281,9 +1277,9 @@ dependencies = [ [[package]] name = "rkyv_derive" -version = "0.7.42" +version = "0.7.43" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b2e06b915b5c230a17d7a736d1e2e63ee753c256a8614ef3f5147b13a4f5541d" +checksum = "b5c462a1328c8e67e4d6dbad1eb0355dd43e8ab432c6e227a43657f16ade5033" dependencies = [ "proc-macro2", "quote", @@ -1292,9 +1288,9 @@ dependencies = [ [[package]] name = "roaring" -version = "0.9.0" +version = "0.10.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "dd539cab4e32019956fe7e0cf160bb6d4802f4be2b52c4253d76d3bb0f85a5f7" +checksum = "6106b5cf8587f5834158895e9715a3c6c9716c8aefab57f1f7680917191c7873" dependencies = [ "bytemuck", "byteorder", @@ -1403,18 +1399,6 @@ version = "0.3.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "8542b68b8800c3cda649d2c72d688b6907b30f1580043135d61669d4aad1c175" -[[package]] -name = "size" -version = "0.4.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9fed904c7fb2856d868b92464fc8fa597fce366edea1a9cbfaa8cb5fe080bd6d" - -[[package]] -name = "slicestring" -version = "0.1.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "636b979c5672ac7c2a1120ca0a9a6074cd090dadfec42af6f8a5baea1223d180" - [[package]] name = "smallvec" version = "1.11.0" @@ -1430,15 +1414,18 @@ checksum = "bceb57dc07c92cdae60f5b27b3fa92ecaaa42fe36c55e22dbfb0b44893e0b1f7" [[package]] name = "sourmash" version = "0.12.0" -source = "git+https://github.com/sourmash-bio/sourmash?rev=ff1092f8f366339caa59d7203f623813228f4356#ff1092f8f366339caa59d7203f623813228f4356" +source = "git+https://github.com/sourmash-bio/sourmash?branch=select-downsample#efd1ee420dbf872462c3bc56defd023a6a6234e5" dependencies = [ "az", - "bytecount", "byteorder", + "camino", "cfg-if", + "chrono", "counter", + "csv", + "enum_dispatch", "fixedbitset", - "flume", + "getrandom", "getset", "histogram", "log", @@ -1448,7 +1435,6 @@ dependencies = [ "niffler", "nohash-hasher", "num-iter", - "numsep", "once_cell", "ouroboros", "piz", @@ -1459,7 +1445,6 @@ dependencies = [ "rocksdb", "serde", "serde_json", - "size", "thiserror", "twox-hash", "typed-builder", @@ -1492,15 +1477,6 @@ dependencies = [ "zip", ] -[[package]] -name = "spin" -version = "0.9.8" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6980e8d7511241f8acf4aebddbb1ff938df5eebe98691418c4468d0b72a96a67" -dependencies = [ - "lock_api", -] - [[package]] name = "static_assertions" version = "1.1.0" @@ -1615,16 +1591,6 @@ version = "0.1.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "1f3ccbac311fea05f86f61904b462b55fb3df8837a366dfc601a0161d0532f20" -[[package]] -name = "twoway" -version = "0.2.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c57ffb460d7c24cd6eda43694110189030a3d1dfe418416d9468fd1c1d290b47" -dependencies = [ - "memchr", - "unchecked-index", -] - [[package]] name = "twox-hash" version = "1.6.3" @@ -1638,20 +1604,23 @@ dependencies = [ [[package]] name = "typed-builder" -version = "0.10.0" +version = "0.18.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "89851716b67b937e393b3daa8423e67ddfc4bbbf1654bcf05488e95e0828db0c" +checksum = "444d8748011b93cb168770e8092458cb0f8854f931ff82fdf6ddfbd72a9c933e" dependencies = [ - "proc-macro2", - "quote", - "syn 1.0.109", + "typed-builder-macro", ] [[package]] -name = "unchecked-index" -version = "0.2.2" +name = "typed-builder-macro" +version = "0.18.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "eeba86d422ce181a719445e51872fa30f1f7413b62becb52e95ec91aa262d85c" +checksum = "563b3b88238ec95680aef36bdece66896eaa7ce3c0f1b4f39d38fb2435261352" +dependencies = [ + "proc-macro2", + "quote", + "syn 2.0.48", +] [[package]] name = "unicode-ident" @@ -1679,10 +1648,13 @@ checksum = "accd4ea62f7bb7a82fe23066fb0957d48ef677f6eeb8215f372f52e48bb32426" [[package]] name = "vec-collections" -version = "0.3.6" +version = "0.4.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4f2390c4dc8ae8640c57d067b1a3d40bc05c124cc6bc7394d761b53435d41b76" +checksum = "3c9965c8f2ffed1dbcd16cafe18a009642f540fa22661c6cfd6309ddb02e4982" dependencies = [ + "binary-merge", + "inplace-vec-builder", + "lazy_static", "num-traits", "serde", "smallvec", @@ -1972,6 +1944,12 @@ dependencies = [ "lzma-sys", ] +[[package]] +name = "yansi" +version = "1.0.0-rc.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1367295b8f788d371ce2dbc842c7b709c73ee1364d30351dd300ec2203b12377" + [[package]] name = "zip" version = "0.6.6" From 2fcf684d89f109b6763697d7f6556f36d2cce695 Mon Sep 17 00:00:00 2001 From: "N. Tessa Pierce-Ward" Date: Tue, 23 Jan 2024 20:47:53 -0800 Subject: [PATCH 15/40] switch to commit in latest br --- Cargo.lock | 298 +++++++++++++++++++++++------------------------------ Cargo.toml | 3 +- 2 files changed, 132 insertions(+), 169 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index e593bb3c..a9e84652 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -10,9 +10,9 @@ checksum = "f26201604c87b1e01bd3d98f8d5d9a8fcbb815e8cedb41ffccbeb4bf593a35fe" [[package]] name = "ahash" -version = "0.7.6" +version = "0.7.7" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "fcb51a0695d8f838b1ee009b3fbf66bda078cd64590202a864a8f3e8c4315c47" +checksum = "5a824f2aa7e75a0c98c5a504fceb80649e9c35265d44525b5f94de4771a395cd" dependencies = [ "getrandom", "once_cell", @@ -21,9 +21,9 @@ dependencies = [ [[package]] name = "aho-corasick" -version = "1.0.5" +version = "1.1.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0c378d78423fdad8089616f827526ee33c19f2fddbd5de1629152c9593ba4783" +checksum = "b2969dcb958b36655471fc61f7e416fa76033bdd4bfed0678d8fee1e2d07a1f0" dependencies = [ "memchr", ] @@ -51,9 +51,9 @@ dependencies = [ [[package]] name = "anstyle" -version = "1.0.2" +version = "1.0.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "15c4c2c83f81532e5845a733998b6971faca23490340a418e9b72a3ec9de12ea" +checksum = "7079075b41f533b8c61d2a4d073c4676e1f8b249ff94a393b0595db304e0dd87" [[package]] name = "anyhow" @@ -139,9 +139,9 @@ checksum = "bef38d45163c2f1dde094a7dfd33ccf595c92905c8f8f4fdc18d06fb1037718a" [[package]] name = "bitflags" -version = "2.4.0" +version = "2.4.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b4682ae6287fcf752ecaabbfcc7b6f9b72aa33933dc23a554d853aea8eea8635" +checksum = "ed570934406eb16438a4e976b1b4500774099c13b8cb96eec99f620f05090ddf" [[package]] name = "bitvec" @@ -157,9 +157,9 @@ dependencies = [ [[package]] name = "bstr" -version = "1.6.2" +version = "1.9.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4c2f7349907b712260e64b0afe2f84692af14a454be26187d9df565c7f69266a" +checksum = "c48f0051a4b4c5e0b6d365cd04af53aeaa209e3cc15ec2cdb69e73cc87fbd0dc" dependencies = [ "memchr", "regex-automata", @@ -178,9 +178,9 @@ dependencies = [ [[package]] name = "bumpalo" -version = "3.13.0" +version = "3.14.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a3e2c3daef883ecc1b5d58c15adae93470a91d425f3532ba1695849656af3fc1" +checksum = "7f30e7476521f6f8af1a1c4c0b8cc94f0bee37d91763d0ca2665f299b6cd8aec" [[package]] name = "bytecheck" @@ -206,21 +206,21 @@ dependencies = [ [[package]] name = "bytecount" -version = "0.6.3" +version = "0.6.7" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2c676a478f63e9fa2dd5368a42f28bba0d6c560b775f38583c8bbaa7fcd67c9c" +checksum = "e1e5f035d16fc623ae5f74981db80a439803888314e3a555fd6f04acd51a3205" [[package]] name = "bytemuck" -version = "1.13.1" +version = "1.14.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "17febce684fd15d89027105661fec94afb475cb995fbc59d2865198446ba2eea" +checksum = "374d28ec25809ee0e23827c2ab573d729e293f281dfe393500e7ad618baa61c6" [[package]] name = "byteorder" -version = "1.4.3" +version = "1.5.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "14c189c53d098945499cdfa7ecc63567cf3886b3332b312a5b4585d8d3a6a610" +checksum = "1fd0f2584146f6f2ef48085050886acf353beff7305ebd1ae69500e27c67f64b" [[package]] name = "bytes" @@ -285,24 +285,23 @@ checksum = "baf1de4339761588bc0619e3cbc0120ee582ebb74b53b4efbf79117bd2da40fd" [[package]] name = "chrono" -version = "0.4.28" +version = "0.4.32" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "95ed24df0632f708f5f6d8082675bef2596f7084dee3dd55f632290bf35bfe0f" +checksum = "41daef31d7a747c5c847246f36de49ced6f7403b4cdabc807a97b5cc184cda7a" dependencies = [ "android-tzdata", "iana-time-zone", "js-sys", "num-traits", - "time", "wasm-bindgen", - "windows-targets 0.48.5", + "windows-targets 0.52.0", ] [[package]] name = "clang-sys" -version = "1.6.1" +version = "1.7.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c688fc74432808e3eb684cae8830a86be1d66a2bd58e1f248ed0960a590baf6f" +checksum = "67523a3b4be3ce1989d607a828d036249522dd9c1c8de7f4dd2dae43a37369d1" dependencies = [ "glob", "libc", @@ -320,9 +319,9 @@ dependencies = [ [[package]] name = "core-foundation-sys" -version = "0.8.4" +version = "0.8.6" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e496a50fda8aacccc86d7529e2c1e0892dbd0f898a6b5645b5561b89c3210efa" +checksum = "06ea2b9bc92be3c2baa9334a323ebca2d6f074ff852cd1d7b11064035cd3868f" [[package]] name = "counter" @@ -344,36 +343,28 @@ dependencies = [ [[package]] name = "crossbeam-deque" -version = "0.8.3" +version = "0.8.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ce6fd6f855243022dcecf8702fef0c297d4338e226845fe067f6341ad9fa0cef" +checksum = "613f8cc01fe9cf1a3eb3d7f488fd2fa8388403e97039e2f73692932e291a770d" dependencies = [ - "cfg-if", "crossbeam-epoch", "crossbeam-utils", ] [[package]] name = "crossbeam-epoch" -version = "0.9.15" +version = "0.9.18" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ae211234986c545741a7dc064309f67ee1e5ad243d0e48335adc0484d960bcc7" +checksum = "5b82ac4a3c2ca9c3460964f020e1402edd5753411d7737aa39c3714ad1b5420e" dependencies = [ - "autocfg", - "cfg-if", "crossbeam-utils", - "memoffset", - "scopeguard", ] [[package]] name = "crossbeam-utils" -version = "0.8.16" +version = "0.8.19" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5a22b2d63d4d1dc0b7f1b6b2747dd0088008a9be28b6ddf0b1e7d335e3037294" -dependencies = [ - "cfg-if", -] +checksum = "248e3bacc7dc6baa3b21e405ee045c3047101a49145e7e9eca583ab4c2ca5345" [[package]] name = "csv" @@ -451,9 +442,9 @@ dependencies = [ [[package]] name = "fastrand" -version = "2.0.0" +version = "2.0.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6999dc1837253364c2ebb0704ba97994bd874e8f195d665c50b7548f6ea92764" +checksum = "25cbce373ec4653f1a01a31e8a5e5ec0c622dc27ff9c4e6606eefef5cbbed4a5" [[package]] name = "fixedbitset" @@ -463,9 +454,9 @@ checksum = "0ce7134b9999ecaf8bcd65542e436736ef32ddca1b3e06094cb6ec5755203b80" [[package]] name = "flate2" -version = "1.0.27" +version = "1.0.28" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c6c98ee8095e9d1dcbf2fcc6d95acccb90d1c81db1e44725c6a984b1dbdfb010" +checksum = "46303f565772937ffe1d394a4fac6f411c6013172fadde9dcdb1e147a086940e" dependencies = [ "crc32fast", "miniz_oxide", @@ -488,14 +479,14 @@ checksum = "e6d5a32815ae3f33302d95fdcb2ce17862f8c65363dcfd29360480ba1001fc9c" [[package]] name = "getrandom" -version = "0.2.10" +version = "0.2.12" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "be4136b2a15dd319360be1c07d9933517ccf0be8f16bf62a3bee4f0d618df427" +checksum = "190092ea657667030ac6a35e305e62fc4dd69fd98ac98631e5d3a2b1575a12b5" dependencies = [ "cfg-if", "js-sys", "libc", - "wasi 0.11.0+wasi-snapshot-preview1", + "wasi", "wasm-bindgen", ] @@ -534,15 +525,15 @@ checksum = "95505c38b4572b2d910cecb0281560f54b440a19336cbbcb27bf6ce6adc6f5a8" [[package]] name = "hermit-abi" -version = "0.3.2" +version = "0.3.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "443144c8cdadd93ebf52ddb4056d257f5b52c04d3c804e657d19eb73fc33668b" +checksum = "5d3d0e0f38255e7fa3cf31335b3a56f05febd18025f4db5ef7a0cfb4f8da651f" [[package]] name = "histogram" -version = "0.8.4" +version = "0.9.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "de0f59c8ab5f8d1f1dd481174172ce418e2e306d665cdd8057c0bd457c447159" +checksum = "e5ee9487899388cf1a1155759c39e3c156c5d198b6da1734053954a6e40e6d4d" dependencies = [ "thiserror", ] @@ -555,16 +546,16 @@ checksum = "9a3a5bfb195931eeb336b2a7b4d761daec841b97f947d34394601737a7bba5e4" [[package]] name = "iana-time-zone" -version = "0.1.57" +version = "0.1.59" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2fad5b825842d2b38bd206f3e81d6957625fd7f0a361e345c30e01a0ae2dd613" +checksum = "b6a67363e2aa4443928ce15e57ebae94fd8949958fd1223c4cfc0cd473ad7539" dependencies = [ "android_system_properties", "core-foundation-sys", "iana-time-zone-haiku", "js-sys", "wasm-bindgen", - "windows", + "windows-core", ] [[package]] @@ -593,13 +584,13 @@ dependencies = [ [[package]] name = "is-terminal" -version = "0.4.9" +version = "0.4.10" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "cb0889898416213fab133e1d33a0e5858a48177452750691bde3666d0fdbaf8b" +checksum = "0bad00257d07be169d870ab665980b06cdb366d792ad690bf2e76876dc503455" dependencies = [ "hermit-abi", "rustix", - "windows-sys 0.48.0", + "windows-sys 0.52.0", ] [[package]] @@ -613,24 +604,24 @@ dependencies = [ [[package]] name = "itoa" -version = "1.0.9" +version = "1.0.10" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "af150ab688ff2122fcef229be89cb50dd66af9e01a4ff320cc137eecc9bacc38" +checksum = "b1a46d1a171d865aa5f83f92695765caa047a9b4cbae2cbf37dbd613a793fd4c" [[package]] name = "jobserver" -version = "0.1.26" +version = "0.1.27" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "936cfd212a0155903bcbc060e316fb6cc7cbf2e1907329391ebadc1fe0ce77c2" +checksum = "8c37f63953c4c63420ed5fd3d6d398c719489b9f872b9fa683262f8edd363c7d" dependencies = [ "libc", ] [[package]] name = "js-sys" -version = "0.3.66" +version = "0.3.67" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "cee9c64da59eae3b50095c18d3e74f8b73c0b86d2792824ff01bbce68ba229ca" +checksum = "9a1d36f1235bc969acba30b7f5990b864423a6068a10f7c90ae8f0112e3a59d1" dependencies = [ "wasm-bindgen", ] @@ -649,18 +640,18 @@ checksum = "830d08ce1d1d941e6b30645f1a0eb5643013d835ce3779a5fc208261dbe10f55" [[package]] name = "libc" -version = "0.2.151" +version = "0.2.152" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "302d7ab3130588088d277783b1e2d2e10c9e9e4a16dd9050e6ec93fb3e7048f4" +checksum = "13e3bf6590cbc649f4d1a3eefc9d5d6eb746f5200ffb04e5e142700b8faa56e7" [[package]] name = "libloading" -version = "0.7.4" +version = "0.8.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b67380fd3b2fbe7527a606e18729d21c6f3951633d0500574c4dc22d2d638b9f" +checksum = "c571b676ddfc9a8c12f1f3d3085a7b163966a8fd8098a90640953ce5f6170161" dependencies = [ "cfg-if", - "winapi", + "windows-sys 0.48.0", ] [[package]] @@ -681,9 +672,9 @@ dependencies = [ [[package]] name = "libz-sys" -version = "1.1.12" +version = "1.1.14" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d97137b25e321a73eef1418d1d5d2eda4d77e12813f8e6dead84bc52c5870a7b" +checksum = "295c17e837573c8c821dbaeb3cceb3d745ad082f7572191409e69cbc1b3fd050" dependencies = [ "cc", "pkg-config", @@ -692,15 +683,15 @@ dependencies = [ [[package]] name = "linux-raw-sys" -version = "0.4.12" +version = "0.4.13" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c4cd1a83af159aa67994778be9070f0ae1bd732942279cabb14f86f986a21456" +checksum = "01cda141df6706de531b6c46c3a33ecca755538219bd484262fa09410c13539c" [[package]] name = "lock_api" -version = "0.4.10" +version = "0.4.11" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c1cc9717a20b1bb222f333e6a92fd32f7d8a18ddc5a3191a11af45dcbf4dcd16" +checksum = "3c168f8615b12bc01f9c17e2eb0cc07dcae1940121185446edc3744920e8ef45" dependencies = [ "autocfg", "scopeguard", @@ -741,9 +732,9 @@ checksum = "490cc448043f947bae3cbee9c203358d62dbee0db12107a74be5c30ccfd09771" [[package]] name = "memchr" -version = "2.6.2" +version = "2.7.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5486aed0026218e61b8a01d5fbd5a0a134649abb71a0e53b7bc088529dced86e" +checksum = "523dc4f511e55ab87b694dc30d0f820d60906ef06413f93d4d7a1385599cc149" [[package]] name = "memmap2" @@ -858,18 +849,18 @@ dependencies = [ [[package]] name = "num-traits" -version = "0.2.16" +version = "0.2.17" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f30b0abd723be7e2ffca1272140fac1a2f084c77ec3e123c192b66af1ee9e6c2" +checksum = "39e3200413f237f41ab11ad6d161bc7239c84dcb631773ccd7de3dfe4b5c267c" dependencies = [ "autocfg", ] [[package]] name = "once_cell" -version = "1.18.0" +version = "1.19.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "dd8b5dd2ae5ed71462c540258bedcb51965123ad7e7ccf4b9a8cafaa4a63576d" +checksum = "3fdb12b2476b595f9358c5161aa467c2438859caa136dec86c26fdd2efe17b92" [[package]] name = "ouroboros" @@ -908,13 +899,13 @@ dependencies = [ [[package]] name = "parking_lot_core" -version = "0.9.8" +version = "0.9.9" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "93f00c865fe7cabf650081affecd3871070f26767e7b2070a3ffae14c654b447" +checksum = "4c42a9226546d68acdd9c0a280d17ce19bfe27a46bf68784e4066115788d008e" dependencies = [ "cfg-if", "libc", - "redox_syscall 0.3.5", + "redox_syscall", "smallvec", "windows-targets 0.48.5", ] @@ -943,9 +934,9 @@ dependencies = [ [[package]] name = "pkg-config" -version = "0.3.27" +version = "0.3.29" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "26072860ba924cbfa98ea39c8c19b4dd6a4a25423dbdf219c1eca91aa0cf6964" +checksum = "2900ede94e305130c13ddd391e0ab7cbaeb783945ae07a279c268cb05109c6cb" [[package]] name = "ppv-lite86" @@ -985,9 +976,9 @@ dependencies = [ [[package]] name = "prettyplease" -version = "0.2.12" +version = "0.2.16" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6c64d9ba0963cdcea2e1b2230fbae2bab30eb25a174be395c41e764bfb65dd62" +checksum = "a41cf62165e97c7f814d2221421dbb9afcbcdb0a88068e5ea206e19951c2cbb5" dependencies = [ "proc-macro2", "syn 2.0.48", @@ -1028,9 +1019,9 @@ dependencies = [ [[package]] name = "proc-macro2" -version = "1.0.76" +version = "1.0.78" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "95fc56cda0b5c3325f5fbbd7ff9fda9e02bb00bb3dac51252d2f1bfa1cb8cc8c" +checksum = "e2422ad645d89c99f8f3e6b88a9fdeca7fabeac836b1002371c4367c8f984aae" dependencies = [ "unicode-ident", ] @@ -1195,15 +1186,6 @@ dependencies = [ "crossbeam-utils", ] -[[package]] -name = "redox_syscall" -version = "0.3.5" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "567664f262709473930a4bf9e51bf2ebf3348f2e748ccc50dea20646858f8f29" -dependencies = [ - "bitflags 1.3.2", -] - [[package]] name = "redox_syscall" version = "0.4.1" @@ -1215,9 +1197,9 @@ dependencies = [ [[package]] name = "regex" -version = "1.9.4" +version = "1.10.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "12de2eff854e5fa4b1295edd650e227e9d8fb0c9e90b12e7f36d6a6811791a29" +checksum = "b62dbe01f0b06f9d8dc7d49e05a0785f153b00b2c227856282f671e0318c9b15" dependencies = [ "aho-corasick", "memchr", @@ -1227,9 +1209,9 @@ dependencies = [ [[package]] name = "regex-automata" -version = "0.3.7" +version = "0.4.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "49530408a136e16e5b486e883fbb6ba058e8e4e8ae6621a77b048b314336e629" +checksum = "3b7fa1134405e2ec9353fd416b17f8dacd46c473d7d3fd1cf202706a14eb792a" dependencies = [ "aho-corasick", "memchr", @@ -1238,15 +1220,15 @@ dependencies = [ [[package]] name = "regex-syntax" -version = "0.7.5" +version = "0.8.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "dbb5fb1acd8a1a18b3dd5be62d25485eb770e05afb408a9627d14d451bae12da" +checksum = "c08c74e62047bb2de4ff487b251e4a92e24f48745648451635cec7d591162d9f" [[package]] name = "rend" -version = "0.4.0" +version = "0.4.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "581008d2099240d37fb08d77ad713bcaec2c4d89d50b5b21a8bb1996bbab68ab" +checksum = "a2571463863a6bd50c32f94402933f03457a3fbaf697a707c5be741e459f08fd" dependencies = [ "bytecheck", ] @@ -1315,11 +1297,11 @@ checksum = "08d43f7aa6b08d49f382cde6a7982047c3426db949b1424bc4b7ec9ae12c6ce2" [[package]] name = "rustix" -version = "0.38.28" +version = "0.38.30" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "72e572a5e8ca657d7366229cdde4bd14c4eb5499a9573d4d366fe1b599daa316" +checksum = "322394588aaf33c24007e8bb3238ee3e4c5c09c084ab32bc73890b99ff326bca" dependencies = [ - "bitflags 2.4.0", + "bitflags 2.4.2", "errno", "libc", "linux-raw-sys", @@ -1328,9 +1310,9 @@ dependencies = [ [[package]] name = "ryu" -version = "1.0.15" +version = "1.0.16" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1ad4cc8da4ef723ed60bced201181d83791ad433213d8c24efffda1eec85d741" +checksum = "f98d2aa92eebf49b69786be48e4477826b256916e84a57ff2a4f21923b48eb4c" [[package]] name = "safemem" @@ -1383,9 +1365,9 @@ dependencies = [ [[package]] name = "shlex" -version = "1.1.0" +version = "1.3.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "43b2853a4d09f215c24cc5489c992ce46052d359b5109343cbafbf26bc62f8a3" +checksum = "0fda2ff0d084019ba4d7c6f371c95d8fd75ce3524c3cb8fb653a3023f6323e64" [[package]] name = "simdutf8" @@ -1401,9 +1383,9 @@ checksum = "8542b68b8800c3cda649d2c72d688b6907b30f1580043135d61669d4aad1c175" [[package]] name = "smallvec" -version = "1.11.0" +version = "1.13.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "62bb4feee49fdd9f707ef802e22365a35de4b7b299de4763d44bfea899442ff9" +checksum = "e6ecd384b10a64542d77071bd64bd7b231f4ed5940fba55e98c3de13824cf3d7" [[package]] name = "sorted-iter" @@ -1414,7 +1396,7 @@ checksum = "bceb57dc07c92cdae60f5b27b3fa92ecaaa42fe36c55e22dbfb0b44893e0b1f7" [[package]] name = "sourmash" version = "0.12.0" -source = "git+https://github.com/sourmash-bio/sourmash?branch=select-downsample#efd1ee420dbf872462c3bc56defd023a6a6234e5" +source = "git+https://github.com/sourmash-bio/sourmash?rev=94b88cc314f781342721addc5ed35c531732a9b6#94b88cc314f781342721addc5ed35c531732a9b6" dependencies = [ "az", "byteorder", @@ -1513,9 +1495,9 @@ checksum = "55937e1799185b12863d447f42597ed69d9928686b8d88a1df17376a097d8369" [[package]] name = "target-lexicon" -version = "0.12.11" +version = "0.12.13" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9d0e916b1148c8e263850e1ebcbd046f333e0683c724876bb0da63ea4373dc8a" +checksum = "69758bda2e78f098e4ccb393021a0963bb3442eac05f135c30f61b7370bbafae" [[package]] name = "tempfile" @@ -1525,16 +1507,16 @@ checksum = "01ce4141aa927a6d1bd34a041795abd0db1cccba5d5f24b009f694bdf3a1f3fa" dependencies = [ "cfg-if", "fastrand", - "redox_syscall 0.4.1", + "redox_syscall", "rustix", "windows-sys 0.52.0", ] [[package]] name = "termcolor" -version = "1.2.0" +version = "1.4.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "be55cf8942feac5c765c2c993422806843c9a9a45d4d5c407ad6dd2ea95eb9b6" +checksum = "06794f8f6c5c898b3275aebefa6b8a1cb24cd2c6c79397ab15774837a0bc5755" dependencies = [ "winapi-util", ] @@ -1547,35 +1529,24 @@ checksum = "3369f5ac52d5eb6ab48c6b4ffdc8efbcad6b89c765749064ba298f2c68a16a76" [[package]] name = "thiserror" -version = "1.0.47" +version = "1.0.56" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "97a802ec30afc17eee47b2855fc72e0c4cd62be9b4efe6591edde0ec5bd68d8f" +checksum = "d54378c645627613241d077a3a79db965db602882668f9136ac42af9ecb730ad" dependencies = [ "thiserror-impl", ] [[package]] name = "thiserror-impl" -version = "1.0.47" +version = "1.0.56" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6bb623b56e39ab7dcd4b1b98bb6c8f8d907ed255b18de254088016b27a8ee19b" +checksum = "fa0faa943b50f3db30a20aa7e265dbc66076993efed8463e8de414e5d06d3471" dependencies = [ "proc-macro2", "quote", "syn 2.0.48", ] -[[package]] -name = "time" -version = "0.1.45" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1b797afad3f312d1c66a56d11d0316f916356d11bd158fbc6ca6389ff6bf805a" -dependencies = [ - "libc", - "wasi 0.10.0+wasi-snapshot-preview1", - "winapi", -] - [[package]] name = "tinyvec" version = "1.6.0" @@ -1624,9 +1595,9 @@ dependencies = [ [[package]] name = "unicode-ident" -version = "1.0.11" +version = "1.0.12" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "301abaae475aa91687eb82514b328ab47a211a533026cb25fc3e519b86adfc3c" +checksum = "3354b9ac3fae1ff6755cb6db53683adb661634f67557942dea4facebec0fee4b" [[package]] name = "unindent" @@ -1636,9 +1607,9 @@ checksum = "c7de7d73e1754487cb58364ee906a499937a0dfabd86bcb980fa99ec8c8fa2ce" [[package]] name = "uuid" -version = "1.4.1" +version = "1.7.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "79daa5ed5740825c40b389c5e50312b9c86df53fccd33f281df655642b43869d" +checksum = "f00cc9702ca12d3c81455259621e676d0f7251cec66a21e98fe2e9a37db93b2a" [[package]] name = "vcpkg" @@ -1676,12 +1647,6 @@ dependencies = [ "libc", ] -[[package]] -name = "wasi" -version = "0.10.0+wasi-snapshot-preview1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1a143597ca7c7793eff794def352d41792a93c481eb1042423ff7ff72ba2c31f" - [[package]] name = "wasi" version = "0.11.0+wasi-snapshot-preview1" @@ -1690,9 +1655,9 @@ checksum = "9c8d87e72b64a3b4db28d11ce29237c246188f4f51057d65a7eab63b7987e423" [[package]] name = "wasm-bindgen" -version = "0.2.89" +version = "0.2.90" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0ed0d4f68a3015cc185aff4db9506a015f4b96f95303897bfa23f846db54064e" +checksum = "b1223296a201415c7fad14792dbefaace9bd52b62d33453ade1c5b5f07555406" dependencies = [ "cfg-if", "serde", @@ -1702,9 +1667,9 @@ dependencies = [ [[package]] name = "wasm-bindgen-backend" -version = "0.2.89" +version = "0.2.90" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1b56f625e64f3a1084ded111c4d5f477df9f8c92df113852fa5a374dbda78826" +checksum = "fcdc935b63408d58a32f8cc9738a0bffd8f05cc7c002086c6ef20b7312ad9dcd" dependencies = [ "bumpalo", "log", @@ -1717,9 +1682,9 @@ dependencies = [ [[package]] name = "wasm-bindgen-macro" -version = "0.2.89" +version = "0.2.90" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0162dbf37223cd2afce98f3d0785506dcb8d266223983e4b5b525859e6e182b2" +checksum = "3e4c238561b2d428924c49815533a8b9121c664599558a5d9ec51f8a1740a999" dependencies = [ "quote", "wasm-bindgen-macro-support", @@ -1727,9 +1692,9 @@ dependencies = [ [[package]] name = "wasm-bindgen-macro-support" -version = "0.2.89" +version = "0.2.90" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f0eb82fcb7930ae6219a7ecfd55b217f5f0893484b7a13022ebb2b2bf20b5283" +checksum = "bae1abb6806dc1ad9e560ed242107c0f6c84335f1749dd4e8ddb012ebd5e25a7" dependencies = [ "proc-macro2", "quote", @@ -1740,15 +1705,15 @@ dependencies = [ [[package]] name = "wasm-bindgen-shared" -version = "0.2.89" +version = "0.2.90" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7ab9b36309365056cd639da3134bf87fa8f3d86008abf99e612384a6eecd459f" +checksum = "4d91413b1c31d7539ba5ef2451af3f0b833a005eb27a631cec32bc0635a8602b" [[package]] name = "web-sys" -version = "0.3.66" +version = "0.3.67" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "50c24a44ec86bb68fbecd1b3efed7e85ea5621b39b35ef2766b66cd984f8010f" +checksum = "58cd2333b6e0be7a39605f0e255892fd7418a682d8da8fe042fe25128794d2ed" dependencies = [ "js-sys", "wasm-bindgen", @@ -1772,9 +1737,9 @@ checksum = "ac3b87c63620426dd9b991e5ce0329eff545bccbbb34f3be09ff6fb6ab51b7b6" [[package]] name = "winapi-util" -version = "0.1.5" +version = "0.1.6" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "70ec6ce85bb158151cae5e5c87f95a8e97d2c0c4b001223f33a334e3ce5de178" +checksum = "f29e6f9198ba0d26b4c9f07dbe6f9ed633e1f3d5b8b414090084349e46a52596" dependencies = [ "winapi", ] @@ -1786,12 +1751,12 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "712e227841d057c1ee1cd2fb22fa7e5a5461ae8e48fa2ca79ec42cfc1931183f" [[package]] -name = "windows" -version = "0.48.0" +name = "windows-core" +version = "0.52.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e686886bc078bc1b0b600cac0147aadb815089b6e4da64016cbd754b6342700f" +checksum = "33ab640c8d7e35bf8ba19b884ba838ceb4fba93a4e8c65a9059d08afcfc683d9" dependencies = [ - "windows-targets 0.48.5", + "windows-targets 0.52.0", ] [[package]] @@ -1983,11 +1948,10 @@ dependencies = [ [[package]] name = "zstd-sys" -version = "2.0.8+zstd.1.5.5" +version = "2.0.9+zstd.1.5.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5556e6ee25d32df2586c098bbfa278803692a20d0ab9565e049480d52707ec8c" +checksum = "9e16efa8a874a0481a574084d34cc26fdb3b99627480f785888deb6386506656" dependencies = [ "cc", - "libc", "pkg-config", ] diff --git a/Cargo.toml b/Cargo.toml index b0986983..21b3976e 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -12,8 +12,7 @@ crate-type = ["cdylib"] pyo3 = { version = "0.20.2", features = ["extension-module", "anyhow"] } rayon = "1.8.1" serde = { version = "1.0.195", features = ["derive"] } -sourmash = { git = "https://github.com/sourmash-bio/sourmash", branch="select-downsample", features = ["branchwater"] } -#sourmash = { git = "https://github.com/sourmash-bio/sourmash", rev= "94b88cc314f781342721addc5ed35c531732a9b6", features = ["branchwater"] } +sourmash = { git = "https://github.com/sourmash-bio/sourmash", rev= "94b88cc314f781342721addc5ed35c531732a9b6", features = ["branchwater"] } #sourmash = { version = "0.12.0", features = ["branchwater"] } serde_json = "1.0.111" niffler = "2.4.0" From 86d6c1645bc1928bc808bf4b57820fb57667e3c2 Mon Sep 17 00:00:00 2001 From: "N. Tessa Pierce-Ward" Date: Tue, 23 Jan 2024 20:59:47 -0800 Subject: [PATCH 16/40] cleanup unused imports and code --- src/index.rs | 20 +------------------- src/mastiff_manygather.rs | 15 ++------------- src/mastiff_manysearch.rs | 12 +----------- 3 files changed, 4 insertions(+), 43 deletions(-) diff --git a/src/index.rs b/src/index.rs index 6768d058..23675614 100644 --- a/src/index.rs +++ b/src/index.rs @@ -1,20 +1,15 @@ -//use sourmash::index::revindex::RevIndex; +use camino::Utf8PathBuf as PathBuf; use sourmash::collection::Collection; use sourmash::index::revindex::RevIndex; -// use sourmash::index::revindex::{prepare_query, RevIndex, RevIndexOps}; use sourmash::manifest::Manifest; use sourmash::prelude::*; -// use sourmash::signature::{Signature, SigsTrait}; use sourmash::storage::{FSStorage, InnerStorage, ZipStorage}; -// use sourmash::sketch::Sketch; -use camino::Utf8PathBuf as PathBuf; use std::path::Path; use crate::utils::load_sketchlist_filenames; pub fn index>( siglist: PathBuf, - // template: Sketch, manifest: Option

, selection: Selection, output: P, @@ -23,19 +18,6 @@ pub fn index>( ) -> Result<(), Box> { println!("Loading siglist"); - // let (index_sigs, _temp_dir) = load_sigpaths_from_zip_or_pathlist(&siglist)?; - - // // if index_sigs pathlist is empty, bail - // if index_sigs.is_empty() { - // bail!("No signatures to index loaded, exiting."); - // } - - // // Create or open the RevIndex database with the provided output path and colors flag - // let db = RevIndex::create(output.as_ref(), colors); - - // // Index the signatures using the loaded template, threshold, and save_paths option - // db.index(index_sigs, &template, 0.0, save_paths); - let manifest = if let Some(m) = manifest { let rdr = std::fs::OpenOptions::new().read(true).open(m.as_ref())?; Some(Manifest::from_reader(rdr)?) diff --git a/src/mastiff_manygather.rs b/src/mastiff_manygather.rs index b581d73e..a8f78bd7 100644 --- a/src/mastiff_manygather.rs +++ b/src/mastiff_manygather.rs @@ -2,22 +2,13 @@ use anyhow::Result; use rayon::prelude::*; -use sourmash::ffi::signature; use sourmash::signature::Signature; -use sourmash::sketch::minhash::KmerMinHash; use sourmash::sketch::Sketch; use std::path::Path; -// use sourmash::collection::Collection; -// use sourmash::selection::Selection; use sourmash::prelude::*; -// use sourmash::index::revindex::{prepare_query, RevIndex, RevIndexOps}; -// use sourmash::manifest::Manifest; -// use sourmash::prelude::*; -// use sourmash::signature::{Signature, SigsTrait}; -// use sourmash::storage::{FSStorage, InnerStorage, ZipStorage}; -use sourmash::index::revindex::{prepare_query, RevIndex, RevIndexOps}; +use sourmash::index::revindex::{RevIndex, RevIndexOps}; use std::sync::atomic; use std::sync::atomic::AtomicUsize; @@ -25,7 +16,7 @@ use std::sync::atomic::AtomicUsize; use std::fs::File; use std::io::{BufWriter, Write}; -use crate::utils::{is_revindex_database, load_sigpaths_from_zip_or_pathlist, ReportType}; // prepare_query +use crate::utils::{is_revindex_database, load_sigpaths_from_zip_or_pathlist, ReportType}; pub fn mastiff_manygather>( queries_file: P, @@ -88,7 +79,6 @@ pub fn mastiff_manygather>( let send = query_paths .par_iter() .filter_map(|filename| { - // ... existing setup code ... let threshold = threshold_bp / selection.scaled()? as usize; match Signature::from_path(filename) { @@ -100,7 +90,6 @@ pub fn mastiff_manygather>( for sketch in query_sig.iter() { if let Sketch::MinHash(query) = sketch { found_compatible_sketch = true; - // eprintln!("query-size: {}", sketch.size()); // Gather! let (counter, query_colors, hash_to_color) = db.prepare_gather_counters(&query); diff --git a/src/mastiff_manysearch.rs b/src/mastiff_manysearch.rs index 654c1c17..40065d62 100644 --- a/src/mastiff_manysearch.rs +++ b/src/mastiff_manysearch.rs @@ -1,20 +1,10 @@ /// mastiff_manysearch: mastiff-indexed version of manysearch. use anyhow::Result; use rayon::prelude::*; - +use sourmash::index::revindex::{RevIndex, RevIndexOps}; use sourmash::signature::{Signature, SigsTrait}; use sourmash::sketch::Sketch; use std::path::Path; - -// use sourmash::collection::Collection; -// use sourmash::index::revindex::{prepare_query, RevIndex, RevIndexOps}; -// use sourmash::manifest::Manifest; -// use sourmash::prelude::*; -// use sourmash::storage::{FSStorage, InnerStorage, ZipStorage}; - -// use sourmash::index::revindex::RevIndex; -use sourmash::index::revindex::{RevIndex, RevIndexOps}; - use std::sync::atomic; use std::sync::atomic::AtomicUsize; From 13940cd6a466e37fd6d13b2e0747b03b3048b9f7 Mon Sep 17 00:00:00 2001 From: "N. Tessa Pierce-Ward" Date: Wed, 24 Jan 2024 18:26:09 -0800 Subject: [PATCH 17/40] init use collection for query loading --- src/lib.rs | 2 - src/mastiff_manygather.rs | 147 +++++++++++++++++--------------------- src/utils.rs | 51 ++++++++++++- 3 files changed, 112 insertions(+), 88 deletions(-) diff --git a/src/lib.rs b/src/lib.rs index a05d1094..1d6a227d 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -122,7 +122,6 @@ fn do_fastmultigather( match mastiff_manygather::mastiff_manygather( query_filenames, siglist_path, - template, selection, threshold_bp, output_path, @@ -192,7 +191,6 @@ fn do_index( // match index::index(siglist, template, output, save_paths, colors) { // convert siglist to PathBuf // build template from ksize, scaled - let template = build_template(ksize, scaled, &moltype); let location = camino::Utf8PathBuf::from(siglist); let manifest = None; match index::index(location, manifest, selection, output, save_paths, colors) { diff --git a/src/mastiff_manygather.rs b/src/mastiff_manygather.rs index a8f78bd7..e50eefc3 100644 --- a/src/mastiff_manygather.rs +++ b/src/mastiff_manygather.rs @@ -6,6 +6,9 @@ use sourmash::signature::Signature; use sourmash::sketch::Sketch; use std::path::Path; +// use camino::Utf8Path as Path; +// use camino::Utf8PathBuf as PathBuf; + use sourmash::prelude::*; use sourmash::index::revindex::{RevIndex, RevIndexOps}; @@ -16,12 +19,11 @@ use std::sync::atomic::AtomicUsize; use std::fs::File; use std::io::{BufWriter, Write}; -use crate::utils::{is_revindex_database, load_sigpaths_from_zip_or_pathlist, ReportType}; +use crate::utils::{is_revindex_database, load_collection}; //, ReportType}; pub fn mastiff_manygather>( - queries_file: P, + queries_file: String, index: P, - template: Sketch, selection: Selection, threshold_bp: usize, output: Option

, @@ -36,10 +38,7 @@ pub fn mastiff_manygather>( let db = RevIndex::open(index.as_ref(), true)?; println!("Loaded DB"); - // Load query paths - let queryfile_name = queries_file.as_ref().to_string_lossy().to_string(); - let (query_paths, _temp_dir) = - load_sigpaths_from_zip_or_pathlist(&queries_file, &template, ReportType::Query)?; + let query_collection = load_collection(camino::Utf8PathBuf::from(queries_file), &selection)?; // set up a multi-producer, single-consumer channel. let (send, recv) = std::sync::mpsc::sync_channel(rayon::current_num_threads()); @@ -76,90 +75,72 @@ pub fn mastiff_manygather>( let skipped_paths = AtomicUsize::new(0); let failed_paths = AtomicUsize::new(0); - let send = query_paths - .par_iter() - .filter_map(|filename| { - let threshold = threshold_bp / selection.scaled()? as usize; - - match Signature::from_path(filename) { - Ok(mut signatures) if !signatures.is_empty() => { - match signatures.swap_remove(0).select(&selection) { - Ok(query_sig) => { - let mut results = vec![]; - let mut found_compatible_sketch = false; - for sketch in query_sig.iter() { - if let Sketch::MinHash(query) = sketch { - found_compatible_sketch = true; - // Gather! - let (counter, query_colors, hash_to_color) = - db.prepare_gather_counters(&query); - - let matches = db.gather( - counter, - query_colors, - hash_to_color, - threshold, - &query, - Some(selection.clone()), - ); - // extract results - if let Ok(matches) = matches { - for match_ in &matches { - results.push(( - query_sig.name().clone(), - query.md5sum().clone(), - match_.name().clone(), - match_.md5().clone(), - match_.f_match(), // f_match_query - match_.intersect_bp(), - )); // intersect_bp - } - } else { - eprintln!("Error gathering matches: {:?}", matches.err()); - } - } - } - if !found_compatible_sketch { - if !queryfile_name.ends_with(".zip") { - eprintln!( - "WARNING: no compatible sketches in path '{}'", - filename.display() - ); - } - let _ = skipped_paths.fetch_add(1, atomic::Ordering::SeqCst); + let send = query_collection + .par_iter() + .filter_map(|(idx, record)| { + let threshold = threshold_bp / selection.scaled()? as usize; + + match query_collection.sig_for_dataset(idx) { + Ok(query_sig) => { + let mut results = vec![]; + let mut found_compatible_sketch = false; + for sketch in query_sig.iter() { + if let Sketch::MinHash(query) = sketch { + found_compatible_sketch = true; + // Gather! + let (counter, query_colors, hash_to_color) = + db.prepare_gather_counters(&query); + + let matches = db.gather( + counter, + query_colors, + hash_to_color, + threshold, + &query, + Some(selection.clone()), + ); + // extract results + if let Ok(matches) = matches { + for match_ in &matches { + results.push(( + query_sig.name().clone(), + query.md5sum().clone(), + match_.name().clone(), + match_.md5().clone(), + match_.f_match(), // f_match_query + match_.intersect_bp(), + )); // intersect_bp } - - if results.is_empty() { - None - } else { - Some(results) - } - } - Err(err) => { - eprintln!("Error selecting sketches: {}", err); - let _ = failed_paths.fetch_add(1, atomic::Ordering::SeqCst); - None + } else { + eprintln!("Error gathering matches: {:?}", matches.err()); } } } - Ok(_) => { - eprintln!("No signatures found in '{}'", filename.display()); - let _ = failed_paths.fetch_add(1, atomic::Ordering::SeqCst); - None + if !found_compatible_sketch { + // if !queryfile_name.ends_with(".zip") { + // eprintln!( + // "WARNING: no compatible sketches in path '{}'", + // filename.display() + // ); + // } + let _ = skipped_paths.fetch_add(1, atomic::Ordering::SeqCst); } - Err(err) => { - eprintln!( - "WARNING: could not load sketches from path '{}': {}", - filename.display(), - err - ); - let _ = failed_paths.fetch_add(1, atomic::Ordering::SeqCst); + + if results.is_empty() { None + } else { + Some(results) } } - }) - .flatten() - .try_for_each_with(send, |s, m| s.send(m)); + Err(err) => { + eprintln!("Error loading sketch: {}", err); + let _ = failed_paths.fetch_add(1, atomic::Ordering::SeqCst); + None + } + } + }) + .flatten() + .try_for_each_with(send, |s, m| s.send(m)); // do some cleanup and error handling - if let Err(e) = send { diff --git a/src/utils.rs b/src/utils.rs index eefbaa14..64472ae2 100644 --- a/src/utils.rs +++ b/src/utils.rs @@ -1,6 +1,8 @@ /// Utility functions for sourmash_plugin_branchwater. use rayon::prelude::*; use sourmash::encodings::HashFunctions; +use sourmash::manifest::Manifest; +use sourmash::selection::Select; use std::fs::File; use std::io::Read; @@ -19,13 +21,13 @@ use anyhow::{anyhow, Result}; use std::cmp::{Ordering, PartialOrd}; -// use sourmash::prelude::FracMinHashOps; -// use sourmash::prelude::HashOps; use sourmash::signature::{Signature, SigsTrait}; use sourmash::sketch::minhash::{max_hash_for_scaled, KmerMinHash}; use sourmash::sketch::Sketch; +use sourmash::collection::Collection; +use sourmash::selection::Selection; + -// use tempfile::tempdir; /// Track a name/minhash. pub struct SmallSignature { @@ -225,6 +227,31 @@ pub fn load_sketchlist_filenames>(sketchlist_filename: &P) -> Res Ok(sketchlist_filenames) } +pub fn load_sketchlist_filenames_camino>(sketchlist_filename: &P) -> Result> { + let sketchlist_file = BufReader::new(File::open(sketchlist_filename)?); + + let mut sketchlist_filenames: Vec = Vec::new(); + for line in sketchlist_file.lines() { + let line = match line { + Ok(v) => v, + Err(_) => { + return { + let filename = sketchlist_filename.as_ref().display(); + let msg = format!("invalid line in fromfile '{}'", filename); + Err(anyhow!(msg)) + } + } + }; + + if !line.is_empty() { + let path = camino::Utf8PathBuf::from(line); + sketchlist_filenames.push(path); + } + } + Ok(sketchlist_filenames) +} + + /// Loads signature file paths from a ZIP archive. /// /// This function extracts the contents of a ZIP archive containing @@ -649,6 +676,7 @@ pub fn load_sketches_from_zip_or_pathlist>( .map(|ext| ext == "zip") .unwrap_or(false) { + load_sketches_from_zip(sketchlist_path, template)? } else { let sketch_paths = load_sketchlist_filenames(&sketchlist_path)?; @@ -660,6 +688,23 @@ pub fn load_sketches_from_zip_or_pathlist>( Ok(sketchlist) } +pub fn load_collection( + sigpath: camino::Utf8PathBuf, + selection: &Selection, +) -> Result { + let collection = if sigpath.extension().map_or(false, |ext| ext == "zip") { + Collection::from_zipfile(&sigpath)? + } else { + let sig_paths: Vec<_> = load_sketchlist_filenames_camino(&sigpath) + .unwrap_or_else(|_| panic!("Error loading siglist")) + .into_iter() + .collect(); + Collection::from_paths(&sig_paths)? + }; + // return collection records that match selection + Ok(collection.select(&selection)?) +} + /// Uses the output of sketch loading functions to report the /// total number of sketches loaded, as well as the number of files, /// if any, that failed to load or contained no compatible sketches. From cd8be99889c41489bcdc6104380033d1a6f95de1 Mon Sep 17 00:00:00 2001 From: "N. Tessa Pierce-Ward" Date: Thu, 25 Jan 2024 18:28:54 -0800 Subject: [PATCH 18/40] ...collection loading in progress --- src/check.rs | 10 +- src/fastgather.rs | 215 ++++++++++++++++++-------- src/fastmultigather.rs | 152 +++++++++--------- src/lib.rs | 83 +++++----- src/mastiff_manygather.rs | 25 ++- src/mastiff_manysearch.rs | 87 ++++++----- src/python/tests/test_multigather.py | 8 +- src/utils.rs | 222 +++++++++++++++++++-------- 8 files changed, 482 insertions(+), 320 deletions(-) diff --git a/src/check.rs b/src/check.rs index 7fea2eca..1311318c 100644 --- a/src/check.rs +++ b/src/check.rs @@ -1,19 +1,17 @@ -use std::path::Path; - use crate::utils::is_revindex_database; use sourmash::index::revindex::{RevIndex, RevIndexOps}; -pub fn check>(index: P, quick: bool) -> Result<(), Box> { - if !is_revindex_database(index.as_ref()) { +pub fn check(index: camino::Utf8PathBuf, quick: bool) -> Result<(), Box> { + if !is_revindex_database(&index) { bail!( "'{}' is not a valid RevIndex database", - index.as_ref().display() + index ); } println!("Opening DB"); - let db = RevIndex::open(index.as_ref(), true)?; + let db = RevIndex::open(index, true)?; println!("Starting check"); db.check(quick); diff --git a/src/fastgather.rs b/src/fastgather.rs index 963a6232..2fef7522 100644 --- a/src/fastgather.rs +++ b/src/fastgather.rs @@ -1,50 +1,39 @@ /// fastgather: Run gather with a query against a list of files. use anyhow::Result; -use sourmash::signature::Signature; use sourmash::sketch::Sketch; -use std::path::Path; +use sourmash::signature::Signature; +use sourmash::selection::Selection; +use camino; +use std::collections::BinaryHeap; +use crate::utils::PrefetchResult; use crate::utils::{ - consume_query_by_gather, load_sigpaths_from_zip_or_pathlist, load_sketches_above_threshold, - prepare_query, write_prefetch, ReportType, + consume_query_by_gather, load_sketches_above_threshold, write_prefetch, ReportType, load_collection }; -pub fn fastgather + std::fmt::Debug + std::fmt::Display + Clone>( - query_filename: P, - matchlist_filename: P, +pub fn fastgather( + query_filepath: camino::Utf8PathBuf, + against_filepath: camino::Utf8PathBuf, threshold_bp: usize, ksize: u8, scaled: usize, - template: Sketch, - gather_output: Option

, - prefetch_output: Option

, + selection: &Selection, + gather_output: Option, + prefetch_output: Option, ) -> Result<()> { - let location = query_filename.to_string(); - eprintln!("Loading query from '{}'", location); - let query = { - let sigs = Signature::from_path(query_filename)?; - - prepare_query(&sigs, &template, &location) - }; - // did we find anything matching the desired template? - let query = match query { - Some(query) => query, - None => bail!("No sketch found with scaled={}, k={}", scaled, ksize), - }; - - // build the list of paths to match against. - eprintln!( - "Loading matchlist from '{}'", - matchlist_filename.as_ref().display() - ); - let matchlist_filename = matchlist_filename.as_ref().to_string_lossy().to_string(); - let (matchlist_paths, _temp_dir) = - load_sigpaths_from_zip_or_pathlist(matchlist_filename, &template, ReportType::Against)?; + let query_collection = load_collection(&query_filepath, selection, ReportType::Query)?; - eprintln!("Loaded {} sig paths in matchlist", matchlist_paths.len()); + if query_collection.len() > 1 { + bail!("Found more than one compatible sketch from '{}'. Fastgather requires a single query sketch.", &query_filepath) + } + // build the list of paths to match against. + eprintln!("Loading matchlist from '{}'", against_filepath); + let against_collection = load_collection(&against_filepath, selection, ReportType::Against)?; + eprintln!("Loaded {} sig paths in matchlist", against_collection.len()); + // calculate the minimum number of hashes based on desired threshold let threshold_hashes: u64 = { let x = threshold_bp / scaled; @@ -60,41 +49,131 @@ pub fn fastgather + std::fmt::Debug + std::fmt::Display + Clone>( "using threshold overlap: {} {}", threshold_hashes, threshold_bp ); - - // load a set of sketches, filtering for those with overlaps > threshold - let result = load_sketches_above_threshold( - matchlist_paths, - &template, - &query.minhash, - threshold_hashes, - )?; - let matchlist = result.0; - let skipped_paths = result.1; - let failed_paths = result.2; - - if skipped_paths > 0 { - eprintln!( - "WARNING: skipped {} search paths - no compatible signatures.", - skipped_paths - ); - } - if failed_paths > 0 { - eprintln!( - "WARNING: {} search paths failed to load. See error messages above.", - failed_paths - ); - } - - if matchlist.is_empty() { - eprintln!("No search signatures loaded, exiting."); - return Ok(()); - } - - if prefetch_output.is_some() { - write_prefetch(&query, prefetch_output, &matchlist).ok(); - } - - // run the gather! - consume_query_by_gather(query, matchlist, threshold_hashes, gather_output).ok(); + query_collection.iter().for_each(|(idx, record)| { + // Load query sig + match query_collection.sig_for_dataset(idx) { + Ok(query_sig) => { + let location = query_sig.filename(); + let mut matchlist: BinaryHeap = BinaryHeap::new(); + let mut skipped_paths = 0; + let mut failed_paths = 0; + + for sketch in query_sig.iter() { + // Access query MinHash + if let Sketch::MinHash(query) = sketch { + let result = load_sketches_above_threshold( + against_collection, + &selection, + &query, + threshold_hashes, + ); + + match result { + Ok((loaded_matchlist, skipped, failed)) => { + matchlist.extend(loaded_matchlist); + skipped_paths += skipped; + failed_paths += failed; + } + Err(err) => { + eprintln!("Error loading sketches: {:?}", err); + failed_paths += 1; + } + } + } + } + + if skipped_paths > 0 { + eprintln!( + "WARNING: Skipped {} search paths - no compatible signatures.", + skipped_paths + ); + } + if failed_paths > 0 { + eprintln!( + "WARNING: {} search paths failed to load. See error messages above.", + failed_paths + ); + } + + if matchlist.is_empty() { + eprintln!("No search signatures loaded for '{}', exiting.", location); + return; // Return early if no search signatures loaded + } + + if let Some(prefetch_output) = &prefetch_output { + write_prefetch(&query_sig, Some(prefetch_output.clone()), &matchlist).ok(); + } + + // Run the gather! + if let Some(gather_output) = &gather_output { + if let Err(err) = consume_query_by_gather(query_sig, matchlist, threshold_hashes, Some(gather_output)) { + eprintln!("Error during gather: {:?}", err); + } + } + } + Err(_) => { + eprintln!("WARNING: Could not load query sketch '{}'", record.internal_location()); + } + } + }); Ok(()) } + +// query_collection.iter().for_each(|(idx, record)| { +// // Load query sig +// match query_collection.sig_for_dataset(idx) { +// Ok(query_sig) => { +// let location = query_sig.filename(); +// for sketch in query_sig.iter() { +// // Access query MinHash +// if let Sketch::MinHash(query) = sketch { +// let matchlist: BinaryHeap = sketchlist +// .par_iter() +// .filter_map(|sm| { +// // Call a function to load sketches above threshold +// let result = load_sketches_above_threshold( +// against_collection, +// &selection, +// &query, +// threshold_hashes, +// )?; +// let matchlist = result.0; +// let skipped_paths = result.1; +// let failed_paths = result.2; + +// if skipped_paths > 0 { +// eprintln!( +// "WARNING: skipped {} search paths - no compatible signatures.", +// skipped_paths +// ); +// } +// if failed_paths > 0 { +// eprintln!( +// "WARNING: {} search paths failed to load. See error messages above.", +// failed_paths +// ); +// } + +// if matchlist.is_empty() { +// eprintln!("No search signatures loaded, exiting."); +// return Ok(()); +// } + +// if prefetch_output.is_some() { +// write_prefetch(&query_sig, prefetch_output, &matchlist).ok(); +// } + +// // run the gather! +// consume_query_by_gather(query_sig, matchlist, threshold_hashes, gather_output).ok(); +// }); +// } +// } +// } +// } +// Err(_) => { +// eprintln!("WARNING: Could not load query sketch '{}'", record.internal_location()); +// } +// } +// }); +// Ok(()) +// } diff --git a/src/fastmultigather.rs b/src/fastmultigather.rs index 915b6370..70850a37 100644 --- a/src/fastmultigather.rs +++ b/src/fastmultigather.rs @@ -2,32 +2,33 @@ use anyhow::Result; use rayon::prelude::*; -use sourmash::signature::Signature; +use sourmash::storage::SigStore; +use sourmash::{selection, signature::Signature}; use sourmash::sketch::Sketch; -use std::path::Path; +use sourmash::selection::Selection; use std::sync::atomic; use std::sync::atomic::AtomicUsize; use std::collections::BinaryHeap; +use camino::Utf8PathBuf; + use crate::utils::{ - consume_query_by_gather, load_sigpaths_from_zip_or_pathlist, - load_sketches_from_zip_or_pathlist, prepare_query, write_prefetch, PrefetchResult, ReportType, + consume_query_by_gather, load_collection, load_sigpaths_from_zip_or_pathlist, load_sketches_from_zip_or_pathlist, prepare_query, write_prefetch, PrefetchResult, ReportType }; -pub fn fastmultigather + std::fmt::Debug + Clone>( - query_filenames: P, - matchlist_filename: P, +pub fn fastmultigather( + query_filepath: camino::Utf8PathBuf, + against_filepath: camino::Utf8PathBuf, threshold_bp: usize, scaled: usize, - template: Sketch, + // template: Sketch, + selection: &Selection, ) -> Result<()> { // load the list of query paths - let queryfile_name = query_filenames.as_ref().to_string_lossy().to_string(); - let (querylist_paths, _temp_dir) = - load_sigpaths_from_zip_or_pathlist(&query_filenames, &template, ReportType::Query)?; - println!("Loaded {} sig paths in querylist", querylist_paths.len()); + let query_collection = load_collection(&query_filepath, selection, ReportType::Query)?; + println!("Loaded {} sig paths in querylist", query_collection.len()); let threshold_hashes: u64 = { let x = threshold_bp / scaled; @@ -43,82 +44,77 @@ pub fn fastmultigather + std::fmt::Debug + Clone>( println!("threshold overlap: {} {}", threshold_hashes, threshold_bp); // Load all the against sketches - let sketchlist = - load_sketches_from_zip_or_pathlist(&matchlist_filename, &template, ReportType::Against)?; + let against_collection = load_collection(&against_filepath, selection, ReportType::Against)?; + // load actual signatures + let mut sketchlist: Vec = vec![]; + + for (idx, record) in against_collection.iter() { + if let Ok(sig) = against_collection.sig_for_dataset(idx) { + sketchlist.push(sig); + } else { + eprintln!("Failed to load 'against' record: {}", record.name()); + } + } // Iterate over all queries => do prefetch and gather! let processed_queries = AtomicUsize::new(0); let skipped_paths = AtomicUsize::new(0); let failed_paths = AtomicUsize::new(0); - querylist_paths.par_iter().for_each(|q| { - // increment counter of # of queries - let _i = processed_queries.fetch_add(1, atomic::Ordering::SeqCst); - - // set query_label to the last path element. - let location = q.clone().into_os_string().into_string().unwrap(); - let location = location.split('/').last().unwrap().to_string(); - - let query = match Signature::from_path(dbg!(q)) { - Ok(sigs) => { - let mm = prepare_query(&sigs, &template, &location); - - if mm.is_none() { - if !queryfile_name.ends_with(".zip") { - eprintln!("WARNING: no compatible sketches in path '{}'", q.display()); + query_collection.par_iter().for_each(|(idx, record)| { + // increment counter of # of queries. q: could we instead use the index from par_iter()? + let _i = processed_queries.fetch_add(1, atomic::Ordering::SeqCst); + // Load query sig + match query_collection.sig_for_dataset(idx) { + Ok(query_sig) => { + let location = query_sig.filename(); + for sketch in query_sig.iter() { + // Access query MinHash + if let Sketch::MinHash(query) = sketch { + let matchlist: BinaryHeap = sketchlist + .par_iter() + .filter_map(|sm| { + let mut mm = None; + // Access against MinHash + if let Some(sketch) = sm.sketches().get(0) { + if let Sketch::MinHash(against_sketch) = sketch { + if let Ok(overlap) = against_sketch.count_common(&query, false) { + if overlap >= threshold_hashes { + let result = PrefetchResult { + name: sm.name(), + md5sum: sm.md5sum().clone(), + minhash: against_sketch.clone(), + overlap, + }; + mm = Some(result); + } + } + } + } + mm + }) + .collect(); + if !matchlist.is_empty() { + let prefetch_output = format!("{}.prefetch.csv", location); + let gather_output = format!("{}.gather.csv", location); + + // Save initial list of matches to prefetch output + write_prefetch(&query_sig, Some(prefetch_output), &matchlist).ok(); + + // Now, do the gather! + consume_query_by_gather(query_sig.clone(), matchlist, threshold_hashes, Some(gather_output)).ok(); + } else { + println!("No matches to '{}'", location); } - let _ = skipped_paths.fetch_add(1, atomic::Ordering::SeqCst); } - mm - } - Err(err) => { - eprintln!("Sketch loading error: {}", err); - eprintln!( - "WARNING: could not load sketches from path '{}'", - q.display() - ); - let _ = failed_paths.fetch_add(1, atomic::Ordering::SeqCst); - None - } - }; - - if let Some(query) = query { - // filter first set of matches out of sketchlist - let matchlist: BinaryHeap = sketchlist - .par_iter() - .filter_map(|sm| { - let mut mm = None; - - if let Ok(overlap) = sm.minhash.count_common(&query.minhash, false) { - if overlap >= threshold_hashes { - let result = PrefetchResult { - name: sm.name.clone(), - md5sum: sm.md5sum.clone(), - minhash: sm.minhash.clone(), - overlap, - }; - mm = Some(result); - } - } - mm - }) - .collect(); - - if !matchlist.is_empty() { - let prefetch_output = format!("{location}.prefetch.csv"); - let gather_output = format!("{location}.gather.csv"); - - // save initial list of matches to prefetch output - write_prefetch(&query, Some(prefetch_output), &matchlist).ok(); - - // now, do the gather! - consume_query_by_gather(query, matchlist, threshold_hashes, Some(gather_output)) - .ok(); - } else { - println!("No matches to '{}'", location); } } - }); + Err(_) => { + eprintln!("WARNING: no compatible sketches in path '{}'", record.internal_location()); + let _ = skipped_paths.fetch_add(1, atomic::Ordering::SeqCst); + } + } +}); println!( "Processed {} queries total.", diff --git a/src/lib.rs b/src/lib.rs index 1d6a227d..18f8e9de 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -1,11 +1,12 @@ /// Python interface Rust code for sourmash_plugin_branchwater. use pyo3::prelude::*; +use sourmash::selection; #[macro_use] extern crate simple_error; mod utils; -use crate::utils::build_template; +use crate::utils::{build_template, build_selection}; use crate::utils::is_revindex_database; mod check; mod fastgather; @@ -20,6 +21,8 @@ mod pairwise; use sourmash::encodings::HashFunctions; use sourmash::selection::Selection; +use camino::Utf8PathBuf; + #[pyfunction] fn do_manysearch( querylist_path: String, @@ -30,13 +33,19 @@ fn do_manysearch( moltype: String, output_path: Option, ) -> anyhow::Result { + + let queryfile_path: camino::Utf8PathBuf = querylist_path.clone().into(); + let againstfile_path: camino::Utf8PathBuf = siglist_path.clone().into(); + let selection = build_selection(ksize, scaled, &moltype); + // if siglist_path is revindex, run mastiff_manysearch; otherwise run manysearch let template = build_template(ksize, scaled, &moltype); - if is_revindex_database(siglist_path.as_ref()) { + if is_revindex_database(&againstfile_path) { + // if is_revindex_database(siglist_path.as_ref()) { match mastiff_manysearch::mastiff_manysearch( - querylist_path, - siglist_path, - template, + queryfile_path, + againstfile_path, + &selection, threshold, output_path, ) { @@ -74,14 +83,17 @@ fn do_fastgather( output_path_prefetch: Option, output_path_gather: Option, ) -> anyhow::Result { - let template = build_template(ksize, scaled, &moltype); + let queryfile_path: camino::Utf8PathBuf = query_filename.into(); + let againstfile_path: camino::Utf8PathBuf = siglist_path.into(); + let selection = build_selection(ksize, scaled, &moltype); + match fastgather::fastgather( - query_filename, - siglist_path, + queryfile_path, + againstfile_path, threshold_bp, ksize, scaled, - template, + &selection, output_path_prefetch, output_path_gather, ) { @@ -103,26 +115,17 @@ fn do_fastmultigather( moltype: String, output_path: Option, ) -> anyhow::Result { + + let queryfile_path: camino::Utf8PathBuf = query_filenames.into(); + let againstfile_path: camino::Utf8PathBuf = siglist_path.into(); + let selection = build_selection(ksize, scaled, &moltype); + // if a siglist path is a revindex, run mastiff_manygather. If not, run multigather - let template = build_template(ksize, scaled, &moltype); - if is_revindex_database(siglist_path.as_ref()) { - // build selection instead of template - let hash_function = match moltype.as_str() { - "dna" => HashFunctions::Murmur64Dna, - "protein" => HashFunctions::Murmur64Protein, - "dayhoff" => HashFunctions::Murmur64Dayhoff, - "hp" => HashFunctions::Murmur64Hp, - _ => panic!("Unknown molecule type: {}", moltype), - }; - let selection = Selection::builder() - .ksize(ksize.into()) - .scaled(scaled as u32) - .moltype(hash_function) - .build(); + if is_revindex_database(&againstfile_path) { match mastiff_manygather::mastiff_manygather( - query_filenames, - siglist_path, - selection, + queryfile_path, + againstfile_path, + &selection, threshold_bp, output_path, ) { @@ -134,11 +137,11 @@ fn do_fastmultigather( } } else { match fastmultigather::fastmultigather( - query_filenames, - siglist_path, + queryfile_path, + againstfile_path, threshold_bp, scaled, - template, + &selection, ) { Ok(_) => Ok(0), Err(e) => { @@ -176,21 +179,7 @@ fn do_index( save_paths: bool, colors: bool, ) -> anyhow::Result { - let hash_function = match moltype.as_str() { - "dna" => HashFunctions::Murmur64Dna, - "protein" => HashFunctions::Murmur64Protein, - "dayhoff" => HashFunctions::Murmur64Dayhoff, - "hp" => HashFunctions::Murmur64Hp, - _ => panic!("Unknown molecule type: {}", moltype), - }; - let selection = Selection::builder() - .ksize(ksize.into()) - .scaled(scaled as u32) - .moltype(hash_function) - .build(); - // match index::index(siglist, template, output, save_paths, colors) { - // convert siglist to PathBuf - // build template from ksize, scaled + let selection = build_selection(ksize, scaled, &moltype); let location = camino::Utf8PathBuf::from(siglist); let manifest = None; match index::index(location, manifest, selection, output, save_paths, colors) { @@ -204,7 +193,8 @@ fn do_index( #[pyfunction] fn do_check(index: String, quick: bool) -> anyhow::Result { - match check::check(index, quick) { + let idx: camino::Utf8PathBuf = index.into(); + match check::check(idx, quick) { Ok(_) => Ok(0), Err(e) => { eprintln!("Error: {e}"); @@ -223,6 +213,7 @@ fn do_multisearch( moltype: String, output_path: Option, ) -> anyhow::Result { + // let selection = build_selection(ksize, scaled, &moltype); let template = build_template(ksize, scaled, &moltype); match multisearch::multisearch( querylist_path, diff --git a/src/mastiff_manygather.rs b/src/mastiff_manygather.rs index e50eefc3..19da5728 100644 --- a/src/mastiff_manygather.rs +++ b/src/mastiff_manygather.rs @@ -19,26 +19,27 @@ use std::sync::atomic::AtomicUsize; use std::fs::File; use std::io::{BufWriter, Write}; -use crate::utils::{is_revindex_database, load_collection}; //, ReportType}; +use crate::utils::{is_revindex_database, load_collection, ReportType}; + pub fn mastiff_manygather>( - queries_file: String, - index: P, - selection: Selection, + queries_file: camino::Utf8PathBuf, + index: camino::Utf8PathBuf, + selection: &Selection, threshold_bp: usize, output: Option

, ) -> Result<(), Box> { - if !is_revindex_database(index.as_ref()) { + if !is_revindex_database(&index) { bail!( "'{}' is not a valid RevIndex database", - index.as_ref().display() + index ); } // Open database once - let db = RevIndex::open(index.as_ref(), true)?; + let db = RevIndex::open(index, true)?; println!("Loaded DB"); - let query_collection = load_collection(camino::Utf8PathBuf::from(queries_file), &selection)?; + let query_collection = load_collection(&queries_file, selection, ReportType::Query)?; // set up a multi-producer, single-consumer channel. let (send, recv) = std::sync::mpsc::sync_channel(rayon::current_num_threads()); @@ -81,6 +82,7 @@ pub fn mastiff_manygather>( let threshold = threshold_bp / selection.scaled()? as usize; match query_collection.sig_for_dataset(idx) { + // match query_collection.sig_from_record(record) { // to be added in core Ok(query_sig) => { let mut results = vec![]; let mut found_compatible_sketch = false; @@ -117,12 +119,7 @@ pub fn mastiff_manygather>( } } if !found_compatible_sketch { - // if !queryfile_name.ends_with(".zip") { - // eprintln!( - // "WARNING: no compatible sketches in path '{}'", - // filename.display() - // ); - // } + eprintln!("WARNING: no compatible sketches in path '{}'", query_sig.filename()); let _ = skipped_paths.fetch_add(1, atomic::Ordering::SeqCst); } diff --git a/src/mastiff_manysearch.rs b/src/mastiff_manysearch.rs index 40065d62..9d4a45b0 100644 --- a/src/mastiff_manysearch.rs +++ b/src/mastiff_manysearch.rs @@ -4,39 +4,42 @@ use rayon::prelude::*; use sourmash::index::revindex::{RevIndex, RevIndexOps}; use sourmash::signature::{Signature, SigsTrait}; use sourmash::sketch::Sketch; +use sourmash::selection::Selection; use std::path::Path; use std::sync::atomic; use std::sync::atomic::AtomicUsize; use crate::utils::{ - csvwriter_thread, is_revindex_database, load_sigpaths_from_zip_or_pathlist, prepare_query, + csvwriter_thread, is_revindex_database, load_collection, prepare_query, ReportType, SearchResult, }; pub fn mastiff_manysearch>( - queries_file: P, - index: P, - template: Sketch, + queries_path: camino::Utf8PathBuf, + index: camino::Utf8PathBuf, + selection: &Selection, minimum_containment: f64, output: Option

, ) -> Result<(), Box> { - if !is_revindex_database(index.as_ref()) { + if !is_revindex_database(&index) { bail!( "'{}' is not a valid RevIndex database", - index.as_ref().display() + index ); } // Open database once - let db = RevIndex::open(index.as_ref(), true)?; + let db = RevIndex::open(index, true)?; println!("Loaded DB"); // Load query paths - let queryfile_name = queries_file.as_ref().to_string_lossy().to_string(); - let (query_paths, _temp_dir) = - load_sigpaths_from_zip_or_pathlist(&queries_file, &template, ReportType::Query)?; + let query_collection = load_collection(&queries_path, selection, ReportType::Query)?; - // if query_paths is empty, exit with error - if query_paths.is_empty() { + // let queryfile_name = queries_file.as_ref().to_string_lossy().to_string(); + // let (query_paths, _temp_dir) = + // load_sigpaths_from_zip_or_pathlist(&queries_file, &template, ReportType::Query)?; + + // if query_paths is empty, exit with error. this should already happen via load_collection, i think? + if query_collection.len() == 0 { bail!("No query signatures loaded, exiting."); } @@ -56,53 +59,49 @@ pub fn mastiff_manysearch>( let skipped_paths = AtomicUsize::new(0); let failed_paths = AtomicUsize::new(0); - let send_result = query_paths + let send_result = query_collection .par_iter() - .filter_map(|filename| { + .filter_map(|(idx, record)| { let i = processed_sigs.fetch_add(1, atomic::Ordering::SeqCst); if i % 1000 == 0 { eprintln!("Processed {} search sigs", i); } let mut results = vec![]; - - // load query signature from path: - match Signature::from_path(filename) { + match query_collection.sig_for_dataset(idx) { Ok(query_sig) => { - let location = filename.display().to_string(); - if let Some(query) = prepare_query(&query_sig, &template, &location) { - let query_size = query.minhash.size() as f64; - // search mastiff db - let counter = db.counter_for_query(&query.minhash); - let matches = - db.matches_from_counter(counter, minimum_containment as usize); + for sketch in query_sig.iter() { + if let Sketch::MinHash(query_mh) = sketch { + // let location = query_sig.filename(); + let query_size = query_mh.size(); + let counter = db.counter_for_query(&query_mh); + let matches = db.matches_from_counter(counter, minimum_containment as usize); // filter the matches for containment - for (path, overlap) in matches { - let containment = overlap as f64 / query_size; - if containment >= minimum_containment { - results.push(SearchResult { - query_name: query.name.clone(), - query_md5: query.md5sum.clone(), - match_name: path.clone(), - containment, - intersect_hashes: overlap, - match_md5: None, - jaccard: None, - max_containment: None, - }); + for (path, overlap) in matches { + let containment = overlap as f64 / query_size as f64; + if containment >= minimum_containment { + results.push(SearchResult { + query_name: query_sig.name(), + query_md5: query_sig.md5sum(), + match_name: path.clone(), + containment, + intersect_hashes: overlap, + match_md5: None, + jaccard: None, + max_containment: None, + }); + } } - } - } else { + + } else { // for reading zips, this is likely not a useful warning and // would show up too often (every sig is stored as individual file). - if !queryfile_name.ends_with(".zip") { eprintln!( - "WARNING: no compatible sketches in path '{}'", - filename.display() + "WARNING: no compatible sketches in path '{}'", query_sig.filename() ); - } let _ = skipped_paths.fetch_add(1, atomic::Ordering::SeqCst); + } } if results.is_empty() { None @@ -115,7 +114,7 @@ pub fn mastiff_manysearch>( eprintln!("Sketch loading error: {}", err); eprintln!( "WARNING: could not load sketches from path '{}'", - filename.display() + record.internal_location() ); None } diff --git a/src/python/tests/test_multigather.py b/src/python/tests/test_multigather.py index 646e9309..26c85277 100644 --- a/src/python/tests/test_multigather.py +++ b/src/python/tests/test_multigather.py @@ -183,7 +183,8 @@ def test_missing_querylist(runtmp, capfd, indexed, zip_query): captured = capfd.readouterr() print(captured.err) - assert 'Error: No such file or directory ' in captured.err + # assert 'Error: failed to load query' in captured.err + assert 'Error: No such file or directory' in captured.err @pytest.mark.parametrize('indexed', [False, True]) @@ -239,7 +240,10 @@ def test_bad_query_2(runtmp, capfd, indexed): captured = capfd.readouterr() print(captured.err) - assert 'Error: invalid Zip archive: Could not find central directory end' in captured.err + if not indexed: + assert 'Error: invalid Zip archive: Could not find central directory end' in captured.err + else: + assert "InvalidArchive" in captured.err @pytest.mark.parametrize('indexed', [False, True]) diff --git a/src/utils.rs b/src/utils.rs index 64472ae2..4efe1fd9 100644 --- a/src/utils.rs +++ b/src/utils.rs @@ -17,8 +17,7 @@ use std::sync::atomic::AtomicUsize; use std::collections::BinaryHeap; -use anyhow::{anyhow, Result}; - +use anyhow::{anyhow, Result, Context}; use std::cmp::{Ordering, PartialOrd}; use sourmash::signature::{Signature, SigsTrait}; @@ -26,6 +25,8 @@ use sourmash::sketch::minhash::{max_hash_for_scaled, KmerMinHash}; use sourmash::sketch::Sketch; use sourmash::collection::Collection; use sourmash::selection::Selection; +use sourmash::errors::SourmashError; +use sourmash::storage::SigStore; /// Track a name/minhash. @@ -172,9 +173,10 @@ pub fn prefetch( } /// Write list of prefetch matches. -pub fn write_prefetch + std::fmt::Debug + std::fmt::Display + Clone>( - query: &SmallSignature, - prefetch_output: Option

, +// pub fn write_prefetch + std::fmt::Debug + std::fmt::Display + Clone>( +pub fn write_prefetch( + query: &SigStore, + prefetch_output: Option, matchlist: &BinaryHeap, ) -> Result<()> { // Set up a writer for prefetch output @@ -193,7 +195,7 @@ pub fn write_prefetch + std::fmt::Debug + std::fmt::Display + Clo writeln!( &mut writer, "{},\"{}\",{},\"{}\",{},{}", - query.location, query.name, query.md5sum, m.name, m.md5sum, m.overlap + query.filename(), query.name(), query.md5sum(), m.name, m.md5sum, m.overlap ) .ok(); } @@ -464,55 +466,49 @@ pub fn load_sketches( /// those with a minimum overlap. pub fn load_sketches_above_threshold( - sketchlist_paths: Vec, - template: &Sketch, + against_collection: Collection, + selection: &Selection, query: &KmerMinHash, threshold_hashes: u64, ) -> Result<(BinaryHeap, usize, usize)> { let skipped_paths = AtomicUsize::new(0); let failed_paths = AtomicUsize::new(0); - let matchlist: BinaryHeap = sketchlist_paths - .par_iter() - .filter_map(|m| { - let sigs = Signature::from_path(m); - let location = m.display().to_string(); - - match sigs { - Ok(sigs) => { - let mut mm = None; - - if let Some(sm) = prepare_query(&sigs, template, &location) { - let mh = sm.minhash; - if let Ok(overlap) = mh.count_common(query, false) { - if overlap >= threshold_hashes { - let result = PrefetchResult { - name: sm.name, - md5sum: sm.md5sum, - minhash: mh, - overlap, - }; - mm = Some(result); - } + let matchlist: BinaryHeap = against_collection + .par_iter() + .filter_map(|(idx, against_record)| { + let mut mm = None; + // Load against into memory + if let Ok(against_sig) = against_collection.sig_for_dataset(idx) { + if let Some(sketch) = against_sig.sketches().get(0) { + if let Sketch::MinHash(against_mh) = sketch { + if let Ok(overlap) = against_mh.count_common(query, false) { + if overlap >= threshold_hashes { + let result = PrefetchResult { + name: against_sig.name().to_string(), + md5sum: against_mh.md5sum().to_string(), + minhash: against_mh.clone(), + overlap, + }; + mm = Some(result); } - } else { - eprintln!("WARNING: no compatible sketches in path '{}'", m.display()); - let _i = skipped_paths.fetch_add(1, atomic::Ordering::SeqCst); } - mm - } - Err(err) => { - eprintln!("Sketch loading error: {}", err); - let _ = failed_paths.fetch_add(1, atomic::Ordering::SeqCst); - eprintln!( - "WARNING: could not load sketches from path '{}'", - m.display() - ); - None + } else { + eprintln!("WARNING: no compatible sketches in path '{}'", against_sig.filename()); + let _i = skipped_paths.fetch_add(1, atomic::Ordering::SeqCst); } + } else { + eprintln!("WARNING: no compatible sketches in path '{}'", against_sig.filename()); + let _i = skipped_paths.fetch_add(1, atomic::Ordering::SeqCst); } - }) - .collect(); + } else { + // this shouldn't happen here anymore -- likely would happen at load_collection + eprintln!("WARNING: could not load sketches for record '{}'", against_record.internal_location()); + let _i = skipped_paths.fetch_add(1, atomic::Ordering::SeqCst); + } + mm + }) + .collect(); let skipped_paths = skipped_paths.load(atomic::Ordering::SeqCst); let failed_paths = failed_paths.load(atomic::Ordering::SeqCst); @@ -689,22 +685,100 @@ pub fn load_sketches_from_zip_or_pathlist>( } pub fn load_collection( - sigpath: camino::Utf8PathBuf, + sigpath: &camino::Utf8PathBuf, selection: &Selection, + report_type: ReportType, ) -> Result { + if !sigpath.exists() { + bail!("No such file or directory: '{}'", sigpath); + } let collection = if sigpath.extension().map_or(false, |ext| ext == "zip") { - Collection::from_zipfile(&sigpath)? + match Collection::from_zipfile(&sigpath) { + Ok(collection) => collection, + Err(_) => { + bail!("failed to load {} zipfile: '{}'", report_type, sigpath); + } + } } else { - let sig_paths: Vec<_> = load_sketchlist_filenames_camino(&sigpath) - .unwrap_or_else(|_| panic!("Error loading siglist")) - .into_iter() - .collect(); - Collection::from_paths(&sig_paths)? + let sig_paths = load_sketchlist_filenames_camino(&sigpath)?; + match Collection::from_paths(&sig_paths) { + Ok(collection) => collection, + Err(_) => { + bail!("failed to load {} signature paths: '{}'", report_type, sigpath); + } + } }; - // return collection records that match selection - Ok(collection.select(&selection)?) + + let n_total = collection.len(); + let selected = collection.select(&selection)?; + let n_skipped = n_total - selected.len(); + let n_failed = 0; // TODO: can we get list / number of failed paths from core??? + report_on_collection_loading(&selected, n_skipped, n_failed, report_type)?; + Ok(selected) +} + +pub fn report_on_collection_loading( + collection: &Collection, + skipped_paths: usize, + failed_paths: usize, + report_type: ReportType, +) -> Result<()> { + if failed_paths > 0 { + eprintln!( + "WARNING: {} {} paths failed to load. See error messages above.", + failed_paths, report_type + ); + } + if skipped_paths > 0 { + eprintln!( + "WARNING: skipped {} {} paths - no compatible signatures.", + skipped_paths, report_type + ); + } + + // Validate sketches + if collection.is_empty() { + bail!("No {} signatures loaded, exiting.", report_type); + } + eprintln!("Loaded {} {} signature(s)", collection.len(), report_type); + Ok(()) } +pub fn load_single_sig_from_collection( + query_collection: &Collection, // Replace with the actual type + selection: &Selection, +) -> Result { + let scaled = selection.scaled().unwrap(); + let ksize = selection.ksize().unwrap(); + + match query_collection.sig_for_dataset(0) { + Ok(sig) => Ok(sig), + Err(_) => Err(anyhow::anyhow!("No sketch found with scaled={}, k={}", scaled, ksize)), + } +} + +// pub fn load_single_sketch_from_sig<'a>(sig: &'a SigStore, selection: &'a Selection) -> Result<&'a KmerMinHash> { +// let sketch = sig.sketches().get(0).ok_or_else(|| { +// anyhow::anyhow!("No sketch found with scaled={}, k={}", selection.scaled().unwrap_or_default(), selection.ksize().unwrap_or_default()) +// })?; + +// if let Sketch::MinHash(mh) = sketch { +// Ok(mh) +// } else { +// Err(anyhow::anyhow!("No sketch found with scaled={}, k={}", selection.scaled().unwrap_or_default(), selection.ksize().unwrap_or_default())) +// } +// } + +// pub fn load_single_sig_and_sketch<'a>( +// query_collection: &'a Collection, +// selection: &'a Selection, +// ) -> Result<(SigStore, &'a KmerMinHash)> { +// let sig = load_single_sig_from_collection(query_collection, selection)?; +// let sketch = load_single_sketch_from_sig(&sig, selection)?; +// Ok((sig, sketch)) +// } + + /// Uses the output of sketch loading functions to report the /// total number of sketches loaded, as well as the number of files, /// if any, that failed to load or contained no compatible sketches. @@ -758,7 +832,7 @@ pub fn report_on_sketch_loading( /// removing matches in 'matchlist' from 'query'. pub fn consume_query_by_gather + std::fmt::Debug + std::fmt::Display + Clone>( - query: SmallSignature, + query: SigStore, matchlist: BinaryHeap, threshold_hashes: u64, gather_output: Option

, @@ -778,17 +852,25 @@ pub fn consume_query_by_gather + std::fmt::Debug + std::fmt::Disp let mut matching_sketches = matchlist; let mut rank = 0; - let mut last_hashes = query.minhash.size(); + let mut last_hashes = query.size(); let mut last_matches = matching_sketches.len(); - let location = query.location; - let mut query_mh = query.minhash; + // let location = query.location; + let location = query.filename(); + // let mut query_mh = query.minhash; + + let sketches = query.sketches(); + let orig_query_mh = match sketches.get(0) { + Some(Sketch::MinHash(mh)) => Ok(mh), + _ => Err(anyhow::anyhow!("No MinHash found")), + }?; + let mut query_mh = orig_query_mh.clone(); eprintln!( "{} iter {}: start: query hashes={} matches={}", location, rank, - query_mh.size(), + query.size(), matching_sketches.len() ); @@ -803,8 +885,8 @@ pub fn consume_query_by_gather + std::fmt::Debug + std::fmt::Disp "{},{},\"{}\",{},\"{}\",{},{}", location, rank, - query.name, - query.md5sum, + query.name(), + query.md5sum(), best_element.name, best_element.md5sum, best_element.overlap @@ -855,7 +937,23 @@ pub fn build_template(ksize: u8, scaled: usize, moltype: &str) -> Sketch { Sketch::MinHash(template_mh) } -pub fn is_revindex_database(path: &Path) -> bool { +pub fn build_selection(ksize: u8, scaled: usize, moltype: &str) -> Selection { + let hash_function = match moltype { + "dna" => HashFunctions::Murmur64Dna, + "protein" => HashFunctions::Murmur64Protein, + "dayhoff" => HashFunctions::Murmur64Dayhoff, + "hp" => HashFunctions::Murmur64Hp, + _ => panic!("Unknown molecule type: {}", moltype), + }; + + Selection::builder() + .ksize(ksize.into()) + .scaled(scaled as u32) + .moltype(hash_function) + .build() +} + +pub fn is_revindex_database(path: &camino::Utf8PathBuf) -> bool { // quick file check for Revindex database: // is path a directory that contains a file named 'CURRENT'? if path.is_dir() { From 32fc2d5a5dcda69ea60229e70b208f4602e2004c Mon Sep 17 00:00:00 2001 From: "N. Tessa Pierce-Ward" Date: Fri, 26 Jan 2024 12:18:36 -0800 Subject: [PATCH 19/40] fix fastgather --- src/fastgather.rs | 187 +++++++++++++++------------------------------- 1 file changed, 60 insertions(+), 127 deletions(-) diff --git a/src/fastgather.rs b/src/fastgather.rs index 2fef7522..2ff13509 100644 --- a/src/fastgather.rs +++ b/src/fastgather.rs @@ -28,6 +28,30 @@ pub fn fastgather( if query_collection.len() > 1 { bail!("Found more than one compatible sketch from '{}'. Fastgather requires a single query sketch.", &query_filepath) } + // load query sig into memory + let mut query_mh = None; + let mut query_sig = None; + for (idx, _record) in query_collection.iter() { + // Load query sig + match query_collection.sig_for_dataset(idx) { + Ok(query_sig) => { + for sketch in query_sig.iter() { + // Access query MinHash + if let Sketch::MinHash(query) = sketch { + query_mh = Some(query.clone()); + break; + } + } + } + Err(_) => { + bail!("No query sketch matching selection parameters.") // should not get here bc we already check this during collection loading? + } + } + + if query_mh.is_some() { + break; // Exit the loop if we found a MinHash sketch + } + } // build the list of paths to match against. eprintln!("Loading matchlist from '{}'", against_filepath); @@ -49,131 +73,40 @@ pub fn fastgather( "using threshold overlap: {} {}", threshold_hashes, threshold_bp ); - query_collection.iter().for_each(|(idx, record)| { - // Load query sig - match query_collection.sig_for_dataset(idx) { - Ok(query_sig) => { - let location = query_sig.filename(); - let mut matchlist: BinaryHeap = BinaryHeap::new(); - let mut skipped_paths = 0; - let mut failed_paths = 0; - - for sketch in query_sig.iter() { - // Access query MinHash - if let Sketch::MinHash(query) = sketch { - let result = load_sketches_above_threshold( - against_collection, - &selection, - &query, - threshold_hashes, - ); - - match result { - Ok((loaded_matchlist, skipped, failed)) => { - matchlist.extend(loaded_matchlist); - skipped_paths += skipped; - failed_paths += failed; - } - Err(err) => { - eprintln!("Error loading sketches: {:?}", err); - failed_paths += 1; - } - } - } - } - - if skipped_paths > 0 { - eprintln!( - "WARNING: Skipped {} search paths - no compatible signatures.", - skipped_paths - ); - } - if failed_paths > 0 { - eprintln!( - "WARNING: {} search paths failed to load. See error messages above.", - failed_paths - ); - } - - if matchlist.is_empty() { - eprintln!("No search signatures loaded for '{}', exiting.", location); - return; // Return early if no search signatures loaded - } - - if let Some(prefetch_output) = &prefetch_output { - write_prefetch(&query_sig, Some(prefetch_output.clone()), &matchlist).ok(); - } - - // Run the gather! - if let Some(gather_output) = &gather_output { - if let Err(err) = consume_query_by_gather(query_sig, matchlist, threshold_hashes, Some(gather_output)) { - eprintln!("Error during gather: {:?}", err); - } - } - } - Err(_) => { - eprintln!("WARNING: Could not load query sketch '{}'", record.internal_location()); - } - } - }); - Ok(()) -} -// query_collection.iter().for_each(|(idx, record)| { -// // Load query sig -// match query_collection.sig_for_dataset(idx) { -// Ok(query_sig) => { -// let location = query_sig.filename(); -// for sketch in query_sig.iter() { -// // Access query MinHash -// if let Sketch::MinHash(query) = sketch { -// let matchlist: BinaryHeap = sketchlist -// .par_iter() -// .filter_map(|sm| { -// // Call a function to load sketches above threshold -// let result = load_sketches_above_threshold( -// against_collection, -// &selection, -// &query, -// threshold_hashes, -// )?; -// let matchlist = result.0; -// let skipped_paths = result.1; -// let failed_paths = result.2; - -// if skipped_paths > 0 { -// eprintln!( -// "WARNING: skipped {} search paths - no compatible signatures.", -// skipped_paths -// ); -// } -// if failed_paths > 0 { -// eprintln!( -// "WARNING: {} search paths failed to load. See error messages above.", -// failed_paths -// ); -// } - -// if matchlist.is_empty() { -// eprintln!("No search signatures loaded, exiting."); -// return Ok(()); -// } - -// if prefetch_output.is_some() { -// write_prefetch(&query_sig, prefetch_output, &matchlist).ok(); -// } - -// // run the gather! -// consume_query_by_gather(query_sig, matchlist, threshold_hashes, gather_output).ok(); -// }); -// } -// } -// } -// } -// Err(_) => { -// eprintln!("WARNING: Could not load query sketch '{}'", record.internal_location()); -// } -// } -// }); -// Ok(()) -// } + // load a set of sketches, filtering for those with overlaps > threshold + let result = load_sketches_above_threshold( + against_collection, + &selection, + &query_mh.unwrap(), + threshold_hashes, + )?; + let matchlist = result.0; + let skipped_paths = result.1; + let failed_paths = result.2; + if skipped_paths > 0 { + eprintln!( + "WARNING: skipped {} search paths - no compatible signatures.", + skipped_paths + ); + } + if failed_paths > 0 { + eprintln!( + "WARNING: {} search paths failed to load. See error messages above.", + failed_paths + ); + } + + if matchlist.is_empty() { + eprintln!("No search signatures loaded, exiting."); + return Ok(()); + } + + if prefetch_output.is_some() { + write_prefetch(query_sig.as_ref().unwrap(), prefetch_output, &matchlist).ok(); + } + + // run the gather! + consume_query_by_gather(query_sig.clone().unwrap(), matchlist, threshold_hashes, gather_output).ok(); + Ok(()) +} \ No newline at end of file From 39fb7dceb8454dcccb90e4848bed1e8e7429e3de Mon Sep 17 00:00:00 2001 From: "N. Tessa Pierce-Ward" Date: Fri, 26 Jan 2024 14:04:57 -0800 Subject: [PATCH 20/40] re-enable more permissive pathlist loading --- src/python/tests/test_search.py | 4 +- src/utils.rs | 169 +++++++++++++++++++------------- 2 files changed, 101 insertions(+), 72 deletions(-) diff --git a/src/python/tests/test_search.py b/src/python/tests/test_search.py index 5427d303..6f29ec9b 100644 --- a/src/python/tests/test_search.py +++ b/src/python/tests/test_search.py @@ -247,7 +247,7 @@ def test_missing_query(runtmp, capfd, indexed, zip_query): captured = capfd.readouterr() print(captured.err) - assert 'Error: No such file or directory ' in captured.err + assert 'Error: No such file or directory' in captured.err @pytest.mark.parametrize("indexed", [False, True]) @@ -273,7 +273,7 @@ def test_bad_query(runtmp, capfd, indexed): captured = capfd.readouterr() print(captured.err) - assert 'Error: invalid line in fromfile ' in captured.err + assert 'Error: invalid line in fromfile' in captured.err @pytest.mark.parametrize("indexed", [False, True]) diff --git a/src/utils.rs b/src/utils.rs index 4efe1fd9..7824113f 100644 --- a/src/utils.rs +++ b/src/utils.rs @@ -7,6 +7,7 @@ use sourmash::selection::Select; use std::fs::File; use std::io::Read; use std::io::{BufRead, BufReader, BufWriter, Write}; +use std::panic; use std::path::{Path, PathBuf}; use tempfile::tempdir; @@ -17,17 +18,17 @@ use std::sync::atomic::AtomicUsize; use std::collections::BinaryHeap; -use anyhow::{anyhow, Result, Context}; +use anyhow::{anyhow, Context, Result}; use std::cmp::{Ordering, PartialOrd}; +use sourmash::collection::{self, Collection}; +use sourmash::errors::SourmashError; +use sourmash::manifest::Record; +use sourmash::selection::Selection; use sourmash::signature::{Signature, SigsTrait}; use sourmash::sketch::minhash::{max_hash_for_scaled, KmerMinHash}; use sourmash::sketch::Sketch; -use sourmash::collection::Collection; -use sourmash::selection::Selection; -use sourmash::errors::SourmashError; -use sourmash::storage::SigStore; - +use sourmash::storage::{FSStorage, InnerStorage, SigStore}; /// Track a name/minhash. @@ -195,7 +196,12 @@ pub fn write_prefetch( writeln!( &mut writer, "{},\"{}\",{},\"{}\",{},{}", - query.filename(), query.name(), query.md5sum(), m.name, m.md5sum, m.overlap + query.filename(), + query.name(), + query.md5sum(), + m.name, + m.md5sum, + m.overlap ) .ok(); } @@ -229,31 +235,6 @@ pub fn load_sketchlist_filenames>(sketchlist_filename: &P) -> Res Ok(sketchlist_filenames) } -pub fn load_sketchlist_filenames_camino>(sketchlist_filename: &P) -> Result> { - let sketchlist_file = BufReader::new(File::open(sketchlist_filename)?); - - let mut sketchlist_filenames: Vec = Vec::new(); - for line in sketchlist_file.lines() { - let line = match line { - Ok(v) => v, - Err(_) => { - return { - let filename = sketchlist_filename.as_ref().display(); - let msg = format!("invalid line in fromfile '{}'", filename); - Err(anyhow!(msg)) - } - } - }; - - if !line.is_empty() { - let path = camino::Utf8PathBuf::from(line); - sketchlist_filenames.push(path); - } - } - Ok(sketchlist_filenames) -} - - /// Loads signature file paths from a ZIP archive. /// /// This function extracts the contents of a ZIP archive containing @@ -475,40 +456,49 @@ pub fn load_sketches_above_threshold( let failed_paths = AtomicUsize::new(0); let matchlist: BinaryHeap = against_collection - .par_iter() - .filter_map(|(idx, against_record)| { - let mut mm = None; - // Load against into memory - if let Ok(against_sig) = against_collection.sig_for_dataset(idx) { - if let Some(sketch) = against_sig.sketches().get(0) { - if let Sketch::MinHash(against_mh) = sketch { - if let Ok(overlap) = against_mh.count_common(query, false) { - if overlap >= threshold_hashes { - let result = PrefetchResult { - name: against_sig.name().to_string(), - md5sum: against_mh.md5sum().to_string(), - minhash: against_mh.clone(), - overlap, - }; - mm = Some(result); + .par_iter() + .filter_map(|(idx, against_record)| { + let mut mm = None; + // Load against into memory + if let Ok(against_sig) = against_collection.sig_for_dataset(idx) { + if let Some(sketch) = against_sig.sketches().get(0) { + if let Sketch::MinHash(against_mh) = sketch { + if let Ok(overlap) = against_mh.count_common(query, false) { + if overlap >= threshold_hashes { + let result = PrefetchResult { + name: against_sig.name().to_string(), + md5sum: against_mh.md5sum().to_string(), + minhash: against_mh.clone(), + overlap, + }; + mm = Some(result); + } } + } else { + eprintln!( + "WARNING: no compatible sketches in path '{}'", + against_sig.filename() + ); + let _i = skipped_paths.fetch_add(1, atomic::Ordering::SeqCst); } } else { - eprintln!("WARNING: no compatible sketches in path '{}'", against_sig.filename()); + eprintln!( + "WARNING: no compatible sketches in path '{}'", + against_sig.filename() + ); let _i = skipped_paths.fetch_add(1, atomic::Ordering::SeqCst); } } else { - eprintln!("WARNING: no compatible sketches in path '{}'", against_sig.filename()); + // this shouldn't happen here anymore -- likely would happen at load_collection + eprintln!( + "WARNING: could not load sketches for record '{}'", + against_record.internal_location() + ); let _i = skipped_paths.fetch_add(1, atomic::Ordering::SeqCst); } - } else { - // this shouldn't happen here anymore -- likely would happen at load_collection - eprintln!("WARNING: could not load sketches for record '{}'", against_record.internal_location()); - let _i = skipped_paths.fetch_add(1, atomic::Ordering::SeqCst); - } - mm - }) - .collect(); + mm + }) + .collect(); let skipped_paths = skipped_paths.load(atomic::Ordering::SeqCst); let failed_paths = failed_paths.load(atomic::Ordering::SeqCst); @@ -672,7 +662,6 @@ pub fn load_sketches_from_zip_or_pathlist>( .map(|ext| ext == "zip") .unwrap_or(false) { - load_sketches_from_zip(sketchlist_path, template)? } else { let sketch_paths = load_sketchlist_filenames(&sketchlist_path)?; @@ -692,6 +681,8 @@ pub fn load_collection( if !sigpath.exists() { bail!("No such file or directory: '{}'", sigpath); } + + let mut n_failed = 0; let collection = if sigpath.extension().map_or(false, |ext| ext == "zip") { match Collection::from_zipfile(&sigpath) { Ok(collection) => collection, @@ -700,19 +691,54 @@ pub fn load_collection( } } } else { - let sig_paths = load_sketchlist_filenames_camino(&sigpath)?; - match Collection::from_paths(&sig_paths) { - Ok(collection) => collection, - Err(_) => { - bail!("failed to load {} signature paths: '{}'", report_type, sigpath); - } - } + let sketchlist_file = BufReader::new(File::open(sigpath)?); + + let records: Vec = sketchlist_file + .lines() + .filter_map(|line| { + let path = match line { + Ok(path) => path, + Err(err) => { + eprintln!("Error: invalid line in fromfile"); + return None; // Skip + } + }; + + match Signature::from_path(&path) { + Ok(signatures) => { + let recs: Vec = signatures + .into_iter() + .flat_map(|v| Record::from_sig(&v, path.as_str())) + .collect(); + Some(recs) + } + Err(err) => { + eprintln!("Sketch loading error: {}", err); + eprintln!("WARNING: could not load sketches from path '{}'", path); + n_failed += 1; + None + } + } + }) + .flatten() + .collect(); + + let manifest: Manifest = records.into(); + + Collection::new( + manifest, + InnerStorage::new( + FSStorage::builder() + .fullpath("".into()) + .subdir("".into()) + .build(), + ), + ) }; let n_total = collection.len(); let selected = collection.select(&selection)?; let n_skipped = n_total - selected.len(); - let n_failed = 0; // TODO: can we get list / number of failed paths from core??? report_on_collection_loading(&selected, n_skipped, n_failed, report_type)?; Ok(selected) } @@ -753,7 +779,11 @@ pub fn load_single_sig_from_collection( match query_collection.sig_for_dataset(0) { Ok(sig) => Ok(sig), - Err(_) => Err(anyhow::anyhow!("No sketch found with scaled={}, k={}", scaled, ksize)), + Err(_) => Err(anyhow::anyhow!( + "No sketch found with scaled={}, k={}", + scaled, + ksize + )), } } @@ -778,7 +808,6 @@ pub fn load_single_sig_from_collection( // Ok((sig, sketch)) // } - /// Uses the output of sketch loading functions to report the /// total number of sketches loaded, as well as the number of files, /// if any, that failed to load or contained no compatible sketches. @@ -945,7 +974,7 @@ pub fn build_selection(ksize: u8, scaled: usize, moltype: &str) -> Selection { "hp" => HashFunctions::Murmur64Hp, _ => panic!("Unknown molecule type: {}", moltype), }; - + Selection::builder() .ksize(ksize.into()) .scaled(scaled as u32) From 15f7dba7a54828064fef8c1e8a00ee28b83561a1 Mon Sep 17 00:00:00 2001 From: "N. Tessa Pierce-Ward" Date: Fri, 26 Jan 2024 14:07:51 -0800 Subject: [PATCH 21/40] clean up ms --- src/mastiff_manysearch.rs | 31 ++++++++++--------------------- 1 file changed, 10 insertions(+), 21 deletions(-) diff --git a/src/mastiff_manysearch.rs b/src/mastiff_manysearch.rs index 9d4a45b0..24fff34e 100644 --- a/src/mastiff_manysearch.rs +++ b/src/mastiff_manysearch.rs @@ -2,16 +2,15 @@ use anyhow::Result; use rayon::prelude::*; use sourmash::index::revindex::{RevIndex, RevIndexOps}; -use sourmash::signature::{Signature, SigsTrait}; -use sourmash::sketch::Sketch; use sourmash::selection::Selection; +use sourmash::signature::SigsTrait; +use sourmash::sketch::Sketch; use std::path::Path; use std::sync::atomic; use std::sync::atomic::AtomicUsize; use crate::utils::{ - csvwriter_thread, is_revindex_database, load_collection, prepare_query, - ReportType, SearchResult, + csvwriter_thread, is_revindex_database, load_collection, ReportType, SearchResult, }; pub fn mastiff_manysearch>( @@ -22,10 +21,7 @@ pub fn mastiff_manysearch>( output: Option

, ) -> Result<(), Box> { if !is_revindex_database(&index) { - bail!( - "'{}' is not a valid RevIndex database", - index - ); + bail!("'{}' is not a valid RevIndex database", index); } // Open database once let db = RevIndex::open(index, true)?; @@ -34,10 +30,6 @@ pub fn mastiff_manysearch>( // Load query paths let query_collection = load_collection(&queries_path, selection, ReportType::Query)?; - // let queryfile_name = queries_file.as_ref().to_string_lossy().to_string(); - // let (query_paths, _temp_dir) = - // load_sigpaths_from_zip_or_pathlist(&queries_file, &template, ReportType::Query)?; - // if query_paths is empty, exit with error. this should already happen via load_collection, i think? if query_collection.len() == 0 { bail!("No query signatures loaded, exiting."); @@ -75,9 +67,10 @@ pub fn mastiff_manysearch>( // let location = query_sig.filename(); let query_size = query_mh.size(); let counter = db.counter_for_query(&query_mh); - let matches = db.matches_from_counter(counter, minimum_containment as usize); + let matches = + db.matches_from_counter(counter, minimum_containment as usize); - // filter the matches for containment + // filter the matches for containment for (path, overlap) in matches { let containment = overlap as f64 / query_size as f64; if containment >= minimum_containment { @@ -93,14 +86,12 @@ pub fn mastiff_manysearch>( }); } } - } else { - // for reading zips, this is likely not a useful warning and - // would show up too often (every sig is stored as individual file). eprintln!( - "WARNING: no compatible sketches in path '{}'", query_sig.filename() + "WARNING: no compatible sketches in path '{}'", + query_sig.filename() ); - let _ = skipped_paths.fetch_add(1, atomic::Ordering::SeqCst); + let _ = skipped_paths.fetch_add(1, atomic::Ordering::SeqCst); } } if results.is_empty() { @@ -159,7 +150,5 @@ pub fn mastiff_manysearch>( ); } - // _temp_dir goes out of scope => is deleted. - Ok(()) } From 4e3b7ee297f6fe97813ed862a80e9246b192654c Mon Sep 17 00:00:00 2001 From: "N. Tessa Pierce-Ward" Date: Fri, 26 Jan 2024 14:17:52 -0800 Subject: [PATCH 22/40] harmonize errors --- src/python/tests/test_multigather.py | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/src/python/tests/test_multigather.py b/src/python/tests/test_multigather.py index 26c85277..3f59278c 100644 --- a/src/python/tests/test_multigather.py +++ b/src/python/tests/test_multigather.py @@ -208,7 +208,7 @@ def test_bad_query(runtmp, capfd, indexed): captured = capfd.readouterr() print(captured.err) - assert 'Error: invalid line in fromfile ' in captured.err + assert 'Error: invalid line in fromfile' in captured.err @pytest.mark.parametrize('indexed', [False, True]) @@ -240,10 +240,7 @@ def test_bad_query_2(runtmp, capfd, indexed): captured = capfd.readouterr() print(captured.err) - if not indexed: - assert 'Error: invalid Zip archive: Could not find central directory end' in captured.err - else: - assert "InvalidArchive" in captured.err + assert "InvalidArchive" in captured.err @pytest.mark.parametrize('indexed', [False, True]) From 14ee1bdbce51f2c52843995ec79cd324c5622251 Mon Sep 17 00:00:00 2001 From: "N. Tessa Pierce-Ward" Date: Fri, 26 Jan 2024 16:16:02 -0800 Subject: [PATCH 23/40] harmonize error text and output filenames --- src/fastmultigather.rs | 115 ++++++++++++++++----------- src/python/tests/test_multigather.py | 55 ++++++------- src/utils.rs | 62 ++++++++++----- 3 files changed, 130 insertions(+), 102 deletions(-) diff --git a/src/fastmultigather.rs b/src/fastmultigather.rs index 70850a37..71a0e174 100644 --- a/src/fastmultigather.rs +++ b/src/fastmultigather.rs @@ -2,20 +2,22 @@ use anyhow::Result; use rayon::prelude::*; +use serde::Serialize; +use sourmash::selection::Selection; +use sourmash::sketch::Sketch; use sourmash::storage::SigStore; use sourmash::{selection, signature::Signature}; -use sourmash::sketch::Sketch; -use sourmash::selection::Selection; use std::sync::atomic; use std::sync::atomic::AtomicUsize; use std::collections::BinaryHeap; -use camino::Utf8PathBuf; +use camino::{Utf8Path, Utf8PathBuf}; use crate::utils::{ - consume_query_by_gather, load_collection, load_sigpaths_from_zip_or_pathlist, load_sketches_from_zip_or_pathlist, prepare_query, write_prefetch, PrefetchResult, ReportType + consume_query_by_gather, load_collection, load_sigpaths_from_zip_or_pathlist, + load_sketches_from_zip_or_pathlist, prepare_query, write_prefetch, PrefetchResult, ReportType, }; pub fn fastmultigather( @@ -23,7 +25,6 @@ pub fn fastmultigather( against_filepath: camino::Utf8PathBuf, threshold_bp: usize, scaled: usize, - // template: Sketch, selection: &Selection, ) -> Result<()> { // load the list of query paths @@ -63,58 +64,76 @@ pub fn fastmultigather( query_collection.par_iter().for_each(|(idx, record)| { // increment counter of # of queries. q: could we instead use the index from par_iter()? - let _i = processed_queries.fetch_add(1, atomic::Ordering::SeqCst); + let _i = processed_queries.fetch_add(1, atomic::Ordering::SeqCst); // Load query sig - match query_collection.sig_for_dataset(idx) { - Ok(query_sig) => { - let location = query_sig.filename(); - for sketch in query_sig.iter() { - // Access query MinHash - if let Sketch::MinHash(query) = sketch { - let matchlist: BinaryHeap = sketchlist - .par_iter() - .filter_map(|sm| { - let mut mm = None; - // Access against MinHash - if let Some(sketch) = sm.sketches().get(0) { - if let Sketch::MinHash(against_sketch) = sketch { - if let Ok(overlap) = against_sketch.count_common(&query, false) { - if overlap >= threshold_hashes { - let result = PrefetchResult { - name: sm.name(), - md5sum: sm.md5sum().clone(), - minhash: against_sketch.clone(), - overlap, - }; - mm = Some(result); + match query_collection.sig_for_dataset(idx) { + Ok(query_sig) => { + let prefix = query_sig.name(); + let location = Utf8Path::new(&prefix).file_name().unwrap(); + for sketch in query_sig.iter() { + // Access query MinHash + if let Sketch::MinHash(query) = sketch { + let matchlist: BinaryHeap = sketchlist + .par_iter() + .filter_map(|sm| { + let mut mm = None; + // Access against MinHash + if let Some(sketch) = sm.sketches().get(0) { + if let Sketch::MinHash(against_sketch) = sketch { + if let Ok(overlap) = + against_sketch.count_common(&query, true) + { + if overlap >= threshold_hashes { + let result = PrefetchResult { + name: sm.name(), + md5sum: sm.md5sum().clone(), + minhash: against_sketch.clone(), + overlap, + }; + mm = Some(result); + } } } } - } - mm - }) - .collect(); - if !matchlist.is_empty() { - let prefetch_output = format!("{}.prefetch.csv", location); - let gather_output = format!("{}.gather.csv", location); - - // Save initial list of matches to prefetch output - write_prefetch(&query_sig, Some(prefetch_output), &matchlist).ok(); - - // Now, do the gather! - consume_query_by_gather(query_sig.clone(), matchlist, threshold_hashes, Some(gather_output)).ok(); + mm + }) + .collect(); + if !matchlist.is_empty() { + let prefetch_output = format!("{}.prefetch.csv", location); + let gather_output = format!("{}.gather.csv", location); + + // Save initial list of matches to prefetch output + write_prefetch(&query_sig, Some(prefetch_output), &matchlist).ok(); + + // Now, do the gather! + consume_query_by_gather( + query_sig.clone(), + matchlist, + threshold_hashes, + Some(gather_output), + ) + .ok(); + } else { + println!("No matches to '{}'", location); + } } else { - println!("No matches to '{}'", location); + eprintln!( + "WARNING: no compatible sketches in path '{}'", + record.internal_location() + ); + let _ = skipped_paths.fetch_add(1, atomic::Ordering::SeqCst); } } } + Err(_) => { + eprintln!( + "WARNING: no compatible sketches in path '{}'", + record.internal_location() + ); + let _ = skipped_paths.fetch_add(1, atomic::Ordering::SeqCst); + } } - Err(_) => { - eprintln!("WARNING: no compatible sketches in path '{}'", record.internal_location()); - let _ = skipped_paths.fetch_add(1, atomic::Ordering::SeqCst); - } - } -}); + }); println!( "Processed {} queries total.", diff --git a/src/python/tests/test_multigather.py b/src/python/tests/test_multigather.py index 3f59278c..960bc68d 100644 --- a/src/python/tests/test_multigather.py +++ b/src/python/tests/test_multigather.py @@ -67,8 +67,8 @@ def test_simple(runtmp, zip_against): print(os.listdir(runtmp.output(''))) - g_output = runtmp.output('SRR606249.sig.gz.gather.csv') - p_output = runtmp.output('SRR606249.sig.gz.prefetch.csv') + g_output = runtmp.output('SRR606249.gather.csv') + p_output = runtmp.output('SRR606249.prefetch.csv') assert os.path.exists(p_output) # check prefetch output (only non-indexed gather) @@ -79,6 +79,7 @@ def test_simple(runtmp, zip_against): assert os.path.exists(g_output) df = pandas.read_csv(g_output) + print(df) assert len(df) == 3 keys = set(df.keys()) assert keys == {'query_filename', 'query_name', 'query_md5', 'match_name', 'match_md5', 'rank', 'intersect_bp'} @@ -109,9 +110,8 @@ def test_simple_zip_query(runtmp): print(os.listdir(runtmp.output(''))) - # outputs are based on md5sum, e.g. "{md5}.sig.gz.gather.csv" - g_output = runtmp.output('dec29ca72e68db0f15de0b1b46f82fc5.sig.gz.gather.csv') - p_output = runtmp.output('dec29ca72e68db0f15de0b1b46f82fc5.sig.gz.prefetch.csv') + g_output = runtmp.output('SRR606249.gather.csv') + p_output = runtmp.output('SRR606249.prefetch.csv') # check prefetch output (only non-indexed gather) assert os.path.exists(p_output) @@ -294,10 +294,7 @@ def test_nomatch_query(runtmp, capfd, indexed, zip_query): captured = capfd.readouterr() print(captured.err) - if zip_query: - assert "WARNING: no compatible sketches in path " not in captured.err - else: - assert "WARNING: no compatible sketches in path " in captured.err + # assert "WARNING: no compatible sketches in path " in captured.err assert "WARNING: skipped 1 query paths - no compatible signatures." in captured.err @@ -324,7 +321,7 @@ def test_missing_against(runtmp, capfd, zip_against): captured = capfd.readouterr() print(captured.err) - assert 'Error: No such file or directory ' in captured.err + assert 'Error: No such file or directory' in captured.err def test_bad_against(runtmp, capfd): @@ -341,7 +338,7 @@ def test_bad_against(runtmp, capfd): captured = capfd.readouterr() print(captured.err) - assert 'Error: invalid line in fromfile ' in captured.err + assert 'Error: invalid line in fromfile' in captured.err def test_bad_against_2(runtmp, capfd): @@ -390,7 +387,7 @@ def test_bad_against_3(runtmp, capfd, zip_query): captured = capfd.readouterr() print(captured.err) - assert 'Error: invalid Zip archive: Could not find central directory end' in captured.err + assert 'InvalidArchive' in captured.err def test_empty_against(runtmp, capfd): @@ -409,7 +406,7 @@ def test_empty_against(runtmp, capfd): captured = capfd.readouterr() print(captured.err) - assert "Loaded 0 search signature(s)" in captured.err + assert "Sketch loading error: No such file or directory" in captured.err assert "Error: No search signatures loaded, exiting." in captured.err @@ -465,11 +462,8 @@ def test_md5(runtmp, zip_query): print(os.listdir(runtmp.output(''))) - g_output = runtmp.output('SRR606249.sig.gz.gather.csv') - p_output = runtmp.output('SRR606249.sig.gz.prefetch.csv') - if zip_query: - g_output = runtmp.output('dec29ca72e68db0f15de0b1b46f82fc5.sig.gz.gather.csv') - p_output = runtmp.output('dec29ca72e68db0f15de0b1b46f82fc5.sig.gz.prefetch.csv') + g_output = runtmp.output('SRR606249.gather.csv') + p_output = runtmp.output('SRR606249.prefetch.csv') # check prefetch output (only non-indexed gather) assert os.path.exists(p_output) @@ -560,11 +554,8 @@ def test_csv_columns_vs_sourmash_prefetch(runtmp, zip_query, zip_against): finally: os.chdir(cwd) - g_output = runtmp.output('SRR606249.sig.gz.gather.csv') - p_output = runtmp.output('SRR606249.sig.gz.prefetch.csv') - if zip_query: - g_output = runtmp.output('dec29ca72e68db0f15de0b1b46f82fc5.sig.gz.gather.csv') - p_output = runtmp.output('dec29ca72e68db0f15de0b1b46f82fc5.sig.gz.prefetch.csv') + g_output = runtmp.output('SRR606249.gather.csv') + p_output = runtmp.output('SRR606249.prefetch.csv') assert os.path.exists(p_output) assert os.path.exists(g_output) @@ -627,14 +618,14 @@ def test_simple_protein(runtmp): # test basic protein execution sigs = get_test_data('protein.zip') - sig_names = ["GCA_001593935.1_ASM159393v1_protein.faa.gz", "GCA_001593925.1_ASM159392v1_protein.faa.gz"] + sig_names = ["GCA_001593935", "GCA_001593925"] runtmp.sourmash('scripts', 'fastmultigather', sigs, sigs, '-s', '100', '--moltype', 'protein', '-k', '19') for qsig in sig_names: - g_output = runtmp.output(os.path.join(qsig + '.sig.gather.csv')) - p_output = runtmp.output(os.path.join(qsig + '.sig.prefetch.csv')) + g_output = runtmp.output(os.path.join(qsig + '.gather.csv')) + p_output = runtmp.output(os.path.join(qsig + '.prefetch.csv')) print(g_output) assert os.path.exists(g_output) assert os.path.exists(p_output) @@ -652,14 +643,14 @@ def test_simple_dayhoff(runtmp): # test basic protein execution sigs = get_test_data('dayhoff.zip') - sig_names = ["GCA_001593935.1_ASM159393v1_protein.faa.gz", "GCA_001593925.1_ASM159392v1_protein.faa.gz"] + sig_names = ["GCA_001593935", "GCA_001593925"] runtmp.sourmash('scripts', 'fastmultigather', sigs, sigs, '-s', '100', '--moltype', 'dayhoff', '-k', '19') for qsig in sig_names: - g_output = runtmp.output(os.path.join(qsig + '.sig.gather.csv')) - p_output = runtmp.output(os.path.join(qsig + '.sig.prefetch.csv')) + g_output = runtmp.output(os.path.join(qsig + '.gather.csv')) + p_output = runtmp.output(os.path.join(qsig + '.prefetch.csv')) print(g_output) assert os.path.exists(g_output) assert os.path.exists(p_output) @@ -677,14 +668,14 @@ def test_simple_hp(runtmp): # test basic protein execution sigs = get_test_data('hp.zip') - sig_names = ["GCA_001593935.1_ASM159393v1_protein.faa.gz", "GCA_001593925.1_ASM159392v1_protein.faa.gz"] + sig_names = ["GCA_001593935", "GCA_001593925"] runtmp.sourmash('scripts', 'fastmultigather', sigs, sigs, '-s', '100', '--moltype', 'hp', '-k', '19') for qsig in sig_names: - g_output = runtmp.output(os.path.join(qsig + '.sig.gather.csv')) - p_output = runtmp.output(os.path.join(qsig + '.sig.prefetch.csv')) + g_output = runtmp.output(os.path.join(qsig + '.gather.csv')) + p_output = runtmp.output(os.path.join(qsig + '.prefetch.csv')) print(g_output) assert os.path.exists(g_output) assert os.path.exists(p_output) diff --git a/src/utils.rs b/src/utils.rs index 7824113f..a6b07b02 100644 --- a/src/utils.rs +++ b/src/utils.rs @@ -4,7 +4,7 @@ use sourmash::encodings::HashFunctions; use sourmash::manifest::Manifest; use sourmash::selection::Select; -use std::fs::File; +use std::fs::{create_dir_all, File}; use std::io::Read; use std::io::{BufRead, BufReader, BufWriter, Write}; use std::panic; @@ -161,7 +161,8 @@ pub fn prefetch( .filter_map(|result| { let mut mm = None; let searchsig = &result.minhash; - let overlap = searchsig.count_common(query_mh, false); + // TODO: fix Select so we can go back to downsample: false here + let overlap = searchsig.count_common(query_mh, true); if let Ok(overlap) = overlap { if overlap >= threshold_hashes { let result = PrefetchResult { overlap, ..result }; @@ -174,18 +175,27 @@ pub fn prefetch( } /// Write list of prefetch matches. -// pub fn write_prefetch + std::fmt::Debug + std::fmt::Display + Clone>( pub fn write_prefetch( query: &SigStore, prefetch_output: Option, matchlist: &BinaryHeap, -) -> Result<()> { - // Set up a writer for prefetch output - let prefetch_out: Box = match prefetch_output { - Some(path) => Box::new(BufWriter::new(File::create(path).unwrap())), - None => Box::new(std::io::stdout()), - }; - let mut writer = BufWriter::new(prefetch_out); +) -> Result<(), Box> { + // Define the writer to stdout by default + let mut writer: Box = Box::new(std::io::stdout()); + + if let Some(output_path) = &prefetch_output { + // Account for potential missing dir in output path + let directory_path = Path::new(output_path).parent(); + + // If a directory path exists in the filename, create it if it doesn't already exist + if let Some(dir) = directory_path { + create_dir_all(dir)?; + } + + let file = File::create(output_path)?; + writer = Box::new(BufWriter::new(file)); + } + writeln!( &mut writer, "query_filename,query_name,query_md5,match_name,match_md5,intersect_bp" @@ -860,18 +870,27 @@ pub fn report_on_sketch_loading( /// Execute the gather algorithm, greedy min-set-cov, by iteratively /// removing matches in 'matchlist' from 'query'. -pub fn consume_query_by_gather + std::fmt::Debug + std::fmt::Display + Clone>( +pub fn consume_query_by_gather( query: SigStore, matchlist: BinaryHeap, threshold_hashes: u64, - gather_output: Option

, + gather_output: Option, ) -> Result<()> { - // Set up a writer for gather output - let gather_out: Box = match gather_output { - Some(path) => Box::new(BufWriter::new(File::create(path).unwrap())), - None => Box::new(std::io::stdout()), - }; - let mut writer = BufWriter::new(gather_out); + // Define the writer to stdout by default + let mut writer: Box = Box::new(std::io::stdout()); + + if let Some(output_path) = &gather_output { + // Account for potential missing dir in output path + let directory_path = Path::new(output_path).parent(); + + // If a directory path exists in the filename, create it if it doesn't already exist + if let Some(dir) = directory_path { + create_dir_all(dir)?; + } + + let file = File::create(output_path)?; + writer = Box::new(BufWriter::new(file)); + } writeln!( &mut writer, "query_filename,rank,query_name,query_md5,match_name,match_md5,intersect_bp" @@ -881,12 +900,10 @@ pub fn consume_query_by_gather + std::fmt::Debug + std::fmt::Disp let mut matching_sketches = matchlist; let mut rank = 0; - let mut last_hashes = query.size(); let mut last_matches = matching_sketches.len(); // let location = query.location; - let location = query.filename(); - // let mut query_mh = query.minhash; + let location = query.filename(); // this is different (original fasta filename) than query.location was (sig name)!! let sketches = query.sketches(); let orig_query_mh = match sketches.get(0) { @@ -894,12 +911,13 @@ pub fn consume_query_by_gather + std::fmt::Debug + std::fmt::Disp _ => Err(anyhow::anyhow!("No MinHash found")), }?; let mut query_mh = orig_query_mh.clone(); + let mut last_hashes = orig_query_mh.size(); eprintln!( "{} iter {}: start: query hashes={} matches={}", location, rank, - query.size(), + orig_query_mh.size(), matching_sketches.len() ); From 363b90d382103e7b00857d957f483980ad04b635 Mon Sep 17 00:00:00 2001 From: "N. Tessa Pierce-Ward" Date: Tue, 30 Jan 2024 14:58:23 -0800 Subject: [PATCH 24/40] re-allow load from sig; upd manysearch --- Cargo.lock | 2 +- Cargo.toml | 4 +- src/check.rs | 5 +- src/fastgather.rs | 85 ++++++++------- src/fastmultigather.rs | 13 ++- src/lib.rs | 23 ++--- src/manysearch.rs | 148 ++++++++++++++------------- src/mastiff_manygather.rs | 117 +++++++++++---------- src/python/tests/test_gather.py | 15 +-- src/python/tests/test_multigather.py | 65 +++++++++--- src/python/tests/test_search.py | 32 +++--- src/utils.rs | 130 +++++++++++------------ 12 files changed, 348 insertions(+), 291 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index a9e84652..fe8ac5a6 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1396,7 +1396,7 @@ checksum = "bceb57dc07c92cdae60f5b27b3fa92ecaaa42fe36c55e22dbfb0b44893e0b1f7" [[package]] name = "sourmash" version = "0.12.0" -source = "git+https://github.com/sourmash-bio/sourmash?rev=94b88cc314f781342721addc5ed35c531732a9b6#94b88cc314f781342721addc5ed35c531732a9b6" +source = "git+https://github.com/sourmash-bio/sourmash?rev=409aeb415ba8b04b9c09f203817d67791afa96da#409aeb415ba8b04b9c09f203817d67791afa96da" dependencies = [ "az", "byteorder", diff --git a/Cargo.toml b/Cargo.toml index 21b3976e..10b0afec 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -12,8 +12,8 @@ crate-type = ["cdylib"] pyo3 = { version = "0.20.2", features = ["extension-module", "anyhow"] } rayon = "1.8.1" serde = { version = "1.0.195", features = ["derive"] } -sourmash = { git = "https://github.com/sourmash-bio/sourmash", rev= "94b88cc314f781342721addc5ed35c531732a9b6", features = ["branchwater"] } -#sourmash = { version = "0.12.0", features = ["branchwater"] } +sourmash = { git = "https://github.com/sourmash-bio/sourmash", rev= "409aeb415ba8b04b9c09f203817d67791afa96da", features = ["branchwater"] } +#sourmash = { version = "0.12.1", features = ["branchwater"] } serde_json = "1.0.111" niffler = "2.4.0" log = "0.4.14" diff --git a/src/check.rs b/src/check.rs index 1311318c..2995284b 100644 --- a/src/check.rs +++ b/src/check.rs @@ -4,10 +4,7 @@ use sourmash::index::revindex::{RevIndex, RevIndexOps}; pub fn check(index: camino::Utf8PathBuf, quick: bool) -> Result<(), Box> { if !is_revindex_database(&index) { - bail!( - "'{}' is not a valid RevIndex database", - index - ); + bail!("'{}' is not a valid RevIndex database", index); } println!("Opening DB"); diff --git a/src/fastgather.rs b/src/fastgather.rs index 2ff13509..5e8e3b07 100644 --- a/src/fastgather.rs +++ b/src/fastgather.rs @@ -1,20 +1,24 @@ /// fastgather: Run gather with a query against a list of files. use anyhow::Result; -use sourmash::sketch::Sketch; -use sourmash::signature::Signature; +use serde::Serialize; use sourmash::selection::Selection; -use camino; -use std::collections::BinaryHeap; +use sourmash::signature::Signature; +use sourmash::sketch::Sketch; +// use camino; use crate::utils::PrefetchResult; +use std::collections::BinaryHeap; + +use sourmash::prelude::Select; use crate::utils::{ - consume_query_by_gather, load_sketches_above_threshold, write_prefetch, ReportType, load_collection + consume_query_by_gather, load_collection, load_sketches_above_threshold, write_prefetch, + ReportType, }; pub fn fastgather( - query_filepath: camino::Utf8PathBuf, - against_filepath: camino::Utf8PathBuf, + query_filepath: &camino::Utf8PathBuf, + against_filepath: &camino::Utf8PathBuf, threshold_bp: usize, ksize: u8, scaled: usize, @@ -22,42 +26,45 @@ pub fn fastgather( gather_output: Option, prefetch_output: Option, ) -> Result<()> { + let query_collection = load_collection(query_filepath, selection, ReportType::Query)?; + let mut query_sig = None; + let mut query_mh = None; - let query_collection = load_collection(&query_filepath, selection, ReportType::Query)?; + for (idx, record) in query_collection.iter() { + if let Ok(sig) = query_collection + .sig_for_dataset(idx) + .unwrap() + .select(&selection) + { + query_sig = Some(sig.clone()); - if query_collection.len() > 1 { - bail!("Found more than one compatible sketch from '{}'. Fastgather requires a single query sketch.", &query_filepath) - } - // load query sig into memory - let mut query_mh = None; - let mut query_sig = None; - for (idx, _record) in query_collection.iter() { - // Load query sig - match query_collection.sig_for_dataset(idx) { - Ok(query_sig) => { - for sketch in query_sig.iter() { - // Access query MinHash - if let Sketch::MinHash(query) = sketch { - query_mh = Some(query.clone()); - break; - } + for sketch in sig.iter() { + // Access query MinHash + if let Sketch::MinHash(mh) = sketch { + query_mh = Some(mh.clone()); + // eprintln!("mh mins: {:?}", mh.mins()); } } - Err(_) => { - bail!("No query sketch matching selection parameters.") // should not get here bc we already check this during collection loading? - } + } else { + eprintln!("Failed to load 'query sig: {}", record.name()); } + } + if query_mh.is_none() { + bail!("No query sketch matching selection parameters."); + } - if query_mh.is_some() { - break; // Exit the loop if we found a MinHash sketch - } + if query_collection.len() != 1 { + bail!( + "Fastgather requires a single query sketch. Check input: '{:?}'", + &query_filepath + ) } // build the list of paths to match against. eprintln!("Loading matchlist from '{}'", against_filepath); - let against_collection = load_collection(&against_filepath, selection, ReportType::Against)?; + let against_collection = load_collection(against_filepath, selection, ReportType::Against)?; eprintln!("Loaded {} sig paths in matchlist", against_collection.len()); - + // calculate the minimum number of hashes based on desired threshold let threshold_hashes: u64 = { let x = threshold_bp / scaled; @@ -74,8 +81,8 @@ pub fn fastgather( threshold_hashes, threshold_bp ); - // load a set of sketches, filtering for those with overlaps > threshold - let result = load_sketches_above_threshold( + // load a set of sketches, filtering for those with overlaps > threshold + let result = load_sketches_above_threshold( against_collection, &selection, &query_mh.unwrap(), @@ -107,6 +114,12 @@ pub fn fastgather( } // run the gather! - consume_query_by_gather(query_sig.clone().unwrap(), matchlist, threshold_hashes, gather_output).ok(); + consume_query_by_gather( + query_sig.clone().unwrap(), + matchlist, + threshold_hashes, + gather_output, + ) + .ok(); Ok(()) -} \ No newline at end of file +} diff --git a/src/fastmultigather.rs b/src/fastmultigather.rs index 71a0e174..d7537c8a 100644 --- a/src/fastmultigather.rs +++ b/src/fastmultigather.rs @@ -3,6 +3,7 @@ use anyhow::Result; use rayon::prelude::*; use serde::Serialize; +use sourmash::prelude::Select; use sourmash::selection::Selection; use sourmash::sketch::Sketch; use sourmash::storage::SigStore; @@ -16,8 +17,7 @@ use std::collections::BinaryHeap; use camino::{Utf8Path, Utf8PathBuf}; use crate::utils::{ - consume_query_by_gather, load_collection, load_sigpaths_from_zip_or_pathlist, - load_sketches_from_zip_or_pathlist, prepare_query, write_prefetch, PrefetchResult, ReportType, + consume_query_by_gather, load_collection, write_prefetch, PrefetchResult, ReportType, }; pub fn fastmultigather( @@ -50,7 +50,11 @@ pub fn fastmultigather( let mut sketchlist: Vec = vec![]; for (idx, record) in against_collection.iter() { - if let Ok(sig) = against_collection.sig_for_dataset(idx) { + if let Ok(sig) = against_collection.sig_for_dataset(idx) + // .unwrap() + // .select(&selection) // if we select here, we downsample and the md5sum changes! + // ...which means we would lose the original md5sum that is used in the standard gather results. + { sketchlist.push(sig); } else { eprintln!("Failed to load 'against' record: {}", record.name()); @@ -74,13 +78,14 @@ pub fn fastmultigather( // Access query MinHash if let Sketch::MinHash(query) = sketch { let matchlist: BinaryHeap = sketchlist - .par_iter() + .iter() .filter_map(|sm| { let mut mm = None; // Access against MinHash if let Some(sketch) = sm.sketches().get(0) { if let Sketch::MinHash(against_sketch) = sketch { if let Ok(overlap) = + // downsample here to just get downsampled mh and avoid changing md5sum against_sketch.count_common(&query, true) { if overlap >= threshold_hashes { diff --git a/src/lib.rs b/src/lib.rs index 18f8e9de..5e55e1f3 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -6,8 +6,8 @@ use sourmash::selection; extern crate simple_error; mod utils; -use crate::utils::{build_template, build_selection}; use crate::utils::is_revindex_database; +use crate::utils::{build_selection, build_template}; mod check; mod fastgather; mod fastmultigather; @@ -33,15 +33,14 @@ fn do_manysearch( moltype: String, output_path: Option, ) -> anyhow::Result { - let queryfile_path: camino::Utf8PathBuf = querylist_path.clone().into(); let againstfile_path: camino::Utf8PathBuf = siglist_path.clone().into(); let selection = build_selection(ksize, scaled, &moltype); + eprintln!("selection scaled: {:?}", selection.scaled()); // if siglist_path is revindex, run mastiff_manysearch; otherwise run manysearch - let template = build_template(ksize, scaled, &moltype); if is_revindex_database(&againstfile_path) { - // if is_revindex_database(siglist_path.as_ref()) { + // if is_revindex_database(siglist_path.as_ref()) { match mastiff_manysearch::mastiff_manysearch( queryfile_path, againstfile_path, @@ -57,9 +56,9 @@ fn do_manysearch( } } else { match manysearch::manysearch( - querylist_path, - siglist_path, - template, + &queryfile_path, + &againstfile_path, + &selection, threshold, output_path, ) { @@ -85,11 +84,12 @@ fn do_fastgather( ) -> anyhow::Result { let queryfile_path: camino::Utf8PathBuf = query_filename.into(); let againstfile_path: camino::Utf8PathBuf = siglist_path.into(); + let selection = build_selection(ksize, scaled, &moltype); - + match fastgather::fastgather( - queryfile_path, - againstfile_path, + &queryfile_path, + &againstfile_path, threshold_bp, ksize, scaled, @@ -115,11 +115,10 @@ fn do_fastmultigather( moltype: String, output_path: Option, ) -> anyhow::Result { - let queryfile_path: camino::Utf8PathBuf = query_filenames.into(); let againstfile_path: camino::Utf8PathBuf = siglist_path.into(); let selection = build_selection(ksize, scaled, &moltype); - + // if a siglist path is a revindex, run mastiff_manygather. If not, run multigather if is_revindex_database(&againstfile_path) { match mastiff_manygather::mastiff_manygather( diff --git a/src/manysearch.rs b/src/manysearch.rs index a95f8d69..53f25e3c 100644 --- a/src/manysearch.rs +++ b/src/manysearch.rs @@ -6,44 +6,47 @@ use anyhow::Result; use rayon::prelude::*; -use sourmash::signature::{Signature, SigsTrait}; +use sourmash::prelude::Select; +use sourmash::selection::Selection; +use sourmash::signature::SigsTrait; use sourmash::sketch::Sketch; +use sourmash::storage::SigStore; use std::path::Path; use std::sync::atomic; use std::sync::atomic::AtomicUsize; -use crate::utils::{ - csvwriter_thread, load_sigpaths_from_zip_or_pathlist, load_sketches_from_zip_or_pathlist, - prepare_query, ReportType, SearchResult, -}; +use crate::utils::{csvwriter_thread, load_collection, ReportType, SearchResult}; pub fn manysearch>( - querylist: P, - siglist: P, - template: Sketch, + query_filepath: &camino::Utf8PathBuf, + against_filepath: &camino::Utf8PathBuf, + selection: &Selection, threshold: f64, output: Option

, ) -> Result<()> { // Read in list of query paths. - eprintln!( - "Reading list of queries from: '{}'", - querylist.as_ref().display() - ); - - // Load all queries into memory at once. - let queries = load_sketches_from_zip_or_pathlist(querylist, &template, ReportType::Query)?; - - // Load all _paths_, not signatures, into memory. - let siglist_name = siglist.as_ref().to_string_lossy().to_string(); - let (search_sigs_paths, _temp_dir) = - load_sigpaths_from_zip_or_pathlist(siglist, &template, ReportType::Against)?; - - if search_sigs_paths.is_empty() { - bail!("No signatures to search loaded, exiting."); + eprintln!("Reading queries from: '{}'", query_filepath); + + // Load all query sigs into memory at once. + let query_collection = load_collection(query_filepath, selection, ReportType::Query)?; + // load actual signatures + let mut query_sketchlist: Vec = vec![]; + + for (idx, record) in query_collection.iter() { + if let Ok(sig) = query_collection + .sig_for_dataset(idx) + .unwrap() + .select(&selection) + { + query_sketchlist.push(sig); + } else { + eprintln!("Failed to load 'query' sig: {}", record.name()); + } } - eprintln!("Loaded {} sig paths to search.", search_sigs_paths.len()); + // Load all _paths_, not signatures, into memory. + let against_collection = load_collection(against_filepath, selection, ReportType::Against)?; // set up a multi-producer, single-consumer channel. let (send, recv) = std::sync::mpsc::sync_channel::(rayon::current_num_threads()); @@ -61,9 +64,9 @@ pub fn manysearch>( let skipped_paths = AtomicUsize::new(0); let failed_paths = AtomicUsize::new(0); - let send = search_sigs_paths + let send = against_collection .par_iter() - .filter_map(|filename| { + .filter_map(|(idx, record)| { let i = processed_sigs.fetch_add(1, atomic::Ordering::SeqCst); if i % 1000 == 0 { eprintln!("Processed {} search sigs", i); @@ -71,60 +74,65 @@ pub fn manysearch>( let mut results = vec![]; - // load search signature from path: - match Signature::from_path(filename) { - Ok(search_sigs) => { - let location = filename.display().to_string(); - if let Some(search_sm) = prepare_query(&search_sigs, &template, &location) { - // search for matches & save containment. - for q in queries.iter() { - let overlap = - q.minhash.count_common(&search_sm.minhash, false).unwrap() as f64; - let query_size = q.minhash.size() as f64; - let target_size = search_sm.minhash.size() as f64; - - let containment_query_in_target = overlap / query_size; - let containment_in_target = overlap / target_size; - let max_containment = - containment_query_in_target.max(containment_in_target); - let jaccard = overlap / (target_size + query_size - overlap); - - if containment_query_in_target > threshold { - results.push(SearchResult { - query_name: q.name.clone(), - query_md5: q.md5sum.clone(), - match_name: search_sm.name.clone(), - containment: containment_query_in_target, - intersect_hashes: overlap as usize, - match_md5: Some(search_sm.md5sum.clone()), - jaccard: Some(jaccard), - max_containment: Some(max_containment), - }); + match against_collection.sig_for_dataset(idx) { + Ok(against_sig) => match against_sig.select(selection) { + Ok(against_sig) => { + for sketch in against_sig.iter() { + if let Sketch::MinHash(against_mh) = sketch { + for query_sig in query_sketchlist.iter() { + for sketch in query_sig.iter() { + if let Sketch::MinHash(query_mh) = sketch { + let overlap = + query_mh.count_common(&against_mh, false).unwrap() + as f64; + let query_size = query_mh.size() as f64; + let target_size = against_mh.size() as f64; + + let containment_query_in_target = overlap / query_size; + let containment_in_target = overlap / target_size; + let max_containment = containment_query_in_target + .max(containment_in_target); + let jaccard = + overlap / (target_size + query_size - overlap); + + if containment_query_in_target > threshold { + results.push(SearchResult { + query_name: query_sig.name(), + query_md5: query_mh.md5sum(), + match_name: against_sig.name(), + containment: containment_query_in_target, + intersect_hashes: overlap as usize, + match_md5: Some(against_mh.md5sum()), + jaccard: Some(jaccard), + max_containment: Some(max_containment), + }); + } + } + } + } } } - } else { - // for reading zips, this is likely not a useful warning and - // would show up too often (every sig is stored as individual file). - if !siglist_name.ends_with(".zip") { - eprintln!( - "WARNING: no compatible sketches in path '{}'", - filename.display() - ); - } + } + Err(err) => { + eprintln!("Sketch selection error: {}", err); + eprintln!( + "WARNING: no compatible sketches in path '{}'", + record.internal_location() + ); let _ = skipped_paths.fetch_add(1, atomic::Ordering::SeqCst); } - Some(results) - } + }, Err(err) => { - let _ = failed_paths.fetch_add(1, atomic::Ordering::SeqCst); eprintln!("Sketch loading error: {}", err); eprintln!( - "WARNING: could not load sketches from path '{}'", - filename.display() + "WARNING: no compatible sketches in path '{}'", + record.internal_location() ); - None + let _ = skipped_paths.fetch_add(1, atomic::Ordering::SeqCst); } } + + Some(results) }) .flatten() .try_for_each_with(send, |s, m| s.send(m)); diff --git a/src/mastiff_manygather.rs b/src/mastiff_manygather.rs index 19da5728..2175e759 100644 --- a/src/mastiff_manygather.rs +++ b/src/mastiff_manygather.rs @@ -21,7 +21,6 @@ use std::io::{BufWriter, Write}; use crate::utils::{is_revindex_database, load_collection, ReportType}; - pub fn mastiff_manygather>( queries_file: camino::Utf8PathBuf, index: camino::Utf8PathBuf, @@ -30,10 +29,7 @@ pub fn mastiff_manygather>( output: Option

, ) -> Result<(), Box> { if !is_revindex_database(&index) { - bail!( - "'{}' is not a valid RevIndex database", - index - ); + bail!("'{}' is not a valid RevIndex database", index); } // Open database once let db = RevIndex::open(index, true)?; @@ -77,67 +73,70 @@ pub fn mastiff_manygather>( let failed_paths = AtomicUsize::new(0); let send = query_collection - .par_iter() - .filter_map(|(idx, record)| { - let threshold = threshold_bp / selection.scaled()? as usize; - - match query_collection.sig_for_dataset(idx) { - // match query_collection.sig_from_record(record) { // to be added in core - Ok(query_sig) => { - let mut results = vec![]; - let mut found_compatible_sketch = false; - for sketch in query_sig.iter() { - if let Sketch::MinHash(query) = sketch { - found_compatible_sketch = true; - // Gather! - let (counter, query_colors, hash_to_color) = - db.prepare_gather_counters(&query); - - let matches = db.gather( - counter, - query_colors, - hash_to_color, - threshold, - &query, - Some(selection.clone()), - ); - // extract results - if let Ok(matches) = matches { - for match_ in &matches { - results.push(( - query_sig.name().clone(), - query.md5sum().clone(), - match_.name().clone(), - match_.md5().clone(), - match_.f_match(), // f_match_query - match_.intersect_bp(), - )); // intersect_bp + .par_iter() + .filter_map(|(idx, record)| { + let threshold = threshold_bp / selection.scaled()? as usize; + + match query_collection.sig_for_dataset(idx) { + // match query_collection.sig_from_record(record) { // to be added in core + Ok(query_sig) => { + let mut results = vec![]; + let mut found_compatible_sketch = false; + for sketch in query_sig.iter() { + if let Sketch::MinHash(query) = sketch { + found_compatible_sketch = true; + // Gather! + let (counter, query_colors, hash_to_color) = + db.prepare_gather_counters(&query); + + let matches = db.gather( + counter, + query_colors, + hash_to_color, + threshold, + &query, + Some(selection.clone()), + ); + // extract results + if let Ok(matches) = matches { + for match_ in &matches { + results.push(( + query_sig.name().clone(), + query.md5sum().clone(), + match_.name().clone(), + match_.md5().clone(), + match_.f_match(), // f_match_query + match_.intersect_bp(), + )); // intersect_bp + } + } else { + eprintln!("Error gathering matches: {:?}", matches.err()); } - } else { - eprintln!("Error gathering matches: {:?}", matches.err()); } } - } - if !found_compatible_sketch { - eprintln!("WARNING: no compatible sketches in path '{}'", query_sig.filename()); - let _ = skipped_paths.fetch_add(1, atomic::Ordering::SeqCst); - } + if !found_compatible_sketch { + eprintln!( + "WARNING: no compatible sketches in path '{}'", + query_sig.filename() + ); + let _ = skipped_paths.fetch_add(1, atomic::Ordering::SeqCst); + } - if results.is_empty() { + if results.is_empty() { + None + } else { + Some(results) + } + } + Err(err) => { + eprintln!("Error loading sketch: {}", err); + let _ = failed_paths.fetch_add(1, atomic::Ordering::SeqCst); None - } else { - Some(results) } } - Err(err) => { - eprintln!("Error loading sketch: {}", err); - let _ = failed_paths.fetch_add(1, atomic::Ordering::SeqCst); - None - } - } - }) - .flatten() - .try_for_each_with(send, |s, m| s.send(m)); + }) + .flatten() + .try_for_each_with(send, |s, m| s.send(m)); // do some cleanup and error handling - if let Err(e) = send { diff --git a/src/python/tests/test_gather.py b/src/python/tests/test_gather.py index 2b59ea2b..d4649a63 100644 --- a/src/python/tests/test_gather.py +++ b/src/python/tests/test_gather.py @@ -120,7 +120,8 @@ def test_missing_query(runtmp, capfd, zip_against): captured = capfd.readouterr() print(captured.err) - assert 'Error: No such file or directory ' in captured.err + assert 'Error: No such file or directory' in captured.err + @pytest.mark.parametrize('zip_against', [False, True]) def test_bad_query(runtmp, capfd, zip_against): @@ -132,9 +133,9 @@ def test_bad_query(runtmp, capfd, zip_against): sig47 = get_test_data('47.fa.sig.gz') sig63 = get_test_data('63.fa.sig.gz') - # since 'query' needs to be a sig, this breaks it. - make_file_list(query, [sig2]) - + # query doesn't need to be a sig anymore - sig, zip, or pathlist welcome + # as long as there's only one sketch that matches params + make_file_list(query, [sig2,sig47]) # [sig2] make_file_list(against_list, [sig2, sig47, sig63]) if zip_against: @@ -151,7 +152,7 @@ def test_bad_query(runtmp, capfd, zip_against): captured = capfd.readouterr() print(captured.err) - assert 'Error: expected value at line 1' in captured.err + assert 'Error: Fastgather requires a single query sketch. Check input:' in captured.err @pytest.mark.parametrize('zip_against', [False, True]) @@ -179,7 +180,7 @@ def test_missing_against(runtmp, capfd, zip_against): captured = capfd.readouterr() print(captured.err) - assert 'Error: No such file or directory ' in captured.err + assert 'Error: No such file or directory' in captured.err def test_bad_against(runtmp, capfd): @@ -199,7 +200,7 @@ def test_bad_against(runtmp, capfd): captured = capfd.readouterr() print(captured.err) - assert 'Error: invalid line in fromfile ' in captured.err + assert 'Error: invalid line in fromfile' in captured.err def test_bad_against_2(runtmp, capfd): diff --git a/src/python/tests/test_multigather.py b/src/python/tests/test_multigather.py index 960bc68d..7ec636ba 100644 --- a/src/python/tests/test_multigather.py +++ b/src/python/tests/test_multigather.py @@ -188,8 +188,10 @@ def test_missing_querylist(runtmp, capfd, indexed, zip_query): @pytest.mark.parametrize('indexed', [False, True]) -def test_bad_query(runtmp, capfd, indexed): - # test bad querylist (a sig file) +def test_sig_query(runtmp, capfd, indexed): + # sig file is now fine as a query + query = get_test_data('SRR606249.sig.gz') + against_list = runtmp.output('against.txt') sig2 = get_test_data('2.fa.sig.gz') @@ -200,19 +202,37 @@ def test_bad_query(runtmp, capfd, indexed): if indexed: against_list = index_siglist(runtmp, against_list, runtmp.output('db')) + g_output = runtmp.output('out.csv') + else: + g_output = runtmp.output('SRR606249.gather.csv') + p_output = runtmp.output('SRR606249.prefetch.csv') - with pytest.raises(utils.SourmashCommandFailed): - runtmp.sourmash('scripts', 'fastmultigather', sig2, against_list, - '-s', '100000') + runtmp.sourmash('scripts', 'fastmultigather', query, against_list, + '-s', '100000', '-o', g_output) captured = capfd.readouterr() print(captured.err) + if not indexed: + # check prefetch output (only non-indexed gather) + assert os.path.exists(p_output) + df = pandas.read_csv(p_output) + assert len(df) == 3 + keys = set(df.keys()) + assert keys == {'query_filename', 'query_name', 'query_md5', 'match_name', 'match_md5', 'intersect_bp'} - assert 'Error: invalid line in fromfile' in captured.err + # check gather output (both) + assert os.path.exists(g_output) + df = pandas.read_csv(g_output) + assert len(df) == 3 + keys = set(df.keys()) + if indexed: + assert keys == {'query_name', 'query_md5', 'match_name', 'match_md5', 'f_match_query', 'intersect_bp'} + else: + assert keys == {'query_filename', 'query_name', 'query_md5', 'match_name', 'match_md5', 'rank', 'intersect_bp'} @pytest.mark.parametrize('indexed', [False, True]) -def test_bad_query_2(runtmp, capfd, indexed): +def test_bad_query(runtmp, capfd, indexed): # test with a bad query (a .sig.gz file renamed as zip file) against_list = runtmp.output('against.txt') @@ -324,24 +344,37 @@ def test_missing_against(runtmp, capfd, zip_against): assert 'Error: No such file or directory' in captured.err -def test_bad_against(runtmp, capfd): - # test bad 'against' file - in this case, use a .sig.gz file. +def test_sig_against(runtmp, capfd): + # against file can be a sig now query = get_test_data('SRR606249.sig.gz') against_list = runtmp.output('against.txt') sig2 = get_test_data('2.fa.sig.gz') - with pytest.raises(utils.SourmashCommandFailed): - runtmp.sourmash('scripts', 'fastmultigather', query, sig2, + g_output = runtmp.output('SRR606249.gather.csv') + p_output = runtmp.output('SRR606249.prefetch.csv') + runtmp.sourmash('scripts', 'fastmultigather', query, sig2, '-s', '100000') captured = capfd.readouterr() print(captured.err) - assert 'Error: invalid line in fromfile' in captured.err + # check prefetch output (only non-indexed gather) + assert os.path.exists(p_output) + df = pandas.read_csv(p_output) + assert len(df) == 1 + keys = set(df.keys()) + assert keys == {'query_filename', 'query_name', 'query_md5', 'match_name', 'match_md5', 'intersect_bp'} + + # check gather output + assert os.path.exists(g_output) + df = pandas.read_csv(g_output) + assert len(df) == 1 + keys = set(df.keys()) + assert keys == {'query_filename', 'query_name', 'query_md5', 'match_name', 'match_md5', 'rank', 'intersect_bp'} -def test_bad_against_2(runtmp, capfd): +def test_bad_against(runtmp, capfd): # test bad 'against' file - in this case, one containing a nonexistent file query = get_test_data('SRR606249.sig.gz') query_list = runtmp.output('query.txt') @@ -363,8 +396,8 @@ def test_bad_against_2(runtmp, capfd): @pytest.mark.parametrize('zip_query', [False, True]) -def test_bad_against_3(runtmp, capfd, zip_query): - # test with a bad query (a .sig.gz file renamed as zip file) +def test_bad_against_2(runtmp, capfd, zip_query): + # test with a bad against (a .sig.gz file renamed as zip file) query = get_test_data('SRR606249.sig.gz') query_list = runtmp.output('query.txt') make_file_list(query_list, [query]) @@ -382,7 +415,7 @@ def test_bad_against_3(runtmp, capfd, zip_query): with pytest.raises(utils.SourmashCommandFailed): runtmp.sourmash('scripts', 'fastmultigather', query_list, against_zip, - '-o', output) + '-s', '100000', '-o', output) captured = capfd.readouterr() print(captured.err) diff --git a/src/python/tests/test_search.py b/src/python/tests/test_search.py index 6f29ec9b..8af6bf3f 100644 --- a/src/python/tests/test_search.py +++ b/src/python/tests/test_search.py @@ -251,8 +251,8 @@ def test_missing_query(runtmp, capfd, indexed, zip_query): @pytest.mark.parametrize("indexed", [False, True]) -def test_bad_query(runtmp, capfd, indexed): - # test with a bad query (a .sig.gz file) +def test_sig_query(runtmp, capfd, indexed): + # test with a single sig query (a .sig.gz file) against_list = runtmp.output('against.txt') sig2 = get_test_data('2.fa.sig.gz') @@ -266,14 +266,14 @@ def test_bad_query(runtmp, capfd, indexed): output = runtmp.output('out.csv') - with pytest.raises(utils.SourmashCommandFailed): - runtmp.sourmash('scripts', 'manysearch', sig2, against_list, + # with pytest.raises(utils.SourmashCommandFailed): + runtmp.sourmash('scripts', 'manysearch', sig2, against_list, '-o', output) - captured = capfd.readouterr() - print(captured.err) + # captured = capfd.readouterr() + # print(captured.err) - assert 'Error: invalid line in fromfile' in captured.err + # assert 'Error: invalid line in fromfile' in captured.err @pytest.mark.parametrize("indexed", [False, True]) @@ -352,34 +352,34 @@ def test_missing_against(runtmp, capfd, indexed): captured = capfd.readouterr() print(captured.err) - assert 'Error: No such file or directory ' in captured.err + assert 'Error: No such file or directory' in captured.err -def test_bad_against(runtmp, capfd): - # test with a bad against list (a .sig file in this case) +def test_nomatch_against(runtmp, capfd): + # nonmatching against file (num sig) query_list = runtmp.output('query.txt') against_list = runtmp.output('against.txt') sig2 = get_test_data('2.fa.sig.gz') sig47 = get_test_data('47.fa.sig.gz') sig63 = get_test_data('63.fa.sig.gz') + # nomatch_sketch = get_test_data('genome-s11.fa.gz.sig') + nomatch_sketch = get_test_data('SRR606249.sig.gz') make_file_list(query_list, [sig2, sig47, sig63]) - #make_file_list(against_list, [sig2, sig47, sig63]) + make_file_list(against_list, [nomatch_sketch]) output = runtmp.output('out.csv') with pytest.raises(utils.SourmashCommandFailed): - runtmp.sourmash('scripts', 'manysearch', query_list, sig2, + runtmp.sourmash('scripts', 'manysearch', query_list, against_list, '-o', output) captured = capfd.readouterr() - print(captured.err) + assert "No search signatures loaded, exiting." in captured.err - assert 'Error: invalid line in fromfile ' in captured.err - -def test_bad_against_2(runtmp, capfd): +def test_bad_against(runtmp, capfd): # test with a bad against list (a missing file) query_list = runtmp.output('query.txt') against_list = runtmp.output('against.txt') diff --git a/src/utils.rs b/src/utils.rs index a6b07b02..be99bab6 100644 --- a/src/utils.rs +++ b/src/utils.rs @@ -693,62 +693,81 @@ pub fn load_collection( } let mut n_failed = 0; - let collection = if sigpath.extension().map_or(false, |ext| ext == "zip") { + let mut collection = if sigpath.extension().map_or(false, |ext| ext == "zip") { match Collection::from_zipfile(&sigpath) { Ok(collection) => collection, - Err(_) => { - bail!("failed to load {} zipfile: '{}'", report_type, sigpath); - } + Err(_) => bail!("failed to load {} zipfile: '{}'", report_type, sigpath), } } else { - let sketchlist_file = BufReader::new(File::open(sigpath)?); - - let records: Vec = sketchlist_file - .lines() - .filter_map(|line| { - let path = match line { - Ok(path) => path, - Err(err) => { - eprintln!("Error: invalid line in fromfile"); - return None; // Skip - } - }; - - match Signature::from_path(&path) { - Ok(signatures) => { - let recs: Vec = signatures - .into_iter() - .flat_map(|v| Record::from_sig(&v, path.as_str())) - .collect(); - Some(recs) - } - Err(err) => { - eprintln!("Sketch loading error: {}", err); - eprintln!("WARNING: could not load sketches from path '{}'", path); - n_failed += 1; - None - } + // if pathlist is just a signature path, load it into a collection + match Signature::from_path(sigpath) { + Ok(signatures) => { + // Load the collection from the signature + match Collection::from_sigs(signatures) { + Ok(collection) => collection, + Err(_) => bail!( + "loaded {} signatures but failed to load as collection: '{}'", + report_type, + sigpath + ), } - }) - .flatten() - .collect(); - - let manifest: Manifest = records.into(); - - Collection::new( - manifest, - InnerStorage::new( - FSStorage::builder() - .fullpath("".into()) - .subdir("".into()) - .build(), - ), - ) + } + // if not, try to load file as list of sig paths + Err(_) => { + // // using core fn doesn't allow us to ignore failed paths; I reimplement loading here to allow + let sketchlist_file = BufReader::new(File::open(sigpath)?); + let records: Vec = sketchlist_file + .lines() + .filter_map(|line| { + let path = line.ok()?; + match Signature::from_path(&path) { + Ok(signatures) => { + let recs: Vec = signatures + .into_iter() + .flat_map(|v| Record::from_sig(&v, &path)) + .collect(); + Some(recs) + } + Err(err) => { + eprintln!("Sketch loading error: {}", err); + eprintln!("WARNING: could not load sketches from path '{}'", path); + n_failed += 1; + None + } + } + }) + .flatten() + .collect(); + + let manifest: Manifest = records.into(); + Collection::new( + manifest, + InnerStorage::new( + FSStorage::builder() + .fullpath("".into()) + .subdir("".into()) + .build(), + ), + ) + } + } }; let n_total = collection.len(); - let selected = collection.select(&selection)?; + eprintln!("n_total: {}", n_total); + // collection = collection.select(selection)?; + let selected = collection.select(selection)?; + + if selected.len() == 1 { + let sig = selected.sig_for_dataset(0).unwrap(); + eprintln!("sig name: {:?}", sig.name()); + let mh = sig.minhash().unwrap(); + eprintln!("scaled= {:?}", mh.scaled()) + } + + eprintln!("selection_len: {}", selected.len()); let n_skipped = n_total - selected.len(); + // let n_skipped = n_total - collection.len(); report_on_collection_loading(&selected, n_skipped, n_failed, report_type)?; Ok(selected) } @@ -780,23 +799,6 @@ pub fn report_on_collection_loading( Ok(()) } -pub fn load_single_sig_from_collection( - query_collection: &Collection, // Replace with the actual type - selection: &Selection, -) -> Result { - let scaled = selection.scaled().unwrap(); - let ksize = selection.ksize().unwrap(); - - match query_collection.sig_for_dataset(0) { - Ok(sig) => Ok(sig), - Err(_) => Err(anyhow::anyhow!( - "No sketch found with scaled={}, k={}", - scaled, - ksize - )), - } -} - // pub fn load_single_sketch_from_sig<'a>(sig: &'a SigStore, selection: &'a Selection) -> Result<&'a KmerMinHash> { // let sketch = sig.sketches().get(0).ok_or_else(|| { // anyhow::anyhow!("No sketch found with scaled={}, k={}", selection.scaled().unwrap_or_default(), selection.ksize().unwrap_or_default()) From 0ea39b5b856ebda777fdb45e1b6ac6b322c0ffc7 Mon Sep 17 00:00:00 2001 From: "N. Tessa Pierce-Ward" Date: Tue, 30 Jan 2024 16:21:38 -0800 Subject: [PATCH 25/40] fix all except moltype selection --- src/fastgather.rs | 54 +++++++++++++-------------------- src/python/tests/test_gather.py | 23 ++++++++------ src/utils.rs | 23 ++++++++------ 3 files changed, 49 insertions(+), 51 deletions(-) diff --git a/src/fastgather.rs b/src/fastgather.rs index 5e8e3b07..8680abaa 100644 --- a/src/fastgather.rs +++ b/src/fastgather.rs @@ -10,6 +10,7 @@ use crate::utils::PrefetchResult; use std::collections::BinaryHeap; use sourmash::prelude::Select; +use sourmash::signature::SigsTrait; use crate::utils::{ consume_query_by_gather, load_collection, load_sketches_above_threshold, write_prefetch, @@ -27,31 +28,6 @@ pub fn fastgather( prefetch_output: Option, ) -> Result<()> { let query_collection = load_collection(query_filepath, selection, ReportType::Query)?; - let mut query_sig = None; - let mut query_mh = None; - - for (idx, record) in query_collection.iter() { - if let Ok(sig) = query_collection - .sig_for_dataset(idx) - .unwrap() - .select(&selection) - { - query_sig = Some(sig.clone()); - - for sketch in sig.iter() { - // Access query MinHash - if let Sketch::MinHash(mh) = sketch { - query_mh = Some(mh.clone()); - // eprintln!("mh mins: {:?}", mh.mins()); - } - } - } else { - eprintln!("Failed to load 'query sig: {}", record.name()); - } - } - if query_mh.is_none() { - bail!("No query sketch matching selection parameters."); - } if query_collection.len() != 1 { bail!( @@ -59,6 +35,22 @@ pub fn fastgather( &query_filepath ) } + // get single query sig and minhash + let query_sig = query_collection.sig_for_dataset(0)?; // need original md5sum, etc + // downsample + let query_sig_ds = query_sig.clone().select(selection)?; + let query_mh = match query_sig_ds.minhash() { + Some(query_mh) => query_mh, + None => { + bail!("No query sketch matching selection parameters."); + } + }; + // some debugging prints + // eprintln!("selection scaled: {:?}", selection.scaled()); + // eprintln!("selection ksize: {:?}", selection.ksize()); + // eprintln!("query ksize: {:?}", query_mh.ksize()); + // eprintln!("selection moltype: {:?}", selection.moltype()); + // eprintln!("query moltype: {:?}", query_sig.hash_function()); // build the list of paths to match against. eprintln!("Loading matchlist from '{}'", against_filepath); @@ -82,12 +74,8 @@ pub fn fastgather( ); // load a set of sketches, filtering for those with overlaps > threshold - let result = load_sketches_above_threshold( - against_collection, - &selection, - &query_mh.unwrap(), - threshold_hashes, - )?; + let result = + load_sketches_above_threshold(against_collection, &selection, &query_mh, threshold_hashes)?; let matchlist = result.0; let skipped_paths = result.1; let failed_paths = result.2; @@ -110,12 +98,12 @@ pub fn fastgather( } if prefetch_output.is_some() { - write_prefetch(query_sig.as_ref().unwrap(), prefetch_output, &matchlist).ok(); + write_prefetch(&query_sig, prefetch_output, &matchlist).ok(); } // run the gather! consume_query_by_gather( - query_sig.clone().unwrap(), + query_sig.clone(), matchlist, threshold_hashes, gather_output, diff --git a/src/python/tests/test_gather.py b/src/python/tests/test_gather.py index d4649a63..e56602b3 100644 --- a/src/python/tests/test_gather.py +++ b/src/python/tests/test_gather.py @@ -183,8 +183,8 @@ def test_missing_against(runtmp, capfd, zip_against): assert 'Error: No such file or directory' in captured.err -def test_bad_against(runtmp, capfd): - # test bad 'against' file - in this case, use a .sig.gz file. +def test_sig_against(runtmp, capfd): + # sig file is ok as against file now query = get_test_data('SRR606249.sig.gz') sig2 = get_test_data('2.fa.sig.gz') @@ -192,18 +192,23 @@ def test_bad_against(runtmp, capfd): g_output = runtmp.output('gather.csv') p_output = runtmp.output('prefetch.csv') - with pytest.raises(utils.SourmashCommandFailed): - runtmp.sourmash('scripts', 'fastgather', query, sig2, + runtmp.sourmash('scripts', 'fastgather', query, sig2, '-o', g_output, '--output-prefetch', p_output, '-s', '100000') captured = capfd.readouterr() print(captured.err) - assert 'Error: invalid line in fromfile' in captured.err + assert os.path.exists(g_output) + + df = pandas.read_csv(g_output) + assert len(df) == 1 + keys = set(df.keys()) + assert keys == {'query_filename', 'query_name', 'query_md5', 'match_name', 'match_md5', 'rank', 'intersect_bp'} + -def test_bad_against_2(runtmp, capfd): +def test_bad_against(runtmp, capfd): # test bad 'against' file - in this case, one containing a bad filename. query = get_test_data('SRR606249.sig.gz') against_list = runtmp.output('against.txt') @@ -226,7 +231,7 @@ def test_bad_against_2(runtmp, capfd): assert "WARNING: 1 search paths failed to load. See error messages above." in captured.err -def test_bad_against_3(runtmp, capfd): +def test_bad_against_2(runtmp, capfd): # test bad 'against' file - in this case, one containing an empty file query = get_test_data('SRR606249.sig.gz') against_list = runtmp.output('against.txt') @@ -254,7 +259,7 @@ def test_bad_against_3(runtmp, capfd): assert "WARNING: 1 search paths failed to load. See error messages above." in captured.err -def test_bad_against_4(runtmp, capfd): +def test_bad_against_3(runtmp, capfd): # test with a bad against (a .sig.gz file renamed as zip file) query = get_test_data('SRR606249.sig.gz') @@ -276,7 +281,7 @@ def test_bad_against_4(runtmp, capfd): captured = capfd.readouterr() print(captured.err) - assert 'Error: invalid Zip archive: Could not find central directory end' in captured.err + assert 'InvalidArchive' in captured.err @pytest.mark.parametrize('zip_against', [False, True]) diff --git a/src/utils.rs b/src/utils.rs index be99bab6..bd1161b3 100644 --- a/src/utils.rs +++ b/src/utils.rs @@ -473,7 +473,8 @@ pub fn load_sketches_above_threshold( if let Ok(against_sig) = against_collection.sig_for_dataset(idx) { if let Some(sketch) = against_sig.sketches().get(0) { if let Sketch::MinHash(against_mh) = sketch { - if let Ok(overlap) = against_mh.count_common(query, false) { + // currently downsampling here to avoid changing md5sum + if let Ok(overlap) = against_mh.count_common(query, true) { if overlap >= threshold_hashes { let result = PrefetchResult { name: against_sig.name().to_string(), @@ -987,16 +988,20 @@ pub fn build_template(ksize: u8, scaled: usize, moltype: &str) -> Sketch { } pub fn build_selection(ksize: u8, scaled: usize, moltype: &str) -> Selection { - let hash_function = match moltype { - "dna" => HashFunctions::Murmur64Dna, - "protein" => HashFunctions::Murmur64Protein, - "dayhoff" => HashFunctions::Murmur64Dayhoff, - "hp" => HashFunctions::Murmur64Hp, - _ => panic!("Unknown molecule type: {}", moltype), - }; + // let hash_function = match moltype { + // "dna" => HashFunctions::Murmur64Dna, + // "protein" => HashFunctions::Murmur64Protein, + // "dayhoff" => HashFunctions::Murmur64Dayhoff, + // "hp" => HashFunctions::Murmur64Hp, + // _ => panic!("Unknown molecule type: {}", moltype), + // }; + let hash_function = HashFunctions::try_from(moltype) + .map_err(|_| panic!("Unknown molecule type: {}", moltype)) + .unwrap(); + let adjusted_ksize = if moltype == "dna" { ksize } else { ksize * 3 }; Selection::builder() - .ksize(ksize.into()) + .ksize(adjusted_ksize.into()) .scaled(scaled as u32) .moltype(hash_function) .build() From 912f717724d37ae17fc35912f0dbd76d20dd444a Mon Sep 17 00:00:00 2001 From: "N. Tessa Pierce-Ward" Date: Tue, 30 Jan 2024 17:45:54 -0800 Subject: [PATCH 26/40] update fastgather and multisearch --- src/fastgather.rs | 14 -- src/lib.rs | 12 +- src/multisearch.rs | 114 +++++++++----- src/python/tests/test_gather.py | 20 ++- src/python/tests/test_multisearch.py | 36 ++--- src/python/tests/test_search.py | 2 +- src/utils.rs | 223 +++------------------------ 7 files changed, 137 insertions(+), 284 deletions(-) diff --git a/src/fastgather.rs b/src/fastgather.rs index 8680abaa..82362c85 100644 --- a/src/fastgather.rs +++ b/src/fastgather.rs @@ -1,13 +1,8 @@ /// fastgather: Run gather with a query against a list of files. use anyhow::Result; -use serde::Serialize; use sourmash::selection::Selection; -use sourmash::signature::Signature; -use sourmash::sketch::Sketch; // use camino; -use crate::utils::PrefetchResult; -use std::collections::BinaryHeap; use sourmash::prelude::Select; use sourmash::signature::SigsTrait; @@ -45,17 +40,8 @@ pub fn fastgather( bail!("No query sketch matching selection parameters."); } }; - // some debugging prints - // eprintln!("selection scaled: {:?}", selection.scaled()); - // eprintln!("selection ksize: {:?}", selection.ksize()); - // eprintln!("query ksize: {:?}", query_mh.ksize()); - // eprintln!("selection moltype: {:?}", selection.moltype()); - // eprintln!("query moltype: {:?}", query_sig.hash_function()); - // build the list of paths to match against. - eprintln!("Loading matchlist from '{}'", against_filepath); let against_collection = load_collection(against_filepath, selection, ReportType::Against)?; - eprintln!("Loaded {} sig paths in matchlist", against_collection.len()); // calculate the minimum number of hashes based on desired threshold let threshold_hashes: u64 = { diff --git a/src/lib.rs b/src/lib.rs index 5e55e1f3..acdc7b61 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -1,6 +1,5 @@ /// Python interface Rust code for sourmash_plugin_branchwater. use pyo3::prelude::*; -use sourmash::selection; #[macro_use] extern crate simple_error; @@ -18,8 +17,6 @@ mod mastiff_manygather; mod mastiff_manysearch; mod multisearch; mod pairwise; -use sourmash::encodings::HashFunctions; -use sourmash::selection::Selection; use camino::Utf8PathBuf; @@ -212,13 +209,16 @@ fn do_multisearch( moltype: String, output_path: Option, ) -> anyhow::Result { + let queryfile_path: camino::Utf8PathBuf = querylist_path.into(); + let againstfile_path: camino::Utf8PathBuf = siglist_path.into(); + let selection = build_selection(ksize, scaled, &moltype); // let selection = build_selection(ksize, scaled, &moltype); let template = build_template(ksize, scaled, &moltype); match multisearch::multisearch( - querylist_path, - siglist_path, + &queryfile_path, + &againstfile_path, threshold, - template, + &selection, output_path, ) { Ok(_) => Ok(0), diff --git a/src/multisearch.rs b/src/multisearch.rs index 73fe9437..be9989f6 100644 --- a/src/multisearch.rs +++ b/src/multisearch.rs @@ -9,28 +9,59 @@ use std::path::Path; use std::sync::atomic; use std::sync::atomic::AtomicUsize; +use sourmash::prelude::Select; +use sourmash::selection::Selection; use sourmash::signature::SigsTrait; use sourmash::sketch::Sketch; +use sourmash::storage::SigStore; -use crate::utils::{load_sketches_from_zip_or_pathlist, ReportType}; +use crate::utils::{load_collection, ReportType}; /// Search many queries against a list of signatures. /// /// Note: this function loads all _queries_ into memory, and iterates over /// database once. -pub fn multisearch>( - querylist: P, - againstlist: P, +pub fn multisearch( + query_filepath: &camino::Utf8PathBuf, + against_filepath: &camino::Utf8PathBuf, threshold: f64, - template: Sketch, - output: Option

, + selection: &Selection, + output: Option, ) -> Result<(), Box> { // Load all queries into memory at once. - let queries = load_sketches_from_zip_or_pathlist(&querylist, &template, ReportType::Query)?; + + // let queries = load_sketches_from_zip_or_pathlist(&querylist, &template, ReportType::Query)?; + let query_collection = load_collection(query_filepath, selection, ReportType::Query)?; + let mut queries: Vec = vec![]; + for (idx, record) in query_collection.iter() { + if let Ok(sig) = query_collection.sig_from_record(record) + // .unwrap() + // .select(&selection) // if we select here, we downsample and the md5sum changes! + // ...which means we would lose the original md5sum that is used in the standard gather results. + { + queries.push(sig); + } else { + eprintln!("Failed to load 'against' record: {}", record.name()); + } + } // Load all against sketches into memory at once. - let against = load_sketches_from_zip_or_pathlist(&againstlist, &template, ReportType::Against)?; + // let against = load_sketches_from_zip_or_pathlist(&againstlist, &template, ReportType::Against)?; + let against_collection = load_collection(against_filepath, selection, ReportType::Against)?; + let mut against: Vec = vec![]; + + for (idx, record) in against_collection.iter() { + if let Ok(sig) = against_collection.sig_from_record(record) + // .unwrap() + // .select(&selection) // if we select here, we downsample and the md5sum changes! + // ...which means we would lose the original md5sum that is used in the standard gather results. + { + against.push(sig); + } else { + eprintln!("Failed to load 'against' record: {}", record.name()); + } + } // set up a multi-producer, single-consumer channel. let (send, recv) = std::sync::mpsc::sync_channel(rayon::current_num_threads()); @@ -66,39 +97,46 @@ pub fn multisearch>( .filter_map(|target| { let mut results = vec![]; - // search for matches & save containment. - for q in queries.iter() { - let i = processed_cmp.fetch_add(1, atomic::Ordering::SeqCst); - if i % 100000 == 0 { - eprintln!("Processed {} comparisons", i); + let ds_against_sig = target.clone().select(&selection).unwrap(); + if let Some(against_mh) = ds_against_sig.minhash() { + // search for matches & save containment. + for query_sig in queries.iter() { + let i = processed_cmp.fetch_add(1, atomic::Ordering::SeqCst); + if i % 100000 == 0 { + eprintln!("Processed {} comparisons", i); + } + let ds_q = query_sig.clone().select(&selection).unwrap(); + let query_mh = ds_q.minhash()?; + let overlap = query_mh.count_common(&against_mh, false).unwrap() as f64; + // use downsampled sizes + let query_size = query_mh.size() as f64; + let target_size = against_mh.size() as f64; + + let containment_query_in_target = overlap / query_size; + let containment_in_target = overlap / target_size; + let max_containment = containment_query_in_target.max(containment_in_target); + let jaccard = overlap / (target_size + query_size - overlap); + + if containment_query_in_target > threshold { + results.push(( + query_sig.name(), + query_sig.md5sum(), + target.name(), + target.md5sum(), + containment_query_in_target, + max_containment, + jaccard, + overlap, + )) + } } - - let overlap = q.minhash.count_common(&target.minhash, false).unwrap() as f64; - let query_size = q.minhash.size() as f64; - let target_size = target.minhash.size() as f64; - - let containment_query_in_target = overlap / query_size; - let containment_in_target = overlap / target_size; - let max_containment = containment_query_in_target.max(containment_in_target); - let jaccard = overlap / (target_size + query_size - overlap); - - if containment_query_in_target > threshold { - results.push(( - q.name.clone(), - q.md5sum.clone(), - target.name.clone(), - target.md5sum.clone(), - containment_query_in_target, - max_containment, - jaccard, - overlap, - )) + if results.is_empty() { + None + } else { + Some(results) } - } - if results.is_empty() { - None } else { - Some(results) + None } }) .flatten() diff --git a/src/python/tests/test_gather.py b/src/python/tests/test_gather.py index e56602b3..d0376a02 100644 --- a/src/python/tests/test_gather.py +++ b/src/python/tests/test_gather.py @@ -310,13 +310,15 @@ def test_against_multisigfile(runtmp, zip_against): df = pandas.read_csv(g_output) if zip_against: assert len(df) == 3 + print(df) else: + print(df) assert len(df) == 1 # @CTB this is a bug :(. It should load multiple sketches properly! @pytest.mark.parametrize('zip_against', [False, True]) -def test_query_multisigfile(runtmp, zip_against): +def test_query_multisigfile(runtmp, capfd, zip_against): # test with a sigfile that contains multiple sketches against_list = runtmp.output('against.txt') @@ -335,12 +337,14 @@ def test_query_multisigfile(runtmp, zip_against): g_output = runtmp.output('gather.csv') p_output = runtmp.output('prefetch.csv') - runtmp.sourmash('scripts', 'fastgather', combined, against_list, + with pytest.raises(utils.SourmashCommandFailed): + runtmp.sourmash('scripts', 'fastgather', combined, against_list, '-o', g_output, '--output-prefetch', p_output, '-s', '100000') - # @CTB this should fail, not succeed :(. - df = pandas.read_csv(g_output) - assert len(df) == 1 + # this fails now :) + captured = capfd.readouterr() + print(captured.err) + assert "Error: Fastgather requires a single query sketch. Check input:" in captured.err @pytest.mark.parametrize('zip_against', [False, True]) @@ -555,7 +559,7 @@ def test_simple_protein(runtmp): # test basic protein execution sigs = get_test_data('protein.zip') - query = runtmp.output('query.sig') + query = runtmp.output('query.zip') against = runtmp.output('against.zip') # extract query from zip file runtmp.sourmash('sig', 'extract', sigs, '--name', 'GCA_001593935', '-o', query) @@ -582,7 +586,7 @@ def test_simple_dayhoff(runtmp): # test basic protein execution sigs = get_test_data('dayhoff.zip') - query = runtmp.output('query.sig') + query = runtmp.output('query.zip') against = runtmp.output('against.zip') # extract query from zip file runtmp.sourmash('sig', 'extract', sigs, '--name', 'GCA_001593935', '-o', query) @@ -609,7 +613,7 @@ def test_simple_hp(runtmp): # test basic protein execution sigs = get_test_data('hp.zip') - query = runtmp.output('query.sig') + query = runtmp.output('query.zip') against = runtmp.output('against.zip') # extract query from zip file runtmp.sourmash('sig', 'extract', sigs, '--name', 'GCA_001593935', '-o', query) diff --git a/src/python/tests/test_multisearch.py b/src/python/tests/test_multisearch.py index ef2ea222..ff2136b0 100644 --- a/src/python/tests/test_multisearch.py +++ b/src/python/tests/test_multisearch.py @@ -148,11 +148,11 @@ def test_missing_query(runtmp, capfd, zip_query): captured = capfd.readouterr() print(captured.err) - assert 'Error: No such file or directory ' in captured.err + assert 'Error: No such file or directory' in captured.err -def test_bad_query(runtmp, capfd): - # test with a bad query (a .sig.gz file) +def test_sig_query(runtmp, capfd): + # sig is ok as query now against_list = runtmp.output('against.txt') sig2 = get_test_data('2.fa.sig.gz') @@ -163,17 +163,17 @@ def test_bad_query(runtmp, capfd): output = runtmp.output('out.csv') - with pytest.raises(utils.SourmashCommandFailed): - runtmp.sourmash('scripts', 'multisearch', sig2, against_list, + runtmp.sourmash('scripts', 'multisearch', sig2, against_list, '-o', output) captured = capfd.readouterr() print(captured.err) - assert 'Error: invalid line in fromfile ' in captured.err - + assert os.path.exists(output) + df = pandas.read_csv(output) + assert len(df) == 1 -def test_bad_query_2(runtmp, capfd): +def test_bad_query(runtmp, capfd): # test with a bad query list (a missing file) query_list = runtmp.output('query.txt') against_list = runtmp.output('against.txt') @@ -221,7 +221,7 @@ def test_bad_query_3(runtmp, capfd): captured = capfd.readouterr() print(captured.err) - assert 'Error: invalid Zip archive: Could not find central directory end' in captured.err + assert 'InvalidArchive' in captured.err @pytest.mark.parametrize("zip_db", [False, True]) @@ -250,11 +250,11 @@ def test_missing_against(runtmp, capfd, zip_db): captured = capfd.readouterr() print(captured.err) - assert 'Error: No such file or directory ' in captured.err + assert 'Error: No such file or directory' in captured.err -def test_bad_against(runtmp, capfd): - # test with a bad against list (a .sig file in this case) +def test_sig_against(runtmp, capfd): + # against can be sig now query_list = runtmp.output('query.txt') against_list = runtmp.output('against.txt') @@ -267,17 +267,17 @@ def test_bad_against(runtmp, capfd): output = runtmp.output('out.csv') - with pytest.raises(utils.SourmashCommandFailed): - runtmp.sourmash('scripts', 'multisearch', query_list, sig2, + runtmp.sourmash('scripts', 'multisearch', query_list, sig2, '-o', output) captured = capfd.readouterr() print(captured.err) - assert 'Error: invalid line in fromfile ' in captured.err - + assert os.path.exists(output) + df = pandas.read_csv(output) + assert len(df) == 1 -def test_bad_against_2(runtmp, capfd): +def test_bad_against(runtmp, capfd): # test with a bad against list (a missing file) query_list = runtmp.output('query.txt') against_list = runtmp.output('against.txt') @@ -380,7 +380,7 @@ def test_load_only_one_bug(runtmp, capfd, zip_db): print(captured.err) assert not 'WARNING: skipped 1 paths - no compatible signatures.' in captured.err - assert not 'WARNING: no compatible sketches in path ' in captured.err + assert not 'WARNING: no compatible sketches in path' in captured.err @pytest.mark.parametrize("zip_query", [False, True]) diff --git a/src/python/tests/test_search.py b/src/python/tests/test_search.py index 8af6bf3f..2ab45907 100644 --- a/src/python/tests/test_search.py +++ b/src/python/tests/test_search.py @@ -327,7 +327,7 @@ def test_bad_query_3(runtmp, capfd): captured = capfd.readouterr() print(captured.err) - assert 'Error: invalid Zip archive: Could not find central directory end' in captured.err + assert 'InvalidArchive' in captured.err @pytest.mark.parametrize("indexed", [False, True]) diff --git a/src/utils.rs b/src/utils.rs index bd1161b3..0b69eb00 100644 --- a/src/utils.rs +++ b/src/utils.rs @@ -245,102 +245,6 @@ pub fn load_sketchlist_filenames>(sketchlist_filename: &P) -> Res Ok(sketchlist_filenames) } -/// Loads signature file paths from a ZIP archive. -/// -/// This function extracts the contents of a ZIP archive containing -/// signature files (with extensions ".sig" or ".sig.gz") to a temporary directory. -/// It returns the paths of these extracted signature files. -/// -/// # Arguments -/// -/// * `zip_path` - The path to the ZIP archive. -/// -/// # Returns -/// -/// Returns a tuple containing: -/// * A vector of `PathBuf` representing the paths to the extracted signature files. -/// * The `TempDir` representing the temporary directory where the files were extracted. -/// Since tempfile::TempDir creates a temporary directory that is automatically -/// deleted once the TempDir value goes out of scope, we return it here to move it -/// to the main function scope. -/// -/// # Errors -/// -/// Returns an error if: -/// * Unable to create a temporary directory. -/// * Unable to open or read the ZIP archive. -/// * Any other IO or file related error. -pub fn load_sigpaths_from_zip>( - zip_path: P, - template: &Sketch, - report_type: ReportType, -) -> Result<(Vec, tempfile::TempDir)> { - let mut signature_paths = Vec::new(); - let temp_dir = tempdir()?; - let zip_file = File::open(&zip_path)?; - let mut zip_archive = ZipArchive::new(zip_file)?; - - let mut skipped_paths = 0; - for i in 0..zip_archive.len() { - let mut file = zip_archive.by_index(i)?; - // make string copy to avoid file borrowing issues - let file_name_str = file.name().to_string(); - let file_name = Path::new(&file_name_str) - .file_name() - .unwrap() - .to_str() - .unwrap(); - // use contains to account for sig.gz_0 bug in sourmash - if file_name.contains(".sig") || file_name.contains(".sig.gz") { - // read file - let mut contents = Vec::new(); - file.read_to_end(&mut contents)?; - // get sig from file - let sigs = Signature::from_reader(&contents[..])?; - if sigs.len() > 1 { - return Err(anyhow::anyhow!( - "File '{}' has more than one signature.", - file_name - )); - } - let sig = &sigs[0]; // Directly take the first (only) signature - // check for compatible sketch - let is_compatible = if let Some(Sketch::MinHash(_)) = sig.select_sketch(template) { - true - } else if let Sketch::MinHash(template_mh) = template { - sig.sketches().iter().any(|sketch| { - matches!(sketch, Sketch::MinHash(ref_mh) if check_compatible_downsample(&ref_mh, template_mh).is_ok()) - }) - } else { - false - }; - - if is_compatible { - let path = temp_dir.path().join(file_name); - // write contents to new file - let mut new_file = File::create(&path)?; - new_file.write_all(&contents)?; - // add filepath to signature paths - signature_paths.push(path); - } else { - skipped_paths += 1; - } - } - } - if skipped_paths > 0 { - eprintln!( - "WARNING: skipped {} {} paths - no compatible signatures.", - skipped_paths, report_type - ); - } - eprintln!( - "loaded paths for {} signature files from zipfile {}", - signature_paths.len(), - zip_path.as_ref().display() - ); - Ok((signature_paths, temp_dir)) -} - pub fn load_fasta_fromfile>( sketchlist_filename: &P, ) -> Result> { @@ -467,22 +371,22 @@ pub fn load_sketches_above_threshold( let matchlist: BinaryHeap = against_collection .par_iter() - .filter_map(|(idx, against_record)| { - let mut mm = None; + .filter_map(|(_idx, against_record)| { + let mut results = Vec::new(); // Load against into memory - if let Ok(against_sig) = against_collection.sig_for_dataset(idx) { - if let Some(sketch) = against_sig.sketches().get(0) { + if let Ok(against_sig) = against_collection.sig_from_record(against_record) { + for sketch in against_sig.sketches() { if let Sketch::MinHash(against_mh) = sketch { // currently downsampling here to avoid changing md5sum if let Ok(overlap) = against_mh.count_common(query, true) { if overlap >= threshold_hashes { let result = PrefetchResult { - name: against_sig.name().to_string(), - md5sum: against_mh.md5sum().to_string(), + name: against_record.name().to_string(), + md5sum: against_mh.md5sum(), minhash: against_mh.clone(), overlap, }; - mm = Some(result); + results.push(result); } } } else { @@ -492,12 +396,6 @@ pub fn load_sketches_above_threshold( ); let _i = skipped_paths.fetch_add(1, atomic::Ordering::SeqCst); } - } else { - eprintln!( - "WARNING: no compatible sketches in path '{}'", - against_sig.filename() - ); - let _i = skipped_paths.fetch_add(1, atomic::Ordering::SeqCst); } } else { // this shouldn't happen here anymore -- likely would happen at load_collection @@ -507,8 +405,13 @@ pub fn load_sketches_above_threshold( ); let _i = skipped_paths.fetch_add(1, atomic::Ordering::SeqCst); } - mm + if results.is_empty() { + None + } else { + Some(results) + } }) + .flatten() .collect(); let skipped_paths = skipped_paths.load(atomic::Ordering::SeqCst); @@ -580,50 +483,6 @@ pub fn load_sketches_from_zip>( Ok((sketchlist, skipped_paths, failed_paths)) } -/// Control function to read signature FILE PATHS from an input file. -/// If a ZIP archive is provided (detected via extension), -/// use `load_sigpaths_from_zip`. Otherwise, assume the -/// user provided a `fromfile` sketchlist and use -/// `load_sketchlist_filenames`. -/// -/// # Arguments -/// -/// * `sketchlist_path` - Path to either a ZIP archive or a list of signature file paths. -/// -/// # Returns -/// -/// Returns a tuple containing: -/// * A vector of `PathBuf` representing the signature file paths. -/// * If extracting from a zipfile, signature files will be extracted to a -/// `TempDir` temporary directory where they can be used individually. -pub fn load_sigpaths_from_zip_or_pathlist>( - sketchlist_path: P, - template: &Sketch, - report_type: ReportType, -) -> Result<(Vec, Option)> { - eprintln!( - "Reading list of filepaths from: '{}'", - sketchlist_path.as_ref().display() - ); - - let result = if sketchlist_path - .as_ref() - .extension() - .map(|ext| ext == "zip") - .unwrap_or(false) - { - let (paths, tempdir) = load_sigpaths_from_zip(&sketchlist_path, template, report_type)?; - (paths, Some(tempdir)) - } else { - let paths = load_sketchlist_filenames(&sketchlist_path)?; - (paths, None) - }; - - eprintln!("Found {} filepaths", result.0.len()); - // should we bail here if empty? - Ok(result) -} - pub enum ReportType { Query, Against, @@ -755,20 +614,8 @@ pub fn load_collection( }; let n_total = collection.len(); - eprintln!("n_total: {}", n_total); - // collection = collection.select(selection)?; let selected = collection.select(selection)?; - - if selected.len() == 1 { - let sig = selected.sig_for_dataset(0).unwrap(); - eprintln!("sig name: {:?}", sig.name()); - let mh = sig.minhash().unwrap(); - eprintln!("scaled= {:?}", mh.scaled()) - } - - eprintln!("selection_len: {}", selected.len()); let n_skipped = n_total - selected.len(); - // let n_skipped = n_total - collection.len(); report_on_collection_loading(&selected, n_skipped, n_failed, report_type)?; Ok(selected) } @@ -800,27 +647,6 @@ pub fn report_on_collection_loading( Ok(()) } -// pub fn load_single_sketch_from_sig<'a>(sig: &'a SigStore, selection: &'a Selection) -> Result<&'a KmerMinHash> { -// let sketch = sig.sketches().get(0).ok_or_else(|| { -// anyhow::anyhow!("No sketch found with scaled={}, k={}", selection.scaled().unwrap_or_default(), selection.ksize().unwrap_or_default()) -// })?; - -// if let Sketch::MinHash(mh) = sketch { -// Ok(mh) -// } else { -// Err(anyhow::anyhow!("No sketch found with scaled={}, k={}", selection.scaled().unwrap_or_default(), selection.ksize().unwrap_or_default())) -// } -// } - -// pub fn load_single_sig_and_sketch<'a>( -// query_collection: &'a Collection, -// selection: &'a Selection, -// ) -> Result<(SigStore, &'a KmerMinHash)> { -// let sig = load_single_sig_from_collection(query_collection, selection)?; -// let sketch = load_single_sketch_from_sig(&sig, selection)?; -// Ok((sig, sketch)) -// } - /// Uses the output of sketch loading functions to report the /// total number of sketches loaded, as well as the number of files, /// if any, that failed to load or contained no compatible sketches. @@ -988,20 +814,19 @@ pub fn build_template(ksize: u8, scaled: usize, moltype: &str) -> Sketch { } pub fn build_selection(ksize: u8, scaled: usize, moltype: &str) -> Selection { - // let hash_function = match moltype { - // "dna" => HashFunctions::Murmur64Dna, - // "protein" => HashFunctions::Murmur64Protein, - // "dayhoff" => HashFunctions::Murmur64Dayhoff, - // "hp" => HashFunctions::Murmur64Hp, - // _ => panic!("Unknown molecule type: {}", moltype), - // }; - let hash_function = HashFunctions::try_from(moltype) - .map_err(|_| panic!("Unknown molecule type: {}", moltype)) - .unwrap(); - let adjusted_ksize = if moltype == "dna" { ksize } else { ksize * 3 }; + let hash_function = match moltype { + "dna" => HashFunctions::Murmur64Dna, + "protein" => HashFunctions::Murmur64Protein, + "dayhoff" => HashFunctions::Murmur64Dayhoff, + "hp" => HashFunctions::Murmur64Hp, + _ => panic!("Unknown molecule type: {}", moltype), + }; + // let hash_function = HashFunctions::try_from(moltype) + // .map_err(|_| panic!("Unknown molecule type: {}", moltype)) + // .unwrap(); Selection::builder() - .ksize(adjusted_ksize.into()) + .ksize(ksize.into()) .scaled(scaled as u32) .moltype(hash_function) .build() From f5216f820b955c62a685e60c56611601f84afbf8 Mon Sep 17 00:00:00 2001 From: "N. Tessa Pierce-Ward" Date: Tue, 30 Jan 2024 18:38:55 -0800 Subject: [PATCH 27/40] update pairwise --- src/lib.rs | 5 +- src/pairwise.rs | 93 ++++++++++++++++++++----------- src/python/tests/test_pairwise.py | 24 +++----- 3 files changed, 70 insertions(+), 52 deletions(-) diff --git a/src/lib.rs b/src/lib.rs index acdc7b61..c7c2d69f 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -238,8 +238,9 @@ fn do_pairwise( moltype: String, output_path: Option, ) -> anyhow::Result { - let template = build_template(ksize, scaled, &moltype); - match pairwise::pairwise(siglist_path, threshold, template, output_path) { + let queryfile_path: camino::Utf8PathBuf = siglist_path.into(); + let selection = build_selection(ksize, scaled, &moltype); + match pairwise::pairwise(&queryfile_path, threshold, &selection, output_path) { Ok(_) => Ok(0), Err(e) => { eprintln!("Error: {e}"); diff --git a/src/pairwise.rs b/src/pairwise.rs index 6e7fe7c4..c4c0a886 100644 --- a/src/pairwise.rs +++ b/src/pairwise.rs @@ -1,6 +1,7 @@ use anyhow::Result; /// pairwise: massively parallel in-memory pairwise comparisons. use rayon::prelude::*; +use sourmash::sketch::minhash::KmerMinHash; use std::fs::File; use std::io::{BufWriter, Write}; @@ -12,20 +13,41 @@ use std::sync::atomic::AtomicUsize; use sourmash::signature::SigsTrait; use sourmash::sketch::Sketch; -use crate::utils::{load_sketches_from_zip_or_pathlist, ReportType}; +use crate::utils::{load_collection, ReportType}; +use sourmash::prelude::Select; +use sourmash::selection::Selection; +use sourmash::storage::SigStore; /// Perform pairwise comparisons of all signatures in a list. /// /// Note: this function loads all _signatures_ into memory. pub fn pairwise>( - siglist: P, + sigpath: &camino::Utf8PathBuf, threshold: f64, - template: Sketch, + selection: &Selection, output: Option

, ) -> Result<(), Box> { // Load all sigs into memory at once. - let sigs = load_sketches_from_zip_or_pathlist(&siglist, &template, ReportType::Query)?; + let collection = load_collection(sigpath, selection, ReportType::Query)?; + + if collection.len() <= 1 { + bail!( + "Pairwise requires two or more sketches. Check input: '{:?}'", + &sigpath + ) + } + + let mut sketches: Vec<(KmerMinHash, String, String)> = Vec::new(); + for (_idx, record) in collection.iter() { + if let Ok(sig) = collection.sig_from_record(record) { + if let Some(ds_mh) = sig.clone().select(&selection)?.minhash().cloned() { + sketches.push((ds_mh, record.name().to_string(), record.md5().to_string())); + } + } else { + eprintln!("Failed to load record: {}", record.name()); + } + } // set up a multi-producer, single-consumer channel. let (send, recv) = std::sync::mpsc::sync_channel(rayon::current_num_threads()); @@ -54,37 +76,40 @@ pub fn pairwise>( let processed_cmp = AtomicUsize::new(0); - sigs.par_iter().enumerate().for_each(|(i, q1)| { - for q2 in &sigs[(i + 1)..] { - let overlap = q1.minhash.count_common(&q2.minhash, false).unwrap() as f64; - let query1_size = q1.minhash.size() as f64; - let query2_size = q2.minhash.size() as f64; - - let containment_q1_in_q2 = overlap / query1_size; - let containment_q2_in_q1 = overlap / query2_size; - let max_containment = containment_q1_in_q2.max(containment_q2_in_q1); - let jaccard = overlap / (query1_size + query2_size - overlap); - - if containment_q1_in_q2 > threshold || containment_q2_in_q1 > threshold { - send.send(( - q1.name.clone(), - q1.md5sum.clone(), - q2.name.clone(), - q2.md5sum.clone(), - containment_q1_in_q2, - max_containment, - jaccard, - overlap, - )) - .unwrap(); - } - - let i = processed_cmp.fetch_add(1, atomic::Ordering::SeqCst); - if i % 100000 == 0 { - eprintln!("Processed {} comparisons", i); + sketches + .par_iter() + .enumerate() + .for_each(|(idx, (q1, q1_name, q1_md5))| { + for (j, (q2, q2_name, q2_md5)) in sketches.iter().enumerate().skip(idx + 1) { + let overlap = q1.count_common(q2, false).unwrap() as f64; + let query1_size = q1.size() as f64; + let query2_size = q2.size() as f64; + + let containment_q1_in_q2 = overlap / query1_size; + let containment_q2_in_q1 = overlap / query2_size; + let max_containment = containment_q1_in_q2.max(containment_q2_in_q1); + let jaccard = overlap / (query1_size + query2_size - overlap); + + if containment_q1_in_q2 > threshold || containment_q2_in_q1 > threshold { + send.send(( + q1_name.clone(), + q1_md5.clone(), + q2_name.clone(), + q2_md5.clone(), + containment_q1_in_q2, + max_containment, + jaccard, + overlap, + )) + .unwrap(); + } + + let i = processed_cmp.fetch_add(1, atomic::Ordering::SeqCst); + if i % 100000 == 0 { + eprintln!("Processed {} comparisons", i); + } } - } - }); + }); // do some cleanup and error handling - drop(send); // close the channel diff --git a/src/python/tests/test_pairwise.py b/src/python/tests/test_pairwise.py index 84bb2365..eeec42d4 100644 --- a/src/python/tests/test_pairwise.py +++ b/src/python/tests/test_pairwise.py @@ -115,15 +115,9 @@ def test_simple_threshold(runtmp, zip_query): -def test_bad_query(runtmp, capfd): - # test with a bad query (a .sig.gz file) - against_list = runtmp.output('against.txt') - +def test_sig_query(runtmp, capfd): + # sig query is ok now, but fails bc only one sig sig2 = get_test_data('2.fa.sig.gz') - sig47 = get_test_data('47.fa.sig.gz') - sig63 = get_test_data('63.fa.sig.gz') - - make_file_list(against_list, [sig2, sig47, sig63]) output = runtmp.output('out.csv') @@ -133,18 +127,16 @@ def test_bad_query(runtmp, capfd): captured = capfd.readouterr() print(captured.err) + assert "Error: Pairwise requires two or more sketches. Check input" in captured.err - assert 'Error: invalid line in fromfile ' in captured.err - -def test_bad_query_2(runtmp, capfd): +def test_bad_query(runtmp, capfd): # test with a bad query list (a missing file) query_list = runtmp.output('query.txt') sig2 = get_test_data('2.fa.sig.gz') sig47 = get_test_data('47.fa.sig.gz') - sig63 = get_test_data('63.fa.sig.gz') - make_file_list(query_list, [sig2, "no-exist"]) + make_file_list(query_list, [sig2, sig47, "no-exist"]) output = runtmp.output('out.csv') @@ -160,7 +152,7 @@ def test_bad_query_2(runtmp, capfd): -def test_bad_query_3(runtmp, capfd): +def test_bad_query_2(runtmp, capfd): # test with a bad query (a .sig.gz file renamed as zip file) sig2 = get_test_data('2.fa.sig.gz') @@ -182,7 +174,7 @@ def test_bad_query_3(runtmp, capfd): captured = capfd.readouterr() print(captured.err) - assert 'Error: invalid Zip archive: Could not find central directory end' in captured.err + assert 'InvalidArchive' in captured.err @pytest.mark.parametrize("zip_db", [False, True]) @@ -203,7 +195,7 @@ def test_missing_query(runtmp, capfd, zip_db): captured = capfd.readouterr() print(captured.err) - assert 'Error: No such file or directory ' in captured.err + assert 'Error: No such file or directory' in captured.err From 893e0a7b52119d351b360df277979c2ec03d460d Mon Sep 17 00:00:00 2001 From: "N. Tessa Pierce-Ward" Date: Tue, 30 Jan 2024 18:44:22 -0800 Subject: [PATCH 28/40] clean up a little --- src/lib.rs | 4 +- src/python/tests/test_pairwise.py | 2 - src/utils.rs | 106 ------------------------------ 3 files changed, 1 insertion(+), 111 deletions(-) diff --git a/src/lib.rs b/src/lib.rs index c7c2d69f..d2365afe 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -5,8 +5,8 @@ use pyo3::prelude::*; extern crate simple_error; mod utils; +use crate::utils::build_selection; use crate::utils::is_revindex_database; -use crate::utils::{build_selection, build_template}; mod check; mod fastgather; mod fastmultigather; @@ -212,8 +212,6 @@ fn do_multisearch( let queryfile_path: camino::Utf8PathBuf = querylist_path.into(); let againstfile_path: camino::Utf8PathBuf = siglist_path.into(); let selection = build_selection(ksize, scaled, &moltype); - // let selection = build_selection(ksize, scaled, &moltype); - let template = build_template(ksize, scaled, &moltype); match multisearch::multisearch( &queryfile_path, &againstfile_path, diff --git a/src/python/tests/test_pairwise.py b/src/python/tests/test_pairwise.py index eeec42d4..55259e85 100644 --- a/src/python/tests/test_pairwise.py +++ b/src/python/tests/test_pairwise.py @@ -150,8 +150,6 @@ def test_bad_query(runtmp, capfd): assert "WARNING: 1 query paths failed to load. See error messages above." in captured.err - - def test_bad_query_2(runtmp, capfd): # test with a bad query (a .sig.gz file renamed as zip file) diff --git a/src/utils.rs b/src/utils.rs index 0b69eb00..eeff3ff9 100644 --- a/src/utils.rs +++ b/src/utils.rs @@ -420,69 +420,6 @@ pub fn load_sketches_above_threshold( Ok((matchlist, skipped_paths, failed_paths)) } -/// Loads all compatible sketches from a ZIP archive at the given path into memory. -/// Currently not parallelized; use a different zip crate to enable parallelization. -/// -/// # Arguments -/// -/// * `zip_path` - Path to the ZIP archive. -/// * `template` - Reference to the Sketch template. -/// -/// # Returns -/// -/// Returns a tuple containing: -/// * A vector of `SmallSignature`s. -/// * Number of paths that were skipped because they did not match the sketch parameters. -/// * Number of paths that failed to load. -/// -/// # Errors -/// -/// Returns an error if: -/// * Unable to open the ZIP file. -/// * ZIP archive is malformed. -pub fn load_sketches_from_zip>( - zip_path: P, - template: &Sketch, -) -> Result<(Vec, usize, usize)> { - let mut sketchlist = Vec::new(); - let zip_file = File::open(&zip_path)?; - let mut zip_archive = ZipArchive::new(zip_file)?; - let mut skipped_paths = 0; - let mut failed_paths = 0; - - // loop through, loading signatures - for i in 0..zip_archive.len() { - let mut file = zip_archive.by_index(i)?; - let file_name = Path::new(file.name()) - .file_name() - .unwrap() - .to_str() - .unwrap() - .to_owned(); - - if !file_name.contains(".sig") && !file_name.contains(".sig.gz") { - continue; - } - if let Ok(sigs) = Signature::from_reader(&mut file) { - if let Some(sm) = - prepare_query(&sigs, template, &zip_path.as_ref().display().to_string()) - { - sketchlist.push(sm); - } else { - // track number of paths that have no matching sigs - skipped_paths += 1; - } - } else { - // failed to load from this path - print error & track. - eprintln!("WARNING: could not load sketches from path '{}'", file_name); - failed_paths += 1; - } - } - drop(zip_archive); - println!("loaded {} signatures", sketchlist.len()); - Ok((sketchlist, skipped_paths, failed_paths)) -} - pub enum ReportType { Query, Against, @@ -500,49 +437,6 @@ impl std::fmt::Display for ReportType { } } -/// Control function to load compatible signatures from an input file. -/// If a ZIP archive is provided (detected via extension), -/// calls `load_sketches_from_zip`. Otherwise, assumes the -/// user provided a `fromfile` sketchlist and calls -/// `load_sketchlist_filenames`. -/// -/// # Arguments -/// -/// * `sketchlist_path` - Path to either a ZIP archive or a list of signature file paths. -/// * `template` - Reference to the Sketch template (used to load only compatible signatures). -/// * `report_type` - ReportType Enum. Are these 'query' or 'search' signatures? -/// -/// # Returns -/// -/// Returns a vector of `SmallSignature`s. -pub fn load_sketches_from_zip_or_pathlist>( - sketchlist_path: P, - template: &Sketch, - report_type: ReportType, -) -> Result> { - eprintln!( - "Reading list of {} paths from: '{}'", - report_type, - sketchlist_path.as_ref().display() - ); - - let (sketchlist, skipped_paths, failed_paths) = if sketchlist_path - .as_ref() - .extension() - .map(|ext| ext == "zip") - .unwrap_or(false) - { - load_sketches_from_zip(sketchlist_path, template)? - } else { - let sketch_paths = load_sketchlist_filenames(&sketchlist_path)?; - load_sketches(sketch_paths, template)? - }; - - report_on_sketch_loading(&sketchlist, skipped_paths, failed_paths, report_type)?; - - Ok(sketchlist) -} - pub fn load_collection( sigpath: &camino::Utf8PathBuf, selection: &Selection, From dbdff4a88047f4d267dd097f77fed31c204c9c21 Mon Sep 17 00:00:00 2001 From: "N. Tessa Pierce-Ward" Date: Thu, 1 Feb 2024 15:18:05 -0800 Subject: [PATCH 29/40] clean up; unify sketch loading for pairwise/multisearch --- src/fastgather.rs | 1 - src/fastmultigather.rs | 5 +- src/mastiff_manygather.rs | 1 - src/multisearch.rs | 108 +++++---------- src/pairwise.rs | 20 +-- src/python/tests/test_pairwise.py | 4 +- src/utils.rs | 217 +++++------------------------- 7 files changed, 73 insertions(+), 283 deletions(-) diff --git a/src/fastgather.rs b/src/fastgather.rs index 82362c85..ff4a07ea 100644 --- a/src/fastgather.rs +++ b/src/fastgather.rs @@ -5,7 +5,6 @@ use sourmash::selection::Selection; // use camino; use sourmash::prelude::Select; -use sourmash::signature::SigsTrait; use crate::utils::{ consume_query_by_gather, load_collection, load_sketches_above_threshold, write_prefetch, diff --git a/src/fastmultigather.rs b/src/fastmultigather.rs index d7537c8a..f28dcb85 100644 --- a/src/fastmultigather.rs +++ b/src/fastmultigather.rs @@ -2,19 +2,16 @@ use anyhow::Result; use rayon::prelude::*; -use serde::Serialize; -use sourmash::prelude::Select; use sourmash::selection::Selection; use sourmash::sketch::Sketch; use sourmash::storage::SigStore; -use sourmash::{selection, signature::Signature}; use std::sync::atomic; use std::sync::atomic::AtomicUsize; use std::collections::BinaryHeap; -use camino::{Utf8Path, Utf8PathBuf}; +use camino::Utf8Path; use crate::utils::{ consume_query_by_gather, load_collection, write_prefetch, PrefetchResult, ReportType, diff --git a/src/mastiff_manygather.rs b/src/mastiff_manygather.rs index 2175e759..6a80a647 100644 --- a/src/mastiff_manygather.rs +++ b/src/mastiff_manygather.rs @@ -2,7 +2,6 @@ use anyhow::Result; use rayon::prelude::*; -use sourmash::signature::Signature; use sourmash::sketch::Sketch; use std::path::Path; diff --git a/src/multisearch.rs b/src/multisearch.rs index be9989f6..0d772276 100644 --- a/src/multisearch.rs +++ b/src/multisearch.rs @@ -4,18 +4,14 @@ use rayon::prelude::*; use std::fs::File; use std::io::{BufWriter, Write}; -use std::path::Path; use std::sync::atomic; use std::sync::atomic::AtomicUsize; -use sourmash::prelude::Select; use sourmash::selection::Selection; use sourmash::signature::SigsTrait; -use sourmash::sketch::Sketch; -use sourmash::storage::SigStore; -use crate::utils::{load_collection, ReportType}; +use crate::utils::{load_collection, load_mh_with_name_and_md5, ReportType}; /// Search many queries against a list of signatures. /// @@ -31,37 +27,14 @@ pub fn multisearch( ) -> Result<(), Box> { // Load all queries into memory at once. - // let queries = load_sketches_from_zip_or_pathlist(&querylist, &template, ReportType::Query)?; let query_collection = load_collection(query_filepath, selection, ReportType::Query)?; - let mut queries: Vec = vec![]; - for (idx, record) in query_collection.iter() { - if let Ok(sig) = query_collection.sig_from_record(record) - // .unwrap() - // .select(&selection) // if we select here, we downsample and the md5sum changes! - // ...which means we would lose the original md5sum that is used in the standard gather results. - { - queries.push(sig); - } else { - eprintln!("Failed to load 'against' record: {}", record.name()); - } - } + let queries = + load_mh_with_name_and_md5(query_collection, &selection, ReportType::Query).unwrap(); // Load all against sketches into memory at once. - // let against = load_sketches_from_zip_or_pathlist(&againstlist, &template, ReportType::Against)?; let against_collection = load_collection(against_filepath, selection, ReportType::Against)?; - let mut against: Vec = vec![]; - - for (idx, record) in against_collection.iter() { - if let Ok(sig) = against_collection.sig_from_record(record) - // .unwrap() - // .select(&selection) // if we select here, we downsample and the md5sum changes! - // ...which means we would lose the original md5sum that is used in the standard gather results. - { - against.push(sig); - } else { - eprintln!("Failed to load 'against' record: {}", record.name()); - } - } + let against = + load_mh_with_name_and_md5(against_collection, &selection, ReportType::Against).unwrap(); // set up a multi-producer, single-consumer channel. let (send, recv) = std::sync::mpsc::sync_channel(rayon::current_num_threads()); @@ -94,49 +67,42 @@ pub fn multisearch( let send = against .par_iter() - .filter_map(|target| { + .filter_map(|(against_mh, against_name, against_md5)| { let mut results = vec![]; - - let ds_against_sig = target.clone().select(&selection).unwrap(); - if let Some(against_mh) = ds_against_sig.minhash() { - // search for matches & save containment. - for query_sig in queries.iter() { - let i = processed_cmp.fetch_add(1, atomic::Ordering::SeqCst); - if i % 100000 == 0 { - eprintln!("Processed {} comparisons", i); - } - let ds_q = query_sig.clone().select(&selection).unwrap(); - let query_mh = ds_q.minhash()?; - let overlap = query_mh.count_common(&against_mh, false).unwrap() as f64; - // use downsampled sizes - let query_size = query_mh.size() as f64; - let target_size = against_mh.size() as f64; - - let containment_query_in_target = overlap / query_size; - let containment_in_target = overlap / target_size; - let max_containment = containment_query_in_target.max(containment_in_target); - let jaccard = overlap / (target_size + query_size - overlap); - - if containment_query_in_target > threshold { - results.push(( - query_sig.name(), - query_sig.md5sum(), - target.name(), - target.md5sum(), - containment_query_in_target, - max_containment, - jaccard, - overlap, - )) - } + // search for matches & save containment. + for (query_mh, query_name, query_md5) in queries.iter() { + let i = processed_cmp.fetch_add(1, atomic::Ordering::SeqCst); + if i % 100000 == 0 { + eprintln!("Processed {} comparisons", i); } - if results.is_empty() { - None - } else { - Some(results) + + let overlap = query_mh.count_common(&against_mh, false).unwrap() as f64; + // use downsampled sizes + let query_size = query_mh.size() as f64; + let target_size = against_mh.size() as f64; + + let containment_query_in_target = overlap / query_size; + let containment_in_target = overlap / target_size; + let max_containment = containment_query_in_target.max(containment_in_target); + let jaccard = overlap / (target_size + query_size - overlap); + + if containment_query_in_target > threshold { + results.push(( + query_name.clone(), + query_md5.clone(), + against_name.clone(), + against_md5.clone(), + containment_query_in_target, + max_containment, + jaccard, + overlap, + )) } - } else { + } + if results.is_empty() { None + } else { + Some(results) } }) .flatten() diff --git a/src/pairwise.rs b/src/pairwise.rs index c4c0a886..b6713d41 100644 --- a/src/pairwise.rs +++ b/src/pairwise.rs @@ -1,7 +1,6 @@ use anyhow::Result; /// pairwise: massively parallel in-memory pairwise comparisons. use rayon::prelude::*; -use sourmash::sketch::minhash::KmerMinHash; use std::fs::File; use std::io::{BufWriter, Write}; @@ -11,12 +10,9 @@ use std::sync::atomic; use std::sync::atomic::AtomicUsize; use sourmash::signature::SigsTrait; -use sourmash::sketch::Sketch; -use crate::utils::{load_collection, ReportType}; -use sourmash::prelude::Select; +use crate::utils::{load_collection, load_mh_with_name_and_md5, ReportType}; use sourmash::selection::Selection; -use sourmash::storage::SigStore; /// Perform pairwise comparisons of all signatures in a list. /// @@ -29,7 +25,7 @@ pub fn pairwise>( output: Option

, ) -> Result<(), Box> { // Load all sigs into memory at once. - let collection = load_collection(sigpath, selection, ReportType::Query)?; + let collection = load_collection(sigpath, selection, ReportType::Pairwise)?; if collection.len() <= 1 { bail!( @@ -37,17 +33,7 @@ pub fn pairwise>( &sigpath ) } - - let mut sketches: Vec<(KmerMinHash, String, String)> = Vec::new(); - for (_idx, record) in collection.iter() { - if let Ok(sig) = collection.sig_from_record(record) { - if let Some(ds_mh) = sig.clone().select(&selection)?.minhash().cloned() { - sketches.push((ds_mh, record.name().to_string(), record.md5().to_string())); - } - } else { - eprintln!("Failed to load record: {}", record.name()); - } - } + let sketches = load_mh_with_name_and_md5(collection, &selection, ReportType::Pairwise).unwrap(); // set up a multi-producer, single-consumer channel. let (send, recv) = std::sync::mpsc::sync_channel(rayon::current_num_threads()); diff --git a/src/python/tests/test_pairwise.py b/src/python/tests/test_pairwise.py index 55259e85..0dd67c05 100644 --- a/src/python/tests/test_pairwise.py +++ b/src/python/tests/test_pairwise.py @@ -147,7 +147,7 @@ def test_bad_query(runtmp, capfd): print(captured.err) assert "WARNING: could not load sketches from path 'no-exist'" in captured.err - assert "WARNING: 1 query paths failed to load. See error messages above." in captured.err + assert "WARNING: 1 signature paths failed to load. See error messages above." in captured.err def test_bad_query_2(runtmp, capfd): @@ -241,7 +241,7 @@ def test_nomatch_query(runtmp, capfd, zip_query): captured = capfd.readouterr() print(captured.err) - assert 'WARNING: skipped 1 query paths - no compatible signatures' in captured.err + assert 'WARNING: skipped 1 signature paths - no compatible signatures' in captured.err @pytest.mark.parametrize("zip_db", [False, True]) diff --git a/src/utils.rs b/src/utils.rs index eeff3ff9..5cd49de1 100644 --- a/src/utils.rs +++ b/src/utils.rs @@ -5,40 +5,26 @@ use sourmash::manifest::Manifest; use sourmash::selection::Select; use std::fs::{create_dir_all, File}; -use std::io::Read; use std::io::{BufRead, BufReader, BufWriter, Write}; use std::panic; use std::path::{Path, PathBuf}; -use tempfile::tempdir; -use zip::read::ZipArchive; - use std::sync::atomic; use std::sync::atomic::AtomicUsize; use std::collections::BinaryHeap; -use anyhow::{anyhow, Context, Result}; +use anyhow::{anyhow, Result}; use std::cmp::{Ordering, PartialOrd}; -use sourmash::collection::{self, Collection}; -use sourmash::errors::SourmashError; +use sourmash::collection::Collection; use sourmash::manifest::Record; use sourmash::selection::Selection; use sourmash::signature::{Signature, SigsTrait}; -use sourmash::sketch::minhash::{max_hash_for_scaled, KmerMinHash}; +use sourmash::sketch::minhash::KmerMinHash; use sourmash::sketch::Sketch; use sourmash::storage::{FSStorage, InnerStorage, SigStore}; -/// Track a name/minhash. - -pub struct SmallSignature { - pub location: String, - pub name: String, - pub md5sum: String, - pub minhash: KmerMinHash, -} - /// Structure to hold overlap information from comparisons. pub struct PrefetchResult { @@ -68,86 +54,6 @@ impl PartialEq for PrefetchResult { impl Eq for PrefetchResult {} -/// check to see if two KmerMinHash are compatible. -/// -/// CTB note: despite the name, downsampling is not performed? -/// Although it checks if they are compatible in one direction... - -pub fn check_compatible_downsample( - me: &KmerMinHash, - other: &KmerMinHash, -) -> Result<(), sourmash::Error> { - /* // ignore num minhashes. - if self.num != other.num { - return Err(Error::MismatchNum { - n1: self.num, - n2: other.num, - } - .into()); - } - */ - use sourmash::Error; - - if me.ksize() != other.ksize() { - return Err(Error::MismatchKSizes); - } - if me.hash_function() != other.hash_function() { - // TODO: fix this error - return Err(Error::MismatchDNAProt); - } - if me.max_hash() < other.max_hash() { - return Err(Error::MismatchScaled); - } - if me.seed() != other.seed() { - return Err(Error::MismatchSeed); - } - Ok(()) -} - -/// Given a vec of search Signatures, each containing one or more sketches, -/// and a template Sketch, return a compatible (& now downsampled) -/// Sketch from the search Signatures.. -/// -/// CTB note: this will return the first acceptable match, I think, ignoring -/// all others. - -pub fn prepare_query( - search_sigs: &[Signature], - template: &Sketch, - location: &str, -) -> Option { - for search_sig in search_sigs.iter() { - // find exact match for template? - if let Some(Sketch::MinHash(mh)) = search_sig.select_sketch(template) { - return Some(SmallSignature { - location: location.to_string().clone(), - name: search_sig.name(), - md5sum: mh.md5sum(), - minhash: mh.clone(), - }); - } else { - // no - try to find one that can be downsampled - if let Sketch::MinHash(template_mh) = template { - for sketch in search_sig.sketches() { - if let Sketch::MinHash(ref_mh) = sketch { - if check_compatible_downsample(&ref_mh, template_mh).is_ok() { - let max_hash = max_hash_for_scaled(template_mh.scaled()); - let mh = ref_mh.downsample_max_hash(max_hash).unwrap(); - return Some(SmallSignature { - location: location.to_string().clone(), - name: search_sig.name(), - md5sum: ref_mh.md5sum(), // original - minhash: mh, // downsampled - }); - } - } - } - } - } - } - None -} - /// Find sketches in 'sketchlist' that overlap with 'query' above /// specified threshold. @@ -319,42 +225,26 @@ pub fn load_fasta_fromfile>( Ok(results) } -/// Load a collection of sketches from a file in parallel. -pub fn load_sketches( - sketchlist_paths: Vec, - template: &Sketch, -) -> Result<(Vec, usize, usize)> { - let skipped_paths = AtomicUsize::new(0); - let failed_paths = AtomicUsize::new(0); - - let sketchlist: Vec = sketchlist_paths - .par_iter() - .filter_map(|m| { - let filename = m.display().to_string(); - - match Signature::from_path(m) { - Ok(sigs) => { - let sm = prepare_query(&sigs, template, &filename); - if sm.is_none() { - // track number of paths that have no matching sigs - let _i = skipped_paths.fetch_add(1, atomic::Ordering::SeqCst); - } - sm - } - Err(err) => { - // failed to load from this path - print error & track. - eprintln!("Sketch loading error: {}", err); - eprintln!("WARNING: could not load sketches from path '{}'", filename); - let _i = failed_paths.fetch_add(1, atomic::Ordering::SeqCst); - None - } +pub fn load_mh_with_name_and_md5( + collection: Collection, + selection: &Selection, + report_type: ReportType, +) -> Result> { + let mut sketchinfo: Vec<(KmerMinHash, String, String)> = Vec::new(); + for (_idx, record) in collection.iter() { + if let Ok(sig) = collection.sig_from_record(record) { + if let Some(ds_mh) = sig.clone().select(&selection)?.minhash().cloned() { + sketchinfo.push((ds_mh, record.name().to_string(), record.md5().to_string())); } - }) - .collect(); - - let skipped_paths = skipped_paths.load(atomic::Ordering::SeqCst); - let failed_paths = failed_paths.load(atomic::Ordering::SeqCst); - Ok((sketchlist, skipped_paths, failed_paths)) + } else { + bail!( + "Error: Failed to load {} record: {}", + report_type, + record.name() + ); + } + } + Ok(sketchinfo) } /// Load a collection of sketches from a file, filtering to keep only @@ -423,7 +313,7 @@ pub fn load_sketches_above_threshold( pub enum ReportType { Query, Against, - Index, + Pairwise, } impl std::fmt::Display for ReportType { @@ -431,7 +321,7 @@ impl std::fmt::Display for ReportType { let description = match self { ReportType::Query => "query", ReportType::Against => "search", - ReportType::Index => "index", + ReportType::Pairwise => "signature", }; write!(f, "{}", description) } @@ -447,7 +337,7 @@ pub fn load_collection( } let mut n_failed = 0; - let mut collection = if sigpath.extension().map_or(false, |ext| ext == "zip") { + let collection = if sigpath.extension().map_or(false, |ext| ext == "zip") { match Collection::from_zipfile(&sigpath) { Ok(collection) => collection, Err(_) => bail!("failed to load {} zipfile: '{}'", report_type, sigpath), @@ -514,34 +404,7 @@ pub fn load_collection( Ok(selected) } -pub fn report_on_collection_loading( - collection: &Collection, - skipped_paths: usize, - failed_paths: usize, - report_type: ReportType, -) -> Result<()> { - if failed_paths > 0 { - eprintln!( - "WARNING: {} {} paths failed to load. See error messages above.", - failed_paths, report_type - ); - } - if skipped_paths > 0 { - eprintln!( - "WARNING: skipped {} {} paths - no compatible signatures.", - skipped_paths, report_type - ); - } - - // Validate sketches - if collection.is_empty() { - bail!("No {} signatures loaded, exiting.", report_type); - } - eprintln!("Loaded {} {} signature(s)", collection.len(), report_type); - Ok(()) -} - -/// Uses the output of sketch loading functions to report the +/// Uses the output of collection loading function to report the /// total number of sketches loaded, as well as the number of files, /// if any, that failed to load or contained no compatible sketches. /// If no sketches were loaded, bail. @@ -563,8 +426,8 @@ pub fn report_on_collection_loading( /// /// Returns an error if: /// * No signatures were successfully loaded. -pub fn report_on_sketch_loading( - sketchlist: &[SmallSignature], +pub fn report_on_collection_loading( + collection: &Collection, skipped_paths: usize, failed_paths: usize, report_type: ReportType, @@ -583,10 +446,10 @@ pub fn report_on_sketch_loading( } // Validate sketches - eprintln!("Loaded {} {} signature(s)", sketchlist.len(), report_type); - if sketchlist.is_empty() { + if collection.is_empty() { bail!("No {} signatures loaded, exiting.", report_type); } + eprintln!("Loaded {} {} signature(s)", collection.len(), report_type); Ok(()) } @@ -687,26 +550,6 @@ pub fn consume_query_by_gather( Ok(()) } -pub fn build_template(ksize: u8, scaled: usize, moltype: &str) -> Sketch { - let hash_function = match moltype { - "dna" => HashFunctions::Murmur64Dna, - "protein" => HashFunctions::Murmur64Protein, - "dayhoff" => HashFunctions::Murmur64Dayhoff, - "hp" => HashFunctions::Murmur64Hp, - _ => panic!("Unknown molecule type: {}", moltype), - }; - //adjust ksize if not dna - let adjusted_ksize = if moltype == "dna" { ksize } else { ksize * 3 }; - let max_hash = max_hash_for_scaled(scaled as u64); - let template_mh = KmerMinHash::builder() - .num(0u32) - .ksize(adjusted_ksize as u32) - .max_hash(max_hash) - .hash_function(hash_function) - .build(); - Sketch::MinHash(template_mh) -} - pub fn build_selection(ksize: u8, scaled: usize, moltype: &str) -> Selection { let hash_function = match moltype { "dna" => HashFunctions::Murmur64Dna, From ab339ba1b677465bdb9954386309ff762d028d5c Mon Sep 17 00:00:00 2001 From: "N. Tessa Pierce-Ward" Date: Thu, 1 Feb 2024 16:24:10 -0800 Subject: [PATCH 30/40] ...cleaner --- src/fastgather.rs | 19 +++++-- src/fastmultigather.rs | 19 +++++-- src/index.rs | 47 ++++----------- src/lib.rs | 69 +++++++++++++--------- src/manysearch.rs | 26 ++++++--- src/manysketch.rs | 12 ++-- src/mastiff_manygather.rs | 14 +++-- src/mastiff_manysearch.rs | 20 ++++--- src/multisearch.rs | 19 +++++-- src/pairwise.rs | 14 +++-- src/python/tests/test_index.py | 25 ++++---- src/utils.rs | 101 ++++++++++++++++++--------------- 12 files changed, 220 insertions(+), 165 deletions(-) diff --git a/src/fastgather.rs b/src/fastgather.rs index ff4a07ea..280afd54 100644 --- a/src/fastgather.rs +++ b/src/fastgather.rs @@ -12,16 +12,22 @@ use crate::utils::{ }; pub fn fastgather( - query_filepath: &camino::Utf8PathBuf, - against_filepath: &camino::Utf8PathBuf, + query_filepath: String, + against_filepath: String, threshold_bp: usize, ksize: u8, scaled: usize, selection: &Selection, gather_output: Option, prefetch_output: Option, + allow_failed_sigpaths: bool, ) -> Result<()> { - let query_collection = load_collection(query_filepath, selection, ReportType::Query)?; + let query_collection = load_collection( + &query_filepath, + selection, + ReportType::Query, + allow_failed_sigpaths, + )?; if query_collection.len() != 1 { bail!( @@ -40,7 +46,12 @@ pub fn fastgather( } }; // build the list of paths to match against. - let against_collection = load_collection(against_filepath, selection, ReportType::Against)?; + let against_collection = load_collection( + &against_filepath, + selection, + ReportType::Against, + allow_failed_sigpaths, + )?; // calculate the minimum number of hashes based on desired threshold let threshold_hashes: u64 = { diff --git a/src/fastmultigather.rs b/src/fastmultigather.rs index f28dcb85..6fb1c932 100644 --- a/src/fastmultigather.rs +++ b/src/fastmultigather.rs @@ -18,14 +18,20 @@ use crate::utils::{ }; pub fn fastmultigather( - query_filepath: camino::Utf8PathBuf, - against_filepath: camino::Utf8PathBuf, + query_filepath: String, + against_filepath: String, threshold_bp: usize, scaled: usize, selection: &Selection, + allow_failed_sigpaths: bool, ) -> Result<()> { // load the list of query paths - let query_collection = load_collection(&query_filepath, selection, ReportType::Query)?; + let query_collection = load_collection( + &query_filepath, + selection, + ReportType::Query, + allow_failed_sigpaths, + )?; println!("Loaded {} sig paths in querylist", query_collection.len()); let threshold_hashes: u64 = { @@ -42,7 +48,12 @@ pub fn fastmultigather( println!("threshold overlap: {} {}", threshold_hashes, threshold_bp); // Load all the against sketches - let against_collection = load_collection(&against_filepath, selection, ReportType::Against)?; + let against_collection = load_collection( + &against_filepath, + selection, + ReportType::Against, + allow_failed_sigpaths, + )?; // load actual signatures let mut sketchlist: Vec = vec![]; diff --git a/src/index.rs b/src/index.rs index 23675614..6fa7e898 100644 --- a/src/index.rs +++ b/src/index.rs @@ -1,52 +1,25 @@ -use camino::Utf8PathBuf as PathBuf; -use sourmash::collection::Collection; use sourmash::index::revindex::RevIndex; -use sourmash::manifest::Manifest; use sourmash::prelude::*; -use sourmash::storage::{FSStorage, InnerStorage, ZipStorage}; use std::path::Path; -use crate::utils::load_sketchlist_filenames; +use crate::utils::{load_collection, ReportType}; pub fn index>( - siglist: PathBuf, - manifest: Option

, - selection: Selection, + siglist: String, + selection: &Selection, output: P, save_paths: bool, colors: bool, + allow_failed_sigpaths: bool, ) -> Result<(), Box> { println!("Loading siglist"); - let manifest = if let Some(m) = manifest { - let rdr = std::fs::OpenOptions::new().read(true).open(m.as_ref())?; - Some(Manifest::from_reader(rdr)?) - } else { - None - }; - - let collection = if matches!(&siglist.extension(), Some("zip")) { - if let Some(m) = manifest { - let storage = ZipStorage::from_file(siglist)?; - Collection::new(m, InnerStorage::new(storage)) - } else { - Collection::from_zipfile(siglist)? - } - } else { - let manifest = manifest.unwrap_or_else(|| { - let sig_paths: Vec<_> = load_sketchlist_filenames(&siglist) - .unwrap_or_else(|_| panic!("Error loading siglist")) - .into_iter() - .map(|v| PathBuf::from_path_buf(v).unwrap()) - .collect(); - sig_paths.as_slice().into() - }); - let storage = FSStorage::builder() - .fullpath("".into()) - .subdir("".into()) - .build(); - Collection::new(manifest, InnerStorage::new(storage)) - }; + let collection = load_collection( + &siglist, + selection, + ReportType::General, + allow_failed_sigpaths, + )?; RevIndex::create( output.as_ref(), diff --git a/src/lib.rs b/src/lib.rs index d2365afe..ab178564 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -18,7 +18,7 @@ mod mastiff_manysearch; mod multisearch; mod pairwise; -use camino::Utf8PathBuf; +use camino::Utf8PathBuf as PathBuf; #[pyfunction] fn do_manysearch( @@ -30,20 +30,20 @@ fn do_manysearch( moltype: String, output_path: Option, ) -> anyhow::Result { - let queryfile_path: camino::Utf8PathBuf = querylist_path.clone().into(); - let againstfile_path: camino::Utf8PathBuf = siglist_path.clone().into(); + let againstfile_path: PathBuf = siglist_path.clone().into(); let selection = build_selection(ksize, scaled, &moltype); eprintln!("selection scaled: {:?}", selection.scaled()); + let allow_failed_sigpaths = true; // if siglist_path is revindex, run mastiff_manysearch; otherwise run manysearch if is_revindex_database(&againstfile_path) { - // if is_revindex_database(siglist_path.as_ref()) { match mastiff_manysearch::mastiff_manysearch( - queryfile_path, + querylist_path, againstfile_path, &selection, threshold, output_path, + allow_failed_sigpaths, ) { Ok(_) => Ok(0), Err(e) => { @@ -53,11 +53,12 @@ fn do_manysearch( } } else { match manysearch::manysearch( - &queryfile_path, - &againstfile_path, + querylist_path, + siglist_path, &selection, threshold, output_path, + allow_failed_sigpaths, ) { Ok(_) => Ok(0), Err(e) => { @@ -79,20 +80,19 @@ fn do_fastgather( output_path_prefetch: Option, output_path_gather: Option, ) -> anyhow::Result { - let queryfile_path: camino::Utf8PathBuf = query_filename.into(); - let againstfile_path: camino::Utf8PathBuf = siglist_path.into(); - let selection = build_selection(ksize, scaled, &moltype); + let allow_failed_sigpaths = true; match fastgather::fastgather( - &queryfile_path, - &againstfile_path, + query_filename, + siglist_path, threshold_bp, ksize, scaled, &selection, output_path_prefetch, output_path_gather, + allow_failed_sigpaths, ) { Ok(_) => Ok(0), Err(e) => { @@ -112,18 +112,19 @@ fn do_fastmultigather( moltype: String, output_path: Option, ) -> anyhow::Result { - let queryfile_path: camino::Utf8PathBuf = query_filenames.into(); - let againstfile_path: camino::Utf8PathBuf = siglist_path.into(); + let againstfile_path: camino::Utf8PathBuf = siglist_path.clone().into(); let selection = build_selection(ksize, scaled, &moltype); + let allow_failed_sigpaths = true; // if a siglist path is a revindex, run mastiff_manygather. If not, run multigather if is_revindex_database(&againstfile_path) { match mastiff_manygather::mastiff_manygather( - queryfile_path, + query_filenames, againstfile_path, &selection, threshold_bp, output_path, + allow_failed_sigpaths, ) { Ok(_) => Ok(0), Err(e) => { @@ -133,11 +134,12 @@ fn do_fastmultigather( } } else { match fastmultigather::fastmultigather( - queryfile_path, - againstfile_path, + query_filenames, + siglist_path, threshold_bp, scaled, &selection, + allow_failed_sigpaths, ) { Ok(_) => Ok(0), Err(e) => { @@ -176,9 +178,15 @@ fn do_index( colors: bool, ) -> anyhow::Result { let selection = build_selection(ksize, scaled, &moltype); - let location = camino::Utf8PathBuf::from(siglist); - let manifest = None; - match index::index(location, manifest, selection, output, save_paths, colors) { + let allow_failed_sigpaths = false; + match index::index( + siglist, + &selection, + output, + save_paths, + colors, + allow_failed_sigpaths, + ) { Ok(_) => Ok(0), Err(e) => { eprintln!("Error: {e}"); @@ -189,7 +197,7 @@ fn do_index( #[pyfunction] fn do_check(index: String, quick: bool) -> anyhow::Result { - let idx: camino::Utf8PathBuf = index.into(); + let idx: PathBuf = index.into(); match check::check(idx, quick) { Ok(_) => Ok(0), Err(e) => { @@ -209,15 +217,16 @@ fn do_multisearch( moltype: String, output_path: Option, ) -> anyhow::Result { - let queryfile_path: camino::Utf8PathBuf = querylist_path.into(); - let againstfile_path: camino::Utf8PathBuf = siglist_path.into(); let selection = build_selection(ksize, scaled, &moltype); + let allow_failed_sigpaths = true; + match multisearch::multisearch( - &queryfile_path, - &againstfile_path, + querylist_path, + siglist_path, threshold, &selection, output_path, + allow_failed_sigpaths, ) { Ok(_) => Ok(0), Err(e) => { @@ -236,9 +245,15 @@ fn do_pairwise( moltype: String, output_path: Option, ) -> anyhow::Result { - let queryfile_path: camino::Utf8PathBuf = siglist_path.into(); let selection = build_selection(ksize, scaled, &moltype); - match pairwise::pairwise(&queryfile_path, threshold, &selection, output_path) { + let allow_failed_sigpaths = true; + match pairwise::pairwise( + siglist_path, + threshold, + &selection, + output_path, + allow_failed_sigpaths, + ) { Ok(_) => Ok(0), Err(e) => { eprintln!("Error: {e}"); diff --git a/src/manysearch.rs b/src/manysearch.rs index 53f25e3c..fa7c4db8 100644 --- a/src/manysearch.rs +++ b/src/manysearch.rs @@ -11,25 +11,30 @@ use sourmash::selection::Selection; use sourmash::signature::SigsTrait; use sourmash::sketch::Sketch; use sourmash::storage::SigStore; -use std::path::Path; use std::sync::atomic; use std::sync::atomic::AtomicUsize; use crate::utils::{csvwriter_thread, load_collection, ReportType, SearchResult}; -pub fn manysearch>( - query_filepath: &camino::Utf8PathBuf, - against_filepath: &camino::Utf8PathBuf, +pub fn manysearch( + query_filepath: String, + against_filepath: String, selection: &Selection, threshold: f64, - output: Option

, + output: Option, + allow_failed_sigpaths: bool, ) -> Result<()> { // Read in list of query paths. eprintln!("Reading queries from: '{}'", query_filepath); // Load all query sigs into memory at once. - let query_collection = load_collection(query_filepath, selection, ReportType::Query)?; + let query_collection = load_collection( + &query_filepath, + selection, + ReportType::Query, + allow_failed_sigpaths, + )?; // load actual signatures let mut query_sketchlist: Vec = vec![]; @@ -46,13 +51,18 @@ pub fn manysearch>( } // Load all _paths_, not signatures, into memory. - let against_collection = load_collection(against_filepath, selection, ReportType::Against)?; + let against_collection = load_collection( + &against_filepath, + selection, + ReportType::Against, + allow_failed_sigpaths, + )?; // set up a multi-producer, single-consumer channel. let (send, recv) = std::sync::mpsc::sync_channel::(rayon::current_num_threads()); // & spawn a thread that is dedicated to printing to a buffered output - let thrd = csvwriter_thread(recv, output.as_ref()); + let thrd = csvwriter_thread(recv, output); // // Main loop: iterate (in parallel) over all search signature paths, diff --git a/src/manysketch.rs b/src/manysketch.rs index 67ff25ae..1fbe399d 100644 --- a/src/manysketch.rs +++ b/src/manysketch.rs @@ -3,10 +3,10 @@ use anyhow::{anyhow, Result}; use rayon::prelude::*; use crate::utils::{load_fasta_fromfile, sigwriter, Params, ZipMessage}; +use camino::Utf8Path as Path; use needletail::parse_fastx_file; use sourmash::cmd::ComputeParameters; use sourmash::signature::Signature; -use std::path::Path; use std::sync::atomic; use std::sync::atomic::AtomicUsize; @@ -117,7 +117,7 @@ fn build_siginfo( let sig = Signature::builder() .hash_function("0.murmur64") .name(Some(name.to_string())) - .filename(Some(filename.to_string_lossy().into_owned())) + .filename(Some(filename.to_string())) .signatures(template) .build(); sigs.push(sig); @@ -128,12 +128,12 @@ fn build_siginfo( (sigs, params_vec) } -pub fn manysketch + Sync>( - filelist: P, +pub fn manysketch( + filelist: String, param_str: String, output: String, ) -> Result<(), Box> { - let fileinfo = match load_fasta_fromfile(&filelist) { + let fileinfo = match load_fasta_fromfile(filelist) { Ok(result) => result, Err(e) => bail!("Could not load fromfile csv. Underlying error: {}", e), }; @@ -206,7 +206,7 @@ pub fn manysketch + Sync>( let mut reader = match parse_fastx_file(filename) { Ok(r) => r, Err(err) => { - eprintln!("Error opening file {}: {:?}", filename.display(), err); + eprintln!("Error opening file {}: {:?}", filename, err); let _ = failed_paths.fetch_add(1, atomic::Ordering::SeqCst); return None; } diff --git a/src/mastiff_manygather.rs b/src/mastiff_manygather.rs index 6a80a647..6755d54a 100644 --- a/src/mastiff_manygather.rs +++ b/src/mastiff_manygather.rs @@ -6,7 +6,7 @@ use sourmash::sketch::Sketch; use std::path::Path; // use camino::Utf8Path as Path; -// use camino::Utf8PathBuf as PathBuf; +use camino::Utf8PathBuf as PathBuf; use sourmash::prelude::*; @@ -21,11 +21,12 @@ use std::io::{BufWriter, Write}; use crate::utils::{is_revindex_database, load_collection, ReportType}; pub fn mastiff_manygather>( - queries_file: camino::Utf8PathBuf, - index: camino::Utf8PathBuf, + queries_file: String, + index: PathBuf, selection: &Selection, threshold_bp: usize, output: Option

, + allow_failed_sigpaths: bool, ) -> Result<(), Box> { if !is_revindex_database(&index) { bail!("'{}' is not a valid RevIndex database", index); @@ -34,7 +35,12 @@ pub fn mastiff_manygather>( let db = RevIndex::open(index, true)?; println!("Loaded DB"); - let query_collection = load_collection(&queries_file, selection, ReportType::Query)?; + let query_collection = load_collection( + &queries_file, + selection, + ReportType::Query, + allow_failed_sigpaths, + )?; // set up a multi-producer, single-consumer channel. let (send, recv) = std::sync::mpsc::sync_channel(rayon::current_num_threads()); diff --git a/src/mastiff_manysearch.rs b/src/mastiff_manysearch.rs index 24fff34e..4ef68830 100644 --- a/src/mastiff_manysearch.rs +++ b/src/mastiff_manysearch.rs @@ -1,11 +1,11 @@ /// mastiff_manysearch: mastiff-indexed version of manysearch. use anyhow::Result; +use camino::Utf8PathBuf as PathBuf; use rayon::prelude::*; use sourmash::index::revindex::{RevIndex, RevIndexOps}; use sourmash::selection::Selection; use sourmash::signature::SigsTrait; use sourmash::sketch::Sketch; -use std::path::Path; use std::sync::atomic; use std::sync::atomic::AtomicUsize; @@ -13,12 +13,13 @@ use crate::utils::{ csvwriter_thread, is_revindex_database, load_collection, ReportType, SearchResult, }; -pub fn mastiff_manysearch>( - queries_path: camino::Utf8PathBuf, - index: camino::Utf8PathBuf, +pub fn mastiff_manysearch( + queries_path: String, + index: PathBuf, selection: &Selection, minimum_containment: f64, - output: Option

, + output: Option, + allow_failed_sigpaths: bool, ) -> Result<(), Box> { if !is_revindex_database(&index) { bail!("'{}' is not a valid RevIndex database", index); @@ -28,7 +29,12 @@ pub fn mastiff_manysearch>( println!("Loaded DB"); // Load query paths - let query_collection = load_collection(&queries_path, selection, ReportType::Query)?; + let query_collection = load_collection( + &queries_path, + selection, + ReportType::Query, + allow_failed_sigpaths, + )?; // if query_paths is empty, exit with error. this should already happen via load_collection, i think? if query_collection.len() == 0 { @@ -39,7 +45,7 @@ pub fn mastiff_manysearch>( let (send, recv) = std::sync::mpsc::sync_channel::(rayon::current_num_threads()); // & spawn a thread that is dedicated to printing to a buffered output - let thrd = csvwriter_thread(recv, output.as_ref()); + let thrd = csvwriter_thread(recv, output); // // Main loop: iterate (in parallel) over all search signature paths, diff --git a/src/multisearch.rs b/src/multisearch.rs index 0d772276..ad28c6ab 100644 --- a/src/multisearch.rs +++ b/src/multisearch.rs @@ -19,20 +19,31 @@ use crate::utils::{load_collection, load_mh_with_name_and_md5, ReportType}; /// database once. pub fn multisearch( - query_filepath: &camino::Utf8PathBuf, - against_filepath: &camino::Utf8PathBuf, + query_filepath: String, + against_filepath: String, threshold: f64, selection: &Selection, output: Option, + allow_failed_sigpaths: bool, ) -> Result<(), Box> { // Load all queries into memory at once. - let query_collection = load_collection(query_filepath, selection, ReportType::Query)?; + let query_collection = load_collection( + &query_filepath, + selection, + ReportType::Query, + allow_failed_sigpaths, + )?; let queries = load_mh_with_name_and_md5(query_collection, &selection, ReportType::Query).unwrap(); // Load all against sketches into memory at once. - let against_collection = load_collection(against_filepath, selection, ReportType::Against)?; + let against_collection = load_collection( + &against_filepath, + selection, + ReportType::Against, + allow_failed_sigpaths, + )?; let against = load_mh_with_name_and_md5(against_collection, &selection, ReportType::Against).unwrap(); diff --git a/src/pairwise.rs b/src/pairwise.rs index b6713d41..c714f9c8 100644 --- a/src/pairwise.rs +++ b/src/pairwise.rs @@ -19,21 +19,27 @@ use sourmash::selection::Selection; /// Note: this function loads all _signatures_ into memory. pub fn pairwise>( - sigpath: &camino::Utf8PathBuf, + siglist: String, threshold: f64, selection: &Selection, output: Option

, + allow_failed_sigpaths: bool, ) -> Result<(), Box> { // Load all sigs into memory at once. - let collection = load_collection(sigpath, selection, ReportType::Pairwise)?; + let collection = load_collection( + &siglist, + selection, + ReportType::General, + allow_failed_sigpaths, + )?; if collection.len() <= 1 { bail!( "Pairwise requires two or more sketches. Check input: '{:?}'", - &sigpath + &siglist ) } - let sketches = load_mh_with_name_and_md5(collection, &selection, ReportType::Pairwise).unwrap(); + let sketches = load_mh_with_name_and_md5(collection, &selection, ReportType::General).unwrap(); // set up a multi-producer, single-consumer channel. let (send, recv) = std::sync::mpsc::sync_channel(rayon::current_num_threads()); diff --git a/src/python/tests/test_index.py b/src/python/tests/test_index.py index eeb8f76a..432d7630 100644 --- a/src/python/tests/test_index.py +++ b/src/python/tests/test_index.py @@ -89,22 +89,22 @@ def test_index_missing_siglist(runtmp, capfd): captured = capfd.readouterr() print(captured.err) - assert 'Error loading siglist' in captured.err + assert 'Error: No such file or directory' in captured.err -def test_index_bad_siglist(runtmp, capfd): - # test index with a bad siglist (.sig.gz file instead of pathlist) +def test_index_sig(runtmp, capfd): + # test index with a .sig.gz file instead of pathlist + # (should work now) sig2 = get_test_data('2.fa.sig.gz') output = runtmp.output('out.db') - with pytest.raises(utils.SourmashCommandFailed): - runtmp.sourmash('scripts', 'index', sig2, + runtmp.sourmash('scripts', 'index', sig2, '-o', output) captured = capfd.readouterr() print(captured.err) - assert 'Error loading siglist' in captured.err print(runtmp.last_result.err) + assert 'index is done' in runtmp.last_result.err def test_index_bad_siglist_2(runtmp, capfd): @@ -124,28 +124,25 @@ def test_index_bad_siglist_2(runtmp, capfd): captured = capfd.readouterr() print(captured.err) - assert 'Error processing "no-exist"' in captured.err + assert "WARNING: could not load sketches from path 'no-exist'" in captured.err def test_index_empty_siglist(runtmp, capfd): - ## TODO: index:: do not write output if no signatures to write? - # OR, warn user? - # test empty siglist file siglist = runtmp.output('db-sigs.txt') output = runtmp.output('out.db') make_file_list(siglist, []) # empty - # with pytest.raises(utils.SourmashCommandFailed): - runtmp.sourmash('scripts', 'index', siglist, + with pytest.raises(utils.SourmashCommandFailed): + runtmp.sourmash('scripts', 'index', siglist, '-o', output) captured = capfd.readouterr() - assert os.path.exists(output) # do we want an empty file, or no file? + assert not os.path.exists(output) # do we want an empty file, or no file? print(runtmp.last_result.out) print(runtmp.last_result.err) print(captured.err) - # assert "No signatures to index loaded, exiting." in captured.err + assert "Error: Signatures failed to load. Exiting." in captured.err def test_index_nomatch_sig_in_siglist(runtmp, capfd): diff --git a/src/utils.rs b/src/utils.rs index 5cd49de1..b31ba31b 100644 --- a/src/utils.rs +++ b/src/utils.rs @@ -7,7 +7,9 @@ use sourmash::selection::Select; use std::fs::{create_dir_all, File}; use std::io::{BufRead, BufReader, BufWriter, Write}; use std::panic; -use std::path::{Path, PathBuf}; +// use std::path::{Path, PathBuf}; +use camino::Utf8Path as Path; +use camino::Utf8PathBuf as PathBuf; use std::sync::atomic; use std::sync::atomic::AtomicUsize; @@ -126,34 +128,32 @@ pub fn write_prefetch( } /// Load a list of filenames from a file. Exits on bad lines. -pub fn load_sketchlist_filenames>(sketchlist_filename: &P) -> Result> { - let sketchlist_file = BufReader::new(File::open(sketchlist_filename)?); - - let mut sketchlist_filenames: Vec = Vec::new(); - for line in sketchlist_file.lines() { - let line = match line { - Ok(v) => v, - Err(_) => { - return { - let filename = sketchlist_filename.as_ref().display(); - let msg = format!("invalid line in fromfile '{}'", filename); - Err(anyhow!(msg)) - } - } - }; - - if !line.is_empty() { - let mut path = PathBuf::new(); - path.push(line); - sketchlist_filenames.push(path); - } - } - Ok(sketchlist_filenames) -} - -pub fn load_fasta_fromfile>( - sketchlist_filename: &P, -) -> Result> { +// pub fn load_sketchlist_filenames>(sketchlist_filename: &P) -> Result> { +// let sketchlist_file = BufReader::new(File::open(sketchlist_filename)?); + +// let mut sketchlist_filenames: Vec = Vec::new(); +// for line in sketchlist_file.lines() { +// let line = match line { +// Ok(v) => v, +// Err(_) => { +// return { +// let filename = sketchlist_filename.as_ref().display(); +// let msg = format!("invalid line in fromfile '{}'", filename); +// Err(anyhow!(msg)) +// } +// } +// }; + +// if !line.is_empty() { +// let mut path = PathBuf::new(); +// path.push(line); +// sketchlist_filenames.push(path); +// } +// } +// Ok(sketchlist_filenames) +// } + +pub fn load_fasta_fromfile(sketchlist_filename: String) -> Result> { let mut rdr = csv::Reader::from_path(sketchlist_filename)?; // Check for right header @@ -313,7 +313,7 @@ pub fn load_sketches_above_threshold( pub enum ReportType { Query, Against, - Pairwise, + General, } impl std::fmt::Display for ReportType { @@ -321,19 +321,22 @@ impl std::fmt::Display for ReportType { let description = match self { ReportType::Query => "query", ReportType::Against => "search", - ReportType::Pairwise => "signature", + ReportType::General => "signature", }; write!(f, "{}", description) } } pub fn load_collection( - sigpath: &camino::Utf8PathBuf, + siglist: &String, selection: &Selection, report_type: ReportType, + allow_failed: bool, ) -> Result { + let sigpath = PathBuf::from(siglist); + if !sigpath.exists() { - bail!("No such file or directory: '{}'", sigpath); + bail!("No such file or directory: '{}'", &sigpath); } let mut n_failed = 0; @@ -344,7 +347,7 @@ pub fn load_collection( } } else { // if pathlist is just a signature path, load it into a collection - match Signature::from_path(sigpath) { + match Signature::from_path(&sigpath) { Ok(signatures) => { // Load the collection from the signature match Collection::from_sigs(signatures) { @@ -358,7 +361,7 @@ pub fn load_collection( } // if not, try to load file as list of sig paths Err(_) => { - // // using core fn doesn't allow us to ignore failed paths; I reimplement loading here to allow + // using core fn doesn't allow us to ignore failed paths; I reimplement loading here to allow let sketchlist_file = BufReader::new(File::open(sigpath)?); let records: Vec = sketchlist_file .lines() @@ -400,7 +403,7 @@ pub fn load_collection( let n_total = collection.len(); let selected = collection.select(selection)?; let n_skipped = n_total - selected.len(); - report_on_collection_loading(&selected, n_skipped, n_failed, report_type)?; + report_on_collection_loading(&selected, n_skipped, n_failed, report_type, allow_failed)?; Ok(selected) } @@ -431,12 +434,16 @@ pub fn report_on_collection_loading( skipped_paths: usize, failed_paths: usize, report_type: ReportType, + allow_failed: bool, ) -> Result<()> { if failed_paths > 0 { eprintln!( "WARNING: {} {} paths failed to load. See error messages above.", failed_paths, report_type ); + if !allow_failed { + bail! {"Signatures failed to load. Exiting."} + } } if skipped_paths > 0 { eprintln!( @@ -715,21 +722,21 @@ pub fn make_manifest_row( n_hashes: sketch.size(), with_abundance: abund, name: sig.name().to_string(), - // filename: filename.display().to_string(), - filename: filename.to_str().unwrap().to_string(), + filename: filename.to_string(), } } -pub fn open_stdout_or_file>(output: Option

) -> Box { +pub fn open_stdout_or_file(output: Option) -> Box { // if output is a file, use open_output_file if let Some(path) = output { - Box::new(open_output_file(&path)) + let outpath: PathBuf = path.into(); + Box::new(open_output_file(&outpath)) } else { Box::new(std::io::stdout()) } } -pub fn open_output_file>(output: &P) -> BufWriter { +pub fn open_output_file(output: &PathBuf) -> BufWriter { let file = File::create(output).unwrap_or_else(|e| { eprintln!("Error creating output file: {:?}", e); std::process::exit(1); @@ -772,7 +779,10 @@ pub fn sigwriter + Send + 'static>( output: String, ) -> std::thread::JoinHandle> { std::thread::spawn(move || -> Result<()> { - let file_writer = open_output_file(&output); + // cast output as pathbuf + let outpath: PathBuf = output.into(); + + let file_writer = open_output_file(&outpath); let options = zip::write::FileOptions::default() .compression_method(zip::CompressionMethod::Stored) @@ -845,16 +855,15 @@ pub trait ResultType { fn format_fields(&self) -> Vec; } -pub fn csvwriter_thread( +pub fn csvwriter_thread( recv: std::sync::mpsc::Receiver, - output: Option

, + output: Option, ) -> std::thread::JoinHandle<()> where T: ResultType, - P: Clone + std::convert::AsRef, { // create output file - let out = open_stdout_or_file(output.as_ref()); + let out = open_stdout_or_file(output); // spawn a thread that is dedicated to printing to a buffered output std::thread::spawn(move || { let mut writer = out; From f769aee8ea297b4b4cf4b01b0d28f49b2326db9d Mon Sep 17 00:00:00 2001 From: "N. Tessa Pierce-Ward" Date: Thu, 1 Feb 2024 19:55:42 -0800 Subject: [PATCH 31/40] unify more code --- src/fastgather.rs | 14 +--- src/fastmultigather.rs | 120 +++++++++++---------------- src/lib.rs | 1 - src/mastiff_manysearch.rs | 5 -- src/python/tests/test_multigather.py | 6 +- src/python/tests/test_multisearch.py | 10 ++- src/python/tests/test_search.py | 13 +-- src/utils.rs | 67 +++++---------- 8 files changed, 90 insertions(+), 146 deletions(-) diff --git a/src/fastgather.rs b/src/fastgather.rs index 280afd54..ab9a55a8 100644 --- a/src/fastgather.rs +++ b/src/fastgather.rs @@ -15,7 +15,6 @@ pub fn fastgather( query_filepath: String, against_filepath: String, threshold_bp: usize, - ksize: u8, scaled: usize, selection: &Selection, gather_output: Option, @@ -36,9 +35,8 @@ pub fn fastgather( ) } // get single query sig and minhash - let query_sig = query_collection.sig_for_dataset(0)?; // need original md5sum, etc - // downsample - let query_sig_ds = query_sig.clone().select(selection)?; + let query_sig = query_collection.sig_for_dataset(0)?; // need this for original md5sum + let query_sig_ds = query_sig.clone().select(selection)?; // downsample let query_mh = match query_sig_ds.minhash() { Some(query_mh) => query_mh, None => { @@ -98,12 +96,6 @@ pub fn fastgather( } // run the gather! - consume_query_by_gather( - query_sig.clone(), - matchlist, - threshold_hashes, - gather_output, - ) - .ok(); + consume_query_by_gather(query_sig, matchlist, threshold_hashes, gather_output).ok(); Ok(()) } diff --git a/src/fastmultigather.rs b/src/fastmultigather.rs index 6fb1c932..1283fcc8 100644 --- a/src/fastmultigather.rs +++ b/src/fastmultigather.rs @@ -3,8 +3,6 @@ use anyhow::Result; use rayon::prelude::*; use sourmash::selection::Selection; -use sourmash::sketch::Sketch; -use sourmash::storage::SigStore; use std::sync::atomic; use std::sync::atomic::AtomicUsize; @@ -14,7 +12,8 @@ use std::collections::BinaryHeap; use camino::Utf8Path; use crate::utils::{ - consume_query_by_gather, load_collection, write_prefetch, PrefetchResult, ReportType, + consume_query_by_gather, load_collection, load_mh_with_name_and_md5, write_prefetch, + PrefetchResult, ReportType, }; pub fn fastmultigather( @@ -32,7 +31,6 @@ pub fn fastmultigather( ReportType::Query, allow_failed_sigpaths, )?; - println!("Loaded {} sig paths in querylist", query_collection.len()); let threshold_hashes: u64 = { let x = threshold_bp / scaled; @@ -55,90 +53,70 @@ pub fn fastmultigather( allow_failed_sigpaths, )?; // load actual signatures - let mut sketchlist: Vec = vec![]; - - for (idx, record) in against_collection.iter() { - if let Ok(sig) = against_collection.sig_for_dataset(idx) - // .unwrap() - // .select(&selection) // if we select here, we downsample and the md5sum changes! - // ...which means we would lose the original md5sum that is used in the standard gather results. - { - sketchlist.push(sig); - } else { - eprintln!("Failed to load 'against' record: {}", record.name()); - } - } + let against = + load_mh_with_name_and_md5(against_collection, &selection, ReportType::Against).unwrap(); // Iterate over all queries => do prefetch and gather! let processed_queries = AtomicUsize::new(0); let skipped_paths = AtomicUsize::new(0); let failed_paths = AtomicUsize::new(0); - query_collection.par_iter().for_each(|(idx, record)| { - // increment counter of # of queries. q: could we instead use the index from par_iter()? + query_collection.par_iter().for_each(|(_idx, record)| { + // increment counter of # of queries. q: could we instead use the _idx from par_iter(), or will it vary based on thread? let _i = processed_queries.fetch_add(1, atomic::Ordering::SeqCst); // Load query sig - match query_collection.sig_for_dataset(idx) { + match query_collection.sig_from_record(record) { Ok(query_sig) => { let prefix = query_sig.name(); let location = Utf8Path::new(&prefix).file_name().unwrap(); - for sketch in query_sig.iter() { - // Access query MinHash - if let Sketch::MinHash(query) = sketch { - let matchlist: BinaryHeap = sketchlist - .iter() - .filter_map(|sm| { - let mut mm = None; - // Access against MinHash - if let Some(sketch) = sm.sketches().get(0) { - if let Sketch::MinHash(against_sketch) = sketch { - if let Ok(overlap) = - // downsample here to just get downsampled mh and avoid changing md5sum - against_sketch.count_common(&query, true) - { - if overlap >= threshold_hashes { - let result = PrefetchResult { - name: sm.name(), - md5sum: sm.md5sum().clone(), - minhash: against_sketch.clone(), - overlap, - }; - mm = Some(result); - } - } - } + if let Some(query_mh) = query_sig.minhash() { + let matchlist: BinaryHeap = against + .iter() + .filter_map(|(against_mh, against_name, against_md5)| { + let mut mm = None; + if let Ok(overlap) = against_mh.count_common(&query_mh, false) { + if overlap >= threshold_hashes { + let result = PrefetchResult { + name: against_name.clone(), + md5sum: against_md5.clone(), + minhash: against_mh.clone(), + overlap, + }; + mm = Some(result); } - mm - }) - .collect(); - if !matchlist.is_empty() { - let prefetch_output = format!("{}.prefetch.csv", location); - let gather_output = format!("{}.gather.csv", location); - - // Save initial list of matches to prefetch output - write_prefetch(&query_sig, Some(prefetch_output), &matchlist).ok(); - - // Now, do the gather! - consume_query_by_gather( - query_sig.clone(), - matchlist, - threshold_hashes, - Some(gather_output), - ) - .ok(); - } else { - println!("No matches to '{}'", location); - } + } + mm + }) + .collect(); + if !matchlist.is_empty() { + let prefetch_output = format!("{}.prefetch.csv", location); + let gather_output = format!("{}.gather.csv", location); + + // Save initial list of matches to prefetch output + write_prefetch(&query_sig, Some(prefetch_output), &matchlist).ok(); + + // Now, do the gather! + consume_query_by_gather( + query_sig.clone(), + matchlist, + threshold_hashes, + Some(gather_output), + ) + .ok(); } else { - eprintln!( - "WARNING: no compatible sketches in path '{}'", - record.internal_location() - ); - let _ = skipped_paths.fetch_add(1, atomic::Ordering::SeqCst); + println!("No matches to '{}'", location); } + } else { + // different warning here? Could not load sig from record?? + eprintln!( + "WARNING: no compatible sketches in path '{}'", + record.internal_location() + ); + let _ = skipped_paths.fetch_add(1, atomic::Ordering::SeqCst); } } Err(_) => { + // different warning here? Could not load sig from record?? eprintln!( "WARNING: no compatible sketches in path '{}'", record.internal_location() diff --git a/src/lib.rs b/src/lib.rs index ab178564..8d427f41 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -87,7 +87,6 @@ fn do_fastgather( query_filename, siglist_path, threshold_bp, - ksize, scaled, &selection, output_path_prefetch, diff --git a/src/mastiff_manysearch.rs b/src/mastiff_manysearch.rs index 4ef68830..5bf716a8 100644 --- a/src/mastiff_manysearch.rs +++ b/src/mastiff_manysearch.rs @@ -36,11 +36,6 @@ pub fn mastiff_manysearch( allow_failed_sigpaths, )?; - // if query_paths is empty, exit with error. this should already happen via load_collection, i think? - if query_collection.len() == 0 { - bail!("No query signatures loaded, exiting."); - } - // set up a multi-producer, single-consumer channel. let (send, recv) = std::sync::mpsc::sync_channel::(rayon::current_num_threads()); diff --git a/src/python/tests/test_multigather.py b/src/python/tests/test_multigather.py index 7ec636ba..1f96eed1 100644 --- a/src/python/tests/test_multigather.py +++ b/src/python/tests/test_multigather.py @@ -424,6 +424,7 @@ def test_bad_against_2(runtmp, capfd, zip_query): def test_empty_against(runtmp, capfd): + # like fastgather - exit gracefully. # test bad 'against' file - in this case, an empty one query = get_test_data('SRR606249.sig.gz') query_list = runtmp.output('query.txt') @@ -432,15 +433,14 @@ def test_empty_against(runtmp, capfd): against_list = runtmp.output('against.txt') make_file_list(against_list, []) - with pytest.raises(utils.SourmashCommandFailed): - runtmp.sourmash('scripts', 'fastmultigather', query_list, against_list, + runtmp.sourmash('scripts', 'fastmultigather', query_list, against_list, '-s', '100000') captured = capfd.readouterr() print(captured.err) assert "Sketch loading error: No such file or directory" in captured.err - assert "Error: No search signatures loaded, exiting." in captured.err + assert "No search signatures loaded, exiting." in captured.err @pytest.mark.parametrize('zip_against', [False, True]) diff --git a/src/python/tests/test_multisearch.py b/src/python/tests/test_multisearch.py index ff2136b0..a7b09931 100644 --- a/src/python/tests/test_multisearch.py +++ b/src/python/tests/test_multisearch.py @@ -300,8 +300,8 @@ def test_bad_against(runtmp, capfd): assert "WARNING: 1 search paths failed to load. See error messages above." in captured.err -def test_empty_query(runtmp): - # test with an empty query list +def test_empty_query(runtmp, capfd): + # test with an empty query list - fail gracefully query_list = runtmp.output('query.txt') against_list = runtmp.output('against.txt') @@ -314,11 +314,13 @@ def test_empty_query(runtmp): output = runtmp.output('out.csv') - with pytest.raises(utils.SourmashCommandFailed): - runtmp.sourmash('scripts', 'multisearch', query_list, against_list, + runtmp.sourmash('scripts', 'multisearch', query_list, against_list, '-o', output) print(runtmp.last_result.err) + captured = capfd.readouterr() + print(captured.err) + assert "No query signatures loaded, exiting." in captured.err # @CTB diff --git a/src/python/tests/test_search.py b/src/python/tests/test_search.py index 2ab45907..c6c49c95 100644 --- a/src/python/tests/test_search.py +++ b/src/python/tests/test_search.py @@ -371,8 +371,8 @@ def test_nomatch_against(runtmp, capfd): output = runtmp.output('out.csv') - with pytest.raises(utils.SourmashCommandFailed): - runtmp.sourmash('scripts', 'manysearch', query_list, against_list, + # with pytest.raises(utils.SourmashCommandFailed): + runtmp.sourmash('scripts', 'manysearch', query_list, against_list, '-o', output) captured = capfd.readouterr() @@ -403,7 +403,7 @@ def test_bad_against(runtmp, capfd): @pytest.mark.parametrize("indexed", [False, True]) -def test_empty_query(runtmp, indexed): +def test_empty_query(runtmp, indexed, capfd): # test with an empty query list query_list = runtmp.output('query.txt') against_list = runtmp.output('against.txt') @@ -420,11 +420,14 @@ def test_empty_query(runtmp, indexed): output = runtmp.output('out.csv') - with pytest.raises(utils.SourmashCommandFailed): - runtmp.sourmash('scripts', 'manysearch', query_list, against_list, + # with pytest.raises(utils.SourmashCommandFailed): + runtmp.sourmash('scripts', 'manysearch', query_list, against_list, '-o', output) print(runtmp.last_result.err) + captured = capfd.readouterr() + print(captured.err) + assert "No query signatures loaded, exiting." in captured.err @pytest.mark.parametrize("indexed", [False, True]) diff --git a/src/utils.rs b/src/utils.rs index b31ba31b..33e9b3dc 100644 --- a/src/utils.rs +++ b/src/utils.rs @@ -127,32 +127,6 @@ pub fn write_prefetch( Ok(()) } -/// Load a list of filenames from a file. Exits on bad lines. -// pub fn load_sketchlist_filenames>(sketchlist_filename: &P) -> Result> { -// let sketchlist_file = BufReader::new(File::open(sketchlist_filename)?); - -// let mut sketchlist_filenames: Vec = Vec::new(); -// for line in sketchlist_file.lines() { -// let line = match line { -// Ok(v) => v, -// Err(_) => { -// return { -// let filename = sketchlist_filename.as_ref().display(); -// let msg = format!("invalid line in fromfile '{}'", filename); -// Err(anyhow!(msg)) -// } -// } -// }; - -// if !line.is_empty() { -// let mut path = PathBuf::new(); -// path.push(line); -// sketchlist_filenames.push(path); -// } -// } -// Ok(sketchlist_filenames) -// } - pub fn load_fasta_fromfile(sketchlist_filename: String) -> Result> { let mut rdr = csv::Reader::from_path(sketchlist_filename)?; @@ -265,27 +239,27 @@ pub fn load_sketches_above_threshold( let mut results = Vec::new(); // Load against into memory if let Ok(against_sig) = against_collection.sig_from_record(against_record) { - for sketch in against_sig.sketches() { - if let Sketch::MinHash(against_mh) = sketch { - // currently downsampling here to avoid changing md5sum - if let Ok(overlap) = against_mh.count_common(query, true) { - if overlap >= threshold_hashes { - let result = PrefetchResult { - name: against_record.name().to_string(), - md5sum: against_mh.md5sum(), - minhash: against_mh.clone(), - overlap, - }; - results.push(result); - } + if let Some(against_mh) = against_sig.minhash() { + // if let Some(against_mh) = against_sig.select(&selection).unwrap().minhash() { // downsample via select + // currently downsampling here to avoid changing md5sum + if let Ok(overlap) = against_mh.count_common(query, true) { + //downsample via count_common + if overlap >= threshold_hashes { + let result = PrefetchResult { + name: against_record.name().to_string(), + md5sum: against_mh.md5sum(), + minhash: against_mh.clone(), + overlap, + }; + results.push(result); } - } else { - eprintln!( - "WARNING: no compatible sketches in path '{}'", - against_sig.filename() - ); - let _i = skipped_paths.fetch_add(1, atomic::Ordering::SeqCst); } + } else { + eprintln!( + "WARNING: no compatible sketches in path '{}'", + against_sig.filename() + ); + let _i = skipped_paths.fetch_add(1, atomic::Ordering::SeqCst); } } else { // this shouldn't happen here anymore -- likely would happen at load_collection @@ -454,7 +428,8 @@ pub fn report_on_collection_loading( // Validate sketches if collection.is_empty() { - bail!("No {} signatures loaded, exiting.", report_type); + eprintln!("No {} signatures loaded, exiting.", report_type); + return Ok(()); } eprintln!("Loaded {} {} signature(s)", collection.len(), report_type); Ok(()) From 8d7781c8b7966e1f8c041ec9ce4e2b4d96b6eb10 Mon Sep 17 00:00:00 2001 From: "N. Tessa Pierce-Ward" Date: Thu, 1 Feb 2024 20:07:12 -0800 Subject: [PATCH 32/40] rm unused save_paths option --- src/fastmultigather.rs | 12 ++++++------ src/index.rs | 1 - src/lib.rs | 10 +--------- src/python/sourmash_plugin_branchwater/__init__.py | 3 --- 4 files changed, 7 insertions(+), 19 deletions(-) diff --git a/src/fastmultigather.rs b/src/fastmultigather.rs index 1283fcc8..dc10e897 100644 --- a/src/fastmultigather.rs +++ b/src/fastmultigather.rs @@ -9,7 +9,7 @@ use std::sync::atomic::AtomicUsize; use std::collections::BinaryHeap; -use camino::Utf8Path; +use camino::Utf8Path as PathBuf; use crate::utils::{ consume_query_by_gather, load_collection, load_mh_with_name_and_md5, write_prefetch, @@ -24,7 +24,7 @@ pub fn fastmultigather( selection: &Selection, allow_failed_sigpaths: bool, ) -> Result<()> { - // load the list of query paths + // load query collection let query_collection = load_collection( &query_filepath, selection, @@ -45,14 +45,14 @@ pub fn fastmultigather( println!("threshold overlap: {} {}", threshold_hashes, threshold_bp); - // Load all the against sketches + // load against collection let against_collection = load_collection( &against_filepath, selection, ReportType::Against, allow_failed_sigpaths, )?; - // load actual signatures + // load against sketches into memory, downsampling on the way let against = load_mh_with_name_and_md5(against_collection, &selection, ReportType::Against).unwrap(); @@ -64,11 +64,11 @@ pub fn fastmultigather( query_collection.par_iter().for_each(|(_idx, record)| { // increment counter of # of queries. q: could we instead use the _idx from par_iter(), or will it vary based on thread? let _i = processed_queries.fetch_add(1, atomic::Ordering::SeqCst); - // Load query sig + // Load query sig (downsampling happens here) match query_collection.sig_from_record(record) { Ok(query_sig) => { let prefix = query_sig.name(); - let location = Utf8Path::new(&prefix).file_name().unwrap(); + let location = PathBuf::new(&prefix).file_name().unwrap(); if let Some(query_mh) = query_sig.minhash() { let matchlist: BinaryHeap = against .iter() diff --git a/src/index.rs b/src/index.rs index 6fa7e898..0ed0a230 100644 --- a/src/index.rs +++ b/src/index.rs @@ -8,7 +8,6 @@ pub fn index>( siglist: String, selection: &Selection, output: P, - save_paths: bool, colors: bool, allow_failed_sigpaths: bool, ) -> Result<(), Box> { diff --git a/src/lib.rs b/src/lib.rs index 8d427f41..16df3ae4 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -173,19 +173,11 @@ fn do_index( scaled: usize, moltype: String, output: String, - save_paths: bool, colors: bool, ) -> anyhow::Result { let selection = build_selection(ksize, scaled, &moltype); let allow_failed_sigpaths = false; - match index::index( - siglist, - &selection, - output, - save_paths, - colors, - allow_failed_sigpaths, - ) { + match index::index(siglist, &selection, output, colors, allow_failed_sigpaths) { Ok(_) => Ok(0), Err(e) => { eprintln!("Error: {e}"); diff --git a/src/python/sourmash_plugin_branchwater/__init__.py b/src/python/sourmash_plugin_branchwater/__init__.py index 6aff91b3..def6fec7 100755 --- a/src/python/sourmash_plugin_branchwater/__init__.py +++ b/src/python/sourmash_plugin_branchwater/__init__.py @@ -189,8 +189,6 @@ def __init__(self, p): help='scaled factor at which to do comparisons') p.add_argument('-m', '--moltype', default='DNA', choices = ["DNA", "protein", "dayhoff", "hp"], help = 'molecule type (DNA, protein, dayhoff, or hp; default DNA)') - p.add_argument('--save-paths', action='store_true', - help='save paths to signatures into index. Default: save full sig into index') p.add_argument('-c', '--cores', default=0, type=int, help='number of cores to use (default is all available)') @@ -208,7 +206,6 @@ def main(self, args): args.scaled, args.moltype, args.output, - args.save_paths, False) # colors - currently must be false? if status == 0: notify(f"...index is done! results in '{args.output}'") From b6ebc7a18dbfe57b21ac255c822bf78ba49fdd90 Mon Sep 17 00:00:00 2001 From: "N. Tessa Pierce-Ward" Date: Thu, 1 Feb 2024 20:23:16 -0800 Subject: [PATCH 33/40] use updated mh loading --- src/manysearch.rs | 92 ++++++++++++++++++----------------------------- 1 file changed, 35 insertions(+), 57 deletions(-) diff --git a/src/manysearch.rs b/src/manysearch.rs index fa7c4db8..099b451d 100644 --- a/src/manysearch.rs +++ b/src/manysearch.rs @@ -15,7 +15,9 @@ use sourmash::storage::SigStore; use std::sync::atomic; use std::sync::atomic::AtomicUsize; -use crate::utils::{csvwriter_thread, load_collection, ReportType, SearchResult}; +use crate::utils::{ + csvwriter_thread, load_collection, load_mh_with_name_and_md5, ReportType, SearchResult, +}; pub fn manysearch( query_filepath: String, @@ -35,22 +37,11 @@ pub fn manysearch( ReportType::Query, allow_failed_sigpaths, )?; - // load actual signatures - let mut query_sketchlist: Vec = vec![]; - - for (idx, record) in query_collection.iter() { - if let Ok(sig) = query_collection - .sig_for_dataset(idx) - .unwrap() - .select(&selection) - { - query_sketchlist.push(sig); - } else { - eprintln!("Failed to load 'query' sig: {}", record.name()); - } - } + // load query sketches into memory, downsampling on the way + let query_sketchlist = + load_mh_with_name_and_md5(query_collection, &selection, ReportType::Query).unwrap(); - // Load all _paths_, not signatures, into memory. + // Against: Load all _paths_, not signatures, into memory. let against_collection = load_collection( &against_filepath, selection, @@ -76,7 +67,7 @@ pub fn manysearch( let send = against_collection .par_iter() - .filter_map(|(idx, record)| { + .filter_map(|(_idx, record)| { let i = processed_sigs.fetch_add(1, atomic::Ordering::SeqCst); if i % 1000 == 0 { eprintln!("Processed {} search sigs", i); @@ -84,54 +75,41 @@ pub fn manysearch( let mut results = vec![]; - match against_collection.sig_for_dataset(idx) { - Ok(against_sig) => match against_sig.select(selection) { - Ok(against_sig) => { - for sketch in against_sig.iter() { - if let Sketch::MinHash(against_mh) = sketch { - for query_sig in query_sketchlist.iter() { - for sketch in query_sig.iter() { - if let Sketch::MinHash(query_mh) = sketch { - let overlap = - query_mh.count_common(&against_mh, false).unwrap() - as f64; - let query_size = query_mh.size() as f64; - let target_size = against_mh.size() as f64; - - let containment_query_in_target = overlap / query_size; - let containment_in_target = overlap / target_size; - let max_containment = containment_query_in_target - .max(containment_in_target); - let jaccard = - overlap / (target_size + query_size - overlap); - - if containment_query_in_target > threshold { - results.push(SearchResult { - query_name: query_sig.name(), - query_md5: query_mh.md5sum(), - match_name: against_sig.name(), - containment: containment_query_in_target, - intersect_hashes: overlap as usize, - match_md5: Some(against_mh.md5sum()), - jaccard: Some(jaccard), - max_containment: Some(max_containment), - }); - } - } - } - } + // against downsampling happens here + match against_collection.sig_from_record(record) { + Ok(against_sig) => { + if let Some(against_mh) = against_sig.minhash() { + for (query_mh, query_name, query_md5) in query_sketchlist.iter() { + let overlap = query_mh.count_common(&against_mh, false).unwrap() as f64; + let query_size = query_mh.size() as f64; + let target_size = against_mh.size() as f64; + let containment_query_in_target = overlap / query_size; + let containment_in_target = overlap / target_size; + let max_containment = + containment_query_in_target.max(containment_in_target); + let jaccard = overlap / (target_size + query_size - overlap); + + if containment_query_in_target > threshold { + results.push(SearchResult { + query_name: query_name.clone(), + query_md5: query_md5.clone(), + match_name: against_sig.name(), + containment: containment_query_in_target, + intersect_hashes: overlap as usize, + match_md5: Some(against_sig.md5sum()), + jaccard: Some(jaccard), + max_containment: Some(max_containment), + }); } } - } - Err(err) => { - eprintln!("Sketch selection error: {}", err); + } else { eprintln!( "WARNING: no compatible sketches in path '{}'", record.internal_location() ); let _ = skipped_paths.fetch_add(1, atomic::Ordering::SeqCst); } - }, + } Err(err) => { eprintln!("Sketch loading error: {}", err); eprintln!( From a463ac88c5597887c59465e4f7f0f68d0a346e42 Mon Sep 17 00:00:00 2001 From: "N. Tessa Pierce-Ward" Date: Thu, 1 Feb 2024 21:13:46 -0800 Subject: [PATCH 34/40] standardize indexed writing using local struct for now --- src/manysearch.rs | 7 +-- src/mastiff_manygather.rs | 115 ++++++++++++++------------------------ src/mastiff_manysearch.rs | 3 +- src/utils.rs | 40 ++++++++++++- 4 files changed, 84 insertions(+), 81 deletions(-) diff --git a/src/manysearch.rs b/src/manysearch.rs index 099b451d..767bb7d2 100644 --- a/src/manysearch.rs +++ b/src/manysearch.rs @@ -27,17 +27,14 @@ pub fn manysearch( output: Option, allow_failed_sigpaths: bool, ) -> Result<()> { - // Read in list of query paths. - eprintln!("Reading queries from: '{}'", query_filepath); - - // Load all query sigs into memory at once. + // Load query collection let query_collection = load_collection( &query_filepath, selection, ReportType::Query, allow_failed_sigpaths, )?; - // load query sketches into memory, downsampling on the way + // load all query sketches into memory, downsampling on the way let query_sketchlist = load_mh_with_name_and_md5(query_collection, &selection, ReportType::Query).unwrap(); diff --git a/src/mastiff_manygather.rs b/src/mastiff_manygather.rs index 6755d54a..48eb61c8 100644 --- a/src/mastiff_manygather.rs +++ b/src/mastiff_manygather.rs @@ -1,31 +1,22 @@ /// mastiff_manygather: mastiff-indexed version of fastmultigather. use anyhow::Result; -use rayon::prelude::*; - -use sourmash::sketch::Sketch; -use std::path::Path; - -// use camino::Utf8Path as Path; use camino::Utf8PathBuf as PathBuf; - -use sourmash::prelude::*; - +use rayon::prelude::*; use sourmash::index::revindex::{RevIndex, RevIndexOps}; - +use sourmash::prelude::*; use std::sync::atomic; use std::sync::atomic::AtomicUsize; -use std::fs::File; -use std::io::{BufWriter, Write}; - -use crate::utils::{is_revindex_database, load_collection, ReportType}; +use crate::utils::{ + csvwriter_thread, is_revindex_database, load_collection, BranchwaterGatherResult, ReportType, +}; -pub fn mastiff_manygather>( +pub fn mastiff_manygather( queries_file: String, index: PathBuf, selection: &Selection, threshold_bp: usize, - output: Option

, + output: Option, allow_failed_sigpaths: bool, ) -> Result<(), Box> { if !is_revindex_database(&index) { @@ -43,29 +34,12 @@ pub fn mastiff_manygather>( )?; // set up a multi-producer, single-consumer channel. - let (send, recv) = std::sync::mpsc::sync_channel(rayon::current_num_threads()); + // let (send, recv) = std::sync::mpsc::sync_channel(rayon::current_num_threads()); + let (send, recv) = + std::sync::mpsc::sync_channel::(rayon::current_num_threads()); // & spawn a thread that is dedicated to printing to a buffered output - let out: Box = match output { - Some(path) => Box::new(BufWriter::new(File::create(path).unwrap())), - None => Box::new(std::io::stdout()), - }; - let thrd = std::thread::spawn(move || { - let mut writer = BufWriter::new(out); - writeln!( - &mut writer, - "query_name,query_md5,match_name,match_md5,f_match_query,intersect_bp" - ) - .unwrap(); - for (query, query_md5, m, m_md5, f_match_query, intersect_bp) in recv.into_iter() { - writeln!( - &mut writer, - "\"{}\",{},\"{}\",{},{},{}", - query, query_md5, m, m_md5, f_match_query, intersect_bp - ) - .ok(); - } - }); + let thrd = csvwriter_thread(recv, output); // // Main loop: iterate (in parallel) over all search signature paths, @@ -79,47 +53,44 @@ pub fn mastiff_manygather>( let send = query_collection .par_iter() - .filter_map(|(idx, record)| { + .filter_map(|(_idx, record)| { let threshold = threshold_bp / selection.scaled()? as usize; - match query_collection.sig_for_dataset(idx) { - // match query_collection.sig_from_record(record) { // to be added in core + // query downsampling happens here + match query_collection.sig_from_record(record) { Ok(query_sig) => { let mut results = vec![]; - let mut found_compatible_sketch = false; - for sketch in query_sig.iter() { - if let Sketch::MinHash(query) = sketch { - found_compatible_sketch = true; - // Gather! - let (counter, query_colors, hash_to_color) = - db.prepare_gather_counters(&query); - - let matches = db.gather( - counter, - query_colors, - hash_to_color, - threshold, - &query, - Some(selection.clone()), - ); - // extract results - if let Ok(matches) = matches { - for match_ in &matches { - results.push(( - query_sig.name().clone(), - query.md5sum().clone(), - match_.name().clone(), - match_.md5().clone(), - match_.f_match(), // f_match_query - match_.intersect_bp(), - )); // intersect_bp - } - } else { - eprintln!("Error gathering matches: {:?}", matches.err()); + if let Some(query_mh) = query_sig.minhash() { + // Gather! + let (counter, query_colors, hash_to_color) = + db.prepare_gather_counters(&query_mh); + + let matches = db.gather( + counter, + query_colors, + hash_to_color, + threshold, + &query_mh, + Some(selection.clone()), + ); + // extract results TODO: ADD REST OF GATHER COLUMNS + if let Ok(matches) = matches { + for match_ in &matches { + results.push( + (BranchwaterGatherResult { + query_name: query_sig.name().clone(), + query_md5: query_sig.md5sum().clone(), + match_name: match_.name().clone(), + match_md5: match_.md5().clone(), + f_match_query: match_.f_match(), + intersect_bp: match_.intersect_bp(), + }), + ); } + } else { + eprintln!("Error gathering matches: {:?}", matches.err()); } - } - if !found_compatible_sketch { + } else { eprintln!( "WARNING: no compatible sketches in path '{}'", query_sig.filename() diff --git a/src/mastiff_manysearch.rs b/src/mastiff_manysearch.rs index 5bf716a8..c2ddc8b4 100644 --- a/src/mastiff_manysearch.rs +++ b/src/mastiff_manysearch.rs @@ -10,7 +10,8 @@ use std::sync::atomic; use std::sync::atomic::AtomicUsize; use crate::utils::{ - csvwriter_thread, is_revindex_database, load_collection, ReportType, SearchResult, + csvwriter_thread, is_revindex_database, load_collection, open_stdout_or_file, ReportType, + SearchResult, }; pub fn mastiff_manysearch( diff --git a/src/utils.rs b/src/utils.rs index 33e9b3dc..1cbdc41a 100644 --- a/src/utils.rs +++ b/src/utils.rs @@ -4,12 +4,11 @@ use sourmash::encodings::HashFunctions; use sourmash::manifest::Manifest; use sourmash::selection::Select; +use camino::Utf8Path as Path; +use camino::Utf8PathBuf as PathBuf; use std::fs::{create_dir_all, File}; use std::io::{BufRead, BufReader, BufWriter, Write}; use std::panic; -// use std::path::{Path, PathBuf}; -use camino::Utf8Path as Path; -use camino::Utf8PathBuf as PathBuf; use std::sync::atomic; use std::sync::atomic::AtomicUsize; @@ -199,6 +198,8 @@ pub fn load_fasta_fromfile(sketchlist_filename: String) -> Result Vec<&'static str> { + vec![ + "query_name", + "query_md5", + "match_name", + "match_md5", + "f_match_query", + "intersect_bp", + ] + } + fn format_fields(&self) -> Vec { + vec![ + format!("\"{}\"", self.query_name), // Wrap query_name with quotes + self.query_md5.clone(), + format!("\"{}\"", self.match_name), // Wrap match_name with quotes + self.match_md5.clone(), + self.f_match_query.to_string(), + self.intersect_bp.to_string(), + ] + } +} + pub struct ManifestRow { pub md5: String, pub md5short: String, From c7b865b458046ddb799f0cb303298253f4f9f0ad Mon Sep 17 00:00:00 2001 From: "N. Tessa Pierce-Ward" Date: Thu, 1 Feb 2024 21:35:07 -0800 Subject: [PATCH 35/40] clean up sketch loading and file opening/writing --- src/mastiff_manygather.rs | 1 - src/mastiff_manysearch.rs | 64 ++++++++++++++++++--------------------- src/multisearch.rs | 61 ++++++++++++++----------------------- src/pairwise.rs | 64 +++++++++++++++------------------------ src/utils.rs | 38 +++++++++++++++++++++++ 5 files changed, 115 insertions(+), 113 deletions(-) diff --git a/src/mastiff_manygather.rs b/src/mastiff_manygather.rs index 48eb61c8..8f19307e 100644 --- a/src/mastiff_manygather.rs +++ b/src/mastiff_manygather.rs @@ -34,7 +34,6 @@ pub fn mastiff_manygather( )?; // set up a multi-producer, single-consumer channel. - // let (send, recv) = std::sync::mpsc::sync_channel(rayon::current_num_threads()); let (send, recv) = std::sync::mpsc::sync_channel::(rayon::current_num_threads()); diff --git a/src/mastiff_manysearch.rs b/src/mastiff_manysearch.rs index c2ddc8b4..cc5efd57 100644 --- a/src/mastiff_manysearch.rs +++ b/src/mastiff_manysearch.rs @@ -5,13 +5,11 @@ use rayon::prelude::*; use sourmash::index::revindex::{RevIndex, RevIndexOps}; use sourmash::selection::Selection; use sourmash::signature::SigsTrait; -use sourmash::sketch::Sketch; use std::sync::atomic; use std::sync::atomic::AtomicUsize; use crate::utils::{ - csvwriter_thread, is_revindex_database, load_collection, open_stdout_or_file, ReportType, - SearchResult, + csvwriter_thread, is_revindex_database, load_collection, ReportType, SearchResult, }; pub fn mastiff_manysearch( @@ -55,46 +53,44 @@ pub fn mastiff_manysearch( let send_result = query_collection .par_iter() - .filter_map(|(idx, record)| { + .filter_map(|(_idx, record)| { let i = processed_sigs.fetch_add(1, atomic::Ordering::SeqCst); if i % 1000 == 0 { eprintln!("Processed {} search sigs", i); } let mut results = vec![]; - match query_collection.sig_for_dataset(idx) { + // query downsample happens here + match query_collection.sig_from_record(record) { Ok(query_sig) => { - for sketch in query_sig.iter() { - if let Sketch::MinHash(query_mh) = sketch { - // let location = query_sig.filename(); - let query_size = query_mh.size(); - let counter = db.counter_for_query(&query_mh); - let matches = - db.matches_from_counter(counter, minimum_containment as usize); - - // filter the matches for containment - for (path, overlap) in matches { - let containment = overlap as f64 / query_size as f64; - if containment >= minimum_containment { - results.push(SearchResult { - query_name: query_sig.name(), - query_md5: query_sig.md5sum(), - match_name: path.clone(), - containment, - intersect_hashes: overlap, - match_md5: None, - jaccard: None, - max_containment: None, - }); - } + if let Some(query_mh) = query_sig.minhash() { + let query_size = query_mh.size(); + let counter = db.counter_for_query(&query_mh); + let matches = + db.matches_from_counter(counter, minimum_containment as usize); + + // filter the matches for containment + for (path, overlap) in matches { + let containment = overlap as f64 / query_size as f64; + if containment >= minimum_containment { + results.push(SearchResult { + query_name: query_sig.name(), + query_md5: query_sig.md5sum(), + match_name: path.clone(), + containment, + intersect_hashes: overlap, + match_md5: None, + jaccard: None, + max_containment: None, + }); } - } else { - eprintln!( - "WARNING: no compatible sketches in path '{}'", - query_sig.filename() - ); - let _ = skipped_paths.fetch_add(1, atomic::Ordering::SeqCst); } + } else { + eprintln!( + "WARNING: no compatible sketches in path '{}'", + query_sig.filename() + ); + let _ = skipped_paths.fetch_add(1, atomic::Ordering::SeqCst); } if results.is_empty() { None diff --git a/src/multisearch.rs b/src/multisearch.rs index ad28c6ab..0ecb6fdf 100644 --- a/src/multisearch.rs +++ b/src/multisearch.rs @@ -1,17 +1,14 @@ -use anyhow::Result; /// multisearch: massively parallel in-memory sketch search. +use anyhow::Result; use rayon::prelude::*; - -use std::fs::File; -use std::io::{BufWriter, Write}; - -use std::sync::atomic; -use std::sync::atomic::AtomicUsize; - use sourmash::selection::Selection; use sourmash::signature::SigsTrait; +use std::sync::atomic; +use std::sync::atomic::AtomicUsize; -use crate::utils::{load_collection, load_mh_with_name_and_md5, ReportType}; +use crate::utils::{ + csvwriter_thread, load_collection, load_mh_with_name_and_md5, MultiSearchResult, ReportType, +}; /// Search many queries against a list of signatures. /// @@ -48,25 +45,11 @@ pub fn multisearch( load_mh_with_name_and_md5(against_collection, &selection, ReportType::Against).unwrap(); // set up a multi-producer, single-consumer channel. - let (send, recv) = std::sync::mpsc::sync_channel(rayon::current_num_threads()); - - // & spawn a thread that is dedicated to printing to a buffered output - let out: Box = match output { - Some(path) => Box::new(BufWriter::new(File::create(path).unwrap())), - None => Box::new(std::io::stdout()), - }; - let thrd = std::thread::spawn(move || { - let mut writer = BufWriter::new(out); - writeln!(&mut writer, "query_name,query_md5,match_name,match_md5,containment,max_containment,jaccard,intersect_hashes").unwrap(); - for (query, query_md5, m, m_md5, cont, max_cont, jaccard, overlap) in recv.into_iter() { - writeln!( - &mut writer, - "\"{}\",{},\"{}\",{},{},{},{},{}", - query, query_md5, m, m_md5, cont, max_cont, jaccard, overlap - ) - .ok(); - } - }); + let (send, recv) = + std::sync::mpsc::sync_channel::(rayon::current_num_threads()); + + // // & spawn a thread that is dedicated to printing to a buffered output + let thrd = csvwriter_thread(recv, output); // // Main loop: iterate (in parallel) over all search signature paths, @@ -98,16 +81,18 @@ pub fn multisearch( let jaccard = overlap / (target_size + query_size - overlap); if containment_query_in_target > threshold { - results.push(( - query_name.clone(), - query_md5.clone(), - against_name.clone(), - against_md5.clone(), - containment_query_in_target, - max_containment, - jaccard, - overlap, - )) + results.push( + (MultiSearchResult { + query_name: query_name.clone(), + query_md5: query_md5.clone(), + match_name: against_name.clone(), + match_md5: against_md5.clone(), + containment: containment_query_in_target, + max_containment: max_containment, + jaccard: jaccard, + intersect_hashes: overlap, + }), + ) } } if results.is_empty() { diff --git a/src/pairwise.rs b/src/pairwise.rs index c714f9c8..fa61e0de 100644 --- a/src/pairwise.rs +++ b/src/pairwise.rs @@ -1,28 +1,24 @@ -use anyhow::Result; /// pairwise: massively parallel in-memory pairwise comparisons. +use anyhow::Result; use rayon::prelude::*; - -use std::fs::File; -use std::io::{BufWriter, Write}; -use std::path::Path; - use std::sync::atomic; use std::sync::atomic::AtomicUsize; -use sourmash::signature::SigsTrait; - -use crate::utils::{load_collection, load_mh_with_name_and_md5, ReportType}; +use crate::utils::{ + csvwriter_thread, load_collection, load_mh_with_name_and_md5, MultiSearchResult, ReportType, +}; use sourmash::selection::Selection; +use sourmash::signature::SigsTrait; /// Perform pairwise comparisons of all signatures in a list. /// /// Note: this function loads all _signatures_ into memory. -pub fn pairwise>( +pub fn pairwise( siglist: String, threshold: f64, selection: &Selection, - output: Option

, + output: Option, allow_failed_sigpaths: bool, ) -> Result<(), Box> { // Load all sigs into memory at once. @@ -42,25 +38,11 @@ pub fn pairwise>( let sketches = load_mh_with_name_and_md5(collection, &selection, ReportType::General).unwrap(); // set up a multi-producer, single-consumer channel. - let (send, recv) = std::sync::mpsc::sync_channel(rayon::current_num_threads()); - - // & spawn a thread that is dedicated to printing to a buffered output - let out: Box = match output { - Some(path) => Box::new(BufWriter::new(File::create(path).unwrap())), - None => Box::new(std::io::stdout()), - }; - let thrd = std::thread::spawn(move || { - let mut writer = BufWriter::new(out); - writeln!(&mut writer, "query_name,query_md5,match_name,match_md5,containment,max_containment,jaccard,intersect_hashes").unwrap(); - for (query, query_md5, m, m_md5, cont, max_cont, jaccard, overlap) in recv.into_iter() { - writeln!( - &mut writer, - "\"{}\",{},\"{}\",{},{},{},{},{}", - query, query_md5, m, m_md5, cont, max_cont, jaccard, overlap - ) - .ok(); - } - }); + let (send, recv) = + std::sync::mpsc::sync_channel::(rayon::current_num_threads()); + + // // & spawn a thread that is dedicated to printing to a buffered output + let thrd = csvwriter_thread(recv, output); // // Main loop: iterate (in parallel) over all signature, @@ -83,16 +65,18 @@ pub fn pairwise>( let jaccard = overlap / (query1_size + query2_size - overlap); if containment_q1_in_q2 > threshold || containment_q2_in_q1 > threshold { - send.send(( - q1_name.clone(), - q1_md5.clone(), - q2_name.clone(), - q2_md5.clone(), - containment_q1_in_q2, - max_containment, - jaccard, - overlap, - )) + send.send( + (MultiSearchResult { + query_name: q1_name.clone(), + query_md5: q1_md5.clone(), + match_name: q2_name.clone(), + match_md5: q2_md5.clone(), + containment: containment_q1_in_q2, + max_containment: max_containment, + jaccard: jaccard, + intersect_hashes: overlap, + }), + ) .unwrap(); } diff --git a/src/utils.rs b/src/utils.rs index 1cbdc41a..00405718 100644 --- a/src/utils.rs +++ b/src/utils.rs @@ -644,6 +644,44 @@ impl ResultType for BranchwaterGatherResult { } } +pub struct MultiSearchResult { + pub query_name: String, + pub query_md5: String, + pub match_name: String, + pub match_md5: String, + pub containment: f64, + pub max_containment: f64, + pub jaccard: f64, + pub intersect_hashes: f64, +} + +impl ResultType for MultiSearchResult { + fn header_fields() -> Vec<&'static str> { + vec![ + "query_name", + "query_md5", + "match_name", + "match_md5", + "containment", + "max_containment", + "jaccard", + "intersect_hashes", + ] + } + + fn format_fields(&self) -> Vec { + vec![ + format!("\"{}\"", self.query_name), // Wrap query_name with quotes + self.query_md5.clone(), + format!("\"{}\"", self.match_name), // Wrap match_name with quotes + self.match_md5.clone(), + self.containment.to_string(), + self.max_containment.to_string(), + self.jaccard.to_string(), + self.intersect_hashes.to_string(), + ] + } +} pub struct ManifestRow { pub md5: String, pub md5short: String, From 14af130add291b35f5e0f4914de52a7d0e24ae68 Mon Sep 17 00:00:00 2001 From: "N. Tessa Pierce-Ward" Date: Thu, 1 Feb 2024 21:44:36 -0800 Subject: [PATCH 36/40] apply clippy suggestions --- src/fastgather.rs | 3 +-- src/fastmultigather.rs | 6 +++--- src/index.rs | 2 +- src/manysearch.rs | 13 ++++--------- src/manysketch.rs | 2 +- src/mastiff_manygather.rs | 22 ++++++++++------------ src/mastiff_manysearch.rs | 2 +- src/multisearch.rs | 28 +++++++++++++--------------- src/pairwise.rs | 26 ++++++++++++-------------- src/utils.rs | 21 ++++++--------------- 10 files changed, 52 insertions(+), 73 deletions(-) diff --git a/src/fastgather.rs b/src/fastgather.rs index ab9a55a8..f70b11e3 100644 --- a/src/fastgather.rs +++ b/src/fastgather.rs @@ -68,8 +68,7 @@ pub fn fastgather( ); // load a set of sketches, filtering for those with overlaps > threshold - let result = - load_sketches_above_threshold(against_collection, &selection, &query_mh, threshold_hashes)?; + let result = load_sketches_above_threshold(against_collection, query_mh, threshold_hashes)?; let matchlist = result.0; let skipped_paths = result.1; let failed_paths = result.2; diff --git a/src/fastmultigather.rs b/src/fastmultigather.rs index dc10e897..a91c33d5 100644 --- a/src/fastmultigather.rs +++ b/src/fastmultigather.rs @@ -54,7 +54,7 @@ pub fn fastmultigather( )?; // load against sketches into memory, downsampling on the way let against = - load_mh_with_name_and_md5(against_collection, &selection, ReportType::Against).unwrap(); + load_mh_with_name_and_md5(against_collection, selection, ReportType::Against).unwrap(); // Iterate over all queries => do prefetch and gather! let processed_queries = AtomicUsize::new(0); @@ -73,8 +73,8 @@ pub fn fastmultigather( let matchlist: BinaryHeap = against .iter() .filter_map(|(against_mh, against_name, against_md5)| { - let mut mm = None; - if let Ok(overlap) = against_mh.count_common(&query_mh, false) { + let mut mm: Option = None; + if let Ok(overlap) = against_mh.count_common(query_mh, false) { if overlap >= threshold_hashes { let result = PrefetchResult { name: against_name.clone(), diff --git a/src/index.rs b/src/index.rs index 0ed0a230..3747e6f5 100644 --- a/src/index.rs +++ b/src/index.rs @@ -22,7 +22,7 @@ pub fn index>( RevIndex::create( output.as_ref(), - collection.select(&selection)?.try_into()?, + collection.select(selection)?.try_into()?, colors, )?; diff --git a/src/manysearch.rs b/src/manysearch.rs index 767bb7d2..1ffd7c28 100644 --- a/src/manysearch.rs +++ b/src/manysearch.rs @@ -5,19 +5,14 @@ /// database once. use anyhow::Result; use rayon::prelude::*; - -use sourmash::prelude::Select; -use sourmash::selection::Selection; -use sourmash::signature::SigsTrait; -use sourmash::sketch::Sketch; -use sourmash::storage::SigStore; - use std::sync::atomic; use std::sync::atomic::AtomicUsize; use crate::utils::{ csvwriter_thread, load_collection, load_mh_with_name_and_md5, ReportType, SearchResult, }; +use sourmash::selection::Selection; +use sourmash::signature::SigsTrait; pub fn manysearch( query_filepath: String, @@ -36,7 +31,7 @@ pub fn manysearch( )?; // load all query sketches into memory, downsampling on the way let query_sketchlist = - load_mh_with_name_and_md5(query_collection, &selection, ReportType::Query).unwrap(); + load_mh_with_name_and_md5(query_collection, selection, ReportType::Query).unwrap(); // Against: Load all _paths_, not signatures, into memory. let against_collection = load_collection( @@ -77,7 +72,7 @@ pub fn manysearch( Ok(against_sig) => { if let Some(against_mh) = against_sig.minhash() { for (query_mh, query_name, query_md5) in query_sketchlist.iter() { - let overlap = query_mh.count_common(&against_mh, false).unwrap() as f64; + let overlap = query_mh.count_common(against_mh, false).unwrap() as f64; let query_size = query_mh.size() as f64; let target_size = against_mh.size() as f64; let containment_query_in_target = overlap / query_size; diff --git a/src/manysketch.rs b/src/manysketch.rs index 1fbe399d..a4eefc7a 100644 --- a/src/manysketch.rs +++ b/src/manysketch.rs @@ -158,7 +158,7 @@ pub fn manysketch( let send = std::sync::Arc::new(send); // & spawn a thread that is dedicated to printing to a buffered output - let thrd = sigwriter::<&str>(recv, output); + let thrd = sigwriter(recv, output); // parse param string into params_vec, print error if fail let param_result = parse_params_str(param_str); diff --git a/src/mastiff_manygather.rs b/src/mastiff_manygather.rs index 8f19307e..cb794735 100644 --- a/src/mastiff_manygather.rs +++ b/src/mastiff_manygather.rs @@ -62,29 +62,27 @@ pub fn mastiff_manygather( if let Some(query_mh) = query_sig.minhash() { // Gather! let (counter, query_colors, hash_to_color) = - db.prepare_gather_counters(&query_mh); + db.prepare_gather_counters(query_mh); let matches = db.gather( counter, query_colors, hash_to_color, threshold, - &query_mh, + query_mh, Some(selection.clone()), ); // extract results TODO: ADD REST OF GATHER COLUMNS if let Ok(matches) = matches { for match_ in &matches { - results.push( - (BranchwaterGatherResult { - query_name: query_sig.name().clone(), - query_md5: query_sig.md5sum().clone(), - match_name: match_.name().clone(), - match_md5: match_.md5().clone(), - f_match_query: match_.f_match(), - intersect_bp: match_.intersect_bp(), - }), - ); + results.push(BranchwaterGatherResult { + query_name: query_sig.name().clone(), + query_md5: query_sig.md5sum().clone(), + match_name: match_.name().clone(), + match_md5: match_.md5().clone(), + f_match_query: match_.f_match(), + intersect_bp: match_.intersect_bp(), + }); } } else { eprintln!("Error gathering matches: {:?}", matches.err()); diff --git a/src/mastiff_manysearch.rs b/src/mastiff_manysearch.rs index cc5efd57..0b7c163d 100644 --- a/src/mastiff_manysearch.rs +++ b/src/mastiff_manysearch.rs @@ -65,7 +65,7 @@ pub fn mastiff_manysearch( Ok(query_sig) => { if let Some(query_mh) = query_sig.minhash() { let query_size = query_mh.size(); - let counter = db.counter_for_query(&query_mh); + let counter = db.counter_for_query(query_mh); let matches = db.matches_from_counter(counter, minimum_containment as usize); diff --git a/src/multisearch.rs b/src/multisearch.rs index 0ecb6fdf..9e2fe6d7 100644 --- a/src/multisearch.rs +++ b/src/multisearch.rs @@ -32,7 +32,7 @@ pub fn multisearch( allow_failed_sigpaths, )?; let queries = - load_mh_with_name_and_md5(query_collection, &selection, ReportType::Query).unwrap(); + load_mh_with_name_and_md5(query_collection, selection, ReportType::Query).unwrap(); // Load all against sketches into memory at once. let against_collection = load_collection( @@ -42,7 +42,7 @@ pub fn multisearch( allow_failed_sigpaths, )?; let against = - load_mh_with_name_and_md5(against_collection, &selection, ReportType::Against).unwrap(); + load_mh_with_name_and_md5(against_collection, selection, ReportType::Against).unwrap(); // set up a multi-producer, single-consumer channel. let (send, recv) = @@ -70,7 +70,7 @@ pub fn multisearch( eprintln!("Processed {} comparisons", i); } - let overlap = query_mh.count_common(&against_mh, false).unwrap() as f64; + let overlap = query_mh.count_common(against_mh, false).unwrap() as f64; // use downsampled sizes let query_size = query_mh.size() as f64; let target_size = against_mh.size() as f64; @@ -81,18 +81,16 @@ pub fn multisearch( let jaccard = overlap / (target_size + query_size - overlap); if containment_query_in_target > threshold { - results.push( - (MultiSearchResult { - query_name: query_name.clone(), - query_md5: query_md5.clone(), - match_name: against_name.clone(), - match_md5: against_md5.clone(), - containment: containment_query_in_target, - max_containment: max_containment, - jaccard: jaccard, - intersect_hashes: overlap, - }), - ) + results.push(MultiSearchResult { + query_name: query_name.clone(), + query_md5: query_md5.clone(), + match_name: against_name.clone(), + match_md5: against_md5.clone(), + containment: containment_query_in_target, + max_containment, + jaccard, + intersect_hashes: overlap, + }) } } if results.is_empty() { diff --git a/src/pairwise.rs b/src/pairwise.rs index fa61e0de..fbfac585 100644 --- a/src/pairwise.rs +++ b/src/pairwise.rs @@ -35,7 +35,7 @@ pub fn pairwise( &siglist ) } - let sketches = load_mh_with_name_and_md5(collection, &selection, ReportType::General).unwrap(); + let sketches = load_mh_with_name_and_md5(collection, selection, ReportType::General).unwrap(); // set up a multi-producer, single-consumer channel. let (send, recv) = @@ -54,7 +54,7 @@ pub fn pairwise( .par_iter() .enumerate() .for_each(|(idx, (q1, q1_name, q1_md5))| { - for (j, (q2, q2_name, q2_md5)) in sketches.iter().enumerate().skip(idx + 1) { + for (q2, q2_name, q2_md5) in sketches.iter().skip(idx + 1) { let overlap = q1.count_common(q2, false).unwrap() as f64; let query1_size = q1.size() as f64; let query2_size = q2.size() as f64; @@ -65,18 +65,16 @@ pub fn pairwise( let jaccard = overlap / (query1_size + query2_size - overlap); if containment_q1_in_q2 > threshold || containment_q2_in_q1 > threshold { - send.send( - (MultiSearchResult { - query_name: q1_name.clone(), - query_md5: q1_md5.clone(), - match_name: q2_name.clone(), - match_md5: q2_md5.clone(), - containment: containment_q1_in_q2, - max_containment: max_containment, - jaccard: jaccard, - intersect_hashes: overlap, - }), - ) + send.send(MultiSearchResult { + query_name: q1_name.clone(), + query_md5: q1_md5.clone(), + match_name: q2_name.clone(), + match_md5: q2_md5.clone(), + containment: containment_q1_in_q2, + max_containment, + jaccard, + intersect_hashes: overlap, + }) .unwrap(); } diff --git a/src/utils.rs b/src/utils.rs index 00405718..06f1ddb6 100644 --- a/src/utils.rs +++ b/src/utils.rs @@ -4,26 +4,22 @@ use sourmash::encodings::HashFunctions; use sourmash::manifest::Manifest; use sourmash::selection::Select; +use anyhow::{anyhow, Result}; use camino::Utf8Path as Path; use camino::Utf8PathBuf as PathBuf; +use std::cmp::{Ordering, PartialOrd}; +use std::collections::BinaryHeap; use std::fs::{create_dir_all, File}; use std::io::{BufRead, BufReader, BufWriter, Write}; use std::panic; - use std::sync::atomic; use std::sync::atomic::AtomicUsize; -use std::collections::BinaryHeap; - -use anyhow::{anyhow, Result}; -use std::cmp::{Ordering, PartialOrd}; - use sourmash::collection::Collection; use sourmash::manifest::Record; use sourmash::selection::Selection; use sourmash::signature::{Signature, SigsTrait}; use sourmash::sketch::minhash::KmerMinHash; -use sourmash::sketch::Sketch; use sourmash::storage::{FSStorage, InnerStorage, SigStore}; /// Structure to hold overlap information from comparisons. @@ -208,7 +204,7 @@ pub fn load_mh_with_name_and_md5( let mut sketchinfo: Vec<(KmerMinHash, String, String)> = Vec::new(); for (_idx, record) in collection.iter() { if let Ok(sig) = collection.sig_from_record(record) { - if let Some(ds_mh) = sig.clone().select(&selection)?.minhash().cloned() { + if let Some(ds_mh) = sig.clone().select(selection)?.minhash().cloned() { sketchinfo.push((ds_mh, record.name().to_string(), record.md5().to_string())); } } else { @@ -227,7 +223,6 @@ pub fn load_mh_with_name_and_md5( pub fn load_sketches_above_threshold( against_collection: Collection, - selection: &Selection, query: &KmerMinHash, threshold_hashes: u64, ) -> Result<(BinaryHeap, usize, usize)> { @@ -475,11 +470,7 @@ pub fn consume_query_by_gather( // let location = query.location; let location = query.filename(); // this is different (original fasta filename) than query.location was (sig name)!! - let sketches = query.sketches(); - let orig_query_mh = match sketches.get(0) { - Some(Sketch::MinHash(mh)) => Ok(mh), - _ => Err(anyhow::anyhow!("No MinHash found")), - }?; + let orig_query_mh = query.minhash().unwrap(); let mut query_mh = orig_query_mh.clone(); let mut last_hashes = orig_query_mh.size(); @@ -821,7 +812,7 @@ pub enum ZipMessage { WriteManifest, } -pub fn sigwriter + Send + 'static>( +pub fn sigwriter( recv: std::sync::mpsc::Receiver, output: String, ) -> std::thread::JoinHandle> { From 13c96d12329a40ae6b736ae5aa697fa0aabdbadb Mon Sep 17 00:00:00 2001 From: "N. Tessa Pierce-Ward" Date: Fri, 2 Feb 2024 09:53:16 -0800 Subject: [PATCH 37/40] add back SmallSignature and use --- src/fastmultigather.rs | 10 +++---- src/manysearch.rs | 11 ++++---- src/multisearch.rs | 18 ++++++------ src/pairwise.rs | 63 ++++++++++++++++++++---------------------- src/utils.rs | 21 +++++++++++--- 5 files changed, 67 insertions(+), 56 deletions(-) diff --git a/src/fastmultigather.rs b/src/fastmultigather.rs index a91c33d5..4f61e89c 100644 --- a/src/fastmultigather.rs +++ b/src/fastmultigather.rs @@ -72,14 +72,14 @@ pub fn fastmultigather( if let Some(query_mh) = query_sig.minhash() { let matchlist: BinaryHeap = against .iter() - .filter_map(|(against_mh, against_name, against_md5)| { + .filter_map(|(against)| { let mut mm: Option = None; - if let Ok(overlap) = against_mh.count_common(query_mh, false) { + if let Ok(overlap) = against.minhash.count_common(query_mh, false) { if overlap >= threshold_hashes { let result = PrefetchResult { - name: against_name.clone(), - md5sum: against_md5.clone(), - minhash: against_mh.clone(), + name: against.name.clone(), + md5sum: against.md5sum.clone(), + minhash: against.minhash.clone(), overlap, }; mm = Some(result); diff --git a/src/manysearch.rs b/src/manysearch.rs index 1ffd7c28..b1546c05 100644 --- a/src/manysearch.rs +++ b/src/manysearch.rs @@ -71,9 +71,10 @@ pub fn manysearch( match against_collection.sig_from_record(record) { Ok(against_sig) => { if let Some(against_mh) = against_sig.minhash() { - for (query_mh, query_name, query_md5) in query_sketchlist.iter() { - let overlap = query_mh.count_common(against_mh, false).unwrap() as f64; - let query_size = query_mh.size() as f64; + for query in query_sketchlist.iter() { + let overlap = + query.minhash.count_common(against_mh, false).unwrap() as f64; + let query_size = query.minhash.size() as f64; let target_size = against_mh.size() as f64; let containment_query_in_target = overlap / query_size; let containment_in_target = overlap / target_size; @@ -83,8 +84,8 @@ pub fn manysearch( if containment_query_in_target > threshold { results.push(SearchResult { - query_name: query_name.clone(), - query_md5: query_md5.clone(), + query_name: query.name.clone(), + query_md5: query.md5sum.clone(), match_name: against_sig.name(), containment: containment_query_in_target, intersect_hashes: overlap as usize, diff --git a/src/multisearch.rs b/src/multisearch.rs index 9e2fe6d7..55ccc54c 100644 --- a/src/multisearch.rs +++ b/src/multisearch.rs @@ -61,19 +61,19 @@ pub fn multisearch( let send = against .par_iter() - .filter_map(|(against_mh, against_name, against_md5)| { + .filter_map(|(against)| { let mut results = vec![]; // search for matches & save containment. - for (query_mh, query_name, query_md5) in queries.iter() { + for query in queries.iter() { let i = processed_cmp.fetch_add(1, atomic::Ordering::SeqCst); if i % 100000 == 0 { eprintln!("Processed {} comparisons", i); } - let overlap = query_mh.count_common(against_mh, false).unwrap() as f64; + let overlap = query.minhash.count_common(&against.minhash, false).unwrap() as f64; // use downsampled sizes - let query_size = query_mh.size() as f64; - let target_size = against_mh.size() as f64; + let query_size = query.minhash.size() as f64; + let target_size = against.minhash.size() as f64; let containment_query_in_target = overlap / query_size; let containment_in_target = overlap / target_size; @@ -82,10 +82,10 @@ pub fn multisearch( if containment_query_in_target > threshold { results.push(MultiSearchResult { - query_name: query_name.clone(), - query_md5: query_md5.clone(), - match_name: against_name.clone(), - match_md5: against_md5.clone(), + query_name: query.name.clone(), + query_md5: query.md5sum.clone(), + match_name: against.name.clone(), + match_md5: against.md5sum.clone(), containment: containment_query_in_target, max_containment, jaccard, diff --git a/src/pairwise.rs b/src/pairwise.rs index fbfac585..e206bf2b 100644 --- a/src/pairwise.rs +++ b/src/pairwise.rs @@ -50,40 +50,37 @@ pub fn pairwise( let processed_cmp = AtomicUsize::new(0); - sketches - .par_iter() - .enumerate() - .for_each(|(idx, (q1, q1_name, q1_md5))| { - for (q2, q2_name, q2_md5) in sketches.iter().skip(idx + 1) { - let overlap = q1.count_common(q2, false).unwrap() as f64; - let query1_size = q1.size() as f64; - let query2_size = q2.size() as f64; - - let containment_q1_in_q2 = overlap / query1_size; - let containment_q2_in_q1 = overlap / query2_size; - let max_containment = containment_q1_in_q2.max(containment_q2_in_q1); - let jaccard = overlap / (query1_size + query2_size - overlap); - - if containment_q1_in_q2 > threshold || containment_q2_in_q1 > threshold { - send.send(MultiSearchResult { - query_name: q1_name.clone(), - query_md5: q1_md5.clone(), - match_name: q2_name.clone(), - match_md5: q2_md5.clone(), - containment: containment_q1_in_q2, - max_containment, - jaccard, - intersect_hashes: overlap, - }) - .unwrap(); - } - - let i = processed_cmp.fetch_add(1, atomic::Ordering::SeqCst); - if i % 100000 == 0 { - eprintln!("Processed {} comparisons", i); - } + sketches.par_iter().enumerate().for_each(|(idx, query)| { + for against in sketches.iter().skip(idx + 1) { + let overlap = query.minhash.count_common(&against.minhash, false).unwrap() as f64; + let query1_size = query.minhash.size() as f64; + let query2_size = against.minhash.size() as f64; + + let containment_q1_in_q2 = overlap / query1_size; + let containment_q2_in_q1 = overlap / query2_size; + let max_containment = containment_q1_in_q2.max(containment_q2_in_q1); + let jaccard = overlap / (query1_size + query2_size - overlap); + + if containment_q1_in_q2 > threshold || containment_q2_in_q1 > threshold { + send.send(MultiSearchResult { + query_name: query.name.clone(), + query_md5: query.md5sum.clone(), + match_name: against.name.clone(), + match_md5: against.md5sum.clone(), + containment: containment_q1_in_q2, + max_containment, + jaccard, + intersect_hashes: overlap, + }) + .unwrap(); } - }); + + let i = processed_cmp.fetch_add(1, atomic::Ordering::SeqCst); + if i % 100000 == 0 { + eprintln!("Processed {} comparisons", i); + } + } + }); // do some cleanup and error handling - drop(send); // close the channel diff --git a/src/utils.rs b/src/utils.rs index 06f1ddb6..8e664412 100644 --- a/src/utils.rs +++ b/src/utils.rs @@ -22,6 +22,14 @@ use sourmash::signature::{Signature, SigsTrait}; use sourmash::sketch::minhash::KmerMinHash; use sourmash::storage::{FSStorage, InnerStorage, SigStore}; +/// Track a name/minhash. + +pub struct SmallSignature { + pub location: String, + pub name: String, + pub md5sum: String, + pub minhash: KmerMinHash, +} /// Structure to hold overlap information from comparisons. pub struct PrefetchResult { @@ -200,12 +208,17 @@ pub fn load_mh_with_name_and_md5( collection: Collection, selection: &Selection, report_type: ReportType, -) -> Result> { - let mut sketchinfo: Vec<(KmerMinHash, String, String)> = Vec::new(); +) -> Result> { + let mut sketchinfo: Vec = Vec::new(); for (_idx, record) in collection.iter() { if let Ok(sig) = collection.sig_from_record(record) { - if let Some(ds_mh) = sig.clone().select(selection)?.minhash().cloned() { - sketchinfo.push((ds_mh, record.name().to_string(), record.md5().to_string())); + if let Some(minhash) = sig.clone().select(selection)?.minhash().cloned() { + sketchinfo.push(SmallSignature { + location: record.internal_location().to_string(), + name: sig.name(), + md5sum: sig.md5sum(), + minhash, + }) } } else { bail!( From 2453c9b4912bc50bdad3534700a2aee444512c7d Mon Sep 17 00:00:00 2001 From: "N. Tessa Pierce-Ward" Date: Fri, 2 Feb 2024 09:57:02 -0800 Subject: [PATCH 38/40] rename fn back to load_sketches --- src/fastmultigather.rs | 7 +++---- src/manysearch.rs | 7 ++----- src/multisearch.rs | 8 +++----- src/pairwise.rs | 4 ++-- src/utils.rs | 2 +- 5 files changed, 11 insertions(+), 17 deletions(-) diff --git a/src/fastmultigather.rs b/src/fastmultigather.rs index 4f61e89c..91e57e23 100644 --- a/src/fastmultigather.rs +++ b/src/fastmultigather.rs @@ -12,8 +12,8 @@ use std::collections::BinaryHeap; use camino::Utf8Path as PathBuf; use crate::utils::{ - consume_query_by_gather, load_collection, load_mh_with_name_and_md5, write_prefetch, - PrefetchResult, ReportType, + consume_query_by_gather, load_collection, load_sketches, write_prefetch, PrefetchResult, + ReportType, }; pub fn fastmultigather( @@ -53,8 +53,7 @@ pub fn fastmultigather( allow_failed_sigpaths, )?; // load against sketches into memory, downsampling on the way - let against = - load_mh_with_name_and_md5(against_collection, selection, ReportType::Against).unwrap(); + let against = load_sketches(against_collection, selection, ReportType::Against).unwrap(); // Iterate over all queries => do prefetch and gather! let processed_queries = AtomicUsize::new(0); diff --git a/src/manysearch.rs b/src/manysearch.rs index b1546c05..d7ff7808 100644 --- a/src/manysearch.rs +++ b/src/manysearch.rs @@ -8,9 +8,7 @@ use rayon::prelude::*; use std::sync::atomic; use std::sync::atomic::AtomicUsize; -use crate::utils::{ - csvwriter_thread, load_collection, load_mh_with_name_and_md5, ReportType, SearchResult, -}; +use crate::utils::{csvwriter_thread, load_collection, load_sketches, ReportType, SearchResult}; use sourmash::selection::Selection; use sourmash::signature::SigsTrait; @@ -30,8 +28,7 @@ pub fn manysearch( allow_failed_sigpaths, )?; // load all query sketches into memory, downsampling on the way - let query_sketchlist = - load_mh_with_name_and_md5(query_collection, selection, ReportType::Query).unwrap(); + let query_sketchlist = load_sketches(query_collection, selection, ReportType::Query).unwrap(); // Against: Load all _paths_, not signatures, into memory. let against_collection = load_collection( diff --git a/src/multisearch.rs b/src/multisearch.rs index 55ccc54c..569d9f7d 100644 --- a/src/multisearch.rs +++ b/src/multisearch.rs @@ -7,7 +7,7 @@ use std::sync::atomic; use std::sync::atomic::AtomicUsize; use crate::utils::{ - csvwriter_thread, load_collection, load_mh_with_name_and_md5, MultiSearchResult, ReportType, + csvwriter_thread, load_collection, load_sketches, MultiSearchResult, ReportType, }; /// Search many queries against a list of signatures. @@ -31,8 +31,7 @@ pub fn multisearch( ReportType::Query, allow_failed_sigpaths, )?; - let queries = - load_mh_with_name_and_md5(query_collection, selection, ReportType::Query).unwrap(); + let queries = load_sketches(query_collection, selection, ReportType::Query).unwrap(); // Load all against sketches into memory at once. let against_collection = load_collection( @@ -41,8 +40,7 @@ pub fn multisearch( ReportType::Against, allow_failed_sigpaths, )?; - let against = - load_mh_with_name_and_md5(against_collection, selection, ReportType::Against).unwrap(); + let against = load_sketches(against_collection, selection, ReportType::Against).unwrap(); // set up a multi-producer, single-consumer channel. let (send, recv) = diff --git a/src/pairwise.rs b/src/pairwise.rs index e206bf2b..aca9f797 100644 --- a/src/pairwise.rs +++ b/src/pairwise.rs @@ -5,7 +5,7 @@ use std::sync::atomic; use std::sync::atomic::AtomicUsize; use crate::utils::{ - csvwriter_thread, load_collection, load_mh_with_name_and_md5, MultiSearchResult, ReportType, + csvwriter_thread, load_collection, load_sketches, MultiSearchResult, ReportType, }; use sourmash::selection::Selection; use sourmash::signature::SigsTrait; @@ -35,7 +35,7 @@ pub fn pairwise( &siglist ) } - let sketches = load_mh_with_name_and_md5(collection, selection, ReportType::General).unwrap(); + let sketches = load_sketches(collection, selection, ReportType::General).unwrap(); // set up a multi-producer, single-consumer channel. let (send, recv) = diff --git a/src/utils.rs b/src/utils.rs index 8e664412..ed4a2606 100644 --- a/src/utils.rs +++ b/src/utils.rs @@ -204,7 +204,7 @@ pub fn load_fasta_fromfile(sketchlist_filename: String) -> Result Date: Fri, 2 Feb 2024 11:54:23 -0800 Subject: [PATCH 39/40] use serde serialize for writing instead of custom traits --- src/fastgather.rs | 7 +- src/fastmultigather.rs | 2 +- src/multisearch.rs | 2 +- src/utils.rs | 202 +++++++++-------------------------------- 4 files changed, 46 insertions(+), 167 deletions(-) diff --git a/src/fastgather.rs b/src/fastgather.rs index f70b11e3..349ed974 100644 --- a/src/fastgather.rs +++ b/src/fastgather.rs @@ -1,10 +1,7 @@ /// fastgather: Run gather with a query against a list of files. use anyhow::Result; - -use sourmash::selection::Selection; -// use camino; - use sourmash::prelude::Select; +use sourmash::selection::Selection; use crate::utils::{ consume_query_by_gather, load_collection, load_sketches_above_threshold, write_prefetch, @@ -43,7 +40,7 @@ pub fn fastgather( bail!("No query sketch matching selection parameters."); } }; - // build the list of paths to match against. + // load collection to match against. let against_collection = load_collection( &against_filepath, selection, diff --git a/src/fastmultigather.rs b/src/fastmultigather.rs index 91e57e23..1ed14f10 100644 --- a/src/fastmultigather.rs +++ b/src/fastmultigather.rs @@ -71,7 +71,7 @@ pub fn fastmultigather( if let Some(query_mh) = query_sig.minhash() { let matchlist: BinaryHeap = against .iter() - .filter_map(|(against)| { + .filter_map(|against| { let mut mm: Option = None; if let Ok(overlap) = against.minhash.count_common(query_mh, false) { if overlap >= threshold_hashes { diff --git a/src/multisearch.rs b/src/multisearch.rs index 569d9f7d..c4f33843 100644 --- a/src/multisearch.rs +++ b/src/multisearch.rs @@ -59,7 +59,7 @@ pub fn multisearch( let send = against .par_iter() - .filter_map(|(against)| { + .filter_map(|against| { let mut results = vec![]; // search for matches & save containment. for query in queries.iter() { diff --git a/src/utils.rs b/src/utils.rs index ed4a2606..4d1cc244 100644 --- a/src/utils.rs +++ b/src/utils.rs @@ -7,6 +7,9 @@ use sourmash::selection::Select; use anyhow::{anyhow, Result}; use camino::Utf8Path as Path; use camino::Utf8PathBuf as PathBuf; +use csv::Writer; +use serde::ser::Serializer; +use serde::Serialize; use std::cmp::{Ordering, PartialOrd}; use std::collections::BinaryHeap; use std::fs::{create_dir_all, File}; @@ -568,6 +571,7 @@ pub fn is_revindex_database(path: &camino::Utf8PathBuf) -> bool { } } +#[derive(Serialize)] pub struct SearchResult { pub query_name: String, pub query_md5: String, @@ -579,43 +583,7 @@ pub struct SearchResult { pub max_containment: Option, } -impl ResultType for SearchResult { - fn header_fields() -> Vec<&'static str> { - vec![ - "query_name", - "query_md5", - "match_name", - "containment", - "intersect_hashes", - "match_md5", - "jaccard", - "max_containment", - ] - } - - fn format_fields(&self) -> Vec { - vec![ - format!("\"{}\"", self.query_name), // Wrap query_name with quotes - self.query_md5.clone(), - format!("\"{}\"", self.match_name), // Wrap match_name with quotes - self.containment.to_string(), - self.intersect_hashes.to_string(), - match &self.match_md5 { - Some(md5) => md5.clone(), - None => "".to_string(), - }, - match &self.jaccard { - Some(jaccard) => jaccard.to_string(), - None => "".to_string(), - }, - match &self.max_containment { - Some(max_containment) => max_containment.to_string(), - None => "".to_string(), - }, - ] - } -} - +#[derive(Serialize)] pub struct BranchwaterGatherResult { pub query_name: String, pub query_md5: String, @@ -625,29 +593,7 @@ pub struct BranchwaterGatherResult { pub intersect_bp: usize, } -impl ResultType for BranchwaterGatherResult { - fn header_fields() -> Vec<&'static str> { - vec![ - "query_name", - "query_md5", - "match_name", - "match_md5", - "f_match_query", - "intersect_bp", - ] - } - fn format_fields(&self) -> Vec { - vec![ - format!("\"{}\"", self.query_name), // Wrap query_name with quotes - self.query_md5.clone(), - format!("\"{}\"", self.match_name), // Wrap match_name with quotes - self.match_md5.clone(), - self.f_match_query.to_string(), - self.intersect_bp.to_string(), - ] - } -} - +#[derive(Serialize)] pub struct MultiSearchResult { pub query_name: String, pub query_md5: String, @@ -659,33 +605,7 @@ pub struct MultiSearchResult { pub intersect_hashes: f64, } -impl ResultType for MultiSearchResult { - fn header_fields() -> Vec<&'static str> { - vec![ - "query_name", - "query_md5", - "match_name", - "match_md5", - "containment", - "max_containment", - "jaccard", - "intersect_hashes", - ] - } - - fn format_fields(&self) -> Vec { - vec![ - format!("\"{}\"", self.query_name), // Wrap query_name with quotes - self.query_md5.clone(), - format!("\"{}\"", self.match_name), // Wrap match_name with quotes - self.match_md5.clone(), - self.containment.to_string(), - self.max_containment.to_string(), - self.jaccard.to_string(), - self.intersect_hashes.to_string(), - ] - } -} +#[derive(Serialize)] pub struct ManifestRow { pub md5: String, pub md5short: String, @@ -694,50 +614,24 @@ pub struct ManifestRow { pub num: u32, pub scaled: u64, pub n_hashes: usize, - pub with_abundance: bool, + pub with_abundance: BoolPython, pub name: String, pub filename: String, pub internal_location: String, } -pub fn bool_to_python_string(b: bool) -> String { - match b { - true => "True".to_string(), - false => "False".to_string(), - } -} - -impl ResultType for ManifestRow { - fn header_fields() -> Vec<&'static str> { - vec![ - "internal_location", - "md5", - "md5short", - "ksize", - "moltype", - "num", - "scaled", - "n_hashes", - "with_abundance", - "name", - "filename", - ] - } +// A wrapper type for booleans to customize serialization +pub struct BoolPython(bool); - fn format_fields(&self) -> Vec { - vec![ - self.internal_location.clone(), - self.md5.clone(), - self.md5short.clone(), - self.ksize.to_string(), - self.moltype.clone(), - self.num.to_string(), - self.scaled.to_string(), - self.n_hashes.to_string(), - bool_to_python_string(self.with_abundance), - format!("\"{}\"", self.name), // Wrap name with quotes - self.filename.clone(), - ] +impl Serialize for BoolPython { + fn serialize(&self, serializer: S) -> Result + where + S: Serializer, + { + match self.0 { + true => serializer.serialize_str("True"), + false => serializer.serialize_str("False"), + } } } @@ -771,7 +665,7 @@ pub fn make_manifest_row( num, scaled, n_hashes: sketch.size(), - with_abundance: abund, + with_abundance: BoolPython(abund), name: sig.name().to_string(), filename: filename.to_string(), } @@ -876,24 +770,27 @@ pub fn sigwriter( println!("Writing manifest"); // Start the CSV file inside the zip zip.start_file("SOURMASH-MANIFEST.csv", options).unwrap(); - // write manifest version line writeln!(&mut zip, "# SOURMASH-MANIFEST-VERSION: 1.0").unwrap(); - // Write the header - let header = ManifestRow::header_fields(); - if let Err(e) = writeln!(&mut zip, "{}", header.join(",")) { - eprintln!("Error writing header: {:?}", e); - } + // scoped block for csv writing + { + let mut csv_writer = Writer::from_writer(&mut zip); - // Write each manifest row - for row in &manifest_rows { - let formatted_fields = row.format_fields(); // Assuming you have a format_fields method on ManifestRow - if let Err(e) = writeln!(&mut zip, "{}", formatted_fields.join(",")) { - eprintln!("Error writing item: {:?}", e); + for row in &manifest_rows { + if let Err(e) = csv_writer.serialize(row) { + eprintln!("Error writing item: {:?}", e); + } + } + // CSV writer must be manually flushed to ensure all data is written + if let Err(e) = csv_writer.flush() { + eprintln!("Error flushing CSV writer: {:?}", e); } + } // drop csv writer here + + // Properly finish writing to the ZIP file + if let Err(e) = zip.finish() { + eprintln!("Error finalizing ZIP file: {:?}", e); } - // finalize the zip file writing. - zip.finish().unwrap(); } } } @@ -901,37 +798,22 @@ pub fn sigwriter( }) } -pub trait ResultType { - fn header_fields() -> Vec<&'static str>; - fn format_fields(&self) -> Vec; -} - -pub fn csvwriter_thread( +pub fn csvwriter_thread( recv: std::sync::mpsc::Receiver, output: Option, -) -> std::thread::JoinHandle<()> -where - T: ResultType, -{ +) -> std::thread::JoinHandle<()> { // create output file let out = open_stdout_or_file(output); // spawn a thread that is dedicated to printing to a buffered output std::thread::spawn(move || { - let mut writer = out; - - let header = T::header_fields(); - if let Err(e) = writeln!(&mut writer, "{}", header.join(",")) { - eprintln!("Error writing header: {:?}", e); - } - writer.flush().unwrap(); + let mut writer = Writer::from_writer(out); - for item in recv.iter() { - let formatted_fields = item.format_fields(); - if let Err(e) = writeln!(&mut writer, "{}", formatted_fields.join(",")) { + for res in recv.iter() { + if let Err(e) = writer.serialize(res) { eprintln!("Error writing item: {:?}", e); } - writer.flush().unwrap(); } + writer.flush().expect("Failed to flush writer."); }) } From 22376199379fb14c21386d2ba9b4b46875c672bd Mon Sep 17 00:00:00 2001 From: "N. Tessa Pierce-Ward" Date: Fri, 2 Feb 2024 13:25:01 -0800 Subject: [PATCH 40/40] narrow down multisketch issue --- src/python/tests/test_gather.py | 35 +++++++++++++++++++++++++++------ src/utils.rs | 10 +++++++++- 2 files changed, 38 insertions(+), 7 deletions(-) diff --git a/src/python/tests/test_gather.py b/src/python/tests/test_gather.py index d0376a02..2e975613 100644 --- a/src/python/tests/test_gather.py +++ b/src/python/tests/test_gather.py @@ -304,17 +304,40 @@ def test_against_multisigfile(runtmp, zip_against): g_output = runtmp.output('gather.csv') p_output = runtmp.output('prefetch.csv') + runtmp.sourmash('scripts', 'fastgather', query, combined, + '-o', g_output, '--output-prefetch', p_output, + '-s', '100000') + df = pandas.read_csv(g_output) + assert len(df) == 3 + print(df) + + +def test_against_multisigfile_in_pathlist(runtmp): + # test against a sigfile that contains multiple sketches + query = get_test_data('SRR606249.sig.gz') + against_list = runtmp.output('against.txt') + + sig2 = get_test_data('2.fa.sig.gz') + sig47 = get_test_data('47.fa.sig.gz') + sig63 = get_test_data('63.fa.sig.gz') + + combined = runtmp.output('combined.sig.gz') + runtmp.sourmash('sig', 'cat', sig2, sig47, sig63, '-o', combined) + make_file_list(against_list, [combined]) + + g_output = runtmp.output('gather.csv') + p_output = runtmp.output('prefetch.csv') + runtmp.sourmash('scripts', 'fastgather', query, against_list, '-o', g_output, '--output-prefetch', p_output, '-s', '100000') df = pandas.read_csv(g_output) - if zip_against: - assert len(df) == 3 - print(df) - else: - print(df) - assert len(df) == 1 + print(df) + assert len(df) == 3 # @CTB this is a bug :(. It should load multiple sketches properly! + # @NTP: see pathlist loading in load_collection. When we build + # records from a signature, all records from the same signature + # are read in, but end up having the same name/md5sum @pytest.mark.parametrize('zip_against', [False, True]) diff --git a/src/utils.rs b/src/utils.rs index 4d1cc244..53bf39b2 100644 --- a/src/utils.rs +++ b/src/utils.rs @@ -252,7 +252,12 @@ pub fn load_sketches_above_threshold( // Load against into memory if let Ok(against_sig) = against_collection.sig_from_record(against_record) { if let Some(against_mh) = against_sig.minhash() { - // if let Some(against_mh) = against_sig.select(&selection).unwrap().minhash() { // downsample via select + eprintln!( + "against_sig info: name: {}, md5:{},", + against_sig.name(), + against_sig.md5sum() + ); + eprintln!("against_mh info: md5:{},", against_mh.md5sum()); // currently downsampling here to avoid changing md5sum if let Ok(overlap) = against_mh.count_common(query, true) { //downsample via count_common @@ -356,6 +361,8 @@ pub fn load_collection( let path = line.ok()?; match Signature::from_path(&path) { Ok(signatures) => { + // TODO: Handling for multisig files: Split into separate sigs so records are unique? + // Currently, we end up with a single record let recs: Vec = signatures .into_iter() .flat_map(|v| Record::from_sig(&v, &path)) @@ -374,6 +381,7 @@ pub fn load_collection( .collect(); let manifest: Manifest = records.into(); + eprintln!("len manifest: {}", manifest.len()); Collection::new( manifest, InnerStorage::new(