Skip to content

Commit

Permalink
Merge pull request #1409 from nextstrain/feat/reduce-minimizer-threshold
Browse files Browse the repository at this point in the history
  • Loading branch information
ivan-aksamentov authored Feb 13, 2024
2 parents 84af471 + eae52b4 commit 1690c28
Show file tree
Hide file tree
Showing 4 changed files with 63 additions and 13 deletions.
10 changes: 8 additions & 2 deletions docs/user/nextclade-cli/reference.md
Original file line number Diff line number Diff line change
Expand Up @@ -275,10 +275,16 @@ For short help type: `nextclade -h`, for extended help type: `nextclade --help`.
* `-r`, `--output-results-tsv <OUTPUT_RESULTS_TSV>` — Path to output results TSV file
* `--min-score <MIN_SCORE>` — Minimum value of the score being considered for a detection

Default value: `0.3`
Default value: `0.1`
* `--min-hits <MIN_HITS>` — Minimum number of the index hits required for a detection

Default value: `10`
Default value: `5`
* `--max-score-gap <MAX_SCORE_GAP>` — Maximum score difference between two adjacent dataset matches, after which the less fitting datasets are not considered

Default value: `0.2`
* `--all-matches` — Whether to consider all datasets

Default value: `false`
* `-j`, `--jobs <JOBS>` — Number of processing jobs. If not specified, all available CPU threads will be used
* `--server <SERVER>` — Use custom dataset server
* `-x`, `--proxy <PROXY>` — Pass all traffic over proxy server. HTTP, HTTPS, and SOCKS5 proxies are supported
Expand Down
24 changes: 16 additions & 8 deletions packages/nextclade-cli/src/cli/nextclade_seq_sort.rs
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@ use nextclade::io::fasta::{FastaReader, FastaRecord, FastaWriter};
use nextclade::io::fs::path_to_string;
use nextclade::make_error;
use nextclade::sort::minimizer_index::{MinimizerIndexJson, MINIMIZER_INDEX_ALGO_VERSION};
use nextclade::sort::minimizer_search::{run_minimizer_search, MinimizerSearchRecord};
use nextclade::sort::minimizer_search::{run_minimizer_search, MinimizerSearchDatasetResult, MinimizerSearchRecord};
use nextclade::utils::option::{OptionMapMutFallible, OptionMapRefFallible};
use nextclade::utils::string::truncate;
use ordered_float::OrderedFloat;
Expand Down Expand Up @@ -138,6 +138,7 @@ pub fn run(args: &NextcladeSortArgs, minimizer_index: &MinimizerIndexJson, verbo
#[derive(Clone, Default, Debug, Serialize, JsonSchema)]
#[serde(rename_all = "camelCase")]
struct SeqSortCsvEntry<'a> {
index: usize,
seq_name: &'a str,
dataset: Option<&'a str>,
score: Option<f64>,
Expand All @@ -153,6 +154,7 @@ fn writer_thread(
output_dir,
output_path,
output_results_tsv,
search_params,
..
} = args;

Expand All @@ -171,13 +173,20 @@ fn writer_thread(
output_results_tsv.map_ref_fallible(|output_results_tsv| CsvStructFileWriter::new(output_results_tsv, b'\t'))?;

for record in result_receiver {
stats.print_seq(&record);
let datasets = &{
if search_params.all_matches {
record.result.datasets
} else {
record.result.datasets.into_iter().take(1).collect_vec()
}
};

let datasets = &record.result.datasets;
stats.print_seq(datasets, &record.fasta_record.seq_name);

if datasets.is_empty() {
results_csv.map_mut_fallible(|results_csv| {
results_csv.write(&SeqSortCsvEntry {
index: record.fasta_record.index,
seq_name: &record.fasta_record.seq_name,
dataset: None,
score: None,
Expand All @@ -189,6 +198,7 @@ fn writer_thread(
for dataset in datasets {
results_csv.map_mut_fallible(|results_csv| {
results_csv.write(&SeqSortCsvEntry {
index: record.fasta_record.index,
seq_name: &record.fasta_record.seq_name,
dataset: Some(&dataset.name),
score: Some(dataset.score),
Expand Down Expand Up @@ -258,19 +268,17 @@ impl StatsPrinter {
}
}

pub fn print_seq(&mut self, record: &MinimizerSearchRecord) {
pub fn print_seq(&mut self, datasets: &[MinimizerSearchDatasetResult], seq_name: &str) {
if !self.enabled {
return;
}

let datasets = record
.result
.datasets
let datasets = datasets
.iter()
.sorted_by_key(|dataset| -OrderedFloat(dataset.score))
.collect_vec();

print!("{:<40}", truncate(&record.fasta_record.seq_name, 40));
print!("{:<40}", truncate(seq_name, 40));

if datasets.is_empty() {
println!(" │ {:40} │ {:>10.3} │ {:>10} │", "undetected".red(), "", "");
Expand Down
17 changes: 16 additions & 1 deletion packages/nextclade/src/sort/minimizer_search.rs
Original file line number Diff line number Diff line change
Expand Up @@ -62,7 +62,7 @@ pub fn run_minimizer_search(
let max_score = scores.iter().copied().fold(0.0, f64::max);
let total_hits: u64 = hit_counts.iter().sum();

let datasets = izip!(&index.references, hit_counts, scores)
let mut datasets = izip!(&index.references, hit_counts, scores)
.filter_map(|(ref_info, n_hits, score)| {
(n_hits >= search_params.min_hits && score >= search_params.min_score).then_some(MinimizerSearchDatasetResult {
name: ref_info.name.clone(),
Expand All @@ -74,6 +74,21 @@ pub fn run_minimizer_search(
.sorted_by_key(|result| -OrderedFloat(result.score))
.collect_vec();

// if there is more than one dataset, check whether there is a gap > 0.2 in their scores
// if so, only keep those with scores above the gap
if datasets.len() > 1 {
let mut chop: usize = 0;
for i in 1..datasets.len() {
if datasets[i - 1].score > datasets[i].score + search_params.max_score_gap {
chop = i;
break;
}
}
if chop > 0 {
datasets.truncate(chop);
}
}

Ok(MinimizerSearchResult {
total_hits,
max_score,
Expand Down
25 changes: 23 additions & 2 deletions packages/nextclade/src/sort/params.rs
Original file line number Diff line number Diff line change
Expand Up @@ -15,14 +15,35 @@ pub struct NextcladeSeqSortParams {
#[clap(long)]
#[clap(default_value_t = NextcladeSeqSortParams::default().min_hits)]
pub min_hits: u64,

/// Maximum score difference between two adjacent dataset matches, after which the less fitting datasets
/// are not considered.
///
/// This argument will truncate the list of datasets considered for a detection, such that if there is a large enough
/// difference in score ("gap") in the list, all datasets that are worse than the dataset before the gap are removed
/// from consideration. This allows, in situation when there's 2 or more groups of similar datasets, to filter-out
/// the groups that are worse than the best group.
#[clap(long)]
#[clap(default_value_t = NextcladeSeqSortParams::default().max_score_gap)]
pub max_score_gap: f64,

/// Whether to consider all datasets
///
/// By default, only the top matching dataset is considered. When this flag is provided,
/// all datasets reaching the matching criteria are considered.
#[clap(long)]
#[clap(default_value_t = NextcladeSeqSortParams::default().all_matches)]
pub all_matches: bool,
}

#[allow(clippy::derivable_impls)]
impl Default for NextcladeSeqSortParams {
fn default() -> Self {
Self {
min_score: 0.3,
min_hits: 10,
min_score: 0.1,
min_hits: 5,
all_matches: false,
max_score_gap: 0.2,
}
}
}

0 comments on commit 1690c28

Please sign in to comment.