Skip to content

Commit

Permalink
Add fastq_minqual and fastq_truncee_rate options
Browse files Browse the repository at this point in the history
  • Loading branch information
torognes committed Feb 27, 2025
1 parent 78013bf commit d580a2c
Show file tree
Hide file tree
Showing 4 changed files with 64 additions and 40 deletions.
44 changes: 30 additions & 14 deletions man/vsearch.1
Original file line number Diff line number Diff line change
Expand Up @@ -1886,6 +1886,12 @@ merged sequence. The default is 1.
.BI \-\-fastq_minovlen\~ "positive integer"
When using \-\-fastq_mergepairs, specify the minimum overlap between
the merged reads. The default is 10. Must be at least 5.
.TAG fastq_minqual
.TP
.BI \-\-fastq_minqual\~ "positive integer"
When using \-\-fastq_filter or \-\-fastx_filter, discard reads having
any base with a quality score below the given value. The default is 0,
which discards none.
.TAG fastq_nostagger
.TP
.B \-\-fastq_nostagger
Expand Down Expand Up @@ -2030,6 +2036,15 @@ default, \fIk\fR = 4.
When using \-\-fastq_filter or \-\-fastx_filter, truncate sequences so
that their total expected error is not higher than the specified
value.
.TAG fastq_truncee_rate
.TP
.BI \-\-fastq_truncee_rate\~ real
When using \-\-fastq_filter or \-\-fastx_filter, truncate sequences so
that their average expected error per base is not higher than the
specified value. The truncation will happen at the first
occurence. The average expected error per base is calculated as the
total expected number of errors divided by the length of the sequence
after truncation.
.TAG fastq_trunclen
.TP
.BI \-\-fastq_trunclen\~ "positive integer"
Expand Down Expand Up @@ -2095,26 +2110,27 @@ corresponding output will be written to the files specified with the
files if the input is in FASTA format. The sequences are first trimmed
and then filtered based on the remaining bases. Sequences may be
trimmed using the options \-\-fastq_stripleft, \-\-fastq_stripright,
\-\-fastq_truncee, \-\-fastq_trunclen, \-\-fastq_trunclen_keep and
\-\-fastq_truncqual. The sequences may be filtered using the options
\-\-fastq_maxee, \-\-fastq_maxee_rate, \-\-fastq_maxlen,
\-\-fastq_maxns, \-\-fastq_minlen (default 1), \-\-fastq_trunclen,
\-\-maxsize, and \-\-minsize. Sequences not satisfying the
requirements are discarded. For pairs of sequences, both sequences in
a pair must satisfy the requirements, otherwise both are discarded. If
no shortening or filtering options are given, all sequences are
written to the output files, possibly after conversion from FASTQ to
FASTA format. The \-\-relabel option may be used to relabel the output
\-\-fastq_truncee, \-\-fastq_truncee_rate, \-\-fastq_trunclen,
\-\-fastq_trunclen_keep and \-\-fastq_truncqual. The sequences may be
filtered using the options \-\-fastq_maxee, \-\-fastq_maxee_rate,
\-\-fastq_maxlen, \-\-fastq_maxns, \-\-fastq_minlen (default 1),
\-\-fastq_minqual, \-\-fastq_trunclen, \-\-maxsize, and
\-\-minsize. Sequences not satisfying the requirements are
discarded. For pairs of sequences, both sequences in a pair must
satisfy the requirements, otherwise both are discarded. If no
shortening or filtering options are given, all sequences are written
to the output files, possibly after conversion from FASTQ to FASTA
format. The \-\-relabel option may be used to relabel the output
sequences. The \-\-eeout option may be used to output the expected
number of errors in each sequence. After all sequences have been
processed, the number of kept and discarded sequences will be shown,
as well as how many of the kept sequences were trimmed. When the input
is in FASTA format, the following options are not accepted because
quality scores are not available: \-\-eeout, \-\-fastq_ascii,
\-\-fastq_eeout, \-\-fastq_maxee, \-\-fastq_maxee_rate, \-\-fastq_out,
\-\-fastq_qmax, \-\-fastq_qmin, \-\-fastq_truncee,
\-\-fastq_truncqual, \-\-fastqout_discarded,
\-\-fastqout_discarded_rev, \-\-fastqout_rev.
\-\-fastq_eeout, \-\-fastq_maxee, \-\-fastq_maxee_rate,
\-\-fastq_minqual, \-\-fastq_out, \-\-fastq_qmax, \-\-fastq_qmin,
\-\-fastq_truncee, \-\-fastq_truncee_rate, \-\-fastq_truncqual,
\-\-fastqout_discarded, \-\-fastqout_discarded_rev, \-\-fastqout_rev.
.TAG fastx_revcomp
.TP
.BI \-\-fastx_revcomp \0filename
Expand Down
36 changes: 10 additions & 26 deletions src/filter.cc
Original file line number Diff line number Diff line change
Expand Up @@ -183,12 +183,18 @@ auto analyse(fastx_handle h) -> struct analysis_res
res.ee += e;

if ((qual <= opt_fastq_truncqual) ||
(res.ee > opt_fastq_truncee))
(res.ee > opt_fastq_truncee) ||
(res.ee > opt_fastq_truncee_rate * (i + 1)))
{
res.ee -= e;
res.length = i;
break;
}

if (qual < opt_fastq_minqual)
{
res.discarded = true;
}
}

/* filter by expected errors (ee) */
Expand Down Expand Up @@ -287,12 +293,14 @@ auto filter(bool fastq_only, char * filename) -> void
(opt_fastq_qmax < 41) ||
(opt_fastq_qmin > 0) ||
(opt_fastq_truncee < dbl_max) ||
(opt_fastq_truncee_rate < dbl_max) ||
(opt_fastq_truncqual < long_min) ||
(opt_fastq_minqual > 0) ||
opt_fastqout_discarded ||
opt_fastqout_discarded_rev ||
opt_fastqout_rev)
{
fatal("The following options are not accepted with the fastx_filter command when the input is a FASTA file, because quality scores are not available: eeout, fastq_ascii, fastq_eeout, fastq_maxee, fastq_maxee_rate, fastq_out, fastq_qmax, fastq_qmin, fastq_truncee, fastq_truncqual, fastqout_discarded, fastqout_discarded_rev, fastqout_rev");
fatal("The following options are not accepted with the fastx_filter command when the input is a FASTA file, because quality scores are not available: eeout, fastq_ascii, fastq_eeout, fastq_maxee, fastq_maxee_rate, fastq_minqual, fastq_out, fastq_qmax, fastq_qmin, fastq_truncee, fastq_truncee_rate, fastq_truncqual, fastqout_discarded, fastqout_discarded_rev, fastqout_rev");
}
}

Expand All @@ -311,30 +319,6 @@ auto filter(bool fastq_only, char * filename) -> void
{
fatal("The forward and reverse input sequence must in the same format, either FASTA or FASTQ");
}

if (! (h2->is_fastq || h2->is_empty))
{
if (fastq_only)
{
fatal("FASTA input files not allowed with fastq_filter, consider using fastx_filter command instead");
}
else if (opt_eeout ||
(opt_fastq_ascii != 33) ||
opt_fastq_eeout ||
(opt_fastq_maxee < dbl_max) ||
(opt_fastq_maxee_rate < dbl_max) ||
opt_fastqout ||
(opt_fastq_qmax < 41) ||
(opt_fastq_qmin > 0) ||
(opt_fastq_truncee < dbl_max) ||
(opt_fastq_truncqual < long_min) ||
opt_fastqout_discarded ||
opt_fastqout_discarded_rev ||
opt_fastqout_rev)
{
fatal("The following options are not accepted with the fastx_filter command when the input is a FASTA file, because quality scores are not available: eeout, fastq_ascii, fastq_eeout, fastq_maxee, fastq_maxee_rate, fastq_out, fastq_qmax, fastq_qmin, fastq_truncee, fastq_truncqual, fastqout_discarded, fastqout_discarded_rev, fastqout_rev");
}
}
}

FILE * fp_fastaout = nullptr;
Expand Down
22 changes: 22 additions & 0 deletions src/vsearch.cc
Original file line number Diff line number Diff line change
Expand Up @@ -226,6 +226,7 @@ double opt_fastq_maxdiffpct;
double opt_fastq_maxee;
double opt_fastq_maxee_rate;
double opt_fastq_truncee;
double opt_fastq_truncee_rate;
double opt_id;
double opt_lca_cutoff;
double opt_max_unmasked_pct;
Expand Down Expand Up @@ -284,6 +285,7 @@ int64_t opt_fastq_maxns;
int64_t opt_fastq_minlen;
int64_t opt_fastq_minmergelen;
int64_t opt_fastq_minovlen;
int64_t opt_fastq_minqual;
int64_t opt_fastq_qmax;
int64_t opt_fastq_qmaxout;
int64_t opt_fastq_qmin;
Expand Down Expand Up @@ -839,6 +841,7 @@ auto args_init(int argc, char ** argv, struct Parameters & parameters) -> void
opt_fastq_minlen = 1;
opt_fastq_minmergelen = 0;
opt_fastq_minovlen = 10;
opt_fastq_minqual = 0;
opt_fastq_nostagger = true;
opt_fastq_qmax = 41;
opt_fastq_qmaxout = 41;
Expand All @@ -848,6 +851,7 @@ auto args_init(int argc, char ** argv, struct Parameters & parameters) -> void
opt_fastq_stripleft = 0;
opt_fastq_stripright = 0;
opt_fastq_truncee = dbl_max;
opt_fastq_truncee_rate = dbl_max;
opt_fastq_trunclen = -1;
opt_fastq_trunclen_keep = -1;
opt_fastq_truncqual = long_min;
Expand Down Expand Up @@ -1082,6 +1086,7 @@ auto args_init(int argc, char ** argv, struct Parameters & parameters) -> void
option_fastq_minlen,
option_fastq_minmergelen,
option_fastq_minovlen,
option_fastq_minqual,
option_fastq_nostagger,
option_fastq_qmax,
option_fastq_qmaxout,
Expand All @@ -1093,6 +1098,7 @@ auto args_init(int argc, char ** argv, struct Parameters & parameters) -> void
option_fastq_stripright,
option_fastq_tail,
option_fastq_truncee,
option_fastq_truncee_rate,
option_fastq_trunclen,
option_fastq_trunclen_keep,
option_fastq_truncqual,
Expand Down Expand Up @@ -1330,6 +1336,7 @@ auto args_init(int argc, char ** argv, struct Parameters & parameters) -> void
{"fastq_minlen", required_argument, nullptr, 0 },
{"fastq_minmergelen", required_argument, nullptr, 0 },
{"fastq_minovlen", required_argument, nullptr, 0 },
{"fastq_minqual", required_argument, nullptr, 0 },
{"fastq_nostagger", no_argument, nullptr, 0 },
{"fastq_qmax", required_argument, nullptr, 0 },
{"fastq_qmaxout", required_argument, nullptr, 0 },
Expand All @@ -1341,6 +1348,7 @@ auto args_init(int argc, char ** argv, struct Parameters & parameters) -> void
{"fastq_stripright", required_argument, nullptr, 0 },
{"fastq_tail", required_argument, nullptr, 0 },
{"fastq_truncee", required_argument, nullptr, 0 },
{"fastq_truncee_rate", required_argument, nullptr, 0 },
{"fastq_trunclen", required_argument, nullptr, 0 },
{"fastq_trunclen_keep", required_argument, nullptr, 0 },
{"fastq_truncqual", required_argument, nullptr, 0 },
Expand Down Expand Up @@ -2603,6 +2611,14 @@ auto args_init(int argc, char ** argv, struct Parameters & parameters) -> void
opt_n_mismatch = true;
break;

case option_fastq_minqual:
opt_fastq_minqual = args_getlong(optarg);
break;

case option_fastq_truncee_rate:
opt_fastq_truncee_rate = args_getdouble(optarg);
break;

default:
fatal("Internal error in option parsing");
}
Expand Down Expand Up @@ -3469,11 +3485,13 @@ auto args_init(int argc, char ** argv, struct Parameters & parameters) -> void
option_fastq_maxlen,
option_fastq_maxns,
option_fastq_minlen,
option_fastq_minqual,
option_fastq_qmax,
option_fastq_qmin,
option_fastq_stripleft,
option_fastq_stripright,
option_fastq_truncee,
option_fastq_truncee_rate,
option_fastq_trunclen,
option_fastq_trunclen_keep,
option_fastq_truncqual,
Expand Down Expand Up @@ -3611,11 +3629,13 @@ auto args_init(int argc, char ** argv, struct Parameters & parameters) -> void
option_fastq_maxlen,
option_fastq_maxns,
option_fastq_minlen,
option_fastq_minqual,
option_fastq_qmax,
option_fastq_qmin,
option_fastq_stripleft,
option_fastq_stripright,
option_fastq_truncee,
option_fastq_truncee_rate,
option_fastq_trunclen,
option_fastq_trunclen_keep,
option_fastq_truncqual,
Expand Down Expand Up @@ -5498,11 +5518,13 @@ auto cmd_help(struct Parameters const & parameters) -> void {
" --fastq_maxlen INT discard if length of sequence is longer\n"
" --fastq_maxns INT discard if number of N's is higher\n"
" --fastq_minlen INT discard if length of sequence is shorter\n"
" --fastq_minqual INT discard if any base quality value lower (0)\n"
" --fastq_qmax INT maximum base quality value for FASTQ input (41)\n"
" --fastq_qmin INT minimum base quality value for FASTQ input (0)\n"
" --fastq_stripleft INT delete given number of bases from the 5' end\n"
" --fastq_stripright INT delete given number of bases from the 3' end\n"
" --fastq_truncee REAL truncate to given maximum expected error\n"
" --fastq_truncee_rate REAL truncate to given maximum expected error rate\n"
" --fastq_trunclen INT truncate to given length (discard if shorter)\n"
" --fastq_trunclen_keep INT truncate to given length (keep if shorter)\n"
" --fastq_truncqual INT truncate to given minimum base quality\n"
Expand Down
2 changes: 2 additions & 0 deletions src/vsearch.h
Original file line number Diff line number Diff line change
Expand Up @@ -346,6 +346,7 @@ extern double opt_fastq_maxdiffpct;
extern double opt_fastq_maxee;
extern double opt_fastq_maxee_rate;
extern double opt_fastq_truncee;
extern double opt_fastq_truncee_rate;
extern double opt_id;
extern double opt_lca_cutoff;
extern double opt_max_unmasked_pct;
Expand Down Expand Up @@ -404,6 +405,7 @@ extern int64_t opt_fastq_maxns;
extern int64_t opt_fastq_minlen;
extern int64_t opt_fastq_minmergelen;
extern int64_t opt_fastq_minovlen;
extern int64_t opt_fastq_minqual;
extern int64_t opt_fastq_qmax;
extern int64_t opt_fastq_qmaxout;
extern int64_t opt_fastq_qmin;
Expand Down

0 comments on commit d580a2c

Please sign in to comment.