diff --git a/README.md b/README.md index cca82934..c5ccb5f6 100644 --- a/README.md +++ b/README.md @@ -24,7 +24,7 @@ Most of the nucleotide based commands and options in USEARCH version 7 are suppo ## Getting Help -If you can't find an answer in the [VSEARCH documentation](https://github.com/torognes/vsearch/releases/download/v2.10.4/vsearch_manual.pdf), please visit the [VSEARCH Web Forum](https://groups.google.com/forum/#!forum/vsearch-forum) to post a question or start a discussion. +If you can't find an answer in the [VSEARCH documentation](https://github.com/torognes/vsearch/releases/download/v2.11.0/vsearch_manual.pdf), please visit the [VSEARCH Web Forum](https://groups.google.com/forum/#!forum/vsearch-forum) to post a question or start a discussion. ## Example @@ -37,9 +37,9 @@ In the example below, VSEARCH will identify sequences in the file database.fsa t **Source distribution** To download the source distribution from a [release](https://github.com/torognes/vsearch/releases) and build the executable and the documentation, use the following commands: ``` -wget https://github.com/torognes/vsearch/archive/v2.10.4.tar.gz -tar xzf v2.10.4.tar.gz -cd vsearch-2.10.4 +wget https://github.com/torognes/vsearch/archive/v2.11.0.tar.gz +tar xzf v2.11.0.tar.gz +cd vsearch-2.11.0 ./autogen.sh ./configure make @@ -68,43 +68,43 @@ Binary distributions are provided for x86-64 systems running GNU/Linux, macOS (v Download the appropriate executable for your system using the following commands if you are using a Linux x86_64 system: ```sh -wget https://github.com/torognes/vsearch/releases/download/v2.10.4/vsearch-2.10.4-linux-x86_64.tar.gz -tar xzf vsearch-2.10.4-linux-x86_64.tar.gz +wget https://github.com/torognes/vsearch/releases/download/v2.11.0/vsearch-2.11.0-linux-x86_64.tar.gz +tar xzf vsearch-2.11.0-linux-x86_64.tar.gz ``` Or these commands if you are using a Linux ppc64le system: ```sh -wget https://github.com/torognes/vsearch/releases/download/v2.10.4/vsearch-2.10.4-linux-ppc64le.tar.gz -tar xzf vsearch-2.10.4-linux-ppc64le.tar.gz +wget https://github.com/torognes/vsearch/releases/download/v2.11.0/vsearch-2.11.0-linux-ppc64le.tar.gz +tar xzf vsearch-2.11.0-linux-ppc64le.tar.gz ``` Or these commands if you are using a Linux aarch64 system: ```sh -wget https://github.com/torognes/vsearch/releases/download/v2.10.4/vsearch-2.10.4-linux-aarch64.tar.gz -tar xzf vsearch-2.10.4-linux-aarch64.tar.gz +wget https://github.com/torognes/vsearch/releases/download/v2.11.0/vsearch-2.11.0-linux-aarch64.tar.gz +tar xzf vsearch-2.11.0-linux-aarch64.tar.gz ``` Or these commands if you are using a Mac: ```sh -wget https://github.com/torognes/vsearch/releases/download/v2.10.4/vsearch-2.10.4-macos-x86_64.tar.gz -tar xzf vsearch-2.10.4-macos-x86_64.tar.gz +wget https://github.com/torognes/vsearch/releases/download/v2.11.0/vsearch-2.11.0-macos-x86_64.tar.gz +tar xzf vsearch-2.11.0-macos-x86_64.tar.gz ``` Or if you are using Windows, download and extract (unzip) the contents of this file: ``` -https://github.com/torognes/vsearch/releases/download/v2.10.4/vsearch-2.10.4-win-x86_64.zip +https://github.com/torognes/vsearch/releases/download/v2.11.0/vsearch-2.11.0-win-x86_64.zip ``` -Linux and Mac: You will now have the binary distribution in a folder called `vsearch-2.10.4-linux-x86_64` or `vsearch-2.10.4-macos-x86_64` in which you will find three subfolders `bin`, `man` and `doc`. We recommend making a copy or a symbolic link to the vsearch binary `bin/vsearch` in a folder included in your `$PATH`, and a copy or a symbolic link to the vsearch man page `man/vsearch.1` in a folder included in your `$MANPATH`. The PDF version of the manual is available in `doc/vsearch_manual.pdf`. +Linux and Mac: You will now have the binary distribution in a folder called `vsearch-2.11.0-linux-x86_64` or `vsearch-2.11.0-macos-x86_64` in which you will find three subfolders `bin`, `man` and `doc`. We recommend making a copy or a symbolic link to the vsearch binary `bin/vsearch` in a folder included in your `$PATH`, and a copy or a symbolic link to the vsearch man page `man/vsearch.1` in a folder included in your `$MANPATH`. The PDF version of the manual is available in `doc/vsearch_manual.pdf`. -Windows: You will now have the binary distribution in a folder called `vsearch-2.10.4-win-x86_64`. The vsearch executable is called `vsearch.exe`. The manual in PDF format is called `vsearch_manual.pdf`. +Windows: You will now have the binary distribution in a folder called `vsearch-2.11.0-win-x86_64`. The vsearch executable is called `vsearch.exe`. The manual in PDF format is called `vsearch_manual.pdf`. -**Documentation** The VSEARCH user's manual is available in the `man` folder in the form of a [man page](https://github.com/torognes/vsearch/blob/master/man/vsearch.1). A pdf version ([vsearch_manual.pdf](https://github.com/torognes/vsearch/releases/download/v2.10.4/vsearch_manual.pdf)) will be generated by `make`. To install the manpage manually, copy the `vsearch.1` file or a create a symbolic link to `vsearch.1` in a folder included in your `$MANPATH`. The manual in both formats is also available with the binary distribution. The manual in PDF form ([vsearch_manual.pdf](https://github.com/torognes/vsearch/releases/download/v2.10.4/vsearch_manual.pdf)) is also attached to the latest [release](https://github.com/torognes/vsearch/releases). +**Documentation** The VSEARCH user's manual is available in the `man` folder in the form of a [man page](https://github.com/torognes/vsearch/blob/master/man/vsearch.1). A pdf version ([vsearch_manual.pdf](https://github.com/torognes/vsearch/releases/download/v2.11.0/vsearch_manual.pdf)) will be generated by `make`. To install the manpage manually, copy the `vsearch.1` file or a create a symbolic link to `vsearch.1` in a folder included in your `$MANPATH`. The manual in both formats is also available with the binary distribution. The manual in PDF form ([vsearch_manual.pdf](https://github.com/torognes/vsearch/releases/download/v2.11.0/vsearch_manual.pdf)) is also attached to the latest [release](https://github.com/torognes/vsearch/releases). ## Plugins, packages, and wrappers @@ -176,11 +176,11 @@ The code is written in C++ but most of it is actually mostly C with some C++ syn File | Description ---|--- -**abundance.cc** | Code for extracting and printing abundance information from FASTA headers **align.cc** | New Needleman-Wunsch global alignment, serial. Only for testing. **align_simd.cc** | SIMD parallel global alignment of 1 query with 8 database sequences **allpairs.cc** | All-vs-all optimal global pairwise alignment (no heuristics) **arch.cc** | Architecture specific code (Mac/Linux) +**attributes.cc** | Extraction and printing of attributes in FASTA headers **bitmap.cc** | Implementation of bitmaps **chimera.cc** | Chimera detection **city.cc** | CityHash code diff --git a/configure.ac b/configure.ac index 6cad8077..9a3a1bb7 100644 --- a/configure.ac +++ b/configure.ac @@ -2,7 +2,7 @@ # Process this file with autoconf to produce a configure script. AC_PREREQ([2.63]) -AC_INIT([vsearch], [2.10.4], [torognes@ifi.uio.no]) +AC_INIT([vsearch], [2.11.0], [torognes@ifi.uio.no]) AC_CANONICAL_TARGET AM_INIT_AUTOMAKE([subdir-objects]) AC_LANG([C++]) diff --git a/man/vsearch.1 b/man/vsearch.1 index e348ac5e..3ae865dd 100644 --- a/man/vsearch.1 +++ b/man/vsearch.1 @@ -1,5 +1,5 @@ .\" ============================================================================ -.TH vsearch 1 "January 10, 2019" "version 2.10.4" "USER COMMANDS" +.TH vsearch 1 "February 13, 2019" "version 2.11.0" "USER COMMANDS" .\" ============================================================================ .SH NAME vsearch \(em chimera detection, clustering, dereplication and @@ -51,9 +51,9 @@ FASTA/FASTQ file processing: \fBvsearch\fR (\-\-fastq_eestats | \-\-fastq_eestats2) \fIfastqfile\fR \-\-output \fIoutputfile\fR [\fIoptions\fR] .PP -\fBvsearch\fR \-\-fastq_filter \fIfastqfile\fR (\-\-fastaout | -\-\-fastaout_discarded | \-\-fastqout | \-\-fastqout_discarded) -\fIoutputfile\fR [\fIoptions\fR] +\fBvsearch\fR \-\-fastq_filter \fIfastqfile\fR [\-\-reverse +\fIfastqfile\fR] (\-\-fastaout | \-\-fastaout_discarded | \-\-fastqout | +\-\-fastqout_discarded \-\-fastaout_rev | \-\-fastaout_discarded_rev | \-\-fastqout_rev | \-\-fastqout_discarded_rev) \fIoutputfile\fR [\fIoptions\fR] .PP \fBvsearch\fR \-\-fastq_join \fIfastqfile\fR \-\-reverse \fIfastqfile\fR (\-\-fastaout | \-\-fastqout) \fIoutputfile\fR @@ -68,7 +68,11 @@ FASTA/FASTQ file processing: \fBvsearch\fR \-\-fastq_stats \fIfastqfile\fR [\-\-log \fIlogfile\fR] [\fIoptions\fR] .PP -\fBvsearch\fR \-\-fastx_revcomp \fIfastxfile\fR (\-\-fastaout | +\fBvsearch\fR \-\-fastx_filter \fIinputfile\fR [\-\-reverse +\fIinputfile\fR] (\-\-fastaout | \-\-fastaout_discarded | \-\-fastqout | +\-\-fastqout_discarded \-\-fastaout_rev | \-\-fastaout_discarded_rev | \-\-fastqout_rev | \-\-fastqout_discarded_rev) \fIoutputfile\fR [\fIoptions\fR] +.PP +\fBvsearch\fR \-\-fastx_revcomp \fIinputfile\fR (\-\-fastaout | \-\-fastqout) \fIoutputfile\fR [\fIoptions\fR] .PP \fBvsearch\fR \-\-sff_convert \fIsff-file\fR \-\-fastqout @@ -957,15 +961,15 @@ file. FASTA/FASTQ file processing options: .RS .PP -Analyse, shorten, filter, convert or merge sequences in FASTQ files, -or reverse complement sequences in FASTA or FASTQ files. The +Analyse, trim, filter, convert or merge sequences in FASTQ files, or +reverse complement sequences in FASTA or FASTQ files. The \-\-fastq_chars command can be used to analyse FASTQ files to identify the quality encoding and the range of quality score values used. To convert between different FASTQ file variants, use the \-\-fastq_convert command. Statistical analysis of the quality and length of the sequences in a FASTQ file may be performed with the \-\-fastq_stats, \-\-fastq_eestats, and \-\-fastq_eestats2 -commands. Sequences may be shortened, filtered and converted by the +commands. Sequences may be trimmed, filtered and converted by the \-\-fastq_filter or \-\-fastx_filter commands. Paired-end reads can be merged using the \-\-fastq_mergepairs command. The \-\-fastx_revcomp command reverse-complements sequences. Finally, the \-\-sff_convert @@ -975,7 +979,9 @@ command can be used to convert SFF files to FASTQ. .B \-\-eeout When using \-\-fastq_filter or \-\-fastq_mergepairs, include the number of expected errors (ee) in the sequence header of FASTQ and -FASTA files. This option is a synonym of the \-\-fastq_eeout option. +FASTA files. This option is a synonym of the \-\-fastq_eeout +option. Use the \-\-xee option to remove this information from +headers. .TP .BI \-\-eetabbedout \0filename When specified with the \-\-fastq_mergepairs command, write statistics @@ -992,6 +998,11 @@ When using \-\-fastq_filter, \-\-fastq_mergepairs or \-\-fastx_filter, write to the given FASTA-formatted file the sequences passing the filter, or the merged sequences. .TP +.BI \-\-fastaout_rev \0filename +When using \-\-fastq_filter, or \-\-fastx_filter, +write to the given FASTA-formatted file the reverse reads passing the +filter. +.TP .BI \-\-fastaout_notmerged_fwd \0filename When using \-\-fastq_mergepairs, write forward reads not merged to the specified FASTA file. @@ -1004,6 +1015,11 @@ specified FASTA file. Write sequences that do not pass the filter of the \-\-fastq_filter or \-\-fastx_filter command to the given FASTA-formatted file. .TP +.BI \-\-fastaout_discarded_rev \0filename +Write reverse reads that do not pass the filter of the +\-\-fastq_filter or \-\-fastx_filter command to the given +FASTA-formatted file. +.TP .B \-\-fastq_allowmergestagger When using \-\-fastq_mergepairs, allow to merge staggered read pairs. Staggered pairs are pairs where the 3' end of the reverse read @@ -1051,9 +1067,11 @@ be limited using the \-\-fastq_qminout and \-\-fastq_qmaxout options. The output file is specified with the \-\-fastqout option. .TP .B \-\-fastq_eeout -When using \-\-fastq_filter or \-\-fastq_mergepairs, include the -number of expected errors (ee) in the sequence header of FASTQ and -FASTA files. This option is a synonym of the \-\-eeout option. +When using \-\-fastq_filter, \-\-fastx_filter or \-\-fastq_mergepairs, +include the number of expected errors (ee) in the sequence header of +FASTQ and FASTA files. This option is a synonym of the \-\-eeout +option. Use the \-\-xee option to remove this information from +headers. .TP .BI \-\-fastq_eestats \0filename Analyze a FASTQ file and report statistics on the distributions of @@ -1098,7 +1116,7 @@ as its argument. The default setting is "0.5,1.0,2.0" that indicates that expected error levels of 0.5, 1.0 and 2.0 should be used. .TP .BI \-\-fastq_filter \0filename -Shorten and/or filter sequences in the given FASTQ file. Similar to +Trim and/or filter sequences in the given FASTQ file. Similar to the \-\-fastx_filter command, but works only on FASTQ files. See \-\-fastx_filter for details. .TP @@ -1341,10 +1359,19 @@ When using \-\-fastq_filter, \-\-fastq_mergepairs or \-\-fastx_filter, write to the given FASTQ-formatted file the sequences passing the filter, or the merged sequences. .TP +.BI \-\-fastqout_rev \0filename +When using \-\-fastq_filter or \-\-fastx_filter, +write to the given FASTQ-formatted file the reverse reads passing the +filter. +.TP .BI \-\-fastqout_discarded \0filename When using \-\-fastq_filter or \-\-fastx_filter, write sequences that do not pass the filter to the given FASTQ-formatted file. .TP +.BI \-\-fastqout_discarded_rev \0filename +When using \-\-fastq_filter or \-\-fastx_filter, write reverse reads that +do not pass the filter to the given FASTQ-formatted file. +.TP .BI \-\-fastqout_notmerged_fwd \0filename When using \-\-fastq_mergepairs, write forward reads not merged to the specified FASTQ file. @@ -1354,26 +1381,34 @@ When using \-\-fastq_mergepairs, write reverse reads not merged to the specified FASTQ file. .TP .BI \-\-fastx_filter \0filename -Shorten and/or filter the sequences in the given FASTA or FASTQ file -and output the remaining sequences to the FASTQ file specified with -the \-\-fastqout option and to the FASTA file specified with the -\-\-fastaout option. The discarded sequences are written to the files +Trim and/or filter the sequences in the given FASTA or FASTQ file and +output the remaining sequences to the FASTQ file specified with the +\-\-fastqout option and/or to the FASTA file specified with the +\-\-fastaout option. Discarded sequences are written to the files specified with the \-\-fastaout_discarded and \-\-fastqout_discarded options. The input format (FASTA or FASTQ) is automatically -detected. Output can not be written to FASTQ files if the input is in -FASTA format. Sequences may be shortened using the options -\-\-fastq_stripleft, \-\-fastq_stripright, \-\-fastq_truncee, -\-\-fastq_trunclen, \-\-fastq_trunclen_keep and -\-\-fastq_truncqual. The sequences may be filtered using the options +detected. If the input consists of paired sequences, an input file +with reverse reads may be specified with the \-\-reverse option, and +corresponding output will be written to the files specified with the +\-\-fastqout_rev, \-\-fastaout_rev, \-\-fastqout_discarded_rev, and +\-\-fastaout_discarded_rev options. Output can not be written to FASTQ files +if the input is in FASTA format. The sequences are first trimmed and +then filtered based on the remaining bases. Sequences may be trimmed +using the options \-\-fastq_stripleft, \-\-fastq_stripright, +\-\-fastq_truncee, \-\-fastq_trunclen, \-\-fastq_trunclen_keep and +\-\-fastq_truncqual. The sequences may be filtered using the options \-\-fastq_maxee, \-\-fastq_maxee_rate, \-\-fastq_maxlen, -\-\-fastq_maxns, \-\-fastq_minlen, \-\-fastq_trunclen, \-\-maxsize, -and \-\-minsize. If shortening results in an empty sequence, it is -discarded. The sequences are first shortened and then filtered based -on the remaining bases. If no shortening or filtering options are -given, all sequences are written to the output files, possibly after -conversion from FASTQ to FASTA format. The \-\-relabel option may be -used to relabel the output sequences. The \-\-eeout may be used to -output the expected number of errors in each sequence. +\-\-fastq_maxns, \-\-fastq_minlen (default 1), \-\-fastq_trunclen, +\-\-maxsize, and \-\-minsize. Sequences not satisfying the +requirements are discarded. For pairs of sequences, both sequences in +a pair must satisfy the requirements, otherwise both are +discarded. If no shortening or filtering options are given, all +sequences are written to the output files, possibly after conversion +from FASTQ to FASTA format. The \-\-relabel option may be used to +relabel the output sequences. The \-\-eeout option may be used to output the +expected number of errors in each sequence. After all sequences have +been processed, the number of kept and discarded sequences will be +shown, as well as how many of the kept sequences were trimmed. .TP .BI \-\-fastx_revcomp \0filename Reverse-complement the sequences in the given FASTA or FASTQ file to a @@ -1426,8 +1461,9 @@ Please see the description of the same option under Chimera detection for details. .TP .BI \-\-reverse \0filename -When using \-\-fastq_mergepairs or \-\-fastq_join, specify the FASTQ -file containing containing the reverse reads. +When using \-\-fastq_filter, \-\-fastx_filter, \-\-fastq_mergepairs or +\-\-fastq_join, specify the FASTQ file containing containing the +reverse reads. .TP .BI \-\-sff_convert \0filename Convert the given SFF file to FASTQ. The FASTQ output file is @@ -1447,6 +1483,11 @@ default no clipping is performed. .B \-\-xsize Strip abundance information from the headers when writing the output file. +.TP +.B \-\-xee +Strip information about expected errors (ee) from the output file +headers. This information is added by the \-\-fastq_eeout and +\-\-eeout options. .RE .PP .\" ---------------------------------------------------------------------------- @@ -3508,6 +3549,12 @@ Fixed serious bug in x86_64 SIMD alignment code introduced in version 2.10.3. Added link to BioConda in README. Fixed bug in fastq_stats with sequence length 1. Fixed use of equals symbol in UC files for identical sequences with cluster_fast. +.TP +.BR v2.11.0\~ "released February 13th, 2019" +Added ability to trim and filter paired-end reads using the reverse +option with the fastx_filter and fastq_filter commands. Added \-\-xee +option to remove ee attributes from FASTA headers. Minor invisible +improvement to the progress indicator. .RE .LP .\" ============================================================================ diff --git a/src/Makefile.am b/src/Makefile.am index af876016..24b5d9a9 100644 --- a/src/Makefile.am +++ b/src/Makefile.am @@ -15,11 +15,11 @@ AM_CFLAGS=$(AM_CXXFLAGS) export MACOSX_DEPLOYMENT_TARGET VSEARCHHEADERS=\ -abundance.h \ align.h \ align_simd.h \ allpairs.h \ arch.h \ +attributes.h \ bitmap.h \ chimera.h \ city.h \ @@ -108,11 +108,11 @@ endif endif __top_builddir__bin_vsearch_SOURCES = $(VSEARCHHEADERS) \ -abundance.cc \ align.cc \ align_simd.cc \ allpairs.cc \ arch.cc \ +attributes.cc \ bitmap.cc \ chimera.cc \ cluster.cc \ diff --git a/src/align.cc b/src/align.cc index d86dd394..f2325883 100644 --- a/src/align.cc +++ b/src/align.cc @@ -2,7 +2,7 @@ VSEARCH: a versatile open source tool for metagenomics - Copyright (C) 2014-2018, Torbjorn Rognes, Frederic Mahe and Tomas Flouri + Copyright (C) 2014-2019, Torbjorn Rognes, Frederic Mahe and Tomas Flouri All rights reserved. Contact: Torbjorn Rognes , diff --git a/src/align.h b/src/align.h index 56d7accc..d74de465 100644 --- a/src/align.h +++ b/src/align.h @@ -2,7 +2,7 @@ VSEARCH: a versatile open source tool for metagenomics - Copyright (C) 2014-2018, Torbjorn Rognes, Frederic Mahe and Tomas Flouri + Copyright (C) 2014-2019, Torbjorn Rognes, Frederic Mahe and Tomas Flouri All rights reserved. Contact: Torbjorn Rognes , diff --git a/src/align_simd.h b/src/align_simd.h index 3d98a92e..7a8a3dc1 100644 --- a/src/align_simd.h +++ b/src/align_simd.h @@ -2,7 +2,7 @@ VSEARCH: a versatile open source tool for metagenomics - Copyright (C) 2014-2018, Torbjorn Rognes, Frederic Mahe and Tomas Flouri + Copyright (C) 2014-2019, Torbjorn Rognes, Frederic Mahe and Tomas Flouri All rights reserved. Contact: Torbjorn Rognes , diff --git a/src/allpairs.cc b/src/allpairs.cc index 4a67f56b..54183afa 100644 --- a/src/allpairs.cc +++ b/src/allpairs.cc @@ -2,7 +2,7 @@ VSEARCH: a versatile open source tool for metagenomics - Copyright (C) 2014-2018, Torbjorn Rognes, Frederic Mahe and Tomas Flouri + Copyright (C) 2014-2019, Torbjorn Rognes, Frederic Mahe and Tomas Flouri All rights reserved. Contact: Torbjorn Rognes , @@ -226,6 +226,7 @@ void allpairs_output_results(int hit_count, strlen(query_head), 0, count_matched, + -1.0, -1, -1, 0, 0.0); } else @@ -240,6 +241,7 @@ void allpairs_output_results(int hit_count, strlen(query_head), 0, count_notmatched, + -1.0, -1, -1, 0, 0.0); } } diff --git a/src/allpairs.h b/src/allpairs.h index 09b928b7..b8b57844 100644 --- a/src/allpairs.h +++ b/src/allpairs.h @@ -2,7 +2,7 @@ VSEARCH: a versatile open source tool for metagenomics - Copyright (C) 2014-2018, Torbjorn Rognes, Frederic Mahe and Tomas Flouri + Copyright (C) 2014-2019, Torbjorn Rognes, Frederic Mahe and Tomas Flouri All rights reserved. Contact: Torbjorn Rognes , diff --git a/src/arch.cc b/src/arch.cc index dca18443..783a5a76 100644 --- a/src/arch.cc +++ b/src/arch.cc @@ -2,7 +2,7 @@ VSEARCH: a versatile open source tool for metagenomics - Copyright (C) 2014-2018, Torbjorn Rognes, Frederic Mahe and Tomas Flouri + Copyright (C) 2014-2019, Torbjorn Rognes, Frederic Mahe and Tomas Flouri All rights reserved. Contact: Torbjorn Rognes , diff --git a/src/arch.h b/src/arch.h index bef2792f..1e022b25 100644 --- a/src/arch.h +++ b/src/arch.h @@ -2,7 +2,7 @@ VSEARCH: a versatile open source tool for metagenomics - Copyright (C) 2014-2018, Torbjorn Rognes, Frederic Mahe and Tomas Flouri + Copyright (C) 2014-2019, Torbjorn Rognes, Frederic Mahe and Tomas Flouri All rights reserved. Contact: Torbjorn Rognes , diff --git a/src/abundance.cc b/src/attributes.cc similarity index 53% rename from src/abundance.cc rename to src/attributes.cc index 18f53b73..b22663a0 100644 --- a/src/abundance.cc +++ b/src/attributes.cc @@ -2,7 +2,7 @@ VSEARCH: a versatile open source tool for metagenomics - Copyright (C) 2014-2018, Torbjorn Rognes, Frederic Mahe and Tomas Flouri + Copyright (C) 2014-2019, Torbjorn Rognes, Frederic Mahe and Tomas Flouri All rights reserved. Contact: Torbjorn Rognes , @@ -64,14 +64,17 @@ bool header_find_attribute(const char * header, int header_length, const char * attribute, int * start, - int * end) + int * end, + bool allow_decimal) { /* Identify the first occurence of the pattern (^|;)size=([0-9]+)(;|$) in the header string, where "size=" is the specified attribute. + If allow_decimal is true, a dot (.) is allowed within the digits. */ const char * digit_chars = "0123456789"; + const char * digit_chars_decimal = "0123456789."; if ((! header) || (! attribute)) return false; @@ -98,7 +101,9 @@ bool header_find_attribute(const char * header, continue; } - int digits = (int) strspn(header + i + alen, digit_chars); + int digits + = (int) strspn(header + i + alen, + (allow_decimal ? digit_chars_decimal : digit_chars)); /* check for at least one digit */ if (digits == 0) @@ -122,13 +127,18 @@ bool header_find_attribute(const char * header, return false; } -int64_t abundance_get(char * header, int header_length) +int64_t header_get_size(char * header, int header_length) { /* read size/abundance annotation */ int64_t abundance = 1; int start = 0; int end = 0; - if (header_find_attribute(header, header_length, "size=", & start, & end)) + if (header_find_attribute(header, + header_length, + "size=", + & start, + & end, + false)) { int64_t number = atol(header + start + 5); if (number > 0) @@ -139,29 +149,109 @@ int64_t abundance_get(char * header, int header_length) return abundance; } -void abundance_fprint_header_strip_size(FILE * fp, - char * header, - int header_length) +void header_fprint_strip_size_ee(FILE * fp, + char * header, + int header_length, + bool strip_size, + bool strip_ee) { - int start = 0; - int end = 0; - if (header_find_attribute(header, header_length, "size=", & start, & end)) + int attributes = 0; + int attribute_start[2]; + int attribute_end[2]; + + /* look for size attribute */ + + int size_start = 0; + int size_end = 0; + bool size_found = false; + if (strip_size) + size_found = header_find_attribute(header, + header_length, + "size=", + & size_start, + & size_end, + false); + if (size_found) + { + attribute_start[attributes] = size_start; + attribute_end[attributes] = size_end; + attributes++; + } + + /* look for ee attribute */ + + int ee_start = 0; + int ee_end = 0; + bool ee_found = false; + if (strip_ee) + ee_found = header_find_attribute(header, + header_length, + "ee=", + & ee_start, + & ee_end, + true); + if (ee_found) + { + attribute_start[attributes] = ee_start; + attribute_end[attributes] = ee_end; + attributes++; + } + + /* sort */ + + if (attributes > 1) { - if (start <= 1) + if (attribute_start[0] > attribute_start[1]) { - if (end < header_length - 1) - fprintf(fp, "%s", header + end + 1); + /* swap */ + + int s = attribute_start[0]; + int e = attribute_end[0]; + attribute_start[0] = attribute_start[1]; + attribute_end[0] = attribute_end[1]; + attribute_start[1] = s; + attribute_end[1] = e; } - else + } + + /* print */ + + if (attributes == 0) + { + fprintf(fp, "%.*s", header_length, header); + } + else + { + int prev_end = 0; + for (int i = 0; i < attributes; i++) + { + /* print part of header in front of this attribute */ + if (attribute_start[i] > prev_end + 1) + { + fprintf(fp, "%.*s", + attribute_start[i] - prev_end - 1, + header + prev_end); + } + prev_end = attribute_end[i]; + } + + /* print the rest, if any */ + if (header_length > prev_end + 1) { - if (end >= header_length - 1) - fprintf(fp, "%.*s", start - 1, header); - else - fprintf(fp, "%.*s;%.*s", - start - 1, header, - header_length - end - 1, header + end + 1); + fprintf(fp, "%.*s", + header_length - prev_end, + header + prev_end); } } - else - fprintf(fp, "%s", header); +} + +void header_fprint_strip_size(FILE * fp, + char * header, + int header_length) +{ + header_fprint_strip_size_ee(fp, + header, + header_length, + true, + false); } diff --git a/src/abundance.h b/src/attributes.h similarity index 80% rename from src/abundance.h rename to src/attributes.h index 56cd002c..6a01e651 100644 --- a/src/abundance.h +++ b/src/attributes.h @@ -2,7 +2,7 @@ VSEARCH: a versatile open source tool for metagenomics - Copyright (C) 2014-2018, Torbjorn Rognes, Frederic Mahe and Tomas Flouri + Copyright (C) 2014-2019, Torbjorn Rognes, Frederic Mahe and Tomas Flouri All rights reserved. Contact: Torbjorn Rognes , @@ -58,16 +58,14 @@ */ -int64_t abundance_get(char * header, int header_length); +int64_t header_get_size(char * header, int header_length); -void abundance_fprint_header_with_size(FILE * fp, - char * header, - int header_length, - uint64_t size); +void header_fprint_strip_size(FILE * fp, + char * header, + int header_length); -void abundance_fprint_header_strip_size(FILE * fp, - char * header, - int header_length); - -char * abundance_strip_size(char * header, - int header_length); +void header_fprint_strip_size_ee(FILE * fp, + char * header, + int header_length, + bool strip_size, + bool strip_ee); diff --git a/src/bitmap.cc b/src/bitmap.cc index 07a45d29..59ca0fcc 100644 --- a/src/bitmap.cc +++ b/src/bitmap.cc @@ -2,7 +2,7 @@ VSEARCH: a versatile open source tool for metagenomics - Copyright (C) 2014-2018, Torbjorn Rognes, Frederic Mahe and Tomas Flouri + Copyright (C) 2014-2019, Torbjorn Rognes, Frederic Mahe and Tomas Flouri All rights reserved. Contact: Torbjorn Rognes , diff --git a/src/bitmap.h b/src/bitmap.h index c51627ba..96bf5807 100644 --- a/src/bitmap.h +++ b/src/bitmap.h @@ -2,7 +2,7 @@ VSEARCH: a versatile open source tool for metagenomics - Copyright (C) 2014-2018, Torbjorn Rognes, Frederic Mahe and Tomas Flouri + Copyright (C) 2014-2019, Torbjorn Rognes, Frederic Mahe and Tomas Flouri All rights reserved. Contact: Torbjorn Rognes , diff --git a/src/chimera.cc b/src/chimera.cc index d8ae2c6d..946d3432 100644 --- a/src/chimera.cc +++ b/src/chimera.cc @@ -2,7 +2,7 @@ VSEARCH: a versatile open source tool for metagenomics - Copyright (C) 2014-2018, Torbjorn Rognes, Frederic Mahe and Tomas Flouri + Copyright (C) 2014-2019, Torbjorn Rognes, Frederic Mahe and Tomas Flouri All rights reserved. Contact: Torbjorn Rognes , @@ -812,27 +812,27 @@ int eval_parents(struct chimera_info_s * ci) fprintf(fp_uchimealns, "Query (%5d nt) ", ci->query_len); if (opt_xsize) - abundance_fprint_header_strip_size(fp_uchimealns, - ci->query_head, - ci->query_head_len); + header_fprint_strip_size(fp_uchimealns, + ci->query_head, + ci->query_head_len); else fprintf(fp_uchimealns, "%s", ci->query_head); fprintf(fp_uchimealns, "\nParentA (%5" PRIu64 " nt) ", db_getsequencelen(seqno_a)); if (opt_xsize) - abundance_fprint_header_strip_size(fp_uchimealns, - db_getheader(seqno_a), - db_getheaderlen(seqno_a)); + header_fprint_strip_size(fp_uchimealns, + db_getheader(seqno_a), + db_getheaderlen(seqno_a)); else fprintf(fp_uchimealns, "%s", db_getheader(seqno_a)); fprintf(fp_uchimealns, "\nParentB (%5" PRIu64 " nt) ", db_getsequencelen(seqno_b)); if (opt_xsize) - abundance_fprint_header_strip_size(fp_uchimealns, - db_getheader(seqno_b), - db_getheaderlen(seqno_b)); + header_fprint_strip_size(fp_uchimealns, + db_getheader(seqno_b), + db_getheaderlen(seqno_b)); else fprintf(fp_uchimealns, "%s", db_getheader(seqno_b)); fprintf(fp_uchimealns, "\n\n"); @@ -911,17 +911,17 @@ int eval_parents(struct chimera_info_s * ci) if (opt_xsize) { - abundance_fprint_header_strip_size(fp_uchimeout, - ci->query_head, - ci->query_head_len); + header_fprint_strip_size(fp_uchimeout, + ci->query_head, + ci->query_head_len); fprintf(fp_uchimeout, "\t"); - abundance_fprint_header_strip_size(fp_uchimeout, - db_getheader(seqno_a), - db_getheaderlen(seqno_a)); + header_fprint_strip_size(fp_uchimeout, + db_getheader(seqno_a), + db_getheaderlen(seqno_a)); fprintf(fp_uchimeout, "\t"); - abundance_fprint_header_strip_size(fp_uchimeout, - db_getheader(seqno_b), - db_getheaderlen(seqno_b)); + header_fprint_strip_size(fp_uchimeout, + db_getheader(seqno_b), + db_getheaderlen(seqno_b)); fprintf(fp_uchimeout, "\t"); } else @@ -938,13 +938,13 @@ int eval_parents(struct chimera_info_s * ci) if (opt_xsize) { if (QA >= QB) - abundance_fprint_header_strip_size(fp_uchimeout, - db_getheader(seqno_a), - db_getheaderlen(seqno_a)); + header_fprint_strip_size(fp_uchimeout, + db_getheader(seqno_a), + db_getheaderlen(seqno_a)); else - abundance_fprint_header_strip_size(fp_uchimeout, - db_getheader(seqno_b), - db_getheaderlen(seqno_b)); + header_fprint_strip_size(fp_uchimeout, + db_getheader(seqno_b), + db_getheaderlen(seqno_b)); fprintf(fp_uchimeout, "\t"); } else @@ -1343,6 +1343,7 @@ uint64_t chimera_thread_core(struct chimera_info_s * ci) ci->query_head_len, ci->query_size, chimera_count, + -1.0, -1, -1, opt_fasta_score ? @@ -1365,6 +1366,7 @@ uint64_t chimera_thread_core(struct chimera_info_s * ci) ci->query_head_len, ci->query_size, borderline_count, + -1.0, -1, -1, opt_fasta_score ? @@ -1384,9 +1386,9 @@ uint64_t chimera_thread_core(struct chimera_info_s * ci) fprintf(fp_uchimeout, "0.0000\t"); if (opt_xsize) - abundance_fprint_header_strip_size(fp_uchimeout, - ci->query_head, - ci->query_head_len); + header_fprint_strip_size(fp_uchimeout, + ci->query_head, + ci->query_head_len); else fprintf(fp_uchimeout, "%s", ci->query_head); @@ -1411,6 +1413,7 @@ uint64_t chimera_thread_core(struct chimera_info_s * ci) ci->query_head_len, ci->query_size, nonchimera_count, + -1.0, -1, -1, opt_fasta_score ? diff --git a/src/chimera.h b/src/chimera.h index 468b46b6..267dc426 100644 --- a/src/chimera.h +++ b/src/chimera.h @@ -2,7 +2,7 @@ VSEARCH: a versatile open source tool for metagenomics - Copyright (C) 2014-2018, Torbjorn Rognes, Frederic Mahe and Tomas Flouri + Copyright (C) 2014-2019, Torbjorn Rognes, Frederic Mahe and Tomas Flouri All rights reserved. Contact: Torbjorn Rognes , diff --git a/src/cluster.cc b/src/cluster.cc index a0dd6bc0..416c8910 100644 --- a/src/cluster.cc +++ b/src/cluster.cc @@ -2,7 +2,7 @@ VSEARCH: a versatile open source tool for metagenomics - Copyright (C) 2014-2018, Torbjorn Rognes, Frederic Mahe and Tomas Flouri + Copyright (C) 2014-2019, Torbjorn Rognes, Frederic Mahe and Tomas Flouri All rights reserved. Contact: Torbjorn Rognes , @@ -418,6 +418,7 @@ void cluster_core_results_hit(struct hit * best, strlen(query_head), qsize, count_matched, + -1.0, -1, -1, 0, 0.0); } @@ -468,6 +469,7 @@ void cluster_core_results_nohit(int clusterno, strlen(query_head), qsize, count_notmatched, + -1.0, -1, -1, 0, 0.0); } @@ -1231,6 +1233,7 @@ void cluster(char * dbname, db_getheaderlen(seqno), cluster_abundance[clusterno], clusterno+1, + -1.0, -1, -1, 0, 0.0); if (opt_uc) diff --git a/src/cluster.h b/src/cluster.h index 1602871d..3e7a9211 100644 --- a/src/cluster.h +++ b/src/cluster.h @@ -2,7 +2,7 @@ VSEARCH: a versatile open source tool for metagenomics - Copyright (C) 2014-2018, Torbjorn Rognes, Frederic Mahe and Tomas Flouri + Copyright (C) 2014-2019, Torbjorn Rognes, Frederic Mahe and Tomas Flouri All rights reserved. Contact: Torbjorn Rognes , diff --git a/src/cpu.cc b/src/cpu.cc index e50acbc2..457263ce 100644 --- a/src/cpu.cc +++ b/src/cpu.cc @@ -2,7 +2,7 @@ VSEARCH: a versatile open source tool for metagenomics - Copyright (C) 2014-2018, Torbjorn Rognes, Frederic Mahe and Tomas Flouri + Copyright (C) 2014-2019, Torbjorn Rognes, Frederic Mahe and Tomas Flouri All rights reserved. Contact: Torbjorn Rognes , diff --git a/src/cpu.h b/src/cpu.h index 5cfc9808..034271ad 100644 --- a/src/cpu.h +++ b/src/cpu.h @@ -2,7 +2,7 @@ VSEARCH: a versatile open source tool for metagenomics - Copyright (C) 2014-2018, Torbjorn Rognes, Frederic Mahe and Tomas Flouri + Copyright (C) 2014-2019, Torbjorn Rognes, Frederic Mahe and Tomas Flouri All rights reserved. Contact: Torbjorn Rognes , diff --git a/src/db.cc b/src/db.cc index 4298cb0a..f7afb0da 100644 --- a/src/db.cc +++ b/src/db.cc @@ -2,7 +2,7 @@ VSEARCH: a versatile open source tool for metagenomics - Copyright (C) 2014-2018, Torbjorn Rognes, Frederic Mahe and Tomas Flouri + Copyright (C) 2014-2019, Torbjorn Rognes, Frederic Mahe and Tomas Flouri All rights reserved. Contact: Torbjorn Rognes , diff --git a/src/db.h b/src/db.h index 14524534..bc5b00c4 100644 --- a/src/db.h +++ b/src/db.h @@ -2,7 +2,7 @@ VSEARCH: a versatile open source tool for metagenomics - Copyright (C) 2014-2018, Torbjorn Rognes, Frederic Mahe and Tomas Flouri + Copyright (C) 2014-2019, Torbjorn Rognes, Frederic Mahe and Tomas Flouri All rights reserved. Contact: Torbjorn Rognes , diff --git a/src/dbhash.cc b/src/dbhash.cc index f47a4af3..a8519843 100644 --- a/src/dbhash.cc +++ b/src/dbhash.cc @@ -2,7 +2,7 @@ VSEARCH: a versatile open source tool for metagenomics - Copyright (C) 2014-2018, Torbjorn Rognes, Frederic Mahe and Tomas Flouri + Copyright (C) 2014-2019, Torbjorn Rognes, Frederic Mahe and Tomas Flouri All rights reserved. Contact: Torbjorn Rognes , diff --git a/src/dbhash.h b/src/dbhash.h index 21765208..2fa09364 100644 --- a/src/dbhash.h +++ b/src/dbhash.h @@ -2,7 +2,7 @@ VSEARCH: a versatile open source tool for metagenomics - Copyright (C) 2014-2018, Torbjorn Rognes, Frederic Mahe and Tomas Flouri + Copyright (C) 2014-2019, Torbjorn Rognes, Frederic Mahe and Tomas Flouri All rights reserved. Contact: Torbjorn Rognes , diff --git a/src/dbindex.cc b/src/dbindex.cc index 49e33b8f..dbfab895 100644 --- a/src/dbindex.cc +++ b/src/dbindex.cc @@ -2,7 +2,7 @@ VSEARCH: a versatile open source tool for metagenomics - Copyright (C) 2014-2018, Torbjorn Rognes, Frederic Mahe and Tomas Flouri + Copyright (C) 2014-2019, Torbjorn Rognes, Frederic Mahe and Tomas Flouri All rights reserved. Contact: Torbjorn Rognes , diff --git a/src/dbindex.h b/src/dbindex.h index 09d17b55..39b628c1 100644 --- a/src/dbindex.h +++ b/src/dbindex.h @@ -2,7 +2,7 @@ VSEARCH: a versatile open source tool for metagenomics - Copyright (C) 2014-2018, Torbjorn Rognes, Frederic Mahe and Tomas Flouri + Copyright (C) 2014-2019, Torbjorn Rognes, Frederic Mahe and Tomas Flouri All rights reserved. Contact: Torbjorn Rognes , diff --git a/src/derep.cc b/src/derep.cc index bbc9cdf8..447eb25e 100644 --- a/src/derep.cc +++ b/src/derep.cc @@ -2,7 +2,7 @@ VSEARCH: a versatile open source tool for metagenomics - Copyright (C) 2014-2018, Torbjorn Rognes, Frederic Mahe and Tomas Flouri + Copyright (C) 2014-2019, Torbjorn Rognes, Frederic Mahe and Tomas Flouri All rights reserved. Contact: Torbjorn Rognes , @@ -621,6 +621,7 @@ void derep_fulllength() strlen(bp->header), size, relabel_count, + -1.0, -1, -1, 0, 0.0); if (relabel_count == opt_topn) break; @@ -1013,6 +1014,7 @@ void derep_prefix() db_getheaderlen(bp->seqno_first), size, relabel_count, + -1.0, -1, -1, 0, 0.0); if (relabel_count == opt_topn) break; diff --git a/src/derep.h b/src/derep.h index 399087ac..8bbbf082 100644 --- a/src/derep.h +++ b/src/derep.h @@ -2,7 +2,7 @@ VSEARCH: a versatile open source tool for metagenomics - Copyright (C) 2014-2018, Torbjorn Rognes, Frederic Mahe and Tomas Flouri + Copyright (C) 2014-2019, Torbjorn Rognes, Frederic Mahe and Tomas Flouri All rights reserved. Contact: Torbjorn Rognes , diff --git a/src/dynlibs.cc b/src/dynlibs.cc index 3bbcaf9a..1ca0880c 100644 --- a/src/dynlibs.cc +++ b/src/dynlibs.cc @@ -2,7 +2,7 @@ VSEARCH: a versatile open source tool for metagenomics - Copyright (C) 2014-2018, Torbjorn Rognes, Frederic Mahe and Tomas Flouri + Copyright (C) 2014-2019, Torbjorn Rognes, Frederic Mahe and Tomas Flouri All rights reserved. Contact: Torbjorn Rognes , diff --git a/src/dynlibs.h b/src/dynlibs.h index 6db1d626..1c5cdaab 100644 --- a/src/dynlibs.h +++ b/src/dynlibs.h @@ -2,7 +2,7 @@ VSEARCH: a versatile open source tool for metagenomics - Copyright (C) 2014-2018, Torbjorn Rognes, Frederic Mahe and Tomas Flouri + Copyright (C) 2014-2019, Torbjorn Rognes, Frederic Mahe and Tomas Flouri All rights reserved. Contact: Torbjorn Rognes , diff --git a/src/eestats.cc b/src/eestats.cc index 59015b77..bd3e43e9 100644 --- a/src/eestats.cc +++ b/src/eestats.cc @@ -2,7 +2,7 @@ VSEARCH: a versatile open source tool for metagenomics - Copyright (C) 2014-2018, Torbjorn Rognes, Frederic Mahe and Tomas Flouri + Copyright (C) 2014-2019, Torbjorn Rognes, Frederic Mahe and Tomas Flouri All rights reserved. Contact: Torbjorn Rognes , diff --git a/src/eestats.h b/src/eestats.h index 53f410cb..b56e73d9 100644 --- a/src/eestats.h +++ b/src/eestats.h @@ -2,7 +2,7 @@ VSEARCH: a versatile open source tool for metagenomics - Copyright (C) 2014-2018, Torbjorn Rognes, Frederic Mahe and Tomas Flouri + Copyright (C) 2014-2019, Torbjorn Rognes, Frederic Mahe and Tomas Flouri All rights reserved. Contact: Torbjorn Rognes , diff --git a/src/fasta.cc b/src/fasta.cc index 2ee33228..6e7dd972 100644 --- a/src/fasta.cc +++ b/src/fasta.cc @@ -2,7 +2,7 @@ VSEARCH: a versatile open source tool for metagenomics - Copyright (C) 2014-2018, Torbjorn Rognes, Frederic Mahe and Tomas Flouri + Copyright (C) 2014-2019, Torbjorn Rognes, Frederic Mahe and Tomas Flouri All rights reserved. Contact: Torbjorn Rognes , @@ -249,7 +249,7 @@ bool fasta_next(fastx_handle h, int64_t fasta_get_abundance(fastx_handle h) { - return abundance_get(h->header_buffer.data, h->header_buffer.length); + return header_get_size(h->header_buffer.data, h->header_buffer.length); } uint64_t fasta_get_position(fastx_handle h) @@ -332,6 +332,7 @@ void fasta_print_general(FILE * fp, int header_len, int abundance, int ordinal, + double ee, int clustersize, int clusterid, const char * score_name, @@ -348,10 +349,16 @@ void fasta_print_general(FILE * fp, fprint_seq_digest_md5(fp, seq, len); else if (opt_relabel && (ordinal > 0)) fprintf(fp, "%s%d", opt_relabel, ordinal); - else if (opt_xsize || (opt_sizeout && abundance > 0)) - abundance_fprint_header_strip_size(fp, header, header_len); else - fprintf(fp, "%s", header); + { + bool xsize = opt_xsize || (opt_sizeout && (abundance > 0)); + bool xee = opt_xee || ((opt_eeout || opt_fastq_eeout) && (ee >= 0.0)); + header_fprint_strip_size_ee(fp, + header, + header_len, + xsize, + xee); + } if (clustersize > 0) fprintf(fp, ";seqs=%d", clustersize); @@ -359,9 +366,12 @@ void fasta_print_general(FILE * fp, if (clusterid >= 0) fprintf(fp, ";clusterid=%d", clusterid); - if ((abundance > 0) && opt_sizeout) + if (opt_sizeout && (abundance > 0)) fprintf(fp, ";size=%u", abundance); + if ((opt_eeout || opt_fastq_eeout) && (ee >= 0.0)) + fprintf(fp, ";ee=%.4lf", ee); + if (score_name) fprintf(fp, ";%s=%.4lf", score_name, score); @@ -387,6 +397,7 @@ void fasta_print_db_relabel(FILE * fp, db_getheaderlen(seqno), db_getabundance(seqno), ordinal, + -1.0, -1, -1, 0, 0.0); } @@ -401,6 +412,7 @@ void fasta_print_db(FILE * fp, uint64_t seqno) db_getheaderlen(seqno), db_getabundance(seqno), 0, + -1.0, -1, -1, 0, 0.0); } diff --git a/src/fasta.h b/src/fasta.h index 87b9e66a..0a5ea54c 100644 --- a/src/fasta.h +++ b/src/fasta.h @@ -2,7 +2,7 @@ VSEARCH: a versatile open source tool for metagenomics - Copyright (C) 2014-2018, Torbjorn Rognes, Frederic Mahe and Tomas Flouri + Copyright (C) 2014-2019, Torbjorn Rognes, Frederic Mahe and Tomas Flouri All rights reserved. Contact: Torbjorn Rognes , @@ -92,6 +92,7 @@ void fasta_print_general(FILE * fp, int header_len, int abundance, int ordinal, + double ee, int clustersize, int clusterid, const char * score_name, diff --git a/src/fastq.cc b/src/fastq.cc index 8e1bbb89..58f50d1c 100644 --- a/src/fastq.cc +++ b/src/fastq.cc @@ -2,7 +2,7 @@ VSEARCH: a versatile open source tool for metagenomics - Copyright (C) 2014-2018, Torbjorn Rognes, Frederic Mahe and Tomas Flouri + Copyright (C) 2014-2019, Torbjorn Rognes, Frederic Mahe and Tomas Flouri All rights reserved. Contact: Torbjorn Rognes , @@ -431,29 +431,7 @@ char * fastq_get_sequence(fastx_handle h) int64_t fastq_get_abundance(fastx_handle h) { - return abundance_get(h->header_buffer.data, h->header_buffer.length); -} - -void fastq_print_header(FILE * fp, char * header) -{ - fprintf(fp, "@%s\n", header); -} - -void fastq_print_sequence(FILE * fp, char * sequence) -{ - fprintf(fp, "%s\n", sequence); -} - -void fastq_print_quality(FILE * fp, char * quality) -{ - fprintf(fp, "+\n%s\n", quality); -} - -void fastq_print(FILE * fp, char * header, char * sequence, char * quality) -{ - fastq_print_header(fp, header); - fastq_print_sequence(fp, sequence); - fastq_print_quality(fp, quality); + return header_get_size(h->header_buffer.data, h->header_buffer.length); } void fastq_print_general(FILE * fp, @@ -464,8 +442,7 @@ void fastq_print_general(FILE * fp, char * quality, int abundance, int ordinal, - const char * score_name, - double score) + double ee) { fprintf(fp, "@"); @@ -475,25 +452,33 @@ void fastq_print_general(FILE * fp, fprint_seq_digest_md5(fp, seq, len); else if (opt_relabel && (ordinal > 0)) fprintf(fp, "%s%d", opt_relabel, ordinal); - else if (opt_xsize || (opt_sizeout && (abundance > 0))) - abundance_fprint_header_strip_size(fp, - header, - header_len); else - fprintf(fp, "%s", header); + { + bool xsize = opt_xsize || (opt_sizeout && (abundance > 0)); + bool xee = opt_xee || ((opt_eeout || opt_fastq_eeout) && (ee >= 0.0)); + header_fprint_strip_size_ee(fp, + header, + header_len, + xsize, + xee); + } - if ((abundance > 0) && opt_sizeout) + if (opt_sizeout && (abundance > 0)) fprintf(fp, ";size=%u", abundance); - if (score_name) - fprintf(fp, ";%s=%.4lf", score_name, score); + if ((opt_eeout || opt_fastq_eeout) && (ee >= 0.0)) + fprintf(fp, ";ee=%.4lf", ee); if (opt_relabel_keep && - ((opt_relabel && (ordinal > 0))|| opt_relabel_sha1 || opt_relabel_md5)) - fprintf(fp, " %s", header); + ((opt_relabel && (ordinal > 0)) || opt_relabel_sha1 || opt_relabel_md5)) + fprintf(fp, " %.*s", header_len, header); - fprintf(fp, "\n"); + fprintf(fp, "\n%.*s\n+\n%.*s\n", len, seq, len, quality); +} - fastq_print_sequence(fp, seq); - fastq_print_quality(fp, quality); +void fastq_print(FILE * fp, char * header, char * sequence, char * quality) +{ + int slen = strlen(sequence); + int hlen = strlen(header); + fastq_print_general(fp, sequence, slen, header, hlen, quality, 0, 0, -1.0); } diff --git a/src/fastq.h b/src/fastq.h index fd73ebaa..da63adfe 100644 --- a/src/fastq.h +++ b/src/fastq.h @@ -2,7 +2,7 @@ VSEARCH: a versatile open source tool for metagenomics - Copyright (C) 2014-2018, Torbjorn Rognes, Frederic Mahe and Tomas Flouri + Copyright (C) 2014-2019, Torbjorn Rognes, Frederic Mahe and Tomas Flouri All rights reserved. Contact: Torbjorn Rognes , @@ -86,5 +86,4 @@ void fastq_print_general(FILE * fp, char * quality, int abundance, int ordinal, - const char * score_name, - double score); + double ee); diff --git a/src/fastqjoin.cc b/src/fastqjoin.cc index e96b3b8b..e4d47451 100644 --- a/src/fastqjoin.cc +++ b/src/fastqjoin.cc @@ -2,7 +2,7 @@ VSEARCH: a versatile open source tool for metagenomics - Copyright (C) 2014-2018, Torbjorn Rognes, Frederic Mahe and Tomas Flouri + Copyright (C) 2014-2019, Torbjorn Rognes, Frederic Mahe and Tomas Flouri All rights reserved. Contact: Torbjorn Rognes , @@ -193,8 +193,7 @@ void fastq_join() qual, 0, total + 1, - 0, - 0); + -1.0); } if (opt_fastaout) @@ -207,6 +206,7 @@ void fastq_join() fastq_get_header_length(fastq_fwd), 0, total + 1, + -1.0, -1, -1, 0, diff --git a/src/fastqjoin.h b/src/fastqjoin.h index 0fc992e3..836b78f9 100644 --- a/src/fastqjoin.h +++ b/src/fastqjoin.h @@ -2,7 +2,7 @@ VSEARCH: a versatile open source tool for metagenomics - Copyright (C) 2014-2018, Torbjorn Rognes, Frederic Mahe and Tomas Flouri + Copyright (C) 2014-2019, Torbjorn Rognes, Frederic Mahe and Tomas Flouri All rights reserved. Contact: Torbjorn Rognes , diff --git a/src/fastqops.cc b/src/fastqops.cc index a69edfbb..3340d32d 100644 --- a/src/fastqops.cc +++ b/src/fastqops.cc @@ -747,6 +747,7 @@ void fastx_revcomp() hlen, 0, count, + -1.0, -1, -1, 0, 0.0); if (opt_fastqout) @@ -758,7 +759,7 @@ void fastx_revcomp() qual_buffer, 0, count, - 0, 0.0); + -1.0); progress_update(fastx_get_position(h)); } @@ -794,6 +795,7 @@ void fastq_convert() progress_init("Reading FASTQ file", filesize); + int j = 1; while(fastq_next(h, 0, chrmap_no_change)) { /* header */ @@ -844,8 +846,10 @@ void fastq_convert() } quality[length] = 0; - fastq_print(fp_fastqout, header, sequence, quality); + int hlen = fastq_get_header_length(h); + fastq_print_general(fp_fastqout, sequence, length, header, hlen, quality, 0, j, -1.0); + j++; progress_update(fastq_get_position(h)); } diff --git a/src/fastx.cc b/src/fastx.cc index 60a51857..73538842 100644 --- a/src/fastx.cc +++ b/src/fastx.cc @@ -2,7 +2,7 @@ VSEARCH: a versatile open source tool for metagenomics - Copyright (C) 2014-2018, Torbjorn Rognes, Frederic Mahe and Tomas Flouri + Copyright (C) 2014-2019, Torbjorn Rognes, Frederic Mahe and Tomas Flouri All rights reserved. Contact: Torbjorn Rognes , diff --git a/src/fastx.h b/src/fastx.h index f90e8811..027607d8 100644 --- a/src/fastx.h +++ b/src/fastx.h @@ -2,7 +2,7 @@ VSEARCH: a versatile open source tool for metagenomics - Copyright (C) 2014-2018, Torbjorn Rognes, Frederic Mahe and Tomas Flouri + Copyright (C) 2014-2019, Torbjorn Rognes, Frederic Mahe and Tomas Flouri All rights reserved. Contact: Torbjorn Rognes , diff --git a/src/filter.cc b/src/filter.cc index c1daa1b4..a708693d 100644 --- a/src/filter.cc +++ b/src/filter.cc @@ -80,26 +80,156 @@ inline int fastq_get_qual(char q) return qual; } +struct analysis_res +{ + bool discarded; + bool truncated; + int start; + int length; + double ee; +}; + +struct analysis_res analyse(fastx_handle h) +{ + struct analysis_res res = { false, false, 0, 0, -1.0 }; + res.length = fastx_get_sequence_length(h); + int64_t old_length = res.length; + + /* strip left (5') end */ + if (opt_fastq_stripleft < res.length) + { + res.start += opt_fastq_stripleft; + res.length -= opt_fastq_stripleft; + } + else + { + res.start = res.length; + res.length = 0; + } + + /* strip right (3') end */ + if (opt_fastq_stripright < res.length) + res.length -= opt_fastq_stripright; + else + res.length = 0; + + /* truncate trailing (3') part */ + if (opt_fastq_trunclen >= 0) + if (res.length > opt_fastq_trunclen) + res.length = opt_fastq_trunclen; + + /* truncate trailing (3') part, but keep if short */ + if (opt_fastq_trunclen_keep >= 0) + if (res.length > opt_fastq_trunclen_keep) + res.length = opt_fastq_trunclen_keep; + + if (h->is_fastq) + { + /* truncate by quality and expected errors (ee) */ + res.ee = 0.0; + char * q = fastx_get_quality(h) + res.start; + for (int64_t i = 0; i < res.length; i++) + { + int qual = fastq_get_qual(q[i]); + double e = exp10(-0.1 * qual); + res.ee += e; + + if ((qual <= opt_fastq_truncqual) || + (res.ee > opt_fastq_truncee)) + { + res.ee -= e; + res.length = i; + break; + } + } + + /* filter by expected errors (ee) */ + if (res.ee > opt_fastq_maxee) + res.discarded = true; + if ((res.length > 0) && (res.ee / res.length > opt_fastq_maxee_rate)) + res.discarded = true; + } + + /* filter by length */ + if ((opt_fastq_trunclen >= 0) && (res.length < opt_fastq_trunclen)) + res.discarded = true; + if (res.length < opt_fastq_minlen) + res.discarded = true; + if (res.length > opt_fastq_maxlen) + res.discarded = true; + + /* filter by n's */ + int64_t ncount = 0; + char * p = fastx_get_sequence(h) + res.start; + for (int64_t i = 0; i < res.length; i++) + { + int pc = p[i]; + if ((pc == 'N') || (pc == 'n')) + ncount++; + } + if (ncount > opt_fastq_maxns) + res.discarded = true; + + /* filter by abundance */ + int64_t abundance = fastx_get_abundance(h); + if ((opt_minsize > 0) && (abundance < opt_minsize)) + res.discarded = true; + if ((opt_maxsize > 0) && (abundance > opt_maxsize)) + res.discarded = true; + + res.truncated = res.length < old_length; + + return res; +} + void filter(bool fastq_only, char * filename) { - fastx_handle h = fastx_open(filename); + if ((!opt_fastqout) && (!opt_fastaout) && + (!opt_fastqout_discarded) && (!opt_fastaout_discarded) && + (!opt_fastqout_rev) && (!opt_fastaout_rev) && + (!opt_fastqout_discarded_rev) && (!opt_fastaout_discarded_rev)) + fatal("No output files specified"); + + fastx_handle h1 = 0; + fastx_handle h2 = 0; - if (!h) + h1 = fastx_open(filename); + + if (!h1) fatal("Unrecognized file type (not proper FASTA or FASTQ format)"); - if (fastq_only && ! h->is_fastq) + if (fastq_only && ! h1->is_fastq) fatal("FASTA input files not allowed with fastq_filter, consider using fastx_filter command instead"); - if ((opt_fastqout || opt_fastqout_discarded) && ! h->is_fastq) - fatal("Cannot write FASTQ output with a FASTA input file, lacking quality scores"); + if ((opt_fastqout || opt_fastqout_discarded) && ! h1->is_fastq) + fatal("Cannot write FASTQ output with FASTA input file (no quality scores)"); + + uint64_t filesize = fastx_get_size(h1); + + if (opt_reverse) + { + h2 = fastx_open(opt_reverse); + + if (!h2) + fatal("Unrecognized file type (not proper FASTA or FASTQ format) for reverse reads"); - uint64_t filesize = fastx_get_size(h); + if (fastq_only && ! h2->is_fastq) + fatal("FASTA input files not allowed with fastq_filter, consider using fastx_filter command instead"); + + if ((opt_fastqout_rev || opt_fastqout_discarded_rev) && ! h2->is_fastq) + fatal("Cannot write FASTQ output with a FASTA input file, lacking quality scores"); + } FILE * fp_fastaout = 0; FILE * fp_fastqout = 0; FILE * fp_fastaout_discarded = 0; FILE * fp_fastqout_discarded = 0; + FILE * fp_fastaout_rev = 0; + FILE * fp_fastqout_rev = 0; + FILE * fp_fastaout_discarded_rev = 0; + FILE * fp_fastqout_discarded_rev = 0; + if (opt_fastaout) { fp_fastaout = fopen_output(opt_fastaout); @@ -128,12 +258,35 @@ void filter(bool fastq_only, char * filename) fatal("Unable to open FASTQ output file for writing"); } - uint64_t header_alloc = 0; - char * header = 0; - if (opt_relabel) + if (h2) { - header_alloc = strlen(opt_relabel) + 25; - header = (char*) xmalloc(header_alloc); + if (opt_fastaout_rev) + { + fp_fastaout_rev = fopen_output(opt_fastaout_rev); + if (!fp_fastaout_rev) + fatal("Unable to open FASTA output file for writing"); + } + + if (opt_fastqout_rev) + { + fp_fastqout_rev = fopen_output(opt_fastqout_rev); + if (!fp_fastqout_rev) + fatal("Unable to open FASTQ output file for writing"); + } + + if (opt_fastaout_discarded_rev) + { + fp_fastaout_discarded_rev = fopen_output(opt_fastaout_discarded_rev); + if (!fp_fastaout_discarded_rev) + fatal("Unable to open FASTA output file for writing"); + } + + if (opt_fastqout_discarded_rev) + { + fp_fastqout_discarded_rev = fopen_output(opt_fastqout_discarded_rev); + if (!fp_fastqout_discarded_rev) + fatal("Unable to open FASTQ output file for writing"); + } } progress_init("Reading input file", filesize); @@ -142,165 +295,152 @@ void filter(bool fastq_only, char * filename) int64_t discarded = 0; int64_t truncated = 0; - while(fastx_next(h, 0, chrmap_no_change)) + while(fastx_next(h1, 0, chrmap_no_change)) { - int64_t length = fastx_get_sequence_length(h); - char * d = fastx_get_header(h); - char * p = fastx_get_sequence(h); - char * q = fastx_get_quality(h); - int64_t abundance = fastx_get_abundance(h); - - /* strip initial part */ - if (opt_fastq_stripleft > 0) - { - if (opt_fastq_stripleft < length) - { - p += opt_fastq_stripleft; - q += opt_fastq_stripleft; - length -= opt_fastq_stripleft; - } - else - { - p += length; - q += length; - length = 0; - } - } + if (h2 && ! fastx_next(h2, 0, chrmap_no_change)) + fatal("More forward reads than reverse reads"); - /* strip right end */ - if (opt_fastq_stripright > 0) - { - if (opt_fastq_stripright < length) - length -= opt_fastq_stripright; - else - length = 0; - } + struct analysis_res res1 = { false, false, 0, 0, 0.0 } ; + struct analysis_res res2 = { false, false, 0, 0, -1.0 } ; + + res1 = analyse(h1); + if (h2) + res2 = analyse(h2); - /* truncate trailing part */ - if (opt_fastq_trunclen >= 0) + if (res1.discarded || res2.discarded) { - if (length >= opt_fastq_trunclen) - length = opt_fastq_trunclen; - else - length = 0; - } + /* discard the sequence(s) */ - /* truncate trailing part, but keep if short */ - if ((opt_fastq_trunclen_keep >= 0) && (length > opt_fastq_trunclen_keep)) - length = opt_fastq_trunclen_keep; + discarded++; - /* quality and ee truncation */ - double ee = 0.0; - if (h->is_fastq) - { - for (int64_t i = 0; i < length; i++) + if (opt_fastaout_discarded) + fasta_print_general(fp_fastaout_discarded, + 0, + fastx_get_sequence(h1) + res1.start, + res1.length, + fastx_get_header(h1), + fastx_get_header_length(h1), + fastx_get_abundance(h1), + discarded, + res1.ee, + -1, + -1, + 0, + 0.0); + + if (opt_fastqout_discarded) + fastq_print_general(fp_fastqout_discarded, + fastx_get_sequence(h1) + res1.start, + res1.length, + fastx_get_header(h1), + fastx_get_header_length(h1), + fastx_get_quality(h1) + res1.start, + fastx_get_abundance(h1), + discarded, + res1.ee); + + if (h2) { - int qual = fastq_get_qual(q[i]); - ee += exp10(- qual / 10.0); - - if ((qual <= opt_fastq_truncqual) || - (ee > opt_fastq_truncee)) - { - ee -= exp10(- qual / 10.0); - length = i; - break; - } + if (opt_fastaout_discarded_rev) + fasta_print_general(fp_fastaout_discarded_rev, + 0, + fastx_get_sequence(h2) + res2.start, + res2.length, + fastx_get_header(h2), + fastx_get_header_length(h2), + fastx_get_abundance(h2), + discarded, + res2.ee, + -1, + -1, + 0, + 0.0); + + if (opt_fastqout_discarded_rev) + fastq_print_general(fp_fastqout_discarded_rev, + fastx_get_sequence(h2) + res2.start, + res2.length, + fastx_get_header(h2), + fastx_get_header_length(h2), + fastx_get_quality(h2) + res2.start, + fastx_get_abundance(h2), + discarded, + res2.ee); } } - - /* count n's */ - int64_t ncount = 0; - for (int64_t i = 0; i < length; i++) - { - int pc = p[i]; - if ((pc == 'N') || (pc == 'n')) - ncount++; - } - - if ((length >= opt_fastq_minlen) && - (length <= opt_fastq_maxlen) && - ((opt_fastq_trunclen < 0) || (length >= opt_fastq_trunclen)) && - (ncount <= opt_fastq_maxns) && - (ee <= opt_fastq_maxee) && - ((length == 0) || (ee / length <= opt_fastq_maxee_rate)) && - ((opt_minsize == 0) || (abundance >= opt_minsize)) && - ((opt_maxsize == 0) || (abundance <= opt_maxsize))) + else { - /* keep the sequence */ + /* keep the sequence(s) */ kept++; - if ((uint64_t)(length) < fastx_get_sequence_length(h)) - { - truncated++; - p[length] = 0; - if (h->is_fastq) - q[length] = 0; - } + if (res1.truncated || res2.truncated) + truncated++; if (opt_fastaout) fasta_print_general(fp_fastaout, 0, - p, - length, - d, - fastx_get_header_length(h), - abundance, + fastx_get_sequence(h1) + res1.start, + res1.length, + fastx_get_header(h1), + fastx_get_header_length(h1), + fastx_get_abundance(h1), kept, + res1.ee, -1, -1, - (opt_eeout || opt_fastq_eeout) ? "ee" : 0, - ee); + 0, + 0.0); if (opt_fastqout) fastq_print_general(fp_fastqout, - p, - length, - d, - fastx_get_header_length(h), - q, - abundance, + fastx_get_sequence(h1) + res1.start, + res1.length, + fastx_get_header(h1), + fastx_get_header_length(h1), + fastx_get_quality(h1) + res1.start, + fastx_get_abundance(h1), kept, - (opt_eeout || opt_fastq_eeout) ? "ee" : 0, - ee); - } - else - { - /* discard the sequence */ - - discarded++; - - if (opt_fastaout_discarded) - fasta_print_general(fp_fastaout_discarded, - 0, - p, - length, - d, - fastx_get_header_length(h), - abundance, - discarded, - -1, - -1, - (opt_eeout || opt_fastq_eeout) ? "ee" : 0, - ee); + res1.ee); - if (opt_fastqout_discarded) - fastq_print_general(fp_fastqout_discarded, - p, - length, - d, - fastx_get_header_length(h), - q, - abundance, - discarded, - (opt_eeout || opt_fastq_eeout) ? "ee" : 0, - ee); + if (h2) + { + if (opt_fastaout_rev) + fasta_print_general(fp_fastaout_rev, + 0, + fastx_get_sequence(h1) + res2.start, + res2.length, + fastx_get_header(h2), + fastx_get_header_length(h2), + fastx_get_abundance(h2), + kept, + res2.ee, + -1, + -1, + 0, + 0.0); + + if (opt_fastqout_rev) + fastq_print_general(fp_fastqout_rev, + fastx_get_sequence(h2) + res2.start, + res2.length, + fastx_get_header(h2), + fastx_get_header_length(h2), + fastx_get_quality(h2) + res2.start, + fastx_get_abundance(h2), + kept, + res2.ee); + } } - progress_update(fastx_get_position(h)); + progress_update(fastx_get_position(h1)); } + progress_done(); + if (h2 && fastx_next(h2, 0, chrmap_no_change)) + fatal("More reverse reads than forward reads"); + if (! opt_quiet) fprintf(stderr, "%" PRId64 " sequences kept (of which %" PRId64 " truncated), %" PRId64 " sequences discarded.\n", @@ -315,8 +455,22 @@ void filter(bool fastq_only, char * filename) truncated, discarded); - if (header) - xfree(header); + if (h2) + { + if (opt_fastaout_rev) + fclose(fp_fastaout_rev); + + if (opt_fastqout_rev) + fclose(fp_fastqout_rev); + + if (opt_fastaout_discarded_rev) + fclose(fp_fastaout_discarded_rev); + + if (opt_fastqout_discarded_rev) + fclose(fp_fastqout_discarded_rev); + + fastx_close(h2); + } if (opt_fastaout) fclose(fp_fastaout); @@ -330,7 +484,7 @@ void filter(bool fastq_only, char * filename) if (opt_fastqout_discarded) fclose(fp_fastqout_discarded); - fastx_close(h); + fastx_close(h1); } void fastq_filter() diff --git a/src/kmerhash.cc b/src/kmerhash.cc index c32d121a..e418d997 100644 --- a/src/kmerhash.cc +++ b/src/kmerhash.cc @@ -2,7 +2,7 @@ VSEARCH: a versatile open source tool for metagenomics - Copyright (C) 2014-2018, Torbjorn Rognes, Frederic Mahe and Tomas Flouri + Copyright (C) 2014-2019, Torbjorn Rognes, Frederic Mahe and Tomas Flouri All rights reserved. Contact: Torbjorn Rognes , diff --git a/src/kmerhash.h b/src/kmerhash.h index 40a931b7..2c1bf694 100644 --- a/src/kmerhash.h +++ b/src/kmerhash.h @@ -2,7 +2,7 @@ VSEARCH: a versatile open source tool for metagenomics - Copyright (C) 2014-2018, Torbjorn Rognes, Frederic Mahe and Tomas Flouri + Copyright (C) 2014-2019, Torbjorn Rognes, Frederic Mahe and Tomas Flouri All rights reserved. Contact: Torbjorn Rognes , diff --git a/src/linmemalign.cc b/src/linmemalign.cc index 49d3e2a5..7e359def 100644 --- a/src/linmemalign.cc +++ b/src/linmemalign.cc @@ -2,7 +2,7 @@ VSEARCH: a versatile open source tool for metagenomics - Copyright (C) 2014-2018, Torbjorn Rognes, Frederic Mahe and Tomas Flouri + Copyright (C) 2014-2019, Torbjorn Rognes, Frederic Mahe and Tomas Flouri All rights reserved. Contact: Torbjorn Rognes , diff --git a/src/linmemalign.h b/src/linmemalign.h index 125e2d62..a8b36654 100644 --- a/src/linmemalign.h +++ b/src/linmemalign.h @@ -2,7 +2,7 @@ VSEARCH: a versatile open source tool for metagenomics - Copyright (C) 2014-2018, Torbjorn Rognes, Frederic Mahe and Tomas Flouri + Copyright (C) 2014-2019, Torbjorn Rognes, Frederic Mahe and Tomas Flouri All rights reserved. Contact: Torbjorn Rognes , diff --git a/src/maps.cc b/src/maps.cc index c7620c6b..f6cd28a4 100644 --- a/src/maps.cc +++ b/src/maps.cc @@ -2,7 +2,7 @@ VSEARCH: a versatile open source tool for metagenomics - Copyright (C) 2014-2018, Torbjorn Rognes, Frederic Mahe and Tomas Flouri + Copyright (C) 2014-2019, Torbjorn Rognes, Frederic Mahe and Tomas Flouri All rights reserved. Contact: Torbjorn Rognes , diff --git a/src/maps.h b/src/maps.h index 50ba402a..55ba14ab 100644 --- a/src/maps.h +++ b/src/maps.h @@ -2,7 +2,7 @@ VSEARCH: a versatile open source tool for metagenomics - Copyright (C) 2014-2018, Torbjorn Rognes, Frederic Mahe and Tomas Flouri + Copyright (C) 2014-2019, Torbjorn Rognes, Frederic Mahe and Tomas Flouri All rights reserved. Contact: Torbjorn Rognes , diff --git a/src/mask.cc b/src/mask.cc index 9f8db82b..90ae662c 100644 --- a/src/mask.cc +++ b/src/mask.cc @@ -2,7 +2,7 @@ VSEARCH: a versatile open source tool for metagenomics - Copyright (C) 2014-2018, Torbjorn Rognes, Frederic Mahe and Tomas Flouri + Copyright (C) 2014-2019, Torbjorn Rognes, Frederic Mahe and Tomas Flouri All rights reserved. Contact: Torbjorn Rognes , @@ -337,6 +337,7 @@ void fastx_mask() db_getheaderlen(i), db_getabundance(i), kept, + -1.0, -1, -1, 0, 0.0); if (opt_fastqout) @@ -348,7 +349,7 @@ void fastx_mask() db_getquality(i), db_getabundance(i), kept, - 0, 0.0); + -1.0); } progress_update(i); diff --git a/src/mask.h b/src/mask.h index b176ced5..162f4afa 100644 --- a/src/mask.h +++ b/src/mask.h @@ -2,7 +2,7 @@ VSEARCH: a versatile open source tool for metagenomics - Copyright (C) 2014-2018, Torbjorn Rognes, Frederic Mahe and Tomas Flouri + Copyright (C) 2014-2019, Torbjorn Rognes, Frederic Mahe and Tomas Flouri All rights reserved. Contact: Torbjorn Rognes , diff --git a/src/mergepairs.cc b/src/mergepairs.cc index 3cbc6795..4b3b7bd9 100644 --- a/src/mergepairs.cc +++ b/src/mergepairs.cc @@ -2,7 +2,7 @@ VSEARCH: a versatile open source tool for metagenomics - Copyright (C) 2014-2018, Torbjorn Rognes, Frederic Mahe and Tomas Flouri + Copyright (C) 2014-2019, Torbjorn Rognes, Frederic Mahe and Tomas Flouri All rights reserved. Contact: Torbjorn Rognes , @@ -372,7 +372,6 @@ void keep(merge_data_t * ip) ip->merged_quality, 0, merged, - (opt_eeout || opt_fastq_eeout) ? "ee" : 0, ip->ee_merged); } @@ -386,10 +385,11 @@ void keep(merge_data_t * ip) strlen(ip->merged_header), 0, merged, + ip->ee_merged, -1, -1, - (opt_eeout || opt_fastq_eeout) ? "ee" : 0, - ip->ee_merged); + 0, + 0.0); } if (opt_eetabbedout) @@ -476,7 +476,7 @@ void discard(merge_data_t * ip) ip->fwd_quality, 0, notmerged, - 0, 0.0); + -1.0); if (opt_fastqout_notmerged_rev) fastq_print_general(fp_fastqout_notmerged_rev, @@ -487,7 +487,7 @@ void discard(merge_data_t * ip) ip->rev_quality, 0, notmerged, - 0, 0.0); + -1.0); if (opt_fastaout_notmerged_fwd) fasta_print_general(fp_fastaout_notmerged_fwd, @@ -498,6 +498,7 @@ void discard(merge_data_t * ip) strlen(ip->fwd_header), 0, notmerged, + -1.0, -1, -1, 0, 0.0); @@ -510,6 +511,7 @@ void discard(merge_data_t * ip) strlen(ip->rev_header), 0, notmerged, + -1.0, -1, -1, 0, 0.0); } diff --git a/src/mergepairs.h b/src/mergepairs.h index cc5917bb..72f96430 100644 --- a/src/mergepairs.h +++ b/src/mergepairs.h @@ -2,7 +2,7 @@ VSEARCH: a versatile open source tool for metagenomics - Copyright (C) 2014-2018, Torbjorn Rognes, Frederic Mahe and Tomas Flouri + Copyright (C) 2014-2019, Torbjorn Rognes, Frederic Mahe and Tomas Flouri All rights reserved. Contact: Torbjorn Rognes , diff --git a/src/minheap.cc b/src/minheap.cc index 34aee07b..5314c664 100644 --- a/src/minheap.cc +++ b/src/minheap.cc @@ -2,7 +2,7 @@ VSEARCH: a versatile open source tool for metagenomics - Copyright (C) 2014-2018, Torbjorn Rognes, Frederic Mahe and Tomas Flouri + Copyright (C) 2014-2019, Torbjorn Rognes, Frederic Mahe and Tomas Flouri All rights reserved. Contact: Torbjorn Rognes , diff --git a/src/minheap.h b/src/minheap.h index e3fb2392..4d9d3479 100644 --- a/src/minheap.h +++ b/src/minheap.h @@ -2,7 +2,7 @@ VSEARCH: a versatile open source tool for metagenomics - Copyright (C) 2014-2018, Torbjorn Rognes, Frederic Mahe and Tomas Flouri + Copyright (C) 2014-2019, Torbjorn Rognes, Frederic Mahe and Tomas Flouri All rights reserved. Contact: Torbjorn Rognes , diff --git a/src/msa.cc b/src/msa.cc index 7a1ed4e0..fa24d7ee 100644 --- a/src/msa.cc +++ b/src/msa.cc @@ -2,7 +2,7 @@ VSEARCH: a versatile open source tool for metagenomics - Copyright (C) 2014-2018, Torbjorn Rognes, Frederic Mahe and Tomas Flouri + Copyright (C) 2014-2019, Torbjorn Rognes, Frederic Mahe and Tomas Flouri All rights reserved. Contact: Torbjorn Rognes , @@ -261,7 +261,7 @@ void msa(FILE * fp_msaout, FILE * fp_consout, FILE * fp_profile, db_getheader(target_seqno), db_getheaderlen(target_seqno), db_getabundance(target_seqno), - 0, -1, -1, 0, 0.0); + 0, -1.0, -1, -1, 0, 0.0); } if (rc_buffer) @@ -334,6 +334,7 @@ void msa(FILE * fp_msaout, FILE * fp_consout, FILE * fp_profile, db_getheaderlen(centroid_seqno), totalabundance, cluster+1, + -1.0, target_count, opt_clusterout_id ? cluster : -1, 0, 0.0); @@ -349,6 +350,7 @@ void msa(FILE * fp_msaout, FILE * fp_consout, FILE * fp_profile, db_getheaderlen(centroid_seqno), totalabundance, cluster+1, + -1.0, target_count, opt_clusterout_id ? cluster : -1, 0, 0.0); diff --git a/src/msa.h b/src/msa.h index 9660cce4..a12f7aa1 100644 --- a/src/msa.h +++ b/src/msa.h @@ -2,7 +2,7 @@ VSEARCH: a versatile open source tool for metagenomics - Copyright (C) 2014-2018, Torbjorn Rognes, Frederic Mahe and Tomas Flouri + Copyright (C) 2014-2019, Torbjorn Rognes, Frederic Mahe and Tomas Flouri All rights reserved. Contact: Torbjorn Rognes , diff --git a/src/otutable.cc b/src/otutable.cc index aecf7e15..bf6f7d07 100644 --- a/src/otutable.cc +++ b/src/otutable.cc @@ -2,7 +2,7 @@ VSEARCH: a versatile open source tool for metagenomics - Copyright (C) 2014-2018, Torbjorn Rognes, Frederic Mahe and Tomas Flouri + Copyright (C) 2014-2019, Torbjorn Rognes, Frederic Mahe and Tomas Flouri All rights reserved. Contact: Torbjorn Rognes , diff --git a/src/otutable.h b/src/otutable.h index 2e293945..ab97e017 100644 --- a/src/otutable.h +++ b/src/otutable.h @@ -2,7 +2,7 @@ VSEARCH: a versatile open source tool for metagenomics - Copyright (C) 2014-2018, Torbjorn Rognes, Frederic Mahe and Tomas Flouri + Copyright (C) 2014-2019, Torbjorn Rognes, Frederic Mahe and Tomas Flouri All rights reserved. Contact: Torbjorn Rognes , diff --git a/src/rerep.cc b/src/rerep.cc index d4d4f8dc..e8400b97 100644 --- a/src/rerep.cc +++ b/src/rerep.cc @@ -3,7 +3,7 @@ VSEARCH: a versatile open source tool for metagenomics - Copyright (C) 2014-2018, Torbjorn Rognes, Frederic Mahe and Tomas Flouri + Copyright (C) 2014-2019, Torbjorn Rognes, Frederic Mahe and Tomas Flouri All rights reserved. Contact: Torbjorn Rognes , @@ -98,6 +98,7 @@ void rereplicate() fasta_get_header_length(fh), 1, i, + -1.0, -1, -1, 0, 0.0); } diff --git a/src/rerep.h b/src/rerep.h index 960c6e8d..d554f268 100644 --- a/src/rerep.h +++ b/src/rerep.h @@ -2,7 +2,7 @@ VSEARCH: a versatile open source tool for metagenomics - Copyright (C) 2014-2018, Torbjorn Rognes, Frederic Mahe and Tomas Flouri + Copyright (C) 2014-2019, Torbjorn Rognes, Frederic Mahe and Tomas Flouri All rights reserved. Contact: Torbjorn Rognes , diff --git a/src/results.cc b/src/results.cc index 638ce94b..17b25fbb 100644 --- a/src/results.cc +++ b/src/results.cc @@ -83,6 +83,7 @@ void results_show_fastapairs_one(FILE * fp, strlen(query_head), 0, 0, + -1.0, -1, -1, 0, @@ -101,6 +102,7 @@ void results_show_fastapairs_one(FILE * fp, db_getheaderlen(hp->target), 0, 0, + -1.0, -1, -1, 0, diff --git a/src/results.h b/src/results.h index 5c533955..a744ca71 100644 --- a/src/results.h +++ b/src/results.h @@ -2,7 +2,7 @@ VSEARCH: a versatile open source tool for metagenomics - Copyright (C) 2014-2018, Torbjorn Rognes, Frederic Mahe and Tomas Flouri + Copyright (C) 2014-2019, Torbjorn Rognes, Frederic Mahe and Tomas Flouri All rights reserved. Contact: Torbjorn Rognes , diff --git a/src/search.cc b/src/search.cc index 42e23a9c..b3dd902f 100644 --- a/src/search.cc +++ b/src/search.cc @@ -2,7 +2,7 @@ VSEARCH: a versatile open source tool for metagenomics - Copyright (C) 2014-2018, Torbjorn Rognes, Frederic Mahe and Tomas Flouri + Copyright (C) 2014-2019, Torbjorn Rognes, Frederic Mahe and Tomas Flouri All rights reserved. Contact: Torbjorn Rognes , @@ -218,6 +218,7 @@ void search_output_results(int hit_count, strlen(query_head), qsize, count_matched, + -1.0, -1, -1, 0, 0.0); } else @@ -232,6 +233,7 @@ void search_output_results(int hit_count, strlen(query_head), qsize, count_notmatched, + -1.0, -1, -1, 0, 0.0); } @@ -725,6 +727,7 @@ void usearch_global(char * cmdline, char * progheader) db_getheaderlen(i), dbmatched[i], count_dbmatched, + -1.0, -1, -1, 0, 0.0); } else @@ -739,6 +742,7 @@ void usearch_global(char * cmdline, char * progheader) db_getheaderlen(i), db_getabundance(i), count_dbnotmatched, + -1.0, -1, -1, 0, 0.0); } } diff --git a/src/search.h b/src/search.h index e04ad7b4..7aeb41ee 100644 --- a/src/search.h +++ b/src/search.h @@ -2,7 +2,7 @@ VSEARCH: a versatile open source tool for metagenomics - Copyright (C) 2014-2018, Torbjorn Rognes, Frederic Mahe and Tomas Flouri + Copyright (C) 2014-2019, Torbjorn Rognes, Frederic Mahe and Tomas Flouri All rights reserved. Contact: Torbjorn Rognes , diff --git a/src/searchcore.cc b/src/searchcore.cc index e926e7cd..af5b7f95 100644 --- a/src/searchcore.cc +++ b/src/searchcore.cc @@ -2,7 +2,7 @@ VSEARCH: a versatile open source tool for metagenomics - Copyright (C) 2014-2018, Torbjorn Rognes, Frederic Mahe and Tomas Flouri + Copyright (C) 2014-2019, Torbjorn Rognes, Frederic Mahe and Tomas Flouri All rights reserved. Contact: Torbjorn Rognes , diff --git a/src/searchcore.h b/src/searchcore.h index 9b4930dd..893c2b65 100644 --- a/src/searchcore.h +++ b/src/searchcore.h @@ -2,7 +2,7 @@ VSEARCH: a versatile open source tool for metagenomics - Copyright (C) 2014-2018, Torbjorn Rognes, Frederic Mahe and Tomas Flouri + Copyright (C) 2014-2019, Torbjorn Rognes, Frederic Mahe and Tomas Flouri All rights reserved. Contact: Torbjorn Rognes , diff --git a/src/searchexact.cc b/src/searchexact.cc index 7db5b0ed..649c14aa 100644 --- a/src/searchexact.cc +++ b/src/searchexact.cc @@ -2,7 +2,7 @@ VSEARCH: a versatile open source tool for metagenomics - Copyright (C) 2014-2018, Torbjorn Rognes, Frederic Mahe and Tomas Flouri + Copyright (C) 2014-2019, Torbjorn Rognes, Frederic Mahe and Tomas Flouri All rights reserved. Contact: Torbjorn Rognes , @@ -292,6 +292,7 @@ void search_exact_output_results(int hit_count, strlen(query_head), qsize, count_matched, + -1.0, -1, -1, 0, 0.0); } else @@ -306,6 +307,7 @@ void search_exact_output_results(int hit_count, strlen(query_head), qsize, count_notmatched, + -1.0, -1, -1, 0, 0.0); } @@ -754,6 +756,7 @@ void search_exact(char * cmdline, char * progheader) db_getheaderlen(i), dbmatched[i], count_dbmatched, + -1.0, -1, -1, 0, 0.0); } else @@ -768,6 +771,7 @@ void search_exact(char * cmdline, char * progheader) db_getheaderlen(i), 0, count_dbnotmatched, + -1.0, -1, -1, 0, 0.0); } } diff --git a/src/searchexact.h b/src/searchexact.h index 41f95e5c..9214fb0d 100644 --- a/src/searchexact.h +++ b/src/searchexact.h @@ -2,7 +2,7 @@ VSEARCH: a versatile open source tool for metagenomics - Copyright (C) 2014-2018, Torbjorn Rognes, Frederic Mahe and Tomas Flouri + Copyright (C) 2014-2019, Torbjorn Rognes, Frederic Mahe and Tomas Flouri All rights reserved. Contact: Torbjorn Rognes , diff --git a/src/sffconvert.cc b/src/sffconvert.cc index a2105094..4264996d 100644 --- a/src/sffconvert.cc +++ b/src/sffconvert.cc @@ -2,7 +2,7 @@ VSEARCH: a versatile open source tool for metagenomics - Copyright (C) 2014-2018, Torbjorn Rognes, Frederic Mahe and Tomas Flouri + Copyright (C) 2014-2019, Torbjorn Rognes, Frederic Mahe and Tomas Flouri All rights reserved. Contact: Torbjorn Rognes , @@ -335,7 +335,7 @@ void sff_convert() read_name, strlen(read_name), qual + clip_start, - 0, 0, 0, 0); + 0, 0, -1.0); xfree(read_name); xfree(bases); diff --git a/src/sffconvert.h b/src/sffconvert.h index be6c1ced..155ef884 100644 --- a/src/sffconvert.h +++ b/src/sffconvert.h @@ -2,7 +2,7 @@ VSEARCH: a versatile open source tool for metagenomics - Copyright (C) 2014-2018, Torbjorn Rognes, Frederic Mahe and Tomas Flouri + Copyright (C) 2014-2019, Torbjorn Rognes, Frederic Mahe and Tomas Flouri All rights reserved. Contact: Torbjorn Rognes , diff --git a/src/showalign.cc b/src/showalign.cc index 214d896a..eeffd20a 100644 --- a/src/showalign.cc +++ b/src/showalign.cc @@ -2,7 +2,7 @@ VSEARCH: a versatile open source tool for metagenomics - Copyright (C) 2014-2018, Torbjorn Rognes, Frederic Mahe and Tomas Flouri + Copyright (C) 2014-2019, Torbjorn Rognes, Frederic Mahe and Tomas Flouri All rights reserved. Contact: Torbjorn Rognes , diff --git a/src/showalign.h b/src/showalign.h index c293d896..71e2dc4c 100644 --- a/src/showalign.h +++ b/src/showalign.h @@ -2,7 +2,7 @@ VSEARCH: a versatile open source tool for metagenomics - Copyright (C) 2014-2018, Torbjorn Rognes, Frederic Mahe and Tomas Flouri + Copyright (C) 2014-2019, Torbjorn Rognes, Frederic Mahe and Tomas Flouri All rights reserved. Contact: Torbjorn Rognes , diff --git a/src/shuffle.cc b/src/shuffle.cc index 41559af3..b845356a 100644 --- a/src/shuffle.cc +++ b/src/shuffle.cc @@ -2,7 +2,7 @@ VSEARCH: a versatile open source tool for metagenomics - Copyright (C) 2014-2018, Torbjorn Rognes, Frederic Mahe and Tomas Flouri + Copyright (C) 2014-2019, Torbjorn Rognes, Frederic Mahe and Tomas Flouri All rights reserved. Contact: Torbjorn Rognes , diff --git a/src/shuffle.h b/src/shuffle.h index 7b8aa240..4bdfec6a 100644 --- a/src/shuffle.h +++ b/src/shuffle.h @@ -2,7 +2,7 @@ VSEARCH: a versatile open source tool for metagenomics - Copyright (C) 2014-2018, Torbjorn Rognes, Frederic Mahe and Tomas Flouri + Copyright (C) 2014-2019, Torbjorn Rognes, Frederic Mahe and Tomas Flouri All rights reserved. Contact: Torbjorn Rognes , diff --git a/src/sintax.cc b/src/sintax.cc index 499a1907..ae6e5e44 100644 --- a/src/sintax.cc +++ b/src/sintax.cc @@ -2,7 +2,7 @@ VSEARCH: a versatile open source tool for metagenomics - Copyright (C) 2014-2018, Torbjorn Rognes, Frederic Mahe and Tomas Flouri + Copyright (C) 2014-2019, Torbjorn Rognes, Frederic Mahe and Tomas Flouri All rights reserved. Contact: Torbjorn Rognes , diff --git a/src/sintax.h b/src/sintax.h index 0de16433..6c5db293 100644 --- a/src/sintax.h +++ b/src/sintax.h @@ -2,7 +2,7 @@ VSEARCH: a versatile open source tool for metagenomics - Copyright (C) 2014-2018, Torbjorn Rognes, Frederic Mahe and Tomas Flouri + Copyright (C) 2014-2019, Torbjorn Rognes, Frederic Mahe and Tomas Flouri All rights reserved. Contact: Torbjorn Rognes , diff --git a/src/sortbylength.cc b/src/sortbylength.cc index 86f9a245..bb7ae929 100644 --- a/src/sortbylength.cc +++ b/src/sortbylength.cc @@ -2,7 +2,7 @@ VSEARCH: a versatile open source tool for metagenomics - Copyright (C) 2014-2018, Torbjorn Rognes, Frederic Mahe and Tomas Flouri + Copyright (C) 2014-2019, Torbjorn Rognes, Frederic Mahe and Tomas Flouri All rights reserved. Contact: Torbjorn Rognes , diff --git a/src/sortbylength.h b/src/sortbylength.h index f60e0d51..b0761976 100644 --- a/src/sortbylength.h +++ b/src/sortbylength.h @@ -2,7 +2,7 @@ VSEARCH: a versatile open source tool for metagenomics - Copyright (C) 2014-2018, Torbjorn Rognes, Frederic Mahe and Tomas Flouri + Copyright (C) 2014-2019, Torbjorn Rognes, Frederic Mahe and Tomas Flouri All rights reserved. Contact: Torbjorn Rognes , diff --git a/src/sortbysize.cc b/src/sortbysize.cc index 63a4b62d..e6739a5b 100644 --- a/src/sortbysize.cc +++ b/src/sortbysize.cc @@ -2,7 +2,7 @@ VSEARCH: a versatile open source tool for metagenomics - Copyright (C) 2014-2018, Torbjorn Rognes, Frederic Mahe and Tomas Flouri + Copyright (C) 2014-2019, Torbjorn Rognes, Frederic Mahe and Tomas Flouri All rights reserved. Contact: Torbjorn Rognes , diff --git a/src/sortbysize.h b/src/sortbysize.h index b485bf77..daefae7e 100644 --- a/src/sortbysize.h +++ b/src/sortbysize.h @@ -2,7 +2,7 @@ VSEARCH: a versatile open source tool for metagenomics - Copyright (C) 2014-2018, Torbjorn Rognes, Frederic Mahe and Tomas Flouri + Copyright (C) 2014-2019, Torbjorn Rognes, Frederic Mahe and Tomas Flouri All rights reserved. Contact: Torbjorn Rognes , diff --git a/src/subsample.cc b/src/subsample.cc index 97debd48..0d173340 100644 --- a/src/subsample.cc +++ b/src/subsample.cc @@ -2,7 +2,7 @@ VSEARCH: a versatile open source tool for metagenomics - Copyright (C) 2014-2018, Torbjorn Rognes, Frederic Mahe and Tomas Flouri + Copyright (C) 2014-2019, Torbjorn Rognes, Frederic Mahe and Tomas Flouri All rights reserved. Contact: Torbjorn Rognes , @@ -188,6 +188,7 @@ void subsample() db_getheaderlen(i), ab_sub, samples, + -1.0, -1, -1, 0, 0.0); if (opt_fastqout) @@ -199,7 +200,7 @@ void subsample() db_getquality(i), ab_sub, samples, - 0, 0.0); + -1.0); } if (ab_discarded > 0) @@ -215,6 +216,7 @@ void subsample() db_getheaderlen(i), ab_discarded, discarded, + -1.0, -1, -1, 0, 0.0); if (opt_fastqout_discarded) @@ -226,7 +228,7 @@ void subsample() db_getquality(i), ab_discarded, discarded, - 0, 0.0); + -1.0); } progress_update(i); } diff --git a/src/subsample.h b/src/subsample.h index 209f467a..6817cab4 100644 --- a/src/subsample.h +++ b/src/subsample.h @@ -2,7 +2,7 @@ VSEARCH: a versatile open source tool for metagenomics - Copyright (C) 2014-2018, Torbjorn Rognes, Frederic Mahe and Tomas Flouri + Copyright (C) 2014-2019, Torbjorn Rognes, Frederic Mahe and Tomas Flouri All rights reserved. Contact: Torbjorn Rognes , diff --git a/src/udb.cc b/src/udb.cc index 5ce1151b..5b9b7eaf 100644 --- a/src/udb.cc +++ b/src/udb.cc @@ -2,7 +2,7 @@ VSEARCH: a versatile open source tool for metagenomics - Copyright (C) 2014-2018, Torbjorn Rognes, Frederic Mahe and Tomas Flouri + Copyright (C) 2014-2019, Torbjorn Rognes, Frederic Mahe and Tomas Flouri All rights reserved. Contact: Torbjorn Rognes , @@ -455,8 +455,8 @@ void udb_read(const char * filename, progress_init("Parsing abundances", seqcount); for(unsigned int i = 0; i < seqcount; i++) { - seqindex[i].size = abundance_get(datap + seqindex[i].header_p, - seqindex[i].headerlen); + seqindex[i].size = header_get_size(datap + seqindex[i].header_p, + seqindex[i].headerlen); progress_update(i+1); } progress_done(); diff --git a/src/udb.h b/src/udb.h index 03f2817b..ae63bb93 100644 --- a/src/udb.h +++ b/src/udb.h @@ -2,7 +2,7 @@ VSEARCH: a versatile open source tool for metagenomics - Copyright (C) 2014-2018, Torbjorn Rognes, Frederic Mahe and Tomas Flouri + Copyright (C) 2014-2019, Torbjorn Rognes, Frederic Mahe and Tomas Flouri All rights reserved. Contact: Torbjorn Rognes , diff --git a/src/unique.cc b/src/unique.cc index 10aa02b2..064f7465 100644 --- a/src/unique.cc +++ b/src/unique.cc @@ -2,7 +2,7 @@ VSEARCH: a versatile open source tool for metagenomics - Copyright (C) 2014-2018, Torbjorn Rognes, Frederic Mahe and Tomas Flouri + Copyright (C) 2014-2019, Torbjorn Rognes, Frederic Mahe and Tomas Flouri All rights reserved. Contact: Torbjorn Rognes , diff --git a/src/unique.h b/src/unique.h index 170b81cc..7d5731c2 100644 --- a/src/unique.h +++ b/src/unique.h @@ -2,7 +2,7 @@ VSEARCH: a versatile open source tool for metagenomics - Copyright (C) 2014-2018, Torbjorn Rognes, Frederic Mahe and Tomas Flouri + Copyright (C) 2014-2019, Torbjorn Rognes, Frederic Mahe and Tomas Flouri All rights reserved. Contact: Torbjorn Rognes , diff --git a/src/userfields.cc b/src/userfields.cc index ab581199..67c986cd 100644 --- a/src/userfields.cc +++ b/src/userfields.cc @@ -2,7 +2,7 @@ VSEARCH: a versatile open source tool for metagenomics - Copyright (C) 2014-2018, Torbjorn Rognes, Frederic Mahe and Tomas Flouri + Copyright (C) 2014-2019, Torbjorn Rognes, Frederic Mahe and Tomas Flouri All rights reserved. Contact: Torbjorn Rognes , diff --git a/src/userfields.h b/src/userfields.h index 823db52d..1ce7c690 100644 --- a/src/userfields.h +++ b/src/userfields.h @@ -2,7 +2,7 @@ VSEARCH: a versatile open source tool for metagenomics - Copyright (C) 2014-2018, Torbjorn Rognes, Frederic Mahe and Tomas Flouri + Copyright (C) 2014-2019, Torbjorn Rognes, Frederic Mahe and Tomas Flouri All rights reserved. Contact: Torbjorn Rognes , diff --git a/src/util.cc b/src/util.cc index e7483b41..6f95d17a 100644 --- a/src/util.cc +++ b/src/util.cc @@ -2,7 +2,7 @@ VSEARCH: a versatile open source tool for metagenomics - Copyright (C) 2014-2018, Torbjorn Rognes, Frederic Mahe and Tomas Flouri + Copyright (C) 2014-2019, Torbjorn Rognes, Frederic Mahe and Tomas Flouri All rights reserved. Contact: Torbjorn Rognes , @@ -65,8 +65,7 @@ static const char * progress_prompt; static uint64_t progress_next; static uint64_t progress_size; -static uint64_t progress_chunk; -static const uint64_t progress_granularity = 200; +static uint64_t progress_pct; static bool progress_show; void progress_init(const char * prompt, uint64_t size) @@ -74,28 +73,32 @@ void progress_init(const char * prompt, uint64_t size) progress_show = isatty(fileno(stderr)) && (!opt_quiet) && (!opt_no_progress); progress_prompt = prompt; progress_size = size; - progress_chunk = size < progress_granularity ? - 1 : size / progress_granularity; - progress_next = 0; + progress_pct = 0; + progress_next = ((progress_pct + 1) * progress_size + 99) / 100; if (! opt_quiet) { fprintf(stderr, "%s", prompt); if (progress_show) - fprintf(stderr, " %.0f%%", 0.0); + fprintf(stderr, " %d%%", 0); } } void progress_update(uint64_t progress) { - if (progress_show && (progress >= progress_next)) + if ((progress >= progress_next) && progress_show) { if (progress_size > 0) - fprintf(stderr, " \r%s %.0f%%", progress_prompt, - 100.0 * progress / progress_size); + { + progress_pct = 100 * progress / progress_size; + fprintf(stderr, + " \r%s %" PRIu64 "%%", + progress_prompt, + progress_pct); + progress_next = ((progress_pct + 1) * progress_size + 99) / 100; + } else fprintf(stderr, " \r%s 0%%", progress_prompt); - progress_next = progress + progress_chunk; } } @@ -105,7 +108,7 @@ void progress_done() { if (progress_show) fprintf(stderr, " \r%s", progress_prompt); - fprintf(stderr, " %.0f%%\n", 100.0); + fprintf(stderr, " %d%%\n", 100); } } diff --git a/src/util.h b/src/util.h index ef77b8ea..6909e67f 100644 --- a/src/util.h +++ b/src/util.h @@ -2,7 +2,7 @@ VSEARCH: a versatile open source tool for metagenomics - Copyright (C) 2014-2018, Torbjorn Rognes, Frederic Mahe and Tomas Flouri + Copyright (C) 2014-2019, Torbjorn Rognes, Frederic Mahe and Tomas Flouri All rights reserved. Contact: Torbjorn Rognes , diff --git a/src/vsearch.cc b/src/vsearch.cc index 18b17bf2..8da194c4 100644 --- a/src/vsearch.cc +++ b/src/vsearch.cc @@ -2,7 +2,7 @@ VSEARCH: a versatile open source tool for metagenomics - Copyright (C) 2014-2018, Torbjorn Rognes, Frederic Mahe and Tomas Flouri + Copyright (C) 2014-2019, Torbjorn Rognes, Frederic Mahe and Tomas Flouri All rights reserved. Contact: Torbjorn Rognes , @@ -79,6 +79,7 @@ bool opt_relabel_sha1; bool opt_samheader; bool opt_sff_clip; bool opt_sizeorder; +bool opt_xee; bool opt_xsize; char * opt_allpairs_global; char * opt_alnout; @@ -101,8 +102,10 @@ char * opt_derep_prefix; char * opt_eetabbedout; char * opt_fastaout; char * opt_fastaout_discarded; +char * opt_fastaout_discarded_rev; char * opt_fastaout_notmerged_fwd; char * opt_fastaout_notmerged_rev; +char * opt_fastaout_rev; char * opt_fastapairs; char * opt_fastq_chars; char * opt_fastq_convert; @@ -114,8 +117,10 @@ char * opt_fastq_mergepairs; char * opt_fastq_stats; char * opt_fastqout; char * opt_fastqout_discarded; +char * opt_fastqout_discarded_rev; char * opt_fastqout_notmerged_fwd; char * opt_fastqout_notmerged_rev; +char * opt_fastqout_rev; char * opt_fastx_filter; char * opt_fastx_mask; char * opt_fastx_revcomp; @@ -668,6 +673,8 @@ void args_init(int argc, char **argv) opt_fasta_width = 80; opt_fastaout = 0; opt_fastaout_discarded = 0; + opt_fastaout_discarded_rev = 0; + opt_fastaout_rev = 0; opt_fastapairs = 0; opt_fastq_allowmergestagger = 0; opt_fastq_ascii = 33; @@ -707,6 +714,8 @@ void args_init(int argc, char **argv) opt_fastq_truncqual = LONG_MIN; opt_fastqout = 0; opt_fastqout_discarded = 0; + opt_fastqout_discarded_rev = 0; + opt_fastqout_rev = 0; opt_fastx_filter = 0; opt_fastx_mask = 0; opt_fastx_revcomp = 0; @@ -841,6 +850,7 @@ void args_init(int argc, char **argv) opt_wordlength = 8; opt_xn = 8.0; opt_xsize = 0; + opt_xee = 0; opterr = 1; @@ -1051,6 +1061,11 @@ void args_init(int argc, char **argv) {"join_padgapq", required_argument, 0, 0 }, {"sff_convert", required_argument, 0, 0 }, {"sff_clip", no_argument, 0, 0 }, + {"fastaout_rev", required_argument, 0, 0 }, + {"fastaout_discarded_rev",required_argument, 0, 0 }, + {"fastqout_rev", required_argument, 0, 0 }, + {"fastqout_discarded_rev",required_argument, 0, 0 }, + {"xee", no_argument, 0, 0 }, { 0, 0, 0, 0 } }; @@ -1917,6 +1932,26 @@ void args_init(int argc, char **argv) opt_sff_clip = 1; break; + case 204: + opt_fastaout_rev = optarg; + break; + + case 205: + opt_fastaout_discarded_rev = optarg; + break; + + case 206: + opt_fastqout_rev = optarg; + break; + + case 207: + opt_fastqout_discarded_rev = optarg; + break; + + case 208: + opt_xee = 1; + break; + default: fatal("Internal error in option parsing"); } @@ -2349,39 +2384,6 @@ void cmd_help() " --log FILENAME output file for fastq_stats statistics\n" " --output FILENAME output file for fastq_eestats(2) statistics\n" "\n" - "Filtering\n" - " --fastx_filter FILENAME filter and truncate sequences in FASTA/FASTQ file\n" - " --fastq_filter FILENAME filter and truncate sequences in FASTQ file\n" - " Parameters\n" - " --fastq_ascii INT FASTQ input quality score ASCII base char (33)\n" - " --fastq_maxee REAL maximum expected error value for filter\n" - " --fastq_maxee_rate REAL maximum expected error rate for filter\n" - " --fastq_maxlen INT maximum length of sequence for filter\n" - " --fastq_maxns INT maximum number of N's for filter\n" - " --fastq_minlen INT minimum length of sequence for filter\n" - " --fastq_qmax INT maximum base quality value for FASTQ input (41)\n" - " --fastq_qmin INT minimum base quality value for FASTQ input (0)\n" - " --fastq_stripleft INT bases on the left to delete\n" - " --fastq_stripright INT bases on the right to delete\n" - " --fastq_truncee REAL maximum total expected error for truncation\n" - " --fastq_trunclen INT truncate reads to length INT (discard if shorter)\n" - " --fastq_trunclen_keep INT truncate reads to length INT (keep if shorter)\n" - " --fastq_truncqual INT minimum base quality value for truncation\n" - " --maxsize INT maximum abundance\n" - " --minsize INT minimum abundance\n" - " Output\n" - " --eeout include expected errors in output\n" - " --fastaout FILENAME FASTA output filename for passed sequences\n" - " --fastaout_discarded FNAME FASTA filename for discarded sequences\n" - " --fastqout FILENAME FASTQ output filename for passed sequences\n" - " --fastqout_discarded FNAME FASTQ filename for discarded sequences\n" - " --relabel STRING relabel filtered sequences with given prefix\n" - " --relabel_keep keep the old label after the new when relabelling\n" - " --relabel_md5 relabel filtered sequences with md5 digest\n" - " --relabel_sha1 relabel filtered sequences with sha1 digest\n" - " --sizeout include abundance information when relabelling\n" - " --xsize strip abundance information in output\n" - "\n" "Masking (new)\n" " --fastx_mask FILENAME mask sequences in the given FASTA or FASTQ file\n" " Parameters\n" @@ -2440,11 +2442,12 @@ void cmd_help() " --fastaout FILENAME FASTA output filename for merged sequences\n" " --fastaout_notmerged_fwd FN FASTA filename for non-merged forward sequences\n" " --fastaout_notmerged_rev FN FASTA filename for non-merged reverse sequences\n" - " --fastq_eeout include expected errors in FASTQ output\n" + " --fastq_eeout include expected errors (ee) in FASTQ output\n" " --fastqout FILENAME FASTQ output filename for merged sequences\n" " --fastqout_notmerged_fwd FN FASTQ filename for non-merged forward sequences\n" " --fastqout_notmerged_rev FN FASTQ filename for non-merged reverse sequences\n" " --label_suffix suffix to append to label of merged sequences\n" + " --xee remove expected errors (ee) info from output\n" "\n" "Pairwise alignment\n" " --allpairs_global FILENAME perform global alignment of all sequence pairs\n" @@ -2582,6 +2585,45 @@ void cmd_help() " Output\n" " --tabbedout FILENAME write results to given tab-delimited file\n" "\n" + "Trimming and filtering\n" + " --fastx_filter FILENAME trim and filter sequences in FASTA/FASTQ file\n" + " --fastq_filter FILENAME trim and filter sequences in FASTQ file\n" + " --reverse FILENAME FASTQ file with other end of paired-end reads\n" + " Parameters\n" + " --fastq_ascii INT FASTQ input quality score ASCII base char (33)\n" + " --fastq_maxee REAL discard if expected error value is higher\n" + " --fastq_maxee_rate REAL discard if expected error rate is higher\n" + " --fastq_maxlen INT discard if length of sequence is longer\n" + " --fastq_maxns INT discard if number of N's is higher\n" + " --fastq_minlen INT discard if length of sequence is shorter\n" + " --fastq_qmax INT maximum base quality value for FASTQ input (41)\n" + " --fastq_qmin INT minimum base quality value for FASTQ input (0)\n" + " --fastq_stripleft INT delete given number of bases from the 5' end\n" + " --fastq_stripright INT delete given number of bases from the 3' end\n" + " --fastq_truncee REAL truncate to given maximum expected error\n" + " --fastq_trunclen INT truncate to given length (discard if shorter)\n" + " --fastq_trunclen_keep INT truncate to given length (keep if shorter)\n" + " --fastq_truncqual INT truncate to given minimum base quality\n" + " --maxsize INT discard if abundance of sequence is above\n" + " --minsize INT discard if abundance of sequence is below\n" + " Output\n" + " --eeout include expected errors in output\n" + " --fastaout FN FASTA filename for passed sequences\n" + " --fastaout_discarded FN FASTA filename for discarded sequences\n" + " --fastaout_discarded_rev FN FASTA filename for discarded reverse sequences\n" + " --fastaout_rev FN FASTA filename for passed reverse sequences\n" + " --fastqout FN FASTQ filename for passed sequences\n" + " --fastqout_discarded FN FASTQ filename for discarded sequences\n" + " --fastqout_discarded_rev FN FASTQ filename for discarded reverse sequences\n" + " --fastqout_rev FN FASTQ filename for passed reverse sequences\n" + " --relabel STRING relabel filtered sequences with given prefix\n" + " --relabel_keep keep the old label after the new when relabelling\n" + " --relabel_md5 relabel filtered sequences with md5 digest\n" + " --relabel_sha1 relabel filtered sequences with sha1 digest\n" + " --sizeout include abundance information when relabelling\n" + " --xee remove expected errors (ee) info from output\n" + " --xsize strip abundance information in output\n" + "\n" "UDB files\n" " --makeudb_usearch FILENAME make UDB file from given FASTA file\n" " --udb2fasta FILENAME output FASTA file from given UDB file\n" @@ -2875,22 +2917,6 @@ void cmd_uchime() chimera(); } -void cmd_fastq_filter() -{ - if ((!opt_fastqout) && (!opt_fastaout) && - (!opt_fastqout_discarded) && (!opt_fastaout_discarded)) - fatal("No output files specified"); - fastq_filter(); -} - -void cmd_fastx_filter() -{ - if ((!opt_fastqout) && (!opt_fastaout) && - (!opt_fastqout_discarded) && (!opt_fastaout_discarded)) - fatal("No output files specified"); - fastx_filter(); -} - void cmd_fastq_mergepairs() { if (!opt_reverse) @@ -3006,9 +3032,9 @@ int main(int argc, char** argv) else if (opt_fastq_stats) fastq_stats(); else if (opt_fastq_filter) - cmd_fastq_filter(); + fastq_filter(); else if (opt_fastx_filter) - cmd_fastx_filter(); + fastx_filter(); else if (opt_fastx_revcomp) cmd_fastx_revcomp(); else if (opt_search_exact) diff --git a/src/vsearch.h b/src/vsearch.h index 6df4e550..da76b280 100644 --- a/src/vsearch.h +++ b/src/vsearch.h @@ -189,7 +189,7 @@ #include "xstring.h" #include "align_simd.h" #include "maps.h" -#include "abundance.h" +#include "attributes.h" #include "db.h" #include "align.h" #include "unique.h" @@ -249,6 +249,7 @@ extern bool opt_relabel_sha1; extern bool opt_samheader; extern bool opt_sff_clip; extern bool opt_sizeorder; +extern bool opt_xee; extern bool opt_xsize; extern char * opt_allpairs_global; extern char * opt_alnout; @@ -271,8 +272,10 @@ extern char * opt_derep_prefix; extern char * opt_eetabbedout; extern char * opt_fastaout; extern char * opt_fastaout_discarded; +extern char * opt_fastaout_discarded_rev; extern char * opt_fastaout_notmerged_fwd; extern char * opt_fastaout_notmerged_rev; +extern char * opt_fastaout_rev; extern char * opt_fastapairs; extern char * opt_fastq_chars; extern char * opt_fastq_convert; @@ -284,6 +287,8 @@ extern char * opt_fastq_mergepairs; extern char * opt_fastq_stats; extern char * opt_fastqout; extern char * opt_fastqout_discarded; +extern char * opt_fastqout_discarded_rev; +extern char * opt_fastqout_rev; extern char * opt_fastqout_notmerged_fwd; extern char * opt_fastqout_notmerged_rev; extern char * opt_fastx_filter; diff --git a/src/xstring.h b/src/xstring.h index fdb8ec81..d14f4ab3 100644 --- a/src/xstring.h +++ b/src/xstring.h @@ -2,7 +2,7 @@ VSEARCH: a versatile open source tool for metagenomics - Copyright (C) 2014-2018, Torbjorn Rognes, Frederic Mahe and Tomas Flouri + Copyright (C) 2014-2019, Torbjorn Rognes, Frederic Mahe and Tomas Flouri All rights reserved. Contact: Torbjorn Rognes ,