diff --git a/README.md b/README.md index 431e1967..2a3020b3 100644 --- a/README.md +++ b/README.md @@ -24,7 +24,7 @@ Most of the nucleotide based commands and options in USEARCH version 7 are suppo ## Getting Help -If you can't find an answer in the [VSEARCH documentation](https://github.com/torognes/vsearch/releases/download/v2.10.0/vsearch_manual.pdf), please visit the [VSEARCH Web Forum](https://groups.google.com/forum/#!forum/vsearch-forum) to post a question or start a discussion. +If you can't find an answer in the [VSEARCH documentation](https://github.com/torognes/vsearch/releases/download/v2.10.1/vsearch_manual.pdf), please visit the [VSEARCH Web Forum](https://groups.google.com/forum/#!forum/vsearch-forum) to post a question or start a discussion. ## Example @@ -37,9 +37,9 @@ In the example below, VSEARCH will identify sequences in the file database.fsa t **Source distribution** To download the source distribution from a [release](https://github.com/torognes/vsearch/releases) and build the executable and the documentation, use the following commands: ``` -wget https://github.com/torognes/vsearch/archive/v2.10.0.tar.gz -tar xzf v2.10.0.tar.gz -cd vsearch-2.10.0 +wget https://github.com/torognes/vsearch/archive/v2.10.1.tar.gz +tar xzf v2.10.1.tar.gz +cd vsearch-2.10.1 ./autogen.sh ./configure make @@ -70,36 +70,36 @@ Binary distributions are provided for x86-64 systems running GNU/Linux, macOS (v Download the appropriate executable for your system using the following commands if you are using a Linux x86_64 system: ```sh -wget https://github.com/torognes/vsearch/releases/download/v2.10.0/vsearch-2.10.0-linux-x86_64.tar.gz -tar xzf vsearch-2.10.0-linux-x86_64.tar.gz +wget https://github.com/torognes/vsearch/releases/download/v2.10.1/vsearch-2.10.1-linux-x86_64.tar.gz +tar xzf vsearch-2.10.1-linux-x86_64.tar.gz ``` Or these commands if you are using a Linux ppc64le system: ```sh -wget https://github.com/torognes/vsearch/releases/download/v2.10.0/vsearch-2.10.0-linux-ppc64le.tar.gz -tar xzf vsearch-2.10.0-linux-ppc64le.tar.gz +wget https://github.com/torognes/vsearch/releases/download/v2.10.1/vsearch-2.10.1-linux-ppc64le.tar.gz +tar xzf vsearch-2.10.1-linux-ppc64le.tar.gz ``` Or these commands if you are using a Mac: ```sh -wget https://github.com/torognes/vsearch/releases/download/v2.10.0/vsearch-2.10.0-macos-x86_64.tar.gz -tar xzf vsearch-2.10.0-macos-x86_64.tar.gz +wget https://github.com/torognes/vsearch/releases/download/v2.10.1/vsearch-2.10.1-macos-x86_64.tar.gz +tar xzf vsearch-2.10.1-macos-x86_64.tar.gz ``` Or if you are using Windows, download and extract (unzip) the contents of this file: ``` -https://github.com/torognes/vsearch/releases/download/v2.10.0/vsearch-2.10.0-win-x86_64.zip +https://github.com/torognes/vsearch/releases/download/v2.10.1/vsearch-2.10.1-win-x86_64.zip ``` -Linux and Mac: You will now have the binary distribution in a folder called `vsearch-2.10.0-linux-x86_64` or `vsearch-2.10.0-macos-x86_64` in which you will find three subfolders `bin`, `man` and `doc`. We recommend making a copy or a symbolic link to the vsearch binary `bin/vsearch` in a folder included in your `$PATH`, and a copy or a symbolic link to the vsearch man page `man/vsearch.1` in a folder included in your `$MANPATH`. The PDF version of the manual is available in `doc/vsearch_manual.pdf`. +Linux and Mac: You will now have the binary distribution in a folder called `vsearch-2.10.1-linux-x86_64` or `vsearch-2.10.1-macos-x86_64` in which you will find three subfolders `bin`, `man` and `doc`. We recommend making a copy or a symbolic link to the vsearch binary `bin/vsearch` in a folder included in your `$PATH`, and a copy or a symbolic link to the vsearch man page `man/vsearch.1` in a folder included in your `$MANPATH`. The PDF version of the manual is available in `doc/vsearch_manual.pdf`. -Windows: You will now have the binary distribution in a folder called `vsearch-2.10.0-win-x86_64`. The vsearch executable is called `vsearch.exe`. The manual in PDF format is called `vsearch_manual.pdf`. +Windows: You will now have the binary distribution in a folder called `vsearch-2.10.1-win-x86_64`. The vsearch executable is called `vsearch.exe`. The manual in PDF format is called `vsearch_manual.pdf`. -**Documentation** The VSEARCH user's manual is available in the `man` folder in the form of a [man page](https://github.com/torognes/vsearch/blob/master/man/vsearch.1). A pdf version ([vsearch_manual.pdf](https://github.com/torognes/vsearch/releases/download/v2.10.0/vsearch_manual.pdf)) will be generated by `make`. To install the manpage manually, copy the `vsearch.1` file or a create a symbolic link to `vsearch.1` in a folder included in your `$MANPATH`. The manual in both formats is also available with the binary distribution. The manual in PDF form ([vsearch_manual.pdf](https://github.com/torognes/vsearch/releases/download/v2.10.0/vsearch_manual.pdf)) is also attached to the latest [release](https://github.com/torognes/vsearch/releases). +**Documentation** The VSEARCH user's manual is available in the `man` folder in the form of a [man page](https://github.com/torognes/vsearch/blob/master/man/vsearch.1). A pdf version ([vsearch_manual.pdf](https://github.com/torognes/vsearch/releases/download/v2.10.1/vsearch_manual.pdf)) will be generated by `make`. To install the manpage manually, copy the `vsearch.1` file or a create a symbolic link to `vsearch.1` in a folder included in your `$MANPATH`. The manual in both formats is also available with the binary distribution. The manual in PDF form ([vsearch_manual.pdf](https://github.com/torognes/vsearch/releases/download/v2.10.1/vsearch_manual.pdf)) is also attached to the latest [release](https://github.com/torognes/vsearch/releases). ## Plugins, packages, and wrappers diff --git a/configure.ac b/configure.ac index ffde5517..2700f574 100644 --- a/configure.ac +++ b/configure.ac @@ -2,7 +2,7 @@ # Process this file with autoconf to produce a configure script. AC_PREREQ([2.63]) -AC_INIT([vsearch], [2.10.0], [torognes@ifi.uio.no]) +AC_INIT([vsearch], [2.10.1], [torognes@ifi.uio.no]) AC_CANONICAL_TARGET AM_INIT_AUTOMAKE([subdir-objects]) AC_LANG([C++]) diff --git a/man/vsearch.1 b/man/vsearch.1 index 8fb58bcb..0ff0428a 100644 --- a/man/vsearch.1 +++ b/man/vsearch.1 @@ -1,5 +1,5 @@ .\" ============================================================================ -.TH vsearch 1 "December 6, 2018" "version 2.10.0" "USER COMMANDS" +.TH vsearch 1 "December 7, 2018" "version 2.10.0" "USER COMMANDS" .\" ============================================================================ .SH NAME vsearch \(em chimera detection, clustering, dereplication and @@ -3487,6 +3487,12 @@ architectures. Update documentation of rereplicate command. Added the sff_convert commmand to convert SFF files to FASTQ. Added some additional option argument checks. Fixed segmentation fault bug after some fatal errors when a log file was specified. +.TP +.BR v2.10.1\~ "released December 7th, 2018" +Improved sff_convert command. It will now read several variants of the +SFF format. It is also able to read from a pipe. Warnings are given if +there are minor problems. Errors messages have been improved. Minor +speed and memory usage improvements. .RE .LP .\" ============================================================================ diff --git a/src/sffconvert.cc b/src/sffconvert.cc index e429de28..a2105094 100644 --- a/src/sffconvert.cc +++ b/src/sffconvert.cc @@ -86,23 +86,48 @@ struct sff_read_header_s uint16_t clip_adapter_right; } read_header; +uint64_t fskip(FILE * fp, uint64_t length) +{ + /* read given amount of data from a stream and ignore it */ + /* used instead of seeking in order to work with pipes */ +#define BLOCKSIZE 4096 + char buffer[BLOCKSIZE]; + + uint64_t skipped = 0; + uint64_t rest = length; + + while (rest > 0) + { + uint64_t want = ((rest > BLOCKSIZE) ? BLOCKSIZE : rest); + uint64_t got = fread(buffer, 1, want, fp); + skipped += got; + rest -= got; + if (got < want) + break; + } + return skipped; +} + void sff_convert() { if (! opt_fastqout) - fatal("No output file for sff_convert specified with --fastqout"); + fatal("No output file for sff_convert specified with --fastqout."); FILE * fp_fastqout = fopen_output(opt_fastqout); if (!fp_fastqout) - fatal("Unable to open fastq output file for writing"); + fatal("Unable to open FASTQ output file for writing."); FILE * fp_sff = fopen_input(opt_sff_convert); if (!fp_sff) - fatal("Unable to open sff input file for reading"); + fatal("Unable to open SFF input file for reading."); /* read and check header */ + uint64_t filepos = 0; + if (fread(&sff_header, 1, 31, fp_sff) < 31) - fatal("Unable to read from sff file, or file too small"); + fatal("Unable to read from SFF file. File may be truncated."); + filepos += 31; sff_header.magic_number = bswap_32(sff_header.magic_number); sff_header.version = bswap_32(sff_header.version); @@ -113,48 +138,52 @@ void sff_convert() sff_header.key_length = bswap_16(sff_header.key_length); sff_header.flows_per_read = bswap_16(sff_header.flows_per_read); -#if 0 - fprintf(stderr, "Magic number: %d\n", sff_header.magic_number); - fprintf(stderr, "Version: %d\n", sff_header.version); - fprintf(stderr, "Index offset: %lld\n", sff_header.index_offset); - fprintf(stderr, "Index length: %d\n", sff_header.index_length); - fprintf(stderr, "Number of reads: %d\n", sff_header.number_of_reads); - fprintf(stderr, "Header length: %d\n", sff_header.header_length); - fprintf(stderr, "Key length: %d\n", sff_header.key_length); - fprintf(stderr, "Flows per read: %d\n", sff_header.flows_per_read); - fprintf(stderr, "Flowgram format: %d\n", sff_header.flowgram_format_code); -#endif - if (sff_header.magic_number != sff_magic) - fatal("Invalid SFF magic number. Must be 0x2e736666 (.sff)."); + fatal("Invalid SFF file. Incorrect magic number. Must be 0x2e736666 (.sff)."); if (sff_header.version != 1) - fatal("Invalid SFF version. Must be 1."); + fatal("Invalid SFF file. Incorrect version. Must be 1."); if (sff_header.flowgram_format_code != 1) - fatal("Invalid SFF flowgram format code. Must be 1."); + fatal("Invalid SFF file. Incorrect flowgram format code. Must be 1."); if (sff_header.header_length != 8 * ((31 + sff_header.flows_per_read + sff_header.key_length + 7) / 8)) - fatal("Invalid SFF header length"); + fatal("Invalid SFF file. Incorrect header length."); if (sff_header.key_length != 4) - fatal("Invalid SFF key length. Must be 4."); + fatal("Invalid SFF file. Incorrect key length. Must be 4."); + + if ((sff_header.index_length > 0) && (sff_header.index_length < 8)) + fatal("Invalid SFF file. Incorrect index size. Must be at least 8."); /* read and check flow chars, key and padding */ - char * flow_chars = (char *) xmalloc(sff_header.flows_per_read + 1); + if (fskip(fp_sff, sff_header.flows_per_read) < sff_header.flows_per_read) + fatal("Invalid SFF file. Unable to read flow characters. File may be truncated."); + filepos += sff_header.flows_per_read; - if (fread(flow_chars, 1, sff_header.flows_per_read, fp_sff) < sff_header.flows_per_read) - fatal("Unable to read flow charactersfrom sff file, or file too small"); + char * key_sequence = (char *) xmalloc(sff_header.key_length + 1); + if (fread(key_sequence, 1, sff_header.key_length, fp_sff) < sff_header.key_length) + fatal("Invalid SFF file. Unable to read key sequence. File may be truncated."); + key_sequence[sff_header.key_length] = 0; + filepos += sff_header.key_length; - flow_chars[sff_header.flows_per_read] = 0; + uint32_t padding_length = sff_header.header_length - sff_header.flows_per_read - sff_header.key_length - 31; + if (fskip(fp_sff, padding_length) < padding_length) + fatal("Invalid SFF file. Unable to read padding. File may be truncated."); + filepos += padding_length; - char * key_sequence = (char *) xmalloc(sff_header.key_length + 1); + double totallength = 0.0; + uint32_t minimum = UINT_MAX; + uint32_t maximum = 0; - if (fread(key_sequence, 1, sff_header.key_length, fp_sff) < sff_header.key_length) - fatal("Unable to read key sequence from sff file, or file too small"); + bool index_done = (sff_header.index_offset == 0) || (sff_header.index_length == 0); + bool index_odd = false; + char index_kind[9]; - key_sequence[sff_header.key_length] = 0; + uint32_t index_padding = 0; + if ((sff_header.index_length & 7) > 0) + index_padding = 8 - (sff_header.index_length & 7); if (! opt_quiet) { @@ -170,31 +199,36 @@ void sff_convert() fprintf(fp_log, "Key sequence: %s\n", key_sequence); } - char padding[8]; - - uint32_t padding_length = sff_header.header_length - sff_header.flows_per_read - sff_header.key_length - 31; - -#if 0 - fprintf(stderr, "Padding length: %d\n", padding_length); - fprintf(stderr, "\n"); -#endif - - if ((padding_length > 0) && - (fread(padding, 1, padding_length, fp_sff) < padding_length)) - fatal("Unable to read padding from sff file, or file too small"); - - double totallength = 0.0; - uint32_t minimum = UINT_MAX; - uint32_t maximum = 0; - progress_init("Converting SFF: ", sff_header.number_of_reads); for(uint32_t read_no = 0; read_no < sff_header.number_of_reads; read_no++) { + /* check if the index block is here */ + + if (! index_done) + { + if (filepos == sff_header.index_offset) + { + if (fread(index_kind, 1, 8, fp_sff) < 8) + fatal("Invalid SFF file. Unable to read index header. File may be truncated."); + filepos += 8; + index_kind[8] = 0; + + uint64 index_size = sff_header.index_length - 8 + index_padding; + if (fskip(fp_sff, index_size) != index_size) + fatal("Invalid SFF file. Unable to read entire index. File may be truncated."); + + filepos += index_size; + index_done = true; + index_odd = true; + } + } + /* read and check each read header */ if (fread(&read_header, 1, 16, fp_sff) < 16) - fatal("Unable to read read header from sff file, or file too small"); + fatal("Invalid SFF file. Unable to read read header. File may be truncated."); + filepos += 16; read_header.read_header_length = bswap_16(read_header.read_header_length); read_header.name_length = bswap_16(read_header.name_length); @@ -205,75 +239,72 @@ void sff_convert() read_header.clip_adapter_right = bswap_16(read_header.clip_adapter_right); if (read_header.read_header_length != 8 * ((16 + read_header.name_length + 7) / 8)) - fatal("Invalid SFF read header length"); + fatal("Invalid SFF file. Incorrect read header length."); + if (read_header.clip_qual_left > read_header.number_of_bases) + fatal("Invalid SFF file. Incorrect clip_qual_left value."); + if (read_header.clip_adapter_left > read_header.number_of_bases) + fatal("Invalid SFF file. Incorrect clip_adapter_left value."); + if (read_header.clip_qual_right > read_header.number_of_bases) + fatal("Invalid SFF file. Incorrect clip_qual_right value."); + if (read_header.clip_adapter_right > read_header.number_of_bases) + fatal("Invalid SFF file. Incorrect clip_adapter_right value."); char * read_name = (char *) xmalloc(read_header.name_length + 1); - if (fread(read_name, 1, read_header.name_length, fp_sff) < read_header.name_length) - fatal("Unable to read read name from sff file, or file too small"); - + fatal("Invalid SFF file. Unable to read read name. File may be truncated."); + filepos += read_header.name_length; read_name[read_header.name_length] = 0; uint32_t read_header_padding_length = read_header.read_header_length - read_header.name_length - 16; - - char read_header_padding[8]; - - if (fread(read_header_padding, 1, read_header_padding_length, fp_sff) < read_header_padding_length) - fatal("Unable to read read header padding from sff file, or file too small"); - - if (read_header.clip_qual_left > read_header.number_of_bases) - fatal("Invalid clip_qual_left value"); - if (read_header.clip_adapter_left > read_header.number_of_bases) - fatal("Invalid clip_adapter_left value"); - if (read_header.clip_qual_right > read_header.number_of_bases) - fatal("Invalid clip_qual_right value"); - if (read_header.clip_adapter_right > read_header.number_of_bases) - fatal("Invalid clip_adapter_right value"); + if (fskip(fp_sff, read_header_padding_length) < read_header_padding_length) + fatal("Invalid SFF file. Unable to read read header padding. File may be truncated."); + filepos += read_header_padding_length; /* read and check the flowgram and sequence */ - uint16_t * flowgram_values = (uint16_t *) xmalloc(sff_header.flows_per_read * 2); - if (fread(flowgram_values, 1, 2 * sff_header.flows_per_read, fp_sff) < sff_header.flows_per_read) - fatal("Unable to read flowgram values from sff file, or file too small"); + if (fskip(fp_sff, 2 * sff_header.flows_per_read) < sff_header.flows_per_read) + fatal("Invalid SFF file. Unable to read flowgram values. File may be truncated."); + filepos += 2 * sff_header.flows_per_read; - uint8_t * flow_index_per_base = (uint8_t *) xmalloc(read_header.number_of_bases); - if (fread(flow_index_per_base, 1, read_header.number_of_bases, fp_sff) < read_header.number_of_bases) - fatal("Unable to read flow indices from sff file, or file too small"); + if (fskip(fp_sff, read_header.number_of_bases) < read_header.number_of_bases) + fatal("Invalid SFF file. Unable to read flow indices. File may be truncated."); + filepos += read_header.number_of_bases; char * bases = (char *) xmalloc(read_header.number_of_bases + 1); if (fread(bases, 1, read_header.number_of_bases, fp_sff) < read_header.number_of_bases) - fatal("Unable to read read length from sff file, or file too small"); + fatal("Invalid SFF file. Unable to read read length. File may be truncated."); bases[read_header.number_of_bases] = 0; - - uint8_t * quality_scores = (uint8_t *) xmalloc(read_header.number_of_bases); - if (fread(quality_scores, 1, read_header.number_of_bases, fp_sff) < read_header.number_of_bases) - fatal("Unable to read quality scores from sff file, or file too small"); + filepos += read_header.number_of_bases; char * qual = (char *) xmalloc(read_header.number_of_bases + 1); + if (fread(qual, 1, read_header.number_of_bases, fp_sff) < read_header.number_of_bases) + fatal("Invalid SFF file. Unable to read quality scores. File may be truncated."); + filepos += read_header.number_of_bases; + /* convert quality scores to ascii characters */ for(uint32_t base_no = 0; base_no < read_header.number_of_bases; base_no++) { - uint8_t q = quality_scores[base_no]; + int q = qual[base_no]; if (q < opt_fastq_qminout) q = opt_fastq_qminout; if (q > opt_fastq_qmaxout) q = opt_fastq_qmaxout; - char c = opt_fastq_asciiout + q; - qual[base_no] = c; + qual[base_no] = opt_fastq_asciiout + q; } qual[read_header.number_of_bases] = 0; uint32_t read_data_length = (2 * sff_header.flows_per_read + 3 * read_header.number_of_bases); uint32_t read_data_padded_length = 8 * ((read_data_length + 7) / 8); uint32_t read_data_padding_length = read_data_padded_length - read_data_length; - char read_data_padding[8]; - if (fread(read_data_padding, 1, read_data_padding_length, fp_sff) < read_data_padding_length) - fatal("Unable to read read data padding from sff file, or file too small"); - uint32_t clip_start = 0; - uint32_t clip_end = read_header.number_of_bases; + if (fskip(fp_sff, read_data_padding_length) < read_data_padding_length) + fatal("Invalid SFF file. Unable to read read data padding. File may be truncated."); + filepos += read_data_padding_length; + uint32_t clip_start = 0; clip_start = MAX(1, MAX(read_header.clip_qual_left, read_header.clip_adapter_left)) - 1; + + uint32_t clip_end = read_header.number_of_bases; clip_end = MIN((read_header.clip_qual_right == 0 ? read_header.number_of_bases : read_header.clip_qual_right), (read_header.clip_adapter_right == 0 ? read_header.number_of_bases : read_header.clip_adapter_right)); /* make the clipped bases lowercase and the rest uppercase */ @@ -285,7 +316,12 @@ void sff_convert() bases[i] = toupper(bases[i]); } - if (! opt_sff_clip) + if (opt_sff_clip) + { + bases[clip_end] = 0; + qual[clip_end] = 0; + } + else { clip_start = 0; clip_end = read_header.number_of_bases; @@ -293,9 +329,6 @@ void sff_convert() uint32_t length = clip_end - clip_start; - bases[clip_end] = 0; - qual[clip_end] = 0; - fastq_print_general(fp_fastqout, bases + clip_start, length, @@ -305,10 +338,7 @@ void sff_convert() 0, 0, 0, 0); xfree(read_name); - xfree(flowgram_values); - xfree(flow_index_per_base); xfree(bases); - xfree(quality_scores); xfree(qual); totallength += length; @@ -321,10 +351,70 @@ void sff_convert() } progress_done(); + /* check if the index block is here */ + + if (! index_done) + { + if (filepos == sff_header.index_offset) + { + if (fread(index_kind, 1, 8, fp_sff) < 8) + fatal("Invalid SFF file. Unable to read index header. File may be truncated."); + filepos += 8; + index_kind[8] = 0; + + uint64 index_size = sff_header.index_length - 8; + if (fskip(fp_sff, index_size) != index_size) + fatal("Invalid SFF file. Unable to read entire index. File may be truncated."); + + filepos += index_size; + index_done = true; + + /* try to skip padding, if any */ + + if (index_padding > 0) + { + uint64_t got = fskip(fp_sff, index_padding); + if ((got < index_padding) && (got != 0)) + fprintf(stderr, "WARNING: Additional data at end of SFF file ignored\n"); + } + } + } + + if (! index_done) + { + fprintf(stderr, "WARNING: SFF index missing\n"); + if (opt_log) + fprintf(fp_log, "WARNING: SFF index missing\n"); + } + + if (index_odd) + { + fprintf(stderr, "WARNING: Index at unusual position in file\n"); + if (opt_log) + fprintf(fp_log, "WARNING: Index at unusual position in file\n"); + } + + /* ignore the rest of file */ + + /* try reading just another byte */ + + if (fskip(fp_sff, 1) > 0) + { + fprintf(stderr, "WARNING: Additional data at end of SFF file ignored\n"); + if (opt_log) + fprintf(fp_log, "WARNING: Additional data at end of SFF file ignored\n"); + } + + fclose(fp_sff); + fclose(fp_fastqout); + double average = totallength / sff_header.number_of_reads; if (! opt_quiet) { + if (sff_header.index_length > 0) + fprintf(stderr, "Index type: %s\n", index_kind); + fprintf(stderr, "\nSFF file read successfully.\n"); if (sff_header.number_of_reads > 0) fprintf(stderr, "Sequence length: minimum %d, average %.1f, maximum %d\n", minimum, @@ -334,6 +424,9 @@ void sff_convert() if (opt_log) { + if (sff_header.index_length > 0) + fprintf(fp_log, "Index type: %s\n", index_kind); + fprintf(fp_log, "\nSFF file read successfully.\n"); if (sff_header.number_of_reads > 0) fprintf(fp_log, "Sequence length: minimum %d, average %.1f, maximum %d\n", minimum, @@ -341,19 +434,5 @@ void sff_convert() maximum); } - if (sff_header.index_offset > 0) - { - size_t filepos = xftello(fp_sff); - - if (filepos != sff_header.index_offset) - fatal("Invalid sff file (wrong index offset)"); - - if (fseek(fp_sff, sff_header.index_length, SEEK_CUR) != 0) - fatal("Unable to seek past index in SFF file"); - } - - /* ignore padding and rest of file */ - - fclose(fp_sff); - fclose(fp_fastqout); + xfree(key_sequence); }