diff --git a/doc/vsearch.1 b/doc/vsearch.1 index e3e7ab72..a20b64b7 100644 --- a/doc/vsearch.1 +++ b/doc/vsearch.1 @@ -1,5 +1,5 @@ .\" ============================================================================ -.TH vsearch 1 "January 23, 2015" "version 1.0.10" "USER COMMANDS" +.TH vsearch 1 "February 5, 2015" "version 1.0.11" "USER COMMANDS" .\" ============================================================================ .SH NAME vsearch \(em chimera detection, clustering, dereplication, masking, pairwise alignment, searching, shuffling and sorting of amplicons from metagenomic projects. @@ -22,7 +22,7 @@ Clustering: .RS \fBvsearch\fR (--cluster_fast | --cluster_size | --cluster_smallmem) \fIfastafile\fR (--alnout | --blast6out | --centroids | --clusters | ---msaout | --uc | --userout) \fIoutputfile\fR --id \fIreal\fR +--msaout | --samout | --uc | --userout) \fIoutputfile\fR --id \fIreal\fR [\fIoptions\fR] .PP .RE @@ -41,14 +41,14 @@ Masking: Pairwise alignment: .RS \fBvsearch\fR --allpairs_global \fIfastafile\fR (--alnout | ---blast6out | --matched | --notmatched | --uc | --userout) +--blast6out | --matched | --notmatched | --samout | --uc | --userout) \fIoutputfile\fR (--acceptall | --id \fIreal\fR) [\fIoptions\fR] .PP .RE Searching: .RS \fBvsearch\fR --usearch_global \fIfastafile\fR --db \fIfastafile\fR -(--alnout | --blast6out | --uc | --userout) \fIoutputfile\fR --id +(--alnout | --blast6out | --samout | --uc | --userout) \fIoutputfile\fR --id \fIreal\fR [\fIoptions\fR] .PP .RE @@ -434,8 +434,9 @@ just a decreasing length ordering. .TP Most searching options also apply to clustering: .br ---alnout, --blast6out, --userout, --userfields, --fastapairs, --matched, ---notmatched, --maxaccept, --maxreject, score filtering, gap penalties, masking. (see the Searching section). +--alnout, --samout, --blast6out, --userout, --userfields, --fastapairs, +--matched, --notmatched, --maxaccept, --maxreject, +score filtering, gap penalties, masking. (see the Searching section). .RE .PP .\" ---------------------------------------------------------------------------- @@ -549,7 +550,7 @@ Pairwise alignment options: Perform optimal global pairwise alignments of all vs. all fasta sequences contained in \fIfilename\fR. The results of the n * (n-1) / 2 alignments are written to the result files specified with --alnout, ---blast6out, --fastapairs --matched, --notmatched, --uc or --userout +--blast6out, --fastapairs --matched, --notmatched, --samout, --uc or --userout (see Searching section below). Specify either the --acceptall option to output all pairwise alignments, or specify an identity level with --id to discard weak alignments. Most other accept/reject options (see @@ -790,7 +791,7 @@ Reject the target sequence if the alignment contains at least .BI --maxhits\~ "positive integer" Maximum number of hits to show once the search is terminated (hits are sorted by decreasing identity). Unlimited by default value. \fBIt -applies to alnout, blast6out, uc, userout, fastapairs\fR. +applies to alnout, blast6out, samout, uc, userout, fastapairs\fR. .TP .BI --maxid \0real Reject the target sequence if its percentage of identity with the @@ -859,9 +860,9 @@ Write query sequences not matching database target sequences to .TP .B --output_no_hits Write both matching and non-matching queries to --alnout, --blast6out, -and --userout output files (--uc and --uc_allhits output files always -feature non-matching queries). Non-matching queries are labelled "No -hits" in --alnout files. +--samout and --userout output files (--uc and --uc_allhits output files +always feature non-matching queries). Non-matching queries are labelled +"No hits" in --alnout files. .TP .BI --qmask\~ "none|dust|soft" Mask simple repeats and low-complexity regions in query sequences @@ -882,6 +883,10 @@ Reject the target sequence if the alignment ends with gaps. Width of alignment lines in --alnout output. The default value is 64. Set to 0 to eliminate wrapping. .TP +.BI --samout \0filename +Write alignment results to \fIfilename\fR in the SAM format. +Output order may vary when using multiple threads. +.TP .B --self Reject the alignment if the query and target labels are identical. .TP diff --git a/doc/vsearch_manual.pdf b/doc/vsearch_manual.pdf index fd2b92c4..afb69d38 100644 Binary files a/doc/vsearch_manual.pdf and b/doc/vsearch_manual.pdf differ diff --git a/eval/eval.sh b/eval/eval.sh index 4cc67310..a48f142f 100755 --- a/eval/eval.sh +++ b/eval/eval.sh @@ -7,7 +7,7 @@ THREADS=0 DUPLICATES=100 DIR=. DB=../data/Rfam_11_0.fasta - +ID=0.5 if [ $(uname -s) == "Linux" ]; then VSEARCH=$(ls -v ../bin/vsearch*linux* | tail -1) @@ -45,7 +45,7 @@ echo Running search /usr/bin/time $PROG \ --usearch_global $DIR/qq.fsa \ --db $DIR/db.fsa \ - --id 0.5 \ + --id $ID \ --maxaccepts 1 \ --maxrejects 32 \ --strand plus \ diff --git a/src/allpairs.cc b/src/allpairs.cc index 1f1b9203..a30d4977 100644 --- a/src/allpairs.cc +++ b/src/allpairs.cc @@ -34,6 +34,7 @@ static int qmatches; static int queries; static long progress = 0; static FILE * fp_alnout = 0; +static FILE * fp_samout = 0; static FILE * fp_userout = 0; static FILE * fp_blast6out = 0; static FILE * fp_uc = 0; @@ -85,6 +86,15 @@ void allpairs_output_results(int hit_count, qseqlen, qsequence_rc); + if (fp_samout) + results_show_samout(fp_samout, + hits, + toreport, + query_head, + qsequence, + qseqlen, + qsequence_rc); + if (toreport) { double top_hit_id = hits[0].id; @@ -493,6 +503,13 @@ void allpairs_global(char * cmdline, char * progheader) fprintf(fp_alnout, "%s\n", progheader); } + if (opt_samout) + { + fp_samout = fopen(opt_samout, "w"); + if (! fp_samout) + fatal("Unable to open SAM output file for writing"); + } + if (opt_userout) { fp_userout = fopen(opt_userout, "w"); @@ -585,5 +602,7 @@ void allpairs_global(char * cmdline, char * progheader) fclose(fp_userout); if (fp_alnout) fclose(fp_alnout); + if (fp_samout) + fclose(fp_samout); show_rusage(); } diff --git a/src/cluster.cc b/src/cluster.cc index 715e30b8..f1842286 100644 --- a/src/cluster.cc +++ b/src/cluster.cc @@ -37,6 +37,7 @@ static int clusters = 0; static FILE * fp_centroids = 0; static FILE * fp_uc = 0; static FILE * fp_alnout = 0; +static FILE * fp_samout = 0; static FILE * fp_userout = 0; static FILE * fp_blast6out = 0; static FILE * fp_fastapairs = 0; @@ -285,6 +286,11 @@ void cluster_core_results_hit(struct hit * best, best, 1, query_head, qsequence, qseqlen, qsequence_rc); + if (fp_samout) + results_show_samout(fp_samout, + best, 1, query_head, + qsequence, qseqlen, qsequence_rc); + if (fp_fastapairs) results_show_fastapairs_one(fp_fastapairs, best, @@ -884,6 +890,13 @@ void cluster(char * dbname, fprintf(fp_alnout, "%s\n", progheader); } + if (opt_samout) + { + fp_samout = fopen(opt_samout, "w"); + if (! fp_samout) + fatal("Unable to open SAM output file for writing"); + } + if (opt_userout) { fp_userout = fopen(opt_userout, "w"); @@ -1170,6 +1183,8 @@ void cluster(char * dbname, fclose(fp_userout); if (fp_alnout) fclose(fp_alnout); + if (fp_samout) + fclose(fp_samout); if (fp_uc) fclose(fp_uc); if (fp_centroids) diff --git a/src/derep.cc b/src/derep.cc index 47f908f9..61ae106b 100644 --- a/src/derep.cc +++ b/src/derep.cc @@ -1,5 +1,5 @@ /* - Copyright (C) 2014 Torbjorn Rognes + Copyright (C) 2014-2015 Torbjorn Rognes This program is free software: you can redistribute it and/or modify it under the terms of the GNU Affero General Public License as diff --git a/src/results.cc b/src/results.cc index 06d026b6..735a2b9f 100644 --- a/src/results.cc +++ b/src/results.cc @@ -457,3 +457,215 @@ void results_show_alnout(FILE * fp, fprintf(fp,"No hits\n"); } } + +bool inline nucleotide_equal(char a, char b) +{ + return chrmap_4bit[(int)a] == chrmap_4bit[(int)b]; +} + +void build_sam_strings(char * alignment, + char * queryseq, + char * targetseq, + xstring * cigar, + xstring * md) +{ + /* + convert cigar to sam format: + add "1" to operations without run length + flip direction of indels in cigar string + + build MD-string with substitutions + */ + + cigar->empty(); + md->empty(); + + char * p = alignment; + char * e = p + strlen(p); + + int qpos = 0; + int tpos = 0; + + int matched = 0; + bool flag = 0; /* 1: MD string ends with a number */ + + while(p < e) + { + int run = 1; + int scanned = 0; + sscanf(p, "%d%n", & run, & scanned); + p += scanned; + char op = *p++; + + switch (op) + { + case 'M': + cigar->add_d(run); + cigar->add_c('M'); + + for(int i=0; iadd_d(matched); + matched = 0; + flag = 1; + } + + md->add_c(targetseq[tpos]); + flag = 0; + } + qpos++; + tpos++; + } + + break; + + case 'D': + cigar->add_d(run); + cigar->add_c('I'); + qpos += run; + break; + + case 'I': + cigar->add_d(run); + cigar->add_c('D'); + + if (!flag) + { + md->add_d(matched); + matched = 0; + flag = 1; + } + + md->add_c('^'); + for(int i=0; iadd_c(targetseq[tpos++]); + flag = 0; + break; + } + } + + if (!flag) + { + md->add_d(matched); + matched = 0; + flag = 1; + } +} + +void results_show_samout(FILE * fp, + struct hit * hits, + int hitcount, + char * query_head, + char * qsequence, + long qseqlen, + char * rc) +{ + /* + SAM format output + + http://samtools.github.io/hts-specs/SAMv1.pdf + http://www.drive5.com/usearch/manual/sam_files.html + http://bowtie-bio.sourceforge.net/bowtie2/manual.shtml#sam-output + http://davetang.org/muse/2011/01/28/perl-and-sam/ + + 1: qname, query template name + 2: flag, bitwise flag (12 bits) + (0x004=unmapped, 0x010=rev strand, 0x100 sec. alignment) + 3: rname, reference sequence name + 4: pos, 1-based leftmost mapping position (1) + 5: mapq, mapping quality (255) + 6: cigar, cigar string (MID) + 7: rnext, ref name of next/paired read (*) + 8: pnest, position of next/paired read (0) + 9: tlen, obs template length (target length) + 10: seq, segment of sequence + 11: qual, ascii of phred based quality+33 (*) + 12: optional tags (tag:type:value) + + Optional tags AS, XN, XM, XO, XG, NM, MD and YT used in usearch8. + + Usearch8: + + AS:i:? alignment score (i.e percent identity) + XN:i:? next best alignment score (always 0?) + XM:i:? number of mismatches + XO:i:? number of gap opens (excluding terminal gaps) + XG:i:? number of gap extensions (excluding terminal gaps) + NM:i:? edit distance (sum of XM and XG) + MD:Z:? variant string + YT:Z:UU string representing alignment type + + */ + + if (hitcount > 0) + { + double top_hit_id = hits[0].id; + + for(int t = 0; t < hitcount; t++) + { + struct hit * hp = hits + t; + + if (opt_top_hits_only && (hp->id < top_hit_id)) + break; + + /* + + */ + + xstring cigar; + xstring md; + + build_sam_strings(hp->nwalignment, + hp->strand ? rc : qsequence, + db_getsequence(hp->target), + & cigar, + & md); + + fprintf(fp, + "%s\t%u\t%s\t%lu\t%u\t%s\t%s\t%lu\t%lu\t%s\t%s\t" + "AS:i:%.0f\tXN:i:%d\tXM:i:%d\tXO:i:%d\t" + "XG:i:%d\tNM:i:%d\tMD:Z:%s\tYT:Z:%s\n", + query_head, + 0x10 * hp->strand | (t>0 ? 0x100 : 0), + db_getheader(hp->target), + 1UL, + 255, + cigar.get_string(), + "*", + 0UL, + 0UL, + hp->strand ? rc : qsequence, + "*", + hp->id, + 0, + hp->mismatches, + hp->internal_gaps, + hp->internal_indels, + hp->mismatches + hp->internal_indels, + md.get_string(), + "UU"); + } + } + else if (opt_output_no_hits) + { + fprintf(fp, + "%s\t%u\t%s\t%lu\t%u\t%s\t%s\t%lu\t%lu\t%s\t%s\n", + query_head, + 0x04, + "*", + 0UL, + 255, + "*", + "*", + 0UL, + 0UL, + qsequence, + "*"); + } +} diff --git a/src/results.h b/src/results.h index 43cedcb4..6af94c98 100644 --- a/src/results.h +++ b/src/results.h @@ -1,5 +1,5 @@ /* - Copyright (C) 2014 Torbjorn Rognes + Copyright (C) 2014-2015 Torbjorn Rognes This program is free software: you can redistribute it and/or modify it under the terms of the GNU Affero General Public License as @@ -54,3 +54,11 @@ void results_show_fastapairs_one(FILE * fp, char * qsequence, long qseqlen, char * rc); + +void results_show_samout(FILE * fp, + struct hit * hits, + int hitcount, + char * query_head, + char * qsequence, + long qseqlen, + char * rc); diff --git a/src/search.cc b/src/search.cc index bfe943b4..0ce5d4ca 100644 --- a/src/search.cc +++ b/src/search.cc @@ -1,5 +1,5 @@ /* - Copyright (C) 2014 Torbjorn Rognes + Copyright (C) 2014-2015 Torbjorn Rognes This program is free software: you can redistribute it and/or modify it under the terms of the GNU Affero General Public License as @@ -36,6 +36,7 @@ static pthread_mutex_t mutex_output; static int qmatches; static int queries; static int * dbmatched; +static FILE * fp_samout = 0; static FILE * fp_alnout = 0; static FILE * fp_userout = 0; static FILE * fp_blast6out = 0; @@ -67,6 +68,15 @@ void search_output_results(int hit_count, qseqlen, qsequence_rc); + if (fp_samout) + results_show_samout(fp_samout, + hits, + toreport, + query_head, + qsequence, + qseqlen, + qsequence_rc); + if (toreport) { double top_hit_id = hits[0].id; @@ -415,6 +425,13 @@ void search_prep(char * cmdline, char * progheader) fprintf(fp_alnout, "%s\n", progheader); } + if (opt_samout) + { + fp_samout = fopen(opt_samout, "w"); + if (! fp_samout) + fatal("Unable to open SAM output file for writing"); + } + if (opt_userout) { fp_userout = fopen(opt_userout, "w"); @@ -504,6 +521,8 @@ void search_done() fclose(fp_userout); if (fp_alnout) fclose(fp_alnout); + if (fp_samout) + fclose(fp_samout); show_rusage(); } diff --git a/src/string.h b/src/string.h new file mode 100644 index 00000000..a217a741 --- /dev/null +++ b/src/string.h @@ -0,0 +1,104 @@ +/* + Copyright (C) 2015 Torbjorn Rognes + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU Affero General Public License as + published by the Free Software Foundation, either version 3 of the + License, or (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU Affero General Public License for more details. + + You should have received a copy of the GNU Affero General Public License + along with this program. If not, see . + + Contact: Torbjorn Rognes , + Department of Informatics, University of Oslo, + PO Box 1080 Blindern, NO-0316 Oslo, Norway +*/ + +static char empty_string[1] = ""; + +class xstring +{ + char * string; + size_t length; + size_t alloc; + + public: + + xstring() + { + length = 0; + alloc = 0; + string = 0; + } + + ~xstring() + { + if (alloc > 0) + free(string); + alloc = 0; + string = 0; + length = 0; + } + + void empty() + { + length = 0; + } + + char * get_string() + { + if (length > 0) + return string; + else + return empty_string; + } + + size_t get_length() + { + return length; + } + + void add_c(char c) + { + size_t needed = 1; + if (length + needed + 1 > alloc) + { + alloc = length + needed + 1; + string = (char*) xrealloc(string, alloc); + } + string[length] = c; + length += 1; + string[length] = 0; + } + + void add_d(int d) + { + int needed = snprintf(0, 0, "%d", d); + if (needed < 0) + fatal("snprintf failed"); + if (length + needed + 1 > alloc) + { + alloc = length + needed + 1; + string = (char*) xrealloc(string, alloc); + } + sprintf(string + length, "%d", d); + length += needed; + } + + void add_s(char * s) + { + size_t needed = strlen(s); + if (length + needed + 1 > alloc) + { + alloc = length + needed + 1; + string = (char*) xrealloc(string, alloc); + } + strcpy(string + length, s); + length += needed; + } +}; diff --git a/src/vsearch.cc b/src/vsearch.cc index 561ba670..ffc26140 100644 --- a/src/vsearch.cc +++ b/src/vsearch.cc @@ -46,6 +46,7 @@ char * opt_notmatched; char * opt_output; char * opt_pattern; char * opt_relabel; +char * opt_samout; char * opt_shuffle; char * opt_sortbylength; char * opt_sortbysize; @@ -471,6 +472,7 @@ void args_init(int argc, char **argv) opt_relabel = 0; opt_rightjust = 0; opt_rowlen = 64; + opt_samout = 0; opt_seed = 0; opt_self = 0; opt_selfid = 0; @@ -607,6 +609,7 @@ void args_init(int argc, char **argv) {"allpairs_global", required_argument, 0, 0 }, {"acceptall", no_argument, 0, 0 }, {"cluster_size", required_argument, 0, 0 }, + {"samout", required_argument, 0, 0 }, { 0, 0, 0, 0 } }; @@ -1164,6 +1167,11 @@ void args_init(int argc, char **argv) opt_cluster_size = optarg; break; + case 103: + /* samout */ + opt_samout = optarg; + break; + default: fatal("Internal error in option parsing"); } @@ -1377,6 +1385,7 @@ void cmd_help() "\n" "Searching options\n" " --alnout FILENAME filename for human-readable alignment output\n" + " --samout FILENAME filename for SAM format output\n" " --blast6out FILENAME filename for blast-like tab-separated output\n" " --db FILENAME filename for FASTA formatted database for search\n" " --dbmask none|dust|soft mask db with dust, soft or no method (dust)\n" @@ -1473,7 +1482,8 @@ void cmd_usearch_global() if ((!opt_alnout) && (!opt_userout) && (!opt_uc) && (!opt_blast6out) && (!opt_matched) && (!opt_notmatched) && - (!opt_dbmatched) && (!opt_dbnotmatched)) + (!opt_dbmatched) && (!opt_dbnotmatched) && + (!opt_samout)) fatal("No output files specified"); if (!opt_db) diff --git a/src/vsearch.h b/src/vsearch.h index 891fdd87..4470abdc 100644 --- a/src/vsearch.h +++ b/src/vsearch.h @@ -54,10 +54,11 @@ #include #endif +#include "util.h" +#include "string.h" #include "align_simd.h" #include "maps.h" #include "arch.h" -#include "util.h" #include "db.h" #include "query.h" #include "align.h" @@ -83,7 +84,7 @@ #include "allpairs.h" #define PROG_NAME "vsearch" -#define PROG_VERSION "v1.0.10" +#define PROG_VERSION "v1.0.11" #ifdef __APPLE__ #define PROG_ARCH "osx_x86_64" @@ -130,6 +131,7 @@ extern char * opt_notmatched; extern char * opt_output; extern char * opt_pattern; extern char * opt_relabel; +extern char * opt_samout; extern char * opt_shuffle; extern char * opt_sortbylength; extern char * opt_sortbysize; diff --git a/test/search.sh b/test/search.sh index c4ff3552..d57a111e 100755 --- a/test/search.sh +++ b/test/search.sh @@ -25,7 +25,7 @@ else fi fi -/usr/bin/time $PROG \ +CMD="/usr/bin/time $PROG \ --usearch_global $Q \ --db $DB \ --threads $T \ @@ -43,4 +43,11 @@ fi --fastapairs fastapairs.$P.fsa \ --dbmatched dbmatched.$P.fsa \ --dbnotmatched dbnotmatched.$P.fsa \ - --blast6out blast6out.$P.bl6 + --blast6out blast6out.$P.bl6" + +echo Search test +echo +echo Running command: $CMD +echo + +$CMD