diff --git a/NEWS b/NEWS index 0d8eb5e72..a1d1ad3de 100644 --- a/NEWS +++ b/NEWS @@ -1,5 +1,10 @@ ## Release a.b +Changes affecting the whole of bcftools, or multiple commands: + +* Support auto indexing during writing BCF and VCF.gz via new `--write-index` option + + Changes affecting specific commands: * bcftools annotate diff --git a/bcftools.h b/bcftools.h index c3f7ded16..bba71e3b6 100644 --- a/bcftools.h +++ b/bcftools.h @@ -1,6 +1,6 @@ /* bcftools.h -- utility function declarations. - Copyright (C) 2013-2022 Genome Research Ltd. + Copyright (C) 2013-2023 Genome Research Ltd. Author: Petr Danecek @@ -49,6 +49,9 @@ void error(const char *format, ...) HTS_NORETURN HTS_FORMAT(HTS_PRINTF_FMT, 1, 2 // newline will be added by the function. void error_errno(const char *format, ...) HTS_NORETURN HTS_FORMAT(HTS_PRINTF_FMT, 1, 2); +// For on the fly index creation with --write-index +int init_index(htsFile *fh, bcf_hdr_t *hdr, char *fname, char **idx_fname); + void bcf_hdr_append_version(bcf_hdr_t *hdr, int argc, char **argv, const char *cmd); const char *hts_bcf_wmode(int file_type); const char *hts_bcf_wmode2(int file_type, const char *fname); diff --git a/csq.c b/csq.c index 49812d4de..364acebd1 100644 --- a/csq.c +++ b/csq.c @@ -574,6 +574,8 @@ typedef struct _args_t // text tab-delimited output (out) or vcf/bcf output (out_fh) FILE *out; htsFile *out_fh; + char *index_fn; + int write_index; // vcf bcf_srs_t *sr; @@ -1536,6 +1538,7 @@ void init_data(args_t *args) if ( args->hdr_nsmpl ) bcf_hdr_printf(args->hdr,"##FORMAT=",args->bcsq_tag); if ( bcf_hdr_write(args->out_fh, args->hdr)!=0 ) error("[%s] Error: cannot write the header to %s\n", __func__,args->output_fname?args->output_fname:"standard output"); + if ( args->write_index && init_index(args->out_fh,args->hdr,args->output_fname,&args->index_fn)<0 ) error("Error: failed to initialise index for %s\n",args->output_fname); } if ( args->verbosity > 0 ) fprintf(stderr,"Calling...\n"); } @@ -1571,7 +1574,18 @@ void destroy_data(args_t *args) if ( args->smpl ) smpl_ilist_destroy(args->smpl); int ret; if ( args->out_fh ) + { + if ( args->write_index ) + { + if ( bcf_idx_save(args->out_fh)<0 ) + { + if ( hts_close(args->out_fh)!=0 ) error("Error: close failed .. %s\n", args->output_fname?args->output_fname:"stdout"); + error("Error: cannot write to index %s\n", args->index_fn); + } + free(args->index_fn); + } ret = hts_close(args->out_fh); + } else ret = fclose(args->out); if ( ret ) error("Error: close failed .. %s\n", args->output_fname?args->output_fname:"stdout"); @@ -4272,6 +4286,7 @@ static const char *usage(void) " --targets-overlap 0|1|2 Include if POS in the region (0), record overlaps (1), variant overlaps (2) [0]\n" " --threads INT Use multithreading with worker threads [0]\n" " -v, --verbose INT Verbosity level 0-2 [1]\n" + " --write-index Automatically index the output files [off]\n" "\n" "Example:\n" " bcftools csq -f hs37d5.fa -g Homo_sapiens.GRCh37.82.gff3.gz in.vcf\n" @@ -4321,6 +4336,7 @@ int main_csq(int argc, char *argv[]) {"targets-file",1,0,'T'}, {"targets-overlap",required_argument,NULL,5}, {"no-version",no_argument,NULL,3}, + {"write-index",no_argument,NULL,6}, {0,0,0,0} }; int c, targets_is_file = 0, regions_is_file = 0; @@ -4409,6 +4425,7 @@ int main_csq(int argc, char *argv[]) targets_overlap = parse_overlap_option(optarg); if ( targets_overlap < 0 ) error("Could not parse: --targets-overlap %s\n",optarg); break; + case 6 : args->write_index = 1; break; case 'h': case '?': error("%s",usage()); default: error("The option not recognised: %s\n\n", optarg); break; diff --git a/doc/bcftools.1 b/doc/bcftools.1 index 0e3d5290e..e212755ab 100644 --- a/doc/bcftools.1 +++ b/doc/bcftools.1 @@ -1,13 +1,13 @@ '\" t .\" Title: bcftools .\" Author: [see the "AUTHOR(S)" section] -.\" Generator: Asciidoctor 2.0.16.dev -.\" Date: 2023-02-21 +.\" Generator: Asciidoctor 2.0.16 +.\" Date: 2023-03-10 .\" Manual: \ \& .\" Source: \ \& .\" Language: English .\" -.TH "BCFTOOLS" "1" "2023-02-21" "\ \&" "\ \&" +.TH "BCFTOOLS" "1" "2023-03-10" "\ \&" "\ \&" .ie \n(.g .ds Aq \(aq .el .ds Aq ' .ss \n[.ss] 0 @@ -51,10 +51,10 @@ standard input (stdin) and outputs to the standard output (stdout). Several commands can thus be combined with Unix pipes. .SS "VERSION" .sp -This manual page was last updated \fB2023\-02\-21\fP and refers to bcftools git version \fB1.17\fP. +This manual page was last updated \fB2023\-03\-10 08:27 GMT\fP and refers to bcftools git version \fB1.17\-10\-g9655089+\fP. .SS "BCF1" .sp -The BCF1 format output by versions of samtools <= 0.1.19 is \fBnot\fP +The obsolete BCF1 format output by versions of samtools <= 0.1.19 is \fBnot\fP compatible with this version of bcftools. To read BCF1 files one can use the view command from old versions of bcftools packaged with samtools versions <= 0.1.19 to convert to VCF, which can then be read by @@ -75,6 +75,9 @@ done with \fIbcftools view\fP. Users are now required to choose between the old samtools calling model (\fI\-c/\-\-consensus\-caller\fP) and the new multiallelic calling model (\fI\-m/\-\-multiallelic\-caller\fP). The multiallelic calling model is recommended for most tasks. +.SS "FILTERING EXPRESSIONS" +.sp +See \fBEXPRESSIONS\fP .SH "LIST OF COMMANDS" .sp For a full list of available commands, run \fBbcftools\fP without arguments. For a full @@ -344,6 +347,17 @@ Some helper scripts are bundled with the bcftools code. . sp -1 . IP \(bu 2.3 .\} +\fBgff2gff\fP .. converts a GFF file to the format required by \fBcsq\fP +.RE +.sp +.RS 4 +.ie n \{\ +\h'-04'\(bu\h'+03'\c +.\} +.el \{\ +. sp -1 +. IP \(bu 2.3 +.\} \fBplot\-vcfstats\fP .. plots the output of \fBstats\fP .RE .SH "COMMANDS AND OPTIONS" @@ -597,6 +611,11 @@ Same as \fB\-\-regions\-overlap\fP but for \fB\-t/\-T\fP. Use multithreading with \fIINT\fP worker threads. The option is currently used only for the compression of the output stream, only when \fI\-\-output\-type\fP is \fIb\fP or \fIz\fP. Default: 0. .RE +.sp +\fB\-\-write\-index\fP +.RS 4 +Automatically index the output files. Can be used only for compressed BCF and VCF output. +.RE .SS "bcftools annotate \fI[OPTIONS]\fP \fIFILE\fP" .sp Add or remove annotations. @@ -881,6 +900,11 @@ except GT. To remove all INFO tags except "FOO" and "BAR", use "INFO" can be abbreviated to "INF" and "FORMAT" to "FMT". .RE .sp +\fB\-\-write\-index\fP +.RS 4 +Automatically index the output file +.RE +.sp \fBExamples:\fP .sp .if n .RS 4 @@ -1017,6 +1041,11 @@ see \fBCommon Options\fP .RS 4 see \fBCommon Options\fP .RE +.sp +\fB\-\-write\-index\fP +.RS 4 +Automatically index the output file +.RE .SS "Input/output options:" .sp \fB\-A, \-\-keep\-alts\fP @@ -1401,6 +1430,11 @@ see \fBCommon Options\fP .RS 4 see \fBCommon Options\fP .RE +.sp +\fB\-\-write\-index\fP +.RS 4 +Automatically index the output file +.RE .SS "bcftools consensus \fI[OPTIONS]\fP \fIFILE\fP" .sp Create consensus sequence by applying VCF variants to a reference fasta file. @@ -1617,6 +1651,11 @@ see \fBCommon Options\fP .RS 4 see \fBCommon Options\fP .RE +.sp +\fB\-\-write\-index\fP +.RS 4 +Automatically index the output file +.RE .SS "VCF output options:" .sp \fB\-\-no\-version\fP @@ -1987,6 +2026,7 @@ transcripts in malformatted GFFs with incorrect phase .RS 4 GFF3 annotation file (required), such as \c .URL "ftp://ftp.ensembl.org/pub/current_gff3/homo_sapiens" "" "." +The script \fBgff2gff\fP can help with conversion from non\-standard GFF formats. An example of a minimal working GFF file: .RE .sp @@ -2137,6 +2177,11 @@ see \fBCommon Options\fP see \fBCommon Options\fP .RE .sp +\fB\-\-write\-index\fP +.RS 4 +Automatically index the output file +.RE +.sp \fBExamples:\fP .sp .if n .RS 4 @@ -2366,6 +2411,11 @@ see \fBCommon Options\fP .RS 4 see \fBCommon Options\fP .RE +.sp +\fB\-\-write\-index\fP +.RS 4 +Automatically index the output file +.RE .SS "bcftools gtcheck [\fIOPTIONS\fP] [\fB\-g\fP \fIgenotypes.vcf.gz\fP] \fIquery.vcf.gz\fP" .sp Checks sample identity. The program can operate in two modes. If the \fB\-g\fP @@ -2676,6 +2726,11 @@ see \fBCommon Options\fP list of input files to output given as 1\-based indices. With \fB\-p\fP and no \fB\-w\fP, all files are written. .RE +.sp +\fB\-\-write\-index\fP +.RS 4 +Automatically index the output file. This is done automatically with the \fB\-p\fP option. +.RE .SS "Examples:" .sp Create intersection and complements of two sets saving the output in dir/* @@ -2876,6 +2931,11 @@ see \fBCommon Options\fP .RS 4 see \fBCommon Options\fP .RE +.sp +\fB\-\-write\-index\fP +.RS 4 +Automatically index the output file +.RE .SS "bcftools mpileup [\fIOPTIONS\fP] \fB\-f\fP \fIref.fa\fP \fIin.bam\fP [\fIin2.bam\fP [...]]" .sp Generate VCF or BCF containing genotype likelihoods for one or multiple @@ -3209,6 +3269,11 @@ BQB. .fi .if n .RE .RE +.sp +\fB\-\-write\-index\fP +.RS 4 +Automatically index the output file +.RE .SS "Options for SNP/INDEL genotype likelihood computation" .sp \fB\-X, \-\-config\fP \fISTR\fP @@ -3528,6 +3593,11 @@ see \fBCommon Options\fP maximum distance between two records to consider when locally sorting variants which changed position during the realignment .RE +.sp +\fB\-\-write\-index\fP +.RS 4 +Automatically index the output file +.RE .SS "bcftools [plugin \fINAME\fP|+\fINAME\fP] \fI[OPTIONS]\fP \fIFILE\fP \(em \fI[PLUGIN OPTIONS]\fP" .sp A common framework for various utilities. The plugins can be used @@ -3601,6 +3671,11 @@ see \fBCommon Options\fP .RS 4 see \fBCommon Options\fP .RE +.sp +\fB\-\-write\-index\fP +.RS 4 +Automatically index the output file +.RE .SS "Plugin options:" .sp \fB\-h, \-\-help\fP @@ -4725,6 +4800,11 @@ see \fBCommon Options\fP .RS 4 Use this directory to store temporary files .RE +.sp +\fB\-\-write\-index\fP +.RS 4 +Automatically index the output file +.RE .SS "bcftools stats [\fIOPTIONS\fP] \fIA.vcf.gz\fP [\fIB.vcf.gz\fP]" .sp Parses VCF or BCF and produces text file stats which is suitable for machine @@ -4943,6 +5023,11 @@ see \fBCommon Options\fP .RS 4 see \fBCommon Options\fP .RE +.sp +\fB\-\-write\-index\fP +.RS 4 +Automatically index the output file +.RE .SS "Subset options:" .sp \fB\-a, \-\-trim\-alt\-alleles\fP @@ -5137,7 +5222,7 @@ important libraries used by bcftools. .SS "bcftools [\fI\-\-version\-only\fP]" .sp Display the full bcftools version number in a machine\-readable format. -.SH "EXPRESSIONS" +.SH "FILTERING EXPRESSIONS" .sp These filtering expressions are accepted by most of the commands. .sp @@ -5919,7 +6004,18 @@ bcftools view \-i \*(Aq%ID!="." & MAF[0]<0.01\*(Aq .if n .RE .sp Please refer to the documentation of your shell for details. -.SH "SCRIPTS AND OPTIONS" +.SH "SCRIPTS" +.SS "gff2gff" +.sp +Attempts to fix a GFF file to be correctly parsed by \fBcsq\fP. +.sp +.if n .RS 4 +.nf +.fam C +zcat in.gff.gz | gff2gff | gzip \-c > out.gff.gz +.fam +.fi +.if n .RE .SS "plot\-vcfstats [\fIOPTIONS\fP] \fIfile.vchk\fP [...]" .sp Script for processing output of \fBbcftools stats\fP. It can merge @@ -6013,8 +6109,10 @@ Please report any bugs you encounter on the github website: \c .sp Heng Li from the Sanger Institute wrote the original C version of htslib, samtools and bcftools. Bob Handsaker from the Broad Institute implemented the -BGZF library. Petr Danecek, Shane McCarthy and John Marshall are maintaining -and further developing bcftools. Many other people contributed to the program +BGZF library. Petr Danecek is maintaining and further developing bcftools, together +with the rest of the \c +.URL "https://www.sanger.ac.uk/tool/samtools\-bcftools\-htslib" "samtools team" "." +Many other people contributed to the program and to the file format specifications, both directly and indirectly by providing patches, testing and reporting bugs. We thank them all. .SH "RESOURCES" diff --git a/doc/bcftools.html b/doc/bcftools.html index 5a4f5ae51..c4dd7d498 100644 --- a/doc/bcftools.html +++ b/doc/bcftools.html @@ -4,7 +4,7 @@ - + bcftools(1) @@ -50,13 +50,13 @@

DESCRIPTION

VERSION

-

This manual page was last updated 2023-02-21 and refers to bcftools git version 1.17.

+

This manual page was last updated 2023-03-10 08:27 GMT and refers to bcftools git version 1.17-10-g9655089+.

BCF1

-

The BCF1 format output by versions of samtools <= 0.1.19 is not +

The obsolete BCF1 format output by versions of samtools <= 0.1.19 is not compatible with this version of bcftools. To read BCF1 files one can use the view command from old versions of bcftools packaged with samtools versions <= 0.1.19 to convert to VCF, which can then be read by @@ -79,6 +79,12 @@

VARIANT CALLING

is recommended for most tasks.

+
+

FILTERING EXPRESSIONS

+
+

See EXPRESSIONS

+
+
@@ -172,6 +178,9 @@

LIST OF SCRIPTS

@@ -417,6 +426,10 @@

Common Options

Use multithreading with INT worker threads. The option is currently used only for the compression of the output stream, only when --output-type is b or z. Default: 0.

+
--write-index
+
+

Automatically index the output files. Can be used only for compressed BCF and VCF output.

+
@@ -668,6 +681,10 @@

bcftools annotate [OPTIONS] FILE

"^INFO/FOO,INFO/BAR" (and similarly for FORMAT and FILTER). "INFO" can be abbreviated to "INF" and "FORMAT" to "FMT".

+
--write-index
+
+

Automatically index the output file

+
@@ -797,6 +814,10 @@

File format options:

see Common Options

+
--write-index
+
+

Automatically index the output file

+
@@ -1161,6 +1182,10 @@

bcftools concat [OPTIONS] FILE1 FILE2

see Common Options

+
--write-index
+
+

Automatically index the output file

+
@@ -1356,6 +1381,10 @@

VCF input options:

see Common Options

+
--write-index
+
+

Automatically index the output file

+
@@ -1738,6 +1767,7 @@

bcftools csq [OPTIONS] FILE

-g, --gff-annot FILE

GFF3 annotation file (required), such as ftp://ftp.ensembl.org/pub/current_gff3/homo_sapiens. +The script gff2gff can help with conversion from non-standard GFF formats. An example of a minimal working GFF file:

@@ -1871,6 +1901,10 @@

bcftools csq [OPTIONS] FILE

see Common Options

+
--write-index
+
+

Automatically index the output file

+
@@ -2084,6 +2118,10 @@

bcftools filter [OPTIONS] FILE

see Common Options

+
--write-index
+
+

Automatically index the output file

+
@@ -2394,6 +2432,10 @@

bcftools isec [OPTIONS] A.vcf.gz B.vcf.gzlist of input files to output given as 1-based indices. With -p and no -w, all files are written.

+
--write-index
+
+

Automatically index the output file. This is done automatically with the -p option.

+
@@ -2577,6 +2619,10 @@

bcftools merge [OPTIONS] A.vcf.gz B.vcf.gz<

see Common Options

+
--write-index
+
+

Automatically index the output file

+

@@ -2889,6 +2935,10 @@

Output options

+
--write-index
+
+

Automatically index the output file

+
@@ -3179,6 +3229,10 @@

bcftools norm [OPTIONS] file.vcf.gz

maximum distance between two records to consider when locally sorting variants which changed position during the realignment

+
--write-index
+
+

Automatically index the output file

+
@@ -3254,6 +3308,10 @@

VCF output options:

see Common Options

+
--write-index
+
+

Automatically index the output file

+
@@ -4136,6 +4194,10 @@

bcftools sort [OPTIONS] file.bcf

Use this directory to store temporary files

+
--write-index
+
+

Automatically index the output file

+
@@ -4339,6 +4401,10 @@

Output options

see Common Options

+
--write-index
+
+

Automatically index the output file

+
@@ -4538,7 +4604,7 @@

bcftools [--version-only]

-

EXPRESSIONS

+

FILTERING EXPRESSIONS

These filtering expressions are accepted by most of the commands.

@@ -4974,9 +5040,24 @@

EXPRESSIONS

-

SCRIPTS AND OPTIONS

+

SCRIPTS

+

gff2gff

+
+

Attempts to fix a GFF file to be correctly parsed by csq.

+
+
+
+
+
+
zcat in.gff.gz | gff2gff | gzip -c > out.gff.gz
+
+
+
+
+
+

plot-vcfstats [OPTIONS] file.vchk […​]

Script for processing output of bcftools stats. It can merge @@ -5077,8 +5158,9 @@

AUTHORS

Heng Li from the Sanger Institute wrote the original C version of htslib, samtools and bcftools. Bob Handsaker from the Broad Institute implemented the -BGZF library. Petr Danecek, Shane McCarthy and John Marshall are maintaining -and further developing bcftools. Many other people contributed to the program +BGZF library. Petr Danecek is maintaining and further developing bcftools, together +with the rest of the samtools team. +Many other people contributed to the program and to the file format specifications, both directly and indirectly by providing patches, testing and reporting bugs. We thank them all.

@@ -5119,7 +5201,7 @@

COPYING

diff --git a/doc/bcftools.txt b/doc/bcftools.txt index b1a5f07c4..3fd09cf23 100644 --- a/doc/bcftools.txt +++ b/doc/bcftools.txt @@ -52,7 +52,7 @@ commands can thus be combined with Unix pipes. This manual page was last updated *{date}* and refers to bcftools git version *{version}*. === BCF1 -The BCF1 format output by versions of samtools \<= 0.1.19 is *not* +The obsolete BCF1 format output by versions of samtools \<= 0.1.19 is *not* compatible with this version of bcftools. To read BCF1 files one can use the view command from old versions of bcftools packaged with samtools versions \<= 0.1.19 to convert to VCF, which can then be read by @@ -71,6 +71,10 @@ calling model ('-m/--multiallelic-caller'). The multiallelic calling model is recommended for most tasks. +=== FILTERING EXPRESSIONS +See *<>* + + LIST OF COMMANDS ---------------- For a full list of available commands, run *bcftools* without arguments. For a full @@ -105,6 +109,7 @@ LIST OF SCRIPTS --------------- Some helper scripts are bundled with the bcftools code. +- *<>* .. converts a GFF file to the format required by *<>* - *<>* .. plots the output of *<>* @@ -298,6 +303,9 @@ Such a file can be easily created from a VCF using: Use multithreading with 'INT' worker threads. The option is currently used only for the compression of the output stream, only when '--output-type' is 'b' or 'z'. Default: 0. +*--write-index*:: + Automatically index the output files. Can be used only for compressed BCF and VCF output. + [[annotate]] === bcftools annotate '[OPTIONS]' 'FILE' @@ -501,6 +509,9 @@ Add or remove annotations. "^INFO/FOO,INFO/BAR" (and similarly for FORMAT and FILTER). "INFO" can be abbreviated to "INF" and "FORMAT" to "FMT". +*--write-index*:: + Automatically index the output file + *Examples:* ---- # Remove three fields @@ -604,6 +615,9 @@ demand. The original calling model can be invoked with the *-c* option. *--threads* 'INT':: see *<>* +*--write-index*:: + Automatically index the output file + ==== Input/output options: *-A, --keep-alts*:: @@ -878,6 +892,9 @@ are concatenated without being recompressed, which is very fast.. *--threads* 'INT':: see *<>* +*--write-index*:: + Automatically index the output file + [[consensus]] === bcftools consensus '[OPTIONS]' 'FILE' @@ -1021,6 +1038,9 @@ depth information, such as INFO/AD or FORMAT/AD. For that, consider using the *--targets-overlap* '0'|'1'|'2':: see *<>* +*--write-index*:: + Automatically index the output file + ==== VCF output options: *--no-version*:: @@ -1290,6 +1310,7 @@ output VCF and are ignored for the prediction analysis. *-g, --gff-annot* 'FILE':: GFF3 annotation file (required), such as ftp://ftp.ensembl.org/pub/current_gff3/homo_sapiens. + The script *<>* can help with conversion from non-standard GFF formats. An example of a minimal working GFF file: ---- # The program looks for "CDS", "exon", "three_prime_UTR" and "five_prime_UTR" lines, @@ -1392,6 +1413,9 @@ output VCF and are ignored for the prediction analysis. *--targets-overlap* '0'|'1'|'2':: see *<>* +*--write-index*:: + Automatically index the output file + *Examples:* ---- # Basic usage @@ -1559,6 +1583,9 @@ And similarly here, the second is filtered: *--threads* 'INT':: see *<>* +*--write-index*:: + Automatically index the output file + [[gtcheck]] @@ -1815,6 +1842,9 @@ in the other files. list of input files to output given as 1-based indices. With *-p* and no *-w*, all files are written. +*--write-index*:: + Automatically index the output file. This is done automatically with the *-p* option. + ==== Examples: Create intersection and complements of two sets saving the output in dir/* @@ -1951,6 +1981,8 @@ For "vertical" merge take a look at *<>* or *<>* +*--write-index*:: + Automatically index the output file [[mpileup]] === bcftools mpileup ['OPTIONS'] *-f* 'ref.fa' 'in.bam' ['in2.bam' [...]] @@ -2199,6 +2231,9 @@ INFO/DPR .. Deprecated in favor of INFO/AD; Number of high-quality bases for used by the earlier Bcftools releases. For excample BQBZ becomes BQB. +*--write-index*:: + Automatically index the output file + ==== Options for SNP/INDEL genotype likelihood computation *-X, --config* 'STR':: @@ -2428,6 +2463,8 @@ the *<>* option is supplied. maximum distance between two records to consider when locally sorting variants which changed position during the realignment +*--write-index*:: + Automatically index the output file [[plugin]] @@ -2485,6 +2522,9 @@ the usage examples that each plugin comes with. *--threads* 'INT':: see *<>* +*--write-index*:: + Automatically index the output file + ==== Plugin options: *-h, --help*:: @@ -3105,6 +3145,9 @@ Transition probabilities: *-T, --temp-dir* 'DIR':: Use this directory to store temporary files +*--write-index*:: + Automatically index the output file + [[stats]] @@ -3252,6 +3295,9 @@ Convert between VCF and BCF. Former *bcftools subset*. *--threads* 'INT':: see *<>* +*--write-index*:: + Automatically index the output file + ==== Subset options: *-a, --trim-alt-alleles*:: @@ -3403,8 +3449,8 @@ Display the full bcftools version number in a machine-readable format. [[expressions]] -EXPRESSIONS ------------ +FILTERING EXPRESSIONS +--------------------- These filtering expressions are accepted by most of the commands. @@ -3662,8 +3708,17 @@ that the whole expression is passed to the program as intended: Please refer to the documentation of your shell for details. -SCRIPTS AND OPTIONS -------------------- +SCRIPTS +------- + +[[gff2gff]] +=== gff2gff +Attempts to fix a GFF file to be correctly parsed by *<>*. + +-- + zcat in.gff.gz | gff2gff | gzip -c > out.gff.gz +-- + [[plot-vcfstats]] === plot-vcfstats ['OPTIONS'] 'file.vchk' [...] @@ -3729,8 +3784,9 @@ AUTHORS ------- Heng Li from the Sanger Institute wrote the original C version of htslib, samtools and bcftools. Bob Handsaker from the Broad Institute implemented the -BGZF library. Petr Danecek, Shane McCarthy and John Marshall are maintaining -and further developing bcftools. Many other people contributed to the program +BGZF library. Petr Danecek is maintaining and further developing bcftools, together +with the rest of the https://www.sanger.ac.uk/tool/samtools-bcftools-htslib[samtools team]. +Many other people contributed to the program and to the file format specifications, both directly and indirectly by providing patches, testing and reporting bugs. We thank them all. diff --git a/misc/gff2gff b/misc/gff2gff index 27485fad8..4e0fc2cf2 100755 --- a/misc/gff2gff +++ b/misc/gff2gff @@ -39,7 +39,7 @@ sub error my (@msg) = @_; if ( scalar @msg ) { confess @msg; } print - "About: Attempt to fix a GFF file to be correctly parse by bcftools/csq, see\n", + "About: Attempt to fix a GFF file to be correctly parsed by bcftools/csq, see\n", " the man page for the description of the expected format\n", " http://samtools.github.io/bcftools/bcftools-man.html#csq\n", "Usage: gff2gff [OPTIONS]\n", diff --git a/mpileup.c b/mpileup.c index 9b21b1873..768216c7d 100644 --- a/mpileup.c +++ b/mpileup.c @@ -1,6 +1,6 @@ /* mpileup.c -- mpileup subcommand. Previously bam_plcmd.c from samtools - Copyright (C) 2008-2022 Genome Research Ltd. + Copyright (C) 2008-2023 Genome Research Ltd. Portions copyright (C) 2009-2012 Broad Institute. Author: Heng Li @@ -101,6 +101,8 @@ typedef struct { int indels_v20; int argc; char **argv; + int write_index; + char *index_fn; } mplp_conf_t; typedef struct { @@ -849,6 +851,7 @@ static int mpileup(mplp_conf_t *conf) for (i=0; ibcf_hdr, smpl[i]); if ( bcf_hdr_write(conf->bcf_fp, conf->bcf_hdr)!=0 ) error("[%s] Error: failed to write the header to %s\n",__func__,conf->output_fname?conf->output_fname:"standard output"); + if ( conf->write_index && init_index(conf->bcf_fp,conf->bcf_hdr,conf->output_fname,&conf->index_fn)<0 ) error("Error: failed to initialise index for %s\n",conf->output_fname); conf->bca = bcf_call_init(-1., conf->min_baseQ, conf->max_baseQ, conf->delta_baseQ); @@ -958,6 +961,15 @@ static int mpileup(mplp_conf_t *conf) bcf_destroy1(conf->bcf_rec); if (conf->bcf_fp) { + if ( conf->write_index ) + { + if ( bcf_idx_save(conf->bcf_fp)<0 ) + { + if ( hts_close(conf->bcf_fp)!=0 ) error("[%s] Error: close failed .. %s\n", __func__,conf->output_fname); + error("Error: cannot write to index %s\n",conf->index_fn); + } + free(conf->index_fn); + } if ( hts_close(conf->bcf_fp)!=0 ) error("[%s] Error: close failed .. %s\n", __func__,conf->output_fname); bcf_hdr_destroy(conf->bcf_hdr); bcf_call_destroy(conf->bca); @@ -1227,6 +1239,7 @@ static void print_usage(FILE *fp, const mplp_conf_t *mplp) " -O, --output-type TYPE 'b' compressed BCF; 'u' uncompressed BCF;\n" " 'z' compressed VCF; 'v' uncompressed VCF; 0-9 compression level [v]\n" " --threads INT Use multithreading with INT worker threads [0]\n" + " --write-index Automatically index the output files [off]\n" "\n" "SNP/INDEL genotype likelihoods options:\n" " -X, --config STR Specify platform specific profiles (see below)\n" @@ -1375,6 +1388,7 @@ int main_mpileup(int argc, char *argv[]) {"seed", required_argument, NULL, 13}, {"ambig-reads", required_argument, NULL, 14}, {"ar", required_argument, NULL, 14}, + {"write-index",no_argument,NULL,21}, {NULL, 0, NULL, 0} }; while ((c = getopt_long(argc, argv, "Ag:f:r:R:q:Q:C:BDd:L:b:P:po:e:h:Im:F:EG:6O:xa:s:S:t:T:M:X:U",lopts,NULL)) >= 0) { @@ -1497,6 +1511,7 @@ int main_mpileup(int argc, char *argv[]) } break; case 20: mplp.indels_v20 = 1; break; + case 21: mplp.write_index = 1; break; case 'A': use_orphan = 1; break; case 'F': mplp.min_frac = atof(optarg); break; case 'm': mplp.min_support = atoi(optarg); break; diff --git a/plugins/contrast.c b/plugins/contrast.c index 71d9d3d45..624bfeead 100644 --- a/plugins/contrast.c +++ b/plugins/contrast.c @@ -1,19 +1,19 @@ /* The MIT License - Copyright (c) 2018-2021 Genome Research Ltd. + Copyright (c) 2018-2023 Genome Research Ltd. Author: Petr Danecek - + Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: - + The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. - + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE @@ -69,6 +69,8 @@ typedef struct int ncontrol_gts, mcontrol_gts, ntotal, nskipped, ntested, ncase_al, ncase_gt; kstring_t case_als_smpl, case_gts_smpl; int max_AC, nals[4]; // nals: number of control-ref, control-alt, case-ref and case-alt alleles in the region + char *index_fn; + int write_index; } args_t; @@ -81,7 +83,7 @@ const char *about(void) static const char *usage_text(void) { - return + return "\n" "About: Runs a basic association test, per-site or in a region, and checks for novel alleles and\n" " genotypes in two groups of samples. Adds the following INFO annotations:\n" @@ -108,6 +110,7 @@ static const char *usage_text(void) " -t, --targets REG Similar to -r but streams rather than index-jumps\n" " -T, --targets-file FILE Similar to -R but streams rather than index-jumps\n" " --targets-overlap 0|1|2 Include if POS in the region (0), record overlaps (1), variant overlaps (2) [0]\n" + " --write-index Automatically index the output files [off]\n" "\n" "Example:\n" " # Test if any of the samples a,b is different from the samples c,d,e\n" @@ -233,6 +236,7 @@ static void init_data(args_t *args) args->out_fh = hts_open(args->output_fname ? args->output_fname : "-", wmode); if ( args->out_fh == NULL ) error("Can't write to \"%s\": %s\n", args->output_fname, strerror(errno)); if ( bcf_hdr_write(args->out_fh, args->hdr_out)!=0 ) error("[%s] Error: cannot write to %s\n", __func__,args->output_fname); + if ( args->write_index && init_index(args->out_fh,args->hdr_out,args->output_fname,&args->index_fn)<0 ) error("Error: failed to initialise index for %s\n",args->output_fname); if ( args->max_AC_str ) { @@ -251,6 +255,15 @@ static void init_data(args_t *args) static void destroy_data(args_t *args) { bcf_hdr_destroy(args->hdr_out); + if ( args->write_index ) + { + if ( bcf_idx_save(args->out_fh)<0 ) + { + if ( hts_close(args->out_fh)!=0 ) error("Error: close failed .. %s\n", args->output_fname?args->output_fname:"stdout"); + error("Error: cannot write to index %s\n", args->index_fn); + } + free(args->index_fn); + } if ( hts_close(args->out_fh)!=0 ) error("[%s] Error: close failed .. %s\n", __func__,args->output_fname); free(args->case_als_smpl.s); free(args->case_gts_smpl.s); @@ -314,7 +327,7 @@ static int process_record(args_t *args, bcf1_t *rec) for (j=0; j 31 ) { @@ -353,7 +366,7 @@ static int process_record(args_t *args, bcf1_t *rec) for (j=0; j 31 ) { @@ -365,7 +378,7 @@ static int process_record(args_t *args, bcf1_t *rec) args->nskipped++; return -1; } - if ( !(control_als & (1<annots & PRINT_NASSOC ) bcf_update_info_int32(args->hdr_out, rec, "NASSOC", nals, 4); - if ( args->case_als_smpl.l ) + if ( args->case_als_smpl.l ) { bcf_update_info_string(args->hdr_out, rec, "NOVELAL", args->case_als_smpl.s); args->ncase_al++; } - if ( args->case_gts_smpl.l ) + if ( args->case_gts_smpl.l ) { bcf_update_info_string(args->hdr_out, rec, "NOVELGT", args->case_gts_smpl.s); args->ncase_gt++; @@ -472,13 +485,14 @@ int run(int argc, char **argv) {"targets",1,0,'t'}, {"targets-file",1,0,'T'}, {"targets-overlap",required_argument,NULL,4}, + {"write-index",no_argument,NULL,5}, {NULL,0,NULL,0} }; int c; char *tmp; while ((c = getopt_long(argc, argv, "O:o:i:e:r:R:t:T:0:1:a:f:",loptions,NULL)) >= 0) { - switch (c) + switch (c) { case 1 : args->force_samples = 1; break; case 'f': args->max_AC_str = optarg; break; @@ -522,6 +536,7 @@ int run(int argc, char **argv) args->targets_overlap = parse_overlap_option(optarg); if ( args->targets_overlap < 0 ) error("Could not parse: --targets-overlap %s\n",optarg); break; + case 5 : args->write_index = 1; break; case 'h': case '?': default: error("%s", usage_text()); break; diff --git a/plugins/gvcfz.c b/plugins/gvcfz.c index d9ddb6643..abb25d997 100644 --- a/plugins/gvcfz.c +++ b/plugins/gvcfz.c @@ -1,5 +1,5 @@ -/* - Copyright (C) 2017-2021 Genome Research Ltd. +/* + Copyright (C) 2017-2023 Genome Research Ltd. Author: Petr Danecek @@ -9,10 +9,10 @@ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: - + The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. - + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE @@ -78,6 +78,8 @@ typedef struct char **argv, *region, *target, *fname, *output_fname, *keep_tags; bcf_hdr_t *hdr_in, *hdr_out; bcf_srs_t *sr; + char *index_fn; + int write_index; } args_t; @@ -88,18 +90,19 @@ const char *about(void) static const char *usage_text(void) { - return + return "\n" "About: Compress gVCF file by resizing gVCF blocks according to specified criteria.\n" "\n" "Usage: bcftools +gvcfz [Options]\n" "Plugin options:\n" - " -a, --trim-alt-alleles trim alternate alleles not seen in the genotypes\n" - " -e, --exclude exclude sites for which the expression is true\n" - " -i, --include include sites for which the expression is true\n" - " -g, --group-by EXPR group gVCF blocks according to the expression\n" - " -o, --output FILE write gVCF output to the FILE\n" + " -a, --trim-alt-alleles Trim alternate alleles not seen in the genotypes\n" + " -e, --exclude Exclude sites for which the expression is true\n" + " -i, --include Include sites for which the expression is true\n" + " -g, --group-by EXPR Group gVCF blocks according to the expression\n" + " -o, --output FILE Write gVCF output to the FILE\n" " -O, --output-type u|b|v|z[0-9] u/b: un/compressed BCF, v/z: un/compressed VCF, 0-9: compression level [v]\n" + " --write-index Automatically index the output files [off]\n" "Examples:\n" " # Compress blocks by GQ and DP. Multiple blocks separated by a semicolon can be defined\n" " bcftools +gvcfz input.bcf -g'PASS:GQ>60 & DP<20; PASS:GQ>40 & DP<15; Flt1:QG>20; Flt2:-'\n" @@ -136,7 +139,7 @@ static void init_groups(args_t *args) beg = ++end; while ( *end && *end!=';' ) end++; char tmp = *end; *end = 0; - if ( strcmp(flt,"PASS") ) + if ( strcmp(flt,"PASS") ) { bcf_hdr_printf(args->hdr_out, "##FILTER=", flt, hdr_str); if (bcf_hdr_sync(args->hdr_out) < 0) @@ -174,6 +177,15 @@ static void destroy_data(args_t *args) free(args->grp); if ( args->filter ) filter_destroy(args->filter); + if ( args->write_index ) + { + if ( bcf_idx_save(args->fh_out)<0 ) + { + if ( hts_close(args->fh_out)!=0 ) error("Error: close failed .. %s\n", args->output_fname?args->output_fname:"stdout"); + error("Error: cannot write to index %s\n", args->index_fn); + } + free(args->index_fn); + } if ( hts_close(args->fh_out)!=0 ) error("failed to close %s\n", args->output_fname); bcf_sr_destroy(args->sr); @@ -203,7 +215,7 @@ static void flush_block(args_t *args, bcf1_t *rec) if ( bcf_update_format_int32(args->hdr_out,gvcf->rec,"PL",&gvcf->pl,3) != 0 ) error("Could not update FORMAT/PL at %s:%"PRId64"\n", bcf_seqname(args->hdr_out,gvcf->rec),(int64_t) gvcf->rec->pos+1); } - if ( gvcf->grp < args->ngrp && args->grp[gvcf->grp].flt_id >= 0 ) + if ( gvcf->grp < args->ngrp && args->grp[gvcf->grp].flt_id >= 0 ) bcf_add_filter(args->hdr_out, gvcf->rec, args->grp[gvcf->grp].flt_id); if ( bcf_write(args->fh_out, args->hdr_out, gvcf->rec)!=0 ) error("Failed to write the header\n"); @@ -323,13 +335,14 @@ int run(int argc, char **argv) {"stats",required_argument,NULL,'s'}, {"output",required_argument,NULL,'o'}, {"output-type",required_argument,NULL,'O'}, + {"write-index",no_argument,NULL,1}, {NULL,0,NULL,0} }; int c; char *tmp; while ((c = getopt_long(argc, argv, "vr:R:t:T:o:O:g:i:e:a",loptions,NULL)) >= 0) { - switch (c) + switch (c) { case 'a': args->trim_alts = 1; break; case 'e': @@ -358,6 +371,7 @@ int run(int argc, char **argv) if ( *tmp || args->clevel<0 || args->clevel>9 ) error("Could not parse argument: --compression-level %s\n", optarg+1); } break; + case 1 : args->write_index = 1; break; case 'h': case '?': default: error("%s", usage_text()); break; @@ -385,6 +399,7 @@ int run(int argc, char **argv) set_wmode(wmode,args->output_type,args->output_fname,args->clevel); args->fh_out = hts_open(args->output_fname ? args->output_fname : "-", wmode); if ( bcf_hdr_write(args->fh_out, args->hdr_out)!=0 ) error("Failed to write the header\n"); + if ( args->write_index && init_index(args->fh_out,args->hdr_out,args->output_fname,&args->index_fn)<0 ) error("Error: failed to initialise index for %s\n",args->output_fname); while ( bcf_sr_next_line(args->sr) ) process_gvcf(args); flush_block(args, NULL); diff --git a/plugins/isecGT.c b/plugins/isecGT.c index c31af38ec..d83e8fdf8 100644 --- a/plugins/isecGT.c +++ b/plugins/isecGT.c @@ -1,5 +1,5 @@ /* - Copyright (C) 2016-2021 Genome Research Ltd. + Copyright (C) 2016-2023 Genome Research Ltd. Author: Petr Danecek @@ -45,6 +45,8 @@ typedef struct bcf_srs_t *sr; bcf_hdr_t *hdr_a, *hdr_b; htsFile *out_fh; + char *index_fn; + int write_index; } args_t; @@ -67,6 +69,7 @@ static const char *usage_text(void) " -R, --regions-file FILE Restrict to regions listed in a file\n" " -t, --targets REGION Similar to -r but streams rather than index-jumps\n" " -T, --targets-file FILE Similar to -R but streams rather than index-jumps\n" + " --write-index Automatically index the output files [off]\n" "\n"; } @@ -84,6 +87,7 @@ int run(int argc, char **argv) {"targets-file",required_argument,NULL,'T'}, {"output",required_argument,NULL,'o'}, {"output-type",required_argument,NULL,'O'}, + {"write-index",no_argument,NULL,1}, {NULL,0,NULL,0} }; int c; @@ -115,6 +119,7 @@ int run(int argc, char **argv) case 'R': args->regions_list = optarg; args->regions_is_file = 1; break; case 't': args->targets_list = optarg; break; case 'T': args->targets_list = optarg; args->targets_is_file = 1; break; + case 1 : args->write_index = 1; break; case 'h': case '?': default: error("%s", usage_text()); break; @@ -146,6 +151,7 @@ int run(int argc, char **argv) args->out_fh = hts_open(args->output_fname ? args->output_fname : "-", wmode); if ( args->out_fh == NULL ) error("Can't write to \"%s\": %s\n", args->output_fname, strerror(errno)); if ( bcf_hdr_write(args->out_fh, args->hdr_a)!=0 ) error("[%s] Error: cannot write to %s\n", __func__,args->output_fname); + if ( args->write_index && init_index(args->out_fh,args->hdr_a,args->output_fname,&args->index_fn)<0 ) error("Error: failed to initialise index for %s\n",args->output_fname); while ( bcf_sr_next_line(args->sr) ) { @@ -179,7 +185,15 @@ int run(int argc, char **argv) if ( dirty ) bcf_update_genotypes(args->hdr_a, line_a, args->arr_a, ngt_a*smpl->n); if ( bcf_write(args->out_fh, args->hdr_a, line_a)!=0 ) error("[%s] Error: cannot write to %s\n", __func__,args->output_fname); } - + if ( args->write_index ) + { + if ( bcf_idx_save(args->out_fh)<0 ) + { + if ( hts_close(args->out_fh)!=0 ) error("Error: close failed .. %s\n", args->output_fname?args->output_fname:"stdout"); + error("Error: cannot write to index %s\n", args->index_fn); + } + free(args->index_fn); + } if ( hts_close(args->out_fh)!=0 ) error("Close failed: %s\n",args->output_fname); smpl_ilist_destroy(smpl); bcf_sr_destroy(args->sr); diff --git a/plugins/mendelian.c b/plugins/mendelian.c deleted file mode 100644 index 65a65fe1c..000000000 --- a/plugins/mendelian.c +++ /dev/null @@ -1,689 +0,0 @@ -/* The MIT License - - Copyright (c) 2015-2022 Genome Research Ltd. - - Author: Petr Danecek - - Permission is hereby granted, free of charge, to any person obtaining a copy - of this software and associated documentation files (the "Software"), to deal - in the Software without restriction, including without limitation the rights - to use, copy, modify, merge, publish, distribute, sublicense, and/or sell - copies of the Software, and to permit persons to whom the Software is - furnished to do so, subject to the following conditions: - - The above copyright notice and this permission notice shall be included in - all copies or substantial portions of the Software. - - THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN - THE SOFTWARE. - - */ - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include // for isatty -#include "../bcftools.h" -#include "../regidx.h" - -#define MODE_COUNT 1 -#define MODE_LIST_GOOD 2 -#define MODE_LIST_BAD 4 -#define MODE_DELETE 8 -#define MODE_ANNOTATE 16 -#define MODE_LIST_SKIP 32 - -typedef struct -{ - int nok, nbad; - int imother,ifather,ichild; -} -trio_t; - -typedef struct -{ - int mpl, fpl, cpl; // ploidies - mother, father, child - int mal, fal; // expect an allele from mother and father -} -rule_t; - -typedef struct _args_t -{ - regidx_t *rules; - regitr_t *itr, *itr_ori; - bcf_hdr_t *hdr; - htsFile *out_fh; - int32_t *gt_arr; - int mode; - int ngt_arr, nrec; - trio_t *trios; - int ntrios, mtrios; - int output_type, clevel; - char *output_fname; - bcf_srs_t *sr; -} -args_t; - -static args_t args; -static int parse_rules(const char *line, char **chr_beg, char **chr_end, uint32_t *beg, uint32_t *end, void *payload, void *usr); -static bcf1_t *process(bcf1_t *rec); - -const char *about(void) -{ - return "Count Mendelian consistent / inconsistent genotypes [DEPRECATED, use mendelian2 instead]\n"; -} - -typedef struct -{ - const char *alias, *about, *rules; -} -rules_predef_t; - -static rules_predef_t rules_predefs[] = -{ - { .alias = "GRCh37", - .about = "Human Genome reference assembly GRCh37 / hg19, both chr naming conventions", - .rules = - " X:1-60000 M/M + F > M\n" - " X:1-60000 M/M + F > M/F\n" - " X:2699521-154931043 M/M + F > M\n" - " X:2699521-154931043 M/M + F > M/F\n" - " Y:1-59373566 . + F > F\n" - " MT:1-16569 M + F > M\n" - "\n" - " chrX:1-60000 M/M + F > M\n" - " chrX:1-60000 M/M + F > M/F\n" - " chrX:2699521-154931043 M/M + F > M\n" - " chrX:2699521-154931043 M/M + F > M/F\n" - " chrY:1-59373566 . + F > F\n" - " chrM:1-16569 M + F > M\n" - }, - { .alias = "GRCh38", - .about = "Human Genome reference assembly GRCh38 / hg38, both chr naming conventions", - .rules = - " X:1-9999 M/M + F > M\n" - " X:1-9999 M/M + F > M/F\n" - " X:2781480-155701381 M/M + F > M\n" - " X:2781480-155701381 M/M + F > M/F\n" - " Y:1-57227415 . + F > F\n" - " MT:1-16569 M + F > M\n" - "\n" - " chrX:1-9999 M/M + F > M\n" - " chrX:1-9999 M/M + F > M/F\n" - " chrX:2781480-155701381 M/M + F > M\n" - " chrX:2781480-155701381 M/M + F > M/F\n" - " chrY:1-57227415 . + F > F\n" - " chrM:1-16569 M + F > M\n" - }, - { - .alias = NULL, - .about = NULL, - .rules = NULL, - } -}; - - -const char *usage(void) -{ - return - "\n" - "About: Count Mendelian consistent / inconsistent genotypes. Note that this plugin is DEPRECATED and\n" - " will not be supported in the future. Please use the newer plugin +mendelian2 instead.\n" - "Usage: bcftools +mendelian [Options]\n" - "Options:\n" - " -c, --count Count the number of consistent sites [DEPRECATED, use `-m c` instead]\n" - " -d, --delete Delete inconsistent genotypes (set to \"./.\") [DEPRECATED, use `-m d` instead]\n" - " -l, --list [+x] List consistent (+) or inconsistent (x) sites [DEPRECATED, use `-m +` or `-m x` instead]\n" - " -m, --mode [+acdux] Output mode (the default is `-m c`):\n" - " + .. list consistent sites\n" - " a .. add INFO/MERR annotation with the number of inconsistent samples\n" - " c .. print counts, a text summary with the number of errors per trio\n" - " d .. delete inconsistent genotypes (set to \"./.\")\n" - " u .. list uninformative sites\n" - " x .. list inconsistent sites\n" - " -o, --output FILE Write output to a file [standard output]\n" - " -O, --output-type u|b|v|z[0-9] u/b: un/compressed BCF, v/z: un/compressed VCF, 0-9: compression level [v]\n" - " -r, --rules ASSEMBLY[?] Predefined rules, 'list' to print available settings, append '?' for details\n" - " -R, --rules-file FILE Inheritance rules, see example below\n" - " -t, --trio M,F,C Names of mother, father and the child\n" - " -T, --trio-file FILE List of trios, one per line (mother,father,child)\n" - " -p, --ped FILE PED file\n" - "\n" - "Example:\n" - " # Default inheritance patterns, override with -r\n" - " # region maternal_ploidy + paternal > offspring\n" - " X:1-60000 M/M + F > M\n" - " X:1-60000 M/M + F > M/F\n" - " X:2699521-154931043 M/M + F > M\n" - " X:2699521-154931043 M/M + F > M/F\n" - " Y:1-59373566 . + F > F\n" - " MT:1-16569 M + F > M\n" - "\n" - " bcftools +mendelian in.vcf -t Mother,Father,Child -c\n" - "\n"; -} - -regidx_t *init_rules(args_t *args, char *alias) -{ - const rules_predef_t *rules = rules_predefs; - if ( !alias ) alias = "GRCh37"; - - int detailed = 0, len = strlen(alias); - if ( alias[len-1]=='?' ) { detailed = 1; alias[len-1] = 0; } - - while ( rules->alias && strcasecmp(alias,rules->alias) ) rules++; - - if ( !rules->alias ) - { - fprintf(stderr,"\nPRE-DEFINED INHERITANCE RULES\n\n"); - fprintf(stderr," * Columns are: CHROM:BEG-END MATERNAL_PLOIDY + PATERNAL_PLOIDY > OFFSPRING\n"); - fprintf(stderr," * Coordinates are 1-based inclusive.\n\n"); - rules = rules_predefs; - while ( rules->alias ) - { - fprintf(stderr,"%s\n .. %s\n\n", rules->alias,rules->about); - if ( detailed ) - fprintf(stderr,"%s\n", rules->rules); - rules++; - } - fprintf(stderr,"Run as --rules (e.g. --rules GRCh37).\n"); - fprintf(stderr,"To see the detailed ploidy definition, append a question mark (e.g. --rules GRCh37?).\n"); - fprintf(stderr,"\n"); - exit(-1); - } - else if ( detailed ) - { - fprintf(stderr,"%s", rules->rules); - exit(-1); - } - return regidx_init_string(rules->rules, parse_rules, NULL, sizeof(rule_t), &args); -} - -static int parse_rules(const char *line, char **chr_beg, char **chr_end, uint32_t *beg, uint32_t *end, void *payload, void *usr) -{ - // e.g. "Y:1-59373566 . + F > . # daugther" - - // eat any leading spaces - char *ss = (char*) line; - while ( *ss && isspace(*ss) ) ss++; - if ( !*ss ) return -1; // skip empty lines - - // chromosome name, beg, end - char *tmp, *se = ss; - while ( se[1] && !isspace(se[1]) ) se++; - while ( se > ss && isdigit(*se) ) se--; - if ( *se!='-' ) error("Could not parse the region: %s\n", line); - *end = strtol(se+1, &tmp, 10) - 1; - if ( tmp==se+1 ) error("Could not parse the region:%s\n",line); - while ( se > ss && *se!=':' ) se--; - *beg = strtol(se+1, &tmp, 10) - 1; - if ( tmp==se+1 ) error("Could not parse the region:%s\n",line); - - *chr_beg = ss; - *chr_end = se-1; - - // skip region - while ( *ss && !isspace(*ss) ) ss++; - while ( *ss && isspace(*ss) ) ss++; - - rule_t *rule = (rule_t*) payload; - memset(rule, 0, sizeof(rule_t)); - - // maternal ploidy - se = ss; - while ( *se && !isspace(*se) ) se++; - int err = 0; - if ( se - ss == 1 ) - { - if ( *ss=='M' ) rule->mpl = 1; - else if ( *ss=='.' ) rule->mpl = 0; - else err = 1; - } - else if ( se - ss == 3 ) - { - if ( !strncmp(ss,"M/M",3) ) rule->mpl = 2; - else err = 1; - } - else err = 1; - if ( err ) error("Could not parse the maternal ploidy, only \"M\", \"M/M\" and \".\" currently supported: %s\n",line); - - // skip "+" - while ( *se && isspace(*se) ) se++; - if ( *se != '+' ) error("Could not parse the line: %s\n",line); - se++; - while ( *se && isspace(*se) ) se++; - - // paternal ploidy - ss = se; - while ( *se && !isspace(*se) ) se++; - if ( se - ss == 1 ) - { - if ( *ss=='F' ) rule->fpl = 1; - else err = 1; - } - else err = 1; - if ( err ) error("Could not parse the paternal ploidy, only \"F\" is currently supported: %s [%s]\n",line, ss); - - // skip ">" - while ( *se && isspace(*se) ) se++; - if ( *se != '>' ) error("Could not parse the line: %s\n",line); - se++; - while ( *se && isspace(*se) ) se++; - - // ploidy in offspring - ss = se; - while ( *se && !isspace(*se) ) se++; - if ( se - ss == 3 ) - { - if ( !strncmp(ss,"M/F",3) ) { rule->cpl = 2; rule->fal = 1; rule->mal = 1; } - else err = 1; - } - else if ( se - ss == 1 ) - { - if ( *ss=='F' ) { rule->cpl = 1; rule->fal = 1; } - else if ( *ss=='M' ) { rule->cpl = 1; rule->mal = 1; } - else err = 1; - } - else err = 1; - if ( err ) error("Could not parse the offspring's ploidy, only \"M\", \"F\" or \"M/F\" is currently supported: %s\n",line); - - return 0; -} - -void parse_ped(args_t *args, char *fname) -{ - htsFile *fp = hts_open(fname, "r"); - if ( !fp ) error("Could not read: %s\n", fname); - - kstring_t str = {0,0,0}; - if ( hts_getline(fp, KS_SEP_LINE, &str) <= 0 ) error("Empty file: %s\n", fname); - - int moff = 0, *off = NULL; - do - { - int ncols = ksplit_core(str.s,0,&moff,&off); - if ( ncols<4 ) error("Could not parse the ped file: %s\n", str.s); - - int ifather = bcf_hdr_id2int(args->hdr,BCF_DT_SAMPLE,&str.s[off[2]]); - int imother = bcf_hdr_id2int(args->hdr,BCF_DT_SAMPLE,&str.s[off[3]]); - int ichild = bcf_hdr_id2int(args->hdr,BCF_DT_SAMPLE,&str.s[off[1]]); - - // The code in process() makes an attempt to work with partial families, - // the support is not complete though and can lead to core dumps. Therefore - // enforcing full trios for now. - // if ( ( ifather<0 && imother<0 ) || ichild<0 ) continue; - if ( ifather<0 || imother<0 || ichild<0 ) continue; - - args->ntrios++; - hts_expand0(trio_t,args->ntrios,args->mtrios,args->trios); - trio_t *trios = &args->trios[args->ntrios-1]; - trios->ifather = ifather; - trios->imother = imother; - trios->ichild = ichild; - - } while ( hts_getline(fp, KS_SEP_LINE, &str)>=0 ); - if ( !args->ntrios ) error("No complete trios found in the PED and VCF\n"); - - free(str.s); - free(off); - hts_close(fp); -} - -int run(int argc, char **argv) -{ - char *trio_samples = NULL, *trio_file = NULL, *ped_fname = NULL, *rules_fname = NULL, *rules_string = NULL; - memset(&args,0,sizeof(args_t)); - args.mode = 0; - args.output_fname = "-"; - args.clevel = -1; - - static struct option loptions[] = - { - {"trio",1,0,'t'}, - {"trio-file",1,0,'T'}, - {"ped",1,0,'p'}, - {"delete",0,0,'d'}, - {"list",1,0,'l'}, - {"mode",1,0,'m'}, - {"count",0,0,'c'}, - {"rules",1,0,'r'}, - {"rules-file",1,0,'R'}, - {"output",required_argument,NULL,'o'}, - {"output-type",required_argument,NULL,'O'}, - {0,0,0,0} - }; - int c; - char *tmp; - while ((c = getopt_long(argc, argv, "?ht:T:p:l:m:cdr:R:o:O:",loptions,NULL)) >= 0) - { - switch (c) - { - case 'o': args.output_fname = optarg; break; - case 'O': - switch (optarg[0]) { - case 'b': args.output_type = FT_BCF_GZ; break; - case 'u': args.output_type = FT_BCF; break; - case 'z': args.output_type = FT_VCF_GZ; break; - case 'v': args.output_type = FT_VCF; break; - default: - { - args.clevel = strtol(optarg,&tmp,10); - if ( *tmp || args.clevel<0 || args.clevel>9 ) error("The output type \"%s\" not recognised\n", optarg); - } - }; - if ( optarg[1] ) - { - args.clevel = strtol(optarg+1,&tmp,10); - if ( *tmp || args.clevel<0 || args.clevel>9 ) error("Could not parse argument: --compression-level %s\n", optarg+1); - } - break; - case 'R': rules_fname = optarg; break; - case 'r': rules_string = optarg; break; - case 'd': - args.mode |= MODE_DELETE; - fprintf(stderr,"Warning: -d will be deprecated, please use `-m d` instead.\n"); - break; - case 'c': - args.mode |= MODE_COUNT; - fprintf(stderr,"Warning: -c will be deprecated, please use `-m c` instead.\n"); - break; - case 'l': - if ( !strcmp("+",optarg) ) args.mode |= MODE_LIST_GOOD; - else if ( !strcmp("x",optarg) ) args.mode |= MODE_LIST_BAD; - else error("The argument not recognised: --list %s\n", optarg); - fprintf(stderr,"Warning: -l will be deprecated, please use -m instead.\n"); - break; - case 'm': - if ( !strcmp("+",optarg) ) args.mode |= MODE_LIST_GOOD; - else if ( !strcmp("x",optarg) ) args.mode |= MODE_LIST_BAD; - else if ( !strcmp("a",optarg) ) args.mode |= MODE_ANNOTATE; - else if ( !strcmp("d",optarg) ) args.mode |= MODE_DELETE; - else if ( !strcmp("c",optarg) ) args.mode |= MODE_COUNT; - else if ( !strcmp("u",optarg) ) args.mode |= MODE_LIST_SKIP; - else error("The argument not recognised: --mode %s\n", optarg); - break; - case 't': trio_samples = optarg; break; - case 'T': trio_file = optarg; break; - case 'p': ped_fname = optarg; break; - case 'h': - case '?': - default: error("%s",usage()); break; - } - } - if ( rules_fname ) - args.rules = regidx_init(rules_fname, parse_rules, NULL, sizeof(rule_t), &args); - else - args.rules = init_rules(&args, rules_string); - if ( !args.rules ) return -1; - args.itr = regitr_init(args.rules); - args.itr_ori = regitr_init(args.rules); - - char *fname = NULL; - if ( optind>=argc || argv[optind][0]=='-' ) - { - if ( !isatty(fileno((FILE *)stdin)) ) fname = "-"; // reading from stdin - else error("%s",usage()); - } - else - fname = argv[optind]; - - if ( !trio_samples && !trio_file && !ped_fname ) error("Expected the -t/T or -p option\n"); - if ( !args.mode ) args.mode = MODE_COUNT; - if ( args.mode&MODE_DELETE && !(args.mode&(MODE_LIST_GOOD|MODE_LIST_BAD|MODE_LIST_SKIP)) ) args.mode |= MODE_LIST_GOOD|MODE_LIST_BAD|MODE_LIST_SKIP; - if ( args.mode&MODE_ANNOTATE && !(args.mode&(MODE_LIST_GOOD|MODE_LIST_BAD|MODE_LIST_SKIP)) ) args.mode |= MODE_LIST_GOOD|MODE_LIST_BAD|MODE_LIST_SKIP; - - FILE *log_fh = stderr; - if ( args.mode==MODE_COUNT ) - { - log_fh = strcmp("-",args.output_fname) ? fopen(args.output_fname,"w") : stdout; - if ( !log_fh ) error("Error: cannot write to %s\n", args.output_fname); - } - - args.sr = bcf_sr_init(); - if ( !bcf_sr_add_reader(args.sr, fname) ) error("Failed to read from %s: %s\n", !strcmp("-",fname)?"standard input":fname,bcf_sr_strerror(args.sr->errnum)); - args.hdr = bcf_sr_get_header(args.sr, 0); - if ( args.mode!=MODE_COUNT ) - { - char wmode[8]; - set_wmode(wmode,args.output_type,args.output_fname,args.clevel); - args.out_fh = hts_open(args.output_fname ? args.output_fname : "-", wmode); - if ( args.out_fh == NULL ) error("Can't write to \"%s\": %s\n", args.output_fname, strerror(errno)); - if ( args.mode&MODE_ANNOTATE ) - bcf_hdr_append(args.hdr, "##INFO="); - if ( bcf_hdr_write(args.out_fh, args.hdr)!=0 ) error("[%s] Error: cannot write to %s\n", __func__,args.output_fname); - } - - int i, n = 0; - char **list; - if ( trio_samples ) - { - args.ntrios = 1; - args.trios = (trio_t*) calloc(1,sizeof(trio_t)); - list = hts_readlist(trio_samples, 0, &n); - if ( n!=3 ) error("Expected three sample names with -t\n"); - args.trios[0].imother = bcf_hdr_id2int(args.hdr, BCF_DT_SAMPLE, list[0]); - args.trios[0].ifather = bcf_hdr_id2int(args.hdr, BCF_DT_SAMPLE, list[1]); - args.trios[0].ichild = bcf_hdr_id2int(args.hdr, BCF_DT_SAMPLE, list[2]); - if ( args.trios[0].imother<0 ) error("The sample is not present in the VCF: %s\n",list[0]); - if ( args.trios[0].ifather<0 ) error("The sample is not present in the VCF: %s\n",list[1]); - if ( args.trios[0].ichild<0 ) error("The sample is not present in the VCF: %s\n",list[2]); - for (i=0; ierrcode ) error("TODO: Unchecked error (%d), exiting\n",line->errcode); - if ( args.out_fh && bcf_write1(args.out_fh, args.hdr, line)!=0 ) error("[%s] Error: cannot write to %s\n", __func__,args.output_fname); - } - } - if ( args.out_fh && hts_close(args.out_fh)!=0 ) error("Error: close failed\n"); - - if ( args.mode & MODE_COUNT ) - { - fprintf(log_fh,"# [1]nOK\t[2]nBad\t[3]nSkipped\t[4]Trio (mother,father,child)\n"); - for (i=0; inok,trio->nbad,args.nrec-(trio->nok+trio->nbad), - bcf_hdr_int2id(args.hdr, BCF_DT_SAMPLE, trio->imother), - bcf_hdr_int2id(args.hdr, BCF_DT_SAMPLE, trio->ifather), - bcf_hdr_int2id(args.hdr, BCF_DT_SAMPLE, trio->ichild) - ); - } - } - if ( log_fh!=stderr && log_fh!=stdout && fclose(log_fh) ) error("Error: close failed for %s\n", args.output_fname); - - free(args.gt_arr); - free(args.trios); - regitr_destroy(args.itr); - regitr_destroy(args.itr_ori); - regidx_destroy(args.rules); - bcf_sr_destroy(args.sr); - return 0; -} - -static void warn_ploidy(bcf1_t *rec) -{ - static int warned = 0; - if ( warned ) return; - fprintf(stderr,"Incorrect ploidy at %s:%"PRId64", skipping the trio. (This warning is printed only once.)\n", bcf_seqname(args.hdr,rec),(int64_t) rec->pos+1); - warned = 1; -} - -bcf1_t *process(bcf1_t *rec) -{ - bcf1_t *dflt = args.mode&MODE_LIST_SKIP ? rec : NULL; - args.nrec++; - - if ( rec->n_allele > 63 ) return dflt; // we use 64bit bitmask below - - int ngt = bcf_get_genotypes(args.hdr, rec, &args.gt_arr, &args.ngt_arr); - if ( ngt<0 ) return dflt; - if ( ngt!=2*bcf_hdr_nsamples(args.hdr) && ngt!=bcf_hdr_nsamples(args.hdr) ) return dflt; - ngt /= bcf_hdr_nsamples(args.hdr); - - int itr_set = regidx_overlap(args.rules, bcf_seqname(args.hdr,rec),rec->pos,rec->pos, args.itr_ori); - - int i, nbad = 0, ngood = 0, needs_update = 0; - for (i=0; iimother<0 ) - { - a = bcf_gt_missing; - b = bcf_int32_vector_end; - } - else - { - a = args.gt_arr[ngt*trio->imother]; - b = ngt==2 ? args.gt_arr[ngt*trio->imother+1] : bcf_int32_vector_end; - } - if ( trio->ifather<0 ) - { - c = bcf_gt_missing; - d = bcf_int32_vector_end; - } - else - { - c = args.gt_arr[ngt*trio->ifather]; - d = ngt==2 ? args.gt_arr[ngt*trio->ifather+1] : bcf_int32_vector_end; - } - e = args.gt_arr[ngt*trio->ichild]; - f = ngt==2 ? args.gt_arr[ngt*trio->ichild+1] : bcf_int32_vector_end; - - // skip sites with missing data in child - if ( bcf_gt_is_missing(e) || bcf_gt_is_missing(f) ) continue; - - uint64_t mother = 0, father = 0,child1,child2; - - int is_ok = 0; - if ( !itr_set ) - { - if ( f==bcf_int32_vector_end ) { warn_ploidy(rec); continue; } - - // All M,F,C genotypes are diploid. Missing data are considered consistent. - child1 = 1<mal || !rule->fal ) continue; // wrong rule (haploid), but this is a diploid GT - if ( !mother ) mother = child1|child2; - if ( !father ) father = child1|child2; - if ( (mother&child1 && father&child2) || (mother&child2 && father&child1) ) is_ok = 1; - continue; - } - if ( rule->mal ) - { - if ( mother && !(child1&mother) ) continue; - } - if ( rule->fal ) - { - if ( father && !(child1&father) ) continue; - } - is_ok = 1; - } - } - if ( is_ok ) - { - trio->nok++; - ngood++; - } - else - { - trio->nbad++; - nbad++; - if ( args.mode&MODE_DELETE ) - { - args.gt_arr[ngt*trio->imother] = bcf_gt_missing; - if ( b!=bcf_int32_vector_end ) args.gt_arr[ngt*trio->imother+1] = bcf_gt_missing; // should be always true - args.gt_arr[ngt*trio->ifather] = bcf_gt_missing; - if ( d!=bcf_int32_vector_end ) args.gt_arr[ngt*trio->ifather+1] = bcf_gt_missing; - args.gt_arr[ngt*trio->ichild] = bcf_gt_missing; - if ( f!=bcf_int32_vector_end ) args.gt_arr[ngt*trio->ichild+1] = bcf_gt_missing; - needs_update = 1; - } - } - } - - if ( needs_update && bcf_update_genotypes(args.hdr,rec,args.gt_arr,ngt*bcf_hdr_nsamples(args.hdr)) ) - error("Could not update GT field at %s:%"PRId64"\n", bcf_seqname(args.hdr,rec),(int64_t) rec->pos+1); - - if ( args.mode&MODE_ANNOTATE ) bcf_update_info_int32(args.hdr, rec, "MERR", &nbad, 1); - if ( args.mode&MODE_LIST_GOOD && ngood ) return rec; - if ( args.mode&MODE_LIST_BAD && nbad ) return rec; - if ( args.mode&MODE_LIST_SKIP && !ngood && !nbad ) return rec; - - return NULL; -} diff --git a/plugins/mendelian2.c b/plugins/mendelian2.c index f1d5c7b02..e44d01c2c 100644 --- a/plugins/mendelian2.c +++ b/plugins/mendelian2.c @@ -1,6 +1,6 @@ /* The MIT License - Copyright (c) 2015-2022 Genome Research Ltd. + Copyright (c) 2015-2023 Genome Research Ltd. Author: Petr Danecek @@ -114,6 +114,8 @@ typedef struct _args_t int ngt_arr; stats_t stats; // common per-site and per-sample stats int nref_only, nmany_als; // per-site stats + char *index_fn; + int write_index; } args_t; @@ -140,6 +142,7 @@ static const char *usage_text(void) " -T, --targets-file FILE Similar to -R but streams rather than index-jumps\n" " --targets-overlap 0|1|2 Include if POS in the region (0), record overlaps (1), variant overlaps (2) [0]\n" " --no-version Do not append version and command line to the header\n" + " --write-index Automatically index the output files [off]\n" "\n" "Options:\n" " -m, --mode c|[adeEgmMS] Output mode, the default is `-m c`. Multiple modes can be combined in VCF/BCF\n" @@ -476,6 +479,7 @@ static void init_data(args_t *args) args->out_fh = hts_open(args->output_fname ? args->output_fname : "-", wmode); if ( args->out_fh == NULL ) error("Can't write to \"%s\": %s\n", args->output_fname, strerror(errno)); if ( bcf_hdr_write(args->out_fh, args->hdr_out)!=0 ) error("[%s] Error: cannot write to %s\n", __func__,args->output_fname); + if ( args->write_index && init_index(args->out_fh,args->hdr_out,args->output_fname,&args->index_fn)<0 ) error("Error: failed to initialise index for %s\n",args->output_fname); } } @@ -488,7 +492,19 @@ static void destroy_data(args_t *args) free(args->trio); free(args->gt_arr); free(args->rule); - if ( args->out_fh && hts_close(args->out_fh)!=0 ) error("[%s] Error: close failed .. %s\n", __func__,args->output_fname); + if ( args->out_fh ) + { + if ( args->write_index ) + { + if ( bcf_idx_save(args->out_fh)<0 ) + { + if ( hts_close(args->out_fh)!=0 ) error("Error: close failed .. %s\n", args->output_fname?args->output_fname:"stdout"); + error("Error: cannot write to index %s\n", args->index_fn); + } + free(args->index_fn); + } + if ( hts_close(args->out_fh)!=0 ) error("[%s] Error: close failed .. %s\n", __func__,args->output_fname); + } bcf_hdr_destroy(args->hdr_out); bcf_sr_destroy(args->sr); free(args); @@ -784,6 +800,7 @@ int run(int argc, char **argv) {"targets-overlap",required_argument,NULL,15}, {"include",required_argument,0,'i'}, {"exclude",required_argument,0,'e'}, + {"write-index",no_argument,NULL,3}, {0,0,0,0} }; int c; @@ -843,6 +860,7 @@ int run(int argc, char **argv) case 'p': args->pfm = optarg; break; case 1 : args->rules_str = optarg; break; case 2 : args->rules_fname = optarg; break; + case 3 : args->write_index = 1; break; case 'h': case '?': default: error("%s",usage_text()); break; diff --git a/plugins/prune.c b/plugins/prune.c index 57ae83a5a..1593e7306 100644 --- a/plugins/prune.c +++ b/plugins/prune.c @@ -1,5 +1,5 @@ -/* - Copyright (C) 2017-2021 Genome Research Ltd. +/* + Copyright (C) 2017-2023 Genome Research Ltd. Author: Petr Danecek @@ -9,10 +9,10 @@ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: - + The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. - + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE @@ -69,6 +69,8 @@ typedef struct htsFile *out_fh; bcf_hdr_t *hdr; bcf_srs_t *sr; + char *index_fn; + int write_index; } args_t; @@ -79,7 +81,7 @@ const char *about(void) static const char *usage_text(void) { - return + return "\n" "About: Prune sites by missingness or linkage disequilibrium.\n" "\n" @@ -103,6 +105,7 @@ static const char *usage_text(void) " -t, --targets REGION Similar to -r but streams rather than index-jumps\n" " -T, --targets-file FILE Similar to -R but streams rather than index-jumps\n" " -w, --window INT[bp|kb|Mb] The window size of INT sites or INT bp/kb/Mb for the -n/-l options [100kb]\n" + " --write-index Automatically index the output files [off]\n" "Examples:\n" " # Discard records with r2 bigger than 0.6 in a window of 1000 sites\n" " bcftools +prune -m 0.6 -w 1000 input.bcf -Ob -o output.bcf\n" @@ -183,6 +186,7 @@ static void init_data(args_t *args) } } if ( bcf_hdr_write(args->out_fh, args->hdr)!=0 ) error("[%s] Error: cannot write to %s\n", __func__,args->output_fname); + if ( args->write_index && init_index(args->out_fh,args->hdr,args->output_fname,&args->index_fn)<0 ) error("Error: failed to initialise index for %s\n",args->output_fname); args->ld_filter_id = -1; if ( args->ld_filter && strcmp(".",args->ld_filter) ) args->ld_filter_id = bcf_hdr_id2int(args->hdr, BCF_DT_ID, args->ld_filter); @@ -211,6 +215,15 @@ static void destroy_data(args_t *args) { if ( args->filter ) filter_destroy(args->filter); + if ( args->write_index ) + { + if ( bcf_idx_save(args->out_fh)<0 ) + { + if ( hts_close(args->out_fh)!=0 ) error("Error: close failed .. %s\n", args->output_fname?args->output_fname:"stdout"); + error("Error: cannot write to index %s\n", args->index_fn); + } + free(args->index_fn); + } if ( hts_close(args->out_fh)!=0 ) error("[%s] Error: close failed .. %s\n", __func__,args->output_fname); vcfbuf_destroy(args->vcfbuf); bcf_sr_destroy(args->sr); @@ -303,20 +316,22 @@ int run(int argc, char **argv) {"nsites-per-win",required_argument,NULL,'n'}, {"nsites-per-win-mode",required_argument,NULL,'N'}, {"window",required_argument,NULL,'w'}, + {"write-index",no_argument,NULL,4}, {NULL,0,NULL,0} }; int c; char *tmp; while ((c = getopt_long(argc, argv, "vr:R:t:T:m:o:O:a:f:i:e:n:N:w:k",loptions,NULL)) >= 0) { - switch (c) + switch (c) { case 1 : args->rand_missing = 1; break; case 2 : args->af_tag = optarg; break; - case 3 : + case 3 : args->rseed = strtol(optarg,&tmp,10); if ( tmp==optarg || *tmp ) error("Could not parse: --random-seed %s\n", optarg); break; + case 4 : args->write_index = 1; break; case 'k': args->keep_sites = 1; break; case 'e': if ( args->filter_str ) error("Error: only one -i or -e expression can be given, and they cannot be combined\n"); @@ -324,7 +339,7 @@ int run(int argc, char **argv) case 'i': if ( args->filter_str ) error("Error: only one -i or -e expression can be given, and they cannot be combined\n"); args->filter_str = optarg; args->filter_logic |= FLT_INCLUDE; break; - case 'a': + case 'a': { int n, i; char **tag = hts_readlist(optarg,0,&n); @@ -352,9 +367,9 @@ int run(int argc, char **argv) free(tag); args->ld_mask |= LD_ANNOTATE; } - break; + break; case 'f': args->ld_filter = optarg; break; - case 'n': + case 'n': args->nsites = strtod(optarg,&tmp); if ( tmp==optarg || *tmp ) error("Could not parse: --nsites-per-win %s\n", optarg); break; @@ -364,7 +379,7 @@ int run(int argc, char **argv) else if ( !strcasecmp(optarg,"rand") ) args->nsites_mode = optarg; else error("The mode \"%s\" is not recognised\n",optarg); break; - case 'm': + case 'm': if ( !strncasecmp("R2=",optarg,3) ) { args->ld_max_set[VCFBUF_LD_IDX_R2] = 1; @@ -388,7 +403,7 @@ int run(int argc, char **argv) if ( !tmp || *tmp ) error("Could not parse: --max %s\n", optarg); args->ld_mask |= LD_SET_MAX; break; - case 'w': + case 'w': args->ld_win = strtod(optarg,&tmp); if ( !*tmp ) break; if ( tmp==optarg ) error("Could not parse: --window %s\n", optarg); @@ -398,9 +413,9 @@ int run(int argc, char **argv) else error("Could not parse: --window %s\n", optarg); break; case 'T': args->target_is_file = 1; // fall-through - case 't': args->target = optarg; break; + case 't': args->target = optarg; break; case 'R': args->region_is_file = 1; // fall-through - case 'r': args->region = optarg; break; + case 'r': args->region = optarg; break; case 'o': args->output_fname = optarg; break; case 'O': switch (optarg[0]) { @@ -439,7 +454,7 @@ int run(int argc, char **argv) else args->fname = argv[optind]; init_data(args); - + while ( bcf_sr_next_line(args->sr) ) process(args); flush(args,1); diff --git a/plugins/remove-overlaps.c b/plugins/remove-overlaps.c index 2e8e6b0dd..bd0304497 100644 --- a/plugins/remove-overlaps.c +++ b/plugins/remove-overlaps.c @@ -1,5 +1,5 @@ -/* - Copyright (C) 2017-2021 Genome Research Ltd. +/* + Copyright (C) 2017-2023 Genome Research Ltd. Author: Petr Danecek @@ -9,10 +9,10 @@ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: - + The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. - + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE @@ -52,6 +52,8 @@ typedef struct htsFile *out_fh; bcf_hdr_t *hdr; bcf_srs_t *sr; + char *index_fn; + int write_index; } args_t; @@ -62,7 +64,7 @@ const char *about(void) static const char *usage_text(void) { - return + return "\n" "About: Remove overlapping variants.\n" "\n" @@ -80,6 +82,7 @@ static const char *usage_text(void) " -R, --regions-file FILE restrict to regions listed in a file\n" " -t, --targets REGION similar to -r but streams rather than index-jumps\n" " -T, --targets-file FILE similar to -R but streams rather than index-jumps\n" + " --write-index Automatically index the output files [off]\n" "\n"; } @@ -100,6 +103,7 @@ static void init_data(args_t *args) args->out_fh = hts_open(args->output_fname ? args->output_fname : "-", wmode); if ( args->out_fh == NULL ) error("Can't write to \"%s\": %s\n", args->output_fname, strerror(errno)); if ( bcf_hdr_write(args->out_fh, args->hdr)!=0 ) error("[%s] Error: cannot write to %s\n", __func__,args->output_fname); + if ( args->write_index && init_index(args->out_fh,args->hdr,args->output_fname,&args->index_fn)<0 ) error("Error: failed to initialise index for %s\n",args->output_fname); args->vcfbuf = vcfbuf_init(args->hdr, 0); if ( args->rmdup ) @@ -114,6 +118,15 @@ static void destroy_data(args_t *args) { if ( args->filter ) filter_destroy(args->filter); + if ( args->write_index ) + { + if ( bcf_idx_save(args->out_fh)<0 ) + { + if ( hts_close(args->out_fh)!=0 ) error("Error: close failed .. %s\n", args->output_fname?args->output_fname:"stdout"); + error("Error: cannot write to index %s\n", args->index_fn); + } + free(args->index_fn); + } if ( hts_close(args->out_fh)!=0 ) error("[%s] Error: close failed .. %s\n", __func__,args->output_fname); vcfbuf_destroy(args->vcfbuf); bcf_sr_destroy(args->sr); @@ -168,13 +181,14 @@ int run(int argc, char **argv) {"output",required_argument,NULL,'o'}, {"output-type",required_argument,NULL,'O'}, {"verbose",no_argument,NULL,'v'}, + {"write-index",no_argument,NULL,1}, {NULL,0,NULL,0} }; int c; char *tmp; while ((c = getopt_long(argc, argv, "r:R:t:T:o:O:i:e:vpd",loptions,NULL)) >= 0) { - switch (c) + switch (c) { case 'd': args->rmdup = 1; break; case 'p': args->print_overlaps = 1; break; @@ -186,9 +200,9 @@ int run(int argc, char **argv) if ( args->filter_str ) error("Error: only one -i or -e expression can be given, and they cannot be combined\n"); args->filter_str = optarg; args->filter_logic |= FLT_INCLUDE; break; case 'T': args->target_is_file = 1; // fall-through - case 't': args->target = optarg; break; + case 't': args->target = optarg; break; case 'R': args->region_is_file = 1; // fall-through - case 'r': args->region = optarg; break; + case 'r': args->region = optarg; break; case 'o': args->output_fname = optarg; break; case 'O': switch (optarg[0]) { @@ -208,6 +222,7 @@ int run(int argc, char **argv) if ( *tmp || args->clevel<0 || args->clevel>9 ) error("Could not parse argument: --compression-level %s\n", optarg+1); } break; + case 1 : args->write_index = 1; break; case 'h': case '?': default: error("%s", usage_text()); break; @@ -223,7 +238,7 @@ int run(int argc, char **argv) else args->fname = argv[optind]; init_data(args); - + while ( bcf_sr_next_line(args->sr) ) process(args); flush(args,1); diff --git a/plugins/scatter.c b/plugins/scatter.c index af358fc4f..e42edd877 100644 --- a/plugins/scatter.c +++ b/plugins/scatter.c @@ -1,6 +1,6 @@ /* The MIT License - Copyright (C) 2020-2021 Giulio Genovese + Copyright (C) 2020-2023 Giulio Genovese Author: Giulio Genovese @@ -39,6 +39,7 @@ typedef struct { htsFile *fh; // output file handle char *fname; // output file name + char *index_fn; } subset_t; @@ -60,6 +61,7 @@ typedef struct char **hts_opts; int nhts_opts; bcf_hdr_t *hdr; + int write_index; } args_t; @@ -95,6 +97,7 @@ static const char *usage_text(void) " -x, --extra STRING Output records not overlapping listed regions in separate file\n" " -p, --prefix STRING Prepend string to output VCF names\n" " --hts-opts LIST Low-level options to pass to HTSlib, e.g. block_size=32768\n" + " --write-index Automatically index the output files [off]\n" "\n" "Examples:\n" " # Scatter a VCF file by shards with 10000 variants each\n" @@ -200,6 +203,7 @@ static void open_set(subset_t *set, args_t *args) if ( args->record_cmd_line ) bcf_hdr_append_version(args->hdr, args->argc, args->argv, "bcftools_plugin"); } if ( bcf_hdr_write(set->fh, args->hdr)!=0 ) error("[%s] Error: cannot write the header to %s\n", __func__, args->str.s); + if ( args->write_index && init_index(set->fh,args->hdr,args->str.s,&set->index_fn)<0 ) error("Error: failed to initialise index for %s\n",args->str.s); } static void init_data(args_t *args) @@ -260,7 +264,17 @@ static void destroy_data(args_t *args) for (i=0; insets; i++) { subset_t *set = &args->sets[i]; - if (set->fname) { + if (set->fname) + { + if ( args->write_index ) + { + if ( bcf_idx_save(set->fh)<0 ) + { + if ( hts_close(set->fh)!=0 ) error("Error: close failed .. %s\n", set->fname); + error("Error: cannot write to index %s\n", set->index_fn); + } + free(set->index_fn); + } if ( hts_close(set->fh)!=0 ) error("Error: close failed .. %s\n", set->fname); free(set->fname); } @@ -338,6 +352,7 @@ int run(int argc, char **argv) {"extra",required_argument,NULL,'x'}, {"prefix",required_argument,NULL,'p'}, {"hts-opts",required_argument,NULL,5}, + {"write-index",no_argument,NULL,6}, {NULL,0,NULL,0} }; int c; @@ -395,6 +410,7 @@ int run(int argc, char **argv) case 'x': args->extra = optarg; break; case 'p': args->prefix = optarg; break; case 5 : args->hts_opts = hts_readlist(optarg, 0, &args->nhts_opts); break; + case 6 : args->write_index = 1; break; case 'h': case '?': default: error("%s", usage_text()); break; diff --git a/plugins/split-vep.c b/plugins/split-vep.c index 446ff4639..172c46299 100644 --- a/plugins/split-vep.c +++ b/plugins/split-vep.c @@ -127,6 +127,8 @@ typedef struct int allow_undef_tags; int genes_mode; // --gene-list +FILE, one of GENES_* mode, prioritize or restrict int print_header; + char *index_fn; + int write_index; } args_t; @@ -234,6 +236,7 @@ static const char *usage_text(void) " -t, --targets REG Similar to -r but streams rather than index-jumps\n" " -T, --targets-file FILE Similar to -R but streams rather than index-jumps\n" " --targets-overlap 0|1|2 Include if POS in the region (0), record overlaps (1), variant overlaps (2) [0]\n" + " --write-index Automatically index the output files [off]\n" "\n" "Examples:\n" " # List available fields of the INFO/CSQ annotation\n" @@ -903,7 +906,19 @@ static void destroy_data(args_t *args) free(args->csq_str); if ( args->filter ) filter_destroy(args->filter); if ( args->convert ) convert_destroy(args->convert); - if ( args->fh_vcf && hts_close(args->fh_vcf)!=0 ) error("Error: close failed .. %s\n",args->output_fname); + if ( args->fh_vcf ) + { + if ( args->write_index ) + { + if ( bcf_idx_save(args->fh_vcf)<0 ) + { + if ( hts_close(args->fh_vcf)!=0 ) error("Error: close failed .. %s\n", args->output_fname?args->output_fname:"stdout"); + error("Error: cannot write to index %s\n", args->index_fn); + } + free(args->index_fn); + } + if ( hts_close(args->fh_vcf)!=0 ) error("Error: close failed .. %s\n",args->output_fname); + } if ( args->fh_bgzf && bgzf_close(args->fh_bgzf)!=0 ) error("Error: close failed .. %s\n",args->output_fname); free(args); } @@ -1325,6 +1340,7 @@ int run(int argc, char **argv) {"targets-overlap",required_argument,NULL,4}, {"no-version",no_argument,NULL,2}, {"allow-undef-tags",no_argument,0,'u'}, + {"write-index",no_argument,NULL,6}, {NULL,0,NULL,0} }; int c; @@ -1390,6 +1406,7 @@ int run(int argc, char **argv) if ( args->targets_overlap < 0 ) error("Could not parse: --targets-overlap %s\n",optarg); break; case 5 : args->gene_fields_str = optarg; break; + case 6 : args->write_index = 1; break; case 'h': case '?': default: error("%s", usage_text()); break; @@ -1440,6 +1457,7 @@ int run(int argc, char **argv) args->fh_vcf = hts_open(args->output_fname ? args->output_fname : "-", wmode); if ( args->record_cmd_line ) bcf_hdr_append_version(args->hdr_out, args->argc, args->argv, "bcftools_split-vep"); if ( bcf_hdr_write(args->fh_vcf, args->hdr_out)!=0 ) error("Failed to write the header to %s\n", args->output_fname); + if ( args->write_index && init_index(args->fh_vcf,args->hdr,args->output_fname,&args->index_fn)<0 ) error("Error: failed to initialise index for %s\n",args->output_fname); } while ( bcf_sr_next_line(args->sr) ) process_record(args, bcf_sr_get_line(args->sr,0)); diff --git a/plugins/split.c b/plugins/split.c index a362e0ed9..011981d42 100644 --- a/plugins/split.c +++ b/plugins/split.c @@ -1,5 +1,5 @@ -/* - Copyright (C) 2017-2021 Genome Research Ltd. +/* + Copyright (C) 2017-2023 Genome Research Ltd. Author: Petr Danecek @@ -9,10 +9,10 @@ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: - + The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. - + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE @@ -52,6 +52,7 @@ typedef struct char *fname; // output file name filter_t *filter; bcf_hdr_t *hdr; + char *index_fn; } subset_t; @@ -70,6 +71,7 @@ typedef struct subset_t *sets; int nsets, nhts_opts; char **hts_opts; + int write_index; } args_t; @@ -80,7 +82,7 @@ const char *about(void) static const char *usage_text(void) { - return + return "\n" "About: Split VCF by sample, creating single- or multi-sample VCFs. The output files are named\n" " by sample names whenever possible, with the characters from the set [ \\t:/\\] replaced\n" @@ -124,6 +126,7 @@ static const char *usage_text(void) " --targets-overlap 0|1|2 Include if POS in the region (0), record overlaps (1), variant overlaps (2) [0]\n" " -T, --targets-file FILE Similar to -R but streams rather than index-jumps\n" " --hts-opts LIST Low-level options to pass to HTSlib, e.g. block_size=32768\n" + " --write-index Automatically index the output files [off]\n" "\n" "Examples:\n" " # Split a VCF file\n" @@ -485,6 +488,7 @@ static void init_data(args_t *args) for (j=0; jnsmpl; j++) set->hdr->samples[j] = set->rename ? set->rename[j] : args->hdr_in->samples[set->smpl[j]]; if ( bcf_hdr_write(set->fh, set->hdr)!=0 ) error("[%s] Error: cannot write the header to %s\n", __func__,str.s); + if ( args->write_index && init_index(set->fh,set->hdr,str.s,&set->index_fn)<0 ) error("Error: failed to initialise index for %s\n",str.s); if ( args->filter_str ) set->filter = filter_init(set->hdr, args->filter_str); } @@ -500,6 +504,15 @@ static void destroy_data(args_t *args) for (i=0; insets; i++) { subset_t *set = &args->sets[i]; + if ( args->write_index ) + { + if ( bcf_idx_save(set->fh)<0 ) + { + if ( hts_close(set->fh)!=0 ) error("Error: close failed .. %s\n", set->fname); + error("Error: cannot write to index %s\n", set->index_fn); + } + free(set->index_fn); + } if ( hts_close(set->fh)!=0 ) error("Error: close failed .. %s\n",set->fname); free(set->fname); free(set->smpl); @@ -596,7 +609,7 @@ static void process(args_t *args) bcf_unpack(rec, BCF_UN_ALL); int i; - bcf1_t *out = NULL; + bcf1_t *out = NULL; for (i=0; insets; i++) { subset_t *set = &args->sets[i]; @@ -641,13 +654,14 @@ int run(int argc, char **argv) {"groups-file",required_argument,NULL,'G'}, {"output",required_argument,NULL,'o'}, {"output-type",required_argument,NULL,'O'}, + {"write-index",no_argument,NULL,4}, {NULL,0,NULL,0} }; int c; char *tmp; while ((c = getopt_long(argc, argv, "vr:R:t:T:o:O:i:e:k:S:G:",loptions,NULL)) >= 0) { - switch (c) + switch (c) { case 1 : args->hts_opts = hts_readlist(optarg,0,&args->nhts_opts); break; case 'k': args->keep_tags = optarg; break; @@ -658,11 +672,11 @@ int run(int argc, char **argv) if ( args->filter_str ) error("Error: only one -i or -e expression can be given, and they cannot be combined\n"); args->filter_str = optarg; args->filter_logic |= FLT_INCLUDE; break; case 'T': args->target = optarg; args->target_is_file = 1; break; - case 't': args->target = optarg; break; + case 't': args->target = optarg; break; case 'R': args->region = optarg; args->region_is_file = 1; break; case 'S': args->samples_fname = optarg; break; case 'G': args->groups_fname = optarg; break; - case 'r': args->region = optarg; break; + case 'r': args->region = optarg; break; case 'o': args->output_dir = optarg; break; case 'O': switch (optarg[0]) { @@ -690,6 +704,7 @@ int run(int argc, char **argv) args->targets_overlap = parse_overlap_option(optarg); if ( args->targets_overlap < 0 ) error("Could not parse: --targets-overlap %s\n",optarg); break; + case 4 : args->write_index = 1; break; case 'h': case '?': default: error("%s", usage_text()); break; @@ -708,7 +723,7 @@ int run(int argc, char **argv) if ( args->filter_logic == (FLT_EXCLUDE|FLT_INCLUDE) ) error("Only one of -i or -e can be given.\n"); init_data(args); - + while ( bcf_sr_next_line(args->sr) ) process(args); destroy_data(args); diff --git a/plugins/trio-dnm2.c b/plugins/trio-dnm2.c index 4783458b2..7cbf7fbcd 100644 --- a/plugins/trio-dnm2.c +++ b/plugins/trio-dnm2.c @@ -125,6 +125,8 @@ typedef struct int need_QS; int strictly_novel; priors_t priors, priors_X, priors_XX; + char *index_fn; + int write_index; } args_t; @@ -179,6 +181,7 @@ static const char *usage_text(void) " --use-NAIVE A naive calling model which uses only FMT/GT to determine DNMs\n" " --with-pAD Do not use FMT/QS but parental FMT/AD\n" " --with-pPL Do not use FMT/QS but parental FMT/PL. Equals to DNG with bugs fixed (more FPs, fewer FNs)\n" + " --write-index Automatically index the output files [off]\n" "\n" "Example:\n" " # Annotate VCF with FORMAT/DNM, run for a single trio\n" @@ -767,6 +770,7 @@ static void init_data(args_t *args) args->out_fh = hts_open(args->output_fname ? args->output_fname : "-", wmode); if ( args->out_fh == NULL ) error("Can't write to \"%s\": %s\n", args->output_fname, strerror(errno)); if ( bcf_hdr_write(args->out_fh, args->hdr_out)!=0 ) error("[%s] Error: cannot write to %s\n", __func__,args->output_fname); + if ( args->write_index && init_index(args->out_fh,args->hdr_out,args->output_fname,&args->index_fn)<0 ) error("Error: failed to initialise index for %s\n",args->output_fname); if ( args->dnm_score_type & DNM_FLOAT ) args->dnm_qual_float = (float*) malloc(sizeof(*args->dnm_qual_float)*bcf_hdr_nsamples(args->hdr)); @@ -796,6 +800,15 @@ static void destroy_data(args_t *args) free(args->ad); free(args->qs); free(args->qs3); + if ( args->write_index ) + { + if ( bcf_idx_save(args->out_fh)<0 ) + { + if ( hts_close(args->out_fh)!=0 ) error("Error: close failed .. %s\n", args->output_fname?args->output_fname:"stdout"); + error("Error: cannot write to index %s\n", args->index_fn); + } + free(args->index_fn); + } if ( hts_close(args->out_fh)!=0 ) error("[%s] Error: close failed .. %s\n", __func__,args->output_fname); bcf_hdr_destroy(args->hdr_out); bcf_sr_destroy(args->sr); @@ -1582,6 +1595,7 @@ int run(int argc, char **argv) {"targets",1,0,'t'}, {"targets-file",1,0,'T'}, {"targets-overlap",required_argument,NULL,15}, + {"write-index",no_argument,NULL,16}, {NULL,0,NULL,0} }; int c; @@ -1670,6 +1684,7 @@ int run(int argc, char **argv) args->targets_overlap = parse_overlap_option(optarg); if ( args->targets_overlap < 0 ) error("Could not parse: --targets-overlap %s\n",optarg); break; + case 16 : args->write_index = 1; break; case 'X': args->chrX_list_str = optarg; break; case 'u': set_option(args,optarg); break; case 'e': diff --git a/plugins/variant-distance.c b/plugins/variant-distance.c index a1aeb9aef..1d195c133 100644 --- a/plugins/variant-distance.c +++ b/plugins/variant-distance.c @@ -1,5 +1,5 @@ /* - Copyright (C) 2022 Genome Research Ltd. + Copyright (C) 2022-2023 Genome Research Ltd. Author: Petr Danecek @@ -63,6 +63,8 @@ typedef struct bcf_hdr_t *hdr; bcf_srs_t *sr; vcfbuf_t *buf; + char *index_fn; + int write_index; } args_t; @@ -91,6 +93,7 @@ static const char *usage_text(void) " -t, --targets REGION Similar to -r but streams rather than index-jumps\n" " -T, --targets-file FILE Similar to -R but streams rather than index-jumps\n" " --targets-overlap 0|1|2 Include if POS in the region (0), record overlaps (1), variant overlaps (2) [0]\n" + " --write-index Automatically index the output files [off]\n" "Examples:\n" " bcftools +variant-distance input.bcf -Ob -o output.bcf\n" "\n"; @@ -126,6 +129,7 @@ static void init_data(args_t *args) if ( args->out_fh == NULL ) error("Can't write to \"%s\": %s\n", args->output_fname, strerror(errno)); if ( bcf_hdr_write(args->out_fh, args->hdr)!=0 ) error("[%s] Error: cannot write to %s\n", __func__,args->output_fname); + if ( args->write_index && init_index(args->out_fh,args->hdr,args->output_fname,&args->index_fn)<0 ) error("Error: failed to initialise index for %s\n",args->output_fname); args->buf = vcfbuf_init(args->hdr, 0); vcfbuf_set_opt(args->buf,int,VCFBUF_DUMMY,1) @@ -134,6 +138,15 @@ static void destroy_data(args_t *args) { if ( args->filter ) filter_destroy(args->filter); + if ( args->write_index ) + { + if ( bcf_idx_save(args->out_fh)<0 ) + { + if ( hts_close(args->out_fh)!=0 ) error("Error: close failed .. %s\n", args->output_fname?args->output_fname:"stdout"); + error("Error: cannot write to index %s\n", args->index_fn); + } + free(args->index_fn); + } if ( hts_close(args->out_fh)!=0 ) error("[%s] Error: close failed .. %s\n", __func__,args->output_fname); bcf_sr_destroy(args->sr); vcfbuf_destroy(args->buf); @@ -233,6 +246,7 @@ int run(int argc, char **argv) {"targets-overlap",required_argument,NULL,2}, {"output",required_argument,NULL,'o'}, {"output-type",required_argument,NULL,'O'}, + {"write-index",no_argument,NULL,4}, {NULL,0,NULL,0} }; int c; @@ -286,6 +300,7 @@ int run(int argc, char **argv) if ( *tmp || args->clevel<0 || args->clevel>9 ) error("Could not parse argument: --compression-level %s\n", optarg+1); } break; + case 4 : args->write_index = 1; break; case 'h': case '?': default: error("%s", usage_text()); break; diff --git a/vcfannotate.c b/vcfannotate.c index 0f2e86380..da2c2b95e 100644 --- a/vcfannotate.c +++ b/vcfannotate.c @@ -118,6 +118,8 @@ typedef struct _args_t htsFile *out_fh; int output_type, n_threads, clevel; bcf_sr_regions_t *tgts; + char *index_fn; + int write_index; regidx_t *tgt_idx; // keep everything in memory only with .tab annotation file and -c BEG,END columns regitr_t *tgt_itr; @@ -2888,6 +2890,7 @@ static void init_data(args_t *args) if ( args->n_threads ) hts_set_opt(args->out_fh, HTS_OPT_THREAD_POOL, args->files->p); if ( bcf_hdr_write(args->out_fh, args->hdr_out)!=0 ) error("[%s] Error: failed to write the header to %s\n", __func__,args->output_fname); + if ( args->write_index && init_index(args->out_fh,args->hdr,args->output_fname,&args->index_fn)<0 ) error("Error: failed to initialise index for %s\n",args->output_fname); } } @@ -2950,7 +2953,19 @@ static void destroy_data(args_t *args) convert_destroy(args->set_ids); if ( args->filter ) filter_destroy(args->filter); - if (args->out_fh) hts_close(args->out_fh); + if (args->out_fh) + { + if ( args->write_index ) + { + if ( bcf_idx_save(args->out_fh)<0 ) + { + if ( hts_close(args->out_fh)!=0 ) error("Error: close failed .. %s\n", args->output_fname?args->output_fname:"stdout"); + error("Error: cannot write to index %s\n", args->index_fn); + } + free(args->index_fn); + } + if ( hts_close(args->out_fh)!=0 ) error("Error: close failed .. %s\n", args->output_fname?args->output_fname:"stdout"); + } free(args->sample_map); free(args->merge_method_str.s); } @@ -3264,7 +3279,6 @@ static void annotate(args_t *args, bcf1_t *line) has_overlap = 1; } } -fprintf(stderr,"has_overlap=%d mark=%s\n",has_overlap,args->mark_sites); if ( args->set_ids ) { args->tmpks.l = 0; @@ -3325,6 +3339,7 @@ static void usage(args_t *args) fprintf(stderr, " --single-overlaps Keep memory low by avoiding complexities arising from handling multiple overlapping intervals\n"); fprintf(stderr, " -x, --remove LIST List of annotations (e.g. ID,INFO/DP,FORMAT/DP,FILTER) to remove (or keep with \"^\" prefix). See man page for details\n"); fprintf(stderr, " --threads INT Number of extra output compression threads [0]\n"); + fprintf(stderr, " --write-index Automatically index the output files [off]\n"); fprintf(stderr, "\n"); fprintf(stderr, "Examples:\n"); fprintf(stderr, " http://samtools.github.io/bcftools/howtos/annotate.html\n"); @@ -3381,6 +3396,7 @@ int main_vcfannotate(int argc, char *argv[]) {"min-overlap",required_argument,NULL,12}, {"no-version",no_argument,NULL,8}, {"force",no_argument,NULL,'f'}, + {"write-index",no_argument,NULL,13}, {NULL,0,NULL,0} }; char *tmp; @@ -3457,6 +3473,7 @@ int main_vcfannotate(int argc, char *argv[]) case 10 : args->single_overlaps = 1; break; case 11 : args->rename_annots = optarg; break; case 12 : args->min_overlap_str = optarg; break; + case 13 : args->write_index = 1; break; case '?': usage(args); break; default: error("Unknown argument: %s\n", optarg); } diff --git a/vcfcall.c b/vcfcall.c index 1cd6f504c..d2f6e2c5f 100644 --- a/vcfcall.c +++ b/vcfcall.c @@ -1,6 +1,6 @@ /* vcfcall.c -- SNP/indel variant calling from VCF/BCF. - Copyright (C) 2013-2022 Genome Research Ltd. + Copyright (C) 2013-2023 Genome Research Ltd. Author: Petr Danecek @@ -97,6 +97,8 @@ typedef struct int argc; char **argv; + char *index_fn; + int write_index; // int flag, prior_type, n1, n_sub, *sublist, n_perm; // uint32_t *trio_aux; @@ -715,6 +717,7 @@ static void init_data(args_t *args) if (args->record_cmd_line) bcf_hdr_append_version(args->aux.hdr, args->argc, args->argv, "bcftools_call"); if ( bcf_hdr_write(args->out_fh, args->aux.hdr)!=0 ) error("[%s] Error: cannot write the header to %s\n", __func__,args->output_fname); + if ( args->write_index && init_index(args->out_fh,args->aux.hdr,args->output_fname,&args->index_fn)<0 ) error("Error: failed to initialise index for %s\n",args->output_fname); if ( args->flag&CF_INS_MISSED ) init_missed_line(args); } @@ -753,6 +756,15 @@ static void destroy_data(args_t *args) free(args->str.s); if ( args->gvcf ) gvcf_destroy(args->gvcf); bcf_hdr_destroy(args->aux.hdr); + if ( args->write_index ) + { + if ( bcf_idx_save(args->out_fh)<0 ) + { + if ( hts_close(args->out_fh)!=0 ) error("Error: close failed .. %s\n", args->output_fname?args->output_fname:"stdout"); + error("Error: cannot write to index %s\n", args->index_fn); + } + free(args->index_fn); + } if ( hts_close(args->out_fh)!=0 ) error("[%s] Error: close failed .. %s\n", __func__,args->output_fname); bcf_sr_destroy(args->aux.srs); } @@ -908,6 +920,7 @@ static void usage(args_t *args) fprintf(stderr, " -M, --keep-masked-ref Keep sites with masked reference allele (REF=N)\n"); fprintf(stderr, " -V, --skip-variants TYPE Skip indels/snps\n"); fprintf(stderr, " -v, --variants-only Output variant sites only\n"); + fprintf(stderr, " --write-index Automatically index the output files [off]\n"); fprintf(stderr, "\n"); fprintf(stderr, "Consensus/variant calling options:\n"); fprintf(stderr, " -c, --consensus-caller The original calling method (conflicts with -m)\n"); @@ -990,6 +1003,7 @@ int main_vcfcall(int argc, char *argv[]) {"chromosome-X",no_argument,NULL,'X'}, {"chromosome-Y",no_argument,NULL,'Y'}, {"no-version",no_argument,NULL,8}, + {"write-index",no_argument,NULL,10}, {NULL,0,NULL,0} }; @@ -1076,6 +1090,7 @@ int main_vcfcall(int argc, char *argv[]) args.regions_overlap = parse_overlap_option(optarg); if ( args.regions_overlap < 0 ) error("Could not parse: --regions-overlap %s\n",optarg); break; + case 10: args.write_index = 1; break; default: usage(&args); } } diff --git a/vcfconcat.c b/vcfconcat.c index 74fd036b8..6943154af 100644 --- a/vcfconcat.c +++ b/vcfconcat.c @@ -1,6 +1,6 @@ /* vcfconcat.c -- Concatenate or combine VCF/BCF files. - Copyright (C) 2013-2021 Genome Research Ltd. + Copyright (C) 2013-2023 Genome Research Ltd. Author: Petr Danecek @@ -46,6 +46,8 @@ typedef struct _args_t int output_type, n_threads, record_cmd_line, clevel; bcf_hdr_t *out_hdr; int *seen_seq; + char *index_fn; + int write_index; // phasing int *start_pos, start_tid, ifname; @@ -142,6 +144,7 @@ static void init_data(args_t *args) hts_set_opt(args->out_fh, HTS_OPT_THREAD_POOL, args->tpool); } if ( bcf_hdr_write(args->out_fh, args->out_hdr)!=0 ) error("[%s] Error: cannot write the header to %s\n", __func__,args->output_fname); + if ( args->write_index && init_index(args->out_fh,args->out_hdr,args->output_fname,&args->index_fn)<0 ) error("Error: failed to initialise index for %s\n",args->output_fname); if ( args->allow_overlaps ) { @@ -203,7 +206,16 @@ static void destroy_data(args_t *args) int i; if ( args->out_fh ) { - if ( hts_close(args->out_fh)!=0 ) error("hts_close error\n"); + if ( args->write_index ) + { + if ( bcf_idx_save(args->out_fh)<0 ) + { + if ( hts_close(args->out_fh)!=0 ) error("Error: close failed .. %s\n", args->output_fname?args->output_fname:"stdout"); + error("Error: cannot write to index %s\n", args->index_fn); + } + free(args->index_fn); + } + if ( hts_close(args->out_fh)!=0 ) error("Error: close failed .. %s\n",args->output_fname?args->output_fname:"stdout"); } if ( args->tpool && !args->files ) { @@ -264,7 +276,7 @@ static void phased_flush(args_t *args) bcf1_t *brec = args->buf[i+1]; int nGTs = bcf_get_genotypes(ahdr, arec, &args->GTa, &args->mGTa); - if ( nGTs < 0 ) + if ( nGTs < 0 ) { if ( !gt_absent_warned ) { @@ -359,7 +371,7 @@ static void phased_flush(args_t *args) bcf_update_format_int32(args->out_hdr,rec,"PQ",args->phase_qual,nsmpl); PQ_printed = 1; for (j=0; jphase_qual[j] < args->min_PQ ) + if ( args->phase_qual[j] < args->min_PQ ) { args->phase_set[j] = rec->pos+1; args->phase_set_changed = 1; @@ -931,6 +943,7 @@ static void usage(args_t *args) fprintf(stderr, " --regions-overlap 0|1|2 Include if POS in the region (0), record overlaps (1), variant overlaps (2) [1]\n"); fprintf(stderr, " --threads INT Use multithreading with worker threads [0]\n"); fprintf(stderr, " -v, --verbose 0|1 Set verbosity level [1]\n"); + fprintf(stderr, " --write-index Automatically index the output files [off]\n"); fprintf(stderr, "\n"); exit(1); } @@ -969,6 +982,7 @@ int main_vcfconcat(int argc, char *argv[]) {"file-list",required_argument,NULL,'f'}, {"min-PQ",required_argument,NULL,'q'}, {"no-version",no_argument,NULL,8}, + {"write-index",no_argument,NULL,13}, {NULL,0,NULL,0} }; char *tmp; @@ -980,7 +994,7 @@ int main_vcfconcat(int argc, char *argv[]) case 'R': args->regions_list = optarg; args->regions_is_file = 1; break; case 'd': args->remove_dups = optarg; break; case 'D': args->remove_dups = "exact"; break; - case 'q': + case 'q': args->min_PQ = strtol(optarg,&tmp,10); if ( *tmp ) error("Could not parse argument: --min-PQ %s\n", optarg); break; @@ -1021,6 +1035,7 @@ int main_vcfconcat(int argc, char *argv[]) args->verbose = strtol(optarg, &tmp, 0); if ( *tmp || args->verbose<0 || args->verbose>1 ) error("Error: currently only --verbose 0 or --verbose 1 is supported\n"); break; + case 13 : args->write_index = 1; break; case 'h': case '?': usage(args); break; default: error("Unknown argument: %s\n", optarg); diff --git a/vcfconvert.c b/vcfconvert.c index ce5ed9981..a82cc74e0 100644 --- a/vcfconvert.c +++ b/vcfconvert.c @@ -1,6 +1,6 @@ /* vcfconvert.c -- convert between VCF/BCF and related formats. - Copyright (C) 2013-2021 Genome Research Ltd. + Copyright (C) 2013-2023 Genome Research Ltd. Author: Petr Danecek @@ -59,7 +59,7 @@ struct _args_t bcf_hdr_t *header; void (*convert_func)(struct _args_t *); struct { - int total, skipped, hom_rr, het_ra, hom_aa, het_aa, missing; + int total, skipped, hom_rr, het_ra, hom_aa, het_aa, missing; } n; kstring_t str; int32_t *gts; @@ -70,6 +70,8 @@ struct _args_t char **argv, *sample_list, *targets_list, *regions_list, *tag, *columns; char *outfname, *infname, *ref_fname, *sex_fname; int argc, n_threads, record_cmd_line, keep_duplicates, clevel; + char *index_fn; + int write_index; }; static void destroy_data(args_t *args) @@ -160,7 +162,7 @@ static int _set_chrom_pos_ref_alt(tsv_t *tsv, bcf1_t *rec, void *usr) // REF,ALT args->str.l = 0; se = ++ss; - while ( se < tsv->se && *se!='_' ) se++; + while ( se < tsv->se && *se!='_' ) se++; if ( *se!='_' ) return -1; kputsn(ss,se-ss,&args->str); ss = ++se; @@ -269,12 +271,12 @@ static int tsv_setter_gt_gp(tsv_t *tsv, bcf1_t *rec, void *usr) if ( aa >= ab ) { if ( aa >= bb ) args->gts[2*i+0] = args->gts[2*i+1] = bcf_gt_unphased(0); - else args->gts[2*i+0] = args->gts[2*i+1] = bcf_gt_unphased(1); + else args->gts[2*i+0] = args->gts[2*i+1] = bcf_gt_unphased(1); } - else if ( ab >= bb ) + else if ( ab >= bb ) { args->gts[2*i+0] = bcf_gt_unphased(0); - args->gts[2*i+1] = bcf_gt_unphased(1); + args->gts[2*i+1] = bcf_gt_unphased(1); } else args->gts[2*i+0] = args->gts[2*i+1] = bcf_gt_unphased(1); } @@ -293,7 +295,7 @@ static int tsv_setter_haps(tsv_t *tsv, bcf1_t *rec, void *usr) else { a0 = bcf_gt_phased(0); a1 = bcf_gt_phased(1); } // up is short for "unphased" - int nup = 0; + int nup = 0; for (i=0; iss + 4*i + nup; @@ -324,11 +326,11 @@ static int tsv_setter_haps(tsv_t *tsv, bcf1_t *rec, void *usr) break; default : fprintf(stderr,"Could not parse: [%c][%s]\n", ss[all*2+up],tsv->ss); - return -1; + return -1; } if( ss[all*2+up+1]=='*' ) up = up + 1; } - + if(up && up != 2) { fprintf(stderr,"Missing unphased marker '*': [%c][%s]", ss[2+up], tsv->ss); @@ -356,13 +358,13 @@ static int tsv_setter_haps(tsv_t *tsv, bcf1_t *rec, void *usr) static void gensample_to_vcf(args_t *args) { /* - * Inpute: IMPUTE2 output (indentation changed here for clarity): + * Inpute: IMPUTE2 output (indentation changed here for clarity): * * 20:62116619_C_T 20:62116619 62116619 C T 0.969 0.031 0 ... * --- 20:62116698_C_A 62116698 C A 1 0 0 ... * * Second column is expected in the form of CHROM:POS_REF_ALT. We use second - * column because the first can be empty ("--") when filling sites from reference + * column because the first can be empty ("--") when filling sites from reference * panel. When the option --vcf-ids is given, the first column is used to set the * VCF ID. * @@ -455,6 +457,7 @@ static void gensample_to_vcf(args_t *args) if ( out_fh == NULL ) error("Can't write to \"%s\": %s\n", args->outfname, strerror(errno)); if ( args->n_threads ) hts_set_threads(out_fh, args->n_threads); if ( bcf_hdr_write(out_fh,args->header)!=0 ) error("[%s] Error: cannot write the header to %s\n", __func__,args->outfname); + if ( args->write_index && init_index(out_fh,args->header,args->outfname,&args->index_fn)<0 ) error("Error: failed to initialise index for %s\n",args->outfname); bcf1_t *rec = bcf_init(); nsamples -= 2; @@ -474,6 +477,15 @@ static void gensample_to_vcf(args_t *args) } while ( hts_getline(gen_fh, KS_SEP_LINE, &line)>0 ); + if ( args->write_index ) + { + if ( bcf_idx_save(out_fh)<0 ) + { + if ( hts_close(out_fh) ) error("Close failed: %s\n", args->outfname); + error("Error: cannot write to index %s\n", args->index_fn); + } + free(args->index_fn); + } if ( hts_close(out_fh) ) error("Close failed: %s\n", args->outfname); if ( hts_close(gen_fh) ) error("Close failed: %s\n", gen_fname); bcf_hdr_destroy(args->header); @@ -589,6 +601,7 @@ static void haplegendsample_to_vcf(args_t *args) if ( out_fh == NULL ) error("Can't write to \"%s\": %s\n", args->outfname, strerror(errno)); if ( args->n_threads ) hts_set_threads(out_fh, args->n_threads); if ( bcf_hdr_write(out_fh,args->header)!=0 ) error("[%s] Error: cannot write the header to %s\n", __func__,args->outfname); + if ( args->write_index && init_index(out_fh,args->header,args->outfname,&args->index_fn)<0 ) error("Error: failed to initialise index for %s\n",args->outfname); bcf1_t *rec = bcf_init(); args->gts = (int32_t *) malloc(sizeof(int32_t)*nsamples*2); @@ -616,6 +629,15 @@ static void haplegendsample_to_vcf(args_t *args) } } + if ( args->write_index ) + { + if ( bcf_idx_save(out_fh)<0 ) + { + if ( hts_close(out_fh) ) error("Close failed: %s\n", args->outfname); + error("Error: cannot write to index %s\n", args->index_fn); + } + free(args->index_fn); + } if ( hts_close(out_fh) ) error("Close failed: %s\n", args->outfname); if ( hts_close(hap_fh) ) error("Close failed: %s\n", hap_fname); if ( hts_close(leg_fh) ) error("Close failed: %s\n", leg_fname); @@ -731,6 +753,7 @@ static void hapsample_to_vcf(args_t *args) if ( out_fh == NULL ) error("Can't write to \"%s\": %s\n", args->outfname, strerror(errno)); if ( args->n_threads ) hts_set_threads(out_fh, args->n_threads); if ( bcf_hdr_write(out_fh,args->header)!=0 ) error("[%s] Error: cannot write to %s\n", __func__,args->outfname); + if ( args->write_index && init_index(out_fh,args->header,args->outfname,&args->index_fn)<0 ) error("Error: failed to initialise index for %s\n",args->outfname); bcf1_t *rec = bcf_init(); nsamples -= 2; @@ -749,6 +772,15 @@ static void hapsample_to_vcf(args_t *args) } while ( hts_getline(hap_fh, KS_SEP_LINE, &line)>0 ); + if ( args->write_index ) + { + if ( bcf_idx_save(out_fh)<0 ) + { + if ( hts_close(out_fh) ) error("Close failed: %s\n", args->outfname); + error("Error: cannot write to index %s\n", args->index_fn); + } + free(args->index_fn); + } if ( hts_close(out_fh) ) error("Close failed: %s\n", args->outfname); if ( hts_close(hap_fh) ) error("Close failed: %s\n", hap_fname); bcf_hdr_destroy(args->header); @@ -784,7 +816,7 @@ char *init_sample2sex(bcf_hdr_t *hdr, char *sex_fname) } for (i=0; isex_fname ) sample2sex = init_sample2sex(args->header,args->sex_fname); @@ -915,7 +947,7 @@ static void vcf_to_gensample(args_t *args) nok++; } } - fprintf(stderr, "%d records written, %d skipped: %d/%d/%d/%d no-ALT/non-biallelic/filtered/duplicated\n", + fprintf(stderr, "%d records written, %d skipped: %d/%d/%d/%d no-ALT/non-biallelic/filtered/duplicated\n", nok, no_alt+non_biallelic+filtered+ndup, no_alt, non_biallelic, filtered, ndup); if ( str.m ) free(str.s); @@ -976,7 +1008,7 @@ static void vcf_to_haplegendsample(args_t *args) { char *sample2sex = NULL; if ( args->sex_fname ) sample2sex = init_sample2sex(args->header,args->sex_fname); - + int i; BGZF *sout = bgzf_open(sample_fname, sample_compressed ? "wg" : "wu"); str.l = 0; @@ -1078,7 +1110,7 @@ static void vcf_to_hapsample(args_t *args) kputs("%CHROM:%POS\\_%REF\\_%FIRST_ALT %ID %POS %REF %FIRST_ALT ", &str); else kputs("%CHROM %CHROM:%POS\\_%REF\\_%FIRST_ALT %POS %REF %FIRST_ALT ", &str); - + if ( args->hap2dip ) kputs("%_GT_TO_HAP2\n", &str); else @@ -1229,7 +1261,7 @@ static inline int tsv_setter_aa1(args_t *args, char *ss, char *se, int alleles[] if ( alleles[a0]<0 ) alleles[a0] = (*nals)++; if ( alleles[a1]<0 ) alleles[a1] = (*nals)++; - gts[0] = bcf_gt_unphased(alleles[a0]); + gts[0] = bcf_gt_unphased(alleles[a0]); gts[1] = ss[1] ? bcf_gt_unphased(alleles[a1]) : bcf_int32_vector_end; if ( ref==a0 && ref==a1 ) args->n.hom_rr++; // hom ref: RR @@ -1265,7 +1297,7 @@ static int tsv_setter_aa(tsv_t *tsv, bcf1_t *rec, void *usr) } ret = tsv_setter_aa1(args, tsv->ss, tsv->se, alleles, &nals, iref, args->gts+i*2); if ( ret==-1 ) error("Error parsing the site %s:%"PRId64", expected two characters\n", bcf_hdr_id2name(args->header,rec->rid),(int64_t) rec->pos+1); - if ( ret==-2 ) + if ( ret==-2 ) { // something else than a SNP free(ref); @@ -1275,7 +1307,7 @@ static int tsv_setter_aa(tsv_t *tsv, bcf1_t *rec, void *usr) args->str.l = 0; kputc(ref[0], &args->str); - for (i=0; i<5; i++) + for (i=0; i<5; i++) { if ( alleles[i]>0 ) { @@ -1321,6 +1353,7 @@ static void tsv_to_vcf(args_t *args) if ( out_fh == NULL ) error("Can't write to \"%s\": %s\n", args->outfname, strerror(errno)); if ( args->n_threads ) hts_set_threads(out_fh, args->n_threads); if ( bcf_hdr_write(out_fh,args->header)!=0 ) error("[%s] Error: cannot write to %s\n", __func__,args->outfname); + if ( args->write_index && init_index(out_fh,args->header,args->outfname,&args->index_fn)<0 ) error("Error: failed to initialise index for %s\n",args->outfname); tsv_t *tsv = tsv_init(args->columns ? args->columns : "ID,CHROM,POS,AA"); if ( tsv_register(tsv, "CHROM", tsv_setter_chrom, args->header) < 0 ) error("Expected CHROM column\n"); @@ -1350,6 +1383,15 @@ static void tsv_to_vcf(args_t *args) if ( hts_close(in_fh) ) error("Close failed: %s\n", args->infname); free(line.s); + if ( args->write_index ) + { + if ( bcf_idx_save(out_fh)<0 ) + { + if ( hts_close(out_fh) ) error("Close failed: %s\n", args->outfname); + error("Error: cannot write to index %s\n", args->index_fn); + } + free(args->index_fn); + } bcf_hdr_destroy(args->header); if ( hts_close(out_fh)!=0 ) error("[%s] Error: close failed .. %s\n", __func__,args->outfname); tsv_destroy(tsv); @@ -1377,6 +1419,7 @@ static void vcf_to_vcf(args_t *args) bcf_hdr_t *hdr = bcf_sr_get_header(args->files,0); if ( bcf_hdr_write(out_fh,hdr)!=0 ) error("[%s] Error: cannot write to %s\n", __func__,args->outfname); + if ( args->write_index && init_index(out_fh,args->header,args->outfname,&args->index_fn)<0 ) error("Error: failed to initialise index for %s\n",args->outfname); while ( bcf_sr_next_line(args->files) ) { @@ -1389,6 +1432,15 @@ static void vcf_to_vcf(args_t *args) } if ( bcf_write(out_fh,hdr,line)!=0 ) error("[%s] Error: cannot write to %s\n", __func__,args->outfname); } + if ( args->write_index ) + { + if ( bcf_idx_save(out_fh)<0 ) + { + if ( hts_close(out_fh) ) error("Close failed: %s\n", args->outfname); + error("Error: cannot write to index %s\n", args->index_fn); + } + free(args->index_fn); + } if ( hts_close(out_fh)!=0 ) error("[%s] Error: close failed .. %s\n", __func__,args->outfname); } @@ -1409,6 +1461,7 @@ static void gvcf_to_vcf(args_t *args) bcf_hdr_t *hdr = bcf_sr_get_header(args->files,0); if (args->record_cmd_line) bcf_hdr_append_version(hdr, args->argc, args->argv, "bcftools_convert"); if ( bcf_hdr_write(out_fh,hdr)!=0 ) error("[%s] Error: cannot write to %s\n", __func__,args->outfname); + if ( args->write_index && init_index(out_fh,hdr,args->outfname,&args->index_fn)<0 ) error("Error: failed to initialise index for %s\n",args->outfname); int32_t *itmp = NULL, nitmp = 0; @@ -1419,7 +1472,7 @@ static void gvcf_to_vcf(args_t *args) { int pass = filter_test(args->filter, line, NULL); if ( args->filter_logic & FLT_EXCLUDE ) pass = pass ? 0 : 1; - if ( !pass ) + if ( !pass ) { if ( bcf_write(out_fh,hdr,line)!=0 ) error("[%s] Error: cannot write to %s\n", __func__,args->outfname); continue; @@ -1469,6 +1522,15 @@ static void gvcf_to_vcf(args_t *args) } } free(itmp); + if ( args->write_index ) + { + if ( bcf_idx_save(out_fh)<0 ) + { + if ( hts_close(out_fh) ) error("Close failed: %s\n", args->outfname); + error("Error: cannot write to index %s\n", args->index_fn); + } + free(args->index_fn); + } if ( hts_close(out_fh)!=0 ) error("[%s] Error: close failed .. %s\n", __func__,args->outfname); } @@ -1497,6 +1559,7 @@ static void usage(void) fprintf(stderr, " -o, --output FILE Output file name [stdout]\n"); fprintf(stderr, " -O, --output-type u|b|v|z[0-9] u/b: un/compressed BCF, v/z: un/compressed VCF, 0-9: compression level [v]\n"); fprintf(stderr, " --threads INT Use multithreading with INT worker threads [0]\n"); + fprintf(stderr, " --write-index Automatically index the output files [off]\n"); fprintf(stderr, "\n"); fprintf(stderr, "GEN/SAMPLE conversion (input/output from IMPUTE2):\n"); fprintf(stderr, " -G, --gensample2vcf ... |,\n"); @@ -1590,6 +1653,7 @@ int main_vcfconvert(int argc, char *argv[]) {"fasta-ref",required_argument,NULL,'f'}, {"no-version",no_argument,NULL,10}, {"keep-duplicates",no_argument,NULL,12}, + {"write-index",no_argument,NULL,16}, {NULL,0,NULL,0} }; char *tmp; @@ -1618,6 +1682,7 @@ int main_vcfconvert(int argc, char *argv[]) case 7 : args->convert_func = vcf_to_hapsample; args->outfname = optarg; break; case 8 : error("The --chrom option has been deprecated, please use --3N6 instead\n"); break; case 15 : args->gen_3N6 = 1; break; + case 16 : args->write_index = 1; break; case 'H': args->convert_func = haplegendsample_to_vcf; args->infname = optarg; break; case 'f': args->ref_fname = optarg; break; case 'c': args->columns = optarg; break; @@ -1667,7 +1732,7 @@ int main_vcfconvert(int argc, char *argv[]) else args->infname = argv[optind]; } if ( !args->infname ) usage(); - + if ( args->convert_func ) args->convert_func(args); else vcf_to_vcf(args); diff --git a/vcffilter.c b/vcffilter.c index 6a3ae4772..8665409d1 100644 --- a/vcffilter.c +++ b/vcffilter.c @@ -1,6 +1,6 @@ /* vcffilter.c -- Apply fixed-threshold filters. - Copyright (C) 2013-2022 Genome Research Ltd. + Copyright (C) 2013-2023 Genome Research Ltd. Author: Petr Danecek @@ -77,6 +77,8 @@ typedef struct _args_t char **argv, *output_fname, *targets_list, *regions_list, *mask_list; int argc, record_cmd_line, mask_is_file, mask_overlap, mask_negate; regidx_t *mask; + char *index_fn; + int write_index; } args_t; @@ -491,6 +493,7 @@ static void usage(args_t *args) fprintf(stderr, " -T, --targets-file FILE Similar to -R but streams rather than index-jumps\n"); fprintf(stderr, " --targets-overlap 0|1|2 Include if POS in the region (0), record overlaps (1), variant overlaps (2) [0]\n"); fprintf(stderr, " --threads INT Use multithreading with worker threads [0]\n"); + fprintf(stderr, " --write-index Automatically index the output files [off]\n"); fprintf(stderr, "\n"); exit(1); } @@ -533,13 +536,14 @@ int main_vcffilter(int argc, char *argv[]) {"SnpGap",required_argument,NULL,'g'}, {"IndelGap",required_argument,NULL,'G'}, {"no-version",no_argument,NULL,8}, + {"write-index",no_argument,NULL,12}, {NULL,0,NULL,0} }; char *tmp; while ((c = getopt_long(argc, argv, "e:i:t:T:r:R:h?s:m:M:o:O:g:G:S:",loptions,NULL)) >= 0) { switch (c) { case 'g': - args->snp_gap = strtol(optarg,&tmp,10); + args->snp_gap = strtol(optarg,&tmp,10); if ( *tmp && *tmp!=':' ) error("Could not parse argument: --SnpGap %s\n", optarg); if ( *tmp==':' ) { @@ -625,6 +629,7 @@ int main_vcffilter(int argc, char *argv[]) else if ( !strcasecmp(optarg,"2") ) args->mask_overlap = 2; else error("Could not parse: --mask-overlap %s\n",optarg); break; + case 12 : args->write_index = 1; break; case 'h': case '?': usage(args); break; default: error("Unknown argument: %s\n", optarg); @@ -672,6 +677,7 @@ int main_vcffilter(int argc, char *argv[]) init_data(args); if ( bcf_hdr_write(args->out_fh, args->hdr)!=0 ) error("[%s] Error: cannot write the header to %s\n", __func__,args->output_fname); + if ( args->write_index && init_index(args->out_fh,args->hdr,args->output_fname,&args->index_fn)<0 ) error("Error: failed to initialise index for %s\n",args->output_fname); while ( bcf_sr_next_line(args->files) ) { bcf1_t *line = bcf_sr_get_line(args->files, 0); @@ -713,7 +719,15 @@ int main_vcffilter(int argc, char *argv[]) } } buffered_filters(args, NULL); - + if ( args->write_index ) + { + if ( bcf_idx_save(args->out_fh)<0 ) + { + if ( hts_close(args->out_fh)!=0 ) error("Error: close failed .. %s\n", args->output_fname?args->output_fname:"stdout"); + error("Error: cannot write to index %s\n", args->index_fn); + } + free(args->index_fn); + } if ( hts_close(args->out_fh)!=0 ) error("[%s] Error: close failed .. %s\n", __func__,args->output_fname); destroy_data(args); bcf_sr_destroy(args->files); diff --git a/vcfisec.c b/vcfisec.c index a755a85b4..4ee29b4c8 100644 --- a/vcfisec.c +++ b/vcfisec.c @@ -1,6 +1,6 @@ /* vcfisec.c -- Create intersections, unions and complements of VCF files. - Copyright (C) 2012-2022 Genome Research Ltd. + Copyright (C) 2012-2023 Genome Research Ltd. Author: Petr Danecek @@ -60,6 +60,8 @@ typedef struct char **argv, *prefix, *output_fname, **fnames, *write_files, *targets_list, *regions_list; char *isec_exact; int argc, record_cmd_line; + char *index_fn; + int write_index; } args_t; @@ -148,6 +150,8 @@ void isec_vcf(args_t *args) if ( args->n_threads ) hts_set_threads(out_fh, args->n_threads); if (args->record_cmd_line) bcf_hdr_append_version(files->readers[args->iwrite].header,args->argc,args->argv,"bcftools_isec"); if ( bcf_hdr_write(out_fh, files->readers[args->iwrite].header)!=0 ) error("[%s] Error: cannot write to %s\n", __func__,args->output_fname?args->output_fname:"standard output"); + if ( args->write_index && init_index(out_fh,files->readers[args->iwrite].header,args->output_fname,&args->index_fn)<0 ) + error("Error: failed to initialise index for %s\n",args->output_fname?args->output_fname:"standard output"); } if ( !args->nwrite && !out_std && !args->prefix ) fprintf(stderr,"Note: -w option not given, printing list of sites...\n"); @@ -253,7 +257,19 @@ void isec_vcf(args_t *args) } } if ( str.s ) free(str.s); - if ( out_fh && hts_close(out_fh)!=0 ) error("[%s] Error: close failed .. %s\n", __func__,args->output_fname? args->output_fname : "-"); + if ( out_fh ) + { + if ( args->write_index ) + { + if ( bcf_idx_save(out_fh)<0 ) + { + if ( hts_close(out_fh)!=0 ) error("Error: close failed .. %s\n", args->output_fname?args->output_fname:"stdout"); + error("Error: cannot write to index %s\n", args->index_fn); + } + free(args->index_fn); + } + if ( hts_close(out_fh)!=0 ) error("[%s] Error: close failed .. %s\n", __func__,args->output_fname? args->output_fname : "-"); + } } static void add_filter(args_t *args, char *expr, int logic) @@ -481,6 +497,7 @@ static void usage(void) fprintf(stderr, " --targets-overlap 0|1|2 Include if POS in the region (0), record overlaps (1), variant overlaps (2) [0]\n"); fprintf(stderr, " --threads INT Use multithreading with worker threads [0]\n"); fprintf(stderr, " -w, --write LIST List of files to write with -p given as 1-based indexes. By default, all files are written\n"); + fprintf(stderr, " --write-index Automatically index the output files [off]\n"); fprintf(stderr, "\n"); fprintf(stderr, "Examples:\n"); fprintf(stderr, " # Create intersection and complements of two sets saving the output in dir/*\n"); @@ -537,6 +554,7 @@ int main_vcfisec(int argc, char *argv[]) {"output-type",required_argument,NULL,'O'}, {"threads",required_argument,NULL,9}, {"no-version",no_argument,NULL,8}, + {"write-index",no_argument,NULL,10}, {NULL,0,NULL,0} }; char *tmp; @@ -608,6 +626,7 @@ int main_vcfisec(int argc, char *argv[]) break; case 9 : args->n_threads = strtol(optarg, 0, 0); break; case 8 : args->record_cmd_line = 0; break; + case 10 : args->write_index = 1; break; case 'h': case '?': usage(); break; default: error("Unknown argument: %s\n", optarg); diff --git a/vcfmerge.c b/vcfmerge.c index 621f4102c..2cf54fb1e 100644 --- a/vcfmerge.c +++ b/vcfmerge.c @@ -1,6 +1,6 @@ /* vcfmerge.c -- Merge multiple VCF/BCF files to create one multi-sample file. - Copyright (C) 2012-2022 Genome Research Ltd. + Copyright (C) 2012-2023 Genome Research Ltd. Author: Petr Danecek @@ -166,6 +166,8 @@ typedef struct int argc, n_threads, record_cmd_line, clevel; int local_alleles; // the value of -L option int keep_AC_AN; + char *index_fn; + int write_index; } args_t; @@ -3087,6 +3089,7 @@ void merge_vcf(args_t *args) if ( hts_close(args->out_fh)!=0 ) error("[%s] Error: close failed .. %s\n", __func__,args->output_fname); return; } + else if ( args->write_index && init_index(args->out_fh,args->out_hdr,args->output_fname,&args->index_fn)<0 ) error("Error: failed to initialise index for %s\n",args->output_fname); if ( args->collapse==COLLAPSE_NONE ) args->vcmp = vcmp_init(); args->maux = maux_init(args); @@ -3124,7 +3127,16 @@ void merge_vcf(args_t *args) info_rules_destroy(args); maux_destroy(args->maux); bcf_hdr_destroy(args->out_hdr); - if ( hts_close(args->out_fh)!=0 ) error("[%s] Error: close failed .. %s\n", __func__,args->output_fname); + if ( args->write_index ) + { + if ( bcf_idx_save(args->out_fh)<0 ) + { + if ( hts_close(args->out_fh)!=0 ) error("Error: close failed .. %s\n", args->output_fname?args->output_fname:"stdout"); + error("Error: cannot write to index %s\n", args->index_fn); + } + free(args->index_fn); + } + if ( hts_close(args->out_fh)!=0 ) error("[%s] Error: close failed .. %s\n", __func__,args->output_fname?args->output_fname:"stdout"); bcf_destroy1(args->out_line); kh_destroy(strdict, args->tmph); if ( args->tmps.m ) free(args->tmps.s); @@ -3159,6 +3171,7 @@ static void usage(void) fprintf(stderr, " -R, --regions-file FILE Restrict to regions listed in a file\n"); fprintf(stderr, " --regions-overlap 0|1|2 Include if POS in the region (0), record overlaps (1), variant overlaps (2) [1]\n"); fprintf(stderr, " --threads INT Use multithreading with worker threads [0]\n"); + fprintf(stderr, " --write-index Automatically index the output files [off]\n"); fprintf(stderr, "\n"); exit(1); } @@ -3200,6 +3213,7 @@ int main_vcfmerge(int argc, char *argv[]) {"no-version",no_argument,NULL,8}, {"no-index",no_argument,NULL,10}, {"filter-logic",required_argument,NULL,'F'}, + {"write-index",no_argument,NULL,11}, {NULL,0,NULL,0} }; char *tmp; @@ -3271,6 +3285,7 @@ int main_vcfmerge(int argc, char *argv[]) case 9 : args->n_threads = strtol(optarg, 0, 0); break; case 8 : args->record_cmd_line = 0; break; case 10 : args->no_index = 1; break; + case 11 : args->write_index = 1; break; case 'h': case '?': usage(); break; default: error("Unknown argument: %s\n", optarg); diff --git a/vcfnorm.c b/vcfnorm.c index 9538f8d01..e836c6b32 100644 --- a/vcfnorm.c +++ b/vcfnorm.c @@ -1,6 +1,6 @@ /* vcfnorm.c -- Left-align and normalize indels. - Copyright (C) 2013-2022 Genome Research Ltd. + Copyright (C) 2013-2023 Genome Research Ltd. Author: Petr Danecek @@ -105,6 +105,8 @@ typedef struct int use_star_allele, ma_use_ref_allele; char *old_rec_tag; htsFile *out; + char *index_fn; + int write_index; } args_t; @@ -2018,6 +2020,7 @@ static void normalize_vcf(args_t *args) hts_set_opt(args->out, HTS_OPT_THREAD_POOL, args->files->p); if (args->record_cmd_line) bcf_hdr_append_version(args->out_hdr, args->argc, args->argv, "bcftools_norm"); if ( bcf_hdr_write(args->out, args->out_hdr)!=0 ) error("[%s] Error: cannot write to %s\n", __func__,args->output_fname); + if ( args->write_index && init_index(args->out,args->out_hdr,args->output_fname,&args->index_fn)<0 ) error("Error: failed to initialise index for %s\n",args->output_fname); bcf1_t *line; int prev_rid = -1, prev_pos = -1, prev_type = 0; @@ -2081,6 +2084,15 @@ static void normalize_vcf(args_t *args) if ( j>0 ) flush_buffer(args, args->out, j); } flush_buffer(args, args->out, args->rbuf.n); + if ( args->write_index ) + { + if ( bcf_idx_save(args->out)<0 ) + { + if ( hts_close(args->out)!=0 ) error("Error: close failed .. %s\n", args->output_fname?args->output_fname:"stdout"); + error("Error: cannot write to index %s\n", args->index_fn); + } + free(args->index_fn); + } if ( hts_close(args->out)!=0 ) error("[%s] Error: close failed .. %s\n", __func__,args->output_fname); fprintf(stderr,"Lines total/split/realigned/skipped:\t%d/%d/%d/%d\n", args->ntotal,args->nsplit,args->nchanged,args->nskipped); @@ -2121,6 +2133,7 @@ static void usage(void) fprintf(stderr, " --targets-overlap 0|1|2 Include if POS in the region (0), record overlaps (1), variant overlaps (2) [0]\n"); fprintf(stderr, " --threads INT Use multithreading with worker threads [0]\n"); fprintf(stderr, " -w, --site-win INT Buffer for sorting lines which changed position during realignment [1000]\n"); + fprintf(stderr, " --write-index Automatically index the output files [off]\n"); fprintf(stderr, "\n"); fprintf(stderr, "Examples:\n"); fprintf(stderr, " # normalize and left-align indels\n"); @@ -2181,6 +2194,7 @@ int main_vcfnorm(int argc, char *argv[]) {"check-ref",required_argument,NULL,'c'}, {"strict-filter",no_argument,NULL,'s'}, {"no-version",no_argument,NULL,8}, + {"write-index",no_argument,NULL,14}, {NULL,0,NULL,0} }; char *tmp; @@ -2204,6 +2218,7 @@ int main_vcfnorm(int argc, char *argv[]) else if ( optarg[0]=='.' ) args->ma_use_ref_allele = 0; else error("Invalid argument to --multi-overlaps\n"); break; + case 14 : args->write_index = 1; break; case 'N': args->do_indels = 0; break; case 'd': if ( !strcmp("snps",optarg) ) args->rmdup = BCF_SR_PAIR_SNPS; diff --git a/vcfplugin.c b/vcfplugin.c index 45686680a..687751961 100644 --- a/vcfplugin.c +++ b/vcfplugin.c @@ -1,6 +1,6 @@ /* vcfplugin.c -- plugin modules for operating on VCF/BCF files. - Copyright (C) 2013-2021 Genome Research Ltd. + Copyright (C) 2013-2023 Genome Research Ltd. Author: Petr Danecek @@ -149,6 +149,8 @@ typedef struct _args_t char **argv, *output_fname, *regions_list, *targets_list; int argc, drop_header, verbose, record_cmd_line, plist_only; + char *index_fn; + int write_index; } args_t; @@ -548,6 +550,7 @@ static void init_data(args_t *args) if ( args->out_fh == NULL ) error("Can't write to \"%s\": %s\n", args->output_fname, strerror(errno)); if ( args->n_threads ) hts_set_threads(args->out_fh, args->n_threads); if ( bcf_hdr_write(args->out_fh, args->hdr_out)!=0 ) error("[%s] Error: cannot write to %s\n", __func__,args->output_fname); + if ( args->write_index && init_index(args->out_fh,args->hdr_out,args->output_fname,&args->index_fn)<0 ) error("Error: failed to initialise index for %s\n",args->output_fname); } } @@ -569,7 +572,19 @@ static void destroy_data(args_t *args) } if ( args->filter ) filter_destroy(args->filter); - if (args->out_fh && hts_close(args->out_fh)!=0 ) error("[%s] Error: close failed .. %s\n", __func__,args->output_fname); + if (args->out_fh ) + { + if ( args->write_index ) + { + if ( bcf_idx_save(args->out_fh)<0 ) + { + if ( hts_close(args->out_fh)!=0 ) error("Error: close failed .. %s\n", args->output_fname?args->output_fname:"stdout"); + error("Error: cannot write to index %s\n", args->index_fn); + } + free(args->index_fn); + } + if ( hts_close(args->out_fh)!=0 ) error("[%s] Error: close failed .. %s\n", __func__,args->output_fname); + } } static void usage(args_t *args) @@ -598,6 +613,7 @@ static void usage(args_t *args) fprintf(stderr, " -l, --list-plugins List available plugins. See BCFTOOLS_PLUGINS environment variable and man page for details\n"); fprintf(stderr, " -v, --verbose Print verbose information, -vv increases verbosity\n"); fprintf(stderr, " -V, --version Print version string and exit\n"); + fprintf(stderr, " --write-index Automatically index the output files [off]\n"); fprintf(stderr, "\n"); exit(1); } @@ -643,9 +659,9 @@ int main_plugin(int argc, char *argv[]) if ( argv[1][0]!='-' ) { args->verbose = is_verbose(argc, argv); - plugin_name = argv[1]; - argc--; - argv++; + plugin_name = argv[1]; + argc--; + argv++; load_plugin(args, plugin_name, 1, &args->plugin); if ( args->plugin.run ) { @@ -675,6 +691,7 @@ int main_plugin(int argc, char *argv[]) {"targets-file",required_argument,NULL,'T'}, {"targets-overlap",required_argument,NULL,2}, {"no-version",no_argument,NULL,8}, + {"write-index",no_argument,NULL,10}, {NULL,0,NULL,0} }; char *tmp; @@ -723,6 +740,7 @@ int main_plugin(int argc, char *argv[]) break; case 9 : args->n_threads = strtol(optarg, 0, 0); break; case 8 : args->record_cmd_line = 0; break; + case 10 : args->write_index = 1; break; case '?': case 'h': usage_only = 1; break; default: error("Unknown argument: %s\n", optarg); diff --git a/vcfsort.c b/vcfsort.c index 1de2b2867..3b208a0d3 100644 --- a/vcfsort.c +++ b/vcfsort.c @@ -1,6 +1,6 @@ /* vcfsort.c -- sort subcommand - Copyright (C) 2017-2022 Genome Research Ltd. + Copyright (C) 2017-2023 Genome Research Ltd. Author: Petr Danecek @@ -62,6 +62,8 @@ typedef struct _args_t uint8_t *mem_block; size_t nbuf, mbuf, nblk; blk_t *blk; + char *index_fn; + int write_index; } args_t; @@ -300,6 +302,7 @@ void merge_blocks(args_t *args) set_wmode(wmode,args->output_type,args->output_fname,args->clevel); htsFile *out = hts_open(args->output_fname ? args->output_fname : "-", wmode); if ( bcf_hdr_write(out, args->hdr)!=0 ) clean_files_and_throw(args, "[%s] Error: cannot write to %s\n", __func__,args->output_fname); + if ( args->write_index && init_index(out,args->hdr,args->output_fname,&args->index_fn)<0 ) error("Error: failed to initialise index for %s\n",args->output_fname); while ( bhp->ndat ) { blk_t *blk = bhp->dat[0]; @@ -307,6 +310,15 @@ void merge_blocks(args_t *args) khp_delete(blk, bhp); blk_read(args, bhp, args->hdr, blk); } + if ( args->write_index ) + { + if ( bcf_idx_save(out)<0 ) + { + if ( hts_close(out)!=0 ) error("Error: close failed .. %s\n", args->output_fname?args->output_fname:"stdout"); + error("Error: cannot write to index %s\n", args->index_fn); + } + free(args->index_fn); + } if ( hts_close(out)!=0 ) clean_files_and_throw(args, "Close failed: %s\n", args->output_fname); clean_files(args); @@ -333,6 +345,7 @@ static void usage(args_t *args) #else fprintf(stderr, " -T, --temp-dir DIR temporary files [/tmp/bcftools.XXXXXX]\n"); #endif + fprintf(stderr, " --write-index Automatically index the output files [off]\n"); fprintf(stderr, "\n"); exit(1); } @@ -395,6 +408,7 @@ int main_sort(int argc, char *argv[]) {"output-file",required_argument,NULL,'o'}, {"output",required_argument,NULL,'o'}, {"help",no_argument,NULL,'h'}, + {"write-index",no_argument,NULL,1}, {0,0,0,0} }; char *tmp; @@ -423,6 +437,7 @@ int main_sort(int argc, char *argv[]) if ( *tmp || args->clevel<0 || args->clevel>9 ) error("Could not parse argument: --compression-level %s\n", optarg+1); } break; + case 1 : args->write_index = 1; break; case 'h': case '?': usage(args); break; default: error("Unknown argument: %s\n", optarg); diff --git a/vcfview.c b/vcfview.c index 7f7c835e7..e09efa0bc 100644 --- a/vcfview.c +++ b/vcfview.c @@ -1,6 +1,6 @@ /* vcfview.c -- VCF/BCF conversion, view, subset and filter VCF/BCF files. - Copyright (C) 2013-2022 Genome Research Ltd. + Copyright (C) 2013-2023 Genome Research Ltd. Author: Shane McCarthy @@ -534,42 +534,11 @@ static void usage(args_t *args) fprintf(stderr, " -u/U, --uncalled/--exclude-uncalled Select/exclude sites without a called genotype\n"); fprintf(stderr, " -v/V, --types/--exclude-types LIST Select/exclude comma-separated list of variant types: snps,indels,mnps,ref,bnd,other [null]\n"); fprintf(stderr, " -x/X, --private/--exclude-private Select/exclude sites where the non-reference alleles are exclusive (private) to the subset samples\n"); + fprintf(stderr, " --write-index Automatically index the output files [off]\n"); fprintf(stderr, "\n"); exit(1); } -// See also samtools/sam_utils.c auto_index() -int init_index(args_t *args, bcf_hdr_t *hdr) { - int min_shift = 14; // CSI - char *fn = args->fn_out; - - if (!fn || !*fn || strcmp(fn, "-") == 0) - return -1; - - char *delim = strstr(fn, HTS_IDX_DELIM); - if (delim) { - delim += strlen(HTS_IDX_DELIM); - args->index_fn = strdup(delim); - if (!args->index_fn) - return -1; - - size_t l = strlen(args->index_fn); - if (l >= 4 && - (strcmp(args->index_fn + l - 4, ".tbi") == 0 || - strcmp(args->index_fn + l - 4, ".bai") == 0)) - min_shift = 0; - } else { - if (!(args->index_fn = malloc(strlen(fn)+6))) - return -1; - sprintf(args->index_fn, "%s.csi", fn); - } - - if (bcf_idx_init(args->out, hdr, min_shift, args->index_fn) < 0) - return -1; - - return 0; -} - int main_vcfview(int argc, char *argv[]) { int c; @@ -820,10 +789,7 @@ int main_vcfview(int argc, char *argv[]) else if ( args->output_type & FT_BCF ) error("BCF output requires header, cannot proceed with -H\n"); - if (args->write_index) { - if (init_index(args, out_hdr) < 0) - error("Failed to initialise index\n"); - } + if ( args->write_index && init_index(args->out,out_hdr,args->fn_out,&args->index_fn)<0 ) error("Error: failed to initialise index for %s\n",args->fn_out); int ret = 0; if (!args->header_only) @@ -838,13 +804,17 @@ int main_vcfview(int argc, char *argv[]) if ( ret ) fprintf(stderr,"Error: %s\n", bcf_sr_strerror(args->files->errnum)); } - if (args->write_index) { + if (args->write_index) + { if (bcf_idx_save(args->out) < 0) + { + if ( hts_close(args->out)!=0 ) error("Error: close failed %s\n", args->fn_out?args->fn_out:"stdout"); error("Error: cannot write to index %s\n", args->index_fn); + } free(args->index_fn); } - hts_close(args->out); + if ( hts_close(args->out)!=0 ) error("Error: close failed %s\n", args->fn_out?args->fn_out:"stdout"); destroy_data(args); bcf_sr_destroy(args->files); free(args); diff --git a/version.c b/version.c index 4306d4011..38417a78b 100644 --- a/version.c +++ b/version.c @@ -1,6 +1,6 @@ /* version.c -- report version numbers for plugins. - Copyright (C) 2014-2021 Genome Research Ltd. + Copyright (C) 2014-2023 Genome Research Ltd. Author: Petr Danecek @@ -72,22 +72,26 @@ const char *hts_bcf_wmode(int file_type) const char *hts_bcf_wmode2(int file_type, const char *fname) { if ( !fname ) return hts_bcf_wmode(file_type); - int len = strlen(fname); - if ( len >= 4 && !strcasecmp(".bcf",fname+len-4) ) return hts_bcf_wmode(FT_BCF|FT_GZ); - if ( len >= 4 && !strcasecmp(".vcf",fname+len-4) ) return hts_bcf_wmode(FT_VCF); - if ( len >= 7 && !strcasecmp(".vcf.gz",fname+len-7) ) return hts_bcf_wmode(FT_VCF|FT_GZ); - if ( len >= 8 && !strcasecmp(".vcf.bgz",fname+len-8) ) return hts_bcf_wmode(FT_VCF|FT_GZ); + const char *end = fname ? strstr(fname, HTS_IDX_DELIM) : NULL; + if ( !end ) end = fname ? fname + strlen(fname) : fname; + int len = end - fname; + if ( len >= 4 && !strncasecmp(".bcf",fname+len-4,4) ) return hts_bcf_wmode(FT_BCF|FT_GZ); + if ( len >= 4 && !strncasecmp(".vcf",fname+len-4,4) ) return hts_bcf_wmode(FT_VCF); + if ( len >= 7 && !strncasecmp(".vcf.gz",fname+len-7,7) ) return hts_bcf_wmode(FT_VCF|FT_GZ); + if ( len >= 8 && !strncasecmp(".vcf.bgz",fname+len-8,8) ) return hts_bcf_wmode(FT_VCF|FT_GZ); return hts_bcf_wmode(file_type); } void set_wmode(char dst[8], int file_type, const char *fname, int clevel) { const char *ret = NULL; - int len = fname ? strlen(fname) : 0; - if ( len >= 4 && !strcasecmp(".bcf",fname+len-4) ) ret = hts_bcf_wmode(FT_BCF|FT_GZ); - else if ( len >= 4 && !strcasecmp(".vcf",fname+len-4) ) ret = hts_bcf_wmode(FT_VCF); - else if ( len >= 7 && !strcasecmp(".vcf.gz",fname+len-7) ) ret = hts_bcf_wmode(FT_VCF|FT_GZ); - else if ( len >= 8 && !strcasecmp(".vcf.bgz",fname+len-8) ) ret = hts_bcf_wmode(FT_VCF|FT_GZ); + const char *end = fname ? strstr(fname, HTS_IDX_DELIM) : NULL; + if ( !end ) end = fname ? fname + strlen(fname) : fname; + int len = end - fname; + if ( len >= 4 && !strncasecmp(".bcf",fname+len-4,4) ) ret = hts_bcf_wmode(FT_BCF|FT_GZ); + else if ( len >= 4 && !strncasecmp(".vcf",fname+len-4,4) ) ret = hts_bcf_wmode(FT_VCF); + else if ( len >= 7 && !strncasecmp(".vcf.gz",fname+len-7,7) ) ret = hts_bcf_wmode(FT_VCF|FT_GZ); + else if ( len >= 8 && !strncasecmp(".vcf.bgz",fname+len-8,8) ) ret = hts_bcf_wmode(FT_VCF|FT_GZ); else ret = hts_bcf_wmode(file_type); if ( clevel>=0 && clevel<=9 ) { @@ -107,3 +111,33 @@ int parse_overlap_option(const char *arg) else if ( strcasecmp(arg, "variant") == 0 || strcmp(arg, "2") == 0 ) return 2; else return -1; } + +// See also samtools/sam_utils.c auto_index() +int init_index(htsFile *fh, bcf_hdr_t *hdr, char *fname, char **idx_fname) +{ + int min_shift = 14; // CSI + + if ( !fname || !*fname || !strcmp(fname, "-") ) return -1; + + char *delim = strstr(fname, HTS_IDX_DELIM); + if (delim) + { + delim += strlen(HTS_IDX_DELIM); + *idx_fname = strdup(delim); + if ( !*idx_fname ) return -1; + + size_t l = strlen(*idx_fname); + if ( l >= 4 && strcmp(*idx_fname + l - 4, ".tbi")==0 ) min_shift = 0; + } + else + { + if ( !(*idx_fname = malloc(strlen(fname)+6)) ) return -1; + sprintf(*idx_fname, "%s.csi", fname); + } + + if ( bcf_idx_init(fh, hdr, min_shift, *idx_fname) < 0 ) return -1; + + return 0; +} + +