diff --git a/NEWS b/NEWS
index 0d8eb5e72..a1d1ad3de 100644
--- a/NEWS
+++ b/NEWS
@@ -1,5 +1,10 @@
## Release a.b
+Changes affecting the whole of bcftools, or multiple commands:
+
+* Support auto indexing during writing BCF and VCF.gz via new `--write-index` option
+
+
Changes affecting specific commands:
* bcftools annotate
diff --git a/bcftools.h b/bcftools.h
index c3f7ded16..bba71e3b6 100644
--- a/bcftools.h
+++ b/bcftools.h
@@ -1,6 +1,6 @@
/* bcftools.h -- utility function declarations.
- Copyright (C) 2013-2022 Genome Research Ltd.
+ Copyright (C) 2013-2023 Genome Research Ltd.
Author: Petr Danecek
@@ -49,6 +49,9 @@ void error(const char *format, ...) HTS_NORETURN HTS_FORMAT(HTS_PRINTF_FMT, 1, 2
// newline will be added by the function.
void error_errno(const char *format, ...) HTS_NORETURN HTS_FORMAT(HTS_PRINTF_FMT, 1, 2);
+// For on the fly index creation with --write-index
+int init_index(htsFile *fh, bcf_hdr_t *hdr, char *fname, char **idx_fname);
+
void bcf_hdr_append_version(bcf_hdr_t *hdr, int argc, char **argv, const char *cmd);
const char *hts_bcf_wmode(int file_type);
const char *hts_bcf_wmode2(int file_type, const char *fname);
diff --git a/csq.c b/csq.c
index 49812d4de..364acebd1 100644
--- a/csq.c
+++ b/csq.c
@@ -574,6 +574,8 @@ typedef struct _args_t
// text tab-delimited output (out) or vcf/bcf output (out_fh)
FILE *out;
htsFile *out_fh;
+ char *index_fn;
+ int write_index;
// vcf
bcf_srs_t *sr;
@@ -1536,6 +1538,7 @@ void init_data(args_t *args)
if ( args->hdr_nsmpl )
bcf_hdr_printf(args->hdr,"##FORMAT=",args->bcsq_tag);
if ( bcf_hdr_write(args->out_fh, args->hdr)!=0 ) error("[%s] Error: cannot write the header to %s\n", __func__,args->output_fname?args->output_fname:"standard output");
+ if ( args->write_index && init_index(args->out_fh,args->hdr,args->output_fname,&args->index_fn)<0 ) error("Error: failed to initialise index for %s\n",args->output_fname);
}
if ( args->verbosity > 0 ) fprintf(stderr,"Calling...\n");
}
@@ -1571,7 +1574,18 @@ void destroy_data(args_t *args)
if ( args->smpl ) smpl_ilist_destroy(args->smpl);
int ret;
if ( args->out_fh )
+ {
+ if ( args->write_index )
+ {
+ if ( bcf_idx_save(args->out_fh)<0 )
+ {
+ if ( hts_close(args->out_fh)!=0 ) error("Error: close failed .. %s\n", args->output_fname?args->output_fname:"stdout");
+ error("Error: cannot write to index %s\n", args->index_fn);
+ }
+ free(args->index_fn);
+ }
ret = hts_close(args->out_fh);
+ }
else
ret = fclose(args->out);
if ( ret ) error("Error: close failed .. %s\n", args->output_fname?args->output_fname:"stdout");
@@ -4272,6 +4286,7 @@ static const char *usage(void)
" --targets-overlap 0|1|2 Include if POS in the region (0), record overlaps (1), variant overlaps (2) [0]\n"
" --threads INT Use multithreading with worker threads [0]\n"
" -v, --verbose INT Verbosity level 0-2 [1]\n"
+ " --write-index Automatically index the output files [off]\n"
"\n"
"Example:\n"
" bcftools csq -f hs37d5.fa -g Homo_sapiens.GRCh37.82.gff3.gz in.vcf\n"
@@ -4321,6 +4336,7 @@ int main_csq(int argc, char *argv[])
{"targets-file",1,0,'T'},
{"targets-overlap",required_argument,NULL,5},
{"no-version",no_argument,NULL,3},
+ {"write-index",no_argument,NULL,6},
{0,0,0,0}
};
int c, targets_is_file = 0, regions_is_file = 0;
@@ -4409,6 +4425,7 @@ int main_csq(int argc, char *argv[])
targets_overlap = parse_overlap_option(optarg);
if ( targets_overlap < 0 ) error("Could not parse: --targets-overlap %s\n",optarg);
break;
+ case 6 : args->write_index = 1; break;
case 'h':
case '?': error("%s",usage());
default: error("The option not recognised: %s\n\n", optarg); break;
diff --git a/doc/bcftools.1 b/doc/bcftools.1
index 0e3d5290e..e212755ab 100644
--- a/doc/bcftools.1
+++ b/doc/bcftools.1
@@ -1,13 +1,13 @@
'\" t
.\" Title: bcftools
.\" Author: [see the "AUTHOR(S)" section]
-.\" Generator: Asciidoctor 2.0.16.dev
-.\" Date: 2023-02-21
+.\" Generator: Asciidoctor 2.0.16
+.\" Date: 2023-03-10
.\" Manual: \ \&
.\" Source: \ \&
.\" Language: English
.\"
-.TH "BCFTOOLS" "1" "2023-02-21" "\ \&" "\ \&"
+.TH "BCFTOOLS" "1" "2023-03-10" "\ \&" "\ \&"
.ie \n(.g .ds Aq \(aq
.el .ds Aq '
.ss \n[.ss] 0
@@ -51,10 +51,10 @@ standard input (stdin) and outputs to the standard output (stdout). Several
commands can thus be combined with Unix pipes.
.SS "VERSION"
.sp
-This manual page was last updated \fB2023\-02\-21\fP and refers to bcftools git version \fB1.17\fP.
+This manual page was last updated \fB2023\-03\-10 08:27 GMT\fP and refers to bcftools git version \fB1.17\-10\-g9655089+\fP.
.SS "BCF1"
.sp
-The BCF1 format output by versions of samtools <= 0.1.19 is \fBnot\fP
+The obsolete BCF1 format output by versions of samtools <= 0.1.19 is \fBnot\fP
compatible with this version of bcftools. To read BCF1 files one can use
the view command from old versions of bcftools packaged with samtools
versions <= 0.1.19 to convert to VCF, which can then be read by
@@ -75,6 +75,9 @@ done with \fIbcftools view\fP. Users are now required to choose between the old
samtools calling model (\fI\-c/\-\-consensus\-caller\fP) and the new multiallelic
calling model (\fI\-m/\-\-multiallelic\-caller\fP). The multiallelic calling model
is recommended for most tasks.
+.SS "FILTERING EXPRESSIONS"
+.sp
+See \fBEXPRESSIONS\fP
.SH "LIST OF COMMANDS"
.sp
For a full list of available commands, run \fBbcftools\fP without arguments. For a full
@@ -344,6 +347,17 @@ Some helper scripts are bundled with the bcftools code.
. sp -1
. IP \(bu 2.3
.\}
+\fBgff2gff\fP .. converts a GFF file to the format required by \fBcsq\fP
+.RE
+.sp
+.RS 4
+.ie n \{\
+\h'-04'\(bu\h'+03'\c
+.\}
+.el \{\
+. sp -1
+. IP \(bu 2.3
+.\}
\fBplot\-vcfstats\fP .. plots the output of \fBstats\fP
.RE
.SH "COMMANDS AND OPTIONS"
@@ -597,6 +611,11 @@ Same as \fB\-\-regions\-overlap\fP but for \fB\-t/\-T\fP.
Use multithreading with \fIINT\fP worker threads. The option is currently used only for the compression of the
output stream, only when \fI\-\-output\-type\fP is \fIb\fP or \fIz\fP. Default: 0.
.RE
+.sp
+\fB\-\-write\-index\fP
+.RS 4
+Automatically index the output files. Can be used only for compressed BCF and VCF output.
+.RE
.SS "bcftools annotate \fI[OPTIONS]\fP \fIFILE\fP"
.sp
Add or remove annotations.
@@ -881,6 +900,11 @@ except GT. To remove all INFO tags except "FOO" and "BAR", use
"INFO" can be abbreviated to "INF" and "FORMAT" to "FMT".
.RE
.sp
+\fB\-\-write\-index\fP
+.RS 4
+Automatically index the output file
+.RE
+.sp
\fBExamples:\fP
.sp
.if n .RS 4
@@ -1017,6 +1041,11 @@ see \fBCommon Options\fP
.RS 4
see \fBCommon Options\fP
.RE
+.sp
+\fB\-\-write\-index\fP
+.RS 4
+Automatically index the output file
+.RE
.SS "Input/output options:"
.sp
\fB\-A, \-\-keep\-alts\fP
@@ -1401,6 +1430,11 @@ see \fBCommon Options\fP
.RS 4
see \fBCommon Options\fP
.RE
+.sp
+\fB\-\-write\-index\fP
+.RS 4
+Automatically index the output file
+.RE
.SS "bcftools consensus \fI[OPTIONS]\fP \fIFILE\fP"
.sp
Create consensus sequence by applying VCF variants to a reference fasta file.
@@ -1617,6 +1651,11 @@ see \fBCommon Options\fP
.RS 4
see \fBCommon Options\fP
.RE
+.sp
+\fB\-\-write\-index\fP
+.RS 4
+Automatically index the output file
+.RE
.SS "VCF output options:"
.sp
\fB\-\-no\-version\fP
@@ -1987,6 +2026,7 @@ transcripts in malformatted GFFs with incorrect phase
.RS 4
GFF3 annotation file (required), such as \c
.URL "ftp://ftp.ensembl.org/pub/current_gff3/homo_sapiens" "" "."
+The script \fBgff2gff\fP can help with conversion from non\-standard GFF formats.
An example of a minimal working GFF file:
.RE
.sp
@@ -2137,6 +2177,11 @@ see \fBCommon Options\fP
see \fBCommon Options\fP
.RE
.sp
+\fB\-\-write\-index\fP
+.RS 4
+Automatically index the output file
+.RE
+.sp
\fBExamples:\fP
.sp
.if n .RS 4
@@ -2366,6 +2411,11 @@ see \fBCommon Options\fP
.RS 4
see \fBCommon Options\fP
.RE
+.sp
+\fB\-\-write\-index\fP
+.RS 4
+Automatically index the output file
+.RE
.SS "bcftools gtcheck [\fIOPTIONS\fP] [\fB\-g\fP \fIgenotypes.vcf.gz\fP] \fIquery.vcf.gz\fP"
.sp
Checks sample identity. The program can operate in two modes. If the \fB\-g\fP
@@ -2676,6 +2726,11 @@ see \fBCommon Options\fP
list of input files to output given as 1\-based indices. With \fB\-p\fP and no
\fB\-w\fP, all files are written.
.RE
+.sp
+\fB\-\-write\-index\fP
+.RS 4
+Automatically index the output file. This is done automatically with the \fB\-p\fP option.
+.RE
.SS "Examples:"
.sp
Create intersection and complements of two sets saving the output in dir/*
@@ -2876,6 +2931,11 @@ see \fBCommon Options\fP
.RS 4
see \fBCommon Options\fP
.RE
+.sp
+\fB\-\-write\-index\fP
+.RS 4
+Automatically index the output file
+.RE
.SS "bcftools mpileup [\fIOPTIONS\fP] \fB\-f\fP \fIref.fa\fP \fIin.bam\fP [\fIin2.bam\fP [...]]"
.sp
Generate VCF or BCF containing genotype likelihoods for one or multiple
@@ -3209,6 +3269,11 @@ BQB.
.fi
.if n .RE
.RE
+.sp
+\fB\-\-write\-index\fP
+.RS 4
+Automatically index the output file
+.RE
.SS "Options for SNP/INDEL genotype likelihood computation"
.sp
\fB\-X, \-\-config\fP \fISTR\fP
@@ -3528,6 +3593,11 @@ see \fBCommon Options\fP
maximum distance between two records to consider when locally
sorting variants which changed position during the realignment
.RE
+.sp
+\fB\-\-write\-index\fP
+.RS 4
+Automatically index the output file
+.RE
.SS "bcftools [plugin \fINAME\fP|+\fINAME\fP] \fI[OPTIONS]\fP \fIFILE\fP \(em \fI[PLUGIN OPTIONS]\fP"
.sp
A common framework for various utilities. The plugins can be used
@@ -3601,6 +3671,11 @@ see \fBCommon Options\fP
.RS 4
see \fBCommon Options\fP
.RE
+.sp
+\fB\-\-write\-index\fP
+.RS 4
+Automatically index the output file
+.RE
.SS "Plugin options:"
.sp
\fB\-h, \-\-help\fP
@@ -4725,6 +4800,11 @@ see \fBCommon Options\fP
.RS 4
Use this directory to store temporary files
.RE
+.sp
+\fB\-\-write\-index\fP
+.RS 4
+Automatically index the output file
+.RE
.SS "bcftools stats [\fIOPTIONS\fP] \fIA.vcf.gz\fP [\fIB.vcf.gz\fP]"
.sp
Parses VCF or BCF and produces text file stats which is suitable for machine
@@ -4943,6 +5023,11 @@ see \fBCommon Options\fP
.RS 4
see \fBCommon Options\fP
.RE
+.sp
+\fB\-\-write\-index\fP
+.RS 4
+Automatically index the output file
+.RE
.SS "Subset options:"
.sp
\fB\-a, \-\-trim\-alt\-alleles\fP
@@ -5137,7 +5222,7 @@ important libraries used by bcftools.
.SS "bcftools [\fI\-\-version\-only\fP]"
.sp
Display the full bcftools version number in a machine\-readable format.
-.SH "EXPRESSIONS"
+.SH "FILTERING EXPRESSIONS"
.sp
These filtering expressions are accepted by most of the commands.
.sp
@@ -5919,7 +6004,18 @@ bcftools view \-i \*(Aq%ID!="." & MAF[0]<0.01\*(Aq
.if n .RE
.sp
Please refer to the documentation of your shell for details.
-.SH "SCRIPTS AND OPTIONS"
+.SH "SCRIPTS"
+.SS "gff2gff"
+.sp
+Attempts to fix a GFF file to be correctly parsed by \fBcsq\fP.
+.sp
+.if n .RS 4
+.nf
+.fam C
+zcat in.gff.gz | gff2gff | gzip \-c > out.gff.gz
+.fam
+.fi
+.if n .RE
.SS "plot\-vcfstats [\fIOPTIONS\fP] \fIfile.vchk\fP [...]"
.sp
Script for processing output of \fBbcftools stats\fP. It can merge
@@ -6013,8 +6109,10 @@ Please report any bugs you encounter on the github website: \c
.sp
Heng Li from the Sanger Institute wrote the original C version of htslib,
samtools and bcftools. Bob Handsaker from the Broad Institute implemented the
-BGZF library. Petr Danecek, Shane McCarthy and John Marshall are maintaining
-and further developing bcftools. Many other people contributed to the program
+BGZF library. Petr Danecek is maintaining and further developing bcftools, together
+with the rest of the \c
+.URL "https://www.sanger.ac.uk/tool/samtools\-bcftools\-htslib" "samtools team" "."
+Many other people contributed to the program
and to the file format specifications, both directly and indirectly by
providing patches, testing and reporting bugs. We thank them all.
.SH "RESOURCES"
diff --git a/doc/bcftools.html b/doc/bcftools.html
index 5a4f5ae51..c4dd7d498 100644
--- a/doc/bcftools.html
+++ b/doc/bcftools.html
@@ -4,7 +4,7 @@
-
+
bcftools(1)
@@ -50,13 +50,13 @@
DESCRIPTION
VERSION
-
This manual page was last updated 2023-02-21 and refers to bcftools git version 1.17.
+
This manual page was last updated 2023-03-10 08:27 GMT and refers to bcftools git version 1.17-10-g9655089+.
BCF1
-
The BCF1 format output by versions of samtools <= 0.1.19 is not
+
The obsolete BCF1 format output by versions of samtools <= 0.1.19 is not
compatible with this version of bcftools. To read BCF1 files one can use
the view command from old versions of bcftools packaged with samtools
versions <= 0.1.19 to convert to VCF, which can then be read by
@@ -79,6 +79,12 @@
Use multithreading with INT worker threads. The option is currently used only for the compression of the
output stream, only when --output-type is b or z. Default: 0.
+
--write-index
+
+
Automatically index the output files. Can be used only for compressed BCF and VCF output.
+
@@ -668,6 +681,10 @@
bcftools annotate [OPTIONS]FILE
"^INFO/FOO,INFO/BAR" (and similarly for FORMAT and FILTER).
"INFO" can be abbreviated to "INF" and "FORMAT" to "FMT".
These filtering expressions are accepted by most of the commands.
@@ -4974,9 +5040,24 @@
EXPRESSIONS
-
SCRIPTS AND OPTIONS
+
SCRIPTS
+
gff2gff
+
+
Attempts to fix a GFF file to be correctly parsed by csq.
+
+
+
+
+
+
zcat in.gff.gz | gff2gff | gzip -c > out.gff.gz
+
+
+
+
+
+
plot-vcfstats [OPTIONS] file.vchk […]
Script for processing output of bcftools stats. It can merge
@@ -5077,8 +5158,9 @@
AUTHORS
Heng Li from the Sanger Institute wrote the original C version of htslib,
samtools and bcftools. Bob Handsaker from the Broad Institute implemented the
-BGZF library. Petr Danecek, Shane McCarthy and John Marshall are maintaining
-and further developing bcftools. Many other people contributed to the program
+BGZF library. Petr Danecek is maintaining and further developing bcftools, together
+with the rest of the samtools team.
+Many other people contributed to the program
and to the file format specifications, both directly and indirectly by
providing patches, testing and reporting bugs. We thank them all.