Skip to content

Commit

Permalink
Support auto indexing during writing BCF and VCF.gz via new `--write-…
Browse files Browse the repository at this point in the history
…index` option
  • Loading branch information
pd3 committed Mar 10, 2023
1 parent 9655089 commit 9b352b7
Show file tree
Hide file tree
Showing 32 changed files with 845 additions and 875 deletions.
5 changes: 5 additions & 0 deletions NEWS
Original file line number Diff line number Diff line change
@@ -1,5 +1,10 @@
## Release a.b

Changes affecting the whole of bcftools, or multiple commands:

* Support auto indexing during writing BCF and VCF.gz via new `--write-index` option


Changes affecting specific commands:

* bcftools annotate
Expand Down
5 changes: 4 additions & 1 deletion bcftools.h
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
/* bcftools.h -- utility function declarations.
Copyright (C) 2013-2022 Genome Research Ltd.
Copyright (C) 2013-2023 Genome Research Ltd.
Author: Petr Danecek <[email protected]>
Expand Down Expand Up @@ -49,6 +49,9 @@ void error(const char *format, ...) HTS_NORETURN HTS_FORMAT(HTS_PRINTF_FMT, 1, 2
// newline will be added by the function.
void error_errno(const char *format, ...) HTS_NORETURN HTS_FORMAT(HTS_PRINTF_FMT, 1, 2);

// For on the fly index creation with --write-index
int init_index(htsFile *fh, bcf_hdr_t *hdr, char *fname, char **idx_fname);

void bcf_hdr_append_version(bcf_hdr_t *hdr, int argc, char **argv, const char *cmd);
const char *hts_bcf_wmode(int file_type);
const char *hts_bcf_wmode2(int file_type, const char *fname);
Expand Down
17 changes: 17 additions & 0 deletions csq.c
Original file line number Diff line number Diff line change
Expand Up @@ -574,6 +574,8 @@ typedef struct _args_t
// text tab-delimited output (out) or vcf/bcf output (out_fh)
FILE *out;
htsFile *out_fh;
char *index_fn;
int write_index;

// vcf
bcf_srs_t *sr;
Expand Down Expand Up @@ -1536,6 +1538,7 @@ void init_data(args_t *args)
if ( args->hdr_nsmpl )
bcf_hdr_printf(args->hdr,"##FORMAT=<ID=%s,Number=.,Type=Integer,Description=\"Bitmask of indexes to INFO/BCSQ, with interleaved first/second haplotype. Use \\\"bcftools query -f'[%%CHROM\\t%%POS\\t%%SAMPLE\\t%%TBCSQ\\n]'\\\" to translate.\">",args->bcsq_tag);
if ( bcf_hdr_write(args->out_fh, args->hdr)!=0 ) error("[%s] Error: cannot write the header to %s\n", __func__,args->output_fname?args->output_fname:"standard output");
if ( args->write_index && init_index(args->out_fh,args->hdr,args->output_fname,&args->index_fn)<0 ) error("Error: failed to initialise index for %s\n",args->output_fname);
}
if ( args->verbosity > 0 ) fprintf(stderr,"Calling...\n");
}
Expand Down Expand Up @@ -1571,7 +1574,18 @@ void destroy_data(args_t *args)
if ( args->smpl ) smpl_ilist_destroy(args->smpl);
int ret;
if ( args->out_fh )
{
if ( args->write_index )
{
if ( bcf_idx_save(args->out_fh)<0 )
{
if ( hts_close(args->out_fh)!=0 ) error("Error: close failed .. %s\n", args->output_fname?args->output_fname:"stdout");
error("Error: cannot write to index %s\n", args->index_fn);
}
free(args->index_fn);
}
ret = hts_close(args->out_fh);
}
else
ret = fclose(args->out);
if ( ret ) error("Error: close failed .. %s\n", args->output_fname?args->output_fname:"stdout");
Expand Down Expand Up @@ -4272,6 +4286,7 @@ static const char *usage(void)
" --targets-overlap 0|1|2 Include if POS in the region (0), record overlaps (1), variant overlaps (2) [0]\n"
" --threads INT Use multithreading with <int> worker threads [0]\n"
" -v, --verbose INT Verbosity level 0-2 [1]\n"
" --write-index Automatically index the output files [off]\n"
"\n"
"Example:\n"
" bcftools csq -f hs37d5.fa -g Homo_sapiens.GRCh37.82.gff3.gz in.vcf\n"
Expand Down Expand Up @@ -4321,6 +4336,7 @@ int main_csq(int argc, char *argv[])
{"targets-file",1,0,'T'},
{"targets-overlap",required_argument,NULL,5},
{"no-version",no_argument,NULL,3},
{"write-index",no_argument,NULL,6},
{0,0,0,0}
};
int c, targets_is_file = 0, regions_is_file = 0;
Expand Down Expand Up @@ -4409,6 +4425,7 @@ int main_csq(int argc, char *argv[])
targets_overlap = parse_overlap_option(optarg);
if ( targets_overlap < 0 ) error("Could not parse: --targets-overlap %s\n",optarg);
break;
case 6 : args->write_index = 1; break;
case 'h':
case '?': error("%s",usage());
default: error("The option not recognised: %s\n\n", optarg); break;
Expand Down
116 changes: 107 additions & 9 deletions doc/bcftools.1
Original file line number Diff line number Diff line change
@@ -1,13 +1,13 @@
'\" t
.\" Title: bcftools
.\" Author: [see the "AUTHOR(S)" section]
.\" Generator: Asciidoctor 2.0.16.dev
.\" Date: 2023-02-21
.\" Generator: Asciidoctor 2.0.16
.\" Date: 2023-03-10
.\" Manual: \ \&
.\" Source: \ \&
.\" Language: English
.\"
.TH "BCFTOOLS" "1" "2023-02-21" "\ \&" "\ \&"
.TH "BCFTOOLS" "1" "2023-03-10" "\ \&" "\ \&"
.ie \n(.g .ds Aq \(aq
.el .ds Aq '
.ss \n[.ss] 0
Expand Down Expand Up @@ -51,10 +51,10 @@ standard input (stdin) and outputs to the standard output (stdout). Several
commands can thus be combined with Unix pipes.
.SS "VERSION"
.sp
This manual page was last updated \fB2023\-02\-21\fP and refers to bcftools git version \fB1.17\fP.
This manual page was last updated \fB2023\-03\-10 08:27 GMT\fP and refers to bcftools git version \fB1.17\-10\-g9655089+\fP.
.SS "BCF1"
.sp
The BCF1 format output by versions of samtools <= 0.1.19 is \fBnot\fP
The obsolete BCF1 format output by versions of samtools <= 0.1.19 is \fBnot\fP
compatible with this version of bcftools. To read BCF1 files one can use
the view command from old versions of bcftools packaged with samtools
versions <= 0.1.19 to convert to VCF, which can then be read by
Expand All @@ -75,6 +75,9 @@ done with \fIbcftools view\fP. Users are now required to choose between the old
samtools calling model (\fI\-c/\-\-consensus\-caller\fP) and the new multiallelic
calling model (\fI\-m/\-\-multiallelic\-caller\fP). The multiallelic calling model
is recommended for most tasks.
.SS "FILTERING EXPRESSIONS"
.sp
See \fBEXPRESSIONS\fP
.SH "LIST OF COMMANDS"
.sp
For a full list of available commands, run \fBbcftools\fP without arguments. For a full
Expand Down Expand Up @@ -344,6 +347,17 @@ Some helper scripts are bundled with the bcftools code.
. sp -1
. IP \(bu 2.3
.\}
\fBgff2gff\fP .. converts a GFF file to the format required by \fBcsq\fP
.RE
.sp
.RS 4
.ie n \{\
\h'-04'\(bu\h'+03'\c
.\}
.el \{\
. sp -1
. IP \(bu 2.3
.\}
\fBplot\-vcfstats\fP .. plots the output of \fBstats\fP
.RE
.SH "COMMANDS AND OPTIONS"
Expand Down Expand Up @@ -597,6 +611,11 @@ Same as \fB\-\-regions\-overlap\fP but for \fB\-t/\-T\fP.
Use multithreading with \fIINT\fP worker threads. The option is currently used only for the compression of the
output stream, only when \fI\-\-output\-type\fP is \fIb\fP or \fIz\fP. Default: 0.
.RE
.sp
\fB\-\-write\-index\fP
.RS 4
Automatically index the output files. Can be used only for compressed BCF and VCF output.
.RE
.SS "bcftools annotate \fI[OPTIONS]\fP \fIFILE\fP"
.sp
Add or remove annotations.
Expand Down Expand Up @@ -881,6 +900,11 @@ except GT. To remove all INFO tags except "FOO" and "BAR", use
"INFO" can be abbreviated to "INF" and "FORMAT" to "FMT".
.RE
.sp
\fB\-\-write\-index\fP
.RS 4
Automatically index the output file
.RE
.sp
\fBExamples:\fP
.sp
.if n .RS 4
Expand Down Expand Up @@ -1017,6 +1041,11 @@ see \fBCommon Options\fP
.RS 4
see \fBCommon Options\fP
.RE
.sp
\fB\-\-write\-index\fP
.RS 4
Automatically index the output file
.RE
.SS "Input/output options:"
.sp
\fB\-A, \-\-keep\-alts\fP
Expand Down Expand Up @@ -1401,6 +1430,11 @@ see \fBCommon Options\fP
.RS 4
see \fBCommon Options\fP
.RE
.sp
\fB\-\-write\-index\fP
.RS 4
Automatically index the output file
.RE
.SS "bcftools consensus \fI[OPTIONS]\fP \fIFILE\fP"
.sp
Create consensus sequence by applying VCF variants to a reference fasta file.
Expand Down Expand Up @@ -1617,6 +1651,11 @@ see \fBCommon Options\fP
.RS 4
see \fBCommon Options\fP
.RE
.sp
\fB\-\-write\-index\fP
.RS 4
Automatically index the output file
.RE
.SS "VCF output options:"
.sp
\fB\-\-no\-version\fP
Expand Down Expand Up @@ -1987,6 +2026,7 @@ transcripts in malformatted GFFs with incorrect phase
.RS 4
GFF3 annotation file (required), such as \c
.URL "ftp://ftp.ensembl.org/pub/current_gff3/homo_sapiens" "" "."
The script \fBgff2gff\fP can help with conversion from non\-standard GFF formats.
An example of a minimal working GFF file:
.RE
.sp
Expand Down Expand Up @@ -2137,6 +2177,11 @@ see \fBCommon Options\fP
see \fBCommon Options\fP
.RE
.sp
\fB\-\-write\-index\fP
.RS 4
Automatically index the output file
.RE
.sp
\fBExamples:\fP
.sp
.if n .RS 4
Expand Down Expand Up @@ -2366,6 +2411,11 @@ see \fBCommon Options\fP
.RS 4
see \fBCommon Options\fP
.RE
.sp
\fB\-\-write\-index\fP
.RS 4
Automatically index the output file
.RE
.SS "bcftools gtcheck [\fIOPTIONS\fP] [\fB\-g\fP \fIgenotypes.vcf.gz\fP] \fIquery.vcf.gz\fP"
.sp
Checks sample identity. The program can operate in two modes. If the \fB\-g\fP
Expand Down Expand Up @@ -2676,6 +2726,11 @@ see \fBCommon Options\fP
list of input files to output given as 1\-based indices. With \fB\-p\fP and no
\fB\-w\fP, all files are written.
.RE
.sp
\fB\-\-write\-index\fP
.RS 4
Automatically index the output file. This is done automatically with the \fB\-p\fP option.
.RE
.SS "Examples:"
.sp
Create intersection and complements of two sets saving the output in dir/*
Expand Down Expand Up @@ -2876,6 +2931,11 @@ see \fBCommon Options\fP
.RS 4
see \fBCommon Options\fP
.RE
.sp
\fB\-\-write\-index\fP
.RS 4
Automatically index the output file
.RE
.SS "bcftools mpileup [\fIOPTIONS\fP] \fB\-f\fP \fIref.fa\fP \fIin.bam\fP [\fIin2.bam\fP [...]]"
.sp
Generate VCF or BCF containing genotype likelihoods for one or multiple
Expand Down Expand Up @@ -3209,6 +3269,11 @@ BQB.
.fi
.if n .RE
.RE
.sp
\fB\-\-write\-index\fP
.RS 4
Automatically index the output file
.RE
.SS "Options for SNP/INDEL genotype likelihood computation"
.sp
\fB\-X, \-\-config\fP \fISTR\fP
Expand Down Expand Up @@ -3528,6 +3593,11 @@ see \fBCommon Options\fP
maximum distance between two records to consider when locally
sorting variants which changed position during the realignment
.RE
.sp
\fB\-\-write\-index\fP
.RS 4
Automatically index the output file
.RE
.SS "bcftools [plugin \fINAME\fP|+\fINAME\fP] \fI[OPTIONS]\fP \fIFILE\fP \(em \fI[PLUGIN OPTIONS]\fP"
.sp
A common framework for various utilities. The plugins can be used
Expand Down Expand Up @@ -3601,6 +3671,11 @@ see \fBCommon Options\fP
.RS 4
see \fBCommon Options\fP
.RE
.sp
\fB\-\-write\-index\fP
.RS 4
Automatically index the output file
.RE
.SS "Plugin options:"
.sp
\fB\-h, \-\-help\fP
Expand Down Expand Up @@ -4725,6 +4800,11 @@ see \fBCommon Options\fP
.RS 4
Use this directory to store temporary files
.RE
.sp
\fB\-\-write\-index\fP
.RS 4
Automatically index the output file
.RE
.SS "bcftools stats [\fIOPTIONS\fP] \fIA.vcf.gz\fP [\fIB.vcf.gz\fP]"
.sp
Parses VCF or BCF and produces text file stats which is suitable for machine
Expand Down Expand Up @@ -4943,6 +5023,11 @@ see \fBCommon Options\fP
.RS 4
see \fBCommon Options\fP
.RE
.sp
\fB\-\-write\-index\fP
.RS 4
Automatically index the output file
.RE
.SS "Subset options:"
.sp
\fB\-a, \-\-trim\-alt\-alleles\fP
Expand Down Expand Up @@ -5137,7 +5222,7 @@ important libraries used by bcftools.
.SS "bcftools [\fI\-\-version\-only\fP]"
.sp
Display the full bcftools version number in a machine\-readable format.
.SH "EXPRESSIONS"
.SH "FILTERING EXPRESSIONS"
.sp
These filtering expressions are accepted by most of the commands.
.sp
Expand Down Expand Up @@ -5919,7 +6004,18 @@ bcftools view \-i \*(Aq%ID!="." & MAF[0]<0.01\*(Aq
.if n .RE
.sp
Please refer to the documentation of your shell for details.
.SH "SCRIPTS AND OPTIONS"
.SH "SCRIPTS"
.SS "gff2gff"
.sp
Attempts to fix a GFF file to be correctly parsed by \fBcsq\fP.
.sp
.if n .RS 4
.nf
.fam C
zcat in.gff.gz | gff2gff | gzip \-c > out.gff.gz
.fam
.fi
.if n .RE
.SS "plot\-vcfstats [\fIOPTIONS\fP] \fIfile.vchk\fP [...]"
.sp
Script for processing output of \fBbcftools stats\fP. It can merge
Expand Down Expand Up @@ -6013,8 +6109,10 @@ Please report any bugs you encounter on the github website: \c
.sp
Heng Li from the Sanger Institute wrote the original C version of htslib,
samtools and bcftools. Bob Handsaker from the Broad Institute implemented the
BGZF library. Petr Danecek, Shane McCarthy and John Marshall are maintaining
and further developing bcftools. Many other people contributed to the program
BGZF library. Petr Danecek is maintaining and further developing bcftools, together
with the rest of the \c
.URL "https://www.sanger.ac.uk/tool/samtools\-bcftools\-htslib" "samtools team" "."
Many other people contributed to the program
and to the file format specifications, both directly and indirectly by
providing patches, testing and reporting bugs. We thank them all.
.SH "RESOURCES"
Expand Down
Loading

0 comments on commit 9b352b7

Please sign in to comment.