diff --git a/.appveyor.yml b/.appveyor.yml index 9720dc2dd..ca4d5049b 100644 --- a/.appveyor.yml +++ b/.appveyor.yml @@ -7,6 +7,7 @@ version: 'vers.{build}' branches: except: - gh-pages + - /.*/ # Appveyor builds are currently disabled. # Do not build on tags (GitHub and BitBucket) skip_tags: true diff --git a/.ci_helpers/clone b/.ci_helpers/clone index a01071d8b..34054b863 100755 --- a/.ci_helpers/clone +++ b/.ci_helpers/clone @@ -5,13 +5,25 @@ # omitted or if there is no branch with that name, checks out origin/HEAD # from the samtools/htslib repository. -repository=$1 -localdir=$2 -branch=$3 +echo CLONE: ${@+"$@"} + +owner=$1 +repository="https://github.com/$owner/$2" +localdir=$3 +branch=$4 +htslib_PR=$5 ref='' [ -n "$branch" ] && ref=$(git ls-remote --heads "$repository" "$branch" 2>/dev/null) [ -z "$ref" ] && repository='https://github.com/samtools/htslib.git' set -x -git clone --recurse-submodules --shallow-submodules --depth=1 ${ref:+--branch="$branch"} "$repository" "$localdir" +git clone --recurse-submodules --shallow-submodules --depth=2 ${ref:+--branch="$branch"} "$repository" "$localdir" + +# NB: "samtools" as the owner/organisation, not the repo name +if [ "x$owner" = "xsamtools" -a -z "$ref" -a "x$htslib_PR" != "x" ] +then + cd "$localdir" + git fetch origin "pull/$htslib_PR/head" + git checkout FETCH_HEAD +fi diff --git a/.cirrus.yml b/.cirrus.yml index 06edba506..4e9927e67 100644 --- a/.cirrus.yml +++ b/.cirrus.yml @@ -21,8 +21,16 @@ timeout_in: 10m # clone with our own commands too. clone_template: &HTSLIB_CLONE htslib_clone_script: | - .ci_helpers/clone "https://github.com/${CIRRUS_REPO_OWNER}/htslib" "${HTSDIR}" "${CIRRUS_BRANCH}" - + # Tricky, but when run as a PR Cirrus-CI obscures the branch name and + # replaces it by pull/. This means we can't automatically get PRs + # to test whether the user has a similarly named branch to compiler and + # test against. + # + # Instead if we add htslib#NUM into the first line of the commit then + # we will use that PR from htslib instead. This is only needed when + # making a PR, so for development prior to the PR being made the + # CIRRUS_BRANCH will be used in preference. + .ci_helpers/clone ${CIRRUS_REPO_OWNER} htslib "${HTSDIR}" "${CIRRUS_BRANCH}" `printenv CIRRUS_CHANGE_TITLE | sed -n 's/.*htslib#\([0-9]*\).*/\1/p'` #-------------------------------------------------- # Template: bcftools compile and test diff --git a/.gitattributes b/.gitattributes index 9d42c7c43..d765010ac 100644 --- a/.gitattributes +++ b/.gitattributes @@ -10,3 +10,9 @@ .git* export-ignore .ci_helpers export-ignore README.md export-ignore + +# Prevent Windows cr-lf endings. +test/** -text +test/**.c text +test/**.h text +test/**.pl text diff --git a/.github/workflows/windows-build.yml b/.github/workflows/windows-build.yml new file mode 100644 index 000000000..49a14f37f --- /dev/null +++ b/.github/workflows/windows-build.yml @@ -0,0 +1,52 @@ +name: Windows/MinGW-W64 CI +on: [push, pull_request] + +jobs: + build: + runs-on: windows-latest + steps: + - name: Checkout + uses: actions/checkout@v4 + with: + ref: ${{ github.event.pull_request.head.sha }} + - name: Set up MSYS2 MinGW-W64 + uses: msys2/setup-msys2@v2 + with: + msystem: mingw64 + update: false + install: >- + mingw-w64-x86_64-toolchain + mingw-w64-x86_64-autotools + mingw-w64-x86_64-curl + mingw-w64-x86_64-libdeflate + mingw-w64-x86_64-tools-git + mingw-w64-x86_64-zlib + mingw-w64-x86_64-bzip2 + mingw-w64-x86_64-xz + - name: Clone htslib + shell: msys2 {0} + run: | + export PATH="$PATH:/mingw64/bin:/c/Program Files/Git/bin" + export MSYSTEM=MINGW64 + htslib_pr=`git log -2 --format='%s' | sed -n 's/.*htslib#\([0-9]*\).*/\1/p'` + .ci_helpers/clone ${GITHUB_REPOSITORY_OWNER} htslib htslib ${GITHUB_HEAD_REF:-$GITHUB_REF_NAME} $htslib_pr + pushd . + cd htslib + autoreconf -i + popd + - name: Compile bcftools + shell: msys2 {0} + run: | + export PATH="$PATH:/mingw64/bin:/c/Program Files/Git/bin" + export MSYSTEM=MINGW64 + autoheader + autoconf -Wno-syntax + ./configure --enable-werror + make -j4 + - name: Check bcftools + shell: msys2 {0} + run: | + export PATH="$PATH:/mingw64/bin:/c/Program Files/Git/bin" + export MSYSTEM=MINGW64 + make check + diff --git a/Makefile b/Makefile index 27738bef6..ffead3ecb 100644 --- a/Makefile +++ b/Makefile @@ -196,7 +196,10 @@ libbcftools.a: $(OBJS) vcfplugin.o: EXTRA_CPPFLAGS += -DPLUGINPATH='"$(pluginpath)"' -%.dll %.cygdll: %.c version.h version.c libbcftools.a $(HTSLIB_DLL) +%.dll: %.c version.h version.c libbcftools.a $(HTSLIB_DLL) + $(CC) $(PLUGIN_FLAGS) $(CFLAGS) $(ALL_CPPFLAGS) $(EXTRA_CPPFLAGS) $(LDFLAGS) -o $@ version.c $< $(PLUGIN_LIBS) + +%.cygdll: %.c version.h version.c libbcftools.a $(HTSLIB_DLL) $(CC) $(PLUGIN_FLAGS) $(CFLAGS) $(ALL_CPPFLAGS) $(EXTRA_CPPFLAGS) $(LDFLAGS) -o $@ version.c $< $(PLUGIN_LIBS) %.so: %.c version.h version.c diff --git a/NEWS b/NEWS index fbd3d159f..b65b6330d 100644 --- a/NEWS +++ b/NEWS @@ -1,5 +1,9 @@ ## Release a.b +Changes affecting the whole of bcftools, or multiple commands: + +* Add support for matching lines by ID (#1739) + Changes affecting specific commands: diff --git a/doc/bcftools.txt b/doc/bcftools.txt index 47ec59ec7..ae844b4bf 100644 --- a/doc/bcftools.txt +++ b/doc/bcftools.txt @@ -163,7 +163,6 @@ specific commands to see if they apply. 'id';; only records with identical ID column are compatible. - Supported by *<>* only. *-f, --apply-filters* 'LIST':: Skip sites where FILTER column does not contain any of the strings listed @@ -501,7 +500,7 @@ Add or remove annotations. *-O, --output-type* 'b'|'u'|'z'|'v'[0-9]:: see *<>* -*--pair-logic* 'snps'|'indels'|'both'|'all'|'some'|'exact':: +*--pair-logic* 'snps'|'indels'|'both'|'all'|'some'|'exact'|'id':: Controls how to match records from the annotation file to the target VCF. Effective only when *-a* is a VCF or BCF. The option replaces the former uninuitive *--collapse*. @@ -1919,7 +1918,7 @@ on the options, the program can output records from one (or more) files which have (or do not have) corresponding records with the same position in the other files. -*-c, --collapse* 'snps'|'indels'|'both'|'all'|'some'|'none':: +*-c, --collapse* 'snps'|'indels'|'both'|'all'|'some'|'none'|'id':: see *<>* *-C, --complement*:: diff --git a/test/annotate35.1.out b/test/annotate35.1.out new file mode 100644 index 000000000..e6f36b8da --- /dev/null +++ b/test/annotate35.1.out @@ -0,0 +1,12 @@ +##fileformat=VCFv4.2 +##FILTER= +##INFO= +##INFO= +##contig= +#CHROM POS ID REF ALT QUAL FILTER INFO +chr1 902 ID6 N . PASS src=ID6 +chr1 902 ID5 N . PASS src=ID5 +chr1 902 ID4 N . PASS src=ID4 +chr1 902 ID3 N . PASS src=ID3 +chr1 902 ID2 N . PASS src=ID2 +chr1 902 ID1 N . PASS src=ID1 diff --git a/test/annotate35.2.out b/test/annotate35.2.out new file mode 100644 index 000000000..888d290e7 --- /dev/null +++ b/test/annotate35.2.out @@ -0,0 +1,12 @@ +##fileformat=VCFv4.2 +##FILTER= +##INFO= +##INFO= +##contig= +#CHROM POS ID REF ALT QUAL FILTER INFO +chr1 902 ID6 N . PASS src=ID6;dst=ID6 +chr1 902 ID5 N . PASS src=ID5;dst=ID5 +chr1 902 ID4 N . PASS src=ID4;dst=ID4 +chr1 902 ID3 N . PASS src=ID3;dst=ID3 +chr1 902 ID2 N . PASS src=ID2;dst=ID2 +chr1 902 ID1 N . PASS src=ID1;dst=ID1 diff --git a/test/annotate35.vcf b/test/annotate35.vcf new file mode 100644 index 000000000..2000257d2 --- /dev/null +++ b/test/annotate35.vcf @@ -0,0 +1,11 @@ +##fileformat=VCFv4.2 +##INFO= +##INFO= +##contig= +#CHROM POS ID REF ALT QUAL FILTER INFO +chr1 902 ID6 N . PASS src=ID6 +chr1 902 ID5 N . PASS src=ID5 +chr1 902 ID4 N . PASS src=ID4 +chr1 902 ID3 N . PASS src=ID3 +chr1 902 ID2 N . PASS src=ID2 +chr1 902 ID1 N . PASS src=ID1 diff --git a/test/annots35.tab b/test/annots35.tab new file mode 100644 index 000000000..faa69496d --- /dev/null +++ b/test/annots35.tab @@ -0,0 +1,6 @@ +chr1 902 ID4 N ID4 +chr1 902 ID2 N ID2 +chr1 902 ID6 N ID6 +chr1 902 ID1 N ID1 +chr1 902 ID3 N ID3 +chr1 902 ID5 N ID5 diff --git a/test/annots35.vcf b/test/annots35.vcf new file mode 100644 index 000000000..dc1a1fdc9 --- /dev/null +++ b/test/annots35.vcf @@ -0,0 +1,11 @@ +##fileformat=VCFv4.2 +##INFO= +##INFO= +##contig= +#CHROM POS ID REF ALT QUAL FILTER INFO +chr1 902 ID4 N . PASS src=ID4 +chr1 902 ID2 N . PASS src=ID2 +chr1 902 ID6 N . PASS src=ID6 +chr1 902 ID3 N . PASS src=ID3 +chr1 902 ID1 N . PASS src=ID1 +chr1 902 ID5 N . PASS src=ID5 diff --git a/test/isec.match-id.1.1.vcf b/test/isec.match-id.1.1.vcf new file mode 100644 index 000000000..7f198c443 --- /dev/null +++ b/test/isec.match-id.1.1.vcf @@ -0,0 +1,7 @@ +##fileformat=VCFv4.2 +##INFO= +##contig= +#CHROM POS ID REF ALT QUAL FILTER INFO +chr1 902 ID1 A C . . . +chr1 902 ID2 A C . . . +chr1 902 ID3 A C . . . diff --git a/test/isec.match-id.1.2.vcf b/test/isec.match-id.1.2.vcf new file mode 100644 index 000000000..e9b1189f4 --- /dev/null +++ b/test/isec.match-id.1.2.vcf @@ -0,0 +1,7 @@ +##fileformat=VCFv4.2 +##INFO= +##contig= +#CHROM POS ID REF ALT QUAL FILTER INFO +chr1 902 ID3 A C . . . +chr1 902 ID2 A C . . . +chr1 902 ID1 A C . . . diff --git a/test/isec.match-id.1.out b/test/isec.match-id.1.out new file mode 100644 index 000000000..fa11a726d --- /dev/null +++ b/test/isec.match-id.1.out @@ -0,0 +1,8 @@ +##fileformat=VCFv4.2 +##FILTER= +##INFO= +##contig= +#CHROM POS ID REF ALT QUAL FILTER INFO +chr1 902 ID3 A C . . . +chr1 902 ID2 A C . . . +chr1 902 ID1 A C . . . diff --git a/test/isec.match-id.2.out b/test/isec.match-id.2.out new file mode 100644 index 000000000..104495322 --- /dev/null +++ b/test/isec.match-id.2.out @@ -0,0 +1,8 @@ +##fileformat=VCFv4.2 +##FILTER= +##INFO= +##contig= +#CHROM POS ID REF ALT QUAL FILTER INFO +chr1 902 ID1 A C . . . +chr1 902 ID2 A C . . . +chr1 902 ID3 A C . . . diff --git a/test/test.pl b/test/test.pl index b77ab7046..04637bab1 100755 --- a/test/test.pl +++ b/test/test.pl @@ -49,6 +49,8 @@ run_test(\&test_vcf_stats,$opts,in=>['stats.counts'],out=>'stats.counts.chk',args=>'-s -'); run_test(\&test_vcf_stats,$opts,in=>['stats.counts'],out=>'stats.counts.2.chk',args=>q[-s - -i 'type="snp"']); run_test(\&test_vcf_stats,$opts,in=>['stats.vaf'],out=>'stats.vaf.1.chk',args=>q[-s -]); +run_test(\&test_vcf_isec,$opts,in=>['isec.match-id.1.1','isec.match-id.1.2'],out=>'isec.match-id.1.out',args=>'-n =2 -w 2 --no-version'); +run_test(\&test_vcf_isec,$opts,in=>['isec.match-id.1.1','isec.match-id.1.2'],out=>'isec.match-id.2.out',args=>'-n =2 -w 2 -c id --no-version'); run_test(\&test_vcf_isec,$opts,in=>['isec.a','isec.b'],out=>'isec.ab.out',args=>'-n =2'); run_test(\&test_vcf_isec,$opts,in=>['isec.a','isec.b'],out=>'isec.ab.flt.out',args=>'-n =2 -i"STRLEN(REF)==2"'); run_test(\&test_vcf_isec,$opts,in=>['isec.a','isec.b'],out=>'isec.ab.both.out',args=>'-n =2 -c both'); @@ -524,6 +526,8 @@ run_test(\&test_vcf_sort,$opts,in=>'sort',out=>'sort.out',args=>q[-m 0],fmt=>'%CHROM\\t%POS\\t%REF,%ALT\\n'); run_test(\&test_vcf_sort,$opts,in=>'sort',out=>'sort.out',args=>q[-m 1000],fmt=>'%CHROM\\t%POS\\t%REF,%ALT\\n'); run_test(\&test_vcf_regions,$opts,in=>'regions'); +run_test(\&test_vcf_annotate,$opts,in=>'annotate35',vcf=>'annots35',out=>'annotate35.1.out',args=>q[-c CHROM,POS,~ID,REF,ALT,INFO/src]); +run_test(\&test_vcf_annotate,$opts,in=>'annotate35',tab=>'annots35',out=>'annotate35.2.out',args=>q[-c CHROM,POS,~ID,REF,ALT,dst:=src]); run_test(\&test_vcf_annotate,$opts,in=>'annotate.escape.1',tab=>'annotate.escape.1',out=>'annotate.escape.1.1.out',args=>q[-c CHROM,POS,ISTR,FMT/FSTR]); run_test(\&test_vcf_annotate,$opts,in=>'annotate.match.1',tab=>'annotate.match.1',out=>'annotate.match.1.1.out',args=>q[-c CHROM,POS,-,-,SCORE,~X,-,- -i'STR={X}']); run_test(\&test_vcf_annotate,$opts,in=>'annotate.match.1',tab=>'annotate.match.1',out=>'annotate.match.1.2.out',args=>q[-c CHROM,POS,REF,ALT,SCORE,-,~X,- -i'INT={X}']); @@ -1478,7 +1482,9 @@ sub test_vcf_isec my $files = join(' ',@files); $args{args} =~ s/{PATH}/$$opts{path}/g; test_cmd($opts,%args,cmd=>"$$opts{bin}/bcftools isec $args{args} $files"); - test_cmd($opts,%args,cmd=>"$$opts{bin}/bcftools isec -Ob $args{args} $files"); + + # Either improve or disable completely: the output type does not make sense in all modes + # test_cmd($opts,%args,cmd=>"$$opts{bin}/bcftools isec -Ob $args{args} $files"); } sub test_vcf_isec2 { diff --git a/vcfannotate.c b/vcfannotate.c index b66c8cf51..5fc0c63ab 100644 --- a/vcfannotate.c +++ b/vcfannotate.c @@ -2313,7 +2313,7 @@ static void init_columns(args_t *args) col->hdr_key_src = strdup(str.s); col->hdr_key_dst = strdup(str.s); col->replace = replace; - if ( args->pair_logic==-1 ) bcf_sr_set_opt(args->files,BCF_SR_PAIR_LOGIC,BCF_SR_PAIR_BOTH_REF); + if ( args->pair_logic==-1 ) args->pair_logic = BCF_SR_PAIR_ANY; } else args->alt_idx = icol; } @@ -2321,7 +2321,6 @@ static void init_columns(args_t *args) { if ( replace & REPLACE_NON_MISSING ) error("Apologies, the -ID feature has not been implemented yet.\n"); if ( str.s[0]=='~' ) replace = MATCH_VALUE; - if ( args->tgts_is_vcf && (replace & MATCH_VALUE) ) error("todo: -c ~ID with -a VCF?\n"); args->ncols++; args->cols = (annot_col_t*) realloc(args->cols,sizeof(annot_col_t)*args->ncols); annot_col_t *col = &args->cols[args->ncols-1]; memset(col,0,sizeof(*col)); @@ -2330,7 +2329,11 @@ static void init_columns(args_t *args) col->setter = args->tgts_is_vcf ? vcf_setter_id : setter_id; col->hdr_key_src = strdup(str.s); col->hdr_key_dst = strdup(str.s); - if ( replace & MATCH_VALUE ) args->match_id = icol; + if ( replace & MATCH_VALUE ) + { + args->match_id = icol; + if ( args->tgts_is_vcf ) args->pair_logic = (args->pair_logic==-1) ? BCF_SR_PAIR_ID : args->pair_logic|BCF_SR_PAIR_ID; + } } else if ( !strcasecmp("~INFO/END",str.s) && !args->tgts_is_vcf ) { @@ -3122,6 +3125,11 @@ static void init_data(args_t *args) &args->index_fn, args->write_index) < 0 ) error("Error: failed to initialise index for %s\n",args->output_fname); } + if ( args->tgts_is_vcf ) + { + if ( args->pair_logic==-1 ) args->pair_logic = BCF_SR_PAIR_SOME; + bcf_sr_set_opt(args->files,BCF_SR_PAIR_LOGIC,args->pair_logic); + } } static void destroy_data(args_t *args) @@ -3650,7 +3658,7 @@ static void usage(args_t *args) fprintf(stderr, " --no-version Do not append version and command line to the header\n"); fprintf(stderr, " -o, --output FILE Write output to a file [standard output]\n"); fprintf(stderr, " -O, --output-type u|b|v|z[0-9] u/b: un/compressed BCF, v/z: un/compressed VCF, 0-9: compression level [v]\n"); - fprintf(stderr, " --pair-logic STR Matching records by , see man page for details [some]\n"); + fprintf(stderr, " --pair-logic STR Matching records by , see man page for details [some]\n"); fprintf(stderr, " -r, --regions REGION Restrict to comma-separated list of regions\n"); fprintf(stderr, " -R, --regions-file FILE Restrict to regions listed in FILE\n"); fprintf(stderr, " --regions-overlap 0|1|2 Include if POS in the region (0), record overlaps (1), variant overlaps (2) [1]\n"); @@ -3784,6 +3792,7 @@ int main_vcfannotate(int argc, char *argv[]) else if ( !strcmp(optarg,"some") ) args->pair_logic |= BCF_SR_PAIR_SOME; else if ( !strcmp(optarg,"none") ) args->pair_logic = BCF_SR_PAIR_EXACT; else if ( !strcmp(optarg,"exact") ) args->pair_logic = BCF_SR_PAIR_EXACT; + else if ( !strcmp(optarg,"id") ) args->pair_logic |= BCF_SR_PAIR_ID; else error("The --pair-logic string \"%s\" not recognised.\n", optarg); break; case 3 : @@ -3829,7 +3838,6 @@ int main_vcfannotate(int argc, char *argv[]) { args->tgts_is_vcf = 1; args->files->require_index = 1; - bcf_sr_set_opt(args->files,BCF_SR_PAIR_LOGIC,args->pair_logic>=0 ? args->pair_logic : BCF_SR_PAIR_SOME); if ( args->min_overlap_str ) error("The --min-overlap option cannot be used when annotating from a VCF\n"); } } diff --git a/vcfisec.c b/vcfisec.c index 24a45685b..e28a956dc 100644 --- a/vcfisec.c +++ b/vcfisec.c @@ -1,6 +1,6 @@ /* vcfisec.c -- Create intersections, unions and complements of VCF files. - Copyright (C) 2012-2023 Genome Research Ltd. + Copyright (C) 2012-2024 Genome Research Ltd. Author: Petr Danecek @@ -460,7 +460,7 @@ static void destroy_data(args_t *args) { if ( !args->fnames[i] ) continue; if ( hts_close(args->fh_out[i])!=0 ) error("[%s] Error: close failed .. %s\n", __func__,args->fnames[i]); - int is_tbi = !args->write_index + int is_tbi = !args->write_index || (args->write_index&127) == HTS_FMT_TBI; if ( args->output_type==FT_VCF_GZ && is_tbi ) { @@ -476,8 +476,8 @@ static void destroy_data(args_t *args) free(args->fh_out); free(args->fnames); if ( args->fh_sites ) fclose(args->fh_sites); - if ( args->write ) free(args->write); } + free(args->write); } static void usage(void) @@ -487,7 +487,7 @@ static void usage(void) fprintf(stderr, "Usage: bcftools isec [options] [...]\n"); fprintf(stderr, "\n"); fprintf(stderr, "Options:\n"); - fprintf(stderr, " -c, --collapse STRING Treat as identical records with , see man page for details [none]\n"); + fprintf(stderr, " -c, --collapse STRING Treat as identical records with , see man page for details [none]\n"); fprintf(stderr, " -C, --complement Output positions present only in the first file but missing in the others\n"); fprintf(stderr, " -e, --exclude EXPR Exclude sites for which the expression is true\n"); fprintf(stderr, " -f, --apply-filters LIST Require at least one of the listed FILTER strings (e.g. \"PASS,.\")\n"); @@ -597,6 +597,7 @@ int main_vcfisec(int argc, char *argv[]) else if ( !strcmp(optarg,"all") ) args->files->collapse |= COLLAPSE_ANY; else if ( !strcmp(optarg,"some") ) args->files->collapse |= COLLAPSE_SOME; else if ( !strcmp(optarg,"none") ) args->files->collapse = COLLAPSE_NONE; + else if ( !strcmp(optarg,"id") ) args->files->collapse |= BCF_SR_PAIR_ID; else error("The --collapse string \"%s\" not recognised.\n", optarg); break; case 'f': args->files->apply_filters = optarg; break;