diff --git a/.appveyor.yml b/.appveyor.yml deleted file mode 100644 index 8fe288094..000000000 --- a/.appveyor.yml +++ /dev/null @@ -1,43 +0,0 @@ -# version format. -# you can use {branch} name in version format too -# version: 1.0.{build}-{branch} -version: 'vers.{build}' - -# branches to build -branches: - # Blacklist - except: - - gh-pages - -# Do not build on tags (GitHub and BitBucket) -skip_tags: true - -# Skipping commits affecting specific files (GitHub only). More details here: /docs/appveyor-yml -#skip_commits: -# files: -# - docs/* -# - '**/*.html' - -# Appveyor Windows images are based on Visual studio version -image: Visual Studio 2019 - -# We use Mingw/Msys, so use pacman for installs -install: - - set HOME=. - - set MSYSTEM=MINGW64 - - set PATH=C:/msys64/usr/bin;C:/msys64/mingw64/bin;%PATH% - - set MINGWPREFIX=x86_64-w64-mingw32 - - "sh -lc \"pacman -S --noconfirm --needed base-devel mingw-w64-x86_64-toolchain mingw-w64-x86_64-autotools mingw-w64-x86_64-zlib mingw-w64-x86_64-bzip2 mingw-w64-x86_64-xz mingw-w64-x86_64-curl mingw-w64-x86_64-tools-git\"" - -build_script: - - set HOME=. - - set MSYSTEM=MINGW64 - - set PATH=C:/msys64/usr/bin;C:/msys64/mingw64/bin;%PATH% - - git submodule update --init --recursive - - "sh -lc \"autoreconf -i && ./configure --enable-werror CFLAGS='-g -O3' && make -j2\"" - -#build_script: -# - make - -test_script: - - "sh -lc \"make test-shlib-exports && make test\"" diff --git a/.cirrus.yml b/.cirrus.yml index fc4405b08..6da99dde0 100644 --- a/.cirrus.yml +++ b/.cirrus.yml @@ -35,21 +35,22 @@ compile_template: &COMPILE if test "$USE_CONFIG" = "yes"; then MAKE_OPTS= autoreconf -i - eval ./configure --enable-plugins --enable-werror $CONFIG_OPTS CFLAGS=\"-g -O3 $CFLAGS\" || \ + eval ./configure --enable-plugins --enable-werror $CONFIG_OPTS CFLAGS=\"$CFLAGS\" || \ ( cat config.log; false ) else MAKE_OPTS=-e fi + make cc-version $MAKE_OPTS if test "x$DO_MAINTAINER_CHECKS" = "xyes"; then - make maintainer-check + make maintainer-check $MAKE_OPTS fi make -j 4 $MAKE_OPTS test_template: &TEST test_script: | - make test-shlib-exports - make test - if test "x$DO_UNTRACKED_FILE_CHECK" = "xyes"; then make check-untracked ; fi + make test-shlib-exports $MAKE_OPTS + make test $MAKE_OPTS + if test "x$DO_UNTRACKED_FILE_CHECK" = "xyes"; then make check-untracked $MAKE_OPTS ; fi #-------------------------------------------------- # Task: linux builds. @@ -71,10 +72,14 @@ gcc_task: DO_MAINTAINER_CHECKS: yes DO_UNTRACKED_FILE_CHECK: yes USE_CONFIG: no + CFLAGS: -g -O2 -Wall -Werror -fvisibility=hidden - environment: USE_CONFIG: yes - CFLAGS: -std=c99 -pedantic -Wformat=2 + # ubsan is incompatible with some -Wformat opts so we do that on clang. + CFLAGS: -g -O3 -fsanitize=address,undefined -DHTS_ALLOW_UNALIGNED=0 -Wno-format-truncation -Wno-format-overflow + LDFLAGS: -fsanitize=address,undefined USE_LIBDEFLATE: yes + UBSAN_OPTIONS: print_stacktrace=1:halt_on_error=1 install_script: | apt-get update @@ -105,11 +110,12 @@ ubuntu_task: matrix: - environment: USE_CONFIG: yes + CFLAGS: -g -O3 DO_UNTRACKED_FILE_CHECK: yes - environment: + # Cirrus-CI's clang isn't installed with ubsan, so we do that in gcc USE_CONFIG: yes - CFLAGS: -g -Wall -O3 -fsanitize=address - LDFLAGS: -fsanitize=address + CFLAGS: -g -O3 -std=c99 -pedantic -Wall -Wformat -Wformat=2 USE_LIBDEFLATE: yes # NB: we could consider building a docker image with these @@ -137,7 +143,7 @@ rocky_task: LC_ALL: C CIRRUS_CLONE_DEPTH: 1 USE_CONFIG: yes - CFLAGS: -std=gnu90 + CFLAGS: -g -O3 -std=gnu90 -Wall -Wformat -Wformat=2 -Wextra -Wno-sign-compare -Wno-unused-parameter -Wno-missing-field-initializers # NB: we could consider building a docker image with these # preinstalled and specifying that instead, to speed up testing. @@ -182,11 +188,10 @@ arm_ubuntu_task: macosx_task: name: macosx + clang macos_instance: - image: ghcr.io/cirruslabs/macos-ventura-base:latest + image: ghcr.io/cirruslabs/macos-runner:sonoma environment: CC: clang - CFLAGS: "-Wall -arch arm64 -arch x86_64" LDFLAGS: "-arch arm64 -arch x86_64" LIBDEFLATE_CFLAGS: "-arch arm64 -arch x86_64" LC_ALL: C @@ -195,9 +200,11 @@ macosx_task: matrix: - environment: USE_CONFIG: no + CFLAGS: "-g -O3 -Wall -Werror -arch arm64 -arch x86_64" - environment: USE_CONFIG: yes USE_LIBDEFLATE: yes + CFLAGS: "-g -O3 -Wall -arch arm64 -arch x86_64" package_install_script: | HOMEBREW_NO_AUTO_UPDATE=1 brew install autoconf automake libtool xz git \ diff --git a/.gitattributes b/.gitattributes index 5d9850bc7..2d5a80e04 100644 --- a/.gitattributes +++ b/.gitattributes @@ -24,3 +24,14 @@ test/index_dos.sam -text # Remove the text attribute from various faidx test files test/faidx/faidx*.fa* -text test/faidx/fastqs*.fq* -text +test/fastq/*.fa -text +test/fastq/*.fq -text +*.tst -text +*.out -text +*.crai -text +*.bai -text +*.csi -text +*.gzi -text +*.bcf -text +*.sam -text +*.sam.gz -text diff --git a/.github/workflows/windows-build.yml b/.github/workflows/windows-build.yml new file mode 100644 index 000000000..bf6f5ae53 --- /dev/null +++ b/.github/workflows/windows-build.yml @@ -0,0 +1,41 @@ +name: Windows/MinGW-W64 CI +on: [push, pull_request] + +jobs: + build: + runs-on: windows-latest + steps: + - name: Checkout + uses: actions/checkout@v4 + with: + submodules: true + - name: Set up MSYS2 MinGW-W64 + uses: msys2/setup-msys2@v2 + with: + msystem: mingw64 + update: false + install: >- + mingw-w64-x86_64-autotools + mingw-w64-x86_64-bzip2 + mingw-w64-x86_64-curl + mingw-w64-x86_64-libdeflate + mingw-w64-x86_64-toolchain + mingw-w64-x86_64-tools-git + mingw-w64-x86_64-xz + mingw-w64-x86_64-zlib + - name: Compile htslib + shell: msys2 {0} + run: | + export PATH="/mingw64/bin:$PATH:/c/Program Files/Git/bin" + export MSYSTEM=MINGW64 + autoreconf -i + ./configure --enable-werror + make cc-version + make -j6 + - name: Check Htslib + shell: msys2 {0} + run: | + export PATH="/mingw64/bin:$PATH:/c/Program Files/Git/bin" + export MSYSTEM=MINGW64 + make test-shlib-exports && make check + diff --git a/.gitignore b/.gitignore index 8b4d74ca1..817b123d7 100644 --- a/.gitignore +++ b/.gitignore @@ -45,8 +45,9 @@ shlib-exports-*.txt /bgzip /htsfile /tabix +/test/*/FAIL* +/test/bgzf_boundaries/*.tmp.* /test/faidx/*.tmp* -/test/faidx/FAIL* /test/fieldarith /test/hfile /test/hts_endian @@ -56,7 +57,6 @@ shlib-exports-*.txt /test/plugins-dlhts /test/sam /test/tabix/*.tmp.* -/test/tabix/FAIL* /test/test-bcf-sr /test/test-bcf-translate /test/test-bcf_set_variant_type @@ -66,8 +66,10 @@ shlib-exports-*.txt /test/test_index /test/test_introspection /test/test_kfunc +/test/test_khash /test/test_kstring /test/test_mod +/test/test_nibbles /test/test-parse-reg /test/test_realn /test/test-regidx diff --git a/Makefile b/Makefile index 99142c865..ef9b5a9a4 100644 --- a/Makefile +++ b/Makefile @@ -1,6 +1,6 @@ # Makefile for htslib, a C library for high-throughput sequencing data formats. # -# Copyright (C) 2013-2023 Genome Research Ltd. +# Copyright (C) 2013-2024 Genome Research Ltd. # # Author: John Marshall # @@ -85,8 +85,10 @@ BUILT_TEST_PROGRAMS = \ test/test_expr \ test/test_faidx \ test/test_kfunc \ + test/test_khash \ test/test_kstring \ test/test_mod \ + test/test_nibbles \ test/test_realn \ test/test-regidx \ test/test_str2int \ @@ -111,8 +113,14 @@ BUILT_THRASH_PROGRAMS = \ test/thrash_threads6 \ test/thrash_threads7 -all: lib-static lib-shared $(BUILT_PROGRAMS) plugins $(BUILT_TEST_PROGRAMS) \ - htslib_static.mk htslib-uninstalled.pc +all: lib-static lib-shared $(BUILT_PROGRAMS) plugins \ + $(BUILT_TEST_PROGRAMS) htslib_static.mk htslib-uninstalled.pc + +# Report compiler and version +cc-version: + -@$(CC) --version 2>/dev/null || true + -@$(CC) --qversion 2>/dev/null || true + -@$(CC) -V 2>/dev/null || true ALL_CPPFLAGS = -I. $(CPPFLAGS) @@ -150,8 +158,8 @@ LIBHTS_SOVERSION = 3 # is not strictly necessary and should be removed the next time # LIBHTS_SOVERSION is bumped (see #1144 and # https://developer.apple.com/library/archive/documentation/DeveloperTools/Conceptual/DynamicLibraries/100-Articles/DynamicLibraryDesignGuidelines.html#//apple_ref/doc/uid/TP40002013-SW23) -MACH_O_COMPATIBILITY_VERSION = 3.1.20 -MACH_O_CURRENT_VERSION = 3.1.20 +MACH_O_COMPATIBILITY_VERSION = 3.1.21 +MACH_O_CURRENT_VERSION = 3.1.21 # Force version.h to be remade if $(PACKAGE_VERSION) has changed. version.h: $(if $(wildcard version.h),$(if $(findstring "$(PACKAGE_VERSION)",$(shell cat version.h)),,force)) @@ -209,6 +217,7 @@ LIBHTS_OBJS = \ region.o \ sam.o \ sam_mods.o \ + simd.o \ synced_bcf_reader.o \ vcf_sweep.o \ tbx.o \ @@ -278,6 +287,10 @@ config.h: echo '#endif' >> $@ echo '#define HAVE_DRAND48 1' >> $@ echo '#define HAVE_LIBCURL 1' >> $@ + if [ "x$(HTS_HAVE_CPUID)" != "x" ]; then \ + echo '#define HAVE_DECL___CPUID_COUNT 1' >> $@ ; \ + echo '#define HAVE_DECL___GET_CPUID_MAX 1' >> $@ ; \ + fi if [ "x$(HTS_BUILD_SSE4)" != "x" ]; then \ echo '#define HAVE_POPCNT 1' >> $@ ; \ echo '#define HAVE_SSE4_1 1' >> $@ ; \ @@ -292,6 +305,13 @@ config.h: if [ "x$(HTS_BUILD_AVX512)" != "x" ] ; then \ echo '#define HAVE_AVX512 1' >> $@ ; \ fi + echo '#if defined __x86_64__ || defined __arm__ || defined __aarch64__' >> $@ + echo '#define HAVE_ATTRIBUTE_CONSTRUCTOR 1' >> $@ + echo '#endif' >> $@ + echo '#if (defined(__x86_64__) || defined(_M_X64))' >> $@ + echo '#define HAVE_ATTRIBUTE_TARGET 1' >> $@ + echo '#define HAVE_BUILTIN_CPU_SUPPORT_SSSE3 1' >> $@ + echo '#endif' >> $@ # And similarly for htslib.pc.tmp ("pkg-config template"). No dependency # on htslib.pc.in listed, as if that file is newer the usual way to regenerate @@ -451,6 +471,7 @@ hts_os.o hts_os.pico: hts_os.c config.h $(htslib_hts_defs_h) os/rand.c vcf.o vcf.pico: vcf.c config.h $(fuzz_settings_h) $(htslib_vcf_h) $(htslib_bgzf_h) $(htslib_tbx_h) $(htslib_hfile_h) $(hts_internal_h) $(htslib_khash_str2int_h) $(htslib_kstring_h) $(htslib_sam_h) $(htslib_khash_h) $(htslib_kseq_h) $(htslib_hts_endian_h) sam.o sam.pico: sam.c config.h $(fuzz_settings_h) $(htslib_hts_defs_h) $(htslib_sam_h) $(htslib_bgzf_h) $(cram_h) $(hts_internal_h) $(sam_internal_h) $(htslib_hfile_h) $(htslib_hts_endian_h) $(htslib_hts_expr_h) $(header_h) $(htslib_khash_h) $(htslib_kseq_h) $(htslib_kstring_h) sam_mods.o sam_mods.pico: sam_mods.c config.h $(htslib_sam_h) $(textutils_internal_h) +simd.o simd.pico: simd.c config.h $(htslib_sam_h) $(sam_internal_h) tbx.o tbx.pico: tbx.c config.h $(htslib_tbx_h) $(htslib_bgzf_h) $(htslib_hts_endian_h) $(hts_internal_h) $(htslib_khash_h) faidx.o faidx.pico: faidx.c config.h $(htslib_bgzf_h) $(htslib_faidx_h) $(htslib_hfile_h) $(htslib_khash_h) $(htslib_kstring_h) $(hts_internal_h) bcf_sr_sort.o bcf_sr_sort.pico: bcf_sr_sort.c config.h $(bcf_sr_sort_h) $(htslib_khash_str2int_h) $(htslib_kbitset_h) @@ -512,10 +533,10 @@ htsfile: htsfile.o libhts.a tabix: tabix.o libhts.a $(CC) $(LDFLAGS) -o $@ tabix.o libhts.a $(LIBS) -lpthread -annot-tsv.o: annot-tsv.c config.h $(htslib_hts_h) $(htslib_hts_defs_h) $(htslib_khash_str2int_h) $(htslib_kstring_h) $(htslib_kseq_h) $(htslib_bgzf_h) $(htslib_regidx_h) +annot-tsv.o: annot-tsv.c config.h $(htslib_hts_h) $(htslib_hts_defs_h) $(htslib_khash_str2int_h) $(htslib_kstring_h) $(htslib_kseq_h) $(htslib_bgzf_h) $(htslib_regidx_h) $(textutils_internal_h) bgzip.o: bgzip.c config.h $(htslib_bgzf_h) $(htslib_hts_h) $(htslib_hfile_h) htsfile.o: htsfile.c config.h $(htslib_hfile_h) $(htslib_hts_h) $(htslib_sam_h) $(htslib_vcf_h) -tabix.o: tabix.c config.h $(htslib_tbx_h) $(htslib_sam_h) $(htslib_vcf_h) $(htslib_kseq_h) $(htslib_bgzf_h) $(htslib_hts_h) $(htslib_regidx_h) $(htslib_hts_defs_h) $(htslib_hts_log_h) +tabix.o: tabix.c config.h $(htslib_tbx_h) $(htslib_sam_h) $(htslib_vcf_h) $(htslib_kseq_h) $(htslib_bgzf_h) $(htslib_hts_h) $(htslib_regidx_h) $(htslib_hts_defs_h) $(htslib_hts_log_h) $(htslib_thread_pool_h) # Runes to check that the htscodecs submodule is present ifdef HTSCODECS_SOURCES @@ -552,7 +573,7 @@ htscodecs/htscodecs/version.h: force vers=`cd $(srcdir)/htscodecs && git describe --always --dirty --match 'v[0-9]\.[0-9]*'` && \ case "$$vers" in \ v*) vers=$${vers#v} ;; \ - *) iv=`awk '/^AC_INIT/ { match($$0, /^AC_INIT\(htscodecs, *([0-9](\.[0-9])*)\)/, m); print substr($$0, m[1, "start"], m[1, "length"]) }' $(srcdir)/htscodecs/configure.ac` ; vers="$$iv$${vers:+-g$$vers}" ;; \ + *) iv=`awk '/^AC_INIT\(htscodecs,/ { match($$0, /[0-9]+(\.[0-9]+)*/); print substr($$0, RSTART, RLENGTH) }' $(srcdir)/htscodecs/configure.ac` ; vers="$$iv$${vers:+-g$$vers}" ;; \ esac ; \ if ! grep -s -q '"'"$$vers"'"' $@ ; then \ echo 'Updating $@ : #define HTSCODECS_VERSION_TEXT "'"$$vers"'"' ; \ @@ -591,7 +612,9 @@ check test: all $(HTSCODECS_TEST_TARGETS) test/hts_endian test/test_expr test/test_kfunc + test/test_khash test/test_kstring + test/test_nibbles -v test/test_str2int test/test_time_funcs test/fieldarith test/fieldarith.sam @@ -643,23 +666,29 @@ test/sam: test/sam.o libhts.a $(CC) $(LDFLAGS) -o $@ test/sam.o libhts.a $(LIBS) -lpthread test/test_bgzf: test/test_bgzf.o libhts.a - $(CC) $(LDFLAGS) -o $@ test/test_bgzf.o libhts.a -lz $(LIBS) -lpthread + $(CC) $(LDFLAGS) -o $@ test/test_bgzf.o libhts.a $(LIBS) -lpthread test/test_expr: test/test_expr.o libhts.a - $(CC) $(LDFLAGS) -o $@ test/test_expr.o libhts.a -lz $(LIBS) -lpthread + $(CC) $(LDFLAGS) -o $@ test/test_expr.o libhts.a $(LIBS) -lpthread test/test_faidx: test/test_faidx.o libhts.a - $(CC) $(LDFLAGS) -o $@ test/test_faidx.o libhts.a -lz $(LIBS) -lpthread + $(CC) $(LDFLAGS) -o $@ test/test_faidx.o libhts.a $(LIBS) -lpthread test/test_kfunc: test/test_kfunc.o libhts.a - $(CC) $(LDFLAGS) -o $@ test/test_kfunc.o libhts.a -lz $(LIBS) -lpthread + $(CC) $(LDFLAGS) -o $@ test/test_kfunc.o libhts.a $(LIBS) -lpthread + +test/test_khash: test/test_khash.o libhts.a + $(CC) $(LDFLAGS) -o $@ test/test_khash.o libhts.a $(LIBS) -lpthread test/test_kstring: test/test_kstring.o libhts.a - $(CC) $(LDFLAGS) -o $@ test/test_kstring.o libhts.a -lz $(LIBS) -lpthread + $(CC) $(LDFLAGS) -o $@ test/test_kstring.o libhts.a $(LIBS) -lpthread test/test_mod: test/test_mod.o libhts.a $(CC) $(LDFLAGS) -o $@ test/test_mod.o libhts.a $(LIBS) -lpthread +test/test_nibbles: test/test_nibbles.o libhts.a + $(CC) $(LDFLAGS) -o $@ test/test_nibbles.o libhts.a $(LIBS) -lpthread + test/test_realn: test/test_realn.o libhts.a $(CC) $(LDFLAGS) -o $@ test/test_realn.o libhts.a $(LIBS) -lpthread @@ -688,10 +717,10 @@ test/test-vcf-sweep: test/test-vcf-sweep.o libhts.a $(CC) $(LDFLAGS) -o $@ test/test-vcf-sweep.o libhts.a $(LIBS) -lpthread test/test-bcf-sr: test/test-bcf-sr.o libhts.a - $(CC) $(LDFLAGS) -o $@ test/test-bcf-sr.o libhts.a -lz $(LIBS) -lpthread + $(CC) $(LDFLAGS) -o $@ test/test-bcf-sr.o libhts.a $(LIBS) -lpthread test/test-bcf-translate: test/test-bcf-translate.o libhts.a - $(CC) $(LDFLAGS) -o $@ test/test-bcf-translate.o libhts.a -lz $(LIBS) -lpthread + $(CC) $(LDFLAGS) -o $@ test/test-bcf-translate.o libhts.a $(LIBS) -lpthread test/test_introspection: test/test_introspection.o libhts.a $(CC) $(LDFLAGS) -o $@ test/test_introspection.o libhts.a $(LIBS) -lpthread @@ -760,8 +789,10 @@ test/sam.o: test/sam.c config.h $(htslib_hts_defs_h) $(htslib_sam_h) $(htslib_fa test/test_bgzf.o: test/test_bgzf.c config.h $(htslib_bgzf_h) $(htslib_hfile_h) $(htslib_hts_log_h) $(hfile_internal_h) test/test_expr.o: test/test_expr.c config.h $(htslib_hts_expr_h) test/test_kfunc.o: test/test_kfunc.c config.h $(htslib_kfunc_h) +test/test_khash.o: test/test_khash.c config.h $(htslib_khash_h) $(htslib_kroundup_h) test/test_kstring.o: test/test_kstring.c config.h $(htslib_kstring_h) test/test_mod.o: test/test_mod.c config.h $(htslib_sam_h) +test/test_nibbles.o: test/test_nibbles.c config.h $(htslib_sam_h) $(sam_internal_h) test/test-parse-reg.o: test/test-parse-reg.c config.h $(htslib_hts_h) $(htslib_sam_h) test/test_realn.o: test/test_realn.c config.h $(htslib_hts_h) $(htslib_sam_h) $(htslib_faidx_h) test/test-regidx.o: test/test-regidx.c config.h $(htslib_kstring_h) $(htslib_regidx_h) $(htslib_hts_defs_h) $(textutils_internal_h) @@ -784,25 +815,25 @@ test/usepublic.o: test/usepublic.cpp config.h $(htslib_bgzf_h) $(htslib_cram_h) test/thrash_threads1: test/thrash_threads1.o libhts.a - $(CC) $(LDFLAGS) -o $@ test/thrash_threads1.o libhts.a -lz $(LIBS) -lpthread + $(CC) $(LDFLAGS) -o $@ test/thrash_threads1.o libhts.a $(LIBS) -lpthread test/thrash_threads2: test/thrash_threads2.o libhts.a - $(CC) $(LDFLAGS) -o $@ test/thrash_threads2.o libhts.a -lz $(LIBS) -lpthread + $(CC) $(LDFLAGS) -o $@ test/thrash_threads2.o libhts.a $(LIBS) -lpthread test/thrash_threads3: test/thrash_threads3.o libhts.a - $(CC) $(LDFLAGS) -o $@ test/thrash_threads3.o libhts.a -lz $(LIBS) -lpthread + $(CC) $(LDFLAGS) -o $@ test/thrash_threads3.o libhts.a $(LIBS) -lpthread test/thrash_threads4: test/thrash_threads4.o libhts.a - $(CC) $(LDFLAGS) -o $@ test/thrash_threads4.o libhts.a -lz $(LIBS) -lpthread + $(CC) $(LDFLAGS) -o $@ test/thrash_threads4.o libhts.a $(LIBS) -lpthread test/thrash_threads5: test/thrash_threads5.o libhts.a - $(CC) $(LDFLAGS) -o $@ test/thrash_threads5.o libhts.a -lz $(LIBS) -lpthread + $(CC) $(LDFLAGS) -o $@ test/thrash_threads5.o libhts.a $(LIBS) -lpthread test/thrash_threads6: test/thrash_threads6.o libhts.a - $(CC) $(LDFLAGS) -o $@ test/thrash_threads6.o libhts.a -lz $(LIBS) -lpthread + $(CC) $(LDFLAGS) -o $@ test/thrash_threads6.o libhts.a $(LIBS) -lpthread test/thrash_threads7: test/thrash_threads7.o libhts.a - $(CC) $(LDFLAGS) -o $@ test/thrash_threads7.o libhts.a -lz $(LIBS) -lpthread + $(CC) $(LDFLAGS) -o $@ test/thrash_threads7.o libhts.a $(LIBS) -lpthread test_thrash: $(BUILT_THRASH_PROGRAMS) @@ -905,8 +936,9 @@ htslib-uninstalled.pc: htslib.pc.tmp testclean: - -rm -f test/*.tmp test/*.tmp.* test/faidx/*.tmp* test/faidx/FAIL* \ - test/longrefs/*.tmp.* test/tabix/*.tmp.* test/tabix/FAIL* \ + -rm -f test/*.tmp test/*.tmp.* test/faidx/*.tmp* \ + test/longrefs/*.tmp.* test/tabix/*.tmp.* \ + test/bgzf_boundaries/*.tmp.* test/*/FAIL* \ header-exports.txt shlib-exports-$(SHLIB_FLAVOUR).txt -rm -rf htscodecs/tests/test.out @@ -970,3 +1002,4 @@ force: .PHONY: clean-dylib install-dylib .PHONY: test_htscodecs_rans4x8 test_htscodecs_rans4x16 test_htscodecs_arith .PHONY: test_htscodecs_tok3 test_htscodecs_fqzcomp test_htscodecs_varint +.PHONY: cc-version diff --git a/NEWS b/NEWS index 83dcaa5b9..8825c30d1 100644 --- a/NEWS +++ b/NEWS @@ -1,3 +1,135 @@ +Noteworthy changes in release 1.21 (12th September 2024) +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +The primary user-visible changes in this release are updates to the +annot-tsv tool and some speed improvements. Full details of other +changes and bugs fixed are below. + +Notice: this is the last SAMtools / HTSlib release where CRAM 3.0 will be +the default CRAM version. From the next we will change to CRAM 3.1 +unless the version is explicitly specified, for example using +"samtools view -O cram,version=3.0". + + +Updates +------- + +* Extend annot-tsv with several new command line options. + --delim permits use of other delimiters. + --headers for selection of other header formats. + --no-header-idx to suppress column index numbers in header. + Also removed -h as it is now short for --headers. Note --help + still works. (PR #1779) + +* Allow annot-tsv -a to rename annotations. (PR #1709) + +* Extend annot-tsv --overlap to be able to specify the overlap + fraction separately for source and target. (PR #1811) + +* Added new APIs to facilitate low-level CRAM container manipulations, + used by the new "samtools cat" region filtering code. Functions are: + cram_container_get_coords() + cram_filter_container() + cram_index_extents() + cram_container_num2offset() + cram_container_offset2num() + cram_num_containers() + cram_num_containers_between() + Also improved cram_index_query() to cope with HTS_IDX_NOCOOR regions. + (PR #1771) + +* Bgzip now retains file modification and access times when + compressing and decompressing. (PR #1727, fixes #1718. Requested by + Gert Hulselmans.) + +* Use FNV1a for string hashing in khash. The old algorithm was + particularly weak with base-64 style strings and lead to a large + number of collisions. (PR #1806. Fixes samtools/samtools#2066, + reported by Hans-Joachim Ruscheweyh) + +* Improve the speed of the nibble2base() function on Intel (PR + #1667, PR #1764, PR #1786, PR #1802, thanks to Ruben Vorderman) and + ARM (PR #1795, thanks to John Marshall). + +* bgzf_getline() will now warn if it encounters UTF-16 data. + (PR #1487, thanks to John Marshall) + +* Speed up bgzf_read(). While this does not reduce CPU significantly, + it does increase the maximum parallelism available permitting 10-15% + faster decoding. (PR #1772, PR #1800, Issue #1798) + +* Speed up faidx by use of better isgraph methods (PR #1797) and + whole-line reading (PR #1799, thanks to John Marshall). + +* Speed up kputll() function, speeding up BAM -> SAM conversion by + about 5% and also samtools depth. (PR #1805) + +* Added more example code, covering fasta/fastq indexing, tabix + indexing and use of the thread pool. (PR #1666) + +Build Changes +------------- + +* Code warning fixes for pedantic compilers (PR #1777) and avoid + some undefined behaviour (PR #1810, PR #1816, PR #1828). + +* Windows based CI has been migrated from AppVeyor to GitHub Actions. + (PR #1796, PR #1803, PR #1808) + +* Miscellaneous minor build infrastructure and code fixes. + (PR #1807, PR #1829, both thanks to John Marshall) + +* Updated htscodecs submodule to version 1.6.1 (PR #1828) + +* Fixed an awk script in the Makefile that only worked with gawk. (PR #1831) + +Bug fixes +--------- + +* Fix small OSS-Fuzz reported issues with CRAM encoding and long + CIGARS and/or illegal positions. (PR #1775, PR #1801, PR #1817) + +* Fix issues with on-the-fly indexing of VCF/BCF (bcftools --write-index) + when not using multiple threads. (PR #1837. Fixes samtools/bcftools#2267, + reported by Giulio Genovese) + +* Stricter limits on POS / MPOS / TLEN in sam_parse1(). This fixes + a signed overflow reported by OSS-Fuzz and should help prevent other + as-yet undetected bugs. (PR #1812) + +* Check that the underlying file open worked for preload: URLs. Fixes + a NULL pointer dereference reported by OSS-Fuzz. (PR #1821) + +* Fix an infinite loop in hts_itr_query() when given extremely large + positions which cause integer overflow. Also adds hts_bin_maxpos() + and hts_idx_maxpos() functions. + (PR #1774, thanks to John Marshall and reported by Jesus Alberto + Munoz Mesa) + +* Fix an out of bounds read in hts_itr_multi_next() when switching + chromosomes. This bug is present in releases 1.11 to 1.20. + (PR #1788. Fixes samtools/samtools#2063, reported by acorvelo) + +* Work around parsing problems with colons in CHROM names. + Fixes samtools/bcftools#2139. (PR #1781, John Marshall / James Bonfield) + +* Correct the CPU detection for Mac OS X 10.7. cpuid is used by + htscodecs (see samtools/htscodecs#116), and the corresponding + changes in htslib are PR #1785. Reported by Ryan Carsten Schmidt. + +* Make BAM zero-length intervals work the same as CRAM; permitted and + returning overlapping records. (PR #1787. Fixes + samtools/samtools#2060, reported by acorvelo) + +* Replace assert() with abort() in BCF synced reader. This is not an + ideal solution, but it gives consistent behaviour when compiling + with or without NDEBUG. (PR #1791, thanks to Martin Pollard) + +* Fixed failure to change the write block size on compressed SAM or VCF + files due to an internal type confusion. (PR #1826) + +* Fixed an out-of-bounds read in cram_codec_iter_next() (PR #1832) + Noteworthy changes in release 1.20 (15th April 2024) ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ diff --git a/README.md b/README.md index 47afdba2a..2906855ba 100644 --- a/README.md +++ b/README.md @@ -1,5 +1,5 @@ -[![Build Status](https://api.cirrus-ci.com/github/samtools/htslib.svg?branch=develop)](https://api.cirrus-ci.com/github/samtools/htslib) -[![Build status](https://ci.appveyor.com/api/projects/status/v46hkwyfjp3l8nd3/branch/develop?svg=true)](https://ci.appveyor.com/project/samtools/htslib/branch/develop) +[![Build Status](https://api.cirrus-ci.com/github/samtools/htslib.svg?branch=develop)](https://cirrus-ci.com/github/samtools/htslib) +[![Build status](https://github.com/samtools/htslib/actions/workflows/windows-build.yml/badge.svg)](https://github.com/samtools/htslib/actions/workflows/windows-build.yml?query=branch%3Adevelop) [![Github All Releases](https://img.shields.io/github/downloads/samtools/htslib/total.svg)](https://github.com/samtools/htslib) HTSlib is an implementation of a unified C library for accessing common file diff --git a/annot-tsv.1 b/annot-tsv.1 index df3b06e91..3a6034b11 100644 --- a/annot-tsv.1 +++ b/annot-tsv.1 @@ -1,7 +1,7 @@ '\" t -.TH annot-tsv 1 "15 April 2024" "htslib-1.20" "Bioinformatics tools" +.TH annot-tsv 1 "12 September 2024" "htslib-1.21" "Bioinformatics tools" .\" -.\" Copyright (C) 2015, 2017-2018, 2023 Genome Research Ltd. +.\" Copyright (C) 2015, 2017-2018, 2023-2024 Genome Research Ltd. .\" .\" Author: Petr Danecek .\" @@ -108,6 +108,11 @@ Target file to be extend with annotations from Add the same annotations multiple times if multiple overlaps are found .RE .PP +.B \-\-help +.RS 4 +This help message +.RE +.PP .BR \-\-max\-annots " INT" .RS 4 Add at most INT annotations per column to save time when many overlaps are found with a single region @@ -138,18 +143,42 @@ number of source base pairs in the overlap .RE .RE .PP +.BR \-d ", " \-\-delim " SRC:TGT" +.RS 4 +Column delimiter in the source and the target file. For example, if both files are comma-delimited, run with +"--delim ,:," or simply "--delim ,". If the source file is comma-delimited and the target file is tab-delimited, +run with "-d $',:\\t'". +.RE +.PP +.BR \-h ", " \-\-headers " SRC:TGT" +.RS 4 +Line number of the header row with column names. By default the first line is interpreted as header if it starts with the comment +character ("#"), otherwise expects numeric indices. However, if the first line does not start with "#" but still +contains the column names, use "--headers 1:1". To ignore existing header (skip comment lines) and use numeric indices, +use "--headers 0:0" which is equivalent to "--ignore-headers". When negative value is given, it is interpreted as the number of +lines from the end of the comment block. Specifically, "--headers -1" takes the column names from the last line of +the comment block (e.g., the "#CHROM" line in the VCF format). +.RE +.PP .BR \-H ", " \-\-ignore\-headers .RS 4 Ignore the headers completely and use numeric indexes even when a header exists .RE .PP -.BR \-O ", " \-\-overlap " FLOAT" +.BR \-I ", " \-\-no\-hdr\-idx +.RS 4 +Suppress index numbers in the printed header. If given twice, drop the entire header. +.RE +.PP +.BR \-O ", " \-\-overlap " FLOAT,[FLOAT]" .RS 4 -Minimum overlap as a fraction of region length in at least one of the overlapping regions. If also +Minimum overlap as a fraction of region length in SRC and TGT, respectively (with two numbers), or in +at least one of the overlapping regions (with a single number). If also .BR \-r ", " \-\-reciprocal is given, require at least .I FLOAT -overlap with respect to both regions +overlap with respect to both regions. Two identical numbers are equivalent to running with +.BR \-r ", " \-\-reciprocal .RE .PP .BR \-r ", " \-\-reciprocal diff --git a/annot-tsv.c b/annot-tsv.c index 4661e6e0f..494c43744 100644 --- a/annot-tsv.c +++ b/annot-tsv.c @@ -1,5 +1,5 @@ /* - Copyright (C) 2018-2023 Genome Research Ltd. + Copyright (C) 2018-2024 Genome Research Ltd. Author: Petr Danecek @@ -44,6 +44,7 @@ #include "htslib/kseq.h" #include "htslib/bgzf.h" #include "htslib/regidx.h" +#include "textutils_internal.h" #define ANN_NBP 1 #define ANN_FRAC 2 @@ -71,6 +72,7 @@ typedef struct cols_t *core, *match, *transfer, *annots; int *core_idx, *match_idx, *transfer_idx, *annots_idx; int *nannots_added; // for --max-annots: the number of annotations added + char delim; int grow_n; kstring_t line; // one buffered line, a byproduct of reading the header htsFile *fp; @@ -100,11 +102,11 @@ typedef struct { nbp_t *nbp; dat_t dst, src; - char *core_str, *match_str, *transfer_str, *annots_str; + char *core_str, *match_str, *transfer_str, *annots_str, *headers_str, *delim_str; char *temp_dir, *out_fname; BGZF *out_fp; - int allow_dups, reciprocal, ignore_headers, max_annots, mode; - double overlap; + int allow_dups, max_annots, mode, no_write_hdr, overlap_either; + double overlap_src, overlap_dst; regidx_t *idx; regitr_t *itr; kstring_t tmp_kstr; @@ -282,7 +284,7 @@ int parse_tab_with_payload(const char *line, char **chr_beg, char **chr_end, hts dat_t *dat = (dat_t*) usr; - cols_t *cols = cols_split(line, NULL, '\t'); + cols_t *cols = cols_split(line, NULL, dat->delim); *((cols_t**)payload) = cols; if ( cols->n < dat->core_idx[0] ) error("Expected at least %d columns, found %d: %s\n",dat->core_idx[0]+1,cols->n,line); @@ -315,86 +317,136 @@ void free_payload(void *payload) cols_destroy(cols); } -// Parse header if present (first line has a leading #) or create a dummy header with -// numeric column names. If dummy is set, read first data line (without a leading #) -// and create a dummy header. -void parse_header(dat_t *dat, char *fname, int dummy) +// Parse header if present, the parameter irow indicates the header row line number: +// 0 .. ignore headers, create numeric fields names, 1-based indices +// N>0 .. N-th line, all previous lines are discarded +// N<0 .. N-th line from the end of the comment block (comment lines are prefixed with #), +// all preceding lines are discarded. +// When autodetect is set, the argument nth_row is ignored. +// Note this makes no attempt to preserve comment lines on output +void parse_header(dat_t *dat, char *fname, int nth_row, int autodetect) { dat->fp = hts_open(fname,"r"); if ( !dat->fp ) error("Failed to open: %s\n", fname); + // buffer comment lines when N<0 + int nbuf = 0; + char **buf = NULL; + if ( nth_row < 0 ) + { + buf = calloc(-nth_row,sizeof(*buf)); + if ( !buf ) error("Out of memory, failed to allocate %zu bytes\n",(-nth_row)*sizeof(*buf)); + } + + int irow = 0; cols_t *cols = NULL; while ( hts_getline(dat->fp, KS_SEP_LINE, &dat->line) > 0 ) { - if ( dat->line.s[0]=='#' ) + if ( autodetect ) + { + // if the first line is comment line, use it as a header. Otherwise go + // with numeric indices + nth_row = dat->line.s[0]=='#' ? 1 : 0; + break; + } + if ( nth_row==0 ) + { + // N=0 .. comment lines to be ignored, read until we get to the first data line + if ( dat->line.s[0]=='#' ) continue; + break; + } + if ( nth_row>0 ) { - // this is a header or comment line - if ( dummy ) continue; - cols = cols_split(dat->line.s, NULL, '\t'); + // N>1 .. regardless of this being a comment or data line, read until Nth line + if ( ++irow < nth_row ) continue; break; } + // N<0 .. keep abs(N) comment lines in a sliding buffer + if ( dat->line.s[0]!='#' ) break; // data line + if ( nbuf == -nth_row ) + { + // one more comment line and the buffer is full. We could use round buffer + // for efficiency, but the assumption is abs(nth_row) is small + free(buf[0]); + memmove(buf, &buf[1], (nbuf-1)*sizeof(*buf)); + nbuf--; + } + buf[nbuf++] = strdup(dat->line.s); + } + + int keep_line = 0; + if ( nth_row < 0 ) + { + if ( nbuf!=-nth_row ) + error("Found %d header lines in %s, cannot fetch N=%d from the end\n",nbuf,fname,-nth_row); + cols = cols_split(buf[0], NULL, dat->delim); + keep_line = 1; + } + else + cols = cols_split(dat->line.s, NULL, dat->delim); - // this a data line, we must be in a dummy mode - cols = cols_split(dat->line.s, NULL, '\t'); - assert(cols && cols->n); - assert(cols->off[0][0] != '#'); + if ( !dat->line.l ) error("Failed to read: %s\n", fname); + assert(cols && cols->n); + if ( nth_row == 0 ) // create numeric indices + { // create a dummy header with numeric field names kstring_t str = {0,0,0}; int i, n = cols->n; for (i=0; i0 ) kputc('\t', &str); + if ( i>0 ) kputc(dat->delim, &str); kputw(i+1, &str); } cols_destroy(cols); - cols = cols_split(str.s, NULL, '\t'); + cols = cols_split(str.s, NULL, dat->delim); free(str.s); dat->hdr.dummy = 1; - - break; + keep_line = 1; } - if ( !dat->line.l ) error("Failed to read: %s\n", fname); - assert(cols && cols->n); dat->hdr.name2idx = khash_str2int_init(); int i; for (i=0; in; i++) { char *ss = cols->off[i]; - while ( *ss && (*ss=='#' || isspace(*ss)) ) ss++; + while ( *ss && (*ss=='#' || isspace_c(*ss)) ) ss++; if ( !*ss ) error("Could not parse the header field \"%s\": %s\n", cols->off[i],dat->line.s); if ( *ss=='[' ) { char *se = ss+1; - while ( *se && isdigit(*se) ) se++; + while ( *se && isdigit_c(*se) ) se++; if ( *se==']' ) ss = se + 1; } - while ( *ss && (*ss=='#' || isspace(*ss)) ) ss++; + while ( *ss && (*ss=='#' || isspace_c(*ss)) ) ss++; if ( !*ss ) error("Could not parse the header field \"%s\": %s\n", cols->off[i],dat->line.s); cols->off[i] = ss; khash_str2int_set(dat->hdr.name2idx, cols->off[i], i); } dat->hdr.cols = cols; - if ( !dat->hdr.dummy ) dat->line.l = 0; + if ( !keep_line ) dat->line.l = 0; + + for (i=0; ihdr.dummy ) return; + if ( args->no_write_hdr>1 ) return; int i; kstring_t str = {0,0,0}; kputc('#', &str); for (i=0; ihdr.cols->n; i++) { - if ( i>0 ) kputc('\t', &str); - ksprintf(&str,"[%d]", i+1); + if ( i>0 ) kputc(dat->delim, &str); + if ( !args->no_write_hdr ) ksprintf(&str,"[%d]", i+1); kputs(dat->hdr.cols->off[i], &str); } if ( dat->hdr.annots ) { for (i=0; ihdr.annots->n; i++) { - if ( str.l > 1 ) kputc('\t', &str); + if ( str.l > 1 ) kputc(dat->delim, &str); kputs(dat->hdr.annots->off[i], &str); } } @@ -434,8 +486,30 @@ void sanity_check_columns(char *fname, hdr_t *hdr, cols_t *cols, int **col2idx, } void init_data(args_t *args) { - parse_header(&args->dst, args->dst.fname, args->ignore_headers); - parse_header(&args->src, args->src.fname, args->ignore_headers); + if ( !args->delim_str ) + args->dst.delim = args->src.delim = '\t'; + else if ( strlen(args->delim_str)==1 ) + args->dst.delim = args->src.delim = *args->delim_str; + else if ( strlen(args->delim_str)==3 && args->delim_str[1]==':' ) + args->src.delim = args->delim_str[0], args->dst.delim = args->delim_str[2]; + else + error("Could not parse the option --delim %s\n",args->delim_str); + + // --headers, determine header row index + int isrc = 0, idst = 0, autodetect = 1; + if ( args->headers_str ) + { + cols_t *tmp = cols_split(args->headers_str, NULL, ':'); + char *rmme; + isrc = strtol(tmp->off[0],&rmme,10); + if ( *rmme || tmp->off[0]==rmme ) error("Could not parse the option --headers %s\n",args->headers_str); + idst = strtol(tmp->n==2 ? tmp->off[1] : tmp->off[0],&rmme,10); + if ( *rmme || (tmp->n==2 ? tmp->off[1] : tmp->off[0])==rmme ) error("Could not parse the option --headers %s\n",args->headers_str); + cols_destroy(tmp); + autodetect = 0; + } + parse_header(&args->dst, args->dst.fname, idst, autodetect); + parse_header(&args->src, args->src.fname, isrc, autodetect); // -c, core columns if ( !args->core_str ) args->core_str = "chr,beg,end:chr,beg,end"; @@ -608,17 +682,17 @@ static void write_annots(args_t *args) { if ( args->dst.annots_idx[i]==ANN_NBP ) { - kputc('\t',&args->tmp_kstr); + kputc(args->dst.delim,&args->tmp_kstr); kputw(len,&args->tmp_kstr); } else if ( args->dst.annots_idx[i]==ANN_FRAC ) { - kputc('\t',&args->tmp_kstr); + kputc(args->dst.delim,&args->tmp_kstr); kputd((double)len/(args->nbp->end - args->nbp->beg + 1),&args->tmp_kstr); } else if ( args->dst.annots_idx[i]==ANN_CNT ) { - kputc('\t',&args->tmp_kstr); + kputc(args->dst.delim,&args->tmp_kstr); kputw(args->nbp->n/2,&args->tmp_kstr); } } @@ -662,18 +736,20 @@ void process_line(args_t *args, char *line, size_t size) int has_match = 0, annot_len = 0; while ( regitr_overlap(args->itr) ) { - if ( args->overlap ) + if ( args->overlap_src || args->overlap_dst ) { - double len1 = end - beg + 1; - double len2 = args->itr->end - args->itr->beg + 1; + double len_dst = end - beg + 1; + double len_src = args->itr->end - args->itr->beg + 1; double isec = (args->itr->end < end ? args->itr->end : end) - (args->itr->beg > beg ? args->itr->beg : beg) + 1; - if ( args->reciprocal ) + int pass_dst = isec/len_dst < args->overlap_dst ? 0 : 1; + int pass_src = isec/len_src < args->overlap_src ? 0 : 1; + if ( args->overlap_either ) { - if ( isec/len1 < args->overlap || isec/len2 < args->overlap ) continue; + if ( !pass_dst && !pass_src ) continue; } else { - if ( isec/len1 < args->overlap && isec/len2 < args->overlap ) continue; + if ( !pass_dst || !pass_src ) continue; } } cols_t *src_cols = regitr_payload(args->itr,cols_t*); @@ -758,7 +834,7 @@ void process_line(args_t *args, char *line, size_t size) write_string(args, dst_cols->off[0], 0); for (i=1; in; i++) { - write_string(args, "\t", 1); + write_string(args, &args->dst.delim, 1); write_string(args, dst_cols->off[i], 0); } write_annots(args); @@ -796,6 +872,7 @@ static const char *usage_text(void) "\n" "Other options:\n" " --allow-dups Add annotations multiple times\n" + " --help This help message\n" " --max-annots INT Adding at most INT annotations per column to save\n" " time in big regions\n" " --version Print version string and exit\n" @@ -804,9 +881,15 @@ static const char *usage_text(void) " frac .. fraction of the target region with an\n" " overlap\n" " nbp .. number of source base pairs in the overlap\n" - " -H, --ignore-headers Use numeric indexes, ignore the headers completely\n" - " -O, --overlap FLOAT Minimum required overlap (non-reciprocal, unless -r\n" - " is given)\n" + " -d, --delim SRC:TGT Column delimiter in SRC and TGT file\n" + " -h, --headers SRC:TGT Header row line number, 0:0 is equivalent to -H, negative\n" + " value counts from the end of comment line block [1:1]\n" + " -H, --ignore-headers Use numeric indices, ignore the headers completely\n" + " -I, --no-header-idx Suppress index numbers in the printed header. If given\n" + " twice, drop the entire header\n" + " -O, --overlap FLOAT[,FLOAT] Minimum required overlap with respect to SRC,TGT.\n" + " If single value, the bigger overlap is considered.\n" + " Identical values are equivalent to running with -r.\n" " -r, --reciprocal Apply the -O requirement to both overlapping\n" " intervals\n" " -x, --drop-overlaps Drop overlapping regions (precludes -f)\n" @@ -847,18 +930,22 @@ int main(int argc, char **argv) {"target-file",required_argument,NULL,'t'}, {"allow-dups",no_argument,NULL,0}, {"max-annots",required_argument,NULL,2}, + {"no-header-idx",required_argument,NULL,'I'}, {"version",no_argument,NULL,1}, {"annotate",required_argument,NULL,'a'}, + {"headers",no_argument,NULL,'h'}, {"ignore-headers",no_argument,NULL,'H'}, {"overlap",required_argument,NULL,'O'}, {"reciprocal",no_argument,NULL,'r'}, {"drop-overlaps",no_argument,NULL,'x'}, - {"help",no_argument,NULL,'h'}, + {"delim",required_argument,NULL,'d'}, + {"help",no_argument,NULL,4}, {NULL,0,NULL,0} }; char *tmp = NULL; int c; - while ((c = getopt_long(argc, argv, "hc:f:m:o:s:t:a:HO:rx",loptions,NULL)) >= 0) + int reciprocal = 0; + while ((c = getopt_long(argc, argv, "c:f:m:o:s:t:a:HO:rxh:Id:",loptions,NULL)) >= 0) { switch (c) { @@ -873,22 +960,33 @@ int main(int argc, char **argv) args->max_annots = strtod(optarg, &tmp); if ( tmp==optarg || *tmp ) error("Could not parse --max-annots %s\n", optarg); break; - case 'H': args->ignore_headers = 1; break; - case 'r': args->reciprocal = 1; break; + case 'I': args->no_write_hdr++; break; + case 'd': args->delim_str = optarg; break; + case 'h': args->headers_str = optarg; break; + case 'H': args->headers_str = "0:0"; break; + case 'r': reciprocal = 1; break; case 'c': args->core_str = optarg; break; case 't': args->dst.fname = optarg; break; case 'm': args->match_str = optarg; break; case 'a': args->annots_str = optarg; break; case 'o': args->out_fname = optarg; break; case 'O': - args->overlap = strtod(optarg, &tmp); - if ( tmp==optarg || *tmp ) error("Could not parse --overlap %s\n", optarg); - if ( args->overlap<0 || args->overlap>1 ) error("Expected value from the interval [0,1]: --overlap %s\n", optarg); + args->overlap_src = strtod(optarg, &tmp); + if ( tmp==optarg || (*tmp && *tmp!=',') ) error("Could not parse --overlap %s\n", optarg); + if ( args->overlap_src<0 || args->overlap_src>1 ) error("Expected value(s) from the interval [0,1]: --overlap %s\n", optarg); + if ( *tmp ) + { + args->overlap_dst = strtod(tmp+1, &tmp); + if ( *tmp ) error("Could not parse --overlap %s\n", optarg); + if ( args->overlap_dst<0 || args->overlap_dst>1 ) error("Expected value(s) from the interval [0,1]: --overlap %s\n", optarg); + } + else + args->overlap_either = 1; break; case 's': args->src.fname = optarg; break; case 'f': args->transfer_str = optarg; break; case 'x': args->mode = PRINT_NONMATCHING; break; - case 'h': printf("\nVersion: %s\n%s\n",hts_version(),usage_text()); exit(EXIT_SUCCESS); break; + case 4 : printf("\nVersion: %s\n%s\n",hts_version(),usage_text()); exit(EXIT_SUCCESS); break; case '?': // fall through default: error("\nVersion: %s\n%s\n",hts_version(),usage_text()); break; } @@ -908,13 +1006,27 @@ int main(int argc, char **argv) else args->mode = PRINT_MATCHING|PRINT_NONMATCHING; } if ( (args->transfer_str || args->annots_str) && !(args->mode & PRINT_MATCHING) ) error("The option -x cannot be combined with -f and -a\n"); + if ( reciprocal ) + { + if ( args->overlap_dst && args->overlap_src && args->overlap_dst!=args->overlap_src ) + error("The combination of --reciprocal with --overlap %f,%f makes no sense: expected single value or identical values\n",args->overlap_src,args->overlap_dst); + if ( !args->overlap_src ) + args->overlap_src = args->overlap_dst; + else + args->overlap_dst = args->overlap_src; + args->overlap_either = 0; + } init_data(args); write_header(args, &args->dst); while ( read_next_line(&args->dst) ) { int i; - for (i=0; idst.grow_n; i++) kputs("\t.", &args->dst.line); + for (i=0; idst.grow_n; i++) + { + kputc(args->dst.delim, &args->dst.line); + kputc('.', &args->dst.line); + } process_line(args, args->dst.line.s, args->dst.line.l); args->dst.line.l = 0; } diff --git a/bgzip.1 b/bgzip.1 index fe4225b43..1e115d044 100644 --- a/bgzip.1 +++ b/bgzip.1 @@ -1,4 +1,4 @@ -.TH bgzip 1 "15 April 2024" "htslib-1.20" "Bioinformatics tools" +.TH bgzip 1 "12 September 2024" "htslib-1.21" "Bioinformatics tools" .SH NAME .PP bgzip \- Block compression/decompression utility diff --git a/bgzip.c b/bgzip.c index 129343fb5..687b29d47 100644 --- a/bgzip.c +++ b/bgzip.c @@ -48,7 +48,7 @@ static const int WINDOW_SIZE = BGZF_BLOCK_SIZE; -static void error(const char *format, ...) +static void HTS_FORMAT(HTS_PRINTF_FMT, 1, 2) error(const char *format, ...) { va_list ap; va_start(ap, format); @@ -57,7 +57,7 @@ static void error(const char *format, ...) exit(EXIT_FAILURE); } -static int ask_yn() +static int ask_yn(void) { char line[1024]; if (fgets(line, sizeof line, stdin) == NULL) @@ -362,8 +362,7 @@ int main(int argc, char **argv) } else { ret = 2; //explicit N - no overwrite, continue and return 2 - if (hclose(f_src) < 0) - ; //ignoring return value + hclose_abruptly(f_src); free(name); continue; } @@ -689,7 +688,7 @@ int main(int argc, char **argv) if ( bgzf_index_load(fp, argv[optind], ".gzi") < 0 ) error("Could not load index: %s.gzi\n", argv[optind]); } - if ( bgzf_useek(fp, start, SEEK_SET) < 0 ) error("Could not seek to %d-th (uncompressd) byte\n", start); + if ( bgzf_useek(fp, start, SEEK_SET) < 0 ) error("Could not seek to %ld-th (uncompressd) byte\n", start); } if (threads > 1) diff --git a/configure.ac b/configure.ac index 49f2cbc70..87e928d47 100644 --- a/configure.ac +++ b/configure.ac @@ -1,6 +1,6 @@ # Configure script for htslib, a C library for high-throughput sequencing data. # -# Copyright (C) 2015-2023 Genome Research Ltd. +# Copyright (C) 2015-2024 Genome Research Ltd. # # Author: John Marshall # @@ -35,7 +35,7 @@ m4_include([m4/hts_hide_dynamic_syms.m4]) m4_include([m4/pkg.m4]) dnl Copyright notice to be copied into the generated configure script -AC_COPYRIGHT([Portions copyright (C) 2020-2023 Genome Research Ltd. +AC_COPYRIGHT([Portions copyright (C) 2020-2024 Genome Research Ltd. This configure script is free software: you are free to change and redistribute it. There is NO WARRANTY, to the extent permitted by law.]) @@ -82,6 +82,14 @@ AC_CHECK_DECL([_XOPEN_SOURCE], [], [AC_DEFINE([_XOPEN_SOURCE], [600], [Specify X/Open requirements])], []) +dnl Check that we have cpuid, and if so run the x86 SIMD checks +AC_CHECK_DECLS([__get_cpuid_max, __cpuid_count], [ + hts_have_cpuid=yes +], [ + hts_have_cpuid=no +], [[#include ]]) + +AS_IF(test "x$hts_have_cpuid" = "xyes", [ dnl Options for rANS32x16 sse4.1 version - sse4.1 HTS_CHECK_COMPILE_FLAGS_NEEDED([sse4.1], [-msse4.1 -mssse3 -mpopcnt], [AC_LANG_PROGRAM([[ @@ -100,6 +108,7 @@ HTS_CHECK_COMPILE_FLAGS_NEEDED([sse4.1], [-msse4.1 -mssse3 -mpopcnt], AC_DEFINE([HAVE_POPCNT],1,[Defined to 1 if rANS source using popcnt can be compiled.]) AC_DEFINE([HAVE_SSE4_1],1,[Defined to 1 if rANS source using SSE4.1 can be compiled. ]) + dnl Propagate HTSlib's unaligned access preference to htscodecs AH_VERBATIM([UBSAN],[ /* Prevent unaligned access in htscodecs SSE4 rANS codec */ @@ -139,7 +148,9 @@ HTS_CHECK_COMPILE_FLAGS_NEEDED([avx512f], [-mavx512f -mpopcnt], #ifdef __x86_64__ __m512i a = _mm512_set1_epi32(1); __m512i b = _mm512_add_epi32(a, a); - return _mm_popcnt_u32(*((char *) &b)); + __m256i c = _mm512_castsi512_si256(b); + __m256i d = _mm512_extracti64x4_epi64(a, 1); + return _mm_popcnt_u32(*((char *) &c)) + (*(char *) &d); #endif ]])], [ hts_cflags_avx512="$flags_needed" @@ -148,6 +159,37 @@ HTS_CHECK_COMPILE_FLAGS_NEEDED([avx512f], [-mavx512f -mpopcnt], AC_DEFINE([HAVE_AVX512],1,[Defined to 1 if rANS source using AVX512F can be compiled.]) ]) +dnl Check for working __builtin_cpu_supports (ssse3 is broken on some clangs) +AC_MSG_CHECKING([for working __builtin_cpu_supports("ssse3")]) +AC_LINK_IFELSE([AC_LANG_PROGRAM([],[ + if (__builtin_cpu_supports("ssse3")) { + return 0; + } +])], [ + AC_MSG_RESULT([yes]) + AC_DEFINE([HAVE_BUILTIN_CPU_SUPPORT_SSSE3], 1, + [Defined to 1 if __builtin_cpu_supports("ssse3") works]) +], [ + AC_MSG_RESULT([no]) +]) + +dnl Check for function attribute used in conjunction with __builtin_cpu_supports +AC_MSG_CHECKING([for __attribute__((target))]) +AC_COMPILE_IFELSE([AC_LANG_PROGRAM([[ + __attribute__((target("ssse3"))) + int zero(void) { + return 0; + } +]], [[zero();]])], [ + AC_MSG_RESULT([yes]) + AC_DEFINE([HAVE_ATTRIBUTE_TARGET], 1, + [Define if __attribute__((target(...))) is available.]) +], [ + AC_MSG_RESULT([no]) +]) + +]) dnl End of AS_IF(hts_have_cpuid) + dnl Avoid chicken-and-egg problem where pkg-config supplies the dnl PKG_PROG_PKG_CONFIG macro, but we want to use it to check dnl for pkg-config... @@ -289,6 +331,25 @@ AC_CHECK_FUNCS([gmtime_r fsync drand48 srand48_deterministic]) # Darwin has a dubious fdatasync() symbol, but no declaration in AC_CHECK_DECL([fdatasync(int)], [AC_CHECK_FUNCS(fdatasync)]) +AC_MSG_CHECKING([for __attribute__((constructor))]) +AC_LINK_IFELSE([AC_LANG_PROGRAM([[ + static __attribute__((constructor)) void noop(void) {} +]], [])], [ + AC_MSG_RESULT([yes]) + AC_DEFINE([HAVE_ATTRIBUTE_CONSTRUCTOR], 1, + [Define if __attribute__((constructor)) is available.]) +], [AC_MSG_RESULT([no])]) + +AC_MSG_CHECKING([for clock_gettime with CLOCK_PROCESS_CPUTIME_ID]) +AC_LINK_IFELSE([AC_LANG_PROGRAM([[#include ]], [[ + struct timespec ts; + clock_gettime(CLOCK_PROCESS_CPUTIME_ID, &ts); +]])], [ + AC_MSG_RESULT([yes]) + AC_DEFINE([HAVE_CLOCK_GETTIME_CPUTIME], 1, + [Define if clock_gettime exists and accepts CLOCK_PROCESS_CPUTIME_ID.]) +], [AC_MSG_RESULT([no])]) + if test $enable_plugins != no; then AC_SEARCH_LIBS([dlsym], [dl], [], [MSG_ERROR([dlsym() not found diff --git a/cram/cram_decode.c b/cram/cram_decode.c index 86e2ef96e..2b2ad6029 100644 --- a/cram/cram_decode.c +++ b/cram/cram_decode.c @@ -1,5 +1,5 @@ /* -Copyright (c) 2012-2020, 2022-2023 Genome Research Ltd. +Copyright (c) 2012-2020, 2022-2024 Genome Research Ltd. Author: James Bonfield Redistribution and use in source and binary forms, with or without @@ -3004,8 +3004,8 @@ int cram_decode_slice_mt(cram_fd *fd, cram_container *c, cram_slice *s, * Returns the used size of the bam record on success * -1 on failure. */ -static int cram_to_bam(sam_hdr_t *sh, cram_fd *fd, cram_slice *s, - cram_record *cr, int rec, bam_seq_t **bam) { +int cram_to_bam(sam_hdr_t *sh, cram_fd *fd, cram_slice *s, + cram_record *cr, int rec, bam_seq_t **bam) { int ret, rg_len; char name_a[1024], *name; int name_len; @@ -3172,7 +3172,7 @@ static cram_container *cram_first_slice(cram_fd *fd) { return c; } -static cram_slice *cram_next_slice(cram_fd *fd, cram_container **cp) { +cram_slice *cram_next_slice(cram_fd *fd, cram_container **cp) { cram_container *c_curr; // container being consumed via cram_get_seq() cram_slice *s_curr = NULL; diff --git a/cram/cram_decode.h b/cram/cram_decode.h index 400eb6beb..16d87a073 100644 --- a/cram/cram_decode.h +++ b/cram/cram_decode.h @@ -1,5 +1,5 @@ /* -Copyright (c) 2012-2013, 2018 Genome Research Ltd. +Copyright (c) 2012-2013, 2018, 2024 Genome Research Ltd. Author: James Bonfield Redistribution and use in source and binary forms, with or without @@ -94,6 +94,15 @@ cram_block_compression_hdr *cram_decode_compression_header(cram_fd *fd, cram_block_slice_hdr *cram_decode_slice_header(cram_fd *fd, cram_block *b); +/*! INTERNAL: + * Loads and decodes the next slice worth of data. + * + * @return + * Returns cram slice pointer on success; + * NULL on failure + */ +cram_slice *cram_next_slice(cram_fd *fd, cram_container **cp); + /*! INTERNAL: * Decode an entire slice from container blocks. Fills out s->crecs[] array. * @@ -105,6 +114,22 @@ int cram_decode_slice(cram_fd *fd, cram_container *c, cram_slice *s, sam_hdr_t *hdr); +/*! INTERNAL: + * Converts a cram in-memory record into a bam in-memory record. We + * pass a pointer to a bam_seq_t pointer along with the a pointer to + * the allocated size. These can initially be pointers to NULL and zero. + * + * This function will reallocate the bam buffer as required and update + * (*bam)->alloc accordingly, allowing it to be used within a loop + * efficiently without needing to allocate new bam objects over and + * over again. + * + * Returns the used size of the bam record on success + * -1 on failure. + */ +int cram_to_bam(sam_hdr_t *sh, cram_fd *fd, cram_slice *s, + cram_record *cr, int rec, bam_seq_t **bam); + /* * Drains and frees the decode read-queue for a multi-threaded reader. */ diff --git a/cram/cram_encode.c b/cram/cram_encode.c index 4a762f7b0..5d22db54d 100644 --- a/cram/cram_encode.c +++ b/cram/cram_encode.c @@ -3401,6 +3401,8 @@ static int process_one_read(cram_fd *fd, cram_container *c, c->num_bases += cr->len; cr->apos = bam_pos(b)+1; + if (cr->apos < 0 || cr->apos > INT64_MAX/2) + goto err; if (c->pos_sorted) { if (cr->apos < s->last_apos && !fd->ap_delta) { c->pos_sorted = 0; @@ -3439,6 +3441,11 @@ static int process_one_read(cram_fd *fd, cram_container *c, int64_t apos = cr->apos-1, spos = 0; int64_t MD_last = apos; // last position of edit in MD tag + if (apos < 0) { + hts_log_error("Mapped read with position <= 0 is disallowed"); + return -1; + } + cr->cigar = s->ncigar; cr->ncigar = bam_cigar_len(b); while (cr->cigar + cr->ncigar >= s->cigar_alloc) { diff --git a/cram/cram_external.c b/cram/cram_external.c index 7455185ad..4943750dd 100644 --- a/cram/cram_external.c +++ b/cram/cram_external.c @@ -1,5 +1,5 @@ /* -Copyright (c) 2015, 2018-2020, 2022-2023 Genome Research Ltd. +Copyright (c) 2015, 2018-2020, 2022-2024 Genome Research Ltd. Author: James Bonfield Redistribution and use in source and binary forms, with or without @@ -121,6 +121,16 @@ int cram_container_is_empty(cram_fd *fd) { return fd->empty_container; } +void cram_container_get_coords(cram_container *c, + int *refid, hts_pos_t *start, hts_pos_t *span) { + if (refid) + *refid = c->ref_seq_id; + if (start) + *start = c->ref_seq_start; + if (span) + *span = c->ref_seq_span; +} + /* *----------------------------------------------------------------------------- @@ -281,7 +291,7 @@ static cram_codec *cram_codec_iter_next(cram_codec_iter *iter, iter->curr_map = iter->curr_map->next; return cc; } - } while (iter->idx <= CRAM_MAP_HASH); + } while (iter->idx < CRAM_MAP_HASH); // End of codecs return NULL; @@ -683,6 +693,7 @@ int cram_copy_slice(cram_fd *in, cram_fd *out, int32_t num_slice) { cram_free_block(blk); return -1; } + if (cram_write_block(out, blk) != 0) { cram_free_block(blk); return -1; @@ -704,6 +715,192 @@ int cram_copy_slice(cram_fd *in, cram_fd *out, int32_t num_slice) { return 0; } +/* + * Discards the next containers worth of data. + * Only the cram structure has been read so far. + * + * Returns 0 on success, + * -1 on failure + */ +static int cram_skip_container(cram_fd *in, cram_container *c) { + // Compression header + cram_block *blk; + if (!(blk = cram_read_block(in))) + return -1; + cram_free_block(blk); + + int i; + for (i = 0; i < c->num_landmarks; i++) { + cram_block_slice_hdr *hdr; + + if (!(blk = cram_read_block(in))) + return -1; + if (!(hdr = cram_decode_slice_header(in, blk))) { + cram_free_block(blk); + return -1; + } + cram_free_block(blk); + + int num_blocks = cram_slice_hdr_get_num_blocks(hdr), j; + for (j = 0; j < num_blocks; j++) { + blk = cram_read_block(in); + if (!blk) { + cram_free_slice_header(hdr); + return -1; + } + cram_free_block(blk); + } + cram_free_slice_header(hdr); + } + + return 0; +} + + +/* + * Copies a container, but filtering it down to a specific region, + * which has already been set on the 'in' fd. + * + * This is used in e.g. samtools cat where we specified a region and discover + * that a region doesn't entirely span the container, so we have to select + * which reads we need to copy out of it. + * + * If ref_id is non-NULL we also return the last ref_id we filtered. + * This can be -2 if it's multi-ref and we observe more than one reference, + * and actual ref_id >= -1 if it's multi-ref and we observe just one ref or + * it's fixed reference. + * + * Returns 0 on success + * -1 on error + */ +int cram_filter_container(cram_fd *in, cram_fd *out, cram_container *c, + int *ref_id) { + int err = 0, fixed_ref = -3; + + if (ref_id) + *ref_id = c->ref_seq_id; + + int rid = in->range.refid == -2 ? -1 : in->range.refid; + if (rid != c->ref_seq_id || + in->range.start > c->ref_seq_start + c->ref_seq_span-1) + // Except for multi-ref cases + if (c->ref_seq_id != -2) + return cram_skip_container(in, c); + + // Container compression header + cram_block *blk = cram_read_block(in); + if (!blk) + return -1; + c->comp_hdr = cram_decode_compression_header(in, blk); + in->ctr = c; + + // If it's multi-ref but a constant ref-id, then we can still do + // basic level chromosome filtering. Similarly multi-ref where we're + // _already_ in ref "*" (unmapped) means we can just copy the container + // as there are no positions to filter on and "*" sorts to the end. + // TODO: how to tell "already in" though? + if (c->ref_seq_id == -2) { + cram_codec *cd = c->comp_hdr->codecs[DS_RI]; + if (cd && cd->codec == E_HUFFMAN && cd->u.huffman.ncodes == 1 && + // this check should be always true anyway + rid == cd->u.huffman.codes[0].symbol) + // We're in multi-ref mode, but actually the entire container + // matches. So if we're in whole-chromosome mode we can just + // copy. + if (in->range.start <= 1 && + in->range.end >= (INT64_MAX&(0xffffffffULL<<32))) { + if (ref_id) + *ref_id = rid; + err |= cram_write_container(out, c) < 0; + err |= cram_write_block(out, blk); + return cram_copy_slice(in, out, c->num_landmarks) | -err; + } + } + + // A simple read-write loop with region filtering automatically due to + // an earlier CRAM_OPT_RANGE request. + // + // We can hit EOF when reaching the end of the range, but we still need + // to manually check we don't attempt to read beyond this single container. + + cram_range rng_copy = in->range; + in->range.start = INT64_MIN; + in->range.end = INT64_MAX; + + bam1_t *b = bam_init1(); + while ((c->curr_slice < c->max_slice || + c->slice->curr_rec < c->slice->max_rec)) { + cram_slice *s; + if (c->slice && c->slice->curr_rec < c->slice->max_rec) + s = c->slice; + else if (c->curr_slice < c->max_slice) + s = cram_next_slice(in, &c); + else + break; // end of container + c->slice = s; + + // This is more efficient if we check as a cram record instead of a + // bam record as we don't have to parse CIGAR end. + cram_record *cr = &c->slice->crecs[c->slice->curr_rec]; + if (fixed_ref == -3) + fixed_ref = cr->ref_id; + else if (fixed_ref != cr->ref_id) + fixed_ref = -2; + + if (rng_copy.refid != cr->ref_id) { + if (rng_copy.refid == -2) { + if (cr->ref_id > -1) { + // Want unmapped, but have mapped + c->slice->curr_rec++; + continue; + } + } else { + if (rng_copy.refid > cr->ref_id || rng_copy.refid == -1) { + // multi-ref and not at the correct ref yet + c->slice->curr_rec++; + continue; + } else { + // multi-ref and beyond the desired ref + break; + } + } + } + + // Correct ref, but check the desired region + if (cr->aend < rng_copy.start) { + c->slice->curr_rec++; + continue; + } + if (cr->apos > rng_copy.end) + break; + + // Broadly rquivalent to cram_get_bam_seq, but starting from 'cr' + err |= cram_to_bam(in->header, in, s, cr, s->curr_rec++, &b) < 0; + + if (cram_put_bam_seq(out, b) < 0) { + err |= 1; + break; + } + } + bam_destroy1(b); + + if (ref_id) + *ref_id = fixed_ref; + + in->range = rng_copy; + + // Avoids double frees as we stole the container from our other + // file descriptor. + in->ctr = NULL; + in->ctr_mt = NULL; + + err |= cram_flush(out); + cram_free_block(blk); + + return -err; +} + + /* * Renumbers RG numbers in a cram compression header. * diff --git a/cram/cram_index.c b/cram/cram_index.c index 0908736ab..77c953d6c 100644 --- a/cram/cram_index.c +++ b/cram/cram_index.c @@ -1,5 +1,5 @@ /* -Copyright (c) 2013-2020, 2023 Genome Research Ltd. +Copyright (c) 2013-2020, 2023-2024 Genome Research Ltd. Author: James Bonfield Redistribution and use in source and binary forms, with or without @@ -410,6 +410,9 @@ cram_index *cram_index_query(cram_fd *fd, int refid, hts_pos_t pos, // Continue from a previous search. // We switch to just scanning the linked list, as the nested // lists are typically short. + if (refid == HTS_IDX_NOCOOR) + refid = -1; + e = from->e_next; if (e && e->refid == refid && e->start <= pos) return e; @@ -423,6 +426,7 @@ cram_index *cram_index_query(cram_fd *fd, int refid, hts_pos_t pos, // fail, or already there, dealt with elsewhere. return NULL; + case -1: case HTS_IDX_NOCOOR: refid = -1; pos = 0; @@ -844,3 +848,193 @@ int cram_index_build(cram_fd *fd, const char *fn_base, const char *fn_idx) { return (bgzf_close(fp) >= 0)? 0 : -4; } + +// internal recursive step +static int64_t cram_num_containers_between_(cram_index *e, int64_t *last_pos, + int64_t nct, + off_t cstart, off_t cend, + int64_t *first, int64_t *last) { + int64_t nc = 0, i; + + if (e->offset) { + if (e->offset != *last_pos) { + if (e->offset >= cstart && (!cend || e->offset <= cend)) { + if (first && *first < 0) + *first = nct; + if (last) + *last = nct; + } + nc++; + } + // else a new multi-ref in same container + *last_pos = e->offset; + } + + for (i = 0; i < e->nslice; i++) + nc += cram_num_containers_between_(&e->e[i], last_pos, nc + nct, + cstart, cend, first, last); + + return nc; +} + +/*! Returns the number of containers in the CRAM file within given offsets. + * + * The cstart and cend offsets are the locations of the start of containers + * as returned by index_container_offset. + * + * If non-NULL, first and last will hold the inclusive range of container + * numbers, counting from zero. + * + * @return + * Returns the number of containers, equivalent to *last-*first+1. + */ +int64_t cram_num_containers_between(cram_fd *fd, + off_t cstart, off_t cend, + int64_t *first, int64_t *last) { + int64_t nc = 0, i; + int64_t last_pos = -99; + int64_t l_first = -1, l_last = -1; + + for (i = 0; i < fd->index_sz; i++) { + int j = i+1 == fd->index_sz ? 0 : i+1; // maps "*" to end + nc += cram_num_containers_between_(&fd->index[j], &last_pos, nc, + cstart, cend, &l_first, &l_last); + } + + if (first) + *first = l_first; + if (last) + *last = l_last; + + return l_last - l_first + 1; +} + +/* + * Queries the total number of distinct containers in the index. + * Note there may be more containers in the file than in the index, as we + * are not required to have an index entry for every one. + */ +int64_t cram_num_containers(cram_fd *fd) { + return cram_num_containers_between(fd, 0, 0, NULL, NULL); +} + + +/*! Returns the byte offset for the start of the n^th container. + * + * The index must have previously been loaded, otherwise <0 is returned. + */ +static cram_index *cram_container_num2offset_(cram_index *e, int num, + int64_t *last_pos, int *nc) { + if (e->offset) { + if (e->offset != *last_pos) { + if (*nc == num) + return e; + (*nc)++; + } + // else a new multi-ref in same container + *last_pos = e->offset; + } + + int i; + for (i = 0; i < e->nslice; i++) { + cram_index *tmp = cram_container_num2offset_(&e->e[i], num, + last_pos, nc); + if (tmp) + return tmp; + } + + + return NULL; +} + +off_t cram_container_num2offset(cram_fd *fd, int64_t num) { + int nc = 0, i; + int64_t last_pos = -9; + cram_index *e = NULL; + + for (i = 0; i < fd->index_sz; i++) { + int j = i+1 == fd->index_sz ? 0 : i+1; // maps "*" to end + if (!fd->index[j].nslice) + continue; + if ((e = cram_container_num2offset_(&fd->index[j], num, + &last_pos, &nc))) + break; + } + + return e ? e->offset : -1; +} + + +/*! Returns the container number for the first container at offset >= pos. + * + * The index must have previously been loaded, otherwise <0 is returned. + */ +static cram_index *cram_container_offset2num_(cram_index *e, off_t pos, + int64_t *last_pos, int *nc) { + if (e->offset) { + if (e->offset != *last_pos) { + if (e->offset >= pos) + return e; + (*nc)++; + } + // else a new multi-ref in same container + *last_pos = e->offset; + } + + int i; + for (i = 0; i < e->nslice; i++) { + cram_index *tmp = cram_container_offset2num_(&e->e[i], pos, + last_pos, nc); + if (tmp) + return tmp; + } + + + return NULL; +} + +int64_t cram_container_offset2num(cram_fd *fd, off_t pos) { + int nc = 0, i; + int64_t last_pos = -9; + cram_index *e = NULL; + + for (i = 0; i < fd->index_sz; i++) { + int j = i+1 == fd->index_sz ? 0 : i+1; // maps "*" to end + if (!fd->index[j].nslice) + continue; + if ((e = cram_container_offset2num_(&fd->index[j], pos, + &last_pos, &nc))) + break; + } + + return e ? nc : -1; +} + +/*! + * Returns the file offsets of CRAM containers covering a specific region + * query. Note both offsets are the START of the container. + * + * first will point to the start of the first overlapping container + * last will point to the start of the last overlapping container + * + * Returns 0 on success + * <0 on failure + */ +int cram_index_extents(cram_fd *fd, int refid, hts_pos_t start, hts_pos_t end, + off_t *first, off_t *last) { + cram_index *ci; + + if (first) { + if (!(ci = cram_index_query(fd, refid, start, NULL))) + return -1; + *first = ci->offset; + } + + if (last) { + if (!(ci = cram_index_query_last(fd, refid, end))) + return -1; + *last = ci->offset; + } + + return 0; +} diff --git a/cram/cram_io.c b/cram/cram_io.c index 247423354..7f7ffca49 100644 --- a/cram/cram_io.c +++ b/cram/cram_io.c @@ -1984,11 +1984,15 @@ int cram_compress_block2(cram_fd *fd, cram_slice *s, // We also get large fluctuations based on genome coordinate for // e.g. SA:Z and SC series, but we consider the typical scale of // delta between blocks and use this to look for abnormality. + + // Equivalent to (but minus possible integer overflow) + // (b->uncomp_size + 1000)/4 > metrics->input_avg_sz+1000 || + // b->uncomp_size + 1000 < (metrics->input_avg_sz+1000)/4) if (metrics->input_avg_sz && - (b->uncomp_size + 1000 > 4*(metrics->input_avg_sz+1000) || - b->uncomp_size + 1000 < (metrics->input_avg_sz+1000)/4) && - ABS(b->uncomp_size-metrics->input_avg_sz) - > 10*metrics->input_avg_delta) { + (b->uncomp_size/4 - 750 > metrics->input_avg_sz || + b->uncomp_size < metrics->input_avg_sz/4 - 750) && + ABS(b->uncomp_size-metrics->input_avg_sz)/10 + > metrics->input_avg_delta) { metrics->next_trial = 0; } diff --git a/faidx.c b/faidx.c index ce8fe5d9f..ed39c0ca0 100644 --- a/faidx.c +++ b/faidx.c @@ -1,6 +1,6 @@ /* faidx.c -- FASTA and FASTQ random access. - Copyright (C) 2008, 2009, 2013-2020, 2022 Genome Research Ltd. + Copyright (C) 2008, 2009, 2013-2020, 2022, 2024 Genome Research Ltd. Portions copyright (C) 2011 Broad Institute. Author: Heng Li @@ -43,6 +43,29 @@ DEALINGS IN THE SOFTWARE. */ #include "htslib/kstring.h" #include "hts_internal.h" +// Faster isgraph; assumes ASCII +static inline int isgraph_(unsigned char c) { + return c > ' ' && c <= '~'; +} + +#ifdef isgraph +# undef isgraph +#endif +#define isgraph isgraph_ + +// An optimised bgzf_getc. +// We could consider moving this to bgzf.h, but our own code uses it here only. +static inline int bgzf_getc_(BGZF *fp) { + if (fp->block_offset+1 < fp->block_length) { + int c = ((unsigned char*)fp->uncompressed_block)[fp->block_offset++]; + fp->uncompressed_address++; + return c; + } + + return bgzf_getc(fp); +} +#define bgzf_getc bgzf_getc_ + typedef struct { int id; // faidx_t->name[id] is for this struct. uint32_t line_len, line_blen; @@ -692,9 +715,8 @@ faidx_t *fai_load_format(const char *fn, enum fai_format_options format) { static char *fai_retrieve(const faidx_t *fai, const faidx1_t *val, uint64_t offset, hts_pos_t beg, hts_pos_t end, hts_pos_t *len) { - char *s; - size_t l; - int c = 0; + char *buffer, *s; + ssize_t nread, remaining, firstline_len, firstline_blen; int ret; if ((uint64_t) end - (uint64_t) beg >= SIZE_MAX - 2) { @@ -720,26 +742,57 @@ static char *fai_retrieve(const faidx_t *fai, const faidx1_t *val, return NULL; } - l = 0; - s = (char*)malloc((size_t) end - beg + 2); - if (!s) { + // Over-allocate so there is extra space for one end-of-line sequence + buffer = (char*)malloc((size_t) end - beg + val->line_len - val->line_blen + 1); + if (!buffer) { *len = -1; return NULL; } - while ( l < end - beg && (c=bgzf_getc(fai->bgzf))>=0 ) - if (isgraph(c)) s[l++] = c; - if (c < 0) { - hts_log_error("Failed to retrieve block: %s", - c == -1 ? "unexpected end of file" : "error reading file"); - free(s); - *len = -1; - return NULL; + remaining = *len = end - beg; + firstline_blen = val->line_blen - beg % val->line_blen; + + // Special case when the entire interval requested is within a single FASTA/Q line + if (remaining <= firstline_blen) { + nread = bgzf_read_small(fai->bgzf, buffer, remaining); + if (nread < remaining) goto error; + buffer[nread] = '\0'; + return buffer; + } + + s = buffer; + firstline_len = val->line_len - beg % val->line_blen; + + // Read the (partial) first line and its line terminator, but increment s past the + // line contents only, so the terminator characters will be overwritten by the next line. + nread = bgzf_read_small(fai->bgzf, s, firstline_len); + if (nread < firstline_len) goto error; + s += firstline_blen; + remaining -= firstline_blen; + + // Similarly read complete lines and their line terminator characters, but overwrite the latter. + while (remaining > val->line_blen) { + nread = bgzf_read_small(fai->bgzf, s, val->line_len); + if (nread < (ssize_t) val->line_len) goto error; + s += val->line_blen; + remaining -= val->line_blen; } - s[l] = '\0'; - *len = l; - return s; + if (remaining > 0) { + nread = bgzf_read_small(fai->bgzf, s, remaining); + if (nread < remaining) goto error; + s += remaining; + } + + *s = '\0'; + return buffer; + +error: + hts_log_error("Failed to retrieve block: %s", + (nread == 0)? "unexpected end of file" : "error reading file"); + free(buffer); + *len = -1; + return NULL; } static int fai_get_val(const faidx_t *fai, const char *str, diff --git a/header.c b/header.c index 5161034f4..7f62074f0 100644 --- a/header.c +++ b/header.c @@ -2358,7 +2358,7 @@ void sam_hdr_incr_ref(sam_hdr_t *bh) { * Returns a sam_hrecs_t struct on success (free with sam_hrecs_free()) * NULL on failure */ -sam_hrecs_t *sam_hrecs_new() { +sam_hrecs_t *sam_hrecs_new(void) { sam_hrecs_t *hrecs = calloc(1, sizeof(*hrecs)); if (!hrecs) diff --git a/hfile.c b/hfile.c index fc87049ca..552b71774 100644 --- a/hfile.c +++ b/hfile.c @@ -703,7 +703,7 @@ static int is_preload_url_remote(const char *url){ static hFILE *hopen_preload(const char *url, const char *mode){ hFILE* fp = hopen(url + 8, mode); - return hpreload(fp); + return fp ? hpreload(fp) : NULL; } hFILE *hdopen(int fd, const char *mode) @@ -976,7 +976,7 @@ void hfile_shutdown(int do_close_plugin) pthread_mutex_unlock(&plugins_lock); } -static void hfile_exit() +static void hfile_exit(void) { hfile_shutdown(0); pthread_mutex_destroy(&plugins_lock); @@ -1082,7 +1082,7 @@ static int init_add_plugin(void *obj, int (*init)(struct hFILE_plugin *), * Returns 0 on success, * <0 on failure */ -static int load_hfile_plugins() +static int load_hfile_plugins(void) { static const struct hFILE_scheme_handler data = { hopen_mem, hfile_always_local, "built-in", 80 }, diff --git a/hfile_libcurl.c b/hfile_libcurl.c index e70550eab..3463acf43 100644 --- a/hfile_libcurl.c +++ b/hfile_libcurl.c @@ -277,7 +277,7 @@ static void free_auth(auth_token *tok) { free(tok); } -static void libcurl_exit() +static void libcurl_exit(void) { if (curl_share_cleanup(curl.share) == CURLSHE_OK) curl.share = NULL; diff --git a/hfile_s3_write.c b/hfile_s3_write.c index d54945839..a501645ca 100644 --- a/hfile_s3_write.c +++ b/hfile_s3_write.c @@ -822,7 +822,7 @@ static hFILE *vhopen_s3_write(const char *url, const char *mode, va_list args) { } -static void s3_write_exit() { +static void s3_write_exit(void) { if (curl_share_cleanup(curl.share) == CURLSHE_OK) curl.share = NULL; diff --git a/hts.c b/hts.c index cf0a07d9f..a8a8bead2 100644 --- a/hts.c +++ b/hts.c @@ -81,7 +81,7 @@ KHASH_INIT2(s2i,, kh_cstr_t, int64_t, 1, kh_str_hash_func, kh_str_hash_equal) HTSLIB_EXPORT int hts_verbose = HTS_LOG_WARNING; -const char *hts_version() +const char *hts_version(void) { return HTS_VERSION_TEXT; } @@ -431,6 +431,27 @@ static int is_text_only(const unsigned char *u, const unsigned char *ulim) return 1; } +static inline int +alternate_zeros(const unsigned char *u, const unsigned char *ulim) +{ + for (; u < ulim; u += 2) + if (*u != '\0') return 0; + return 1; +} + +static int is_utf16_text(const unsigned char *u, const unsigned char *ulim) +{ + if (ulim - u >= 6 && + ((u[0] == 0xfe && u[1] == 0xff && alternate_zeros(u+2, ulim)) || + (u[0] == 0xff && u[1] == 0xfe && alternate_zeros(u+3, ulim)))) + return 2; + else if (ulim - u >= 8 && + (alternate_zeros(u, ulim) || alternate_zeros(u+1, ulim))) + return 1; + else + return 0; +} + static int is_fastaq(const unsigned char *u, const unsigned char *ulim) { const unsigned char *eol = memchr(u, '\n', ulim - u); @@ -1301,7 +1322,7 @@ int hts_parse_opt_list(htsFormat *fmt, const char *str) { * -1 on failure. */ int hts_parse_format(htsFormat *format, const char *str) { - char fmt[8]; + char fmt[9]; const char *cp = scan_keyword(str, ',', fmt, sizeof fmt); format->version.minor = 0; // unknown @@ -1743,7 +1764,7 @@ static hFILE *hts_hfile(htsFile *fp) { case bcf: // fall through case bam: return bgzf_hfile(fp->fp.bgzf); case cram: return cram_hfile(fp->fp.cram); - case text_format: return fp->fp.hfile; + case text_format: // fall through case vcf: // fall through case fastq_format: // fall through case fasta_format: // fall through @@ -1961,6 +1982,12 @@ hFILE *hts_open_tmpfile(const char *fname, const char *mode, kstring_t *tmpname) return fp; } +int hts_is_utf16_text(const kstring_t *str) +{ + const unsigned char *u = (const unsigned char *) (str->s); + return (str->l > 0 && str->s)? is_utf16_text(u, u + str->l) : 0; +} + // For VCF/BCF backward sweeper. Not exposing these functions because their // future is uncertain. Things will probably have to change with hFILE... BGZF *hts_get_bgzfp(htsFile *fp) @@ -2030,6 +2057,8 @@ char **hts_readlist(const char *string, int is_file, int *_n) while ((ret = bgzf_getline(fp, '\n', &str)) >= 0) { if (str.l == 0) continue; + if (n == 0 && hts_is_utf16_text(&str)) + hts_log_warning("'%s' appears to be encoded as UTF-16", string); if (hts_resize(char*, n + 1, &m, &s, 0) < 0) goto err; s[n] = strdup(str.s); @@ -2089,6 +2118,8 @@ char **hts_readlines(const char *fn, int *_n) str.s = 0; str.l = str.m = 0; while ((ret = bgzf_getline(fp, '\n', &str)) >= 0) { if (str.l == 0) continue; + if (n == 0 && hts_is_utf16_text(&str)) + hts_log_warning("'%s' appears to be encoded as UTF-16", fn); if (hts_resize(char *, n + 1, &m, &s, 0) < 0) goto err; s[n] = strdup(str.s); @@ -2446,9 +2477,14 @@ int hts_idx_finish(hts_idx_t *idx, uint64_t final_offset) return ret; } +static inline hts_pos_t hts_idx_maxpos(const hts_idx_t *idx) +{ + return hts_bin_maxpos(idx->min_shift, idx->n_lvls); +} + int hts_idx_check_range(hts_idx_t *idx, int tid, hts_pos_t beg, hts_pos_t end) { - int64_t maxpos = (int64_t) 1 << (idx->min_shift + idx->n_lvls * 3); + hts_pos_t maxpos = hts_idx_maxpos(idx); if (tid < 0 || (beg <= maxpos && end <= maxpos)) return 0; @@ -3222,7 +3258,7 @@ static inline int reg2intervals(hts_itr_t *iter, const hts_idx_t *idx, int tid, size_t reg_bin_count = 0, hash_bin_count; int res; - if (!iter || !idx || (bidx = idx->bidx[tid]) == NULL || beg >= end) + if (!iter || !idx || (bidx = idx->bidx[tid]) == NULL || beg > end) return -1; hash_bin_count = kh_n_buckets(bidx); @@ -3341,6 +3377,7 @@ hts_itr_t *hts_itr_query(const hts_idx_t *idx, int tid, hts_pos_t beg, hts_pos_t khint_t k; bidx_t *bidx; uint64_t min_off, max_off; + hts_pos_t idx_maxpos; hts_itr_t *iter; uint32_t unmapped = 0, rel_off; @@ -3385,6 +3422,9 @@ hts_itr_t *hts_itr_query(const hts_idx_t *idx, int tid, hts_pos_t beg, hts_pos_t if ( !kh_size(bidx) ) { iter->finished = 1; return iter; } + idx_maxpos = hts_idx_maxpos(idx); + if (beg >= idx_maxpos) { iter->finished = 1; return iter; } + rel_off = beg>>idx->min_shift; // compute min_off bin = hts_bin_first(idx->n_lvls) + rel_off; @@ -3427,7 +3467,7 @@ hts_itr_t *hts_itr_query(const hts_idx_t *idx, int tid, hts_pos_t beg, hts_pos_t // compute max_off: a virtual offset from a bin to the right of end // First check if end lies within the range of the index (it won't // if it's HTS_POS_MAX) - if (end < 1LL << (idx->min_shift + 3 * idx->n_lvls)) { + if (end <= idx_maxpos) { bin = hts_bin_first(idx->n_lvls) + ((end-1) >> idx->min_shift) + 1; if (bin >= idx->n_bins) bin = 0; while (1) { @@ -3513,7 +3553,7 @@ int hts_itr_multi_bam(const hts_idx_t *idx, hts_itr_t *iter) bidx_t *bidx; uint64_t min_off, max_off, t_off = (uint64_t)-1; int tid; - hts_pos_t beg, end; + hts_pos_t beg, end, idx_maxpos; hts_reglist_t *curr_reg; uint32_t unmapped = 0, rel_off; @@ -3555,6 +3595,8 @@ int hts_itr_multi_bam(const hts_idx_t *idx, hts_itr_t *iter) else unmapped = 1; + idx_maxpos = hts_idx_maxpos(idx); + for(j=0; jcount; j++) { hts_pair32_t *curr_intv = &curr_reg->intervals[j]; if (curr_intv->end < curr_intv->beg) @@ -3562,6 +3604,8 @@ int hts_itr_multi_bam(const hts_idx_t *idx, hts_itr_t *iter) beg = curr_intv->beg; end = curr_intv->end; + if (beg >= idx_maxpos) + continue; rel_off = beg>>idx->min_shift; /* Compute 'min_off' by searching the lowest level bin containing 'beg'. @@ -3606,7 +3650,7 @@ int hts_itr_multi_bam(const hts_idx_t *idx, hts_itr_t *iter) // compute max_off: a virtual offset from a bin to the right of end // First check if end lies within the range of the index (it // won't if it's HTS_POS_MAX) - if (end < 1LL << (idx->min_shift + 3 * idx->n_lvls)) { + if (end <= idx_maxpos) { bin = hts_bin_first(idx->n_lvls) + ((end-1) >> idx->min_shift) + 1; if (bin >= idx->n_bins) bin = 0; while (1) { @@ -3782,7 +3826,7 @@ void hts_itr_destroy(hts_itr_t *iter) } } -static inline long long push_digit(long long i, char c) +static inline unsigned long long push_digit(unsigned long long i, char c) { // ensure subtraction occurs first, avoiding overflow for >= MAX-48 or so int digit = c - '0'; @@ -3791,7 +3835,7 @@ static inline long long push_digit(long long i, char c) long long hts_parse_decimal(const char *str, char **strend, int flags) { - long long n = 0; + unsigned long long n = 0; int digits = 0, decimals = 0, e = 0, lost = 0; char sign = '+', esign = '+'; const char *s, *str_orig = str; @@ -4405,11 +4449,12 @@ int hts_itr_multi_next(htsFile *fd, hts_itr_t *iter, void *r) break; uint64_t max = iter->off[j].max; - if ((max>>32) != tid) + if ((max>>32) != tid) { tid = HTS_IDX_START; // => no range limit - - if (end < rl->intervals[max & 0xffffffff].end) - end = rl->intervals[max & 0xffffffff].end; + } else { + if (end < rl->intervals[max & 0xffffffff].end) + end = rl->intervals[max & 0xffffffff].end; + } if (v < iter->off[j].v) v = iter->off[j].v; j++; @@ -5050,7 +5095,7 @@ int hts_resize_array_(size_t item_size, size_t num, size_t size_sz, return 0; } -void hts_lib_shutdown() +void hts_lib_shutdown(void) { hfile_shutdown(1); } @@ -5064,7 +5109,7 @@ void hts_set_log_level(enum htsLogLevel level) hts_verbose = level; } -enum htsLogLevel hts_get_log_level() +enum htsLogLevel hts_get_log_level(void) { return hts_verbose; } diff --git a/hts_expr.c b/hts_expr.c index 5e5a132ea..dfd15b151 100644 --- a/hts_expr.c +++ b/hts_expr.c @@ -1,6 +1,6 @@ /* hts_expr.c -- filter expression parsing and processing. - Copyright (C) 2020-2022 Genome Research Ltd. + Copyright (C) 2020-2022, 2024 Genome Research Ltd. Author: James Bonfield @@ -527,8 +527,10 @@ static int bitand_expr(hts_filter_t *filt, void *data, hts_expr_sym_func *fn, } else if (res->is_str || val.is_str) { hts_expr_val_free(&val); return -1; + } else { + res->is_true = + (res->d = ((int64_t)res->d & (int64_t)val.d)) != 0; } - res->is_true = (res->d = ((int64_t)res->d & (int64_t)val.d)) != 0; } else { break; } @@ -560,8 +562,10 @@ static int bitxor_expr(hts_filter_t *filt, void *data, hts_expr_sym_func *fn, } else if (res->is_str || val.is_str) { hts_expr_val_free(&val); return -1; + } else { + res->is_true = + (res->d = ((int64_t)res->d ^ (int64_t)val.d)) != 0; } - res->is_true = (res->d = ((int64_t)res->d ^ (int64_t)val.d)) != 0; } else { break; } @@ -593,8 +597,10 @@ static int bitor_expr(hts_filter_t *filt, void *data, hts_expr_sym_func *fn, } else if (res->is_str || val.is_str) { hts_expr_val_free(&val); return -1; + } else { + res->is_true = + (res->d = ((int64_t)res->d | (int64_t)val.d)) != 0; } - res->is_true = (res->d = ((int64_t)res->d | (int64_t)val.d)) != 0; } else { break; } diff --git a/hts_internal.h b/hts_internal.h index 191a55d16..52f29e6c1 100644 --- a/hts_internal.h +++ b/hts_internal.h @@ -87,6 +87,9 @@ typedef struct hts_cram_idx_t { struct cram_fd *cram; } hts_cram_idx_t; +// Determine whether the string's contents appear to be UTF-16-encoded text. +// Returns 1 if they are, 2 if there is also a BOM, or 0 otherwise. +int hts_is_utf16_text(const kstring_t *str); // Entry point to hFILE_multipart backend. struct hFILE *hopen_htsget_redirect(struct hFILE *hfile, const char *mode); @@ -120,18 +123,6 @@ const char *hts_plugin_path(void); */ int bgzf_idx_push(BGZF *fp, hts_idx_t *hidx, int tid, hts_pos_t beg, hts_pos_t end, uint64_t offset, int is_mapped); -/* - * bgzf analogue to hts_idx_amend_last. - * - * This is needed when multi-threading and writing indices on the fly. - * At the point of writing a record we know the virtual offset for start - * and end, but that end virtual offset may be the end of the current - * block. In standard indexing our end virtual offset becomes the start - * of the next block. Thus to ensure bit for bit compatibility we - * detect this boundary case and fix it up here. - */ -void bgzf_idx_amend_last(BGZF *fp, hts_idx_t *hidx, uint64_t offset); - static inline int find_file_extension(const char *fn, char ext_out[static HTS_MAX_EXT_LEN]) { const char *delim = fn ? strstr(fn, HTS_IDX_DELIM) : NULL, *ext; diff --git a/hts_probe_cc.sh b/hts_probe_cc.sh index 48d0159c6..c9fc0a821 100755 --- a/hts_probe_cc.sh +++ b/hts_probe_cc.sh @@ -2,7 +2,7 @@ # Check compiler options for non-configure builds and create Makefile fragment # -# Copyright (C) 2022-2023 Genome Research Ltd. +# Copyright (C) 2022-2024 Genome Research Ltd. # # Author: Rob Davies # @@ -51,6 +51,12 @@ run_compiler () # again with it to see if the flag is needed. run_test () { + if [ $have_cpuid -ne 1 ] ; then + # Only test for and build SSE / AVX code if cpuid works as + # otherwise it won't be executed, even if present + echo "$3 =" + return + fi rm -f conftest conftest.err conftest.c cat - > conftest.c if run_compiler ; then @@ -66,6 +72,27 @@ run_test () echo "# Compiler probe results, generated by $0" +# Check for cpuid +rm -f conftest conftest.err conftest.c +cat > conftest.c <<'EOF' +#include +#include +int main(int argc, char **argv) { + unsigned int a, b, c, d; + int level = __get_cpuid_max(0, NULL); + if (level > 0) + __cpuid_count(1, 0, a, b, c, d); + return 0; +} +EOF +if run_compiler ; then + echo "HTS_HAVE_CPUID = 1" + have_cpuid=1 +else + echo "HTS_HAVE_CPUID =" + have_cpuid=0 +fi + # Check for sse4.1 etc. support run_test "-msse4.1 -mpopcnt -mssse3" HTS_CFLAGS_SSE4 HTS_BUILD_SSE4 <<'EOF' #ifdef __x86_64__ @@ -104,7 +131,9 @@ run_test "-mavx512f -mpopcnt" HTS_CFLAGS_AVX512 HTS_BUILD_AVX512 <<'EOF' int main(int argc, char **argv) { __m512i a = _mm512_set1_epi32(1); __m512i b = _mm512_add_epi32(a, a); - return _mm_popcnt_u32(*((char *) &b)); + __m256i c = _mm512_castsi512_si256(b); + __m256i d = _mm512_extracti64x4_epi64(a, 1); + return _mm_popcnt_u32(*((char *) &c)) + (*(char *) &d); } #else int main(int argc, char **argv) { return 0; } diff --git a/htscodecs b/htscodecs index ffda7310c..51794289a 160000 --- a/htscodecs +++ b/htscodecs @@ -1 +1 @@ -Subproject commit ffda7310c4b3292955561d6c3b1743cb82bfe26b +Subproject commit 51794289ac47455209c333182b6768f99a613948 diff --git a/htsfile.1 b/htsfile.1 index 89a2fe446..e22fdbcda 100644 --- a/htsfile.1 +++ b/htsfile.1 @@ -1,4 +1,4 @@ -.TH htsfile 1 "15 April 2024" "htslib-1.20" "Bioinformatics tools" +.TH htsfile 1 "12 September 2024" "htslib-1.21" "Bioinformatics tools" .SH NAME htsfile \- identify high-throughput sequencing data files .\" diff --git a/htsfile.c b/htsfile.c index 9af4ae31b..25af3f584 100644 --- a/htsfile.c +++ b/htsfile.c @@ -46,7 +46,7 @@ int show_headers = 1; int verbose = 0; int status = EXIT_SUCCESS; /* Exit status from main */ -void error(const char *format, ...) +void HTS_FORMAT(HTS_PRINTF_FMT, 1, 2) error(const char *format, ...) { int err = errno; va_list args; diff --git a/htslib-s3-plugin.7 b/htslib-s3-plugin.7 index 3bd868c71..44de65771 100644 --- a/htslib-s3-plugin.7 +++ b/htslib-s3-plugin.7 @@ -1,4 +1,4 @@ -.TH htslib-s3-plugin 7 "15 April 2024" "htslib-1.20" "Bioinformatics tools" +.TH htslib-s3-plugin 7 "12 September 2024" "htslib-1.21" "Bioinformatics tools" .SH NAME htslib-s3-plugin \- htslib AWS S3 plugin .\" diff --git a/htslib.map b/htslib.map index e342f55b5..52ad738bb 100644 --- a/htslib.map +++ b/htslib.map @@ -640,3 +640,13 @@ HTSLIB_1.18 { HTSLIB_1.20 { tbx_conf_gaf; } HTSLIB_1.18; + +HTSLIB_1.21 { + cram_container_get_coords; + cram_container_num2offset; + cram_container_offset2num; + cram_filter_container; + cram_index_extents; + cram_num_containers; + cram_num_containers_between; +} HTSLIB_1.20; diff --git a/htslib/bgzf.h b/htslib/bgzf.h index ea4ec3ece..87d4c6a3b 100644 --- a/htslib/bgzf.h +++ b/htslib/bgzf.h @@ -3,7 +3,7 @@ /* Copyright (c) 2008 Broad Institute / Massachusetts Institute of Technology 2011, 2012 Attractive Chaos - Copyright (C) 2009, 2013, 2014, 2017, 2018-2019, 2022-2023 Genome Research Ltd + Copyright (C) 2009, 2013, 2014, 2017, 2018-2019, 2022-2024 Genome Research Ltd Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal @@ -31,6 +31,7 @@ #define HTSLIB_BGZF_H #include +#include #include #include "hts_defs.h" @@ -143,6 +144,26 @@ typedef struct BGZF BGZF; HTSLIB_EXPORT ssize_t bgzf_read(BGZF *fp, void *data, size_t length) HTS_RESULT_USED; +/** + * bgzf_read optimised for small quantities, as a static inline + * See bgzf_read() normal function for return values. + */ +static inline ssize_t bgzf_read_small(BGZF *fp, void *data, size_t length) { + // A block length of 0 implies current block isn't loaded (see + // bgzf_seek_common). That gives negative available so careful on types + if ((ssize_t)length < fp->block_length - fp->block_offset) { + // Short cut the common and easy mode + memcpy((uint8_t *)data, + (uint8_t *)fp->uncompressed_block + fp->block_offset, + length); + fp->block_offset += length; + fp->uncompressed_address += length; + return length; + } else { + return bgzf_read(fp, data, length); + } +} + /** * Write _length_ bytes from _data_ to the file. If no I/O errors occur, * the complete _length_ bytes will be written (or queued for writing). @@ -155,6 +176,24 @@ typedef struct BGZF BGZF; HTSLIB_EXPORT ssize_t bgzf_write(BGZF *fp, const void *data, size_t length) HTS_RESULT_USED; +/** + * bgzf_write optimised for small quantities, as a static inline + * See bgzf_write() normal function for return values. + */ +static inline +ssize_t bgzf_write_small(BGZF *fp, const void *data, size_t length) { + if (fp->is_compressed + && (size_t) (BGZF_BLOCK_SIZE - fp->block_offset) > length) { + // Short cut the common and easy mode + memcpy((uint8_t *)fp->uncompressed_block + fp->block_offset, + data, length); + fp->block_offset += length; + return length; + } else { + return bgzf_write(fp, data, length); + } +} + /** * Write _length_ bytes from _data_ to the file, the index will be used to * decide the amount of uncompressed data to be written to each bgzip block. diff --git a/htslib/cram.h b/htslib/cram.h index e0b51839c..ddc44bbba 100644 --- a/htslib/cram.h +++ b/htslib/cram.h @@ -1,7 +1,7 @@ /// @file htslib/cram.h /// CRAM format-specific API functions. /* - Copyright (C) 2015, 2016, 2018-2020, 2022-2023 Genome Research Ltd. + Copyright (C) 2015, 2016, 2018-2020, 2022-2024 Genome Research Ltd. Author: James Bonfield @@ -209,6 +209,11 @@ HTSLIB_EXPORT int cram_container_is_empty(cram_fd *fd); +/* Returns chromosome and start/span from container struct */ +HTSLIB_EXPORT +void cram_container_get_coords(cram_container *c, + int *refid, hts_pos_t *start, hts_pos_t *span); + /* *----------------------------------------------------------------------------- * cram_block @@ -329,6 +334,18 @@ int cram_transcode_rg(cram_fd *in, cram_fd *out, HTSLIB_EXPORT int cram_copy_slice(cram_fd *in, cram_fd *out, int32_t num_slice); +/* + * Copies a container, but filtering it down to a specific region (as + * already specified in 'in' + * + * Returns 0 on success + * -1 on EOF + * -2 on error + */ +HTSLIB_EXPORT +int cram_filter_container(cram_fd *in, cram_fd *out, cram_container *c, + int *ref_id); + /* * Decodes a CRAM block compression header. * Returns header ptr on success @@ -744,6 +761,62 @@ static inline void sam_hdr_free(SAM_hdr *hdr) { sam_hdr_destroy(hdr); } HTSLIB_EXPORT refs_t *cram_get_refs(htsFile *fd); +/*! + * Returns the file offsets of CRAM slices covering a specific region + * query. Note both offsets are the START of the slice. + * + * first will point to the start of the first overlapping slice + * last will point to the start of the last overlapping slice + * + * @return + * Returns 0 on success + * <0 on failure + */ +HTSLIB_EXPORT +int cram_index_extents(cram_fd *fd, int refid, hts_pos_t start, hts_pos_t end, + off_t *first, off_t *last); + +/*! Returns the total number of containers in the CRAM index. + * + * Note the index is not required to have an entry for every container, but it + * will always have an index entry for the start of each chromosome. + * (Although in practice our indices do container one entry per container.) + * + * This is equivalent to cram_num_containers_between(fd, 0, 0, NULL, NULL) + */ +HTSLIB_EXPORT +int64_t cram_num_containers(cram_fd *fd); + +/*! Returns the number of containers in the CRAM index within given offsets. + * + * The cstart and cend offsets are the locations of the start of containers + * as returned by index_container_offset. + * + * If non-NULL, first and last will hold the inclusive range of container + * numbers, counting from zero. + * + * @return + * Returns the number of containers, equivalent to *last-*first+1. + */ +HTSLIB_EXPORT +int64_t cram_num_containers_between(cram_fd *fd, + off_t cstart, off_t cend, + int64_t *first, int64_t *last); + +/*! Returns the byte offset for the start of the n^th container. + * + * The index must have previously been loaded, otherwise <0 is returned. + */ +HTSLIB_EXPORT +off_t cram_container_num2offset(cram_fd *fd, int64_t n); + +/*! Returns the container number for the first container at offset >= pos. + * + * The index must have previously been loaded, otherwise <0 is returned. + */ +HTSLIB_EXPORT +int64_t cram_container_offset2num(cram_fd *fd, off_t pos); + /**@}*/ #ifdef __cplusplus diff --git a/htslib/hts.h b/htslib/hts.h index c5d99aba1..4f85424cf 100644 --- a/htslib/hts.h +++ b/htslib/hts.h @@ -489,7 +489,7 @@ const char *hts_version(void); // Immediately after release, bump ZZ to 90 to distinguish in-development // Git repository builds from the release; you may wish to increment this // further when significant features are merged. -#define HTS_VERSION 102000 +#define HTS_VERSION 102100 /*! @abstract Introspection on the features enabled in htslib * @@ -1534,6 +1534,13 @@ static inline int hts_bin_bot(int bin, int n_lvls) return (bin - hts_bin_first(l)) << (n_lvls - l) * 3; } +/// Compute the (0-based exclusive) maximum position covered by a binning index +static inline hts_pos_t hts_bin_maxpos(int min_shift, int n_lvls) +{ + hts_pos_t one = 1; + return one << (min_shift + n_lvls * 3); +} + /************** * Endianness * **************/ diff --git a/htslib/hts_defs.h b/htslib/hts_defs.h index e714e8fda..b5cded341 100644 --- a/htslib/hts_defs.h +++ b/htslib/hts_defs.h @@ -1,6 +1,6 @@ /* hts_defs.h -- Miscellaneous definitions. - Copyright (C) 2013-2015,2017, 2019-2020 Genome Research Ltd. + Copyright (C) 2013-2015,2017, 2019-2020, 2024 Genome Research Ltd. Author: John Marshall diff --git a/htslib/khash.h b/htslib/khash.h index 4cea91020..02e4917c8 100644 --- a/htslib/khash.h +++ b/htslib/khash.h @@ -1,7 +1,7 @@ /* The MIT License Copyright (c) 2008, 2009, 2011 by Attractive Chaos - Copyright (C) 2014-2015, 2018 Genome Research Ltd. + Copyright (C) 2014-2015, 2018, 2024 Genome Research Ltd. Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the @@ -356,7 +356,39 @@ static const double __ac_HASH_UPPER = 0.77; __ac_set_isdel_true(h->flags, x); \ --h->size; \ } \ - } + } \ + SCOPE int kh_stats_##name(kh_##name##_t *h, khint_t *empty, \ + khint_t *deleted, khint_t *hist_size, \ + khint_t **hist_out) \ + { \ + khint_t i, *hist = NULL, dist_max = 0, k, dist, step; \ + khint_t mask = h->n_buckets - 1; \ + *empty = *deleted = *hist_size = 0; \ + hist = (khint_t *) calloc(1, sizeof(*hist)); \ + if (!hist) { return -1; } \ + for (i = kh_begin(h); i < kh_end(h); ++i) { \ + if (__ac_isempty(h->flags, i)) { (*empty)++; continue; } \ + if (__ac_isdel(h->flags, i)) { (*deleted)++; continue; } \ + k = __hash_func(h->keys[i]) & (h->n_buckets - 1); \ + dist = 0; \ + step = 0; \ + while (k != i) { \ + dist++; \ + k = (k + (++step)) & mask; \ + } \ + if (dist_max <= dist) { \ + khint_t *new_hist = (khint_t *) realloc(hist, sizeof(*new_hist) * (dist + 1)); \ + if (!new_hist) { free(hist); return -1; } \ + for (k = dist_max + 1; k <= dist; k++) new_hist[k] = 0; \ + hist = new_hist; \ + dist_max = dist; \ + } \ + hist[dist]++; \ + } \ + *hist_out = hist; \ + *hist_size = dist_max + 1; \ + return 0; \ + } #define KHASH_DECLARE(name, khkey_t, khval_t) \ __KHASH_TYPE(name, khkey_t, khval_t) \ @@ -391,6 +423,7 @@ static const double __ac_HASH_UPPER = 0.77; @abstract 64-bit integer comparison function */ #define kh_int64_hash_equal(a, b) ((a) == (b)) + /*! @function @abstract const char* hash function @param s Pointer to a null terminated string @@ -402,12 +435,28 @@ static kh_inline khint_t __ac_X31_hash_string(const char *s) if (h) for (++s ; *s; ++s) h = (h << 5) - h + (khint_t)*s; return h; } + +/*! @function + @abstract const char* FNV1a hash function + @param s Pointer to a null terminated string + @return The hash value + */ +static kh_inline khint_t __ac_FNV1a_hash_string(const char *s) +{ + const khint_t offset_basis = 2166136261; + const khint_t FNV_prime = 16777619; + khint_t h = offset_basis; + for (; *s; ++s) h = (h ^ (uint8_t) *s) * FNV_prime; + return h; +} + /*! @function @abstract Another interface to const char* hash function @param key Pointer to a nul terminated string [const char*] @return The hash value [khint_t] */ -#define kh_str_hash_func(key) __ac_X31_hash_string(key) +#define kh_str_hash_func(key) __ac_FNV1a_hash_string(key) + /*! @function @abstract Const char* comparison function */ @@ -426,12 +475,29 @@ static kh_inline khint_t __ac_X31_hash_kstring(const kstring_t ks) h = (h << 5) - h + (khint_t)ks.s[i]; return h; } + +/*! @function + @abstract Kstring hash function + @param s Pointer to a kstring + @return The hash value + */ +static kh_inline khint_t __ac_FNV1a_hash_kstring(const kstring_t ks) +{ + const khint_t offset_basis = 2166136261; + const khint_t FNV_prime = 16777619; + khint_t h = offset_basis; + size_t i; + for (i = 0; i < ks.l; i++) + h = (h ^ (uint8_t) ks.s[i]) * FNV_prime; + return h; +} + /*! @function @abstract Interface to kstring hash function. @param key Pointer to a khash; permits hashing on non-nul terminated strings. @return The hash value [khint_t] */ -#define kh_kstr_hash_func(key) __ac_X31_hash_kstring(key) +#define kh_kstr_hash_func(key) __ac_FNV1a_hash_kstring(key) /*! @function @abstract kstring comparison function */ @@ -604,6 +670,19 @@ static kh_inline khint_t __ac_Wang_hash(khint_t key) code; \ } } +/*! @function + @abstract Gather hash table statistics + @param name Name of the hash table [symbol] + @param h Pointer to the hash table [khash_t(name)*] + @param empty[out] Number of empty hash bins + @param deleted[out] Number of hash bins with the deleted flag + @param hist_size[out] Size of @p hist array + @param hist[out] Probe count histogram + @return 0 on success; -1 on failure + */ +#define kh_stats(name, h, empty, deleted, hist_size, hist) \ + kh_stats_##name(h, empty, deleted, hist_size, hist) + /* More convenient interfaces */ /*! @function diff --git a/htslib/kstring.h b/htslib/kstring.h index 53a19806d..ebb2f9363 100644 --- a/htslib/kstring.h +++ b/htslib/kstring.h @@ -1,7 +1,7 @@ /* The MIT License Copyright (C) 2011 by Attractive Chaos - Copyright (C) 2013-2014, 2016, 2018-2020, 2022 Genome Research Ltd. + Copyright (C) 2013-2014, 2016, 2018-2020, 2022, 2024 Genome Research Ltd. Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the @@ -375,17 +375,63 @@ static inline int kputw(int c, kstring_t *s) static inline int kputll(long long c, kstring_t *s) { - char buf[32]; - int i, l = 0; - unsigned long long x = c; - if (c < 0) x = -x; - do { buf[l++] = x%10 + '0'; x /= 10; } while (x > 0); - if (c < 0) buf[l++] = '-'; - if (ks_resize(s, s->l + l + 2) < 0) - return EOF; - for (i = l - 1; i >= 0; --i) s->s[s->l++] = buf[i]; - s->s[s->l] = 0; - return 0; + // Worst case expansion. One check reduces function size + // and aids inlining chance. Memory overhead is minimal. + if (ks_resize(s, s->l + 23) < 0) + return EOF; + + unsigned long long x = c; + if (c < 0) { + x = -x; + s->s[s->l++] = '-'; + } + + if (x <= UINT32_MAX) + return kputuw(x, s); + + static const char kputull_dig2r[] = + "00010203040506070809" + "10111213141516171819" + "20212223242526272829" + "30313233343536373839" + "40414243444546474849" + "50515253545556575859" + "60616263646566676869" + "70717273747576777879" + "80818283848586878889" + "90919293949596979899"; + unsigned int l, j; + char *cp; + + // Find out how long the number is (could consider clzll) + uint64_t m = 1; + l = 0; + if (sizeof(long long)==sizeof(uint64_t) && x >= 10000000000000000000ULL) { + // avoids overflow below + l = 20; + } else { + do { + l++; + m *= 10; + } while (x >= m); + } + + // Add digits two at a time + j = l; + cp = s->s + s->l; + while (x >= 10) { + const char *d = &kputull_dig2r[2*(x%100)]; + x /= 100; + memcpy(&cp[j-=2], d, 2); + } + + // Last one (if necessary). We know that x < 10 by now. + if (j == 1) + cp[0] = x + '0'; + + s->l += l; + s->s[s->l] = 0; + return 0; } static inline int kputl(long c, kstring_t *s) { diff --git a/htslib/vcf.h b/htslib/vcf.h index e60911ab5..9a36cab05 100644 --- a/htslib/vcf.h +++ b/htslib/vcf.h @@ -596,7 +596,8 @@ set to one of BCF_ERR* codes and must be checked before calling bcf_write(). int bcf_hdr_append(bcf_hdr_t *h, const char *line); HTSLIB_EXPORT - int bcf_hdr_printf(bcf_hdr_t *h, const char *format, ...); + int bcf_hdr_printf(bcf_hdr_t *h, const char *format, ...) + HTS_FORMAT(HTS_PRINTF_FMT, 2, 3); /** VCF version, e.g. VCFv4.2 */ HTSLIB_EXPORT diff --git a/sam.c b/sam.c index 1a5519410..7e58da6e7 100644 --- a/sam.c +++ b/sam.c @@ -104,7 +104,7 @@ const int8_t bam_cigar_table[256] = { -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 }; -sam_hdr_t *sam_hdr_init() +sam_hdr_t *sam_hdr_init(void) { sam_hdr_t *bh = (sam_hdr_t*)calloc(1, sizeof(sam_hdr_t)); if (bh == NULL) return NULL; @@ -421,7 +421,7 @@ const char *sam_parse_region(sam_hdr_t *h, const char *s, int *tid, *** BAM alignment I/O *** *************************/ -bam1_t *bam_init1() +bam1_t *bam_init1(void) { return (bam1_t*)calloc(1, sizeof(bam1_t)); } @@ -431,7 +431,8 @@ int sam_realloc_bam_data(bam1_t *b, size_t desired) uint32_t new_m_data; uint8_t *new_data; new_m_data = desired; - kroundup32(new_m_data); + kroundup32(new_m_data); // next power of 2 + new_m_data += 32; // reduces malloc arena migrations? if (new_m_data < desired) { errno = ENOMEM; // Not strictly true but we can't store the size return -1; @@ -672,25 +673,36 @@ hts_pos_t bam_endpos(const bam1_t *b) static int bam_tag2cigar(bam1_t *b, int recal_bin, int give_warning) // return 0 if CIGAR is untouched; 1 if CIGAR is updated with CG { bam1_core_t *c = &b->core; - uint32_t cigar_st, n_cigar4, CG_st, CG_en, ori_len = b->l_data, *cigar0, CG_len, fake_bytes; - uint8_t *CG; - // test where there is a real CIGAR in the CG tag to move - if (c->n_cigar == 0 || c->tid < 0 || c->pos < 0) return 0; - cigar0 = bam_get_cigar(b); - if (bam_cigar_op(cigar0[0]) != BAM_CSOFT_CLIP || bam_cigar_oplen(cigar0[0]) != c->l_qseq) return 0; - fake_bytes = c->n_cigar * 4; + // Bail out as fast as possible for the easy case + uint32_t test_CG = BAM_CSOFT_CLIP | (c->l_qseq << BAM_CIGAR_SHIFT); + if (c->n_cigar == 0 || test_CG != *bam_get_cigar(b)) + return 0; + + // The above isn't fool proof - we may have old CIGAR tags that aren't used, + // but this is much less likely so do as a secondary check. + if (c->tid < 0 || c->pos < 0) + return 0; + + // Do we have a CG tag? + uint8_t *CG = bam_aux_get(b, "CG"); int saved_errno = errno; - CG = bam_aux_get(b, "CG"); if (!CG) { if (errno != ENOENT) return -1; // Bad aux data errno = saved_errno; // restore errno on expected no-CG-tag case return 0; } + + // Now we start with the serious work migrating CG to CIGAR + uint32_t cigar_st, n_cigar4, CG_st, CG_en, ori_len = b->l_data, + *cigar0, CG_len, fake_bytes; + cigar0 = bam_get_cigar(b); + fake_bytes = c->n_cigar * 4; if (CG[0] != 'B' || !(CG[1] == 'I' || CG[1] == 'i')) return 0; // not of type B,I CG_len = le_to_u32(CG + 2); - if (CG_len < c->n_cigar || CG_len >= 1U<<29) return 0; // don't move if the real CIGAR length is shorter than the fake cigar length + // don't move if the real CIGAR length is shorter than the fake cigar length + if (CG_len < c->n_cigar || CG_len >= 1U<<29) return 0; // move from the CG tag to the right position cigar_st = (uint8_t*)cigar0 - b->data; @@ -699,9 +711,12 @@ static int bam_tag2cigar(bam1_t *b, int recal_bin, int give_warning) // return 0 CG_st = CG - b->data - 2; CG_en = CG_st + 8 + n_cigar4; if (possibly_expand_bam_data(b, n_cigar4 - fake_bytes) < 0) return -1; - b->l_data = b->l_data - fake_bytes + n_cigar4; // we need c->n_cigar-fake_bytes bytes to swap CIGAR to the right place - memmove(b->data + cigar_st + n_cigar4, b->data + cigar_st + fake_bytes, ori_len - (cigar_st + fake_bytes)); // insert c->n_cigar-fake_bytes empty space to make room - memcpy(b->data + cigar_st, b->data + (n_cigar4 - fake_bytes) + CG_st + 8, n_cigar4); // copy the real CIGAR to the right place; -fake_bytes for the fake CIGAR + // we need c->n_cigar-fake_bytes bytes to swap CIGAR to the right place + b->l_data = b->l_data - fake_bytes + n_cigar4; + // insert c->n_cigar-fake_bytes empty space to make room + memmove(b->data + cigar_st + n_cigar4, b->data + cigar_st + fake_bytes, ori_len - (cigar_st + fake_bytes)); + // copy the real CIGAR to the right place; -fake_bytes for the fake CIGAR + memcpy(b->data + cigar_st, b->data + (n_cigar4 - fake_bytes) + CG_st + 8, n_cigar4); if (ori_len > CG_en) // move data after the CG tag memmove(b->data + CG_st + n_cigar4 - fake_bytes, b->data + CG_en + n_cigar4 - fake_bytes, ori_len - CG_en); b->l_data -= n_cigar4 + 8; // 8: CGBI (4 bytes) and CGBI length (4) @@ -763,27 +778,41 @@ int bam_read1(BGZF *fp, bam1_t *b) { bam1_core_t *c = &b->core; int32_t block_len, ret, i; - uint32_t x[8], new_l_data; + uint32_t new_l_data; + uint8_t tmp[32], *x; b->l_data = 0; - if ((ret = bgzf_read(fp, &block_len, 4)) != 4) { + if ((ret = bgzf_read_small(fp, &block_len, 4)) != 4) { if (ret == 0) return -1; // normal end-of-file else return -2; // truncated } if (fp->is_be) ed_swap_4p(&block_len); if (block_len < 32) return -4; // block_len includes core data - if (bgzf_read(fp, x, 32) != 32) return -3; - if (fp->is_be) { - for (i = 0; i < 8; ++i) ed_swap_4p(x + i); + if (fp->block_length - fp->block_offset > 32) { + // Avoid bgzf_read and a temporary copy to a local buffer + x = (uint8_t *)fp->uncompressed_block + fp->block_offset; + fp->block_offset += 32; + } else { + x = tmp; + if (bgzf_read(fp, x, 32) != 32) return -3; } - c->tid = x[0]; c->pos = (int32_t)x[1]; - c->bin = x[2]>>16; c->qual = x[2]>>8&0xff; c->l_qname = x[2]&0xff; + + c->tid = le_to_u32(x); + c->pos = le_to_i32(x+4); + uint32_t x2 = le_to_u32(x+8); + c->bin = x2>>16; + c->qual = x2>>8&0xff; + c->l_qname = x2&0xff; c->l_extranul = (c->l_qname%4 != 0)? (4 - c->l_qname%4) : 0; - c->flag = x[3]>>16; c->n_cigar = x[3]&0xffff; - c->l_qseq = x[4]; - c->mtid = x[5]; c->mpos = (int32_t)x[6]; c->isize = (int32_t)x[7]; + uint32_t x3 = le_to_u32(x+12); + c->flag = x3>>16; + c->n_cigar = x3&0xffff; + c->l_qseq = le_to_u32(x+16); + c->mtid = le_to_u32(x+20); + c->mpos = le_to_i32(x+24); + c->isize = le_to_i32(x+28); new_l_data = block_len - 32 + c->l_extranul; if (new_l_data > INT_MAX || c->l_qseq < 0 || c->l_qname < 1) return -4; @@ -793,19 +822,20 @@ int bam_read1(BGZF *fp, bam1_t *b) if (realloc_bam_data(b, new_l_data) < 0) return -4; b->l_data = new_l_data; - if (bgzf_read(fp, b->data, c->l_qname) != c->l_qname) return -4; - if (b->data[c->l_qname - 1] != '\0') { // Try to fix missing NUL termination + if (bgzf_read_small(fp, b->data, c->l_qname) != c->l_qname) return -4; + if (b->data[c->l_qname - 1] != '\0') { // try to fix missing nul termination if (fixup_missing_qname_nul(b) < 0) return -4; } for (i = 0; i < c->l_extranul; ++i) b->data[c->l_qname+i] = '\0'; c->l_qname += c->l_extranul; if (b->l_data < c->l_qname || - bgzf_read(fp, b->data + c->l_qname, b->l_data - c->l_qname) != b->l_data - c->l_qname) + bgzf_read_small(fp, b->data + c->l_qname, b->l_data - c->l_qname) != b->l_data - c->l_qname) return -4; if (fp->is_be) swap_data(c, b->l_data, b->data, 0); if (bam_tag2cigar(b, 0, 0) < 0) return -4; + // TODO: consider making this conditional if (c->n_cigar > 0) { // recompute "bin" and check CIGAR-qlen consistency hts_pos_t rlen, qlen; bam_cigar2rqlens(c->n_cigar, bam_get_cigar(b), &rlen, &qlen); @@ -852,15 +882,15 @@ int bam_write1(BGZF *fp, const bam1_t *b) if (fp->is_be) { for (i = 0; i < 8; ++i) ed_swap_4p(x + i); y = block_len; - if (ok) ok = (bgzf_write(fp, ed_swap_4p(&y), 4) >= 0); + if (ok) ok = (bgzf_write_small(fp, ed_swap_4p(&y), 4) >= 0); swap_data(c, b->l_data, b->data, 1); } else { - if (ok) ok = (bgzf_write(fp, &block_len, 4) >= 0); + if (ok) ok = (bgzf_write_small(fp, &block_len, 4) >= 0); } - if (ok) ok = (bgzf_write(fp, x, 32) >= 0); - if (ok) ok = (bgzf_write(fp, b->data, c->l_qname - c->l_extranul) >= 0); + if (ok) ok = (bgzf_write_small(fp, x, 32) >= 0); + if (ok) ok = (bgzf_write_small(fp, b->data, c->l_qname - c->l_extranul) >= 0); if (c->n_cigar <= 0xffff) { // no long CIGAR; write normally - if (ok) ok = (bgzf_write(fp, b->data + c->l_qname, b->l_data - c->l_qname) >= 0); + if (ok) ok = (bgzf_write_small(fp, b->data + c->l_qname, b->l_data - c->l_qname) >= 0); } else { // with long CIGAR, insert a fake CIGAR record and move the real CIGAR to the CG:B,I tag uint8_t buf[8]; uint32_t cigar_st, cigar_en, cigar[2]; @@ -879,12 +909,12 @@ int bam_write1(BGZF *fp, const bam1_t *b) cigar[1] = (uint32_t)cigreflen << 4 | BAM_CREF_SKIP; u32_to_le(cigar[0], buf); u32_to_le(cigar[1], buf + 4); - if (ok) ok = (bgzf_write(fp, buf, 8) >= 0); // write cigar: SN - if (ok) ok = (bgzf_write(fp, &b->data[cigar_en], b->l_data - cigar_en) >= 0); // write data after CIGAR - if (ok) ok = (bgzf_write(fp, "CGBI", 4) >= 0); // write CG:B,I + if (ok) ok = (bgzf_write_small(fp, buf, 8) >= 0); // write cigar: SN + if (ok) ok = (bgzf_write_small(fp, &b->data[cigar_en], b->l_data - cigar_en) >= 0); // write data after CIGAR + if (ok) ok = (bgzf_write_small(fp, "CGBI", 4) >= 0); // write CG:B,I u32_to_le(c->n_cigar, buf); - if (ok) ok = (bgzf_write(fp, buf, 4) >= 0); // write the true CIGAR length - if (ok) ok = (bgzf_write(fp, &b->data[cigar_st], c->n_cigar * 4) >= 0); // write the real CIGAR + if (ok) ok = (bgzf_write_small(fp, buf, 4) >= 0); // write the true CIGAR length + if (ok) ok = (bgzf_write_small(fp, &b->data[cigar_st], c->n_cigar * 4) >= 0); // write the real CIGAR } if (fp->is_be) swap_data(c, b->l_data, b->data, 0); return ok? 4 + block_len : -1; @@ -2917,7 +2947,7 @@ int sam_parse1(kstring_t *s, sam_hdr_t *h, bam1_t *b) } else c->tid = -1; // pos - c->pos = hts_str2uint(p, &p, 63, &overflow) - 1; + c->pos = hts_str2uint(p, &p, 62, &overflow) - 1; if (*p++ != '\t') goto err_ret; if (c->pos < 0 && c->tid >= 0) { _parse_warn(1, "mapped query cannot have zero coordinate; treated as unmapped"); @@ -2960,15 +2990,16 @@ int sam_parse1(kstring_t *s, sam_hdr_t *h, bam1_t *b) _parse_warn(c->mtid < 0, "unrecognized mate reference name %s; treated as unmapped", hts_strprint(logbuf, sizeof logbuf, '"', q, SIZE_MAX)); } // mpos - c->mpos = hts_str2uint(p, &p, 63, &overflow) - 1; + c->mpos = hts_str2uint(p, &p, 62, &overflow) - 1; if (*p++ != '\t') goto err_ret; if (c->mpos < 0 && c->mtid >= 0) { _parse_warn(1, "mapped mate cannot have zero coordinate; treated as unmapped"); c->mtid = -1; } // tlen - c->isize = hts_str2int(p, &p, 64, &overflow); + c->isize = hts_str2int(p, &p, 63, &overflow); if (*p++ != '\t') goto err_ret; + _parse_err(overflow, "number outside allowed range"); // seq q = _read_token(p); if (strcmp(q, "*")) { @@ -4297,6 +4328,9 @@ static inline int sam_read1_sam(htsFile *fp, sam_hdr_t *h, bam1_t *b) { fd->curr_bam = NULL; fd->curr_idx = 0; + // Consider prefetching next record? I.e. + // } else { + // __builtin_prefetch(&b_array[fd->curr_idx], 0, 3); } ret = 0; diff --git a/sam_internal.h b/sam_internal.h index b1fce9fe4..750c597b2 100644 --- a/sam_internal.h +++ b/sam_internal.h @@ -1,6 +1,6 @@ /* sam_internal.h -- internal functions; not part of the public API. - Copyright (C) 2019-2020 Genome Research Ltd. + Copyright (C) 2019-2020, 2023-2024 Genome Research Ltd. Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal @@ -25,6 +25,7 @@ DEALINGS IN THE SOFTWARE. */ #include #include + #include "htslib/sam.h" #ifdef __cplusplus @@ -68,7 +69,7 @@ static inline int possibly_expand_bam_data(bam1_t *b, size_t bytes) { * for (i = 0; i < len; i++) * seq[i] = seq_nt16_str[bam_seqi(nib, i)]; */ -static inline void nibble2base(uint8_t *nib, char *seq, int len) { +static inline void nibble2base_default(uint8_t *nib, char *seq, int len) { static const char code2base[512] = "===A=C=M=G=R=S=V=T=W=Y=H=K=D=B=N" "A=AAACAMAGARASAVATAWAYAHAKADABAN" @@ -98,6 +99,21 @@ static inline void nibble2base(uint8_t *nib, char *seq, int len) { seq[i] = seq_nt16_str[bam_seqi(nib, i)]; } +#if defined HAVE_ATTRIBUTE_CONSTRUCTOR && \ + ((defined __x86_64__ && defined HAVE_ATTRIBUTE_TARGET && defined HAVE_BUILTIN_CPU_SUPPORT_SSSE3) || \ + (defined __ARM_NEON)) +#define BUILDING_SIMD_NIBBLE2BASE +#endif + +static inline void nibble2base(uint8_t *nib, char *seq, int len) { +#ifdef BUILDING_SIMD_NIBBLE2BASE + extern void (*htslib_nibble2base)(uint8_t *nib, char *seq, int len); + htslib_nibble2base(nib, seq, len); +#else + nibble2base_default(nib, seq, len); +#endif +} + #ifdef __cplusplus } #endif diff --git a/samples/DEMO.md b/samples/DEMO.md index 911792899..98c9981b8 100644 --- a/samples/DEMO.md +++ b/samples/DEMO.md @@ -88,18 +88,24 @@ alignment. It adds count of ATCGN base as an array in auxiliary data, BA:I. Modified data is written on standard output. Write_fast - This application showcases the fasta/fastq data write. It appends -a dummy data to given file. +data to given file. Index_write - This application showcases the creation of index along with output creation. Based on file type and shift, it creates bai, csi or crai files. +Index_fast - This application showcases the index creation on fasta/fastq +reference files. + Read_reg - This application showcases the usage of region specification in alignment read. -Read_multireg - This application showcases the usage of mulitple regionn +Read_multireg - This application showcases the usage of multiple region specification in alignment read. +Read_fast_index - This application showcases the fasta/fastq data read using +index. + Pileup - This application showcases the pileup api, where all alignments covering a reference position are accessed together. It displays the bases covering each position on standard output. @@ -131,6 +137,15 @@ handling. It saves the read1 and read2 as separate files in given directory, one as sam and other as bam. A pool of 4 threads is created and shared for both read and write. +Qtask_ordered - This application showcases the use of queues and threads for +custom processing. Alignments in input file are updated with their GC ratio +on a custom aux tag. The processing may occur in any order but the result is +retrieved in same order as it was queued and saved to disk. + +Qtask_unordered - This application showcases the use of queues and threads +for custom processing. The count of bases and GC ratio are calculated and +displayed. The order of counting is irrelevant and hence ordered retrieval is +not used. ## Building the sample apps @@ -173,7 +188,7 @@ sam_read1 api. samFile pointer, header and bam storage are to be passed as argument and it returns 0 on success, -1 on end of file and < -1 in case of errors. -The bam storage has to be initialised using bam_init1 api before the call and +The bam storage has to be initialized using bam_init1 api before the call and can be reused for successive reads. Once done, it needs to be destroyed using bam_destroy1. The member field named core - bam1_core_t - in bam storage, bam1_t, has the sequence data in an easily accessible way. Using the fields @@ -185,30 +200,31 @@ and macros, data can easily be read from it. { ... //initialize - if (!(bamdata = bam_init1())) { - ... + if (!(bamdata = bam_init1())) + ... // error //open input files - r reading - if (!(infile = sam_open(inname, "r"))) { - ... + if (!(infile = sam_open(inname, "r"))) + ... // error //read header - if (!(in_samhdr = sam_hdr_read(infile))) { - ... + if (!(in_samhdr = sam_hdr_read(infile))) + ... // error + //read data, check flags and update count while ((c = sam_read1(infile, in_samhdr, bamdata)) >= 0) { - if (bamdata->core.flag & BAM_FREAD1) { + if (bamdata->core.flag & BAM_FREAD1) cntread1++; - } - ... + ... + //clean up - if (in_samhdr) { + if (in_samhdr) sam_hdr_destroy(in_samhdr); - } - if (infile) { + + if (infile) sam_close(infile); - } - if (bamdata) { + + if (bamdata) bam_destroy1(bamdata); - } + return ret; } Refer: flags_demo.c @@ -255,21 +271,23 @@ set the reference name in the alignment. It returns -ve value on error. int main(int argc, char *argv[]) { ... - if (!(infile = sam_open(inname, "r"))) { - ... + if (!(infile = sam_open(inname, "r"))) + ... // error outfile1 = sam_open(file1, "w"); //as SAM outfile2 = sam_open(file2, "wb"); //as BAM ... - if (!(in_samhdr = sam_hdr_read(infile))) { - ... + if (!(in_samhdr = sam_hdr_read(infile))) + ... // error + //write header if ((sam_hdr_write(outfile1, in_samhdr) == -1) || - (sam_hdr_write(outfile2, in_samhdr) == -1)) { - ... + (sam_hdr_write(outfile2, in_samhdr) == -1)) + ... // error + while ((c = sam_read1(infile, in_samhdr, bamdata)) >= 0) { if (bamdata->core.flag & BAM_FREAD1) { if (sam_write1(outfile1, in_samhdr, bamdata) < 0) { - ... + ... // error } Refer: split.c @@ -284,10 +302,11 @@ Below code excerpt shows sam_open_mode api usage. ... //set file open mode based on file name for 1st and as explicit for 2nd if ((sam_open_mode(mode1+1, file1, NULL) == -1) || - (sam_open_mode(mode2+1, file2, "sam.gz") == -1)) { - ... - if (!(infile = sam_open(inname, "r"))) { - ... + (sam_open_mode(mode2+1, file2, "sam.gz") == -1)) + ... // error + if (!(infile = sam_open(inname, "r"))) + ... // error + //open output files outfile1 = sam_open(file1, mode1); //as compressed SAM through sam_open outfile2 = sam_open_format(file2, mode2, NULL); //as compressed SAM through sam_open_format @@ -321,7 +340,7 @@ api and used with sam_open_format api to create appropriate CRAM file. hts_parse_format(&fmt2, reffmt2) == -1 || //embed the reference internally hts_parse_format(&fmt3, "cram,embed_ref=2") == -1 || //embed autogenerated reference hts_parse_format(&fmt4, "cram,no_ref=1") == -1) { //no reference data encoding at all - ... + ... // error outfile1 = sam_open_format(file1, "wc", &fmt1); outfile2 = sam_open_format(file2, "wc", &fmt2); ... Refer: cram.c @@ -337,16 +356,20 @@ or explicit format text. This mode buffer can be used with sam_open or can be used with sam_open_format with explicit format information in htsFormat structure. +It is the FASTA format which is mainly in use to store the reference data. + ... - if (!(bamdata = bam_init1())) { - ... - if (!(infile = sam_open(inname, "r"))) { - ... - if (infile->format.format != fasta_format && infile->format.format != fastq_format) { - ... - if (!(in_samhdr = sam_hdr_read(infile))) { - ... - while ((c = sam_read1(infile, in_samhdr, bamdata)) >= 0) { + if (!(bamdata = bam_init1())) + ... // error + if (!(infile = sam_open(inname, "r"))) + ... // error + if (infile->format.format != fasta_format && infile->format.format != fastq_format) + ... // error + if (!(in_samhdr = sam_hdr_read(infile))) + ... // error + + while ((c = sam_read1(infile, in_samhdr, bamdata)) >= 0) + ... // error printf("\nsequence: "); for (c = 0; c < bamdata->core.l_qseq; ++c) { printf("%c", seq_nt16_str[bam_seqi(bam_get_seq(bamdata), c)]); @@ -354,23 +377,22 @@ structure. if (infile->format.format == fastq_format) { printf("\nquality: "); for (c = 0; c < bamdata->core.l_qseq; ++c) { - printf("%c", bam_get_qual(bamdata)[c]); + printf("%c", bam_get_qual(bamdata)[c] + 33); ... Refer: read_fast.c ... char mode[4] = "a"; ... - if (sam_open_mode(mode + 1, outname, NULL) < 0) { - ... - if (!(outfile = sam_open(outname, mode))) { - ... - if (bam_set1(bamdata, sizeof("test"), "test", BAM_FUNMAP, -1, -1, 0, 0, NULL, -1, -1, 0, 10, "AACTGACTGA", "1234567890", 0) - < 0) { - ... + if (sam_open_mode(mode + 1, outname, NULL) < 0) + ... // error + if (!(outfile = sam_open(outname, mode))) + ... // error + if (bam_set1(bamdata, strlen(name), name, BAM_FUNMAP, -1, -1, 0, 0, NULL, -1, -1, 0, strlen(data), data, qual, 0) < 0) + ... // error if (sam_write1(outfile, out_samhdr, bamdata) < 0) { printf("Failed to write data\n"); - ... + ... Refer: write_fast.c @@ -388,18 +410,21 @@ line can be retrieved using sam_hdr_find_line_pos or sam_hdr_line_id with position and unique identifier values respectively. ... - if (!(in_samhdr = sam_hdr_read(infile))) { - ... - ret = sam_hdr_find_tag_id(in_samhdr, header, id, idval, tag, &data); + if (!(in_samhdr = sam_hdr_read(infile))) + ... // error ... - ret = sam_hdr_find_line_id(in_samhdr, header, id, idval, &data); + if (tag) + ret = sam_hdr_find_tag_id(in_samhdr, header, id, idval, tag, &data); + else + ret = sam_hdr_find_line_id(in_samhdr, header, id, idval, &data); ... linecnt = sam_hdr_count_lines(in_samhdr, header); - ... - ret = sam_hdr_find_tag_pos(in_samhdr, header, c, tag, &data); - ... - ret = sam_hdr_find_line_pos(in_samhdr, header, c, &data); - ... + ... + if (tag) + ret = sam_hdr_find_tag_pos(in_samhdr, header, c, tag, &data); + else + ret = sam_hdr_find_line_pos(in_samhdr, header, c, &data); + ... Refer: read_header.c This will show the VN tag's value from HD header. @@ -417,16 +442,19 @@ Below code excerpt shows the reference names which has length above given value. ... //iterate and check each reference's length for (pos = 1, c = 0; c < linecnt; ++c) { - if ((ret = sam_hdr_find_tag_pos(in_samhdr, "SQ", c, "LN", &data) == -2)) { - ... + if ((ret = sam_hdr_find_tag_pos(in_samhdr, "SQ", c, "LN", &data) == -2)) + ... // error + size = atoll(data.s); if (size < minsize) { //not required continue; } - if (!(id = sam_hdr_line_name(in_samhdr, "SQ", c))) { - //sam_hdr_find_tag_pos(in_samhdr, "SQ", c, "SN", &data) can also do the same! - ... + + //sam_hdr_find_tag_pos(in_samhdr, "SQ", c, "SN", &data) can also do the same! + if (!(id = sam_hdr_line_name(in_samhdr, "SQ", c))) + ... // error + printf("%d,%s,%s\n", pos, id, data.s); ... Refer: read_refname.c @@ -465,8 +493,8 @@ indexing the seq_nt16_str array. printf("MQUAL: %d\n", bamdata->core.qual); //map quality value cigar = bam_get_cigar(bamdata); //retrieves the cigar data for (i = 0; i < bamdata->core.n_cigar; ++i) { //no. of cigar data entries - printf("%d%c", bam_cigar_oplen(cigar[i]), bam_cigar_opchr(cigar[i])); //the macros gives the count of operation - and the symbol of operation for given cigar entry + printf("%d%c", bam_cigar_oplen(cigar[i]), bam_cigar_opchr(cigar[i])); + //the macros gives the count of operation and the symbol of operation for given cigar entry } printf("\nTLEN/ISIZE: %"PRIhts_pos"\n", bamdata->core.isize); data = bam_get_seq(bamdata); @@ -475,8 +503,8 @@ indexing the seq_nt16_str array. ... for (i = 0; i < bamdata->core.l_qseq ; ++i) { //sequence length printf("%c", seq_nt16_str[bam_seqi(data, i)]); //retrieves the base from (internal compressed) sequence data - ... - printf("%c", bam_get_qual(bamdata)[i]+33); //retrives the quality value + ... + printf("%c", bam_get_qual(bamdata)[i]+33); //retrieves the quality value ... Refer: read_bam.c @@ -516,15 +544,13 @@ given position of the array. ... while ((ret_r = sam_read1(infile, in_samhdr, bamdata)) >= 0) { - if (i % 2) { //use options alternatively to demonstrate both - //option 1 - get data as string with tag and type - if ((c = bam_aux_get_str(bamdata, tag, &sdata)) == 1) { - printf("%s\n",sdata.s); - ... - //option 2 - get raw data - if (!(data = bam_aux_get(bamdata, tag))) { - ... - if (printauxdata(stdout, bam_aux_type(data), -1, data) == EXIT_FAILURE) { + //option 1 - get data as string with tag and type + if ((c = bam_aux_get_str(bamdata, tag, &sdata)) == 1) { + printf("%s\n",sdata.s); + ... + //option 2 - get raw data + if ((data = bam_aux_get(bamdata, tag)) != NULL) { + printauxdata(stdout, bam_aux_type(data), -1, data); ... Refer: read_aux.c @@ -539,8 +565,8 @@ Shows the MD aux tag from alignments. printf("%.2s:%c:", bam_aux_tag(data), NULL != strchr("cCsSiI", bam_aux_type(data)) ? 'i' : bam_aux_type(data)); //macros gets the tag and type of aux data //dump the data - if (printauxdata(stdout, bam_aux_type(data), -1, data) == EXIT_FAILURE) { - ... + printauxdata(stdout, bam_aux_type(data), -1, data); + ... data = bam_aux_next(bamdata, data); //get the next aux data ... Refer: dump_aux.c @@ -563,19 +589,22 @@ sam_hdr_write api does the write of the header data to file. ... //add SQ line with SN as TR1 and TR2 - if (sam_hdr_add_lines(in_samhdr, &sq[0], 0)) { //length as 0 for NULL terminated data - ... + if (sam_hdr_add_lines(in_samhdr, &sq[0], 0)) //length as 0 for NULL terminated data + ... // error + //add RG line with ID as RG1 - if (sam_hdr_add_line(in_samhdr, "RG", "ID", "RG1", "LB", "Test", "SM", "S1", NULL)) { - ... - //add pg line - if (sam_hdr_add_pg(in_samhdr, "add_header", "VN", "Test", "CL", data.s, NULL)) { //NULL is to indicate end of args - ... - if (sam_hdr_add_line(in_samhdr, "CO", "Test data", NULL)) { //NULL is to indicate end of args - ... + if (sam_hdr_add_line(in_samhdr, "RG", "ID", "RG1", "LB", "Test", "SM", "S1", NULL)) + ... // error + + //add PG/CO lines + if (sam_hdr_add_pg(in_samhdr, "add_header", "VN", "Test", "CL", data.s, NULL)) //NULL is to indicate end of args + ... // error + if (sam_hdr_add_line(in_samhdr, "CO", "Test data", NULL)) //NULL is to indicate end of args + ... // error + //write output - if (sam_hdr_write(outfile, in_samhdr) < 0) { - ... + if (sam_hdr_write(outfile, in_samhdr) < 0) + ... // error Refer: add_header.c Not all type of header data can be removed but where it is possible, either a @@ -585,14 +614,14 @@ to be used. To remove all lines of a type, header type and unique identifier field tag are to be used. ... - //remove specific line - if (sam_hdr_remove_line_id(in_samhdr, header, id, idval)) { - ... - //remove multiple lines of a header type - if (sam_hdr_remove_lines(in_samhdr, header, id, NULL)) { - ... - if (sam_hdr_write(outfile, in_samhdr) < 0) { - ... + + //remove specific line + if (sam_hdr_remove_line_id(in_samhdr, header, id, idval) < 0) + ... // error + + //remove multiple lines of a header type + if (sam_hdr_remove_lines(in_samhdr, header, id, NULL) < 0) + ... // error Refer: rem_header.c Shows the file content after removing SQ line with SN 2. @@ -640,13 +669,12 @@ be easier than update of existing record. break; case 3:// RNAME case 7:// RNEXT - if ((ret = sam_hdr_name2tid(in_samhdr, val)) < 0) { - ... + if ((ret = sam_hdr_name2tid(in_samhdr, val)) < 0) + ... // error if (field == 3) { //reference bamdata->core.tid = ret; - } - else { + } else { //mate reference bamdata->core.mtid = ret; } @@ -659,20 +687,21 @@ be easier than update of existing record. break; case 6:// CIGAR { - ... + ... //get cigar array and set all data in new bam record - if ((ncigar = sam_parse_cigar(val, NULL, &cigar, &size)) < 0) { - ... + if ((ncigar = sam_parse_cigar(val, NULL, &cigar, &size)) < 0) + ... // error if (bam_set1(newbam, bamdata->core.l_qname, bam_get_qname(bamdata), bamdata->core.flag, bamdata->core.tid, bamdata->core.pos, bamdata->core.qual, ncigar, cigar, bamdata->core.mtid, bamdata->core.mpos, bamdata->core.isize, bamdata->core.l_qseq, (const char*)bam_get_seq(bamdata), - (const char*)bam_get_qual(bamdata), bam_get_l_aux(bamdata)) < 0) { - ... + (const char*)bam_get_qual(bamdata), bam_get_l_aux(bamdata)) < 0) + ... // error + //correct sequence data as input is expected in ascii format and not as compressed inside bam! memcpy(bam_get_seq(newbam), bam_get_seq(bamdata), (bamdata->core.l_qseq + 1) / 2); //copy the aux data memcpy(bam_get_aux(newbam), bam_get_aux(bamdata), bam_get_l_aux(bamdata)); - ... + ... break; case 8:// PNEXT bamdata->core.mpos = atoll(val); @@ -681,18 +710,16 @@ be easier than update of existing record. bamdata->core.isize = atoll(val); break; case 10:// SEQ - ... + ... for( c = 0; c < i; ++c) { bam_set_seqi(bam_get_seq(bamdata), c, seq_nt16_table[(unsigned char)val[c]]); } break; case 11:// QUAL - ... - for (c = 0; c < i; ++c) { + ... + for (c = 0; c < i; ++c) val[c] -= 33; //phred score from ascii value - } memcpy(bam_get_qual(bamdata), val, i); - ... Refer: mod_bam.c Shows data with RNAME modified to T2. @@ -707,33 +734,32 @@ present at all, it can be appended using bam_aux_append. //matched to qname, update aux if (!(data = bam_aux_get(bamdata, tag))) { //tag not present append - ... - if (bam_aux_append(bamdata, tag, type, length, (const uint8_t*)val)) { - ... - else { - char auxtype = bam_aux_type(data); + ... // cut: computed length and val based on tag type + if (bam_aux_append(bamdata, tag, type, length, (const uint8_t*)val)) + ... // error + } else { //update the tag with newer value + char auxtype = bam_aux_type(data); switch (type) { case 'f': case 'd': - ... - if (bam_aux_update_float(bamdata, tag, atof(val))) { - ... + ... + if (bam_aux_update_float(bamdata, tag, atof(val))) + ... // error case 'C': case 'S': case 'I': - ... - if (bam_aux_update_int(bamdata, tag, atoll(val))) { - ... + ... + if (bam_aux_update_int(bamdata, tag, atoll(val))) + ... // error case 'Z': - ... - if (bam_aux_update_str(bamdata, tag, length, val)) { - ... + ... + if (bam_aux_update_str(bamdata, tag, length, val)) + ... // error case 'A': - ... + ... //update the char data directly on buffer *(data+1) = val[0]; - ... Refer: mod_aux.c Shows the given record's MD tag set to Test. @@ -743,12 +769,14 @@ Shows the given record's MD tag set to Test. The array aux fields can be updated using bam_aux_update_array api. ... - if (bam_aux_update_array(bamdata, "BA", 'I', sizeof(cnt)/sizeof(cnt[0]), cnt)) { - ... + if (bam_aux_update_array(bamdata, "BA", 'I', sizeof(cnt)/sizeof(cnt[0]), cnt)) + ... // error Refer: mod_aux_ba.c Shows the records updated with an array of integers, containing count of ACGT -and N in that order. +and N in that order. The bases are decoded before count for the sake of +simplicity. Refer qtask_ordered.c for a better counting where decoding is made +outside the loop. ./mod_aux_ba samtools/test/mpileup/mpileup.1.bam @@ -761,14 +789,14 @@ can be read easily. There are different type of indices, BAI, CSI, CRAI, TBI, FAI etc. and are usually used with iterators. Indexing of plain/textual files are not supported, compressed SAM&FASTA/Q, BAM, -and CRAM files can be indexed. CRAM files are indexed as .crai and the other two -can be indexed as .bai or .csi files. Each of these types have different -internal representations of the index information. Bai uses a fixed -configuration values where as csi has them dynamically updated based on the -alignment data. +and CRAM files can be indexed. CRAM files are indexed as .crai and the others +as .bai, .csi, .fai etc. Each of these types have different internal +representations of the index information. Bai uses a fixed configuration values +where as csi has them dynamically updated based on the alignment data. Indexes can be created either with save of alignment data or explicitly by -read of existing alignment file. +read of existing alignment file for alignment data (SAM/BAM/CRAM). For reference +data it has to be explicitly created (FASTA). To create index along with alignment write, the sam_idx_init api need to be invoked before the start of alignment data write. This api takes the output @@ -777,16 +805,17 @@ index, the min shift has to be 0. At the end of write, sam_idx_save api need to be invoked to save the index. - //write header - if (sam_hdr_write(outfile, in_samhdr)) { ... + //write header + if (sam_hdr_write(outfile, in_samhdr)) + ... // error // initialize indexing, before start of write - if (sam_idx_init(outfile, in_samhdr, size, fileidx)) { - ... - if (sam_write1(outfile, in_samhdr, bamdata) < 0) { - ... - if (sam_idx_save(outfile)) { - ... + if (sam_idx_init(outfile, in_samhdr, size, fileidx)) + ... // error + if (sam_write1(outfile, in_samhdr, bamdata) < 0) + ... // error + if (sam_idx_save(outfile)) + ... // error Refer:index_write.c Creates mpileup.1.bam and mpileup.1.bam.bai in /tmp/. @@ -803,6 +832,20 @@ The sam_index_build2 api takes the index file path as well and gives more control than the previous one. The sam_index_build3 api provides an option to configure the number of threads in index creation. +Index for reference data can be created using fai_build3 api. This creates +index file with .fai extension. If the file is bgzip-ped, a .gzi file is +created as well. It takes the path to input file and that of fai and gzi files. +When fai/gzi path are NULL, they are created along with input file. +These index files will be useful for reference data access. + + ... + if (fai_build3(filename, NULL, NULL) == -1) + ... // error +Refer: index_fast.c + +A tabix index can be created for compressed vcf/sam/bed and other data using +tbx_index_build. It is mainly used with vcf and non-sam type files. + ### Read with iterators @@ -849,18 +892,19 @@ sam_itr_destroy and hts_idx_destroy apis does this. ... //load index file - if (!(idx = sam_index_load2(infile, inname, idxfile))) { - ... + if (!(idx = sam_index_load2(infile, inname, idxfile))) + ... // error //create iterator - if (!(iter = sam_itr_querys(idx, in_samhdr, region))) { - ... + if (!(iter = sam_itr_querys(idx, in_samhdr, region))) + ... // error + //read using iterator - while ((c = sam_itr_next(infile, iter, bamdata)) >= 0) { - ... - if (iter) { + while ((c = sam_itr_next(infile, iter, bamdata)) >= 0) + ... // error + + if (iter) sam_itr_destroy(iter); - } - if (idx) { + if (idx) hts_idx_destroy(idx); ... Refer:index_reg_read.c @@ -891,19 +935,20 @@ itself. ... //load index file, assume it to be present in same location - if (!(idx = sam_index_load(infile, inname))) { - ... + if (!(idx = sam_index_load(infile, inname))) + ... // error //create iterator - if (!(iter = sam_itr_regarray(idx, in_samhdr, regions, regcnt))) { - ... + if (!(iter = sam_itr_regarray(idx, in_samhdr, regions, regcnt))) + ... // error if (regions) { //can be freed as it is no longer required free(regions); regions = NULL; } + //get required area - while ((c = sam_itr_multi_next(infile, iter, bamdata) >= 0)) { - ... + while ((c = sam_itr_multi_next(infile, iter, bamdata) >= 0)) + ... // process bamdata Refer:index_multireg_read.c With compressed sample.sam and 2 regions from reference T1 (30 to 32) and 1 @@ -921,13 +966,70 @@ hts_idx_destroy. The hts_reglist_t* array passed is destroyed by the library on iterator destroy. The regions array (array of char array/string) needs to be destroyed by the user itself. +For fasta/fastq files, the index has to be loaded using fai_load3_format which +takes the file, index file names and format. With single region specification +fai_fetch64 can be used to get bases, and fai_fetchqual64 for quality in case +of fastq data. With multiple region specification, with comma separation, +faidx_fetch_seq64 and faidx_fetch_qual64 does the job. Regions has to be parsed +using fai_parse_region in case of multiregion specifications. fai_adjust_region +is used to adjust the start-end points based on available data. + +Below excerpt shows fasta/q access with single and multiregions, + + ... + //load index + if (!(idx = fai_load3_format(inname, NULL, NULL, FAI_CREATE, fmt))) + ... // error + + ... + if (!usemulti) { + //get data from single given region + if (!(data = fai_fetch64(idx, region, &len))) + ... // region not found + + printf("Data: %"PRId64" %s\n", len, data); + free((void*)data); + //get quality for fastq type + if (fmt == FAI_FASTQ) { + if (!(data = fai_fetchqual64(idx, region, &len))) + ... // region not found + ... + + } else { // usemulti + //parse, get each region and get data for each + while ((remaining = fai_parse_region(idx, region, &tid, &beg, &end, HTS_PARSE_LIST))) { //here expects regions as csv + //parsed the region, correct end points based on actual data + if (fai_adjust_region(idx, tid, &beg, &end) == -1) + ... // error + //get data for given region + if (!(data = faidx_fetch_seq64(idx, faidx_iseq(idx, tid), beg, end, &len))) + ... // region not found + + printf("Data: %"PRIhts_pos" %s\n", len, data); + free((void*)data); + data = NULL; + //get quality data for fastq + if (fmt == FAI_FASTQ) { + if (!(data = faidx_fetch_qual64(idx, faidx_iseq(idx, tid), beg, end, &len))) + ... // error + printf("Qual: %"PRIhts_pos" %s\n", len, data); + free((void*)data); + ... + region = remaining; //parse remaining region defs + + ... + if (idx) { + fai_destroy(idx); + ... +Refer: read_fast_index.c + ### Pileup and MPileup Pileup shows the transposed view of the SAM alignment data, i.e. it shows the -the reference positions and bases which cover that position through different -reads side by side. MPileup facilitates the piling up of multiple sam files -against each other and same reference at the same time. +reference positions and bases which cover that position through different reads +side by side. MPileup facilitates the piling up of multiple sam files against +each other and same reference at the same time. Mpileup has replaced the pileup. The input expects the data to be sorted by position. @@ -978,8 +1080,8 @@ above the cache limit are discarded. Once done, the pileup iterator to be discarded by sam_plp_destroy api. ... - if (!(plpiter = bam_plp_init(readdata, &conf))) { - ... + if (!(plpiter = bam_plp_init(readdata, &conf))) + ... // error //set constructor destructor callbacks bam_plp_constructor(plpiter, plpconstructor); bam_plp_destructor(plpiter, plpdestructor); @@ -1011,7 +1113,7 @@ Once done, the pileup iterator to be discarded by sam_plp_destroy api. printf("?"); } ... - if (plpiter) { + if (plpiter) bam_plp_destroy(plpiter); ... Refer:pileup.c @@ -1067,8 +1169,8 @@ above the cache limit are discarded. Once done, the pileup iterator to be discarded by sam_mplp_destroy api. ... - if (!(mplpiter = bam_mplp_init(argc - 1, readdata, (void**) conf))) { - ... + if (!(mplpiter = bam_mplp_init(argc - 1, readdata, (void**) conf))) + ... // error //set constructor destructor callbacks bam_mplp_constructor(mplpiter, plpconstructor); bam_mplp_destructor(mplpiter, plpdestructor); @@ -1134,13 +1236,13 @@ end of processing, the state need to be released using hts_base_mod_state_free api. ... - if (!(ms = hts_base_mod_state_alloc())) { - ... + if (!(ms = hts_base_mod_state_alloc())) + ... // error while ((ret_r = sam_read1(infile, in_samhdr, bamdata)) >= 0) { - ... - if (bam_parse_basemod(bamdata, ms)) { - ... + ... + if (bam_parse_basemod(bamdata, ms)) + ... // error bm = bam_mods_recorded(ms, &cnt); for (k = 0; k < cnt; ++k) { printf("%c", bm[k]); @@ -1191,7 +1293,7 @@ api. } } ... - if (ms) { + if (ms) hts_base_mod_state_free(ms); ... Refer:modstate.c @@ -1221,7 +1323,7 @@ api. { ... if (!(plpiter = bam_plp_init(readdata, &conf))) { - ... + ... // error //set constructor destructor callbacks bam_plp_constructor(plpiter, plpconstructor); bam_plp_destructor(plpiter, plpdestructor); @@ -1238,11 +1340,11 @@ api. } /*invoke bam mods_mods_at_qpos before bam_plp_insertion_mod that the base modification is retrieved before change in pileup pos thr' plp_insertion_mod call*/ - if ((modlen = bam_mods_at_qpos(plp[j].b, plp[j].qpos, plp[j].cd.p, mods, NMODS)) == -1) { - ... + if ((modlen = bam_mods_at_qpos(plp[j].b, plp[j].qpos, plp[j].cd.p, mods, NMODS)) == -1) + ... // error //use plp_insertion/_mod to get insertion and del at the same position - if ((inslen = bam_plp_insertion_mod(&plp[j], (hts_base_mod_state*)plp[j].cd.p, &insdata, &dellen)) == -1) { - ... + if ((inslen = bam_plp_insertion_mod(&plp[j], (hts_base_mod_state*)plp[j].cd.p, &insdata, &dellen)) == -1) + ... // error //start and end are displayed in UPPER and rest on LOWER, only 1st modification considered //base and modification printf("%c%c%c", plp[j].is_head ? toupper(seq_nt16_str[bam_seqi(bam_get_seq(plp[j].b), plp[j].qpos)]) : @@ -1260,7 +1362,7 @@ api. printf("-%d", dellen); for (k = 0; k < dellen; ++k) { printf("?"); - ... + ... else if (plp[j].indel < 0) { //deletion printf("%d", plp[j].indel); @@ -1285,17 +1387,18 @@ data and a combination of flags for the required fields can be passed with CRAM_OPT_REQUIRED_FIELDS to this api. ... - //select required field alone, this is useful for CRAM alone - if (hts_set_opt(infile, CRAM_OPT_REQUIRED_FIELDS, SAM_FLAG) < 0) { - ... - //read header - in_samhdr = sam_hdr_read(infile); + //select required field alone, this is useful for CRAM alone + if (hts_set_opt(infile, CRAM_OPT_REQUIRED_FIELDS, SAM_FLAG) < 0) + ... // error + + //read header + in_samhdr = sam_hdr_read(infile); ... //read data, check flags and update count while ((c = sam_read1(infile, in_samhdr, bamdata)) >= 0) { - if (bamdata->core.flag & BAM_FREAD1) { + if (bamdata->core.flag & BAM_FREAD1) cntread1++; - ... + ... Refer: flags_htsopt_field.c @@ -1303,48 +1406,248 @@ Refer: flags_htsopt_field.c The HTSLib api supports thread pooling for better performance. There are a few ways in which this can be used. The pool can be made specific for a file or a -generic pool can be created and shared across multiple files. Another way to -use thread pool is to schedule tasks explicitly to queues which gets executed -using threads in pool. +generic pool can be created and shared across multiple files. Thread pool can +also be used to execute user defined tasks. The tasks are to be added to queue, +threads in pool executes them and results can be queued back if required. To have a thread pool specific for a file, hts_set_opt api can be used with the -file pointer, HTS_OPT_NTHREADS and the number of threads to use in the pool. -Closure of file releases the thread pool as well. To have a thread pool which -can be shared across different files, it needs to be initialized using -hts_tpool_init api, passing number of threads as argument. This thread pool can -be associated with a file using hts_set_opt api. The file pointer, -HTS_OPT_THREAD_POOL and the thread pool address are to be passed as arguments -to api. The thread pool has to be released with hts_tpool_destroy. +file pointer, HTS_OPT_NTHREADS and the number of threads to be in the pool. +Thread pool is released on closure of file. To have a thread pool which can be +shared across different files, it needs to be initialized using hts_tpool_init +api, passing number of threads as an argument. This thread pool can be +associated with a file using hts_set_opt api. The file pointer, +HTS_OPT_THREAD_POOL and the thread pool address are to be passed as arguments to +the api. The thread pool has to be released with hts_tpool_destroy. + +The samples are trivial ones to showcase the usage of api. The number of threads +to use for different tasks has to be identified based on complexity and +parallelism of the task. Below excerpt shows file specific thread pool, ... //create file specific threads - if (hts_set_opt(infile, HTS_OPT_NTHREADS, 2) < 0 || //2 thread specific for reading + if (hts_set_opt(infile, HTS_OPT_NTHREADS, 1) < 0 || //1 thread specific for reading hts_set_opt(outfile1, HTS_OPT_NTHREADS, 1) < 0 || //1 thread specific for sam write - hts_set_opt(outfile2, HTS_OPT_NTHREADS, 1) < 0) { //1 thread specific for bam write + hts_set_opt(outfile2, HTS_OPT_NTHREADS, 2) < 0) { //2 thread specific for bam write printf("Failed to set thread options\n"); goto end; } Refer: split_thread1.c -Below excerpt shows thread pool shared across files, +Below excerpt shows a thread pool shared across files, ... //create a pool of 4 threads - if (!(tpool.pool = hts_tpool_init(4))) { - ... + if (!(tpool.pool = hts_tpool_init(4))) + ... // error //share the pool with all the 3 files if (hts_set_opt(infile, HTS_OPT_THREAD_POOL, &tpool) < 0 || hts_set_opt(outfile1, HTS_OPT_THREAD_POOL, &tpool) < 0 || hts_set_opt(outfile2, HTS_OPT_THREAD_POOL, &tpool) < 0) { - ... - if (tpool.pool) { + ... // error + + ... // do something + + //tidy up at end + if (tpool.pool) hts_tpool_destroy(tpool.pool); - } ... Refer: split_thread2.c +Note that it is important to analyze the task in hand to decide the number of +threads to be used. As an example, if the number of threads for reading is set +to 2 and bam write to 1, keeping total number of threads the same, the +performance may decrease as bam decoding is easier than encoding. + +Custom task / user defined functions can be performed on data using thread pool +and for that, the task has to be scheduled to a queue. Thread pool associated +with the queue will perform the task. There can be multiple pools and queues. +The order of execution of threads are decided based on many factors and load on +each task may vary, so the completion of the tasks may not be in the order of +their queueing. The queues can be used in two different ways, one where the +result is enqueued to queue again to be read in same order as initial queueing, +second where the resuls are not enqueued and completed possibly in a different +order than initial queueing. Explicitly created threads can also be used along +with hts thread pool usage. + +hts_tpool_process_init initializes the queue / process, associates a queue with +thread pool and reserves space for given number of tasks on queue. It takes a +parameter indicating whether the result need to be enqueued for retrieval or +not. If the result is enqueued, it is retrieved in the order of scheduling of +task. Another parameter sets the maximum number of slots for tasks in queue, +usually 2 times the number of threads are used. The input and output have their +own queues and they grow as required upto the max set. hts_tpool_dispatch api +enqueues the task to the queue. The api blocks when there is no space in queue. +This behavior can be controlled with hts_tpool_dispatch2 api. The queue can be +reset using hts_tpool_process_reset api where all tasks are discarded. The api +hts_tpool_dispatch3 supports configuring cleanup routines which are to be run +when reset occurs on the queue. hts_tpool_process_flush api can ensure that +all the piled up tasks are processed, a possible case when the queueing and +processing happen at different speeds. hts_tpool_process_shutdown api stops the +processing of queue. + +There are a few apis which let the user to check the status of processing. The +api hts_tpool_process_empty shows whether all the tasks are completed or not. +The api hts_tpool_process_sz gives the number of tasks, at different states of +processing. The api hts_tpool_process_len gives the number of results in output +queue waiting to be collected. + +The order of execution of tasks depends on the number of threads involved and +how the threads are scheduled by operating system. When the results are enqueued +back to queue, they are read in same order of enqueueing of task and in that +case the order of execution will not be noticed. When the results are not +enqueued the results are available right away and the order of execution may be +noticeable. Based on the nature of task and the need of order maintenance, users +can select either of the queueing. + +Below excerpts shows the usage of queues and threads in both cases. In the 1st, +alignments are updated with an aux tag indicating GC ratio. The order of data +has to be maintained even after update, hence the result queueing is used to +ensure same order as initial. A number of alignments are bunched together and +reuse of allocated memory is made to make it perform better. A sentinel job is +used to identify the completion of all tasks at the result collection side. + ... + void *thread_ordered_proc(void *args) + { + ... + for ( i = 0; i < bamdata->count; ++i) { + ... + for (pos = 0; pos < bamdata->bamarray[i]->core.l_qseq; ++pos) + count[bam_seqi(data,pos)]++; + ... + gcratio = (count[2] /*C*/ + count[4] /*G*/) / (float) (count[1] /*A*/ + count[8] /*T*/ + count[2] + count[4]); + + if (bam_aux_append(bamdata->bamarray[i], "xr", 'f', sizeof(gcratio), (const uint8_t*)&gcratio) < 0) { + + ... + void *threadfn_orderedwrite(void *args) + { + ... + //get result and write; wait if no result is in queue - until shutdown of queue + while (tdata->result == 0 && + (r = hts_tpool_next_result_wait(tdata->queue)) != NULL) { + bamdata = (data*) hts_tpool_result_data(r); + ... + for (i = 0; i < bamdata->count; ++i) { + if (sam_write1(tdata->outfile, tdata->samhdr, bamdata->bamarray[i]) < 0) { + ... // error + ... + hts_tpool_delete_result(r, 0); //release the result memory + ... + + // Shut down the process queue. If we stopped early due to a write failure, + // this will signal to the other end that something has gone wrong. + hts_tpool_process_shutdown(tdata->queue); + + ... + int main(int argc, char *argv[]) + { + ... + if (!(pool = hts_tpool_init(cnt))) //thread pool + ... // error + tpool.pool = pool; //to share the pool for file read and write as well + //queue to use with thread pool, for task and results + if (!(queue = hts_tpool_process_init(pool, cnt * 2, 0))) { + ... + //share the thread pool with i/o files + if (hts_set_opt(infile, HTS_OPT_THREAD_POOL, &tpool) < 0 || + hts_set_opt(outfile, HTS_OPT_THREAD_POOL, &tpool) < 0) + ... // error + if (pthread_create(&thread, NULL, threadfn_orderedwrite, &twritedata)) + ... // error + while (c >= 0) { + if (!(bamdata = getbamstorage(chunk, &bamcache))) + ... // error + for (cnt = 0; cnt < bamdata->maxsize; ++cnt) { + c = sam_read1(infile, in_samhdr, bamdata->bamarray[cnt]); + ... + if (hts_tpool_dispatch3(pool, queue, thread_ordered_proc, bamdata, + cleanup_bamstorage, cleanup_bamstorage, + 0) == -1) + ... // error + ... + if (queue) { + if (-1 == c) { + // EOF read, send a marker to tell the threadfn_orderedwrite() + // function to shut down. + if (hts_tpool_dispatch(pool, queue, thread_ordered_proc, + NULL) == -1) { + ... // error + hts_tpool_process_shutdown(queue); + + ... + // Wait for threadfn_orderedwrite to finish. + if (started_thread) { + pthread_join(thread, NULL); + + ... + if (queue) { + // Once threadfn_orderedwrite has stopped, the queue can be + // cleaned up. + hts_tpool_process_destroy(queue); + } + ... +Refer: qtask_ordered.c + +In this 2nd, the bases are counted and GC ratio of whole file is calculated. +Order in which bases are counted is not relevant and no result queue required. +The queue is created as input only. + ... + void *thread_unordered_proc(void *args) + { + ... + for ( i = 0; i < bamdata->count; ++i) { + data = bam_get_seq(bamdata->bamarray[i]); + for (pos = 0; pos < bamdata->bamarray[i]->core.l_qseq; ++pos) + counts[bam_seqi(data, pos)]++; + + ... + //update result and add the memory block for reuse + pthread_mutex_lock(&bamdata->cache->lock); + for (i = 0; i < 16; i++) { + bamdata->bases->counts[i] += counts[i]; + } + + bamdata->next = bamdata->cache->list; + bamdata->cache->list = bamdata; + pthread_mutex_unlock(&bamdata->cache->lock); + + ... + int main(int argc, char *argv[]) + { + ... + if (!(queue = hts_tpool_process_init(pool, cnt * 2, 1))) + ... // error + c = 0; + while (c >= 0) { + ... + for (cnt = 0; cnt < bamdata->maxsize; ++cnt) { + c = sam_read1(infile, in_samhdr, bamdata->bamarray[cnt]); + + ... + if (c >= -1 ) { + ... + if (hts_tpool_dispatch3(pool, queue, thread_unordered_proc, bamdata, + cleanup_bamstorage, cleanup_bamstorage, + 0) == -1) + ... // error + ... + if (-1 == c) { + // EOF read, ensure all are processed, waits for all to finish + if (hts_tpool_process_flush(queue) == -1) { + fprintf(stderr, "Failed to flush queue\n"); + } else { //all done + //refer seq_nt16_str to find position of required bases + fprintf(stdout, "GCratio: %f\nBase counts:\n", + (gccount.counts[2] /*C*/ + gccount.counts[4] /*G*/) / (float) + (gccount.counts[1] /*A*/ + gccount.counts[8] /*T*/ + + gccount.counts[2] + gccount.counts[4])); + ... + if (queue) { + hts_tpool_process_destroy(queue); + } +Refer: qtask_unordered.c ## More Information @@ -1421,9 +1724,9 @@ be destroyed as many times with sam_hdr_destroy api. ### Index Indices need the data to be sorted by position. They can be of different -types with extension .bai, .csi or .tbi for compressed SAM/BAM files and .crai -for CRAM files. The index name can be passed along with the alignment file -itself by appending a specific character sequence. The apis can detect this +types with extension .bai, .csi or .tbi for compressed SAM/BAM/VCF files and +.crai for CRAM files. The index name can be passed along with the alignment +file itself by appending a specific character sequence. The apis can detect this sequence and extract the index path. ##idx## is the sequence which separates the file path and index path. diff --git a/samples/Makefile b/samples/Makefile index 40991d78f..ee632e3ad 100644 --- a/samples/Makefile +++ b/samples/Makefile @@ -2,7 +2,7 @@ HTS_DIR = ../ include $(HTS_DIR)/htslib_static.mk CC = gcc -CFLAGS = -Wall -g -O0 +CFLAGS = -Wall -O2 #to statically link to libhts LDFLAGS = $(HTS_DIR)/libhts.a -L$(HTS_DIR) $(HTSLIB_static_LDFLAGS) $(HTSLIB_static_LIBS) @@ -13,94 +13,105 @@ LDFLAGS = $(HTS_DIR)/libhts.a -L$(HTS_DIR) $(HTSLIB_static_LDFLAGS) $(HTSLIB_sta PRGS = flags split split2 cram read_fast read_header read_ref read_bam \ read_aux dump_aux add_header rem_header update_header mod_bam mod_aux \ mod_aux_ba write_fast idx_on_write read_reg read_multireg pileup \ - mpileup modstate pileup_mod flags_field split_t1 split_t2 + mpileup modstate pileup_mod flags_field split_t1 split_t2 \ + read_fast_i qtask_ordered qtask_unordered index_fasta all: $(PRGS) -flags: +flags: flags_demo.c $(CC) $(CFLAGS) -I $(HTS_DIR) flags_demo.c -o $@ $(LDFLAGS) -split: +split: split.c $(CC) $(CFLAGS) -I $(HTS_DIR) split.c -o $@ $(LDFLAGS) -split2: +split2: split2.c $(CC) $(CFLAGS) -I $(HTS_DIR) split2.c -o $@ $(LDFLAGS) -cram: +cram: cram.c $(CC) $(CFLAGS) -I $(HTS_DIR) cram.c -o $@ $(LDFLAGS) -read_fast: +read_fast: read_fast.c $(CC) $(CFLAGS) -I $(HTS_DIR) read_fast.c -o $@ $(LDFLAGS) -read_header: +read_header: read_header.c $(CC) $(CFLAGS) -I $(HTS_DIR) read_header.c -o $@ $(LDFLAGS) -read_ref: +read_ref: read_refname.c $(CC) $(CFLAGS) -I $(HTS_DIR) read_refname.c -o $@ $(LDFLAGS) -read_bam: +read_bam: read_bam.c $(CC) $(CFLAGS) -I $(HTS_DIR) read_bam.c -o $@ $(LDFLAGS) -read_aux: +read_aux: read_aux.c $(CC) $(CFLAGS) -I $(HTS_DIR) read_aux.c -o $@ $(LDFLAGS) -dump_aux: +dump_aux: dump_aux.c $(CC) $(CFLAGS) -I $(HTS_DIR) dump_aux.c -o $@ $(LDFLAGS) -add_header: +add_header: add_header.c $(CC) $(CFLAGS) -I $(HTS_DIR) add_header.c -o $@ $(LDFLAGS) -rem_header: +rem_header: rem_header.c $(CC) $(CFLAGS) -I $(HTS_DIR) rem_header.c -o $@ $(LDFLAGS) -update_header: +update_header: update_header.c $(CC) $(CFLAGS) -I $(HTS_DIR) update_header.c -o $@ $(LDFLAGS) -mod_bam: +mod_bam: mod_bam.c $(CC) $(CFLAGS) -I $(HTS_DIR) mod_bam.c -o $@ $(LDFLAGS) -mod_aux: +mod_aux: mod_aux.c $(CC) $(CFLAGS) -I $(HTS_DIR) mod_aux.c -o $@ $(LDFLAGS) -mod_aux_ba: +mod_aux_ba: mod_aux_ba.c $(CC) $(CFLAGS) -I $(HTS_DIR) mod_aux_ba.c -o $@ $(LDFLAGS) -write_fast: +write_fast: write_fast.c $(CC) $(CFLAGS) -I $(HTS_DIR) write_fast.c -o $@ $(LDFLAGS) -idx_on_write: +idx_on_write: index_write.c $(CC) $(CFLAGS) -I $(HTS_DIR) index_write.c -o $@ $(LDFLAGS) -read_reg: +read_reg: index_reg_read.c $(CC) $(CFLAGS) -I $(HTS_DIR) index_reg_read.c -o $@ $(LDFLAGS) -read_multireg: +read_multireg: index_multireg_read.c $(CC) $(CFLAGS) -I $(HTS_DIR) index_multireg_read.c -o $@ $(LDFLAGS) -pileup: +read_fast_i: read_fast_index.c + $(CC) $(CFLAGS) -I $(HTS_DIR) read_fast_index.c -o $@ $(LDFLAGS) + +pileup: pileup.c $(CC) $(CFLAGS) -I $(HTS_DIR) pileup.c -o $@ $(LDFLAGS) -mpileup: +mpileup: mpileup.c $(CC) $(CFLAGS) -I $(HTS_DIR) mpileup.c -o $@ $(LDFLAGS) -modstate: +modstate: modstate.c $(CC) $(CFLAGS) -I $(HTS_DIR) modstate.c -o $@ $(LDFLAGS) -pileup_mod: +pileup_mod: pileup_mod.c $(CC) $(CFLAGS) -I $(HTS_DIR) pileup_mod.c -o $@ $(LDFLAGS) -flags_field: +flags_field: flags_htsopt_field.c $(CC) $(CFLAGS) -I $(HTS_DIR) flags_htsopt_field.c -o $@ $(LDFLAGS) -split_t1: +split_t1: split_thread1.c $(CC) $(CFLAGS) -I $(HTS_DIR) split_thread1.c -o $@ $(LDFLAGS) -split_t2: +split_t2: split_thread2.c $(CC) $(CFLAGS) -I $(HTS_DIR) split_thread2.c -o $@ $(LDFLAGS) +index_fasta: index_fasta.c + $(CC) $(CFLAGS) -I $(HTS_DIR) index_fasta.c -o $@ $(LDFLAGS) + +qtask_ordered: qtask_ordered.c + $(CC) $(CFLAGS) -I $(HTS_DIR) qtask_ordered.c -o $@ $(LDFLAGS) + +qtask_unordered: qtask_unordered.c + $(CC) $(CFLAGS) -I $(HTS_DIR) qtask_unordered.c -o $@ $(LDFLAGS) + clean: find . -name "*.o" | xargs rm -rf find . -name "*.dSYM" | xargs rm -rf - rm $(PRGS) - - + -rm -f $(PRGS) diff --git a/samples/README.md b/samples/README.md index ab5481dea..6f90c0c3f 100644 --- a/samples/README.md +++ b/samples/README.md @@ -4,7 +4,7 @@ data, and is the core library used by [samtools][2] and [bcftools][3]. A set of sample programs are available which showcases the usage of APIs in HTSlib. They are based on version 1.17 of HTSLib and are mainly for demonstration of API usage. -Further optimization and error handling might be required for actual usage. +Further optimisation and error handling might be required for actual usage. [1]: http://samtools.github.io/hts-specs/ @@ -61,7 +61,7 @@ indexed. [Read_fast][Read_fast] - This application showcases the fasta/fastq data read. + This application showcases fasta/fastq data read without using index. [Read_header][Read_header] @@ -72,7 +72,7 @@ indexed. [Read_ref][Read_ref] This application showcases the read and access of header data. It shows - all reference names which has length equal or greather to given input. + all reference names which has length equal or greater to given input. [Read_bam][Read_bam] @@ -129,14 +129,18 @@ indexed. [Write_fast][Write_fast] - This application showcases the fasta/fastq data write. It appends a dummy - data to given file. + This application showcases the fasta/fastq data write. It appends data on + given file. [Index_write][Index_write] This application showcases the creation of index along with output creation. Based on file type and shift, it creates bai, csi or crai files. +[Index_fast][Index_fast] + + This application showcases index creation on fasta/fastq reference data. + [Read_reg][Read_reg]: This application showcases the usage of region specification in alignment @@ -144,9 +148,14 @@ indexed. [Read_multireg][Read_multireg]: - This application showcases the usage of mulitple region specification in + This application showcases the usage of multiple region specification in alignment read. +[Read_fast_index][Read_fast_index] + + This application showcases the fasta/fastq data read using index. It takes a + region (reference name[:start-end]) and gets data from that region. + [Pileup][Pileup]: This application showcases the pileup api, where all alignments covering a @@ -181,8 +190,7 @@ indexed. This application showcases the use of threads in file handling. It saves the read1 and read2 as separate files in given directory, one as sam and - other as bam. 2 threads are used for read and 1 each dedicated for each - output file. + other as bam. 1 thread is used for read, 1 for sam write and 2 for bam write. [Split_thread2][Split_thread2] @@ -191,6 +199,19 @@ indexed. and other as bam. A pool of 4 threads is created and shared for both read and write. +[Qtask_ordered][Qtask_ordered] + + This application showcases the use of queues and threads for custom + processing. Alignments in input file are updated with their GC ratio on a + custom aux tag. The processing may occur in any order but the results are + retrieved in same order as it was queued and saved to disk. + +[Qtask_unordered][Qtask_unordered] + + This application showcases the use of queues and threads for custom + processing. The count of bases and GC ratio are calculated and displayed. + The order of counting is irrelevant and hence ordered retrieval is not used. + ### More Information More detailed documentation is available in the [DEMO.md][DEMO] with worked @@ -215,8 +236,10 @@ examples per demonstration tool. [Mod_aux_ba]: mod_aux_ba.c [Write_fast]: write_fast.c [Index_write]: index_write.c +[Index_fasta]: index_fasta.c [Read_reg]: index_reg_read.c [Read_multireg]: index_multireg_read.c +[Read_fast_index]: read_fast_index.c [Pileup]: pileup.c [Mpileup]: mpileup.c [Modstate]: modstate.c @@ -224,4 +247,6 @@ examples per demonstration tool. [Flags_field]: flags_htsopt_field.c [Split_thread1]: split_thread1.c [Split_thread2]: split_thread2.c +[Qtask_ordered]: qtask_ordered.c +[Qtask_unordered]: qtask_unordered.c [DEMO]: DEMO.md diff --git a/samples/add_header.c b/samples/add_header.c index d1a2fc13c..066b1d438 100644 --- a/samples/add_header.c +++ b/samples/add_header.c @@ -24,20 +24,20 @@ DEALINGS IN THE SOFTWARE */ -/* The pupose of this code is to demonstrate the library apis and need proper error handling and optimization */ +/* The purpose of this code is to demonstrate the library apis and need proper error handling and optimisation */ #include #include #include -/// print_usage - print the demo_usage -/** @param fp pointer to the file / terminal to which demo_usage to be dumped +/// print_usage - print the usage +/** @param fp pointer to the file / terminal to which usage to be dumped returns nothing */ static void print_usage(FILE *fp) { fprintf(fp, "Usage: add_header infile\n\ -Adds new header lines of SQ, RG, PG and CO typs\n"); +Adds new header lines of SQ, RG, PG and CO types\n"); return; } diff --git a/samples/cram.c b/samples/cram.c index 5f55e65d2..7b1342377 100644 --- a/samples/cram.c +++ b/samples/cram.c @@ -24,14 +24,14 @@ DEALINGS IN THE SOFTWARE */ -/* The pupose of this code is to demonstrate the library apis and need proper error handling and optimization */ +/* The purpose of this code is to demonstrate the library apis and need proper error handling and optimisation */ #include #include #include -/// print_usage - print the demo_usage -/** @param fp pointer to the file / terminal to which demo_usage to be dumped +/// print_usage - print the usage +/** @param fp pointer to the file / terminal to which usage to be dumped returns nothing */ static void print_usage(FILE *fp) diff --git a/samples/dump_aux.c b/samples/dump_aux.c index 49251fe04..3caa16027 100644 --- a/samples/dump_aux.c +++ b/samples/dump_aux.c @@ -24,14 +24,14 @@ DEALINGS IN THE SOFTWARE */ -/* The pupose of this code is to demonstrate the library apis and need proper error handling and optimization */ +/* The purpose of this code is to demonstrate the library apis and need proper error handling and optimisation */ #include #include #include -/// print_usage - print the demo_usage -/** @param fp pointer to the file / terminal to which demo_usage to be dumped +/// print_usage - print the usage +/** @param fp pointer to the file / terminal to which usage to be dumped returns nothing */ static void print_usage(FILE *fp) @@ -92,7 +92,7 @@ int printauxdata(FILE *fp, char type, int32_t idx, const uint8_t *data) fprintf(fp, "%c", auxBType); for (i = 0; i < auxBcnt; ++i) { //iterate the array fprintf(fp, ","); - //calling recurssively with index to reuse a few lines + //calling recursively with index to reuse a few lines if (printauxdata(fp, auxBType, i, data) == EXIT_FAILURE) { return EXIT_FAILURE; } diff --git a/samples/flags_demo.c b/samples/flags_demo.c index e03fc6cd8..ac26be86c 100644 --- a/samples/flags_demo.c +++ b/samples/flags_demo.c @@ -24,14 +24,14 @@ DEALINGS IN THE SOFTWARE */ -/* The pupose of this code is to demonstrate the library apis and need proper error handling and optimization */ +/* The purpose of this code is to demonstrate the library apis and need proper error handling and optimisation */ #include #include #include -/// print_usage - show flags_demo usage -/** @param fp pointer to the file / terminal to which demo_usage to be dumped +/// print_usage - show usage +/** @param fp pointer to the file / terminal to which usage to be dumped returns nothing */ static void print_usage(FILE *fp) diff --git a/samples/flags_htsopt_field.c b/samples/flags_htsopt_field.c index 4b64445e3..40a0affc4 100644 --- a/samples/flags_htsopt_field.c +++ b/samples/flags_htsopt_field.c @@ -24,14 +24,14 @@ DEALINGS IN THE SOFTWARE */ -/* The pupose of this code is to demonstrate the library apis and need proper error handling and optimization */ +/* The purpose of this code is to demonstrate the library apis and need proper error handling and optimisation */ #include #include #include -/// print_usage - show flags_demo usage -/** @param fp pointer to the file / terminal to which demo_usage to be dumped +/// print_usage - show usage +/** @param fp pointer to the file / terminal to which usage to be dumped returns nothing */ static void print_usage(FILE *fp) diff --git a/samples/index_fasta.c b/samples/index_fasta.c new file mode 100644 index 000000000..ba0489094 --- /dev/null +++ b/samples/index_fasta.c @@ -0,0 +1,72 @@ +/* index_fasta.c -- showcases the htslib api usage + + Copyright (C) 2024 Genome Research Ltd. + + Author: Vasudeva Sarma + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL +THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +DEALINGS IN THE SOFTWARE + +*/ + +/* The purpose of this code is to demonstrate the library apis and need proper error handling and optimisation */ + +#include +#include +#include +#include +#include + +/// print_usage - show usage +/** @param fp pointer to the file / terminal to which usage to be dumped +returns nothing +*/ +static void print_usage(FILE *fp) +{ + fprintf(fp, "Usage: index_fasta \n\ +Indexes a fasta/fastq file and saves along with source.\n"); + return; +} + +/// main - indexes fasta/fastq file +/** @param argc - count of arguments + * @param argv - pointer to array of arguments +returns 1 on failure 0 on success +*/ +int main(int argc, char *argv[]) +{ + const char *filename = NULL; //file name + int ret = EXIT_FAILURE; + + if (argc != 2) { + print_usage(stdout); + goto end; + } + filename = argv[1]; + + // index the file + if (fai_build3(filename, NULL, NULL) == -1) { + printf("Indexing failed with %d\n", errno); + goto end; + } + //this creates an .fai file. If the file is bgzipped, a .gzi file will be created along with .fai + ret = EXIT_SUCCESS; +end: + //clean up + return ret; +} diff --git a/samples/index_multireg_read.c b/samples/index_multireg_read.c index dbe8f15f9..7bb864990 100644 --- a/samples/index_multireg_read.c +++ b/samples/index_multireg_read.c @@ -24,14 +24,14 @@ DEALINGS IN THE SOFTWARE */ -/* The pupose of this code is to demonstrate the library apis and need proper error handling and optimization */ +/* The purpose of this code is to demonstrate the library apis and need proper error handling and optimisation */ #include #include #include -/// print_usage - print the print_usage -/** @param fp pointer to the file / terminal to which print_usage to be dumped +/// print_usage - print the usage +/** @param fp pointer to the file / terminal to which usage to be dumped returns nothing */ static void print_usage(FILE *fp) diff --git a/samples/index_reg_read.c b/samples/index_reg_read.c index 346d5428f..dec684933 100644 --- a/samples/index_reg_read.c +++ b/samples/index_reg_read.c @@ -24,19 +24,19 @@ DEALINGS IN THE SOFTWARE */ -/* The pupose of this code is to demonstrate the library apis and need proper error handling and optimization */ +/* The purpose of this code is to demonstrate the library apis and need proper error handling and optimisation */ #include #include #include -/// print_usage - print the print_usage -/** @param fp pointer to the file / terminal to which print_usage to be dumped +/// print_usage - print the usage +/** @param fp pointer to the file / terminal to which usage to be dumped returns nothing */ static void print_usage(FILE *fp) { - fprintf(fp, "Usage: readreg infile idxfile region\n\ + fprintf(fp, "Usage: read_reg infile idxfile region\n\ Reads alignments matching to a specific region\n\ \\. from start of file\n\ \\* only unmapped reads\n\ diff --git a/samples/index_write.c b/samples/index_write.c index 8fd2bc968..9ec63d4ad 100644 --- a/samples/index_write.c +++ b/samples/index_write.c @@ -24,15 +24,15 @@ DEALINGS IN THE SOFTWARE */ -/* The pupose of this code is to demonstrate the library apis and need proper error handling and optimization */ +/* The purpose of this code is to demonstrate the library apis and need proper error handling and optimisation */ #include #include #include #include -/// print_usage - print the demo_usage -/** @param fp pointer to the file / terminal to which demo_usage to be dumped +/// print_usage - print the usage +/** @param fp pointer to the file / terminal to which usage to be dumped returns nothing */ static void print_usage(FILE *fp) diff --git a/samples/mod_aux.c b/samples/mod_aux.c index d5ed18cde..ae531b985 100644 --- a/samples/mod_aux.c +++ b/samples/mod_aux.c @@ -24,14 +24,15 @@ DEALINGS IN THE SOFTWARE */ -/* The pupose of this code is to demonstrate the library apis and need proper error handling and optimization */ +/* The purpose of this code is to demonstrate the library apis and need proper error handling and optimisation */ #include +#include #include #include -/// print_usage - print the demo_usage -/** @param fp pointer to the file / terminal to which demo_usage to be dumped +/// print_usage - print the usage +/** @param fp pointer to the file / terminal to which usage to be dumped returns nothing */ static void print_usage(FILE *fp) diff --git a/samples/mod_aux_ba.c b/samples/mod_aux_ba.c index 8ef90ee1e..836a3d39c 100644 --- a/samples/mod_aux_ba.c +++ b/samples/mod_aux_ba.c @@ -24,14 +24,14 @@ DEALINGS IN THE SOFTWARE */ -/* The pupose of this code is to demonstrate the library apis and need proper error handling and optimization */ +/* The purpose of this code is to demonstrate the library apis and need proper error handling and optimisation */ #include #include #include -/// print_usage - print the demo_usage -/** @param fp pointer to the file / terminal to which demo_usage to be dumped +/// print_usage - print the usage +/** @param fp pointer to the file / terminal to which usage to be dumped returns nothing */ static void print_usage(FILE *fp) diff --git a/samples/mod_bam.c b/samples/mod_bam.c index 9f1eb324e..616639610 100644 --- a/samples/mod_bam.c +++ b/samples/mod_bam.c @@ -24,14 +24,15 @@ DEALINGS IN THE SOFTWARE */ -/* The pupose of this code is to demonstrate the library apis and need proper error handling and optimization */ +/* The purpose of this code is to demonstrate the library apis and need proper error handling and optimisation */ #include +#include #include #include -/// print_usage - print the demo_usage -/** @param fp pointer to the file / terminal to which demo_usage to be dumped +/// print_usage - print the usage +/** @param fp pointer to the file / terminal to which usage to be dumped returns nothing */ static void print_usage(FILE *fp) diff --git a/samples/modstate.c b/samples/modstate.c index 976391684..4d5f67635 100644 --- a/samples/modstate.c +++ b/samples/modstate.c @@ -24,14 +24,14 @@ DEALINGS IN THE SOFTWARE */ -/* The pupose of this code is to demonstrate the library apis and need proper error handling and optimization */ +/* The purpose of this code is to demonstrate the library apis and need proper error handling and optimisation */ #include #include #include -/// print_usage - print the demo_usage -/** @param fp pointer to the file / terminal to which demo_usage to be dumped +/// print_usage - print the usage +/** @param fp pointer to the file / terminal to which usage to be dumped returns nothing */ static void print_usage(FILE *fp) diff --git a/samples/mpileup.c b/samples/mpileup.c index fe933748e..ecab70584 100644 --- a/samples/mpileup.c +++ b/samples/mpileup.c @@ -24,15 +24,15 @@ DEALINGS IN THE SOFTWARE */ -/* The pupose of this code is to demonstrate the library apis and need proper error handling and optimization */ +/* The purpose of this code is to demonstrate the library apis and need proper error handling and optimisation */ #include #include #include #include -/// print_usage - show flags_demo usage -/** @param fp pointer to the file / terminal to which demo_usage to be dumped +/// print_usage - show usage +/** @param fp pointer to the file / terminal to which usage to be dumped returns nothing */ static void print_usage(FILE *fp) diff --git a/samples/pileup.c b/samples/pileup.c index 11e2fb02f..be7aad801 100644 --- a/samples/pileup.c +++ b/samples/pileup.c @@ -24,15 +24,15 @@ DEALINGS IN THE SOFTWARE */ -/* The pupose of this code is to demonstrate the library apis and need proper error handling and optimization */ +/* The purpose of this code is to demonstrate the library apis and need proper error handling and optimisation */ #include #include #include #include -/// print_usage - show flags_demo usage -/** @param fp pointer to the file / terminal to which demo_usage to be dumped +/// print_usage - show usage +/** @param fp pointer to the file / terminal to which usage to be dumped returns nothing */ static void print_usage(FILE *fp) diff --git a/samples/pileup_mod.c b/samples/pileup_mod.c index 24d6cf539..81ac5a540 100644 --- a/samples/pileup_mod.c +++ b/samples/pileup_mod.c @@ -24,15 +24,15 @@ DEALINGS IN THE SOFTWARE */ -/* The pupose of this code is to demonstrate the library apis and need proper error handling and optimization */ +/* The purpose of this code is to demonstrate the library apis and need proper error handling and optimisation */ #include #include #include #include -/// print_usage - show flags_demo usage -/** @param fp pointer to the file / terminal to which demo_usage to be dumped +/// print_usage - show usage +/** @param fp pointer to the file / terminal to which usage to be dumped returns nothing */ static void print_usage(FILE *fp) diff --git a/samples/qtask_ordered.c b/samples/qtask_ordered.c new file mode 100644 index 000000000..a76d59826 --- /dev/null +++ b/samples/qtask_ordered.c @@ -0,0 +1,425 @@ +/* qtask_ordered.c -- showcases the htslib api usage + + Copyright (C) 2024 Genome Research Ltd. + + Author: Vasudeva Sarma + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL +THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +DEALINGS IN THE SOFTWARE + +*/ + +/* The purpose of this code is to demonstrate the library apis and need proper error handling and optimisation */ + +#include +#include +#include +#include +#include +#include + +typedef struct data { + int count; //used up size + int maxsize; //max size per data chunk + bam1_t **bamarray; //bam1_t array for optimal queueing + struct data *next; //pointer to next one - to reuse earlier allocations +} data; + +typedef struct datacache +{ + pthread_mutex_t lock; //synchronizes the access to cache + data *list; //data storage +} datacache; + +typedef struct orderedwrite { + samFile *outfile; //output file handle + sam_hdr_t *samhdr; //header used to write data + hts_tpool_process *queue; //queue from which results to be retrieved + datacache *cache; //to re-use allocated storage + int result; //result code returned by writer thread +} orderedwrite; + +/// print_usage - print the usage +/** @param fp pointer to the file / terminal to which usage to be dumped +returns nothing +*/ +static void print_usage(FILE *fp) +{ + fprintf(fp, "Usage: qtask_ordered infile threadcount outdir [chunksize]\n\ +Calculates GC ratio - sum(G,C) / sum(A,T,C,G) - and adds to each alignment\n\ +as xr:f aux tag. Output is saved in outdir.\n\ +chunksize [4096] sets the number of alignments clubbed together to process.\n"); + return; +} + +/// getbamstorage - allocates storage for alignments to queue +/** @param chunk number of bam data to allocate + * @param bamcache cached storage +returns already allocated data storage if one is available, otherwise allocates new +*/ +data* getbamstorage(int chunk, datacache *bamcache) +{ + int i = 0; + data *bamdata = NULL; + + if (!bamcache) { + return NULL; + } + //get from cache if there is an already allocated storage + if (pthread_mutex_lock(&bamcache->lock)) { + return NULL; + } + if (bamcache->list) { //available + bamdata = bamcache->list; + bamcache->list = bamdata->next; //remove and set next one as available + bamdata->next = NULL; //remove link + bamdata->count = 0; + goto end; + } + //allocate and use + if (!(bamdata = malloc(sizeof(data)))) { + goto end; + } + bamdata->bamarray = malloc(chunk * sizeof(bam1_t*)); + if (!bamdata->bamarray) { + free(bamdata); + bamdata = NULL; + goto end; + } + for (i = 0; i < chunk; ++i) { + bamdata->bamarray[i] = bam_init1(); + } + bamdata->maxsize = chunk; + bamdata->count = 0; + bamdata->next = NULL; + +end: + pthread_mutex_unlock(&bamcache->lock); + return bamdata; +} + +/// cleanup_bamstorage - frees a bamdata struct plus contents +/** @param arg Pointer to data to free + @p arg has type void * so it can be used as a callback passed + to hts_tpool_dispatch3(). + */ +void cleanup_bamstorage(void *arg) +{ + data *bamdata = (data *) arg; + if (!bamdata) + return; + if (bamdata->bamarray) { + int i; + for (i = 0; i < bamdata->maxsize; i++) { + bam_destroy1(bamdata->bamarray[i]); + } + free(bamdata->bamarray); + } + free(bamdata); +} + +/// thread_ordered_proc - does the processing of task in queue and queues the output back +/** @param args pointer to set of data to be processed +returns the processed data +the processing could be in any order based on the number of threads in use but read of output +from queue will be in order +a null data indicates the end of input and a null is returned to be added back to result queue +*/ +void *thread_ordered_proc(void *args) +{ + int i = 0, pos = 0; + data *bamdata = (data*)args; + float gcratio = 0; + uint8_t *data = NULL; + + if (bamdata == NULL) + return NULL; // Indicates no more input + + for ( i = 0; i < bamdata->count; ++i) { + //add count + uint64_t count[16] = {0}; + data = bam_get_seq(bamdata->bamarray[i]); + for (pos = 0; pos < bamdata->bamarray[i]->core.l_qseq; ++pos) { + count[bam_seqi(data,pos)]++; + } + /*it is faster to count all and use offset to get required counts rather than select + require ones inside the loop*/ + gcratio = (count[2] /*C*/ + count[4] /*G*/) / (float) (count[1] /*A*/ + count[8] /*T*/ + count[2] + count[4]); + + if (bam_aux_append(bamdata->bamarray[i], "xr", 'f', sizeof(gcratio), (const uint8_t*)&gcratio) < 0) { + fprintf(stderr, "Failed to add aux tag xr, errno: %d\n", errno); + break; + } + } + return bamdata; +} + +/// threadfn_orderedwrite - thread that read the output from queue and writes +/** @param args pointer to data specific for the thread +returns NULL +*/ +void *threadfn_orderedwrite(void *args) +{ + orderedwrite *tdata = (orderedwrite*)args; + hts_tpool_result *r = NULL; + data *bamdata = NULL; + int i = 0; + + tdata->result = 0; + + //get result and write; wait if no result is in queue - until shutdown of queue + while (tdata->result == 0 && + (r = hts_tpool_next_result_wait(tdata->queue)) != NULL) { + bamdata = (data*) hts_tpool_result_data(r); + + if (bamdata == NULL) { + // Indicator for no more input. Time to stop. + hts_tpool_delete_result(r, 0); + break; + } + + for (i = 0; i < bamdata->count; ++i) { + if (sam_write1(tdata->outfile, tdata->samhdr, bamdata->bamarray[i]) < 0) { + fprintf(stderr, "Failed to write output data\n"); + tdata->result = -1; + break; + } + } + hts_tpool_delete_result(r, 0); //release the result memory + + pthread_mutex_lock(&tdata->cache->lock); + bamdata->next = tdata->cache->list; //make current list as next + tdata->cache->list = bamdata; //set as current to reuse + pthread_mutex_unlock(&tdata->cache->lock); + } + + // Shut down the process queue. If we stopped early due to a write failure, + // this will signal to the other end that something has gone wrong. + hts_tpool_process_shutdown(tdata->queue); + + return NULL; +} + +/// main_demo - start of the demo +/** @param argc - count of arguments + * @param argv - pointer to array of arguments +returns 1 on failure 0 on success +*/ +int main(int argc, char *argv[]) +{ + const char *inname = NULL, *outdir = NULL; + char *file = NULL; + int c = 0, ret = EXIT_FAILURE, cnt = 0, started_thread = 0, chunk = 0; + size_t size = 0; + samFile *infile = NULL, *outfile = NULL; + sam_hdr_t *in_samhdr = NULL; + pthread_t thread; + orderedwrite twritedata = {0}; + hts_tpool *pool = NULL; + hts_tpool_process *queue = NULL; + htsThreadPool tpool = {NULL, 0}; + data *bamdata = NULL; + datacache bamcache = {PTHREAD_MUTEX_INITIALIZER, NULL}; + + //qtask infile threadcount outdir [chunksize] + if (argc != 4 && argc != 5) { + print_usage(stdout); + goto end; + } + inname = argv[1]; + cnt = atoi(argv[2]); + outdir = argv[3]; + if (argc == 5) { //chunk size present + chunk = atoi(argv[4]); + } + if (cnt < 1) { //set proper thread count + cnt = 1; + } + if (chunk < 1) { //set valid chunk size + chunk = 4096; + } + + //allocate space for output + size = (strlen(outdir) + sizeof("/out.bam") + 1); //space for output file name and null termination + if (!(file = malloc(size))) { + fprintf(stderr, "Failed to set output path\n"); + goto end; + } + snprintf(file, size, "%s/out.bam", outdir); //output file name + if (!(pool = hts_tpool_init(cnt))) { //thread pool + fprintf(stderr, "Failed to create thread pool\n"); + goto end; + } + tpool.pool = pool; //to share the pool for file read and write as well + //queue to use with thread pool, for task and results + if (!(queue = hts_tpool_process_init(pool, cnt * 2, 0))) { + fprintf(stderr, "Failed to create queue\n"); + goto end; + } + //open input file - r reading + if (!(infile = sam_open(inname, "r"))) { + fprintf(stderr, "Could not open %s\n", inname); + goto end; + } + //open output files - w write as SAM, wb write as BAM + if (!(outfile = sam_open(file, "wb"))) { + fprintf(stderr, "Could not open output file\n"); + goto end; + } + //share the thread pool with i/o files + if (hts_set_opt(infile, HTS_OPT_THREAD_POOL, &tpool) < 0 || + hts_set_opt(outfile, HTS_OPT_THREAD_POOL, &tpool) < 0) { + fprintf(stderr, "Failed to set threads to i/o files\n"); + goto end; + } + //read header, required to resolve the target names to proper ids + if (!(in_samhdr = sam_hdr_read(infile))) { + fprintf(stderr, "Failed to read header from file!\n"); + goto end; + } + //write header + if ((sam_hdr_write(outfile, in_samhdr) == -1)) { + fprintf(stderr, "Failed to write header\n"); + goto end; + } + + /* tasks are queued, worker threads get them and process in parallel; + the results are queued and they are to be removed in parallel as well */ + + // start output writer thread for ordered processing + twritedata.outfile = outfile; + twritedata.samhdr = in_samhdr; + twritedata.result = 0; + twritedata.queue = queue; + twritedata.cache = &bamcache; + if (pthread_create(&thread, NULL, threadfn_orderedwrite, &twritedata)) { + fprintf(stderr, "Failed to create writer thread\n"); + goto end; + } + started_thread = 1; + + c = 0; + while (c >= 0) { + if (!(bamdata = getbamstorage(chunk, &bamcache))) { + fprintf(stderr, "Failed to allocate memory\n"); + break; + } + //read alignments, upto max size for this lot + for (cnt = 0; cnt < bamdata->maxsize; ++cnt) { + c = sam_read1(infile, in_samhdr, bamdata->bamarray[cnt]); + if (c < 0) { + break; // EOF or failure + } + } + if (c >= -1 ) { + //max size data or reached EOF + bamdata->count = cnt; + // Queue the data for processing. hts_tpool_dispatch3() is + // used here as it allows in-flight data to be cleaned up + // properly when stopping early due to errors. + if (hts_tpool_dispatch3(pool, queue, thread_ordered_proc, bamdata, + cleanup_bamstorage, cleanup_bamstorage, + 0) == -1) { + fprintf(stderr, "Failed to schedule processing\n"); + goto end; + } + bamdata = NULL; + } else { + fprintf(stderr, "Error in reading data\n"); + break; + } + } + + ret = EXIT_SUCCESS; + + end: + // Tidy up after having dispatched all of the data. + + // Note that the order here is important. In particular, we need + // to join the thread that was started earlier before freeing anything + // to avoid any use-after-free errors. + + // It's also possible to get here early due to various error conditions, + // so we need to carefully check which parts of the program state have + // been created before trying to clean them up. + + if (queue) { + if (-1 == c) { + // EOF read, send a marker to tell the threadfn_orderedwrite() + // function to shut down. + if (hts_tpool_dispatch(pool, queue, thread_ordered_proc, + NULL) == -1) { + fprintf(stderr, "Failed to schedule sentinel job\n"); + ret = EXIT_FAILURE; + } + } else { + // Error or we never wrote anything. Shut down the queue to + // ensure threadfn_orderedwrite() wakes up and terminates. + hts_tpool_process_shutdown(queue); + } + } + + // Wait for threadfn_orderedwrite to finish. + if (started_thread) { + pthread_join(thread, NULL); + + // Once the writer thread has finished, check the result it sent back + if (twritedata.result != 0) { + ret = EXIT_FAILURE; + } + } + + if (queue) { + // Once threadfn_orderedwrite has stopped, the queue can be + // cleaned up. + hts_tpool_process_destroy(queue); + } + + if (in_samhdr) { + sam_hdr_destroy(in_samhdr); + } + if (infile) { + if (sam_close(infile) != 0) { + ret = EXIT_FAILURE; + } + } + if (outfile) { + if (sam_close(outfile) != 0) { + ret = EXIT_FAILURE; + } + } + + pthread_mutex_lock(&bamcache.lock); + if (bamcache.list) { + struct data *tmp = NULL; + while (bamcache.list) { + tmp = bamcache.list; + bamcache.list = bamcache.list->next; + cleanup_bamstorage(tmp); + } + } + pthread_mutex_unlock(&bamcache.lock); + + if (file) { + free(file); + } + if (pool) { + hts_tpool_destroy(pool); + } + return ret; +} diff --git a/samples/qtask_unordered.c b/samples/qtask_unordered.c new file mode 100644 index 000000000..05fe50346 --- /dev/null +++ b/samples/qtask_unordered.c @@ -0,0 +1,320 @@ +/* qtask_ordered.c -- showcases the htslib api usage + + Copyright (C) 2024 Genome Research Ltd. + + Author: Vasudeva Sarma + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL +THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +DEALINGS IN THE SOFTWARE + +*/ + +/* The purpose of this code is to demonstrate the library apis and need proper error handling and optimisation */ + +#include +#include +#include +#include +#include +#include + +struct datacache; + +typedef struct basecount { + uint64_t counts[16]; //count of all bases +} basecount; + +typedef struct data { + int count; //used up size + int maxsize; //max size per data chunk + bam1_t **bamarray; //bam1_t array for optimal queueing + + struct datacache *cache; + basecount *bases; //count of all possible bases + struct data *next; //pointer to next one - to reuse earlier allocations +} data; + +typedef struct datacache +{ + pthread_mutex_t lock; //synchronizes the access to cache + data *list; //data storage +} datacache; + +/// print_usage - print the usage +/** @param fp pointer to the file / terminal to which usage to be dumped +returns nothing +*/ +static void print_usage(FILE *fp) +{ + fprintf(fp, "Usage: qtask_unordered infile threadcount [chunksize]\n\ +Shows the base counts and calculates GC ratio - sum(G,C) / sum(A,T,C,G)\n\ +chunksize [4096] sets the number of alignments clubbed together to process.\n"); + return; +} + +/// getbamstorage - allocates storage for alignments to queue +/** @param chunk number of bam data to allocate + * @param bases storage of result + * @param bamcache cached storage +returns already allocated data storage if one is available, otherwise allocates new +*/ +data* getbamstorage(int chunk, basecount *bases, datacache *bamcache) +{ + int i = 0; + data *bamdata = NULL; + + if (!bamcache || !bases) { + return NULL; + } + //get from cache if there is an already allocated storage + if (pthread_mutex_lock(&bamcache->lock)) { + return NULL; + } + if (bamcache->list) { //available + bamdata = bamcache->list; + bamcache->list = bamdata->next; //remove and set next one as available + bamdata->next = NULL; //remove link + bamdata->count = 0; + + bamdata->bases = bases; + bamdata->cache = bamcache; + goto end; + } + //allocate and use + if (!(bamdata = malloc(sizeof(data)))) { + goto end; + } + bamdata->bamarray = malloc(chunk * sizeof(bam1_t*)); + if (!bamdata->bamarray) { + free(bamdata); + bamdata = NULL; + goto end; + } + for (i = 0; i < chunk; ++i) { + bamdata->bamarray[i] = bam_init1(); + } + bamdata->maxsize = chunk; + bamdata->count = 0; + bamdata->next = NULL; + + bamdata->bases = bases; + bamdata->cache = bamcache; + +end: + pthread_mutex_unlock(&bamcache->lock); + return bamdata; +} + +/// cleanup_bamstorage - frees a bamdata struct plus contents +/** @param arg Pointer to data to free + @p arg has type void * so it can be used as a callback passed + to hts_tpool_dispatch3(). + */ +void cleanup_bamstorage(void *arg) +{ + data *bamdata = (data *) arg; + if (!bamdata) + return; + if (bamdata->bamarray) { + int i; + for (i = 0; i < bamdata->maxsize; i++) { + bam_destroy1(bamdata->bamarray[i]); + } + free(bamdata->bamarray); + } + free(bamdata); +} + +/// thread_unordered_proc - does the processing of task in queue and updates result +/** @param args pointer to set of data to be processed +returns NULL +the processing could be in any order based on the number of threads in use +*/ +void *thread_unordered_proc(void *args) +{ + int i = 0; + data *bamdata = (data*)args; + uint64_t pos = 0; + uint8_t *data = NULL; + uint64_t counts[16] = {0}; + for ( i = 0; i < bamdata->count; ++i) { + data = bam_get_seq(bamdata->bamarray[i]); + for (pos = 0; pos < bamdata->bamarray[i]->core.l_qseq; ++pos) { + /* it is faster to count all bases and select required ones later + compared to select and count here */ + counts[bam_seqi(data, pos)]++; + } + } + //update result and add the memory block for reuse + pthread_mutex_lock(&bamdata->cache->lock); + for (i = 0; i < 16; i++) { + bamdata->bases->counts[i] += counts[i]; + } + + bamdata->next = bamdata->cache->list; + bamdata->cache->list = bamdata; + pthread_mutex_unlock(&bamdata->cache->lock); + + return NULL; +} + +/// main - start of the demo +/** @param argc - count of arguments + * @param argv - pointer to array of arguments +returns 1 on failure 0 on success +*/ +int main(int argc, char *argv[]) +{ + const char *inname = NULL; + int c = 0, ret = EXIT_FAILURE, cnt = 0, chunk = 0; + samFile *infile = NULL; + sam_hdr_t *in_samhdr = NULL; + hts_tpool *pool = NULL; + hts_tpool_process *queue = NULL; + htsThreadPool tpool = {NULL, 0}; + data *bamdata = NULL; + basecount gccount = {{0}}; + datacache bamcache = {PTHREAD_MUTEX_INITIALIZER, NULL}; + + //qtask infile threadcount [chunksize] + if (argc != 3 && argc != 4) { + print_usage(stdout); + goto end; + } + inname = argv[1]; + cnt = atoi(argv[2]); + if (argc == 4) { + chunk = atoi(argv[3]); + } + if (cnt < 1) { + cnt = 1; + } + if (chunk < 1) { + chunk = 4096; + } + + if (!(pool = hts_tpool_init(cnt))) { + fprintf(stderr, "Failed to create thread pool\n"); + goto end; + } + tpool.pool = pool; //to share the pool for file read and write as well + //queue to use with thread pool, for tasks + if (!(queue = hts_tpool_process_init(pool, cnt * 2, 1))) { + fprintf(stderr, "Failed to create queue\n"); + goto end; + } + //open input file - r reading + if (!(infile = sam_open(inname, "r"))) { + fprintf(stderr, "Could not open %s\n", inname); + goto end; + } + //share the thread pool with i/o files + if (hts_set_opt(infile, HTS_OPT_THREAD_POOL, &tpool) < 0) { + fprintf(stderr, "Failed to set threads to i/o files\n"); + goto end; + } + //read header, required to resolve the target names to proper ids + if (!(in_samhdr = sam_hdr_read(infile))) { + fprintf(stderr, "Failed to read header from file!\n"); + goto end; + } + + /*tasks are queued, worker threads get them and process in parallel; + all bases are counted instead of counting atcg alone as it is faster*/ + + c = 0; + while (c >= 0) { + //use cached storage to avoid allocate/deallocate overheads + if (!(bamdata = getbamstorage(chunk, &gccount, &bamcache))) { + fprintf(stderr, "Failed to allocate memory\n"); + break; + } + //read alignments, upto max size for this lot + for (cnt = 0; cnt < bamdata->maxsize; ++cnt) { + c = sam_read1(infile, in_samhdr, bamdata->bamarray[cnt]); + if (c < 0) { + break; // EOF or failure + } + } + if (c >= -1 ) { + //max size data or reached EOF + bamdata->count = cnt; + // Queue the data for processing. hts_tpool_dispatch3() is + // used here as it allows in-flight data to be cleaned up + // properly when stopping early due to errors. + if (hts_tpool_dispatch3(pool, queue, thread_unordered_proc, bamdata, + cleanup_bamstorage, cleanup_bamstorage, + 0) == -1) { + fprintf(stderr, "Failed to schedule processing\n"); + goto end; + } + bamdata = NULL; + } else { + fprintf(stderr, "Error in reading data\n"); + break; + } + } + + if (-1 == c) { + // EOF read, ensure all are processed, waits for all to finish + if (hts_tpool_process_flush(queue) == -1) { + fprintf(stderr, "Failed to flush queue\n"); + } else { //all done + //refer seq_nt16_str to find position of required bases + fprintf(stdout, "GCratio: %f\nBase counts:\n", + (gccount.counts[2] /*C*/ + gccount.counts[4] /*G*/) / (float) + (gccount.counts[1] /*A*/ + gccount.counts[8] /*T*/ + + gccount.counts[2] + gccount.counts[4])); + + for (cnt = 0; cnt < 16; ++cnt) { + fprintf(stdout, "%c: %"PRIu64"\n", seq_nt16_str[cnt], gccount.counts[cnt]); + } + + ret = EXIT_SUCCESS; + } + } + end: + if (queue) { + hts_tpool_process_destroy(queue); + } + + if (in_samhdr) { + sam_hdr_destroy(in_samhdr); + } + if (infile) { + if (sam_close(infile) != 0) { + ret = EXIT_FAILURE; + } + } + + pthread_mutex_lock(&bamcache.lock); + if (bamcache.list) { + struct data *tmp = NULL; + while (bamcache.list) { + tmp = bamcache.list; + bamcache.list = bamcache.list->next; + cleanup_bamstorage(tmp); + } + } + pthread_mutex_unlock(&bamcache.lock); + + if (pool) { + hts_tpool_destroy(pool); + } + return ret; +} diff --git a/samples/read_aux.c b/samples/read_aux.c index cbf972b98..efd6f3651 100644 --- a/samples/read_aux.c +++ b/samples/read_aux.c @@ -24,14 +24,14 @@ DEALINGS IN THE SOFTWARE */ -/* The pupose of this code is to demonstrate the library apis and need proper error handling and optimization */ +/* The purpose of this code is to demonstrate the library apis and need proper error handling and optimisation */ #include #include #include -/// print_usage - print the demo_usage -/** @param fp pointer to the file / terminal to which demo_usage to be dumped +/// print_usage - print the usage +/** @param fp pointer to the file / terminal to which usage to be dumped returns nothing */ static void print_usage(FILE *fp) @@ -92,7 +92,7 @@ int printauxdata(FILE *fp, char type, int32_t idx, const uint8_t *data) fprintf(fp, "%c", auxBType); for (i = 0; i < auxBcnt; ++i) { //iterate the array fprintf(fp, ","); - //calling recurssively with index to reuse a few lines + //calling recursively with index to reuse a few lines if (printauxdata(fp, auxBType, i, data) == EXIT_FAILURE) { return EXIT_FAILURE; } @@ -166,7 +166,7 @@ int main(int argc, char *argv[]) else { //option 2 - get raw data if (!(data = bam_aux_get(bamdata, tag))) { - //tag data not returned, errono gives the reason + //tag data not returned, errno gives the reason if (errno == ENOENT) { printf("Tag not present\n"); } diff --git a/samples/read_bam.c b/samples/read_bam.c index 7fca8c55d..30bedf81c 100644 --- a/samples/read_bam.c +++ b/samples/read_bam.c @@ -24,14 +24,14 @@ DEALINGS IN THE SOFTWARE */ -/* The pupose of this code is to demonstrate the library apis and need proper error handling and optimization */ +/* The purpose of this code is to demonstrate the library apis and need proper error handling and optimisation */ #include #include #include -/// print_usage - print the demo_usage -/** @param fp pointer to the file / terminal to which demo_usage to be dumped +/// print_usage - print the usage +/** @param fp pointer to the file / terminal to which usage to be dumped returns nothing */ static void print_usage(FILE *fp) diff --git a/samples/read_fast.c b/samples/read_fast.c index f74b25515..10f807b69 100644 --- a/samples/read_fast.c +++ b/samples/read_fast.c @@ -24,14 +24,14 @@ DEALINGS IN THE SOFTWARE */ -/* The pupose of this code is to demonstrate the library apis and need proper error handling and optimization */ +/* The purpose of this code is to demonstrate the library apis and need proper error handling and optimisation */ #include #include #include -/// print_usage - show flags_demo usage -/** @param fp pointer to the file / terminal to which demo_usage to be dumped +/// print_usage - show usage +/** @param fp pointer to the file / terminal to which usage to be dumped returns nothing */ static void print_usage(FILE *fp) @@ -83,6 +83,8 @@ int main(int argc, char *argv[]) //read data while ((c = sam_read1(infile, in_samhdr, bamdata)) >= 0) { + printf("\nname: "); + printf("%s", bam_get_qname(bamdata)); printf("\nsequence: "); for (c = 0; c < bamdata->core.l_qseq; ++c) { printf("%c", seq_nt16_str[bam_seqi(bam_get_seq(bamdata), c)]); @@ -90,10 +92,11 @@ int main(int argc, char *argv[]) if (infile->format.format == fastq_format) { printf("\nquality: "); for (c = 0; c < bamdata->core.l_qseq; ++c) { - printf("%c", bam_get_qual(bamdata)[c]); + printf("%c", bam_get_qual(bamdata)[c] + 33); } } } + printf("\n"); if (c != -1) { //error printf("Failed to get data\n"); diff --git a/samples/read_fast_index.c b/samples/read_fast_index.c new file mode 100644 index 000000000..97076630a --- /dev/null +++ b/samples/read_fast_index.c @@ -0,0 +1,163 @@ +/* read_fast_index.c -- showcases the htslib api usage + + Copyright (C) 2023 Genome Research Ltd. + + Author: Vasudeva Sarma + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL +THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +DEALINGS IN THE SOFTWARE + +*/ + +/* The purpose of this code is to demonstrate the library apis and need proper error handling and optimisation */ + +#include +#include +#include +#include + +/// print_usage - show usage +/** @param fp pointer to the file / terminal to which usage to be dumped +returns nothing +*/ +static void print_usage(FILE *fp) +{ + fprintf(fp, "Usage: read_fast_i A/Q 0/1 regiondef\n\ +Reads the fasta/fastq file using index and shows the content.\n\ +For fasta files use A and Q for fastq files.\n\ +Region can be 1 or more of [:start-end] entries separated by comma.\n\ +For single region, give regcount as 0 and non 0 for multi-regions.\n"); + return; +} + +/// main_demo - start of the demo +/** @param argc - count of arguments + * @param argv - pointer to array of arguments +returns 1 on failure 0 on success +*/ +int main(int argc, char *argv[]) +{ + const char *inname = NULL, *region = NULL, *data = NULL, *remaining = NULL; + int ret = EXIT_FAILURE, tid = -1, usemulti = 0; + faidx_t *idx = NULL; + enum fai_format_options fmt = FAI_FASTA; + hts_pos_t len = 0, beg = 0, end = 0; + + //read_fast_i infile A/Q regcount region + if (argc != 5) { + print_usage(stdout); + goto end; + } + inname = argv[1]; + if (argv[2][0] == 'Q') { + fmt = FAI_FASTQ; + } + usemulti = atoi(argv[3]); + region = argv[4]; + + //load index + if (!(idx = fai_load3_format(inname, NULL, NULL, FAI_CREATE, fmt))) { + printf("Failed to load index\n"); + goto end; + } + + if (!usemulti) { + //get data from given region + if (!(data = fai_fetch64(idx, region, &len))) { + if (-1 == len) { + printf("Failed to get data\n"); //failure + goto end; + } + else { + printf("Data not found for given region\n"); //no data + } + } + else { + printf("Data: %"PRId64" %s\n", len, data); + free((void*)data); + //get quality for fastq type + if (fmt == FAI_FASTQ) { + if (!(data = fai_fetchqual64(idx, region, &len))) { + if (len == -1) { + printf("Failed to get data\n"); + goto end; + } + else { + printf("Data not found for given region\n"); + } + } + else { + printf("Qual: %"PRId64" %s\n", len, data); + free((void*)data); + } + } + } + } + else { + //parse, get each region and get data for each + while ((remaining = fai_parse_region(idx, region, &tid, &beg, &end, HTS_PARSE_LIST))) { //here expects regions as csv + //parsed the region, correct end points based on actual data + if (fai_adjust_region(idx, tid, &beg, &end) == -1) { + printf("Error in adjusting region for tid %d\n", tid); + goto end; + } + //get data for given region + if (!(data = faidx_fetch_seq64(idx, faidx_iseq(idx, tid), beg, end, &len))) { + if (len == -1) { + printf("Failed to get data\n"); //failure + goto end; + } + else { + printf("No data found for given region\n"); //no data + } + } + else { + printf("Data: %"PRIhts_pos" %s\n", len, data); + free((void*)data); + data = NULL; + + //get quality data for fastq + if (fmt == FAI_FASTQ) { + if (!(data = faidx_fetch_qual64(idx, faidx_iseq(idx, tid), beg, end, &len))) { + if (len == -1) { + printf("Failed to get qual data\n"); + goto end; + } + else { + printf("No data found for given region\n"); + } + } + else { + printf("Qual: %"PRIhts_pos" %s\n", len, data); + free((void*)data); + data = NULL; + } + } + } + region = remaining; //parse remaining region defs + } + } + + ret = EXIT_SUCCESS; +end: + //clean up + if (idx) { + fai_destroy(idx); + } + return ret; +} diff --git a/samples/read_header.c b/samples/read_header.c index eb14daea5..54b07e736 100644 --- a/samples/read_header.c +++ b/samples/read_header.c @@ -24,14 +24,14 @@ DEALINGS IN THE SOFTWARE */ -/* The pupose of this code is to demonstrate the library apis and need proper error handling and optimization */ +/* The purpose of this code is to demonstrate the library apis and need proper error handling and optimisation */ #include #include #include -/// print_usage - print the demo_usage -/** @param fp pointer to the file / terminal to which demo_usage to be dumped +/// print_usage - print the usage +/** @param fp pointer to the file / terminal to which susage to be dumped returns nothing */ static void print_usage(FILE *fp) diff --git a/samples/read_refname.c b/samples/read_refname.c index adbc71183..9b4918ded 100644 --- a/samples/read_refname.c +++ b/samples/read_refname.c @@ -24,14 +24,14 @@ DEALINGS IN THE SOFTWARE */ -/* The pupose of this code is to demonstrate the library apis and need proper error handling and optimization */ +/* The purpose of this code is to demonstrate the library apis and need proper error handling and optimisation */ #include #include #include -/// print_usage - print the demo_usage -/** @param fp pointer to the file / terminal to which demo_usage to be dumped +/// print_usage - print the usage +/** @param fp pointer to the file / terminal to which usage to be dumped returns nothing */ static void print_usage(FILE *fp) diff --git a/samples/rem_header.c b/samples/rem_header.c index a0b6510fb..852d5f055 100644 --- a/samples/rem_header.c +++ b/samples/rem_header.c @@ -24,14 +24,14 @@ DEALINGS IN THE SOFTWARE */ -/* The pupose of this code is to demonstrate the library apis and need proper error handling and optimization */ +/* The purpose of this code is to demonstrate the library apis and need proper error handling and optimisation */ #include #include #include -/// print_usage - print the demo_usage -/** @param fp pointer to the file / terminal to which demo_usage to be dumped +/// print_usage - print the usage +/** @param fp pointer to the file / terminal to which usage to be dumped returns nothing */ static void print_usage(FILE *fp) @@ -124,7 +124,7 @@ int main(int argc, char *argv[]) ret = EXIT_SUCCESS; //bam data write to follow.... end: - //cleanupq + //cleanup if (in_samhdr) { sam_hdr_destroy(in_samhdr); } diff --git a/samples/sample.bed b/samples/sample.bed new file mode 100644 index 000000000..2ae458fd5 --- /dev/null +++ b/samples/sample.bed @@ -0,0 +1,4 @@ +T1 1 2 +T1 30 35 +T2 10 15 +T2 30 40 diff --git a/samples/sample.ref.fq b/samples/sample.ref.fq new file mode 100644 index 000000000..18b2b9617 --- /dev/null +++ b/samples/sample.ref.fq @@ -0,0 +1,16 @@ +@T1 +AAAAACTGAAAACCCCTTTTGGGGACTGTTAACAGTTTTT ++ +AAAAACTGAAAACCCCTTTTGGGGACTGTTAACAGTTTTT +@T2 +TTTTCCCCACTGAAAACCCCTTTTGGGGACTGTTAACAGT ++ +TTTTCCCCACTGAAAACCCCTTTTGGGGACTGTTAACAGT +@T3 +TTTTGGGGACTGTTAACAGT ++ +TTTTGGGGACTGTTAACAGT +@T4 +TTTTCCCCACTGAAAACCCCTTTTGGGGACTGTTAACAGTTTTTCCCCACTGAAAACCCCTTTTGGGGACTGTTAACAGTTTTTGGGGACTGTTAACAGT ++ +TTTTCCCCACTGAAAACCCCTTTTGGGGACTGTTAACAGTTTTTCCCCACTGAAAACCCCTTTTGGGGACTGTTAACAGTTTTTGGGGACTGTTAACAGT diff --git a/samples/sample.sam b/samples/sample.sam index e56efd69f..58515c976 100644 --- a/samples/sample.sam +++ b/samples/sample.sam @@ -9,7 +9,7 @@ @CO 1234567890123456789012345678901234567890 @CO AAAAACTGAAAACCCCTTTTGGGGACTGTTAACAGTTTTT T1 @CO TTTTCCCCACTGAAAACCCCTTTTGGGGACTGTTAACAGT T2 -@CO ITR1-ITR2M, ITR2-ITR2M are proper pairs in T1 and T2, UNMP1 is partly mapped and pair is unmapped, UNMP2 & 3 are unmappped +@CO ITR1-ITR2M, ITR2-ITR2M are proper pairs in T1 and T2, UNMP1 is partly mapped and pair is unmapped, UNMP2 & 3 are unmapped @CO A1-A2, A4-A3 are proper pairs with A4-A3 in different read order. A5 is secondary alignment ITR1 99 T1 5 40 4M = 33 10 ACTG ()() ITR2 147 T2 23 49 2M = 35 -10 TT ** diff --git a/samples/split.c b/samples/split.c index 2eb9e6b79..c51dbd385 100644 --- a/samples/split.c +++ b/samples/split.c @@ -24,14 +24,14 @@ DEALINGS IN THE SOFTWARE */ -/* The pupose of this code is to demonstrate the library apis and need proper error handling and optimization */ +/* The purpose of this code is to demonstrate the library apis and need proper error handling and optimisation */ #include #include #include -/// print_usage - print the demo_usage -/** @param fp pointer to the file / terminal to which demo_usage to be dumped +/// print_usage - print the usage +/** @param fp pointer to the file / terminal to which usage to be dumped returns nothing */ static void print_usage(FILE *fp) diff --git a/samples/split2.c b/samples/split2.c index 2354abfe3..33fabbd67 100644 --- a/samples/split2.c +++ b/samples/split2.c @@ -24,19 +24,19 @@ DEALINGS IN THE SOFTWARE */ -/* The pupose of this code is to demonstrate the library apis and need proper error handling and optimization */ +/* The purpose of this code is to demonstrate the library apis and need proper error handling and optimisation */ #include #include #include -/// print_usage - print the demo_usage -/** @param fp pointer to the file / terminal to which demo_usage to be dumped +/// print_usage - print the usage +/** @param fp pointer to the file / terminal to which usage to be dumped returns nothing */ static void print_usage(FILE *fp) { - fprintf(fp, "Usage: split infile outdir\n\ + fprintf(fp, "Usage: split2 infile outdir\n\ Splits the input file alignments to read1 and read2 and saves as 1.sam and 2.bam in given directory\n\ Shows file type selection through name and format api\n"); return; diff --git a/samples/split_thread1.c b/samples/split_thread1.c index 40d2dfdc2..551c7f093 100644 --- a/samples/split_thread1.c +++ b/samples/split_thread1.c @@ -24,14 +24,14 @@ DEALINGS IN THE SOFTWARE */ -/* The pupose of this code is to demonstrate the library apis and need proper error handling and optimization */ +/* The purpose of this code is to demonstrate the library apis and need proper error handling and optimisation */ #include #include #include -/// print_usage - print the demo_usage -/** @param fp pointer to the file / terminal to which demo_usage to be dumped +/// print_usage - print the usage +/** @param fp pointer to the file / terminal to which usage to be dumped returns nothing */ static void print_usage(FILE *fp) @@ -94,9 +94,9 @@ int main(int argc, char *argv[]) } //create file specific threads - if (hts_set_opt(infile, HTS_OPT_NTHREADS, 2) < 0 || //2 thread specific for reading + if (hts_set_opt(infile, HTS_OPT_NTHREADS, 1) < 0 || //1 thread specific for reading hts_set_opt(outfile1, HTS_OPT_NTHREADS, 1) < 0 || //1 thread specific for sam write - hts_set_opt(outfile2, HTS_OPT_NTHREADS, 1) < 0) { //1 thread specific for bam write + hts_set_opt(outfile2, HTS_OPT_NTHREADS, 2) < 0) { //2 thread specific for bam write printf("Failed to set thread options\n"); goto end; } diff --git a/samples/split_thread2.c b/samples/split_thread2.c index dab897b5f..dc8bc9f31 100644 --- a/samples/split_thread2.c +++ b/samples/split_thread2.c @@ -24,15 +24,15 @@ DEALINGS IN THE SOFTWARE */ -/* The pupose of this code is to demonstrate the library apis and need proper error handling and optimization */ +/* The purpose of this code is to demonstrate the library apis and need proper error handling and optimisation */ #include #include #include #include -/// print_usage - print the demo_usage -/** @param fp pointer to the file / terminal to which demo_usage to be dumped +/// print_usage - print the usage +/** @param fp pointer to the file / terminal to which usage to be dumped returns nothing */ static void print_usage(FILE *fp) diff --git a/samples/update_header.c b/samples/update_header.c index f6b1680cd..237d5c4df 100644 --- a/samples/update_header.c +++ b/samples/update_header.c @@ -24,14 +24,14 @@ DEALINGS IN THE SOFTWARE */ -/* The pupose of this code is to demonstrate the library apis and need proper error handling and optimization */ +/* The purpose of this code is to demonstrate the library apis and need proper error handling and optimisation */ #include #include #include -/// print_usage - print the demo_usage -/** @param fp pointer to the file / terminal to which demo_usage to be dumped +/// print_usage - print the usage +/** @param fp pointer to the file / terminal to which usage to be dumped returns nothing */ static void print_usage(FILE *fp) diff --git a/samples/write_fast.c b/samples/write_fast.c index ef7817683..95d919fd0 100644 --- a/samples/write_fast.c +++ b/samples/write_fast.c @@ -24,19 +24,21 @@ DEALINGS IN THE SOFTWARE */ -/* The pupose of this code is to demonstrate the library apis and need proper error handling and optimization */ +/* The purpose of this code is to demonstrate the library apis and need proper error handling and optimisation */ #include #include +#include #include +#include -/// print_usage - show flags_demo usage -/** @param fp pointer to the file / terminal to which demo_usage to be dumped +/// print_usage - show usage +/** @param fp pointer to the file / terminal to which usage to be dumped returns nothing */ static void print_usage(FILE *fp) { - fprintf(fp, "Usage: write_fast \n\ + fprintf(fp, "Usage: write_fast [ 4 || argc < 3) { print_usage(stdout); goto end; } outname = argv[1]; + data = argv[2]; + if (argc == 4) { //fastq data + qual = argv[3]; + if (strlen(data) != strlen(qual)) { //check for proper length of data and quality values + printf("Incorrect reference and quality data\n"); + goto end; + } + } //initialize if (!(bamdata = bam_init1())) { @@ -71,26 +82,30 @@ int main(int argc, char *argv[]) goto end; } //open output file - if (!(outfile = sam_open(outname, mode))) { + if (!(outfile = sam_open(outname, mode))) { //expects the name to have correct extension! printf("Could not open %s\n", outname); goto end; } - //dummy data - if (bam_set1(bamdata, sizeof("test"), "test", BAM_FUNMAP, -1, -1, 0, 0, NULL, -1, -1, 0, 10, "AACTGACTGA", "1234567890", 0) < 0) { + /* if the file name extension is not appropriate to the content, inconsistent data will be present in output. + if required, htsFormat and sam_open_format can be explicitly used to ensure appropriateness of content. + htsFormat fmt = {sequence_data, fastq_format / fasta_format}; + sam_open_format(outname, mode, fmt); + */ + + snprintf(name, sizeof(name), "Test_%ld", (long) time(NULL)); + //data + if (bam_set1(bamdata, strlen(name), name, BAM_FUNMAP, -1, -1, 0, 0, NULL, -1, -1, 0, strlen(data), data, qual, 0) < 0) { printf("Failed to set data\n"); goto end; } - if (sam_write1(outfile, out_samhdr, bamdata) < 0) { + //as we write only FASTA/FASTQ, we can get away without providing headers + if (sam_write1(outfile, NULL, bamdata) < 0) { printf("Failed to write data\n"); goto end; } - ret = EXIT_SUCCESS; end: //clean up - if (out_samhdr) { - sam_hdr_destroy(out_samhdr); - } if (outfile) { sam_close(outfile); } diff --git a/simd.c b/simd.c new file mode 100644 index 000000000..865dd887e --- /dev/null +++ b/simd.c @@ -0,0 +1,222 @@ +/* simd.c -- SIMD optimised versions of various internal functions. + + Copyright (C) 2024 Genome Research Ltd. + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL +THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +DEALINGS IN THE SOFTWARE. */ + +#define HTS_BUILDING_LIBRARY // Enables HTSLIB_EXPORT, see htslib/hts_defs.h +#include + +// These must be defined before the first system include to ensure that legacy +// BSD types needed by remain defined when _XOPEN_SOURCE is set. +#if defined __APPLE__ +#define _DARWIN_C_SOURCE +#elif defined __NetBSD__ +#define _NETBSD_SOURCE +#endif + +#include "htslib/sam.h" +#include "sam_internal.h" + +#if defined __x86_64__ +#include +#elif defined __ARM_NEON +#include +#endif + +#if defined __arm__ || defined __aarch64__ + +#if defined __linux__ || defined __FreeBSD__ +#include +#elif defined __APPLE__ +#include +#include +#elif defined __NetBSD__ +#include +#include +#include +#ifdef __aarch64__ +#include +#else +#include +#endif +#elif defined _WIN32 +#include +#endif + +static inline int cpu_supports_neon(void) { +#if defined __linux__ && defined __arm__ && defined HWCAP_NEON + return (getauxval(AT_HWCAP) & HWCAP_NEON) != 0; +#elif defined __linux__ && defined __arm__ && defined HWCAP_ARM_NEON + return (getauxval(AT_HWCAP) & HWCAP_ARM_NEON) != 0; +#elif defined __linux__ && defined __aarch64__ && defined HWCAP_ASIMD + return (getauxval(AT_HWCAP) & HWCAP_ASIMD) != 0; +#elif defined __APPLE__ && defined __aarch64__ + int32_t ctl; + size_t ctlsize = sizeof ctl; + if (sysctlbyname("hw.optional.AdvSIMD", &ctl, &ctlsize, NULL, 0) != 0) return 0; + if (ctlsize != sizeof ctl) return 0; + return ctl; +#elif defined __FreeBSD__ && defined __arm__ && defined HWCAP_NEON + unsigned long cap; + if (elf_aux_info(AT_HWCAP, &cap, sizeof cap) != 0) return 0; + return (cap & HWCAP_NEON) != 0; +#elif defined __FreeBSD__ && defined __aarch64__ && defined HWCAP_ASIMD + unsigned long cap; + if (elf_aux_info(AT_HWCAP, &cap, sizeof cap) != 0) return 0; + return (cap & HWCAP_ASIMD) != 0; +#elif defined __NetBSD__ && defined __arm__ && defined ARM_MVFR0_ASIMD_MASK + uint32_t buf[16]; + size_t buflen = sizeof buf; + if (sysctlbyname("machdep.id_mvfr", buf, &buflen, NULL, 0) != 0) return 0; + if (buflen < sizeof(uint32_t)) return 0; + return (buf[0] & ARM_MVFR0_ASIMD_MASK) == 0x00000002; +#elif defined __NetBSD__ && defined __aarch64__ && defined ID_AA64PFR0_EL1_ADVSIMD + struct aarch64_sysctl_cpu_id buf; + size_t buflen = sizeof buf; + if (sysctlbyname("machdep.cpu0.cpu_id", &buf, &buflen, NULL, 0) != 0) return 0; + if (buflen < offsetof(struct aarch64_sysctl_cpu_id, ac_aa64pfr0) + sizeof(uint64_t)) return 0; + return (buf.ac_aa64pfr0 & ID_AA64PFR0_EL1_ADVSIMD & 0x00e00000) == 0; +#elif defined _WIN32 + return IsProcessorFeaturePresent(PF_ARM_V8_INSTRUCTIONS_AVAILABLE) != 0; +#else + return 0; +#endif +} + +#endif + +#ifdef BUILDING_SIMD_NIBBLE2BASE + +void (*htslib_nibble2base)(uint8_t *nib, char *seq, int len) = nibble2base_default; + +#if defined __x86_64__ + +/* + * Convert a nibble encoded BAM sequence to a string of bases. + * + * Using SSSE3 instructions, 16 codepoints that hold 2 bases each can be + * unpacked into 32 indexes from 0-15. Using the pshufb instruction these can + * be converted to the IUPAC characters. + * It falls back on the nibble2base_default function for the remainder. + */ + +__attribute__((target("ssse3"))) +static void nibble2base_ssse3(uint8_t *nib, char *seq, int len) { + const char *seq_end_ptr = seq + len; + char *seq_cursor = seq; + uint8_t *nibble_cursor = nib; + const char *seq_vec_end_ptr = seq_end_ptr - (2 * sizeof(__m128i) - 1); + __m128i nuc_lookup_vec = _mm_lddqu_si128((__m128i *)seq_nt16_str); + /* Nucleotides are encoded 4-bits per nucleotide and stored in 8-bit bytes + as follows: |AB|CD|EF|GH|. The 4-bit codes (going from 0-15) can be used + together with the pshufb instruction as a lookup table. The most efficient + way is to use bitwise AND and shift to create two vectors. One with all + the upper codes (|A|C|E|G|) and one with the lower codes (|B|D|F|H|). + The lookup can then be performed and the resulting vectors can be + interleaved again using the unpack instructions. */ + while (seq_cursor < seq_vec_end_ptr) { + __m128i encoded = _mm_lddqu_si128((__m128i *)nibble_cursor); + __m128i encoded_upper = _mm_srli_epi64(encoded, 4); + encoded_upper = _mm_and_si128(encoded_upper, _mm_set1_epi8(15)); + __m128i encoded_lower = _mm_and_si128(encoded, _mm_set1_epi8(15)); + __m128i nucs_upper = _mm_shuffle_epi8(nuc_lookup_vec, encoded_upper); + __m128i nucs_lower = _mm_shuffle_epi8(nuc_lookup_vec, encoded_lower); + __m128i first_nucleotides = _mm_unpacklo_epi8(nucs_upper, nucs_lower); + __m128i second_nucleotides = _mm_unpackhi_epi8(nucs_upper, nucs_lower); + _mm_storeu_si128((__m128i *)seq_cursor, first_nucleotides); + _mm_storeu_si128((__m128i *)(seq_cursor + sizeof(__m128i)), + second_nucleotides); + nibble_cursor += sizeof(__m128i); + seq_cursor += 2 * sizeof(__m128i); + } + nibble2base_default(nibble_cursor, seq_cursor, seq_end_ptr - seq_cursor); +} + +__attribute__((constructor)) +static void nibble2base_resolve(void) { + if (__builtin_cpu_supports("ssse3")) { + htslib_nibble2base = nibble2base_ssse3; + } +} + +#elif defined __ARM_NEON + +static void nibble2base_neon(uint8_t *nib, char *seq0, int len) { + uint8x16_t low_nibbles_mask = vdupq_n_u8(0x0f); + uint8x16_t nuc_lookup_vec = vld1q_u8((const uint8_t *) seq_nt16_str); +#ifndef __aarch64__ + uint8x8x2_t nuc_lookup_vec2 = {{ vget_low_u8(nuc_lookup_vec), vget_high_u8(nuc_lookup_vec) }}; +#endif + + uint8_t *seq = (uint8_t *) seq0; + int blocks; + + for (blocks = len / 32; blocks > 0; --blocks) { + uint8x16_t encoded = vld1q_u8(nib); + nib += 16; + + /* Translate the high and low nibbles to nucleotide letters separately, + then interleave them back together via vzipq for writing. */ + + uint8x16_t high_nibbles = vshrq_n_u8(encoded, 4); + uint8x16_t low_nibbles = vandq_u8(encoded, low_nibbles_mask); + +#ifdef __aarch64__ + uint8x16_t high_nucleotides = vqtbl1q_u8(nuc_lookup_vec, high_nibbles); + uint8x16_t low_nucleotides = vqtbl1q_u8(nuc_lookup_vec, low_nibbles); +#else + uint8x8_t high_low = vtbl2_u8(nuc_lookup_vec2, vget_low_u8(high_nibbles)); + uint8x8_t high_high = vtbl2_u8(nuc_lookup_vec2, vget_high_u8(high_nibbles)); + uint8x16_t high_nucleotides = vcombine_u8(high_low, high_high); + + uint8x8_t low_low = vtbl2_u8(nuc_lookup_vec2, vget_low_u8(low_nibbles)); + uint8x8_t low_high = vtbl2_u8(nuc_lookup_vec2, vget_high_u8(low_nibbles)); + uint8x16_t low_nucleotides = vcombine_u8(low_low, low_high); +#endif + +#ifdef __aarch64__ + vst1q_u8_x2(seq, vzipq_u8(high_nucleotides, low_nucleotides)); +#else + // Avoid vst1q_u8_x2 as GCC erroneously omits it on 32-bit ARM + uint8x16x2_t nucleotides = {{ high_nucleotides, low_nucleotides }}; + vst2q_u8(seq, nucleotides); +#endif + seq += 32; + } + + if (len % 32 != 0) + nibble2base_default(nib, (char *) seq, len % 32); +} + +static __attribute__((constructor)) void nibble2base_resolve(void) { + if (cpu_supports_neon()) htslib_nibble2base = nibble2base_neon; +} + +#endif + +#endif // BUILDING_SIMD_NIBBLE2BASE + +// Potentially useful diagnostic, and prevents "empty translation unit" errors +const char htslib_simd[] = + "SIMD functions present:" +#ifdef BUILDING_SIMD_NIBBLE2BASE + " nibble2base" +#endif + "."; diff --git a/synced_bcf_reader.c b/synced_bcf_reader.c index a43ab15ae..1835ea2d6 100644 --- a/synced_bcf_reader.c +++ b/synced_bcf_reader.c @@ -25,6 +25,7 @@ DEALINGS IN THE SOFTWARE. */ #define HTS_BUILDING_LIBRARY // Enables HTSLIB_EXPORT, see htslib/hts_defs.h #include +#include #include #include #include @@ -71,6 +72,7 @@ typedef struct } aux_t; +static bcf_sr_regions_t *bcf_sr_regions_alloc(void); static int _regions_add(bcf_sr_regions_t *reg, const char *chr, hts_pos_t start, hts_pos_t end); static bcf_sr_regions_t *_regions_init_string(const char *str); static int _regions_match_alleles(bcf_sr_regions_t *reg, int als_idx, bcf1_t *rec); @@ -368,13 +370,22 @@ int bcf_sr_add_reader(bcf_srs_t *files, const char *fname) if ( !files->explicit_regs && !files->streaming ) { int n = 0, i; - const char **names = reader->tbx_idx ? tbx_seqnames(reader->tbx_idx, &n) : bcf_hdr_seqnames(reader->header, &n); - for (i=0; iregions ) { + files->regions = bcf_sr_regions_alloc(); if ( !files->regions ) - files->regions = _regions_init_string(names[i]); - else - _regions_add(files->regions, names[i], -1, -1); + { + hts_log_error("Cannot allocate regions data structure"); + return 0; + } + } + + names = reader->tbx_idx ? tbx_seqnames(reader->tbx_idx, &n) : bcf_hdr_seqnames(reader->header, &n); + for (i=0; iregions, names[i], -1, -1); } free(names); _regions_sort_and_merge(files->regions); @@ -532,7 +543,7 @@ static int _reader_seek(bcf_sr_t *reader, const char *seq, hts_pos_t start, hts_ } if (!reader->itr) { hts_log_error("Could not seek: %s:%"PRIhts_pos"-%"PRIhts_pos, seq, start + 1, end + 1); - assert(0); + abort(); } return 0; } @@ -956,6 +967,17 @@ int bcf_sr_set_samples(bcf_srs_t *files, const char *fname, int is_file) return 1; } +// Allocate a new region list structure. +static bcf_sr_regions_t *bcf_sr_regions_alloc(void) +{ + bcf_sr_regions_t *reg = (bcf_sr_regions_t *) calloc(1, sizeof(bcf_sr_regions_t)); + if ( !reg ) return NULL; + + reg->start = reg->end = -1; + reg->prev_start = reg->prev_end = reg->prev_seq = -1; + return reg; +} + // Add a new region into a list. On input the coordinates are 1-based, inclusive, then stored 0-based, // inclusive. Sorting and merging step needed afterwards: qsort(..,cmp_regions) and merge_regions(). static int _regions_add(bcf_sr_regions_t *reg, const char *chr, hts_pos_t start, hts_pos_t end) @@ -1037,9 +1059,8 @@ void _regions_sort_and_merge(bcf_sr_regions_t *reg) // wouldn't learn the chromosome name. static bcf_sr_regions_t *_regions_init_string(const char *str) { - bcf_sr_regions_t *reg = (bcf_sr_regions_t *) calloc(1, sizeof(bcf_sr_regions_t)); - reg->start = reg->end = -1; - reg->prev_start = reg->prev_end = reg->prev_seq = -1; + bcf_sr_regions_t *reg = bcf_sr_regions_alloc(); + if ( !reg ) return NULL; kstring_t tmp = {0,0,0}; const char *sp = str, *ep = str; @@ -1189,9 +1210,8 @@ bcf_sr_regions_t *bcf_sr_regions_init(const char *regions, int is_file, int ichr return reg; } - reg = (bcf_sr_regions_t *) calloc(1, sizeof(bcf_sr_regions_t)); - reg->start = reg->end = -1; - reg->prev_start = reg->prev_end = reg->prev_seq = -1; + reg = bcf_sr_regions_alloc(); + if ( !reg ) return NULL; reg->file = hts_open(regions, "rb"); if ( !reg->file ) diff --git a/tabix.1 b/tabix.1 index 9bf1d6891..f0dc7b519 100644 --- a/tabix.1 +++ b/tabix.1 @@ -1,4 +1,4 @@ -.TH tabix 1 "15 April 2024" "htslib-1.20" "Bioinformatics tools" +.TH tabix 1 "12 September 2024" "htslib-1.21" "Bioinformatics tools" .SH NAME .PP tabix \- Generic indexer for TAB-delimited genome position files diff --git a/tbx.c b/tbx.c index 5f861299a..662500549 100644 --- a/tbx.c +++ b/tbx.c @@ -229,8 +229,11 @@ static inline int get_intv(tbx_t *tbx, kstring_t *str, tbx_intv_t *intv, int is_ case TBX_UCSC: type = "TBX_UCSC"; break; default: type = "TBX_GENERIC"; break; } - hts_log_error("Failed to parse %s, was wrong -p [type] used?\nThe offending line was: \"%s\"", - type, str->s); + if (hts_is_utf16_text(str)) + hts_log_error("Failed to parse %s: offending line appears to be encoded as UTF-16", type); + else + hts_log_error("Failed to parse %s: was wrong -p [type] used?\nThe offending line was: \"%s\"", + type, str->s); return -1; } } @@ -321,7 +324,7 @@ static void adjust_max_ref_len_sam(const char *str, int64_t *max_ref_len) // files with very large contigs. static int adjust_n_lvls(int min_shift, int n_lvls, int64_t max_len) { - int64_t s = 1LL << (min_shift + n_lvls * 3); + int64_t s = hts_bin_maxpos(min_shift, n_lvls); max_len += 256; for (; max_len > s; ++n_lvls, s <<= 3) {} return n_lvls; diff --git a/test/annot-tsv/dst.11.txt b/test/annot-tsv/dst.11.txt new file mode 100644 index 000000000..c54ad153a --- /dev/null +++ b/test/annot-tsv/dst.11.txt @@ -0,0 +1,5 @@ +#ignore me +#chr beg end smpl +1 10 20 A +1 30 40 A +1 50 60 A diff --git a/test/annot-tsv/dst.12.txt b/test/annot-tsv/dst.12.txt new file mode 100644 index 000000000..9b26b79af --- /dev/null +++ b/test/annot-tsv/dst.12.txt @@ -0,0 +1,5 @@ +#ignore me +#chr,beg,end,smpl +1,10,20,A +1,30,40,A +1,50,60,A diff --git a/test/annot-tsv/out.11.1.txt b/test/annot-tsv/out.11.1.txt new file mode 100644 index 000000000..3de1f68ee --- /dev/null +++ b/test/annot-tsv/out.11.1.txt @@ -0,0 +1,3 @@ +1 10 20 A A +1 30 40 A B +1 50 60 A . diff --git a/test/annot-tsv/out.11.2.txt b/test/annot-tsv/out.11.2.txt new file mode 100644 index 000000000..a863f4e61 --- /dev/null +++ b/test/annot-tsv/out.11.2.txt @@ -0,0 +1,4 @@ +#[1]chr [2]beg [3]end [4]smpl [5]src_smpl +1 10 20 A A +1 30 40 A B +1 50 60 A . diff --git a/test/annot-tsv/out.11.3.txt b/test/annot-tsv/out.11.3.txt new file mode 100644 index 000000000..7a37130db --- /dev/null +++ b/test/annot-tsv/out.11.3.txt @@ -0,0 +1,4 @@ +#chr beg end smpl src_smpl +1 10 20 A A +1 30 40 A B +1 50 60 A . diff --git a/test/annot-tsv/out.12.1.txt b/test/annot-tsv/out.12.1.txt new file mode 100644 index 000000000..7b6d0e994 --- /dev/null +++ b/test/annot-tsv/out.12.1.txt @@ -0,0 +1,3 @@ +1,10,20,A,A +1,30,40,A,B +1,50,60,A,. diff --git a/test/annot-tsv/out.13.1.txt b/test/annot-tsv/out.13.1.txt new file mode 100644 index 000000000..a1bf0be68 --- /dev/null +++ b/test/annot-tsv/out.13.1.txt @@ -0,0 +1,2 @@ +1 10 20 long long,short +1 15 15 short long,short diff --git a/test/annot-tsv/out.13.2.txt b/test/annot-tsv/out.13.2.txt new file mode 100644 index 000000000..7c543b134 --- /dev/null +++ b/test/annot-tsv/out.13.2.txt @@ -0,0 +1,2 @@ +1 10 20 long long +1 15 15 short short diff --git a/test/annot-tsv/out.13.3.txt b/test/annot-tsv/out.13.3.txt new file mode 100644 index 000000000..8911afad8 --- /dev/null +++ b/test/annot-tsv/out.13.3.txt @@ -0,0 +1,2 @@ +1 10 20 long long +1 15 15 short long,short diff --git a/test/annot-tsv/out.13.4.txt b/test/annot-tsv/out.13.4.txt new file mode 100644 index 000000000..f7a0e4d88 --- /dev/null +++ b/test/annot-tsv/out.13.4.txt @@ -0,0 +1,2 @@ +1 10 20 long long,short +1 15 15 short short diff --git a/test/annot-tsv/src.11.txt b/test/annot-tsv/src.11.txt new file mode 100644 index 000000000..26eb20be6 --- /dev/null +++ b/test/annot-tsv/src.11.txt @@ -0,0 +1,5 @@ +#ignore me +#chr1 beg1 end1 smpl1 +#chr2 beg2 end2 smpl2 +1 10 20 A +1 30 40 B diff --git a/test/annot-tsv/src.12.txt b/test/annot-tsv/src.12.txt new file mode 100644 index 000000000..9b7ac367c --- /dev/null +++ b/test/annot-tsv/src.12.txt @@ -0,0 +1,5 @@ +#ignore me +#chr1,beg1,end1,smpl1 +#chr2,beg2,end2,smpl2 +1,10,20,A +1,30,40,B diff --git a/test/annot-tsv/src.13.txt b/test/annot-tsv/src.13.txt new file mode 100644 index 000000000..de3338de1 --- /dev/null +++ b/test/annot-tsv/src.13.txt @@ -0,0 +1,2 @@ +1 10 20 long +1 15 15 short diff --git a/test/base_mods/MM-explicit.sam b/test/base_mods/MM-explicit.sam index e85afa293..c230a9d82 100644 --- a/test/base_mods/MM-explicit.sam +++ b/test/base_mods/MM-explicit.sam @@ -19,7 +19,7 @@ @CO ATCATCATTCCTACCGCTATAGCCT r3; mixture @CO - - . -. - -- @CO M M -@CO - - ?? ?? ? -- +@CO - - ?? ?? ? -- @CO hH hh h -- @CO r1 0 * 0 0 * * 0 0 ATCATCATTCCTACCGCTATAGCCT * Mm:Z:C+mh,2,0,1; Ml:B:C,200,10,50,170,160,20 diff --git a/test/bgzf_boundaries/bgzf_boundaries1.bam b/test/bgzf_boundaries/bgzf_boundaries1.bam new file mode 100644 index 000000000..264e22fad Binary files /dev/null and b/test/bgzf_boundaries/bgzf_boundaries1.bam differ diff --git a/test/bgzf_boundaries/bgzf_boundaries2.bam b/test/bgzf_boundaries/bgzf_boundaries2.bam new file mode 100644 index 000000000..704804eaf Binary files /dev/null and b/test/bgzf_boundaries/bgzf_boundaries2.bam differ diff --git a/test/bgzf_boundaries/bgzf_boundaries3.bam b/test/bgzf_boundaries/bgzf_boundaries3.bam new file mode 100644 index 000000000..328a27451 Binary files /dev/null and b/test/bgzf_boundaries/bgzf_boundaries3.bam differ diff --git a/test/header_syms.pl b/test/header_syms.pl index fe5128a78..a8d4a885c 100755 --- a/test/header_syms.pl +++ b/test/header_syms.pl @@ -60,6 +60,7 @@ sub extract_symbols { open(my $f, '<', $file) || die "Couldn't open $file : $!\n"; my $text = <$f>; + $text =~ tr/\r//d; close($f) || die "Error reading $file : $!\n"; # Get rid of comments diff --git a/test/hfile.c b/test/hfile.c index 8f06a971f..741cf7a8d 100644 --- a/test/hfile.c +++ b/test/hfile.c @@ -35,7 +35,8 @@ DEALINGS IN THE SOFTWARE. */ #include "../htslib/hts_defs.h" #include "../htslib/kstring.h" -void HTS_NORETURN fail(const char *format, ...) +void HTS_FORMAT(HTS_PRINTF_FMT, 1, 2) HTS_NORETURN +fail(const char *format, ...) { int err = errno; va_list args; diff --git a/test/plugins-dlhts.c b/test/plugins-dlhts.c index f90e3bd74..33f432fbd 100644 --- a/test/plugins-dlhts.c +++ b/test/plugins-dlhts.c @@ -177,7 +177,7 @@ int main(int argc, char **argv) #else -int main() +int main(void) { printf("Tests skipped due to " SKIP "\n"); return EXIT_SUCCESS; diff --git a/test/range.out2 b/test/range.out2 new file mode 100644 index 000000000..22e6fd542 --- /dev/null +++ b/test/range.out2 @@ -0,0 +1,21 @@ +@HD VN:1.4 SO:coordinate +@RG ID:1 PL:ILLUMINA PU:130410_HS18_09653_A_C1JT2ACXX_4 LB:7053878 DT:2013-04-10T00:00:00+0100 SM:ERS225193 CN:SC +@SQ SN:CHROMOSOME_I LN:1009800 M5:8ede36131e0dbf3417807e48f77f3ebd UR:/ +@SQ SN:CHROMOSOME_II LN:5000 M5:8e7993f7a93158587ee897d7287948ec UR:/ +@SQ SN:CHROMOSOME_III LN:5000 M5:3adcb065e1cf74fafdbba1e8c352b323 UR:/ +@SQ SN:CHROMOSOME_IV LN:5000 M5:251af66a69ee589c9f3757340ec2de6f UR:/ +@SQ SN:CHROMOSOME_V LN:5000 M5:cf200a65fb754836dcc56b24b3170ee8 UR:/ +@SQ SN:CHROMOSOME_X LN:5000 M5:6f9368fd2192c89c613718399d2d31fc UR:/ +@SQ SN:CHROMOSOME_MtDNA LN:5000 M5:cd05857ece6411f40257a565ccfe15bb UR:/ +@PG ID:scramble PN:scramble VN:1.14.7 CL:scramble -M -I sam -s 50 -r /tmp/ce.fa - /tmp/ERR304769_subset.cram +HS18_09653:4:2108:14085:93656 147 CHROMOSOME_I 1122 60 100M = 756 -466 AATTTGCAAGAAAATTCGCAAGAAATTTGTATTAAAAACTGTTCAAAATTTTTGGAAATTAGTTTAAAAATCTCACATTTTTTTTAGAAAAATTATTTTT GEFGHHFHEGGIFEFHFHFECDDE? X0:i:1 X1:i:0 XG:i:0 AM:i:37 SM:i:37 XM:i:0 XO:i:0 XT:A:U MD:Z:100 NM:i:0 RG:Z:1 +HS18_09653:4:2314:21094:58611 99 CHROMOSOME_II 1353 60 100M = 1775 522 ATTTTTCTATTCTTGTGAGCTCAGGACACCTCATACAACTCCAGAGAAAATGTGTCTCATTATTCTTGTCTTTTTTCAAGATCTAATCAATTTTCTACAT D;?FBD9CDBGBGG?GF8DFGFFHDACDGFGGD/HGHHGFFEFGD=FGIG0D.GH7HHFFGFDGGFF:HFDGGHGGGGE;F:@GGEGGCFGFGHHB@FHG X0:i:1 X1:i:0 BC:Z:GTNTGCCG XG:i:0 AM:i:37 SM:i:37 XM:i:0 XO:i:0 QT:Z:=?!4AD22 XT:A:U MD:Z:100 NM:i:0 RG:Z:1 +HS18_09653:4:2108:10782:59721 83 CHROMOSOME_II 1366 60 100M = 1241 -225 TGTGAGCTCAGGACACCTCATACAACTCCAGAGAAAATGTGTCTCATTATTCTTGTCTTTTTTCAAGATCTAATCAATTTTCTACATTAACGACGTTTTT IFGHDHHFFGHIIEGGGHEHHJGGGFGGHFHHGFGGGGGGHHDHFHGIF=IFIFHIGIHGHF=HGJGGGFGGGHEEHGFGGFGEGGGGEGFFGGGFEBCD X0:i:1 X1:i:0 BC:Z:GTNTGCCG XG:i:0 AM:i:37 SM:i:37 XM:i:0 XO:i:0 QT:Z:=?!4AD+2 XT:A:U MD:Z:100 NM:i:0 RG:Z:1 +HS18_09653:4:2111:5602:28724 99 CHROMOSOME_II 1416 60 100M = 1881 565 TCTTGTCTTTTTTCAAGATCTAATCAATTTTCTACATTAACGACGTTTTTGTCGTTCTGCTTCTTTTTTTCGTTCGTTTGTCTCGTCCATCAGCTGTCCA ECE>EGGGGFGGGGDGFEFGGGFHEGHGIIFGFEJGHHFGGGHFGEFHIHGFFGGECGFHHGGFGHIHHHGEGGHBGBGHHEHGEBGGFFGFFHHGCGFF X0:i:1 X1:i:0 BC:Z:GTNTGCCG XG:i:0 AM:i:37 SM:i:37 XM:i:0 XO:i:0 QT:Z:=@!4AD24 XT:A:U MD:Z:100 NM:i:0 RG:Z:1 +HS18_09653:4:2103:6720:15025 99 CHROMOSOME_II 1459 60 100M = 1617 258 CGTTTTTGTCGTTCTGCTTCTTTTTTTCGTTCGTTTGTCTCGTCCATCAGCTGTCCACTCATTTCTCTCCCACTCACTAGGCAGTGCTTTGTTTGGTTCC ECEFFGGGEHGEGGGGGGHFGGGHIGHIGGGG?HFGHGEGFBFGGGFGIHGDGGDEDFFFGGAEFGGGDDGEGGFGGEEEGEIFGFG@E>EFGDG?HCFCF>DGGHDFFCHF>=G;CFBEHG @@ -1408,16 +1408,16 @@ static void check_big_ref(int parse_header) "@HD\tVN:1.4\n" "@SQ\tSN:large#1\tLN:5000000000\n" "@SQ\tSN:small#1\tLN:100\n" - "@SQ\tSN:large#2\tLN:9223372034707292158\n" + "@SQ\tSN:large#2\tLN:4611686018427387904\n" "@SQ\tSN:small#2\tLN:1\n" "r1\t0\tlarge#1\t4999999000\t50\t8M\t*\t0\t0\tACGTACGT\tabcdefgh\n" "r2\t0\tsmall#1\t1\t50\t8M\t*\t0\t0\tACGTACGT\tabcdefgh\n" - "r3\t0\tlarge#2\t9223372034707292000\t50\t8M\t*\t0\t0\tACGTACGT\tabcdefgh\n" - "p1\t99\tlarge#2\t1\t50\t8M\t=\t9223372034707292150\t9223372034707292158\tACGTACGT\tabcdefgh\n" - "p1\t147\tlarge#2\t9223372034707292150\t50\t8M\t=\t1\t-9223372034707292158\tACGTACGT\tabcdefgh\n" + "r3\t0\tlarge#2\t4611686018427387000\t50\t8M\t*\t0\t0\tACGTACGT\tabcdefgh\n" + "p1\t99\tlarge#2\t1\t50\t8M\t=\t4611686018427387895\t4611686018427387903\tACGTACGT\tabcdefgh\n" + "p1\t147\tlarge#2\t4611686018427387895\t50\t8M\t=\t1\t-4611686018427387903\tACGTACGT\tabcdefgh\n" "r4\t0\tsmall#2\t2\t50\t8M\t*\t0\t0\tACGTACGT\tabcdefgh\n"; const hts_pos_t expected_lengths[] = { - 5000000000LL, 100LL, 9223372034707292158LL, 1LL + 5000000000LL, 100LL, 4611686018427387904LL, 1LL }; const int expected_tids[] = { 0, 1, 2, 2, 2, 3 @@ -1426,11 +1426,11 @@ static void check_big_ref(int parse_header) -1, -1, -1, 2, 2, -1 }; const hts_pos_t expected_positions[] = { - 4999999000LL - 1, 1LL - 1, 9223372034707292000LL - 1, 1LL - 1, - 9223372034707292150LL - 1, 2LL - 1 + 4999999000LL - 1, 1LL - 1, 4611686018427387000LL - 1, 1LL - 1, + 4611686018427387895LL - 1, 2LL - 1 }; const hts_pos_t expected_mpos[] = { - -1, -1, -1, 9223372034707292150LL - 1, 1LL - 1, -1 + -1, -1, -1, 4611686018427387895LL - 1, 1LL - 1, -1 }; samFile *in = NULL, *out = NULL; sam_hdr_t *header = NULL, *dup_header = NULL; @@ -1997,7 +1997,7 @@ static void test_mempolicy(void) } } -static void test_bam_set1_minimal() +static void test_bam_set1_minimal(void) { int r; bam1_t *bam = NULL; @@ -2028,7 +2028,7 @@ static void test_bam_set1_minimal() if (bam != NULL) bam_destroy1(bam); } -static void test_bam_set1_full() +static void test_bam_set1_full(void) { const char *qname = "!??AAA~~~~"; const uint32_t cigar[] = { 6 << BAM_CIGAR_SHIFT | BAM_CMATCH, 2 << BAM_CIGAR_SHIFT | BAM_CINS, 2 << BAM_CIGAR_SHIFT | BAM_CMATCH }; @@ -2075,7 +2075,7 @@ static void test_bam_set1_full() if (bam != NULL) bam_destroy1(bam); } -static void test_bam_set1_even_and_odd_seq_len() +static void test_bam_set1_even_and_odd_seq_len(void) { const char *seq_even = "TGGACTACGA"; const char *seq_odd = "TGGACTACGAC"; @@ -2105,7 +2105,7 @@ static void test_bam_set1_even_and_odd_seq_len() if (bam != NULL) bam_destroy1(bam); } -static void test_bam_set1_with_seq_but_no_qual() +static void test_bam_set1_with_seq_but_no_qual(void) { const char *seq = "TGGACTACGA"; @@ -2129,7 +2129,7 @@ static void test_bam_set1_with_seq_but_no_qual() if (bam != NULL) bam_destroy1(bam); } -static void test_bam_set1_validate_qname() +static void test_bam_set1_validate_qname(void) { int r; bam1_t *bam = NULL; @@ -2146,7 +2146,7 @@ static void test_bam_set1_validate_qname() if (bam != NULL) bam_destroy1(bam); } -static void test_bam_set1_validate_seq() +static void test_bam_set1_validate_seq(void) { int r; bam1_t *bam = NULL; @@ -2163,7 +2163,7 @@ static void test_bam_set1_validate_seq() if (bam != NULL) bam_destroy1(bam); } -static void test_bam_set1_validate_cigar() +static void test_bam_set1_validate_cigar(void) { const uint32_t cigar[] = { 20 << BAM_CIGAR_SHIFT | BAM_CMATCH }; const char *seq = "TGGACTACGA"; @@ -2192,7 +2192,7 @@ static void test_bam_set1_validate_cigar() if (bam != NULL) bam_destroy1(bam); } -static void test_bam_set1_validate_size_limits() +static void test_bam_set1_validate_size_limits(void) { const uint32_t cigar[] = { 20 << BAM_CIGAR_SHIFT | BAM_CMATCH }; const char *seq = "TGGACTACGA"; @@ -2224,7 +2224,7 @@ static void test_bam_set1_validate_size_limits() if (bam != NULL) bam_destroy1(bam); } -static void test_bam_set1_write_and_read_back() +static void test_bam_set1_write_and_read_back(void) { const char *qname = "q1"; const uint32_t cigar[] = { 6 << BAM_CIGAR_SHIFT | BAM_CMATCH, 2 << BAM_CIGAR_SHIFT | BAM_CINS, 2 << BAM_CIGAR_SHIFT | BAM_CMATCH }; diff --git a/test/test-bcf-sr.c b/test/test-bcf-sr.c index 80daf0423..0fb59905c 100644 --- a/test/test-bcf-sr.c +++ b/test/test-bcf-sr.c @@ -36,11 +36,13 @@ #include #include +#include "../htslib/hts_defs.h" #include "../htslib/synced_bcf_reader.h" #include "../htslib/hts.h" #include "../htslib/vcf.h" -void error(const char *format, ...) +void HTS_FORMAT(HTS_PRINTF_FMT, 1, 2) HTS_NORETURN +error(const char *format, ...) { va_list ap; va_start(ap, format); @@ -49,7 +51,7 @@ void error(const char *format, ...) exit(EXIT_FAILURE); } -void usage(int exit_code) +void HTS_NORETURN usage(int exit_code) { fprintf(stderr, "Usage: test-bcf-sr [OPTIONS] vcf-list.txt\n"); fprintf(stderr, " test-bcf-sr [OPTIONS] -args file1.bcf [...]\n"); diff --git a/test/test-bcf-translate.c b/test/test-bcf-translate.c index c2f069e39..263e71eb8 100644 --- a/test/test-bcf-translate.c +++ b/test/test-bcf-translate.c @@ -29,7 +29,7 @@ #include "../htslib/vcf.h" -void error(const char *format, ...) +void HTS_FORMAT(HTS_PRINTF_FMT, 1, 2) error(const char *format, ...) { va_list ap; va_start(ap, format); diff --git a/test/test-bcf_set_variant_type.c b/test/test-bcf_set_variant_type.c index e5092084e..eb12ecde3 100644 --- a/test/test-bcf_set_variant_type.c +++ b/test/test-bcf_set_variant_type.c @@ -29,7 +29,7 @@ DEALINGS IN THE SOFTWARE. */ #include "../htslib/hts.h" #include "../vcf.c" -void error(const char *format, ...) +void HTS_FORMAT(HTS_PRINTF_FMT, 1, 2) error(const char *format, ...) { va_list ap; va_start(ap, format); @@ -39,7 +39,7 @@ void error(const char *format, ...) exit(-1); } -static void test_bcf_set_variant_type() +static void test_bcf_set_variant_type(void) { // Test SNVs bcf_variant_t var1; diff --git a/test/test-vcf-api.c b/test/test-vcf-api.c index eff653686..b86b71d99 100644 --- a/test/test-vcf-api.c +++ b/test/test-vcf-api.c @@ -33,7 +33,7 @@ DEALINGS IN THE SOFTWARE. */ #include "../htslib/kstring.h" #include "../htslib/kseq.h" -void error(const char *format, ...) +void HTS_FORMAT(HTS_PRINTF_FMT, 1, 2) error(const char *format, ...) { va_list ap; va_start(ap, format); @@ -625,7 +625,7 @@ void test_invalid_end_tag(void) hts_set_log_level(logging); } -void test_open_format() { +void test_open_format(void) { char mode[5]; int ret; strcpy(mode, "r"); diff --git a/test/test.pl b/test/test.pl index 03eca1129..b5f52bdfb 100755 --- a/test/test.pl +++ b/test/test.pl @@ -819,6 +819,43 @@ sub test_view } } + # BAM files with alignment records that span BGZF blocks + # HTSlib starts a new block if an alignment is likely to overflow the + # current one, so for its own data this will only happen for records + # longer than 64kbytes. As other implementations may not do this, + # check that reading works correctly on some BAM files where records + # have been deliberately split between BGZF blocks. + print "test_view testing BAM records in multiple BGZF blocks:\n"; + $test_view_failures = 0; + my $src_sam = "ce#1.sam"; + foreach my $test_bam (qw(bgzf_boundaries/bgzf_boundaries1.bam + bgzf_boundaries/bgzf_boundaries2.bam + bgzf_boundaries/bgzf_boundaries3.bam)) { + testv $opts, "./test_view $tv_args -p $test_bam.tmp.sam $test_bam"; + testv $opts, "./compare_sam.pl $test_bam.tmp.sam $src_sam"; + } + + # Test a file with a long alignment record. Boundaries hit in the middle of + # the CIGAR data, and in the sequence. Generate the test file here as it's + # big, but with fairly simple contents. + $src_sam = "bgzf_boundaries/large_rec.tmp.sam"; + open(my $test_sam, '>', $src_sam) || die "Couldn't open $src_sam : $!\n"; + print $test_sam "\@HD\tVN:1.6\tSO:coordinate\n"; + print $test_sam "\@SQ\tSN:ref\tLN:100000\n"; + print $test_sam "read\t0\tref\t1\t60\t", "1M1I" x 16000, "\t*\t0\t0\t", "A" x 32000, "\t", "Q" x 32000, "\n"; + close($test_sam) || die "Error on closing $src_sam : $!\n"; + + testv $opts, "./test_view $tv_args -b -l 0 -p $src_sam.bam $src_sam"; + testv $opts, "./test_view $tv_args -p $src_sam.bam.sam $src_sam.bam"; + testv $opts, "./compare_sam.pl $src_sam $src_sam.bam.sam"; + + if ($test_view_failures == 0) { + passed($opts, "BAM records spanning multiple BGZF block tests"); + } else { + failed($opts, "BAM records spanning multiple BGZF block tests", + "$test_view_failures subtests failed"); + } + # embed_ref=2 mode print "test_view testing embed_ref=2:\n"; $test_view_failures = 0; @@ -850,6 +887,18 @@ sub test_view testv $opts, "./test_view $tv_args range.bam $regions > range.tmp"; testv $opts, "./compare_sam.pl range.tmp range.out"; + # Regression check for out-of-bounds read on regions list (see + # samtools#2063). As reg_insert() allocates at least four slots + # for chromosome regions, we need more than that many in the second + # chr. requested to ensure it has a bigger array. + + $regions = "CHROMOSOME_I:1122-1122 CHROMOSOME_II:1136-1136 CHROMOSOME_II:1241-1241 CHROMOSOME_II:1267-1267 CHROMOSOME_II:1326-1326 CHROMOSOME_II:1345-1345 CHROMOSOME_II:1353-1353 CHROMOSOME_II:1366-1366 CHROMOSOME_II:1416-1416 CHROMOSOME_II:1459-1459 CHROMOSOME_II:1536-1536"; + testv $opts, "./test_view $tv_args -i reference=ce.fa -M range.cram $regions > range.tmp"; + testv $opts, "./compare_sam.pl range.tmp range.out2"; + + testv $opts, "./test_view $tv_args -M range.bam $regions > range.tmp"; + testv $opts, "./compare_sam.pl range.tmp range.out2"; + if ($test_view_failures == 0) { passed($opts, "range.cram tests"); } else { @@ -1414,4 +1463,19 @@ sub test_annot_tsv run_annot_tsv($opts,src=>'src.10.txt',dst=>'dst.10.txt',out=>'out.10.4.txt',args=>'-m smpl -f smpl'); run_annot_tsv($opts,src=>'src.10.txt',dst=>'dst.10.txt',out=>'out.10.5.txt',args=>'-m smpl '); run_annot_tsv($opts,src=>'src.10.txt',dst=>'dst.10.txt',out=>'out.10.6.txt',args=>'-m smpl -x'); + run_annot_tsv($opts,src=>'src.11.txt',dst=>'dst.11.txt',out=>'out.11.1.txt',args=>'-c 1,2,3:1,2,3 -f 4:5 -h 0:0'); + run_annot_tsv($opts,src=>'src.11.txt',dst=>'dst.11.txt',out=>'out.11.1.txt',args=>'-c chr1,beg1,end1:chr,beg,end -f smpl1:src_smpl -h 2:2 -II'); + run_annot_tsv($opts,src=>'src.11.txt',dst=>'dst.11.txt',out=>'out.11.1.txt',args=>'-c chr1,beg1,end1:chr,beg,end -f smpl1:src_smpl -h 2:-1 -II'); + run_annot_tsv($opts,src=>'src.11.txt',dst=>'dst.11.txt',out=>'out.11.2.txt',args=>'-c chr1,beg1,end1:chr,beg,end -f smpl1:src_smpl -h 2:2'); + run_annot_tsv($opts,src=>'src.11.txt',dst=>'dst.11.txt',out=>'out.11.2.txt',args=>'-c chr2,beg2,end2:chr,beg,end -f smpl2:src_smpl -h 3:2'); + run_annot_tsv($opts,src=>'src.11.txt',dst=>'dst.11.txt',out=>'out.11.3.txt',args=>'-c chr1,beg1,end1:chr,beg,end -f smpl1:src_smpl -h 2:2 -I'); + run_annot_tsv($opts,src=>'src.11.txt',dst=>'dst.11.txt',out=>'out.11.3.txt',args=>'-c chr2,beg2,end2:chr,beg,end -f smpl2:src_smpl -h 3:2 -I'); + run_annot_tsv($opts,src=>'src.12.txt',dst=>'dst.12.txt',out=>'out.12.1.txt',args=>'-c 1,2,3:1,2,3 -f 4:5 -h 0:0 -d ,'); + run_annot_tsv($opts,src=>'src.12.txt',dst=>'dst.11.txt',out=>'out.11.1.txt',args=>q[-c 1,2,3:1,2,3 -f 4:5 -h 0:0 -d $',:\t']); + run_annot_tsv($opts,src=>'src.13.txt',dst=>'src.13.txt',out=>'out.13.1.txt',args=>q[-c 1,2,3 -f 4:5]); + run_annot_tsv($opts,src=>'src.13.txt',dst=>'src.13.txt',out=>'out.13.1.txt',args=>q[-c 1,2,3 -f 4:5 -O 0.5]); + run_annot_tsv($opts,src=>'src.13.txt',dst=>'src.13.txt',out=>'out.13.2.txt',args=>q[-c 1,2,3 -f 4:5 -O 0.5 -r]); + run_annot_tsv($opts,src=>'src.13.txt',dst=>'src.13.txt',out=>'out.13.2.txt',args=>q[-c 1,2,3 -f 4:5 -O 0.5,0.5]); + run_annot_tsv($opts,src=>'src.13.txt',dst=>'src.13.txt',out=>'out.13.3.txt',args=>q[-c 1,2,3 -f 4:5 -O 0,1]); + run_annot_tsv($opts,src=>'src.13.txt',dst=>'src.13.txt',out=>'out.13.4.txt',args=>q[-c 1,2,3 -f 4:5 -O 1,0]); } diff --git a/test/test_bgzf.c b/test/test_bgzf.c index 6cb6db902..bda68d1e9 100644 --- a/test/test_bgzf.c +++ b/test/test_bgzf.c @@ -1,6 +1,6 @@ /* test/test_bgzf.c -- bgzf unit tests - Copyright (C) 2017, 2019, 2022-2023 Genome Research Ltd + Copyright (C) 2017, 2019, 2022-2024 Genome Research Ltd Author: Robert Davies @@ -179,7 +179,7 @@ static int try_bgzf_close(BGZF **bgz, const char *name, const char *func, int ex static ssize_t try_bgzf_read(BGZF *fp, void *data, size_t length, const char *name, const char *func) { - ssize_t got = bgzf_read(fp, data, length); + ssize_t got = bgzf_read_small(fp, data, length); if (got < 0) { fprintf(stderr, "%s : Error from bgzf_read %s : %s\n", func, name, strerror(errno)); @@ -189,7 +189,7 @@ static ssize_t try_bgzf_read(BGZF *fp, void *data, size_t length, static ssize_t try_bgzf_write(BGZF *fp, const void *data, size_t length, const char *name, const char *func) { - ssize_t put = bgzf_write(fp, data, length); + ssize_t put = bgzf_write_small(fp, data, length); if (put < (ssize_t) length) { fprintf(stderr, "%s : %s %s : %s\n", func, put < 0 ? "Error writing to" : "Short write on", @@ -878,6 +878,49 @@ static int test_tell_read(Files *f, const char *mode) { return -1; } +static int test_useek_read_small(Files *f, const char *mode) { + + BGZF* bgz = NULL; + char bg_buf[99]; + + bgz = try_bgzf_open(f->tmp_bgzf, mode, __func__); + if (!bgz) goto fail; + + + if (try_bgzf_write(bgz, "#>Hello, World!\n", 16, + f->tmp_bgzf, __func__) != 16) + goto fail; + if (try_bgzf_close(&bgz, f->tmp_bgzf, __func__, 0) != 0) goto fail; + + bgz = try_bgzf_open(f->tmp_bgzf, "r", __func__); + if (!bgz) goto fail; + + if (try_bgzf_getc(bgz, 0, '#', f->tmp_bgzf, __func__) < 0 || + try_bgzf_getc(bgz, 1, '>', f->tmp_bgzf, __func__) < 0) + goto fail; + + if (try_bgzf_read(bgz, bg_buf, 5, f->tmp_bgzf, __func__) != 5) + goto fail; + if (memcmp(bg_buf, "Hello", 5) != 0) + goto fail; + + if (try_bgzf_useek(bgz, 9, SEEK_SET, f->tmp_bgzf, __func__) < 0) + goto fail; + + if (try_bgzf_read(bgz, bg_buf, 5, f->tmp_bgzf, __func__) != 5) + goto fail; + if (memcmp(bg_buf, "World", 5) != 0) + goto fail; + + if (try_bgzf_close(&bgz, f->tmp_bgzf, __func__, 0) != 0) goto fail; + return 0; + + fail: + fprintf(stderr, "%s: failed\n", __func__); + if (bgz) bgzf_close(bgz); + return -1; +} + static int test_bgzf_getline(Files *f, const char *mode, int nthreads) { BGZF* bgz = NULL; ssize_t bg_put; @@ -1098,6 +1141,10 @@ int main(int argc, char **argv) { if (test_tell_read(&f, "w") != 0) goto out; if (test_tell_read(&f, "wu") != 0) goto out; + // bgzf_useek and bgzf_read_small + if (test_useek_read_small(&f, "w") != 0) goto out; + if (test_useek_read_small(&f, "wu") != 0) goto out; + // getline if (test_bgzf_getline(&f, "w", 0) != 0) goto out; if (test_bgzf_getline(&f, "w", 1) != 0) goto out; diff --git a/test/test_faidx.c b/test/test_faidx.c index 566149071..f73f973a0 100644 --- a/test/test_faidx.c +++ b/test/test_faidx.c @@ -26,7 +26,7 @@ DEALINGS IN THE SOFTWARE. */ #include #include -#include +#include #include "../htslib/faidx.h" diff --git a/test/test_khash.c b/test/test_khash.c new file mode 100644 index 000000000..a2e80b581 --- /dev/null +++ b/test/test_khash.c @@ -0,0 +1,502 @@ +/* test_khash.c -- khash unit tests + + Copyright (C) 2024 Genome Research Ltd. + Copyright (C) 2024 Centre for Population Genomics. + + Author: Rob Davies + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL +THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +DEALINGS IN THE SOFTWARE. */ + +#include + +#include +#include +#include +#include +#include +#include +#ifdef HAVE_CLOCK_GETTIME_CPUTIME +#include +#else +#include +#endif +#include +#include +#include + +#include +#include + +#define MAX_ENTRIES 99999999 + +KHASH_MAP_INIT_STR(str2int, int) + +static void write_stats_str2int(khash_t(str2int) *h) { + khint_t empty = 0, deleted = 0, hist_size = 0, *hist = NULL; + + if (kh_stats(str2int, h, &empty, &deleted, &hist_size, &hist) == 0) { + khint_t i; + printf("n_buckets = %u\n", + kh_n_buckets(h)); + printf("empty = %u\n", empty); + printf("deleted = %u\n", deleted); + for (i = 0; i < hist_size; i++) { + printf("dist[ %8u ] = %u\n", i, hist[i]); + } + free(hist); + } +} + +char * make_keys(size_t num, size_t kl) { + size_t i; + char *keys; + + if (num > MAX_ENTRIES) return NULL; + keys = malloc(kl * num); + if (!keys) { + perror(NULL); + return NULL; + } + for (i = 0; i < num; i++) { + if (snprintf(keys + kl * i, kl, "test%zu", i) >= kl) { + free(keys); + return NULL; + } + } + + return keys; +} + +static int add_str2int_entry(khash_t(str2int) *h, char *key, khint_t val) { + int ret = 0; + khint_t k = kh_put(str2int, h, key, &ret); + + if (ret != 1 && ret != 2) { + fprintf(stderr, "Unexpected return from kh_put(%s) : %d\n", key, ret); + return -1; + } + kh_val(h, k) = val; + return 0; +} + +static int check_str2int_entry(khash_t(str2int) *h, char *key, khint_t val, + uint8_t is_deleted) { + khint_t k = kh_get(str2int, h, key); + if (is_deleted) { + if (k < kh_end(h)) { + fprintf(stderr, "Found deleted entry %s in hash table\n", key); + return -1; + } else { + return 0; + } + } + + if (k >= kh_end(h)) { + fprintf(stderr, "Couldn't find %s in hash table\n", key); + return -1; + } + if (strcmp(kh_key(h, k), key) != 0) { + fprintf(stderr, "Wrong key in hash table, expected %s got %s\n", + key, kh_key(h, k)); + return -1; + } + if (kh_val(h, k) != val) { + fprintf(stderr, "Wrong value in hash table, expected %u got %u\n", + val, kh_val(h, k)); + return -1; + } + return 0; +} + +static int del_str2int_entry(khash_t(str2int) *h, char *key) { + khint_t k = kh_get(str2int, h, key); + if (k >= kh_end(h)) { + fprintf(stderr, "Couldn't find %s to delete from hash table\n", key); + return -1; + } + kh_del(str2int, h, k); + return 0; +} + +static int test_str2int(size_t max, size_t to_del, int show_stats) { + const size_t kl = 16; + size_t mask = max; + char *keys = make_keys(max, kl); + uint8_t *flags = NULL; + khash_t(str2int) *h; + khint_t i; + uint32_t r = 0x533d; + + if (!keys) return -1; + + h = kh_init(str2int); + if (!h) goto memfail; + + // Add some entries + for (i = 0; i < max; i++) { + if (add_str2int_entry(h, keys + i * kl, i) != 0) + goto fail; + } + + // Check they exist + for (i = 0; i < max; i++) { + if (check_str2int_entry(h, keys + i * kl, i, 0) != 0) + goto fail; + } + + if (show_stats) { + printf("Initial fill:\n"); + write_stats_str2int(h); + } + + // Delete a random selection + flags = calloc(max, sizeof(*flags)); + if (!flags) { + perror(""); + goto fail; + } + + kroundup_size_t(mask); + --mask; + + // Note that this method may become slow for a high %age removed + // as it searches for the last available entries. Despite this, it + // seems to be acceptable for the number of entries allowed. + for (i = 0; i < to_del; i++) { + khint_t victim; + // LFSR, see http://users.ece.cmu.edu/~koopman/lfsr/index.html + do { + r = (r >> 1) ^ ((r & 1) * 0x80000057U); + victim = (r & mask) - 1; + } while (victim >= max || flags[victim]); + if (del_str2int_entry(h, keys + victim * kl) != 0) + goto fail; + flags[victim] = 1; + } + + // Check correct entries are present + for (i = 0; i < max; i++) { + if (check_str2int_entry(h, keys + i * kl, i, flags[i]) != 0) + goto fail; + } + + if (show_stats) { + printf("\nAfter deletion:\n"); + write_stats_str2int(h); + } + + // Re-insert deleted entries + for (i = 0; i < max; i++) { + if (flags[i] && add_str2int_entry(h, keys + i * kl, i) != 0) + goto fail; + } + + // Ensure they're all back + for (i = 0; i < max; i++) { + if (check_str2int_entry(h, keys + i * kl, i, 0) != 0) + goto fail; + } + + if (show_stats) { + printf("\nAfter re-insert:\n"); + write_stats_str2int(h); + } + + kh_destroy(str2int, h); + free(keys); + free(flags); + + return 0; + + memfail: + perror(NULL); + fail: + kh_destroy(str2int, h); + free(keys); + free(flags); + return -1; +} + +static size_t read_keys(const char *keys_file, char **keys_out, + char ***key_locations_out) { + FILE *in = fopen(keys_file, "r"); + char *keys = NULL, *key, *end; + size_t keys_size = 1000000; + size_t keys_used = 0; + size_t avail, got, nkeys = 0; + char **key_locations = NULL; + struct stat fileinfo = { 0 }; + + if (!in) + return 0; + + // Slurp entire file + if (fstat(fileno(in), &fileinfo) < 0) { + if (fileinfo.st_size > keys_size) + keys_size = (size_t) fileinfo.st_size; + } + + keys = malloc(keys_size + 1); + if (!keys) + goto fail; + + do { + avail = keys_size - keys_used; + if (avail == 0) { + size_t new_size = keys_size + 1000000; + char *new_keys = realloc(keys, new_size + 1); + if (!new_keys) + goto fail; + keys = new_keys; + keys_size = new_size; + avail = keys_size - keys_used; + } + got = fread(keys + keys_used, 1, avail, in); + keys_used += got; + } while (got == avail); + keys[keys_used] = '\0'; + + if (ferror(in)) + goto fail; + if (fclose(in) < 0) + goto fail; + in = NULL; + + // Split by line + end = keys + keys_used; + for (key = keys; key != NULL; key = memchr(key, '\n', end - key)) { + while (*key == '\n') key++; + if (key < end) nkeys++; + } + + key_locations = malloc(nkeys * sizeof(*key_locations)); + if (!key_locations) + goto fail; + + nkeys = 0; + for (key = keys; key != NULL; key = memchr(key, '\n', end - key)) { + while (*key == '\n') *key++ = '\0'; + if (key < end) { + key_locations[nkeys++] = key; + } + } + *keys_out = keys; + *key_locations_out = key_locations; + return nkeys; + + fail: + if (in) + fclose(in); + free(keys); + *keys_out = NULL; + *key_locations_out = NULL; + return 0; +} + +static long long get_time(void) { +#ifdef HAVE_CLOCK_GETTIME_CPUTIME + struct timespec ts; + if (clock_gettime(CLOCK_PROCESS_CPUTIME_ID, &ts) < 0) { + perror("clock_gettime"); + return -1; + } + return ts.tv_sec * 1000000000LL + ts.tv_nsec; +#else + struct timeval tv; + if (gettimeofday(&tv, NULL) < 0) { + perror("gettimeofday"); + return -1; + } + return tv.tv_sec * 1000000LL + tv.tv_usec; +#endif +} + +static char * fmt_time(long long elapsed) { + static char buf[64]; +#ifdef HAVE_CLOCK_GETTIME_CPUTIME + long long sec = elapsed / 1000000000; + long long nsec = elapsed % 1000000000; + snprintf(buf, sizeof(buf), "%lld.%09lld processor seconds", sec, nsec); +#else + long long sec = elapsed / 1000000; + long long usec = elapsed % 1000000; + snprintf(buf, sizeof(buf), "%lld.%06lld wall-time seconds", sec, usec); +#endif + return buf; +} + +static int benchmark(const char *keys_file) { + const size_t kl = 16; + size_t max = 50000000; + size_t i; + char *keys = NULL; + char **key_locations = NULL; + khash_t(str2int) *h; + long long start, end; + + if (keys_file) { + max = read_keys(keys_file, &keys, &key_locations); + } else { + keys = make_keys(max, kl); + } + + if (!keys) return -1; + + h = kh_init(str2int); + if (!h) goto fail; + + if ((start = get_time()) < 0) + goto fail; + + if (keys_file) { + for (i = 0; i < max; i++) { + int ret; + khint_t k = kh_put(str2int, h, key_locations[i], &ret); + if (ret < 0) { + fprintf(stderr, "Unexpected return from kh_put(%s) : %d\n", + key_locations[i], ret); + goto fail; + } + kh_val(h, k) = i; + } + } else { + for (i = 0; i < max; i++) { + int ret; + khint_t k = kh_put(str2int, h, keys + i * kl, &ret); + if (ret <= 0) { + fprintf(stderr, "Unexpected return from kh_put(%s) : %d\n", + keys + i * kl, ret); + goto fail; + } + kh_val(h, k) = i; + } + } + + if ((end = get_time()) < 0) + goto fail; + + printf("Insert %zu %s\n", max, fmt_time(end - start)); + + if ((start = get_time()) < 0) + goto fail; + + if (keys_file) { + for (i = 0; i < max; i++) { + khint_t k = kh_get(str2int, h, key_locations[i]); + if (k >= kh_end(h)) { + fprintf(stderr, "Couldn't find %s in hash table\n", + key_locations[i]); + goto fail; + } + } + } else { + for (i = 0; i < max; i++) { + khint_t k = kh_get(str2int, h, keys + i * kl); + if (k >= kh_end(h)) { + fprintf(stderr, "Couldn't find %s in hash table\n", + keys + i * kl); + goto fail; + } + } + } + + if ((end = get_time()) < 0) + goto fail; + + printf("Lookup %zu %s\n", max, fmt_time(end - start)); + + write_stats_str2int(h); + + kh_destroy(str2int, h); + free(keys); + free(key_locations); + + return 0; + fail: + kh_destroy(str2int, h); + free(keys); + return -1; +} + +static void show_usage(FILE *out, char *prog) { + fprintf(out, "Usage : %s [-t ] [-i ]\n", prog); + fprintf(out, " Options:\n"); + fprintf(out, " -t Test to run (str2int, benchmark)\n"); + fprintf(out, " -i Optional input file for benchmark\n"); + fprintf(out, " -n Number of items to add\n"); + fprintf(out, " -f Fraction to delete and re-insert\n"); + fprintf(out, " -d Dump hash table stats\n"); + fprintf(out, " -h Show this help\n"); +} + +int main(int argc, char **argv) { + int opt, res = EXIT_SUCCESS; + char *test = NULL; + char *input_file = NULL; + size_t max = 1000; + double del_frac = 0.25; + int show_stats = 0; + + while ((opt = getopt(argc, argv, "df:hi:n:t:")) != -1) { + switch (opt) { + case 'd': + show_stats = 1; + break; + case 'f': + del_frac = strtod(optarg, NULL); + if (del_frac < 0 || del_frac > 1.0) { + fprintf(stderr, "Error: -d must be between 0.0 and 1.0\n"); + return EXIT_FAILURE; + } + break; + case 'h': + show_usage(stdout, argv[0]); + return EXIT_SUCCESS; + case 'i': + input_file = optarg; + break; + case 'n': + max = strtoul(optarg, NULL, 0); + if (max == 0 || max > 99999999) { + fprintf(stderr, "Error: -n must be between 1 and %u\n", + MAX_ENTRIES); + return EXIT_FAILURE; + } + break; + case 't': + test = optarg; + break; + default: + show_usage(stderr, argv[0]); + return EXIT_FAILURE; + } + } + + if (!test || strcmp(test, "str2int") == 0) { + if (test_str2int(max, (size_t) (max * del_frac), show_stats) != 0) + res = EXIT_FAILURE; + } + + if (test && strcmp(test, "benchmark") == 0) { + if (benchmark(input_file) != 0) + res = EXIT_FAILURE; + } + + return res; +} diff --git a/test/test_kstring.c b/test/test_kstring.c index ee913a2e3..8b6188b6e 100644 --- a/test/test_kstring.c +++ b/test/test_kstring.c @@ -1,6 +1,6 @@ /* test_kstring.c -- kstring unit tests - Copyright (C) 2018, 2020 Genome Research Ltd. + Copyright (C) 2018, 2020, 2024 Genome Research Ltd. Author: Rob Davies @@ -261,6 +261,84 @@ static int test_kputw(int64_t start, int64_t end) { return 0; } +static int test_kputll_from_to(kstring_t *str, long long s, long long e) { + long long i = s; + + for (;;) { + str->l = 0; + memset(str->s, 0xff, str->m); + if (kputll(i, str) < 0 || !str->s) { + perror("kputll"); + return -1; + } + if (str->l >= str->m || str->s[str->l] != '\0') { + fprintf(stderr, "No NUL termination on string from kputll\n"); + return -1; + } + if (i != strtoll(str->s, NULL, 10)) { + fprintf(stderr, + "kputll wrote the wrong value, expected %lld, got %s\n", + i, str->s); + return -1; + } + if (i >= e) break; + i++; + } + return 0; +} + +static int test_kputll(long long start, long long end) { + kstring_t str = { 0, 0, NULL }; + unsigned long long val; + + str.s = malloc(2); + if (!str.s) { + perror("malloc"); + return -1; + } + str.m = 2; + + for (val = 1; val < INT64_MAX-5; val *= 10) { + if (test_kputll_from_to(&str, val >= 5 ? val - 5 : val, val) < 0) { + free(ks_release(&str)); + return -1; + } + } + + for (val = 1; val < INT64_MAX-5; val *= 10) { + long long valm = -val; + if (test_kputll_from_to(&str, valm >= 5 ? valm - 5 : valm, valm) < 0) { + free(ks_release(&str)); + return -1; + } + } + + if (test_kputll_from_to(&str, INT64_MAX - 5, INT64_MAX) < 0) { + free(ks_release(&str)); + return -1; + } + + if (test_kputll_from_to(&str, INT64_MIN, INT64_MIN + 5) < 0) { + free(ks_release(&str)); + return -1; + } + + str.m = 1; // Force a resize + int64_t start2 = (int64_t)start; // no larger on our platforms + int64_t end2 = (int64_t)end; + clamp(&start2, INT64_MIN, INT64_MAX); + clamp(&end2, INT64_MIN, INT64_MAX); + + if (test_kputll_from_to(&str, start, end) < 0) { + free(ks_release(&str)); + return -1; + } + + free(ks_release(&str)); + + return 0; +} + // callback used by test_kgetline static char *mock_fgets(char *str, int num, void *p) { int *mock_state = (int*)p; @@ -290,7 +368,7 @@ static char *mock_fgets(char *str, int num, void *p) { return str; } -static int test_kgetline() { +static int test_kgetline(void) { kstring_t s = KS_INITIALIZE; int mock_state = 0; @@ -346,7 +424,7 @@ static ssize_t mock_fgets2(char *str, size_t num, void *p) { return strlen(str); } -static int test_kgetline2() { +static int test_kgetline2(void) { kstring_t s = KS_INITIALIZE; int mock_state = 0; @@ -413,6 +491,9 @@ int main(int argc, char **argv) { if (!test || strcmp(test, "kputw") == 0) if (test_kputw(start, end) != 0) res = EXIT_FAILURE; + if (!test || strcmp(test, "kputll") == 0) + if (test_kputll(start, end) != 0) res = EXIT_FAILURE; + if (!test || strcmp(test, "kgetline") == 0) if (test_kgetline() != 0) res = EXIT_FAILURE; diff --git a/test/test_nibbles.c b/test/test_nibbles.c new file mode 100644 index 000000000..1ef3456ea --- /dev/null +++ b/test/test_nibbles.c @@ -0,0 +1,164 @@ +/* test/test_nibbles.c -- Test SIMD optimised function implementations. + + Copyright (C) 2024 Centre for Population Genomics. + + Author: John Marshall + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL +THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +DEALINGS IN THE SOFTWARE. */ + +#include + +#include +#include +#include +#include + +#ifdef HAVE_CLOCK_GETTIME_CPUTIME +#include +#else +#include +#endif + +#include "../htslib/sam.h" +#include "../sam_internal.h" + +long long gettime(void) { +#ifdef HAVE_CLOCK_GETTIME_CPUTIME + struct timespec ts; + clock_gettime(CLOCK_PROCESS_CPUTIME_ID, &ts); + return ts.tv_sec * 1000000000LL + ts.tv_nsec; +#else + struct timeval tv; + gettimeofday(&tv, NULL); + return tv.tv_sec * 1000000LL + tv.tv_usec; +#endif +} + +char *fmttime(long long elapsed) { + static char buf[64]; + +#ifdef HAVE_CLOCK_GETTIME_CPUTIME + long long sec = elapsed / 1000000000; + long long nsec = elapsed % 1000000000; + sprintf(buf, "%lld.%09lld processor seconds", sec, nsec); +#else + long long sec = elapsed / 1000000; + long long usec = elapsed % 1000000; + sprintf(buf, "%lld.%06lld wall-time seconds", sec, usec); +#endif + + return buf; +} + +void nibble2base_single(uint8_t *nib, char *seq, int len) { + int i; + for (i = 0; i < len; i++) + seq[i] = seq_nt16_str[bam_seqi(nib, i)]; +} + +unsigned char nibble[5000]; +char buf[10000]; + +int validate_nibble2base(void) { + char defbuf[500]; + int i, start, len; + unsigned long long total = 0, failed = 0; + + for (i = 0; i < sizeof nibble; i++) + nibble[i] = i % 256; + + for (start = 0; start < 80; start++) + for (len = 0; len < 400; len++) { + memset(defbuf, '\0', sizeof defbuf); + nibble2base_single(&nibble[start], defbuf, len); + + memset(buf, '\0', sizeof defbuf); + nibble2base(&nibble[start], buf, len); + + total++; + if (strcmp(defbuf, buf) != 0) { + printf("%s expected\n%s FAIL\n\n", defbuf, buf); + failed++; + } + } + + if (failed > 0) { + fprintf(stderr, "Failures: %llu (out of %llu tests)\n", failed, total); + return 1; + } + + return 0; +} + +int time_nibble2base(int length, unsigned long count) { + unsigned long i, total = 0; + + for (i = 0; i < length; i++) + nibble[i] = i % 256; + + printf("Timing %lu nibble2base iterations with read length %d...\n", count, length); + long long start = gettime(); + + for (i = 0; i < count; i++) { + nibble2base(nibble, buf, length); + total += buf[i % length]; + } + + long long stop = gettime(); + printf("%s (summing to %lu)\n", fmttime(stop - start), total); + return 0; +} + +int main(int argc, char **argv) { + int readlen = 5000; + unsigned long count = 1000000; + int status = 0; + int c; + + if (argc == 1) + printf( +"Usage: test_nibbles [-c NUM] [-r NUM] [-n|-v]...\n" +"Options:\n" +" -c NUM Specify number of iterations [%lu]\n" +" -n Run nibble2base speed tests\n" +" -r NUM Specify read length [%d]\n" +" -v Run all validation tests\n" +"", count, readlen); + + while ((c = getopt(argc, argv, "c:nr:v")) >= 0) + switch (c) { + case 'c': + count = strtoul(optarg, NULL, 0); + break; + + case 'n': + status += time_nibble2base(readlen, count); + break; + + case 'r': + readlen = atoi(optarg); + break; + + case 'v': + status += validate_nibble2base(); + break; + } + + return status; +} diff --git a/textutils.c b/textutils.c index 0cc2af818..b2c29a893 100644 --- a/textutils.c +++ b/textutils.c @@ -220,7 +220,7 @@ static char token_type(hts_json_token *token) } HTSLIB_EXPORT -hts_json_token * hts_json_alloc_token() { +hts_json_token * hts_json_alloc_token(void) { return calloc(1, sizeof(hts_json_token)); } diff --git a/vcf.c b/vcf.c index 9dec8481b..105c7539d 100644 --- a/vcf.c +++ b/vcf.c @@ -1567,7 +1567,7 @@ int bcf_hdr_write(htsFile *hfp, bcf_hdr_t *h) *** BCF site I/O *** ********************/ -bcf1_t *bcf_init() +bcf1_t *bcf_init(void) { bcf1_t *v; v = (bcf1_t*)calloc(1, sizeof(bcf1_t)); @@ -3703,7 +3703,7 @@ int vcf_parse(kstring_t *s, const bcf_hdr_t *h, bcf1_t *v) overflow = 0; char *tmp = p; - v->pos = hts_str2uint(p, &p, 63, &overflow); + v->pos = hts_str2uint(p, &p, 62, &overflow); if (overflow) { hts_log_error("Position value '%s' is too large", tmp); goto err; @@ -4020,7 +4020,10 @@ int vcf_format(const bcf_hdr_t *h, const bcf1_t *v, kstring_t *s) kputc_('\t', s); // INFO if (v->n_info) { - uint8_t *ptr = (uint8_t *)v->shared.s + v->unpack_size[0] + v->unpack_size[1] + v->unpack_size[2]; + uint8_t *ptr = v->shared.s + ? (uint8_t *)v->shared.s + v->unpack_size[0] + + v->unpack_size[1] + v->unpack_size[2] + : NULL; int first = 1; bcf_info_t *info = v->d.info; @@ -4235,6 +4238,8 @@ int vcf_write(htsFile *fp, const bcf_hdr_t *h, bcf1_t *v) if ( fp->format.compression!=no_compression ) { if (bgzf_flush_try(fp->fp.bgzf, fp->line.l) < 0) return -1; + if (fp->idx && !fp->fp.bgzf->mt) + hts_idx_amend_last(fp->idx, bgzf_tell(fp->fp.bgzf)); ret = bgzf_write(fp->fp.bgzf, fp->line.s, fp->line.l); } else { ret = hwrite(fp->fp.hfile, fp->line.s, fp->line.l); @@ -4288,7 +4293,7 @@ static int idx_calc_n_lvls_ids(const bcf_hdr_t *h, int min_shift, } if ( !max_len ) max_len = (1LL<<31) - 1; // In case contig line is broken. max_len += 256; - s = 1LL << (min_shift + starting_n_lvls * 3); + s = hts_bin_maxpos(min_shift, starting_n_lvls); for (n_lvls = starting_n_lvls; max_len > s; ++n_lvls, s <<= 3); if (nids_out) *nids_out = nids; diff --git a/version.sh b/version.sh index 98ae48ec0..f35234c2d 100755 --- a/version.sh +++ b/version.sh @@ -24,7 +24,7 @@ # DEALINGS IN THE SOFTWARE. # Master version, for use in tarballs or non-git source copies -VERSION=1.20 +VERSION=1.21 # If we have a git clone, then check against the current tag srcdir=${0%/version.sh}