diff --git a/.appveyor.yml b/.appveyor.yml
deleted file mode 100644
index 8fe288094..000000000
--- a/.appveyor.yml
+++ /dev/null
@@ -1,43 +0,0 @@
-# version format.
-# you can use {branch} name in version format too
-# version: 1.0.{build}-{branch}
-version: 'vers.{build}'
-
-# branches to build
-branches:
-    # Blacklist
-    except:
-      - gh-pages
-
-# Do not build on tags (GitHub and BitBucket)
-skip_tags: true
-
-# Skipping commits affecting specific files (GitHub only). More details here: /docs/appveyor-yml
-#skip_commits:
-#  files:
-#    - docs/*
-#    - '**/*.html'
-
-# Appveyor Windows images are based on Visual studio version
-image: Visual Studio 2019
-
-# We use Mingw/Msys, so use pacman for installs
-install:
-  - set HOME=.
-  - set MSYSTEM=MINGW64
-  - set PATH=C:/msys64/usr/bin;C:/msys64/mingw64/bin;%PATH%
-  - set MINGWPREFIX=x86_64-w64-mingw32
-  - "sh -lc \"pacman -S --noconfirm --needed base-devel mingw-w64-x86_64-toolchain mingw-w64-x86_64-autotools mingw-w64-x86_64-zlib mingw-w64-x86_64-bzip2 mingw-w64-x86_64-xz mingw-w64-x86_64-curl mingw-w64-x86_64-tools-git\""
-
-build_script:
-  - set HOME=.
-  - set MSYSTEM=MINGW64
-  - set PATH=C:/msys64/usr/bin;C:/msys64/mingw64/bin;%PATH%
-  - git submodule update --init --recursive
-  - "sh -lc \"autoreconf -i && ./configure --enable-werror CFLAGS='-g -O3' && make -j2\""
-
-#build_script:
-#  - make
-
-test_script:
-  - "sh -lc \"make test-shlib-exports && make test\""
diff --git a/.cirrus.yml b/.cirrus.yml
index fc4405b08..6da99dde0 100644
--- a/.cirrus.yml
+++ b/.cirrus.yml
@@ -35,21 +35,22 @@ compile_template: &COMPILE
     if test "$USE_CONFIG" = "yes"; then
       MAKE_OPTS=
       autoreconf -i
-      eval ./configure --enable-plugins --enable-werror $CONFIG_OPTS CFLAGS=\"-g -O3 $CFLAGS\" || \
+      eval ./configure --enable-plugins --enable-werror $CONFIG_OPTS CFLAGS=\"$CFLAGS\" || \
         ( cat config.log; false )
     else
       MAKE_OPTS=-e
     fi
+    make cc-version $MAKE_OPTS
     if test "x$DO_MAINTAINER_CHECKS" = "xyes"; then
-      make maintainer-check
+      make maintainer-check $MAKE_OPTS
     fi
     make -j 4 $MAKE_OPTS
 
 test_template: &TEST
   test_script: |
-    make test-shlib-exports
-    make test
-    if test "x$DO_UNTRACKED_FILE_CHECK" = "xyes"; then make check-untracked ; fi
+    make test-shlib-exports $MAKE_OPTS
+    make test $MAKE_OPTS
+    if test "x$DO_UNTRACKED_FILE_CHECK" = "xyes"; then make check-untracked $MAKE_OPTS ; fi
 
 #--------------------------------------------------
 # Task: linux builds.
@@ -71,10 +72,14 @@ gcc_task:
        DO_MAINTAINER_CHECKS: yes
        DO_UNTRACKED_FILE_CHECK: yes
        USE_CONFIG: no
+       CFLAGS: -g -O2 -Wall -Werror -fvisibility=hidden
     - environment:
        USE_CONFIG: yes
-       CFLAGS: -std=c99 -pedantic -Wformat=2
+       # ubsan is incompatible with some -Wformat opts so we do that on clang.
+       CFLAGS: -g -O3 -fsanitize=address,undefined -DHTS_ALLOW_UNALIGNED=0 -Wno-format-truncation -Wno-format-overflow
+       LDFLAGS: -fsanitize=address,undefined
        USE_LIBDEFLATE: yes
+       UBSAN_OPTIONS: print_stacktrace=1:halt_on_error=1
 
   install_script: |
     apt-get update
@@ -105,11 +110,12 @@ ubuntu_task:
   matrix:
     - environment:
        USE_CONFIG: yes
+       CFLAGS: -g -O3
        DO_UNTRACKED_FILE_CHECK: yes
     - environment:
+       # Cirrus-CI's clang isn't installed with ubsan, so we do that in gcc
        USE_CONFIG: yes
-       CFLAGS: -g -Wall -O3 -fsanitize=address
-       LDFLAGS: -fsanitize=address
+       CFLAGS: -g -O3 -std=c99 -pedantic -Wall -Wformat -Wformat=2
        USE_LIBDEFLATE: yes
 
   # NB: we could consider building a docker image with these
@@ -137,7 +143,7 @@ rocky_task:
     LC_ALL: C
     CIRRUS_CLONE_DEPTH: 1
     USE_CONFIG: yes
-    CFLAGS: -std=gnu90
+    CFLAGS: -g -O3 -std=gnu90 -Wall -Wformat -Wformat=2 -Wextra -Wno-sign-compare -Wno-unused-parameter -Wno-missing-field-initializers
 
   # NB: we could consider building a docker image with these
   # preinstalled and specifying that instead, to speed up testing.
@@ -182,11 +188,10 @@ arm_ubuntu_task:
 macosx_task:
   name: macosx + clang
   macos_instance:
-    image: ghcr.io/cirruslabs/macos-ventura-base:latest
+    image: ghcr.io/cirruslabs/macos-runner:sonoma
 
   environment:
     CC: clang
-    CFLAGS: "-Wall -arch arm64 -arch x86_64"
     LDFLAGS: "-arch arm64 -arch x86_64"
     LIBDEFLATE_CFLAGS: "-arch arm64 -arch x86_64"
     LC_ALL: C
@@ -195,9 +200,11 @@ macosx_task:
   matrix:
     - environment:
        USE_CONFIG: no
+       CFLAGS: "-g -O3 -Wall -Werror -arch arm64 -arch x86_64"
     - environment:
        USE_CONFIG: yes
        USE_LIBDEFLATE: yes
+       CFLAGS: "-g -O3 -Wall -arch arm64 -arch x86_64"
 
   package_install_script: |
     HOMEBREW_NO_AUTO_UPDATE=1 brew install autoconf automake libtool xz git \
diff --git a/.gitattributes b/.gitattributes
index 5d9850bc7..2d5a80e04 100644
--- a/.gitattributes
+++ b/.gitattributes
@@ -24,3 +24,14 @@ test/index_dos.sam -text
 # Remove the text attribute from various faidx test files
 test/faidx/faidx*.fa* -text
 test/faidx/fastqs*.fq* -text
+test/fastq/*.fa -text
+test/fastq/*.fq -text
+*.tst -text
+*.out -text
+*.crai    -text
+*.bai     -text
+*.csi     -text
+*.gzi     -text
+*.bcf     -text
+*.sam     -text
+*.sam.gz  -text
diff --git a/.github/workflows/windows-build.yml b/.github/workflows/windows-build.yml
new file mode 100644
index 000000000..bf6f5ae53
--- /dev/null
+++ b/.github/workflows/windows-build.yml
@@ -0,0 +1,41 @@
+name: Windows/MinGW-W64 CI
+on: [push, pull_request]
+
+jobs:
+  build:
+    runs-on: windows-latest
+    steps:
+    - name: Checkout
+      uses: actions/checkout@v4
+      with:
+        submodules: true
+    - name: Set up MSYS2 MinGW-W64
+      uses: msys2/setup-msys2@v2
+      with:
+        msystem: mingw64
+        update: false
+        install: >-
+          mingw-w64-x86_64-autotools
+          mingw-w64-x86_64-bzip2
+          mingw-w64-x86_64-curl
+          mingw-w64-x86_64-libdeflate
+          mingw-w64-x86_64-toolchain
+          mingw-w64-x86_64-tools-git
+          mingw-w64-x86_64-xz
+          mingw-w64-x86_64-zlib
+    - name: Compile htslib
+      shell: msys2 {0}
+      run: |
+        export PATH="/mingw64/bin:$PATH:/c/Program Files/Git/bin"
+        export MSYSTEM=MINGW64
+        autoreconf -i
+        ./configure --enable-werror
+        make cc-version
+        make -j6
+    - name: Check Htslib
+      shell: msys2 {0}
+      run: |
+        export PATH="/mingw64/bin:$PATH:/c/Program Files/Git/bin"
+        export MSYSTEM=MINGW64
+        make test-shlib-exports && make check
+
diff --git a/.gitignore b/.gitignore
index 8b4d74ca1..817b123d7 100644
--- a/.gitignore
+++ b/.gitignore
@@ -45,8 +45,9 @@ shlib-exports-*.txt
 /bgzip
 /htsfile
 /tabix
+/test/*/FAIL*
+/test/bgzf_boundaries/*.tmp.*
 /test/faidx/*.tmp*
-/test/faidx/FAIL*
 /test/fieldarith
 /test/hfile
 /test/hts_endian
@@ -56,7 +57,6 @@ shlib-exports-*.txt
 /test/plugins-dlhts
 /test/sam
 /test/tabix/*.tmp.*
-/test/tabix/FAIL*
 /test/test-bcf-sr
 /test/test-bcf-translate
 /test/test-bcf_set_variant_type
@@ -66,8 +66,10 @@ shlib-exports-*.txt
 /test/test_index
 /test/test_introspection
 /test/test_kfunc
+/test/test_khash
 /test/test_kstring
 /test/test_mod
+/test/test_nibbles
 /test/test-parse-reg
 /test/test_realn
 /test/test-regidx
diff --git a/Makefile b/Makefile
index 99142c865..ef9b5a9a4 100644
--- a/Makefile
+++ b/Makefile
@@ -1,6 +1,6 @@
 # Makefile for htslib, a C library for high-throughput sequencing data formats.
 #
-#    Copyright (C) 2013-2023 Genome Research Ltd.
+#    Copyright (C) 2013-2024 Genome Research Ltd.
 #
 #    Author: John Marshall <jm18@sanger.ac.uk>
 #
@@ -85,8 +85,10 @@ BUILT_TEST_PROGRAMS = \
 	test/test_expr \
 	test/test_faidx \
 	test/test_kfunc \
+	test/test_khash \
 	test/test_kstring \
 	test/test_mod \
+	test/test_nibbles \
 	test/test_realn \
 	test/test-regidx \
 	test/test_str2int \
@@ -111,8 +113,14 @@ BUILT_THRASH_PROGRAMS = \
 	test/thrash_threads6 \
 	test/thrash_threads7
 
-all: lib-static lib-shared $(BUILT_PROGRAMS) plugins $(BUILT_TEST_PROGRAMS) \
-     htslib_static.mk htslib-uninstalled.pc
+all: lib-static lib-shared $(BUILT_PROGRAMS) plugins \
+	$(BUILT_TEST_PROGRAMS) htslib_static.mk htslib-uninstalled.pc
+
+# Report compiler and version
+cc-version:
+	-@$(CC) --version  2>/dev/null || true
+	-@$(CC) --qversion 2>/dev/null || true
+	-@$(CC) -V         2>/dev/null || true
 
 ALL_CPPFLAGS = -I. $(CPPFLAGS)
 
@@ -150,8 +158,8 @@ LIBHTS_SOVERSION = 3
 # is not strictly necessary and should be removed the next time
 # LIBHTS_SOVERSION is bumped (see #1144 and
 # https://developer.apple.com/library/archive/documentation/DeveloperTools/Conceptual/DynamicLibraries/100-Articles/DynamicLibraryDesignGuidelines.html#//apple_ref/doc/uid/TP40002013-SW23)
-MACH_O_COMPATIBILITY_VERSION = 3.1.20
-MACH_O_CURRENT_VERSION = 3.1.20
+MACH_O_COMPATIBILITY_VERSION = 3.1.21
+MACH_O_CURRENT_VERSION = 3.1.21
 
 # Force version.h to be remade if $(PACKAGE_VERSION) has changed.
 version.h: $(if $(wildcard version.h),$(if $(findstring "$(PACKAGE_VERSION)",$(shell cat version.h)),,force))
@@ -209,6 +217,7 @@ LIBHTS_OBJS = \
 	region.o \
 	sam.o \
 	sam_mods.o \
+	simd.o \
 	synced_bcf_reader.o \
 	vcf_sweep.o \
 	tbx.o \
@@ -278,6 +287,10 @@ config.h:
 	echo '#endif' >> $@
 	echo '#define HAVE_DRAND48 1' >> $@
 	echo '#define HAVE_LIBCURL 1' >> $@
+	if [ "x$(HTS_HAVE_CPUID)" != "x" ]; then \
+	    echo '#define HAVE_DECL___CPUID_COUNT 1' >> $@ ; \
+	    echo '#define HAVE_DECL___GET_CPUID_MAX 1' >> $@ ; \
+	fi
 	if [ "x$(HTS_BUILD_SSE4)" != "x" ]; then \
 	    echo '#define HAVE_POPCNT 1' >> $@ ; \
 	    echo '#define HAVE_SSE4_1 1' >> $@ ; \
@@ -292,6 +305,13 @@ config.h:
 	if [ "x$(HTS_BUILD_AVX512)" != "x" ] ; then \
 	    echo '#define HAVE_AVX512 1' >> $@ ; \
 	fi
+	echo '#if defined __x86_64__ || defined __arm__ || defined __aarch64__' >> $@
+	echo '#define HAVE_ATTRIBUTE_CONSTRUCTOR 1' >> $@
+	echo '#endif' >> $@
+	echo '#if (defined(__x86_64__) || defined(_M_X64))' >> $@
+	echo '#define HAVE_ATTRIBUTE_TARGET 1' >> $@
+	echo '#define HAVE_BUILTIN_CPU_SUPPORT_SSSE3 1' >> $@
+	echo '#endif' >> $@
 
 # And similarly for htslib.pc.tmp ("pkg-config template").  No dependency
 # on htslib.pc.in listed, as if that file is newer the usual way to regenerate
@@ -451,6 +471,7 @@ hts_os.o hts_os.pico: hts_os.c config.h $(htslib_hts_defs_h) os/rand.c
 vcf.o vcf.pico: vcf.c config.h $(fuzz_settings_h) $(htslib_vcf_h) $(htslib_bgzf_h) $(htslib_tbx_h) $(htslib_hfile_h) $(hts_internal_h) $(htslib_khash_str2int_h) $(htslib_kstring_h) $(htslib_sam_h) $(htslib_khash_h) $(htslib_kseq_h) $(htslib_hts_endian_h)
 sam.o sam.pico: sam.c config.h $(fuzz_settings_h) $(htslib_hts_defs_h) $(htslib_sam_h) $(htslib_bgzf_h) $(cram_h) $(hts_internal_h) $(sam_internal_h) $(htslib_hfile_h) $(htslib_hts_endian_h) $(htslib_hts_expr_h) $(header_h) $(htslib_khash_h) $(htslib_kseq_h) $(htslib_kstring_h)
 sam_mods.o sam_mods.pico: sam_mods.c config.h $(htslib_sam_h) $(textutils_internal_h)
+simd.o simd.pico: simd.c config.h $(htslib_sam_h) $(sam_internal_h)
 tbx.o tbx.pico: tbx.c config.h $(htslib_tbx_h) $(htslib_bgzf_h) $(htslib_hts_endian_h) $(hts_internal_h) $(htslib_khash_h)
 faidx.o faidx.pico: faidx.c config.h $(htslib_bgzf_h) $(htslib_faidx_h) $(htslib_hfile_h) $(htslib_khash_h) $(htslib_kstring_h) $(hts_internal_h)
 bcf_sr_sort.o bcf_sr_sort.pico: bcf_sr_sort.c config.h $(bcf_sr_sort_h) $(htslib_khash_str2int_h) $(htslib_kbitset_h)
@@ -512,10 +533,10 @@ htsfile: htsfile.o libhts.a
 tabix: tabix.o libhts.a
 	$(CC) $(LDFLAGS) -o $@ tabix.o libhts.a $(LIBS) -lpthread
 
-annot-tsv.o: annot-tsv.c config.h $(htslib_hts_h) $(htslib_hts_defs_h) $(htslib_khash_str2int_h) $(htslib_kstring_h) $(htslib_kseq_h) $(htslib_bgzf_h) $(htslib_regidx_h)
+annot-tsv.o: annot-tsv.c config.h $(htslib_hts_h) $(htslib_hts_defs_h) $(htslib_khash_str2int_h) $(htslib_kstring_h) $(htslib_kseq_h) $(htslib_bgzf_h) $(htslib_regidx_h) $(textutils_internal_h)
 bgzip.o: bgzip.c config.h $(htslib_bgzf_h) $(htslib_hts_h) $(htslib_hfile_h)
 htsfile.o: htsfile.c config.h $(htslib_hfile_h) $(htslib_hts_h) $(htslib_sam_h) $(htslib_vcf_h)
-tabix.o: tabix.c config.h $(htslib_tbx_h) $(htslib_sam_h) $(htslib_vcf_h) $(htslib_kseq_h) $(htslib_bgzf_h) $(htslib_hts_h) $(htslib_regidx_h) $(htslib_hts_defs_h) $(htslib_hts_log_h)
+tabix.o: tabix.c config.h $(htslib_tbx_h) $(htslib_sam_h) $(htslib_vcf_h) $(htslib_kseq_h) $(htslib_bgzf_h) $(htslib_hts_h) $(htslib_regidx_h) $(htslib_hts_defs_h) $(htslib_hts_log_h) $(htslib_thread_pool_h)
 
 # Runes to check that the htscodecs submodule is present
 ifdef HTSCODECS_SOURCES
@@ -552,7 +573,7 @@ htscodecs/htscodecs/version.h: force
 	  vers=`cd $(srcdir)/htscodecs && git describe --always --dirty --match 'v[0-9]\.[0-9]*'` && \
 	  case "$$vers" in \
 	    v*) vers=$${vers#v} ;; \
-	    *) iv=`awk '/^AC_INIT/ { match($$0, /^AC_INIT\(htscodecs, *([0-9](\.[0-9])*)\)/, m); print substr($$0, m[1, "start"], m[1, "length"]) }' $(srcdir)/htscodecs/configure.ac` ; vers="$$iv$${vers:+-g$$vers}" ;; \
+	    *) iv=`awk '/^AC_INIT\(htscodecs,/ { match($$0, /[0-9]+(\.[0-9]+)*/); print substr($$0, RSTART, RLENGTH) }' $(srcdir)/htscodecs/configure.ac` ; vers="$$iv$${vers:+-g$$vers}" ;; \
 	  esac ; \
 	  if ! grep -s -q '"'"$$vers"'"' $@ ; then \
 	    echo 'Updating $@ : #define HTSCODECS_VERSION_TEXT "'"$$vers"'"' ; \
@@ -591,7 +612,9 @@ check test: all $(HTSCODECS_TEST_TARGETS)
 	test/hts_endian
 	test/test_expr
 	test/test_kfunc
+	test/test_khash
 	test/test_kstring
+	test/test_nibbles -v
 	test/test_str2int
 	test/test_time_funcs
 	test/fieldarith test/fieldarith.sam
@@ -643,23 +666,29 @@ test/sam: test/sam.o libhts.a
 	$(CC) $(LDFLAGS) -o $@ test/sam.o libhts.a $(LIBS) -lpthread
 
 test/test_bgzf: test/test_bgzf.o libhts.a
-	$(CC) $(LDFLAGS) -o $@ test/test_bgzf.o libhts.a -lz $(LIBS) -lpthread
+	$(CC) $(LDFLAGS) -o $@ test/test_bgzf.o libhts.a $(LIBS) -lpthread
 
 test/test_expr: test/test_expr.o libhts.a
-	$(CC) $(LDFLAGS) -o $@ test/test_expr.o libhts.a -lz $(LIBS) -lpthread
+	$(CC) $(LDFLAGS) -o $@ test/test_expr.o libhts.a $(LIBS) -lpthread
 
 test/test_faidx: test/test_faidx.o libhts.a
-	$(CC) $(LDFLAGS) -o $@ test/test_faidx.o libhts.a -lz $(LIBS) -lpthread
+	$(CC) $(LDFLAGS) -o $@ test/test_faidx.o libhts.a $(LIBS) -lpthread
 
 test/test_kfunc: test/test_kfunc.o libhts.a
-	$(CC) $(LDFLAGS) -o $@ test/test_kfunc.o libhts.a -lz $(LIBS) -lpthread
+	$(CC) $(LDFLAGS) -o $@ test/test_kfunc.o libhts.a $(LIBS) -lpthread
+
+test/test_khash: test/test_khash.o libhts.a
+	$(CC) $(LDFLAGS) -o $@ test/test_khash.o libhts.a $(LIBS) -lpthread
 
 test/test_kstring: test/test_kstring.o libhts.a
-	$(CC) $(LDFLAGS) -o $@ test/test_kstring.o libhts.a -lz $(LIBS) -lpthread
+	$(CC) $(LDFLAGS) -o $@ test/test_kstring.o libhts.a $(LIBS) -lpthread
 
 test/test_mod: test/test_mod.o libhts.a
 	$(CC) $(LDFLAGS) -o $@ test/test_mod.o libhts.a $(LIBS) -lpthread
 
+test/test_nibbles: test/test_nibbles.o libhts.a
+	$(CC) $(LDFLAGS) -o $@ test/test_nibbles.o libhts.a $(LIBS) -lpthread
+
 test/test_realn: test/test_realn.o libhts.a
 	$(CC) $(LDFLAGS) -o $@ test/test_realn.o libhts.a $(LIBS) -lpthread
 
@@ -688,10 +717,10 @@ test/test-vcf-sweep: test/test-vcf-sweep.o libhts.a
 	$(CC) $(LDFLAGS) -o $@ test/test-vcf-sweep.o libhts.a $(LIBS) -lpthread
 
 test/test-bcf-sr: test/test-bcf-sr.o libhts.a
-	$(CC) $(LDFLAGS) -o $@ test/test-bcf-sr.o libhts.a -lz $(LIBS) -lpthread
+	$(CC) $(LDFLAGS) -o $@ test/test-bcf-sr.o libhts.a $(LIBS) -lpthread
 
 test/test-bcf-translate: test/test-bcf-translate.o libhts.a
-	$(CC) $(LDFLAGS) -o $@ test/test-bcf-translate.o libhts.a -lz $(LIBS) -lpthread
+	$(CC) $(LDFLAGS) -o $@ test/test-bcf-translate.o libhts.a $(LIBS) -lpthread
 
 test/test_introspection: test/test_introspection.o libhts.a
 	$(CC) $(LDFLAGS) -o $@ test/test_introspection.o libhts.a $(LIBS) -lpthread
@@ -760,8 +789,10 @@ test/sam.o: test/sam.c config.h $(htslib_hts_defs_h) $(htslib_sam_h) $(htslib_fa
 test/test_bgzf.o: test/test_bgzf.c config.h $(htslib_bgzf_h) $(htslib_hfile_h) $(htslib_hts_log_h) $(hfile_internal_h)
 test/test_expr.o: test/test_expr.c config.h $(htslib_hts_expr_h)
 test/test_kfunc.o: test/test_kfunc.c config.h $(htslib_kfunc_h)
+test/test_khash.o: test/test_khash.c config.h $(htslib_khash_h) $(htslib_kroundup_h)
 test/test_kstring.o: test/test_kstring.c config.h $(htslib_kstring_h)
 test/test_mod.o: test/test_mod.c config.h $(htslib_sam_h)
+test/test_nibbles.o: test/test_nibbles.c config.h $(htslib_sam_h) $(sam_internal_h)
 test/test-parse-reg.o: test/test-parse-reg.c config.h $(htslib_hts_h) $(htslib_sam_h)
 test/test_realn.o: test/test_realn.c config.h $(htslib_hts_h) $(htslib_sam_h) $(htslib_faidx_h)
 test/test-regidx.o: test/test-regidx.c config.h $(htslib_kstring_h) $(htslib_regidx_h) $(htslib_hts_defs_h) $(textutils_internal_h)
@@ -784,25 +815,25 @@ test/usepublic.o: test/usepublic.cpp config.h $(htslib_bgzf_h) $(htslib_cram_h)
 
 
 test/thrash_threads1: test/thrash_threads1.o libhts.a
-	$(CC) $(LDFLAGS) -o $@ test/thrash_threads1.o libhts.a -lz $(LIBS) -lpthread
+	$(CC) $(LDFLAGS) -o $@ test/thrash_threads1.o libhts.a $(LIBS) -lpthread
 
 test/thrash_threads2: test/thrash_threads2.o libhts.a
-	$(CC) $(LDFLAGS) -o $@ test/thrash_threads2.o libhts.a -lz $(LIBS) -lpthread
+	$(CC) $(LDFLAGS) -o $@ test/thrash_threads2.o libhts.a $(LIBS) -lpthread
 
 test/thrash_threads3: test/thrash_threads3.o libhts.a
-	$(CC) $(LDFLAGS) -o $@ test/thrash_threads3.o libhts.a -lz $(LIBS) -lpthread
+	$(CC) $(LDFLAGS) -o $@ test/thrash_threads3.o libhts.a $(LIBS) -lpthread
 
 test/thrash_threads4: test/thrash_threads4.o libhts.a
-	$(CC) $(LDFLAGS) -o $@ test/thrash_threads4.o libhts.a -lz $(LIBS) -lpthread
+	$(CC) $(LDFLAGS) -o $@ test/thrash_threads4.o libhts.a $(LIBS) -lpthread
 
 test/thrash_threads5: test/thrash_threads5.o libhts.a
-	$(CC) $(LDFLAGS) -o $@ test/thrash_threads5.o libhts.a -lz $(LIBS) -lpthread
+	$(CC) $(LDFLAGS) -o $@ test/thrash_threads5.o libhts.a $(LIBS) -lpthread
 
 test/thrash_threads6: test/thrash_threads6.o libhts.a
-	$(CC) $(LDFLAGS) -o $@ test/thrash_threads6.o libhts.a -lz $(LIBS) -lpthread
+	$(CC) $(LDFLAGS) -o $@ test/thrash_threads6.o libhts.a $(LIBS) -lpthread
 
 test/thrash_threads7: test/thrash_threads7.o libhts.a
-	$(CC) $(LDFLAGS) -o $@ test/thrash_threads7.o libhts.a -lz $(LIBS) -lpthread
+	$(CC) $(LDFLAGS) -o $@ test/thrash_threads7.o libhts.a $(LIBS) -lpthread
 
 test_thrash: $(BUILT_THRASH_PROGRAMS)
 
@@ -905,8 +936,9 @@ htslib-uninstalled.pc: htslib.pc.tmp
 
 
 testclean:
-	-rm -f test/*.tmp test/*.tmp.* test/faidx/*.tmp* test/faidx/FAIL* \
-               test/longrefs/*.tmp.* test/tabix/*.tmp.* test/tabix/FAIL* \
+	-rm -f test/*.tmp test/*.tmp.* test/faidx/*.tmp* \
+               test/longrefs/*.tmp.* test/tabix/*.tmp.* \
+               test/bgzf_boundaries/*.tmp.* test/*/FAIL* \
                header-exports.txt shlib-exports-$(SHLIB_FLAVOUR).txt
 	-rm -rf htscodecs/tests/test.out
 
@@ -970,3 +1002,4 @@ force:
 .PHONY: clean-dylib install-dylib
 .PHONY: test_htscodecs_rans4x8 test_htscodecs_rans4x16 test_htscodecs_arith
 .PHONY: test_htscodecs_tok3 test_htscodecs_fqzcomp test_htscodecs_varint
+.PHONY: cc-version
diff --git a/NEWS b/NEWS
index 83dcaa5b9..8825c30d1 100644
--- a/NEWS
+++ b/NEWS
@@ -1,3 +1,135 @@
+Noteworthy changes in release 1.21 (12th September 2024)
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+The primary user-visible changes in this release are updates to the
+annot-tsv tool and some speed improvements.  Full details of other
+changes and bugs fixed are below.
+
+Notice: this is the last SAMtools / HTSlib release where CRAM 3.0 will be
+the default CRAM version.  From the next we will change to CRAM 3.1
+unless the version is explicitly specified, for example using
+"samtools view -O cram,version=3.0".
+
+
+Updates
+-------
+
+* Extend annot-tsv with several new command line options.
+    --delim permits use of other delimiters.
+    --headers for selection of other header formats.
+    --no-header-idx to suppress column index numbers in header.
+  Also removed -h as it is now short for --headers.  Note --help
+  still works. (PR #1779)
+
+* Allow annot-tsv -a to rename annotations. (PR #1709)
+
+* Extend annot-tsv --overlap to be able to specify the overlap
+  fraction separately for source and target. (PR #1811)
+
+* Added new APIs to facilitate low-level CRAM container manipulations,
+  used by the new "samtools cat" region filtering code. Functions are:
+    cram_container_get_coords()
+    cram_filter_container()
+    cram_index_extents()
+    cram_container_num2offset()
+    cram_container_offset2num()
+    cram_num_containers()
+    cram_num_containers_between()
+  Also improved cram_index_query() to cope with HTS_IDX_NOCOOR regions.
+  (PR #1771)
+
+* Bgzip now retains file modification and access times when
+  compressing and decompressing. (PR #1727, fixes #1718.  Requested by
+  Gert Hulselmans.)
+
+* Use FNV1a for string hashing in khash.  The old algorithm was
+  particularly weak with base-64 style strings and lead to a large
+  number of collisions.  (PR #1806.  Fixes   samtools/samtools#2066,
+  reported by Hans-Joachim Ruscheweyh)
+
+* Improve the speed of the nibble2base() function on Intel (PR
+  #1667, PR #1764, PR #1786, PR #1802, thanks to Ruben Vorderman) and
+  ARM (PR #1795, thanks to John Marshall).
+
+* bgzf_getline() will now warn if it encounters UTF-16 data.
+  (PR #1487, thanks to John Marshall)
+
+* Speed up bgzf_read().  While this does not reduce CPU significantly,
+  it does increase the maximum parallelism available permitting 10-15%
+  faster decoding. (PR #1772, PR #1800, Issue #1798)
+
+* Speed up faidx by use of better isgraph methods (PR #1797) and
+  whole-line reading (PR #1799, thanks to John Marshall).
+
+* Speed up kputll() function, speeding up BAM -> SAM conversion by
+  about 5% and also samtools depth.  (PR #1805)
+
+* Added more example code, covering fasta/fastq indexing, tabix
+  indexing and use of the thread pool. (PR #1666)
+
+Build Changes
+-------------
+
+* Code warning fixes for pedantic compilers (PR #1777) and avoid
+  some undefined behaviour (PR #1810, PR #1816, PR #1828).
+
+* Windows based CI has been migrated from AppVeyor to GitHub Actions.
+  (PR #1796, PR #1803, PR #1808)
+
+* Miscellaneous minor build infrastructure and code fixes.
+  (PR #1807, PR #1829, both thanks to John Marshall)
+
+* Updated htscodecs submodule to version 1.6.1 (PR #1828)
+
+* Fixed an awk script in the Makefile that only worked with gawk. (PR #1831)
+
+Bug fixes
+---------
+
+* Fix small OSS-Fuzz reported issues with CRAM encoding and long
+  CIGARS and/or illegal positions. (PR #1775, PR #1801, PR #1817)
+
+* Fix issues with on-the-fly indexing of VCF/BCF (bcftools --write-index)
+  when not using multiple threads. (PR #1837. Fixes samtools/bcftools#2267,
+  reported by Giulio Genovese)
+
+* Stricter limits on POS / MPOS / TLEN in sam_parse1().  This fixes
+  a signed overflow reported by OSS-Fuzz and should help prevent other
+  as-yet undetected bugs. (PR #1812)
+
+* Check that the underlying file open worked for preload: URLs.  Fixes
+  a NULL pointer dereference reported by OSS-Fuzz. (PR #1821)
+
+* Fix an infinite loop in hts_itr_query() when given extremely large
+  positions which cause integer overflow.  Also adds hts_bin_maxpos()
+  and hts_idx_maxpos() functions.
+  (PR #1774, thanks to John Marshall and reported by Jesus Alberto
+  Munoz Mesa)
+
+* Fix an out of bounds read in hts_itr_multi_next() when switching
+  chromosomes.  This bug is present in releases 1.11 to 1.20.
+  (PR #1788. Fixes samtools/samtools#2063, reported by acorvelo)
+
+* Work around parsing problems with colons in CHROM names.
+  Fixes samtools/bcftools#2139.  (PR #1781, John Marshall / James Bonfield)
+
+* Correct the CPU detection for Mac OS X 10.7.  cpuid is used by
+  htscodecs (see samtools/htscodecs#116), and the corresponding
+  changes in htslib are PR #1785.  Reported by Ryan Carsten Schmidt.
+
+* Make BAM zero-length intervals work the same as CRAM; permitted and
+  returning overlapping records. (PR #1787.  Fixes
+  samtools/samtools#2060, reported by acorvelo)
+
+* Replace assert() with abort() in BCF synced reader.  This is not an
+  ideal solution, but it gives consistent behaviour when compiling
+  with or without NDEBUG.  (PR #1791, thanks to Martin Pollard)
+
+* Fixed failure to change the write block size on compressed SAM or VCF
+  files due to an internal type confusion.  (PR #1826)
+
+* Fixed an out-of-bounds read in cram_codec_iter_next() (PR #1832)
+
 Noteworthy changes in release 1.20 (15th April 2024)
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
diff --git a/README.md b/README.md
index 47afdba2a..2906855ba 100644
--- a/README.md
+++ b/README.md
@@ -1,5 +1,5 @@
-[![Build Status](https://api.cirrus-ci.com/github/samtools/htslib.svg?branch=develop)](https://api.cirrus-ci.com/github/samtools/htslib)
-[![Build status](https://ci.appveyor.com/api/projects/status/v46hkwyfjp3l8nd3/branch/develop?svg=true)](https://ci.appveyor.com/project/samtools/htslib/branch/develop)
+[![Build Status](https://api.cirrus-ci.com/github/samtools/htslib.svg?branch=develop)](https://cirrus-ci.com/github/samtools/htslib)
+[![Build status](https://github.com/samtools/htslib/actions/workflows/windows-build.yml/badge.svg)](https://github.com/samtools/htslib/actions/workflows/windows-build.yml?query=branch%3Adevelop)
 [![Github All Releases](https://img.shields.io/github/downloads/samtools/htslib/total.svg)](https://github.com/samtools/htslib)
 
 HTSlib is an implementation of a unified C library for accessing common file
diff --git a/annot-tsv.1 b/annot-tsv.1
index df3b06e91..3a6034b11 100644
--- a/annot-tsv.1
+++ b/annot-tsv.1
@@ -1,7 +1,7 @@
 '\" t
-.TH annot-tsv 1 "15 April 2024" "htslib-1.20" "Bioinformatics tools"
+.TH annot-tsv 1 "12 September 2024" "htslib-1.21" "Bioinformatics tools"
 .\"
-.\" Copyright (C) 2015, 2017-2018, 2023 Genome Research Ltd.
+.\" Copyright (C) 2015, 2017-2018, 2023-2024 Genome Research Ltd.
 .\"
 .\" Author: Petr Danecek
 .\"
@@ -108,6 +108,11 @@ Target file to be extend with annotations from
 Add the same annotations multiple times if multiple overlaps are found
 .RE
 .PP
+.B \-\-help
+.RS 4
+This help message
+.RE
+.PP
 .BR \-\-max\-annots " INT"
 .RS 4
 Add at most INT annotations per column to save time when many overlaps are found with a single region
@@ -138,18 +143,42 @@ number of source base pairs in the overlap
 .RE
 .RE
 .PP
+.BR \-d ", " \-\-delim " SRC:TGT"
+.RS 4
+Column delimiter in the source and the target file. For example, if both files are comma-delimited, run with
+"--delim ,:," or simply "--delim ,". If the source file is comma-delimited and the target file is tab-delimited,
+run with "-d $',:\\t'".
+.RE
+.PP
+.BR \-h ", " \-\-headers " SRC:TGT"
+.RS 4
+Line number of the header row with column names. By default the first line is interpreted as header if it starts with the comment
+character ("#"), otherwise expects numeric indices. However, if the first line does not start with "#" but still
+contains the column names, use "--headers 1:1". To ignore existing header (skip comment lines) and use numeric indices,
+use "--headers 0:0" which is equivalent to "--ignore-headers". When negative value is given, it is interpreted as the number of
+lines from the end of the comment block. Specifically, "--headers -1" takes the column names from the last line of
+the comment block (e.g., the "#CHROM" line in the VCF format).
+.RE
+.PP
 .BR \-H ", " \-\-ignore\-headers
 .RS 4
 Ignore the headers completely and use numeric indexes even when a header exists
 .RE
 .PP
-.BR \-O ", " \-\-overlap " FLOAT"
+.BR \-I ", " \-\-no\-hdr\-idx
+.RS 4
+Suppress index numbers in the printed header. If given twice, drop the entire header.
+.RE
+.PP
+.BR \-O ", " \-\-overlap " FLOAT,[FLOAT]"
 .RS 4
-Minimum overlap as a fraction of region length in at least one of the overlapping regions. If also
+Minimum overlap as a fraction of region length in SRC and TGT, respectively (with two numbers), or in
+at least one of the overlapping regions (with a single number). If also
 .BR \-r ", " \-\-reciprocal
 is given, require at least
 .I FLOAT
-overlap with respect to both regions
+overlap with respect to both regions. Two identical numbers are equivalent to running with
+.BR \-r ", " \-\-reciprocal
 .RE
 .PP
 .BR \-r ", " \-\-reciprocal
diff --git a/annot-tsv.c b/annot-tsv.c
index 4661e6e0f..494c43744 100644
--- a/annot-tsv.c
+++ b/annot-tsv.c
@@ -1,5 +1,5 @@
 /*
-    Copyright (C) 2018-2023 Genome Research Ltd.
+    Copyright (C) 2018-2024 Genome Research Ltd.
 
     Author: Petr Danecek <pd3@sanger.ac.uk>
 
@@ -44,6 +44,7 @@
 #include "htslib/kseq.h"
 #include "htslib/bgzf.h"
 #include "htslib/regidx.h"
+#include "textutils_internal.h"
 
 #define ANN_NBP     1
 #define ANN_FRAC    2
@@ -71,6 +72,7 @@ typedef struct
     cols_t *core, *match, *transfer, *annots;
     int *core_idx, *match_idx, *transfer_idx, *annots_idx;
     int *nannots_added; // for --max-annots: the number of annotations added
+    char delim;
     int grow_n;
     kstring_t line;     // one buffered line, a byproduct of reading the header
     htsFile *fp;
@@ -100,11 +102,11 @@ typedef struct
 {
     nbp_t *nbp;
     dat_t dst, src;
-    char *core_str, *match_str, *transfer_str, *annots_str;
+    char *core_str, *match_str, *transfer_str, *annots_str, *headers_str, *delim_str;
     char *temp_dir, *out_fname;
     BGZF *out_fp;
-    int allow_dups, reciprocal, ignore_headers, max_annots, mode;
-    double overlap;
+    int allow_dups, max_annots, mode, no_write_hdr, overlap_either;
+    double overlap_src, overlap_dst;
     regidx_t *idx;
     regitr_t *itr;
     kstring_t tmp_kstr;
@@ -282,7 +284,7 @@ int parse_tab_with_payload(const char *line, char **chr_beg, char **chr_end, hts
 
     dat_t *dat = (dat_t*) usr;
 
-    cols_t *cols = cols_split(line, NULL, '\t');
+    cols_t *cols = cols_split(line, NULL, dat->delim);
     *((cols_t**)payload) = cols;
 
     if ( cols->n < dat->core_idx[0] ) error("Expected at least %d columns, found %d: %s\n",dat->core_idx[0]+1,cols->n,line);
@@ -315,86 +317,136 @@ void free_payload(void *payload)
     cols_destroy(cols);
 }
 
-// Parse header if present (first line has a leading #) or create a dummy header with
-// numeric column names. If dummy is set, read first data line (without a leading #)
-// and create a dummy header.
-void parse_header(dat_t *dat, char *fname, int dummy)
+// Parse header if present, the parameter irow indicates the header row line number:
+//      0   .. ignore headers, create numeric fields names, 1-based indices
+//      N>0 .. N-th line, all previous lines are discarded
+//      N<0 .. N-th line from the end of the comment block (comment lines are prefixed with #),
+//             all preceding lines are discarded.
+// When autodetect is set, the argument nth_row is ignored.
+// Note this makes no attempt to preserve comment lines on output
+void parse_header(dat_t *dat, char *fname, int nth_row, int autodetect)
 {
     dat->fp = hts_open(fname,"r");
     if ( !dat->fp ) error("Failed to open: %s\n", fname);
 
+    // buffer comment lines when N<0
+    int nbuf = 0;
+    char **buf = NULL;
+    if ( nth_row < 0 )
+    {
+        buf = calloc(-nth_row,sizeof(*buf));
+        if ( !buf ) error("Out of memory, failed to allocate %zu bytes\n",(-nth_row)*sizeof(*buf));
+    }
+
+    int irow = 0;
     cols_t *cols = NULL;
     while ( hts_getline(dat->fp, KS_SEP_LINE, &dat->line) > 0 )
     {
-        if ( dat->line.s[0]=='#' )
+        if ( autodetect )
+        {
+            // if the first line is comment line, use it as a header. Otherwise go
+            // with numeric indices
+            nth_row = dat->line.s[0]=='#' ? 1 : 0;
+            break;
+        }
+        if ( nth_row==0 )
+        {
+            // N=0 .. comment lines to be ignored, read until we get to the first data line
+            if ( dat->line.s[0]=='#' ) continue;
+            break;
+        }
+        if ( nth_row>0 )
         {
-            // this is a header or comment line
-            if ( dummy ) continue;
-            cols = cols_split(dat->line.s, NULL, '\t');
+            // N>1 .. regardless of this being a comment or data line, read until Nth line
+            if ( ++irow < nth_row ) continue;
             break;
         }
+        // N<0 .. keep abs(N) comment lines in a sliding buffer
+        if ( dat->line.s[0]!='#' ) break;   // data line
+        if ( nbuf == -nth_row )
+        {
+            // one more comment line and the buffer is full. We could use round buffer
+            // for efficiency, but the assumption is abs(nth_row) is small
+            free(buf[0]);
+            memmove(buf, &buf[1], (nbuf-1)*sizeof(*buf));
+            nbuf--;
+        }
+        buf[nbuf++] = strdup(dat->line.s);
+    }
+
+    int keep_line = 0;
+    if ( nth_row < 0 )
+    {
+        if ( nbuf!=-nth_row )
+            error("Found %d header lines in %s, cannot fetch N=%d from the end\n",nbuf,fname,-nth_row);
+        cols = cols_split(buf[0], NULL, dat->delim);
+        keep_line = 1;
+    }
+    else
+        cols = cols_split(dat->line.s, NULL, dat->delim);
 
-        // this a data line, we must be in a dummy mode
-        cols = cols_split(dat->line.s, NULL, '\t');
-        assert(cols && cols->n);
-        assert(cols->off[0][0] != '#');
+    if ( !dat->line.l ) error("Failed to read: %s\n", fname);
+    assert(cols && cols->n);
 
+    if ( nth_row == 0 ) // create numeric indices
+    {
         // create a dummy header with numeric field names
         kstring_t str = {0,0,0};
         int i, n = cols->n;
         for (i=0; i<n; i++)
         {
-            if ( i>0 ) kputc('\t', &str);
+            if ( i>0 ) kputc(dat->delim, &str);
             kputw(i+1, &str);
         }
         cols_destroy(cols);
-        cols = cols_split(str.s, NULL, '\t');
+        cols = cols_split(str.s, NULL, dat->delim);
         free(str.s);
         dat->hdr.dummy = 1;
-
-        break;
+        keep_line = 1;
     }
-    if ( !dat->line.l ) error("Failed to read: %s\n", fname);
-    assert(cols && cols->n);
 
     dat->hdr.name2idx = khash_str2int_init();
     int i;
     for (i=0; i<cols->n; i++)
     {
         char *ss = cols->off[i];
-        while ( *ss && (*ss=='#' || isspace(*ss)) ) ss++;
+        while ( *ss && (*ss=='#' || isspace_c(*ss)) ) ss++;
         if ( !*ss ) error("Could not parse the header field \"%s\": %s\n", cols->off[i],dat->line.s);
         if ( *ss=='[' )
         {
             char *se = ss+1;
-            while ( *se && isdigit(*se) ) se++;
+            while ( *se && isdigit_c(*se) ) se++;
             if ( *se==']' ) ss = se + 1;
         }
-        while ( *ss && (*ss=='#' || isspace(*ss)) ) ss++;
+        while ( *ss && (*ss=='#' || isspace_c(*ss)) ) ss++;
         if ( !*ss ) error("Could not parse the header field \"%s\": %s\n", cols->off[i],dat->line.s);
         cols->off[i] = ss;
         khash_str2int_set(dat->hdr.name2idx, cols->off[i], i);
     }
     dat->hdr.cols = cols;
-    if ( !dat->hdr.dummy ) dat->line.l = 0;
+    if ( !keep_line ) dat->line.l = 0;
+
+    for (i=0; i<nbuf; i++) free(buf[i]);
+    free(buf);
 }
 void write_header(args_t *args, dat_t *dat)
 {
     if ( dat->hdr.dummy ) return;
+    if ( args->no_write_hdr>1 ) return;
     int i;
     kstring_t str = {0,0,0};
     kputc('#', &str);
     for (i=0; i<dat->hdr.cols->n; i++)
     {
-        if ( i>0 ) kputc('\t', &str);
-        ksprintf(&str,"[%d]", i+1);
+        if ( i>0 ) kputc(dat->delim, &str);
+        if ( !args->no_write_hdr ) ksprintf(&str,"[%d]", i+1);
         kputs(dat->hdr.cols->off[i], &str);
     }
     if ( dat->hdr.annots )
     {
         for (i=0; i<dat->hdr.annots->n; i++)
         {
-            if ( str.l > 1 ) kputc('\t', &str);
+            if ( str.l > 1 ) kputc(dat->delim, &str);
             kputs(dat->hdr.annots->off[i], &str);
         }
     }
@@ -434,8 +486,30 @@ void sanity_check_columns(char *fname, hdr_t *hdr, cols_t *cols, int **col2idx,
 }
 void init_data(args_t *args)
 {
-    parse_header(&args->dst, args->dst.fname, args->ignore_headers);
-    parse_header(&args->src, args->src.fname, args->ignore_headers);
+    if ( !args->delim_str )
+        args->dst.delim = args->src.delim = '\t';
+    else if ( strlen(args->delim_str)==1 )
+        args->dst.delim = args->src.delim = *args->delim_str;
+    else if ( strlen(args->delim_str)==3 && args->delim_str[1]==':' )
+        args->src.delim = args->delim_str[0], args->dst.delim = args->delim_str[2];
+    else
+        error("Could not parse the option --delim %s\n",args->delim_str);
+
+    // --headers, determine header row index
+    int isrc = 0, idst = 0, autodetect = 1;
+    if ( args->headers_str )
+    {
+        cols_t *tmp = cols_split(args->headers_str, NULL, ':');
+        char *rmme;
+        isrc = strtol(tmp->off[0],&rmme,10);
+        if ( *rmme || tmp->off[0]==rmme ) error("Could not parse the option --headers %s\n",args->headers_str);
+        idst = strtol(tmp->n==2 ? tmp->off[1] : tmp->off[0],&rmme,10);
+        if ( *rmme || (tmp->n==2 ? tmp->off[1] : tmp->off[0])==rmme ) error("Could not parse the option --headers %s\n",args->headers_str);
+        cols_destroy(tmp);
+        autodetect = 0;
+    }
+    parse_header(&args->dst, args->dst.fname, idst, autodetect);
+    parse_header(&args->src, args->src.fname, isrc, autodetect);
 
     // -c, core columns
     if ( !args->core_str ) args->core_str = "chr,beg,end:chr,beg,end";
@@ -608,17 +682,17 @@ static void write_annots(args_t *args)
     {
         if ( args->dst.annots_idx[i]==ANN_NBP )
         {
-            kputc('\t',&args->tmp_kstr);
+            kputc(args->dst.delim,&args->tmp_kstr);
             kputw(len,&args->tmp_kstr);
         }
         else if ( args->dst.annots_idx[i]==ANN_FRAC )
         {
-            kputc('\t',&args->tmp_kstr);
+            kputc(args->dst.delim,&args->tmp_kstr);
             kputd((double)len/(args->nbp->end - args->nbp->beg + 1),&args->tmp_kstr);
         }
         else if ( args->dst.annots_idx[i]==ANN_CNT )
         {
-            kputc('\t',&args->tmp_kstr);
+            kputc(args->dst.delim,&args->tmp_kstr);
             kputw(args->nbp->n/2,&args->tmp_kstr);
         }
     }
@@ -662,18 +736,20 @@ void process_line(args_t *args, char *line, size_t size)
     int has_match = 0, annot_len = 0;
     while ( regitr_overlap(args->itr) )
     {
-        if ( args->overlap )
+        if ( args->overlap_src || args->overlap_dst )
         {
-            double len1 = end - beg + 1;
-            double len2 = args->itr->end - args->itr->beg + 1;
+            double len_dst = end - beg + 1;
+            double len_src = args->itr->end - args->itr->beg + 1;
             double isec = (args->itr->end < end ? args->itr->end : end) - (args->itr->beg > beg ? args->itr->beg : beg) + 1;
-            if ( args->reciprocal )
+            int pass_dst = isec/len_dst < args->overlap_dst ? 0 : 1;
+            int pass_src = isec/len_src < args->overlap_src ? 0 : 1;
+            if ( args->overlap_either )
             {
-                if ( isec/len1 < args->overlap || isec/len2 < args->overlap ) continue;
+                if ( !pass_dst && !pass_src ) continue;
             }
             else
             {
-                if ( isec/len1 < args->overlap && isec/len2 < args->overlap ) continue;
+                if ( !pass_dst || !pass_src ) continue;
             }
         }
         cols_t *src_cols = regitr_payload(args->itr,cols_t*);
@@ -758,7 +834,7 @@ void process_line(args_t *args, char *line, size_t size)
     write_string(args, dst_cols->off[0], 0);
     for (i=1; i<dst_cols->n; i++)
     {
-        write_string(args, "\t", 1);
+        write_string(args, &args->dst.delim, 1);
         write_string(args, dst_cols->off[i], 0);
     }
     write_annots(args);
@@ -796,6 +872,7 @@ static const char *usage_text(void)
         "\n"
         "Other options:\n"
         "       --allow-dups        Add annotations multiple times\n"
+        "       --help              This help message\n"
         "       --max-annots INT    Adding at most INT annotations per column to save\n"
         "                           time in big regions\n"
         "       --version           Print version string and exit\n"
@@ -804,9 +881,15 @@ static const char *usage_text(void)
         "                             frac .. fraction of the target region with an\n"
         "                                       overlap\n"
         "                             nbp  .. number of source base pairs in the overlap\n"
-        "   -H, --ignore-headers    Use numeric indexes, ignore the headers completely\n"
-        "   -O, --overlap FLOAT     Minimum required overlap (non-reciprocal, unless -r\n"
-        "                           is given)\n"
+        "   -d, --delim SRC:TGT     Column delimiter in SRC and TGT file\n"
+        "   -h, --headers SRC:TGT   Header row line number, 0:0 is equivalent to -H, negative\n"
+        "                             value counts from the end of comment line block [1:1]\n"
+        "   -H, --ignore-headers    Use numeric indices, ignore the headers completely\n"
+        "   -I, --no-header-idx     Suppress index numbers in the printed header. If given\n"
+        "                           twice, drop the entire header\n"
+        "   -O, --overlap FLOAT[,FLOAT]     Minimum required overlap with respect to SRC,TGT.\n"
+        "                           If single value, the bigger overlap is considered.\n"
+        "                           Identical values are equivalent to running with -r.\n"
         "   -r, --reciprocal        Apply the -O requirement to both overlapping\n"
         "                           intervals\n"
         "   -x, --drop-overlaps     Drop overlapping regions (precludes -f)\n"
@@ -847,18 +930,22 @@ int main(int argc, char **argv)
         {"target-file",required_argument,NULL,'t'},
         {"allow-dups",no_argument,NULL,0},
         {"max-annots",required_argument,NULL,2},
+        {"no-header-idx",required_argument,NULL,'I'},
         {"version",no_argument,NULL,1},
         {"annotate",required_argument,NULL,'a'},
+        {"headers",no_argument,NULL,'h'},
         {"ignore-headers",no_argument,NULL,'H'},
         {"overlap",required_argument,NULL,'O'},
         {"reciprocal",no_argument,NULL,'r'},
         {"drop-overlaps",no_argument,NULL,'x'},
-        {"help",no_argument,NULL,'h'},
+        {"delim",required_argument,NULL,'d'},
+        {"help",no_argument,NULL,4},
         {NULL,0,NULL,0}
     };
     char *tmp = NULL;
     int c;
-    while ((c = getopt_long(argc, argv, "hc:f:m:o:s:t:a:HO:rx",loptions,NULL)) >= 0)
+    int reciprocal = 0;
+    while ((c = getopt_long(argc, argv, "c:f:m:o:s:t:a:HO:rxh:Id:",loptions,NULL)) >= 0)
     {
         switch (c)
         {
@@ -873,22 +960,33 @@ int main(int argc, char **argv)
                 args->max_annots = strtod(optarg, &tmp);
                 if ( tmp==optarg || *tmp ) error("Could not parse --max-annots  %s\n", optarg);
                 break;
-            case 'H': args->ignore_headers = 1; break;
-            case 'r': args->reciprocal = 1; break;
+            case 'I': args->no_write_hdr++; break;
+            case 'd': args->delim_str = optarg; break;
+            case 'h': args->headers_str = optarg; break;
+            case 'H': args->headers_str = "0:0"; break;
+            case 'r': reciprocal = 1; break;
             case 'c': args->core_str  = optarg; break;
             case 't': args->dst.fname = optarg; break;
             case 'm': args->match_str = optarg; break;
             case 'a': args->annots_str = optarg; break;
             case 'o': args->out_fname = optarg; break;
             case 'O':
-                args->overlap = strtod(optarg, &tmp);
-                if ( tmp==optarg || *tmp ) error("Could not parse --overlap %s\n", optarg);
-                if ( args->overlap<0 || args->overlap>1 ) error("Expected value from the interval [0,1]: --overlap %s\n", optarg);
+                args->overlap_src = strtod(optarg, &tmp);
+                if ( tmp==optarg || (*tmp && *tmp!=',') ) error("Could not parse --overlap %s\n", optarg);
+                if ( args->overlap_src<0 || args->overlap_src>1 ) error("Expected value(s) from the interval [0,1]: --overlap %s\n", optarg);
+                if ( *tmp )
+                {
+                    args->overlap_dst = strtod(tmp+1, &tmp);
+                    if ( *tmp ) error("Could not parse --overlap %s\n", optarg);
+                    if ( args->overlap_dst<0 || args->overlap_dst>1 ) error("Expected value(s) from the interval [0,1]: --overlap %s\n", optarg);
+                }
+                else
+                    args->overlap_either = 1;
                 break;
             case 's': args->src.fname = optarg; break;
             case 'f': args->transfer_str = optarg; break;
             case 'x': args->mode = PRINT_NONMATCHING; break;
-            case 'h': printf("\nVersion: %s\n%s\n",hts_version(),usage_text()); exit(EXIT_SUCCESS); break;
+            case  4 : printf("\nVersion: %s\n%s\n",hts_version(),usage_text()); exit(EXIT_SUCCESS); break;
             case '?': // fall through
             default: error("\nVersion: %s\n%s\n",hts_version(),usage_text()); break;
         }
@@ -908,13 +1006,27 @@ int main(int argc, char **argv)
         else args->mode = PRINT_MATCHING|PRINT_NONMATCHING;
     }
     if ( (args->transfer_str || args->annots_str) && !(args->mode & PRINT_MATCHING) ) error("The option -x cannot be combined with -f and -a\n");
+    if ( reciprocal )
+    {
+        if ( args->overlap_dst && args->overlap_src && args->overlap_dst!=args->overlap_src )
+            error("The combination of --reciprocal with --overlap %f,%f makes no sense: expected single value or identical values\n",args->overlap_src,args->overlap_dst);
+        if ( !args->overlap_src )
+            args->overlap_src = args->overlap_dst;
+        else
+            args->overlap_dst = args->overlap_src;
+        args->overlap_either = 0;
+    }
 
     init_data(args);
     write_header(args, &args->dst);
     while ( read_next_line(&args->dst) )
     {
         int i;
-        for (i=0; i<args->dst.grow_n; i++) kputs("\t.", &args->dst.line);
+        for (i=0; i<args->dst.grow_n; i++)
+        {
+            kputc(args->dst.delim, &args->dst.line);
+            kputc('.', &args->dst.line);
+        }
         process_line(args, args->dst.line.s, args->dst.line.l);
         args->dst.line.l = 0;
     }
diff --git a/bgzip.1 b/bgzip.1
index fe4225b43..1e115d044 100644
--- a/bgzip.1
+++ b/bgzip.1
@@ -1,4 +1,4 @@
-.TH bgzip 1 "15 April 2024" "htslib-1.20" "Bioinformatics tools"
+.TH bgzip 1 "12 September 2024" "htslib-1.21" "Bioinformatics tools"
 .SH NAME
 .PP
 bgzip \- Block compression/decompression utility
diff --git a/bgzip.c b/bgzip.c
index 129343fb5..687b29d47 100644
--- a/bgzip.c
+++ b/bgzip.c
@@ -48,7 +48,7 @@
 
 static const int WINDOW_SIZE = BGZF_BLOCK_SIZE;
 
-static void error(const char *format, ...)
+static void HTS_FORMAT(HTS_PRINTF_FMT, 1, 2) error(const char *format, ...)
 {
     va_list ap;
     va_start(ap, format);
@@ -57,7 +57,7 @@ static void error(const char *format, ...)
     exit(EXIT_FAILURE);
 }
 
-static int ask_yn()
+static int ask_yn(void)
 {
     char line[1024];
     if (fgets(line, sizeof line, stdin) == NULL)
@@ -362,8 +362,7 @@ int main(int argc, char **argv)
                         }
                         else {
                             ret = 2;                        //explicit N - no overwrite, continue and return 2
-                            if (hclose(f_src) < 0)
-                                ;                           //ignoring return value
+                            hclose_abruptly(f_src);
                             free(name);
                             continue;
                         }
@@ -689,7 +688,7 @@ int main(int argc, char **argv)
                     if ( bgzf_index_load(fp, argv[optind], ".gzi") < 0 )
                         error("Could not load index: %s.gzi\n", argv[optind]);
                 }
-                if ( bgzf_useek(fp, start, SEEK_SET) < 0 ) error("Could not seek to %d-th (uncompressd) byte\n", start);
+                if ( bgzf_useek(fp, start, SEEK_SET) < 0 ) error("Could not seek to %ld-th (uncompressd) byte\n", start);
             }
 
             if (threads > 1)
diff --git a/configure.ac b/configure.ac
index 49f2cbc70..87e928d47 100644
--- a/configure.ac
+++ b/configure.ac
@@ -1,6 +1,6 @@
 # Configure script for htslib, a C library for high-throughput sequencing data.
 #
-#    Copyright (C) 2015-2023 Genome Research Ltd.
+#    Copyright (C) 2015-2024 Genome Research Ltd.
 #
 #    Author: John Marshall <jm18@sanger.ac.uk>
 #
@@ -35,7 +35,7 @@ m4_include([m4/hts_hide_dynamic_syms.m4])
 m4_include([m4/pkg.m4])
 
 dnl Copyright notice to be copied into the generated configure script
-AC_COPYRIGHT([Portions copyright (C) 2020-2023 Genome Research Ltd.
+AC_COPYRIGHT([Portions copyright (C) 2020-2024 Genome Research Ltd.
 
 This configure script is free software: you are free to change and
 redistribute it.  There is NO WARRANTY, to the extent permitted by law.])
@@ -82,6 +82,14 @@ AC_CHECK_DECL([_XOPEN_SOURCE], [],
   [AC_DEFINE([_XOPEN_SOURCE], [600], [Specify X/Open requirements])],
   [])
 
+dnl Check that we have cpuid, and if so run the x86 SIMD checks
+AC_CHECK_DECLS([__get_cpuid_max, __cpuid_count], [
+   hts_have_cpuid=yes
+], [
+   hts_have_cpuid=no
+], [[#include <cpuid.h>]])
+
+AS_IF(test "x$hts_have_cpuid" = "xyes", [
 dnl Options for rANS32x16 sse4.1 version - sse4.1
 HTS_CHECK_COMPILE_FLAGS_NEEDED([sse4.1], [-msse4.1 -mssse3 -mpopcnt],
  [AC_LANG_PROGRAM([[
@@ -100,6 +108,7 @@ HTS_CHECK_COMPILE_FLAGS_NEEDED([sse4.1], [-msse4.1 -mssse3 -mpopcnt],
   AC_DEFINE([HAVE_POPCNT],1,[Defined to 1 if rANS source using popcnt can be compiled.])
   AC_DEFINE([HAVE_SSE4_1],1,[Defined to 1 if rANS source using SSE4.1 can be compiled.
 ])
+
 dnl Propagate HTSlib's unaligned access preference to htscodecs
   AH_VERBATIM([UBSAN],[
 /* Prevent unaligned access in htscodecs SSE4 rANS codec */
@@ -139,7 +148,9 @@ HTS_CHECK_COMPILE_FLAGS_NEEDED([avx512f], [-mavx512f -mpopcnt],
     #ifdef __x86_64__
     __m512i a = _mm512_set1_epi32(1);
     __m512i b = _mm512_add_epi32(a, a);
-    return _mm_popcnt_u32(*((char *) &b));
+    __m256i c = _mm512_castsi512_si256(b);
+    __m256i d = _mm512_extracti64x4_epi64(a, 1);
+    return _mm_popcnt_u32(*((char *) &c)) + (*(char *) &d);
     #endif
   ]])], [
   hts_cflags_avx512="$flags_needed"
@@ -148,6 +159,37 @@ HTS_CHECK_COMPILE_FLAGS_NEEDED([avx512f], [-mavx512f -mpopcnt],
   AC_DEFINE([HAVE_AVX512],1,[Defined to 1 if rANS source using AVX512F can be compiled.])
 ])
 
+dnl Check for working __builtin_cpu_supports (ssse3 is broken on some clangs)
+AC_MSG_CHECKING([for working __builtin_cpu_supports("ssse3")])
+AC_LINK_IFELSE([AC_LANG_PROGRAM([],[
+  if (__builtin_cpu_supports("ssse3")) {
+    return 0;
+  }
+])], [
+  AC_MSG_RESULT([yes])
+  AC_DEFINE([HAVE_BUILTIN_CPU_SUPPORT_SSSE3], 1,
+            [Defined to 1 if __builtin_cpu_supports("ssse3") works])
+], [
+  AC_MSG_RESULT([no])
+])
+
+dnl Check for function attribute used in conjunction with __builtin_cpu_supports
+AC_MSG_CHECKING([for __attribute__((target))])
+AC_COMPILE_IFELSE([AC_LANG_PROGRAM([[
+  __attribute__((target("ssse3")))
+  int zero(void) {
+    return 0;
+  }
+]], [[zero();]])], [
+  AC_MSG_RESULT([yes])
+  AC_DEFINE([HAVE_ATTRIBUTE_TARGET], 1,
+            [Define if __attribute__((target(...))) is available.])
+], [
+  AC_MSG_RESULT([no])
+])
+
+]) dnl End of AS_IF(hts_have_cpuid)
+
 dnl Avoid chicken-and-egg problem where pkg-config supplies the
 dnl PKG_PROG_PKG_CONFIG macro, but we want to use it to check
 dnl for pkg-config...
@@ -289,6 +331,25 @@ AC_CHECK_FUNCS([gmtime_r fsync drand48 srand48_deterministic])
 # Darwin has a dubious fdatasync() symbol, but no declaration in <unistd.h>
 AC_CHECK_DECL([fdatasync(int)], [AC_CHECK_FUNCS(fdatasync)])
 
+AC_MSG_CHECKING([for __attribute__((constructor))])
+AC_LINK_IFELSE([AC_LANG_PROGRAM([[
+  static __attribute__((constructor)) void noop(void) {}
+]], [])], [
+  AC_MSG_RESULT([yes])
+  AC_DEFINE([HAVE_ATTRIBUTE_CONSTRUCTOR], 1,
+            [Define if __attribute__((constructor)) is available.])
+], [AC_MSG_RESULT([no])])
+
+AC_MSG_CHECKING([for clock_gettime with CLOCK_PROCESS_CPUTIME_ID])
+AC_LINK_IFELSE([AC_LANG_PROGRAM([[#include <time.h>]], [[
+  struct timespec ts;
+  clock_gettime(CLOCK_PROCESS_CPUTIME_ID, &ts);
+]])], [
+  AC_MSG_RESULT([yes])
+  AC_DEFINE([HAVE_CLOCK_GETTIME_CPUTIME], 1,
+            [Define if clock_gettime exists and accepts CLOCK_PROCESS_CPUTIME_ID.])
+], [AC_MSG_RESULT([no])])
+
 if test $enable_plugins != no; then
   AC_SEARCH_LIBS([dlsym], [dl], [],
     [MSG_ERROR([dlsym() not found
diff --git a/cram/cram_decode.c b/cram/cram_decode.c
index 86e2ef96e..2b2ad6029 100644
--- a/cram/cram_decode.c
+++ b/cram/cram_decode.c
@@ -1,5 +1,5 @@
 /*
-Copyright (c) 2012-2020, 2022-2023 Genome Research Ltd.
+Copyright (c) 2012-2020, 2022-2024 Genome Research Ltd.
 Author: James Bonfield <jkb@sanger.ac.uk>
 
 Redistribution and use in source and binary forms, with or without
@@ -3004,8 +3004,8 @@ int cram_decode_slice_mt(cram_fd *fd, cram_container *c, cram_slice *s,
  * Returns the used size of the bam record on success
  *         -1 on failure.
  */
-static int cram_to_bam(sam_hdr_t *sh, cram_fd *fd, cram_slice *s,
-                       cram_record *cr, int rec, bam_seq_t **bam) {
+int cram_to_bam(sam_hdr_t *sh, cram_fd *fd, cram_slice *s,
+                cram_record *cr, int rec, bam_seq_t **bam) {
     int ret, rg_len;
     char name_a[1024], *name;
     int name_len;
@@ -3172,7 +3172,7 @@ static cram_container *cram_first_slice(cram_fd *fd) {
     return c;
 }
 
-static cram_slice *cram_next_slice(cram_fd *fd, cram_container **cp) {
+cram_slice *cram_next_slice(cram_fd *fd, cram_container **cp) {
     cram_container *c_curr;  // container being consumed via cram_get_seq()
     cram_slice *s_curr = NULL;
 
diff --git a/cram/cram_decode.h b/cram/cram_decode.h
index 400eb6beb..16d87a073 100644
--- a/cram/cram_decode.h
+++ b/cram/cram_decode.h
@@ -1,5 +1,5 @@
 /*
-Copyright (c) 2012-2013, 2018 Genome Research Ltd.
+Copyright (c) 2012-2013, 2018, 2024 Genome Research Ltd.
 Author: James Bonfield <jkb@sanger.ac.uk>
 
 Redistribution and use in source and binary forms, with or without
@@ -94,6 +94,15 @@ cram_block_compression_hdr *cram_decode_compression_header(cram_fd *fd,
 cram_block_slice_hdr *cram_decode_slice_header(cram_fd *fd, cram_block *b);
 
 
+/*! INTERNAL:
+ * Loads and decodes the next slice worth of data.
+ *
+ * @return
+ * Returns cram slice pointer on success;
+ *         NULL on failure
+ */
+cram_slice *cram_next_slice(cram_fd *fd, cram_container **cp);
+
 /*! INTERNAL:
  * Decode an entire slice from container blocks. Fills out s->crecs[] array.
  *
@@ -105,6 +114,22 @@ int cram_decode_slice(cram_fd *fd, cram_container *c, cram_slice *s,
                       sam_hdr_t *hdr);
 
 
+/*! INTERNAL:
+ * Converts a cram in-memory record into a bam in-memory record. We
+ * pass a pointer to a bam_seq_t pointer along with the a pointer to
+ * the allocated size. These can initially be pointers to NULL and zero.
+ *
+ * This function will reallocate the bam buffer as required and update
+ * (*bam)->alloc accordingly, allowing it to be used within a loop
+ * efficiently without needing to allocate new bam objects over and
+ * over again.
+ *
+ * Returns the used size of the bam record on success
+ *         -1 on failure.
+ */
+int cram_to_bam(sam_hdr_t *sh, cram_fd *fd, cram_slice *s,
+                cram_record *cr, int rec, bam_seq_t **bam);
+
 /*
  * Drains and frees the decode read-queue for a multi-threaded reader.
  */
diff --git a/cram/cram_encode.c b/cram/cram_encode.c
index 4a762f7b0..5d22db54d 100644
--- a/cram/cram_encode.c
+++ b/cram/cram_encode.c
@@ -3401,6 +3401,8 @@ static int process_one_read(cram_fd *fd, cram_container *c,
 
     c->num_bases   += cr->len;
     cr->apos        = bam_pos(b)+1;
+    if (cr->apos < 0 || cr->apos > INT64_MAX/2)
+        goto err;
     if (c->pos_sorted) {
         if (cr->apos < s->last_apos && !fd->ap_delta) {
             c->pos_sorted = 0;
@@ -3439,6 +3441,11 @@ static int process_one_read(cram_fd *fd, cram_container *c,
         int64_t apos = cr->apos-1, spos = 0;
         int64_t MD_last = apos; // last position of edit in MD tag
 
+        if (apos < 0) {
+            hts_log_error("Mapped read with position <= 0 is disallowed");
+            return -1;
+        }
+
         cr->cigar       = s->ncigar;
         cr->ncigar      = bam_cigar_len(b);
         while (cr->cigar + cr->ncigar >= s->cigar_alloc) {
diff --git a/cram/cram_external.c b/cram/cram_external.c
index 7455185ad..4943750dd 100644
--- a/cram/cram_external.c
+++ b/cram/cram_external.c
@@ -1,5 +1,5 @@
 /*
-Copyright (c) 2015, 2018-2020, 2022-2023 Genome Research Ltd.
+Copyright (c) 2015, 2018-2020, 2022-2024 Genome Research Ltd.
 Author: James Bonfield <jkb@sanger.ac.uk>
 
 Redistribution and use in source and binary forms, with or without
@@ -121,6 +121,16 @@ int cram_container_is_empty(cram_fd *fd) {
     return fd->empty_container;
 }
 
+void cram_container_get_coords(cram_container *c,
+                               int *refid, hts_pos_t *start, hts_pos_t *span) {
+    if (refid)
+        *refid = c->ref_seq_id;
+    if (start)
+        *start = c->ref_seq_start;
+    if (span)
+        *span  = c->ref_seq_span;
+}
+
 
 /*
  *-----------------------------------------------------------------------------
@@ -281,7 +291,7 @@ static cram_codec *cram_codec_iter_next(cram_codec_iter *iter,
             iter->curr_map = iter->curr_map->next;
             return cc;
         }
-    } while (iter->idx <= CRAM_MAP_HASH);
+    } while (iter->idx < CRAM_MAP_HASH);
 
     // End of codecs
     return NULL;
@@ -683,6 +693,7 @@ int cram_copy_slice(cram_fd *in, cram_fd *out, int32_t num_slice) {
             cram_free_block(blk);
             return -1;
         }
+
         if (cram_write_block(out, blk) != 0) {
             cram_free_block(blk);
             return -1;
@@ -704,6 +715,192 @@ int cram_copy_slice(cram_fd *in, cram_fd *out, int32_t num_slice) {
     return 0;
 }
 
+/*
+ * Discards the next containers worth of data.
+ * Only the cram structure has been read so far.
+ *
+ * Returns 0 on success,
+ *        -1 on failure
+ */
+static int cram_skip_container(cram_fd *in, cram_container *c) {
+    // Compression header
+    cram_block *blk;
+    if (!(blk = cram_read_block(in)))
+        return -1;
+    cram_free_block(blk);
+
+    int i;
+    for (i = 0; i < c->num_landmarks; i++) {
+        cram_block_slice_hdr *hdr;
+
+        if (!(blk = cram_read_block(in)))
+            return -1;
+        if (!(hdr = cram_decode_slice_header(in, blk))) {
+            cram_free_block(blk);
+            return -1;
+        }
+        cram_free_block(blk);
+
+        int num_blocks = cram_slice_hdr_get_num_blocks(hdr), j;
+        for (j = 0; j < num_blocks; j++) {
+            blk = cram_read_block(in);
+            if (!blk) {
+                cram_free_slice_header(hdr);
+                return -1;
+            }
+            cram_free_block(blk);
+        }
+        cram_free_slice_header(hdr);
+    }
+
+    return 0;
+}
+
+
+/*
+ * Copies a container, but filtering it down to a specific region,
+ * which has already been set on the 'in' fd.
+ *
+ * This is used in e.g. samtools cat where we specified a region and discover
+ * that a region doesn't entirely span the container, so we have to select
+ * which reads we need to copy out of it.
+ *
+ * If ref_id is non-NULL we also return the last ref_id we filtered.
+ * This can be -2 if it's multi-ref and we observe more than one reference,
+ * and actual ref_id >= -1 if it's multi-ref and we observe just one ref or
+ * it's fixed reference.
+ *
+ * Returns 0 on success
+ *        -1 on error
+ */
+int cram_filter_container(cram_fd *in, cram_fd *out, cram_container *c,
+                          int *ref_id) {
+    int err = 0, fixed_ref = -3;
+
+    if (ref_id)
+        *ref_id = c->ref_seq_id;
+
+    int rid = in->range.refid == -2 ? -1 : in->range.refid;
+    if (rid != c->ref_seq_id ||
+        in->range.start > c->ref_seq_start + c->ref_seq_span-1)
+        // Except for multi-ref cases
+        if (c->ref_seq_id != -2)
+            return cram_skip_container(in, c);
+
+    // Container compression header
+    cram_block *blk = cram_read_block(in);
+    if (!blk)
+        return -1;
+    c->comp_hdr = cram_decode_compression_header(in, blk);
+    in->ctr = c;
+
+    // If it's multi-ref but a constant ref-id, then we can still do
+    // basic level chromosome filtering.  Similarly multi-ref where we're
+    // _already_ in ref "*" (unmapped) means we can just copy the container
+    // as there are no positions to filter on and "*" sorts to the end.
+    // TODO: how to tell "already in" though?
+    if (c->ref_seq_id == -2) {
+        cram_codec *cd = c->comp_hdr->codecs[DS_RI];
+        if (cd && cd->codec == E_HUFFMAN && cd->u.huffman.ncodes == 1 &&
+            // this check should be always true anyway
+            rid == cd->u.huffman.codes[0].symbol)
+            // We're in multi-ref mode, but actually the entire container
+            // matches.  So if we're in whole-chromosome mode we can just
+            // copy.
+            if (in->range.start <= 1 &&
+                in->range.end >= (INT64_MAX&(0xffffffffULL<<32))) {
+                if (ref_id)
+                    *ref_id = rid;
+                err |= cram_write_container(out, c) < 0;
+                err |= cram_write_block(out, blk);
+                return cram_copy_slice(in, out, c->num_landmarks) | -err;
+            }
+    }
+
+    // A simple read-write loop with region filtering automatically due to
+    // an earlier CRAM_OPT_RANGE request.
+    //
+    // We can hit EOF when reaching the end of the range, but we still need
+    // to manually check we don't attempt to read beyond this single container.
+
+    cram_range rng_copy = in->range;
+    in->range.start = INT64_MIN;
+    in->range.end = INT64_MAX;
+
+    bam1_t *b = bam_init1();
+    while ((c->curr_slice < c->max_slice ||
+            c->slice->curr_rec < c->slice->max_rec)) {
+        cram_slice *s;
+        if (c->slice && c->slice->curr_rec < c->slice->max_rec)
+            s = c->slice;
+        else if (c->curr_slice < c->max_slice)
+            s = cram_next_slice(in, &c);
+        else
+            break; // end of container
+        c->slice = s;
+
+        // This is more efficient if we check as a cram record instead of a
+        // bam record as we don't have to parse CIGAR end.
+        cram_record *cr = &c->slice->crecs[c->slice->curr_rec];
+        if (fixed_ref == -3)
+            fixed_ref = cr->ref_id;
+        else if (fixed_ref != cr->ref_id)
+            fixed_ref = -2;
+
+        if (rng_copy.refid != cr->ref_id) {
+            if (rng_copy.refid == -2) {
+                if (cr->ref_id > -1) {
+                    // Want unmapped, but have mapped
+                    c->slice->curr_rec++;
+                    continue;
+                }
+            } else {
+                if (rng_copy.refid > cr->ref_id || rng_copy.refid == -1) {
+                    // multi-ref and not at the correct ref yet
+                    c->slice->curr_rec++;
+                    continue;
+                } else {
+                    // multi-ref and beyond the desired ref
+                    break;
+                }
+            }
+        }
+
+        // Correct ref, but check the desired region
+        if (cr->aend < rng_copy.start) {
+            c->slice->curr_rec++;
+            continue;
+        }
+        if (cr->apos > rng_copy.end)
+            break;
+
+        // Broadly rquivalent to cram_get_bam_seq, but starting from 'cr'
+        err |= cram_to_bam(in->header, in, s, cr, s->curr_rec++, &b) < 0;
+
+        if (cram_put_bam_seq(out, b) < 0) {
+            err |= 1;
+            break;
+        }
+    }
+    bam_destroy1(b);
+
+    if (ref_id)
+        *ref_id = fixed_ref;
+
+    in->range = rng_copy;
+
+    // Avoids double frees as we stole the container from our other
+    // file descriptor.
+    in->ctr    = NULL;
+    in->ctr_mt = NULL;
+
+    err |= cram_flush(out);
+    cram_free_block(blk);
+
+    return -err;
+}
+
+
 /*
  * Renumbers RG numbers in a cram compression header.
  *
diff --git a/cram/cram_index.c b/cram/cram_index.c
index 0908736ab..77c953d6c 100644
--- a/cram/cram_index.c
+++ b/cram/cram_index.c
@@ -1,5 +1,5 @@
 /*
-Copyright (c) 2013-2020, 2023 Genome Research Ltd.
+Copyright (c) 2013-2020, 2023-2024 Genome Research Ltd.
 Author: James Bonfield <jkb@sanger.ac.uk>
 
 Redistribution and use in source and binary forms, with or without
@@ -410,6 +410,9 @@ cram_index *cram_index_query(cram_fd *fd, int refid, hts_pos_t pos,
         // Continue from a previous search.
         // We switch to just scanning the linked list, as the nested
         // lists are typically short.
+        if (refid == HTS_IDX_NOCOOR)
+            refid = -1;
+
         e = from->e_next;
         if (e && e->refid == refid && e->start <= pos)
             return e;
@@ -423,6 +426,7 @@ cram_index *cram_index_query(cram_fd *fd, int refid, hts_pos_t pos,
         // fail, or already there, dealt with elsewhere.
         return NULL;
 
+    case -1:
     case HTS_IDX_NOCOOR:
         refid = -1;
         pos = 0;
@@ -844,3 +848,193 @@ int cram_index_build(cram_fd *fd, const char *fn_base, const char *fn_idx) {
 
     return (bgzf_close(fp) >= 0)? 0 : -4;
 }
+
+// internal recursive step
+static int64_t cram_num_containers_between_(cram_index *e, int64_t *last_pos,
+                                            int64_t nct,
+                                            off_t cstart, off_t cend,
+                                            int64_t *first, int64_t *last) {
+    int64_t nc = 0, i;
+
+    if (e->offset) {
+        if (e->offset != *last_pos) {
+            if (e->offset >= cstart && (!cend || e->offset <= cend)) {
+                if (first && *first < 0)
+                    *first = nct;
+                if (last)
+                    *last = nct;
+            }
+            nc++;
+        }
+        // else a new multi-ref in same container
+        *last_pos = e->offset;
+    }
+
+    for (i = 0; i < e->nslice; i++)
+        nc += cram_num_containers_between_(&e->e[i], last_pos, nc + nct,
+                                           cstart, cend, first, last);
+
+    return nc;
+}
+
+/*! Returns the number of containers in the CRAM file within given offsets.
+ *
+ * The cstart and cend offsets are the locations of the start of containers
+ * as returned by index_container_offset.
+ *
+ * If non-NULL, first and last will hold the inclusive range of container
+ * numbers, counting from zero.
+ *
+ * @return
+ * Returns the number of containers, equivalent to *last-*first+1.
+ */
+int64_t cram_num_containers_between(cram_fd *fd,
+                                    off_t cstart, off_t cend,
+                                    int64_t *first, int64_t *last) {
+    int64_t nc = 0, i;
+    int64_t last_pos = -99;
+    int64_t l_first = -1, l_last = -1;
+
+    for (i = 0; i < fd->index_sz; i++) {
+        int j = i+1 == fd->index_sz ? 0 : i+1; // maps "*" to end
+        nc += cram_num_containers_between_(&fd->index[j], &last_pos, nc,
+                                           cstart, cend, &l_first, &l_last);
+    }
+
+    if (first)
+        *first = l_first;
+    if (last)
+        *last = l_last;
+
+    return l_last - l_first + 1;
+}
+
+/*
+ * Queries the total number of distinct containers in the index.
+ * Note there may be more containers in the file than in the index, as we
+ * are not required to have an index entry for every one.
+ */
+int64_t cram_num_containers(cram_fd *fd) {
+    return cram_num_containers_between(fd, 0, 0, NULL, NULL);
+}
+
+
+/*! Returns the byte offset for the start of the n^th container.
+ *
+ * The index must have previously been loaded, otherwise <0 is returned.
+ */
+static cram_index *cram_container_num2offset_(cram_index *e, int num,
+                                              int64_t *last_pos, int *nc) {
+    if (e->offset) {
+        if (e->offset != *last_pos) {
+            if (*nc == num)
+                return e;
+            (*nc)++;
+        }
+        // else a new multi-ref in same container
+        *last_pos = e->offset;
+    }
+
+    int i;
+    for (i = 0; i < e->nslice; i++) {
+        cram_index *tmp = cram_container_num2offset_(&e->e[i], num,
+                                                     last_pos, nc);
+        if (tmp)
+            return tmp;
+    }
+
+
+    return NULL;
+}
+
+off_t cram_container_num2offset(cram_fd *fd, int64_t num) {
+    int nc = 0, i;
+    int64_t last_pos = -9;
+    cram_index *e = NULL;
+
+    for (i = 0; i < fd->index_sz; i++) {
+        int j = i+1 == fd->index_sz ? 0 : i+1; // maps "*" to end
+        if (!fd->index[j].nslice)
+            continue;
+        if ((e = cram_container_num2offset_(&fd->index[j], num,
+                                            &last_pos, &nc)))
+            break;
+    }
+
+    return e ? e->offset : -1;
+}
+
+
+/*! Returns the container number for the first container at offset >= pos.
+ *
+ * The index must have previously been loaded, otherwise <0 is returned.
+ */
+static cram_index *cram_container_offset2num_(cram_index *e, off_t pos,
+                                              int64_t *last_pos, int *nc) {
+    if (e->offset) {
+        if (e->offset != *last_pos) {
+            if (e->offset >= pos)
+                return e;
+            (*nc)++;
+        }
+        // else a new multi-ref in same container
+        *last_pos = e->offset;
+    }
+
+    int i;
+    for (i = 0; i < e->nslice; i++) {
+        cram_index *tmp = cram_container_offset2num_(&e->e[i], pos,
+                                                     last_pos, nc);
+        if (tmp)
+            return tmp;
+    }
+
+
+    return NULL;
+}
+
+int64_t cram_container_offset2num(cram_fd *fd, off_t pos) {
+    int nc = 0, i;
+    int64_t last_pos = -9;
+    cram_index *e = NULL;
+
+    for (i = 0; i < fd->index_sz; i++) {
+        int j = i+1 == fd->index_sz ? 0 : i+1; // maps "*" to end
+        if (!fd->index[j].nslice)
+            continue;
+        if ((e = cram_container_offset2num_(&fd->index[j], pos,
+                                            &last_pos, &nc)))
+            break;
+    }
+
+    return e ? nc : -1;
+}
+
+/*!
+ * Returns the file offsets of CRAM containers covering a specific region
+ * query.  Note both offsets are the START of the container.
+ *
+ * first will point to the start of the first overlapping container
+ * last will point to the start of the last overlapping container
+ *
+ * Returns 0 on success
+ *        <0 on failure
+ */
+int cram_index_extents(cram_fd *fd, int refid, hts_pos_t start, hts_pos_t end,
+                       off_t *first, off_t *last) {
+    cram_index *ci;
+
+    if (first) {
+        if (!(ci = cram_index_query(fd, refid, start, NULL)))
+            return -1;
+        *first = ci->offset;
+    }
+
+    if (last) {
+        if (!(ci = cram_index_query_last(fd, refid, end)))
+            return -1;
+        *last = ci->offset;
+    }
+
+    return 0;
+}
diff --git a/cram/cram_io.c b/cram/cram_io.c
index 247423354..7f7ffca49 100644
--- a/cram/cram_io.c
+++ b/cram/cram_io.c
@@ -1984,11 +1984,15 @@ int cram_compress_block2(cram_fd *fd, cram_slice *s,
         // We also get large fluctuations based on genome coordinate for
         // e.g. SA:Z and SC series, but we consider the typical scale of
         // delta between blocks and use this to look for abnormality.
+
+        // Equivalent to (but minus possible integer overflow)
+        //   (b->uncomp_size + 1000)/4 > metrics->input_avg_sz+1000 ||
+        //    b->uncomp_size + 1000    < (metrics->input_avg_sz+1000)/4)
         if (metrics->input_avg_sz &&
-            (b->uncomp_size + 1000 > 4*(metrics->input_avg_sz+1000) ||
-             b->uncomp_size + 1000 < (metrics->input_avg_sz+1000)/4) &&
-            ABS(b->uncomp_size-metrics->input_avg_sz)
-                > 10*metrics->input_avg_delta) {
+            (b->uncomp_size/4 - 750 > metrics->input_avg_sz ||
+             b->uncomp_size         < metrics->input_avg_sz/4 - 750) &&
+            ABS(b->uncomp_size-metrics->input_avg_sz)/10
+                > metrics->input_avg_delta) {
             metrics->next_trial = 0;
         }
 
diff --git a/faidx.c b/faidx.c
index ce8fe5d9f..ed39c0ca0 100644
--- a/faidx.c
+++ b/faidx.c
@@ -1,6 +1,6 @@
 /*  faidx.c -- FASTA and FASTQ random access.
 
-    Copyright (C) 2008, 2009, 2013-2020, 2022 Genome Research Ltd.
+    Copyright (C) 2008, 2009, 2013-2020, 2022, 2024 Genome Research Ltd.
     Portions copyright (C) 2011 Broad Institute.
 
     Author: Heng Li <lh3@sanger.ac.uk>
@@ -43,6 +43,29 @@ DEALINGS IN THE SOFTWARE.  */
 #include "htslib/kstring.h"
 #include "hts_internal.h"
 
+// Faster isgraph; assumes ASCII
+static inline int isgraph_(unsigned char c) {
+    return c > ' ' && c <= '~';
+}
+
+#ifdef isgraph
+#  undef isgraph
+#endif
+#define isgraph isgraph_
+
+// An optimised bgzf_getc.
+// We could consider moving this to bgzf.h, but our own code uses it here only.
+static inline int bgzf_getc_(BGZF *fp) {
+    if (fp->block_offset+1 < fp->block_length) {
+        int c = ((unsigned char*)fp->uncompressed_block)[fp->block_offset++];
+        fp->uncompressed_address++;
+        return c;
+    }
+
+    return bgzf_getc(fp);
+}
+#define bgzf_getc bgzf_getc_
+
 typedef struct {
     int id; // faidx_t->name[id] is for this struct.
     uint32_t line_len, line_blen;
@@ -692,9 +715,8 @@ faidx_t *fai_load_format(const char *fn, enum fai_format_options format) {
 
 static char *fai_retrieve(const faidx_t *fai, const faidx1_t *val,
                           uint64_t offset, hts_pos_t beg, hts_pos_t end, hts_pos_t *len) {
-    char *s;
-    size_t l;
-    int c = 0;
+    char *buffer, *s;
+    ssize_t nread, remaining, firstline_len, firstline_blen;
     int ret;
 
     if ((uint64_t) end - (uint64_t) beg >= SIZE_MAX - 2) {
@@ -720,26 +742,57 @@ static char *fai_retrieve(const faidx_t *fai, const faidx1_t *val,
         return NULL;
     }
 
-    l = 0;
-    s = (char*)malloc((size_t) end - beg + 2);
-    if (!s) {
+    // Over-allocate so there is extra space for one end-of-line sequence
+    buffer = (char*)malloc((size_t) end - beg + val->line_len - val->line_blen + 1);
+    if (!buffer) {
         *len = -1;
         return NULL;
     }
 
-    while ( l < end - beg && (c=bgzf_getc(fai->bgzf))>=0 )
-        if (isgraph(c)) s[l++] = c;
-    if (c < 0) {
-        hts_log_error("Failed to retrieve block: %s",
-            c == -1 ? "unexpected end of file" : "error reading file");
-        free(s);
-        *len = -1;
-        return NULL;
+    remaining = *len = end - beg;
+    firstline_blen = val->line_blen - beg % val->line_blen;
+
+    // Special case when the entire interval requested is within a single FASTA/Q line
+    if (remaining <= firstline_blen) {
+        nread = bgzf_read_small(fai->bgzf, buffer, remaining);
+        if (nread < remaining) goto error;
+        buffer[nread] = '\0';
+        return buffer;
+    }
+
+    s = buffer;
+    firstline_len = val->line_len - beg % val->line_blen;
+
+    // Read the (partial) first line and its line terminator, but increment  s  past the
+    // line contents only, so the terminator characters will be overwritten by the next line.
+    nread = bgzf_read_small(fai->bgzf, s, firstline_len);
+    if (nread < firstline_len) goto error;
+    s += firstline_blen;
+    remaining -= firstline_blen;
+
+    // Similarly read complete lines and their line terminator characters, but overwrite the latter.
+    while (remaining > val->line_blen) {
+        nread = bgzf_read_small(fai->bgzf, s, val->line_len);
+        if (nread < (ssize_t) val->line_len) goto error;
+        s += val->line_blen;
+        remaining -= val->line_blen;
     }
 
-    s[l] = '\0';
-    *len = l;
-    return s;
+    if (remaining > 0) {
+        nread = bgzf_read_small(fai->bgzf, s, remaining);
+        if (nread < remaining) goto error;
+        s += remaining;
+    }
+
+    *s = '\0';
+    return buffer;
+
+error:
+    hts_log_error("Failed to retrieve block: %s",
+                  (nread == 0)? "unexpected end of file" : "error reading file");
+    free(buffer);
+    *len = -1;
+    return NULL;
 }
 
 static int fai_get_val(const faidx_t *fai, const char *str,
diff --git a/header.c b/header.c
index 5161034f4..7f62074f0 100644
--- a/header.c
+++ b/header.c
@@ -2358,7 +2358,7 @@ void sam_hdr_incr_ref(sam_hdr_t *bh) {
  * Returns a sam_hrecs_t struct on success (free with sam_hrecs_free())
  *         NULL on failure
  */
-sam_hrecs_t *sam_hrecs_new() {
+sam_hrecs_t *sam_hrecs_new(void) {
     sam_hrecs_t *hrecs = calloc(1, sizeof(*hrecs));
 
     if (!hrecs)
diff --git a/hfile.c b/hfile.c
index fc87049ca..552b71774 100644
--- a/hfile.c
+++ b/hfile.c
@@ -703,7 +703,7 @@ static int is_preload_url_remote(const char *url){
 
 static hFILE *hopen_preload(const char *url, const char *mode){
     hFILE* fp = hopen(url + 8, mode);
-    return hpreload(fp);
+    return fp ? hpreload(fp) : NULL;
 }
 
 hFILE *hdopen(int fd, const char *mode)
@@ -976,7 +976,7 @@ void hfile_shutdown(int do_close_plugin)
     pthread_mutex_unlock(&plugins_lock);
 }
 
-static void hfile_exit()
+static void hfile_exit(void)
 {
     hfile_shutdown(0);
     pthread_mutex_destroy(&plugins_lock);
@@ -1082,7 +1082,7 @@ static int init_add_plugin(void *obj, int (*init)(struct hFILE_plugin *),
  * Returns 0 on success,
  *        <0 on failure
  */
-static int load_hfile_plugins()
+static int load_hfile_plugins(void)
 {
     static const struct hFILE_scheme_handler
         data = { hopen_mem, hfile_always_local, "built-in", 80 },
diff --git a/hfile_libcurl.c b/hfile_libcurl.c
index e70550eab..3463acf43 100644
--- a/hfile_libcurl.c
+++ b/hfile_libcurl.c
@@ -277,7 +277,7 @@ static void free_auth(auth_token *tok) {
     free(tok);
 }
 
-static void libcurl_exit()
+static void libcurl_exit(void)
 {
     if (curl_share_cleanup(curl.share) == CURLSHE_OK)
         curl.share = NULL;
diff --git a/hfile_s3_write.c b/hfile_s3_write.c
index d54945839..a501645ca 100644
--- a/hfile_s3_write.c
+++ b/hfile_s3_write.c
@@ -822,7 +822,7 @@ static hFILE *vhopen_s3_write(const char *url, const char *mode, va_list args) {
 }
 
 
-static void s3_write_exit() {
+static void s3_write_exit(void) {
     if (curl_share_cleanup(curl.share) == CURLSHE_OK)
         curl.share = NULL;
 
diff --git a/hts.c b/hts.c
index cf0a07d9f..a8a8bead2 100644
--- a/hts.c
+++ b/hts.c
@@ -81,7 +81,7 @@ KHASH_INIT2(s2i,, kh_cstr_t, int64_t, 1, kh_str_hash_func, kh_str_hash_equal)
 HTSLIB_EXPORT
 int hts_verbose = HTS_LOG_WARNING;
 
-const char *hts_version()
+const char *hts_version(void)
 {
     return HTS_VERSION_TEXT;
 }
@@ -431,6 +431,27 @@ static int is_text_only(const unsigned char *u, const unsigned char *ulim)
     return 1;
 }
 
+static inline int
+alternate_zeros(const unsigned char *u, const unsigned char *ulim)
+{
+    for (; u < ulim; u += 2)
+        if (*u != '\0') return 0;
+    return 1;
+}
+
+static int is_utf16_text(const unsigned char *u, const unsigned char *ulim)
+{
+    if (ulim - u >= 6 &&
+        ((u[0] == 0xfe && u[1] == 0xff && alternate_zeros(u+2, ulim)) ||
+         (u[0] == 0xff && u[1] == 0xfe && alternate_zeros(u+3, ulim))))
+        return 2;
+    else if (ulim - u >= 8 &&
+             (alternate_zeros(u, ulim) || alternate_zeros(u+1, ulim)))
+        return 1;
+    else
+        return 0;
+}
+
 static int is_fastaq(const unsigned char *u, const unsigned char *ulim)
 {
     const unsigned char *eol = memchr(u, '\n', ulim - u);
@@ -1301,7 +1322,7 @@ int hts_parse_opt_list(htsFormat *fmt, const char *str) {
  *        -1 on failure.
  */
 int hts_parse_format(htsFormat *format, const char *str) {
-    char fmt[8];
+    char fmt[9];
     const char *cp = scan_keyword(str, ',', fmt, sizeof fmt);
 
     format->version.minor = 0; // unknown
@@ -1743,7 +1764,7 @@ static hFILE *hts_hfile(htsFile *fp) {
     case bcf:          // fall through
     case bam:          return bgzf_hfile(fp->fp.bgzf);
     case cram:         return cram_hfile(fp->fp.cram);
-    case text_format:  return fp->fp.hfile;
+    case text_format:  // fall through
     case vcf:          // fall through
     case fastq_format: // fall through
     case fasta_format: // fall through
@@ -1961,6 +1982,12 @@ hFILE *hts_open_tmpfile(const char *fname, const char *mode, kstring_t *tmpname)
     return fp;
 }
 
+int hts_is_utf16_text(const kstring_t *str)
+{
+    const unsigned char *u = (const unsigned char *) (str->s);
+    return (str->l > 0 && str->s)? is_utf16_text(u, u + str->l) : 0;
+}
+
 // For VCF/BCF backward sweeper. Not exposing these functions because their
 // future is uncertain. Things will probably have to change with hFILE...
 BGZF *hts_get_bgzfp(htsFile *fp)
@@ -2030,6 +2057,8 @@ char **hts_readlist(const char *string, int is_file, int *_n)
         while ((ret = bgzf_getline(fp, '\n', &str)) >= 0)
         {
             if (str.l == 0) continue;
+            if (n == 0 && hts_is_utf16_text(&str))
+                hts_log_warning("'%s' appears to be encoded as UTF-16", string);
             if (hts_resize(char*, n + 1, &m, &s, 0) < 0)
                 goto err;
             s[n] = strdup(str.s);
@@ -2089,6 +2118,8 @@ char **hts_readlines(const char *fn, int *_n)
         str.s = 0; str.l = str.m = 0;
         while ((ret = bgzf_getline(fp, '\n', &str)) >= 0) {
             if (str.l == 0) continue;
+            if (n == 0 && hts_is_utf16_text(&str))
+                hts_log_warning("'%s' appears to be encoded as UTF-16", fn);
             if (hts_resize(char *, n + 1, &m, &s, 0) < 0)
                 goto err;
             s[n] = strdup(str.s);
@@ -2446,9 +2477,14 @@ int hts_idx_finish(hts_idx_t *idx, uint64_t final_offset)
     return ret;
 }
 
+static inline hts_pos_t hts_idx_maxpos(const hts_idx_t *idx)
+{
+    return hts_bin_maxpos(idx->min_shift, idx->n_lvls);
+}
+
 int hts_idx_check_range(hts_idx_t *idx, int tid, hts_pos_t beg, hts_pos_t end)
 {
-    int64_t maxpos = (int64_t) 1 << (idx->min_shift + idx->n_lvls * 3);
+    hts_pos_t maxpos = hts_idx_maxpos(idx);
     if (tid < 0 || (beg <= maxpos && end <= maxpos))
         return 0;
 
@@ -3222,7 +3258,7 @@ static inline int reg2intervals(hts_itr_t *iter, const hts_idx_t *idx, int tid,
     size_t reg_bin_count = 0, hash_bin_count;
     int res;
 
-    if (!iter || !idx || (bidx = idx->bidx[tid]) == NULL || beg >= end)
+    if (!iter || !idx || (bidx = idx->bidx[tid]) == NULL || beg > end)
         return -1;
 
     hash_bin_count = kh_n_buckets(bidx);
@@ -3341,6 +3377,7 @@ hts_itr_t *hts_itr_query(const hts_idx_t *idx, int tid, hts_pos_t beg, hts_pos_t
     khint_t k;
     bidx_t *bidx;
     uint64_t min_off, max_off;
+    hts_pos_t idx_maxpos;
     hts_itr_t *iter;
     uint32_t unmapped = 0, rel_off;
 
@@ -3385,6 +3422,9 @@ hts_itr_t *hts_itr_query(const hts_idx_t *idx, int tid, hts_pos_t beg, hts_pos_t
 
             if ( !kh_size(bidx) ) { iter->finished = 1; return iter; }
 
+            idx_maxpos = hts_idx_maxpos(idx);
+            if (beg >= idx_maxpos) { iter->finished = 1; return iter; }
+
             rel_off = beg>>idx->min_shift;
             // compute min_off
             bin = hts_bin_first(idx->n_lvls) + rel_off;
@@ -3427,7 +3467,7 @@ hts_itr_t *hts_itr_query(const hts_idx_t *idx, int tid, hts_pos_t beg, hts_pos_t
             // compute max_off: a virtual offset from a bin to the right of end
             // First check if end lies within the range of the index (it won't
             // if it's HTS_POS_MAX)
-            if (end < 1LL << (idx->min_shift + 3 * idx->n_lvls)) {
+            if (end <= idx_maxpos) {
                 bin = hts_bin_first(idx->n_lvls) + ((end-1) >> idx->min_shift) + 1;
                 if (bin >= idx->n_bins) bin = 0;
                 while (1) {
@@ -3513,7 +3553,7 @@ int hts_itr_multi_bam(const hts_idx_t *idx, hts_itr_t *iter)
     bidx_t *bidx;
     uint64_t min_off, max_off, t_off = (uint64_t)-1;
     int tid;
-    hts_pos_t beg, end;
+    hts_pos_t beg, end, idx_maxpos;
     hts_reglist_t *curr_reg;
     uint32_t unmapped = 0, rel_off;
 
@@ -3555,6 +3595,8 @@ int hts_itr_multi_bam(const hts_idx_t *idx, hts_itr_t *iter)
             else
                 unmapped = 1;
 
+            idx_maxpos = hts_idx_maxpos(idx);
+
             for(j=0; j<curr_reg->count; j++) {
                 hts_pair32_t *curr_intv = &curr_reg->intervals[j];
                 if (curr_intv->end < curr_intv->beg)
@@ -3562,6 +3604,8 @@ int hts_itr_multi_bam(const hts_idx_t *idx, hts_itr_t *iter)
 
                 beg = curr_intv->beg;
                 end = curr_intv->end;
+                if (beg >= idx_maxpos)
+                    continue;
                 rel_off = beg>>idx->min_shift;
 
                 /* Compute 'min_off' by searching the lowest level bin containing 'beg'.
@@ -3606,7 +3650,7 @@ int hts_itr_multi_bam(const hts_idx_t *idx, hts_itr_t *iter)
                 // compute max_off: a virtual offset from a bin to the right of end
                 // First check if end lies within the range of the index (it
                 // won't if it's HTS_POS_MAX)
-                if (end < 1LL << (idx->min_shift + 3 * idx->n_lvls)) {
+                if (end <= idx_maxpos) {
                     bin = hts_bin_first(idx->n_lvls) + ((end-1) >> idx->min_shift) + 1;
                     if (bin >= idx->n_bins) bin = 0;
                     while (1) {
@@ -3782,7 +3826,7 @@ void hts_itr_destroy(hts_itr_t *iter)
     }
 }
 
-static inline long long push_digit(long long i, char c)
+static inline unsigned long long push_digit(unsigned long long i, char c)
 {
     // ensure subtraction occurs first, avoiding overflow for >= MAX-48 or so
     int digit = c - '0';
@@ -3791,7 +3835,7 @@ static inline long long push_digit(long long i, char c)
 
 long long hts_parse_decimal(const char *str, char **strend, int flags)
 {
-    long long n = 0;
+    unsigned long long n = 0;
     int digits = 0, decimals = 0, e = 0, lost = 0;
     char sign = '+', esign = '+';
     const char *s, *str_orig = str;
@@ -4405,11 +4449,12 @@ int hts_itr_multi_next(htsFile *fd, hts_itr_t *iter, void *r)
                                     break;
 
                                 uint64_t max = iter->off[j].max;
-                                if ((max>>32) != tid)
+                                if ((max>>32) != tid) {
                                     tid = HTS_IDX_START; // => no range limit
-
-                                if (end < rl->intervals[max & 0xffffffff].end)
-                                    end = rl->intervals[max & 0xffffffff].end;
+                                } else {
+                                    if (end < rl->intervals[max & 0xffffffff].end)
+                                        end = rl->intervals[max & 0xffffffff].end;
+                                }
                                 if (v < iter->off[j].v)
                                     v = iter->off[j].v;
                                 j++;
@@ -5050,7 +5095,7 @@ int hts_resize_array_(size_t item_size, size_t num, size_t size_sz,
     return 0;
 }
 
-void hts_lib_shutdown()
+void hts_lib_shutdown(void)
 {
     hfile_shutdown(1);
 }
@@ -5064,7 +5109,7 @@ void hts_set_log_level(enum htsLogLevel level)
     hts_verbose = level;
 }
 
-enum htsLogLevel hts_get_log_level()
+enum htsLogLevel hts_get_log_level(void)
 {
     return hts_verbose;
 }
diff --git a/hts_expr.c b/hts_expr.c
index 5e5a132ea..dfd15b151 100644
--- a/hts_expr.c
+++ b/hts_expr.c
@@ -1,6 +1,6 @@
 /*  hts_expr.c -- filter expression parsing and processing.
 
-    Copyright (C) 2020-2022 Genome Research Ltd.
+    Copyright (C) 2020-2022, 2024 Genome Research Ltd.
 
     Author: James Bonfield <jkb@sanger.ac.uk>
 
@@ -527,8 +527,10 @@ static int bitand_expr(hts_filter_t *filt, void *data, hts_expr_sym_func *fn,
             } else if (res->is_str || val.is_str) {
                 hts_expr_val_free(&val);
                 return -1;
+            } else {
+                res->is_true =
+                    (res->d = ((int64_t)res->d & (int64_t)val.d)) != 0;
             }
-            res->is_true = (res->d = ((int64_t)res->d & (int64_t)val.d)) != 0;
         } else {
             break;
         }
@@ -560,8 +562,10 @@ static int bitxor_expr(hts_filter_t *filt, void *data, hts_expr_sym_func *fn,
             } else if (res->is_str || val.is_str) {
                 hts_expr_val_free(&val);
                 return -1;
+            } else {
+                res->is_true =
+                    (res->d = ((int64_t)res->d ^ (int64_t)val.d)) != 0;
             }
-            res->is_true = (res->d = ((int64_t)res->d ^ (int64_t)val.d)) != 0;
         } else {
             break;
         }
@@ -593,8 +597,10 @@ static int bitor_expr(hts_filter_t *filt, void *data, hts_expr_sym_func *fn,
             } else if (res->is_str || val.is_str) {
                 hts_expr_val_free(&val);
                 return -1;
+            } else {
+                res->is_true =
+                    (res->d = ((int64_t)res->d | (int64_t)val.d)) != 0;
             }
-            res->is_true = (res->d = ((int64_t)res->d | (int64_t)val.d)) != 0;
         } else {
             break;
         }
diff --git a/hts_internal.h b/hts_internal.h
index 191a55d16..52f29e6c1 100644
--- a/hts_internal.h
+++ b/hts_internal.h
@@ -87,6 +87,9 @@ typedef struct hts_cram_idx_t {
     struct cram_fd *cram;
 } hts_cram_idx_t;
 
+// Determine whether the string's contents appear to be UTF-16-encoded text.
+// Returns 1 if they are, 2 if there is also a BOM, or 0 otherwise.
+int hts_is_utf16_text(const kstring_t *str);
 
 // Entry point to hFILE_multipart backend.
 struct hFILE *hopen_htsget_redirect(struct hFILE *hfile, const char *mode);
@@ -120,18 +123,6 @@ const char *hts_plugin_path(void);
  */
 int bgzf_idx_push(BGZF *fp, hts_idx_t *hidx, int tid, hts_pos_t beg, hts_pos_t end, uint64_t offset, int is_mapped);
 
-/*
- * bgzf analogue to hts_idx_amend_last.
- *
- * This is needed when multi-threading and writing indices on the fly.
- * At the point of writing a record we know the virtual offset for start
- * and end, but that end virtual offset may be the end of the current
- * block.  In standard indexing our end virtual offset becomes the start
- * of the next block.  Thus to ensure bit for bit compatibility we
- * detect this boundary case and fix it up here.
- */
-void bgzf_idx_amend_last(BGZF *fp, hts_idx_t *hidx, uint64_t offset);
-
 static inline int find_file_extension(const char *fn, char ext_out[static HTS_MAX_EXT_LEN])
 {
     const char *delim = fn ? strstr(fn, HTS_IDX_DELIM) : NULL, *ext;
diff --git a/hts_probe_cc.sh b/hts_probe_cc.sh
index 48d0159c6..c9fc0a821 100755
--- a/hts_probe_cc.sh
+++ b/hts_probe_cc.sh
@@ -2,7 +2,7 @@
 
 # Check compiler options for non-configure builds and create Makefile fragment
 #
-#    Copyright (C) 2022-2023 Genome Research Ltd.
+#    Copyright (C) 2022-2024 Genome Research Ltd.
 #
 #    Author: Rob Davies <rmd@sanger.ac.uk>
 #
@@ -51,6 +51,12 @@ run_compiler ()
 # again with it to see if the flag is needed.
 run_test ()
 {
+    if [ $have_cpuid -ne 1 ] ; then
+        # Only test for and build SSE / AVX code if cpuid works as
+        # otherwise it won't be executed, even if present
+        echo "$3 ="
+        return
+    fi
     rm -f conftest conftest.err conftest.c
     cat - > conftest.c
     if run_compiler ; then
@@ -66,6 +72,27 @@ run_test ()
 
 echo "# Compiler probe results, generated by $0"
 
+# Check for cpuid
+rm -f conftest conftest.err conftest.c
+cat > conftest.c <<'EOF'
+#include <cpuid.h>
+#include <stddef.h>
+int main(int argc, char **argv) {
+    unsigned int a, b, c, d;
+    int level = __get_cpuid_max(0, NULL);
+    if (level > 0)
+        __cpuid_count(1, 0, a, b, c, d);
+    return 0;
+}
+EOF
+if run_compiler ; then
+    echo "HTS_HAVE_CPUID = 1"
+    have_cpuid=1
+else
+    echo "HTS_HAVE_CPUID ="
+    have_cpuid=0
+fi
+
 # Check for sse4.1 etc. support
 run_test "-msse4.1 -mpopcnt -mssse3" HTS_CFLAGS_SSE4 HTS_BUILD_SSE4 <<'EOF'
 #ifdef __x86_64__
@@ -104,7 +131,9 @@ run_test "-mavx512f -mpopcnt" HTS_CFLAGS_AVX512 HTS_BUILD_AVX512 <<'EOF'
 int main(int argc, char **argv) {
     __m512i a = _mm512_set1_epi32(1);
     __m512i b = _mm512_add_epi32(a, a);
-    return _mm_popcnt_u32(*((char *) &b));
+    __m256i c = _mm512_castsi512_si256(b);
+    __m256i d = _mm512_extracti64x4_epi64(a, 1);
+    return _mm_popcnt_u32(*((char *) &c)) + (*(char *) &d);
 }
 #else
 int main(int argc, char **argv) { return 0; }
diff --git a/htscodecs b/htscodecs
index ffda7310c..51794289a 160000
--- a/htscodecs
+++ b/htscodecs
@@ -1 +1 @@
-Subproject commit ffda7310c4b3292955561d6c3b1743cb82bfe26b
+Subproject commit 51794289ac47455209c333182b6768f99a613948
diff --git a/htsfile.1 b/htsfile.1
index 89a2fe446..e22fdbcda 100644
--- a/htsfile.1
+++ b/htsfile.1
@@ -1,4 +1,4 @@
-.TH htsfile 1 "15 April 2024" "htslib-1.20" "Bioinformatics tools"
+.TH htsfile 1 "12 September 2024" "htslib-1.21" "Bioinformatics tools"
 .SH NAME
 htsfile \- identify high-throughput sequencing data files
 .\"
diff --git a/htsfile.c b/htsfile.c
index 9af4ae31b..25af3f584 100644
--- a/htsfile.c
+++ b/htsfile.c
@@ -46,7 +46,7 @@ int show_headers = 1;
 int verbose = 0;
 int status = EXIT_SUCCESS;  /* Exit status from main */
 
-void error(const char *format, ...)
+void HTS_FORMAT(HTS_PRINTF_FMT, 1, 2) error(const char *format, ...)
 {
     int err = errno;
     va_list args;
diff --git a/htslib-s3-plugin.7 b/htslib-s3-plugin.7
index 3bd868c71..44de65771 100644
--- a/htslib-s3-plugin.7
+++ b/htslib-s3-plugin.7
@@ -1,4 +1,4 @@
-.TH htslib-s3-plugin 7 "15 April 2024" "htslib-1.20" "Bioinformatics tools"
+.TH htslib-s3-plugin 7 "12 September 2024" "htslib-1.21" "Bioinformatics tools"
 .SH NAME
 htslib-s3-plugin \- htslib AWS S3 plugin
 .\"
diff --git a/htslib.map b/htslib.map
index e342f55b5..52ad738bb 100644
--- a/htslib.map
+++ b/htslib.map
@@ -640,3 +640,13 @@ HTSLIB_1.18 {
 HTSLIB_1.20 {
     tbx_conf_gaf;
 } HTSLIB_1.18;
+
+HTSLIB_1.21 {
+    cram_container_get_coords;
+    cram_container_num2offset;
+    cram_container_offset2num;
+    cram_filter_container;
+    cram_index_extents;
+    cram_num_containers;
+    cram_num_containers_between;
+} HTSLIB_1.20;
diff --git a/htslib/bgzf.h b/htslib/bgzf.h
index ea4ec3ece..87d4c6a3b 100644
--- a/htslib/bgzf.h
+++ b/htslib/bgzf.h
@@ -3,7 +3,7 @@
 /*
    Copyright (c) 2008 Broad Institute / Massachusetts Institute of Technology
                  2011, 2012 Attractive Chaos <attractor@live.co.uk>
-   Copyright (C) 2009, 2013, 2014, 2017, 2018-2019, 2022-2023 Genome Research Ltd
+   Copyright (C) 2009, 2013, 2014, 2017, 2018-2019, 2022-2024 Genome Research Ltd
 
    Permission is hereby granted, free of charge, to any person obtaining a copy
    of this software and associated documentation files (the "Software"), to deal
@@ -31,6 +31,7 @@
 #define HTSLIB_BGZF_H
 
 #include <stdint.h>
+#include <string.h>
 #include <sys/types.h>
 
 #include "hts_defs.h"
@@ -143,6 +144,26 @@ typedef struct BGZF BGZF;
     HTSLIB_EXPORT
     ssize_t bgzf_read(BGZF *fp, void *data, size_t length) HTS_RESULT_USED;
 
+/**
+ * bgzf_read optimised for small quantities, as a static inline
+ * See bgzf_read() normal function for return values.
+ */
+static inline ssize_t bgzf_read_small(BGZF *fp, void *data, size_t length) {
+    // A block length of 0 implies current block isn't loaded (see
+    // bgzf_seek_common).  That gives negative available so careful on types
+    if ((ssize_t)length < fp->block_length - fp->block_offset) {
+        // Short cut the common and easy mode
+        memcpy((uint8_t *)data,
+               (uint8_t *)fp->uncompressed_block + fp->block_offset,
+               length);
+        fp->block_offset += length;
+        fp->uncompressed_address += length;
+        return length;
+    } else {
+        return bgzf_read(fp, data, length);
+    }
+}
+
     /**
      * Write _length_ bytes from _data_ to the file.  If no I/O errors occur,
      * the complete _length_ bytes will be written (or queued for writing).
@@ -155,6 +176,24 @@ typedef struct BGZF BGZF;
     HTSLIB_EXPORT
     ssize_t bgzf_write(BGZF *fp, const void *data, size_t length) HTS_RESULT_USED;
 
+/**
+ * bgzf_write optimised for small quantities, as a static inline
+ * See bgzf_write() normal function for return values.
+ */
+static inline
+ssize_t bgzf_write_small(BGZF *fp, const void *data, size_t length) {
+    if (fp->is_compressed
+        && (size_t) (BGZF_BLOCK_SIZE - fp->block_offset) > length) {
+        // Short cut the common and easy mode
+        memcpy((uint8_t *)fp->uncompressed_block + fp->block_offset,
+               data, length);
+        fp->block_offset += length;
+        return length;
+    } else {
+        return bgzf_write(fp, data, length);
+    }
+}
+
     /**
      * Write _length_ bytes from _data_ to the file, the index will be used to
      * decide the amount of uncompressed data to be written to each bgzip block.
diff --git a/htslib/cram.h b/htslib/cram.h
index e0b51839c..ddc44bbba 100644
--- a/htslib/cram.h
+++ b/htslib/cram.h
@@ -1,7 +1,7 @@
 /// @file htslib/cram.h
 /// CRAM format-specific API functions.
 /*
-    Copyright (C) 2015, 2016, 2018-2020, 2022-2023 Genome Research Ltd.
+    Copyright (C) 2015, 2016, 2018-2020, 2022-2024 Genome Research Ltd.
 
     Author: James Bonfield <jkb@sanger.ac.uk>
 
@@ -209,6 +209,11 @@ HTSLIB_EXPORT
 int cram_container_is_empty(cram_fd *fd);
 
 
+/* Returns chromosome and start/span from container struct */
+HTSLIB_EXPORT
+void cram_container_get_coords(cram_container *c,
+                               int *refid, hts_pos_t *start, hts_pos_t *span);
+
 /*
  *-----------------------------------------------------------------------------
  * cram_block
@@ -329,6 +334,18 @@ int cram_transcode_rg(cram_fd *in, cram_fd *out,
 HTSLIB_EXPORT
 int cram_copy_slice(cram_fd *in, cram_fd *out, int32_t num_slice);
 
+/*
+ * Copies a container, but filtering it down to a specific region (as
+ * already specified in 'in'
+ *
+ * Returns 0 on success
+ *        -1 on EOF
+ *        -2 on error
+ */
+HTSLIB_EXPORT
+int cram_filter_container(cram_fd *in, cram_fd *out, cram_container *c,
+                          int *ref_id);
+
 /*
  * Decodes a CRAM block compression header.
  * Returns header ptr on success
@@ -744,6 +761,62 @@ static inline void sam_hdr_free(SAM_hdr *hdr) { sam_hdr_destroy(hdr); }
 HTSLIB_EXPORT
 refs_t *cram_get_refs(htsFile *fd);
 
+/*!
+ * Returns the file offsets of CRAM slices covering a specific region
+ * query.  Note both offsets are the START of the slice.
+ *
+ * first will point to the start of the first overlapping slice
+ * last will point to the start of the last overlapping slice
+ *
+ * @return
+ * Returns 0 on success
+ *        <0 on failure
+ */
+HTSLIB_EXPORT
+int cram_index_extents(cram_fd *fd, int refid, hts_pos_t start, hts_pos_t end,
+                       off_t *first, off_t *last);
+
+/*! Returns the total number of containers in the CRAM index.
+ *
+ * Note the index is not required to have an entry for every container, but it
+ * will always have an index entry for the start of each chromosome.
+ * (Although in practice our indices do container one entry per container.)
+ *
+ * This is equivalent to cram_num_containers_between(fd, 0, 0, NULL, NULL)
+ */
+HTSLIB_EXPORT
+int64_t cram_num_containers(cram_fd *fd);
+
+/*! Returns the number of containers in the CRAM index within given offsets.
+ *
+ * The cstart and cend offsets are the locations of the start of containers
+ * as returned by index_container_offset.
+ *
+ * If non-NULL, first and last will hold the inclusive range of container
+ * numbers, counting from zero.
+ *
+ * @return
+ * Returns the number of containers, equivalent to *last-*first+1.
+ */
+HTSLIB_EXPORT
+int64_t cram_num_containers_between(cram_fd *fd,
+                                    off_t cstart, off_t cend,
+                                    int64_t *first, int64_t *last);
+
+/*! Returns the byte offset for the start of the n^th container.
+ *
+ * The index must have previously been loaded, otherwise <0 is returned.
+ */
+HTSLIB_EXPORT
+off_t cram_container_num2offset(cram_fd *fd, int64_t n);
+
+/*! Returns the container number for the first container at offset >= pos.
+ *
+ * The index must have previously been loaded, otherwise <0 is returned.
+ */
+HTSLIB_EXPORT
+int64_t cram_container_offset2num(cram_fd *fd, off_t pos);
+
 /**@}*/
 
 #ifdef __cplusplus
diff --git a/htslib/hts.h b/htslib/hts.h
index c5d99aba1..4f85424cf 100644
--- a/htslib/hts.h
+++ b/htslib/hts.h
@@ -489,7 +489,7 @@ const char *hts_version(void);
 // Immediately after release, bump ZZ to 90 to distinguish in-development
 // Git repository builds from the release; you may wish to increment this
 // further when significant features are merged.
-#define HTS_VERSION 102000
+#define HTS_VERSION 102100
 
 /*! @abstract Introspection on the features enabled in htslib
  *
@@ -1534,6 +1534,13 @@ static inline int hts_bin_bot(int bin, int n_lvls)
     return (bin - hts_bin_first(l)) << (n_lvls - l) * 3;
 }
 
+/// Compute the (0-based exclusive) maximum position covered by a binning index
+static inline hts_pos_t hts_bin_maxpos(int min_shift, int n_lvls)
+{
+    hts_pos_t one = 1;
+    return one << (min_shift + n_lvls * 3);
+}
+
 /**************
  * Endianness *
  **************/
diff --git a/htslib/hts_defs.h b/htslib/hts_defs.h
index e714e8fda..b5cded341 100644
--- a/htslib/hts_defs.h
+++ b/htslib/hts_defs.h
@@ -1,6 +1,6 @@
 /*  hts_defs.h -- Miscellaneous definitions.
 
-    Copyright (C) 2013-2015,2017, 2019-2020 Genome Research Ltd.
+    Copyright (C) 2013-2015,2017, 2019-2020, 2024 Genome Research Ltd.
 
     Author: John Marshall <jm18@sanger.ac.uk>
 
diff --git a/htslib/khash.h b/htslib/khash.h
index 4cea91020..02e4917c8 100644
--- a/htslib/khash.h
+++ b/htslib/khash.h
@@ -1,7 +1,7 @@
 /* The MIT License
 
    Copyright (c) 2008, 2009, 2011 by Attractive Chaos <attractor@live.co.uk>
-   Copyright (C) 2014-2015, 2018 Genome Research Ltd.
+   Copyright (C) 2014-2015, 2018, 2024 Genome Research Ltd.
 
    Permission is hereby granted, free of charge, to any person obtaining
    a copy of this software and associated documentation files (the
@@ -356,7 +356,39 @@ static const double __ac_HASH_UPPER = 0.77;
 			__ac_set_isdel_true(h->flags, x);							\
 			--h->size;													\
 		}																\
-	}
+	}                                                                   \
+    SCOPE int kh_stats_##name(kh_##name##_t *h, khint_t *empty,         \
+                              khint_t *deleted, khint_t *hist_size,     \
+                              khint_t **hist_out)                       \
+    {                                                                   \
+        khint_t i, *hist = NULL, dist_max = 0, k, dist, step;           \
+        khint_t mask = h->n_buckets - 1;                                \
+        *empty = *deleted = *hist_size = 0;                             \
+        hist = (khint_t *) calloc(1, sizeof(*hist));                    \
+        if (!hist) { return -1; }                                       \
+        for (i = kh_begin(h); i < kh_end(h); ++i) {                     \
+            if (__ac_isempty(h->flags, i)) { (*empty)++; continue; }      \
+            if (__ac_isdel(h->flags, i)) { (*deleted)++; continue; }      \
+            k = __hash_func(h->keys[i]) & (h->n_buckets - 1);           \
+            dist = 0;                                                   \
+            step = 0;                                                   \
+            while (k != i) {                                            \
+                dist++;                                                 \
+                k = (k + (++step)) & mask;                              \
+            }                                                           \
+            if (dist_max <= dist) {                                     \
+                khint_t *new_hist = (khint_t *) realloc(hist, sizeof(*new_hist) * (dist + 1)); \
+                if (!new_hist) { free(hist); return -1; }               \
+                for (k = dist_max + 1; k <= dist; k++) new_hist[k] = 0; \
+                hist = new_hist;                                        \
+                dist_max = dist;                                        \
+            }                                                           \
+            hist[dist]++;                                               \
+        }                                                               \
+        *hist_out = hist;                                               \
+        *hist_size = dist_max + 1;                                      \
+        return 0;                                                       \
+    }
 
 #define KHASH_DECLARE(name, khkey_t, khval_t)		 					\
 	__KHASH_TYPE(name, khkey_t, khval_t) 								\
@@ -391,6 +423,7 @@ static const double __ac_HASH_UPPER = 0.77;
   @abstract     64-bit integer comparison function
  */
 #define kh_int64_hash_equal(a, b) ((a) == (b))
+
 /*! @function
   @abstract     const char* hash function
   @param  s     Pointer to a null terminated string
@@ -402,12 +435,28 @@ static kh_inline khint_t __ac_X31_hash_string(const char *s)
 	if (h) for (++s ; *s; ++s) h = (h << 5) - h + (khint_t)*s;
 	return h;
 }
+
+/*! @function
+  @abstract     const char* FNV1a hash function
+  @param  s     Pointer to a null terminated string
+  @return       The hash value
+ */
+static kh_inline khint_t __ac_FNV1a_hash_string(const char *s)
+{
+	const khint_t offset_basis = 2166136261;
+	const khint_t FNV_prime = 16777619;
+	khint_t h = offset_basis;
+	for (; *s; ++s) h = (h ^ (uint8_t) *s) * FNV_prime;
+	return h;
+}
+
 /*! @function
   @abstract     Another interface to const char* hash function
   @param  key   Pointer to a nul terminated string [const char*]
   @return       The hash value [khint_t]
  */
-#define kh_str_hash_func(key) __ac_X31_hash_string(key)
+#define kh_str_hash_func(key) __ac_FNV1a_hash_string(key)
+
 /*! @function
   @abstract     Const char* comparison function
  */
@@ -426,12 +475,29 @@ static kh_inline khint_t __ac_X31_hash_kstring(const kstring_t ks)
 		h = (h << 5) - h + (khint_t)ks.s[i];
 	return h;
 }
+
+/*! @function
+  @abstract     Kstring hash function
+  @param  s     Pointer to a kstring
+  @return       The hash value
+ */
+static kh_inline khint_t __ac_FNV1a_hash_kstring(const kstring_t ks)
+{
+	const khint_t offset_basis = 2166136261;
+	const khint_t FNV_prime = 16777619;
+	khint_t h = offset_basis;
+	size_t i;
+	for (i = 0; i < ks.l; i++)
+		h = (h ^ (uint8_t) ks.s[i]) * FNV_prime;
+	return h;
+}
+
 /*! @function
   @abstract     Interface to kstring hash function.
   @param  key   Pointer to a khash; permits hashing on non-nul terminated strings.
   @return       The hash value [khint_t]
  */
-#define kh_kstr_hash_func(key) __ac_X31_hash_kstring(key)
+#define kh_kstr_hash_func(key) __ac_FNV1a_hash_kstring(key)
 /*! @function
   @abstract     kstring comparison function
  */
@@ -604,6 +670,19 @@ static kh_inline khint_t __ac_Wang_hash(khint_t key)
 		code;												\
 	} }
 
+/*! @function
+  @abstract  Gather hash table statistics
+  @param  name            Name of the hash table [symbol]
+  @param  h               Pointer to the hash table [khash_t(name)*]
+  @param  empty[out]      Number of empty hash bins
+  @param  deleted[out]    Number of hash bins with the deleted flag
+  @param  hist_size[out]  Size of @p hist array
+  @param  hist[out]       Probe count histogram
+  @return 0 on success; -1 on failure
+ */
+#define kh_stats(name, h, empty, deleted, hist_size, hist) \
+    kh_stats_##name(h, empty, deleted, hist_size, hist)
+
 /* More convenient interfaces */
 
 /*! @function
diff --git a/htslib/kstring.h b/htslib/kstring.h
index 53a19806d..ebb2f9363 100644
--- a/htslib/kstring.h
+++ b/htslib/kstring.h
@@ -1,7 +1,7 @@
 /* The MIT License
 
    Copyright (C) 2011 by Attractive Chaos <attractor@live.co.uk>
-   Copyright (C) 2013-2014, 2016, 2018-2020, 2022 Genome Research Ltd.
+   Copyright (C) 2013-2014, 2016, 2018-2020, 2022, 2024 Genome Research Ltd.
 
    Permission is hereby granted, free of charge, to any person obtaining
    a copy of this software and associated documentation files (the
@@ -375,17 +375,63 @@ static inline int kputw(int c, kstring_t *s)
 
 static inline int kputll(long long c, kstring_t *s)
 {
-	char buf[32];
-	int i, l = 0;
-	unsigned long long x = c;
-	if (c < 0) x = -x;
-	do { buf[l++] = x%10 + '0'; x /= 10; } while (x > 0);
-	if (c < 0) buf[l++] = '-';
-	if (ks_resize(s, s->l + l + 2) < 0)
-		return EOF;
-	for (i = l - 1; i >= 0; --i) s->s[s->l++] = buf[i];
-	s->s[s->l] = 0;
-	return 0;
+    // Worst case expansion.  One check reduces function size
+    // and aids inlining chance.  Memory overhead is minimal.
+    if (ks_resize(s, s->l + 23) < 0)
+	return EOF;
+
+    unsigned long long x = c;
+    if (c < 0) {
+	x = -x;
+        s->s[s->l++] = '-';
+    }
+
+    if (x <= UINT32_MAX)
+	return kputuw(x, s);
+
+    static const char kputull_dig2r[] =
+        "00010203040506070809"
+        "10111213141516171819"
+        "20212223242526272829"
+        "30313233343536373839"
+        "40414243444546474849"
+        "50515253545556575859"
+        "60616263646566676869"
+        "70717273747576777879"
+        "80818283848586878889"
+        "90919293949596979899";
+    unsigned int l, j;
+    char *cp;
+
+    // Find out how long the number is (could consider clzll)
+    uint64_t m = 1;
+    l = 0;
+    if (sizeof(long long)==sizeof(uint64_t) && x >= 10000000000000000000ULL) {
+	// avoids overflow below
+	l = 20;
+    } else {
+	do {
+	    l++;
+	    m *= 10;
+	} while (x >= m);
+    }
+
+    // Add digits two at a time
+    j = l;
+    cp = s->s + s->l;
+    while (x >= 10) {
+        const char *d = &kputull_dig2r[2*(x%100)];
+        x /= 100;
+        memcpy(&cp[j-=2], d, 2);
+    }
+
+    // Last one (if necessary).  We know that x < 10 by now.
+    if (j == 1)
+        cp[0] = x + '0';
+
+    s->l += l;
+    s->s[s->l] = 0;
+    return 0;
 }
 
 static inline int kputl(long c, kstring_t *s) {
diff --git a/htslib/vcf.h b/htslib/vcf.h
index e60911ab5..9a36cab05 100644
--- a/htslib/vcf.h
+++ b/htslib/vcf.h
@@ -596,7 +596,8 @@ set to one of BCF_ERR* codes and must be checked before calling bcf_write().
     int bcf_hdr_append(bcf_hdr_t *h, const char *line);
 
     HTSLIB_EXPORT
-    int bcf_hdr_printf(bcf_hdr_t *h, const char *format, ...);
+    int bcf_hdr_printf(bcf_hdr_t *h, const char *format, ...)
+    HTS_FORMAT(HTS_PRINTF_FMT, 2, 3);
 
     /** VCF version, e.g. VCFv4.2 */
     HTSLIB_EXPORT
diff --git a/sam.c b/sam.c
index 1a5519410..7e58da6e7 100644
--- a/sam.c
+++ b/sam.c
@@ -104,7 +104,7 @@ const int8_t bam_cigar_table[256] = {
     -1, -1, -1, -1,  -1, -1, -1, -1,  -1, -1, -1, -1,  -1, -1, -1, -1
 };
 
-sam_hdr_t *sam_hdr_init()
+sam_hdr_t *sam_hdr_init(void)
 {
     sam_hdr_t *bh = (sam_hdr_t*)calloc(1, sizeof(sam_hdr_t));
     if (bh == NULL) return NULL;
@@ -421,7 +421,7 @@ const char *sam_parse_region(sam_hdr_t *h, const char *s, int *tid,
  *** BAM alignment I/O ***
  *************************/
 
-bam1_t *bam_init1()
+bam1_t *bam_init1(void)
 {
     return (bam1_t*)calloc(1, sizeof(bam1_t));
 }
@@ -431,7 +431,8 @@ int sam_realloc_bam_data(bam1_t *b, size_t desired)
     uint32_t new_m_data;
     uint8_t *new_data;
     new_m_data = desired;
-    kroundup32(new_m_data);
+    kroundup32(new_m_data); // next power of 2
+    new_m_data += 32; // reduces malloc arena migrations?
     if (new_m_data < desired) {
         errno = ENOMEM; // Not strictly true but we can't store the size
         return -1;
@@ -672,25 +673,36 @@ hts_pos_t bam_endpos(const bam1_t *b)
 static int bam_tag2cigar(bam1_t *b, int recal_bin, int give_warning) // return 0 if CIGAR is untouched; 1 if CIGAR is updated with CG
 {
     bam1_core_t *c = &b->core;
-    uint32_t cigar_st, n_cigar4, CG_st, CG_en, ori_len = b->l_data, *cigar0, CG_len, fake_bytes;
-    uint8_t *CG;
 
-    // test where there is a real CIGAR in the CG tag to move
-    if (c->n_cigar == 0 || c->tid < 0 || c->pos < 0) return 0;
-    cigar0 = bam_get_cigar(b);
-    if (bam_cigar_op(cigar0[0]) != BAM_CSOFT_CLIP || bam_cigar_oplen(cigar0[0]) != c->l_qseq) return 0;
-    fake_bytes = c->n_cigar * 4;
+    // Bail out as fast as possible for the easy case
+    uint32_t test_CG = BAM_CSOFT_CLIP | (c->l_qseq << BAM_CIGAR_SHIFT);
+    if (c->n_cigar == 0 || test_CG != *bam_get_cigar(b))
+        return 0;
+
+    // The above isn't fool proof - we may have old CIGAR tags that aren't used,
+    // but this is much less likely so do as a secondary check.
+    if (c->tid < 0 || c->pos < 0)
+        return 0;
+
+    // Do we have a CG tag?
+    uint8_t *CG = bam_aux_get(b, "CG");
     int saved_errno = errno;
-    CG = bam_aux_get(b, "CG");
     if (!CG) {
         if (errno != ENOENT) return -1;  // Bad aux data
         errno = saved_errno; // restore errno on expected no-CG-tag case
         return 0;
     }
+
+    // Now we start with the serious work migrating CG to CIGAR
+    uint32_t cigar_st, n_cigar4, CG_st, CG_en, ori_len = b->l_data,
+        *cigar0, CG_len, fake_bytes;
+    cigar0 = bam_get_cigar(b);
+    fake_bytes = c->n_cigar * 4;
     if (CG[0] != 'B' || !(CG[1] == 'I' || CG[1] == 'i'))
         return 0; // not of type B,I
     CG_len = le_to_u32(CG + 2);
-    if (CG_len < c->n_cigar || CG_len >= 1U<<29) return 0; // don't move if the real CIGAR length is shorter than the fake cigar length
+    // don't move if the real CIGAR length is shorter than the fake cigar length
+    if (CG_len < c->n_cigar || CG_len >= 1U<<29) return 0;
 
     // move from the CG tag to the right position
     cigar_st = (uint8_t*)cigar0 - b->data;
@@ -699,9 +711,12 @@ static int bam_tag2cigar(bam1_t *b, int recal_bin, int give_warning) // return 0
     CG_st = CG - b->data - 2;
     CG_en = CG_st + 8 + n_cigar4;
     if (possibly_expand_bam_data(b, n_cigar4 - fake_bytes) < 0) return -1;
-    b->l_data = b->l_data - fake_bytes + n_cigar4; // we need c->n_cigar-fake_bytes bytes to swap CIGAR to the right place
-    memmove(b->data + cigar_st + n_cigar4, b->data + cigar_st + fake_bytes, ori_len - (cigar_st + fake_bytes)); // insert c->n_cigar-fake_bytes empty space to make room
-    memcpy(b->data + cigar_st, b->data + (n_cigar4 - fake_bytes) + CG_st + 8, n_cigar4); // copy the real CIGAR to the right place; -fake_bytes for the fake CIGAR
+    // we need c->n_cigar-fake_bytes bytes to swap CIGAR to the right place
+    b->l_data = b->l_data - fake_bytes + n_cigar4;
+    // insert c->n_cigar-fake_bytes empty space to make room
+    memmove(b->data + cigar_st + n_cigar4, b->data + cigar_st + fake_bytes, ori_len - (cigar_st + fake_bytes));
+    // copy the real CIGAR to the right place; -fake_bytes for the fake CIGAR
+    memcpy(b->data + cigar_st, b->data + (n_cigar4 - fake_bytes) + CG_st + 8, n_cigar4);
     if (ori_len > CG_en) // move data after the CG tag
         memmove(b->data + CG_st + n_cigar4 - fake_bytes, b->data + CG_en + n_cigar4 - fake_bytes, ori_len - CG_en);
     b->l_data -= n_cigar4 + 8; // 8: CGBI (4 bytes) and CGBI length (4)
@@ -763,27 +778,41 @@ int bam_read1(BGZF *fp, bam1_t *b)
 {
     bam1_core_t *c = &b->core;
     int32_t block_len, ret, i;
-    uint32_t x[8], new_l_data;
+    uint32_t new_l_data;
+    uint8_t tmp[32], *x;
 
     b->l_data = 0;
 
-    if ((ret = bgzf_read(fp, &block_len, 4)) != 4) {
+    if ((ret = bgzf_read_small(fp, &block_len, 4)) != 4) {
         if (ret == 0) return -1; // normal end-of-file
         else return -2; // truncated
     }
     if (fp->is_be)
         ed_swap_4p(&block_len);
     if (block_len < 32) return -4;  // block_len includes core data
-    if (bgzf_read(fp, x, 32) != 32) return -3;
-    if (fp->is_be) {
-        for (i = 0; i < 8; ++i) ed_swap_4p(x + i);
+    if (fp->block_length - fp->block_offset > 32) {
+        // Avoid bgzf_read and a temporary copy to a local buffer
+        x = (uint8_t *)fp->uncompressed_block + fp->block_offset;
+        fp->block_offset += 32;
+    } else {
+        x = tmp;
+        if (bgzf_read(fp, x, 32) != 32) return -3;
     }
-    c->tid = x[0]; c->pos = (int32_t)x[1];
-    c->bin = x[2]>>16; c->qual = x[2]>>8&0xff; c->l_qname = x[2]&0xff;
+
+    c->tid        = le_to_u32(x);
+    c->pos        = le_to_i32(x+4);
+    uint32_t x2   = le_to_u32(x+8);
+    c->bin        = x2>>16;
+    c->qual       = x2>>8&0xff;
+    c->l_qname    = x2&0xff;
     c->l_extranul = (c->l_qname%4 != 0)? (4 - c->l_qname%4) : 0;
-    c->flag = x[3]>>16; c->n_cigar = x[3]&0xffff;
-    c->l_qseq = x[4];
-    c->mtid = x[5]; c->mpos = (int32_t)x[6]; c->isize = (int32_t)x[7];
+    uint32_t x3   = le_to_u32(x+12);
+    c->flag       = x3>>16;
+    c->n_cigar    = x3&0xffff;
+    c->l_qseq     = le_to_u32(x+16);
+    c->mtid       = le_to_u32(x+20);
+    c->mpos       = le_to_i32(x+24);
+    c->isize      = le_to_i32(x+28);
 
     new_l_data = block_len - 32 + c->l_extranul;
     if (new_l_data > INT_MAX || c->l_qseq < 0 || c->l_qname < 1) return -4;
@@ -793,19 +822,20 @@ int bam_read1(BGZF *fp, bam1_t *b)
     if (realloc_bam_data(b, new_l_data) < 0) return -4;
     b->l_data = new_l_data;
 
-    if (bgzf_read(fp, b->data, c->l_qname) != c->l_qname) return -4;
-    if (b->data[c->l_qname - 1] != '\0') { // Try to fix missing NUL termination
+    if (bgzf_read_small(fp, b->data, c->l_qname) != c->l_qname) return -4;
+    if (b->data[c->l_qname - 1] != '\0') { // try to fix missing nul termination
         if (fixup_missing_qname_nul(b) < 0) return -4;
     }
     for (i = 0; i < c->l_extranul; ++i) b->data[c->l_qname+i] = '\0';
     c->l_qname += c->l_extranul;
     if (b->l_data < c->l_qname ||
-        bgzf_read(fp, b->data + c->l_qname, b->l_data - c->l_qname) != b->l_data - c->l_qname)
+        bgzf_read_small(fp, b->data + c->l_qname, b->l_data - c->l_qname) != b->l_data - c->l_qname)
         return -4;
     if (fp->is_be) swap_data(c, b->l_data, b->data, 0);
     if (bam_tag2cigar(b, 0, 0) < 0)
         return -4;
 
+    // TODO: consider making this conditional
     if (c->n_cigar > 0) { // recompute "bin" and check CIGAR-qlen consistency
         hts_pos_t rlen, qlen;
         bam_cigar2rqlens(c->n_cigar, bam_get_cigar(b), &rlen, &qlen);
@@ -852,15 +882,15 @@ int bam_write1(BGZF *fp, const bam1_t *b)
     if (fp->is_be) {
         for (i = 0; i < 8; ++i) ed_swap_4p(x + i);
         y = block_len;
-        if (ok) ok = (bgzf_write(fp, ed_swap_4p(&y), 4) >= 0);
+        if (ok) ok = (bgzf_write_small(fp, ed_swap_4p(&y), 4) >= 0);
         swap_data(c, b->l_data, b->data, 1);
     } else {
-        if (ok) ok = (bgzf_write(fp, &block_len, 4) >= 0);
+        if (ok) ok = (bgzf_write_small(fp, &block_len, 4) >= 0);
     }
-    if (ok) ok = (bgzf_write(fp, x, 32) >= 0);
-    if (ok) ok = (bgzf_write(fp, b->data, c->l_qname - c->l_extranul) >= 0);
+    if (ok) ok = (bgzf_write_small(fp, x, 32) >= 0);
+    if (ok) ok = (bgzf_write_small(fp, b->data, c->l_qname - c->l_extranul) >= 0);
     if (c->n_cigar <= 0xffff) { // no long CIGAR; write normally
-        if (ok) ok = (bgzf_write(fp, b->data + c->l_qname, b->l_data - c->l_qname) >= 0);
+        if (ok) ok = (bgzf_write_small(fp, b->data + c->l_qname, b->l_data - c->l_qname) >= 0);
     } else { // with long CIGAR, insert a fake CIGAR record and move the real CIGAR to the CG:B,I tag
         uint8_t buf[8];
         uint32_t cigar_st, cigar_en, cigar[2];
@@ -879,12 +909,12 @@ int bam_write1(BGZF *fp, const bam1_t *b)
         cigar[1] = (uint32_t)cigreflen << 4 | BAM_CREF_SKIP;
         u32_to_le(cigar[0], buf);
         u32_to_le(cigar[1], buf + 4);
-        if (ok) ok = (bgzf_write(fp, buf, 8) >= 0); // write cigar: <read_length>S<ref_length>N
-        if (ok) ok = (bgzf_write(fp, &b->data[cigar_en], b->l_data - cigar_en) >= 0); // write data after CIGAR
-        if (ok) ok = (bgzf_write(fp, "CGBI", 4) >= 0); // write CG:B,I
+        if (ok) ok = (bgzf_write_small(fp, buf, 8) >= 0); // write cigar: <read_length>S<ref_length>N
+        if (ok) ok = (bgzf_write_small(fp, &b->data[cigar_en], b->l_data - cigar_en) >= 0); // write data after CIGAR
+        if (ok) ok = (bgzf_write_small(fp, "CGBI", 4) >= 0); // write CG:B,I
         u32_to_le(c->n_cigar, buf);
-        if (ok) ok = (bgzf_write(fp, buf, 4) >= 0); // write the true CIGAR length
-        if (ok) ok = (bgzf_write(fp, &b->data[cigar_st], c->n_cigar * 4) >= 0); // write the real CIGAR
+        if (ok) ok = (bgzf_write_small(fp, buf, 4) >= 0); // write the true CIGAR length
+        if (ok) ok = (bgzf_write_small(fp, &b->data[cigar_st], c->n_cigar * 4) >= 0); // write the real CIGAR
     }
     if (fp->is_be) swap_data(c, b->l_data, b->data, 0);
     return ok? 4 + block_len : -1;
@@ -2917,7 +2947,7 @@ int sam_parse1(kstring_t *s, sam_hdr_t *h, bam1_t *b)
     } else c->tid = -1;
 
     // pos
-    c->pos = hts_str2uint(p, &p, 63, &overflow) - 1;
+    c->pos = hts_str2uint(p, &p, 62, &overflow) - 1;
     if (*p++ != '\t') goto err_ret;
     if (c->pos < 0 && c->tid >= 0) {
         _parse_warn(1, "mapped query cannot have zero coordinate; treated as unmapped");
@@ -2960,15 +2990,16 @@ int sam_parse1(kstring_t *s, sam_hdr_t *h, bam1_t *b)
         _parse_warn(c->mtid < 0, "unrecognized mate reference name %s; treated as unmapped", hts_strprint(logbuf, sizeof logbuf, '"', q, SIZE_MAX));
     }
     // mpos
-    c->mpos = hts_str2uint(p, &p, 63, &overflow) - 1;
+    c->mpos = hts_str2uint(p, &p, 62, &overflow) - 1;
     if (*p++ != '\t') goto err_ret;
     if (c->mpos < 0 && c->mtid >= 0) {
         _parse_warn(1, "mapped mate cannot have zero coordinate; treated as unmapped");
         c->mtid = -1;
     }
     // tlen
-    c->isize = hts_str2int(p, &p, 64, &overflow);
+    c->isize = hts_str2int(p, &p, 63, &overflow);
     if (*p++ != '\t') goto err_ret;
+    _parse_err(overflow, "number outside allowed range");
     // seq
     q = _read_token(p);
     if (strcmp(q, "*")) {
@@ -4297,6 +4328,9 @@ static inline int sam_read1_sam(htsFile *fp, sam_hdr_t *h, bam1_t *b) {
 
             fd->curr_bam = NULL;
             fd->curr_idx = 0;
+        // Consider prefetching next record?  I.e.
+        // } else {
+        //     __builtin_prefetch(&b_array[fd->curr_idx], 0, 3);
         }
 
         ret = 0;
diff --git a/sam_internal.h b/sam_internal.h
index b1fce9fe4..750c597b2 100644
--- a/sam_internal.h
+++ b/sam_internal.h
@@ -1,6 +1,6 @@
 /*  sam_internal.h -- internal functions; not part of the public API.
 
-    Copyright (C) 2019-2020 Genome Research Ltd.
+    Copyright (C) 2019-2020, 2023-2024 Genome Research Ltd.
 
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal
@@ -25,6 +25,7 @@ DEALINGS IN THE SOFTWARE.  */
 
 #include <errno.h>
 #include <stdint.h>
+
 #include "htslib/sam.h"
 
 #ifdef __cplusplus
@@ -68,7 +69,7 @@ static inline int possibly_expand_bam_data(bam1_t *b, size_t bytes) {
  * for (i = 0; i < len; i++)
  *    seq[i] = seq_nt16_str[bam_seqi(nib, i)];
  */
-static inline void nibble2base(uint8_t *nib, char *seq, int len) {
+static inline void nibble2base_default(uint8_t *nib, char *seq, int len) {
     static const char code2base[512] =
         "===A=C=M=G=R=S=V=T=W=Y=H=K=D=B=N"
         "A=AAACAMAGARASAVATAWAYAHAKADABAN"
@@ -98,6 +99,21 @@ static inline void nibble2base(uint8_t *nib, char *seq, int len) {
         seq[i] = seq_nt16_str[bam_seqi(nib, i)];
 }
 
+#if defined HAVE_ATTRIBUTE_CONSTRUCTOR && \
+    ((defined __x86_64__ && defined HAVE_ATTRIBUTE_TARGET && defined HAVE_BUILTIN_CPU_SUPPORT_SSSE3) || \
+     (defined __ARM_NEON))
+#define BUILDING_SIMD_NIBBLE2BASE
+#endif
+
+static inline void nibble2base(uint8_t *nib, char *seq, int len) {
+#ifdef BUILDING_SIMD_NIBBLE2BASE
+    extern void (*htslib_nibble2base)(uint8_t *nib, char *seq, int len);
+    htslib_nibble2base(nib, seq, len);
+#else
+    nibble2base_default(nib, seq, len);
+#endif
+}
+
 #ifdef __cplusplus
 }
 #endif
diff --git a/samples/DEMO.md b/samples/DEMO.md
index 911792899..98c9981b8 100644
--- a/samples/DEMO.md
+++ b/samples/DEMO.md
@@ -88,18 +88,24 @@ alignment. It adds count of ATCGN base as an array in auxiliary data, BA:I.
 Modified data is written on standard output.
 
 Write_fast - This application showcases the fasta/fastq data write. It appends
-a dummy data to given file.
+data to given file.
 
 Index_write - This application showcases the creation of index along with
 output creation. Based on file type and shift, it creates bai, csi or crai
 files.
 
+Index_fast - This application showcases the index creation on fasta/fastq
+reference files.
+
 Read_reg - This application showcases the usage of region specification in
 alignment read.
 
-Read_multireg - This application showcases the usage of mulitple regionn
+Read_multireg - This application showcases the usage of multiple region
 specification in alignment read.
 
+Read_fast_index - This application showcases the fasta/fastq data read using
+index.
+
 Pileup - This application showcases the pileup api, where all alignments
 covering a reference position are accessed together. It displays the bases
 covering each position on standard output.
@@ -131,6 +137,15 @@ handling. It saves the read1 and read2 as separate files in given directory,
 one as sam and other as bam. A pool of 4 threads is created and shared for both
 read and write.
 
+Qtask_ordered - This application showcases the use of queues and threads for
+custom processing. Alignments in input file are updated with their GC ratio
+on a custom aux tag. The processing may occur in any order but the result is
+retrieved in same order as it was queued and saved to disk.
+
+Qtask_unordered - This application showcases the use of queues and threads
+for custom processing. The count of bases and GC ratio are calculated and
+displayed.  The order of counting is irrelevant and hence ordered retrieval is
+not used.
 
 ## Building the sample apps
 
@@ -173,7 +188,7 @@ sam_read1 api. samFile pointer, header and bam storage are to be passed as
 argument and it returns 0 on success, -1 on end of file and < -1 in case of
 errors.
 
-The bam storage has to be initialised using bam_init1 api before the call and
+The bam storage has to be initialized using bam_init1 api before the call and
 can be reused for successive reads. Once done, it needs to be destroyed using
 bam_destroy1.  The member field named core - bam1_core_t - in bam storage,
 bam1_t, has the sequence data in an easily accessible way. Using the fields
@@ -185,30 +200,31 @@ and macros, data can easily be read from it.
     {
         ...
         //initialize
-        if (!(bamdata = bam_init1())) {
-        ...
+        if (!(bamdata = bam_init1()))
+           ... // error
         //open input files - r reading
-        if (!(infile = sam_open(inname, "r"))) {
-        ...
+        if (!(infile = sam_open(inname, "r")))
+           ... // error
         //read header
-        if (!(in_samhdr = sam_hdr_read(infile))) {
-        ...
+        if (!(in_samhdr = sam_hdr_read(infile)))
+           ... // error
+
         //read data, check flags and update count
         while ((c = sam_read1(infile, in_samhdr, bamdata)) >= 0) {
-            if (bamdata->core.flag & BAM_FREAD1) {
+            if (bamdata->core.flag & BAM_FREAD1)
                 cntread1++;
-            }
-        ...
+            ...
+
         //clean up
-        if (in_samhdr) {
+        if (in_samhdr)
             sam_hdr_destroy(in_samhdr);
-        }
-        if (infile) {
+
+        if (infile)
             sam_close(infile);
-        }
-        if (bamdata) {
+
+        if (bamdata)
             bam_destroy1(bamdata);
-        }
+
         return ret;
     }
 Refer: flags_demo.c
@@ -255,21 +271,23 @@ set the reference name in the alignment. It returns -ve value on error.
     int main(int argc, char *argv[])
     {
         ...
-        if (!(infile = sam_open(inname, "r"))) {
-        ...
+        if (!(infile = sam_open(inname, "r")))
+           ... // error
         outfile1 = sam_open(file1, "w");            //as SAM
         outfile2 = sam_open(file2, "wb");           //as BAM
         ...
-        if (!(in_samhdr = sam_hdr_read(infile))) {
-        ...
+        if (!(in_samhdr = sam_hdr_read(infile)))
+           ... // error
+
         //write header
         if ((sam_hdr_write(outfile1, in_samhdr) == -1) ||
-         (sam_hdr_write(outfile2, in_samhdr) == -1)) {
-        ...
+         (sam_hdr_write(outfile2, in_samhdr) == -1))
+           ... // error
+
         while ((c = sam_read1(infile, in_samhdr, bamdata)) >= 0) {
             if (bamdata->core.flag & BAM_FREAD1) {
                 if (sam_write1(outfile1, in_samhdr, bamdata) < 0) {
-        ...
+                    ... // error
     }
 Refer: split.c
 
@@ -284,10 +302,11 @@ Below code excerpt shows sam_open_mode api usage.
         ...
         //set file open mode based on file name for 1st and as explicit for 2nd
         if ((sam_open_mode(mode1+1, file1, NULL) == -1) ||
-         (sam_open_mode(mode2+1, file2, "sam.gz") == -1)) {
-        ...
-        if (!(infile = sam_open(inname, "r"))) {
-        ...
+         (sam_open_mode(mode2+1, file2, "sam.gz") == -1))
+           ... // error
+        if (!(infile = sam_open(inname, "r")))
+           ... // error
+
         //open output files
         outfile1 = sam_open(file1, mode1);                          //as compressed SAM through sam_open
         outfile2 = sam_open_format(file2, mode2, NULL);             //as compressed SAM through sam_open_format
@@ -321,7 +340,7 @@ api and used with sam_open_format api to create appropriate CRAM file.
             hts_parse_format(&fmt2, reffmt2) == -1 ||               //embed the reference internally
             hts_parse_format(&fmt3, "cram,embed_ref=2") == -1 ||    //embed autogenerated reference
             hts_parse_format(&fmt4, "cram,no_ref=1") == -1) {       //no reference data encoding at all
-    ...
+       ... // error
     outfile1 = sam_open_format(file1, "wc", &fmt1); outfile2 = sam_open_format(file2, "wc", &fmt2);
     ...
 Refer: cram.c
@@ -337,16 +356,20 @@ or explicit format text. This mode buffer can be used with sam_open or can be
 used with sam_open_format with explicit format information in htsFormat
 structure.
 
+It is the FASTA format which is mainly in use to store the reference data.
+
     ...
-    if (!(bamdata = bam_init1())) {
-    ...
-    if (!(infile = sam_open(inname, "r"))) {
-    ...
-    if (infile->format.format != fasta_format && infile->format.format != fastq_format) {
-    ...
-    if (!(in_samhdr = sam_hdr_read(infile))) {
-    ...
-    while ((c = sam_read1(infile, in_samhdr, bamdata)) >= 0) {
+    if (!(bamdata = bam_init1()))
+      ... // error
+    if (!(infile = sam_open(inname, "r")))
+       ... // error
+    if (infile->format.format != fasta_format && infile->format.format != fastq_format)
+       ... // error
+    if (!(in_samhdr = sam_hdr_read(infile)))
+       ... // error
+
+    while ((c = sam_read1(infile, in_samhdr, bamdata)) >= 0)
+       ... // error
         printf("\nsequence: ");
         for (c = 0; c < bamdata->core.l_qseq; ++c) {
             printf("%c", seq_nt16_str[bam_seqi(bam_get_seq(bamdata), c)]);
@@ -354,23 +377,22 @@ structure.
         if (infile->format.format == fastq_format) {
             printf("\nquality: ");
             for (c = 0; c < bamdata->core.l_qseq; ++c) {
-                printf("%c", bam_get_qual(bamdata)[c]);
+                printf("%c", bam_get_qual(bamdata)[c] + 33);
     ...
 Refer: read_fast.c
 
     ...
     char mode[4] = "a";
     ...
-    if (sam_open_mode(mode + 1, outname, NULL) < 0) {
-    ...
-    if (!(outfile = sam_open(outname, mode))) {
-    ...
-    if (bam_set1(bamdata, sizeof("test"), "test", BAM_FUNMAP, -1, -1, 0, 0, NULL, -1, -1, 0, 10, "AACTGACTGA", "1234567890", 0)
-     < 0) {
-    ...
+    if (sam_open_mode(mode + 1, outname, NULL) < 0)
+       ... // error
+    if (!(outfile = sam_open(outname, mode)))
+       ... // error
+    if (bam_set1(bamdata, strlen(name), name, BAM_FUNMAP, -1, -1, 0, 0, NULL, -1, -1, 0, strlen(data), data, qual, 0) < 0)
+       ... // error
     if (sam_write1(outfile, out_samhdr, bamdata) < 0) {
         printf("Failed to write data\n");
-    ...
+        ...
 Refer: write_fast.c
 
 
@@ -388,18 +410,21 @@ line can be retrieved using sam_hdr_find_line_pos or sam_hdr_line_id with
 position and unique identifier values respectively.
 
     ...
-    if (!(in_samhdr = sam_hdr_read(infile))) {
-    ...
-            ret = sam_hdr_find_tag_id(in_samhdr, header, id, idval, tag, &data);
+    if (!(in_samhdr = sam_hdr_read(infile)))
+        ... // error
     ...
-            ret = sam_hdr_find_line_id(in_samhdr, header, id, idval, &data);
+      if (tag)
+          ret = sam_hdr_find_tag_id(in_samhdr, header, id, idval, tag, &data);
+      else
+          ret = sam_hdr_find_line_id(in_samhdr, header, id, idval, &data);
     ...
         linecnt = sam_hdr_count_lines(in_samhdr, header);
-    ...
-            ret = sam_hdr_find_tag_pos(in_samhdr, header, c, tag, &data);
-    ...
-            ret = sam_hdr_find_line_pos(in_samhdr, header, c, &data);
-    ...
+        ...
+            if (tag)
+                ret = sam_hdr_find_tag_pos(in_samhdr, header, c, tag, &data);
+            else
+                ret = sam_hdr_find_line_pos(in_samhdr, header, c, &data);
+        ...
 Refer: read_header.c
 
 This will show the VN tag's value from HD header.
@@ -417,16 +442,19 @@ Below code excerpt shows the reference names which has length above given value.
     ...
     //iterate and check each reference's length
     for (pos = 1, c = 0; c < linecnt; ++c) {
-        if ((ret = sam_hdr_find_tag_pos(in_samhdr, "SQ", c, "LN", &data) == -2)) {
-    ...
+        if ((ret = sam_hdr_find_tag_pos(in_samhdr, "SQ", c, "LN", &data) == -2))
+            ... // error
+
         size = atoll(data.s);
         if (size < minsize) {
             //not required
             continue;
         }
-        if (!(id = sam_hdr_line_name(in_samhdr, "SQ", c))) {
-            //sam_hdr_find_tag_pos(in_samhdr, "SQ", c, "SN", &data) can also do the same!
-    ...
+
+        //sam_hdr_find_tag_pos(in_samhdr, "SQ", c, "SN", &data) can also do the same!
+        if (!(id = sam_hdr_line_name(in_samhdr, "SQ", c)))
+            ... // error
+
         printf("%d,%s,%s\n", pos, id, data.s);
     ...
 Refer: read_refname.c
@@ -465,8 +493,8 @@ indexing the seq_nt16_str array.
         printf("MQUAL: %d\n", bamdata->core.qual);                              //map quality value
         cigar = bam_get_cigar(bamdata);                                         //retrieves the cigar data
         for (i = 0; i < bamdata->core.n_cigar; ++i) {                           //no. of cigar data entries
-            printf("%d%c", bam_cigar_oplen(cigar[i]), bam_cigar_opchr(cigar[i]));   //the macros gives the count of operation
-             and the symbol of operation for given cigar entry
+            printf("%d%c", bam_cigar_oplen(cigar[i]), bam_cigar_opchr(cigar[i]));
+            //the macros gives the count of operation and the symbol of operation for given cigar entry
         }
         printf("\nTLEN/ISIZE: %"PRIhts_pos"\n", bamdata->core.isize);
         data = bam_get_seq(bamdata);
@@ -475,8 +503,8 @@ indexing the seq_nt16_str array.
         ...
         for (i = 0; i < bamdata->core.l_qseq ; ++i) {       //sequence length
             printf("%c", seq_nt16_str[bam_seqi(data, i)]);  //retrieves the base from (internal compressed) sequence data
-        ...
-            printf("%c", bam_get_qual(bamdata)[i]+33);      //retrives the quality value
+            ...
+            printf("%c", bam_get_qual(bamdata)[i]+33);      //retrieves the quality value
         ...
 Refer: read_bam.c
 
@@ -516,15 +544,13 @@ given position of the array.
 
     ...
     while ((ret_r = sam_read1(infile, in_samhdr, bamdata)) >= 0) {
-        if (i % 2) {    //use options alternatively to demonstrate both
-            //option 1 - get data as string with tag and type
-            if ((c = bam_aux_get_str(bamdata, tag, &sdata)) == 1) {
-                printf("%s\n",sdata.s);
-    ...
-            //option 2 - get raw data
-            if (!(data = bam_aux_get(bamdata, tag))) {
-    ...
-                if (printauxdata(stdout, bam_aux_type(data), -1, data) == EXIT_FAILURE) {
+        //option 1 - get data as string with tag and type
+        if ((c = bam_aux_get_str(bamdata, tag, &sdata)) == 1) {
+            printf("%s\n",sdata.s);
+        ...
+        //option 2 - get raw data
+        if ((data = bam_aux_get(bamdata, tag)) != NULL) {
+            printauxdata(stdout, bam_aux_type(data), -1, data);
     ...
 Refer: read_aux.c
 
@@ -539,8 +565,8 @@ Shows the MD aux tag from alignments.
             printf("%.2s:%c:", bam_aux_tag(data), NULL != strchr("cCsSiI", bam_aux_type(data)) ? 'i' : bam_aux_type(data));
               //macros gets the tag and type of aux data
             //dump the data
-            if (printauxdata(stdout, bam_aux_type(data), -1, data) == EXIT_FAILURE) {
-    ...
+            printauxdata(stdout, bam_aux_type(data), -1, data);
+            ...
             data = bam_aux_next(bamdata, data);                                     //get the next aux data
     ...
 Refer: dump_aux.c
@@ -563,19 +589,22 @@ sam_hdr_write api does the write of the header data to file.
 
     ...
     //add SQ line with SN as TR1 and TR2
-    if (sam_hdr_add_lines(in_samhdr, &sq[0], 0)) {                                      //length as 0 for NULL terminated data
-    ...
+    if (sam_hdr_add_lines(in_samhdr, &sq[0], 0))                                        //length as 0 for NULL terminated data
+        ... // error
+
     //add RG line with ID as RG1
-    if (sam_hdr_add_line(in_samhdr, "RG", "ID", "RG1", "LB", "Test", "SM", "S1", NULL)) {
-    ...
-    //add pg line
-    if (sam_hdr_add_pg(in_samhdr, "add_header", "VN", "Test", "CL", data.s, NULL)) {    //NULL is to indicate end of args
-    ...
-    if (sam_hdr_add_line(in_samhdr, "CO", "Test data", NULL)) {                         //NULL is to indicate end of args
-    ...
+    if (sam_hdr_add_line(in_samhdr, "RG", "ID", "RG1", "LB", "Test", "SM", "S1", NULL))
+        ... // error
+
+    //add PG/CO lines
+    if (sam_hdr_add_pg(in_samhdr, "add_header", "VN", "Test", "CL", data.s, NULL))      //NULL is to indicate end of args
+        ... // error
+    if (sam_hdr_add_line(in_samhdr, "CO", "Test data", NULL))                           //NULL is to indicate end of args
+        ... // error
+
     //write output
-    if (sam_hdr_write(outfile, in_samhdr) < 0) {
-    ...
+    if (sam_hdr_write(outfile, in_samhdr) < 0)
+        ... // error
 Refer: add_header.c
 
 Not all type of header data can be removed but where it is possible, either a
@@ -585,14 +614,14 @@ to be used. To remove all lines of a type, header type and unique identifier
 field tag are to be used.
 
     ...
-        //remove specific line
-        if (sam_hdr_remove_line_id(in_samhdr, header, id, idval)) {
-    ...
-        //remove multiple lines of a header type
-        if (sam_hdr_remove_lines(in_samhdr, header, id, NULL)) {
-    ...
-    if (sam_hdr_write(outfile, in_samhdr) < 0) {
-    ...
+
+    //remove specific line
+    if (sam_hdr_remove_line_id(in_samhdr, header, id, idval) < 0)
+        ... // error
+
+    //remove multiple lines of a header type
+    if (sam_hdr_remove_lines(in_samhdr, header, id, NULL) < 0)
+        ... // error
 Refer: rem_header.c
 
 Shows the file content after removing SQ line with SN 2.
@@ -640,13 +669,12 @@ be easier than update of existing record.
             break;
             case 3:// RNAME
             case 7:// RNEXT
-                if ((ret = sam_hdr_name2tid(in_samhdr, val)) < 0) {
-    ...
+                if ((ret = sam_hdr_name2tid(in_samhdr, val)) < 0)
+                    ... // error
                 if (field == 3) {
                     //reference
                     bamdata->core.tid = ret;
-                }
-                else {
+                } else {
                     //mate reference
                     bamdata->core.mtid = ret;
                 }
@@ -659,20 +687,21 @@ be easier than update of existing record.
             break;
             case 6:// CIGAR
             {
-    ...
+                ...
                 //get cigar array and set all data in new bam record
-                if ((ncigar = sam_parse_cigar(val, NULL, &cigar, &size)) < 0) {
-    ...
+                if ((ncigar = sam_parse_cigar(val, NULL, &cigar, &size)) < 0)
+                    ... // error
                 if (bam_set1(newbam, bamdata->core.l_qname, bam_get_qname(bamdata), bamdata->core.flag, bamdata->core.tid,
                  bamdata->core.pos, bamdata->core.qual, ncigar, cigar, bamdata->core.mtid, bamdata->core.mpos,
                   bamdata->core.isize, bamdata->core.l_qseq, (const char*)bam_get_seq(bamdata),
-                   (const char*)bam_get_qual(bamdata), bam_get_l_aux(bamdata)) < 0) {
-    ...
+                   (const char*)bam_get_qual(bamdata), bam_get_l_aux(bamdata)) < 0)
+                    ... // error
+
                 //correct sequence data as input is expected in ascii format and not as compressed inside bam!
                 memcpy(bam_get_seq(newbam), bam_get_seq(bamdata), (bamdata->core.l_qseq + 1) / 2);
                 //copy the aux data
                 memcpy(bam_get_aux(newbam), bam_get_aux(bamdata), bam_get_l_aux(bamdata));
-    ...
+            ...
             break;
             case 8:// PNEXT
                 bamdata->core.mpos = atoll(val);
@@ -681,18 +710,16 @@ be easier than update of existing record.
                 bamdata->core.isize = atoll(val);
             break;
             case 10:// SEQ
-    ...
+                ...
                 for( c = 0; c < i; ++c) {
                     bam_set_seqi(bam_get_seq(bamdata), c, seq_nt16_table[(unsigned char)val[c]]);
                 }
             break;
             case 11:// QUAL
-    ...
-                for (c = 0; c < i; ++c) {
+                ...
+                for (c = 0; c < i; ++c)
                     val[c] -= 33;               //phred score from ascii value
-                }
                 memcpy(bam_get_qual(bamdata), val, i);
-    ...
 Refer: mod_bam.c
 
 Shows data with RNAME modified to T2.
@@ -707,33 +734,32 @@ present at all, it can be appended using bam_aux_append.
     //matched to qname, update aux
     if (!(data = bam_aux_get(bamdata, tag))) {
         //tag not present append
-    ...
-        if (bam_aux_append(bamdata, tag, type, length, (const uint8_t*)val)) {
-    ...
-    else {
-        char auxtype = bam_aux_type(data);
+        ... // cut: computed length and val based on tag type
+        if (bam_aux_append(bamdata, tag, type, length, (const uint8_t*)val))
+            ... // error
+    } else {
         //update the tag with newer value
+        char auxtype = bam_aux_type(data);
         switch (type) {
             case 'f':
             case 'd':
-    ...
-                if (bam_aux_update_float(bamdata, tag, atof(val))) {
-    ...
+                ...
+                if (bam_aux_update_float(bamdata, tag, atof(val)))
+                    ... // error
             case 'C':
             case 'S':
             case 'I':
-    ...
-                if (bam_aux_update_int(bamdata, tag, atoll(val))) {
-    ...
+                ...
+                if (bam_aux_update_int(bamdata, tag, atoll(val)))
+                    ... // error
             case 'Z':
-    ...
-                if (bam_aux_update_str(bamdata, tag, length, val)) {
-    ...
+                ...
+                if (bam_aux_update_str(bamdata, tag, length, val))
+                    ... // error
             case 'A':
-    ...
+                ...
                 //update the char data directly on buffer
                 *(data+1) = val[0];
-    ...
 Refer: mod_aux.c
 
 Shows the given record's MD tag set to Test.
@@ -743,12 +769,14 @@ Shows the given record's MD tag set to Test.
 The array aux fields can be updated using bam_aux_update_array api.
 
     ...
-    if (bam_aux_update_array(bamdata, "BA", 'I', sizeof(cnt)/sizeof(cnt[0]), cnt)) {
-    ...
+    if (bam_aux_update_array(bamdata, "BA", 'I', sizeof(cnt)/sizeof(cnt[0]), cnt))
+        ... // error
 Refer: mod_aux_ba.c
 
 Shows the records updated with an array of integers, containing count of ACGT
-and N in that order.
+and N in that order. The bases are decoded before count for the sake of
+simplicity. Refer qtask_ordered.c for a better counting where decoding is made
+outside the loop.
 
     ./mod_aux_ba samtools/test/mpileup/mpileup.1.bam
 
@@ -761,14 +789,14 @@ can be read easily. There are different type of indices, BAI, CSI, CRAI, TBI,
 FAI etc. and are usually used with iterators.
 
 Indexing of plain/textual files are not supported, compressed SAM&FASTA/Q, BAM,
-and CRAM files can be indexed. CRAM files are indexed as .crai and the other two
-can be indexed as .bai or .csi files. Each of these types have different
-internal representations of the index information. Bai uses a fixed
-configuration values where as csi has them dynamically updated based on the
-alignment data.
+and CRAM files can be indexed. CRAM files are indexed as .crai and the others
+as .bai, .csi, .fai etc. Each of these types have different internal
+representations of the index information. Bai uses a fixed configuration values
+where as csi has them dynamically updated based on the alignment data.
 
 Indexes can be created either with save of alignment data or explicitly by
-read of existing alignment file.
+read of existing alignment file for alignment data (SAM/BAM/CRAM). For reference
+data it has to be explicitly created (FASTA).
 
 To create index along with alignment write, the sam_idx_init api need to be
 invoked before the start of alignment data write. This api takes the output
@@ -777,16 +805,17 @@ index, the min shift has to be 0.
 
 At the end of write, sam_idx_save api need to be invoked to save the index.
 
-    //write header
-    if (sam_hdr_write(outfile, in_samhdr)) {
     ...
+    //write header
+    if (sam_hdr_write(outfile, in_samhdr))
+        ... // error
     // initialize indexing, before start of write
-    if (sam_idx_init(outfile, in_samhdr, size, fileidx)) {
-    ...
-        if (sam_write1(outfile, in_samhdr, bamdata) < 0) {
-    ...
-    if (sam_idx_save(outfile)) {
-    ...
+    if (sam_idx_init(outfile, in_samhdr, size, fileidx))
+        ... // error
+        if (sam_write1(outfile, in_samhdr, bamdata) < 0)
+            ... // error
+    if (sam_idx_save(outfile))
+        ... // error
 Refer:index_write.c
 
 Creates mpileup.1.bam and mpileup.1.bam.bai in /tmp/.
@@ -803,6 +832,20 @@ The sam_index_build2 api takes the index file path as well and gives more
 control than the previous one.  The sam_index_build3 api provides an option to
 configure the number of threads in index creation.
 
+Index for reference data can be created using fai_build3 api. This creates
+index file with .fai extension. If the file is bgzip-ped, a .gzi file is
+created as well. It takes the path to input file and that of fai and gzi files.
+When fai/gzi path are NULL, they are created along with input file.
+These index files will be useful for reference data access.
+
+    ...
+    if (fai_build3(filename, NULL, NULL) == -1)
+        ... // error
+Refer: index_fast.c
+
+A tabix index can be created for compressed vcf/sam/bed and other data using
+tbx_index_build. It is mainly used with vcf and non-sam type files.
+
 
 ### Read with iterators
 
@@ -849,18 +892,19 @@ sam_itr_destroy and hts_idx_destroy apis does this.
 
     ...
     //load index file
-    if (!(idx = sam_index_load2(infile, inname, idxfile))) {
-    ...
+    if (!(idx = sam_index_load2(infile, inname, idxfile)))
+        ... // error
     //create iterator
-    if (!(iter = sam_itr_querys(idx, in_samhdr, region))) {
-    ...
+    if (!(iter = sam_itr_querys(idx, in_samhdr, region)))
+        ... // error
+
     //read using iterator
-    while ((c = sam_itr_next(infile, iter, bamdata)) >= 0) {
-    ...
-    if (iter) {
+    while ((c = sam_itr_next(infile, iter, bamdata)) >= 0)
+        ... // error
+
+    if (iter)
         sam_itr_destroy(iter);
-    }
-    if (idx) {
+    if (idx)
         hts_idx_destroy(idx);
     ...
 Refer:index_reg_read.c
@@ -891,19 +935,20 @@ itself.
 
     ...
     //load index file, assume it to be present in same location
-    if (!(idx = sam_index_load(infile, inname))) {
-    ...
+    if (!(idx = sam_index_load(infile, inname)))
+        ... // error
     //create iterator
-    if (!(iter = sam_itr_regarray(idx, in_samhdr, regions, regcnt))) {
-    ...
+    if (!(iter = sam_itr_regarray(idx, in_samhdr, regions, regcnt)))
+        ... // error
     if (regions) {
         //can be freed as it is no longer required
         free(regions);
         regions = NULL;
     }
+
     //get required area
-    while ((c = sam_itr_multi_next(infile, iter, bamdata) >= 0)) {
-    ...
+    while ((c = sam_itr_multi_next(infile, iter, bamdata) >= 0))
+        ... // process bamdata
 Refer:index_multireg_read.c
 
 With compressed sample.sam and 2 regions from reference T1 (30 to 32) and 1
@@ -921,13 +966,70 @@ hts_idx_destroy. The hts_reglist_t* array passed is destroyed by the library
 on iterator destroy. The regions array (array of char array/string) needs to be
 destroyed by the user itself.
 
+For fasta/fastq files, the index has to be loaded using fai_load3_format which
+takes the file, index file names and format. With single region specification
+fai_fetch64 can be used to get bases, and fai_fetchqual64 for quality in case
+of fastq data. With multiple region specification, with comma separation,
+faidx_fetch_seq64 and faidx_fetch_qual64 does the job. Regions has to be parsed
+using fai_parse_region in case of multiregion specifications. fai_adjust_region
+is used to adjust the start-end points based on available data.
+
+Below excerpt shows fasta/q access with single and multiregions,
+
+    ...
+    //load index
+    if (!(idx = fai_load3_format(inname, NULL, NULL, FAI_CREATE, fmt)))
+        ... // error
+
+    ...
+    if (!usemulti) {
+        //get data from single given region
+        if (!(data = fai_fetch64(idx, region, &len)))
+            ... // region not found
+
+        printf("Data: %"PRId64" %s\n", len, data);
+        free((void*)data);
+        //get quality for fastq type
+        if (fmt == FAI_FASTQ) {
+            if (!(data = fai_fetchqual64(idx, region, &len)))
+                ... // region not found
+        ...
+
+    } else { // usemulti
+        //parse, get each region and get data for each
+        while ((remaining = fai_parse_region(idx, region, &tid, &beg, &end, HTS_PARSE_LIST))) {     //here expects regions as csv
+            //parsed the region, correct end points based on actual data
+            if (fai_adjust_region(idx, tid, &beg, &end) == -1)
+                ... // error
+            //get data for given region
+            if (!(data = faidx_fetch_seq64(idx, faidx_iseq(idx, tid), beg, end, &len)))
+                ... // region not found
+
+            printf("Data: %"PRIhts_pos" %s\n", len, data);
+            free((void*)data);
+            data = NULL;
+            //get quality data for fastq
+            if (fmt == FAI_FASTQ) {
+                if (!(data = faidx_fetch_qual64(idx, faidx_iseq(idx, tid), beg, end, &len)))
+                    ... // error
+                printf("Qual: %"PRIhts_pos" %s\n", len, data);
+                free((void*)data);
+            ...
+            region = remaining;                                     //parse remaining region defs
+
+    ...
+    if (idx) {
+        fai_destroy(idx);
+    ...
+Refer: read_fast_index.c
+
 
 ### Pileup and MPileup
 
 Pileup shows the transposed view of the SAM alignment data, i.e. it shows the
-the reference positions and bases which cover that position through different
-reads side by side. MPileup facilitates the piling up of multiple sam files
-against each other and same reference at the same time.
+reference positions and bases which cover that position through different reads
+side by side. MPileup facilitates the piling up of multiple sam files against
+each other and same reference at the same time.
 
 Mpileup has replaced the pileup. The input expects the data to be sorted by
 position.
@@ -978,8 +1080,8 @@ above the cache limit are discarded.
 Once done, the pileup iterator to be discarded by sam_plp_destroy api.
 
     ...
-    if (!(plpiter = bam_plp_init(readdata, &conf))) {
-    ...
+    if (!(plpiter = bam_plp_init(readdata, &conf)))
+        ... // error
     //set constructor destructor callbacks
     bam_plp_constructor(plpiter, plpconstructor);
     bam_plp_destructor(plpiter, plpdestructor);
@@ -1011,7 +1113,7 @@ Once done, the pileup iterator to be discarded by sam_plp_destroy api.
                     printf("?");
                 }
     ...
-    if (plpiter) {
+    if (plpiter)
         bam_plp_destroy(plpiter);
     ...
 Refer:pileup.c
@@ -1067,8 +1169,8 @@ above the cache limit are discarded.
 Once done, the pileup iterator to be discarded by sam_mplp_destroy api.
 
     ...
-    if (!(mplpiter = bam_mplp_init(argc - 1, readdata, (void**) conf))) {
-    ...
+    if (!(mplpiter = bam_mplp_init(argc - 1, readdata, (void**) conf)))
+        ... // error
     //set constructor destructor callbacks
     bam_mplp_constructor(mplpiter, plpconstructor);
     bam_mplp_destructor(mplpiter, plpdestructor);
@@ -1134,13 +1236,13 @@ end of processing, the state need to be released using hts_base_mod_state_free
 api.
 
     ...
-    if (!(ms = hts_base_mod_state_alloc())) {
-    ...
+    if (!(ms = hts_base_mod_state_alloc()))
+        ... // error
     while ((ret_r = sam_read1(infile, in_samhdr, bamdata)) >= 0)
     {
-    ...
-        if (bam_parse_basemod(bamdata, ms)) {
-    ...
+        ...
+        if (bam_parse_basemod(bamdata, ms))
+            ... // error
         bm = bam_mods_recorded(ms, &cnt);
         for (k = 0; k < cnt; ++k) {
             printf("%c", bm[k]);
@@ -1191,7 +1293,7 @@ api.
             }
         }
     ...
-    if (ms) {
+    if (ms)
         hts_base_mod_state_free(ms);
     ...
 Refer:modstate.c
@@ -1221,7 +1323,7 @@ api.
     {
     ...
     if (!(plpiter = bam_plp_init(readdata, &conf))) {
-    ...
+        ... // error
     //set constructor destructor callbacks
     bam_plp_constructor(plpiter, plpconstructor);
     bam_plp_destructor(plpiter, plpdestructor);
@@ -1238,11 +1340,11 @@ api.
             }
             /*invoke bam mods_mods_at_qpos before bam_plp_insertion_mod that the base modification
             is retrieved before change in pileup pos thr' plp_insertion_mod call*/
-            if ((modlen = bam_mods_at_qpos(plp[j].b, plp[j].qpos, plp[j].cd.p, mods, NMODS)) == -1) {
-    ...
+            if ((modlen = bam_mods_at_qpos(plp[j].b, plp[j].qpos, plp[j].cd.p, mods, NMODS)) == -1)
+                ... // error
             //use plp_insertion/_mod to get insertion and del at the same position
-            if ((inslen = bam_plp_insertion_mod(&plp[j], (hts_base_mod_state*)plp[j].cd.p, &insdata, &dellen)) == -1) {
-    ...
+            if ((inslen = bam_plp_insertion_mod(&plp[j], (hts_base_mod_state*)plp[j].cd.p, &insdata, &dellen)) == -1)
+                ... // error
             //start and end are displayed in UPPER and rest on LOWER, only 1st modification considered
             //base and modification
             printf("%c%c%c", plp[j].is_head ? toupper(seq_nt16_str[bam_seqi(bam_get_seq(plp[j].b), plp[j].qpos)]) :
@@ -1260,7 +1362,7 @@ api.
                     printf("-%d", dellen);
                     for (k = 0; k < dellen; ++k) {
                         printf("?");
-    ...
+                ...
             else if (plp[j].indel < 0) {
                 //deletion
                 printf("%d", plp[j].indel);
@@ -1285,17 +1387,18 @@ data and a combination of flags for the required fields can be passed with
 CRAM_OPT_REQUIRED_FIELDS to this api.
 
     ...
-       //select required field alone, this is useful for CRAM alone
-       if (hts_set_opt(infile, CRAM_OPT_REQUIRED_FIELDS, SAM_FLAG) < 0) {
-    ...
-       //read header
-       in_samhdr = sam_hdr_read(infile);
+    //select required field alone, this is useful for CRAM alone
+    if (hts_set_opt(infile, CRAM_OPT_REQUIRED_FIELDS, SAM_FLAG) < 0)
+        ... // error
+
+    //read header
+    in_samhdr = sam_hdr_read(infile);
     ...
     //read data, check flags and update count
     while ((c = sam_read1(infile, in_samhdr, bamdata)) >= 0) {
-        if (bamdata->core.flag & BAM_FREAD1) {
+        if (bamdata->core.flag & BAM_FREAD1)
             cntread1++;
-    ...
+        ...
 Refer: flags_htsopt_field.c
 
 
@@ -1303,48 +1406,248 @@ Refer: flags_htsopt_field.c
 
 The HTSLib api supports thread pooling for better performance. There are a few
 ways in which this can be used. The pool can be made specific for a file or a
-generic pool can be created and shared across multiple files. Another way to
-use thread pool is to schedule tasks explicitly to queues which gets executed
-using threads in pool.
+generic pool can be created and shared across multiple files. Thread pool can
+also be used to execute user defined tasks. The tasks are to be added to queue,
+threads in pool executes them and results can be queued back if required.
 
 To have a thread pool specific for a file, hts_set_opt api can be used with the
-file pointer, HTS_OPT_NTHREADS and the number of threads to use in the pool.
-Closure of file releases the thread pool as well. To have a thread pool which
-can be shared across different files, it needs to be initialized using
-hts_tpool_init api, passing number of threads as argument. This thread pool can
-be associated with a file using hts_set_opt api. The file pointer,
-HTS_OPT_THREAD_POOL and the thread pool address are to be passed as arguments
-to api. The thread pool has to be released with hts_tpool_destroy.
+file pointer, HTS_OPT_NTHREADS and the number of threads to be in the pool.
+Thread pool is released on closure of file. To have a thread pool which can be
+shared across different files, it needs to be initialized using hts_tpool_init
+api, passing number of threads as an argument. This thread pool can be
+associated with a file using hts_set_opt api. The file pointer,
+HTS_OPT_THREAD_POOL and the thread pool address are to be passed as arguments to
+the api. The thread pool has to be released with hts_tpool_destroy.
+
+The samples are trivial ones to showcase the usage of api. The number of threads
+to use for different tasks has to be identified based on complexity and
+parallelism of the task.
 
 Below excerpt shows file specific thread pool,
 
     ...
     //create file specific threads
-    if (hts_set_opt(infile, HTS_OPT_NTHREADS, 2) < 0 ||     //2 thread specific for reading
+    if (hts_set_opt(infile, HTS_OPT_NTHREADS, 1) < 0 ||     //1 thread specific for reading
     hts_set_opt(outfile1, HTS_OPT_NTHREADS, 1) < 0 ||       //1 thread specific for sam write
-    hts_set_opt(outfile2, HTS_OPT_NTHREADS, 1) < 0) {       //1 thread specific for bam write
+    hts_set_opt(outfile2, HTS_OPT_NTHREADS, 2) < 0) {       //2 thread specific for bam write
         printf("Failed to set thread options\n");
         goto end;
     }
 Refer: split_thread1.c
 
-Below excerpt shows thread pool shared across files,
+Below excerpt shows a thread pool shared across files,
 
     ...
     //create a pool of 4 threads
-    if (!(tpool.pool = hts_tpool_init(4))) {
-    ...
+    if (!(tpool.pool = hts_tpool_init(4)))
+        ... // error
     //share the pool with all the 3 files
     if (hts_set_opt(infile, HTS_OPT_THREAD_POOL, &tpool) < 0 ||
     hts_set_opt(outfile1, HTS_OPT_THREAD_POOL, &tpool) < 0 ||
     hts_set_opt(outfile2, HTS_OPT_THREAD_POOL, &tpool) < 0) {
-    ...
-    if (tpool.pool) {
+        ... // error
+
+    ... // do something
+
+    //tidy up at end
+    if (tpool.pool)
         hts_tpool_destroy(tpool.pool);
-    }
     ...
 Refer: split_thread2.c
 
+Note that it is important to analyze the task in hand to decide the number of
+threads to be used. As an example, if the number of threads for reading is set
+to 2 and bam write to 1, keeping total number of threads the same, the
+performance may decrease as bam decoding is easier than encoding.
+
+Custom task / user defined functions can be performed on data using thread pool
+and for that, the task has to be scheduled to a queue. Thread pool associated
+with the queue will perform the task. There can be multiple pools and queues.
+The order of execution of threads are decided based on many factors and load on
+each task may vary, so the completion of the tasks may not be in the order of
+their queueing. The queues can be used in two different ways, one where the
+result is enqueued to queue again to be read in same order as initial queueing,
+second where the resuls are not enqueued and completed possibly in a different
+order than initial queueing. Explicitly created threads can also be used along
+with hts thread pool usage.
+
+hts_tpool_process_init initializes the queue / process, associates a queue with
+thread pool and reserves space for given number of tasks on queue. It takes a
+parameter indicating whether the result need to be enqueued for retrieval or
+not. If the result is enqueued, it is retrieved in the order of scheduling of
+task. Another parameter sets the maximum number of slots for tasks in queue,
+usually 2 times the number of threads are used. The input and output have their
+own queues and they grow as required upto the max set. hts_tpool_dispatch api
+enqueues the task to the queue. The api blocks when there is no space in queue.
+This behavior can be controlled with hts_tpool_dispatch2 api. The queue can be
+reset using hts_tpool_process_reset api where all tasks are discarded. The api
+hts_tpool_dispatch3 supports configuring cleanup routines which are to be run
+when reset occurs on the queue. hts_tpool_process_flush api can ensure that
+all the piled up tasks are processed, a possible case when the queueing and
+processing happen at different speeds. hts_tpool_process_shutdown api stops the
+processing of queue.
+
+There are a few apis which let the user to check the status of processing. The
+api hts_tpool_process_empty shows whether all the tasks are completed or not.
+The api hts_tpool_process_sz gives the number of tasks, at different states of
+processing. The api hts_tpool_process_len gives the number of results in output
+queue waiting to be collected.
+
+The order of execution of tasks depends on the number of threads involved and
+how the threads are scheduled by operating system. When the results are enqueued
+back to queue, they are read in same order of enqueueing of task and in that
+case the order of execution will not be noticed. When the results are not
+enqueued the results are available right away and the order of execution may be
+noticeable. Based on the nature of task and the need of order maintenance, users
+can select either of the queueing.
+
+Below excerpts shows the usage of queues and threads in both cases. In the 1st,
+alignments are updated with an aux tag indicating GC ratio. The order of data
+has to be maintained even after update, hence the result queueing is used to
+ensure same order as initial. A number of alignments are bunched together and
+reuse of allocated memory is made to make it perform better. A sentinel job is
+used to identify the completion of all tasks at the result collection side.
+    ...
+    void *thread_ordered_proc(void *args)
+    {
+        ...
+        for ( i = 0; i < bamdata->count; ++i) {
+            ...
+            for (pos = 0; pos < bamdata->bamarray[i]->core.l_qseq; ++pos)
+                count[bam_seqi(data,pos)]++;
+            ...
+            gcratio = (count[2] /*C*/ + count[4] /*G*/) / (float) (count[1] /*A*/ + count[8] /*T*/ + count[2] + count[4]);
+
+            if (bam_aux_append(bamdata->bamarray[i], "xr", 'f', sizeof(gcratio), (const uint8_t*)&gcratio) < 0) {
+
+    ...
+    void *threadfn_orderedwrite(void *args)
+    {
+        ...
+        //get result and write; wait if no result is in queue - until shutdown of queue
+        while (tdata->result == 0 &&
+            (r = hts_tpool_next_result_wait(tdata->queue)) != NULL) {
+            bamdata = (data*) hts_tpool_result_data(r);
+            ...
+            for (i = 0; i < bamdata->count; ++i) {
+                if (sam_write1(tdata->outfile, tdata->samhdr, bamdata->bamarray[i]) < 0) {
+                    ... // error
+            ...
+            hts_tpool_delete_result(r, 0);              //release the result memory
+            ...
+
+        // Shut down the process queue.  If we stopped early due to a write failure,
+        // this will signal to the other end that something has gone wrong.
+        hts_tpool_process_shutdown(tdata->queue);
+
+    ...
+    int main(int argc, char *argv[])
+    {
+        ...
+        if (!(pool = hts_tpool_init(cnt)))                  //thread pool
+            ... // error
+        tpool.pool = pool;      //to share the pool for file read and write as well
+        //queue to use with thread pool, for task and results
+        if (!(queue = hts_tpool_process_init(pool, cnt * 2, 0))) {
+    ...
+        //share the thread pool with i/o files
+        if (hts_set_opt(infile, HTS_OPT_THREAD_POOL, &tpool) < 0 ||
+            hts_set_opt(outfile, HTS_OPT_THREAD_POOL, &tpool) < 0)
+            ... // error
+        if (pthread_create(&thread, NULL, threadfn_orderedwrite, &twritedata))
+            ... // error
+        while (c >= 0) {
+            if (!(bamdata = getbamstorage(chunk, &bamcache)))
+                ... // error
+            for (cnt = 0; cnt < bamdata->maxsize; ++cnt) {
+                c = sam_read1(infile, in_samhdr, bamdata->bamarray[cnt]);
+                ...
+                if (hts_tpool_dispatch3(pool, queue, thread_ordered_proc, bamdata,
+                                        cleanup_bamstorage, cleanup_bamstorage,
+                                        0) == -1)
+                    ... // error
+        ...
+        if (queue) {
+            if (-1 == c) {
+                // EOF read, send a marker to tell the threadfn_orderedwrite()
+                // function to shut down.
+                if (hts_tpool_dispatch(pool, queue, thread_ordered_proc,
+                                    NULL) == -1) {
+                    ... // error
+                hts_tpool_process_shutdown(queue);
+
+        ...
+        // Wait for threadfn_orderedwrite to finish.
+        if (started_thread) {
+            pthread_join(thread, NULL);
+
+        ...
+        if (queue) {
+            // Once threadfn_orderedwrite has stopped, the queue can be
+            // cleaned up.
+            hts_tpool_process_destroy(queue);
+        }
+    ...
+Refer: qtask_ordered.c
+
+In this 2nd, the bases are counted and GC ratio of whole file is calculated.
+Order in which bases are counted is not relevant and no result queue required.
+The queue is created as input only.
+    ...
+    void *thread_unordered_proc(void *args)
+    {
+        ...
+        for ( i = 0; i < bamdata->count; ++i) {
+            data = bam_get_seq(bamdata->bamarray[i]);
+            for (pos = 0; pos < bamdata->bamarray[i]->core.l_qseq; ++pos)
+                counts[bam_seqi(data, pos)]++;
+
+        ...
+        //update result and add the memory block for reuse
+        pthread_mutex_lock(&bamdata->cache->lock);
+        for (i = 0; i < 16; i++) {
+            bamdata->bases->counts[i] += counts[i];
+        }
+
+        bamdata->next = bamdata->cache->list;
+        bamdata->cache->list = bamdata;
+        pthread_mutex_unlock(&bamdata->cache->lock);
+
+    ...
+    int main(int argc, char *argv[])
+    {
+        ...
+        if (!(queue = hts_tpool_process_init(pool, cnt * 2, 1)))
+            ... // error
+        c = 0;
+        while (c >= 0) {
+            ...
+            for (cnt = 0; cnt < bamdata->maxsize; ++cnt) {
+                c = sam_read1(infile, in_samhdr, bamdata->bamarray[cnt]);
+
+            ...
+            if (c >= -1 ) {
+                ...
+                if (hts_tpool_dispatch3(pool, queue, thread_unordered_proc, bamdata,
+                                        cleanup_bamstorage, cleanup_bamstorage,
+                                        0) == -1)
+                    ... // error
+        ...
+        if (-1 == c) {
+            // EOF read, ensure all are processed, waits for all to finish
+            if (hts_tpool_process_flush(queue) == -1) {
+                fprintf(stderr, "Failed to flush queue\n");
+            } else { //all done
+                //refer seq_nt16_str to find position of required bases
+                fprintf(stdout, "GCratio: %f\nBase counts:\n",
+                    (gccount.counts[2] /*C*/ + gccount.counts[4] /*G*/) / (float)
+                        (gccount.counts[1] /*A*/ + gccount.counts[8] /*T*/ +
+                            gccount.counts[2] + gccount.counts[4]));
+        ...
+        if (queue) {
+            hts_tpool_process_destroy(queue);
+        }
+Refer: qtask_unordered.c
 
 ## More Information
 
@@ -1421,9 +1724,9 @@ be destroyed as many times with sam_hdr_destroy api.
 ### Index
 
 Indices need the data to be sorted by position.  They can be of different
-types with extension .bai, .csi or .tbi for compressed SAM/BAM files and .crai
-for CRAM files.  The index name can be passed along with the alignment file
-itself by appending a specific character sequence. The apis can detect this
+types with extension .bai, .csi or .tbi for compressed SAM/BAM/VCF files and
+.crai for CRAM files.  The index name can be passed along with the alignment
+file itself by appending a specific character sequence. The apis can detect this
 sequence and extract the index path. ##idx## is the sequence which separates
 the file path and index path.
 
diff --git a/samples/Makefile b/samples/Makefile
index 40991d78f..ee632e3ad 100644
--- a/samples/Makefile
+++ b/samples/Makefile
@@ -2,7 +2,7 @@ HTS_DIR = ../
 include $(HTS_DIR)/htslib_static.mk
 
 CC = gcc
-CFLAGS = -Wall -g -O0
+CFLAGS = -Wall -O2
 
 #to statically link to libhts
 LDFLAGS = $(HTS_DIR)/libhts.a -L$(HTS_DIR) $(HTSLIB_static_LDFLAGS) $(HTSLIB_static_LIBS)
@@ -13,94 +13,105 @@ LDFLAGS = $(HTS_DIR)/libhts.a -L$(HTS_DIR) $(HTSLIB_static_LDFLAGS) $(HTSLIB_sta
 PRGS = flags split split2 cram read_fast read_header read_ref read_bam \
 	read_aux dump_aux add_header rem_header update_header mod_bam mod_aux \
 	mod_aux_ba write_fast idx_on_write read_reg read_multireg pileup \
-	mpileup modstate pileup_mod flags_field split_t1 split_t2
+	mpileup modstate pileup_mod flags_field split_t1 split_t2 \
+	read_fast_i qtask_ordered qtask_unordered index_fasta
 
 all: $(PRGS)
 
-flags:
+flags: flags_demo.c
 	$(CC) $(CFLAGS) -I $(HTS_DIR) flags_demo.c -o $@  $(LDFLAGS)
 
-split:
+split: split.c
 	$(CC) $(CFLAGS) -I $(HTS_DIR) split.c -o $@  $(LDFLAGS)
 
-split2:
+split2: split2.c
 	$(CC) $(CFLAGS) -I $(HTS_DIR) split2.c -o $@  $(LDFLAGS)
 
-cram:
+cram: cram.c
 	$(CC) $(CFLAGS) -I $(HTS_DIR) cram.c -o $@  $(LDFLAGS)
 
-read_fast:
+read_fast: read_fast.c
 	$(CC) $(CFLAGS) -I $(HTS_DIR) read_fast.c -o $@  $(LDFLAGS)
 
-read_header:
+read_header: read_header.c
 	$(CC) $(CFLAGS) -I $(HTS_DIR) read_header.c -o $@  $(LDFLAGS)
 
-read_ref:
+read_ref: read_refname.c
 	$(CC) $(CFLAGS) -I $(HTS_DIR) read_refname.c -o $@  $(LDFLAGS)
 
-read_bam:
+read_bam: read_bam.c
 	$(CC) $(CFLAGS) -I $(HTS_DIR) read_bam.c -o $@  $(LDFLAGS)
 
-read_aux:
+read_aux: read_aux.c
 	$(CC) $(CFLAGS) -I $(HTS_DIR) read_aux.c -o $@  $(LDFLAGS)
 
-dump_aux:
+dump_aux: dump_aux.c
 	$(CC) $(CFLAGS) -I $(HTS_DIR) dump_aux.c -o $@  $(LDFLAGS)
 
-add_header:
+add_header: add_header.c
 	$(CC) $(CFLAGS) -I $(HTS_DIR) add_header.c -o $@  $(LDFLAGS)
 
-rem_header:
+rem_header: rem_header.c
 	$(CC) $(CFLAGS) -I $(HTS_DIR) rem_header.c -o $@  $(LDFLAGS)
 
-update_header:
+update_header: update_header.c
 	$(CC) $(CFLAGS) -I $(HTS_DIR) update_header.c -o $@  $(LDFLAGS)
 
-mod_bam:
+mod_bam: mod_bam.c
 	$(CC) $(CFLAGS) -I $(HTS_DIR) mod_bam.c -o $@  $(LDFLAGS)
 
-mod_aux:
+mod_aux: mod_aux.c
 	$(CC) $(CFLAGS) -I $(HTS_DIR) mod_aux.c -o $@  $(LDFLAGS)
 
-mod_aux_ba:
+mod_aux_ba: mod_aux_ba.c
 	$(CC) $(CFLAGS) -I $(HTS_DIR) mod_aux_ba.c -o $@  $(LDFLAGS)
 
-write_fast:
+write_fast: write_fast.c
 	$(CC) $(CFLAGS) -I $(HTS_DIR) write_fast.c -o $@  $(LDFLAGS)
 
-idx_on_write:
+idx_on_write: index_write.c
 	$(CC) $(CFLAGS) -I $(HTS_DIR) index_write.c -o $@  $(LDFLAGS)
 
-read_reg:
+read_reg: index_reg_read.c
 	$(CC) $(CFLAGS) -I $(HTS_DIR) index_reg_read.c -o $@  $(LDFLAGS)
 
-read_multireg:
+read_multireg: index_multireg_read.c
 	$(CC) $(CFLAGS) -I $(HTS_DIR) index_multireg_read.c -o $@  $(LDFLAGS)
 
-pileup:
+read_fast_i: read_fast_index.c
+	$(CC) $(CFLAGS) -I $(HTS_DIR) read_fast_index.c -o $@  $(LDFLAGS)
+
+pileup: pileup.c
 	$(CC) $(CFLAGS) -I $(HTS_DIR) pileup.c -o $@  $(LDFLAGS)
 
-mpileup:
+mpileup: mpileup.c
 	$(CC) $(CFLAGS) -I $(HTS_DIR) mpileup.c -o $@  $(LDFLAGS)
 
-modstate:
+modstate: modstate.c
 	$(CC) $(CFLAGS) -I $(HTS_DIR) modstate.c -o $@  $(LDFLAGS)
 
-pileup_mod:
+pileup_mod: pileup_mod.c
 	$(CC) $(CFLAGS) -I $(HTS_DIR) pileup_mod.c -o $@  $(LDFLAGS)
 
-flags_field:
+flags_field: flags_htsopt_field.c
 	$(CC) $(CFLAGS) -I $(HTS_DIR) flags_htsopt_field.c -o $@  $(LDFLAGS)
 
-split_t1:
+split_t1: split_thread1.c
 	$(CC) $(CFLAGS) -I $(HTS_DIR) split_thread1.c -o $@  $(LDFLAGS)
 
-split_t2:
+split_t2: split_thread2.c
 	$(CC) $(CFLAGS) -I $(HTS_DIR) split_thread2.c -o $@  $(LDFLAGS)
 
+index_fasta: index_fasta.c
+	$(CC) $(CFLAGS) -I $(HTS_DIR) index_fasta.c -o $@  $(LDFLAGS)
+
+qtask_ordered: qtask_ordered.c
+	$(CC) $(CFLAGS) -I $(HTS_DIR) qtask_ordered.c -o $@  $(LDFLAGS)
+
+qtask_unordered: qtask_unordered.c
+	$(CC) $(CFLAGS) -I $(HTS_DIR) qtask_unordered.c -o $@  $(LDFLAGS)
+
 clean:
 	find . -name "*.o" | xargs rm -rf
 	find . -name "*.dSYM" | xargs rm -rf
-	rm $(PRGS)
-
-
+	-rm -f $(PRGS)
diff --git a/samples/README.md b/samples/README.md
index ab5481dea..6f90c0c3f 100644
--- a/samples/README.md
+++ b/samples/README.md
@@ -4,7 +4,7 @@ data, and is the core library used by [samtools][2] and [bcftools][3].
 
 A set of sample programs are available which showcases the usage of APIs in HTSlib.
 They are based on version 1.17 of HTSLib and are mainly for demonstration of API usage.
-Further optimization and error handling might be required for actual usage.
+Further optimisation and error handling might be required for actual usage.
 
 
 [1]: http://samtools.github.io/hts-specs/
@@ -61,7 +61,7 @@ indexed.
 
 [Read_fast][Read_fast]
 
-  This application showcases the fasta/fastq data read.
+  This application showcases fasta/fastq data read without using index.
 
 [Read_header][Read_header]
 
@@ -72,7 +72,7 @@ indexed.
 [Read_ref][Read_ref]
 
   This application showcases the read and access of header data. It shows
-  all reference names which has length equal or greather to given input.
+  all reference names which has length equal or greater to given input.
 
 [Read_bam][Read_bam]
 
@@ -129,14 +129,18 @@ indexed.
 
 [Write_fast][Write_fast]
 
-  This application showcases the fasta/fastq data write. It appends a dummy
-  data to given file.
+  This application showcases the fasta/fastq data write. It appends data on
+  given file.
 
 [Index_write][Index_write]
 
   This application showcases the creation of index along with output
   creation. Based on file type and shift, it creates bai, csi or crai files.
 
+[Index_fast][Index_fast]
+
+  This application showcases index creation on fasta/fastq reference data.
+
 [Read_reg][Read_reg]:
 
   This application showcases the usage of region specification in alignment
@@ -144,9 +148,14 @@ indexed.
 
 [Read_multireg][Read_multireg]:
 
-  This application showcases the usage of mulitple region specification in
+  This application showcases the usage of multiple region specification in
   alignment read.
 
+[Read_fast_index][Read_fast_index]
+
+  This application showcases the fasta/fastq data read using index. It takes a
+  region (reference name[:start-end]) and gets data from that region.
+
 [Pileup][Pileup]:
 
   This application showcases the pileup api, where all alignments covering a
@@ -181,8 +190,7 @@ indexed.
 
   This application showcases the use of threads in file handling. It saves
   the read1 and read2 as separate files in given directory, one as sam and
-  other as bam. 2 threads are used for read and 1 each dedicated for each
-  output file.
+  other as bam. 1 thread is used for read, 1 for sam write and 2 for bam write.
 
 [Split_thread2][Split_thread2]
 
@@ -191,6 +199,19 @@ indexed.
   and other as bam. A pool of 4 threads is created and shared for both read
   and write.
 
+[Qtask_ordered][Qtask_ordered]
+
+  This application showcases the use of queues and threads for custom
+  processing. Alignments in input file are updated with their GC ratio on a
+  custom aux tag. The processing may occur in any order but the results are
+  retrieved in same order as it was queued and saved to disk.
+
+[Qtask_unordered][Qtask_unordered]
+
+  This application showcases the use of queues and threads for custom
+  processing. The count of bases and GC ratio are calculated and displayed.
+  The order of counting is irrelevant and hence ordered retrieval is not used.
+
 ### More Information
 
 More detailed documentation is available in the [DEMO.md][DEMO] with worked
@@ -215,8 +236,10 @@ examples per demonstration tool.
 [Mod_aux_ba]: mod_aux_ba.c
 [Write_fast]: write_fast.c
 [Index_write]: index_write.c
+[Index_fasta]: index_fasta.c
 [Read_reg]: index_reg_read.c
 [Read_multireg]: index_multireg_read.c
+[Read_fast_index]: read_fast_index.c
 [Pileup]: pileup.c
 [Mpileup]: mpileup.c
 [Modstate]: modstate.c
@@ -224,4 +247,6 @@ examples per demonstration tool.
 [Flags_field]: flags_htsopt_field.c
 [Split_thread1]: split_thread1.c
 [Split_thread2]: split_thread2.c
+[Qtask_ordered]: qtask_ordered.c
+[Qtask_unordered]: qtask_unordered.c
 [DEMO]: DEMO.md
diff --git a/samples/add_header.c b/samples/add_header.c
index d1a2fc13c..066b1d438 100644
--- a/samples/add_header.c
+++ b/samples/add_header.c
@@ -24,20 +24,20 @@ DEALINGS IN THE SOFTWARE
 
 */
 
-/* The pupose of this code is to demonstrate the library apis and need proper error handling and optimization */
+/* The purpose of this code is to demonstrate the library apis and need proper error handling and optimisation */
 
 #include <getopt.h>
 #include <unistd.h>
 #include <htslib/sam.h>
 
-/// print_usage - print the demo_usage
-/** @param fp pointer to the file / terminal to which demo_usage to be dumped
+/// print_usage - print the usage
+/** @param fp pointer to the file / terminal to which usage to be dumped
 returns nothing
 */
 static void print_usage(FILE *fp)
 {
     fprintf(fp, "Usage: add_header infile\n\
-Adds new header lines of SQ, RG, PG and CO typs\n");
+Adds new header lines of SQ, RG, PG and CO types\n");
     return;
 }
 
diff --git a/samples/cram.c b/samples/cram.c
index 5f55e65d2..7b1342377 100644
--- a/samples/cram.c
+++ b/samples/cram.c
@@ -24,14 +24,14 @@ DEALINGS IN THE SOFTWARE
 
 */
 
-/* The pupose of this code is to demonstrate the library apis and need proper error handling and optimization */
+/* The purpose of this code is to demonstrate the library apis and need proper error handling and optimisation */
 
 #include <getopt.h>
 #include <unistd.h>
 #include <htslib/sam.h>
 
-/// print_usage - print the demo_usage
-/** @param fp pointer to the file / terminal to which demo_usage to be dumped
+/// print_usage - print the usage
+/** @param fp pointer to the file / terminal to which usage to be dumped
 returns nothing
 */
 static void print_usage(FILE *fp)
diff --git a/samples/dump_aux.c b/samples/dump_aux.c
index 49251fe04..3caa16027 100644
--- a/samples/dump_aux.c
+++ b/samples/dump_aux.c
@@ -24,14 +24,14 @@ DEALINGS IN THE SOFTWARE
 
 */
 
-/* The pupose of this code is to demonstrate the library apis and need proper error handling and optimization */
+/* The purpose of this code is to demonstrate the library apis and need proper error handling and optimisation */
 
 #include <getopt.h>
 #include <unistd.h>
 #include <htslib/sam.h>
 
-/// print_usage - print the demo_usage
-/** @param fp pointer to the file / terminal to which demo_usage to be dumped
+/// print_usage - print the usage
+/** @param fp pointer to the file / terminal to which usage to be dumped
 returns nothing
 */
 static void print_usage(FILE *fp)
@@ -92,7 +92,7 @@ int printauxdata(FILE *fp, char type, int32_t idx, const uint8_t *data)
         fprintf(fp, "%c", auxBType);
         for (i = 0; i < auxBcnt; ++i) {                                                     //iterate the array
             fprintf(fp, ",");
-            //calling recurssively  with index to reuse a few lines
+            //calling recursively  with index to reuse a few lines
             if (printauxdata(fp, auxBType, i, data) == EXIT_FAILURE) {
                 return EXIT_FAILURE;
             }
diff --git a/samples/flags_demo.c b/samples/flags_demo.c
index e03fc6cd8..ac26be86c 100644
--- a/samples/flags_demo.c
+++ b/samples/flags_demo.c
@@ -24,14 +24,14 @@ DEALINGS IN THE SOFTWARE
 
 */
 
-/* The pupose of this code is to demonstrate the library apis and need proper error handling and optimization */
+/* The purpose of this code is to demonstrate the library apis and need proper error handling and optimisation */
 
 #include <getopt.h>
 #include <unistd.h>
 #include <htslib/sam.h>
 
-/// print_usage - show flags_demo usage
-/** @param fp pointer to the file / terminal to which demo_usage to be dumped
+/// print_usage - show usage
+/** @param fp pointer to the file / terminal to which usage to be dumped
 returns nothing
 */
 static void print_usage(FILE *fp)
diff --git a/samples/flags_htsopt_field.c b/samples/flags_htsopt_field.c
index 4b64445e3..40a0affc4 100644
--- a/samples/flags_htsopt_field.c
+++ b/samples/flags_htsopt_field.c
@@ -24,14 +24,14 @@ DEALINGS IN THE SOFTWARE
 
 */
 
-/* The pupose of this code is to demonstrate the library apis and need proper error handling and optimization */
+/* The purpose of this code is to demonstrate the library apis and need proper error handling and optimisation */
 
 #include <getopt.h>
 #include <unistd.h>
 #include <htslib/sam.h>
 
-/// print_usage - show flags_demo usage
-/** @param fp pointer to the file / terminal to which demo_usage to be dumped
+/// print_usage - show usage
+/** @param fp pointer to the file / terminal to which usage to be dumped
 returns nothing
 */
 static void print_usage(FILE *fp)
diff --git a/samples/index_fasta.c b/samples/index_fasta.c
new file mode 100644
index 000000000..ba0489094
--- /dev/null
+++ b/samples/index_fasta.c
@@ -0,0 +1,72 @@
+/*  index_fasta.c --  showcases the htslib api usage
+
+    Copyright (C) 2024 Genome Research Ltd.
+
+    Author: Vasudeva Sarma <vasudeva.sarma@sanger.ac.uk>
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+DEALINGS IN THE SOFTWARE
+
+*/
+
+/* The purpose of this code is to demonstrate the library apis and need proper error handling and optimisation */
+
+#include <getopt.h>
+#include <unistd.h>
+#include <time.h>
+#include <htslib/sam.h>
+#include <htslib/faidx.h>
+
+/// print_usage - show usage
+/** @param fp pointer to the file / terminal to which usage to be dumped
+returns nothing
+*/
+static void print_usage(FILE *fp)
+{
+    fprintf(fp, "Usage: index_fasta <file>\n\
+Indexes a fasta/fastq file and saves along with source.\n");
+    return;
+}
+
+/// main - indexes fasta/fastq file
+/** @param argc - count of arguments
+ *  @param argv - pointer to array of arguments
+returns 1 on failure 0 on success
+*/
+int main(int argc, char *argv[])
+{
+    const char *filename = NULL;             //file name
+    int ret = EXIT_FAILURE;
+
+    if (argc != 2) {
+        print_usage(stdout);
+        goto end;
+    }
+    filename = argv[1];
+
+    // index the file
+    if (fai_build3(filename, NULL, NULL) == -1) {
+        printf("Indexing failed with %d\n", errno);
+        goto end;
+    }
+    //this creates an .fai file. If the file is bgzipped, a .gzi file will be created along with .fai
+    ret = EXIT_SUCCESS;
+end:
+    //clean up
+    return ret;
+}
diff --git a/samples/index_multireg_read.c b/samples/index_multireg_read.c
index dbe8f15f9..7bb864990 100644
--- a/samples/index_multireg_read.c
+++ b/samples/index_multireg_read.c
@@ -24,14 +24,14 @@ DEALINGS IN THE SOFTWARE
 
 */
 
-/* The pupose of this code is to demonstrate the library apis and need proper error handling and optimization */
+/* The purpose of this code is to demonstrate the library apis and need proper error handling and optimisation */
 
 #include <getopt.h>
 #include <unistd.h>
 #include <htslib/sam.h>
 
-/// print_usage - print the print_usage
-/** @param fp pointer to the file / terminal to which print_usage to be dumped
+/// print_usage - print the usage
+/** @param fp pointer to the file / terminal to which usage to be dumped
 returns nothing
 */
 static void print_usage(FILE *fp)
diff --git a/samples/index_reg_read.c b/samples/index_reg_read.c
index 346d5428f..dec684933 100644
--- a/samples/index_reg_read.c
+++ b/samples/index_reg_read.c
@@ -24,19 +24,19 @@ DEALINGS IN THE SOFTWARE
 
 */
 
-/* The pupose of this code is to demonstrate the library apis and need proper error handling and optimization */
+/* The purpose of this code is to demonstrate the library apis and need proper error handling and optimisation */
 
 #include <getopt.h>
 #include <unistd.h>
 #include <htslib/sam.h>
 
-/// print_usage - print the print_usage
-/** @param fp pointer to the file / terminal to which print_usage to be dumped
+/// print_usage - print the usage
+/** @param fp pointer to the file / terminal to which usage to be dumped
 returns nothing
 */
 static void print_usage(FILE *fp)
 {
-    fprintf(fp, "Usage: readreg infile idxfile region\n\
+    fprintf(fp, "Usage: read_reg infile idxfile region\n\
 Reads alignments matching to a specific region\n\
 \\. from start of file\n\
 \\* only unmapped reads\n\
diff --git a/samples/index_write.c b/samples/index_write.c
index 8fd2bc968..9ec63d4ad 100644
--- a/samples/index_write.c
+++ b/samples/index_write.c
@@ -24,15 +24,15 @@ DEALINGS IN THE SOFTWARE
 
 */
 
-/* The pupose of this code is to demonstrate the library apis and need proper error handling and optimization */
+/* The purpose of this code is to demonstrate the library apis and need proper error handling and optimisation */
 
 #include <getopt.h>
 #include <libgen.h>
 #include <unistd.h>
 #include <htslib/sam.h>
 
-/// print_usage - print the demo_usage
-/** @param fp pointer to the file / terminal to which demo_usage to be dumped
+/// print_usage - print the usage
+/** @param fp pointer to the file / terminal to which usage to be dumped
 returns nothing
 */
 static void print_usage(FILE *fp)
diff --git a/samples/mod_aux.c b/samples/mod_aux.c
index d5ed18cde..ae531b985 100644
--- a/samples/mod_aux.c
+++ b/samples/mod_aux.c
@@ -24,14 +24,15 @@ DEALINGS IN THE SOFTWARE
 
 */
 
-/* The pupose of this code is to demonstrate the library apis and need proper error handling and optimization */
+/* The purpose of this code is to demonstrate the library apis and need proper error handling and optimisation */
 
 #include <getopt.h>
+#include <strings.h>
 #include <unistd.h>
 #include <htslib/sam.h>
 
-/// print_usage - print the demo_usage
-/** @param fp pointer to the file / terminal to which demo_usage to be dumped
+/// print_usage - print the usage
+/** @param fp pointer to the file / terminal to which usage to be dumped
 returns nothing
 */
 static void print_usage(FILE *fp)
diff --git a/samples/mod_aux_ba.c b/samples/mod_aux_ba.c
index 8ef90ee1e..836a3d39c 100644
--- a/samples/mod_aux_ba.c
+++ b/samples/mod_aux_ba.c
@@ -24,14 +24,14 @@ DEALINGS IN THE SOFTWARE
 
 */
 
-/* The pupose of this code is to demonstrate the library apis and need proper error handling and optimization */
+/* The purpose of this code is to demonstrate the library apis and need proper error handling and optimisation */
 
 #include <getopt.h>
 #include <unistd.h>
 #include <htslib/sam.h>
 
-/// print_usage - print the demo_usage
-/** @param fp pointer to the file / terminal to which demo_usage to be dumped
+/// print_usage - print the usage
+/** @param fp pointer to the file / terminal to which usage to be dumped
 returns nothing
 */
 static void print_usage(FILE *fp)
diff --git a/samples/mod_bam.c b/samples/mod_bam.c
index 9f1eb324e..616639610 100644
--- a/samples/mod_bam.c
+++ b/samples/mod_bam.c
@@ -24,14 +24,15 @@ DEALINGS IN THE SOFTWARE
 
 */
 
-/* The pupose of this code is to demonstrate the library apis and need proper error handling and optimization */
+/* The purpose of this code is to demonstrate the library apis and need proper error handling and optimisation */
 
 #include <getopt.h>
+#include <strings.h>
 #include <unistd.h>
 #include <htslib/sam.h>
 
-/// print_usage - print the demo_usage
-/** @param fp pointer to the file / terminal to which demo_usage to be dumped
+/// print_usage - print the usage
+/** @param fp pointer to the file / terminal to which usage to be dumped
 returns nothing
 */
 static void print_usage(FILE *fp)
diff --git a/samples/modstate.c b/samples/modstate.c
index 976391684..4d5f67635 100644
--- a/samples/modstate.c
+++ b/samples/modstate.c
@@ -24,14 +24,14 @@ DEALINGS IN THE SOFTWARE
 
 */
 
-/* The pupose of this code is to demonstrate the library apis and need proper error handling and optimization */
+/* The purpose of this code is to demonstrate the library apis and need proper error handling and optimisation */
 
 #include <getopt.h>
 #include <unistd.h>
 #include <htslib/sam.h>
 
-/// print_usage - print the demo_usage
-/** @param fp pointer to the file / terminal to which demo_usage to be dumped
+/// print_usage - print the usage
+/** @param fp pointer to the file / terminal to which usage to be dumped
 returns nothing
 */
 static void print_usage(FILE *fp)
diff --git a/samples/mpileup.c b/samples/mpileup.c
index fe933748e..ecab70584 100644
--- a/samples/mpileup.c
+++ b/samples/mpileup.c
@@ -24,15 +24,15 @@ DEALINGS IN THE SOFTWARE
 
 */
 
-/* The pupose of this code is to demonstrate the library apis and need proper error handling and optimization */
+/* The purpose of this code is to demonstrate the library apis and need proper error handling and optimisation */
 
 #include <getopt.h>
 #include <unistd.h>
 #include <ctype.h>
 #include <htslib/sam.h>
 
-/// print_usage - show flags_demo usage
-/** @param fp pointer to the file / terminal to which demo_usage to be dumped
+/// print_usage - show usage
+/** @param fp pointer to the file / terminal to which usage to be dumped
 returns nothing
 */
 static void print_usage(FILE *fp)
diff --git a/samples/pileup.c b/samples/pileup.c
index 11e2fb02f..be7aad801 100644
--- a/samples/pileup.c
+++ b/samples/pileup.c
@@ -24,15 +24,15 @@ DEALINGS IN THE SOFTWARE
 
 */
 
-/* The pupose of this code is to demonstrate the library apis and need proper error handling and optimization */
+/* The purpose of this code is to demonstrate the library apis and need proper error handling and optimisation */
 
 #include <getopt.h>
 #include <unistd.h>
 #include <ctype.h>
 #include <htslib/sam.h>
 
-/// print_usage - show flags_demo usage
-/** @param fp pointer to the file / terminal to which demo_usage to be dumped
+/// print_usage - show usage
+/** @param fp pointer to the file / terminal to which usage to be dumped
 returns nothing
 */
 static void print_usage(FILE *fp)
diff --git a/samples/pileup_mod.c b/samples/pileup_mod.c
index 24d6cf539..81ac5a540 100644
--- a/samples/pileup_mod.c
+++ b/samples/pileup_mod.c
@@ -24,15 +24,15 @@ DEALINGS IN THE SOFTWARE
 
 */
 
-/* The pupose of this code is to demonstrate the library apis and need proper error handling and optimization */
+/* The purpose of this code is to demonstrate the library apis and need proper error handling and optimisation */
 
 #include <getopt.h>
 #include <unistd.h>
 #include <ctype.h>
 #include <htslib/sam.h>
 
-/// print_usage - show flags_demo usage
-/** @param fp pointer to the file / terminal to which demo_usage to be dumped
+/// print_usage - show usage
+/** @param fp pointer to the file / terminal to which usage to be dumped
 returns nothing
 */
 static void print_usage(FILE *fp)
diff --git a/samples/qtask_ordered.c b/samples/qtask_ordered.c
new file mode 100644
index 000000000..a76d59826
--- /dev/null
+++ b/samples/qtask_ordered.c
@@ -0,0 +1,425 @@
+/*  qtask_ordered.c --  showcases the htslib api usage
+
+    Copyright (C) 2024 Genome Research Ltd.
+
+    Author: Vasudeva Sarma <vasudeva.sarma@sanger.ac.uk>
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+DEALINGS IN THE SOFTWARE
+
+*/
+
+/* The purpose of this code is to demonstrate the library apis and need proper error handling and optimisation */
+
+#include <getopt.h>
+#include <unistd.h>
+#include <pthread.h>
+#include <sys/time.h>
+#include <htslib/sam.h>
+#include <htslib/thread_pool.h>
+
+typedef struct data {
+    int count;                  //used up size
+    int maxsize;                //max size per data chunk
+    bam1_t **bamarray;          //bam1_t array for optimal queueing
+    struct data *next;          //pointer to next one - to reuse earlier allocations
+} data;
+
+typedef struct datacache
+{
+    pthread_mutex_t lock;       //synchronizes the access to cache
+    data *list;                 //data storage
+} datacache;
+
+typedef struct orderedwrite {
+    samFile *outfile;           //output file handle
+    sam_hdr_t *samhdr;          //header used to write data
+    hts_tpool_process *queue;   //queue from which results to be retrieved
+    datacache *cache;           //to re-use allocated storage
+    int result;                 //result code returned by writer thread
+} orderedwrite;
+
+/// print_usage - print the usage
+/** @param fp pointer to the file / terminal to which usage to be dumped
+returns nothing
+*/
+static void print_usage(FILE *fp)
+{
+    fprintf(fp, "Usage: qtask_ordered infile threadcount outdir [chunksize]\n\
+Calculates GC ratio - sum(G,C) / sum(A,T,C,G) - and adds to each alignment\n\
+as xr:f aux tag. Output is saved in outdir.\n\
+chunksize [4096] sets the number of alignments clubbed together to process.\n");
+    return;
+}
+
+/// getbamstorage - allocates storage for alignments to queue
+/** @param chunk number of bam data to allocate
+ * @param bamcache cached storage
+returns already allocated data storage if one is available, otherwise allocates new
+*/
+data* getbamstorage(int chunk, datacache *bamcache)
+{
+    int i = 0;
+    data *bamdata = NULL;
+
+    if (!bamcache) {
+        return NULL;
+    }
+    //get from cache if there is an already allocated storage
+    if (pthread_mutex_lock(&bamcache->lock)) {
+        return NULL;
+    }
+    if (bamcache->list) {                   //available
+        bamdata = bamcache->list;
+        bamcache->list = bamdata->next;     //remove and set next one as available
+        bamdata->next = NULL;               //remove link
+        bamdata->count = 0;
+        goto end;
+    }
+    //allocate and use
+    if (!(bamdata = malloc(sizeof(data)))) {
+        goto end;
+    }
+    bamdata->bamarray = malloc(chunk * sizeof(bam1_t*));
+    if (!bamdata->bamarray) {
+        free(bamdata);
+        bamdata = NULL;
+        goto end;
+    }
+    for (i = 0; i < chunk; ++i) {
+        bamdata->bamarray[i] = bam_init1();
+    }
+    bamdata->maxsize = chunk;
+    bamdata->count = 0;
+    bamdata->next = NULL;
+
+end:
+    pthread_mutex_unlock(&bamcache->lock);
+    return bamdata;
+}
+
+/// cleanup_bamstorage - frees a bamdata struct plus contents
+/** @param arg Pointer to data to free
+    @p arg has type void * so it can be used as a callback passed
+    to hts_tpool_dispatch3().
+ */
+void cleanup_bamstorage(void *arg)
+{
+    data *bamdata = (data *) arg;
+    if (!bamdata)
+        return;
+    if (bamdata->bamarray) {
+        int i;
+        for (i = 0; i < bamdata->maxsize; i++) {
+            bam_destroy1(bamdata->bamarray[i]);
+        }
+        free(bamdata->bamarray);
+    }
+    free(bamdata);
+}
+
+/// thread_ordered_proc - does the processing of task in queue and queues the output back
+/** @param args pointer to set of data to be processed
+returns the processed data
+the processing could be in any order based on the number of threads in use but read of output
+from queue will be in order
+a null data indicates the end of input and a null is returned to be added back to result queue
+*/
+void *thread_ordered_proc(void *args)
+{
+    int i = 0, pos = 0;
+    data *bamdata = (data*)args;
+    float gcratio = 0;
+    uint8_t *data = NULL;
+
+    if (bamdata == NULL)
+        return NULL; // Indicates no more input
+
+    for ( i = 0; i < bamdata->count; ++i) {
+        //add count
+        uint64_t count[16] = {0};
+        data = bam_get_seq(bamdata->bamarray[i]);
+        for (pos = 0; pos < bamdata->bamarray[i]->core.l_qseq; ++pos) {
+            count[bam_seqi(data,pos)]++;
+        }
+        /*it is faster to count all and use offset to get required counts rather than select
+        require ones inside the loop*/
+        gcratio = (count[2] /*C*/ + count[4] /*G*/) / (float) (count[1] /*A*/ + count[8] /*T*/ + count[2] + count[4]);
+
+        if (bam_aux_append(bamdata->bamarray[i], "xr", 'f', sizeof(gcratio), (const uint8_t*)&gcratio) < 0) {
+            fprintf(stderr, "Failed to add aux tag xr, errno: %d\n", errno);
+            break;
+        }
+    }
+    return bamdata;
+}
+
+/// threadfn_orderedwrite - thread that read the output from queue and writes
+/** @param args pointer to data specific for the thread
+returns NULL
+*/
+void *threadfn_orderedwrite(void *args)
+{
+    orderedwrite *tdata = (orderedwrite*)args;
+    hts_tpool_result *r = NULL;
+    data *bamdata = NULL;
+    int i = 0;
+
+    tdata->result = 0;
+
+    //get result and write; wait if no result is in queue - until shutdown of queue
+    while (tdata->result == 0 &&
+           (r = hts_tpool_next_result_wait(tdata->queue)) != NULL) {
+        bamdata = (data*) hts_tpool_result_data(r);
+
+        if (bamdata == NULL) {
+            // Indicator for no more input. Time to stop.
+            hts_tpool_delete_result(r, 0);
+            break;
+        }
+
+        for (i = 0; i < bamdata->count; ++i) {
+            if (sam_write1(tdata->outfile, tdata->samhdr, bamdata->bamarray[i]) < 0) {
+                fprintf(stderr, "Failed to write output data\n");
+                tdata->result = -1;
+                break;
+            }
+        }
+        hts_tpool_delete_result(r, 0);              //release the result memory
+
+        pthread_mutex_lock(&tdata->cache->lock);
+        bamdata->next = tdata->cache->list;         //make current list as next
+        tdata->cache->list = bamdata;               //set as current to reuse
+        pthread_mutex_unlock(&tdata->cache->lock);
+    }
+
+    // Shut down the process queue.  If we stopped early due to a write failure,
+    // this will signal to the other end that something has gone wrong.
+    hts_tpool_process_shutdown(tdata->queue);
+
+    return NULL;
+}
+
+/// main_demo - start of the demo
+/** @param argc - count of arguments
+ *  @param argv - pointer to array of arguments
+returns 1 on failure 0 on success
+*/
+int main(int argc, char *argv[])
+{
+    const char *inname = NULL, *outdir = NULL;
+    char *file = NULL;
+    int c = 0, ret = EXIT_FAILURE, cnt = 0, started_thread = 0, chunk = 0;
+    size_t size = 0;
+    samFile *infile = NULL, *outfile = NULL;
+    sam_hdr_t *in_samhdr = NULL;
+    pthread_t thread;
+    orderedwrite twritedata = {0};
+    hts_tpool *pool = NULL;
+    hts_tpool_process *queue = NULL;
+    htsThreadPool tpool = {NULL, 0};
+    data *bamdata = NULL;
+    datacache bamcache = {PTHREAD_MUTEX_INITIALIZER, NULL};
+
+    //qtask infile threadcount outdir [chunksize]
+    if (argc != 4 && argc != 5) {
+        print_usage(stdout);
+        goto end;
+    }
+    inname = argv[1];
+    cnt = atoi(argv[2]);
+    outdir = argv[3];
+    if (argc == 5) {    //chunk size present
+        chunk = atoi(argv[4]);
+    }
+    if (cnt < 1) {      //set proper thread count
+        cnt = 1;
+    }
+    if (chunk < 1) {    //set valid  chunk size
+        chunk = 4096;
+    }
+
+    //allocate space for output
+    size = (strlen(outdir) + sizeof("/out.bam") + 1);   //space for output file name and null termination
+    if (!(file = malloc(size))) {
+        fprintf(stderr, "Failed to set output path\n");
+        goto end;
+    }
+    snprintf(file, size, "%s/out.bam", outdir);         //output file name
+    if (!(pool = hts_tpool_init(cnt))) {                //thread pool
+        fprintf(stderr, "Failed to create thread pool\n");
+        goto end;
+    }
+    tpool.pool = pool;      //to share the pool for file read and write as well
+    //queue to use with thread pool, for task and results
+    if (!(queue = hts_tpool_process_init(pool, cnt * 2, 0))) {
+        fprintf(stderr, "Failed to create queue\n");
+        goto end;
+    }
+    //open input file - r reading
+    if (!(infile = sam_open(inname, "r"))) {
+        fprintf(stderr, "Could not open %s\n", inname);
+        goto end;
+    }
+    //open output files - w write as SAM, wb  write as BAM
+    if (!(outfile = sam_open(file, "wb"))) {
+        fprintf(stderr, "Could not open output file\n");
+        goto end;
+    }
+    //share the thread pool with i/o files
+    if (hts_set_opt(infile, HTS_OPT_THREAD_POOL, &tpool) < 0 ||
+          hts_set_opt(outfile, HTS_OPT_THREAD_POOL, &tpool) < 0) {
+        fprintf(stderr, "Failed to set threads to i/o files\n");
+        goto end;
+    }
+    //read header, required to resolve the target names to proper ids
+    if (!(in_samhdr = sam_hdr_read(infile))) {
+        fprintf(stderr, "Failed to read header from file!\n");
+        goto end;
+    }
+    //write header
+    if ((sam_hdr_write(outfile, in_samhdr) == -1)) {
+        fprintf(stderr, "Failed to write header\n");
+        goto end;
+    }
+
+    /* tasks are queued, worker threads get them and process in parallel;
+    the results are queued and they are to be removed in parallel as well */
+
+    // start output writer thread for ordered processing
+    twritedata.outfile = outfile;
+    twritedata.samhdr  = in_samhdr;
+    twritedata.result  = 0;
+    twritedata.queue   = queue;
+    twritedata.cache   = &bamcache;
+    if (pthread_create(&thread, NULL, threadfn_orderedwrite, &twritedata)) {
+        fprintf(stderr, "Failed to create writer thread\n");
+        goto end;
+    }
+    started_thread = 1;
+
+    c = 0;
+    while (c >= 0) {
+        if (!(bamdata = getbamstorage(chunk, &bamcache))) {
+            fprintf(stderr, "Failed to allocate memory\n");
+            break;
+        }
+        //read alignments, upto max size for this lot
+        for (cnt = 0; cnt < bamdata->maxsize; ++cnt) {
+            c = sam_read1(infile, in_samhdr, bamdata->bamarray[cnt]);
+            if (c < 0) {
+                break;      // EOF or failure
+            }
+        }
+        if (c >= -1 ) {
+            //max size data or reached EOF
+            bamdata->count = cnt;
+            // Queue the data for processing.  hts_tpool_dispatch3() is
+            // used here as it allows in-flight data to be cleaned up
+            // properly when stopping early due to errors.
+            if (hts_tpool_dispatch3(pool, queue, thread_ordered_proc, bamdata,
+                                    cleanup_bamstorage, cleanup_bamstorage,
+                                    0) == -1) {
+                fprintf(stderr, "Failed to schedule processing\n");
+                goto end;
+            }
+            bamdata = NULL;
+        } else {
+            fprintf(stderr, "Error in reading data\n");
+            break;
+        }
+    }
+
+    ret = EXIT_SUCCESS;
+
+ end:
+    // Tidy up after having dispatched all of the data.
+
+    // Note that the order here is important.  In particular, we need
+    // to join the thread that was started earlier before freeing anything
+    // to avoid any use-after-free errors.
+
+    // It's also possible to get here early due to various error conditions,
+    // so we need to carefully check which parts of the program state have
+    // been created before trying to clean them up.
+
+    if (queue) {
+        if (-1 == c) {
+            // EOF read, send a marker to tell the threadfn_orderedwrite()
+            // function to shut down.
+            if (hts_tpool_dispatch(pool, queue, thread_ordered_proc,
+                                   NULL) == -1) {
+                fprintf(stderr, "Failed to schedule sentinel job\n");
+                ret = EXIT_FAILURE;
+            }
+        } else {
+            // Error or we never wrote anything.  Shut down the queue to
+            // ensure threadfn_orderedwrite() wakes up and terminates.
+            hts_tpool_process_shutdown(queue);
+        }
+    }
+
+    // Wait for threadfn_orderedwrite to finish.
+    if (started_thread) {
+        pthread_join(thread, NULL);
+
+        // Once the writer thread has finished, check the result it sent back
+        if (twritedata.result != 0) {
+            ret = EXIT_FAILURE;
+        }
+    }
+
+    if (queue) {
+        // Once threadfn_orderedwrite has stopped, the queue can be
+        // cleaned up.
+        hts_tpool_process_destroy(queue);
+    }
+
+    if (in_samhdr) {
+        sam_hdr_destroy(in_samhdr);
+    }
+    if (infile) {
+        if (sam_close(infile) != 0) {
+            ret = EXIT_FAILURE;
+        }
+    }
+    if (outfile) {
+        if (sam_close(outfile) != 0) {
+            ret = EXIT_FAILURE;
+        }
+    }
+
+    pthread_mutex_lock(&bamcache.lock);
+    if (bamcache.list) {
+        struct data *tmp = NULL;
+        while (bamcache.list) {
+            tmp = bamcache.list;
+            bamcache.list = bamcache.list->next;
+            cleanup_bamstorage(tmp);
+        }
+    }
+    pthread_mutex_unlock(&bamcache.lock);
+
+    if (file) {
+        free(file);
+    }
+    if (pool) {
+        hts_tpool_destroy(pool);
+    }
+    return ret;
+}
diff --git a/samples/qtask_unordered.c b/samples/qtask_unordered.c
new file mode 100644
index 000000000..05fe50346
--- /dev/null
+++ b/samples/qtask_unordered.c
@@ -0,0 +1,320 @@
+/*  qtask_ordered.c --  showcases the htslib api usage
+
+    Copyright (C) 2024 Genome Research Ltd.
+
+    Author: Vasudeva Sarma <vasudeva.sarma@sanger.ac.uk>
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+DEALINGS IN THE SOFTWARE
+
+*/
+
+/* The purpose of this code is to demonstrate the library apis and need proper error handling and optimisation */
+
+#include <getopt.h>
+#include <unistd.h>
+#include <pthread.h>
+#include <sys/time.h>
+#include <htslib/sam.h>
+#include <htslib/thread_pool.h>
+
+struct datacache;
+
+typedef struct basecount {
+    uint64_t counts[16];        //count of all bases
+} basecount;
+
+typedef struct data {
+    int count;                  //used up size
+    int maxsize;                //max size per data chunk
+    bam1_t **bamarray;          //bam1_t array for optimal queueing
+
+    struct datacache *cache;
+    basecount *bases;           //count of all possible bases
+    struct data *next;          //pointer to next one - to reuse earlier allocations
+} data;
+
+typedef struct datacache
+{
+    pthread_mutex_t lock;       //synchronizes the access to cache
+    data *list;                 //data storage
+} datacache;
+
+/// print_usage - print the usage
+/** @param fp pointer to the file / terminal to which usage to be dumped
+returns nothing
+*/
+static void print_usage(FILE *fp)
+{
+    fprintf(fp, "Usage: qtask_unordered infile threadcount [chunksize]\n\
+Shows the base counts and calculates GC ratio - sum(G,C) / sum(A,T,C,G)\n\
+chunksize [4096] sets the number of alignments clubbed together to process.\n");
+    return;
+}
+
+/// getbamstorage - allocates storage for alignments to queue
+/** @param chunk number of bam data to allocate
+ * @param bases storage of result
+ * @param bamcache cached storage
+returns already allocated data storage if one is available, otherwise allocates new
+*/
+data* getbamstorage(int chunk, basecount *bases, datacache *bamcache)
+{
+    int i = 0;
+    data *bamdata = NULL;
+
+    if (!bamcache || !bases) {
+        return NULL;
+    }
+    //get from cache if there is an already allocated storage
+    if (pthread_mutex_lock(&bamcache->lock)) {
+        return NULL;
+    }
+    if (bamcache->list) {                   //available
+        bamdata = bamcache->list;
+        bamcache->list = bamdata->next;     //remove and set next one as available
+        bamdata->next = NULL;               //remove link
+        bamdata->count = 0;
+
+        bamdata->bases = bases;
+        bamdata->cache = bamcache;
+        goto end;
+    }
+    //allocate and use
+    if (!(bamdata = malloc(sizeof(data)))) {
+        goto end;
+    }
+    bamdata->bamarray = malloc(chunk * sizeof(bam1_t*));
+    if (!bamdata->bamarray) {
+        free(bamdata);
+        bamdata = NULL;
+        goto end;
+    }
+    for (i = 0; i < chunk; ++i) {
+        bamdata->bamarray[i] = bam_init1();
+    }
+    bamdata->maxsize = chunk;
+    bamdata->count = 0;
+    bamdata->next = NULL;
+
+    bamdata->bases = bases;
+    bamdata->cache = bamcache;
+
+end:
+    pthread_mutex_unlock(&bamcache->lock);
+    return bamdata;
+}
+
+/// cleanup_bamstorage - frees a bamdata struct plus contents
+/** @param arg Pointer to data to free
+    @p arg has type void * so it can be used as a callback passed
+    to hts_tpool_dispatch3().
+ */
+void cleanup_bamstorage(void *arg)
+{
+    data *bamdata = (data *) arg;
+    if (!bamdata)
+        return;
+    if (bamdata->bamarray) {
+        int i;
+        for (i = 0; i < bamdata->maxsize; i++) {
+            bam_destroy1(bamdata->bamarray[i]);
+        }
+        free(bamdata->bamarray);
+    }
+    free(bamdata);
+}
+
+/// thread_unordered_proc - does the processing of task in queue and updates result
+/** @param args pointer to set of data to be processed
+returns NULL
+the processing could be in any order based on the number of threads in use
+*/
+void *thread_unordered_proc(void *args)
+{
+    int i = 0;
+    data *bamdata = (data*)args;
+    uint64_t pos = 0;
+    uint8_t *data = NULL;
+    uint64_t counts[16] = {0};
+    for ( i = 0; i < bamdata->count; ++i) {
+        data = bam_get_seq(bamdata->bamarray[i]);
+        for (pos = 0; pos < bamdata->bamarray[i]->core.l_qseq; ++pos) {
+            /* it is faster to count all bases and select required ones later
+            compared to select and count here */
+            counts[bam_seqi(data, pos)]++;
+        }
+    }
+    //update result and add the memory block for reuse
+    pthread_mutex_lock(&bamdata->cache->lock);
+    for (i = 0; i < 16; i++) {
+        bamdata->bases->counts[i] += counts[i];
+    }
+
+    bamdata->next = bamdata->cache->list;
+    bamdata->cache->list = bamdata;
+    pthread_mutex_unlock(&bamdata->cache->lock);
+
+    return NULL;
+}
+
+/// main - start of the demo
+/** @param argc - count of arguments
+ *  @param argv - pointer to array of arguments
+returns 1 on failure 0 on success
+*/
+int main(int argc, char *argv[])
+{
+    const char *inname = NULL;
+    int c = 0, ret = EXIT_FAILURE, cnt = 0, chunk = 0;
+    samFile *infile = NULL;
+    sam_hdr_t *in_samhdr = NULL;
+    hts_tpool *pool = NULL;
+    hts_tpool_process *queue = NULL;
+    htsThreadPool tpool = {NULL, 0};
+    data *bamdata = NULL;
+    basecount gccount = {{0}};
+    datacache bamcache = {PTHREAD_MUTEX_INITIALIZER, NULL};
+
+    //qtask infile threadcount [chunksize]
+    if (argc != 3 && argc != 4) {
+        print_usage(stdout);
+        goto end;
+    }
+    inname = argv[1];
+    cnt = atoi(argv[2]);
+    if (argc == 4) {
+        chunk = atoi(argv[3]);
+    }
+    if (cnt < 1) {
+        cnt = 1;
+    }
+    if (chunk < 1) {
+        chunk = 4096;
+    }
+
+    if (!(pool = hts_tpool_init(cnt))) {
+        fprintf(stderr, "Failed to create thread pool\n");
+        goto end;
+    }
+    tpool.pool = pool;      //to share the pool for file read and write as well
+    //queue to use with thread pool, for tasks
+    if (!(queue = hts_tpool_process_init(pool, cnt * 2, 1))) {
+        fprintf(stderr, "Failed to create queue\n");
+        goto end;
+    }
+    //open input file - r reading
+    if (!(infile = sam_open(inname, "r"))) {
+        fprintf(stderr, "Could not open %s\n", inname);
+        goto end;
+    }
+    //share the thread pool with i/o files
+    if (hts_set_opt(infile, HTS_OPT_THREAD_POOL, &tpool) < 0) {
+        fprintf(stderr, "Failed to set threads to i/o files\n");
+        goto end;
+    }
+    //read header, required to resolve the target names to proper ids
+    if (!(in_samhdr = sam_hdr_read(infile))) {
+        fprintf(stderr, "Failed to read header from file!\n");
+        goto end;
+    }
+
+    /*tasks are queued, worker threads get them and process in parallel;
+    all bases are counted instead of counting atcg alone as it is faster*/
+
+    c = 0;
+    while (c >= 0) {
+        //use cached storage to avoid allocate/deallocate overheads
+        if (!(bamdata = getbamstorage(chunk, &gccount, &bamcache))) {
+            fprintf(stderr, "Failed to allocate memory\n");
+            break;
+        }
+        //read alignments, upto max size for this lot
+        for (cnt = 0; cnt < bamdata->maxsize; ++cnt) {
+            c = sam_read1(infile, in_samhdr, bamdata->bamarray[cnt]);
+            if (c < 0) {
+                break;      // EOF or failure
+            }
+        }
+        if (c >= -1 ) {
+            //max size data or reached EOF
+            bamdata->count = cnt;
+            // Queue the data for processing.  hts_tpool_dispatch3() is
+            // used here as it allows in-flight data to be cleaned up
+            // properly when stopping early due to errors.
+            if (hts_tpool_dispatch3(pool, queue, thread_unordered_proc, bamdata,
+                                    cleanup_bamstorage, cleanup_bamstorage,
+                                    0) == -1) {
+                fprintf(stderr, "Failed to schedule processing\n");
+                goto end;
+            }
+            bamdata = NULL;
+        } else {
+            fprintf(stderr, "Error in reading data\n");
+            break;
+        }
+    }
+
+     if (-1 == c) {
+        // EOF read, ensure all are processed, waits for all to finish
+        if (hts_tpool_process_flush(queue) == -1) {
+            fprintf(stderr, "Failed to flush queue\n");
+        } else { //all done
+            //refer seq_nt16_str to find position of required bases
+            fprintf(stdout, "GCratio: %f\nBase counts:\n",
+                (gccount.counts[2] /*C*/ + gccount.counts[4] /*G*/) / (float)
+                    (gccount.counts[1] /*A*/ + gccount.counts[8] /*T*/ +
+                        gccount.counts[2] + gccount.counts[4]));
+
+            for (cnt = 0; cnt < 16; ++cnt) {
+                fprintf(stdout, "%c: %"PRIu64"\n", seq_nt16_str[cnt], gccount.counts[cnt]);
+            }
+
+            ret = EXIT_SUCCESS;
+        }
+    }
+ end:
+    if (queue) {
+        hts_tpool_process_destroy(queue);
+    }
+
+    if (in_samhdr) {
+        sam_hdr_destroy(in_samhdr);
+    }
+    if (infile) {
+        if (sam_close(infile) != 0) {
+            ret = EXIT_FAILURE;
+        }
+    }
+
+    pthread_mutex_lock(&bamcache.lock);
+    if (bamcache.list) {
+        struct data *tmp = NULL;
+        while (bamcache.list) {
+            tmp = bamcache.list;
+            bamcache.list = bamcache.list->next;
+            cleanup_bamstorage(tmp);
+        }
+    }
+    pthread_mutex_unlock(&bamcache.lock);
+
+    if (pool) {
+        hts_tpool_destroy(pool);
+    }
+    return ret;
+}
diff --git a/samples/read_aux.c b/samples/read_aux.c
index cbf972b98..efd6f3651 100644
--- a/samples/read_aux.c
+++ b/samples/read_aux.c
@@ -24,14 +24,14 @@ DEALINGS IN THE SOFTWARE
 
 */
 
-/* The pupose of this code is to demonstrate the library apis and need proper error handling and optimization */
+/* The purpose of this code is to demonstrate the library apis and need proper error handling and optimisation */
 
 #include <getopt.h>
 #include <unistd.h>
 #include <htslib/sam.h>
 
-/// print_usage - print the demo_usage
-/** @param fp pointer to the file / terminal to which demo_usage to be dumped
+/// print_usage - print the usage
+/** @param fp pointer to the file / terminal to which usage to be dumped
 returns nothing
 */
 static void print_usage(FILE *fp)
@@ -92,7 +92,7 @@ int printauxdata(FILE *fp, char type, int32_t idx, const uint8_t *data)
         fprintf(fp, "%c", auxBType);
         for (i = 0; i < auxBcnt; ++i) {                                                     //iterate the array
             fprintf(fp, ",");
-            //calling recurssively  with index to reuse a few lines
+            //calling recursively  with index to reuse a few lines
             if (printauxdata(fp, auxBType, i, data) == EXIT_FAILURE) {
                 return EXIT_FAILURE;
             }
@@ -166,7 +166,7 @@ int main(int argc, char *argv[])
         else {
             //option 2 - get raw data
             if (!(data = bam_aux_get(bamdata, tag))) {
-                //tag data not returned, errono gives the reason
+                //tag data not returned, errno gives the reason
                 if (errno == ENOENT) {
                     printf("Tag not present\n");
                 }
diff --git a/samples/read_bam.c b/samples/read_bam.c
index 7fca8c55d..30bedf81c 100644
--- a/samples/read_bam.c
+++ b/samples/read_bam.c
@@ -24,14 +24,14 @@ DEALINGS IN THE SOFTWARE
 
 */
 
-/* The pupose of this code is to demonstrate the library apis and need proper error handling and optimization */
+/* The purpose of this code is to demonstrate the library apis and need proper error handling and optimisation */
 
 #include <getopt.h>
 #include <unistd.h>
 #include <htslib/sam.h>
 
-/// print_usage - print the demo_usage
-/** @param fp pointer to the file / terminal to which demo_usage to be dumped
+/// print_usage - print the usage
+/** @param fp pointer to the file / terminal to which usage to be dumped
 returns nothing
 */
 static void print_usage(FILE *fp)
diff --git a/samples/read_fast.c b/samples/read_fast.c
index f74b25515..10f807b69 100644
--- a/samples/read_fast.c
+++ b/samples/read_fast.c
@@ -24,14 +24,14 @@ DEALINGS IN THE SOFTWARE
 
 */
 
-/* The pupose of this code is to demonstrate the library apis and need proper error handling and optimization */
+/* The purpose of this code is to demonstrate the library apis and need proper error handling and optimisation */
 
 #include <getopt.h>
 #include <unistd.h>
 #include <htslib/sam.h>
 
-/// print_usage - show flags_demo usage
-/** @param fp pointer to the file / terminal to which demo_usage to be dumped
+/// print_usage - show usage
+/** @param fp pointer to the file / terminal to which usage to be dumped
 returns nothing
 */
 static void print_usage(FILE *fp)
@@ -83,6 +83,8 @@ int main(int argc, char *argv[])
 
     //read data
     while ((c = sam_read1(infile, in_samhdr, bamdata)) >= 0) {
+        printf("\nname: ");
+        printf("%s", bam_get_qname(bamdata));
         printf("\nsequence: ");
         for (c = 0; c < bamdata->core.l_qseq; ++c) {
             printf("%c", seq_nt16_str[bam_seqi(bam_get_seq(bamdata), c)]);
@@ -90,10 +92,11 @@ int main(int argc, char *argv[])
         if (infile->format.format == fastq_format) {
             printf("\nquality: ");
             for (c = 0; c < bamdata->core.l_qseq; ++c) {
-                printf("%c", bam_get_qual(bamdata)[c]);
+                printf("%c", bam_get_qual(bamdata)[c] + 33);
             }
         }
     }
+    printf("\n");
     if (c != -1) {
         //error
         printf("Failed to get data\n");
diff --git a/samples/read_fast_index.c b/samples/read_fast_index.c
new file mode 100644
index 000000000..97076630a
--- /dev/null
+++ b/samples/read_fast_index.c
@@ -0,0 +1,163 @@
+/*  read_fast_index.c --  showcases the htslib api usage
+
+    Copyright (C) 2023 Genome Research Ltd.
+
+    Author: Vasudeva Sarma <vasudeva.sarma@sanger.ac.uk>
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+DEALINGS IN THE SOFTWARE
+
+*/
+
+/* The purpose of this code is to demonstrate the library apis and need proper error handling and optimisation */
+
+#include <getopt.h>
+#include <unistd.h>
+#include <htslib/sam.h>
+#include <htslib/faidx.h>
+
+/// print_usage - show usage
+/** @param fp pointer to the file / terminal to which usage to be dumped
+returns nothing
+*/
+static void print_usage(FILE *fp)
+{
+    fprintf(fp, "Usage: read_fast_i <infile> A/Q 0/1 regiondef\n\
+Reads the fasta/fastq file using index and shows the content.\n\
+For fasta files use A and Q for fastq files.\n\
+Region can be 1 or more of <reference name>[:start-end] entries separated by comma.\n\
+For single region, give regcount as 0 and non 0 for multi-regions.\n");
+    return;
+}
+
+/// main_demo - start of the demo
+/** @param argc - count of arguments
+ *  @param argv - pointer to array of arguments
+returns 1 on failure 0 on success
+*/
+int main(int argc, char *argv[])
+{
+    const char *inname = NULL, *region = NULL, *data = NULL, *remaining = NULL;
+    int ret = EXIT_FAILURE, tid = -1, usemulti = 0;
+    faidx_t *idx = NULL;
+    enum fai_format_options fmt = FAI_FASTA;
+    hts_pos_t len = 0, beg = 0, end = 0;
+
+    //read_fast_i infile A/Q regcount region
+    if (argc != 5) {
+        print_usage(stdout);
+        goto end;
+    }
+    inname = argv[1];
+    if (argv[2][0] == 'Q') {
+        fmt = FAI_FASTQ;
+    }
+    usemulti = atoi(argv[3]);
+    region = argv[4];
+
+    //load index
+    if (!(idx = fai_load3_format(inname, NULL, NULL, FAI_CREATE, fmt))) {
+        printf("Failed to load index\n");
+        goto end;
+    }
+
+    if (!usemulti) {
+        //get data from given region
+        if (!(data = fai_fetch64(idx, region, &len))) {
+            if (-1 == len) {
+                printf("Failed to get data\n");                 //failure
+                goto end;
+            }
+            else {
+                printf("Data not found for given region\n");    //no data
+            }
+        }
+        else {
+            printf("Data: %"PRId64" %s\n", len, data);
+            free((void*)data);
+            //get quality for fastq type
+            if (fmt == FAI_FASTQ) {
+                if (!(data = fai_fetchqual64(idx, region, &len))) {
+                    if (len == -1) {
+                        printf("Failed to get data\n");
+                        goto end;
+                    }
+                    else {
+                        printf("Data not found for given region\n");
+                    }
+                }
+                else {
+                    printf("Qual: %"PRId64" %s\n", len, data);
+                    free((void*)data);
+                }
+            }
+        }
+    }
+    else {
+        //parse, get each region and get data for each
+        while ((remaining = fai_parse_region(idx, region, &tid, &beg, &end, HTS_PARSE_LIST))) {     //here expects regions as csv
+            //parsed the region, correct end points based on actual data
+            if (fai_adjust_region(idx, tid, &beg, &end) == -1) {
+                printf("Error in adjusting region for tid %d\n", tid);
+                goto end;
+            }
+            //get data for given region
+            if (!(data = faidx_fetch_seq64(idx, faidx_iseq(idx, tid), beg, end, &len))) {
+                if (len == -1) {
+                    printf("Failed to get data\n");                 //failure
+                    goto end;
+                }
+                else {
+                    printf("No data found for given region\n");     //no data
+                }
+            }
+            else {
+                printf("Data: %"PRIhts_pos" %s\n", len, data);
+                free((void*)data);
+                data = NULL;
+
+                //get quality data for fastq
+                if (fmt == FAI_FASTQ) {
+                    if (!(data = faidx_fetch_qual64(idx, faidx_iseq(idx, tid), beg, end, &len))) {
+                        if (len == -1) {
+                            printf("Failed to get qual data\n");
+                            goto end;
+                        }
+                        else {
+                            printf("No data found for given region\n");
+                        }
+                    }
+                    else {
+                        printf("Qual: %"PRIhts_pos" %s\n", len, data);
+                        free((void*)data);
+                        data = NULL;
+                    }
+                }
+            }
+            region = remaining;                                     //parse remaining region defs
+        }
+    }
+
+    ret = EXIT_SUCCESS;
+end:
+    //clean up
+    if (idx) {
+        fai_destroy(idx);
+    }
+    return ret;
+}
diff --git a/samples/read_header.c b/samples/read_header.c
index eb14daea5..54b07e736 100644
--- a/samples/read_header.c
+++ b/samples/read_header.c
@@ -24,14 +24,14 @@ DEALINGS IN THE SOFTWARE
 
 */
 
-/* The pupose of this code is to demonstrate the library apis and need proper error handling and optimization */
+/* The purpose of this code is to demonstrate the library apis and need proper error handling and optimisation */
 
 #include <getopt.h>
 #include <unistd.h>
 #include <htslib/sam.h>
 
-/// print_usage - print the demo_usage
-/** @param fp pointer to the file / terminal to which demo_usage to be dumped
+/// print_usage - print the usage
+/** @param fp pointer to the file / terminal to which susage to be dumped
 returns nothing
 */
 static void print_usage(FILE *fp)
diff --git a/samples/read_refname.c b/samples/read_refname.c
index adbc71183..9b4918ded 100644
--- a/samples/read_refname.c
+++ b/samples/read_refname.c
@@ -24,14 +24,14 @@ DEALINGS IN THE SOFTWARE
 
 */
 
-/* The pupose of this code is to demonstrate the library apis and need proper error handling and optimization */
+/* The purpose of this code is to demonstrate the library apis and need proper error handling and optimisation */
 
 #include <getopt.h>
 #include <unistd.h>
 #include <htslib/sam.h>
 
-/// print_usage - print the demo_usage
-/** @param fp pointer to the file / terminal to which demo_usage to be dumped
+/// print_usage - print the usage
+/** @param fp pointer to the file / terminal to which usage to be dumped
 returns nothing
 */
 static void print_usage(FILE *fp)
diff --git a/samples/rem_header.c b/samples/rem_header.c
index a0b6510fb..852d5f055 100644
--- a/samples/rem_header.c
+++ b/samples/rem_header.c
@@ -24,14 +24,14 @@ DEALINGS IN THE SOFTWARE
 
 */
 
-/* The pupose of this code is to demonstrate the library apis and need proper error handling and optimization */
+/* The purpose of this code is to demonstrate the library apis and need proper error handling and optimisation */
 
 #include <getopt.h>
 #include <unistd.h>
 #include <htslib/sam.h>
 
-/// print_usage - print the demo_usage
-/** @param fp pointer to the file / terminal to which demo_usage to be dumped
+/// print_usage - print the usage
+/** @param fp pointer to the file / terminal to which usage to be dumped
 returns nothing
 */
 static void print_usage(FILE *fp)
@@ -124,7 +124,7 @@ int main(int argc, char *argv[])
     ret = EXIT_SUCCESS;
     //bam data write to follow....
 end:
-    //cleanupq
+    //cleanup
     if (in_samhdr) {
         sam_hdr_destroy(in_samhdr);
     }
diff --git a/samples/sample.bed b/samples/sample.bed
new file mode 100644
index 000000000..2ae458fd5
--- /dev/null
+++ b/samples/sample.bed
@@ -0,0 +1,4 @@
+T1	1	2
+T1	30	35
+T2	10	15
+T2	30	40
diff --git a/samples/sample.ref.fq b/samples/sample.ref.fq
new file mode 100644
index 000000000..18b2b9617
--- /dev/null
+++ b/samples/sample.ref.fq
@@ -0,0 +1,16 @@
+@T1
+AAAAACTGAAAACCCCTTTTGGGGACTGTTAACAGTTTTT
++
+AAAAACTGAAAACCCCTTTTGGGGACTGTTAACAGTTTTT
+@T2
+TTTTCCCCACTGAAAACCCCTTTTGGGGACTGTTAACAGT
++
+TTTTCCCCACTGAAAACCCCTTTTGGGGACTGTTAACAGT
+@T3
+TTTTGGGGACTGTTAACAGT
++
+TTTTGGGGACTGTTAACAGT
+@T4
+TTTTCCCCACTGAAAACCCCTTTTGGGGACTGTTAACAGTTTTTCCCCACTGAAAACCCCTTTTGGGGACTGTTAACAGTTTTTGGGGACTGTTAACAGT
++
+TTTTCCCCACTGAAAACCCCTTTTGGGGACTGTTAACAGTTTTTCCCCACTGAAAACCCCTTTTGGGGACTGTTAACAGTTTTTGGGGACTGTTAACAGT
diff --git a/samples/sample.sam b/samples/sample.sam
index e56efd69f..58515c976 100644
--- a/samples/sample.sam
+++ b/samples/sample.sam
@@ -9,7 +9,7 @@
 @CO	1234567890123456789012345678901234567890
 @CO	AAAAACTGAAAACCCCTTTTGGGGACTGTTAACAGTTTTT	T1
 @CO	TTTTCCCCACTGAAAACCCCTTTTGGGGACTGTTAACAGT	T2
-@CO	ITR1-ITR2M, ITR2-ITR2M are proper pairs in T1 and T2, UNMP1 is partly mapped and pair is unmapped, UNMP2 & 3 are unmappped
+@CO	ITR1-ITR2M, ITR2-ITR2M are proper pairs in T1 and T2, UNMP1 is partly mapped and pair is unmapped, UNMP2 & 3 are unmapped
 @CO	A1-A2, A4-A3 are proper pairs with A4-A3 in different read order. A5 is secondary alignment
 ITR1	99	T1	5	40	4M	=	33	10	ACTG	()()
 ITR2	147	T2	23	49	2M	=	35	-10	TT	**
diff --git a/samples/split.c b/samples/split.c
index 2eb9e6b79..c51dbd385 100644
--- a/samples/split.c
+++ b/samples/split.c
@@ -24,14 +24,14 @@ DEALINGS IN THE SOFTWARE
 
 */
 
-/* The pupose of this code is to demonstrate the library apis and need proper error handling and optimization */
+/* The purpose of this code is to demonstrate the library apis and need proper error handling and optimisation */
 
 #include <getopt.h>
 #include <unistd.h>
 #include <htslib/sam.h>
 
-/// print_usage - print the demo_usage
-/** @param fp pointer to the file / terminal to which demo_usage to be dumped
+/// print_usage - print the usage
+/** @param fp pointer to the file / terminal to which usage to be dumped
 returns nothing
 */
 static void print_usage(FILE *fp)
diff --git a/samples/split2.c b/samples/split2.c
index 2354abfe3..33fabbd67 100644
--- a/samples/split2.c
+++ b/samples/split2.c
@@ -24,19 +24,19 @@ DEALINGS IN THE SOFTWARE
 
 */
 
-/* The pupose of this code is to demonstrate the library apis and need proper error handling and optimization */
+/* The purpose of this code is to demonstrate the library apis and need proper error handling and optimisation */
 
 #include <getopt.h>
 #include <unistd.h>
 #include <htslib/sam.h>
 
-/// print_usage - print the demo_usage
-/** @param fp pointer to the file / terminal to which demo_usage to be dumped
+/// print_usage - print the usage
+/** @param fp pointer to the file / terminal to which usage to be dumped
 returns nothing
 */
 static void print_usage(FILE *fp)
 {
-    fprintf(fp, "Usage: split infile outdir\n\
+    fprintf(fp, "Usage: split2 infile outdir\n\
 Splits the input file alignments to read1 and read2 and saves as 1.sam and 2.bam in given directory\n\
 Shows file type selection through name and format api\n");
     return;
diff --git a/samples/split_thread1.c b/samples/split_thread1.c
index 40d2dfdc2..551c7f093 100644
--- a/samples/split_thread1.c
+++ b/samples/split_thread1.c
@@ -24,14 +24,14 @@ DEALINGS IN THE SOFTWARE
 
 */
 
-/* The pupose of this code is to demonstrate the library apis and need proper error handling and optimization */
+/* The purpose of this code is to demonstrate the library apis and need proper error handling and optimisation */
 
 #include <getopt.h>
 #include <unistd.h>
 #include <htslib/sam.h>
 
-/// print_usage - print the demo_usage
-/** @param fp pointer to the file / terminal to which demo_usage to be dumped
+/// print_usage - print the usage
+/** @param fp pointer to the file / terminal to which usage to be dumped
 returns nothing
 */
 static void print_usage(FILE *fp)
@@ -94,9 +94,9 @@ int main(int argc, char *argv[])
     }
 
     //create file specific threads
-    if (hts_set_opt(infile, HTS_OPT_NTHREADS, 2) < 0 ||     //2 thread specific for reading
+    if (hts_set_opt(infile, HTS_OPT_NTHREADS, 1) < 0 ||     //1 thread specific for reading
     hts_set_opt(outfile1, HTS_OPT_NTHREADS, 1) < 0 ||       //1 thread specific for sam write
-    hts_set_opt(outfile2, HTS_OPT_NTHREADS, 1) < 0) {       //1 thread specific for bam write
+    hts_set_opt(outfile2, HTS_OPT_NTHREADS, 2) < 0) {       //2 thread specific for bam write
         printf("Failed to set thread options\n");
         goto end;
     }
diff --git a/samples/split_thread2.c b/samples/split_thread2.c
index dab897b5f..dc8bc9f31 100644
--- a/samples/split_thread2.c
+++ b/samples/split_thread2.c
@@ -24,15 +24,15 @@ DEALINGS IN THE SOFTWARE
 
 */
 
-/* The pupose of this code is to demonstrate the library apis and need proper error handling and optimization */
+/* The purpose of this code is to demonstrate the library apis and need proper error handling and optimisation */
 
 #include <getopt.h>
 #include <unistd.h>
 #include <htslib/sam.h>
 #include <htslib/thread_pool.h>
 
-/// print_usage - print the demo_usage
-/** @param fp pointer to the file / terminal to which demo_usage to be dumped
+/// print_usage - print the usage
+/** @param fp pointer to the file / terminal to which usage to be dumped
 returns nothing
 */
 static void print_usage(FILE *fp)
diff --git a/samples/update_header.c b/samples/update_header.c
index f6b1680cd..237d5c4df 100644
--- a/samples/update_header.c
+++ b/samples/update_header.c
@@ -24,14 +24,14 @@ DEALINGS IN THE SOFTWARE
 
 */
 
-/* The pupose of this code is to demonstrate the library apis and need proper error handling and optimization */
+/* The purpose of this code is to demonstrate the library apis and need proper error handling and optimisation */
 
 #include <getopt.h>
 #include <unistd.h>
 #include <htslib/sam.h>
 
-/// print_usage - print the demo_usage
-/** @param fp pointer to the file / terminal to which demo_usage to be dumped
+/// print_usage - print the usage
+/** @param fp pointer to the file / terminal to which usage to be dumped
 returns nothing
 */
 static void print_usage(FILE *fp)
diff --git a/samples/write_fast.c b/samples/write_fast.c
index ef7817683..95d919fd0 100644
--- a/samples/write_fast.c
+++ b/samples/write_fast.c
@@ -24,19 +24,21 @@ DEALINGS IN THE SOFTWARE
 
 */
 
-/* The pupose of this code is to demonstrate the library apis and need proper error handling and optimization */
+/* The purpose of this code is to demonstrate the library apis and need proper error handling and optimisation */
 
 #include <getopt.h>
 #include <unistd.h>
+#include <time.h>
 #include <htslib/sam.h>
+#include <htslib/faidx.h>
 
-/// print_usage - show flags_demo usage
-/** @param fp pointer to the file / terminal to which demo_usage to be dumped
+/// print_usage - show usage
+/** @param fp pointer to the file / terminal to which usage to be dumped
 returns nothing
 */
 static void print_usage(FILE *fp)
 {
-    fprintf(fp, "Usage: write_fast <file>\n\
+    fprintf(fp, "Usage: write_fast <file> <sequence> [<qualities]\n\
 Appends a fasta/fastq file.\n");
     return;
 }
@@ -51,15 +53,24 @@ int main(int argc, char *argv[])
     const char *outname = NULL;             //output file name
     int ret = EXIT_FAILURE;
     samFile *outfile = NULL;                //sam file
-    sam_hdr_t *out_samhdr = NULL;           //header of file
     bam1_t *bamdata = NULL;                 //to hold the read data
     char mode[4] = "a";
+    const char *data = NULL, *qual = NULL;  //ref data and quality
+    char name[256] = {0};
 
-    if (argc != 2) {
+    if (argc > 4 || argc < 3) {
         print_usage(stdout);
         goto end;
     }
     outname = argv[1];
+    data = argv[2];
+    if (argc == 4) {    //fastq data
+        qual = argv[3];
+        if (strlen(data) != strlen(qual)) {     //check for proper length of data and quality values
+            printf("Incorrect reference and quality data\n");
+            goto end;
+        }
+    }
 
     //initialize
     if (!(bamdata = bam_init1())) {
@@ -71,26 +82,30 @@ int main(int argc, char *argv[])
         goto end;
     }
     //open output file
-    if (!(outfile = sam_open(outname, mode))) {
+    if (!(outfile = sam_open(outname, mode))) {         //expects the name to have correct extension!
         printf("Could not open %s\n", outname);
         goto end;
     }
-    //dummy data
-    if (bam_set1(bamdata, sizeof("test"), "test", BAM_FUNMAP, -1, -1, 0, 0, NULL, -1, -1, 0, 10, "AACTGACTGA", "1234567890", 0) < 0) {
+    /* if the file name extension is not appropriate to the content, inconsistent data will be present in output.
+    if required, htsFormat and sam_open_format can be explicitly used to ensure appropriateness of content.
+    htsFormat fmt = {sequence_data, fastq_format / fasta_format};
+    sam_open_format(outname, mode, fmt);
+    */
+
+    snprintf(name, sizeof(name), "Test_%ld", (long) time(NULL));
+    //data
+    if (bam_set1(bamdata, strlen(name), name, BAM_FUNMAP, -1, -1, 0, 0, NULL, -1, -1, 0, strlen(data), data, qual, 0) < 0) {
         printf("Failed to set data\n");
         goto end;
     }
-    if (sam_write1(outfile, out_samhdr, bamdata) < 0) {
+    //as we write only FASTA/FASTQ, we can get away without providing headers
+    if (sam_write1(outfile, NULL, bamdata) < 0) {
         printf("Failed to write data\n");
         goto end;
     }
-
     ret = EXIT_SUCCESS;
 end:
     //clean up
-    if (out_samhdr) {
-        sam_hdr_destroy(out_samhdr);
-    }
     if (outfile) {
         sam_close(outfile);
     }
diff --git a/simd.c b/simd.c
new file mode 100644
index 000000000..865dd887e
--- /dev/null
+++ b/simd.c
@@ -0,0 +1,222 @@
+/*  simd.c -- SIMD optimised versions of various internal functions.
+
+    Copyright (C) 2024 Genome Research Ltd.
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+DEALINGS IN THE SOFTWARE.  */
+
+#define HTS_BUILDING_LIBRARY // Enables HTSLIB_EXPORT, see htslib/hts_defs.h
+#include <config.h>
+
+// These must be defined before the first system include to ensure that legacy
+// BSD types needed by <sys/sysctl.h> remain defined when _XOPEN_SOURCE is set.
+#if defined __APPLE__
+#define _DARWIN_C_SOURCE
+#elif defined __NetBSD__
+#define _NETBSD_SOURCE
+#endif
+
+#include "htslib/sam.h"
+#include "sam_internal.h"
+
+#if defined __x86_64__
+#include <immintrin.h>
+#elif defined __ARM_NEON
+#include <arm_neon.h>
+#endif
+
+#if defined __arm__ || defined __aarch64__
+
+#if defined __linux__ || defined __FreeBSD__
+#include <sys/auxv.h>
+#elif defined __APPLE__
+#include <sys/types.h>
+#include <sys/sysctl.h>
+#elif defined __NetBSD__
+#include <stddef.h>
+#include <sys/param.h>
+#include <sys/sysctl.h>
+#ifdef __aarch64__
+#include <aarch64/armreg.h>
+#else
+#include <arm/armreg.h>
+#endif
+#elif defined _WIN32
+#include <processthreadsapi.h>
+#endif
+
+static inline int cpu_supports_neon(void) {
+#if defined __linux__ && defined __arm__ && defined HWCAP_NEON
+    return (getauxval(AT_HWCAP) & HWCAP_NEON) != 0;
+#elif defined __linux__ && defined __arm__ && defined HWCAP_ARM_NEON
+    return (getauxval(AT_HWCAP) & HWCAP_ARM_NEON) != 0;
+#elif defined __linux__ && defined __aarch64__ && defined HWCAP_ASIMD
+    return (getauxval(AT_HWCAP) & HWCAP_ASIMD) != 0;
+#elif defined __APPLE__ && defined __aarch64__
+    int32_t ctl;
+    size_t ctlsize = sizeof ctl;
+    if (sysctlbyname("hw.optional.AdvSIMD", &ctl, &ctlsize, NULL, 0) != 0) return 0;
+    if (ctlsize != sizeof ctl) return 0;
+    return ctl;
+#elif defined __FreeBSD__ && defined __arm__ && defined HWCAP_NEON
+    unsigned long cap;
+    if (elf_aux_info(AT_HWCAP, &cap, sizeof cap) != 0) return 0;
+    return (cap & HWCAP_NEON) != 0;
+#elif defined __FreeBSD__ && defined __aarch64__ && defined HWCAP_ASIMD
+    unsigned long cap;
+    if (elf_aux_info(AT_HWCAP, &cap, sizeof cap) != 0) return 0;
+    return (cap & HWCAP_ASIMD) != 0;
+#elif defined __NetBSD__ && defined __arm__ && defined ARM_MVFR0_ASIMD_MASK
+    uint32_t buf[16];
+    size_t buflen = sizeof buf;
+    if (sysctlbyname("machdep.id_mvfr", buf, &buflen, NULL, 0) != 0) return 0;
+    if (buflen < sizeof(uint32_t)) return 0;
+    return (buf[0] & ARM_MVFR0_ASIMD_MASK) == 0x00000002;
+#elif defined __NetBSD__ && defined __aarch64__ && defined ID_AA64PFR0_EL1_ADVSIMD
+    struct aarch64_sysctl_cpu_id buf;
+    size_t buflen = sizeof buf;
+    if (sysctlbyname("machdep.cpu0.cpu_id", &buf, &buflen, NULL, 0) != 0) return 0;
+    if (buflen < offsetof(struct aarch64_sysctl_cpu_id, ac_aa64pfr0) + sizeof(uint64_t)) return 0;
+    return (buf.ac_aa64pfr0 & ID_AA64PFR0_EL1_ADVSIMD & 0x00e00000) == 0;
+#elif defined _WIN32
+    return IsProcessorFeaturePresent(PF_ARM_V8_INSTRUCTIONS_AVAILABLE) != 0;
+#else
+    return 0;
+#endif
+}
+
+#endif
+
+#ifdef BUILDING_SIMD_NIBBLE2BASE
+
+void (*htslib_nibble2base)(uint8_t *nib, char *seq, int len) = nibble2base_default;
+
+#if defined __x86_64__
+
+/*
+ * Convert a nibble encoded BAM sequence to a string of bases.
+ *
+ * Using SSSE3 instructions, 16 codepoints that hold 2 bases each can be
+ * unpacked into 32 indexes from 0-15. Using the pshufb instruction these can
+ * be converted to the IUPAC characters.
+ * It falls back on the nibble2base_default function for the remainder.
+ */
+
+__attribute__((target("ssse3")))
+static void nibble2base_ssse3(uint8_t *nib, char *seq, int len) {
+    const char *seq_end_ptr = seq + len;
+    char *seq_cursor = seq;
+    uint8_t *nibble_cursor = nib;
+    const char *seq_vec_end_ptr = seq_end_ptr - (2 * sizeof(__m128i) - 1);
+    __m128i nuc_lookup_vec = _mm_lddqu_si128((__m128i *)seq_nt16_str);
+    /* Nucleotides are encoded 4-bits per nucleotide and stored in 8-bit bytes
+       as follows: |AB|CD|EF|GH|. The 4-bit codes (going from 0-15) can be used
+       together with the pshufb instruction as a lookup table. The most efficient
+       way is to use bitwise AND and shift to create two vectors. One with all
+       the upper codes (|A|C|E|G|) and one with the lower codes (|B|D|F|H|).
+       The lookup can then be performed and the resulting vectors can be
+       interleaved again using the unpack instructions. */
+    while (seq_cursor < seq_vec_end_ptr) {
+        __m128i encoded = _mm_lddqu_si128((__m128i *)nibble_cursor);
+        __m128i encoded_upper = _mm_srli_epi64(encoded, 4);
+        encoded_upper = _mm_and_si128(encoded_upper, _mm_set1_epi8(15));
+        __m128i encoded_lower = _mm_and_si128(encoded, _mm_set1_epi8(15));
+        __m128i nucs_upper = _mm_shuffle_epi8(nuc_lookup_vec, encoded_upper);
+        __m128i nucs_lower = _mm_shuffle_epi8(nuc_lookup_vec, encoded_lower);
+        __m128i first_nucleotides = _mm_unpacklo_epi8(nucs_upper, nucs_lower);
+        __m128i second_nucleotides = _mm_unpackhi_epi8(nucs_upper, nucs_lower);
+        _mm_storeu_si128((__m128i *)seq_cursor, first_nucleotides);
+        _mm_storeu_si128((__m128i *)(seq_cursor + sizeof(__m128i)),
+                         second_nucleotides);
+        nibble_cursor += sizeof(__m128i);
+        seq_cursor += 2 * sizeof(__m128i);
+    }
+    nibble2base_default(nibble_cursor, seq_cursor, seq_end_ptr - seq_cursor);
+}
+
+__attribute__((constructor))
+static void nibble2base_resolve(void) {
+    if (__builtin_cpu_supports("ssse3")) {
+        htslib_nibble2base = nibble2base_ssse3;
+    }
+}
+
+#elif defined __ARM_NEON
+
+static void nibble2base_neon(uint8_t *nib, char *seq0, int len) {
+    uint8x16_t low_nibbles_mask = vdupq_n_u8(0x0f);
+    uint8x16_t nuc_lookup_vec = vld1q_u8((const uint8_t *) seq_nt16_str);
+#ifndef __aarch64__
+    uint8x8x2_t nuc_lookup_vec2 = {{ vget_low_u8(nuc_lookup_vec), vget_high_u8(nuc_lookup_vec) }};
+#endif
+
+    uint8_t *seq = (uint8_t *) seq0;
+    int blocks;
+
+    for (blocks = len / 32; blocks > 0; --blocks) {
+        uint8x16_t encoded = vld1q_u8(nib);
+        nib += 16;
+
+        /* Translate the high and low nibbles to nucleotide letters separately,
+           then interleave them back together via vzipq for writing. */
+
+        uint8x16_t high_nibbles = vshrq_n_u8(encoded, 4);
+        uint8x16_t low_nibbles  = vandq_u8(encoded, low_nibbles_mask);
+
+#ifdef __aarch64__
+        uint8x16_t high_nucleotides = vqtbl1q_u8(nuc_lookup_vec, high_nibbles);
+        uint8x16_t low_nucleotides  = vqtbl1q_u8(nuc_lookup_vec, low_nibbles);
+#else
+        uint8x8_t high_low  = vtbl2_u8(nuc_lookup_vec2, vget_low_u8(high_nibbles));
+        uint8x8_t high_high = vtbl2_u8(nuc_lookup_vec2, vget_high_u8(high_nibbles));
+        uint8x16_t high_nucleotides = vcombine_u8(high_low, high_high);
+
+        uint8x8_t low_low  = vtbl2_u8(nuc_lookup_vec2, vget_low_u8(low_nibbles));
+        uint8x8_t low_high = vtbl2_u8(nuc_lookup_vec2, vget_high_u8(low_nibbles));
+        uint8x16_t low_nucleotides = vcombine_u8(low_low, low_high);
+#endif
+
+#ifdef __aarch64__
+        vst1q_u8_x2(seq, vzipq_u8(high_nucleotides, low_nucleotides));
+#else
+        // Avoid vst1q_u8_x2 as GCC erroneously omits it on 32-bit ARM
+        uint8x16x2_t nucleotides = {{ high_nucleotides, low_nucleotides }};
+        vst2q_u8(seq, nucleotides);
+#endif
+        seq += 32;
+    }
+
+    if (len % 32 != 0)
+        nibble2base_default(nib, (char *) seq, len % 32);
+}
+
+static __attribute__((constructor)) void nibble2base_resolve(void) {
+    if (cpu_supports_neon()) htslib_nibble2base = nibble2base_neon;
+}
+
+#endif
+
+#endif // BUILDING_SIMD_NIBBLE2BASE
+
+// Potentially useful diagnostic, and prevents "empty translation unit" errors
+const char htslib_simd[] =
+    "SIMD functions present:"
+#ifdef BUILDING_SIMD_NIBBLE2BASE
+    " nibble2base"
+#endif
+    ".";
diff --git a/synced_bcf_reader.c b/synced_bcf_reader.c
index a43ab15ae..1835ea2d6 100644
--- a/synced_bcf_reader.c
+++ b/synced_bcf_reader.c
@@ -25,6 +25,7 @@ DEALINGS IN THE SOFTWARE.  */
 #define HTS_BUILDING_LIBRARY // Enables HTSLIB_EXPORT, see htslib/hts_defs.h
 #include <config.h>
 
+#include <stdlib.h>
 #include <assert.h>
 #include <stdio.h>
 #include <unistd.h>
@@ -71,6 +72,7 @@ typedef struct
 }
 aux_t;
 
+static bcf_sr_regions_t *bcf_sr_regions_alloc(void);
 static int _regions_add(bcf_sr_regions_t *reg, const char *chr, hts_pos_t start, hts_pos_t end);
 static bcf_sr_regions_t *_regions_init_string(const char *str);
 static int _regions_match_alleles(bcf_sr_regions_t *reg, int als_idx, bcf1_t *rec);
@@ -368,13 +370,22 @@ int bcf_sr_add_reader(bcf_srs_t *files, const char *fname)
     if ( !files->explicit_regs && !files->streaming )
     {
         int n = 0, i;
-        const char **names = reader->tbx_idx ? tbx_seqnames(reader->tbx_idx, &n) : bcf_hdr_seqnames(reader->header, &n);
-        for (i=0; i<n; i++)
+        const char **names;
+
+        if ( !files->regions )
         {
+            files->regions = bcf_sr_regions_alloc();
             if ( !files->regions )
-                files->regions = _regions_init_string(names[i]);
-            else
-                _regions_add(files->regions, names[i], -1, -1);
+            {
+                hts_log_error("Cannot allocate regions data structure");
+                return 0;
+            }
+        }
+
+        names = reader->tbx_idx ? tbx_seqnames(reader->tbx_idx, &n) : bcf_hdr_seqnames(reader->header, &n);
+        for (i=0; i<n; i++)
+        {
+            _regions_add(files->regions, names[i], -1, -1);
         }
         free(names);
         _regions_sort_and_merge(files->regions);
@@ -532,7 +543,7 @@ static int _reader_seek(bcf_sr_t *reader, const char *seq, hts_pos_t start, hts_
     }
     if (!reader->itr) {
         hts_log_error("Could not seek: %s:%"PRIhts_pos"-%"PRIhts_pos, seq, start + 1, end + 1);
-        assert(0);
+        abort();
     }
     return 0;
 }
@@ -956,6 +967,17 @@ int bcf_sr_set_samples(bcf_srs_t *files, const char *fname, int is_file)
     return 1;
 }
 
+// Allocate a new region list structure.
+static bcf_sr_regions_t *bcf_sr_regions_alloc(void)
+{
+    bcf_sr_regions_t *reg = (bcf_sr_regions_t *) calloc(1, sizeof(bcf_sr_regions_t));
+    if ( !reg ) return NULL;
+
+    reg->start = reg->end = -1;
+    reg->prev_start = reg->prev_end = reg->prev_seq = -1;
+    return reg;
+}
+
 // Add a new region into a list. On input the coordinates are 1-based, inclusive, then stored 0-based,
 // inclusive. Sorting and merging step needed afterwards: qsort(..,cmp_regions) and merge_regions().
 static int _regions_add(bcf_sr_regions_t *reg, const char *chr, hts_pos_t start, hts_pos_t end)
@@ -1037,9 +1059,8 @@ void _regions_sort_and_merge(bcf_sr_regions_t *reg)
 // wouldn't learn the chromosome name.
 static bcf_sr_regions_t *_regions_init_string(const char *str)
 {
-    bcf_sr_regions_t *reg = (bcf_sr_regions_t *) calloc(1, sizeof(bcf_sr_regions_t));
-    reg->start = reg->end = -1;
-    reg->prev_start = reg->prev_end = reg->prev_seq = -1;
+    bcf_sr_regions_t *reg = bcf_sr_regions_alloc();
+    if ( !reg ) return NULL;
 
     kstring_t tmp = {0,0,0};
     const char *sp = str, *ep = str;
@@ -1189,9 +1210,8 @@ bcf_sr_regions_t *bcf_sr_regions_init(const char *regions, int is_file, int ichr
         return reg;
     }
 
-    reg = (bcf_sr_regions_t *) calloc(1, sizeof(bcf_sr_regions_t));
-    reg->start = reg->end = -1;
-    reg->prev_start = reg->prev_end = reg->prev_seq = -1;
+    reg = bcf_sr_regions_alloc();
+    if ( !reg ) return NULL;
 
     reg->file = hts_open(regions, "rb");
     if ( !reg->file )
diff --git a/tabix.1 b/tabix.1
index 9bf1d6891..f0dc7b519 100644
--- a/tabix.1
+++ b/tabix.1
@@ -1,4 +1,4 @@
-.TH tabix 1 "15 April 2024" "htslib-1.20" "Bioinformatics tools"
+.TH tabix 1 "12 September 2024" "htslib-1.21" "Bioinformatics tools"
 .SH NAME
 .PP
 tabix \- Generic indexer for TAB-delimited genome position files
diff --git a/tbx.c b/tbx.c
index 5f861299a..662500549 100644
--- a/tbx.c
+++ b/tbx.c
@@ -229,8 +229,11 @@ static inline int get_intv(tbx_t *tbx, kstring_t *str, tbx_intv_t *intv, int is_
             case TBX_UCSC: type = "TBX_UCSC"; break;
             default: type = "TBX_GENERIC"; break;
         }
-        hts_log_error("Failed to parse %s, was wrong -p [type] used?\nThe offending line was: \"%s\"",
-            type, str->s);
+        if (hts_is_utf16_text(str))
+            hts_log_error("Failed to parse %s: offending line appears to be encoded as UTF-16", type);
+        else
+            hts_log_error("Failed to parse %s: was wrong -p [type] used?\nThe offending line was: \"%s\"",
+                type, str->s);
         return -1;
     }
 }
@@ -321,7 +324,7 @@ static void adjust_max_ref_len_sam(const char *str, int64_t *max_ref_len)
 // files with very large contigs.
 static int adjust_n_lvls(int min_shift, int n_lvls, int64_t max_len)
 {
-    int64_t s = 1LL << (min_shift + n_lvls * 3);
+    int64_t s = hts_bin_maxpos(min_shift, n_lvls);
     max_len += 256;
     for (; max_len > s; ++n_lvls, s <<= 3) {}
     return n_lvls;
diff --git a/test/annot-tsv/dst.11.txt b/test/annot-tsv/dst.11.txt
new file mode 100644
index 000000000..c54ad153a
--- /dev/null
+++ b/test/annot-tsv/dst.11.txt
@@ -0,0 +1,5 @@
+#ignore me
+#chr	beg	end	smpl
+1	10	20	A
+1	30	40	A
+1	50	60	A
diff --git a/test/annot-tsv/dst.12.txt b/test/annot-tsv/dst.12.txt
new file mode 100644
index 000000000..9b26b79af
--- /dev/null
+++ b/test/annot-tsv/dst.12.txt
@@ -0,0 +1,5 @@
+#ignore me
+#chr,beg,end,smpl
+1,10,20,A
+1,30,40,A
+1,50,60,A
diff --git a/test/annot-tsv/out.11.1.txt b/test/annot-tsv/out.11.1.txt
new file mode 100644
index 000000000..3de1f68ee
--- /dev/null
+++ b/test/annot-tsv/out.11.1.txt
@@ -0,0 +1,3 @@
+1	10	20	A	A
+1	30	40	A	B
+1	50	60	A	.
diff --git a/test/annot-tsv/out.11.2.txt b/test/annot-tsv/out.11.2.txt
new file mode 100644
index 000000000..a863f4e61
--- /dev/null
+++ b/test/annot-tsv/out.11.2.txt
@@ -0,0 +1,4 @@
+#[1]chr	[2]beg	[3]end	[4]smpl	[5]src_smpl
+1	10	20	A	A
+1	30	40	A	B
+1	50	60	A	.
diff --git a/test/annot-tsv/out.11.3.txt b/test/annot-tsv/out.11.3.txt
new file mode 100644
index 000000000..7a37130db
--- /dev/null
+++ b/test/annot-tsv/out.11.3.txt
@@ -0,0 +1,4 @@
+#chr	beg	end	smpl	src_smpl
+1	10	20	A	A
+1	30	40	A	B
+1	50	60	A	.
diff --git a/test/annot-tsv/out.12.1.txt b/test/annot-tsv/out.12.1.txt
new file mode 100644
index 000000000..7b6d0e994
--- /dev/null
+++ b/test/annot-tsv/out.12.1.txt
@@ -0,0 +1,3 @@
+1,10,20,A,A
+1,30,40,A,B
+1,50,60,A,.
diff --git a/test/annot-tsv/out.13.1.txt b/test/annot-tsv/out.13.1.txt
new file mode 100644
index 000000000..a1bf0be68
--- /dev/null
+++ b/test/annot-tsv/out.13.1.txt
@@ -0,0 +1,2 @@
+1	10	20	long	long,short
+1	15	15	short	long,short
diff --git a/test/annot-tsv/out.13.2.txt b/test/annot-tsv/out.13.2.txt
new file mode 100644
index 000000000..7c543b134
--- /dev/null
+++ b/test/annot-tsv/out.13.2.txt
@@ -0,0 +1,2 @@
+1	10	20	long	long
+1	15	15	short	short
diff --git a/test/annot-tsv/out.13.3.txt b/test/annot-tsv/out.13.3.txt
new file mode 100644
index 000000000..8911afad8
--- /dev/null
+++ b/test/annot-tsv/out.13.3.txt
@@ -0,0 +1,2 @@
+1	10	20	long	long
+1	15	15	short	long,short
diff --git a/test/annot-tsv/out.13.4.txt b/test/annot-tsv/out.13.4.txt
new file mode 100644
index 000000000..f7a0e4d88
--- /dev/null
+++ b/test/annot-tsv/out.13.4.txt
@@ -0,0 +1,2 @@
+1	10	20	long	long,short
+1	15	15	short	short
diff --git a/test/annot-tsv/src.11.txt b/test/annot-tsv/src.11.txt
new file mode 100644
index 000000000..26eb20be6
--- /dev/null
+++ b/test/annot-tsv/src.11.txt
@@ -0,0 +1,5 @@
+#ignore me
+#chr1	beg1	end1	smpl1
+#chr2	beg2	end2	smpl2
+1	10	20	A
+1	30	40	B
diff --git a/test/annot-tsv/src.12.txt b/test/annot-tsv/src.12.txt
new file mode 100644
index 000000000..9b7ac367c
--- /dev/null
+++ b/test/annot-tsv/src.12.txt
@@ -0,0 +1,5 @@
+#ignore me
+#chr1,beg1,end1,smpl1
+#chr2,beg2,end2,smpl2
+1,10,20,A
+1,30,40,B
diff --git a/test/annot-tsv/src.13.txt b/test/annot-tsv/src.13.txt
new file mode 100644
index 000000000..de3338de1
--- /dev/null
+++ b/test/annot-tsv/src.13.txt
@@ -0,0 +1,2 @@
+1	10	20	long
+1	15	15	short
diff --git a/test/base_mods/MM-explicit.sam b/test/base_mods/MM-explicit.sam
index e85afa293..c230a9d82 100644
--- a/test/base_mods/MM-explicit.sam
+++ b/test/base_mods/MM-explicit.sam
@@ -19,7 +19,7 @@
 @CO	ATCATCATTCCTACCGCTATAGCCT  r3; mixture
 @CO	  -  -   .   -. -     --
 @CO	         M    M
-@CO       -  -   ??  ?? ?     --
+@CO	  -  -   ??  ?? ?     --
 @CO	         hH  hh h     --
 @CO	
 r1	0	*	0	0	*	*	0	0	ATCATCATTCCTACCGCTATAGCCT	*	Mm:Z:C+mh,2,0,1;	Ml:B:C,200,10,50,170,160,20
diff --git a/test/bgzf_boundaries/bgzf_boundaries1.bam b/test/bgzf_boundaries/bgzf_boundaries1.bam
new file mode 100644
index 000000000..264e22fad
Binary files /dev/null and b/test/bgzf_boundaries/bgzf_boundaries1.bam differ
diff --git a/test/bgzf_boundaries/bgzf_boundaries2.bam b/test/bgzf_boundaries/bgzf_boundaries2.bam
new file mode 100644
index 000000000..704804eaf
Binary files /dev/null and b/test/bgzf_boundaries/bgzf_boundaries2.bam differ
diff --git a/test/bgzf_boundaries/bgzf_boundaries3.bam b/test/bgzf_boundaries/bgzf_boundaries3.bam
new file mode 100644
index 000000000..328a27451
Binary files /dev/null and b/test/bgzf_boundaries/bgzf_boundaries3.bam differ
diff --git a/test/header_syms.pl b/test/header_syms.pl
index fe5128a78..a8d4a885c 100755
--- a/test/header_syms.pl
+++ b/test/header_syms.pl
@@ -60,6 +60,7 @@ sub extract_symbols {
 
     open(my $f, '<', $file) || die "Couldn't open $file : $!\n";
     my $text = <$f>;
+    $text =~ tr/\r//d;
     close($f) || die "Error reading $file : $!\n";
 
     # Get rid of comments
diff --git a/test/hfile.c b/test/hfile.c
index 8f06a971f..741cf7a8d 100644
--- a/test/hfile.c
+++ b/test/hfile.c
@@ -35,7 +35,8 @@ DEALINGS IN THE SOFTWARE.  */
 #include "../htslib/hts_defs.h"
 #include "../htslib/kstring.h"
 
-void HTS_NORETURN fail(const char *format, ...)
+void HTS_FORMAT(HTS_PRINTF_FMT, 1, 2) HTS_NORETURN
+fail(const char *format, ...)
 {
     int err = errno;
     va_list args;
diff --git a/test/plugins-dlhts.c b/test/plugins-dlhts.c
index f90e3bd74..33f432fbd 100644
--- a/test/plugins-dlhts.c
+++ b/test/plugins-dlhts.c
@@ -177,7 +177,7 @@ int main(int argc, char **argv)
 
 #else
 
-int main()
+int main(void)
 {
     printf("Tests skipped due to " SKIP "\n");
     return EXIT_SUCCESS;
diff --git a/test/range.out2 b/test/range.out2
new file mode 100644
index 000000000..22e6fd542
--- /dev/null
+++ b/test/range.out2
@@ -0,0 +1,21 @@
+@HD	VN:1.4	SO:coordinate
+@RG	ID:1	PL:ILLUMINA	PU:130410_HS18_09653_A_C1JT2ACXX_4	LB:7053878	DT:2013-04-10T00:00:00+0100	SM:ERS225193	CN:SC
+@SQ	SN:CHROMOSOME_I	LN:1009800	M5:8ede36131e0dbf3417807e48f77f3ebd	UR:/
+@SQ	SN:CHROMOSOME_II	LN:5000	M5:8e7993f7a93158587ee897d7287948ec	UR:/
+@SQ	SN:CHROMOSOME_III	LN:5000	M5:3adcb065e1cf74fafdbba1e8c352b323	UR:/
+@SQ	SN:CHROMOSOME_IV	LN:5000	M5:251af66a69ee589c9f3757340ec2de6f	UR:/
+@SQ	SN:CHROMOSOME_V	LN:5000	M5:cf200a65fb754836dcc56b24b3170ee8	UR:/
+@SQ	SN:CHROMOSOME_X	LN:5000	M5:6f9368fd2192c89c613718399d2d31fc	UR:/
+@SQ	SN:CHROMOSOME_MtDNA	LN:5000	M5:cd05857ece6411f40257a565ccfe15bb	UR:/
+@PG	ID:scramble	PN:scramble	VN:1.14.7	CL:scramble -M -I sam -s 50 -r /tmp/ce.fa - /tmp/ERR304769_subset.cram
+HS18_09653:4:2108:14085:93656	147	CHROMOSOME_I	1122	60	100M	=	756	-466	AATTTGCAAGAAAATTCGCAAGAAATTTGTATTAAAAACTGTTCAAAATTTTTGGAAATTAGTTTAAAAATCTCACATTTTTTTTAGAAAAATTATTTTT	GEFGHHFHEGGIFEFHFH<HHGGEFIAHEEFGEHFHFDFGDHG@HGGFFIIHHG8HICFBCEGICHEGIBHEHH;CGGFDGGJFFHGGDGGFFFEGDDE?	X0:i:1	X1:i:0	XG:i:0	AM:i:37	SM:i:37	XM:i:0	XO:i:0	XT:A:U	MD:Z:100	NM:i:0	RG:Z:1
+HS18_09653:4:2302:10109:87885	147	CHROMOSOME_II	1136	60	100M	=	734	-502	TGATTCATTTTATATTCTATATACTCATGTAATATGCCCATGTAAGGTTTAATTCCAAAAATATGAGCGTGTTCTATTTTATAATATTTTACTAAAATAC	GFGFEEGBEHH8BEFHCGGFEF.G:GB9FBFGEA@FB:<GF<G8FDFCGFCHBEHHDEEFDBGFCFHIEBCDE-ACGGGIFDDFFEFCHA=FDFAGDCC?	X0:i:1	X1:i:0	XG:i:0	AM:i:37	SM:i:37	XM:i:0	XO:i:0	XT:A:U	MD:Z:100	NM:i:0	RG:Z:1
+HS18_09653:4:2108:10782:59721	163	CHROMOSOME_II	1241	60	100M	=	1366	225	AGTTAATTGCACTCAAATTTGTTGTTCTTCATTCTCTCGTTATGATTTAATCTTATTGCGTCAAGGTCATTATTTTAGGTCCATTAGTTATCGATCTGAA	?EDDGEFFFGGHIIHFGIHGHEHDHGDEHHGHGEHGHGHHGGHIGGGIGGIGIGGHHGHHGHHFFIHJGFHGHHGLGHGHGGHFKEHGIGGH@FHIFGFG	X0:i:1	X1:i:0	XG:i:0	AM:i:37	SM:i:37	XM:i:0	XO:i:0	XT:A:U	MD:Z:100	NM:i:0	RG:Z:1
+HS18_09653:4:1107:10640:32305	99	CHROMOSOME_II	1267	60	100M	=	1778	611	CTTCATTCTCTCGTTATGATTTAATCTTATTGCGTCAAGGTCATTATTTTAGGTCCATTAGTTATCGATCTGAAACATGTTGTTGTATTTTTCTATTCTT	D?EFDEGGFFGEGHGGGHIGGGHHEGHIFGGEGFCGHFHGCHGGGHFKIHGGGGHFHFFHFFGGGGHGHHGHGFFHGGGHHGFGHGFGHFGGFHHBHGFF	X0:i:1	X1:i:0	BC:Z:GTNTGCNG	XG:i:0	AM:i:37	SM:i:37	XM:i:0	XO:i:0	QT:Z:<<!2@@!2	XT:A:U	MD:Z:100	NM:i:0	RG:Z:1
+HS18_09653:4:2213:16876:56957	99	CHROMOSOME_II	1326	60	100M	=	1651	425	AGTTATCGATCTGAAACATGTTGTTGTATTTTTCTATTCTTGTGAGCTCAGGACACCTCATACAACTCCAGAGAAAATGTGTCTCATTATTCTTGTCTTT	BCEDDD:AAFGDG<F2DGHHGGGHDECGGG@GGFD/HGFGCB0GEEFGDHGHEGHFGHCFGHGFGHFHHHDFGFGHGEF.F-HGGBEGFGGGFHHGHEEI	X0:i:1	X1:i:0	BC:Z:GTNTGCNG	XG:i:0	AM:i:37	SM:i:37	XM:i:0	XO:i:0	QT:Z:<<!2@@!4	XT:A:U	MD:Z:100	NM:i:0	RG:Z:1
+HS18_09653:4:2211:18838:86627	147	CHROMOSOME_II	1345	60	100M	=	828	-617	GTTGTTGTATTTTTCTATTCTTGTGAGCTCAGGACACCTCATACAACTCCAGAGAAAATGTGTCTCATTATTCTTGTCTTTTTTCAAGATCTAATCAATT	<GFDIEGH@FGDBFFEFFGFGAGHHAFH@HHGFHIHFGHGHG:CH:GCFHGGHGHGFEHGCJEJCGHGIGEEEFGFFGBHGHGEFHFCFGGG>FECDDE?	X0:i:1	X1:i:0	XG:i:0	AM:i:37	SM:i:37	XM:i:0	XO:i:0	XT:A:U	MD:Z:100	NM:i:0	RG:Z:1
+HS18_09653:4:2314:21094:58611	99	CHROMOSOME_II	1353	60	100M	=	1775	522	ATTTTTCTATTCTTGTGAGCTCAGGACACCTCATACAACTCCAGAGAAAATGTGTCTCATTATTCTTGTCTTTTTTCAAGATCTAATCAATTTTCTACAT	D;?FBD9CDBGBGG?GF8DFGFFHDACDGFGGD/HGHHGFFEFGD=FGIG0D.GH7HHFFGFDGGFF:HFDGGHGGGGE;F:@GGEGGCFGFGHHB@FHG	X0:i:1	X1:i:0	BC:Z:GTNTGCCG	XG:i:0	AM:i:37	SM:i:37	XM:i:0	XO:i:0	QT:Z:=?!4AD22	XT:A:U	MD:Z:100	NM:i:0	RG:Z:1
+HS18_09653:4:2108:10782:59721	83	CHROMOSOME_II	1366	60	100M	=	1241	-225	TGTGAGCTCAGGACACCTCATACAACTCCAGAGAAAATGTGTCTCATTATTCTTGTCTTTTTTCAAGATCTAATCAATTTTCTACATTAACGACGTTTTT	IFGHDHHFFGHIIEGGGHEHHJGGGFGGHFHHGFGGGGGGHHDHFHGIF=IFIFHIGIHGHF=HGJGGGFGGGHEEHGFGGFGEGGGGEGFFGGGFEBCD	X0:i:1	X1:i:0	BC:Z:GTNTGCCG	XG:i:0	AM:i:37	SM:i:37	XM:i:0	XO:i:0	QT:Z:=?!4AD+2	XT:A:U	MD:Z:100	NM:i:0	RG:Z:1
+HS18_09653:4:2111:5602:28724	99	CHROMOSOME_II	1416	60	100M	=	1881	565	TCTTGTCTTTTTTCAAGATCTAATCAATTTTCTACATTAACGACGTTTTTGTCGTTCTGCTTCTTTTTTTCGTTCGTTTGTCTCGTCCATCAGCTGTCCA	ECE>EGGGGFGGGGDGFEFGGGFHEGHGIIFGFEJGHHFGGGHFGEFHIHGFFGGECGFHHGGFGHIHHHGEGGHBGBGHHEHGEBGGFFGFFHHGCGFF	X0:i:1	X1:i:0	BC:Z:GTNTGCCG	XG:i:0	AM:i:37	SM:i:37	XM:i:0	XO:i:0	QT:Z:=@!4AD24	XT:A:U	MD:Z:100	NM:i:0	RG:Z:1
+HS18_09653:4:2103:6720:15025	99	CHROMOSOME_II	1459	60	100M	=	1617	258	CGTTTTTGTCGTTCTGCTTCTTTTTTTCGTTCGTTTGTCTCGTCCATCAGCTGTCCACTCATTTCTCTCCCACTCACTAGGCAGTGCTTTGTTTGGTTCC	ECEFFGGGEHGEGGGGGGHFGGGHIGHIGGGG?HFGHGEGFBFGGGFGIHGDGGDEDFF<FGGGGHFGGFFAGEGBGGCHFEFGGGEHEHGDGF:FFFFC	X0:i:1	X1:i:0	BC:Z:GTNTGCGG	XG:i:0	AM:i:37	SM:i:37	XM:i:0	XO:i:0	QT:Z:=?!4AD22	XT:A:U	MD:Z:100	NM:i:0	RG:Z:1
+HS18_09653:4:1316:7415:40818	99	CHROMOSOME_II	1536	60	100M	=	1839	403	TAGGCAGTGCTTTGTTTGGTTCCGATTGGCAGCTGGCTGCAGGGCCTGCATCTCTTCTATGTCTCTCATTTACTTGCATTCTTTTCTTCGTTAATTTTTG	AC?>FGGAEFGGGDDGEGGFGGEEEGEIFGFG@E<GH>>EFGDG?HCFCF>DGGHDFFCHF>=G;CFBEHG<GCCGGEEHFDHGGHGGFFGGDDFHHGH?	X0:i:1	X1:i:0	BC:Z:GTNTGCGG	XG:i:0	AM:i:37	SM:i:37	XM:i:0	XO:i:0	QT:Z:=?!4AD+2	XT:A:U	MD:Z:100	NM:i:0	RG:Z:1
diff --git a/test/sam.c b/test/sam.c
index f0eadbefe..74591fc2d 100644
--- a/test/sam.c
+++ b/test/sam.c
@@ -1,6 +1,6 @@
 /*  test/sam.c -- SAM/BAM/CRAM API test cases.
 
-    Copyright (C) 2014-2020, 2022-2023 Genome Research Ltd.
+    Copyright (C) 2014-2020, 2022-2024 Genome Research Ltd.
 
     Author: John Marshall <jm18@sanger.ac.uk>
 
@@ -1408,16 +1408,16 @@ static void check_big_ref(int parse_header)
         "@HD\tVN:1.4\n"
         "@SQ\tSN:large#1\tLN:5000000000\n"
         "@SQ\tSN:small#1\tLN:100\n"
-        "@SQ\tSN:large#2\tLN:9223372034707292158\n"
+        "@SQ\tSN:large#2\tLN:4611686018427387904\n"
         "@SQ\tSN:small#2\tLN:1\n"
         "r1\t0\tlarge#1\t4999999000\t50\t8M\t*\t0\t0\tACGTACGT\tabcdefgh\n"
         "r2\t0\tsmall#1\t1\t50\t8M\t*\t0\t0\tACGTACGT\tabcdefgh\n"
-        "r3\t0\tlarge#2\t9223372034707292000\t50\t8M\t*\t0\t0\tACGTACGT\tabcdefgh\n"
-        "p1\t99\tlarge#2\t1\t50\t8M\t=\t9223372034707292150\t9223372034707292158\tACGTACGT\tabcdefgh\n"
-        "p1\t147\tlarge#2\t9223372034707292150\t50\t8M\t=\t1\t-9223372034707292158\tACGTACGT\tabcdefgh\n"
+        "r3\t0\tlarge#2\t4611686018427387000\t50\t8M\t*\t0\t0\tACGTACGT\tabcdefgh\n"
+        "p1\t99\tlarge#2\t1\t50\t8M\t=\t4611686018427387895\t4611686018427387903\tACGTACGT\tabcdefgh\n"
+        "p1\t147\tlarge#2\t4611686018427387895\t50\t8M\t=\t1\t-4611686018427387903\tACGTACGT\tabcdefgh\n"
         "r4\t0\tsmall#2\t2\t50\t8M\t*\t0\t0\tACGTACGT\tabcdefgh\n";
     const hts_pos_t expected_lengths[] = {
-        5000000000LL, 100LL, 9223372034707292158LL, 1LL
+        5000000000LL, 100LL, 4611686018427387904LL, 1LL
     };
     const int expected_tids[] = {
         0, 1, 2, 2, 2, 3
@@ -1426,11 +1426,11 @@ static void check_big_ref(int parse_header)
         -1, -1, -1, 2, 2, -1
     };
     const hts_pos_t expected_positions[] = {
-        4999999000LL - 1, 1LL - 1, 9223372034707292000LL - 1, 1LL - 1,
-        9223372034707292150LL - 1, 2LL - 1
+        4999999000LL - 1, 1LL - 1, 4611686018427387000LL - 1, 1LL - 1,
+        4611686018427387895LL - 1, 2LL - 1
     };
     const hts_pos_t expected_mpos[] = {
-        -1, -1, -1, 9223372034707292150LL - 1, 1LL - 1, -1
+        -1, -1, -1, 4611686018427387895LL - 1, 1LL - 1, -1
     };
     samFile *in = NULL, *out = NULL;
     sam_hdr_t *header = NULL, *dup_header = NULL;
@@ -1997,7 +1997,7 @@ static void test_mempolicy(void)
     }
 }
 
-static void test_bam_set1_minimal()
+static void test_bam_set1_minimal(void)
 {
     int r;
     bam1_t *bam = NULL;
@@ -2028,7 +2028,7 @@ static void test_bam_set1_minimal()
     if (bam != NULL) bam_destroy1(bam);
 }
 
-static void test_bam_set1_full()
+static void test_bam_set1_full(void)
 {
     const char *qname = "!??AAA~~~~";
     const uint32_t cigar[] = { 6 << BAM_CIGAR_SHIFT | BAM_CMATCH, 2 << BAM_CIGAR_SHIFT | BAM_CINS, 2 << BAM_CIGAR_SHIFT | BAM_CMATCH };
@@ -2075,7 +2075,7 @@ static void test_bam_set1_full()
     if (bam != NULL) bam_destroy1(bam);
 }
 
-static void test_bam_set1_even_and_odd_seq_len()
+static void test_bam_set1_even_and_odd_seq_len(void)
 {
     const char *seq_even = "TGGACTACGA";
     const char *seq_odd  = "TGGACTACGAC";
@@ -2105,7 +2105,7 @@ static void test_bam_set1_even_and_odd_seq_len()
     if (bam != NULL) bam_destroy1(bam);
 }
 
-static void test_bam_set1_with_seq_but_no_qual()
+static void test_bam_set1_with_seq_but_no_qual(void)
 {
     const char *seq = "TGGACTACGA";
 
@@ -2129,7 +2129,7 @@ static void test_bam_set1_with_seq_but_no_qual()
     if (bam != NULL) bam_destroy1(bam);
 }
 
-static void test_bam_set1_validate_qname()
+static void test_bam_set1_validate_qname(void)
 {
     int r;
     bam1_t *bam = NULL;
@@ -2146,7 +2146,7 @@ static void test_bam_set1_validate_qname()
     if (bam != NULL) bam_destroy1(bam);
 }
 
-static void test_bam_set1_validate_seq()
+static void test_bam_set1_validate_seq(void)
 {
     int r;
     bam1_t *bam = NULL;
@@ -2163,7 +2163,7 @@ static void test_bam_set1_validate_seq()
     if (bam != NULL) bam_destroy1(bam);
 }
 
-static void test_bam_set1_validate_cigar()
+static void test_bam_set1_validate_cigar(void)
 {
     const uint32_t cigar[] = { 20 << BAM_CIGAR_SHIFT | BAM_CMATCH };
     const char *seq = "TGGACTACGA";
@@ -2192,7 +2192,7 @@ static void test_bam_set1_validate_cigar()
     if (bam != NULL) bam_destroy1(bam);
 }
 
-static void test_bam_set1_validate_size_limits()
+static void test_bam_set1_validate_size_limits(void)
 {
     const uint32_t cigar[] = { 20 << BAM_CIGAR_SHIFT | BAM_CMATCH };
     const char *seq = "TGGACTACGA";
@@ -2224,7 +2224,7 @@ static void test_bam_set1_validate_size_limits()
     if (bam != NULL) bam_destroy1(bam);
 }
 
-static void test_bam_set1_write_and_read_back()
+static void test_bam_set1_write_and_read_back(void)
 {
     const char *qname = "q1";
     const uint32_t cigar[] = { 6 << BAM_CIGAR_SHIFT | BAM_CMATCH, 2 << BAM_CIGAR_SHIFT | BAM_CINS, 2 << BAM_CIGAR_SHIFT | BAM_CMATCH };
diff --git a/test/test-bcf-sr.c b/test/test-bcf-sr.c
index 80daf0423..0fb59905c 100644
--- a/test/test-bcf-sr.c
+++ b/test/test-bcf-sr.c
@@ -36,11 +36,13 @@
 #include <strings.h>
 #include <errno.h>
 
+#include "../htslib/hts_defs.h"
 #include "../htslib/synced_bcf_reader.h"
 #include "../htslib/hts.h"
 #include "../htslib/vcf.h"
 
-void error(const char *format, ...)
+void HTS_FORMAT(HTS_PRINTF_FMT, 1, 2) HTS_NORETURN
+error(const char *format, ...)
 {
     va_list ap;
     va_start(ap, format);
@@ -49,7 +51,7 @@ void error(const char *format, ...)
     exit(EXIT_FAILURE);
 }
 
-void usage(int exit_code)
+void HTS_NORETURN usage(int exit_code)
 {
     fprintf(stderr, "Usage: test-bcf-sr [OPTIONS] vcf-list.txt\n");
     fprintf(stderr, "       test-bcf-sr [OPTIONS] -args file1.bcf [...]\n");
diff --git a/test/test-bcf-translate.c b/test/test-bcf-translate.c
index c2f069e39..263e71eb8 100644
--- a/test/test-bcf-translate.c
+++ b/test/test-bcf-translate.c
@@ -29,7 +29,7 @@
 
 #include "../htslib/vcf.h"
 
-void error(const char *format, ...)
+void HTS_FORMAT(HTS_PRINTF_FMT, 1, 2) error(const char *format, ...)
 {
     va_list ap;
     va_start(ap, format);
diff --git a/test/test-bcf_set_variant_type.c b/test/test-bcf_set_variant_type.c
index e5092084e..eb12ecde3 100644
--- a/test/test-bcf_set_variant_type.c
+++ b/test/test-bcf_set_variant_type.c
@@ -29,7 +29,7 @@ DEALINGS IN THE SOFTWARE.  */
 #include "../htslib/hts.h"
 #include "../vcf.c"
 
-void error(const char *format, ...)
+void HTS_FORMAT(HTS_PRINTF_FMT, 1, 2) error(const char *format, ...)
 {
     va_list ap;
     va_start(ap, format);
@@ -39,7 +39,7 @@ void error(const char *format, ...)
     exit(-1);
 }
 
-static void test_bcf_set_variant_type()
+static void test_bcf_set_variant_type(void)
 {
     // Test SNVs
     bcf_variant_t var1;
diff --git a/test/test-vcf-api.c b/test/test-vcf-api.c
index eff653686..b86b71d99 100644
--- a/test/test-vcf-api.c
+++ b/test/test-vcf-api.c
@@ -33,7 +33,7 @@ DEALINGS IN THE SOFTWARE.  */
 #include "../htslib/kstring.h"
 #include "../htslib/kseq.h"
 
-void error(const char *format, ...)
+void HTS_FORMAT(HTS_PRINTF_FMT, 1, 2) error(const char *format, ...)
 {
     va_list ap;
     va_start(ap, format);
@@ -625,7 +625,7 @@ void test_invalid_end_tag(void)
     hts_set_log_level(logging);
 }
 
-void test_open_format() {
+void test_open_format(void) {
     char mode[5];
     int ret;
     strcpy(mode, "r");
diff --git a/test/test.pl b/test/test.pl
index 03eca1129..b5f52bdfb 100755
--- a/test/test.pl
+++ b/test/test.pl
@@ -819,6 +819,43 @@ sub test_view
         }
     }
 
+    # BAM files with alignment records that span BGZF blocks
+    # HTSlib starts a new block if an alignment is likely to overflow the
+    # current one, so for its own data this will only happen for records
+    # longer than 64kbytes.  As other implementations may not do this,
+    # check that reading works correctly on some BAM files where records
+    # have been deliberately split between BGZF blocks.
+    print "test_view testing BAM records in multiple BGZF blocks:\n";
+    $test_view_failures = 0;
+    my $src_sam = "ce#1.sam";
+    foreach my $test_bam (qw(bgzf_boundaries/bgzf_boundaries1.bam
+                          bgzf_boundaries/bgzf_boundaries2.bam
+                          bgzf_boundaries/bgzf_boundaries3.bam)) {
+        testv $opts, "./test_view $tv_args -p $test_bam.tmp.sam $test_bam";
+        testv $opts, "./compare_sam.pl $test_bam.tmp.sam $src_sam";
+    }
+
+    # Test a file with a long alignment record.  Boundaries hit in the middle of
+    # the CIGAR data, and in the sequence.  Generate the test file here as it's
+    # big, but with fairly simple contents.
+    $src_sam = "bgzf_boundaries/large_rec.tmp.sam";
+    open(my $test_sam, '>', $src_sam) || die "Couldn't open $src_sam : $!\n";
+    print $test_sam "\@HD\tVN:1.6\tSO:coordinate\n";
+    print $test_sam "\@SQ\tSN:ref\tLN:100000\n";
+    print $test_sam "read\t0\tref\t1\t60\t", "1M1I" x 16000, "\t*\t0\t0\t", "A" x 32000, "\t", "Q" x 32000, "\n";
+    close($test_sam) || die "Error on closing $src_sam : $!\n";
+
+    testv $opts, "./test_view $tv_args -b -l 0 -p $src_sam.bam $src_sam";
+    testv $opts, "./test_view $tv_args -p $src_sam.bam.sam $src_sam.bam";
+    testv $opts, "./compare_sam.pl $src_sam $src_sam.bam.sam";
+
+    if ($test_view_failures == 0) {
+        passed($opts, "BAM records spanning multiple BGZF block tests");
+    } else {
+        failed($opts, "BAM records spanning multiple BGZF block tests",
+               "$test_view_failures subtests failed");
+    }
+
     # embed_ref=2 mode
     print "test_view testing embed_ref=2:\n";
     $test_view_failures = 0;
@@ -850,6 +887,18 @@ sub test_view
     testv $opts, "./test_view $tv_args range.bam $regions > range.tmp";
     testv $opts, "./compare_sam.pl range.tmp range.out";
 
+    # Regression check for out-of-bounds read on regions list (see
+    # samtools#2063).  As reg_insert() allocates at least four slots
+    # for chromosome regions, we need more than that many in the second
+    # chr. requested to ensure it has a bigger array.
+
+    $regions = "CHROMOSOME_I:1122-1122 CHROMOSOME_II:1136-1136 CHROMOSOME_II:1241-1241 CHROMOSOME_II:1267-1267 CHROMOSOME_II:1326-1326 CHROMOSOME_II:1345-1345 CHROMOSOME_II:1353-1353 CHROMOSOME_II:1366-1366 CHROMOSOME_II:1416-1416 CHROMOSOME_II:1459-1459 CHROMOSOME_II:1536-1536";
+    testv $opts, "./test_view $tv_args -i reference=ce.fa -M range.cram $regions > range.tmp";
+    testv $opts, "./compare_sam.pl range.tmp range.out2";
+
+    testv $opts, "./test_view $tv_args -M range.bam $regions > range.tmp";
+    testv $opts, "./compare_sam.pl range.tmp range.out2";
+
     if ($test_view_failures == 0) {
         passed($opts, "range.cram tests");
     } else {
@@ -1414,4 +1463,19 @@ sub test_annot_tsv
     run_annot_tsv($opts,src=>'src.10.txt',dst=>'dst.10.txt',out=>'out.10.4.txt',args=>'-m smpl -f smpl');
     run_annot_tsv($opts,src=>'src.10.txt',dst=>'dst.10.txt',out=>'out.10.5.txt',args=>'-m smpl ');
     run_annot_tsv($opts,src=>'src.10.txt',dst=>'dst.10.txt',out=>'out.10.6.txt',args=>'-m smpl -x');
+    run_annot_tsv($opts,src=>'src.11.txt',dst=>'dst.11.txt',out=>'out.11.1.txt',args=>'-c 1,2,3:1,2,3 -f 4:5 -h 0:0');
+    run_annot_tsv($opts,src=>'src.11.txt',dst=>'dst.11.txt',out=>'out.11.1.txt',args=>'-c chr1,beg1,end1:chr,beg,end -f smpl1:src_smpl -h 2:2 -II');
+    run_annot_tsv($opts,src=>'src.11.txt',dst=>'dst.11.txt',out=>'out.11.1.txt',args=>'-c chr1,beg1,end1:chr,beg,end -f smpl1:src_smpl -h 2:-1 -II');
+    run_annot_tsv($opts,src=>'src.11.txt',dst=>'dst.11.txt',out=>'out.11.2.txt',args=>'-c chr1,beg1,end1:chr,beg,end -f smpl1:src_smpl -h 2:2');
+    run_annot_tsv($opts,src=>'src.11.txt',dst=>'dst.11.txt',out=>'out.11.2.txt',args=>'-c chr2,beg2,end2:chr,beg,end -f smpl2:src_smpl -h 3:2');
+    run_annot_tsv($opts,src=>'src.11.txt',dst=>'dst.11.txt',out=>'out.11.3.txt',args=>'-c chr1,beg1,end1:chr,beg,end -f smpl1:src_smpl -h 2:2 -I');
+    run_annot_tsv($opts,src=>'src.11.txt',dst=>'dst.11.txt',out=>'out.11.3.txt',args=>'-c chr2,beg2,end2:chr,beg,end -f smpl2:src_smpl -h 3:2 -I');
+    run_annot_tsv($opts,src=>'src.12.txt',dst=>'dst.12.txt',out=>'out.12.1.txt',args=>'-c 1,2,3:1,2,3 -f 4:5 -h 0:0 -d ,');
+    run_annot_tsv($opts,src=>'src.12.txt',dst=>'dst.11.txt',out=>'out.11.1.txt',args=>q[-c 1,2,3:1,2,3 -f 4:5 -h 0:0 -d $',:\t']);
+    run_annot_tsv($opts,src=>'src.13.txt',dst=>'src.13.txt',out=>'out.13.1.txt',args=>q[-c 1,2,3 -f 4:5]);
+    run_annot_tsv($opts,src=>'src.13.txt',dst=>'src.13.txt',out=>'out.13.1.txt',args=>q[-c 1,2,3 -f 4:5 -O 0.5]);
+    run_annot_tsv($opts,src=>'src.13.txt',dst=>'src.13.txt',out=>'out.13.2.txt',args=>q[-c 1,2,3 -f 4:5 -O 0.5 -r]);
+    run_annot_tsv($opts,src=>'src.13.txt',dst=>'src.13.txt',out=>'out.13.2.txt',args=>q[-c 1,2,3 -f 4:5 -O 0.5,0.5]);
+    run_annot_tsv($opts,src=>'src.13.txt',dst=>'src.13.txt',out=>'out.13.3.txt',args=>q[-c 1,2,3 -f 4:5 -O 0,1]);
+    run_annot_tsv($opts,src=>'src.13.txt',dst=>'src.13.txt',out=>'out.13.4.txt',args=>q[-c 1,2,3 -f 4:5 -O 1,0]);
 }
diff --git a/test/test_bgzf.c b/test/test_bgzf.c
index 6cb6db902..bda68d1e9 100644
--- a/test/test_bgzf.c
+++ b/test/test_bgzf.c
@@ -1,6 +1,6 @@
 /* test/test_bgzf.c -- bgzf unit tests
 
-   Copyright (C) 2017, 2019, 2022-2023 Genome Research Ltd
+   Copyright (C) 2017, 2019, 2022-2024 Genome Research Ltd
 
    Author: Robert Davies <rmd@sanger.ac.uk>
 
@@ -179,7 +179,7 @@ static int try_bgzf_close(BGZF **bgz, const char *name, const char *func, int ex
 
 static ssize_t try_bgzf_read(BGZF *fp, void *data, size_t length,
                              const char *name, const char *func) {
-    ssize_t got = bgzf_read(fp, data, length);
+    ssize_t got = bgzf_read_small(fp, data, length);
     if (got < 0) {
         fprintf(stderr, "%s : Error from bgzf_read %s : %s\n",
                 func, name, strerror(errno));
@@ -189,7 +189,7 @@ static ssize_t try_bgzf_read(BGZF *fp, void *data, size_t length,
 
 static ssize_t try_bgzf_write(BGZF *fp, const void *data, size_t length,
                               const char *name, const char *func) {
-    ssize_t put = bgzf_write(fp, data, length);
+    ssize_t put = bgzf_write_small(fp, data, length);
     if (put < (ssize_t) length) {
         fprintf(stderr, "%s : %s %s : %s\n",
                 func, put < 0 ? "Error writing to" : "Short write on",
@@ -878,6 +878,49 @@ static int test_tell_read(Files *f, const char *mode) {
     return -1;
 }
 
+static int test_useek_read_small(Files *f, const char *mode) {
+
+    BGZF* bgz = NULL;
+    char bg_buf[99];
+
+    bgz = try_bgzf_open(f->tmp_bgzf, mode, __func__);
+    if (!bgz) goto fail;
+
+
+    if (try_bgzf_write(bgz, "#>Hello, World!\n", 16,
+                       f->tmp_bgzf, __func__) != 16)
+        goto fail;
+    if (try_bgzf_close(&bgz, f->tmp_bgzf, __func__, 0) != 0) goto fail;
+
+    bgz = try_bgzf_open(f->tmp_bgzf, "r", __func__);
+    if (!bgz) goto fail;
+
+    if (try_bgzf_getc(bgz, 0, '#', f->tmp_bgzf, __func__) < 0 ||
+        try_bgzf_getc(bgz, 1, '>', f->tmp_bgzf, __func__) < 0)
+        goto fail;
+
+    if (try_bgzf_read(bgz, bg_buf, 5, f->tmp_bgzf, __func__) != 5)
+        goto fail;
+    if (memcmp(bg_buf, "Hello", 5) != 0)
+        goto fail;
+
+    if (try_bgzf_useek(bgz, 9, SEEK_SET, f->tmp_bgzf, __func__) < 0)
+        goto fail;
+
+    if (try_bgzf_read(bgz, bg_buf, 5, f->tmp_bgzf, __func__) != 5)
+        goto fail;
+    if (memcmp(bg_buf, "World", 5) != 0)
+        goto fail;
+
+    if (try_bgzf_close(&bgz, f->tmp_bgzf, __func__, 0) != 0) goto fail;
+    return 0;
+
+ fail:
+    fprintf(stderr, "%s: failed\n", __func__);
+    if (bgz) bgzf_close(bgz);
+    return -1;
+}
+
 static int test_bgzf_getline(Files *f, const char *mode, int nthreads) {
     BGZF* bgz = NULL;
     ssize_t bg_put;
@@ -1098,6 +1141,10 @@ int main(int argc, char **argv) {
     if (test_tell_read(&f, "w") != 0) goto out;
     if (test_tell_read(&f, "wu") != 0) goto out;
 
+    // bgzf_useek and bgzf_read_small
+    if (test_useek_read_small(&f, "w") != 0) goto out;
+    if (test_useek_read_small(&f, "wu") != 0) goto out;
+
     // getline
     if (test_bgzf_getline(&f, "w", 0) != 0) goto out;
     if (test_bgzf_getline(&f, "w", 1) != 0) goto out;
diff --git a/test/test_faidx.c b/test/test_faidx.c
index 566149071..f73f973a0 100644
--- a/test/test_faidx.c
+++ b/test/test_faidx.c
@@ -26,7 +26,7 @@ DEALINGS IN THE SOFTWARE.  */
 
 #include <stdio.h>
 #include <stdlib.h>
-#include <getopt.h>
+#include <unistd.h>
 
 #include "../htslib/faidx.h"
 
diff --git a/test/test_khash.c b/test/test_khash.c
new file mode 100644
index 000000000..a2e80b581
--- /dev/null
+++ b/test/test_khash.c
@@ -0,0 +1,502 @@
+/*  test_khash.c -- khash unit tests
+
+    Copyright (C) 2024 Genome Research Ltd.
+    Copyright (C) 2024 Centre for Population Genomics.
+
+    Author: Rob Davies <rmd@sanger.ac.uk>
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+DEALINGS IN THE SOFTWARE.  */
+
+#include <config.h>
+
+#include <stdlib.h>
+#include <stdio.h>
+#include <limits.h>
+#include <stdint.h>
+#include <inttypes.h>
+#include <getopt.h>
+#ifdef HAVE_CLOCK_GETTIME_CPUTIME
+#include <time.h>
+#else
+#include <sys/time.h>
+#endif
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <unistd.h>
+
+#include <htslib/khash.h>
+#include <htslib/kroundup.h>
+
+#define MAX_ENTRIES 99999999
+
+KHASH_MAP_INIT_STR(str2int, int)
+
+static void write_stats_str2int(khash_t(str2int) *h) {
+    khint_t empty = 0, deleted = 0, hist_size = 0, *hist = NULL;
+
+    if (kh_stats(str2int, h, &empty, &deleted, &hist_size, &hist) == 0) {
+        khint_t i;
+        printf("n_buckets = %u\n",
+                kh_n_buckets(h));
+        printf("empty     = %u\n", empty);
+        printf("deleted   = %u\n", deleted);
+        for (i = 0; i < hist_size; i++) {
+            printf("dist[ %8u ] = %u\n", i, hist[i]);
+        }
+        free(hist);
+    }
+}
+
+char * make_keys(size_t num, size_t kl) {
+    size_t i;
+    char *keys;
+
+    if (num > MAX_ENTRIES) return NULL;
+    keys = malloc(kl * num);
+    if (!keys) {
+        perror(NULL);
+        return NULL;
+    }
+    for (i = 0; i < num; i++) {
+        if (snprintf(keys + kl * i, kl, "test%zu", i) >= kl) {
+            free(keys);
+            return NULL;
+        }
+    }
+
+    return keys;
+}
+
+static int add_str2int_entry(khash_t(str2int) *h, char *key, khint_t val) {
+    int ret = 0;
+    khint_t k = kh_put(str2int, h, key, &ret);
+
+    if (ret != 1 && ret != 2) {
+        fprintf(stderr, "Unexpected return from kh_put(%s) : %d\n", key, ret);
+        return -1;
+    }
+    kh_val(h, k) = val;
+    return 0;
+}
+
+static int check_str2int_entry(khash_t(str2int) *h, char *key, khint_t val,
+                               uint8_t is_deleted) {
+    khint_t k = kh_get(str2int, h, key);
+    if (is_deleted) {
+        if (k < kh_end(h)) {
+            fprintf(stderr, "Found deleted entry %s in hash table\n", key);
+            return -1;
+        } else {
+            return 0;
+        }
+    }
+
+    if (k >= kh_end(h)) {
+        fprintf(stderr, "Couldn't find %s in hash table\n", key);
+        return -1;
+    }
+    if (strcmp(kh_key(h, k), key) != 0) {
+        fprintf(stderr, "Wrong key in hash table, expected %s got %s\n",
+                key, kh_key(h, k));
+        return -1;
+    }
+    if (kh_val(h, k) != val) {
+        fprintf(stderr, "Wrong value in hash table, expected %u got %u\n",
+                val, kh_val(h, k));
+        return -1;
+    }
+    return 0;
+}
+
+static int del_str2int_entry(khash_t(str2int) *h, char *key) {
+    khint_t k = kh_get(str2int, h, key);
+    if (k >= kh_end(h)) {
+        fprintf(stderr, "Couldn't find %s to delete from hash table\n", key);
+        return -1;
+    }
+    kh_del(str2int, h, k);
+    return 0;
+}
+
+static int test_str2int(size_t max, size_t to_del, int show_stats) {
+    const size_t kl = 16;
+    size_t mask = max;
+    char *keys = make_keys(max, kl);
+    uint8_t *flags = NULL;
+    khash_t(str2int) *h;
+    khint_t i;
+    uint32_t r = 0x533d;
+
+    if (!keys) return -1;
+
+    h = kh_init(str2int);
+    if (!h) goto memfail;
+
+    // Add some entries
+    for (i = 0; i < max; i++) {
+        if (add_str2int_entry(h, keys + i * kl, i) != 0)
+            goto fail;
+    }
+
+    // Check they exist
+    for (i = 0; i < max; i++) {
+        if (check_str2int_entry(h, keys + i * kl, i, 0) != 0)
+            goto fail;
+    }
+
+    if (show_stats) {
+        printf("Initial fill:\n");
+        write_stats_str2int(h);
+    }
+
+    // Delete a random selection
+    flags = calloc(max, sizeof(*flags));
+    if (!flags) {
+        perror("");
+        goto fail;
+    }
+
+    kroundup_size_t(mask);
+    --mask;
+
+    // Note that this method may become slow for a high %age removed
+    // as it searches for the last available entries.  Despite this, it
+    // seems to be acceptable for the number of entries allowed.
+    for (i = 0; i < to_del; i++) {
+        khint_t victim;
+        // LFSR, see http://users.ece.cmu.edu/~koopman/lfsr/index.html
+        do {
+            r = (r >> 1) ^ ((r & 1) * 0x80000057U);
+            victim = (r & mask) - 1;
+        } while (victim >= max || flags[victim]);
+        if (del_str2int_entry(h, keys + victim * kl) != 0)
+            goto fail;
+        flags[victim] = 1;
+    }
+
+    // Check correct entries are present
+    for (i = 0; i < max; i++) {
+        if (check_str2int_entry(h, keys + i * kl, i, flags[i]) != 0)
+            goto fail;
+    }
+
+    if (show_stats) {
+        printf("\nAfter deletion:\n");
+        write_stats_str2int(h);
+    }
+
+    // Re-insert deleted entries
+    for (i = 0; i < max; i++) {
+        if (flags[i] && add_str2int_entry(h, keys + i * kl, i) != 0)
+            goto fail;
+    }
+
+    // Ensure they're all back
+    for (i = 0; i < max; i++) {
+        if (check_str2int_entry(h, keys + i * kl, i, 0) != 0)
+            goto fail;
+    }
+
+    if (show_stats) {
+        printf("\nAfter re-insert:\n");
+        write_stats_str2int(h);
+    }
+
+    kh_destroy(str2int, h);
+    free(keys);
+    free(flags);
+
+    return 0;
+
+ memfail:
+    perror(NULL);
+ fail:
+    kh_destroy(str2int, h);
+    free(keys);
+    free(flags);
+    return -1;
+}
+
+static size_t read_keys(const char *keys_file, char **keys_out,
+                        char ***key_locations_out) {
+    FILE *in = fopen(keys_file, "r");
+    char *keys = NULL, *key, *end;
+    size_t keys_size = 1000000;
+    size_t keys_used = 0;
+    size_t avail, got, nkeys = 0;
+    char **key_locations = NULL;
+    struct stat fileinfo = { 0 };
+
+    if (!in)
+        return 0;
+
+    // Slurp entire file
+    if (fstat(fileno(in), &fileinfo) < 0) {
+        if (fileinfo.st_size > keys_size)
+            keys_size = (size_t) fileinfo.st_size;
+    }
+
+    keys = malloc(keys_size + 1);
+    if (!keys)
+        goto fail;
+
+    do {
+        avail = keys_size - keys_used;
+        if (avail == 0) {
+            size_t new_size = keys_size + 1000000;
+            char *new_keys = realloc(keys, new_size + 1);
+            if (!new_keys)
+                goto fail;
+            keys = new_keys;
+            keys_size = new_size;
+            avail = keys_size - keys_used;
+        }
+        got = fread(keys + keys_used, 1, avail, in);
+        keys_used += got;
+    } while (got == avail);
+    keys[keys_used] = '\0';
+
+    if (ferror(in))
+        goto fail;
+    if (fclose(in) < 0)
+        goto fail;
+    in = NULL;
+
+    // Split by line
+    end = keys + keys_used;
+    for (key = keys; key != NULL; key = memchr(key, '\n', end - key)) {
+        while (*key == '\n') key++;
+        if (key < end) nkeys++;
+    }
+
+    key_locations = malloc(nkeys * sizeof(*key_locations));
+    if (!key_locations)
+        goto fail;
+
+    nkeys = 0;
+    for (key = keys; key != NULL; key = memchr(key, '\n', end - key)) {
+        while (*key == '\n') *key++ = '\0';
+        if (key < end) {
+            key_locations[nkeys++] = key;
+        }
+    }
+    *keys_out = keys;
+    *key_locations_out = key_locations;
+    return nkeys;
+
+ fail:
+    if (in)
+        fclose(in);
+    free(keys);
+    *keys_out = NULL;
+    *key_locations_out = NULL;
+    return 0;
+}
+
+static long long get_time(void) {
+#ifdef HAVE_CLOCK_GETTIME_CPUTIME
+    struct timespec ts;
+    if (clock_gettime(CLOCK_PROCESS_CPUTIME_ID, &ts) < 0) {
+        perror("clock_gettime");
+        return -1;
+    }
+    return ts.tv_sec * 1000000000LL + ts.tv_nsec;
+#else
+    struct timeval tv;
+    if (gettimeofday(&tv, NULL) < 0) {
+        perror("gettimeofday");
+        return -1;
+    }
+    return tv.tv_sec * 1000000LL + tv.tv_usec;
+#endif
+}
+
+static char * fmt_time(long long elapsed) {
+    static char buf[64];
+#ifdef HAVE_CLOCK_GETTIME_CPUTIME
+    long long sec = elapsed / 1000000000;
+    long long nsec = elapsed % 1000000000;
+    snprintf(buf, sizeof(buf), "%lld.%09lld processor seconds", sec, nsec);
+#else
+    long long sec = elapsed / 1000000;
+    long long usec = elapsed % 1000000;
+    snprintf(buf, sizeof(buf), "%lld.%06lld wall-time seconds", sec, usec);
+#endif
+    return buf;
+}
+
+static int benchmark(const char *keys_file) {
+    const size_t kl = 16;
+    size_t max = 50000000;
+    size_t i;
+    char *keys = NULL;
+    char **key_locations = NULL;
+    khash_t(str2int) *h;
+    long long start, end;
+
+    if (keys_file) {
+        max = read_keys(keys_file, &keys, &key_locations);
+    } else {
+        keys = make_keys(max, kl);
+    }
+
+    if (!keys) return -1;
+
+    h = kh_init(str2int);
+    if (!h) goto fail;
+
+    if ((start = get_time()) < 0)
+        goto fail;
+
+    if (keys_file) {
+        for (i = 0; i < max; i++) {
+            int ret;
+            khint_t k = kh_put(str2int, h, key_locations[i], &ret);
+            if (ret < 0) {
+                fprintf(stderr, "Unexpected return from kh_put(%s) : %d\n",
+                        key_locations[i], ret);
+                goto fail;
+            }
+            kh_val(h, k) = i;
+        }
+    } else {
+        for (i = 0; i < max; i++) {
+            int ret;
+            khint_t k = kh_put(str2int, h, keys + i * kl, &ret);
+            if (ret <= 0) {
+                fprintf(stderr, "Unexpected return from kh_put(%s) : %d\n",
+                        keys + i * kl, ret);
+                goto fail;
+            }
+            kh_val(h, k) = i;
+        }
+    }
+
+    if ((end = get_time()) < 0)
+        goto fail;
+
+    printf("Insert %zu %s\n", max, fmt_time(end - start));
+
+    if ((start = get_time()) < 0)
+        goto fail;
+
+    if (keys_file) {
+        for (i = 0; i < max; i++) {
+            khint_t k = kh_get(str2int, h, key_locations[i]);
+            if (k >= kh_end(h)) {
+                fprintf(stderr, "Couldn't find %s in hash table\n",
+                        key_locations[i]);
+                goto fail;
+            }
+        }
+    } else {
+        for (i = 0; i < max; i++) {
+            khint_t k = kh_get(str2int, h, keys + i * kl);
+            if (k >= kh_end(h)) {
+                fprintf(stderr, "Couldn't find %s in hash table\n",
+                        keys + i * kl);
+                goto fail;
+            }
+        }
+    }
+
+    if ((end = get_time()) < 0)
+        goto fail;
+
+    printf("Lookup %zu %s\n", max, fmt_time(end - start));
+
+    write_stats_str2int(h);
+
+    kh_destroy(str2int, h);
+    free(keys);
+    free(key_locations);
+
+    return 0;
+ fail:
+    kh_destroy(str2int, h);
+    free(keys);
+    return -1;
+}
+
+static void show_usage(FILE *out, char *prog) {
+    fprintf(out, "Usage : %s [-t <test>] [-i <file>]\n", prog);
+    fprintf(out, " Options:\n");
+    fprintf(out, "  -t <TEST>   Test to run (str2int, benchmark)\n");
+    fprintf(out, "  -i <FILE>   Optional input file for benchmark\n");
+    fprintf(out, "  -n <INT>    Number of items to add\n");
+    fprintf(out, "  -f <FRAC>   Fraction to delete and re-insert\n");
+    fprintf(out, "  -d          Dump hash table stats\n");
+    fprintf(out, "  -h          Show this help\n");
+}
+
+int main(int argc, char **argv) {
+    int opt, res = EXIT_SUCCESS;
+    char *test = NULL;
+    char *input_file = NULL;
+    size_t max = 1000;
+    double del_frac = 0.25;
+    int show_stats = 0;
+
+    while ((opt = getopt(argc, argv, "df:hi:n:t:")) != -1) {
+        switch (opt) {
+        case 'd':
+            show_stats = 1;
+            break;
+        case 'f':
+            del_frac = strtod(optarg, NULL);
+            if (del_frac < 0 || del_frac > 1.0) {
+                fprintf(stderr, "Error: -d must be between 0.0 and 1.0\n");
+                return EXIT_FAILURE;
+            }
+            break;
+        case 'h':
+            show_usage(stdout, argv[0]);
+            return EXIT_SUCCESS;
+        case 'i':
+            input_file = optarg;
+            break;
+        case 'n':
+            max = strtoul(optarg, NULL, 0);
+            if (max == 0 || max > 99999999) {
+                fprintf(stderr, "Error: -n must be between 1 and %u\n",
+                        MAX_ENTRIES);
+                return EXIT_FAILURE;
+            }
+            break;
+        case 't':
+            test = optarg;
+            break;
+        default:
+            show_usage(stderr, argv[0]);
+            return EXIT_FAILURE;
+        }
+    }
+
+    if (!test || strcmp(test, "str2int") == 0) {
+        if (test_str2int(max, (size_t) (max * del_frac), show_stats) != 0)
+            res = EXIT_FAILURE;
+    }
+
+    if (test && strcmp(test, "benchmark") == 0) {
+        if (benchmark(input_file) != 0)
+            res = EXIT_FAILURE;
+    }
+
+    return res;
+}
diff --git a/test/test_kstring.c b/test/test_kstring.c
index ee913a2e3..8b6188b6e 100644
--- a/test/test_kstring.c
+++ b/test/test_kstring.c
@@ -1,6 +1,6 @@
 /*  test_kstring.c -- kstring unit tests
 
-    Copyright (C) 2018, 2020 Genome Research Ltd.
+    Copyright (C) 2018, 2020, 2024 Genome Research Ltd.
 
     Author: Rob Davies <rmd@sanger.ac.uk>
 
@@ -261,6 +261,84 @@ static int test_kputw(int64_t start, int64_t end) {
     return 0;
 }
 
+static int test_kputll_from_to(kstring_t *str, long long s, long long e) {
+    long long i = s;
+
+    for (;;) {
+        str->l = 0;
+        memset(str->s, 0xff, str->m);
+        if (kputll(i, str) < 0 || !str->s) {
+            perror("kputll");
+            return -1;
+        }
+        if (str->l >= str->m || str->s[str->l] != '\0') {
+            fprintf(stderr, "No NUL termination on string from kputll\n");
+            return -1;
+        }
+        if (i != strtoll(str->s, NULL, 10)) {
+            fprintf(stderr,
+                    "kputll wrote the wrong value, expected %lld, got %s\n",
+                    i, str->s);
+            return -1;
+        }
+        if (i >= e) break;
+        i++;
+    }
+    return 0;
+}
+
+static int test_kputll(long long start, long long end) {
+    kstring_t str = { 0, 0, NULL };
+    unsigned long long val;
+
+    str.s = malloc(2);
+    if (!str.s) {
+        perror("malloc");
+        return -1;
+    }
+    str.m = 2;
+
+    for (val = 1; val < INT64_MAX-5; val *= 10) {
+        if (test_kputll_from_to(&str, val >= 5 ? val - 5 : val, val) < 0) {
+            free(ks_release(&str));
+            return -1;
+        }
+    }
+
+    for (val = 1; val < INT64_MAX-5; val *= 10) {
+        long long valm = -val;
+        if (test_kputll_from_to(&str, valm >= 5 ? valm - 5 : valm, valm) < 0) {
+            free(ks_release(&str));
+            return -1;
+        }
+    }
+
+    if (test_kputll_from_to(&str, INT64_MAX - 5, INT64_MAX) < 0) {
+        free(ks_release(&str));
+        return -1;
+    }
+
+    if (test_kputll_from_to(&str, INT64_MIN, INT64_MIN + 5) < 0) {
+        free(ks_release(&str));
+        return -1;
+    }
+
+    str.m = 1; // Force a resize
+    int64_t start2 = (int64_t)start; // no larger on our platforms
+    int64_t end2   = (int64_t)end;
+    clamp(&start2, INT64_MIN, INT64_MAX);
+    clamp(&end2,   INT64_MIN, INT64_MAX);
+
+    if (test_kputll_from_to(&str, start, end) < 0) {
+        free(ks_release(&str));
+        return -1;
+    }
+
+    free(ks_release(&str));
+
+    return 0;
+}
+
 // callback used by test_kgetline
 static char *mock_fgets(char *str, int num, void *p) {
     int *mock_state = (int*)p;
@@ -290,7 +368,7 @@ static char *mock_fgets(char *str, int num, void *p) {
     return str;
 }
 
-static int test_kgetline() {
+static int test_kgetline(void) {
     kstring_t s = KS_INITIALIZE;
     int mock_state = 0;
 
@@ -346,7 +424,7 @@ static ssize_t mock_fgets2(char *str, size_t num, void *p) {
     return strlen(str);
 }
 
-static int test_kgetline2() {
+static int test_kgetline2(void) {
     kstring_t s = KS_INITIALIZE;
     int mock_state = 0;
 
@@ -413,6 +491,9 @@ int main(int argc, char **argv) {
     if (!test || strcmp(test, "kputw") == 0)
         if (test_kputw(start, end) != 0) res = EXIT_FAILURE;
 
+    if (!test || strcmp(test, "kputll") == 0)
+        if (test_kputll(start, end) != 0) res = EXIT_FAILURE;
+
     if (!test || strcmp(test, "kgetline") == 0)
         if (test_kgetline() != 0) res = EXIT_FAILURE;
 
diff --git a/test/test_nibbles.c b/test/test_nibbles.c
new file mode 100644
index 000000000..1ef3456ea
--- /dev/null
+++ b/test/test_nibbles.c
@@ -0,0 +1,164 @@
+/*  test/test_nibbles.c -- Test SIMD optimised function implementations.
+
+    Copyright (C) 2024 Centre for Population Genomics.
+
+    Author: John Marshall <jmarshall@hey.com>
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+DEALINGS IN THE SOFTWARE.  */
+
+#include <config.h>
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <unistd.h>
+
+#ifdef HAVE_CLOCK_GETTIME_CPUTIME
+#include <time.h>
+#else
+#include <sys/time.h>
+#endif
+
+#include "../htslib/sam.h"
+#include "../sam_internal.h"
+
+long long gettime(void) {
+#ifdef HAVE_CLOCK_GETTIME_CPUTIME
+    struct timespec ts;
+    clock_gettime(CLOCK_PROCESS_CPUTIME_ID, &ts);
+    return ts.tv_sec * 1000000000LL + ts.tv_nsec;
+#else
+    struct timeval tv;
+    gettimeofday(&tv, NULL);
+    return tv.tv_sec * 1000000LL + tv.tv_usec;
+#endif
+}
+
+char *fmttime(long long elapsed) {
+    static char buf[64];
+
+#ifdef HAVE_CLOCK_GETTIME_CPUTIME
+    long long sec = elapsed / 1000000000;
+    long long nsec = elapsed % 1000000000;
+    sprintf(buf, "%lld.%09lld processor seconds", sec, nsec);
+#else
+    long long sec = elapsed / 1000000;
+    long long usec = elapsed % 1000000;
+    sprintf(buf, "%lld.%06lld wall-time seconds", sec, usec);
+#endif
+
+    return buf;
+}
+
+void nibble2base_single(uint8_t *nib, char *seq, int len) {
+    int i;
+    for (i = 0; i < len; i++)
+        seq[i] = seq_nt16_str[bam_seqi(nib, i)];
+}
+
+unsigned char nibble[5000];
+char buf[10000];
+
+int validate_nibble2base(void) {
+    char defbuf[500];
+    int i, start, len;
+    unsigned long long total = 0, failed = 0;
+
+    for (i = 0; i < sizeof nibble; i++)
+        nibble[i] = i % 256;
+
+    for (start = 0; start < 80; start++)
+        for (len = 0; len < 400; len++) {
+            memset(defbuf, '\0', sizeof defbuf);
+            nibble2base_single(&nibble[start], defbuf, len);
+
+            memset(buf, '\0', sizeof defbuf);
+            nibble2base(&nibble[start], buf, len);
+
+            total++;
+            if (strcmp(defbuf, buf) != 0) {
+                printf("%s expected\n%s FAIL\n\n", defbuf, buf);
+                failed++;
+            }
+        }
+
+    if (failed > 0) {
+        fprintf(stderr, "Failures: %llu (out of %llu tests)\n", failed, total);
+        return 1;
+    }
+
+    return 0;
+}
+
+int time_nibble2base(int length, unsigned long count) {
+    unsigned long i, total = 0;
+
+    for (i = 0; i < length; i++)
+        nibble[i] = i % 256;
+
+    printf("Timing %lu nibble2base iterations with read length %d...\n", count, length);
+    long long start = gettime();
+
+    for (i = 0; i < count; i++) {
+        nibble2base(nibble, buf, length);
+        total += buf[i % length];
+    }
+
+    long long stop = gettime();
+    printf("%s (summing to %lu)\n", fmttime(stop - start), total);
+    return 0;
+}
+
+int main(int argc, char **argv) {
+    int readlen = 5000;
+    unsigned long count = 1000000;
+    int status = 0;
+    int c;
+
+    if (argc == 1)
+        printf(
+"Usage: test_nibbles [-c NUM] [-r NUM] [-n|-v]...\n"
+"Options:\n"
+"  -c NUM  Specify number of iterations [%lu]\n"
+"  -n      Run nibble2base speed tests\n"
+"  -r NUM  Specify read length [%d]\n"
+"  -v      Run all validation tests\n"
+"", count, readlen);
+
+    while ((c = getopt(argc, argv, "c:nr:v")) >= 0)
+        switch (c) {
+        case 'c':
+            count = strtoul(optarg, NULL, 0);
+            break;
+
+        case 'n':
+            status += time_nibble2base(readlen, count);
+            break;
+
+        case 'r':
+            readlen = atoi(optarg);
+            break;
+
+        case 'v':
+            status += validate_nibble2base();
+            break;
+        }
+
+    return status;
+}
diff --git a/textutils.c b/textutils.c
index 0cc2af818..b2c29a893 100644
--- a/textutils.c
+++ b/textutils.c
@@ -220,7 +220,7 @@ static char token_type(hts_json_token *token)
 }
 
 HTSLIB_EXPORT
-hts_json_token * hts_json_alloc_token() {
+hts_json_token * hts_json_alloc_token(void) {
     return calloc(1, sizeof(hts_json_token));
 }
 
diff --git a/vcf.c b/vcf.c
index 9dec8481b..105c7539d 100644
--- a/vcf.c
+++ b/vcf.c
@@ -1567,7 +1567,7 @@ int bcf_hdr_write(htsFile *hfp, bcf_hdr_t *h)
  *** BCF site I/O ***
  ********************/
 
-bcf1_t *bcf_init()
+bcf1_t *bcf_init(void)
 {
     bcf1_t *v;
     v = (bcf1_t*)calloc(1, sizeof(bcf1_t));
@@ -3703,7 +3703,7 @@ int vcf_parse(kstring_t *s, const bcf_hdr_t *h, bcf1_t *v)
 
     overflow = 0;
     char *tmp = p;
-    v->pos = hts_str2uint(p, &p, 63, &overflow);
+    v->pos = hts_str2uint(p, &p, 62, &overflow);
     if (overflow) {
         hts_log_error("Position value '%s' is too large", tmp);
         goto err;
@@ -4020,7 +4020,10 @@ int vcf_format(const bcf_hdr_t *h, const bcf1_t *v, kstring_t *s)
 
     kputc_('\t', s); // INFO
     if (v->n_info) {
-        uint8_t *ptr = (uint8_t *)v->shared.s + v->unpack_size[0] + v->unpack_size[1] + v->unpack_size[2];
+        uint8_t *ptr = v->shared.s
+            ? (uint8_t *)v->shared.s + v->unpack_size[0] +
+               v->unpack_size[1] + v->unpack_size[2]
+            : NULL;
         int first = 1;
         bcf_info_t *info = v->d.info;
 
@@ -4235,6 +4238,8 @@ int vcf_write(htsFile *fp, const bcf_hdr_t *h, bcf1_t *v)
     if ( fp->format.compression!=no_compression ) {
         if (bgzf_flush_try(fp->fp.bgzf, fp->line.l) < 0)
             return -1;
+        if (fp->idx && !fp->fp.bgzf->mt)
+            hts_idx_amend_last(fp->idx, bgzf_tell(fp->fp.bgzf));
         ret = bgzf_write(fp->fp.bgzf, fp->line.s, fp->line.l);
     } else {
         ret = hwrite(fp->fp.hfile, fp->line.s, fp->line.l);
@@ -4288,7 +4293,7 @@ static int idx_calc_n_lvls_ids(const bcf_hdr_t *h, int min_shift,
     }
     if ( !max_len ) max_len = (1LL<<31) - 1;  // In case contig line is broken.
     max_len += 256;
-    s = 1LL << (min_shift + starting_n_lvls * 3);
+    s = hts_bin_maxpos(min_shift, starting_n_lvls);
     for (n_lvls = starting_n_lvls; max_len > s; ++n_lvls, s <<= 3);
 
     if (nids_out) *nids_out = nids;
diff --git a/version.sh b/version.sh
index 98ae48ec0..f35234c2d 100755
--- a/version.sh
+++ b/version.sh
@@ -24,7 +24,7 @@
 # DEALINGS IN THE SOFTWARE.
 
 # Master version, for use in tarballs or non-git source copies
-VERSION=1.20
+VERSION=1.21
 
 # If we have a git clone, then check against the current tag
 srcdir=${0%/version.sh}