diff --git a/.cirrus.yml b/.cirrus.yml
index 28f0112e8..2f0b2da73 100644
--- a/.cirrus.yml
+++ b/.cirrus.yml
@@ -39,7 +39,7 @@ task:
         CXX: g++-4.9
   compile_script: |
     mkdir build && cd build
-    cmake -DCMAKE_BUILD_TYPE=Release -DHAVE_TESTS=1 -DENABLE_WERROR=1 -DHAVE_SSE4_1=1 -DREQUIRE_OPENMP=0 ..
+    cmake -DCMAKE_BUILD_TYPE=Release -DHAVE_TESTS=1 -DENABLE_WERROR=0 -DHAVE_SSE4_1=1 -DREQUIRE_OPENMP=0 ..
     make -j $(nproc --all)
   test_script: MMSEQS_NUM_THREADS=4 ./util/regression/run_regression.sh ./build/src/mmseqs SCRATCH SEARCH
 
diff --git a/.github/workflows/Dockerfile.GPU-manylinux2014 b/.github/workflows/Dockerfile.GPU-manylinux2014
new file mode 100644
index 000000000..5559fc470
--- /dev/null
+++ b/.github/workflows/Dockerfile.GPU-manylinux2014
@@ -0,0 +1,77 @@
+FROM quay.io/pypa/manylinux2014_x86_64
+ARG VER="12-6"
+ARG ARCH="x86_64"
+
+# CUDA
+RUN yum install -y yum-utils
+RUN yum-config-manager --add-repo https://developer.download.nvidia.com/compute/cuda/repos/rhel8/x86_64/cuda-rhel8.repo
+# libcublas-devel-${VER}.${ARCH}
+RUN yum -y install cuda-nvcc-${VER}.${ARCH} cuda-cudart-devel-${VER}.${ARCH}
+RUN echo "/usr/local/cuda/lib64" >> /etc/ld.so.conf.d/999_nvidia_cuda.conf
+ENV PATH="/usr/local/cuda/bin:${PATH}"
+ENV LD_LIBRARY_PATH="/usr/local/cuda/lib64:${LD_LIBRARY_PATH}"
+ENV CUDA_HOME=/usr/local/cuda
+ENV CUDA_ROOT=/usr/local/cuda
+ENV CUDA_PATH=/usr/local/cuda
+ENV CUDADIR=/usr/local/cuda
+
+# Build system
+RUN yum install -y git wget vim zlib-devel bzip2-devel ninja-build centos-release-scl
+RUN mv /etc/yum.repos.d/CentOS-SCLo-scl.repo /etc/yum.repos.d/CentOS-SCLo-scl.repo.disabled
+RUN yum install -y devtoolset-11-gcc devtoolset-11-gcc-c++ devtoolset-11-libatomic-devel
+ENV CC=/opt/rh/devtoolset-11/root/bin/gcc
+ENV CXX=/opt/rh/devtoolset-11/root/bin/g++
+ENV CUDAHOSTCXX=/opt/rh/devtoolset-11/root/bin/g++
+ENV CUDACXX=/usr/local/cuda/bin/nvcc
+ENV LIBGCC=/opt/rh/devtoolset-11/root/usr/lib/gcc/x86_64-redhat-linux/11
+
+# cmake
+RUN wget https://github.com/Kitware/CMake/releases/download/v3.31.0/cmake-3.31.0-linux-x86_64.sh; \
+    chmod +x cmake-3.31.0-linux-x86_64.sh; \
+    ./cmake-3.31.0-linux-x86_64.sh --skip-license --prefix=/usr/local;
+
+RUN mkdir /deps; \
+    cd /deps; \
+    wget https://sourceware.org/pub/bzip2/bzip2-1.0.8.tar.gz; \
+    tar xzvf bzip2-1.0.8.tar.gz; \
+    cd bzip2-1.0.8; \
+    make install PREFIX=/deps;
+
+RUN cd /deps; \
+    wget https://www.zlib.net/zlib-1.3.1.tar.gz; \
+    tar xzvf zlib-1.3.1.tar.gz; \
+    cd zlib-1.3.1; \
+    ./configure --prefix=/deps --static; \
+    make install;
+
+# compile
+WORKDIR /work
+ADD . .
+RUN mkdir -p /work/build && cd /work/build; \
+    if [ -e "${LIBGCC}/libgomp.so" ]; then \
+        mv -f -- "${LIBGCC}/libgomp.so" "${LIBGCC}/libgomp.so.disabled"; \
+    fi; \
+    /usr/local/bin/cmake -GNinja -DCMAKE_BUILD_TYPE=Release -DHAVE_TESTS=1 -DENABLE_WERROR=1 -DHAVE_AVX2=1 \
+        -DOpenMP_C_FLAGS="-fopenmp -I${LIBGCC} -L${LIBGCC}" -DOpenMP_C_LIB_NAMES=gomp -DOpenMP_CXX_FLAGS="-fopenmp -I${LIBGCC} -L${LIBGCC}" -DOpenMP_CXX_LIB_NAMES=gomp -DOpenMP_gomp_LIBRARY="${LIBGCC}/libgomp.a" \
+        -DATOMIC_LIB_OVERRIDE="${LIBGCC}/libatomic.a" \
+        -DCMAKE_POLICY_DEFAULT_CMP0074=NEW -DCMAKE_POLICY_DEFAULT_CMP0144=NEW \
+        -DZLIB_ROOT=/deps -DBZIP2_ROOT=/deps \
+        -DFORCE_STATIC_DEPS=1 -DENABLE_CUDA=1 -DCMAKE_CUDA_ARCHITECTURES="75-real;80-real;86-real;89-real;90" ..; \
+    cmake --build . -j$(nproc --all) -v;
+
+RUN if ldd /work/build/src/mmseqs | grep -P -v "linux-vdso.so|/lib64/(ld-linux-x86-64|libc|libm|libdl|librt|libpthread).so" | grep -q .; then \
+        echo "Error: unwanted libraries found"; \
+        ldd /work/build/src/mmseqs; \
+        exit 1; \
+    fi; \
+    if readelf -Ws /work/build/src/mmseqs | grep -q GLIBC_PRIVATE; then \
+        echo "Error: binary contains private glibc symbols"; \
+        readelf -Ws /work/build/src/mmseqs; \
+        exit 1; \
+    fi; \
+    LIBC_V=$(readelf -V /work/build/src/mmseqs | awk '$3 ~ /^GLIBC_/ { print $3 }' | sort -V | tail -n1); \
+    if [[ "$LIBC_V" > "GLIBC_2.17" ]]; then \
+        echo "Error: glibc too new"; \
+        readelf -V /work/build/src/mmseqs; \
+        exit 1; \
+    fi;
diff --git a/.github/workflows/docker.yml b/.github/workflows/docker.yml
index dbad6f469..7c84ef085 100644
--- a/.github/workflows/docker.yml
+++ b/.github/workflows/docker.yml
@@ -11,9 +11,11 @@ on:
       tag:
         required: true
         type: string
+        description: "Docker tag"
       latest:
         default: false
         type: boolean
+        description: "Mark as latest"
 
 
 env:
@@ -45,7 +47,7 @@ jobs:
 
       - name: Extract metadata (tags, labels) for Docker
         id: meta
-        uses: docker/metadata-action@98669ae865ea3cffbcbaa878cf57c20bbf1c6c38
+        uses: docker/metadata-action@8e5442c4ef9f78752691e2d8f8d19755c6f78e81
         with:
           images: ${{ env.REGISTRY }}/${{ env.IMAGE_NAME }}
 
@@ -77,4 +79,63 @@ jobs:
             ${{ steps.dispatch_tag.outputs.tag }}
             ${{ steps.dispatch_tag.outputs.latest }}
           labels: ${{ steps.meta.outputs.labels }}
+  build-and-push-gpu-image:
+    runs-on: ubuntu-latest
+    permissions:
+      contents: read
+      packages: write
+    steps:
+      - name: Checkout repository
+        uses: actions/checkout@v3
+
+      # - name: Set up QEMU
+      #   uses: docker/setup-qemu-action@v2
+
+      - name: Set up Docker Buildx
+        uses: docker/setup-buildx-action@v2
+
+      - name: Log in to the Container registry
+        uses: docker/login-action@f054a8b539a109f9f41c372932f1ae047eff08c9
+        with:
+          registry: ${{ env.REGISTRY }}
+          username: ${{ github.actor }}
+          password: ${{ secrets.GITHUB_TOKEN }}
+
+      - name: Extract metadata (tags, labels) for Docker
+        id: meta
+        uses: docker/metadata-action@8e5442c4ef9f78752691e2d8f8d19755c6f78e81
+        with:
+          images: ${{ env.REGISTRY }}/${{ env.IMAGE_NAME }}
+          flavor: |
+            suffix=-cuda12,onlatest=true
+
+      - name: Tag for workflow_dispatch
+        id: dispatch_tag
+        run: |
+          if [ x"$TAG" != x"" ];then
+            echo "::set-output name=tag::${FULL_TAG}"
+          fi
+          if [ x"$LATEST" = x"true" ]; then
+            echo "::set-output name=latest::${LATEST_TAG}"
+          fi
+        env:
+          FULL_TAG: ${{ env.REGISTRY }}/${{ env.IMAGE_NAME }}:${{ github.event.inputs.tag }}
+          LATEST_TAG: ${{ env.REGISTRY }}/${{ env.IMAGE_NAME }}:latest
+          TAG: ${{ github.event.inputs.tag }}
+          LATEST: ${{ github.event.inputs.latest }}
 
+      - name: Build and push Docker image
+        uses: docker/build-push-action@ad44023a93711e3deb337508980b4b5e9bcdc5dc
+        with:
+          context: .
+          platforms: linux/amd64
+          push: true
+          cache-from: type=gha
+          cache-to: type=gha,mode=max
+          build-args: |
+            GPU=1
+          tags: |
+            ${{ steps.meta.outputs.tags }}
+            ${{ steps.dispatch_tag.outputs.tag }}
+            ${{ steps.dispatch_tag.outputs.latest }}
+          labels: ${{ steps.meta.outputs.labels }}
diff --git a/.github/workflows/mac-arm64.yml b/.github/workflows/mac-arm64.yml
index 508b708d4..b708d233f 100644
--- a/.github/workflows/mac-arm64.yml
+++ b/.github/workflows/mac-arm64.yml
@@ -7,12 +7,16 @@ on:
 
 jobs:
   build:
-    runs-on: [self-hosted, macOS, ARM64]
+    runs-on: macos-latest
     steps:
       - uses: actions/checkout@v3
         with:
           submodules: true
 
+      - name: Dependencies
+        run: |
+          brew install -f --overwrite cmake libomp
+
       - name: Build
         run: |
           mkdir -p build
diff --git a/.gitignore b/.gitignore
index 59ee4a0d1..44a1aa978 100644
--- a/.gitignore
+++ b/.gitignore
@@ -30,6 +30,7 @@
 src/workflow/time_test
 
 build/
+build-*/
 .idea/
 cmake-build-*/
 BenchmarkingDatas/
diff --git a/CMakeLists.txt b/CMakeLists.txt
index adde42630..d86ed6b11 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -18,6 +18,25 @@ set(HAVE_ARM8 0 CACHE BOOL "Have ARMv8 CPU")
 set(HAVE_S390X 0 CACHE BOOL "Have s390x architecture")
 set(NATIVE_ARCH 1 CACHE BOOL "Assume native architecture for SIMD. Use one of the HAVE_* options or set CMAKE_CXX_FLAGS to the appropriate flags if you disable this.")
 set(USE_SYSTEM_ZSTD 0 CACHE BOOL "Use zstd provided by system instead of bundled version")
+set(ENABLE_CUDA 0 CACHE BOOL "Enable CUDA")
+set(FORCE_STATIC_DEPS 0 CACHE BOOL "Force static linking of deps")
+
+if(FORCE_STATIC_DEPS)
+    if(ENABLE_CUDA)
+        set(CMAKE_FIND_LIBRARY_SUFFIXES .a .so CACHE INTERNAL "" FORCE)
+        set(CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} -static-libgcc -static-libstdc++")
+    else()
+        set(CMAKE_FIND_LIBRARY_SUFFIXES .a CACHE INTERNAL "" FORCE)
+        set(CMAKE_LINK_SEARCH_START_STATIC ON CACHE INTERNAL "" FORCE)
+        set(CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} -static -static-libgcc -static-libstdc++")
+    endif()
+    set(BUILD_SHARED_LIBS OFF CACHE INTERNAL "" FORCE)
+endif()
+
+if(NOT XCODE AND NOT MSVC AND NOT CMAKE_BUILD_TYPE)
+    set(CMAKE_BUILD_TYPE Release CACHE STRING "Build type" FORCE)
+    set_property(CACHE CMAKE_BUILD_TYPE PROPERTY STRINGS "Debug" "Release" "MinSizeRel" "RelWithDebInfo")
+endif()
 
 if (HAVE_SANITIZER)
     include(FindUBSan)
@@ -26,10 +45,6 @@ if (HAVE_SANITIZER)
     include(FindTSan)
 endif ()
 
-if (NOT CMAKE_BUILD_TYPE)
-    set(CMAKE_BUILD_TYPE Release)
-endif ()
-
 # find compiler
 if (CMAKE_CXX_COMPILER_ID MATCHES "Clang")
     message("-- Compiler is clang(++)")
@@ -77,7 +92,11 @@ elseif (HAVE_S390X)
     set(ZARCH 1 CACHE INTERNAL "")
 endif ()
 
-if (NATIVE_ARCH AND (MMSEQS_ARCH STREQUAL ""))
+if (MMSEQS_ARCH)
+    set(NATIVE_ARCH 0 CACHE INTERNAL "")
+endif ()
+
+if (NATIVE_ARCH)
     if (CMAKE_SYSTEM_PROCESSOR MATCHES "^(arm.*|ARM.*|aarch64.*|AARCH64.*)")
         set(ARM 1 CACHE INTERNAL "")
     elseif (CMAKE_SYSTEM_PROCESSOR MATCHES "PPC64*|ppc64*|powerpc64*")
@@ -205,21 +224,29 @@ add_subdirectory(lib/tinyexpr EXCLUDE_FROM_ALL)
 include_directories(lib/microtar)
 add_subdirectory(lib/microtar)
 
+# tantan
+include_directories(lib/tantan)
+add_subdirectory(lib/tantan)
+
 # simde
 include_directories(lib/simde)
 
 include_directories(lib)
 include_directories(lib/simd)
-include_directories(lib/gzstream)
 include_directories(lib/alp)
-include_directories(lib/cacode)
 include_directories(lib/ksw2)
 include_directories(lib/xxhash)
 if (NOT DISABLE_IPS4O)
     include_directories(lib/ips4o)
 endif ()
 
-add_subdirectory(lib/cacode)
+# libmarv
+if (ENABLE_CUDA)
+    set(LIBRARY_ONLY 1 CACHE INTERNAL "" FORCE)
+    include_directories(lib/libmarv/src)
+    add_subdirectory(lib/libmarv/src EXCLUDE_FROM_ALL)
+endif ()
+
 add_subdirectory(lib/alp)
 add_subdirectory(lib/ksw2)
 add_subdirectory(data)
diff --git a/Dockerfile b/Dockerfile
index 1ee5c3b78..c288363a4 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -1,15 +1,21 @@
 ARG APP=mmseqs
-FROM --platform=$BUILDPLATFORM debian:stable-slim as builder
+FROM --platform=$BUILDPLATFORM debian:bookworm-slim AS builder
 ARG TARGETARCH
 ARG APP
+ARG GPU
 
 RUN dpkg --add-architecture $TARGETARCH \
     && apt-get update \
     && apt-get install -y \
-      build-essential cmake xxd git \
+      build-essential cmake xxd git wget \
       zlib1g-dev libbz2-dev libatomic1 \
-      crossbuild-essential-$TARGETARCH zlib1g-dev:$TARGETARCH libbz2-dev:$TARGETARCH \
-    && rm -rf /var/lib/apt/lists/*
+      crossbuild-essential-$TARGETARCH zlib1g-dev:$TARGETARCH libbz2-dev:$TARGETARCH; \
+    if [ "$GPU" = "1" ]; then \
+      wget https://developer.download.nvidia.com/compute/cuda/repos/debian12/x86_64/cuda-keyring_1.1-1_all.deb; \
+      dpkg -i cuda-keyring_1.1-1_all.deb; \
+      apt-get update && apt-get install -y cuda-nvcc-12-6 cuda-cudart-dev-12-6 ninja-build; \
+    fi; \
+    rm -rf /var/lib/apt/lists/*;
 
 WORKDIR /opt/build
 ADD . .
@@ -22,33 +28,51 @@ RUN if [ "$TARGETARCH" = "arm64" ]; then \
       mv src/${APP} /opt/build/${APP}_arch; \
       touch /opt/build/${APP}_sse2 /opt/build/${APP}_sse41 /opt/build/${APP}_avx2; \
     else \
-      mkdir -p build_sse2/src && mkdir -p build_sse41/src && mkdir -p build_avx2/src; \
-      cd /opt/build/build_sse2; \
-      cmake -DHAVE_SSE2=1 -DHAVE_MPI=0 -DHAVE_TESTS=0 -DCMAKE_BUILD_TYPE=Release -DCMAKE_INSTALL_PREFIX=. ..; \
-      make -j $(nproc --all); \
-      mv src/${APP} /opt/build/${APP}_sse2; \
-      cd /opt/build/build_sse41; \
-      cmake -DHAVE_SSE4_1=1 -DHAVE_MPI=0 -DHAVE_TESTS=0 -DCMAKE_BUILD_TYPE=Release -DCMAKE_INSTALL_PREFIX=. ..; \
-      make -j $(nproc --all); \
-      mv src/${APP} /opt/build/${APP}_sse41; \
-      cd /opt/build/build_avx2; \
-      cmake -DHAVE_AVX2=1 -DHAVE_MPI=0 -DHAVE_TESTS=0 -DCMAKE_BUILD_TYPE=Release -DCMAKE_INSTALL_PREFIX=. ..; \
-      make -j $(nproc --all); \
-      mv src/${APP} /opt/build/${APP}_avx2; \
-      touch /opt/build/${APP}_arch; \
+      if [ "$GPU" = "1" ]; then \
+        export CUDACXX=/usr/local/cuda/bin/nvcc; \
+        mkdir -p build_avx2/src; \
+        cd /opt/build/build_avx2; \
+        LIBGOMP=/usr/lib/gcc/x86_64-linux-gnu/12/; \
+        cmake -GNinja -DHAVE_AVX2=1 -DHAVE_MPI=0 -DHAVE_TESTS=0 -DFORCE_STATIC_DEPS=1 \
+        -DENABLE_CUDA=1 -DCMAKE_CUDA_ARCHITECTURES="75-real;80-real;86-real;89-real;90" \
+        -DOpenMP_C_FLAGS="-fopenmp -I${LIBGOMP}" -DOpenMP_C_LIB_NAMES=gomp -DOpenMP_CXX_FLAGS="-fopenmp -I${LIBGOMP}" -DOpenMP_CXX_LIB_NAMES=gomp -DOpenMP_gomp_LIBRARY=${LIBGOMP}/libgomp.a \
+        -DCMAKE_BUILD_TYPE=Release -DCMAKE_INSTALL_PREFIX=. ..; \
+        cmake --build . -j$(nproc --all); \
+        mv src/${APP} /opt/build/${APP}_avx2; \
+        touch /opt/build/${APP}_arch /opt/build/${APP}_sse41 /opt/build/${APP}_sse2; \
+      else \
+        mkdir -p build_sse2/src && mkdir -p build_sse41/src && mkdir -p build_avx2/src; \
+        cd /opt/build/build_sse2; \
+        cmake -DHAVE_SSE2=1 -DHAVE_MPI=0 -DHAVE_TESTS=0 -DCMAKE_BUILD_TYPE=Release -DCMAKE_INSTALL_PREFIX=. ..; \
+        make -j $(nproc --all); \
+        mv src/${APP} /opt/build/${APP}_sse2; \
+        cd /opt/build/build_sse41; \
+        cmake -DHAVE_SSE4_1=1 -DHAVE_MPI=0 -DHAVE_TESTS=0 -DCMAKE_BUILD_TYPE=Release -DCMAKE_INSTALL_PREFIX=. ..; \
+        make -j $(nproc --all); \
+        mv src/${APP} /opt/build/${APP}_sse41; \
+        cd /opt/build/build_avx2; \
+        cmake -DHAVE_AVX2=1 -DHAVE_MPI=0 -DHAVE_TESTS=0 -DCMAKE_BUILD_TYPE=Release -DCMAKE_INSTALL_PREFIX=. ..; \
+        make -j $(nproc --all); \
+        mv src/${APP} /opt/build/${APP}_avx2; \
+        touch /opt/build/${APP}_arch; \
+      fi; \
     fi
 
-FROM debian:stable-slim
+FROM debian:bookworm-slim
 ARG TARGETARCH
 ARG APP
-
-RUN apt-get update && apt-get install -y \
-      gawk bash grep libstdc++6 libgomp1 libatomic1 zlib1g libbz2-1.0 wget tar \
-    && rm -rf /var/lib/apt/lists/*
+ARG GPU
 
 COPY --from=builder /opt/build/${APP}_arch /opt/build/${APP}_sse2 /opt/build/${APP}_sse41 /opt/build/${APP}_avx2 /usr/local/bin/
 ADD util/${APP}_wrapper.sh /usr/local/bin/entrypoint
-RUN if [ "$TARGETARCH" = "arm64" ]; then rm -f /usr/local/bin/entrypoint; ln -s /usr/local/bin/${APP}_arch /usr/local/bin/entrypoint; fi
 
-ENTRYPOINT ["/usr/local/bin/entrypoint"]
+RUN apt-get update && apt-get install -y \
+      gawk bash grep libstdc++6 libgomp1 libatomic1 zlib1g libbz2-1.0 wget tar aria2 \
+    && rm -rf /var/lib/apt/lists/*; \
+    if [ "$TARGETARCH" = "arm64" ]; then \
+      rm -f /usr/local/bin/entrypoint; ln -s /usr/local/bin/${APP}_arch /usr/local/bin/entrypoint; \
+    elif [ "$GPU" = "1" ]; then \
+      rm -f /usr/local/bin/entrypoint; ln -s /usr/local/bin/${APP}_avx2 /usr/local/bin/entrypoint; \
+    fi
 
+ENTRYPOINT ["/usr/local/bin/entrypoint"]
diff --git a/LICENSE.md b/LICENSE.md
index 9cecc1d46..d6b226ffb 100644
--- a/LICENSE.md
+++ b/LICENSE.md
@@ -1,674 +1,26 @@
-                    GNU GENERAL PUBLIC LICENSE
-                       Version 3, 29 June 2007
+The MIT License (MIT)
+=====================
+
+Copyright © 2024 The MMseqs2 Development Team
+
+Permission is hereby granted, free of charge, to any person
+obtaining a copy of this software and associated documentation
+files (the “Software”), to deal in the Software without
+restriction, including without limitation the rights to use,
+copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the
+Software is furnished to do so, subject to the following
+conditions:
+
+The above copyright notice and this permission notice shall be
+included in all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED “AS IS”, WITHOUT WARRANTY OF ANY KIND,
+EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
+OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
+HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+OTHER DEALINGS IN THE SOFTWARE.
 
- Copyright (C) 2007 Free Software Foundation, Inc. <http://fsf.org/>
- Everyone is permitted to copy and distribute verbatim copies
- of this license document, but changing it is not allowed.
-
-                            Preamble
-
-  The GNU General Public License is a free, copyleft license for
-software and other kinds of works.
-
-  The licenses for most software and other practical works are designed
-to take away your freedom to share and change the works.  By contrast,
-the GNU General Public License is intended to guarantee your freedom to
-share and change all versions of a program--to make sure it remains free
-software for all its users.  We, the Free Software Foundation, use the
-GNU General Public License for most of our software; it applies also to
-any other work released this way by its authors.  You can apply it to
-your programs, too.
-
-  When we speak of free software, we are referring to freedom, not
-price.  Our General Public Licenses are designed to make sure that you
-have the freedom to distribute copies of free software (and charge for
-them if you wish), that you receive source code or can get it if you
-want it, that you can change the software or use pieces of it in new
-free programs, and that you know you can do these things.
-
-  To protect your rights, we need to prevent others from denying you
-these rights or asking you to surrender the rights.  Therefore, you have
-certain responsibilities if you distribute copies of the software, or if
-you modify it: responsibilities to respect the freedom of others.
-
-  For example, if you distribute copies of such a program, whether
-gratis or for a fee, you must pass on to the recipients the same
-freedoms that you received.  You must make sure that they, too, receive
-or can get the source code.  And you must show them these terms so they
-know their rights.
-
-  Developers that use the GNU GPL protect your rights with two steps:
-(1) assert copyright on the software, and (2) offer you this License
-giving you legal permission to copy, distribute and/or modify it.
-
-  For the developers' and authors' protection, the GPL clearly explains
-that there is no warranty for this free software.  For both users' and
-authors' sake, the GPL requires that modified versions be marked as
-changed, so that their problems will not be attributed erroneously to
-authors of previous versions.
-
-  Some devices are designed to deny users access to install or run
-modified versions of the software inside them, although the manufacturer
-can do so.  This is fundamentally incompatible with the aim of
-protecting users' freedom to change the software.  The systematic
-pattern of such abuse occurs in the area of products for individuals to
-use, which is precisely where it is most unacceptable.  Therefore, we
-have designed this version of the GPL to prohibit the practice for those
-products.  If such problems arise substantially in other domains, we
-stand ready to extend this provision to those domains in future versions
-of the GPL, as needed to protect the freedom of users.
-
-  Finally, every program is threatened constantly by software patents.
-States should not allow patents to restrict development and use of
-software on general-purpose computers, but in those that do, we wish to
-avoid the special danger that patents applied to a free program could
-make it effectively proprietary.  To prevent this, the GPL assures that
-patents cannot be used to render the program non-free.
-
-  The precise terms and conditions for copying, distribution and
-modification follow.
-
-                       TERMS AND CONDITIONS
-
-  0. Definitions.
-
-  "This License" refers to version 3 of the GNU General Public License.
-
-  "Copyright" also means copyright-like laws that apply to other kinds of
-works, such as semiconductor masks.
-
-  "The Program" refers to any copyrightable work licensed under this
-License.  Each licensee is addressed as "you".  "Licensees" and
-"recipients" may be individuals or organizations.
-
-  To "modify" a work means to copy from or adapt all or part of the work
-in a fashion requiring copyright permission, other than the making of an
-exact copy.  The resulting work is called a "modified version" of the
-earlier work or a work "based on" the earlier work.
-
-  A "covered work" means either the unmodified Program or a work based
-on the Program.
-
-  To "propagate" a work means to do anything with it that, without
-permission, would make you directly or secondarily liable for
-infringement under applicable copyright law, except executing it on a
-computer or modifying a private copy.  Propagation includes copying,
-distribution (with or without modification), making available to the
-public, and in some countries other activities as well.
-
-  To "convey" a work means any kind of propagation that enables other
-parties to make or receive copies.  Mere interaction with a user through
-a computer network, with no transfer of a copy, is not conveying.
-
-  An interactive user interface displays "Appropriate Legal Notices"
-to the extent that it includes a convenient and prominently visible
-feature that (1) displays an appropriate copyright notice, and (2)
-tells the user that there is no warranty for the work (except to the
-extent that warranties are provided), that licensees may convey the
-work under this License, and how to view a copy of this License.  If
-the interface presents a list of user commands or options, such as a
-menu, a prominent item in the list meets this criterion.
-
-  1. Source Code.
-
-  The "source code" for a work means the preferred form of the work
-for making modifications to it.  "Object code" means any non-source
-form of a work.
-
-  A "Standard Interface" means an interface that either is an official
-standard defined by a recognized standards body, or, in the case of
-interfaces specified for a particular programming language, one that
-is widely used among developers working in that language.
-
-  The "System Libraries" of an executable work include anything, other
-than the work as a whole, that (a) is included in the normal form of
-packaging a Major Component, but which is not part of that Major
-Component, and (b) serves only to enable use of the work with that
-Major Component, or to implement a Standard Interface for which an
-implementation is available to the public in source code form.  A
-"Major Component", in this context, means a major essential component
-(kernel, window system, and so on) of the specific operating system
-(if any) on which the executable work runs, or a compiler used to
-produce the work, or an object code interpreter used to run it.
-
-  The "Corresponding Source" for a work in object code form means all
-the source code needed to generate, install, and (for an executable
-work) run the object code and to modify the work, including scripts to
-control those activities.  However, it does not include the work's
-System Libraries, or general-purpose tools or generally available free
-programs which are used unmodified in performing those activities but
-which are not part of the work.  For example, Corresponding Source
-includes interface definition files associated with source files for
-the work, and the source code for shared libraries and dynamically
-linked subprograms that the work is specifically designed to require,
-such as by intimate data communication or control flow between those
-subprograms and other parts of the work.
-
-  The Corresponding Source need not include anything that users
-can regenerate automatically from other parts of the Corresponding
-Source.
-
-  The Corresponding Source for a work in source code form is that
-same work.
-
-  2. Basic Permissions.
-
-  All rights granted under this License are granted for the term of
-copyright on the Program, and are irrevocable provided the stated
-conditions are met.  This License explicitly affirms your unlimited
-permission to run the unmodified Program.  The output from running a
-covered work is covered by this License only if the output, given its
-content, constitutes a covered work.  This License acknowledges your
-rights of fair use or other equivalent, as provided by copyright law.
-
-  You may make, run and propagate covered works that you do not
-convey, without conditions so long as your license otherwise remains
-in force.  You may convey covered works to others for the sole purpose
-of having them make modifications exclusively for you, or provide you
-with facilities for running those works, provided that you comply with
-the terms of this License in conveying all material for which you do
-not control copyright.  Those thus making or running the covered works
-for you must do so exclusively on your behalf, under your direction
-and control, on terms that prohibit them from making any copies of
-your copyrighted material outside their relationship with you.
-
-  Conveying under any other circumstances is permitted solely under
-the conditions stated below.  Sublicensing is not allowed; section 10
-makes it unnecessary.
-
-  3. Protecting Users' Legal Rights From Anti-Circumvention Law.
-
-  No covered work shall be deemed part of an effective technological
-measure under any applicable law fulfilling obligations under article
-11 of the WIPO copyright treaty adopted on 20 December 1996, or
-similar laws prohibiting or restricting circumvention of such
-measures.
-
-  When you convey a covered work, you waive any legal power to forbid
-circumvention of technological measures to the extent such circumvention
-is effected by exercising rights under this License with respect to
-the covered work, and you disclaim any intention to limit operation or
-modification of the work as a means of enforcing, against the work's
-users, your or third parties' legal rights to forbid circumvention of
-technological measures.
-
-  4. Conveying Verbatim Copies.
-
-  You may convey verbatim copies of the Program's source code as you
-receive it, in any medium, provided that you conspicuously and
-appropriately publish on each copy an appropriate copyright notice;
-keep intact all notices stating that this License and any
-non-permissive terms added in accord with section 7 apply to the code;
-keep intact all notices of the absence of any warranty; and give all
-recipients a copy of this License along with the Program.
-
-  You may charge any price or no price for each copy that you convey,
-and you may offer support or warranty protection for a fee.
-
-  5. Conveying Modified Source Versions.
-
-  You may convey a work based on the Program, or the modifications to
-produce it from the Program, in the form of source code under the
-terms of section 4, provided that you also meet all of these conditions:
-
-    a) The work must carry prominent notices stating that you modified
-    it, and giving a relevant date.
-
-    b) The work must carry prominent notices stating that it is
-    released under this License and any conditions added under section
-    7.  This requirement modifies the requirement in section 4 to
-    "keep intact all notices".
-
-    c) You must license the entire work, as a whole, under this
-    License to anyone who comes into possession of a copy.  This
-    License will therefore apply, along with any applicable section 7
-    additional terms, to the whole of the work, and all its parts,
-    regardless of how they are packaged.  This License gives no
-    permission to license the work in any other way, but it does not
-    invalidate such permission if you have separately received it.
-
-    d) If the work has interactive user interfaces, each must display
-    Appropriate Legal Notices; however, if the Program has interactive
-    interfaces that do not display Appropriate Legal Notices, your
-    work need not make them do so.
-
-  A compilation of a covered work with other separate and independent
-works, which are not by their nature extensions of the covered work,
-and which are not combined with it such as to form a larger program,
-in or on a volume of a storage or distribution medium, is called an
-"aggregate" if the compilation and its resulting copyright are not
-used to limit the access or legal rights of the compilation's users
-beyond what the individual works permit.  Inclusion of a covered work
-in an aggregate does not cause this License to apply to the other
-parts of the aggregate.
-
-  6. Conveying Non-Source Forms.
-
-  You may convey a covered work in object code form under the terms
-of sections 4 and 5, provided that you also convey the
-machine-readable Corresponding Source under the terms of this License,
-in one of these ways:
-
-    a) Convey the object code in, or embodied in, a physical product
-    (including a physical distribution medium), accompanied by the
-    Corresponding Source fixed on a durable physical medium
-    customarily used for software interchange.
-
-    b) Convey the object code in, or embodied in, a physical product
-    (including a physical distribution medium), accompanied by a
-    written offer, valid for at least three years and valid for as
-    long as you offer spare parts or customer support for that product
-    model, to give anyone who possesses the object code either (1) a
-    copy of the Corresponding Source for all the software in the
-    product that is covered by this License, on a durable physical
-    medium customarily used for software interchange, for a price no
-    more than your reasonable cost of physically performing this
-    conveying of source, or (2) access to copy the
-    Corresponding Source from a network server at no charge.
-
-    c) Convey individual copies of the object code with a copy of the
-    written offer to provide the Corresponding Source.  This
-    alternative is allowed only occasionally and noncommercially, and
-    only if you received the object code with such an offer, in accord
-    with subsection 6b.
-
-    d) Convey the object code by offering access from a designated
-    place (gratis or for a charge), and offer equivalent access to the
-    Corresponding Source in the same way through the same place at no
-    further charge.  You need not require recipients to copy the
-    Corresponding Source along with the object code.  If the place to
-    copy the object code is a network server, the Corresponding Source
-    may be on a different server (operated by you or a third party)
-    that supports equivalent copying facilities, provided you maintain
-    clear directions next to the object code saying where to find the
-    Corresponding Source.  Regardless of what server hosts the
-    Corresponding Source, you remain obligated to ensure that it is
-    available for as long as needed to satisfy these requirements.
-
-    e) Convey the object code using peer-to-peer transmission, provided
-    you inform other peers where the object code and Corresponding
-    Source of the work are being offered to the general public at no
-    charge under subsection 6d.
-
-  A separable portion of the object code, whose source code is excluded
-from the Corresponding Source as a System Library, need not be
-included in conveying the object code work.
-
-  A "User Product" is either (1) a "consumer product", which means any
-tangible personal property which is normally used for personal, family,
-or household purposes, or (2) anything designed or sold for incorporation
-into a dwelling.  In determining whether a product is a consumer product,
-doubtful cases shall be resolved in favor of coverage.  For a particular
-product received by a particular user, "normally used" refers to a
-typical or common use of that class of product, regardless of the status
-of the particular user or of the way in which the particular user
-actually uses, or expects or is expected to use, the product.  A product
-is a consumer product regardless of whether the product has substantial
-commercial, industrial or non-consumer uses, unless such uses represent
-the only significant mode of use of the product.
-
-  "Installation Information" for a User Product means any methods,
-procedures, authorization keys, or other information required to install
-and execute modified versions of a covered work in that User Product from
-a modified version of its Corresponding Source.  The information must
-suffice to ensure that the continued functioning of the modified object
-code is in no case prevented or interfered with solely because
-modification has been made.
-
-  If you convey an object code work under this section in, or with, or
-specifically for use in, a User Product, and the conveying occurs as
-part of a transaction in which the right of possession and use of the
-User Product is transferred to the recipient in perpetuity or for a
-fixed term (regardless of how the transaction is characterized), the
-Corresponding Source conveyed under this section must be accompanied
-by the Installation Information.  But this requirement does not apply
-if neither you nor any third party retains the ability to install
-modified object code on the User Product (for example, the work has
-been installed in ROM).
-
-  The requirement to provide Installation Information does not include a
-requirement to continue to provide support service, warranty, or updates
-for a work that has been modified or installed by the recipient, or for
-the User Product in which it has been modified or installed.  Access to a
-network may be denied when the modification itself materially and
-adversely affects the operation of the network or violates the rules and
-protocols for communication across the network.
-
-  Corresponding Source conveyed, and Installation Information provided,
-in accord with this section must be in a format that is publicly
-documented (and with an implementation available to the public in
-source code form), and must require no special password or key for
-unpacking, reading or copying.
-
-  7. Additional Terms.
-
-  "Additional permissions" are terms that supplement the terms of this
-License by making exceptions from one or more of its conditions.
-Additional permissions that are applicable to the entire Program shall
-be treated as though they were included in this License, to the extent
-that they are valid under applicable law.  If additional permissions
-apply only to part of the Program, that part may be used separately
-under those permissions, but the entire Program remains governed by
-this License without regard to the additional permissions.
-
-  When you convey a copy of a covered work, you may at your option
-remove any additional permissions from that copy, or from any part of
-it.  (Additional permissions may be written to require their own
-removal in certain cases when you modify the work.)  You may place
-additional permissions on material, added by you to a covered work,
-for which you have or can give appropriate copyright permission.
-
-  Notwithstanding any other provision of this License, for material you
-add to a covered work, you may (if authorized by the copyright holders of
-that material) supplement the terms of this License with terms:
-
-    a) Disclaiming warranty or limiting liability differently from the
-    terms of sections 15 and 16 of this License; or
-
-    b) Requiring preservation of specified reasonable legal notices or
-    author attributions in that material or in the Appropriate Legal
-    Notices displayed by works containing it; or
-
-    c) Prohibiting misrepresentation of the origin of that material, or
-    requiring that modified versions of such material be marked in
-    reasonable ways as different from the original version; or
-
-    d) Limiting the use for publicity purposes of names of licensors or
-    authors of the material; or
-
-    e) Declining to grant rights under trademark law for use of some
-    trade names, trademarks, or service marks; or
-
-    f) Requiring indemnification of licensors and authors of that
-    material by anyone who conveys the material (or modified versions of
-    it) with contractual assumptions of liability to the recipient, for
-    any liability that these contractual assumptions directly impose on
-    those licensors and authors.
-
-  All other non-permissive additional terms are considered "further
-restrictions" within the meaning of section 10.  If the Program as you
-received it, or any part of it, contains a notice stating that it is
-governed by this License along with a term that is a further
-restriction, you may remove that term.  If a license document contains
-a further restriction but permits relicensing or conveying under this
-License, you may add to a covered work material governed by the terms
-of that license document, provided that the further restriction does
-not survive such relicensing or conveying.
-
-  If you add terms to a covered work in accord with this section, you
-must place, in the relevant source files, a statement of the
-additional terms that apply to those files, or a notice indicating
-where to find the applicable terms.
-
-  Additional terms, permissive or non-permissive, may be stated in the
-form of a separately written license, or stated as exceptions;
-the above requirements apply either way.
-
-  8. Termination.
-
-  You may not propagate or modify a covered work except as expressly
-provided under this License.  Any attempt otherwise to propagate or
-modify it is void, and will automatically terminate your rights under
-this License (including any patent licenses granted under the third
-paragraph of section 11).
-
-  However, if you cease all violation of this License, then your
-license from a particular copyright holder is reinstated (a)
-provisionally, unless and until the copyright holder explicitly and
-finally terminates your license, and (b) permanently, if the copyright
-holder fails to notify you of the violation by some reasonable means
-prior to 60 days after the cessation.
-
-  Moreover, your license from a particular copyright holder is
-reinstated permanently if the copyright holder notifies you of the
-violation by some reasonable means, this is the first time you have
-received notice of violation of this License (for any work) from that
-copyright holder, and you cure the violation prior to 30 days after
-your receipt of the notice.
-
-  Termination of your rights under this section does not terminate the
-licenses of parties who have received copies or rights from you under
-this License.  If your rights have been terminated and not permanently
-reinstated, you do not qualify to receive new licenses for the same
-material under section 10.
-
-  9. Acceptance Not Required for Having Copies.
-
-  You are not required to accept this License in order to receive or
-run a copy of the Program.  Ancillary propagation of a covered work
-occurring solely as a consequence of using peer-to-peer transmission
-to receive a copy likewise does not require acceptance.  However,
-nothing other than this License grants you permission to propagate or
-modify any covered work.  These actions infringe copyright if you do
-not accept this License.  Therefore, by modifying or propagating a
-covered work, you indicate your acceptance of this License to do so.
-
-  10. Automatic Licensing of Downstream Recipients.
-
-  Each time you convey a covered work, the recipient automatically
-receives a license from the original licensors, to run, modify and
-propagate that work, subject to this License.  You are not responsible
-for enforcing compliance by third parties with this License.
-
-  An "entity transaction" is a transaction transferring control of an
-organization, or substantially all assets of one, or subdividing an
-organization, or merging organizations.  If propagation of a covered
-work results from an entity transaction, each party to that
-transaction who receives a copy of the work also receives whatever
-licenses to the work the party's predecessor in interest had or could
-give under the previous paragraph, plus a right to possession of the
-Corresponding Source of the work from the predecessor in interest, if
-the predecessor has it or can get it with reasonable efforts.
-
-  You may not impose any further restrictions on the exercise of the
-rights granted or affirmed under this License.  For example, you may
-not impose a license fee, royalty, or other charge for exercise of
-rights granted under this License, and you may not initiate litigation
-(including a cross-claim or counterclaim in a lawsuit) alleging that
-any patent claim is infringed by making, using, selling, offering for
-sale, or importing the Program or any portion of it.
-
-  11. Patents.
-
-  A "contributor" is a copyright holder who authorizes use under this
-License of the Program or a work on which the Program is based.  The
-work thus licensed is called the contributor's "contributor version".
-
-  A contributor's "essential patent claims" are all patent claims
-owned or controlled by the contributor, whether already acquired or
-hereafter acquired, that would be infringed by some manner, permitted
-by this License, of making, using, or selling its contributor version,
-but do not include claims that would be infringed only as a
-consequence of further modification of the contributor version.  For
-purposes of this definition, "control" includes the right to grant
-patent sublicenses in a manner consistent with the requirements of
-this License.
-
-  Each contributor grants you a non-exclusive, worldwide, royalty-free
-patent license under the contributor's essential patent claims, to
-make, use, sell, offer for sale, import and otherwise run, modify and
-propagate the contents of its contributor version.
-
-  In the following three paragraphs, a "patent license" is any express
-agreement or commitment, however denominated, not to enforce a patent
-(such as an express permission to practice a patent or covenant not to
-sue for patent infringement).  To "grant" such a patent license to a
-party means to make such an agreement or commitment not to enforce a
-patent against the party.
-
-  If you convey a covered work, knowingly relying on a patent license,
-and the Corresponding Source of the work is not available for anyone
-to copy, free of charge and under the terms of this License, through a
-publicly available network server or other readily accessible means,
-then you must either (1) cause the Corresponding Source to be so
-available, or (2) arrange to deprive yourself of the benefit of the
-patent license for this particular work, or (3) arrange, in a manner
-consistent with the requirements of this License, to extend the patent
-license to downstream recipients.  "Knowingly relying" means you have
-actual knowledge that, but for the patent license, your conveying the
-covered work in a country, or your recipient's use of the covered work
-in a country, would infringe one or more identifiable patents in that
-country that you have reason to believe are valid.
-
-  If, pursuant to or in connection with a single transaction or
-arrangement, you convey, or propagate by procuring conveyance of, a
-covered work, and grant a patent license to some of the parties
-receiving the covered work authorizing them to use, propagate, modify
-or convey a specific copy of the covered work, then the patent license
-you grant is automatically extended to all recipients of the covered
-work and works based on it.
-
-  A patent license is "discriminatory" if it does not include within
-the scope of its coverage, prohibits the exercise of, or is
-conditioned on the non-exercise of one or more of the rights that are
-specifically granted under this License.  You may not convey a covered
-work if you are a party to an arrangement with a third party that is
-in the business of distributing software, under which you make payment
-to the third party based on the extent of your activity of conveying
-the work, and under which the third party grants, to any of the
-parties who would receive the covered work from you, a discriminatory
-patent license (a) in connection with copies of the covered work
-conveyed by you (or copies made from those copies), or (b) primarily
-for and in connection with specific products or compilations that
-contain the covered work, unless you entered into that arrangement,
-or that patent license was granted, prior to 28 March 2007.
-
-  Nothing in this License shall be construed as excluding or limiting
-any implied license or other defenses to infringement that may
-otherwise be available to you under applicable patent law.
-
-  12. No Surrender of Others' Freedom.
-
-  If conditions are imposed on you (whether by court order, agreement or
-otherwise) that contradict the conditions of this License, they do not
-excuse you from the conditions of this License.  If you cannot convey a
-covered work so as to satisfy simultaneously your obligations under this
-License and any other pertinent obligations, then as a consequence you may
-not convey it at all.  For example, if you agree to terms that obligate you
-to collect a royalty for further conveying from those to whom you convey
-the Program, the only way you could satisfy both those terms and this
-License would be to refrain entirely from conveying the Program.
-
-  13. Use with the GNU Affero General Public License.
-
-  Notwithstanding any other provision of this License, you have
-permission to link or combine any covered work with a work licensed
-under version 3 of the GNU Affero General Public License into a single
-combined work, and to convey the resulting work.  The terms of this
-License will continue to apply to the part which is the covered work,
-but the special requirements of the GNU Affero General Public License,
-section 13, concerning interaction through a network will apply to the
-combination as such.
-
-  14. Revised Versions of this License.
-
-  The Free Software Foundation may publish revised and/or new versions of
-the GNU General Public License from time to time.  Such new versions will
-be similar in spirit to the present version, but may differ in detail to
-address new problems or concerns.
-
-  Each version is given a distinguishing version number.  If the
-Program specifies that a certain numbered version of the GNU General
-Public License "or any later version" applies to it, you have the
-option of following the terms and conditions either of that numbered
-version or of any later version published by the Free Software
-Foundation.  If the Program does not specify a version number of the
-GNU General Public License, you may choose any version ever published
-by the Free Software Foundation.
-
-  If the Program specifies that a proxy can decide which future
-versions of the GNU General Public License can be used, that proxy's
-public statement of acceptance of a version permanently authorizes you
-to choose that version for the Program.
-
-  Later license versions may give you additional or different
-permissions.  However, no additional obligations are imposed on any
-author or copyright holder as a result of your choosing to follow a
-later version.
-
-  15. Disclaimer of Warranty.
-
-  THERE IS NO WARRANTY FOR THE PROGRAM, TO THE EXTENT PERMITTED BY
-APPLICABLE LAW.  EXCEPT WHEN OTHERWISE STATED IN WRITING THE COPYRIGHT
-HOLDERS AND/OR OTHER PARTIES PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY
-OF ANY KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING, BUT NOT LIMITED TO,
-THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
-PURPOSE.  THE ENTIRE RISK AS TO THE QUALITY AND PERFORMANCE OF THE PROGRAM
-IS WITH YOU.  SHOULD THE PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF
-ALL NECESSARY SERVICING, REPAIR OR CORRECTION.
-
-  16. Limitation of Liability.
-
-  IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING
-WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MODIFIES AND/OR CONVEYS
-THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES, INCLUDING ANY
-GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING OUT OF THE
-USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED TO LOSS OF
-DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY YOU OR THIRD
-PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER PROGRAMS),
-EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE POSSIBILITY OF
-SUCH DAMAGES.
-
-  17. Interpretation of Sections 15 and 16.
-
-  If the disclaimer of warranty and limitation of liability provided
-above cannot be given local legal effect according to their terms,
-reviewing courts shall apply local law that most closely approximates
-an absolute waiver of all civil liability in connection with the
-Program, unless a warranty or assumption of liability accompanies a
-copy of the Program in return for a fee.
-
-                     END OF TERMS AND CONDITIONS
-
-            How to Apply These Terms to Your New Programs
-
-  If you develop a new program, and you want it to be of the greatest
-possible use to the public, the best way to achieve this is to make it
-free software which everyone can redistribute and change under these terms.
-
-  To do so, attach the following notices to the program.  It is safest
-to attach them to the start of each source file to most effectively
-state the exclusion of warranty; and each file should have at least
-the "copyright" line and a pointer to where the full notice is found.
-
-    {one line to give the program's name and a brief idea of what it does.}
-    Copyright (C) {year}  {name of author}
-
-    This program is free software: you can redistribute it and/or modify
-    it under the terms of the GNU General Public License as published by
-    the Free Software Foundation, either version 3 of the License, or
-    (at your option) any later version.
-
-    This program is distributed in the hope that it will be useful,
-    but WITHOUT ANY WARRANTY; without even the implied warranty of
-    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-    GNU General Public License for more details.
-
-    You should have received a copy of the GNU General Public License
-    along with this program.  If not, see <http://www.gnu.org/licenses/>.
-
-Also add information on how to contact you by electronic and paper mail.
-
-  If the program does terminal interaction, make it output a short
-notice like this when it starts in an interactive mode:
-
-    {project}  Copyright (C) {year}  {fullname}
-    This program comes with ABSOLUTELY NO WARRANTY; for details type `show w'.
-    This is free software, and you are welcome to redistribute it
-    under certain conditions; type `show c' for details.
-
-The hypothetical commands `show w' and `show c' should show the appropriate
-parts of the General Public License.  Of course, your program's commands
-might be different; for a GUI interface, you would use an "about box".
-
-  You should also get your employer (if you work as a programmer) or school,
-if any, to sign a "copyright disclaimer" for the program, if necessary.
-For more information on this, and how to apply and follow the GNU GPL, see
-<http://www.gnu.org/licenses/>.
-
-  The GNU General Public License does not permit incorporating your program
-into proprietary programs.  If your program is a subroutine library, you
-may consider it more useful to permit linking proprietary applications with
-the library.  If this is what you want to do, use the GNU Lesser General
-Public License instead of this License.  But first, please read
-<http://www.gnu.org/philosophy/why-not-lgpl.html>.
diff --git a/README.md b/README.md
index bb63707ac..a601c549b 100644
--- a/README.md
+++ b/README.md
@@ -1,5 +1,5 @@
 # MMseqs2: ultra fast and sensitive sequence search and clustering suite
-MMseqs2 (Many-against-Many sequence searching) is a software suite to search and cluster huge protein and nucleotide sequence sets. MMseqs2 is open source GPL-licensed software implemented in C++ for Linux, MacOS, and (as beta version, via cygwin) Windows. The software is designed to run on multiple cores and servers and exhibits very good scalability. MMseqs2 can run 10000 times faster than BLAST. At 100 times its speed it achieves almost the same sensitivity. It can perform profile searches with the same sensitivity as PSI-BLAST at over 400 times its speed.
+MMseqs2 (Many-against-Many sequence searching) is a software suite to search and cluster huge protein and nucleotide sequence sets. MMseqs2 is free and open source software implemented in C++ for Linux, MacOS, and (as beta version, via cygwin) Windows. The software is designed to run on multiple cores and servers and exhibits very good scalability. MMseqs2 can run 10000 times faster than BLAST. At 100 times its speed it achieves almost the same sensitivity. It can perform profile searches with the same sensitivity as PSI-BLAST at over 400 times its speed.
 
 ##  Publications
 
@@ -11,17 +11,19 @@ MMseqs2 (Many-against-Many sequence searching) is a software suite to search and
 
 [Mirdita M, Steinegger M, Breitwieser F, Soding J, Levy Karin E: Fast and sensitive taxonomic assignment to metagenomic contigs. Bioinformatics, doi: 10.1093/bioinformatics/btab184 (2021)](https://doi.org/10.1093/bioinformatics/btab184).
 
-[![BioConda Install](https://img.shields.io/conda/dn/bioconda/mmseqs2.svg?style=flag&label=BioConda%20install)](https://anaconda.org/bioconda/mmseqs2) [![Github All Releases](https://img.shields.io/github/downloads/soedinglab/mmseqs2/total.svg)](https://github.com/soedinglab/mmseqs2/releases/latest) [![Biocontainer Pulls](https://img.shields.io/endpoint?url=https%3A%2F%2Fmmseqs.com%2Fbiocontainer.php%3Fcontainer%3Dmmseqs2)](https://biocontainers.pro/#/tools/mmseqs2) [![Build Status](https://dev.azure.com/themartinsteinegger/mmseqs2/_apis/build/status/soedinglab.MMseqs2?branchName=master)](https://dev.azure.com/themartinsteinegger/mmseqs2/_build/latest?definitionId=2&branchName=master) <a href="https://chat.mmseqs.com/"><img src="https://chat.mmseqs.com/api/v1/shield.svg?type=online&name=chat&icon=false" /></a>
+[Kallenborn F, Chacon A, Hundt C, Sirelkhatim H, Didi K, Dallago C, Mirdita M, Schmidt B, Steinegger M: GPU-accelerated homology search with MMseqs2. bioRxiv, doi: 10.1101/2024.11.13.623350 (2024)](https://www.biorxiv.org/content/10.1101/2024.11.13.623350v1)
+
+[![BioConda Install](https://img.shields.io/conda/dn/bioconda/mmseqs2.svg?style=flag&label=BioConda%20install)](https://anaconda.org/bioconda/mmseqs2) [![Github All Releases](https://img.shields.io/github/downloads/soedinglab/mmseqs2/total.svg)](https://github.com/soedinglab/mmseqs2/releases/latest) [![Biocontainer Pulls](https://img.shields.io/endpoint?url=https%3A%2F%2Fmmseqs.com%2Fbiocontainer.php%3Fcontainer%3Dmmseqs2)](https://biocontainers.pro/#/tools/mmseqs2) [![Build Status](https://dev.azure.com/themartinsteinegger/mmseqs2/_apis/build/status/soedinglab.MMseqs2?branchName=master)](https://dev.azure.com/themartinsteinegger/mmseqs2/_build/latest?definitionId=2&branchName=master)
 
 <p align="center"><img src="https://raw.githubusercontent.com/soedinglab/mmseqs2/master/.github/mmseqs2_logo.png" height="256" /></p>
 
 
 ## Documentation
-The MMseqs2 user guide is available in our [GitHub Wiki](https://github.com/soedinglab/mmseqs2/wiki) or as a [PDF file](https://mmseqs.com/latest/userguide.pdf) (Thanks to [pandoc](https://github.com/jgm/pandoc)!). The wiki also contains [tutorials](https://github.com/soedinglab/MMseqs2/wiki/Tutorials) to learn how to use MMseqs2 with real data. For questions please open an issue on [GitHub](https://github.com/soedinglab/MMseqs2/issues) or ask in our [chat](https://chat.mmseqs.com). 
+The MMseqs2 user guide is available in our [GitHub Wiki](https://github.com/soedinglab/mmseqs2/wiki) or as a [PDF file](https://mmseqs.com/latest/userguide.pdf) (Thanks to [pandoc](https://github.com/jgm/pandoc)!). The wiki also contains [tutorials](https://github.com/soedinglab/MMseqs2/wiki/Tutorials) to learn how to use MMseqs2 with real data. For questions please open an issue on [GitHub](https://github.com/soedinglab/MMseqs2/issues).
 Keep posted about MMseqs2/Linclust updates by following Martin on [Twitter](https://twitter.com/thesteinegger).
 
 ## Installation
-MMseqs2 can be used by [compiling from source](https://github.com/soedinglab/MMseqs2/wiki#installation), downloading a statically compiled binary, using [Homebrew](https://github.com/Homebrew/brew), [conda](https://github.com/conda/conda) or [Docker](https://github.com/moby/moby).
+MMseqs2 can be used by [compiling from source](https://github.com/soedinglab/MMseqs2/wiki#installation), downloading a statically compiled binary at [mmseqs.com/latest](https://mmseqs.com/latest), using [Homebrew](https://github.com/Homebrew/brew), [conda](https://github.com/conda/conda) or [Docker](https://github.com/moby/moby).
      
     # install by brew
     brew install mmseqs2
@@ -29,6 +31,8 @@ MMseqs2 can be used by [compiling from source](https://github.com/soedinglab/MMs
     conda install -c conda-forge -c bioconda mmseqs2
     # install docker
     docker pull ghcr.io/soedinglab/mmseqs2
+    # MMseqs2-GPU mostly-static AVX2 build requiring glibc >= 2.29 and nvidia driver >=525.60.13 (see below)
+    wget https://mmseqs.com/latest/mmseqs-linux-gpu.tar.gz; tar xvfz mmseqs-linux-gpu.tar.gz; export PATH=$(pwd)/mmseqs/bin/:$PATH
     # static build with AVX2 (fastest)
     wget https://mmseqs.com/latest/mmseqs-linux-avx2.tar.gz; tar xvfz mmseqs-linux-avx2.tar.gz; export PATH=$(pwd)/mmseqs/bin/:$PATH
     # static build with SSE4.1
@@ -36,13 +40,13 @@ MMseqs2 can be used by [compiling from source](https://github.com/soedinglab/MMs
     # static build with SSE2 (slowest, for very old systems)
     wget https://mmseqs.com/latest/mmseqs-linux-sse2.tar.gz; tar xvfz mmseqs-linux-sse2.tar.gz; export PATH=$(pwd)/mmseqs/bin/:$PATH
 
-MMseqs2 requires an AMD or Intel 64-bit system (check with `uname -a | grep x86_64`). We recommend using a system with at least the SSE4.1 instruction set (check by executing `cat /proc/cpuinfo | grep sse4_1` on Linux or `sysctl -a | grep machdep.cpu.features | grep SSE4.1` on MacOS). The AVX2 version is faster than SSE4.1, check if AVX2 is supported by executing `cat /proc/cpuinfo | grep avx2` on Linux and `sysctl -a | grep machdep.cpu.leaf7_features | grep AVX2` on MacOS). A SSE2 version is also available for very old systems.
-
-MMseqs2 also works on ARM64 systems and on PPC64LE systems with POWER8 ISA or newer.
+MMseqs2 requires an AMD or Intel 64-bit system (check with `uname -a | grep x86_64`). We recommend using a system with at least the SSE4.1 instruction set (check by executing `cat /proc/cpuinfo | grep sse4_1` on Linux or `sysctl -a | grep machdep.cpu.features | grep SSE4.1` on MacOS). The AVX2 version is faster than SSE4.1, check if AVX2 is supported by executing `cat /proc/cpuinfo | grep avx2` on Linux and `sysctl -a | grep machdep.cpu.leaf7_features | grep AVX2` on MacOS. A SSE2 version is also available for very old systems. MMseqs2 also works on ARM64 systems and on PPC64LE systems with POWER8 ISA or newer.
 
-We provide static binaries for all supported platforms at [mmseqs.com/latest](https://mmseqs.com/latest).
+> [!NOTE]
+> We recently added support for GPU-accelerated protein sequence and profile searches. This requires an NVIDIA GPU of the Ampere generation or newer for full speed, however, also works at reduced speed for Tesla-generation GPUs.
+> Check the [wiki](https://github.com/soedinglab/MMseqs2/wiki#compile-from-source-for-linux-with-gpu-support) for instructions on how to get started.
 
-MMseqs2 comes with a bash command and parameter auto completion, which can be activated by adding the following lines to your $HOME/.bash_profile:
+MMseqs2 comes with a bash command and parameter auto completion, which can be activated by adding the following to your $HOME/.bash_profile:
 
 <pre>
 if [ -f /<b>Path to MMseqs2</b>/util/bash-completion.sh ]; then
diff --git a/azure-pipelines.yml b/azure-pipelines.yml
index 798b538c9..6123f05c2 100644
--- a/azure-pipelines.yml
+++ b/azure-pipelines.yml
@@ -40,7 +40,7 @@ jobs:
   - job: build_macos
     displayName: macOS
     pool:
-      vmImage: 'macos-12'
+      vmImage: 'macos-latest'
     steps:
       - checkout: self
         submodules: true
@@ -68,22 +68,22 @@ jobs:
           SIMD: 'AVX2'
           STATIC: 1
           MPI: 0
-          BUILD_TYPE: RelWithDebInfo
+          BUILD_TYPE: Release
         sse41:
           SIMD: 'SSE4_1'
           STATIC: 1
           MPI: 0
-          BUILD_TYPE: RelWithDebInfo
+          BUILD_TYPE: Release
         sse2:
           SIMD: 'SSE2'
           STATIC: 1
           MPI: 0
-          BUILD_TYPE: RelWithDebInfo
+          BUILD_TYPE: Release
         avx2_mpi:
           SIMD: 'AVX2'
           STATIC: 0
           MPI: 1
-          BUILD_TYPE: RelWithDebInfo
+          BUILD_TYPE: Release
         asan:
           SIMD: 'AVX2'
           STATIC: 0
@@ -94,10 +94,9 @@ jobs:
       - checkout: self
         submodules: true
       - script: |
-          #sudo add-apt-repository ppa:ubuntu-toolchain-r/test
-          #sudo apt-get update
-          sudo apt-get install -y g++-10
-        condition: eq(variables['BUILD_TYPE'], 'ASanOpt')
+          sudo add-apt-repository ppa:ubuntu-toolchain-r/test
+          sudo apt-get update
+          sudo apt-get install -y gcc-11 g++-11 libgcc-11-dev build-essential
         displayName: Install newer G++
       - script: |
           sudo apt-get update
@@ -106,28 +105,21 @@ jobs:
         condition: eq(variables['MPI'], 1)
       - script: |
           mkdir build && cd build
-          if [ "${BUILD_TYPE}" = "ASanOpt" ]; then
-              export CC=gcc-10 ; export CXX=g++-10
-          fi
+          export CC=gcc-11 ; export CXX=g++-11
           if [ "${STATIC}" -eq "1" ]; then
-              cmake -DHAVE_SANITIZER=1 -DCMAKE_BUILD_TYPE=${BUILD_TYPE} -DHAVE_TESTS=1 \
-                -DBUILD_SHARED_LIBS=OFF \
-                -DCMAKE_EXE_LINKER_FLAGS="-static -static-libgcc \
-                -static-libstdc++" -DCMAKE_FIND_LIBRARY_SUFFIXES=".a" \
-                -DENABLE_WERROR=1 -DHAVE_${SIMD}=1 -DHAVE_MPI=${MPI} ..
+            LIBGOMP=/usr/lib/gcc/x86_64-linux-gnu/11
+            cmake -DCMAKE_BUILD_TYPE=${BUILD_TYPE} -DFORCE_STATIC_DEPS=1 -DHAVE_TESTS=1 \
+              -DOpenMP_C_FLAGS="-fopenmp -I${LIBGOMP}" -DOpenMP_C_LIB_NAMES=gomp -DOpenMP_CXX_FLAGS="-fopenmp -I${LIBGOMP}" -DOpenMP_CXX_LIB_NAMES=gomp -DOpenMP_gomp_LIBRARY=${LIBGOMP}/libgomp.a \
+              -DENABLE_WERROR=1 -DHAVE_${SIMD}=1 -DHAVE_MPI=${MPI} ..
           else
-              cmake -DHAVE_SANITIZER=1 -DCMAKE_BUILD_TYPE=${BUILD_TYPE} -DHAVE_TESTS=1 \
-                -DENABLE_WERROR=1 -DHAVE_${SIMD}=1 -DHAVE_MPI=${MPI} ..
+            cmake -DHAVE_SANITIZER=1 -DCMAKE_BUILD_TYPE=${BUILD_TYPE} -DHAVE_TESTS=1 \
+              -DENABLE_WERROR=1 -DHAVE_${SIMD}=1 -DHAVE_MPI=${MPI} ..
           fi
 
-          make -j $(nproc --all)
+          make -j $(nproc --all) VERBOSE=1
         displayName: Build MMseqs2
       - script: |
           export TTY=0
-          if [ "${BUILD_TYPE}" = "ASan" ]; then
-            echo "leak:libgomp1" > ${BUILD_SOURCESDIRECTORY}/ASan.supp
-            export ASAN_OPTIONS=suppressions=${BUILD_SOURCESDIRECTORY}/ASan.supp
-          fi
           ${BUILD_SOURCESDIRECTORY}/util/regression/run_regression.sh ${BUILD_SOURCESDIRECTORY}/build/src/mmseqs ${BUILD_SOURCESDIRECTORY}/regression
         displayName: Run Regression Suite
         condition: eq(variables['regression'], 1)
@@ -137,6 +129,28 @@ jobs:
           targetPath: $(Build.SourcesDirectory)/build/src/mmseqs
           artifactName: mmseqs-linux-$(SIMD)
 
+  - job: build_ubuntu_gpu
+    displayName: Ubuntu MMseqs2 GPU
+    pool:
+      vmImage: 'Ubuntu-20.04'
+    timeoutInMinutes: 120
+    steps:
+      - checkout: self
+        submodules: false
+      - script: |
+          mkdir -p ${BUILD_SOURCESDIRECTORY}/output
+          docker build -t manylinux-builder \
+            -f .github/workflows/Dockerfile.GPU-manylinux2014 .
+          docker run --rm \
+            -v ${BUILD_SOURCESDIRECTORY}/output:/output \
+            manylinux-builder \
+            /bin/bash -c "cp /work/build/src/mmseqs /output/"
+        displayName: Run Docker Container and Copy Binary
+      - task: PublishPipelineArtifact@0
+        inputs:
+          targetPath: $(Build.SourcesDirectory)/output/mmseqs
+          artifactName: mmseqs-linux-gpu
+
   - job: build_ubuntu_cross
     displayName: Ubuntu Cross-Compile
     pool:
@@ -144,10 +158,10 @@ jobs:
     timeoutInMinutes: 120
     strategy:
       matrix:
-        power8:
-          SIMD: POWER8
-          ARCH: ppc64el
-          CPREF: powerpc64le
+        # power8:
+        #   SIMD: POWER8
+        #   ARCH: ppc64el
+        #   CPREF: powerpc64le
         power9:
           SIMD: POWER9
           ARCH: ppc64el
@@ -175,11 +189,10 @@ jobs:
         displayName: Install Toolchain
       - script: |
           mkdir build && cd build
+          LIBGOMP=/usr/lib/gcc-cross/${CPREF}-linux-gnu/9;
           CC=${CPREF}-linux-gnu-gcc CXX=${CPREF}-linux-gnu-g++ \
-           cmake -DCMAKE_BUILD_TYPE=${BUILD_TYPE} -DHAVE_TESTS=1 \
-            -DBUILD_SHARED_LIBS=OFF \
-            -DCMAKE_EXE_LINKER_FLAGS="-static -static-libgcc \
-            -static-libstdc++" -DCMAKE_FIND_LIBRARY_SUFFIXES=".a" \
+           cmake -DCMAKE_BUILD_TYPE=${BUILD_TYPE} -DHAVE_TESTS=1 -DFORCE_STATIC_DEPS=1 \
+            -DOpenMP_C_FLAGS="-fopenmp -I${LIBGOMP}" -DOpenMP_C_LIB_NAMES=gomp -DOpenMP_CXX_FLAGS="-fopenmp -I${LIBGOMP}" -DOpenMP_CXX_LIB_NAMES=gomp -DOpenMP_gomp_LIBRARY=${LIBGOMP}/libgomp.a \
             -DENABLE_WERROR=1 -DHAVE_${SIMD}=1 ..
           make -j $(nproc --all)
         displayName: Build MMseqs2
@@ -252,6 +265,7 @@ jobs:
       - build_ubuntu_userguide
       - build_macos
       - build_ubuntu
+      - build_ubuntu_gpu
       - build_ubuntu_cross
       - build_windows
     steps:
@@ -319,28 +333,40 @@ jobs:
           archiveType: tar
       - task: DownloadPipelineArtifact@1
         inputs:
-          artifactName: mmseqs-linux-POWER9
+          artifactName: mmseqs-linux-gpu
           targetPath: $(Build.SourcesDirectory)/mmseqs/bin
       - script:
           chmod +x "${BUILD_SOURCESDIRECTORY}/mmseqs/bin/mmseqs"
       - task: ArchiveFiles@2
         inputs:
           rootFolderOrFile: $(Build.SourcesDirectory)/mmseqs
-          archiveFile: $(Build.SourcesDirectory)/mmseqs-linux-ppc64le-power9.tar.gz
+          archiveFile: $(Build.SourcesDirectory)/mmseqs-linux-gpu.tar.gz
           includeRootFolder: true
           archiveType: tar
       - task: DownloadPipelineArtifact@1
         inputs:
-          artifactName: mmseqs-linux-POWER8
+          artifactName: mmseqs-linux-POWER9
           targetPath: $(Build.SourcesDirectory)/mmseqs/bin
       - script:
           chmod +x "${BUILD_SOURCESDIRECTORY}/mmseqs/bin/mmseqs"
       - task: ArchiveFiles@2
         inputs:
           rootFolderOrFile: $(Build.SourcesDirectory)/mmseqs
-          archiveFile: $(Build.SourcesDirectory)/mmseqs-linux-ppc64le-power8.tar.gz
+          archiveFile: $(Build.SourcesDirectory)/mmseqs-linux-ppc64le-power9.tar.gz
           includeRootFolder: true
           archiveType: tar
+      # - task: DownloadPipelineArtifact@1
+      #   inputs:
+      #     artifactName: mmseqs-linux-POWER8
+      #     targetPath: $(Build.SourcesDirectory)/mmseqs/bin
+      # - script:
+      #     chmod +x "${BUILD_SOURCESDIRECTORY}/mmseqs/bin/mmseqs"
+      # - task: ArchiveFiles@2
+      #   inputs:
+      #     rootFolderOrFile: $(Build.SourcesDirectory)/mmseqs
+      #     archiveFile: $(Build.SourcesDirectory)/mmseqs-linux-ppc64le-power8.tar.gz
+      #     includeRootFolder: true
+      #     archiveType: tar
       - task: DownloadPipelineArtifact@1
         inputs:
           artifactName: mmseqs-linux-ARM8
@@ -378,14 +404,16 @@ jobs:
           ssh-keygen -f ~/.ssh/id_rsa -y > ~/.ssh/id_rsa.pub
           cd "${BUILD_SOURCESDIRECTORY}"
           cp mmseqs/userguide.pdf userguide.pdf
+          # disabled: mmseqs-linux-ppc64le-power8.tar.gz
+          # -F file[]=@mmseqs-linux-ppc64le-power8.tar.gz -F signature[]=@mmseqs-linux-ppc64le-power8.tar.gz.sig
           ssh-keygen -Y sign -f ~/.ssh/id_rsa -n file \
             userguide.pdf \
             mmseqs-osx-universal.tar.gz \
             mmseqs-linux-sse2.tar.gz \
             mmseqs-linux-sse41.tar.gz \
             mmseqs-linux-avx2.tar.gz \
+            mmseqs-linux-gpu.tar.gz \
             mmseqs-linux-arm64.tar.gz \
-            mmseqs-linux-ppc64le-power8.tar.gz \
             mmseqs-linux-ppc64le-power9.tar.gz \
             mmseqs-win64.zip
           curl --retry 5 --retry-all-errors -X POST \
@@ -394,8 +422,8 @@ jobs:
             -F file[]=@mmseqs-linux-sse2.tar.gz -F signature[]=@mmseqs-linux-sse2.tar.gz.sig \
             -F file[]=@mmseqs-linux-sse41.tar.gz -F signature[]=@mmseqs-linux-sse41.tar.gz.sig \
             -F file[]=@mmseqs-linux-avx2.tar.gz -F signature[]=@mmseqs-linux-avx2.tar.gz.sig \
+            -F file[]=@mmseqs-linux-gpu.tar.gz -F signature[]=@mmseqs-linux-gpu.tar.gz.sig \
             -F file[]=@mmseqs-linux-arm64.tar.gz -F signature[]=@mmseqs-linux-arm64.tar.gz.sig \
-            -F file[]=@mmseqs-linux-ppc64le-power8.tar.gz -F signature[]=@mmseqs-linux-ppc64le-power8.tar.gz.sig \
             -F file[]=@mmseqs-linux-ppc64le-power9.tar.gz -F signature[]=@mmseqs-linux-ppc64le-power9.tar.gz.sig \
             -F file[]=@mmseqs-win64.zip  -F signature[]=@mmseqs-win64.zip.sig \
             -F identifier="mmseqs" -F directory="${BUILD_SOURCEVERSION}" \
diff --git a/cmake/FindAtomic.cmake b/cmake/FindAtomic.cmake
index c17169296..088f32ee6 100644
--- a/cmake/FindAtomic.cmake
+++ b/cmake/FindAtomic.cmake
@@ -1,48 +1,56 @@
-# From https://github.com/cern-eos/eos/blob/master/cmake/FindAtomic.cmake
-# License: GPL-3-or-later
-# Try to find libatomic
-# Once done, this will define
-#
-# ATOMIC_FOUND        - system has libatomic
-# ATOMIC_LIBRARIES    - libraries needed to use libatomic
-#
+# based on
+# https://raw.githubusercontent.com/eProsima/Fast-DDS/d607eefc91e2623cde8bc71d14f275ac57ba5c4f/cmake/modules/FindAtomic.cmake
+# license: Apache-2.0
 
 include(CheckCXXSourceCompiles)
-
 check_cxx_source_compiles("
-           int main() {
-             volatile unsigned __int128 all_ = 4;
-             __atomic_fetch_add(&all_, 8, __ATOMIC_RELAXED);
-             return 0;
-           }
-        "
-        ATOMIC_LIBRARY_NATIVE)
+    int main() {
+        volatile unsigned __int128 i = 4;
+        __atomic_fetch_add(&i, 8, __ATOMIC_RELAXED);
+        __atomic_fetch_sub(&i, 8, __ATOMIC_RELAXED);
+        return 0;
+    }"
+    ATOMIC_NATIVE
+)
 
-if (ATOMIC_LIBRARY_NATIVE)
-    set(ATOMIC_FOUND 1)
-    set(ATOMIC_LIBRARY)
+if (ATOMIC_NATIVE)
+  set(ATOMIC_FOUND 1)
+  set(ATOMIC_LIBRARY)
 else ()
-    set(CMAKE_REQUIRED_LIBRARIES "-latomic")
+  set(_OLD_CMAKE_REQUIRED_LIBRARIES "${CMAKE_REQUIRED_LIBRARIES}")
+  find_library(ATOMIC_LIBRARY_PATH
+    NAMES atomic
+  )
+
+  if (ATOMIC_LIBRARY_PATH)
+    set(ATOMIC_LIBRARY ${ATOMIC_LIBRARY_PATH})
+  else ()
+    set(ATOMIC_LIBRARY "-latomic")
+  endif ()
+
+  set(CMAKE_REQUIRED_LIBRARIES "${ATOMIC_LIBRARY}")
     check_cxx_source_compiles("
-           int main() {
-             volatile unsigned __int128 all_ = 4;
-             __atomic_fetch_add(&all_, 8, __ATOMIC_RELAXED);
-             return 0;
-           }
-        "
-            ATOMIC_LIBRARY_LIB)
-    set(CMAKE_REQUIRED_LIBRARIES)
-    if (ATOMIC_LIBRARY_LIB)
-        set(ATOMIC_FOUND 1)
-        set(ATOMIC_LIBRARY atomic)
+      int main() {
+          volatile unsigned __int128 i = 4;
+          __atomic_fetch_add(&i, 8, __ATOMIC_RELAXED);
+          __atomic_fetch_sub(&i, 8, __ATOMIC_RELAXED);
+          return 0;
+      }"
+      ATOMIC_WITH_LIB
+    )
+    set(CMAKE_REQUIRED_LIBRARIES "${_OLD_CMAKE_REQUIRED_LIBRARIES}")
+    unset(_OLD_CMAKE_REQUIRED_LIBRARIES)
+    if (ATOMIC_WITH_LIB)
+      set(ATOMIC_FOUND 1)
     else ()
-        find_library(ATOMIC_LIBRARY
-                NAMES atomic atomic.so.1 libatomic.so.1 libatomic.dylib libatomic.1.dylib libatomic.a
-                HINTS ${ATOMIC_ROOT}
-                PATH_SUFFIXES ${CMAKE_INSTALL_LIBDIR})
+      set(ATOMIC_FOUND 0)
+      unset(ATOMIC_LIBRARY)
     endif ()
     include(FindPackageHandleStandardArgs)
     find_package_handle_standard_args(Atomic DEFAULT_MSG ATOMIC_LIBRARY)
 endif ()
+
 set(ATOMIC_LIBRARIES ${ATOMIC_LIBRARY})
 unset(ATOMIC_LIBRARY)
+unset(ATOMIC_WITH_LIB)
+unset(ATOMIC_NATIVE)
\ No newline at end of file
diff --git a/cmake/xxdi.pl b/cmake/xxdi.pl
index 8d18358d2..4410fe363 100755
--- a/cmake/xxdi.pl
+++ b/cmake/xxdi.pl
@@ -1,56 +1,18 @@
 #!/usr/bin/env perl
-#
-# xxdi.pl - perl implementation of 'xxd -i' mode
-#
-# Copyright 2013 Greg Kroah-Hartman <gregkh@linuxfoundation.org>
-# Copyright 2013 Linux Foundation
-#
-# Released under the GPLv2.
-#
-# Implements the "basic" functionality of 'xxd -i' in perl to keep build
-# systems from having to build/install/rely on vim-core, which not all
-# distros want to do.  But everyone has perl, so use it instead.
-#
 
 use strict;
 use warnings;
-sub slurp {
-    my $file = shift;
-    open my $fh, '<', $file or die;
-    local $/ = undef;
-    my $cont = <$fh>;
-    close $fh;
-    return $cont;
-}
-my $indata = slurp(@ARGV ? $ARGV[0] : \*STDIN);
-my $len_data = length($indata);
-my $num_digits_per_line = 12;
-my $var_name;
-my $outdata;
 
-# Use the variable name of the file we read from, converting '/' and '.
-# to '_', or, if this is stdin, just use "stdin" as the name.
-if (@ARGV) {
-	$var_name = $ARGV[0];
-	$var_name =~ s/\//_/g;
-	$var_name =~ s/\./_/g;
-} else {
-	$var_name = "stdin";
-}
+my $file = shift;
+open my $input, '<', $file or die "Can't open file for read: $file $!";
+my $text = do { local $/; <$input> };
+close $input;
 
-$outdata .= "unsigned char $var_name\[] = {";
-
-# trailing ',' is acceptable, so instead of duplicating the logic for
-# just the last character, live with the extra ','.
-for (my $key= 0; $key < $len_data; $key++) {
-	if ($key % $num_digits_per_line == 0) {
-		$outdata .= "\n\t";
-	}
-	$outdata .= sprintf("0x%.2x, ", ord(substr($indata, $key, 1)));
-}
-
-$outdata .= "\n};\nunsigned int $var_name\_len = $len_data;\n";
-
-binmode STDOUT;
-print {*STDOUT} $outdata;
+my @hex_values = map { "0x$_" } unpack("(H2)*", $text);
+my $hex_data = join(",", map { ($_ % 16 == 0 ? "\n\t" : "") . $hex_values[$_] } 0 .. $#hex_values);
+my $len_data = length($text);
 
+my $varname = $file;
+$varname =~ s/[\/.]/_/g;
+print "unsigned char $varname\[\] = { $hex_data \n};\n";
+print "unsigned int ${varname}_len = $len_data;\n";
diff --git a/data/PAM10.out b/data/PAM10.out
index a9d4a08d9..b57ba817e 100644
--- a/data/PAM10.out
+++ b/data/PAM10.out
@@ -1,4 +1,6 @@
 # PAM10
+# Background (precomputed optional): 0.07766 0.03092 0.05514 0.05448 0.04223 0.08321 0.02901 0.03984 0.08275 0.08452 0.01395 0.03728 0.05864 0.03950 0.04071 0.07601 0.05556 0.05770 0.01099 0.02991 0.00001
+# Lambda     (precomputed optional): 0.34513
 	A	C	D	E	F	G	H	I	K	L	M	N	P	Q	R	S	T	V	W	Y	X
 A	7	-10	-6	-5	-12	-4	-11	-8	-10	-9	-8	-7	-4	-7	-10	-3	-3	-5	-20	-11	-6
 C	-10	10	-21	-20	-19	-13	-10	-9	-20	-21	-20	-17	-11	-20	-11	-6	-11	-9	-22	-7	-13
diff --git a/data/PAM100.out b/data/PAM100.out
index f17f4f3d4..c434e4c90 100644
--- a/data/PAM100.out
+++ b/data/PAM100.out
@@ -1,4 +1,6 @@
 # PAM100
+# Background (precomputed optional): 0.06947 0.03455 0.07300 0.03239 0.03950 0.10068 0.03732 0.04350 0.09240 0.07569 0.01186 0.01757 0.04407 0.04277 0.03835 0.07329 0.05499 0.07859 0.01254 0.02747 0.00001
+# Lambda     (precomputed optional): 0.34508
 	A	C	D	E	F	G	H	I	K	L	M	N	P	Q	R	S	T	V	W	Y	X
 A	4	-3	-1	0	-5	1	-3	-2	-3	-3	-2	-1	1	-2	-3	1	1	0	-7	-4	-1
 C	-3	9	-7	-8	-7	-5	-4	-3	-8	-8	-7	-5	-4	-8	-5	-1	-4	-3	-9	-1	-5
diff --git a/data/PAM110.out b/data/PAM110.out
index 6e41c28b8..78270beee 100644
--- a/data/PAM110.out
+++ b/data/PAM110.out
@@ -1,4 +1,6 @@
 # PAM110
+# Background (precomputed optional): 0.09255 0.03195 0.05090 0.04371 0.03491 0.08280 0.03571 0.04177 0.08991 0.07037 0.01679 0.02821 0.05518 0.03123 0.03428 0.11003 0.03394 0.06698 0.01185 0.03694 0.00001
+# Lambda     (precomputed optional): 0.34818
 	A	C	D	E	F	G	H	I	K	L	M	N	P	Q	R	S	T	V	W	Y	X
 A	3	-3	-1	0	-4	1	-3	-1	-3	-3	-2	-1	1	-1	-3	1	1	0	-7	-4	-1
 C	-3	9	-7	-7	-6	-5	-4	-3	-7	-8	-7	-5	-4	-7	-4	-1	-3	-3	-9	-1	-4
diff --git a/data/PAM120.out b/data/PAM120.out
index 3f8ed6a72..abd2a8012 100644
--- a/data/PAM120.out
+++ b/data/PAM120.out
@@ -1,4 +1,6 @@
 # PAM120
+# Background (precomputed optional): 0.09508 0.03207 0.04145 0.06079 0.03360 0.08079 0.03328 0.03390 0.07939 0.10168 0.01162 0.04435 0.05162 0.02639 0.05219 0.04013 0.07668 0.05892 0.01118 0.03489 0.00001
+# Lambda     (precomputed optional): 0.34997
 	A	C	D	E	F	G	H	I	K	L	M	N	P	Q	R	S	T	V	W	Y	X
 A	3	-3	0	0	-4	1	-3	-1	-2	-3	-2	-1	1	-1	-3	1	1	0	-7	-4	-1
 C	-3	9	-7	-7	-6	-4	-4	-3	-7	-7	-6	-5	-4	-7	-4	0	-3	-3	-8	-1	-4
diff --git a/data/PAM130.out b/data/PAM130.out
index 940e165a7..c4758a38e 100644
--- a/data/PAM130.out
+++ b/data/PAM130.out
@@ -1,4 +1,6 @@
 # PAM130
+# Background (precomputed optional): 0.09279 0.03221 0.04683 0.05288 0.04483 0.08180 0.02479 0.04990 0.07677 0.09507 0.01026 0.03186 0.04969 0.04310 0.05370 0.05572 0.05784 0.05482 0.01138 0.03374 0.00001
+# Lambda     (precomputed optional): 0.34338
 	A	C	D	E	F	G	H	I	K	L	M	N	P	Q	R	S	T	V	W	Y	X
 A	3	-3	0	0	-4	1	-2	-1	-2	-3	-2	0	1	-1	-3	1	1	0	-6	-4	-1
 C	-3	9	-6	-6	-5	-4	-4	-3	-6	-7	-6	-4	-3	-6	-4	0	-3	-2	-8	-1	-4
diff --git a/data/PAM140.out b/data/PAM140.out
index 6184512e5..42ac4e49a 100644
--- a/data/PAM140.out
+++ b/data/PAM140.out
@@ -1,4 +1,6 @@
 # PAM140
+# Background (precomputed optional): 0.07973 0.03324 0.06785 0.06270 0.04608 0.08204 0.03583 0.05305 0.06861 0.08888 0.01687 0.03047 0.04731 0.03358 0.04621 0.05083 0.05984 0.05092 0.01203 0.03393 0.00001
+# Lambda     (precomputed optional): 0.33861
 	A	C	D	E	F	G	H	I	K	L	M	N	P	Q	R	S	T	V	W	Y	X
 A	3	-2	0	0	-4	1	-2	-1	-2	-2	-2	0	1	-1	-2	1	1	0	-6	-4	-1
 C	-2	9	-6	-6	-5	-4	-4	-3	-6	-7	-6	-4	-3	-6	-4	0	-3	-2	-8	-1	-4
diff --git a/data/PAM160.out b/data/PAM160.out
index 0de3eb280..bf880a579 100644
--- a/data/PAM160.out
+++ b/data/PAM160.out
@@ -1,4 +1,6 @@
 # PAM160
+# Background (precomputed optional): 0.08625 0.02678 0.06346 0.03951 0.03951 0.09517 0.04313 0.02268 0.10432 0.07943 0.01145 0.00249 0.05627 0.03704 0.02415 0.07011 0.07572 0.08561 0.01067 0.02623 0.00001
+# Lambda     (precomputed optional): 0.35111
 	A	C	D	E	F	G	H	I	K	L	M	N	P	Q	R	S	T	V	W	Y	X
 A	2	-2	0	0	-3	1	-2	-1	-2	-2	-1	0	1	-1	-2	1	1	0	-5	-3	0
 C	-2	9	-5	-5	-5	-3	-3	-2	-5	-6	-5	-4	-3	-5	-3	0	-2	-2	-7	0	-3
diff --git a/data/PAM170.out b/data/PAM170.out
index 556cbe765..43d9a7964 100644
--- a/data/PAM170.out
+++ b/data/PAM170.out
@@ -1,4 +1,6 @@
 # PAM170
+# Background (precomputed optional): 0.11152 0.03214 0.04835 0.02832 0.04679 0.09725 0.02864 0.03092 0.08752 0.09134 0.00987 0.05342 0.04804 0.05011 0.04017 0.04956 0.04792 0.06299 0.01041 0.02471 0.00001
+# Lambda     (precomputed optional): 0.23122
 	A	C	D	E	F	G	H	I	K	L	M	N	P	Q	R	S	T	V	W	Y	X
 A	3	-3	0	0	-5	1	-3	-1	-2	-3	-2	0	1	-1	-3	2	2	0	-8	-5	-1
 C	-3	13	-7	-8	-6	-5	-5	-3	-8	-9	-7	-5	-4	-8	-5	0	-3	-3	-10	0	-4
diff --git a/data/PAM180.out b/data/PAM180.out
index f4782d7ee..bb1634ce1 100644
--- a/data/PAM180.out
+++ b/data/PAM180.out
@@ -1,4 +1,6 @@
 # PAM180
+# Background (precomputed optional): 0.11481 0.02690 0.04900 0.07206 0.03731 0.08013 0.04349 0.05223 0.08574 0.08832 0.01252 0.00794 0.03867 0.02285 0.03715 0.11423 0.03321 0.04308 0.00892 0.03143 0.00001
+# Lambda     (precomputed optional): 0.23847
 	A	C	D	E	F	G	H	I	K	L	M	N	P	Q	R	S	T	V	W	Y	X
 A	3	-3	0	0	-5	1	-2	-1	-2	-3	-2	0	1	-1	-3	1	2	0	-8	-5	-1
 C	-3	13	-7	-7	-6	-5	-4	-3	-7	-8	-7	-5	-4	-7	-5	0	-3	-3	-10	0	-4
diff --git a/data/PAM190.out b/data/PAM190.out
index d94c486b2..6926f1085 100644
--- a/data/PAM190.out
+++ b/data/PAM190.out
@@ -1,4 +1,6 @@
 # PAM190
+# Background (precomputed optional): 0.11046 0.03056 0.05499 0.05495 0.03682 0.08966 0.04111 0.06123 0.08113 0.07986 0.01564 0.02788 0.05570 0.02737 0.03518 0.06281 0.04450 0.04793 0.01032 0.03187 0.00001
+# Lambda     (precomputed optional): 0.23012
 	A	C	D	E	F	G	H	I	K	L	M	N	P	Q	R	S	T	V	W	Y	X
 A	3	-3	0	0	-5	1	-2	-1	-2	-3	-2	0	1	-1	-2	1	2	0	-7	-4	0
 C	-3	13	-7	-7	-6	-4	-4	-3	-7	-8	-7	-5	-4	-7	-5	0	-3	-3	-9	0	-4
diff --git a/data/PAM20.out b/data/PAM20.out
index 35f921f66..ab0906521 100644
--- a/data/PAM20.out
+++ b/data/PAM20.out
@@ -1,4 +1,6 @@
 # PAM20
+# Background (precomputed optional): 0.09514 0.03010 0.04701 0.04750 0.04017 0.07864 0.03778 0.03389 0.07727 0.07935 0.01742 0.04493 0.05417 0.03521 0.03805 0.06349 0.06739 0.07367 0.01075 0.02804 0.00001
+# Lambda     (precomputed optional): 0.34521
 	A	C	D	E	F	G	H	I	K	L	M	N	P	Q	R	S	T	V	W	Y	X
 A	6	-8	-4	-3	-9	-3	-8	-6	-8	-7	-6	-5	-2	-5	-8	-1	-1	-3	-16	-9	-4
 C	-8	10	-16	-16	-15	-11	-8	-7	-16	-17	-16	-13	-9	-16	-9	-4	-9	-7	-18	-5	-11
diff --git a/data/PAM40.out b/data/PAM40.out
index 30146a063..1f86e6c15 100644
--- a/data/PAM40.out
+++ b/data/PAM40.out
@@ -1,4 +1,6 @@
 # PAM40
+# Background (precomputed optional): 0.07681 0.03991 0.05302 0.05129 0.03709 0.09747 0.03162 0.04127 0.09232 0.07298 0.01387 0.04179 0.04715 0.03812 0.04357 0.06855 0.05319 0.06399 0.01006 0.02589 0.00001
+# Lambda     (precomputed optional): 0.34607
 	A	C	D	E	F	G	H	I	K	L	M	N	P	Q	R	S	T	V	W	Y	X
 A	6	-6	-3	-2	-7	-1	-6	-4	-6	-5	-4	-3	-1	-3	-6	0	0	-2	-12	-7	-3
 C	-6	9	-12	-12	-11	-8	-7	-5	-12	-13	-12	-9	-7	-12	-7	-2	-7	-5	-14	-3	-8
diff --git a/data/PAM50.out b/data/PAM50.out
index 1afc149e6..2a32a9a54 100644
--- a/data/PAM50.out
+++ b/data/PAM50.out
@@ -1,4 +1,6 @@
 # PAM50
+# Background (precomputed optional): 0.09840 0.03831 0.04745 0.04700 0.03230 0.09266 0.02808 0.03292 0.08842 0.09667 0.01591 0.03736 0.04235 0.03443 0.04098 0.05936 0.07078 0.05319 0.00961 0.03380 0.00001
+# Lambda     (precomputed optional): 0.34899
 	A	C	D	E	F	G	H	I	K	L	M	N	P	Q	R	S	T	V	W	Y	X
 A	5	-5	-2	-1	-7	-1	-5	-3	-5	-5	-4	-2	0	-3	-5	0	0	-1	-11	-6	-2
 C	-5	9	-11	-11	-10	-7	-6	-5	-11	-12	-11	-8	-6	-11	-6	-2	-6	-5	-13	-3	-7
diff --git a/data/PAM60.out b/data/PAM60.out
index 0c045b6f5..28ecfb038 100644
--- a/data/PAM60.out
+++ b/data/PAM60.out
@@ -1,4 +1,6 @@
 # PAM60
+# Background (precomputed optional): 0.07921 0.03546 0.04441 0.04110 0.04363 0.08175 0.03590 0.04001 0.07882 0.08930 0.01286 0.04795 0.05384 0.04398 0.03448 0.06773 0.05968 0.07068 0.00884 0.03037 0.00001
+# Lambda     (precomputed optional): 0.35305
 	A	C	D	E	F	G	H	I	K	L	M	N	P	Q	R	S	T	V	W	Y	X
 A	5	-5	-2	-1	-6	0	-5	-3	-5	-4	-3	-2	0	-3	-5	1	1	-1	-10	-6	-2
 C	-5	9	-10	-10	-9	-7	-6	-4	-10	-11	-10	-7	-6	-10	-6	-1	-5	-4	-12	-2	-6
diff --git a/data/PAM70.out b/data/PAM70.out
index f49a5a709..e332c8405 100644
--- a/data/PAM70.out
+++ b/data/PAM70.out
@@ -1,4 +1,6 @@
 # PAM70
+# Background (precomputed optional): 0.07759 0.03824 0.04909 0.05960 0.04431 0.08293 0.03596 0.03685 0.07970 0.08850 0.01405 0.03989 0.05563 0.03929 0.03580 0.06069 0.05288 0.06934 0.00989 0.02979 0.00001
+# Lambda     (precomputed optional): 0.34210
 	A	C	D	E	F	G	H	I	K	L	M	N	P	Q	R	S	T	V	W	Y	X
 A	5	-4	-1	-1	-6	0	-4	-2	-4	-4	-3	-2	0	-2	-4	1	1	-1	-9	-5	-2
 C	-4	9	-9	-9	-8	-6	-5	-4	-9	-10	-9	-7	-5	-9	-5	-1	-5	-4	-11	-2	-6
diff --git a/data/PAM80.out b/data/PAM80.out
index 65b1b5d66..79abe7257 100644
--- a/data/PAM80.out
+++ b/data/PAM80.out
@@ -1,4 +1,6 @@
 # PAM80
+# Background (precomputed optional): 0.09999 0.03564 0.03519 0.05417 0.03993 0.07534 0.03150 0.03550 0.07178 0.08343 0.01793 0.04985 0.05074 0.03864 0.04750 0.07369 0.06212 0.05961 0.00896 0.02848 0.00001
+# Lambda     (precomputed optional): 0.34682
 	A	C	D	E	F	G	H	I	K	L	M	N	P	Q	R	S	T	V	W	Y	X
 A	4	-4	-1	-1	-5	0	-4	-2	-4	-4	-3	-1	0	-2	-4	1	1	0	-8	-5	-1
 C	-4	9	-9	-9	-8	-6	-5	-4	-9	-9	-8	-6	-5	-9	-5	-1	-4	-3	-10	-2	-5
diff --git a/data/PAM90.out b/data/PAM90.out
index 43bb8791c..dba798a15 100644
--- a/data/PAM90.out
+++ b/data/PAM90.out
@@ -1,4 +1,6 @@
 # PAM90
+# Background (precomputed optional): 0.09013 0.03325 0.04419 0.03474 0.03828 0.10537 0.02771 0.04824 0.09570 0.07713 0.01549 0.03356 0.04602 0.05609 0.03959 0.06940 0.05530 0.05461 0.00847 0.02671 0.00001
+# Lambda     (precomputed optional): 0.35176
 	A	C	D	E	F	G	H	I	K	L	M	N	P	Q	R	S	T	V	W	Y	X
 A	4	-3	-1	0	-5	0	-4	-2	-3	-3	-2	-1	0	-2	-4	1	1	0	-8	-5	-1
 C	-3	9	-8	-8	-7	-5	-5	-3	-8	-9	-8	-6	-5	-8	-5	-1	-4	-3	-10	-1	-5
diff --git a/data/blosum100.out b/data/blosum100.out
index 03dee997a..10e753ddf 100644
--- a/data/blosum100.out
+++ b/data/blosum100.out
@@ -1,4 +1,6 @@
 # BLOSUM100 in 1/2 Bit
+# Background (precomputed optional): 0.07232 0.02954 0.05418 0.05682 0.04668 0.08126 0.02542 0.06514 0.05153 0.09522 0.02514 0.04121 0.03836 0.03335 0.04929 0.05778 0.05329 0.07261 0.01532 0.03553 0.00001
+# Lambda     (precomputed optional): 0.34657
    A       C       D       E       F       G       H       I       K       L       M       N       P       Q       R       S       T       V       W       Y       X
 A  5.4032 -1.3914 -3.0411 -1.7843 -3.5667 -0.9429 -2.7743 -2.6285 -1.5480 -2.6564 -2.2890 -2.3818 -1.4407 -1.3900 -2.2348  0.9287 -0.5187 -1.0016 -4.1469 -3.5292 -1.0000
 C -1.3914  9.0527 -5.4848 -6.1579 -2.8088 -4.8414 -5.4005 -2.0578 -5.0581 -3.0318 -2.9747 -3.5406 -5.0881 -4.7305 -5.2752 -1.9639 -1.9677 -1.8493 -4.7523 -3.8063 -1.0000
diff --git a/data/blosum30.out b/data/blosum30.out
index e65e6d373..80e84a749 100644
--- a/data/blosum30.out
+++ b/data/blosum30.out
@@ -1,4 +1,6 @@
 # BLOSUM30 in 1/2 Bit
+# Background (precomputed optional): 0.07484 0.02580 0.05398 0.06229 0.04472 0.07331 0.02855 0.05678 0.06049 0.09230 0.02177 0.04276 0.04327 0.03627 0.05858 0.06483 0.04742 0.06490 0.01264 0.03448 0.00001
+# Lambda     (precomputed optional): 0.34657
    A       C       D       E       F       G       H       I       K       L       M       N       P       Q       R       S       T       V       W       Y       X
 A  1.5675 -1.0136  0.1920 -0.1870 -0.6023 -0.1721 -0.7992 -0.1637 -0.0888 -0.5876  0.2152 -0.1114 -0.3952  0.3636 -0.3792  0.3881  0.3304  0.3874 -1.8151 -1.6953 -1.0000
 C -1.0136  6.8039 -1.0074  0.2619 -1.0289 -1.5349 -2.0110 -0.6765 -1.2588 -0.1094 -0.9709 -0.4175 -1.3558 -0.8750 -0.8764 -0.7459 -0.8655 -0.9287 -0.6150 -2.4342 -1.0000
diff --git a/data/blosum35.out b/data/blosum35.out
index ab057abef..2e01cb4e7 100644
--- a/data/blosum35.out
+++ b/data/blosum35.out
@@ -1,4 +1,6 @@
 # BLOSUM35 in 1/2 Bit
+# Background (precomputed optional): 0.07675 0.02859 0.05467 0.06102 0.04403 0.07880 0.02765 0.05617 0.06265 0.08708 0.02235 0.04351 0.04844 0.03517 0.05089 0.06122 0.04947 0.06349 0.01196 0.03608 0.00001
+# Lambda     (precomputed optional): 0.34658
    A       C       D       E       F       G       H       I       K       L       M       N       P       Q       R       S       T       V       W       Y       X
 A  2.2600 -0.8033 -0.4171 -0.4171 -1.0258  0.1947 -0.8971 -0.4419 -0.2117 -0.8324 -0.0320 -0.3282 -0.8139 -0.0999 -0.4635  0.4745  0.1227  0.1557 -0.9101 -0.6432 -1.0000
 C -0.8033  7.4435 -1.5089 -0.4135 -1.8442 -1.6504 -1.9401 -1.8334 -0.9373 -1.0365 -1.7596 -0.7106 -2.1475 -1.7463 -1.4076 -1.3954 -0.4751 -1.2307 -2.2502 -2.4418 -1.0000
diff --git a/data/blosum40.out b/data/blosum40.out
index 94edbbcbd..8819f6e23 100644
--- a/data/blosum40.out
+++ b/data/blosum40.out
@@ -1,4 +1,6 @@
 # BLOSUM40 in 1/2 Bit
+# Background (precomputed optional): 0.07723 0.02509 0.05299 0.05957 0.04636 0.07778 0.02619 0.05969 0.06007 0.08917 0.02361 0.04253 0.04607 0.03559 0.04980 0.06117 0.05127 0.06743 0.01341 0.03497 0.00001
+# Lambda     (precomputed optional): 0.34657
    A       C       D       E       F       G       H       I       K       L       M       N       P       Q       R       S       T       V       W       Y       X
 A  2.6312 -0.7811 -0.7041 -0.3993 -1.3685  0.2688 -1.1026 -0.6496 -0.3739 -0.9204 -0.2604 -0.3740 -0.8484 -0.1424 -0.7916  0.6828  0.0141  0.1203 -1.2891 -0.9677 -1.0000
 C -0.7811  7.7717 -1.1804 -1.0990 -1.2364 -1.4173 -2.0336 -1.8091 -1.2614 -1.1538 -1.5593 -1.0410 -2.2570 -2.0233 -1.6445 -0.5475 -0.4663 -1.0304 -2.8737 -2.1814 -1.0000
diff --git a/data/blosum45.out b/data/blosum45.out
index 7eb2fc769..cb0329560 100644
--- a/data/blosum45.out
+++ b/data/blosum45.out
@@ -1,4 +1,6 @@
 # BLOSUM45 in 1/2 Bit
+# Background (precomputed optional): 0.07825 0.02350 0.04971 0.05809 0.04643 0.07498 0.02480 0.06190 0.06000 0.09376 0.02508 0.04187 0.04260 0.03591 0.05229 0.06118 0.05250 0.06998 0.01337 0.03378 0.00001
+# Lambda     (precomputed optional): 0.34657
    A       C       D       E       F       G       H       I       K       L       M       N       P       Q       R       S       T       V       W       Y       X
 A  3.1219 -0.6424 -1.0751 -0.5545 -1.5354  0.2229 -1.2249 -0.8424 -0.6940 -0.9786 -0.5679 -0.6837 -0.9940 -0.4129 -1.0304  0.7577  0.0017  0.0285 -1.6451 -1.2935 -1.0000
 C -0.6424  8.1902 -1.8164 -1.7495 -1.4660 -1.6894 -2.0545 -1.7630 -1.7422 -1.1441 -1.4561 -1.1118 -2.5619 -2.0807 -2.1675 -0.6549 -0.5664 -0.9689 -3.1675 -2.0648 -1.0000
diff --git a/data/blosum50.out b/data/blosum50.out
index 726c80191..59e29c480 100644
--- a/data/blosum50.out
+++ b/data/blosum50.out
@@ -1,4 +1,6 @@
 # BLOSUM50 in 1/2 Bit
+# Background (precomputed optional): 0.07654 0.02232 0.05139 0.05645 0.04810 0.07385 0.02471 0.06406 0.05787 0.09559 0.02439 0.04364 0.04088 0.03492 0.05325 0.05948 0.05292 0.07243 0.01371 0.03349 0.00001
+# Lambda     (precomputed optional): 0.34657
    A       C       D       E       F       G       H       I       K       L       M       N       P       Q       R       S       T       V       W       Y       X
 A  3.4217 -0.3440 -1.2002 -0.6534 -1.7445  0.2784 -1.2822 -0.9673 -0.8392 -1.2123 -0.4572 -0.9498 -0.9689 -0.5729 -1.1626  0.8948 -0.0958 -0.0535 -2.2183 -1.4913 -1.0000
 C -0.3440  8.3766 -2.4505 -2.2656 -1.6474 -1.8628 -1.9012 -1.5362 -2.2414 -1.2426 -1.1088 -1.4691 -2.6218 -2.1100 -2.4498 -0.5381 -0.5800 -0.6128 -3.3595 -1.7849 -1.0000
diff --git a/data/blosum55.out b/data/blosum55.out
index bd980be96..abbddb863 100644
--- a/data/blosum55.out
+++ b/data/blosum55.out
@@ -1,4 +1,6 @@
 # BLOSUM55 in 1/2 Bit
+# Background (precomputed optional): 0.07551 0.02329 0.05165 0.05540 0.04753 0.07381 0.02471 0.06597 0.05799 0.09572 0.02476 0.04361 0.04079 0.03452 0.05168 0.05945 0.05349 0.07350 0.01376 0.03283 0.00001
+# Lambda     (precomputed optional): 0.34657
    A       C       D       E       F       G       H       I       K       L       M       N       P       Q       R       S       T       V       W       Y       X
 A  3.5985 -0.1914 -1.4300 -0.7792 -1.7950  0.1752 -1.4189 -1.0893 -0.8085 -1.3585 -0.6749 -1.1307 -0.9411 -0.6988 -1.2436  1.0043 -0.0148 -0.1057 -2.4213 -1.3825 -1.0000
 C -0.1914  8.4177 -2.7946 -2.8812 -1.8386 -2.1058 -2.3530 -1.3765 -2.6838 -1.3343 -1.3561 -1.9481 -2.2621 -2.4642 -2.8372 -0.6505 -0.4548 -0.6685 -2.6586 -2.1554 -1.0000
diff --git a/data/blosum60.out b/data/blosum60.out
index 9d252d100..ef114fddd 100644
--- a/data/blosum60.out
+++ b/data/blosum60.out
@@ -1,4 +1,6 @@
 # BLOSUM60 in 1/2 Bit
+# Background (precomputed optional): 0.07455 0.02446 0.05250 0.05498 0.04809 0.07334 0.02525 0.06677 0.05759 0.09865 0.02517 0.04341 0.03954 0.03429 0.05202 0.05820 0.05193 0.07318 0.01353 0.03255 0.00001
+# Lambda     (precomputed optional): 0.34658
    A       C       D       E       F       G       H       I       K       L       M       N       P       Q       R       S       T       V       W       Y       X
 A  3.8330 -0.3290 -1.6674 -0.8762 -2.0947  0.1548 -1.5436 -1.1948 -0.7972 -1.3823 -0.9062 -1.3615 -0.9186 -0.7243 -1.3618  1.0699 -0.0520 -0.1445 -2.5631 -1.6290 -1.5000
 C -0.3290  8.5513 -3.3480 -3.4074 -2.2086 -2.3983 -2.8951 -1.4205 -3.0012 -1.3201 -1.4609 -2.4441 -2.6577 -2.7131 -3.3106 -0.7957 -0.7759 -0.8133 -2.1774 -2.3419 -1.5000
diff --git a/data/blosum65.out b/data/blosum65.out
index a19bfda88..cfebf1557 100644
--- a/data/blosum65.out
+++ b/data/blosum65.out
@@ -1,4 +1,6 @@
 # BLOSUM65 in 1/2 Bit
+# Background (precomputed optional): 0.07384 0.02532 0.05372 0.05357 0.04752 0.07423 0.02619 0.06814 0.05710 0.09988 0.02512 0.04410 0.03805 0.03341 0.05069 0.05769 0.05112 0.07277 0.01413 0.03341 0.00001
+# Lambda     (precomputed optional): 0.34657
    A       C       D       E       F       G       H       I       K       L       M       N       P       Q       R       S       T       V       W       Y       X
 A  4.0449 -0.4496 -1.8345 -0.9003 -2.3286  0.1273 -1.7820 -1.3920 -0.7773 -1.5866 -1.0481 -1.6172 -0.8096 -0.9027 -1.4894  1.2390 -0.0893 -0.2379 -2.7099 -1.8384 -1.0000
 C -0.4496  8.6075 -3.5409 -3.9428 -2.2776 -2.7275 -3.3989 -1.1238 -3.2979 -1.3839 -1.5403 -2.8901 -3.0874 -3.1847 -3.6123 -0.8547 -1.0355 -0.8436 -2.4679 -2.4603 -1.0000
diff --git a/data/blosum70.out b/data/blosum70.out
index 9220482fb..3091b0ca6 100644
--- a/data/blosum70.out
+++ b/data/blosum70.out
@@ -1,4 +1,6 @@
 # BLOSUM70 in 1/2 Bit
+# Background (precomputed optional): 0.07378 0.02662 0.05370 0.05401 0.04769 0.07521 0.02602 0.06776 0.05619 0.10126 0.02467 0.04386 0.03829 0.03370 0.05086 0.05649 0.04995 0.07258 0.01379 0.03356 0.00001
+# Lambda     (precomputed optional): 0.34657
    A       C       D       E       F       G       H       I       K       L       M       N       P       Q       R       S       T       V       W       Y       X
 A  4.2364 -0.6025 -2.0030 -0.9681 -2.4888  0.0322 -1.8236 -1.5456 -0.8601 -1.7676 -1.1640 -1.7406 -0.6968 -0.9409 -1.6012  1.2304 -0.0530 -0.2960 -2.9000 -2.1049 -1.0000
 C -0.6025  8.6848 -3.7647 -4.3606 -2.3155 -3.1486 -3.7961 -1.3288 -3.6122 -1.6853 -1.7621 -3.2284 -3.2899 -3.0928 -3.8031 -1.0660 -1.2117 -1.1259 -2.5713 -2.7563 -1.0000
diff --git a/data/blosum75.out b/data/blosum75.out
index b54e7a86a..1ddcc3a4f 100644
--- a/data/blosum75.out
+++ b/data/blosum75.out
@@ -1,4 +1,6 @@
 # BLOSUM75 in 1/2 Bit
+# Background (precomputed optional): 0.07354 0.02718 0.05370 0.05405 0.04792 0.07522 0.02561 0.06770 0.05547 0.10137 0.02473 0.04341 0.03836 0.03350 0.05104 0.05635 0.04953 0.07295 0.01409 0.03427 0.00001
+# Lambda     (precomputed optional): 0.34657
    A       C       D       E       F       G       H       I       K       L       M       N       P       Q       R       S       T       V       W       Y       X
 A  4.3642 -0.7178 -2.1931 -0.9795 -2.6320 -0.0249 -1.7879 -1.6727 -0.8608 -1.8902 -1.2384 -1.8504 -0.6599 -1.0315 -1.6979  1.2749 -0.0519 -0.3642 -3.2269 -2.2548 -1.0000
 C -0.7178  8.7214 -4.1131 -4.6349 -2.3973 -3.3654 -3.9849 -1.3474 -3.8023 -1.7590 -1.8353 -3.3280 -3.5396 -3.1068 -3.9359 -1.2667 -1.3033 -1.0931 -2.9623 -3.0434 -1.0000
diff --git a/data/blosum80.out b/data/blosum80.out
index 0d86348e5..b7b77dc4d 100644
--- a/data/blosum80.out
+++ b/data/blosum80.out
@@ -1,4 +1,6 @@
 # BLOSUM80 in 1/2 Bit
+# Background (precomputed optional): 0.07265 0.02886 0.05363 0.05456 0.04711 0.07667 0.02542 0.06717 0.05479 0.09952 0.02444 0.04300 0.03821 0.03363 0.05042 0.05724 0.05013 0.07320 0.01449 0.03485 0.00001
+# Lambda     (precomputed optional): 0.34657
    A       C       D       E       F       G       H       I       K       L       M       N       P       Q       R       S       T       V       W       Y       X
 A  4.5099 -0.9010 -2.2982 -1.0162 -2.6670 -0.1265 -1.9179 -1.7603 -0.9363 -1.9737 -1.3538 -1.9422 -0.7500 -1.0467 -1.6972  1.2358 -0.0594 -0.4155 -3.3860 -2.3937 -1.0000
 C -0.9010  8.7434 -4.4500 -4.9539 -2.6819 -3.7561 -4.3516 -1.5659 -4.1040 -2.0427 -2.0034 -3.4702 -3.7868 -3.5213 -4.2097 -1.5911 -1.4656 -1.3128 -3.4526 -3.3986 -1.0000
diff --git a/data/blosum85.out b/data/blosum85.out
index 4bc6c4c6d..cf96b15b4 100644
--- a/data/blosum85.out
+++ b/data/blosum85.out
@@ -1,4 +1,6 @@
 # BLOSUM85 in 1/2 Bit
+# Background (precomputed optional): 0.07241 0.02955 0.05373 0.05433 0.04762 0.07823 0.02484 0.06684 0.05396 0.09840 0.02444 0.04238 0.03837 0.03327 0.04953 0.05766 0.05099 0.07315 0.01497 0.03531 0.00001
+# Lambda     (precomputed optional): 0.34657
    A       C       D       E       F       G       H       I       K       L       M       N       P       Q       R       S       T       V       W       Y       X
 A  4.6981 -1.0333 -2.4616 -1.0636 -2.8537 -0.2870 -2.0836 -1.8760 -1.0253 -2.0833 -1.5008 -2.0214 -0.8475 -1.1129 -1.7680  1.1430 -0.0949 -0.5152 -3.4126 -2.5887 -1.0000
 C -1.0333  8.7626 -4.6810 -5.2816 -2.6773 -4.1137 -4.6265 -1.6864 -4.2989 -2.2496 -2.2177 -3.5713 -4.1308 -3.7838 -4.3161 -1.6358 -1.5048 -1.3429 -3.6143 -3.3450 -1.0000
diff --git a/data/blosum90.out b/data/blosum90.out
index d1b40f275..16c21bad6 100644
--- a/data/blosum90.out
+++ b/data/blosum90.out
@@ -1,4 +1,6 @@
 # BLOSUM90 in 1/2 Bit
+# Background (precomputed optional): 0.07255 0.02960 0.05411 0.05517 0.04651 0.07925 0.02489 0.06606 0.05338 0.09697 0.02440 0.04145 0.03866 0.03358 0.04920 0.05809 0.05160 0.07347 0.01537 0.03565 0.00001
+# Lambda     (precomputed optional): 0.34657
    A       C       D       E       F       G       H       I       K       L       M       N       P       Q       R       S       T       V       W       Y       X
 A  4.9179 -1.1581 -2.5684 -1.1669 -3.0548 -0.4512 -2.2281 -2.1429 -1.1811 -2.3278 -1.7305 -2.1459 -0.9798 -1.3011 -1.8254  1.0777 -0.1948 -0.6666 -3.6358 -2.8667 -1.0000
 C -1.1581  8.8709 -5.0118 -5.5314 -2.6979 -4.3928 -4.7384 -1.8819 -4.4261 -2.4681 -2.4547 -3.5984 -4.2387 -4.0930 -4.5552 -1.8661 -1.7337 -1.5322 -3.9931 -3.5469 -1.0000
diff --git a/data/blosum95.out b/data/blosum95.out
index 6862147ec..32c3bbaa4 100644
--- a/data/blosum95.out
+++ b/data/blosum95.out
@@ -1,4 +1,6 @@
 # BLOSUM95 in 1/2 Bit
+# Background (precomputed optional): 0.07263 0.03008 0.05418 0.05500 0.04624 0.08061 0.02478 0.06573 0.05202 0.09555 0.02478 0.04167 0.03889 0.03344 0.04900 0.05856 0.05268 0.07357 0.01505 0.03552 0.00001
+# Lambda     (precomputed optional): 0.34657
    A       C       D       E       F       G       H       I       K       L       M       N       P       Q       R       S       T       V       W       Y       X
 A  5.1233 -1.2307 -2.7621 -1.4029 -3.2441 -0.6576 -2.5268 -2.3441 -1.3416 -2.4710 -1.9593 -2.2705 -1.2288 -1.2861 -1.9849  1.0225 -0.3233 -0.8255 -3.6874 -3.1432 -1.0000
 C -1.2307  8.9206 -5.2347 -5.9107 -2.7117 -4.5869 -4.9437 -1.9952 -4.8027 -2.7939 -2.6890 -3.5408 -4.7432 -4.4374 -4.8766 -1.9147 -1.9105 -1.7424 -4.3860 -3.7118 -1.0000
diff --git a/data/workflow/blastp.sh b/data/workflow/blastp.sh
index 5d587fd44..2610733bc 100755
--- a/data/workflow/blastp.sh
+++ b/data/workflow/blastp.sh
@@ -63,6 +63,8 @@ while [ "$STEP" -lt "$STEPS" ]; do
           # shellcheck disable=SC2086
           $RUNNER "$MMSEQS" ungappedprefilter "$INPUT" "$TARGET" "$TMP_PATH/pref_$STEP" $UNGAPPEDPREFILTER_PAR \
               || fail "Ungapped prefilter died"
+      elif [ "$PREFMODE" = "UNGAPPED_AND_GAPPED" ]; then
+          :
       else
           # shellcheck disable=SC2086
           $RUNNER "$MMSEQS" prefilter "$INPUT" "$TARGET" "$TMP_PATH/pref_$STEP" $PREFILTER_PAR -s "$SENS" \
@@ -73,9 +75,16 @@ while [ "$STEP" -lt "$STEPS" ]; do
     # 2. alignment module
     if [ "$STEPS" -eq 1 ]; then
         if notExists "$3.dbtype"; then
-            # shellcheck disable=SC2086
-            $RUNNER "$MMSEQS" "${ALIGN_MODULE}" "$INPUT" "$TARGET${ALIGNMENT_DB_EXT}" "$TMP_PATH/pref_$STEP" "$3" $ALIGNMENT_PAR  \
-                || fail "Alignment died"
+            if [ "$PREFMODE" = "UNGAPPED_AND_GAPPED" ]; then
+              # The GPU based ungapped prefilter als generate alignments
+              # shellcheck disable=SC2086
+              $RUNNER "$MMSEQS" ungappedprefilter "$INPUT" "$TARGET" "$3" $UNGAPPEDPREFILTER_PAR  \
+                  || fail "Alignment died"
+            else
+              # shellcheck disable=SC2086
+              $RUNNER "$MMSEQS" "${ALIGN_MODULE}" "$INPUT" "$TARGET${ALIGNMENT_DB_EXT}" "$TMP_PATH/pref_$STEP" "$3" $ALIGNMENT_PAR  \
+                  || fail "Alignment died"
+            fi
         fi
         break
     else
diff --git a/data/workflow/createindex.sh b/data/workflow/createindex.sh
index d75127381..2b02b1174 100755
--- a/data/workflow/createindex.sh
+++ b/data/workflow/createindex.sh
@@ -18,9 +18,15 @@ INPUT="$1"
 if [ -n "$TRANSLATED" ]; then
     # 1. extract orf
     if notExists "$2/orfs_aa.dbtype"; then
-        # shellcheck disable=SC2086
-        "$MMSEQS" extractorfs "$INPUT" "$2/orfs_aa" ${ORF_PAR} \
-            || fail "extractorfs died"
+        if [ -n "$ORF_SKIP" ]; then
+            # shellcheck disable=SC2086
+            "$MMSEQS" extractframes "$INPUT" "$2/orfs_aa" ${EXTRACT_FRAMES_PAR} \
+                || fail  "extractframes died"
+        else
+            # shellcheck disable=SC2086
+            "$MMSEQS" extractorfs "$INPUT" "$2/orfs_aa" ${ORF_PAR} \
+                || fail "extractorfs died"
+        fi
     fi
 
     # shellcheck disable=SC2086
@@ -33,7 +39,7 @@ if [ -n "$TRANSLATED" ]; then
         rm -f "$2/createindex.sh"
     fi
 elif [ -n "$LIN_NUCL" ] || [ -n "$NUCL" ]; then
-      # 1. extract orf
+    # 1. extract orf
     if notExists "$2/nucl_split_seq.dbtype"; then
         # shellcheck disable=SC2086
         "$MMSEQS" splitsequence "$INPUT" "$2/nucl_split_seq" ${SPLIT_SEQ_PAR} \
diff --git a/data/workflow/easyrbh.sh b/data/workflow/easyrbh.sh
index 016350cea..de75d472e 100755
--- a/data/workflow/easyrbh.sh
+++ b/data/workflow/easyrbh.sh
@@ -15,6 +15,15 @@ if notExists "${TMP_PATH}/query.dbtype"; then
     QUERY="${TMP_PATH}/query"
 fi
 
+if [ -n "${GPU}" ]; then
+    if notExists "${TMP_PATH}/query_pad"; then
+        # shellcheck disable=SC2086
+        "$MMSEQS" makepaddedseqdb "${TMP_PATH}/query" "${TMP_PATH}/query_pad" ${MAKEPADDEDSEQDB_PAR} \
+            || fail "makepaddedseqdb died"
+    fi
+    QUERY="${TMP_PATH}/query_pad"
+fi
+
 if notExists "${TARGET}.dbtype"; then
     if notExists "${TMP_PATH}/target"; then
         # shellcheck disable=SC2086
@@ -22,6 +31,15 @@ if notExists "${TARGET}.dbtype"; then
             || fail "target createdb died"
     fi
     TARGET="${TMP_PATH}/target"
+
+    if [ -n "${GPU}" ]; then
+        if notExists "${TMP_PATH}/target_pad"; then
+            # shellcheck disable=SC2086
+            "$MMSEQS" makepaddedseqdb "${TMP_PATH}/target" "${TMP_PATH}/target_pad" ${MAKEPADDEDSEQDB_PAR} \
+                || fail "makepaddedseqdb died"
+        fi
+        TARGET="${TMP_PATH}/target_pad"
+    fi
 fi
 
 if notExists "${INTERMEDIATE}.dbtype"; then
@@ -46,10 +64,22 @@ if [ -n "${REMOVE_TMP}" ]; then
             # shellcheck disable=SC2086
             "$MMSEQS" rmdb "${TMP_PATH}/target_h" ${VERBOSITY}
         fi
+        if [ -f "${TMP_PATH}/target_pad" ]; then
+            # shellcheck disable=SC2086
+            "$MMSEQS" rmdb "${TMP_PATH}/target_pad" ${VERBOSITY}
+            # shellcheck disable=SC2086
+            "$MMSEQS" rmdb "${TMP_PATH}/target_pad_h" ${VERBOSITY}
+        fi
         # shellcheck disable=SC2086
         "$MMSEQS" rmdb "${TMP_PATH}/query" ${VERBOSITY}
         # shellcheck disable=SC2086
         "$MMSEQS" rmdb "${TMP_PATH}/query_h" ${VERBOSITY}
+        if [ -f "${TMP_PATH}/query_pad" ]; then
+            # shellcheck disable=SC2086
+            "$MMSEQS" rmdb "${TMP_PATH}/query_pad" ${VERBOSITY}
+            # shellcheck disable=SC2086
+            "$MMSEQS" rmdb "${TMP_PATH}/query_pad_h" ${VERBOSITY}
+        fi
     fi
     rm -rf "${TMP_PATH}/rbh_tmp"
     rm -f "${TMP_PATH}/easyrbh.sh"
diff --git a/data/workflow/easysearch.sh b/data/workflow/easysearch.sh
index f6443fc49..778657d15 100755
--- a/data/workflow/easysearch.sh
+++ b/data/workflow/easysearch.sh
@@ -21,6 +21,15 @@ if notExists "${TARGET}.dbtype"; then
             || fail "target createdb died"
     fi
     TARGET="${TMP_PATH}/target"
+
+    if [ -n "${GPU}" ]; then
+        if notExists "${TMP_PATH}/target_pad"; then
+            # shellcheck disable=SC2086
+            "$MMSEQS" makepaddedseqdb "${TMP_PATH}/target" "${TMP_PATH}/target_pad" ${MAKEPADDEDSEQDB_PAR} \
+                || fail "makepaddedseqdb died"
+        fi
+        TARGET="${TMP_PATH}/target_pad"
+    fi
 fi
 
 if [ -n "${LINSEARCH}" ] && notExists "${TARGET}.linidx"; then
@@ -66,6 +75,14 @@ if [ -n "${REMOVE_TMP}" ]; then
             # shellcheck disable=SC2086
             "$MMSEQS" rmdb "${TMP_PATH}/target_h" ${VERBOSITY}
         fi
+
+        if [ -f "${TMP_PATH}/target_pad" ]; then
+            # shellcheck disable=SC2086
+            "$MMSEQS" rmdb "${TMP_PATH}/target_pad" ${VERBOSITY}
+            # shellcheck disable=SC2086
+            "$MMSEQS" rmdb "${TMP_PATH}/target_pad_h" ${VERBOSITY}
+        fi
+        
         # shellcheck disable=SC2086
         "$MMSEQS" rmdb "${TMP_PATH}/query" ${VERBOSITY}
         # shellcheck disable=SC2086
diff --git a/data/workflow/translated_search.sh b/data/workflow/translated_search.sh
index b2e4252d4..f39c5978f 100755
--- a/data/workflow/translated_search.sh
+++ b/data/workflow/translated_search.sh
@@ -21,10 +21,18 @@ TMP_PATH="$4"
 QUERY="$1"
 QUERY_ORF="$1"
 if [ -n "$QUERY_NUCL" ]; then
-    if notExists "${TMP_PATH}/q_orfs_aa.dbtype"; then
-        # shellcheck disable=SC2086
-        "$MMSEQS" extractorfs "$1" "${TMP_PATH}/q_orfs_aa" ${ORF_PAR} \
-            || fail  "extract orfs step died"
+    if [ -n "$ORF_SKIP" ]; then
+        if notExists "${TMP_PATH}/q_orfs_aa.dbtype"; then
+            # shellcheck disable=SC2086
+            "$MMSEQS" extractframes "$1" "${TMP_PATH}/q_orfs_aa" ${EXTRACT_FRAMES_PAR} \
+                || fail  "extractframes died"
+        fi
+    else
+        if notExists "${TMP_PATH}/q_orfs_aa.dbtype"; then
+            # shellcheck disable=SC2086
+            "$MMSEQS" extractorfs "$1" "${TMP_PATH}/q_orfs_aa" ${ORF_PAR} \
+                || fail  "extract orfs step died"
+        fi
     fi
     QUERY="${TMP_PATH}/q_orfs_aa"
     QUERY_ORF="${TMP_PATH}/q_orfs_aa"
@@ -34,10 +42,19 @@ TARGET="$2"
 TARGET_ORF="$2"
 if [ -n "$TARGET_NUCL" ]; then
 if [ -n "$NO_TARGET_INDEX" ]; then
-    if notExists "${TMP_PATH}/t_orfs_aa.dbtype"; then
-        # shellcheck disable=SC2086
-        "$MMSEQS" extractorfs "$2" "${TMP_PATH}/t_orfs_aa" ${ORF_PAR} \
-            || fail  "extract target orfs step died"
+    if [ -n "$ORF_SKIP" ]; then
+        if notExists "${TMP_PATH}/t_orfs_aa.dbtype"; then
+            # shellcheck disable=SC2086
+            "$MMSEQS" extractframes "$2" "${TMP_PATH}/t_orfs_aa" ${EXTRACT_FRAMES_PAR} \
+                || fail  "extractframes died"
+        fi
+    else
+        if notExists "${TMP_PATH}/t_orfs_aa.dbtype"; then
+            # same here
+            # shellcheck disable=SC2086
+            "$MMSEQS" extractorfs "$2" "${TMP_PATH}/t_orfs_aa" ${ORF_PAR} \
+                || fail  "extract target orfs step died"
+        fi
     fi
     TARGET="${TMP_PATH}/t_orfs_aa"
     TARGET_ORF="${TMP_PATH}/t_orfs_aa"
diff --git a/data/workflow/tsv2exprofiledb.sh b/data/workflow/tsv2exprofiledb.sh
index 184315d7c..4524354cb 100644
--- a/data/workflow/tsv2exprofiledb.sh
+++ b/data/workflow/tsv2exprofiledb.sh
@@ -16,44 +16,71 @@ OUT="$2"
 [ ! -f "${IN}_aln.tsv" ] && echo "${IN}_aln.tsv not found!" && exit 1;
 [ -d "${OUT}.tsv" ] && echo "${OUT} is a directory!" && exit 1;
 
-if notExists "${OUT}_h.dbtype"; then
-  MMSEQS_FORCE_MERGE=1 "$MMSEQS" tsv2db "${IN}_h.tsv" "${OUT}_h" --output-dbtype 12 ${VERBOSITY}
+if notExists "${OUT}_seq.dbtype"; then
+  if [ -n "${COMPRESSED}" ]; then
+    "$MMSEQS" tsv2db "${IN}_seq.tsv" "${OUT}_seq_tmp" --output-dbtype 0 ${VERBOSITY}
+    MMSEQS_FORCE_MERGE=1 "$MMSEQS" compress "${OUT}_seq_tmp" "${OUT}_seq" ${THREADS}
+    "$MMSEQS" rmdb "${OUT}_seq_tmp" ${VERBOSITY}
+  else
+    "$MMSEQS" tsv2db "${IN}_seq.tsv" "${OUT}_seq" --output-dbtype 0 ${VERBOSITY}
+  fi
+fi
+
+if notExists "${OUT}_seq_h.dbtype"; then
+  MMSEQS_FORCE_MERGE=1 "$MMSEQS" tsv2db "${IN}_h.tsv" "${OUT}_seq_h" --output-dbtype 12 ${VERBOSITY}
 fi
 
 if notExists "${OUT}.dbtype"; then
   if [ -n "${COMPRESSED}" ]; then
     "$MMSEQS" tsv2db "${IN}.tsv" "${OUT}_tmp" --output-dbtype 0 ${VERBOSITY}
-    MMSEQS_FORCE_MERGE=1 "$MMSEQS" compress "${OUT}_tmp" "${OUT}" ${VERBOSITY}
+    MMSEQS_FORCE_MERGE=1 "$MMSEQS" compress "${OUT}_tmp" "${OUT}" ${THREADS}
     "$MMSEQS" rmdb "${OUT}_tmp" ${VERBOSITY}
   else
     MMSEQS_FORCE_MERGE=1 "$MMSEQS" tsv2db "${IN}.tsv" "${OUT}" --output-dbtype 0 ${VERBOSITY}
   fi
 fi
 
-if notExists "${OUT}_seq.dbtype"; then
-  if [ -n "${COMPRESSED}" ]; then
-    "$MMSEQS" tsv2db "${IN}_seq.tsv" "${OUT}_seq_tmp" --output-dbtype 0 ${VERBOSITY}
-    MMSEQS_FORCE_MERGE=1 "$MMSEQS" compress "${OUT}_seq_tmp" "${OUT}_seq" ${VERBOSITY}
-    "$MMSEQS" rmdb "${OUT}_seq_tmp" ${VERBOSITY}
-  else
-    "$MMSEQS" tsv2db "${IN}_seq.tsv" "${OUT}_seq" --output-dbtype 0 ${VERBOSITY}
+if [ -n "${GPU}" ]; then
+  if notExists "${OUT}.GPU_READY"; then
+    "$MMSEQS" aliasdb "${OUT}_seq_h" "${OUT}_h" ${VERBOSITY}
+    "$MMSEQS" makepaddedseqdb "${OUT}" "${OUT}_pad" ${THREADS}
+    "$MMSEQS" rmdb "${OUT}" ${VERBOSITY}
+    "$MMSEQS" rmdb "${OUT}_h" ${VERBOSITY}
+    "$MMSEQS" mvdb "${OUT}_pad" "${OUT}" ${VERBOSITY}
+    "$MMSEQS" mvdb "${OUT}_pad_h" "${OUT}_h" ${VERBOSITY}
+    touch "${OUT}.GPU_READY"
+  fi
+else
+  if notExists "${OUT}_h.dbtype"; then
+    "$MMSEQS" aliasdb "${OUT}_seq_h" "${OUT}_h" ${VERBOSITY}
   fi
 fi
 
 if notExists "${OUT}_aln.dbtype"; then
+  TSVPATH="${IN}"
+  if [ -n "${GPU}" ]; then
+    if notExists "${OUT}_mapped_aln.tsv"; then
+      awk 'BEGIN { FS = "\t"; OFS = "\t"; sh = ""; entry = ""; } NR == 1 { last = $1; } ($1 != last) { if (sh != "") print sh; if (entry != "") print entry; last = $1; sh = ""; entry = ""; } ($1 != "" && $1 == $2) { sh = $0; next; } { if (entry != "") { entry = entry "\n" $0; } else { entry = $0; } } END { if (sh != "") print sh; if (entry != "") print entry; }' \
+        "${IN}_aln.tsv" > "${OUT}_reorder_aln.tsv"
+      awk 'NR == FNR { f[$3] = $1; next; } { $1 = f[$1]; print }' \
+        "${OUT}.lookup" "${OUT}_reorder_aln.tsv" | sort -s -k1,1n > "${OUT}_mapped_aln.tsv"
+      rm -f -- "${OUT}_reorder_aln.tsv"
+    fi
+    TSVPATH="${OUT}_mapped"
+  fi
+
   if [ -n "${COMPRESSED}" ]; then
-    "$MMSEQS" tsv2db "${IN}_aln.tsv" "${OUT}_aln_tmp" --output-dbtype 5 ${VERBOSITY}
+    "$MMSEQS" tsv2db "${TSVPATH}_aln.tsv" "${OUT}_aln_tmp" --output-dbtype 5 ${VERBOSITY}
     MMSEQS_FORCE_MERGE=1 "$MMSEQS" compress "${OUT}_aln_tmp" "${OUT}_aln" ${VERBOSITY}
     "$MMSEQS" rmdb "${OUT}_aln_tmp" ${VERBOSITY}
   else
-    MMSEQS_FORCE_MERGE=1 "$MMSEQS" tsv2db "${IN}_aln.tsv" "${OUT}_aln" --output-dbtype 5 ${VERBOSITY}
+    MMSEQS_FORCE_MERGE=1 "$MMSEQS" tsv2db "${TSVPATH}_aln.tsv" "${OUT}_aln" --output-dbtype 5 ${VERBOSITY}
   fi
 fi
 
-if notExists "${OUT}_seq_h.dbtype"; then
-  "$MMSEQS" aliasdb "${OUT}_h" "${OUT}_seq_h" ${VERBOSITY}
-fi
-
 if [ -e "${OUT}.sh" ]; then
+  if [ -n "${GPU}" ]; then
+    rm -rf -- "${OUT}_mapped_aln.tsv"
+  fi
   rm -f -- "${OUT}.sh"
 fi
diff --git a/lib/cacode/CMakeLists.txt b/lib/cacode/CMakeLists.txt
deleted file mode 100644
index 0d126b5dd..000000000
--- a/lib/cacode/CMakeLists.txt
+++ /dev/null
@@ -1,9 +0,0 @@
-add_library(cacode OBJECT
-        lambda_calculator.cpp
-        lambda_calculator.h
-        lubksb.cpp
-        ludcmp.cpp
-        nrutil.cpp
-        nrutil.h
-        )
-set_target_properties(cacode PROPERTIES COMPILE_FLAGS "${MMSEQS_CXX_FLAGS} -w" LINK_FLAGS "${MMSEQS_CXX_FLAGS} -w")
diff --git a/lib/cacode/LICENSE.LAST b/lib/cacode/LICENSE.LAST
deleted file mode 100644
index 94a9ed024..000000000
--- a/lib/cacode/LICENSE.LAST
+++ /dev/null
@@ -1,674 +0,0 @@
-                    GNU GENERAL PUBLIC LICENSE
-                       Version 3, 29 June 2007
-
- Copyright (C) 2007 Free Software Foundation, Inc. <http://fsf.org/>
- Everyone is permitted to copy and distribute verbatim copies
- of this license document, but changing it is not allowed.
-
-                            Preamble
-
-  The GNU General Public License is a free, copyleft license for
-software and other kinds of works.
-
-  The licenses for most software and other practical works are designed
-to take away your freedom to share and change the works.  By contrast,
-the GNU General Public License is intended to guarantee your freedom to
-share and change all versions of a program--to make sure it remains free
-software for all its users.  We, the Free Software Foundation, use the
-GNU General Public License for most of our software; it applies also to
-any other work released this way by its authors.  You can apply it to
-your programs, too.
-
-  When we speak of free software, we are referring to freedom, not
-price.  Our General Public Licenses are designed to make sure that you
-have the freedom to distribute copies of free software (and charge for
-them if you wish), that you receive source code or can get it if you
-want it, that you can change the software or use pieces of it in new
-free programs, and that you know you can do these things.
-
-  To protect your rights, we need to prevent others from denying you
-these rights or asking you to surrender the rights.  Therefore, you have
-certain responsibilities if you distribute copies of the software, or if
-you modify it: responsibilities to respect the freedom of others.
-
-  For example, if you distribute copies of such a program, whether
-gratis or for a fee, you must pass on to the recipients the same
-freedoms that you received.  You must make sure that they, too, receive
-or can get the source code.  And you must show them these terms so they
-know their rights.
-
-  Developers that use the GNU GPL protect your rights with two steps:
-(1) assert copyright on the software, and (2) offer you this License
-giving you legal permission to copy, distribute and/or modify it.
-
-  For the developers' and authors' protection, the GPL clearly explains
-that there is no warranty for this free software.  For both users' and
-authors' sake, the GPL requires that modified versions be marked as
-changed, so that their problems will not be attributed erroneously to
-authors of previous versions.
-
-  Some devices are designed to deny users access to install or run
-modified versions of the software inside them, although the manufacturer
-can do so.  This is fundamentally incompatible with the aim of
-protecting users' freedom to change the software.  The systematic
-pattern of such abuse occurs in the area of products for individuals to
-use, which is precisely where it is most unacceptable.  Therefore, we
-have designed this version of the GPL to prohibit the practice for those
-products.  If such problems arise substantially in other domains, we
-stand ready to extend this provision to those domains in future versions
-of the GPL, as needed to protect the freedom of users.
-
-  Finally, every program is threatened constantly by software patents.
-States should not allow patents to restrict development and use of
-software on general-purpose computers, but in those that do, we wish to
-avoid the special danger that patents applied to a free program could
-make it effectively proprietary.  To prevent this, the GPL assures that
-patents cannot be used to render the program non-free.
-
-  The precise terms and conditions for copying, distribution and
-modification follow.
-
-                       TERMS AND CONDITIONS
-
-  0. Definitions.
-
-  "This License" refers to version 3 of the GNU General Public License.
-
-  "Copyright" also means copyright-like laws that apply to other kinds of
-works, such as semiconductor masks.
-
-  "The Program" refers to any copyrightable work licensed under this
-License.  Each licensee is addressed as "you".  "Licensees" and
-"recipients" may be individuals or organizations.
-
-  To "modify" a work means to copy from or adapt all or part of the work
-in a fashion requiring copyright permission, other than the making of an
-exact copy.  The resulting work is called a "modified version" of the
-earlier work or a work "based on" the earlier work.
-
-  A "covered work" means either the unmodified Program or a work based
-on the Program.
-
-  To "propagate" a work means to do anything with it that, without
-permission, would make you directly or secondarily liable for
-infringement under applicable copyright law, except executing it on a
-computer or modifying a private copy.  Propagation includes copying,
-distribution (with or without modification), making available to the
-public, and in some countries other activities as well.
-
-  To "convey" a work means any kind of propagation that enables other
-parties to make or receive copies.  Mere interaction with a user through
-a computer network, with no transfer of a copy, is not conveying.
-
-  An interactive user interface displays "Appropriate Legal Notices"
-to the extent that it includes a convenient and prominently visible
-feature that (1) displays an appropriate copyright notice, and (2)
-tells the user that there is no warranty for the work (except to the
-extent that warranties are provided), that licensees may convey the
-work under this License, and how to view a copy of this License.  If
-the interface presents a list of user commands or options, such as a
-menu, a prominent item in the list meets this criterion.
-
-  1. Source Code.
-
-  The "source code" for a work means the preferred form of the work
-for making modifications to it.  "Object code" means any non-source
-form of a work.
-
-  A "Standard Interface" means an interface that either is an official
-standard defined by a recognized standards body, or, in the case of
-interfaces specified for a particular programming language, one that
-is widely used among developers working in that language.
-
-  The "System Libraries" of an executable work include anything, other
-than the work as a whole, that (a) is included in the normal form of
-packaging a Major Component, but which is not part of that Major
-Component, and (b) serves only to enable use of the work with that
-Major Component, or to implement a Standard Interface for which an
-implementation is available to the public in source code form.  A
-"Major Component", in this context, means a major essential component
-(kernel, window system, and so on) of the specific operating system
-(if any) on which the executable work runs, or a compiler used to
-produce the work, or an object code interpreter used to run it.
-
-  The "Corresponding Source" for a work in object code form means all
-the source code needed to generate, install, and (for an executable
-work) run the object code and to modify the work, including scripts to
-control those activities.  However, it does not include the work's
-System Libraries, or general-purpose tools or generally available free
-programs which are used unmodified in performing those activities but
-which are not part of the work.  For example, Corresponding Source
-includes interface definition files associated with source files for
-the work, and the source code for shared libraries and dynamically
-linked subprograms that the work is specifically designed to require,
-such as by intimate data communication or control flow between those
-subprograms and other parts of the work.
-
-  The Corresponding Source need not include anything that users
-can regenerate automatically from other parts of the Corresponding
-Source.
-
-  The Corresponding Source for a work in source code form is that
-same work.
-
-  2. Basic Permissions.
-
-  All rights granted under this License are granted for the term of
-copyright on the Program, and are irrevocable provided the stated
-conditions are met.  This License explicitly affirms your unlimited
-permission to run the unmodified Program.  The output from running a
-covered work is covered by this License only if the output, given its
-content, constitutes a covered work.  This License acknowledges your
-rights of fair use or other equivalent, as provided by copyright law.
-
-  You may make, run and propagate covered works that you do not
-convey, without conditions so long as your license otherwise remains
-in force.  You may convey covered works to others for the sole purpose
-of having them make modifications exclusively for you, or provide you
-with facilities for running those works, provided that you comply with
-the terms of this License in conveying all material for which you do
-not control copyright.  Those thus making or running the covered works
-for you must do so exclusively on your behalf, under your direction
-and control, on terms that prohibit them from making any copies of
-your copyrighted material outside their relationship with you.
-
-  Conveying under any other circumstances is permitted solely under
-the conditions stated below.  Sublicensing is not allowed; section 10
-makes it unnecessary.
-
-  3. Protecting Users' Legal Rights From Anti-Circumvention Law.
-
-  No covered work shall be deemed part of an effective technological
-measure under any applicable law fulfilling obligations under article
-11 of the WIPO copyright treaty adopted on 20 December 1996, or
-similar laws prohibiting or restricting circumvention of such
-measures.
-
-  When you convey a covered work, you waive any legal power to forbid
-circumvention of technological measures to the extent such circumvention
-is effected by exercising rights under this License with respect to
-the covered work, and you disclaim any intention to limit operation or
-modification of the work as a means of enforcing, against the work's
-users, your or third parties' legal rights to forbid circumvention of
-technological measures.
-
-  4. Conveying Verbatim Copies.
-
-  You may convey verbatim copies of the Program's source code as you
-receive it, in any medium, provided that you conspicuously and
-appropriately publish on each copy an appropriate copyright notice;
-keep intact all notices stating that this License and any
-non-permissive terms added in accord with section 7 apply to the code;
-keep intact all notices of the absence of any warranty; and give all
-recipients a copy of this License along with the Program.
-
-  You may charge any price or no price for each copy that you convey,
-and you may offer support or warranty protection for a fee.
-
-  5. Conveying Modified Source Versions.
-
-  You may convey a work based on the Program, or the modifications to
-produce it from the Program, in the form of source code under the
-terms of section 4, provided that you also meet all of these conditions:
-
-    a) The work must carry prominent notices stating that you modified
-    it, and giving a relevant date.
-
-    b) The work must carry prominent notices stating that it is
-    released under this License and any conditions added under section
-    7.  This requirement modifies the requirement in section 4 to
-    "keep intact all notices".
-
-    c) You must license the entire work, as a whole, under this
-    License to anyone who comes into possession of a copy.  This
-    License will therefore apply, along with any applicable section 7
-    additional terms, to the whole of the work, and all its parts,
-    regardless of how they are packaged.  This License gives no
-    permission to license the work in any other way, but it does not
-    invalidate such permission if you have separately received it.
-
-    d) If the work has interactive user interfaces, each must display
-    Appropriate Legal Notices; however, if the Program has interactive
-    interfaces that do not display Appropriate Legal Notices, your
-    work need not make them do so.
-
-  A compilation of a covered work with other separate and independent
-works, which are not by their nature extensions of the covered work,
-and which are not combined with it such as to form a larger program,
-in or on a volume of a storage or distribution medium, is called an
-"aggregate" if the compilation and its resulting copyright are not
-used to limit the access or legal rights of the compilation's users
-beyond what the individual works permit.  Inclusion of a covered work
-in an aggregate does not cause this License to apply to the other
-parts of the aggregate.
-
-  6. Conveying Non-Source Forms.
-
-  You may convey a covered work in object code form under the terms
-of sections 4 and 5, provided that you also convey the
-machine-readable Corresponding Source under the terms of this License,
-in one of these ways:
-
-    a) Convey the object code in, or embodied in, a physical product
-    (including a physical distribution medium), accompanied by the
-    Corresponding Source fixed on a durable physical medium
-    customarily used for software interchange.
-
-    b) Convey the object code in, or embodied in, a physical product
-    (including a physical distribution medium), accompanied by a
-    written offer, valid for at least three years and valid for as
-    long as you offer spare parts or customer support for that product
-    model, to give anyone who possesses the object code either (1) a
-    copy of the Corresponding Source for all the software in the
-    product that is covered by this License, on a durable physical
-    medium customarily used for software interchange, for a price no
-    more than your reasonable cost of physically performing this
-    conveying of source, or (2) access to copy the
-    Corresponding Source from a network server at no charge.
-
-    c) Convey individual copies of the object code with a copy of the
-    written offer to provide the Corresponding Source.  This
-    alternative is allowed only occasionally and noncommercially, and
-    only if you received the object code with such an offer, in accord
-    with subsection 6b.
-
-    d) Convey the object code by offering access from a designated
-    place (gratis or for a charge), and offer equivalent access to the
-    Corresponding Source in the same way through the same place at no
-    further charge.  You need not require recipients to copy the
-    Corresponding Source along with the object code.  If the place to
-    copy the object code is a network server, the Corresponding Source
-    may be on a different server (operated by you or a third party)
-    that supports equivalent copying facilities, provided you maintain
-    clear directions next to the object code saying where to find the
-    Corresponding Source.  Regardless of what server hosts the
-    Corresponding Source, you remain obligated to ensure that it is
-    available for as long as needed to satisfy these requirements.
-
-    e) Convey the object code using peer-to-peer transmission, provided
-    you inform other peers where the object code and Corresponding
-    Source of the work are being offered to the general public at no
-    charge under subsection 6d.
-
-  A separable portion of the object code, whose source code is excluded
-from the Corresponding Source as a System Library, need not be
-included in conveying the object code work.
-
-  A "User Product" is either (1) a "consumer product", which means any
-tangible personal property which is normally used for personal, family,
-or household purposes, or (2) anything designed or sold for incorporation
-into a dwelling.  In determining whether a product is a consumer product,
-doubtful cases shall be resolved in favor of coverage.  For a particular
-product received by a particular user, "normally used" refers to a
-typical or common use of that class of product, regardless of the status
-of the particular user or of the way in which the particular user
-actually uses, or expects or is expected to use, the product.  A product
-is a consumer product regardless of whether the product has substantial
-commercial, industrial or non-consumer uses, unless such uses represent
-the only significant mode of use of the product.
-
-  "Installation Information" for a User Product means any methods,
-procedures, authorization keys, or other information required to install
-and execute modified versions of a covered work in that User Product from
-a modified version of its Corresponding Source.  The information must
-suffice to ensure that the continued functioning of the modified object
-code is in no case prevented or interfered with solely because
-modification has been made.
-
-  If you convey an object code work under this section in, or with, or
-specifically for use in, a User Product, and the conveying occurs as
-part of a transaction in which the right of possession and use of the
-User Product is transferred to the recipient in perpetuity or for a
-fixed term (regardless of how the transaction is characterized), the
-Corresponding Source conveyed under this section must be accompanied
-by the Installation Information.  But this requirement does not apply
-if neither you nor any third party retains the ability to install
-modified object code on the User Product (for example, the work has
-been installed in ROM).
-
-  The requirement to provide Installation Information does not include a
-requirement to continue to provide support service, warranty, or updates
-for a work that has been modified or installed by the recipient, or for
-the User Product in which it has been modified or installed.  Access to a
-network may be denied when the modification itself materially and
-adversely affects the operation of the network or violates the rules and
-protocols for communication across the network.
-
-  Corresponding Source conveyed, and Installation Information provided,
-in accord with this section must be in a format that is publicly
-documented (and with an implementation available to the public in
-source code form), and must require no special password or key for
-unpacking, reading or copying.
-
-  7. Additional Terms.
-
-  "Additional permissions" are terms that supplement the terms of this
-License by making exceptions from one or more of its conditions.
-Additional permissions that are applicable to the entire Program shall
-be treated as though they were included in this License, to the extent
-that they are valid under applicable law.  If additional permissions
-apply only to part of the Program, that part may be used separately
-under those permissions, but the entire Program remains governed by
-this License without regard to the additional permissions.
-
-  When you convey a copy of a covered work, you may at your option
-remove any additional permissions from that copy, or from any part of
-it.  (Additional permissions may be written to require their own
-removal in certain cases when you modify the work.)  You may place
-additional permissions on material, added by you to a covered work,
-for which you have or can give appropriate copyright permission.
-
-  Notwithstanding any other provision of this License, for material you
-add to a covered work, you may (if authorized by the copyright holders of
-that material) supplement the terms of this License with terms:
-
-    a) Disclaiming warranty or limiting liability differently from the
-    terms of sections 15 and 16 of this License; or
-
-    b) Requiring preservation of specified reasonable legal notices or
-    author attributions in that material or in the Appropriate Legal
-    Notices displayed by works containing it; or
-
-    c) Prohibiting misrepresentation of the origin of that material, or
-    requiring that modified versions of such material be marked in
-    reasonable ways as different from the original version; or
-
-    d) Limiting the use for publicity purposes of names of licensors or
-    authors of the material; or
-
-    e) Declining to grant rights under trademark law for use of some
-    trade names, trademarks, or service marks; or
-
-    f) Requiring indemnification of licensors and authors of that
-    material by anyone who conveys the material (or modified versions of
-    it) with contractual assumptions of liability to the recipient, for
-    any liability that these contractual assumptions directly impose on
-    those licensors and authors.
-
-  All other non-permissive additional terms are considered "further
-restrictions" within the meaning of section 10.  If the Program as you
-received it, or any part of it, contains a notice stating that it is
-governed by this License along with a term that is a further
-restriction, you may remove that term.  If a license document contains
-a further restriction but permits relicensing or conveying under this
-License, you may add to a covered work material governed by the terms
-of that license document, provided that the further restriction does
-not survive such relicensing or conveying.
-
-  If you add terms to a covered work in accord with this section, you
-must place, in the relevant source files, a statement of the
-additional terms that apply to those files, or a notice indicating
-where to find the applicable terms.
-
-  Additional terms, permissive or non-permissive, may be stated in the
-form of a separately written license, or stated as exceptions;
-the above requirements apply either way.
-
-  8. Termination.
-
-  You may not propagate or modify a covered work except as expressly
-provided under this License.  Any attempt otherwise to propagate or
-modify it is void, and will automatically terminate your rights under
-this License (including any patent licenses granted under the third
-paragraph of section 11).
-
-  However, if you cease all violation of this License, then your
-license from a particular copyright holder is reinstated (a)
-provisionally, unless and until the copyright holder explicitly and
-finally terminates your license, and (b) permanently, if the copyright
-holder fails to notify you of the violation by some reasonable means
-prior to 60 days after the cessation.
-
-  Moreover, your license from a particular copyright holder is
-reinstated permanently if the copyright holder notifies you of the
-violation by some reasonable means, this is the first time you have
-received notice of violation of this License (for any work) from that
-copyright holder, and you cure the violation prior to 30 days after
-your receipt of the notice.
-
-  Termination of your rights under this section does not terminate the
-licenses of parties who have received copies or rights from you under
-this License.  If your rights have been terminated and not permanently
-reinstated, you do not qualify to receive new licenses for the same
-material under section 10.
-
-  9. Acceptance Not Required for Having Copies.
-
-  You are not required to accept this License in order to receive or
-run a copy of the Program.  Ancillary propagation of a covered work
-occurring solely as a consequence of using peer-to-peer transmission
-to receive a copy likewise does not require acceptance.  However,
-nothing other than this License grants you permission to propagate or
-modify any covered work.  These actions infringe copyright if you do
-not accept this License.  Therefore, by modifying or propagating a
-covered work, you indicate your acceptance of this License to do so.
-
-  10. Automatic Licensing of Downstream Recipients.
-
-  Each time you convey a covered work, the recipient automatically
-receives a license from the original licensors, to run, modify and
-propagate that work, subject to this License.  You are not responsible
-for enforcing compliance by third parties with this License.
-
-  An "entity transaction" is a transaction transferring control of an
-organization, or substantially all assets of one, or subdividing an
-organization, or merging organizations.  If propagation of a covered
-work results from an entity transaction, each party to that
-transaction who receives a copy of the work also receives whatever
-licenses to the work the party's predecessor in interest had or could
-give under the previous paragraph, plus a right to possession of the
-Corresponding Source of the work from the predecessor in interest, if
-the predecessor has it or can get it with reasonable efforts.
-
-  You may not impose any further restrictions on the exercise of the
-rights granted or affirmed under this License.  For example, you may
-not impose a license fee, royalty, or other charge for exercise of
-rights granted under this License, and you may not initiate litigation
-(including a cross-claim or counterclaim in a lawsuit) alleging that
-any patent claim is infringed by making, using, selling, offering for
-sale, or importing the Program or any portion of it.
-
-  11. Patents.
-
-  A "contributor" is a copyright holder who authorizes use under this
-License of the Program or a work on which the Program is based.  The
-work thus licensed is called the contributor's "contributor version".
-
-  A contributor's "essential patent claims" are all patent claims
-owned or controlled by the contributor, whether already acquired or
-hereafter acquired, that would be infringed by some manner, permitted
-by this License, of making, using, or selling its contributor version,
-but do not include claims that would be infringed only as a
-consequence of further modification of the contributor version.  For
-purposes of this definition, "control" includes the right to grant
-patent sublicenses in a manner consistent with the requirements of
-this License.
-
-  Each contributor grants you a non-exclusive, worldwide, royalty-free
-patent license under the contributor's essential patent claims, to
-make, use, sell, offer for sale, import and otherwise run, modify and
-propagate the contents of its contributor version.
-
-  In the following three paragraphs, a "patent license" is any express
-agreement or commitment, however denominated, not to enforce a patent
-(such as an express permission to practice a patent or covenant not to
-sue for patent infringement).  To "grant" such a patent license to a
-party means to make such an agreement or commitment not to enforce a
-patent against the party.
-
-  If you convey a covered work, knowingly relying on a patent license,
-and the Corresponding Source of the work is not available for anyone
-to copy, free of charge and under the terms of this License, through a
-publicly available network server or other readily accessible means,
-then you must either (1) cause the Corresponding Source to be so
-available, or (2) arrange to deprive yourself of the benefit of the
-patent license for this particular work, or (3) arrange, in a manner
-consistent with the requirements of this License, to extend the patent
-license to downstream recipients.  "Knowingly relying" means you have
-actual knowledge that, but for the patent license, your conveying the
-covered work in a country, or your recipient's use of the covered work
-in a country, would infringe one or more identifiable patents in that
-country that you have reason to believe are valid.
-
-  If, pursuant to or in connection with a single transaction or
-arrangement, you convey, or propagate by procuring conveyance of, a
-covered work, and grant a patent license to some of the parties
-receiving the covered work authorizing them to use, propagate, modify
-or convey a specific copy of the covered work, then the patent license
-you grant is automatically extended to all recipients of the covered
-work and works based on it.
-
-  A patent license is "discriminatory" if it does not include within
-the scope of its coverage, prohibits the exercise of, or is
-conditioned on the non-exercise of one or more of the rights that are
-specifically granted under this License.  You may not convey a covered
-work if you are a party to an arrangement with a third party that is
-in the business of distributing software, under which you make payment
-to the third party based on the extent of your activity of conveying
-the work, and under which the third party grants, to any of the
-parties who would receive the covered work from you, a discriminatory
-patent license (a) in connection with copies of the covered work
-conveyed by you (or copies made from those copies), or (b) primarily
-for and in connection with specific products or compilations that
-contain the covered work, unless you entered into that arrangement,
-or that patent license was granted, prior to 28 March 2007.
-
-  Nothing in this License shall be construed as excluding or limiting
-any implied license or other defenses to infringement that may
-otherwise be available to you under applicable patent law.
-
-  12. No Surrender of Others' Freedom.
-
-  If conditions are imposed on you (whether by court order, agreement or
-otherwise) that contradict the conditions of this License, they do not
-excuse you from the conditions of this License.  If you cannot convey a
-covered work so as to satisfy simultaneously your obligations under this
-License and any other pertinent obligations, then as a consequence you may
-not convey it at all.  For example, if you agree to terms that obligate you
-to collect a royalty for further conveying from those to whom you convey
-the Program, the only way you could satisfy both those terms and this
-License would be to refrain entirely from conveying the Program.
-
-  13. Use with the GNU Affero General Public License.
-
-  Notwithstanding any other provision of this License, you have
-permission to link or combine any covered work with a work licensed
-under version 3 of the GNU Affero General Public License into a single
-combined work, and to convey the resulting work.  The terms of this
-License will continue to apply to the part which is the covered work,
-but the special requirements of the GNU Affero General Public License,
-section 13, concerning interaction through a network will apply to the
-combination as such.
-
-  14. Revised Versions of this License.
-
-  The Free Software Foundation may publish revised and/or new versions of
-the GNU General Public License from time to time.  Such new versions will
-be similar in spirit to the present version, but may differ in detail to
-address new problems or concerns.
-
-  Each version is given a distinguishing version number.  If the
-Program specifies that a certain numbered version of the GNU General
-Public License "or any later version" applies to it, you have the
-option of following the terms and conditions either of that numbered
-version or of any later version published by the Free Software
-Foundation.  If the Program does not specify a version number of the
-GNU General Public License, you may choose any version ever published
-by the Free Software Foundation.
-
-  If the Program specifies that a proxy can decide which future
-versions of the GNU General Public License can be used, that proxy's
-public statement of acceptance of a version permanently authorizes you
-to choose that version for the Program.
-
-  Later license versions may give you additional or different
-permissions.  However, no additional obligations are imposed on any
-author or copyright holder as a result of your choosing to follow a
-later version.
-
-  15. Disclaimer of Warranty.
-
-  THERE IS NO WARRANTY FOR THE PROGRAM, TO THE EXTENT PERMITTED BY
-APPLICABLE LAW.  EXCEPT WHEN OTHERWISE STATED IN WRITING THE COPYRIGHT
-HOLDERS AND/OR OTHER PARTIES PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY
-OF ANY KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING, BUT NOT LIMITED TO,
-THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
-PURPOSE.  THE ENTIRE RISK AS TO THE QUALITY AND PERFORMANCE OF THE PROGRAM
-IS WITH YOU.  SHOULD THE PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF
-ALL NECESSARY SERVICING, REPAIR OR CORRECTION.
-
-  16. Limitation of Liability.
-
-  IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING
-WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MODIFIES AND/OR CONVEYS
-THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES, INCLUDING ANY
-GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING OUT OF THE
-USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED TO LOSS OF
-DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY YOU OR THIRD
-PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER PROGRAMS),
-EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE POSSIBILITY OF
-SUCH DAMAGES.
-
-  17. Interpretation of Sections 15 and 16.
-
-  If the disclaimer of warranty and limitation of liability provided
-above cannot be given local legal effect according to their terms,
-reviewing courts shall apply local law that most closely approximates
-an absolute waiver of all civil liability in connection with the
-Program, unless a warranty or assumption of liability accompanies a
-copy of the Program in return for a fee.
-
-                     END OF TERMS AND CONDITIONS
-
-            How to Apply These Terms to Your New Programs
-
-  If you develop a new program, and you want it to be of the greatest
-possible use to the public, the best way to achieve this is to make it
-free software which everyone can redistribute and change under these terms.
-
-  To do so, attach the following notices to the program.  It is safest
-to attach them to the start of each source file to most effectively
-state the exclusion of warranty; and each file should have at least
-the "copyright" line and a pointer to where the full notice is found.
-
-    <one line to give the program's name and a brief idea of what it does.>
-    Copyright (C) <year>  <name of author>
-
-    This program is free software: you can redistribute it and/or modify
-    it under the terms of the GNU General Public License as published by
-    the Free Software Foundation, either version 3 of the License, or
-    (at your option) any later version.
-
-    This program is distributed in the hope that it will be useful,
-    but WITHOUT ANY WARRANTY; without even the implied warranty of
-    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-    GNU General Public License for more details.
-
-    You should have received a copy of the GNU General Public License
-    along with this program.  If not, see <http://www.gnu.org/licenses/>.
-
-Also add information on how to contact you by electronic and paper mail.
-
-  If the program does terminal interaction, make it output a short
-notice like this when it starts in an interactive mode:
-
-    <program>  Copyright (C) <year>  <name of author>
-    This program comes with ABSOLUTELY NO WARRANTY; for details type `show w'.
-    This is free software, and you are welcome to redistribute it
-    under certain conditions; type `show c' for details.
-
-The hypothetical commands `show w' and `show c' should show the appropriate
-parts of the General Public License.  Of course, your program's commands
-might be different; for a GUI interface, you would use an "about box".
-
-  You should also get your employer (if you work as a programmer) or school,
-if any, to sign a "copyright disclaimer" for the program, if necessary.
-For more information on this, and how to apply and follow the GNU GPL, see
-<http://www.gnu.org/licenses/>.
-
-  The GNU General Public License does not permit incorporating your program
-into proprietary programs.  If your program is a subroutine library, you
-may consider it more useful to permit linking proprietary applications with
-the library.  If this is what you want to do, use the GNU Lesser General
-Public License instead of this License.  But first, please read
-<http://www.gnu.org/philosophy/why-not-lgpl.html>.
diff --git a/lib/cacode/LICENSE.NCBI b/lib/cacode/LICENSE.NCBI
deleted file mode 100644
index f2b2df75b..000000000
--- a/lib/cacode/LICENSE.NCBI
+++ /dev/null
@@ -1,26 +0,0 @@
-https://github.com/superwills/NibbleAndAHalf
-base64.h -- Fast base64 encoding and decoding.
-version 1.0.0, April 17, 2013 143a
-
-Copyright (C) 2013 William Sherif
-
-This software is provided 'as-is', without any express or implied
-warranty.  In no event will the authors be held liable for any damages
-arising from the use of this software.
-
-Permission is granted to anyone to use this software for any purpose,
-including commercial applications, and to alter it and redistribute it
-freely, subject to the following restrictions:
-
-1. The origin of this software must not be misrepresented; you must not
- claim that you wrote the original software. If you use this software
- in a product, an acknowledgment in the product documentation would be
- appreciated but is not required.
-2. Altered source versions must be plainly marked as such, and must not be
- misrepresented as being the original software.
-3. This notice may not be removed or altered from any source distribution.
-
-William Sherif
-will.sherif@gmail.com
-
-YWxsIHlvdXIgYmFzZSBhcmUgYmVsb25nIHRvIHVz
diff --git a/lib/cacode/README b/lib/cacode/README
deleted file mode 100644
index 45d7b8470..000000000
--- a/lib/cacode/README
+++ /dev/null
@@ -1,2 +0,0 @@
-CA_code was extracted from LAST (http://last.cbrc.jp) which is licensed under GPLv3-or-later (see LICENSE.LAST).
-CA_code itself is public domain developed by members of the NCBI (see LICENSE.NCBI).
diff --git a/lib/cacode/lambda_calculator.cpp b/lib/cacode/lambda_calculator.cpp
deleted file mode 100644
index 560d6d6f9..000000000
--- a/lib/cacode/lambda_calculator.cpp
+++ /dev/null
@@ -1,478 +0,0 @@
-// Copyright 2008 Michiaki Hamada
-// Adapted from public domain code by Yi-Kuo Yu, NCBI
-
-/**
- * See lambda_calculator.h
- */
-#include <stdio.h>
-#include <stdlib.h>
-#include <math.h>
-#include "nrutil.h"
-
-int Alphsize;
-
-#include "nrutil.cpp"
-#include "ludcmp.cpp"
-#include "lubksb.cpp"
-#include "lambda_calculator.h"
-
-#define Epsilon 1.0e-36
-#define E_bound 1.0e-12
-#define Infty   1000000.0
-#define min(A, B) ((A) > (B) ? (B) : (A) )
-#define max(A, B) ((A) > (B) ? (A) : (B) )
-#define bool int
-#define true 1
-#define false 0
-double Lambda_UB; //lambda upper bound
-double r_max_m, c_max_m; //min of each row's (column's) max
-
-void makematrix(const double **mat_b, double **a, double lambda);
-
-typedef struct Lambda {
-    double min;
-    double max;
-    int flag;    // 1 means there is a range, -1 means no solution possible.
-} Lambda;
-typedef struct Sum {
-    double value;
-    int flag;   // 1 means no negative bg_freq, -1 means there is negative bg_freq
-} Sum;
-
-Lambda Find_JP(const double **mat_b, double la_min, double la_max, double **JP, double *p_in, double *q_in);
-
-Sum Check_root(const double **mat_b, double **a, double lambda, double *p, double *q);
-
-double Check_det(const double **mat_b, double **a, double lambda);
-
-Sum Nail_lambda(const double **mat_b, int flag_sign, double lambda_min, double lambda_max, double *p, double *q,
-                double *la_add);
-
-double Nail_det(const double **mat_b, int flag_sign, double lambda_min, double lambda_max);
-
-bool Check_range(const double **mat_b);
-
-double *Locate_det_zero(const double **mat_b, int *); //pointer to root list are returned with how many of them by int
-
-
-double calculate_lambda(const double **mat_b, int alpha_size,
-                        double *p, double *q) {
-    double **JP/*, *q, *p*/;
-    int k;
-    double *root_location;
-    int N_root;
-    Lambda Lambda_local;
-
-    Alphsize = alpha_size;
-
-    if (!Check_range(mat_b)) return -1.0;
-
-    root_location = Locate_det_zero(mat_b, &N_root);
-    if (root_location == NULL && N_root > 0) return -1.0;
-
-    //q=dvector(1,Alphsize);
-    //p=dvector(1,Alphsize);
-    JP = dmatrix(1, Alphsize, 1, Alphsize);
-
-    if (N_root == 0) {
-        Lambda_local = Find_JP(mat_b, 0, Lambda_UB, JP, p, q);
-        if (1 == Lambda_local.flag) { // sensible solution found
-            // Remember to find the right place to free the vectors
-            //free_dvector(p, 1,Alphsize);
-            //free_dvector(q, 1,Alphsize);
-            free(root_location);
-            free_dmatrix(JP, 1, Alphsize, 1, Alphsize);
-            return (Lambda_local.min + Lambda_local.max) / 2.0;
-        } else if (-1 == Lambda_local.flag) {
-            //printf("matrix pass first screening but no sensible solution found. :-( \n");
-        }
-    } else if (N_root > 0) {
-        //printf("N_root = %d for this matirx \n", N_root);
-        //for (i=0;i<N_root;i++) printf("N_root[%d] = %lf\n",i,root_location[i]);
-        for (k = 0; k <= N_root; k++) {
-            if (k == 0) {
-                Lambda_local.min = 0;
-                Lambda_local.max = root_location[0];
-            }
-            else if (k == N_root) {
-                Lambda_local.min = root_location[N_root - 1];
-                Lambda_local.max = Lambda_UB + Epsilon;
-            }
-            else {
-                Lambda_local.min = root_location[k - 1];
-                Lambda_local.max = root_location[k];
-            }
-            Lambda_local = Find_JP(mat_b, Lambda_local.min, Lambda_local.max, JP, p, q);
-            if (1 == Lambda_local.flag) { // sensible solution found
-                //free_dvector(p, 1,Alphsize);
-                //free_dvector(q, 1,Alphsize);
-                free(root_location);
-                free_dmatrix(JP, 1, Alphsize, 1, Alphsize);
-                return (Lambda_local.min + Lambda_local.max) / 2.0;
-            } else if (-1 == Lambda_local.flag) {
-                //printf("matrix pass first screening but still no sensible solution found. :-( \n");
-            }
-        }
-    }
-    // Remember to find the right place to free the vectors
-    //free_dvector(p, 1,Alphsize);
-    //free_dvector(q, 1,Alphsize);
-    free(root_location);
-    free_dmatrix(JP, 1, Alphsize, 1, Alphsize);
-    return -1.0;
-}
-
-bool Check_range(const double **mat_b) {
-    int pos_flag_r, neg_flag_r;
-    int pos_flag_c, neg_flag_c;
-    double r_max, c_max; //max of each row (or column)
-
-    int L_r = 0, L_c = 0; // number of zero-score rows (columns)
-
-    // First make sure each row and column have both pos and neg entries
-    r_max_m = c_max_m = 100000000000.0;
-    for (int i = 1; i <= Alphsize; i++) {
-        r_max = 0;
-        c_max = 0;
-        pos_flag_r = -1;
-        neg_flag_r = -1;
-        pos_flag_c = -1;
-        neg_flag_c = -1;
-        for (int j = 1; j <= Alphsize; j++) {
-            if (mat_b[i][j] > 0) {
-                if (mat_b[i][j] > r_max) r_max = mat_b[i][j];
-                pos_flag_r = 1;
-            } else if (mat_b[i][j] < 0) neg_flag_r = 1;
-            if (mat_b[j][i] > 0) {
-                if (mat_b[j][i] > c_max) c_max = mat_b[j][i];
-                pos_flag_c = 1;
-            } else if (mat_b[j][i] < 0) neg_flag_c = 1;
-        }
-        if ((pos_flag_r == -1) || (neg_flag_r == -1) || (pos_flag_c == -1) || (neg_flag_c == -1)) {
-            if ((pos_flag_r == -1) && (neg_flag_r == -1)) {
-                printf("only zero score at row  %d\n", i);
-                L_r++;
-            } else if ((pos_flag_c == -1) && (neg_flag_c == -1)) {
-                printf("only zero score at column %d\n", i);
-                L_c++;
-            } else {
-                //printf("all positive or all negative at row or column %d\n", i);
-                //printf("therefore invalid matrix. exit now. \n");
-                return false;
-                //exit(1);
-            }
-        }
-        if ((r_max < r_max_m) && (r_max > 0)) r_max_m = r_max;
-        if ((c_max < c_max_m) && (c_max > 0)) c_max_m = c_max;
-    }
-
-
-    // Find the upper bound for lambda
-    if (r_max_m > c_max_m) {
-        Lambda_UB = 1.1 * log(1.0 * Alphsize - L_r) / r_max_m;
-    } else {
-        Lambda_UB = 1.1 * log(1.0 * Alphsize - L_c) / c_max_m;
-    }
-    //printf("the upper bound for lambda is %lf\n", Lambda_UB);
-    return true;
-}
-
-
-double Check_det(const double **mat_b, double **a, double lambda) {
-    double d;
-    int i, /*j,*/ *indx;
-
-    indx = ivector(1, Alphsize);
-    makematrix(mat_b, a, lambda);
-    ludcmp(a, Alphsize, indx, &d);
-    for (i = 1; i <= Alphsize; i++) d *= a[i][i];
-    free_ivector(indx, 1, Alphsize);
-    return d;  //returning the determinant
-}
-
-
-Sum Check_root(const double **mat_b, double **a, double lambda, double *p, double *q) {
-    double **y, /* *col,*/ d;
-    //double sum = 0.0;
-    int i, j;//, *indx;
-    Sum Sum_here;
-
-    y = dmatrix(1, Alphsize, 1, Alphsize);
-    //indx = ivector(1,Alphsize);
-    int indx[Alphsize + 1];
-    //col = dvector(1,Alphsize);
-    double col[Alphsize + 1];
-
-    makematrix(mat_b, a, lambda);
-    ludcmp(a, Alphsize, indx, &d);
-    Sum_here.value = 0.0;
-    for (i = 1; i <= Alphsize; i++) q[i] = 0.0;
-    for (j = 1; j <= Alphsize; j++) {
-        for (i = 1; i <= Alphsize; i++){
-            col[i] = 0.0;
-        }
-        col[j] = 1.0;
-        lubksb(a, Alphsize, indx, col);
-        p[j] = 0.0;
-        for (i = 1; i <= Alphsize; i++) {
-            y[i][j] = col[i];
-            Sum_here.value += y[i][j];
-            p[j] += y[i][j];
-            q[i] += y[i][j];
-        }
-    }
-
-    Sum_here.flag = 1;
-    for (i = 1; i < Alphsize; i++) {
-        if ((p[i] < 0) || (q[i] < 0)) {
-            Sum_here.flag = -1;
-            //printf("problematic freq. p[%d] = %.4f q[%d]=%.4f\n",i,p[i],i,q[i]);
-        }
-    }
-    free_dmatrix(y, 1, Alphsize, 1, Alphsize);
-    return Sum_here;
-}
-
-
-double *Locate_det_zero(const double **mat_b, int *N_root_add) {
-    double **a/*,  *q, *p */; // a is the exponentiated matrix of socres, p and q are bg_freqs
-    int i/*,j,k*/;
-    int N;  // number of points for first round
-    int flag_sign;
-    double lambda/*, l_tmp, sum,  sum_min, sum_max */;
-    double lambda_root, dlambda /*, dsum=0.5 */;
-    //double *l_here, *s_here;
-    double root[5000];
-    double *root_temp;
-    //double error=0.000000000001;
-    int zero_monitor = 0;  // record number of zeros found in the range
-    //int flag;
-
-    a = dmatrix(1, Alphsize, 1, Alphsize);
-    //Sum_local = (Sum *)malloc(sizeof(Sum));
-    //Lambda_local = (Lambda *)malloc(sizeof(Lambda));
-
-    N = 2 + max(400, ((int) (Lambda_UB - 0) / 0.005));
-    //printf("N = %d in Locate_det_zero\n", N);
-    dlambda = (Lambda_UB) / (N * 1.0);
-    //l_here = (double *)malloc((N+1)*sizeof(double));
-    //s_here = (double *)malloc((N+1)*sizeof(double));
-    double l_here[N + 1];
-    double s_here[N + 1];
-
-    for (i = 0; i < N; i++) {
-        lambda = (i + 1) * dlambda;
-        s_here[i] = Check_det(mat_b, a, lambda);
-        l_here[i] = lambda;
-    }
-
-    if (s_here[0] < 0.0) flag_sign = -1;
-    if (s_here[0] > 0.0) flag_sign = 1;
-    if (fabs(s_here[0]) / exp(l_here[0] * (r_max_m + c_max_m) / 2.0) <= Epsilon) {
-        root[zero_monitor++] = l_here[0];
-        flag_sign = 0;
-    }
-
-    for (i = 1; i < N; i++) {
-        if ((flag_sign != 0) && (fabs(s_here[i]) > Epsilon)) {
-            if (s_here[i - 1] * s_here[i] < 0) {
-                //printf("occurring at regular places\n");
-                lambda_root = Nail_det(mat_b, flag_sign, l_here[i - 1], l_here[i]);
-                root[zero_monitor++] = lambda_root;
-                flag_sign = -flag_sign;  // the flag switch sign after one sol found
-                //printf("a (regular) root of det found at %12.10f, i= %d\n", lambda_root,i);
-            }
-        } else {
-            if (s_here[i] < 0.0) flag_sign = -1;
-            if (s_here[i] > 0.0) flag_sign = 1;
-            if (fabs(s_here[i]) / exp(l_here[i] * (r_max_m + c_max_m) / 2.0) <= Epsilon) {
-                root[zero_monitor++] = l_here[i];
-            }
-        }
-    }
-    //printf("total number of solution found in range is %d\n", i_monitor);
-    root_temp = (double *) malloc(zero_monitor * sizeof(double));
-    *N_root_add = zero_monitor;
-    if (zero_monitor > 0) {
-        if (zero_monitor >= N / 4) {
-            //printf("It is likely that uniform zero determinant is occurring.\n");
-            //printf("number of small det points = %d out of %d, exit now....\n",zero_monitor, N);
-            free(root_temp);
-            return NULL;
-            //exit(1);
-        }
-        for (i = 0; i < zero_monitor; i++) {
-            root_temp[i] = root[i];
-            //printf("root_location[%d] = %lf\n",i,root_temp[i]);
-        }
-    }
-    free_dmatrix(a, 1, Alphsize, 1, Alphsize);
-    return root_temp;
-
-}
-
-
-Lambda Find_JP(const double **mat_b, double la_min, double la_max, double **JP, double *p_in, double *q_in) {
-    double **a, *q, *p; // a is the exponentiated matrix of socres, p and q are bg_freqs
-    int i, j/*,k*/;
-    int N;  // number of points for first round
-    double lambda/*, l_tmp, sum, sum_min, sum_max*/;
-    double lambda_max, lambda_min, lambda_final, dlambda/*, dsum=0.5*/;
-    //double *l_here, *s_here;
-    //double error=0.000000000000001;
-    //int validity_flag; // 1 means valid, -1 means not valid.
-    int flag_sign;       // 1 means small lambda sum > 1, -1 means otherwise
-    int flag_done = -1;       // 1 means find sensible solution, -1 means sensible not found
-    int i_monitor = 0;          // record number of solution found in the range, including nonsense ones
-    int j_monitor;
-
-    Lambda Lambda_local;
-    //Sum *Sum_local;
-    Sum Sum_local;
-
-    lambda_min = la_min;
-    lambda_max = la_max;
-    q = q_in;
-    p = p_in;
-    a = dmatrix(1, Alphsize, 1, Alphsize);
-    //Sum_local = (Sum *)malloc(sizeof(Sum));
-    //Lambda_local = (Lambda *)malloc(sizeof(Lambda));
-
-    N = 2 + max(400, ((int) (lambda_max - lambda_min) / 0.005));
-    //printf("N = %d in Find_JP\n", N);
-    dlambda = (lambda_max - lambda_min) / (N * 1.0);
-    //l_here = (double *)malloc((N+1)*sizeof(double));
-    //s_here = (double *)malloc((N+1)*sizeof(double));
-    double l_here[N + 1];
-    double s_here[N + 1];
-    //printf("lambda_min enter = %12.10e, lambda_max = %12.10f\n", lambda_min, lambda_max);
-    for (i = 0; i < N - 1; i++) {
-        lambda = lambda_min + (i + 1) * dlambda;
-        makematrix(mat_b, a, lambda);
-        Sum_local = Check_root(mat_b, a, lambda, p, q);
-        l_here[i] = lambda;
-        s_here[i] = Sum_local.value - 1.0;
-        //printf("scan %d th time in Find_JP\n",i );
-    }
-    //printf("finish first time scanining in Find_JP\n");
-    if (s_here[0] < 0.0) flag_sign = -1;
-    else if (s_here[0] > 0.0) flag_sign = 1;
-    else if (s_here[0] == 0.0) {  //needs refined definition on flag_sign
-        printf("enter the exact hit, rarely occurs other than when lambda = 0 \n");
-        j_monitor = 1;
-        flag_sign = 0;
-        while ((flag_sign == 0) && (j_monitor < N)) {
-            Sum_local = Check_root(mat_b, a, l_here[0] + j_monitor * dlambda / N, p, q);
-            if (Sum_local.value > 1.0) {
-                flag_sign = 1;
-            } else if (Sum_local.value < 1.0) {
-                flag_sign = -1;
-            }
-            j_monitor++;
-        }
-    }
-
-    for (i = 1; i < N; i++) {  // should be N-1 ???
-        if (flag_sign == 0) {
-            printf("flag_sign = 0 \n");
-            exit(1);
-        }
-        if (s_here[i - 1] * s_here[i] < 0) {
-            lambda_min = l_here[i - 1];
-            lambda_max = l_here[i];
-            Sum_local = Nail_lambda(mat_b, flag_sign, lambda_min, lambda_max, p, q, &lambda_final);
-            if (Sum_local.flag == 1) {
-                i = N;
-                flag_done = 1;
-                Lambda_local.flag = 1;
-                Lambda_local.min = lambda_final, Lambda_local.max = lambda_final;
-            }
-            flag_sign = -flag_sign;  // the flag switch sign after one sol found
-            i_monitor++;
-        }
-    }
-
-    if (flag_done == 1) {
-        // Write correct JP to the matrix
-        makematrix(mat_b, a, lambda_final);
-        for (i = 1; i <= Alphsize; i++) {
-            for (j = 1; j <= Alphsize; j++) {
-                JP[i][j] = a[i][j] * p[i] * q[j];
-            }
-        }
-        free_dmatrix(a, 1, Alphsize, 1, Alphsize);
-        return Lambda_local;
-    } else if (flag_done == -1) {
-        //printf("no sensible solution in the plausible x range: (%lf,%lf)\n", la_min, la_max);
-        Lambda_local.flag = -1;
-        Lambda_local.min = 0;
-        Lambda_local.max = Infty;
-        return Lambda_local;
-    }
-    // never come here
-    return Lambda_local;
-}
-
-
-Sum Nail_lambda(const double **mat_b, int flag_sign, double lambda_min, double lambda_max, double *p, double *q,
-                double *lam_add) {
-    double **a;
-    double lambda;
-
-    //Sum *Sum_local;
-    Sum Sum_local;
-    a = dmatrix(1, Alphsize, 1, Alphsize);
-    //Sum_local = (Sum *)malloc(sizeof(Sum));
-
-    lambda = (lambda_min + lambda_max) / 2.0;
-    Sum_local = Check_root(mat_b, a, lambda, p, q);
-    while (fabs(Sum_local.value - 1.0) > E_bound) {
-        if (flag_sign * (Sum_local.value - 1.0) < 0) lambda_max = lambda;
-        else if (flag_sign * (Sum_local.value - 1.0) > 0) lambda_min = lambda;
-
-        // Added by MCF to avoid infinite loop:
-        if (lambda == (lambda_min + lambda_max) / 2.0) {
-            Sum_local.flag = -1;
-            break;
-        }
-
-        lambda = (lambda_min + lambda_max) / 2.0;
-        Sum_local = Check_root(mat_b, a, lambda, p, q);
-    }
-    free_dmatrix(a, 1, Alphsize, 1, Alphsize);
-    *lam_add = lambda;
-    return Sum_local;
-}
-
-
-double Nail_det(const double **mat_b, int flag_sign, double lambda_min, double lambda_max) {
-    double **a;
-    double lambda;
-    double value;
-
-    a = dmatrix(1, Alphsize, 1, Alphsize);
-
-    lambda = (lambda_min + lambda_max) / 2.0;
-    value = Check_det(mat_b, a, lambda);
-    while ((fabs(value) > E_bound) && (lambda > 0)) {
-        if (flag_sign * (value) < 0) lambda_max = lambda;
-        else if (flag_sign * (value) > 0) lambda_min = lambda;
-        lambda = (lambda_min + lambda_max) / 2.0;
-        value = Check_det(mat_b, a, lambda);
-    }
-    free_dmatrix(a, 1, Alphsize, 1, Alphsize);
-    return lambda;
-}
-
-void makematrix(const double **mat_b, double **a, double lambda) {
-
-    int i, j;
-
-    for (i = 1; i <= Alphsize; i++)
-        for (j = 1; j <= Alphsize; j++) {
-            *(*(a + i) + j) = exp(lambda * mat_b[i][j]);
-        }
-}
-
-
-
diff --git a/lib/cacode/lambda_calculator.h b/lib/cacode/lambda_calculator.h
deleted file mode 100644
index 70f2f1800..000000000
--- a/lib/cacode/lambda_calculator.h
+++ /dev/null
@@ -1,10 +0,0 @@
-// Copyright 2008 Michiaki Hamada
-
-#ifndef __H_INCLUDE_LAMBDA_CALCULATOR_HH
-#define __H_INCLUDE_LAMBDA_CALCULATOR_HH
-
-// These pointers are 1-based!
-double calculate_lambda( const double** mat_b, int alpha_size,
-                         double* p, double* q );
-
-#endif // __H_INCLUDE_LAMBDA_CALCULATOR_HH
diff --git a/lib/cacode/lubksb.cpp b/lib/cacode/lubksb.cpp
deleted file mode 100644
index 7725d5668..000000000
--- a/lib/cacode/lubksb.cpp
+++ /dev/null
@@ -1,27 +0,0 @@
-// Public domain code from Yi-Kuo Yu & Stephen Altschul, NCBI
-
-void lubksb(double **a, int n, int *indx, double b[]) {
-    int i, ii = 0, ip, j;
-    double sum;
-
-    for (i = 1; i <= n; i++) {
-        ip = indx[i];
-        sum = b[ip];
-        b[ip] = b[i];
-        if (ii) {
-            for (j = ii; j <= i - 1; j++) {
-                sum -= a[i][j] * b[j];
-            }
-        } else if (sum) {
-            ii = i;
-        }
-        b[i] = sum;
-    }
-    for (i = n; i >= 1; i--) {
-        sum = b[i];
-        for (j = i + 1; j <= n; j++) {
-            sum -= a[i][j] * b[j];
-        }
-        b[i] = sum / a[i][i];
-    }
-}
diff --git a/lib/cacode/ludcmp.cpp b/lib/cacode/ludcmp.cpp
deleted file mode 100644
index 8d401b9e0..000000000
--- a/lib/cacode/ludcmp.cpp
+++ /dev/null
@@ -1,63 +0,0 @@
-// Public domain code from Yi-Kuo Yu & Stephen Altschul, NCBI
-
-#include <math.h>
-
-#define TINY 1.0e-20;
-
-double *dvector(int, int);
-
-void nrerror(const char *);
-
-void free_dvector(double *, int, int);
-
-void ludcmp(double **a, int n, int *indx, double *d) {
-    int i, imax, j, k;
-    double big, dum, sum, temp;
-    double *vv;
-
-    vv = dvector(1, n);
-    *d = 1.0;
-    for (i = 1; i <= n; i++) {
-        big = 0.0;
-        for (j = 1; j <= n; j++)
-            if ((temp = fabs(a[i][j])) > big) big = temp;
-        if (big == 0.0) nrerror("Singular matrix in routine LUDCMP");
-        vv[i] = 1.0 / big;
-    }
-    for (j = 1; j <= n; j++) {
-        for (i = 1; i < j; i++) {
-            sum = a[i][j];
-            for (k = 1; k < i; k++) sum -= a[i][k] * a[k][j];
-            a[i][j] = sum;
-        }
-        big = 0.0;
-        for (i = j; i <= n; i++) {
-            sum = a[i][j];
-            for (k = 1; k < j; k++)
-                sum -= a[i][k] * a[k][j];
-            a[i][j] = sum;
-            if ((dum = vv[i] * fabs(sum)) >= big) {
-                big = dum;
-                imax = i;
-            }
-        }
-        if (j != imax) {
-            for (k = 1; k <= n; k++) {
-                dum = a[imax][k];
-                a[imax][k] = a[j][k];
-                a[j][k] = dum;
-            }
-            *d = -(*d);
-            vv[imax] = vv[j];
-        }
-        indx[j] = imax;
-        if (a[j][j] == 0.0) a[j][j] = TINY;
-        if (j != n) {
-            dum = 1.0 / (a[j][j]);
-            for (i = j + 1; i <= n; i++) a[i][j] *= dum;
-        }
-    }
-    free_dvector(vv, 1, n);
-}
-
-#undef TINY
diff --git a/lib/cacode/nrutil.cpp b/lib/cacode/nrutil.cpp
deleted file mode 100644
index dcb8fb217..000000000
--- a/lib/cacode/nrutil.cpp
+++ /dev/null
@@ -1,97 +0,0 @@
-// Public domain code from Yi-Kuo Yu & Stephen Altschul, NCBI
-
-//#include <malloc.h>
-#include <stdlib.h>
-#include <stdio.h>
-
-void nrerror(const char *error_text) {
-
-    fprintf(stderr, "Numerical Recipes run-time error...\n");
-    fprintf(stderr, "%s\n", error_text);
-    fprintf(stderr, "...now exiting to system...\n");
-    exit(1);
-}
-
-float *vector(int nl, int nh) {
-    float *v;
-
-    v = (float *) malloc((unsigned) (nh - nl + 1) * sizeof(float));
-    if (!v) nrerror("allocation failure in vector()");
-    return v - nl;
-}
-
-int *ivector(int nl, int nh) {
-    int *v;
-
-    v = (int *) malloc((unsigned) (nh - nl + 1) * sizeof(int));
-    if (!v) nrerror("allocation failure in ivector()");
-    return v - nl;
-}
-
-double *dvector(int nl, int nh) {
-    double *v;
-
-    v = (double *) malloc((unsigned) (nh - nl + 1) * sizeof(double));
-    if (!v) nrerror("allocation failure in dvector()");
-    return v - nl;
-}
-
-float **matrix(int nrl, int nrh, int ncl, int nch) {
-    int i;
-    float **m;
-
-    m = (float **) malloc((unsigned) (nrh - nrl + 1) * sizeof(float *));
-    if (!m) nrerror("allocation failure 1 in matrix()");
-    m -= nrl;
-
-    for (i = nrl; i <= nrh; i++) {
-        m[i] = (float *) malloc((unsigned) (nch - ncl + 1) * sizeof(float));
-        if (!m[i]) nrerror("allocation failure 2 in matrix()");
-        m[i] -= ncl;
-    }
-    return m;
-}
-
-double **dmatrix(int nrl, int nrh, int ncl, int nch) {
-    int i;
-    double **m;
-
-    m = (double **) malloc((unsigned) (nrh - nrl + 1) * sizeof(double *));
-    if (!m) nrerror("allocation failure 1 in dmatrix()");
-    m -= nrl;
-
-    for (i = nrl; i <= nrh; i++) {
-        m[i] = (double *) malloc((unsigned) (nch - ncl + 1) * sizeof(double));
-        if (!m[i]) nrerror("allocation failure 2 in dmatrix()");
-        m[i] -= ncl;
-    }
-    return m;
-}
-
-float **submatrix(float **a, int oldrl, int oldrh, int oldcl, int oldch, int newrl, int newcl) {
-    int i, j;
-    float **m;
-
-    m = (float **) malloc((unsigned) (oldrh - oldrl + 1) * sizeof(float *));
-    if (!m) nrerror("allocation failure in submatrix()");
-    m -= newrl;
-
-    for (i = oldrl, j = newrl; i <= oldrh; i++, j++) m[j] = a[i] + oldcl - newcl;
-
-    return m;
-}
-
-void free_ivector(int *v, int nl, int nh) {
-    free((char *) (v + nl));
-}
-
-void free_dvector(double *v, int nl, int nh) {
-    free((char *) (v + nl));
-}
-
-void free_dmatrix(double **m, int nrl, int nrh, int ncl, int nch) {
-    int i;
-
-    for (i = nrh; i >= nrl; i--) free((char *) (m[i] + ncl));
-    free((char *) (m + nrl));
-}
\ No newline at end of file
diff --git a/lib/cacode/nrutil.h b/lib/cacode/nrutil.h
deleted file mode 100644
index 0de5f5f9b..000000000
--- a/lib/cacode/nrutil.h
+++ /dev/null
@@ -1,19 +0,0 @@
-// Public domain code from Yi-Kuo Yu & Stephen Altschul, NCBI
-
-float *vector();
-float **matrix();
-float **convert_matrix();
-double *dvector();
-double **dmatrix();
-int *ivector();
-int **imatrix();
-float **submatrix();
-void free_vector();
-void free_dvector();
-void free_ivector();
-void free_matrix();
-void free_dmatrix();
-void free_imatrix();
-void free_submatrix();
-void free_convert_matrix();
-void nrerror();
diff --git a/lib/fast_float/VERSION b/lib/fast_float/VERSION
new file mode 100644
index 000000000..f8c5c2ccd
--- /dev/null
+++ b/lib/fast_float/VERSION
@@ -0,0 +1 @@
+6.1.5
\ No newline at end of file
diff --git a/lib/fast_float/fast_float.h b/lib/fast_float/fast_float.h
new file mode 100644
index 000000000..60ced8190
--- /dev/null
+++ b/lib/fast_float/fast_float.h
@@ -0,0 +1,3911 @@
+// fast_float by Daniel Lemire
+// fast_float by João Paulo Magalhaes
+//
+//
+// with contributions from Eugene Golushkov
+// with contributions from Maksim Kita
+// with contributions from Marcin Wojdyr
+// with contributions from Neal Richardson
+// with contributions from Tim Paine
+// with contributions from Fabio Pellacini
+// with contributions from Lénárd Szolnoki
+// with contributions from Jan Pharago
+// with contributions from Maya Warrier
+// with contributions from Taha Khokhar
+//
+//
+// Licensed under the Apache License, Version 2.0, or the
+// MIT License or the Boost License. This file may not be copied,
+// modified, or distributed except according to those terms.
+//
+// MIT License Notice
+//
+//    MIT License
+//    
+//    Copyright (c) 2021 The fast_float authors
+//    
+//    Permission is hereby granted, free of charge, to any
+//    person obtaining a copy of this software and associated
+//    documentation files (the "Software"), to deal in the
+//    Software without restriction, including without
+//    limitation the rights to use, copy, modify, merge,
+//    publish, distribute, sublicense, and/or sell copies of
+//    the Software, and to permit persons to whom the Software
+//    is furnished to do so, subject to the following
+//    conditions:
+//    
+//    The above copyright notice and this permission notice
+//    shall be included in all copies or substantial portions
+//    of the Software.
+//    
+//    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF
+//    ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED
+//    TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A
+//    PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT
+//    SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+//    CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+//    OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR
+//    IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+//    DEALINGS IN THE SOFTWARE.
+//
+// Apache License (Version 2.0) Notice
+//
+//    Copyright 2021 The fast_float authors
+//    Licensed under the Apache License, Version 2.0 (the "License");
+//    you may not use this file except in compliance with the License.
+//    You may obtain a copy of the License at
+//    
+//    http://www.apache.org/licenses/LICENSE-2.0
+//    
+//    Unless required by applicable law or agreed to in writing, software
+//    distributed under the License is distributed on an "AS IS" BASIS,
+//    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+//    See the License for the specific language governing permissions and
+//
+// BOOST License Notice
+//
+//    Boost Software License - Version 1.0 - August 17th, 2003
+//    
+//    Permission is hereby granted, free of charge, to any person or organization
+//    obtaining a copy of the software and accompanying documentation covered by
+//    this license (the "Software") to use, reproduce, display, distribute,
+//    execute, and transmit the Software, and to prepare derivative works of the
+//    Software, and to permit third-parties to whom the Software is furnished to
+//    do so, all subject to the following:
+//    
+//    The copyright notices in the Software and this entire statement, including
+//    the above license grant, this restriction and the following disclaimer,
+//    must be included in all copies of the Software, in whole or in part, and
+//    all derivative works of the Software, unless such copies or derivative
+//    works are solely in the form of machine-executable object code generated by
+//    a source language processor.
+//    
+//    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+//    IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+//    FITNESS FOR A PARTICULAR PURPOSE, TITLE AND NON-INFRINGEMENT. IN NO EVENT
+//    SHALL THE COPYRIGHT HOLDERS OR ANYONE DISTRIBUTING THE SOFTWARE BE LIABLE
+//    FOR ANY DAMAGES OR OTHER LIABILITY, WHETHER IN CONTRACT, TORT OR OTHERWISE,
+//    ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+//    DEALINGS IN THE SOFTWARE.
+//
+
+#ifndef FASTFLOAT_CONSTEXPR_FEATURE_DETECT_H
+#define FASTFLOAT_CONSTEXPR_FEATURE_DETECT_H
+
+#ifdef __has_include
+#if __has_include(<version>)
+#include <version>
+#endif
+#endif
+
+// Testing for https://wg21.link/N3652, adopted in C++14
+#if __cpp_constexpr >= 201304
+#define FASTFLOAT_CONSTEXPR14 constexpr
+#else
+#define FASTFLOAT_CONSTEXPR14
+#endif
+
+#if defined(__cpp_lib_bit_cast) && __cpp_lib_bit_cast >= 201806L
+#define FASTFLOAT_HAS_BIT_CAST 1
+#else
+#define FASTFLOAT_HAS_BIT_CAST 0
+#endif
+
+#if defined(__cpp_lib_is_constant_evaluated) &&                                \
+    __cpp_lib_is_constant_evaluated >= 201811L
+#define FASTFLOAT_HAS_IS_CONSTANT_EVALUATED 1
+#else
+#define FASTFLOAT_HAS_IS_CONSTANT_EVALUATED 0
+#endif
+
+// Testing for relevant C++20 constexpr library features
+#if FASTFLOAT_HAS_IS_CONSTANT_EVALUATED && FASTFLOAT_HAS_BIT_CAST &&           \
+    __cpp_lib_constexpr_algorithms >= 201806L /*For std::copy and std::fill*/
+#define FASTFLOAT_CONSTEXPR20 constexpr
+#define FASTFLOAT_IS_CONSTEXPR 1
+#else
+#define FASTFLOAT_CONSTEXPR20
+#define FASTFLOAT_IS_CONSTEXPR 0
+#endif
+
+#if __cplusplus >= 201703L || (defined(_MSVC_LANG) && _MSVC_LANG >= 201703L)
+#define FASTFLOAT_DETAIL_MUST_DEFINE_CONSTEXPR_VARIABLE 0
+#else
+#define FASTFLOAT_DETAIL_MUST_DEFINE_CONSTEXPR_VARIABLE 1
+#endif
+
+#endif // FASTFLOAT_CONSTEXPR_FEATURE_DETECT_H
+
+#ifndef FASTFLOAT_FLOAT_COMMON_H
+#define FASTFLOAT_FLOAT_COMMON_H
+
+#include <cfloat>
+#include <cstdint>
+#include <cassert>
+#include <cstring>
+#include <type_traits>
+#include <system_error>
+#ifdef __has_include
+#if __has_include(<stdfloat>) && (__cplusplus > 202002L || _MSVC_LANG > 202002L)
+#include <stdfloat>
+#endif
+#endif
+
+namespace fast_float {
+
+#define FASTFLOAT_JSONFMT (1 << 5)
+#define FASTFLOAT_FORTRANFMT (1 << 6)
+
+enum chars_format {
+  scientific = 1 << 0,
+  fixed = 1 << 2,
+  hex = 1 << 3,
+  no_infnan = 1 << 4,
+  // RFC 8259: https://datatracker.ietf.org/doc/html/rfc8259#section-6
+  json = FASTFLOAT_JSONFMT | fixed | scientific | no_infnan,
+  // Extension of RFC 8259 where, e.g., "inf" and "nan" are allowed.
+  json_or_infnan = FASTFLOAT_JSONFMT | fixed | scientific,
+  fortran = FASTFLOAT_FORTRANFMT | fixed | scientific,
+  general = fixed | scientific
+};
+
+template <typename UC> struct from_chars_result_t {
+  UC const *ptr;
+  std::errc ec;
+};
+using from_chars_result = from_chars_result_t<char>;
+
+template <typename UC> struct parse_options_t {
+  constexpr explicit parse_options_t(chars_format fmt = chars_format::general,
+                                     UC dot = UC('.'))
+      : format(fmt), decimal_point(dot) {}
+
+  /** Which number formats are accepted */
+  chars_format format;
+  /** The character used as decimal point */
+  UC decimal_point;
+};
+using parse_options = parse_options_t<char>;
+
+} // namespace fast_float
+
+#if FASTFLOAT_HAS_BIT_CAST
+#include <bit>
+#endif
+
+#if (defined(__x86_64) || defined(__x86_64__) || defined(_M_X64) ||            \
+     defined(__amd64) || defined(__aarch64__) || defined(_M_ARM64) ||          \
+     defined(__MINGW64__) || defined(__s390x__) ||                             \
+     (defined(__ppc64__) || defined(__PPC64__) || defined(__ppc64le__) ||      \
+      defined(__PPC64LE__)) ||                                                 \
+     defined(__loongarch64))
+#define FASTFLOAT_64BIT 1
+#elif (defined(__i386) || defined(__i386__) || defined(_M_IX86) ||             \
+       defined(__arm__) || defined(_M_ARM) || defined(__ppc__) ||              \
+       defined(__MINGW32__) || defined(__EMSCRIPTEN__))
+#define FASTFLOAT_32BIT 1
+#else
+  // Need to check incrementally, since SIZE_MAX is a size_t, avoid overflow.
+// We can never tell the register width, but the SIZE_MAX is a good
+// approximation. UINTPTR_MAX and INTPTR_MAX are optional, so avoid them for max
+// portability.
+#if SIZE_MAX == 0xffff
+#error Unknown platform (16-bit, unsupported)
+#elif SIZE_MAX == 0xffffffff
+#define FASTFLOAT_32BIT 1
+#elif SIZE_MAX == 0xffffffffffffffff
+#define FASTFLOAT_64BIT 1
+#else
+#error Unknown platform (not 32-bit, not 64-bit?)
+#endif
+#endif
+
+#if ((defined(_WIN32) || defined(_WIN64)) && !defined(__clang__)) ||           \
+    (defined(_M_ARM64) && !defined(__MINGW32__))
+#include <intrin.h>
+#endif
+
+#if defined(_MSC_VER) && !defined(__clang__)
+#define FASTFLOAT_VISUAL_STUDIO 1
+#endif
+
+#if defined __BYTE_ORDER__ && defined __ORDER_BIG_ENDIAN__
+#define FASTFLOAT_IS_BIG_ENDIAN (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
+#elif defined _WIN32
+#define FASTFLOAT_IS_BIG_ENDIAN 0
+#else
+#if defined(__APPLE__) || defined(__FreeBSD__)
+#include <machine/endian.h>
+#elif defined(sun) || defined(__sun)
+#include <sys/byteorder.h>
+#elif defined(__MVS__)
+#include <sys/endian.h>
+#else
+#ifdef __has_include
+#if __has_include(<endian.h>)
+#include <endian.h>
+#endif //__has_include(<endian.h>)
+#endif //__has_include
+#endif
+#
+#ifndef __BYTE_ORDER__
+// safe choice
+#define FASTFLOAT_IS_BIG_ENDIAN 0
+#endif
+#
+#ifndef __ORDER_LITTLE_ENDIAN__
+// safe choice
+#define FASTFLOAT_IS_BIG_ENDIAN 0
+#endif
+#
+#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
+#define FASTFLOAT_IS_BIG_ENDIAN 0
+#else
+#define FASTFLOAT_IS_BIG_ENDIAN 1
+#endif
+#endif
+
+#if defined(__SSE2__) || (defined(FASTFLOAT_VISUAL_STUDIO) &&                  \
+                          (defined(_M_AMD64) || defined(_M_X64) ||             \
+                           (defined(_M_IX86_FP) && _M_IX86_FP == 2)))
+#define FASTFLOAT_SSE2 1
+#endif
+
+#if defined(__aarch64__) || defined(_M_ARM64)
+#define FASTFLOAT_NEON 1
+#endif
+
+#if defined(FASTFLOAT_SSE2) || defined(FASTFLOAT_NEON)
+#define FASTFLOAT_HAS_SIMD 1
+#endif
+
+#if defined(__GNUC__)
+// disable -Wcast-align=strict (GCC only)
+#define FASTFLOAT_SIMD_DISABLE_WARNINGS                                        \
+  _Pragma("GCC diagnostic push")                                               \
+      _Pragma("GCC diagnostic ignored \"-Wcast-align\"")
+#else
+#define FASTFLOAT_SIMD_DISABLE_WARNINGS
+#endif
+
+#if defined(__GNUC__)
+#define FASTFLOAT_SIMD_RESTORE_WARNINGS _Pragma("GCC diagnostic pop")
+#else
+#define FASTFLOAT_SIMD_RESTORE_WARNINGS
+#endif
+
+#ifdef FASTFLOAT_VISUAL_STUDIO
+#define fastfloat_really_inline __forceinline
+#else
+#define fastfloat_really_inline inline __attribute__((always_inline))
+#endif
+
+#ifndef FASTFLOAT_ASSERT
+#define FASTFLOAT_ASSERT(x)                                                    \
+  { ((void)(x)); }
+#endif
+
+#ifndef FASTFLOAT_DEBUG_ASSERT
+#define FASTFLOAT_DEBUG_ASSERT(x)                                              \
+  { ((void)(x)); }
+#endif
+
+// rust style `try!()` macro, or `?` operator
+#define FASTFLOAT_TRY(x)                                                       \
+  {                                                                            \
+    if (!(x))                                                                  \
+      return false;                                                            \
+  }
+
+#define FASTFLOAT_ENABLE_IF(...)                                               \
+  typename std::enable_if<(__VA_ARGS__), int>::type
+
+namespace fast_float {
+
+fastfloat_really_inline constexpr bool cpp20_and_in_constexpr() {
+#if FASTFLOAT_HAS_IS_CONSTANT_EVALUATED
+  return std::is_constant_evaluated();
+#else
+  return false;
+#endif
+}
+
+template <typename T>
+fastfloat_really_inline constexpr bool is_supported_float_type() {
+  return std::is_same<T, float>::value || std::is_same<T, double>::value
+#if __STDCPP_FLOAT32_T__
+         || std::is_same<T, std::float32_t>::value
+#endif
+#if __STDCPP_FLOAT64_T__
+         || std::is_same<T, std::float64_t>::value
+#endif
+      ;
+}
+
+template <typename UC>
+fastfloat_really_inline constexpr bool is_supported_char_type() {
+  return std::is_same<UC, char>::value || std::is_same<UC, wchar_t>::value ||
+         std::is_same<UC, char16_t>::value || std::is_same<UC, char32_t>::value;
+}
+
+// Compares two ASCII strings in a case insensitive manner.
+template <typename UC>
+inline FASTFLOAT_CONSTEXPR14 bool
+fastfloat_strncasecmp(UC const *input1, UC const *input2, size_t length) {
+  char running_diff{0};
+  for (size_t i = 0; i < length; ++i) {
+    running_diff |= (char(input1[i]) ^ char(input2[i]));
+  }
+  return (running_diff == 0) || (running_diff == 32);
+}
+
+#ifndef FLT_EVAL_METHOD
+#error "FLT_EVAL_METHOD should be defined, please include cfloat."
+#endif
+
+// a pointer and a length to a contiguous block of memory
+template <typename T> struct span {
+  const T *ptr;
+  size_t length;
+  constexpr span(const T *_ptr, size_t _length) : ptr(_ptr), length(_length) {}
+  constexpr span() : ptr(nullptr), length(0) {}
+
+  constexpr size_t len() const noexcept { return length; }
+
+  FASTFLOAT_CONSTEXPR14 const T &operator[](size_t index) const noexcept {
+    FASTFLOAT_DEBUG_ASSERT(index < length);
+    return ptr[index];
+  }
+};
+
+struct value128 {
+  uint64_t low;
+  uint64_t high;
+  constexpr value128(uint64_t _low, uint64_t _high) : low(_low), high(_high) {}
+  constexpr value128() : low(0), high(0) {}
+};
+
+/* Helper C++14 constexpr generic implementation of leading_zeroes */
+fastfloat_really_inline FASTFLOAT_CONSTEXPR14 int
+leading_zeroes_generic(uint64_t input_num, int last_bit = 0) {
+  if (input_num & uint64_t(0xffffffff00000000)) {
+    input_num >>= 32;
+    last_bit |= 32;
+  }
+  if (input_num & uint64_t(0xffff0000)) {
+    input_num >>= 16;
+    last_bit |= 16;
+  }
+  if (input_num & uint64_t(0xff00)) {
+    input_num >>= 8;
+    last_bit |= 8;
+  }
+  if (input_num & uint64_t(0xf0)) {
+    input_num >>= 4;
+    last_bit |= 4;
+  }
+  if (input_num & uint64_t(0xc)) {
+    input_num >>= 2;
+    last_bit |= 2;
+  }
+  if (input_num & uint64_t(0x2)) { /* input_num >>=  1; */
+    last_bit |= 1;
+  }
+  return 63 - last_bit;
+}
+
+/* result might be undefined when input_num is zero */
+fastfloat_really_inline FASTFLOAT_CONSTEXPR20 int
+leading_zeroes(uint64_t input_num) {
+  assert(input_num > 0);
+  if (cpp20_and_in_constexpr()) {
+    return leading_zeroes_generic(input_num);
+  }
+#ifdef FASTFLOAT_VISUAL_STUDIO
+#if defined(_M_X64) || defined(_M_ARM64)
+  unsigned long leading_zero = 0;
+  // Search the mask data from most significant bit (MSB)
+  // to least significant bit (LSB) for a set bit (1).
+  _BitScanReverse64(&leading_zero, input_num);
+  return (int)(63 - leading_zero);
+#else
+  return leading_zeroes_generic(input_num);
+#endif
+#else
+  return __builtin_clzll(input_num);
+#endif
+}
+
+// slow emulation routine for 32-bit
+fastfloat_really_inline constexpr uint64_t emulu(uint32_t x, uint32_t y) {
+  return x * (uint64_t)y;
+}
+
+fastfloat_really_inline FASTFLOAT_CONSTEXPR14 uint64_t
+umul128_generic(uint64_t ab, uint64_t cd, uint64_t *hi) {
+  uint64_t ad = emulu((uint32_t)(ab >> 32), (uint32_t)cd);
+  uint64_t bd = emulu((uint32_t)ab, (uint32_t)cd);
+  uint64_t adbc = ad + emulu((uint32_t)ab, (uint32_t)(cd >> 32));
+  uint64_t adbc_carry = (uint64_t)(adbc < ad);
+  uint64_t lo = bd + (adbc << 32);
+  *hi = emulu((uint32_t)(ab >> 32), (uint32_t)(cd >> 32)) + (adbc >> 32) +
+        (adbc_carry << 32) + (uint64_t)(lo < bd);
+  return lo;
+}
+
+#ifdef FASTFLOAT_32BIT
+
+// slow emulation routine for 32-bit
+#if !defined(__MINGW64__)
+fastfloat_really_inline FASTFLOAT_CONSTEXPR14 uint64_t _umul128(uint64_t ab,
+                                                                uint64_t cd,
+                                                                uint64_t *hi) {
+  return umul128_generic(ab, cd, hi);
+}
+#endif // !__MINGW64__
+
+#endif // FASTFLOAT_32BIT
+
+// compute 64-bit a*b
+fastfloat_really_inline FASTFLOAT_CONSTEXPR20 value128
+full_multiplication(uint64_t a, uint64_t b) {
+  if (cpp20_and_in_constexpr()) {
+    value128 answer;
+    answer.low = umul128_generic(a, b, &answer.high);
+    return answer;
+  }
+  value128 answer;
+#if defined(_M_ARM64) && !defined(__MINGW32__)
+  // ARM64 has native support for 64-bit multiplications, no need to emulate
+  // But MinGW on ARM64 doesn't have native support for 64-bit multiplications
+  answer.high = __umulh(a, b);
+  answer.low = a * b;
+#elif defined(FASTFLOAT_32BIT) ||                                              \
+    (defined(_WIN64) && !defined(__clang__) && !defined(_M_ARM64))
+  answer.low = _umul128(a, b, &answer.high); // _umul128 not available on ARM64
+#elif defined(FASTFLOAT_64BIT) && defined(__SIZEOF_INT128__)
+  __uint128_t r = ((__uint128_t)a) * b;
+  answer.low = uint64_t(r);
+  answer.high = uint64_t(r >> 64);
+#else
+  answer.low = umul128_generic(a, b, &answer.high);
+#endif
+  return answer;
+}
+
+struct adjusted_mantissa {
+  uint64_t mantissa{0};
+  int32_t power2{0}; // a negative value indicates an invalid result
+  adjusted_mantissa() = default;
+  constexpr bool operator==(const adjusted_mantissa &o) const {
+    return mantissa == o.mantissa && power2 == o.power2;
+  }
+  constexpr bool operator!=(const adjusted_mantissa &o) const {
+    return mantissa != o.mantissa || power2 != o.power2;
+  }
+};
+
+// Bias so we can get the real exponent with an invalid adjusted_mantissa.
+constexpr static int32_t invalid_am_bias = -0x8000;
+
+// used for binary_format_lookup_tables<T>::max_mantissa
+constexpr uint64_t constant_55555 = 5 * 5 * 5 * 5 * 5;
+
+template <typename T, typename U = void> struct binary_format_lookup_tables;
+
+template <typename T> struct binary_format : binary_format_lookup_tables<T> {
+  using equiv_uint =
+      typename std::conditional<sizeof(T) == 4, uint32_t, uint64_t>::type;
+
+  static inline constexpr int mantissa_explicit_bits();
+  static inline constexpr int minimum_exponent();
+  static inline constexpr int infinite_power();
+  static inline constexpr int sign_index();
+  static inline constexpr int
+  min_exponent_fast_path(); // used when fegetround() == FE_TONEAREST
+  static inline constexpr int max_exponent_fast_path();
+  static inline constexpr int max_exponent_round_to_even();
+  static inline constexpr int min_exponent_round_to_even();
+  static inline constexpr uint64_t max_mantissa_fast_path(int64_t power);
+  static inline constexpr uint64_t
+  max_mantissa_fast_path(); // used when fegetround() == FE_TONEAREST
+  static inline constexpr int largest_power_of_ten();
+  static inline constexpr int smallest_power_of_ten();
+  static inline constexpr T exact_power_of_ten(int64_t power);
+  static inline constexpr size_t max_digits();
+  static inline constexpr equiv_uint exponent_mask();
+  static inline constexpr equiv_uint mantissa_mask();
+  static inline constexpr equiv_uint hidden_bit_mask();
+};
+
+template <typename U> struct binary_format_lookup_tables<double, U> {
+  static constexpr double powers_of_ten[] = {
+      1e0,  1e1,  1e2,  1e3,  1e4,  1e5,  1e6,  1e7,  1e8,  1e9,  1e10, 1e11,
+      1e12, 1e13, 1e14, 1e15, 1e16, 1e17, 1e18, 1e19, 1e20, 1e21, 1e22};
+
+  // Largest integer value v so that (5**index * v) <= 1<<53.
+  // 0x20000000000000 == 1 << 53
+  static constexpr uint64_t max_mantissa[] = {
+      0x20000000000000,
+      0x20000000000000 / 5,
+      0x20000000000000 / (5 * 5),
+      0x20000000000000 / (5 * 5 * 5),
+      0x20000000000000 / (5 * 5 * 5 * 5),
+      0x20000000000000 / (constant_55555),
+      0x20000000000000 / (constant_55555 * 5),
+      0x20000000000000 / (constant_55555 * 5 * 5),
+      0x20000000000000 / (constant_55555 * 5 * 5 * 5),
+      0x20000000000000 / (constant_55555 * 5 * 5 * 5 * 5),
+      0x20000000000000 / (constant_55555 * constant_55555),
+      0x20000000000000 / (constant_55555 * constant_55555 * 5),
+      0x20000000000000 / (constant_55555 * constant_55555 * 5 * 5),
+      0x20000000000000 / (constant_55555 * constant_55555 * 5 * 5 * 5),
+      0x20000000000000 / (constant_55555 * constant_55555 * constant_55555),
+      0x20000000000000 / (constant_55555 * constant_55555 * constant_55555 * 5),
+      0x20000000000000 /
+          (constant_55555 * constant_55555 * constant_55555 * 5 * 5),
+      0x20000000000000 /
+          (constant_55555 * constant_55555 * constant_55555 * 5 * 5 * 5),
+      0x20000000000000 /
+          (constant_55555 * constant_55555 * constant_55555 * 5 * 5 * 5 * 5),
+      0x20000000000000 /
+          (constant_55555 * constant_55555 * constant_55555 * constant_55555),
+      0x20000000000000 / (constant_55555 * constant_55555 * constant_55555 *
+                          constant_55555 * 5),
+      0x20000000000000 / (constant_55555 * constant_55555 * constant_55555 *
+                          constant_55555 * 5 * 5),
+      0x20000000000000 / (constant_55555 * constant_55555 * constant_55555 *
+                          constant_55555 * 5 * 5 * 5),
+      0x20000000000000 / (constant_55555 * constant_55555 * constant_55555 *
+                          constant_55555 * 5 * 5 * 5 * 5)};
+};
+
+#if FASTFLOAT_DETAIL_MUST_DEFINE_CONSTEXPR_VARIABLE
+
+template <typename U>
+constexpr double binary_format_lookup_tables<double, U>::powers_of_ten[];
+
+template <typename U>
+constexpr uint64_t binary_format_lookup_tables<double, U>::max_mantissa[];
+
+#endif
+
+template <typename U> struct binary_format_lookup_tables<float, U> {
+  static constexpr float powers_of_ten[] = {1e0f, 1e1f, 1e2f, 1e3f, 1e4f, 1e5f,
+                                            1e6f, 1e7f, 1e8f, 1e9f, 1e10f};
+
+  // Largest integer value v so that (5**index * v) <= 1<<24.
+  // 0x1000000 == 1<<24
+  static constexpr uint64_t max_mantissa[] = {
+      0x1000000,
+      0x1000000 / 5,
+      0x1000000 / (5 * 5),
+      0x1000000 / (5 * 5 * 5),
+      0x1000000 / (5 * 5 * 5 * 5),
+      0x1000000 / (constant_55555),
+      0x1000000 / (constant_55555 * 5),
+      0x1000000 / (constant_55555 * 5 * 5),
+      0x1000000 / (constant_55555 * 5 * 5 * 5),
+      0x1000000 / (constant_55555 * 5 * 5 * 5 * 5),
+      0x1000000 / (constant_55555 * constant_55555),
+      0x1000000 / (constant_55555 * constant_55555 * 5)};
+};
+
+#if FASTFLOAT_DETAIL_MUST_DEFINE_CONSTEXPR_VARIABLE
+
+template <typename U>
+constexpr float binary_format_lookup_tables<float, U>::powers_of_ten[];
+
+template <typename U>
+constexpr uint64_t binary_format_lookup_tables<float, U>::max_mantissa[];
+
+#endif
+
+template <>
+inline constexpr int binary_format<double>::min_exponent_fast_path() {
+#if (FLT_EVAL_METHOD != 1) && (FLT_EVAL_METHOD != 0)
+  return 0;
+#else
+  return -22;
+#endif
+}
+
+template <>
+inline constexpr int binary_format<float>::min_exponent_fast_path() {
+#if (FLT_EVAL_METHOD != 1) && (FLT_EVAL_METHOD != 0)
+  return 0;
+#else
+  return -10;
+#endif
+}
+
+template <>
+inline constexpr int binary_format<double>::mantissa_explicit_bits() {
+  return 52;
+}
+template <>
+inline constexpr int binary_format<float>::mantissa_explicit_bits() {
+  return 23;
+}
+
+template <>
+inline constexpr int binary_format<double>::max_exponent_round_to_even() {
+  return 23;
+}
+
+template <>
+inline constexpr int binary_format<float>::max_exponent_round_to_even() {
+  return 10;
+}
+
+template <>
+inline constexpr int binary_format<double>::min_exponent_round_to_even() {
+  return -4;
+}
+
+template <>
+inline constexpr int binary_format<float>::min_exponent_round_to_even() {
+  return -17;
+}
+
+template <> inline constexpr int binary_format<double>::minimum_exponent() {
+  return -1023;
+}
+template <> inline constexpr int binary_format<float>::minimum_exponent() {
+  return -127;
+}
+
+template <> inline constexpr int binary_format<double>::infinite_power() {
+  return 0x7FF;
+}
+template <> inline constexpr int binary_format<float>::infinite_power() {
+  return 0xFF;
+}
+
+template <> inline constexpr int binary_format<double>::sign_index() {
+  return 63;
+}
+template <> inline constexpr int binary_format<float>::sign_index() {
+  return 31;
+}
+
+template <>
+inline constexpr int binary_format<double>::max_exponent_fast_path() {
+  return 22;
+}
+template <>
+inline constexpr int binary_format<float>::max_exponent_fast_path() {
+  return 10;
+}
+
+template <>
+inline constexpr uint64_t binary_format<double>::max_mantissa_fast_path() {
+  return uint64_t(2) << mantissa_explicit_bits();
+}
+template <>
+inline constexpr uint64_t
+binary_format<double>::max_mantissa_fast_path(int64_t power) {
+  // caller is responsible to ensure that
+  // power >= 0 && power <= 22
+  //
+  // Work around clang bug https://godbolt.org/z/zedh7rrhc
+  return (void)max_mantissa[0], max_mantissa[power];
+}
+template <>
+inline constexpr uint64_t binary_format<float>::max_mantissa_fast_path() {
+  return uint64_t(2) << mantissa_explicit_bits();
+}
+template <>
+inline constexpr uint64_t
+binary_format<float>::max_mantissa_fast_path(int64_t power) {
+  // caller is responsible to ensure that
+  // power >= 0 && power <= 10
+  //
+  // Work around clang bug https://godbolt.org/z/zedh7rrhc
+  return (void)max_mantissa[0], max_mantissa[power];
+}
+
+template <>
+inline constexpr double
+binary_format<double>::exact_power_of_ten(int64_t power) {
+  // Work around clang bug https://godbolt.org/z/zedh7rrhc
+  return (void)powers_of_ten[0], powers_of_ten[power];
+}
+template <>
+inline constexpr float binary_format<float>::exact_power_of_ten(int64_t power) {
+  // Work around clang bug https://godbolt.org/z/zedh7rrhc
+  return (void)powers_of_ten[0], powers_of_ten[power];
+}
+
+template <> inline constexpr int binary_format<double>::largest_power_of_ten() {
+  return 308;
+}
+template <> inline constexpr int binary_format<float>::largest_power_of_ten() {
+  return 38;
+}
+
+template <>
+inline constexpr int binary_format<double>::smallest_power_of_ten() {
+  return -342;
+}
+template <> inline constexpr int binary_format<float>::smallest_power_of_ten() {
+  return -64;
+}
+
+template <> inline constexpr size_t binary_format<double>::max_digits() {
+  return 769;
+}
+template <> inline constexpr size_t binary_format<float>::max_digits() {
+  return 114;
+}
+
+template <>
+inline constexpr binary_format<float>::equiv_uint
+binary_format<float>::exponent_mask() {
+  return 0x7F800000;
+}
+template <>
+inline constexpr binary_format<double>::equiv_uint
+binary_format<double>::exponent_mask() {
+  return 0x7FF0000000000000;
+}
+
+template <>
+inline constexpr binary_format<float>::equiv_uint
+binary_format<float>::mantissa_mask() {
+  return 0x007FFFFF;
+}
+template <>
+inline constexpr binary_format<double>::equiv_uint
+binary_format<double>::mantissa_mask() {
+  return 0x000FFFFFFFFFFFFF;
+}
+
+template <>
+inline constexpr binary_format<float>::equiv_uint
+binary_format<float>::hidden_bit_mask() {
+  return 0x00800000;
+}
+template <>
+inline constexpr binary_format<double>::equiv_uint
+binary_format<double>::hidden_bit_mask() {
+  return 0x0010000000000000;
+}
+
+template <typename T>
+fastfloat_really_inline FASTFLOAT_CONSTEXPR20 void
+to_float(bool negative, adjusted_mantissa am, T &value) {
+  using fastfloat_uint = typename binary_format<T>::equiv_uint;
+  fastfloat_uint word = (fastfloat_uint)am.mantissa;
+  word |= fastfloat_uint(am.power2)
+          << binary_format<T>::mantissa_explicit_bits();
+  word |= fastfloat_uint(negative) << binary_format<T>::sign_index();
+#if FASTFLOAT_HAS_BIT_CAST
+  value = std::bit_cast<T>(word);
+#else
+  ::memcpy(&value, &word, sizeof(T));
+#endif
+}
+
+#ifdef FASTFLOAT_SKIP_WHITE_SPACE // disabled by default
+template <typename = void> struct space_lut {
+  static constexpr bool value[] = {
+      0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+      0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+      0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+      0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+      0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+      0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+      0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+      0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+      0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+      0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+      0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
+};
+
+#if FASTFLOAT_DETAIL_MUST_DEFINE_CONSTEXPR_VARIABLE
+
+template <typename T> constexpr bool space_lut<T>::value[];
+
+#endif
+
+inline constexpr bool is_space(uint8_t c) { return space_lut<>::value[c]; }
+#endif
+
+template <typename UC> static constexpr uint64_t int_cmp_zeros() {
+  static_assert((sizeof(UC) == 1) || (sizeof(UC) == 2) || (sizeof(UC) == 4),
+                "Unsupported character size");
+  return (sizeof(UC) == 1) ? 0x3030303030303030
+         : (sizeof(UC) == 2)
+             ? (uint64_t(UC('0')) << 48 | uint64_t(UC('0')) << 32 |
+                uint64_t(UC('0')) << 16 | UC('0'))
+             : (uint64_t(UC('0')) << 32 | UC('0'));
+}
+template <typename UC> static constexpr int int_cmp_len() {
+  return sizeof(uint64_t) / sizeof(UC);
+}
+template <typename UC> static constexpr UC const *str_const_nan() {
+  return nullptr;
+}
+template <> constexpr char const *str_const_nan<char>() { return "nan"; }
+template <> constexpr wchar_t const *str_const_nan<wchar_t>() { return L"nan"; }
+template <> constexpr char16_t const *str_const_nan<char16_t>() {
+  return u"nan";
+}
+template <> constexpr char32_t const *str_const_nan<char32_t>() {
+  return U"nan";
+}
+template <typename UC> static constexpr UC const *str_const_inf() {
+  return nullptr;
+}
+template <> constexpr char const *str_const_inf<char>() { return "infinity"; }
+template <> constexpr wchar_t const *str_const_inf<wchar_t>() {
+  return L"infinity";
+}
+template <> constexpr char16_t const *str_const_inf<char16_t>() {
+  return u"infinity";
+}
+template <> constexpr char32_t const *str_const_inf<char32_t>() {
+  return U"infinity";
+}
+
+template <typename = void> struct int_luts {
+  static constexpr uint8_t chdigit[] = {
+      255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
+      255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
+      255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
+      255, 255, 255, 0,   1,   2,   3,   4,   5,   6,   7,   8,   9,   255, 255,
+      255, 255, 255, 255, 255, 10,  11,  12,  13,  14,  15,  16,  17,  18,  19,
+      20,  21,  22,  23,  24,  25,  26,  27,  28,  29,  30,  31,  32,  33,  34,
+      35,  255, 255, 255, 255, 255, 255, 10,  11,  12,  13,  14,  15,  16,  17,
+      18,  19,  20,  21,  22,  23,  24,  25,  26,  27,  28,  29,  30,  31,  32,
+      33,  34,  35,  255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
+      255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
+      255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
+      255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
+      255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
+      255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
+      255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
+      255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
+      255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
+      255};
+
+  static constexpr size_t maxdigits_u64[] = {
+      64, 41, 32, 28, 25, 23, 22, 21, 20, 19, 18, 18, 17, 17, 16, 16, 16, 16,
+      15, 15, 15, 15, 14, 14, 14, 14, 14, 14, 14, 13, 13, 13, 13, 13, 13};
+
+  static constexpr uint64_t min_safe_u64[] = {
+      9223372036854775808ull,  12157665459056928801ull, 4611686018427387904,
+      7450580596923828125,     4738381338321616896,     3909821048582988049,
+      9223372036854775808ull,  12157665459056928801ull, 10000000000000000000ull,
+      5559917313492231481,     2218611106740436992,     8650415919381337933,
+      2177953337809371136,     6568408355712890625,     1152921504606846976,
+      2862423051509815793,     6746640616477458432,     15181127029874798299ull,
+      1638400000000000000,     3243919932521508681,     6221821273427820544,
+      11592836324538749809ull, 876488338465357824,      1490116119384765625,
+      2481152873203736576,     4052555153018976267,     6502111422497947648,
+      10260628712958602189ull, 15943230000000000000ull, 787662783788549761,
+      1152921504606846976,     1667889514952984961,     2386420683693101056,
+      3379220508056640625,     4738381338321616896};
+};
+
+#if FASTFLOAT_DETAIL_MUST_DEFINE_CONSTEXPR_VARIABLE
+
+template <typename T> constexpr uint8_t int_luts<T>::chdigit[];
+
+template <typename T> constexpr size_t int_luts<T>::maxdigits_u64[];
+
+template <typename T> constexpr uint64_t int_luts<T>::min_safe_u64[];
+
+#endif
+
+template <typename UC>
+fastfloat_really_inline constexpr uint8_t ch_to_digit(UC c) {
+  return int_luts<>::chdigit[static_cast<unsigned char>(c)];
+}
+
+fastfloat_really_inline constexpr size_t max_digits_u64(int base) {
+  return int_luts<>::maxdigits_u64[base - 2];
+}
+
+// If a u64 is exactly max_digits_u64() in length, this is
+// the value below which it has definitely overflowed.
+fastfloat_really_inline constexpr uint64_t min_safe_u64(int base) {
+  return int_luts<>::min_safe_u64[base - 2];
+}
+
+} // namespace fast_float
+
+#endif
+
+
+#ifndef FASTFLOAT_FAST_FLOAT_H
+#define FASTFLOAT_FAST_FLOAT_H
+
+
+namespace fast_float {
+/**
+ * This function parses the character sequence [first,last) for a number. It
+ * parses floating-point numbers expecting a locale-indepent format equivalent
+ * to what is used by std::strtod in the default ("C") locale. The resulting
+ * floating-point value is the closest floating-point values (using either float
+ * or double), using the "round to even" convention for values that would
+ * otherwise fall right in-between two values. That is, we provide exact parsing
+ * according to the IEEE standard.
+ *
+ * Given a successful parse, the pointer (`ptr`) in the returned value is set to
+ * point right after the parsed number, and the `value` referenced is set to the
+ * parsed value. In case of error, the returned `ec` contains a representative
+ * error, otherwise the default (`std::errc()`) value is stored.
+ *
+ * The implementation does not throw and does not allocate memory (e.g., with
+ * `new` or `malloc`).
+ *
+ * Like the C++17 standard, the `fast_float::from_chars` functions take an
+ * optional last argument of the type `fast_float::chars_format`. It is a bitset
+ * value: we check whether `fmt & fast_float::chars_format::fixed` and `fmt &
+ * fast_float::chars_format::scientific` are set to determine whether we allow
+ * the fixed point and scientific notation respectively. The default is
+ * `fast_float::chars_format::general` which allows both `fixed` and
+ * `scientific`.
+ */
+template <typename T, typename UC = char,
+          typename = FASTFLOAT_ENABLE_IF(is_supported_float_type<T>())>
+FASTFLOAT_CONSTEXPR20 from_chars_result_t<UC>
+from_chars(UC const *first, UC const *last, T &value,
+           chars_format fmt = chars_format::general) noexcept;
+
+/**
+ * Like from_chars, but accepts an `options` argument to govern number parsing.
+ */
+template <typename T, typename UC = char>
+FASTFLOAT_CONSTEXPR20 from_chars_result_t<UC>
+from_chars_advanced(UC const *first, UC const *last, T &value,
+                    parse_options_t<UC> options) noexcept;
+/**
+ * from_chars for integer types.
+ */
+template <typename T, typename UC = char,
+          typename = FASTFLOAT_ENABLE_IF(!is_supported_float_type<T>())>
+FASTFLOAT_CONSTEXPR20 from_chars_result_t<UC>
+from_chars(UC const *first, UC const *last, T &value, int base = 10) noexcept;
+
+} // namespace fast_float
+#endif // FASTFLOAT_FAST_FLOAT_H
+
+#ifndef FASTFLOAT_ASCII_NUMBER_H
+#define FASTFLOAT_ASCII_NUMBER_H
+
+#include <cctype>
+#include <cstdint>
+#include <cstring>
+#include <iterator>
+#include <limits>
+#include <type_traits>
+
+
+#ifdef FASTFLOAT_SSE2
+#include <emmintrin.h>
+#endif
+
+#ifdef FASTFLOAT_NEON
+#include <arm_neon.h>
+#endif
+
+namespace fast_float {
+
+template <typename UC> fastfloat_really_inline constexpr bool has_simd_opt() {
+#ifdef FASTFLOAT_HAS_SIMD
+  return std::is_same<UC, char16_t>::value;
+#else
+  return false;
+#endif
+}
+
+// Next function can be micro-optimized, but compilers are entirely
+// able to optimize it well.
+template <typename UC>
+fastfloat_really_inline constexpr bool is_integer(UC c) noexcept {
+  return !(c > UC('9') || c < UC('0'));
+}
+
+fastfloat_really_inline constexpr uint64_t byteswap(uint64_t val) {
+  return (val & 0xFF00000000000000) >> 56 | (val & 0x00FF000000000000) >> 40 |
+         (val & 0x0000FF0000000000) >> 24 | (val & 0x000000FF00000000) >> 8 |
+         (val & 0x00000000FF000000) << 8 | (val & 0x0000000000FF0000) << 24 |
+         (val & 0x000000000000FF00) << 40 | (val & 0x00000000000000FF) << 56;
+}
+
+// Read 8 UC into a u64. Truncates UC if not char.
+template <typename UC>
+fastfloat_really_inline FASTFLOAT_CONSTEXPR20 uint64_t
+read8_to_u64(const UC *chars) {
+  if (cpp20_and_in_constexpr() || !std::is_same<UC, char>::value) {
+    uint64_t val = 0;
+    for (int i = 0; i < 8; ++i) {
+      val |= uint64_t(uint8_t(*chars)) << (i * 8);
+      ++chars;
+    }
+    return val;
+  }
+  uint64_t val;
+  ::memcpy(&val, chars, sizeof(uint64_t));
+#if FASTFLOAT_IS_BIG_ENDIAN == 1
+  // Need to read as-if the number was in little-endian order.
+  val = byteswap(val);
+#endif
+  return val;
+}
+
+#ifdef FASTFLOAT_SSE2
+
+fastfloat_really_inline uint64_t simd_read8_to_u64(const __m128i data) {
+  FASTFLOAT_SIMD_DISABLE_WARNINGS
+  const __m128i packed = _mm_packus_epi16(data, data);
+#ifdef FASTFLOAT_64BIT
+  return uint64_t(_mm_cvtsi128_si64(packed));
+#else
+  uint64_t value;
+  // Visual Studio + older versions of GCC don't support _mm_storeu_si64
+  _mm_storel_epi64(reinterpret_cast<__m128i *>(&value), packed);
+  return value;
+#endif
+  FASTFLOAT_SIMD_RESTORE_WARNINGS
+}
+
+fastfloat_really_inline uint64_t simd_read8_to_u64(const char16_t *chars) {
+  FASTFLOAT_SIMD_DISABLE_WARNINGS
+  return simd_read8_to_u64(
+      _mm_loadu_si128(reinterpret_cast<const __m128i *>(chars)));
+  FASTFLOAT_SIMD_RESTORE_WARNINGS
+}
+
+#elif defined(FASTFLOAT_NEON)
+
+fastfloat_really_inline uint64_t simd_read8_to_u64(const uint16x8_t data) {
+  FASTFLOAT_SIMD_DISABLE_WARNINGS
+  uint8x8_t utf8_packed = vmovn_u16(data);
+  return vget_lane_u64(vreinterpret_u64_u8(utf8_packed), 0);
+  FASTFLOAT_SIMD_RESTORE_WARNINGS
+}
+
+fastfloat_really_inline uint64_t simd_read8_to_u64(const char16_t *chars) {
+  FASTFLOAT_SIMD_DISABLE_WARNINGS
+  return simd_read8_to_u64(
+      vld1q_u16(reinterpret_cast<const uint16_t *>(chars)));
+  FASTFLOAT_SIMD_RESTORE_WARNINGS
+}
+
+#endif // FASTFLOAT_SSE2
+
+// MSVC SFINAE is broken pre-VS2017
+#if defined(_MSC_VER) && _MSC_VER <= 1900
+template <typename UC>
+#else
+template <typename UC, FASTFLOAT_ENABLE_IF(!has_simd_opt<UC>()) = 0>
+#endif
+// dummy for compile
+uint64_t simd_read8_to_u64(UC const *) {
+  return 0;
+}
+
+// credit  @aqrit
+fastfloat_really_inline FASTFLOAT_CONSTEXPR14 uint32_t
+parse_eight_digits_unrolled(uint64_t val) {
+  const uint64_t mask = 0x000000FF000000FF;
+  const uint64_t mul1 = 0x000F424000000064; // 100 + (1000000ULL << 32)
+  const uint64_t mul2 = 0x0000271000000001; // 1 + (10000ULL << 32)
+  val -= 0x3030303030303030;
+  val = (val * 10) + (val >> 8); // val = (val * 2561) >> 8;
+  val = (((val & mask) * mul1) + (((val >> 16) & mask) * mul2)) >> 32;
+  return uint32_t(val);
+}
+
+// Call this if chars are definitely 8 digits.
+template <typename UC>
+fastfloat_really_inline FASTFLOAT_CONSTEXPR20 uint32_t
+parse_eight_digits_unrolled(UC const *chars) noexcept {
+  if (cpp20_and_in_constexpr() || !has_simd_opt<UC>()) {
+    return parse_eight_digits_unrolled(read8_to_u64(chars)); // truncation okay
+  }
+  return parse_eight_digits_unrolled(simd_read8_to_u64(chars));
+}
+
+// credit @aqrit
+fastfloat_really_inline constexpr bool
+is_made_of_eight_digits_fast(uint64_t val) noexcept {
+  return !((((val + 0x4646464646464646) | (val - 0x3030303030303030)) &
+            0x8080808080808080));
+}
+
+#ifdef FASTFLOAT_HAS_SIMD
+
+// Call this if chars might not be 8 digits.
+// Using this style (instead of is_made_of_eight_digits_fast() then
+// parse_eight_digits_unrolled()) ensures we don't load SIMD registers twice.
+fastfloat_really_inline FASTFLOAT_CONSTEXPR20 bool
+simd_parse_if_eight_digits_unrolled(const char16_t *chars,
+                                    uint64_t &i) noexcept {
+  if (cpp20_and_in_constexpr()) {
+    return false;
+  }
+#ifdef FASTFLOAT_SSE2
+  FASTFLOAT_SIMD_DISABLE_WARNINGS
+  const __m128i data =
+      _mm_loadu_si128(reinterpret_cast<const __m128i *>(chars));
+
+  // (x - '0') <= 9
+  // http://0x80.pl/articles/simd-parsing-int-sequences.html
+  const __m128i t0 = _mm_add_epi16(data, _mm_set1_epi16(32720));
+  const __m128i t1 = _mm_cmpgt_epi16(t0, _mm_set1_epi16(-32759));
+
+  if (_mm_movemask_epi8(t1) == 0) {
+    i = i * 100000000 + parse_eight_digits_unrolled(simd_read8_to_u64(data));
+    return true;
+  } else
+    return false;
+  FASTFLOAT_SIMD_RESTORE_WARNINGS
+#elif defined(FASTFLOAT_NEON)
+  FASTFLOAT_SIMD_DISABLE_WARNINGS
+  const uint16x8_t data = vld1q_u16(reinterpret_cast<const uint16_t *>(chars));
+
+  // (x - '0') <= 9
+  // http://0x80.pl/articles/simd-parsing-int-sequences.html
+  const uint16x8_t t0 = vsubq_u16(data, vmovq_n_u16('0'));
+  const uint16x8_t mask = vcltq_u16(t0, vmovq_n_u16('9' - '0' + 1));
+
+  if (vminvq_u16(mask) == 0xFFFF) {
+    i = i * 100000000 + parse_eight_digits_unrolled(simd_read8_to_u64(data));
+    return true;
+  } else
+    return false;
+  FASTFLOAT_SIMD_RESTORE_WARNINGS
+#else
+  (void)chars;
+  (void)i;
+  return false;
+#endif // FASTFLOAT_SSE2
+}
+
+#endif // FASTFLOAT_HAS_SIMD
+
+// MSVC SFINAE is broken pre-VS2017
+#if defined(_MSC_VER) && _MSC_VER <= 1900
+template <typename UC>
+#else
+template <typename UC, FASTFLOAT_ENABLE_IF(!has_simd_opt<UC>()) = 0>
+#endif
+// dummy for compile
+bool simd_parse_if_eight_digits_unrolled(UC const *, uint64_t &) {
+  return 0;
+}
+
+template <typename UC, FASTFLOAT_ENABLE_IF(!std::is_same<UC, char>::value) = 0>
+fastfloat_really_inline FASTFLOAT_CONSTEXPR20 void
+loop_parse_if_eight_digits(const UC *&p, const UC *const pend, uint64_t &i) {
+  if (!has_simd_opt<UC>()) {
+    return;
+  }
+  while ((std::distance(p, pend) >= 8) &&
+         simd_parse_if_eight_digits_unrolled(
+             p, i)) { // in rare cases, this will overflow, but that's ok
+    p += 8;
+  }
+}
+
+fastfloat_really_inline FASTFLOAT_CONSTEXPR20 void
+loop_parse_if_eight_digits(const char *&p, const char *const pend,
+                           uint64_t &i) {
+  // optimizes better than parse_if_eight_digits_unrolled() for UC = char.
+  while ((std::distance(p, pend) >= 8) &&
+         is_made_of_eight_digits_fast(read8_to_u64(p))) {
+    i = i * 100000000 +
+        parse_eight_digits_unrolled(read8_to_u64(
+            p)); // in rare cases, this will overflow, but that's ok
+    p += 8;
+  }
+}
+
+enum class parse_error {
+  no_error,
+  // [JSON-only] The minus sign must be followed by an integer.
+  missing_integer_after_sign,
+  // A sign must be followed by an integer or dot.
+  missing_integer_or_dot_after_sign,
+  // [JSON-only] The integer part must not have leading zeros.
+  leading_zeros_in_integer_part,
+  // [JSON-only] The integer part must have at least one digit.
+  no_digits_in_integer_part,
+  // [JSON-only] If there is a decimal point, there must be digits in the
+  // fractional part.
+  no_digits_in_fractional_part,
+  // The mantissa must have at least one digit.
+  no_digits_in_mantissa,
+  // Scientific notation requires an exponential part.
+  missing_exponential_part,
+};
+
+template <typename UC> struct parsed_number_string_t {
+  int64_t exponent{0};
+  uint64_t mantissa{0};
+  UC const *lastmatch{nullptr};
+  bool negative{false};
+  bool valid{false};
+  bool too_many_digits{false};
+  // contains the range of the significant digits
+  span<const UC> integer{};  // non-nullable
+  span<const UC> fraction{}; // nullable
+  parse_error error{parse_error::no_error};
+};
+
+using byte_span = span<const char>;
+using parsed_number_string = parsed_number_string_t<char>;
+
+template <typename UC>
+fastfloat_really_inline FASTFLOAT_CONSTEXPR20 parsed_number_string_t<UC>
+report_parse_error(UC const *p, parse_error error) {
+  parsed_number_string_t<UC> answer;
+  answer.valid = false;
+  answer.lastmatch = p;
+  answer.error = error;
+  return answer;
+}
+
+// Assuming that you use no more than 19 digits, this will
+// parse an ASCII string.
+template <typename UC>
+fastfloat_really_inline FASTFLOAT_CONSTEXPR20 parsed_number_string_t<UC>
+parse_number_string(UC const *p, UC const *pend,
+                    parse_options_t<UC> options) noexcept {
+  chars_format const fmt = options.format;
+  UC const decimal_point = options.decimal_point;
+
+  parsed_number_string_t<UC> answer;
+  answer.valid = false;
+  answer.too_many_digits = false;
+  answer.negative = (*p == UC('-'));
+#ifdef FASTFLOAT_ALLOWS_LEADING_PLUS // disabled by default
+  if ((*p == UC('-')) || (!(fmt & FASTFLOAT_JSONFMT) && *p == UC('+'))) {
+#else
+  if (*p == UC('-')) { // C++17 20.19.3.(7.1) explicitly forbids '+' sign here
+#endif
+    ++p;
+    if (p == pend) {
+      return report_parse_error<UC>(
+          p, parse_error::missing_integer_or_dot_after_sign);
+    }
+    if (fmt & FASTFLOAT_JSONFMT) {
+      if (!is_integer(*p)) { // a sign must be followed by an integer
+        return report_parse_error<UC>(p,
+                                      parse_error::missing_integer_after_sign);
+      }
+    } else {
+      if (!is_integer(*p) &&
+          (*p !=
+           decimal_point)) { // a sign must be followed by an integer or the dot
+        return report_parse_error<UC>(
+            p, parse_error::missing_integer_or_dot_after_sign);
+      }
+    }
+  }
+  UC const *const start_digits = p;
+
+  uint64_t i = 0; // an unsigned int avoids signed overflows (which are bad)
+
+  while ((p != pend) && is_integer(*p)) {
+    // a multiplication by 10 is cheaper than an arbitrary integer
+    // multiplication
+    i = 10 * i +
+        uint64_t(*p -
+                 UC('0')); // might overflow, we will handle the overflow later
+    ++p;
+  }
+  UC const *const end_of_integer_part = p;
+  int64_t digit_count = int64_t(end_of_integer_part - start_digits);
+  answer.integer = span<const UC>(start_digits, size_t(digit_count));
+  if (fmt & FASTFLOAT_JSONFMT) {
+    // at least 1 digit in integer part, without leading zeros
+    if (digit_count == 0) {
+      return report_parse_error<UC>(p, parse_error::no_digits_in_integer_part);
+    }
+    if ((start_digits[0] == UC('0') && digit_count > 1)) {
+      return report_parse_error<UC>(start_digits,
+                                    parse_error::leading_zeros_in_integer_part);
+    }
+  }
+
+  int64_t exponent = 0;
+  const bool has_decimal_point = (p != pend) && (*p == decimal_point);
+  if (has_decimal_point) {
+    ++p;
+    UC const *before = p;
+    // can occur at most twice without overflowing, but let it occur more, since
+    // for integers with many digits, digit parsing is the primary bottleneck.
+    loop_parse_if_eight_digits(p, pend, i);
+
+    while ((p != pend) && is_integer(*p)) {
+      uint8_t digit = uint8_t(*p - UC('0'));
+      ++p;
+      i = i * 10 + digit; // in rare cases, this will overflow, but that's ok
+    }
+    exponent = before - p;
+    answer.fraction = span<const UC>(before, size_t(p - before));
+    digit_count -= exponent;
+  }
+  if (fmt & FASTFLOAT_JSONFMT) {
+    // at least 1 digit in fractional part
+    if (has_decimal_point && exponent == 0) {
+      return report_parse_error<UC>(p,
+                                    parse_error::no_digits_in_fractional_part);
+    }
+  } else if (digit_count ==
+             0) { // we must have encountered at least one integer!
+    return report_parse_error<UC>(p, parse_error::no_digits_in_mantissa);
+  }
+  int64_t exp_number = 0; // explicit exponential part
+  if (((fmt & chars_format::scientific) && (p != pend) &&
+       ((UC('e') == *p) || (UC('E') == *p))) ||
+      ((fmt & FASTFLOAT_FORTRANFMT) && (p != pend) &&
+       ((UC('+') == *p) || (UC('-') == *p) || (UC('d') == *p) ||
+        (UC('D') == *p)))) {
+    UC const *location_of_e = p;
+    if ((UC('e') == *p) || (UC('E') == *p) || (UC('d') == *p) ||
+        (UC('D') == *p)) {
+      ++p;
+    }
+    bool neg_exp = false;
+    if ((p != pend) && (UC('-') == *p)) {
+      neg_exp = true;
+      ++p;
+    } else if ((p != pend) &&
+               (UC('+') ==
+                *p)) { // '+' on exponent is allowed by C++17 20.19.3.(7.1)
+      ++p;
+    }
+    if ((p == pend) || !is_integer(*p)) {
+      if (!(fmt & chars_format::fixed)) {
+        // The exponential part is invalid for scientific notation, so it must
+        // be a trailing token for fixed notation. However, fixed notation is
+        // disabled, so report a scientific notation error.
+        return report_parse_error<UC>(p, parse_error::missing_exponential_part);
+      }
+      // Otherwise, we will be ignoring the 'e'.
+      p = location_of_e;
+    } else {
+      while ((p != pend) && is_integer(*p)) {
+        uint8_t digit = uint8_t(*p - UC('0'));
+        if (exp_number < 0x10000000) {
+          exp_number = 10 * exp_number + digit;
+        }
+        ++p;
+      }
+      if (neg_exp) {
+        exp_number = -exp_number;
+      }
+      exponent += exp_number;
+    }
+  } else {
+    // If it scientific and not fixed, we have to bail out.
+    if ((fmt & chars_format::scientific) && !(fmt & chars_format::fixed)) {
+      return report_parse_error<UC>(p, parse_error::missing_exponential_part);
+    }
+  }
+  answer.lastmatch = p;
+  answer.valid = true;
+
+  // If we frequently had to deal with long strings of digits,
+  // we could extend our code by using a 128-bit integer instead
+  // of a 64-bit integer. However, this is uncommon.
+  //
+  // We can deal with up to 19 digits.
+  if (digit_count > 19) { // this is uncommon
+    // It is possible that the integer had an overflow.
+    // We have to handle the case where we have 0.0000somenumber.
+    // We need to be mindful of the case where we only have zeroes...
+    // E.g., 0.000000000...000.
+    UC const *start = start_digits;
+    while ((start != pend) && (*start == UC('0') || *start == decimal_point)) {
+      if (*start == UC('0')) {
+        digit_count--;
+      }
+      start++;
+    }
+
+    if (digit_count > 19) {
+      answer.too_many_digits = true;
+      // Let us start again, this time, avoiding overflows.
+      // We don't need to check if is_integer, since we use the
+      // pre-tokenized spans from above.
+      i = 0;
+      p = answer.integer.ptr;
+      UC const *int_end = p + answer.integer.len();
+      const uint64_t minimal_nineteen_digit_integer{1000000000000000000};
+      while ((i < minimal_nineteen_digit_integer) && (p != int_end)) {
+        i = i * 10 + uint64_t(*p - UC('0'));
+        ++p;
+      }
+      if (i >= minimal_nineteen_digit_integer) { // We have a big integers
+        exponent = end_of_integer_part - p + exp_number;
+      } else { // We have a value with a fractional component.
+        p = answer.fraction.ptr;
+        UC const *frac_end = p + answer.fraction.len();
+        while ((i < minimal_nineteen_digit_integer) && (p != frac_end)) {
+          i = i * 10 + uint64_t(*p - UC('0'));
+          ++p;
+        }
+        exponent = answer.fraction.ptr - p + exp_number;
+      }
+      // We have now corrected both exponent and i, to a truncated value
+    }
+  }
+  answer.exponent = exponent;
+  answer.mantissa = i;
+  return answer;
+}
+
+template <typename T, typename UC>
+fastfloat_really_inline FASTFLOAT_CONSTEXPR20 from_chars_result_t<UC>
+parse_int_string(UC const *p, UC const *pend, T &value, int base) {
+  from_chars_result_t<UC> answer;
+
+  UC const *const first = p;
+
+  bool negative = (*p == UC('-'));
+  if (!std::is_signed<T>::value && negative) {
+    answer.ec = std::errc::invalid_argument;
+    answer.ptr = first;
+    return answer;
+  }
+#ifdef FASTFLOAT_ALLOWS_LEADING_PLUS // disabled by default
+  if ((*p == UC('-')) || (*p == UC('+'))) {
+#else
+  if (*p == UC('-')) {
+#endif
+    ++p;
+  }
+
+  UC const *const start_num = p;
+
+  while (p != pend && *p == UC('0')) {
+    ++p;
+  }
+
+  const bool has_leading_zeros = p > start_num;
+
+  UC const *const start_digits = p;
+
+  uint64_t i = 0;
+  if (base == 10) {
+    loop_parse_if_eight_digits(p, pend, i); // use SIMD if possible
+  }
+  while (p != pend) {
+    uint8_t digit = ch_to_digit(*p);
+    if (digit >= base) {
+      break;
+    }
+    i = uint64_t(base) * i + digit; // might overflow, check this later
+    p++;
+  }
+
+  size_t digit_count = size_t(p - start_digits);
+
+  if (digit_count == 0) {
+    if (has_leading_zeros) {
+      value = 0;
+      answer.ec = std::errc();
+      answer.ptr = p;
+    } else {
+      answer.ec = std::errc::invalid_argument;
+      answer.ptr = first;
+    }
+    return answer;
+  }
+
+  answer.ptr = p;
+
+  // check u64 overflow
+  size_t max_digits = max_digits_u64(base);
+  if (digit_count > max_digits) {
+    answer.ec = std::errc::result_out_of_range;
+    return answer;
+  }
+  // this check can be eliminated for all other types, but they will all require
+  // a max_digits(base) equivalent
+  if (digit_count == max_digits && i < min_safe_u64(base)) {
+    answer.ec = std::errc::result_out_of_range;
+    return answer;
+  }
+
+  // check other types overflow
+  if (!std::is_same<T, uint64_t>::value) {
+    if (i > uint64_t(std::numeric_limits<T>::max()) + uint64_t(negative)) {
+      answer.ec = std::errc::result_out_of_range;
+      return answer;
+    }
+  }
+
+  if (negative) {
+#ifdef FASTFLOAT_VISUAL_STUDIO
+#pragma warning(push)
+#pragma warning(disable : 4146)
+#endif
+    // this weird workaround is required because:
+    // - converting unsigned to signed when its value is greater than signed max
+    // is UB pre-C++23.
+    // - reinterpret_casting (~i + 1) would work, but it is not constexpr
+    // this is always optimized into a neg instruction (note: T is an integer
+    // type)
+    value = T(-std::numeric_limits<T>::max() -
+              T(i - uint64_t(std::numeric_limits<T>::max())));
+#ifdef FASTFLOAT_VISUAL_STUDIO
+#pragma warning(pop)
+#endif
+  } else {
+    value = T(i);
+  }
+
+  answer.ec = std::errc();
+  return answer;
+}
+
+} // namespace fast_float
+
+#endif
+
+#ifndef FASTFLOAT_FAST_TABLE_H
+#define FASTFLOAT_FAST_TABLE_H
+
+#include <cstdint>
+
+namespace fast_float {
+
+/**
+ * When mapping numbers from decimal to binary,
+ * we go from w * 10^q to m * 2^p but we have
+ * 10^q = 5^q * 2^q, so effectively
+ * we are trying to match
+ * w * 2^q * 5^q to m * 2^p. Thus the powers of two
+ * are not a concern since they can be represented
+ * exactly using the binary notation, only the powers of five
+ * affect the binary significand.
+ */
+
+/**
+ * The smallest non-zero float (binary64) is 2^-1074.
+ * We take as input numbers of the form w x 10^q where w < 2^64.
+ * We have that w * 10^-343  <  2^(64-344) 5^-343 < 2^-1076.
+ * However, we have that
+ * (2^64-1) * 10^-342 =  (2^64-1) * 2^-342 * 5^-342 > 2^-1074.
+ * Thus it is possible for a number of the form w * 10^-342 where
+ * w is a 64-bit value to be a non-zero floating-point number.
+ *********
+ * Any number of form w * 10^309 where w>= 1 is going to be
+ * infinite in binary64 so we never need to worry about powers
+ * of 5 greater than 308.
+ */
+template <class unused = void> struct powers_template {
+
+  constexpr static int smallest_power_of_five =
+      binary_format<double>::smallest_power_of_ten();
+  constexpr static int largest_power_of_five =
+      binary_format<double>::largest_power_of_ten();
+  constexpr static int number_of_entries =
+      2 * (largest_power_of_five - smallest_power_of_five + 1);
+  // Powers of five from 5^-342 all the way to 5^308 rounded toward one.
+  constexpr static uint64_t power_of_five_128[number_of_entries] = {
+      0xeef453d6923bd65a, 0x113faa2906a13b3f,
+      0x9558b4661b6565f8, 0x4ac7ca59a424c507,
+      0xbaaee17fa23ebf76, 0x5d79bcf00d2df649,
+      0xe95a99df8ace6f53, 0xf4d82c2c107973dc,
+      0x91d8a02bb6c10594, 0x79071b9b8a4be869,
+      0xb64ec836a47146f9, 0x9748e2826cdee284,
+      0xe3e27a444d8d98b7, 0xfd1b1b2308169b25,
+      0x8e6d8c6ab0787f72, 0xfe30f0f5e50e20f7,
+      0xb208ef855c969f4f, 0xbdbd2d335e51a935,
+      0xde8b2b66b3bc4723, 0xad2c788035e61382,
+      0x8b16fb203055ac76, 0x4c3bcb5021afcc31,
+      0xaddcb9e83c6b1793, 0xdf4abe242a1bbf3d,
+      0xd953e8624b85dd78, 0xd71d6dad34a2af0d,
+      0x87d4713d6f33aa6b, 0x8672648c40e5ad68,
+      0xa9c98d8ccb009506, 0x680efdaf511f18c2,
+      0xd43bf0effdc0ba48, 0x212bd1b2566def2,
+      0x84a57695fe98746d, 0x14bb630f7604b57,
+      0xa5ced43b7e3e9188, 0x419ea3bd35385e2d,
+      0xcf42894a5dce35ea, 0x52064cac828675b9,
+      0x818995ce7aa0e1b2, 0x7343efebd1940993,
+      0xa1ebfb4219491a1f, 0x1014ebe6c5f90bf8,
+      0xca66fa129f9b60a6, 0xd41a26e077774ef6,
+      0xfd00b897478238d0, 0x8920b098955522b4,
+      0x9e20735e8cb16382, 0x55b46e5f5d5535b0,
+      0xc5a890362fddbc62, 0xeb2189f734aa831d,
+      0xf712b443bbd52b7b, 0xa5e9ec7501d523e4,
+      0x9a6bb0aa55653b2d, 0x47b233c92125366e,
+      0xc1069cd4eabe89f8, 0x999ec0bb696e840a,
+      0xf148440a256e2c76, 0xc00670ea43ca250d,
+      0x96cd2a865764dbca, 0x380406926a5e5728,
+      0xbc807527ed3e12bc, 0xc605083704f5ecf2,
+      0xeba09271e88d976b, 0xf7864a44c633682e,
+      0x93445b8731587ea3, 0x7ab3ee6afbe0211d,
+      0xb8157268fdae9e4c, 0x5960ea05bad82964,
+      0xe61acf033d1a45df, 0x6fb92487298e33bd,
+      0x8fd0c16206306bab, 0xa5d3b6d479f8e056,
+      0xb3c4f1ba87bc8696, 0x8f48a4899877186c,
+      0xe0b62e2929aba83c, 0x331acdabfe94de87,
+      0x8c71dcd9ba0b4925, 0x9ff0c08b7f1d0b14,
+      0xaf8e5410288e1b6f, 0x7ecf0ae5ee44dd9,
+      0xdb71e91432b1a24a, 0xc9e82cd9f69d6150,
+      0x892731ac9faf056e, 0xbe311c083a225cd2,
+      0xab70fe17c79ac6ca, 0x6dbd630a48aaf406,
+      0xd64d3d9db981787d, 0x92cbbccdad5b108,
+      0x85f0468293f0eb4e, 0x25bbf56008c58ea5,
+      0xa76c582338ed2621, 0xaf2af2b80af6f24e,
+      0xd1476e2c07286faa, 0x1af5af660db4aee1,
+      0x82cca4db847945ca, 0x50d98d9fc890ed4d,
+      0xa37fce126597973c, 0xe50ff107bab528a0,
+      0xcc5fc196fefd7d0c, 0x1e53ed49a96272c8,
+      0xff77b1fcbebcdc4f, 0x25e8e89c13bb0f7a,
+      0x9faacf3df73609b1, 0x77b191618c54e9ac,
+      0xc795830d75038c1d, 0xd59df5b9ef6a2417,
+      0xf97ae3d0d2446f25, 0x4b0573286b44ad1d,
+      0x9becce62836ac577, 0x4ee367f9430aec32,
+      0xc2e801fb244576d5, 0x229c41f793cda73f,
+      0xf3a20279ed56d48a, 0x6b43527578c1110f,
+      0x9845418c345644d6, 0x830a13896b78aaa9,
+      0xbe5691ef416bd60c, 0x23cc986bc656d553,
+      0xedec366b11c6cb8f, 0x2cbfbe86b7ec8aa8,
+      0x94b3a202eb1c3f39, 0x7bf7d71432f3d6a9,
+      0xb9e08a83a5e34f07, 0xdaf5ccd93fb0cc53,
+      0xe858ad248f5c22c9, 0xd1b3400f8f9cff68,
+      0x91376c36d99995be, 0x23100809b9c21fa1,
+      0xb58547448ffffb2d, 0xabd40a0c2832a78a,
+      0xe2e69915b3fff9f9, 0x16c90c8f323f516c,
+      0x8dd01fad907ffc3b, 0xae3da7d97f6792e3,
+      0xb1442798f49ffb4a, 0x99cd11cfdf41779c,
+      0xdd95317f31c7fa1d, 0x40405643d711d583,
+      0x8a7d3eef7f1cfc52, 0x482835ea666b2572,
+      0xad1c8eab5ee43b66, 0xda3243650005eecf,
+      0xd863b256369d4a40, 0x90bed43e40076a82,
+      0x873e4f75e2224e68, 0x5a7744a6e804a291,
+      0xa90de3535aaae202, 0x711515d0a205cb36,
+      0xd3515c2831559a83, 0xd5a5b44ca873e03,
+      0x8412d9991ed58091, 0xe858790afe9486c2,
+      0xa5178fff668ae0b6, 0x626e974dbe39a872,
+      0xce5d73ff402d98e3, 0xfb0a3d212dc8128f,
+      0x80fa687f881c7f8e, 0x7ce66634bc9d0b99,
+      0xa139029f6a239f72, 0x1c1fffc1ebc44e80,
+      0xc987434744ac874e, 0xa327ffb266b56220,
+      0xfbe9141915d7a922, 0x4bf1ff9f0062baa8,
+      0x9d71ac8fada6c9b5, 0x6f773fc3603db4a9,
+      0xc4ce17b399107c22, 0xcb550fb4384d21d3,
+      0xf6019da07f549b2b, 0x7e2a53a146606a48,
+      0x99c102844f94e0fb, 0x2eda7444cbfc426d,
+      0xc0314325637a1939, 0xfa911155fefb5308,
+      0xf03d93eebc589f88, 0x793555ab7eba27ca,
+      0x96267c7535b763b5, 0x4bc1558b2f3458de,
+      0xbbb01b9283253ca2, 0x9eb1aaedfb016f16,
+      0xea9c227723ee8bcb, 0x465e15a979c1cadc,
+      0x92a1958a7675175f, 0xbfacd89ec191ec9,
+      0xb749faed14125d36, 0xcef980ec671f667b,
+      0xe51c79a85916f484, 0x82b7e12780e7401a,
+      0x8f31cc0937ae58d2, 0xd1b2ecb8b0908810,
+      0xb2fe3f0b8599ef07, 0x861fa7e6dcb4aa15,
+      0xdfbdcece67006ac9, 0x67a791e093e1d49a,
+      0x8bd6a141006042bd, 0xe0c8bb2c5c6d24e0,
+      0xaecc49914078536d, 0x58fae9f773886e18,
+      0xda7f5bf590966848, 0xaf39a475506a899e,
+      0x888f99797a5e012d, 0x6d8406c952429603,
+      0xaab37fd7d8f58178, 0xc8e5087ba6d33b83,
+      0xd5605fcdcf32e1d6, 0xfb1e4a9a90880a64,
+      0x855c3be0a17fcd26, 0x5cf2eea09a55067f,
+      0xa6b34ad8c9dfc06f, 0xf42faa48c0ea481e,
+      0xd0601d8efc57b08b, 0xf13b94daf124da26,
+      0x823c12795db6ce57, 0x76c53d08d6b70858,
+      0xa2cb1717b52481ed, 0x54768c4b0c64ca6e,
+      0xcb7ddcdda26da268, 0xa9942f5dcf7dfd09,
+      0xfe5d54150b090b02, 0xd3f93b35435d7c4c,
+      0x9efa548d26e5a6e1, 0xc47bc5014a1a6daf,
+      0xc6b8e9b0709f109a, 0x359ab6419ca1091b,
+      0xf867241c8cc6d4c0, 0xc30163d203c94b62,
+      0x9b407691d7fc44f8, 0x79e0de63425dcf1d,
+      0xc21094364dfb5636, 0x985915fc12f542e4,
+      0xf294b943e17a2bc4, 0x3e6f5b7b17b2939d,
+      0x979cf3ca6cec5b5a, 0xa705992ceecf9c42,
+      0xbd8430bd08277231, 0x50c6ff782a838353,
+      0xece53cec4a314ebd, 0xa4f8bf5635246428,
+      0x940f4613ae5ed136, 0x871b7795e136be99,
+      0xb913179899f68584, 0x28e2557b59846e3f,
+      0xe757dd7ec07426e5, 0x331aeada2fe589cf,
+      0x9096ea6f3848984f, 0x3ff0d2c85def7621,
+      0xb4bca50b065abe63, 0xfed077a756b53a9,
+      0xe1ebce4dc7f16dfb, 0xd3e8495912c62894,
+      0x8d3360f09cf6e4bd, 0x64712dd7abbbd95c,
+      0xb080392cc4349dec, 0xbd8d794d96aacfb3,
+      0xdca04777f541c567, 0xecf0d7a0fc5583a0,
+      0x89e42caaf9491b60, 0xf41686c49db57244,
+      0xac5d37d5b79b6239, 0x311c2875c522ced5,
+      0xd77485cb25823ac7, 0x7d633293366b828b,
+      0x86a8d39ef77164bc, 0xae5dff9c02033197,
+      0xa8530886b54dbdeb, 0xd9f57f830283fdfc,
+      0xd267caa862a12d66, 0xd072df63c324fd7b,
+      0x8380dea93da4bc60, 0x4247cb9e59f71e6d,
+      0xa46116538d0deb78, 0x52d9be85f074e608,
+      0xcd795be870516656, 0x67902e276c921f8b,
+      0x806bd9714632dff6, 0xba1cd8a3db53b6,
+      0xa086cfcd97bf97f3, 0x80e8a40eccd228a4,
+      0xc8a883c0fdaf7df0, 0x6122cd128006b2cd,
+      0xfad2a4b13d1b5d6c, 0x796b805720085f81,
+      0x9cc3a6eec6311a63, 0xcbe3303674053bb0,
+      0xc3f490aa77bd60fc, 0xbedbfc4411068a9c,
+      0xf4f1b4d515acb93b, 0xee92fb5515482d44,
+      0x991711052d8bf3c5, 0x751bdd152d4d1c4a,
+      0xbf5cd54678eef0b6, 0xd262d45a78a0635d,
+      0xef340a98172aace4, 0x86fb897116c87c34,
+      0x9580869f0e7aac0e, 0xd45d35e6ae3d4da0,
+      0xbae0a846d2195712, 0x8974836059cca109,
+      0xe998d258869facd7, 0x2bd1a438703fc94b,
+      0x91ff83775423cc06, 0x7b6306a34627ddcf,
+      0xb67f6455292cbf08, 0x1a3bc84c17b1d542,
+      0xe41f3d6a7377eeca, 0x20caba5f1d9e4a93,
+      0x8e938662882af53e, 0x547eb47b7282ee9c,
+      0xb23867fb2a35b28d, 0xe99e619a4f23aa43,
+      0xdec681f9f4c31f31, 0x6405fa00e2ec94d4,
+      0x8b3c113c38f9f37e, 0xde83bc408dd3dd04,
+      0xae0b158b4738705e, 0x9624ab50b148d445,
+      0xd98ddaee19068c76, 0x3badd624dd9b0957,
+      0x87f8a8d4cfa417c9, 0xe54ca5d70a80e5d6,
+      0xa9f6d30a038d1dbc, 0x5e9fcf4ccd211f4c,
+      0xd47487cc8470652b, 0x7647c3200069671f,
+      0x84c8d4dfd2c63f3b, 0x29ecd9f40041e073,
+      0xa5fb0a17c777cf09, 0xf468107100525890,
+      0xcf79cc9db955c2cc, 0x7182148d4066eeb4,
+      0x81ac1fe293d599bf, 0xc6f14cd848405530,
+      0xa21727db38cb002f, 0xb8ada00e5a506a7c,
+      0xca9cf1d206fdc03b, 0xa6d90811f0e4851c,
+      0xfd442e4688bd304a, 0x908f4a166d1da663,
+      0x9e4a9cec15763e2e, 0x9a598e4e043287fe,
+      0xc5dd44271ad3cdba, 0x40eff1e1853f29fd,
+      0xf7549530e188c128, 0xd12bee59e68ef47c,
+      0x9a94dd3e8cf578b9, 0x82bb74f8301958ce,
+      0xc13a148e3032d6e7, 0xe36a52363c1faf01,
+      0xf18899b1bc3f8ca1, 0xdc44e6c3cb279ac1,
+      0x96f5600f15a7b7e5, 0x29ab103a5ef8c0b9,
+      0xbcb2b812db11a5de, 0x7415d448f6b6f0e7,
+      0xebdf661791d60f56, 0x111b495b3464ad21,
+      0x936b9fcebb25c995, 0xcab10dd900beec34,
+      0xb84687c269ef3bfb, 0x3d5d514f40eea742,
+      0xe65829b3046b0afa, 0xcb4a5a3112a5112,
+      0x8ff71a0fe2c2e6dc, 0x47f0e785eaba72ab,
+      0xb3f4e093db73a093, 0x59ed216765690f56,
+      0xe0f218b8d25088b8, 0x306869c13ec3532c,
+      0x8c974f7383725573, 0x1e414218c73a13fb,
+      0xafbd2350644eeacf, 0xe5d1929ef90898fa,
+      0xdbac6c247d62a583, 0xdf45f746b74abf39,
+      0x894bc396ce5da772, 0x6b8bba8c328eb783,
+      0xab9eb47c81f5114f, 0x66ea92f3f326564,
+      0xd686619ba27255a2, 0xc80a537b0efefebd,
+      0x8613fd0145877585, 0xbd06742ce95f5f36,
+      0xa798fc4196e952e7, 0x2c48113823b73704,
+      0xd17f3b51fca3a7a0, 0xf75a15862ca504c5,
+      0x82ef85133de648c4, 0x9a984d73dbe722fb,
+      0xa3ab66580d5fdaf5, 0xc13e60d0d2e0ebba,
+      0xcc963fee10b7d1b3, 0x318df905079926a8,
+      0xffbbcfe994e5c61f, 0xfdf17746497f7052,
+      0x9fd561f1fd0f9bd3, 0xfeb6ea8bedefa633,
+      0xc7caba6e7c5382c8, 0xfe64a52ee96b8fc0,
+      0xf9bd690a1b68637b, 0x3dfdce7aa3c673b0,
+      0x9c1661a651213e2d, 0x6bea10ca65c084e,
+      0xc31bfa0fe5698db8, 0x486e494fcff30a62,
+      0xf3e2f893dec3f126, 0x5a89dba3c3efccfa,
+      0x986ddb5c6b3a76b7, 0xf89629465a75e01c,
+      0xbe89523386091465, 0xf6bbb397f1135823,
+      0xee2ba6c0678b597f, 0x746aa07ded582e2c,
+      0x94db483840b717ef, 0xa8c2a44eb4571cdc,
+      0xba121a4650e4ddeb, 0x92f34d62616ce413,
+      0xe896a0d7e51e1566, 0x77b020baf9c81d17,
+      0x915e2486ef32cd60, 0xace1474dc1d122e,
+      0xb5b5ada8aaff80b8, 0xd819992132456ba,
+      0xe3231912d5bf60e6, 0x10e1fff697ed6c69,
+      0x8df5efabc5979c8f, 0xca8d3ffa1ef463c1,
+      0xb1736b96b6fd83b3, 0xbd308ff8a6b17cb2,
+      0xddd0467c64bce4a0, 0xac7cb3f6d05ddbde,
+      0x8aa22c0dbef60ee4, 0x6bcdf07a423aa96b,
+      0xad4ab7112eb3929d, 0x86c16c98d2c953c6,
+      0xd89d64d57a607744, 0xe871c7bf077ba8b7,
+      0x87625f056c7c4a8b, 0x11471cd764ad4972,
+      0xa93af6c6c79b5d2d, 0xd598e40d3dd89bcf,
+      0xd389b47879823479, 0x4aff1d108d4ec2c3,
+      0x843610cb4bf160cb, 0xcedf722a585139ba,
+      0xa54394fe1eedb8fe, 0xc2974eb4ee658828,
+      0xce947a3da6a9273e, 0x733d226229feea32,
+      0x811ccc668829b887, 0x806357d5a3f525f,
+      0xa163ff802a3426a8, 0xca07c2dcb0cf26f7,
+      0xc9bcff6034c13052, 0xfc89b393dd02f0b5,
+      0xfc2c3f3841f17c67, 0xbbac2078d443ace2,
+      0x9d9ba7832936edc0, 0xd54b944b84aa4c0d,
+      0xc5029163f384a931, 0xa9e795e65d4df11,
+      0xf64335bcf065d37d, 0x4d4617b5ff4a16d5,
+      0x99ea0196163fa42e, 0x504bced1bf8e4e45,
+      0xc06481fb9bcf8d39, 0xe45ec2862f71e1d6,
+      0xf07da27a82c37088, 0x5d767327bb4e5a4c,
+      0x964e858c91ba2655, 0x3a6a07f8d510f86f,
+      0xbbe226efb628afea, 0x890489f70a55368b,
+      0xeadab0aba3b2dbe5, 0x2b45ac74ccea842e,
+      0x92c8ae6b464fc96f, 0x3b0b8bc90012929d,
+      0xb77ada0617e3bbcb, 0x9ce6ebb40173744,
+      0xe55990879ddcaabd, 0xcc420a6a101d0515,
+      0x8f57fa54c2a9eab6, 0x9fa946824a12232d,
+      0xb32df8e9f3546564, 0x47939822dc96abf9,
+      0xdff9772470297ebd, 0x59787e2b93bc56f7,
+      0x8bfbea76c619ef36, 0x57eb4edb3c55b65a,
+      0xaefae51477a06b03, 0xede622920b6b23f1,
+      0xdab99e59958885c4, 0xe95fab368e45eced,
+      0x88b402f7fd75539b, 0x11dbcb0218ebb414,
+      0xaae103b5fcd2a881, 0xd652bdc29f26a119,
+      0xd59944a37c0752a2, 0x4be76d3346f0495f,
+      0x857fcae62d8493a5, 0x6f70a4400c562ddb,
+      0xa6dfbd9fb8e5b88e, 0xcb4ccd500f6bb952,
+      0xd097ad07a71f26b2, 0x7e2000a41346a7a7,
+      0x825ecc24c873782f, 0x8ed400668c0c28c8,
+      0xa2f67f2dfa90563b, 0x728900802f0f32fa,
+      0xcbb41ef979346bca, 0x4f2b40a03ad2ffb9,
+      0xfea126b7d78186bc, 0xe2f610c84987bfa8,
+      0x9f24b832e6b0f436, 0xdd9ca7d2df4d7c9,
+      0xc6ede63fa05d3143, 0x91503d1c79720dbb,
+      0xf8a95fcf88747d94, 0x75a44c6397ce912a,
+      0x9b69dbe1b548ce7c, 0xc986afbe3ee11aba,
+      0xc24452da229b021b, 0xfbe85badce996168,
+      0xf2d56790ab41c2a2, 0xfae27299423fb9c3,
+      0x97c560ba6b0919a5, 0xdccd879fc967d41a,
+      0xbdb6b8e905cb600f, 0x5400e987bbc1c920,
+      0xed246723473e3813, 0x290123e9aab23b68,
+      0x9436c0760c86e30b, 0xf9a0b6720aaf6521,
+      0xb94470938fa89bce, 0xf808e40e8d5b3e69,
+      0xe7958cb87392c2c2, 0xb60b1d1230b20e04,
+      0x90bd77f3483bb9b9, 0xb1c6f22b5e6f48c2,
+      0xb4ecd5f01a4aa828, 0x1e38aeb6360b1af3,
+      0xe2280b6c20dd5232, 0x25c6da63c38de1b0,
+      0x8d590723948a535f, 0x579c487e5a38ad0e,
+      0xb0af48ec79ace837, 0x2d835a9df0c6d851,
+      0xdcdb1b2798182244, 0xf8e431456cf88e65,
+      0x8a08f0f8bf0f156b, 0x1b8e9ecb641b58ff,
+      0xac8b2d36eed2dac5, 0xe272467e3d222f3f,
+      0xd7adf884aa879177, 0x5b0ed81dcc6abb0f,
+      0x86ccbb52ea94baea, 0x98e947129fc2b4e9,
+      0xa87fea27a539e9a5, 0x3f2398d747b36224,
+      0xd29fe4b18e88640e, 0x8eec7f0d19a03aad,
+      0x83a3eeeef9153e89, 0x1953cf68300424ac,
+      0xa48ceaaab75a8e2b, 0x5fa8c3423c052dd7,
+      0xcdb02555653131b6, 0x3792f412cb06794d,
+      0x808e17555f3ebf11, 0xe2bbd88bbee40bd0,
+      0xa0b19d2ab70e6ed6, 0x5b6aceaeae9d0ec4,
+      0xc8de047564d20a8b, 0xf245825a5a445275,
+      0xfb158592be068d2e, 0xeed6e2f0f0d56712,
+      0x9ced737bb6c4183d, 0x55464dd69685606b,
+      0xc428d05aa4751e4c, 0xaa97e14c3c26b886,
+      0xf53304714d9265df, 0xd53dd99f4b3066a8,
+      0x993fe2c6d07b7fab, 0xe546a8038efe4029,
+      0xbf8fdb78849a5f96, 0xde98520472bdd033,
+      0xef73d256a5c0f77c, 0x963e66858f6d4440,
+      0x95a8637627989aad, 0xdde7001379a44aa8,
+      0xbb127c53b17ec159, 0x5560c018580d5d52,
+      0xe9d71b689dde71af, 0xaab8f01e6e10b4a6,
+      0x9226712162ab070d, 0xcab3961304ca70e8,
+      0xb6b00d69bb55c8d1, 0x3d607b97c5fd0d22,
+      0xe45c10c42a2b3b05, 0x8cb89a7db77c506a,
+      0x8eb98a7a9a5b04e3, 0x77f3608e92adb242,
+      0xb267ed1940f1c61c, 0x55f038b237591ed3,
+      0xdf01e85f912e37a3, 0x6b6c46dec52f6688,
+      0x8b61313bbabce2c6, 0x2323ac4b3b3da015,
+      0xae397d8aa96c1b77, 0xabec975e0a0d081a,
+      0xd9c7dced53c72255, 0x96e7bd358c904a21,
+      0x881cea14545c7575, 0x7e50d64177da2e54,
+      0xaa242499697392d2, 0xdde50bd1d5d0b9e9,
+      0xd4ad2dbfc3d07787, 0x955e4ec64b44e864,
+      0x84ec3c97da624ab4, 0xbd5af13bef0b113e,
+      0xa6274bbdd0fadd61, 0xecb1ad8aeacdd58e,
+      0xcfb11ead453994ba, 0x67de18eda5814af2,
+      0x81ceb32c4b43fcf4, 0x80eacf948770ced7,
+      0xa2425ff75e14fc31, 0xa1258379a94d028d,
+      0xcad2f7f5359a3b3e, 0x96ee45813a04330,
+      0xfd87b5f28300ca0d, 0x8bca9d6e188853fc,
+      0x9e74d1b791e07e48, 0x775ea264cf55347e,
+      0xc612062576589dda, 0x95364afe032a819e,
+      0xf79687aed3eec551, 0x3a83ddbd83f52205,
+      0x9abe14cd44753b52, 0xc4926a9672793543,
+      0xc16d9a0095928a27, 0x75b7053c0f178294,
+      0xf1c90080baf72cb1, 0x5324c68b12dd6339,
+      0x971da05074da7bee, 0xd3f6fc16ebca5e04,
+      0xbce5086492111aea, 0x88f4bb1ca6bcf585,
+      0xec1e4a7db69561a5, 0x2b31e9e3d06c32e6,
+      0x9392ee8e921d5d07, 0x3aff322e62439fd0,
+      0xb877aa3236a4b449, 0x9befeb9fad487c3,
+      0xe69594bec44de15b, 0x4c2ebe687989a9b4,
+      0x901d7cf73ab0acd9, 0xf9d37014bf60a11,
+      0xb424dc35095cd80f, 0x538484c19ef38c95,
+      0xe12e13424bb40e13, 0x2865a5f206b06fba,
+      0x8cbccc096f5088cb, 0xf93f87b7442e45d4,
+      0xafebff0bcb24aafe, 0xf78f69a51539d749,
+      0xdbe6fecebdedd5be, 0xb573440e5a884d1c,
+      0x89705f4136b4a597, 0x31680a88f8953031,
+      0xabcc77118461cefc, 0xfdc20d2b36ba7c3e,
+      0xd6bf94d5e57a42bc, 0x3d32907604691b4d,
+      0x8637bd05af6c69b5, 0xa63f9a49c2c1b110,
+      0xa7c5ac471b478423, 0xfcf80dc33721d54,
+      0xd1b71758e219652b, 0xd3c36113404ea4a9,
+      0x83126e978d4fdf3b, 0x645a1cac083126ea,
+      0xa3d70a3d70a3d70a, 0x3d70a3d70a3d70a4,
+      0xcccccccccccccccc, 0xcccccccccccccccd,
+      0x8000000000000000, 0x0,
+      0xa000000000000000, 0x0,
+      0xc800000000000000, 0x0,
+      0xfa00000000000000, 0x0,
+      0x9c40000000000000, 0x0,
+      0xc350000000000000, 0x0,
+      0xf424000000000000, 0x0,
+      0x9896800000000000, 0x0,
+      0xbebc200000000000, 0x0,
+      0xee6b280000000000, 0x0,
+      0x9502f90000000000, 0x0,
+      0xba43b74000000000, 0x0,
+      0xe8d4a51000000000, 0x0,
+      0x9184e72a00000000, 0x0,
+      0xb5e620f480000000, 0x0,
+      0xe35fa931a0000000, 0x0,
+      0x8e1bc9bf04000000, 0x0,
+      0xb1a2bc2ec5000000, 0x0,
+      0xde0b6b3a76400000, 0x0,
+      0x8ac7230489e80000, 0x0,
+      0xad78ebc5ac620000, 0x0,
+      0xd8d726b7177a8000, 0x0,
+      0x878678326eac9000, 0x0,
+      0xa968163f0a57b400, 0x0,
+      0xd3c21bcecceda100, 0x0,
+      0x84595161401484a0, 0x0,
+      0xa56fa5b99019a5c8, 0x0,
+      0xcecb8f27f4200f3a, 0x0,
+      0x813f3978f8940984, 0x4000000000000000,
+      0xa18f07d736b90be5, 0x5000000000000000,
+      0xc9f2c9cd04674ede, 0xa400000000000000,
+      0xfc6f7c4045812296, 0x4d00000000000000,
+      0x9dc5ada82b70b59d, 0xf020000000000000,
+      0xc5371912364ce305, 0x6c28000000000000,
+      0xf684df56c3e01bc6, 0xc732000000000000,
+      0x9a130b963a6c115c, 0x3c7f400000000000,
+      0xc097ce7bc90715b3, 0x4b9f100000000000,
+      0xf0bdc21abb48db20, 0x1e86d40000000000,
+      0x96769950b50d88f4, 0x1314448000000000,
+      0xbc143fa4e250eb31, 0x17d955a000000000,
+      0xeb194f8e1ae525fd, 0x5dcfab0800000000,
+      0x92efd1b8d0cf37be, 0x5aa1cae500000000,
+      0xb7abc627050305ad, 0xf14a3d9e40000000,
+      0xe596b7b0c643c719, 0x6d9ccd05d0000000,
+      0x8f7e32ce7bea5c6f, 0xe4820023a2000000,
+      0xb35dbf821ae4f38b, 0xdda2802c8a800000,
+      0xe0352f62a19e306e, 0xd50b2037ad200000,
+      0x8c213d9da502de45, 0x4526f422cc340000,
+      0xaf298d050e4395d6, 0x9670b12b7f410000,
+      0xdaf3f04651d47b4c, 0x3c0cdd765f114000,
+      0x88d8762bf324cd0f, 0xa5880a69fb6ac800,
+      0xab0e93b6efee0053, 0x8eea0d047a457a00,
+      0xd5d238a4abe98068, 0x72a4904598d6d880,
+      0x85a36366eb71f041, 0x47a6da2b7f864750,
+      0xa70c3c40a64e6c51, 0x999090b65f67d924,
+      0xd0cf4b50cfe20765, 0xfff4b4e3f741cf6d,
+      0x82818f1281ed449f, 0xbff8f10e7a8921a4,
+      0xa321f2d7226895c7, 0xaff72d52192b6a0d,
+      0xcbea6f8ceb02bb39, 0x9bf4f8a69f764490,
+      0xfee50b7025c36a08, 0x2f236d04753d5b4,
+      0x9f4f2726179a2245, 0x1d762422c946590,
+      0xc722f0ef9d80aad6, 0x424d3ad2b7b97ef5,
+      0xf8ebad2b84e0d58b, 0xd2e0898765a7deb2,
+      0x9b934c3b330c8577, 0x63cc55f49f88eb2f,
+      0xc2781f49ffcfa6d5, 0x3cbf6b71c76b25fb,
+      0xf316271c7fc3908a, 0x8bef464e3945ef7a,
+      0x97edd871cfda3a56, 0x97758bf0e3cbb5ac,
+      0xbde94e8e43d0c8ec, 0x3d52eeed1cbea317,
+      0xed63a231d4c4fb27, 0x4ca7aaa863ee4bdd,
+      0x945e455f24fb1cf8, 0x8fe8caa93e74ef6a,
+      0xb975d6b6ee39e436, 0xb3e2fd538e122b44,
+      0xe7d34c64a9c85d44, 0x60dbbca87196b616,
+      0x90e40fbeea1d3a4a, 0xbc8955e946fe31cd,
+      0xb51d13aea4a488dd, 0x6babab6398bdbe41,
+      0xe264589a4dcdab14, 0xc696963c7eed2dd1,
+      0x8d7eb76070a08aec, 0xfc1e1de5cf543ca2,
+      0xb0de65388cc8ada8, 0x3b25a55f43294bcb,
+      0xdd15fe86affad912, 0x49ef0eb713f39ebe,
+      0x8a2dbf142dfcc7ab, 0x6e3569326c784337,
+      0xacb92ed9397bf996, 0x49c2c37f07965404,
+      0xd7e77a8f87daf7fb, 0xdc33745ec97be906,
+      0x86f0ac99b4e8dafd, 0x69a028bb3ded71a3,
+      0xa8acd7c0222311bc, 0xc40832ea0d68ce0c,
+      0xd2d80db02aabd62b, 0xf50a3fa490c30190,
+      0x83c7088e1aab65db, 0x792667c6da79e0fa,
+      0xa4b8cab1a1563f52, 0x577001b891185938,
+      0xcde6fd5e09abcf26, 0xed4c0226b55e6f86,
+      0x80b05e5ac60b6178, 0x544f8158315b05b4,
+      0xa0dc75f1778e39d6, 0x696361ae3db1c721,
+      0xc913936dd571c84c, 0x3bc3a19cd1e38e9,
+      0xfb5878494ace3a5f, 0x4ab48a04065c723,
+      0x9d174b2dcec0e47b, 0x62eb0d64283f9c76,
+      0xc45d1df942711d9a, 0x3ba5d0bd324f8394,
+      0xf5746577930d6500, 0xca8f44ec7ee36479,
+      0x9968bf6abbe85f20, 0x7e998b13cf4e1ecb,
+      0xbfc2ef456ae276e8, 0x9e3fedd8c321a67e,
+      0xefb3ab16c59b14a2, 0xc5cfe94ef3ea101e,
+      0x95d04aee3b80ece5, 0xbba1f1d158724a12,
+      0xbb445da9ca61281f, 0x2a8a6e45ae8edc97,
+      0xea1575143cf97226, 0xf52d09d71a3293bd,
+      0x924d692ca61be758, 0x593c2626705f9c56,
+      0xb6e0c377cfa2e12e, 0x6f8b2fb00c77836c,
+      0xe498f455c38b997a, 0xb6dfb9c0f956447,
+      0x8edf98b59a373fec, 0x4724bd4189bd5eac,
+      0xb2977ee300c50fe7, 0x58edec91ec2cb657,
+      0xdf3d5e9bc0f653e1, 0x2f2967b66737e3ed,
+      0x8b865b215899f46c, 0xbd79e0d20082ee74,
+      0xae67f1e9aec07187, 0xecd8590680a3aa11,
+      0xda01ee641a708de9, 0xe80e6f4820cc9495,
+      0x884134fe908658b2, 0x3109058d147fdcdd,
+      0xaa51823e34a7eede, 0xbd4b46f0599fd415,
+      0xd4e5e2cdc1d1ea96, 0x6c9e18ac7007c91a,
+      0x850fadc09923329e, 0x3e2cf6bc604ddb0,
+      0xa6539930bf6bff45, 0x84db8346b786151c,
+      0xcfe87f7cef46ff16, 0xe612641865679a63,
+      0x81f14fae158c5f6e, 0x4fcb7e8f3f60c07e,
+      0xa26da3999aef7749, 0xe3be5e330f38f09d,
+      0xcb090c8001ab551c, 0x5cadf5bfd3072cc5,
+      0xfdcb4fa002162a63, 0x73d9732fc7c8f7f6,
+      0x9e9f11c4014dda7e, 0x2867e7fddcdd9afa,
+      0xc646d63501a1511d, 0xb281e1fd541501b8,
+      0xf7d88bc24209a565, 0x1f225a7ca91a4226,
+      0x9ae757596946075f, 0x3375788de9b06958,
+      0xc1a12d2fc3978937, 0x52d6b1641c83ae,
+      0xf209787bb47d6b84, 0xc0678c5dbd23a49a,
+      0x9745eb4d50ce6332, 0xf840b7ba963646e0,
+      0xbd176620a501fbff, 0xb650e5a93bc3d898,
+      0xec5d3fa8ce427aff, 0xa3e51f138ab4cebe,
+      0x93ba47c980e98cdf, 0xc66f336c36b10137,
+      0xb8a8d9bbe123f017, 0xb80b0047445d4184,
+      0xe6d3102ad96cec1d, 0xa60dc059157491e5,
+      0x9043ea1ac7e41392, 0x87c89837ad68db2f,
+      0xb454e4a179dd1877, 0x29babe4598c311fb,
+      0xe16a1dc9d8545e94, 0xf4296dd6fef3d67a,
+      0x8ce2529e2734bb1d, 0x1899e4a65f58660c,
+      0xb01ae745b101e9e4, 0x5ec05dcff72e7f8f,
+      0xdc21a1171d42645d, 0x76707543f4fa1f73,
+      0x899504ae72497eba, 0x6a06494a791c53a8,
+      0xabfa45da0edbde69, 0x487db9d17636892,
+      0xd6f8d7509292d603, 0x45a9d2845d3c42b6,
+      0x865b86925b9bc5c2, 0xb8a2392ba45a9b2,
+      0xa7f26836f282b732, 0x8e6cac7768d7141e,
+      0xd1ef0244af2364ff, 0x3207d795430cd926,
+      0x8335616aed761f1f, 0x7f44e6bd49e807b8,
+      0xa402b9c5a8d3a6e7, 0x5f16206c9c6209a6,
+      0xcd036837130890a1, 0x36dba887c37a8c0f,
+      0x802221226be55a64, 0xc2494954da2c9789,
+      0xa02aa96b06deb0fd, 0xf2db9baa10b7bd6c,
+      0xc83553c5c8965d3d, 0x6f92829494e5acc7,
+      0xfa42a8b73abbf48c, 0xcb772339ba1f17f9,
+      0x9c69a97284b578d7, 0xff2a760414536efb,
+      0xc38413cf25e2d70d, 0xfef5138519684aba,
+      0xf46518c2ef5b8cd1, 0x7eb258665fc25d69,
+      0x98bf2f79d5993802, 0xef2f773ffbd97a61,
+      0xbeeefb584aff8603, 0xaafb550ffacfd8fa,
+      0xeeaaba2e5dbf6784, 0x95ba2a53f983cf38,
+      0x952ab45cfa97a0b2, 0xdd945a747bf26183,
+      0xba756174393d88df, 0x94f971119aeef9e4,
+      0xe912b9d1478ceb17, 0x7a37cd5601aab85d,
+      0x91abb422ccb812ee, 0xac62e055c10ab33a,
+      0xb616a12b7fe617aa, 0x577b986b314d6009,
+      0xe39c49765fdf9d94, 0xed5a7e85fda0b80b,
+      0x8e41ade9fbebc27d, 0x14588f13be847307,
+      0xb1d219647ae6b31c, 0x596eb2d8ae258fc8,
+      0xde469fbd99a05fe3, 0x6fca5f8ed9aef3bb,
+      0x8aec23d680043bee, 0x25de7bb9480d5854,
+      0xada72ccc20054ae9, 0xaf561aa79a10ae6a,
+      0xd910f7ff28069da4, 0x1b2ba1518094da04,
+      0x87aa9aff79042286, 0x90fb44d2f05d0842,
+      0xa99541bf57452b28, 0x353a1607ac744a53,
+      0xd3fa922f2d1675f2, 0x42889b8997915ce8,
+      0x847c9b5d7c2e09b7, 0x69956135febada11,
+      0xa59bc234db398c25, 0x43fab9837e699095,
+      0xcf02b2c21207ef2e, 0x94f967e45e03f4bb,
+      0x8161afb94b44f57d, 0x1d1be0eebac278f5,
+      0xa1ba1ba79e1632dc, 0x6462d92a69731732,
+      0xca28a291859bbf93, 0x7d7b8f7503cfdcfe,
+      0xfcb2cb35e702af78, 0x5cda735244c3d43e,
+      0x9defbf01b061adab, 0x3a0888136afa64a7,
+      0xc56baec21c7a1916, 0x88aaa1845b8fdd0,
+      0xf6c69a72a3989f5b, 0x8aad549e57273d45,
+      0x9a3c2087a63f6399, 0x36ac54e2f678864b,
+      0xc0cb28a98fcf3c7f, 0x84576a1bb416a7dd,
+      0xf0fdf2d3f3c30b9f, 0x656d44a2a11c51d5,
+      0x969eb7c47859e743, 0x9f644ae5a4b1b325,
+      0xbc4665b596706114, 0x873d5d9f0dde1fee,
+      0xeb57ff22fc0c7959, 0xa90cb506d155a7ea,
+      0x9316ff75dd87cbd8, 0x9a7f12442d588f2,
+      0xb7dcbf5354e9bece, 0xc11ed6d538aeb2f,
+      0xe5d3ef282a242e81, 0x8f1668c8a86da5fa,
+      0x8fa475791a569d10, 0xf96e017d694487bc,
+      0xb38d92d760ec4455, 0x37c981dcc395a9ac,
+      0xe070f78d3927556a, 0x85bbe253f47b1417,
+      0x8c469ab843b89562, 0x93956d7478ccec8e,
+      0xaf58416654a6babb, 0x387ac8d1970027b2,
+      0xdb2e51bfe9d0696a, 0x6997b05fcc0319e,
+      0x88fcf317f22241e2, 0x441fece3bdf81f03,
+      0xab3c2fddeeaad25a, 0xd527e81cad7626c3,
+      0xd60b3bd56a5586f1, 0x8a71e223d8d3b074,
+      0x85c7056562757456, 0xf6872d5667844e49,
+      0xa738c6bebb12d16c, 0xb428f8ac016561db,
+      0xd106f86e69d785c7, 0xe13336d701beba52,
+      0x82a45b450226b39c, 0xecc0024661173473,
+      0xa34d721642b06084, 0x27f002d7f95d0190,
+      0xcc20ce9bd35c78a5, 0x31ec038df7b441f4,
+      0xff290242c83396ce, 0x7e67047175a15271,
+      0x9f79a169bd203e41, 0xf0062c6e984d386,
+      0xc75809c42c684dd1, 0x52c07b78a3e60868,
+      0xf92e0c3537826145, 0xa7709a56ccdf8a82,
+      0x9bbcc7a142b17ccb, 0x88a66076400bb691,
+      0xc2abf989935ddbfe, 0x6acff893d00ea435,
+      0xf356f7ebf83552fe, 0x583f6b8c4124d43,
+      0x98165af37b2153de, 0xc3727a337a8b704a,
+      0xbe1bf1b059e9a8d6, 0x744f18c0592e4c5c,
+      0xeda2ee1c7064130c, 0x1162def06f79df73,
+      0x9485d4d1c63e8be7, 0x8addcb5645ac2ba8,
+      0xb9a74a0637ce2ee1, 0x6d953e2bd7173692,
+      0xe8111c87c5c1ba99, 0xc8fa8db6ccdd0437,
+      0x910ab1d4db9914a0, 0x1d9c9892400a22a2,
+      0xb54d5e4a127f59c8, 0x2503beb6d00cab4b,
+      0xe2a0b5dc971f303a, 0x2e44ae64840fd61d,
+      0x8da471a9de737e24, 0x5ceaecfed289e5d2,
+      0xb10d8e1456105dad, 0x7425a83e872c5f47,
+      0xdd50f1996b947518, 0xd12f124e28f77719,
+      0x8a5296ffe33cc92f, 0x82bd6b70d99aaa6f,
+      0xace73cbfdc0bfb7b, 0x636cc64d1001550b,
+      0xd8210befd30efa5a, 0x3c47f7e05401aa4e,
+      0x8714a775e3e95c78, 0x65acfaec34810a71,
+      0xa8d9d1535ce3b396, 0x7f1839a741a14d0d,
+      0xd31045a8341ca07c, 0x1ede48111209a050,
+      0x83ea2b892091e44d, 0x934aed0aab460432,
+      0xa4e4b66b68b65d60, 0xf81da84d5617853f,
+      0xce1de40642e3f4b9, 0x36251260ab9d668e,
+      0x80d2ae83e9ce78f3, 0xc1d72b7c6b426019,
+      0xa1075a24e4421730, 0xb24cf65b8612f81f,
+      0xc94930ae1d529cfc, 0xdee033f26797b627,
+      0xfb9b7cd9a4a7443c, 0x169840ef017da3b1,
+      0x9d412e0806e88aa5, 0x8e1f289560ee864e,
+      0xc491798a08a2ad4e, 0xf1a6f2bab92a27e2,
+      0xf5b5d7ec8acb58a2, 0xae10af696774b1db,
+      0x9991a6f3d6bf1765, 0xacca6da1e0a8ef29,
+      0xbff610b0cc6edd3f, 0x17fd090a58d32af3,
+      0xeff394dcff8a948e, 0xddfc4b4cef07f5b0,
+      0x95f83d0a1fb69cd9, 0x4abdaf101564f98e,
+      0xbb764c4ca7a4440f, 0x9d6d1ad41abe37f1,
+      0xea53df5fd18d5513, 0x84c86189216dc5ed,
+      0x92746b9be2f8552c, 0x32fd3cf5b4e49bb4,
+      0xb7118682dbb66a77, 0x3fbc8c33221dc2a1,
+      0xe4d5e82392a40515, 0xfabaf3feaa5334a,
+      0x8f05b1163ba6832d, 0x29cb4d87f2a7400e,
+      0xb2c71d5bca9023f8, 0x743e20e9ef511012,
+      0xdf78e4b2bd342cf6, 0x914da9246b255416,
+      0x8bab8eefb6409c1a, 0x1ad089b6c2f7548e,
+      0xae9672aba3d0c320, 0xa184ac2473b529b1,
+      0xda3c0f568cc4f3e8, 0xc9e5d72d90a2741e,
+      0x8865899617fb1871, 0x7e2fa67c7a658892,
+      0xaa7eebfb9df9de8d, 0xddbb901b98feeab7,
+      0xd51ea6fa85785631, 0x552a74227f3ea565,
+      0x8533285c936b35de, 0xd53a88958f87275f,
+      0xa67ff273b8460356, 0x8a892abaf368f137,
+      0xd01fef10a657842c, 0x2d2b7569b0432d85,
+      0x8213f56a67f6b29b, 0x9c3b29620e29fc73,
+      0xa298f2c501f45f42, 0x8349f3ba91b47b8f,
+      0xcb3f2f7642717713, 0x241c70a936219a73,
+      0xfe0efb53d30dd4d7, 0xed238cd383aa0110,
+      0x9ec95d1463e8a506, 0xf4363804324a40aa,
+      0xc67bb4597ce2ce48, 0xb143c6053edcd0d5,
+      0xf81aa16fdc1b81da, 0xdd94b7868e94050a,
+      0x9b10a4e5e9913128, 0xca7cf2b4191c8326,
+      0xc1d4ce1f63f57d72, 0xfd1c2f611f63a3f0,
+      0xf24a01a73cf2dccf, 0xbc633b39673c8cec,
+      0x976e41088617ca01, 0xd5be0503e085d813,
+      0xbd49d14aa79dbc82, 0x4b2d8644d8a74e18,
+      0xec9c459d51852ba2, 0xddf8e7d60ed1219e,
+      0x93e1ab8252f33b45, 0xcabb90e5c942b503,
+      0xb8da1662e7b00a17, 0x3d6a751f3b936243,
+      0xe7109bfba19c0c9d, 0xcc512670a783ad4,
+      0x906a617d450187e2, 0x27fb2b80668b24c5,
+      0xb484f9dc9641e9da, 0xb1f9f660802dedf6,
+      0xe1a63853bbd26451, 0x5e7873f8a0396973,
+      0x8d07e33455637eb2, 0xdb0b487b6423e1e8,
+      0xb049dc016abc5e5f, 0x91ce1a9a3d2cda62,
+      0xdc5c5301c56b75f7, 0x7641a140cc7810fb,
+      0x89b9b3e11b6329ba, 0xa9e904c87fcb0a9d,
+      0xac2820d9623bf429, 0x546345fa9fbdcd44,
+      0xd732290fbacaf133, 0xa97c177947ad4095,
+      0x867f59a9d4bed6c0, 0x49ed8eabcccc485d,
+      0xa81f301449ee8c70, 0x5c68f256bfff5a74,
+      0xd226fc195c6a2f8c, 0x73832eec6fff3111,
+      0x83585d8fd9c25db7, 0xc831fd53c5ff7eab,
+      0xa42e74f3d032f525, 0xba3e7ca8b77f5e55,
+      0xcd3a1230c43fb26f, 0x28ce1bd2e55f35eb,
+      0x80444b5e7aa7cf85, 0x7980d163cf5b81b3,
+      0xa0555e361951c366, 0xd7e105bcc332621f,
+      0xc86ab5c39fa63440, 0x8dd9472bf3fefaa7,
+      0xfa856334878fc150, 0xb14f98f6f0feb951,
+      0x9c935e00d4b9d8d2, 0x6ed1bf9a569f33d3,
+      0xc3b8358109e84f07, 0xa862f80ec4700c8,
+      0xf4a642e14c6262c8, 0xcd27bb612758c0fa,
+      0x98e7e9cccfbd7dbd, 0x8038d51cb897789c,
+      0xbf21e44003acdd2c, 0xe0470a63e6bd56c3,
+      0xeeea5d5004981478, 0x1858ccfce06cac74,
+      0x95527a5202df0ccb, 0xf37801e0c43ebc8,
+      0xbaa718e68396cffd, 0xd30560258f54e6ba,
+      0xe950df20247c83fd, 0x47c6b82ef32a2069,
+      0x91d28b7416cdd27e, 0x4cdc331d57fa5441,
+      0xb6472e511c81471d, 0xe0133fe4adf8e952,
+      0xe3d8f9e563a198e5, 0x58180fddd97723a6,
+      0x8e679c2f5e44ff8f, 0x570f09eaa7ea7648,
+  };
+};
+
+#if FASTFLOAT_DETAIL_MUST_DEFINE_CONSTEXPR_VARIABLE
+
+template <class unused>
+constexpr uint64_t
+    powers_template<unused>::power_of_five_128[number_of_entries];
+
+#endif
+
+using powers = powers_template<>;
+
+} // namespace fast_float
+
+#endif
+
+#ifndef FASTFLOAT_DECIMAL_TO_BINARY_H
+#define FASTFLOAT_DECIMAL_TO_BINARY_H
+
+#include <cfloat>
+#include <cinttypes>
+#include <cmath>
+#include <cstdint>
+#include <cstdlib>
+#include <cstring>
+
+namespace fast_float {
+
+// This will compute or rather approximate w * 5**q and return a pair of 64-bit
+// words approximating the result, with the "high" part corresponding to the
+// most significant bits and the low part corresponding to the least significant
+// bits.
+//
+template <int bit_precision>
+fastfloat_really_inline FASTFLOAT_CONSTEXPR20 value128
+compute_product_approximation(int64_t q, uint64_t w) {
+  const int index = 2 * int(q - powers::smallest_power_of_five);
+  // For small values of q, e.g., q in [0,27], the answer is always exact
+  // because The line value128 firstproduct = full_multiplication(w,
+  // power_of_five_128[index]); gives the exact answer.
+  value128 firstproduct =
+      full_multiplication(w, powers::power_of_five_128[index]);
+  static_assert((bit_precision >= 0) && (bit_precision <= 64),
+                " precision should  be in (0,64]");
+  constexpr uint64_t precision_mask =
+      (bit_precision < 64) ? (uint64_t(0xFFFFFFFFFFFFFFFF) >> bit_precision)
+                           : uint64_t(0xFFFFFFFFFFFFFFFF);
+  if ((firstproduct.high & precision_mask) ==
+      precision_mask) { // could further guard with  (lower + w < lower)
+    // regarding the second product, we only need secondproduct.high, but our
+    // expectation is that the compiler will optimize this extra work away if
+    // needed.
+    value128 secondproduct =
+        full_multiplication(w, powers::power_of_five_128[index + 1]);
+    firstproduct.low += secondproduct.high;
+    if (secondproduct.high > firstproduct.low) {
+      firstproduct.high++;
+    }
+  }
+  return firstproduct;
+}
+
+namespace detail {
+/**
+ * For q in (0,350), we have that
+ *  f = (((152170 + 65536) * q ) >> 16);
+ * is equal to
+ *   floor(p) + q
+ * where
+ *   p = log(5**q)/log(2) = q * log(5)/log(2)
+ *
+ * For negative values of q in (-400,0), we have that
+ *  f = (((152170 + 65536) * q ) >> 16);
+ * is equal to
+ *   -ceil(p) + q
+ * where
+ *   p = log(5**-q)/log(2) = -q * log(5)/log(2)
+ */
+constexpr fastfloat_really_inline int32_t power(int32_t q) noexcept {
+  return (((152170 + 65536) * q) >> 16) + 63;
+}
+} // namespace detail
+
+// create an adjusted mantissa, biased by the invalid power2
+// for significant digits already multiplied by 10 ** q.
+template <typename binary>
+fastfloat_really_inline FASTFLOAT_CONSTEXPR14 adjusted_mantissa
+compute_error_scaled(int64_t q, uint64_t w, int lz) noexcept {
+  int hilz = int(w >> 63) ^ 1;
+  adjusted_mantissa answer;
+  answer.mantissa = w << hilz;
+  int bias = binary::mantissa_explicit_bits() - binary::minimum_exponent();
+  answer.power2 = int32_t(detail::power(int32_t(q)) + bias - hilz - lz - 62 +
+                          invalid_am_bias);
+  return answer;
+}
+
+// w * 10 ** q, without rounding the representation up.
+// the power2 in the exponent will be adjusted by invalid_am_bias.
+template <typename binary>
+fastfloat_really_inline FASTFLOAT_CONSTEXPR20 adjusted_mantissa
+compute_error(int64_t q, uint64_t w) noexcept {
+  int lz = leading_zeroes(w);
+  w <<= lz;
+  value128 product =
+      compute_product_approximation<binary::mantissa_explicit_bits() + 3>(q, w);
+  return compute_error_scaled<binary>(q, product.high, lz);
+}
+
+// w * 10 ** q
+// The returned value should be a valid ieee64 number that simply need to be
+// packed. However, in some very rare cases, the computation will fail. In such
+// cases, we return an adjusted_mantissa with a negative power of 2: the caller
+// should recompute in such cases.
+template <typename binary>
+fastfloat_really_inline FASTFLOAT_CONSTEXPR20 adjusted_mantissa
+compute_float(int64_t q, uint64_t w) noexcept {
+  adjusted_mantissa answer;
+  if ((w == 0) || (q < binary::smallest_power_of_ten())) {
+    answer.power2 = 0;
+    answer.mantissa = 0;
+    // result should be zero
+    return answer;
+  }
+  if (q > binary::largest_power_of_ten()) {
+    // we want to get infinity:
+    answer.power2 = binary::infinite_power();
+    answer.mantissa = 0;
+    return answer;
+  }
+  // At this point in time q is in [powers::smallest_power_of_five,
+  // powers::largest_power_of_five].
+
+  // We want the most significant bit of i to be 1. Shift if needed.
+  int lz = leading_zeroes(w);
+  w <<= lz;
+
+  // The required precision is binary::mantissa_explicit_bits() + 3 because
+  // 1. We need the implicit bit
+  // 2. We need an extra bit for rounding purposes
+  // 3. We might lose a bit due to the "upperbit" routine (result too small,
+  // requiring a shift)
+
+  value128 product =
+      compute_product_approximation<binary::mantissa_explicit_bits() + 3>(q, w);
+  // The computed 'product' is always sufficient.
+  // Mathematical proof:
+  // Noble Mushtak and Daniel Lemire, Fast Number Parsing Without Fallback (to
+  // appear) See script/mushtak_lemire.py
+
+  // The "compute_product_approximation" function can be slightly slower than a
+  // branchless approach: value128 product = compute_product(q, w); but in
+  // practice, we can win big with the compute_product_approximation if its
+  // additional branch is easily predicted. Which is best is data specific.
+  int upperbit = int(product.high >> 63);
+  int shift = upperbit + 64 - binary::mantissa_explicit_bits() - 3;
+
+  answer.mantissa = product.high >> shift;
+
+  answer.power2 = int32_t(detail::power(int32_t(q)) + upperbit - lz -
+                          binary::minimum_exponent());
+  if (answer.power2 <= 0) { // we have a subnormal?
+    // Here have that answer.power2 <= 0 so -answer.power2 >= 0
+    if (-answer.power2 + 1 >=
+        64) { // if we have more than 64 bits below the minimum exponent, you
+              // have a zero for sure.
+      answer.power2 = 0;
+      answer.mantissa = 0;
+      // result should be zero
+      return answer;
+    }
+    // next line is safe because -answer.power2 + 1 < 64
+    answer.mantissa >>= -answer.power2 + 1;
+    // Thankfully, we can't have both "round-to-even" and subnormals because
+    // "round-to-even" only occurs for powers close to 0.
+    answer.mantissa += (answer.mantissa & 1); // round up
+    answer.mantissa >>= 1;
+    // There is a weird scenario where we don't have a subnormal but just.
+    // Suppose we start with 2.2250738585072013e-308, we end up
+    // with 0x3fffffffffffff x 2^-1023-53 which is technically subnormal
+    // whereas 0x40000000000000 x 2^-1023-53  is normal. Now, we need to round
+    // up 0x3fffffffffffff x 2^-1023-53  and once we do, we are no longer
+    // subnormal, but we can only know this after rounding.
+    // So we only declare a subnormal if we are smaller than the threshold.
+    answer.power2 =
+        (answer.mantissa < (uint64_t(1) << binary::mantissa_explicit_bits()))
+            ? 0
+            : 1;
+    return answer;
+  }
+
+  // usually, we round *up*, but if we fall right in between and and we have an
+  // even basis, we need to round down
+  // We are only concerned with the cases where 5**q fits in single 64-bit word.
+  if ((product.low <= 1) && (q >= binary::min_exponent_round_to_even()) &&
+      (q <= binary::max_exponent_round_to_even()) &&
+      ((answer.mantissa & 3) == 1)) { // we may fall between two floats!
+    // To be in-between two floats we need that in doing
+    //   answer.mantissa = product.high >> (upperbit + 64 -
+    //   binary::mantissa_explicit_bits() - 3);
+    // ... we dropped out only zeroes. But if this happened, then we can go
+    // back!!!
+    if ((answer.mantissa << shift) == product.high) {
+      answer.mantissa &= ~uint64_t(1); // flip it so that we do not round up
+    }
+  }
+
+  answer.mantissa += (answer.mantissa & 1); // round up
+  answer.mantissa >>= 1;
+  if (answer.mantissa >= (uint64_t(2) << binary::mantissa_explicit_bits())) {
+    answer.mantissa = (uint64_t(1) << binary::mantissa_explicit_bits());
+    answer.power2++; // undo previous addition
+  }
+
+  answer.mantissa &= ~(uint64_t(1) << binary::mantissa_explicit_bits());
+  if (answer.power2 >= binary::infinite_power()) { // infinity
+    answer.power2 = binary::infinite_power();
+    answer.mantissa = 0;
+  }
+  return answer;
+}
+
+} // namespace fast_float
+
+#endif
+
+#ifndef FASTFLOAT_BIGINT_H
+#define FASTFLOAT_BIGINT_H
+
+#include <algorithm>
+#include <cstdint>
+#include <climits>
+#include <cstring>
+
+
+namespace fast_float {
+
+// the limb width: we want efficient multiplication of double the bits in
+// limb, or for 64-bit limbs, at least 64-bit multiplication where we can
+// extract the high and low parts efficiently. this is every 64-bit
+// architecture except for sparc, which emulates 128-bit multiplication.
+// we might have platforms where `CHAR_BIT` is not 8, so let's avoid
+// doing `8 * sizeof(limb)`.
+#if defined(FASTFLOAT_64BIT) && !defined(__sparc)
+#define FASTFLOAT_64BIT_LIMB 1
+typedef uint64_t limb;
+constexpr size_t limb_bits = 64;
+#else
+#define FASTFLOAT_32BIT_LIMB
+typedef uint32_t limb;
+constexpr size_t limb_bits = 32;
+#endif
+
+typedef span<limb> limb_span;
+
+// number of bits in a bigint. this needs to be at least the number
+// of bits required to store the largest bigint, which is
+// `log2(10**(digits + max_exp))`, or `log2(10**(767 + 342))`, or
+// ~3600 bits, so we round to 4000.
+constexpr size_t bigint_bits = 4000;
+constexpr size_t bigint_limbs = bigint_bits / limb_bits;
+
+// vector-like type that is allocated on the stack. the entire
+// buffer is pre-allocated, and only the length changes.
+template <uint16_t size> struct stackvec {
+  limb data[size];
+  // we never need more than 150 limbs
+  uint16_t length{0};
+
+  stackvec() = default;
+  stackvec(const stackvec &) = delete;
+  stackvec &operator=(const stackvec &) = delete;
+  stackvec(stackvec &&) = delete;
+  stackvec &operator=(stackvec &&other) = delete;
+
+  // create stack vector from existing limb span.
+  FASTFLOAT_CONSTEXPR20 stackvec(limb_span s) {
+    FASTFLOAT_ASSERT(try_extend(s));
+  }
+
+  FASTFLOAT_CONSTEXPR14 limb &operator[](size_t index) noexcept {
+    FASTFLOAT_DEBUG_ASSERT(index < length);
+    return data[index];
+  }
+  FASTFLOAT_CONSTEXPR14 const limb &operator[](size_t index) const noexcept {
+    FASTFLOAT_DEBUG_ASSERT(index < length);
+    return data[index];
+  }
+  // index from the end of the container
+  FASTFLOAT_CONSTEXPR14 const limb &rindex(size_t index) const noexcept {
+    FASTFLOAT_DEBUG_ASSERT(index < length);
+    size_t rindex = length - index - 1;
+    return data[rindex];
+  }
+
+  // set the length, without bounds checking.
+  FASTFLOAT_CONSTEXPR14 void set_len(size_t len) noexcept {
+    length = uint16_t(len);
+  }
+  constexpr size_t len() const noexcept { return length; }
+  constexpr bool is_empty() const noexcept { return length == 0; }
+  constexpr size_t capacity() const noexcept { return size; }
+  // append item to vector, without bounds checking
+  FASTFLOAT_CONSTEXPR14 void push_unchecked(limb value) noexcept {
+    data[length] = value;
+    length++;
+  }
+  // append item to vector, returning if item was added
+  FASTFLOAT_CONSTEXPR14 bool try_push(limb value) noexcept {
+    if (len() < capacity()) {
+      push_unchecked(value);
+      return true;
+    } else {
+      return false;
+    }
+  }
+  // add items to the vector, from a span, without bounds checking
+  FASTFLOAT_CONSTEXPR20 void extend_unchecked(limb_span s) noexcept {
+    limb *ptr = data + length;
+    std::copy_n(s.ptr, s.len(), ptr);
+    set_len(len() + s.len());
+  }
+  // try to add items to the vector, returning if items were added
+  FASTFLOAT_CONSTEXPR20 bool try_extend(limb_span s) noexcept {
+    if (len() + s.len() <= capacity()) {
+      extend_unchecked(s);
+      return true;
+    } else {
+      return false;
+    }
+  }
+  // resize the vector, without bounds checking
+  // if the new size is longer than the vector, assign value to each
+  // appended item.
+  FASTFLOAT_CONSTEXPR20
+  void resize_unchecked(size_t new_len, limb value) noexcept {
+    if (new_len > len()) {
+      size_t count = new_len - len();
+      limb *first = data + len();
+      limb *last = first + count;
+      ::std::fill(first, last, value);
+      set_len(new_len);
+    } else {
+      set_len(new_len);
+    }
+  }
+  // try to resize the vector, returning if the vector was resized.
+  FASTFLOAT_CONSTEXPR20 bool try_resize(size_t new_len, limb value) noexcept {
+    if (new_len > capacity()) {
+      return false;
+    } else {
+      resize_unchecked(new_len, value);
+      return true;
+    }
+  }
+  // check if any limbs are non-zero after the given index.
+  // this needs to be done in reverse order, since the index
+  // is relative to the most significant limbs.
+  FASTFLOAT_CONSTEXPR14 bool nonzero(size_t index) const noexcept {
+    while (index < len()) {
+      if (rindex(index) != 0) {
+        return true;
+      }
+      index++;
+    }
+    return false;
+  }
+  // normalize the big integer, so most-significant zero limbs are removed.
+  FASTFLOAT_CONSTEXPR14 void normalize() noexcept {
+    while (len() > 0 && rindex(0) == 0) {
+      length--;
+    }
+  }
+};
+
+fastfloat_really_inline FASTFLOAT_CONSTEXPR14 uint64_t
+empty_hi64(bool &truncated) noexcept {
+  truncated = false;
+  return 0;
+}
+
+fastfloat_really_inline FASTFLOAT_CONSTEXPR20 uint64_t
+uint64_hi64(uint64_t r0, bool &truncated) noexcept {
+  truncated = false;
+  int shl = leading_zeroes(r0);
+  return r0 << shl;
+}
+
+fastfloat_really_inline FASTFLOAT_CONSTEXPR20 uint64_t
+uint64_hi64(uint64_t r0, uint64_t r1, bool &truncated) noexcept {
+  int shl = leading_zeroes(r0);
+  if (shl == 0) {
+    truncated = r1 != 0;
+    return r0;
+  } else {
+    int shr = 64 - shl;
+    truncated = (r1 << shl) != 0;
+    return (r0 << shl) | (r1 >> shr);
+  }
+}
+
+fastfloat_really_inline FASTFLOAT_CONSTEXPR20 uint64_t
+uint32_hi64(uint32_t r0, bool &truncated) noexcept {
+  return uint64_hi64(r0, truncated);
+}
+
+fastfloat_really_inline FASTFLOAT_CONSTEXPR20 uint64_t
+uint32_hi64(uint32_t r0, uint32_t r1, bool &truncated) noexcept {
+  uint64_t x0 = r0;
+  uint64_t x1 = r1;
+  return uint64_hi64((x0 << 32) | x1, truncated);
+}
+
+fastfloat_really_inline FASTFLOAT_CONSTEXPR20 uint64_t
+uint32_hi64(uint32_t r0, uint32_t r1, uint32_t r2, bool &truncated) noexcept {
+  uint64_t x0 = r0;
+  uint64_t x1 = r1;
+  uint64_t x2 = r2;
+  return uint64_hi64(x0, (x1 << 32) | x2, truncated);
+}
+
+// add two small integers, checking for overflow.
+// we want an efficient operation. for msvc, where
+// we don't have built-in intrinsics, this is still
+// pretty fast.
+fastfloat_really_inline FASTFLOAT_CONSTEXPR20 limb
+scalar_add(limb x, limb y, bool &overflow) noexcept {
+  limb z;
+// gcc and clang
+#if defined(__has_builtin)
+#if __has_builtin(__builtin_add_overflow)
+  if (!cpp20_and_in_constexpr()) {
+    overflow = __builtin_add_overflow(x, y, &z);
+    return z;
+  }
+#endif
+#endif
+
+  // generic, this still optimizes correctly on MSVC.
+  z = x + y;
+  overflow = z < x;
+  return z;
+}
+
+// multiply two small integers, getting both the high and low bits.
+fastfloat_really_inline FASTFLOAT_CONSTEXPR20 limb
+scalar_mul(limb x, limb y, limb &carry) noexcept {
+#ifdef FASTFLOAT_64BIT_LIMB
+#if defined(__SIZEOF_INT128__)
+  // GCC and clang both define it as an extension.
+  __uint128_t z = __uint128_t(x) * __uint128_t(y) + __uint128_t(carry);
+  carry = limb(z >> limb_bits);
+  return limb(z);
+#else
+  // fallback, no native 128-bit integer multiplication with carry.
+  // on msvc, this optimizes identically, somehow.
+  value128 z = full_multiplication(x, y);
+  bool overflow;
+  z.low = scalar_add(z.low, carry, overflow);
+  z.high += uint64_t(overflow); // cannot overflow
+  carry = z.high;
+  return z.low;
+#endif
+#else
+  uint64_t z = uint64_t(x) * uint64_t(y) + uint64_t(carry);
+  carry = limb(z >> limb_bits);
+  return limb(z);
+#endif
+}
+
+// add scalar value to bigint starting from offset.
+// used in grade school multiplication
+template <uint16_t size>
+inline FASTFLOAT_CONSTEXPR20 bool small_add_from(stackvec<size> &vec, limb y,
+                                                 size_t start) noexcept {
+  size_t index = start;
+  limb carry = y;
+  bool overflow;
+  while (carry != 0 && index < vec.len()) {
+    vec[index] = scalar_add(vec[index], carry, overflow);
+    carry = limb(overflow);
+    index += 1;
+  }
+  if (carry != 0) {
+    FASTFLOAT_TRY(vec.try_push(carry));
+  }
+  return true;
+}
+
+// add scalar value to bigint.
+template <uint16_t size>
+fastfloat_really_inline FASTFLOAT_CONSTEXPR20 bool
+small_add(stackvec<size> &vec, limb y) noexcept {
+  return small_add_from(vec, y, 0);
+}
+
+// multiply bigint by scalar value.
+template <uint16_t size>
+inline FASTFLOAT_CONSTEXPR20 bool small_mul(stackvec<size> &vec,
+                                            limb y) noexcept {
+  limb carry = 0;
+  for (size_t index = 0; index < vec.len(); index++) {
+    vec[index] = scalar_mul(vec[index], y, carry);
+  }
+  if (carry != 0) {
+    FASTFLOAT_TRY(vec.try_push(carry));
+  }
+  return true;
+}
+
+// add bigint to bigint starting from index.
+// used in grade school multiplication
+template <uint16_t size>
+FASTFLOAT_CONSTEXPR20 bool large_add_from(stackvec<size> &x, limb_span y,
+                                          size_t start) noexcept {
+  // the effective x buffer is from `xstart..x.len()`, so exit early
+  // if we can't get that current range.
+  if (x.len() < start || y.len() > x.len() - start) {
+    FASTFLOAT_TRY(x.try_resize(y.len() + start, 0));
+  }
+
+  bool carry = false;
+  for (size_t index = 0; index < y.len(); index++) {
+    limb xi = x[index + start];
+    limb yi = y[index];
+    bool c1 = false;
+    bool c2 = false;
+    xi = scalar_add(xi, yi, c1);
+    if (carry) {
+      xi = scalar_add(xi, 1, c2);
+    }
+    x[index + start] = xi;
+    carry = c1 | c2;
+  }
+
+  // handle overflow
+  if (carry) {
+    FASTFLOAT_TRY(small_add_from(x, 1, y.len() + start));
+  }
+  return true;
+}
+
+// add bigint to bigint.
+template <uint16_t size>
+fastfloat_really_inline FASTFLOAT_CONSTEXPR20 bool
+large_add_from(stackvec<size> &x, limb_span y) noexcept {
+  return large_add_from(x, y, 0);
+}
+
+// grade-school multiplication algorithm
+template <uint16_t size>
+FASTFLOAT_CONSTEXPR20 bool long_mul(stackvec<size> &x, limb_span y) noexcept {
+  limb_span xs = limb_span(x.data, x.len());
+  stackvec<size> z(xs);
+  limb_span zs = limb_span(z.data, z.len());
+
+  if (y.len() != 0) {
+    limb y0 = y[0];
+    FASTFLOAT_TRY(small_mul(x, y0));
+    for (size_t index = 1; index < y.len(); index++) {
+      limb yi = y[index];
+      stackvec<size> zi;
+      if (yi != 0) {
+        // re-use the same buffer throughout
+        zi.set_len(0);
+        FASTFLOAT_TRY(zi.try_extend(zs));
+        FASTFLOAT_TRY(small_mul(zi, yi));
+        limb_span zis = limb_span(zi.data, zi.len());
+        FASTFLOAT_TRY(large_add_from(x, zis, index));
+      }
+    }
+  }
+
+  x.normalize();
+  return true;
+}
+
+// grade-school multiplication algorithm
+template <uint16_t size>
+FASTFLOAT_CONSTEXPR20 bool large_mul(stackvec<size> &x, limb_span y) noexcept {
+  if (y.len() == 1) {
+    FASTFLOAT_TRY(small_mul(x, y[0]));
+  } else {
+    FASTFLOAT_TRY(long_mul(x, y));
+  }
+  return true;
+}
+
+template <typename = void> struct pow5_tables {
+  static constexpr uint32_t large_step = 135;
+  static constexpr uint64_t small_power_of_5[] = {
+      1UL,
+      5UL,
+      25UL,
+      125UL,
+      625UL,
+      3125UL,
+      15625UL,
+      78125UL,
+      390625UL,
+      1953125UL,
+      9765625UL,
+      48828125UL,
+      244140625UL,
+      1220703125UL,
+      6103515625UL,
+      30517578125UL,
+      152587890625UL,
+      762939453125UL,
+      3814697265625UL,
+      19073486328125UL,
+      95367431640625UL,
+      476837158203125UL,
+      2384185791015625UL,
+      11920928955078125UL,
+      59604644775390625UL,
+      298023223876953125UL,
+      1490116119384765625UL,
+      7450580596923828125UL,
+  };
+#ifdef FASTFLOAT_64BIT_LIMB
+  constexpr static limb large_power_of_5[] = {
+      1414648277510068013UL, 9180637584431281687UL, 4539964771860779200UL,
+      10482974169319127550UL, 198276706040285095UL};
+#else
+  constexpr static limb large_power_of_5[] = {
+      4279965485U, 329373468U,  4020270615U, 2137533757U, 4287402176U,
+      1057042919U, 1071430142U, 2440757623U, 381945767U,  46164893U};
+#endif
+};
+
+#if FASTFLOAT_DETAIL_MUST_DEFINE_CONSTEXPR_VARIABLE
+
+template <typename T> constexpr uint32_t pow5_tables<T>::large_step;
+
+template <typename T> constexpr uint64_t pow5_tables<T>::small_power_of_5[];
+
+template <typename T> constexpr limb pow5_tables<T>::large_power_of_5[];
+
+#endif
+
+// big integer type. implements a small subset of big integer
+// arithmetic, using simple algorithms since asymptotically
+// faster algorithms are slower for a small number of limbs.
+// all operations assume the big-integer is normalized.
+struct bigint : pow5_tables<> {
+  // storage of the limbs, in little-endian order.
+  stackvec<bigint_limbs> vec;
+
+  FASTFLOAT_CONSTEXPR20 bigint() : vec() {}
+  bigint(const bigint &) = delete;
+  bigint &operator=(const bigint &) = delete;
+  bigint(bigint &&) = delete;
+  bigint &operator=(bigint &&other) = delete;
+
+  FASTFLOAT_CONSTEXPR20 bigint(uint64_t value) : vec() {
+#ifdef FASTFLOAT_64BIT_LIMB
+    vec.push_unchecked(value);
+#else
+    vec.push_unchecked(uint32_t(value));
+    vec.push_unchecked(uint32_t(value >> 32));
+#endif
+    vec.normalize();
+  }
+
+  // get the high 64 bits from the vector, and if bits were truncated.
+  // this is to get the significant digits for the float.
+  FASTFLOAT_CONSTEXPR20 uint64_t hi64(bool &truncated) const noexcept {
+#ifdef FASTFLOAT_64BIT_LIMB
+    if (vec.len() == 0) {
+      return empty_hi64(truncated);
+    } else if (vec.len() == 1) {
+      return uint64_hi64(vec.rindex(0), truncated);
+    } else {
+      uint64_t result = uint64_hi64(vec.rindex(0), vec.rindex(1), truncated);
+      truncated |= vec.nonzero(2);
+      return result;
+    }
+#else
+    if (vec.len() == 0) {
+      return empty_hi64(truncated);
+    } else if (vec.len() == 1) {
+      return uint32_hi64(vec.rindex(0), truncated);
+    } else if (vec.len() == 2) {
+      return uint32_hi64(vec.rindex(0), vec.rindex(1), truncated);
+    } else {
+      uint64_t result =
+          uint32_hi64(vec.rindex(0), vec.rindex(1), vec.rindex(2), truncated);
+      truncated |= vec.nonzero(3);
+      return result;
+    }
+#endif
+  }
+
+  // compare two big integers, returning the large value.
+  // assumes both are normalized. if the return value is
+  // negative, other is larger, if the return value is
+  // positive, this is larger, otherwise they are equal.
+  // the limbs are stored in little-endian order, so we
+  // must compare the limbs in ever order.
+  FASTFLOAT_CONSTEXPR20 int compare(const bigint &other) const noexcept {
+    if (vec.len() > other.vec.len()) {
+      return 1;
+    } else if (vec.len() < other.vec.len()) {
+      return -1;
+    } else {
+      for (size_t index = vec.len(); index > 0; index--) {
+        limb xi = vec[index - 1];
+        limb yi = other.vec[index - 1];
+        if (xi > yi) {
+          return 1;
+        } else if (xi < yi) {
+          return -1;
+        }
+      }
+      return 0;
+    }
+  }
+
+  // shift left each limb n bits, carrying over to the new limb
+  // returns true if we were able to shift all the digits.
+  FASTFLOAT_CONSTEXPR20 bool shl_bits(size_t n) noexcept {
+    // Internally, for each item, we shift left by n, and add the previous
+    // right shifted limb-bits.
+    // For example, we transform (for u8) shifted left 2, to:
+    //      b10100100 b01000010
+    //      b10 b10010001 b00001000
+    FASTFLOAT_DEBUG_ASSERT(n != 0);
+    FASTFLOAT_DEBUG_ASSERT(n < sizeof(limb) * 8);
+
+    size_t shl = n;
+    size_t shr = limb_bits - shl;
+    limb prev = 0;
+    for (size_t index = 0; index < vec.len(); index++) {
+      limb xi = vec[index];
+      vec[index] = (xi << shl) | (prev >> shr);
+      prev = xi;
+    }
+
+    limb carry = prev >> shr;
+    if (carry != 0) {
+      return vec.try_push(carry);
+    }
+    return true;
+  }
+
+  // move the limbs left by `n` limbs.
+  FASTFLOAT_CONSTEXPR20 bool shl_limbs(size_t n) noexcept {
+    FASTFLOAT_DEBUG_ASSERT(n != 0);
+    if (n + vec.len() > vec.capacity()) {
+      return false;
+    } else if (!vec.is_empty()) {
+      // move limbs
+      limb *dst = vec.data + n;
+      const limb *src = vec.data;
+      std::copy_backward(src, src + vec.len(), dst + vec.len());
+      // fill in empty limbs
+      limb *first = vec.data;
+      limb *last = first + n;
+      ::std::fill(first, last, 0);
+      vec.set_len(n + vec.len());
+      return true;
+    } else {
+      return true;
+    }
+  }
+
+  // move the limbs left by `n` bits.
+  FASTFLOAT_CONSTEXPR20 bool shl(size_t n) noexcept {
+    size_t rem = n % limb_bits;
+    size_t div = n / limb_bits;
+    if (rem != 0) {
+      FASTFLOAT_TRY(shl_bits(rem));
+    }
+    if (div != 0) {
+      FASTFLOAT_TRY(shl_limbs(div));
+    }
+    return true;
+  }
+
+  // get the number of leading zeros in the bigint.
+  FASTFLOAT_CONSTEXPR20 int ctlz() const noexcept {
+    if (vec.is_empty()) {
+      return 0;
+    } else {
+#ifdef FASTFLOAT_64BIT_LIMB
+      return leading_zeroes(vec.rindex(0));
+#else
+      // no use defining a specialized leading_zeroes for a 32-bit type.
+      uint64_t r0 = vec.rindex(0);
+      return leading_zeroes(r0 << 32);
+#endif
+    }
+  }
+
+  // get the number of bits in the bigint.
+  FASTFLOAT_CONSTEXPR20 int bit_length() const noexcept {
+    int lz = ctlz();
+    return int(limb_bits * vec.len()) - lz;
+  }
+
+  FASTFLOAT_CONSTEXPR20 bool mul(limb y) noexcept { return small_mul(vec, y); }
+
+  FASTFLOAT_CONSTEXPR20 bool add(limb y) noexcept { return small_add(vec, y); }
+
+  // multiply as if by 2 raised to a power.
+  FASTFLOAT_CONSTEXPR20 bool pow2(uint32_t exp) noexcept { return shl(exp); }
+
+  // multiply as if by 5 raised to a power.
+  FASTFLOAT_CONSTEXPR20 bool pow5(uint32_t exp) noexcept {
+    // multiply by a power of 5
+    size_t large_length = sizeof(large_power_of_5) / sizeof(limb);
+    limb_span large = limb_span(large_power_of_5, large_length);
+    while (exp >= large_step) {
+      FASTFLOAT_TRY(large_mul(vec, large));
+      exp -= large_step;
+    }
+#ifdef FASTFLOAT_64BIT_LIMB
+    uint32_t small_step = 27;
+    limb max_native = 7450580596923828125UL;
+#else
+    uint32_t small_step = 13;
+    limb max_native = 1220703125U;
+#endif
+    while (exp >= small_step) {
+      FASTFLOAT_TRY(small_mul(vec, max_native));
+      exp -= small_step;
+    }
+    if (exp != 0) {
+      // Work around clang bug https://godbolt.org/z/zedh7rrhc
+      // This is similar to https://github.com/llvm/llvm-project/issues/47746,
+      // except the workaround described there don't work here
+      FASTFLOAT_TRY(small_mul(
+          vec, limb(((void)small_power_of_5[0], small_power_of_5[exp]))));
+    }
+
+    return true;
+  }
+
+  // multiply as if by 10 raised to a power.
+  FASTFLOAT_CONSTEXPR20 bool pow10(uint32_t exp) noexcept {
+    FASTFLOAT_TRY(pow5(exp));
+    return pow2(exp);
+  }
+};
+
+} // namespace fast_float
+
+#endif
+
+#ifndef FASTFLOAT_DIGIT_COMPARISON_H
+#define FASTFLOAT_DIGIT_COMPARISON_H
+
+#include <algorithm>
+#include <cstdint>
+#include <cstring>
+#include <iterator>
+
+
+namespace fast_float {
+
+// 1e0 to 1e19
+constexpr static uint64_t powers_of_ten_uint64[] = {1UL,
+                                                    10UL,
+                                                    100UL,
+                                                    1000UL,
+                                                    10000UL,
+                                                    100000UL,
+                                                    1000000UL,
+                                                    10000000UL,
+                                                    100000000UL,
+                                                    1000000000UL,
+                                                    10000000000UL,
+                                                    100000000000UL,
+                                                    1000000000000UL,
+                                                    10000000000000UL,
+                                                    100000000000000UL,
+                                                    1000000000000000UL,
+                                                    10000000000000000UL,
+                                                    100000000000000000UL,
+                                                    1000000000000000000UL,
+                                                    10000000000000000000UL};
+
+// calculate the exponent, in scientific notation, of the number.
+// this algorithm is not even close to optimized, but it has no practical
+// effect on performance: in order to have a faster algorithm, we'd need
+// to slow down performance for faster algorithms, and this is still fast.
+template <typename UC>
+fastfloat_really_inline FASTFLOAT_CONSTEXPR14 int32_t
+scientific_exponent(parsed_number_string_t<UC> &num) noexcept {
+  uint64_t mantissa = num.mantissa;
+  int32_t exponent = int32_t(num.exponent);
+  while (mantissa >= 10000) {
+    mantissa /= 10000;
+    exponent += 4;
+  }
+  while (mantissa >= 100) {
+    mantissa /= 100;
+    exponent += 2;
+  }
+  while (mantissa >= 10) {
+    mantissa /= 10;
+    exponent += 1;
+  }
+  return exponent;
+}
+
+// this converts a native floating-point number to an extended-precision float.
+template <typename T>
+fastfloat_really_inline FASTFLOAT_CONSTEXPR20 adjusted_mantissa
+to_extended(T value) noexcept {
+  using equiv_uint = typename binary_format<T>::equiv_uint;
+  constexpr equiv_uint exponent_mask = binary_format<T>::exponent_mask();
+  constexpr equiv_uint mantissa_mask = binary_format<T>::mantissa_mask();
+  constexpr equiv_uint hidden_bit_mask = binary_format<T>::hidden_bit_mask();
+
+  adjusted_mantissa am;
+  int32_t bias = binary_format<T>::mantissa_explicit_bits() -
+                 binary_format<T>::minimum_exponent();
+  equiv_uint bits;
+#if FASTFLOAT_HAS_BIT_CAST
+  bits = std::bit_cast<equiv_uint>(value);
+#else
+  ::memcpy(&bits, &value, sizeof(T));
+#endif
+  if ((bits & exponent_mask) == 0) {
+    // denormal
+    am.power2 = 1 - bias;
+    am.mantissa = bits & mantissa_mask;
+  } else {
+    // normal
+    am.power2 = int32_t((bits & exponent_mask) >>
+                        binary_format<T>::mantissa_explicit_bits());
+    am.power2 -= bias;
+    am.mantissa = (bits & mantissa_mask) | hidden_bit_mask;
+  }
+
+  return am;
+}
+
+// get the extended precision value of the halfway point between b and b+u.
+// we are given a native float that represents b, so we need to adjust it
+// halfway between b and b+u.
+template <typename T>
+fastfloat_really_inline FASTFLOAT_CONSTEXPR20 adjusted_mantissa
+to_extended_halfway(T value) noexcept {
+  adjusted_mantissa am = to_extended(value);
+  am.mantissa <<= 1;
+  am.mantissa += 1;
+  am.power2 -= 1;
+  return am;
+}
+
+// round an extended-precision float to the nearest machine float.
+template <typename T, typename callback>
+fastfloat_really_inline FASTFLOAT_CONSTEXPR14 void round(adjusted_mantissa &am,
+                                                         callback cb) noexcept {
+  int32_t mantissa_shift = 64 - binary_format<T>::mantissa_explicit_bits() - 1;
+  if (-am.power2 >= mantissa_shift) {
+    // have a denormal float
+    int32_t shift = -am.power2 + 1;
+    cb(am, std::min<int32_t>(shift, 64));
+    // check for round-up: if rounding-nearest carried us to the hidden bit.
+    am.power2 = (am.mantissa <
+                 (uint64_t(1) << binary_format<T>::mantissa_explicit_bits()))
+                    ? 0
+                    : 1;
+    return;
+  }
+
+  // have a normal float, use the default shift.
+  cb(am, mantissa_shift);
+
+  // check for carry
+  if (am.mantissa >=
+      (uint64_t(2) << binary_format<T>::mantissa_explicit_bits())) {
+    am.mantissa = (uint64_t(1) << binary_format<T>::mantissa_explicit_bits());
+    am.power2++;
+  }
+
+  // check for infinite: we could have carried to an infinite power
+  am.mantissa &= ~(uint64_t(1) << binary_format<T>::mantissa_explicit_bits());
+  if (am.power2 >= binary_format<T>::infinite_power()) {
+    am.power2 = binary_format<T>::infinite_power();
+    am.mantissa = 0;
+  }
+}
+
+template <typename callback>
+fastfloat_really_inline FASTFLOAT_CONSTEXPR14 void
+round_nearest_tie_even(adjusted_mantissa &am, int32_t shift,
+                       callback cb) noexcept {
+  const uint64_t mask = (shift == 64) ? UINT64_MAX : (uint64_t(1) << shift) - 1;
+  const uint64_t halfway = (shift == 0) ? 0 : uint64_t(1) << (shift - 1);
+  uint64_t truncated_bits = am.mantissa & mask;
+  bool is_above = truncated_bits > halfway;
+  bool is_halfway = truncated_bits == halfway;
+
+  // shift digits into position
+  if (shift == 64) {
+    am.mantissa = 0;
+  } else {
+    am.mantissa >>= shift;
+  }
+  am.power2 += shift;
+
+  bool is_odd = (am.mantissa & 1) == 1;
+  am.mantissa += uint64_t(cb(is_odd, is_halfway, is_above));
+}
+
+fastfloat_really_inline FASTFLOAT_CONSTEXPR14 void
+round_down(adjusted_mantissa &am, int32_t shift) noexcept {
+  if (shift == 64) {
+    am.mantissa = 0;
+  } else {
+    am.mantissa >>= shift;
+  }
+  am.power2 += shift;
+}
+template <typename UC>
+fastfloat_really_inline FASTFLOAT_CONSTEXPR20 void
+skip_zeros(UC const *&first, UC const *last) noexcept {
+  uint64_t val;
+  while (!cpp20_and_in_constexpr() &&
+         std::distance(first, last) >= int_cmp_len<UC>()) {
+    ::memcpy(&val, first, sizeof(uint64_t));
+    if (val != int_cmp_zeros<UC>()) {
+      break;
+    }
+    first += int_cmp_len<UC>();
+  }
+  while (first != last) {
+    if (*first != UC('0')) {
+      break;
+    }
+    first++;
+  }
+}
+
+// determine if any non-zero digits were truncated.
+// all characters must be valid digits.
+template <typename UC>
+fastfloat_really_inline FASTFLOAT_CONSTEXPR20 bool
+is_truncated(UC const *first, UC const *last) noexcept {
+  // do 8-bit optimizations, can just compare to 8 literal 0s.
+  uint64_t val;
+  while (!cpp20_and_in_constexpr() &&
+         std::distance(first, last) >= int_cmp_len<UC>()) {
+    ::memcpy(&val, first, sizeof(uint64_t));
+    if (val != int_cmp_zeros<UC>()) {
+      return true;
+    }
+    first += int_cmp_len<UC>();
+  }
+  while (first != last) {
+    if (*first != UC('0')) {
+      return true;
+    }
+    ++first;
+  }
+  return false;
+}
+template <typename UC>
+fastfloat_really_inline FASTFLOAT_CONSTEXPR20 bool
+is_truncated(span<const UC> s) noexcept {
+  return is_truncated(s.ptr, s.ptr + s.len());
+}
+
+template <typename UC>
+fastfloat_really_inline FASTFLOAT_CONSTEXPR20 void
+parse_eight_digits(const UC *&p, limb &value, size_t &counter,
+                   size_t &count) noexcept {
+  value = value * 100000000 + parse_eight_digits_unrolled(p);
+  p += 8;
+  counter += 8;
+  count += 8;
+}
+
+template <typename UC>
+fastfloat_really_inline FASTFLOAT_CONSTEXPR14 void
+parse_one_digit(UC const *&p, limb &value, size_t &counter,
+                size_t &count) noexcept {
+  value = value * 10 + limb(*p - UC('0'));
+  p++;
+  counter++;
+  count++;
+}
+
+fastfloat_really_inline FASTFLOAT_CONSTEXPR20 void
+add_native(bigint &big, limb power, limb value) noexcept {
+  big.mul(power);
+  big.add(value);
+}
+
+fastfloat_really_inline FASTFLOAT_CONSTEXPR20 void
+round_up_bigint(bigint &big, size_t &count) noexcept {
+  // need to round-up the digits, but need to avoid rounding
+  // ....9999 to ...10000, which could cause a false halfway point.
+  add_native(big, 10, 1);
+  count++;
+}
+
+// parse the significant digits into a big integer
+template <typename UC>
+inline FASTFLOAT_CONSTEXPR20 void
+parse_mantissa(bigint &result, parsed_number_string_t<UC> &num,
+               size_t max_digits, size_t &digits) noexcept {
+  // try to minimize the number of big integer and scalar multiplication.
+  // therefore, try to parse 8 digits at a time, and multiply by the largest
+  // scalar value (9 or 19 digits) for each step.
+  size_t counter = 0;
+  digits = 0;
+  limb value = 0;
+#ifdef FASTFLOAT_64BIT_LIMB
+  size_t step = 19;
+#else
+  size_t step = 9;
+#endif
+
+  // process all integer digits.
+  UC const *p = num.integer.ptr;
+  UC const *pend = p + num.integer.len();
+  skip_zeros(p, pend);
+  // process all digits, in increments of step per loop
+  while (p != pend) {
+    while ((std::distance(p, pend) >= 8) && (step - counter >= 8) &&
+           (max_digits - digits >= 8)) {
+      parse_eight_digits(p, value, counter, digits);
+    }
+    while (counter < step && p != pend && digits < max_digits) {
+      parse_one_digit(p, value, counter, digits);
+    }
+    if (digits == max_digits) {
+      // add the temporary value, then check if we've truncated any digits
+      add_native(result, limb(powers_of_ten_uint64[counter]), value);
+      bool truncated = is_truncated(p, pend);
+      if (num.fraction.ptr != nullptr) {
+        truncated |= is_truncated(num.fraction);
+      }
+      if (truncated) {
+        round_up_bigint(result, digits);
+      }
+      return;
+    } else {
+      add_native(result, limb(powers_of_ten_uint64[counter]), value);
+      counter = 0;
+      value = 0;
+    }
+  }
+
+  // add our fraction digits, if they're available.
+  if (num.fraction.ptr != nullptr) {
+    p = num.fraction.ptr;
+    pend = p + num.fraction.len();
+    if (digits == 0) {
+      skip_zeros(p, pend);
+    }
+    // process all digits, in increments of step per loop
+    while (p != pend) {
+      while ((std::distance(p, pend) >= 8) && (step - counter >= 8) &&
+             (max_digits - digits >= 8)) {
+        parse_eight_digits(p, value, counter, digits);
+      }
+      while (counter < step && p != pend && digits < max_digits) {
+        parse_one_digit(p, value, counter, digits);
+      }
+      if (digits == max_digits) {
+        // add the temporary value, then check if we've truncated any digits
+        add_native(result, limb(powers_of_ten_uint64[counter]), value);
+        bool truncated = is_truncated(p, pend);
+        if (truncated) {
+          round_up_bigint(result, digits);
+        }
+        return;
+      } else {
+        add_native(result, limb(powers_of_ten_uint64[counter]), value);
+        counter = 0;
+        value = 0;
+      }
+    }
+  }
+
+  if (counter != 0) {
+    add_native(result, limb(powers_of_ten_uint64[counter]), value);
+  }
+}
+
+template <typename T>
+inline FASTFLOAT_CONSTEXPR20 adjusted_mantissa
+positive_digit_comp(bigint &bigmant, int32_t exponent) noexcept {
+  FASTFLOAT_ASSERT(bigmant.pow10(uint32_t(exponent)));
+  adjusted_mantissa answer;
+  bool truncated;
+  answer.mantissa = bigmant.hi64(truncated);
+  int bias = binary_format<T>::mantissa_explicit_bits() -
+             binary_format<T>::minimum_exponent();
+  answer.power2 = bigmant.bit_length() - 64 + bias;
+
+  round<T>(answer, [truncated](adjusted_mantissa &a, int32_t shift) {
+    round_nearest_tie_even(
+        a, shift,
+        [truncated](bool is_odd, bool is_halfway, bool is_above) -> bool {
+          return is_above || (is_halfway && truncated) ||
+                 (is_odd && is_halfway);
+        });
+  });
+
+  return answer;
+}
+
+// the scaling here is quite simple: we have, for the real digits `m * 10^e`,
+// and for the theoretical digits `n * 2^f`. Since `e` is always negative,
+// to scale them identically, we do `n * 2^f * 5^-f`, so we now have `m * 2^e`.
+// we then need to scale by `2^(f- e)`, and then the two significant digits
+// are of the same magnitude.
+template <typename T>
+inline FASTFLOAT_CONSTEXPR20 adjusted_mantissa negative_digit_comp(
+    bigint &bigmant, adjusted_mantissa am, int32_t exponent) noexcept {
+  bigint &real_digits = bigmant;
+  int32_t real_exp = exponent;
+
+  // get the value of `b`, rounded down, and get a bigint representation of b+h
+  adjusted_mantissa am_b = am;
+  // gcc7 buf: use a lambda to remove the noexcept qualifier bug with
+  // -Wnoexcept-type.
+  round<T>(am_b,
+           [](adjusted_mantissa &a, int32_t shift) { round_down(a, shift); });
+  T b;
+  to_float(false, am_b, b);
+  adjusted_mantissa theor = to_extended_halfway(b);
+  bigint theor_digits(theor.mantissa);
+  int32_t theor_exp = theor.power2;
+
+  // scale real digits and theor digits to be same power.
+  int32_t pow2_exp = theor_exp - real_exp;
+  uint32_t pow5_exp = uint32_t(-real_exp);
+  if (pow5_exp != 0) {
+    FASTFLOAT_ASSERT(theor_digits.pow5(pow5_exp));
+  }
+  if (pow2_exp > 0) {
+    FASTFLOAT_ASSERT(theor_digits.pow2(uint32_t(pow2_exp)));
+  } else if (pow2_exp < 0) {
+    FASTFLOAT_ASSERT(real_digits.pow2(uint32_t(-pow2_exp)));
+  }
+
+  // compare digits, and use it to director rounding
+  int ord = real_digits.compare(theor_digits);
+  adjusted_mantissa answer = am;
+  round<T>(answer, [ord](adjusted_mantissa &a, int32_t shift) {
+    round_nearest_tie_even(
+        a, shift, [ord](bool is_odd, bool _, bool __) -> bool {
+          (void)_;  // not needed, since we've done our comparison
+          (void)__; // not needed, since we've done our comparison
+          if (ord > 0) {
+            return true;
+          } else if (ord < 0) {
+            return false;
+          } else {
+            return is_odd;
+          }
+        });
+  });
+
+  return answer;
+}
+
+// parse the significant digits as a big integer to unambiguously round the
+// the significant digits. here, we are trying to determine how to round
+// an extended float representation close to `b+h`, halfway between `b`
+// (the float rounded-down) and `b+u`, the next positive float. this
+// algorithm is always correct, and uses one of two approaches. when
+// the exponent is positive relative to the significant digits (such as
+// 1234), we create a big-integer representation, get the high 64-bits,
+// determine if any lower bits are truncated, and use that to direct
+// rounding. in case of a negative exponent relative to the significant
+// digits (such as 1.2345), we create a theoretical representation of
+// `b` as a big-integer type, scaled to the same binary exponent as
+// the actual digits. we then compare the big integer representations
+// of both, and use that to direct rounding.
+template <typename T, typename UC>
+inline FASTFLOAT_CONSTEXPR20 adjusted_mantissa
+digit_comp(parsed_number_string_t<UC> &num, adjusted_mantissa am) noexcept {
+  // remove the invalid exponent bias
+  am.power2 -= invalid_am_bias;
+
+  int32_t sci_exp = scientific_exponent(num);
+  size_t max_digits = binary_format<T>::max_digits();
+  size_t digits = 0;
+  bigint bigmant;
+  parse_mantissa(bigmant, num, max_digits, digits);
+  // can't underflow, since digits is at most max_digits.
+  int32_t exponent = sci_exp + 1 - int32_t(digits);
+  if (exponent >= 0) {
+    return positive_digit_comp<T>(bigmant, exponent);
+  } else {
+    return negative_digit_comp<T>(bigmant, am, exponent);
+  }
+}
+
+} // namespace fast_float
+
+#endif
+
+#ifndef FASTFLOAT_PARSE_NUMBER_H
+#define FASTFLOAT_PARSE_NUMBER_H
+
+
+#include <cmath>
+#include <cstring>
+#include <limits>
+#include <system_error>
+namespace fast_float {
+
+namespace detail {
+/**
+ * Special case +inf, -inf, nan, infinity, -infinity.
+ * The case comparisons could be made much faster given that we know that the
+ * strings a null-free and fixed.
+ **/
+template <typename T, typename UC>
+from_chars_result_t<UC> FASTFLOAT_CONSTEXPR14 parse_infnan(UC const *first,
+                                                           UC const *last,
+                                                           T &value) noexcept {
+  from_chars_result_t<UC> answer{first, std::errc()}; // be optimistic
+  bool minusSign = false;
+  if (*first ==
+      UC('-')) { // assume first < last, so dereference without checks;
+                 // C++17 20.19.3.(7.1) explicitly forbids '+' here
+    minusSign = true;
+    ++first;
+  }
+#ifdef FASTFLOAT_ALLOWS_LEADING_PLUS // disabled by default
+  if (*first == UC('+')) {
+    ++first;
+  }
+#endif
+  if (last - first >= 3) {
+    if (fastfloat_strncasecmp(first, str_const_nan<UC>(), 3)) {
+      answer.ptr = (first += 3);
+      value = minusSign ? -std::numeric_limits<T>::quiet_NaN()
+                        : std::numeric_limits<T>::quiet_NaN();
+      // Check for possible nan(n-char-seq-opt), C++17 20.19.3.7,
+      // C11 7.20.1.3.3. At least MSVC produces nan(ind) and nan(snan).
+      if (first != last && *first == UC('(')) {
+        for (UC const *ptr = first + 1; ptr != last; ++ptr) {
+          if (*ptr == UC(')')) {
+            answer.ptr = ptr + 1; // valid nan(n-char-seq-opt)
+            break;
+          } else if (!((UC('a') <= *ptr && *ptr <= UC('z')) ||
+                       (UC('A') <= *ptr && *ptr <= UC('Z')) ||
+                       (UC('0') <= *ptr && *ptr <= UC('9')) || *ptr == UC('_')))
+            break; // forbidden char, not nan(n-char-seq-opt)
+        }
+      }
+      return answer;
+    }
+    if (fastfloat_strncasecmp(first, str_const_inf<UC>(), 3)) {
+      if ((last - first >= 8) &&
+          fastfloat_strncasecmp(first + 3, str_const_inf<UC>() + 3, 5)) {
+        answer.ptr = first + 8;
+      } else {
+        answer.ptr = first + 3;
+      }
+      value = minusSign ? -std::numeric_limits<T>::infinity()
+                        : std::numeric_limits<T>::infinity();
+      return answer;
+    }
+  }
+  answer.ec = std::errc::invalid_argument;
+  return answer;
+}
+
+/**
+ * Returns true if the floating-pointing rounding mode is to 'nearest'.
+ * It is the default on most system. This function is meant to be inexpensive.
+ * Credit : @mwalcott3
+ */
+fastfloat_really_inline bool rounds_to_nearest() noexcept {
+  // https://lemire.me/blog/2020/06/26/gcc-not-nearest/
+#if (FLT_EVAL_METHOD != 1) && (FLT_EVAL_METHOD != 0)
+  return false;
+#endif
+  // See
+  // A fast function to check your floating-point rounding mode
+  // https://lemire.me/blog/2022/11/16/a-fast-function-to-check-your-floating-point-rounding-mode/
+  //
+  // This function is meant to be equivalent to :
+  // prior: #include <cfenv>
+  //  return fegetround() == FE_TONEAREST;
+  // However, it is expected to be much faster than the fegetround()
+  // function call.
+  //
+  // The volatile keywoard prevents the compiler from computing the function
+  // at compile-time.
+  // There might be other ways to prevent compile-time optimizations (e.g.,
+  // asm). The value does not need to be std::numeric_limits<float>::min(), any
+  // small value so that 1 + x should round to 1 would do (after accounting for
+  // excess precision, as in 387 instructions).
+  static volatile float fmin = std::numeric_limits<float>::min();
+  float fmini = fmin; // we copy it so that it gets loaded at most once.
+//
+// Explanation:
+// Only when fegetround() == FE_TONEAREST do we have that
+// fmin + 1.0f == 1.0f - fmin.
+//
+// FE_UPWARD:
+//  fmin + 1.0f > 1
+//  1.0f - fmin == 1
+//
+// FE_DOWNWARD or  FE_TOWARDZERO:
+//  fmin + 1.0f == 1
+//  1.0f - fmin < 1
+//
+// Note: This may fail to be accurate if fast-math has been
+// enabled, as rounding conventions may not apply.
+#ifdef FASTFLOAT_VISUAL_STUDIO
+#pragma warning(push)
+//  todo: is there a VS warning?
+//  see
+//  https://stackoverflow.com/questions/46079446/is-there-a-warning-for-floating-point-equality-checking-in-visual-studio-2013
+#elif defined(__clang__)
+#pragma clang diagnostic push
+#pragma clang diagnostic ignored "-Wfloat-equal"
+#elif defined(__GNUC__)
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wfloat-equal"
+#endif
+  return (fmini + 1.0f == 1.0f - fmini);
+#ifdef FASTFLOAT_VISUAL_STUDIO
+#pragma warning(pop)
+#elif defined(__clang__)
+#pragma clang diagnostic pop
+#elif defined(__GNUC__)
+#pragma GCC diagnostic pop
+#endif
+}
+
+} // namespace detail
+
+template <typename T> struct from_chars_caller {
+  template <typename UC>
+  FASTFLOAT_CONSTEXPR20 static from_chars_result_t<UC>
+  call(UC const *first, UC const *last, T &value,
+       parse_options_t<UC> options) noexcept {
+    return from_chars_advanced(first, last, value, options);
+  }
+};
+
+#if __STDCPP_FLOAT32_T__ == 1
+template <> struct from_chars_caller<std::float32_t> {
+  template <typename UC>
+  FASTFLOAT_CONSTEXPR20 static from_chars_result_t<UC>
+  call(UC const *first, UC const *last, std::float32_t &value,
+       parse_options_t<UC> options) noexcept {
+    // if std::float32_t is defined, and we are in C++23 mode; macro set for
+    // float32; set value to float due to equivalence between float and
+    // float32_t
+    float val;
+    auto ret = from_chars_advanced(first, last, val, options);
+    value = val;
+    return ret;
+  }
+};
+#endif
+
+#if __STDCPP_FLOAT64_T__ == 1
+template <> struct from_chars_caller<std::float64_t> {
+  template <typename UC>
+  FASTFLOAT_CONSTEXPR20 static from_chars_result_t<UC>
+  call(UC const *first, UC const *last, std::float64_t &value,
+       parse_options_t<UC> options) noexcept {
+    // if std::float64_t is defined, and we are in C++23 mode; macro set for
+    // float64; set value as double due to equivalence between double and
+    // float64_t
+    double val;
+    auto ret = from_chars_advanced(first, last, val, options);
+    value = val;
+    return ret;
+  }
+};
+#endif
+
+template <typename T, typename UC, typename>
+FASTFLOAT_CONSTEXPR20 from_chars_result_t<UC>
+from_chars(UC const *first, UC const *last, T &value,
+           chars_format fmt /*= chars_format::general*/) noexcept {
+  return from_chars_caller<T>::call(first, last, value,
+                                    parse_options_t<UC>(fmt));
+}
+
+/**
+ * This function overload takes parsed_number_string_t structure that is created
+ * and populated either by from_chars_advanced function taking chars range and
+ * parsing options or other parsing custom function implemented by user.
+ */
+template <typename T, typename UC>
+FASTFLOAT_CONSTEXPR20 from_chars_result_t<UC>
+from_chars_advanced(parsed_number_string_t<UC> &pns, T &value) noexcept {
+
+  static_assert(is_supported_float_type<T>(),
+                "only some floating-point types are supported");
+  static_assert(is_supported_char_type<UC>(),
+                "only char, wchar_t, char16_t and char32_t are supported");
+
+  from_chars_result_t<UC> answer;
+
+  answer.ec = std::errc(); // be optimistic
+  answer.ptr = pns.lastmatch;
+  // The implementation of the Clinger's fast path is convoluted because
+  // we want round-to-nearest in all cases, irrespective of the rounding mode
+  // selected on the thread.
+  // We proceed optimistically, assuming that detail::rounds_to_nearest()
+  // returns true.
+  if (binary_format<T>::min_exponent_fast_path() <= pns.exponent &&
+      pns.exponent <= binary_format<T>::max_exponent_fast_path() &&
+      !pns.too_many_digits) {
+    // Unfortunately, the conventional Clinger's fast path is only possible
+    // when the system rounds to the nearest float.
+    //
+    // We expect the next branch to almost always be selected.
+    // We could check it first (before the previous branch), but
+    // there might be performance advantages at having the check
+    // be last.
+    if (!cpp20_and_in_constexpr() && detail::rounds_to_nearest()) {
+      // We have that fegetround() == FE_TONEAREST.
+      // Next is Clinger's fast path.
+      if (pns.mantissa <= binary_format<T>::max_mantissa_fast_path()) {
+        value = T(pns.mantissa);
+        if (pns.exponent < 0) {
+          value = value / binary_format<T>::exact_power_of_ten(-pns.exponent);
+        } else {
+          value = value * binary_format<T>::exact_power_of_ten(pns.exponent);
+        }
+        if (pns.negative) {
+          value = -value;
+        }
+        return answer;
+      }
+    } else {
+      // We do not have that fegetround() == FE_TONEAREST.
+      // Next is a modified Clinger's fast path, inspired by Jakub Jelínek's
+      // proposal
+      if (pns.exponent >= 0 &&
+          pns.mantissa <=
+              binary_format<T>::max_mantissa_fast_path(pns.exponent)) {
+#if defined(__clang__) || defined(FASTFLOAT_32BIT)
+        // Clang may map 0 to -0.0 when fegetround() == FE_DOWNWARD
+        if (pns.mantissa == 0) {
+          value = pns.negative ? T(-0.) : T(0.);
+          return answer;
+        }
+#endif
+        value = T(pns.mantissa) *
+                binary_format<T>::exact_power_of_ten(pns.exponent);
+        if (pns.negative) {
+          value = -value;
+        }
+        return answer;
+      }
+    }
+  }
+  adjusted_mantissa am =
+      compute_float<binary_format<T>>(pns.exponent, pns.mantissa);
+  if (pns.too_many_digits && am.power2 >= 0) {
+    if (am != compute_float<binary_format<T>>(pns.exponent, pns.mantissa + 1)) {
+      am = compute_error<binary_format<T>>(pns.exponent, pns.mantissa);
+    }
+  }
+  // If we called compute_float<binary_format<T>>(pns.exponent, pns.mantissa)
+  // and we have an invalid power (am.power2 < 0), then we need to go the long
+  // way around again. This is very uncommon.
+  if (am.power2 < 0) {
+    am = digit_comp<T>(pns, am);
+  }
+  to_float(pns.negative, am, value);
+  // Test for over/underflow.
+  if ((pns.mantissa != 0 && am.mantissa == 0 && am.power2 == 0) ||
+      am.power2 == binary_format<T>::infinite_power()) {
+    answer.ec = std::errc::result_out_of_range;
+  }
+  return answer;
+}
+
+template <typename T, typename UC>
+FASTFLOAT_CONSTEXPR20 from_chars_result_t<UC>
+from_chars_advanced(UC const *first, UC const *last, T &value,
+                    parse_options_t<UC> options) noexcept {
+
+  static_assert(is_supported_float_type<T>(),
+                "only some floating-point types are supported");
+  static_assert(is_supported_char_type<UC>(),
+                "only char, wchar_t, char16_t and char32_t are supported");
+
+  from_chars_result_t<UC> answer;
+#ifdef FASTFLOAT_SKIP_WHITE_SPACE // disabled by default
+  while ((first != last) && fast_float::is_space(uint8_t(*first))) {
+    first++;
+  }
+#endif
+  if (first == last) {
+    answer.ec = std::errc::invalid_argument;
+    answer.ptr = first;
+    return answer;
+  }
+  parsed_number_string_t<UC> pns =
+      parse_number_string<UC>(first, last, options);
+  if (!pns.valid) {
+    if (options.format & chars_format::no_infnan) {
+      answer.ec = std::errc::invalid_argument;
+      answer.ptr = first;
+      return answer;
+    } else {
+      return detail::parse_infnan(first, last, value);
+    }
+  }
+
+  // call overload that takes parsed_number_string_t directly.
+  return from_chars_advanced(pns, value);
+}
+
+template <typename T, typename UC, typename>
+FASTFLOAT_CONSTEXPR20 from_chars_result_t<UC>
+from_chars(UC const *first, UC const *last, T &value, int base) noexcept {
+  static_assert(is_supported_char_type<UC>(),
+                "only char, wchar_t, char16_t and char32_t are supported");
+
+  from_chars_result_t<UC> answer;
+#ifdef FASTFLOAT_SKIP_WHITE_SPACE // disabled by default
+  while ((first != last) && fast_float::is_space(uint8_t(*first))) {
+    first++;
+  }
+#endif
+  if (first == last || base < 2 || base > 36) {
+    answer.ec = std::errc::invalid_argument;
+    answer.ptr = first;
+    return answer;
+  }
+  return parse_int_string(first, last, value, base);
+}
+
+} // namespace fast_float
+
+#endif
+
diff --git a/lib/fmt/LICENSE b/lib/fmt/LICENSE
new file mode 100644
index 000000000..1cd1ef926
--- /dev/null
+++ b/lib/fmt/LICENSE
@@ -0,0 +1,27 @@
+Copyright (c) 2012 - present, Victor Zverovich and {fmt} contributors
+
+Permission is hereby granted, free of charge, to any person obtaining
+a copy of this software and associated documentation files (the
+"Software"), to deal in the Software without restriction, including
+without limitation the rights to use, copy, modify, merge, publish,
+distribute, sublicense, and/or sell copies of the Software, and to
+permit persons to whom the Software is furnished to do so, subject to
+the following conditions:
+
+The above copyright notice and this permission notice shall be
+included in all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
+LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+--- Optional exception to the license ---
+
+As an exception, if, as a result of your compiling your source code, portions
+of this Software are embedded into a machine-executable object form of such
+source code, you may redistribute such embedded portions in such object form
+without including the above copyright and permission notices.
diff --git a/lib/fmt/README.md b/lib/fmt/README.md
new file mode 100644
index 000000000..5f9249d4b
--- /dev/null
+++ b/lib/fmt/README.md
@@ -0,0 +1,484 @@
+<img src="https://user-images.githubusercontent.com/576385/156254208-f5b743a9-88cf-439d-b0c0-923d53e8d551.png" alt="{fmt}" width="25%"/>
+
+[![image](https://github.com/fmtlib/fmt/workflows/linux/badge.svg)](https://github.com/fmtlib/fmt/actions?query=workflow%3Alinux)
+[![image](https://github.com/fmtlib/fmt/workflows/macos/badge.svg)](https://github.com/fmtlib/fmt/actions?query=workflow%3Amacos)
+[![image](https://github.com/fmtlib/fmt/workflows/windows/badge.svg)](https://github.com/fmtlib/fmt/actions?query=workflow%3Awindows)
+[![fmt is continuously fuzzed at oss-fuzz](https://oss-fuzz-build-logs.storage.googleapis.com/badges/fmt.svg)](https://bugs.chromium.org/p/oss-fuzz/issues/list?\%0Acolspec=ID%20Type%20Component%20Status%20Proj%20Reported%20Owner%20\%0ASummary&q=proj%3Dfmt&can=1)
+[![Ask questions at StackOverflow with the tag fmt](https://img.shields.io/badge/stackoverflow-fmt-blue.svg)](https://stackoverflow.com/questions/tagged/fmt)
+[![image](https://api.securityscorecards.dev/projects/github.com/fmtlib/fmt/badge)](https://securityscorecards.dev/viewer/?uri=github.com/fmtlib/fmt)
+
+**{fmt}** is an open-source formatting library providing a fast and safe
+alternative to C stdio and C++ iostreams.
+
+If you like this project, please consider donating to one of the funds
+that help victims of the war in Ukraine: <https://www.stopputin.net/>.
+
+[Documentation](https://fmt.dev)
+
+[Cheat Sheets](https://hackingcpp.com/cpp/libs/fmt.html)
+
+Q&A: ask questions on [StackOverflow with the tag
+fmt](https://stackoverflow.com/questions/tagged/fmt).
+
+Try {fmt} in [Compiler Explorer](https://godbolt.org/z/8Mx1EW73v).
+
+# Features
+
+- Simple [format API](https://fmt.dev/latest/api/) with positional
+  arguments for localization
+- Implementation of [C++20
+  std::format](https://en.cppreference.com/w/cpp/utility/format) and
+  [C++23 std::print](https://en.cppreference.com/w/cpp/io/print)
+- [Format string syntax](https://fmt.dev/latest/syntax/) similar
+  to Python\'s
+  [format](https://docs.python.org/3/library/stdtypes.html#str.format)
+- Fast IEEE 754 floating-point formatter with correct rounding,
+  shortness and round-trip guarantees using the
+  [Dragonbox](https://github.com/jk-jeon/dragonbox) algorithm
+- Portable Unicode support
+- Safe [printf
+  implementation](https://fmt.dev/latest/api/#printf-formatting)
+  including the POSIX extension for positional arguments
+- Extensibility: [support for user-defined
+  types](https://fmt.dev/latest/api/#formatting-user-defined-types)
+- High performance: faster than common standard library
+  implementations of `(s)printf`, iostreams, `to_string` and
+  `to_chars`, see [Speed tests](#speed-tests) and [Converting a
+  hundred million integers to strings per
+  second](http://www.zverovich.net/2020/06/13/fast-int-to-string-revisited.html)
+- Small code size both in terms of source code with the minimum
+  configuration consisting of just three files, `core.h`, `format.h`
+  and `format-inl.h`, and compiled code; see [Compile time and code
+  bloat](#compile-time-and-code-bloat)
+- Reliability: the library has an extensive set of
+  [tests](https://github.com/fmtlib/fmt/tree/master/test) and is
+  [continuously fuzzed](https://bugs.chromium.org/p/oss-fuzz/issues/list?colspec=ID%20Type%20Component%20Status%20Proj%20Reported%20Owner%20Summary&q=proj%3Dfmt&can=1)
+- Safety: the library is fully type-safe, errors in format strings can
+  be reported at compile time, automatic memory management prevents
+  buffer overflow errors
+- Ease of use: small self-contained code base, no external
+  dependencies, permissive MIT
+  [license](https://github.com/fmtlib/fmt/blob/master/LICENSE)
+- [Portability](https://fmt.dev/latest/#portability) with
+  consistent output across platforms and support for older compilers
+- Clean warning-free codebase even on high warning levels such as
+  `-Wall -Wextra -pedantic`
+- Locale independence by default
+- Optional header-only configuration enabled with the
+  `FMT_HEADER_ONLY` macro
+
+See the [documentation](https://fmt.dev) for more details.
+
+# Examples
+
+**Print to stdout** ([run](https://godbolt.org/z/Tevcjh))
+
+``` c++
+#include <fmt/core.h>
+
+int main() {
+  fmt::print("Hello, world!\n");
+}
+```
+
+**Format a string** ([run](https://godbolt.org/z/oK8h33))
+
+``` c++
+std::string s = fmt::format("The answer is {}.", 42);
+// s == "The answer is 42."
+```
+
+**Format a string using positional arguments**
+([run](https://godbolt.org/z/Yn7Txe))
+
+``` c++
+std::string s = fmt::format("I'd rather be {1} than {0}.", "right", "happy");
+// s == "I'd rather be happy than right."
+```
+
+**Print dates and times** ([run](https://godbolt.org/z/c31ExdY3W))
+
+``` c++
+#include <fmt/chrono.h>
+
+int main() {
+  auto now = std::chrono::system_clock::now();
+  fmt::print("Date and time: {}\n", now);
+  fmt::print("Time: {:%H:%M}\n", now);
+}
+```
+
+Output:
+
+    Date and time: 2023-12-26 19:10:31.557195597
+    Time: 19:10
+
+**Print a container** ([run](https://godbolt.org/z/MxM1YqjE7))
+
+``` c++
+#include <vector>
+#include <fmt/ranges.h>
+
+int main() {
+  std::vector<int> v = {1, 2, 3};
+  fmt::print("{}\n", v);
+}
+```
+
+Output:
+
+    [1, 2, 3]
+
+**Check a format string at compile time**
+
+``` c++
+std::string s = fmt::format("{:d}", "I am not a number");
+```
+
+This gives a compile-time error in C++20 because `d` is an invalid
+format specifier for a string.
+
+**Write a file from a single thread**
+
+``` c++
+#include <fmt/os.h>
+
+int main() {
+  auto out = fmt::output_file("guide.txt");
+  out.print("Don't {}", "Panic");
+}
+```
+
+This can be [5 to 9 times faster than
+fprintf](http://www.zverovich.net/2020/08/04/optimal-file-buffer-size.html).
+
+**Print with colors and text styles**
+
+``` c++
+#include <fmt/color.h>
+
+int main() {
+  fmt::print(fg(fmt::color::crimson) | fmt::emphasis::bold,
+             "Hello, {}!\n", "world");
+  fmt::print(fg(fmt::color::floral_white) | bg(fmt::color::slate_gray) |
+             fmt::emphasis::underline, "Olá, {}!\n", "Mundo");
+  fmt::print(fg(fmt::color::steel_blue) | fmt::emphasis::italic,
+             "你好{}！\n", "世界");
+}
+```
+
+Output on a modern terminal with Unicode support:
+
+![image](https://github.com/fmtlib/fmt/assets/%0A576385/2a93c904-d6fa-4aa6-b453-2618e1c327d7)
+
+# Benchmarks
+
+## Speed tests
+
+| Library           | Method        | Run Time, s |
+|-------------------|---------------|-------------|
+| libc              | printf        |   0.91      |
+| libc++            | std::ostream  |   2.49      |
+| {fmt} 9.1         | fmt::print    |   0.74      |
+| Boost Format 1.80 | boost::format |   6.26      |
+| Folly Format      | folly::format |   1.87      |
+
+{fmt} is the fastest of the benchmarked methods, \~20% faster than
+`printf`.
+
+The above results were generated by building `tinyformat_test.cpp` on
+macOS 12.6.1 with `clang++ -O3 -DNDEBUG -DSPEED_TEST -DHAVE_FORMAT`, and
+taking the best of three runs. In the test, the format string
+`"%0.10f:%04d:%+g:%s:%p:%c:%%\n"` or equivalent is filled 2,000,000
+times with output sent to `/dev/null`; for further details refer to the
+[source](https://github.com/fmtlib/format-benchmark/blob/master/src/tinyformat-test.cc).
+
+{fmt} is up to 20-30x faster than `std::ostringstream` and `sprintf` on
+IEEE754 `float` and `double` formatting
+([dtoa-benchmark](https://github.com/fmtlib/dtoa-benchmark)) and faster
+than [double-conversion](https://github.com/google/double-conversion)
+and [ryu](https://github.com/ulfjack/ryu):
+
+[![image](https://user-images.githubusercontent.com/576385/95684665-11719600-0ba8-11eb-8e5b-972ff4e49428.png)](https://fmt.dev/unknown_mac64_clang12.0.html)
+
+## Compile time and code bloat
+
+The script [bloat-test.py][test] from [format-benchmark][bench] tests compile
+time and code bloat for nontrivial projects. It generates 100 translation units
+and uses `printf()` or its alternative five times in each to simulate a
+medium-sized project. The resulting executable size and compile time (Apple
+clang version 15.0.0 (clang-1500.1.0.2.5), macOS Sonoma, best of three) is shown
+in the following tables.
+
+[test]: https://github.com/fmtlib/format-benchmark/blob/master/bloat-test.py
+[bench]: https://github.com/fmtlib/format-benchmark
+
+**Optimized build (-O3)**
+
+| Method        | Compile Time, s | Executable size, KiB | Stripped size, KiB |
+|---------------|-----------------|----------------------|--------------------|
+| printf        |             1.6 |                   54 |                 50 |
+| IOStreams     |            25.9 |                   98 |                 84 |
+| fmt 83652df   |             4.8 |                   54 |                 50 |
+| tinyformat    |            29.1 |                  161 |                136 |
+| Boost Format  |            55.0 |                  530 |                317 |
+
+{fmt} is fast to compile and is comparable to `printf` in terms of per-call
+binary size (within a rounding error on this system).
+
+**Non-optimized build**
+
+| Method        | Compile Time, s | Executable size, KiB | Stripped size, KiB |
+|---------------|-----------------|----------------------|--------------------|
+| printf        |             1.4 |                   54 |                 50 |
+| IOStreams     |            23.4 |                   92 |                 68 |
+| {fmt} 83652df |             4.4 |                   89 |                 85 |
+| tinyformat    |            24.5 |                  204 |                161 |
+| Boost Format  |            36.4 |                  831 |                462 |
+
+`libc`, `lib(std)c++`, and `libfmt` are all linked as shared libraries
+to compare formatting function overhead only. Boost Format is a
+header-only library so it doesn\'t provide any linkage options.
+
+## Running the tests
+
+Please refer to [Building the
+library](https://fmt.dev/latest/get-started/#building-from-source) for
+instructions on how to build the library and run the unit tests.
+
+Benchmarks reside in a separate repository,
+[format-benchmarks](https://github.com/fmtlib/format-benchmark), so to
+run the benchmarks you first need to clone this repository and generate
+Makefiles with CMake:
+
+    $ git clone --recursive https://github.com/fmtlib/format-benchmark.git
+    $ cd format-benchmark
+    $ cmake .
+
+Then you can run the speed test:
+
+    $ make speed-test
+
+or the bloat test:
+
+    $ make bloat-test
+
+# Migrating code
+
+[clang-tidy](https://clang.llvm.org/extra/clang-tidy/) v18 provides the
+[modernize-use-std-print](https://clang.llvm.org/extra/clang-tidy/checks/modernize/use-std-print.html)
+check that is capable of converting occurrences of `printf` and
+`fprintf` to `fmt::print` if configured to do so. (By default it
+converts to `std::print`.)
+
+# Notable projects using this library
+
+- [0 A.D.](https://play0ad.com/): a free, open-source, cross-platform
+  real-time strategy game
+- [AMPL/MP](https://github.com/ampl/mp): an open-source library for
+  mathematical programming
+- [Apple's FoundationDB](https://github.com/apple/foundationdb): an open-source,
+  distributed, transactional key-value store
+- [Aseprite](https://github.com/aseprite/aseprite): animated sprite
+  editor & pixel art tool
+- [AvioBook](https://www.aviobook.aero/en): a comprehensive aircraft
+  operations suite
+- [Blizzard Battle.net](https://battle.net/): an online gaming
+  platform
+- [Celestia](https://celestia.space/): real-time 3D visualization of
+  space
+- [Ceph](https://ceph.com/): a scalable distributed storage system
+- [ccache](https://ccache.dev/): a compiler cache
+- [ClickHouse](https://github.com/ClickHouse/ClickHouse): an
+  analytical database management system
+- [Contour](https://github.com/contour-terminal/contour/): a modern
+  terminal emulator
+- [CUAUV](https://cuauv.org/): Cornell University\'s autonomous
+  underwater vehicle
+- [Drake](https://drake.mit.edu/): a planning, control, and analysis
+  toolbox for nonlinear dynamical systems (MIT)
+- [Envoy](https://github.com/envoyproxy/envoy): C++ L7 proxy and
+  communication bus (Lyft)
+- [FiveM](https://fivem.net/): a modification framework for GTA V
+- [fmtlog](https://github.com/MengRao/fmtlog): a performant
+  fmtlib-style logging library with latency in nanoseconds
+- [Folly](https://github.com/facebook/folly): Facebook open-source
+  library
+- [GemRB](https://gemrb.org/): a portable open-source implementation
+  of Bioware's Infinity Engine
+- [Grand Mountain
+  Adventure](https://store.steampowered.com/app/1247360/Grand_Mountain_Adventure/):
+  a beautiful open-world ski & snowboarding game
+- [HarpyWar/pvpgn](https://github.com/pvpgn/pvpgn-server): Player vs
+  Player Gaming Network with tweaks
+- [KBEngine](https://github.com/kbengine/kbengine): an open-source
+  MMOG server engine
+- [Keypirinha](https://keypirinha.com/): a semantic launcher for
+  Windows
+- [Kodi](https://kodi.tv/) (formerly xbmc): home theater software
+- [Knuth](https://kth.cash/): high-performance Bitcoin full-node
+- [libunicode](https://github.com/contour-terminal/libunicode/): a
+  modern C++17 Unicode library
+- [MariaDB](https://mariadb.org/): relational database management
+  system
+- [Microsoft Verona](https://github.com/microsoft/verona): research
+  programming language for concurrent ownership
+- [MongoDB](https://mongodb.com/): distributed document database
+- [MongoDB Smasher](https://github.com/duckie/mongo_smasher): a small
+  tool to generate randomized datasets
+- [OpenSpace](https://openspaceproject.com/): an open-source
+  astrovisualization framework
+- [PenUltima Online (POL)](https://www.polserver.com/): an MMO server,
+  compatible with most Ultima Online clients
+- [PyTorch](https://github.com/pytorch/pytorch): an open-source
+  machine learning library
+- [quasardb](https://www.quasardb.net/): a distributed,
+  high-performance, associative database
+- [Quill](https://github.com/odygrd/quill): asynchronous low-latency
+  logging library
+- [QKW](https://github.com/ravijanjam/qkw): generalizing aliasing to
+  simplify navigation, and execute complex multi-line terminal
+  command sequences
+- [redis-cerberus](https://github.com/HunanTV/redis-cerberus): a Redis
+  cluster proxy
+- [redpanda](https://vectorized.io/redpanda): a 10x faster Kafka®
+  replacement for mission-critical systems written in C++
+- [rpclib](http://rpclib.net/): a modern C++ msgpack-RPC server and
+  client library
+- [Salesforce Analytics
+  Cloud](https://www.salesforce.com/analytics-cloud/overview/):
+  business intelligence software
+- [Scylla](https://www.scylladb.com/): a Cassandra-compatible NoSQL
+  data store that can handle 1 million transactions per second on a
+  single server
+- [Seastar](http://www.seastar-project.org/): an advanced, open-source
+  C++ framework for high-performance server applications on modern
+  hardware
+- [spdlog](https://github.com/gabime/spdlog): super fast C++ logging
+  library
+- [Stellar](https://www.stellar.org/): financial platform
+- [Touch Surgery](https://www.touchsurgery.com/): surgery simulator
+- [TrinityCore](https://github.com/TrinityCore/TrinityCore):
+  open-source MMORPG framework
+- [🐙 userver framework](https://userver.tech/): open-source
+  asynchronous framework with a rich set of abstractions and database
+  drivers
+- [Windows Terminal](https://github.com/microsoft/terminal): the new
+  Windows terminal
+
+[More\...](https://github.com/search?q=fmtlib&type=Code)
+
+If you are aware of other projects using this library, please let me
+know by [email](mailto:victor.zverovich@gmail.com) or by submitting an
+[issue](https://github.com/fmtlib/fmt/issues).
+
+# Motivation
+
+So why yet another formatting library?
+
+There are plenty of methods for doing this task, from standard ones like
+the printf family of function and iostreams to Boost Format and
+FastFormat libraries. The reason for creating a new library is that
+every existing solution that I found either had serious issues or
+didn\'t provide all the features I needed.
+
+## printf
+
+The good thing about `printf` is that it is pretty fast and readily
+available being a part of the C standard library. The main drawback is
+that it doesn\'t support user-defined types. `printf` also has safety
+issues although they are somewhat mitigated with [\_\_attribute\_\_
+((format (printf,
+\...))](https://gcc.gnu.org/onlinedocs/gcc/Function-Attributes.html) in
+GCC. There is a POSIX extension that adds positional arguments required
+for
+[i18n](https://en.wikipedia.org/wiki/Internationalization_and_localization)
+to `printf` but it is not a part of C99 and may not be available on some
+platforms.
+
+## iostreams
+
+The main issue with iostreams is best illustrated with an example:
+
+``` c++
+std::cout << std::setprecision(2) << std::fixed << 1.23456 << "\n";
+```
+
+which is a lot of typing compared to printf:
+
+``` c++
+printf("%.2f\n", 1.23456);
+```
+
+Matthew Wilson, the author of FastFormat, called this \"chevron hell\".
+iostreams don\'t support positional arguments by design.
+
+The good part is that iostreams support user-defined types and are safe
+although error handling is awkward.
+
+## Boost Format
+
+This is a very powerful library that supports both `printf`-like format
+strings and positional arguments. Its main drawback is performance.
+According to various benchmarks, it is much slower than other methods
+considered here. Boost Format also has excessive build times and severe
+code bloat issues (see [Benchmarks](#benchmarks)).
+
+## FastFormat
+
+This is an interesting library that is fast, safe and has positional
+arguments. However, it has significant limitations, citing its author:
+
+> Three features that have no hope of being accommodated within the
+> current design are:
+>
+> - Leading zeros (or any other non-space padding)
+> - Octal/hexadecimal encoding
+> - Runtime width/alignment specification
+
+It is also quite big and has a heavy dependency, on STLSoft, which might be
+too restrictive for use in some projects.
+
+## Boost Spirit.Karma
+
+This is not a formatting library but I decided to include it here for
+completeness. As iostreams, it suffers from the problem of mixing
+verbatim text with arguments. The library is pretty fast, but slower on
+integer formatting than `fmt::format_to` with format string compilation
+on Karma\'s own benchmark, see [Converting a hundred million integers to
+strings per
+second](http://www.zverovich.net/2020/06/13/fast-int-to-string-revisited.html).
+
+# License
+
+{fmt} is distributed under the MIT
+[license](https://github.com/fmtlib/fmt/blob/master/LICENSE).
+
+# Documentation License
+
+The [Format String Syntax](https://fmt.dev/latest/syntax/) section
+in the documentation is based on the one from Python [string module
+documentation](https://docs.python.org/3/library/string.html#module-string).
+For this reason, the documentation is distributed under the Python
+Software Foundation license available in
+[doc/python-license.txt](https://raw.github.com/fmtlib/fmt/master/doc/python-license.txt).
+It only applies if you distribute the documentation of {fmt}.
+
+# Maintainers
+
+The {fmt} library is maintained by Victor Zverovich
+([vitaut](https://github.com/vitaut)) with contributions from many other
+people. See
+[Contributors](https://github.com/fmtlib/fmt/graphs/contributors) and
+[Releases](https://github.com/fmtlib/fmt/releases) for some of the
+names. Let us know if your contribution is not listed or mentioned
+incorrectly and we\'ll make it right.
+
+# Security Policy
+
+To report a security issue, please disclose it at [security
+advisory](https://github.com/fmtlib/fmt/security/advisories/new).
+
+This project is maintained by a team of volunteers on a
+reasonable-effort basis. As such, please give us at least *90* days to
+work on a fix before public exposure.
diff --git a/lib/fmt/VERSION b/lib/fmt/VERSION
new file mode 100644
index 000000000..6552bc745
--- /dev/null
+++ b/lib/fmt/VERSION
@@ -0,0 +1 @@
+11.0.2
\ No newline at end of file
diff --git a/lib/fmt/fmt/args.h b/lib/fmt/fmt/args.h
new file mode 100644
index 000000000..31a60e8fa
--- /dev/null
+++ b/lib/fmt/fmt/args.h
@@ -0,0 +1,228 @@
+// Formatting library for C++ - dynamic argument lists
+//
+// Copyright (c) 2012 - present, Victor Zverovich
+// All rights reserved.
+//
+// For the license information refer to format.h.
+
+#ifndef FMT_ARGS_H_
+#define FMT_ARGS_H_
+
+#ifndef FMT_MODULE
+#  include <functional>  // std::reference_wrapper
+#  include <memory>      // std::unique_ptr
+#  include <vector>
+#endif
+
+#include "format.h"  // std_string_view
+
+FMT_BEGIN_NAMESPACE
+
+namespace detail {
+
+template <typename T> struct is_reference_wrapper : std::false_type {};
+template <typename T>
+struct is_reference_wrapper<std::reference_wrapper<T>> : std::true_type {};
+
+template <typename T> auto unwrap(const T& v) -> const T& { return v; }
+template <typename T>
+auto unwrap(const std::reference_wrapper<T>& v) -> const T& {
+  return static_cast<const T&>(v);
+}
+
+// node is defined outside dynamic_arg_list to workaround a C2504 bug in MSVC
+// 2022 (v17.10.0).
+//
+// Workaround for clang's -Wweak-vtables. Unlike for regular classes, for
+// templates it doesn't complain about inability to deduce single translation
+// unit for placing vtable. So node is made a fake template.
+template <typename = void> struct node {
+  virtual ~node() = default;
+  std::unique_ptr<node<>> next;
+};
+
+class dynamic_arg_list {
+  template <typename T> struct typed_node : node<> {
+    T value;
+
+    template <typename Arg>
+    FMT_CONSTEXPR typed_node(const Arg& arg) : value(arg) {}
+
+    template <typename Char>
+    FMT_CONSTEXPR typed_node(const basic_string_view<Char>& arg)
+        : value(arg.data(), arg.size()) {}
+  };
+
+  std::unique_ptr<node<>> head_;
+
+ public:
+  template <typename T, typename Arg> auto push(const Arg& arg) -> const T& {
+    auto new_node = std::unique_ptr<typed_node<T>>(new typed_node<T>(arg));
+    auto& value = new_node->value;
+    new_node->next = std::move(head_);
+    head_ = std::move(new_node);
+    return value;
+  }
+};
+}  // namespace detail
+
+/**
+ * A dynamic list of formatting arguments with storage.
+ *
+ * It can be implicitly converted into `fmt::basic_format_args` for passing
+ * into type-erased formatting functions such as `fmt::vformat`.
+ */
+template <typename Context>
+class dynamic_format_arg_store
+#if FMT_GCC_VERSION && FMT_GCC_VERSION < 409
+    // Workaround a GCC template argument substitution bug.
+    : public basic_format_args<Context>
+#endif
+{
+ private:
+  using char_type = typename Context::char_type;
+
+  template <typename T> struct need_copy {
+    static constexpr detail::type mapped_type =
+        detail::mapped_type_constant<T, Context>::value;
+
+    enum {
+      value = !(detail::is_reference_wrapper<T>::value ||
+                std::is_same<T, basic_string_view<char_type>>::value ||
+                std::is_same<T, detail::std_string_view<char_type>>::value ||
+                (mapped_type != detail::type::cstring_type &&
+                 mapped_type != detail::type::string_type &&
+                 mapped_type != detail::type::custom_type))
+    };
+  };
+
+  template <typename T>
+  using stored_type = conditional_t<
+      std::is_convertible<T, std::basic_string<char_type>>::value &&
+          !detail::is_reference_wrapper<T>::value,
+      std::basic_string<char_type>, T>;
+
+  // Storage of basic_format_arg must be contiguous.
+  std::vector<basic_format_arg<Context>> data_;
+  std::vector<detail::named_arg_info<char_type>> named_info_;
+
+  // Storage of arguments not fitting into basic_format_arg must grow
+  // without relocation because items in data_ refer to it.
+  detail::dynamic_arg_list dynamic_args_;
+
+  friend class basic_format_args<Context>;
+
+  auto get_types() const -> unsigned long long {
+    return detail::is_unpacked_bit | data_.size() |
+           (named_info_.empty()
+                ? 0ULL
+                : static_cast<unsigned long long>(detail::has_named_args_bit));
+  }
+
+  auto data() const -> const basic_format_arg<Context>* {
+    return named_info_.empty() ? data_.data() : data_.data() + 1;
+  }
+
+  template <typename T> void emplace_arg(const T& arg) {
+    data_.emplace_back(detail::make_arg<Context>(arg));
+  }
+
+  template <typename T>
+  void emplace_arg(const detail::named_arg<char_type, T>& arg) {
+    if (named_info_.empty()) {
+      constexpr const detail::named_arg_info<char_type>* zero_ptr{nullptr};
+      data_.insert(data_.begin(), {zero_ptr, 0});
+    }
+    data_.emplace_back(detail::make_arg<Context>(detail::unwrap(arg.value)));
+    auto pop_one = [](std::vector<basic_format_arg<Context>>* data) {
+      data->pop_back();
+    };
+    std::unique_ptr<std::vector<basic_format_arg<Context>>, decltype(pop_one)>
+        guard{&data_, pop_one};
+    named_info_.push_back({arg.name, static_cast<int>(data_.size() - 2u)});
+    data_[0].value_.named_args = {named_info_.data(), named_info_.size()};
+    guard.release();
+  }
+
+ public:
+  constexpr dynamic_format_arg_store() = default;
+
+  /**
+   * Adds an argument into the dynamic store for later passing to a formatting
+   * function.
+   *
+   * Note that custom types and string types (but not string views) are copied
+   * into the store dynamically allocating memory if necessary.
+   *
+   * **Example**:
+   *
+   *     fmt::dynamic_format_arg_store<fmt::format_context> store;
+   *     store.push_back(42);
+   *     store.push_back("abc");
+   *     store.push_back(1.5f);
+   *     std::string result = fmt::vformat("{} and {} and {}", store);
+   */
+  template <typename T> void push_back(const T& arg) {
+    if (detail::const_check(need_copy<T>::value))
+      emplace_arg(dynamic_args_.push<stored_type<T>>(arg));
+    else
+      emplace_arg(detail::unwrap(arg));
+  }
+
+  /**
+   * Adds a reference to the argument into the dynamic store for later passing
+   * to a formatting function.
+   *
+   * **Example**:
+   *
+   *     fmt::dynamic_format_arg_store<fmt::format_context> store;
+   *     char band[] = "Rolling Stones";
+   *     store.push_back(std::cref(band));
+   *     band[9] = 'c'; // Changing str affects the output.
+   *     std::string result = fmt::vformat("{}", store);
+   *     // result == "Rolling Scones"
+   */
+  template <typename T> void push_back(std::reference_wrapper<T> arg) {
+    static_assert(
+        need_copy<T>::value,
+        "objects of built-in types and string views are always copied");
+    emplace_arg(arg.get());
+  }
+
+  /**
+   * Adds named argument into the dynamic store for later passing to a
+   * formatting function. `std::reference_wrapper` is supported to avoid
+   * copying of the argument. The name is always copied into the store.
+   */
+  template <typename T>
+  void push_back(const detail::named_arg<char_type, T>& arg) {
+    const char_type* arg_name =
+        dynamic_args_.push<std::basic_string<char_type>>(arg.name).c_str();
+    if (detail::const_check(need_copy<T>::value)) {
+      emplace_arg(
+          fmt::arg(arg_name, dynamic_args_.push<stored_type<T>>(arg.value)));
+    } else {
+      emplace_arg(fmt::arg(arg_name, arg.value));
+    }
+  }
+
+  /// Erase all elements from the store.
+  void clear() {
+    data_.clear();
+    named_info_.clear();
+    dynamic_args_ = detail::dynamic_arg_list();
+  }
+
+  /// Reserves space to store at least `new_cap` arguments including
+  /// `new_cap_named` named arguments.
+  void reserve(size_t new_cap, size_t new_cap_named) {
+    FMT_ASSERT(new_cap >= new_cap_named,
+               "Set of arguments includes set of named arguments");
+    data_.reserve(new_cap);
+    named_info_.reserve(new_cap_named);
+  }
+};
+
+FMT_END_NAMESPACE
+
+#endif  // FMT_ARGS_H_
diff --git a/lib/fmt/fmt/base.h b/lib/fmt/fmt/base.h
new file mode 100644
index 000000000..627649425
--- /dev/null
+++ b/lib/fmt/fmt/base.h
@@ -0,0 +1,3077 @@
+// Formatting library for C++ - the base API for char/UTF-8
+//
+// Copyright (c) 2012 - present, Victor Zverovich
+// All rights reserved.
+//
+// For the license information refer to format.h.
+
+#ifndef FMT_BASE_H_
+#define FMT_BASE_H_
+
+#if defined(FMT_IMPORT_STD) && !defined(FMT_MODULE)
+#  define FMT_MODULE
+#endif
+
+#ifndef FMT_MODULE
+#  include <limits.h>  // CHAR_BIT
+#  include <stdio.h>   // FILE
+#  include <string.h>  // strlen
+
+// <cstddef> is also included transitively from <type_traits>.
+#  include <cstddef>      // std::byte
+#  include <type_traits>  // std::enable_if
+#endif
+
+// The fmt library version in the form major * 10000 + minor * 100 + patch.
+#define FMT_VERSION 110002
+
+// Detect compiler versions.
+#if defined(__clang__) && !defined(__ibmxl__)
+#  define FMT_CLANG_VERSION (__clang_major__ * 100 + __clang_minor__)
+#else
+#  define FMT_CLANG_VERSION 0
+#endif
+#if defined(__GNUC__) && !defined(__clang__) && !defined(__INTEL_COMPILER)
+#  define FMT_GCC_VERSION (__GNUC__ * 100 + __GNUC_MINOR__)
+#else
+#  define FMT_GCC_VERSION 0
+#endif
+#if defined(__ICL)
+#  define FMT_ICC_VERSION __ICL
+#elif defined(__INTEL_COMPILER)
+#  define FMT_ICC_VERSION __INTEL_COMPILER
+#else
+#  define FMT_ICC_VERSION 0
+#endif
+#if defined(_MSC_VER)
+#  define FMT_MSC_VERSION _MSC_VER
+#else
+#  define FMT_MSC_VERSION 0
+#endif
+
+// Detect standard library versions.
+#ifdef _GLIBCXX_RELEASE
+#  define FMT_GLIBCXX_RELEASE _GLIBCXX_RELEASE
+#else
+#  define FMT_GLIBCXX_RELEASE 0
+#endif
+#ifdef _LIBCPP_VERSION
+#  define FMT_LIBCPP_VERSION _LIBCPP_VERSION
+#else
+#  define FMT_LIBCPP_VERSION 0
+#endif
+
+#ifdef _MSVC_LANG
+#  define FMT_CPLUSPLUS _MSVC_LANG
+#else
+#  define FMT_CPLUSPLUS __cplusplus
+#endif
+
+// Detect __has_*.
+#ifdef __has_feature
+#  define FMT_HAS_FEATURE(x) __has_feature(x)
+#else
+#  define FMT_HAS_FEATURE(x) 0
+#endif
+#ifdef __has_include
+#  define FMT_HAS_INCLUDE(x) __has_include(x)
+#else
+#  define FMT_HAS_INCLUDE(x) 0
+#endif
+#ifdef __has_cpp_attribute
+#  define FMT_HAS_CPP_ATTRIBUTE(x) __has_cpp_attribute(x)
+#else
+#  define FMT_HAS_CPP_ATTRIBUTE(x) 0
+#endif
+
+#define FMT_HAS_CPP14_ATTRIBUTE(attribute) \
+  (FMT_CPLUSPLUS >= 201402L && FMT_HAS_CPP_ATTRIBUTE(attribute))
+
+#define FMT_HAS_CPP17_ATTRIBUTE(attribute) \
+  (FMT_CPLUSPLUS >= 201703L && FMT_HAS_CPP_ATTRIBUTE(attribute))
+
+// Detect C++14 relaxed constexpr.
+#ifdef FMT_USE_CONSTEXPR
+// Use the provided definition.
+#elif FMT_GCC_VERSION >= 600 && FMT_CPLUSPLUS >= 201402L
+// GCC only allows throw in constexpr since version 6:
+// https://gcc.gnu.org/bugzilla/show_bug.cgi?id=67371.
+#  define FMT_USE_CONSTEXPR 1
+#elif FMT_ICC_VERSION
+#  define FMT_USE_CONSTEXPR 0  // https://github.com/fmtlib/fmt/issues/1628
+#elif FMT_HAS_FEATURE(cxx_relaxed_constexpr) || FMT_MSC_VERSION >= 1912
+#  define FMT_USE_CONSTEXPR 1
+#else
+#  define FMT_USE_CONSTEXPR 0
+#endif
+#if FMT_USE_CONSTEXPR
+#  define FMT_CONSTEXPR constexpr
+#else
+#  define FMT_CONSTEXPR
+#endif
+
+// Detect consteval, C++20 constexpr extensions and std::is_constant_evaluated.
+#if !defined(__cpp_lib_is_constant_evaluated)
+#  define FMT_USE_CONSTEVAL 0
+#elif FMT_CPLUSPLUS < 201709L
+#  define FMT_USE_CONSTEVAL 0
+#elif FMT_GLIBCXX_RELEASE && FMT_GLIBCXX_RELEASE < 10
+#  define FMT_USE_CONSTEVAL 0
+#elif FMT_LIBCPP_VERSION && FMT_LIBCPP_VERSION < 10000
+#  define FMT_USE_CONSTEVAL 0
+#elif defined(__apple_build_version__) && __apple_build_version__ < 14000029L
+#  define FMT_USE_CONSTEVAL 0  // consteval is broken in Apple clang < 14.
+#elif FMT_MSC_VERSION && FMT_MSC_VERSION < 1929
+#  define FMT_USE_CONSTEVAL 0  // consteval is broken in MSVC VS2019 < 16.10.
+#elif defined(__cpp_consteval)
+#  define FMT_USE_CONSTEVAL 1
+#elif FMT_GCC_VERSION >= 1002 || FMT_CLANG_VERSION >= 1101
+#  define FMT_USE_CONSTEVAL 1
+#else
+#  define FMT_USE_CONSTEVAL 0
+#endif
+#if FMT_USE_CONSTEVAL
+#  define FMT_CONSTEVAL consteval
+#  define FMT_CONSTEXPR20 constexpr
+#else
+#  define FMT_CONSTEVAL
+#  define FMT_CONSTEXPR20
+#endif
+
+#if defined(FMT_USE_NONTYPE_TEMPLATE_ARGS)
+// Use the provided definition.
+#elif defined(__NVCOMPILER)
+#  define FMT_USE_NONTYPE_TEMPLATE_ARGS 0
+#elif FMT_GCC_VERSION >= 903 && FMT_CPLUSPLUS >= 201709L
+#  define FMT_USE_NONTYPE_TEMPLATE_ARGS 1
+#elif defined(__cpp_nontype_template_args) && \
+    __cpp_nontype_template_args >= 201911L
+#  define FMT_USE_NONTYPE_TEMPLATE_ARGS 1
+#elif FMT_CLANG_VERSION >= 1200 && FMT_CPLUSPLUS >= 202002L
+#  define FMT_USE_NONTYPE_TEMPLATE_ARGS 1
+#else
+#  define FMT_USE_NONTYPE_TEMPLATE_ARGS 0
+#endif
+
+#ifdef FMT_USE_CONCEPTS
+// Use the provided definition.
+#elif defined(__cpp_concepts)
+#  define FMT_USE_CONCEPTS 1
+#else
+#  define FMT_USE_CONCEPTS 0
+#endif
+
+// Check if exceptions are disabled.
+#ifdef FMT_EXCEPTIONS
+// Use the provided definition.
+#elif defined(__GNUC__) && !defined(__EXCEPTIONS)
+#  define FMT_EXCEPTIONS 0
+#elif FMT_MSC_VERSION && !_HAS_EXCEPTIONS
+#  define FMT_EXCEPTIONS 0
+#else
+#  define FMT_EXCEPTIONS 1
+#endif
+#if FMT_EXCEPTIONS
+#  define FMT_TRY try
+#  define FMT_CATCH(x) catch (x)
+#else
+#  define FMT_TRY if (true)
+#  define FMT_CATCH(x) if (false)
+#endif
+
+#if FMT_HAS_CPP17_ATTRIBUTE(fallthrough)
+#  define FMT_FALLTHROUGH [[fallthrough]]
+#elif defined(__clang__)
+#  define FMT_FALLTHROUGH [[clang::fallthrough]]
+#elif FMT_GCC_VERSION >= 700 && \
+    (!defined(__EDG_VERSION__) || __EDG_VERSION__ >= 520)
+#  define FMT_FALLTHROUGH [[gnu::fallthrough]]
+#else
+#  define FMT_FALLTHROUGH
+#endif
+
+// Disable [[noreturn]] on MSVC/NVCC because of bogus unreachable code warnings.
+#if FMT_HAS_CPP_ATTRIBUTE(noreturn) && !FMT_MSC_VERSION && !defined(__NVCC__)
+#  define FMT_NORETURN [[noreturn]]
+#else
+#  define FMT_NORETURN
+#endif
+
+#ifndef FMT_NODISCARD
+#  if FMT_HAS_CPP17_ATTRIBUTE(nodiscard)
+#    define FMT_NODISCARD [[nodiscard]]
+#  else
+#    define FMT_NODISCARD
+#  endif
+#endif
+
+#ifdef FMT_DEPRECATED
+// Use the provided definition.
+#elif FMT_HAS_CPP14_ATTRIBUTE(deprecated)
+#  define FMT_DEPRECATED [[deprecated]]
+#else
+#  define FMT_DEPRECATED /* deprecated */
+#endif
+
+#ifdef FMT_INLINE
+// Use the provided definition.
+#elif FMT_GCC_VERSION || FMT_CLANG_VERSION
+#  define FMT_ALWAYS_INLINE inline __attribute__((always_inline))
+#else
+#  define FMT_ALWAYS_INLINE inline
+#endif
+// A version of FMT_INLINE to prevent code bloat in debug mode.
+#ifdef NDEBUG
+#  define FMT_INLINE FMT_ALWAYS_INLINE
+#else
+#  define FMT_INLINE inline
+#endif
+
+#if FMT_GCC_VERSION || FMT_CLANG_VERSION
+#  define FMT_VISIBILITY(value) __attribute__((visibility(value)))
+#else
+#  define FMT_VISIBILITY(value)
+#endif
+
+#ifndef FMT_GCC_PRAGMA
+// Workaround a _Pragma bug https://gcc.gnu.org/bugzilla/show_bug.cgi?id=59884
+// and an nvhpc warning: https://github.com/fmtlib/fmt/pull/2582.
+#  if FMT_GCC_VERSION >= 504 && !defined(__NVCOMPILER)
+#    define FMT_GCC_PRAGMA(arg) _Pragma(arg)
+#  else
+#    define FMT_GCC_PRAGMA(arg)
+#  endif
+#endif
+
+// GCC < 5 requires this-> in decltype.
+#if FMT_GCC_VERSION && FMT_GCC_VERSION < 500
+#  define FMT_DECLTYPE_THIS this->
+#else
+#  define FMT_DECLTYPE_THIS
+#endif
+
+#if FMT_MSC_VERSION
+#  define FMT_MSC_WARNING(...) __pragma(warning(__VA_ARGS__))
+#  define FMT_UNCHECKED_ITERATOR(It) \
+    using _Unchecked_type = It  // Mark iterator as checked.
+#else
+#  define FMT_MSC_WARNING(...)
+#  define FMT_UNCHECKED_ITERATOR(It) using unchecked_type = It
+#endif
+
+#ifndef FMT_BEGIN_NAMESPACE
+#  define FMT_BEGIN_NAMESPACE \
+    namespace fmt {           \
+    inline namespace v11 {
+#  define FMT_END_NAMESPACE \
+    }                       \
+    }
+#endif
+
+#ifndef FMT_EXPORT
+#  define FMT_EXPORT
+#  define FMT_BEGIN_EXPORT
+#  define FMT_END_EXPORT
+#endif
+
+#if !defined(FMT_HEADER_ONLY) && defined(_WIN32)
+#  if defined(FMT_LIB_EXPORT)
+#    define FMT_API __declspec(dllexport)
+#  elif defined(FMT_SHARED)
+#    define FMT_API __declspec(dllimport)
+#  endif
+#elif defined(FMT_LIB_EXPORT) || defined(FMT_SHARED)
+#  define FMT_API FMT_VISIBILITY("default")
+#endif
+#ifndef FMT_API
+#  define FMT_API
+#endif
+
+#ifndef FMT_UNICODE
+#  define FMT_UNICODE 1
+#endif
+
+// Check if rtti is available.
+#ifndef FMT_USE_RTTI
+// __RTTI is for EDG compilers. _CPPRTTI is for MSVC.
+#  if defined(__GXX_RTTI) || FMT_HAS_FEATURE(cxx_rtti) || defined(_CPPRTTI) || \
+      defined(__INTEL_RTTI__) || defined(__RTTI)
+#    define FMT_USE_RTTI 1
+#  else
+#    define FMT_USE_RTTI 0
+#  endif
+#endif
+
+#define FMT_FWD(...) static_cast<decltype(__VA_ARGS__)&&>(__VA_ARGS__)
+
+// Enable minimal optimizations for more compact code in debug mode.
+FMT_GCC_PRAGMA("GCC push_options")
+#if !defined(__OPTIMIZE__) && !defined(__CUDACC__)
+FMT_GCC_PRAGMA("GCC optimize(\"Og\")")
+#endif
+
+FMT_BEGIN_NAMESPACE
+
+// Implementations of enable_if_t and other metafunctions for older systems.
+template <bool B, typename T = void>
+using enable_if_t = typename std::enable_if<B, T>::type;
+template <bool B, typename T, typename F>
+using conditional_t = typename std::conditional<B, T, F>::type;
+template <bool B> using bool_constant = std::integral_constant<bool, B>;
+template <typename T>
+using remove_reference_t = typename std::remove_reference<T>::type;
+template <typename T>
+using remove_const_t = typename std::remove_const<T>::type;
+template <typename T>
+using remove_cvref_t = typename std::remove_cv<remove_reference_t<T>>::type;
+template <typename T> struct type_identity {
+  using type = T;
+};
+template <typename T> using type_identity_t = typename type_identity<T>::type;
+template <typename T>
+using make_unsigned_t = typename std::make_unsigned<T>::type;
+template <typename T>
+using underlying_t = typename std::underlying_type<T>::type;
+
+#if FMT_GCC_VERSION && FMT_GCC_VERSION < 500
+// A workaround for gcc 4.8 to make void_t work in a SFINAE context.
+template <typename...> struct void_t_impl {
+  using type = void;
+};
+template <typename... T> using void_t = typename void_t_impl<T...>::type;
+#else
+template <typename...> using void_t = void;
+#endif
+
+struct monostate {
+  constexpr monostate() {}
+};
+
+// An enable_if helper to be used in template parameters which results in much
+// shorter symbols: https://godbolt.org/z/sWw4vP. Extra parentheses are needed
+// to workaround a bug in MSVC 2019 (see #1140 and #1186).
+#ifdef FMT_DOC
+#  define FMT_ENABLE_IF(...)
+#else
+#  define FMT_ENABLE_IF(...) fmt::enable_if_t<(__VA_ARGS__), int> = 0
+#endif
+
+// This is defined in base.h instead of format.h to avoid injecting in std.
+// It is a template to avoid undesirable implicit conversions to std::byte.
+#ifdef __cpp_lib_byte
+template <typename T, FMT_ENABLE_IF(std::is_same<T, std::byte>::value)>
+inline auto format_as(T b) -> unsigned char {
+  return static_cast<unsigned char>(b);
+}
+#endif
+
+namespace detail {
+// Suppresses "unused variable" warnings with the method described in
+// https://herbsutter.com/2009/10/18/mailbag-shutting-up-compiler-warnings/.
+// (void)var does not work on many Intel compilers.
+template <typename... T> FMT_CONSTEXPR void ignore_unused(const T&...) {}
+
+constexpr auto is_constant_evaluated(bool default_value = false) noexcept
+    -> bool {
+// Workaround for incompatibility between libstdc++ consteval-based
+// std::is_constant_evaluated() implementation and clang-14:
+// https://github.com/fmtlib/fmt/issues/3247.
+#if FMT_CPLUSPLUS >= 202002L && FMT_GLIBCXX_RELEASE >= 12 && \
+    (FMT_CLANG_VERSION >= 1400 && FMT_CLANG_VERSION < 1500)
+  ignore_unused(default_value);
+  return __builtin_is_constant_evaluated();
+#elif defined(__cpp_lib_is_constant_evaluated)
+  ignore_unused(default_value);
+  return std::is_constant_evaluated();
+#else
+  return default_value;
+#endif
+}
+
+// Suppresses "conditional expression is constant" warnings.
+template <typename T> constexpr auto const_check(T value) -> T { return value; }
+
+FMT_NORETURN FMT_API void assert_fail(const char* file, int line,
+                                      const char* message);
+
+#if defined(FMT_ASSERT)
+// Use the provided definition.
+#elif defined(NDEBUG)
+// FMT_ASSERT is not empty to avoid -Wempty-body.
+#  define FMT_ASSERT(condition, message) \
+    fmt::detail::ignore_unused((condition), (message))
+#else
+#  define FMT_ASSERT(condition, message)                                    \
+    ((condition) /* void() fails with -Winvalid-constexpr on clang 4.0.1 */ \
+         ? (void)0                                                          \
+         : fmt::detail::assert_fail(__FILE__, __LINE__, (message)))
+#endif
+
+#ifdef FMT_USE_INT128
+// Do nothing.
+#elif defined(__SIZEOF_INT128__) && !defined(__NVCC__) && \
+    !(FMT_CLANG_VERSION && FMT_MSC_VERSION)
+#  define FMT_USE_INT128 1
+using int128_opt = __int128_t;  // An optional native 128-bit integer.
+using uint128_opt = __uint128_t;
+template <typename T> inline auto convert_for_visit(T value) -> T {
+  return value;
+}
+#else
+#  define FMT_USE_INT128 0
+#endif
+#if !FMT_USE_INT128
+enum class int128_opt {};
+enum class uint128_opt {};
+// Reduce template instantiations.
+template <typename T> auto convert_for_visit(T) -> monostate { return {}; }
+#endif
+
+// Casts a nonnegative integer to unsigned.
+template <typename Int>
+FMT_CONSTEXPR auto to_unsigned(Int value) -> make_unsigned_t<Int> {
+  FMT_ASSERT(std::is_unsigned<Int>::value || value >= 0, "negative value");
+  return static_cast<make_unsigned_t<Int>>(value);
+}
+
+// A heuristic to detect std::string and std::[experimental::]string_view.
+// It is mainly used to avoid dependency on <[experimental/]string_view>.
+template <typename T, typename Enable = void>
+struct is_std_string_like : std::false_type {};
+template <typename T>
+struct is_std_string_like<T, void_t<decltype(std::declval<T>().find_first_of(
+                                 typename T::value_type(), 0))>>
+    : std::is_convertible<decltype(std::declval<T>().data()),
+                          const typename T::value_type*> {};
+
+// Returns true iff the literal encoding is UTF-8.
+constexpr auto is_utf8_enabled() -> bool {
+  // Avoid an MSVC sign extension bug: https://github.com/fmtlib/fmt/pull/2297.
+  using uchar = unsigned char;
+  return sizeof("\u00A7") == 3 && uchar("\u00A7"[0]) == 0xC2 &&
+         uchar("\u00A7"[1]) == 0xA7;
+}
+constexpr auto use_utf8() -> bool {
+  return !FMT_MSC_VERSION || is_utf8_enabled();
+}
+
+static_assert(!FMT_UNICODE || use_utf8(),
+              "Unicode support requires compiling with /utf-8");
+
+template <typename Char> FMT_CONSTEXPR auto length(const Char* s) -> size_t {
+  size_t len = 0;
+  while (*s++) ++len;
+  return len;
+}
+
+template <typename Char>
+FMT_CONSTEXPR auto compare(const Char* s1, const Char* s2, std::size_t n)
+    -> int {
+  if (!is_constant_evaluated() && sizeof(Char) == 1) return memcmp(s1, s2, n);
+  for (; n != 0; ++s1, ++s2, --n) {
+    if (*s1 < *s2) return -1;
+    if (*s1 > *s2) return 1;
+  }
+  return 0;
+}
+
+namespace adl {
+using namespace std;
+
+template <typename Container>
+auto invoke_back_inserter()
+    -> decltype(back_inserter(std::declval<Container&>()));
+}  // namespace adl
+
+template <typename It, typename Enable = std::true_type>
+struct is_back_insert_iterator : std::false_type {};
+
+template <typename It>
+struct is_back_insert_iterator<
+    It, bool_constant<std::is_same<
+            decltype(adl::invoke_back_inserter<typename It::container_type>()),
+            It>::value>> : std::true_type {};
+
+// Extracts a reference to the container from *insert_iterator.
+template <typename OutputIt>
+inline auto get_container(OutputIt it) -> typename OutputIt::container_type& {
+  struct accessor : OutputIt {
+    accessor(OutputIt base) : OutputIt(base) {}
+    using OutputIt::container;
+  };
+  return *accessor(it).container;
+}
+}  // namespace detail
+
+// Checks whether T is a container with contiguous storage.
+template <typename T> struct is_contiguous : std::false_type {};
+
+/**
+ * An implementation of `std::basic_string_view` for pre-C++17. It provides a
+ * subset of the API. `fmt::basic_string_view` is used for format strings even
+ * if `std::basic_string_view` is available to prevent issues when a library is
+ * compiled with a different `-std` option than the client code (which is not
+ * recommended).
+ */
+FMT_EXPORT
+template <typename Char> class basic_string_view {
+ private:
+  const Char* data_;
+  size_t size_;
+
+ public:
+  using value_type = Char;
+  using iterator = const Char*;
+
+  constexpr basic_string_view() noexcept : data_(nullptr), size_(0) {}
+
+  /// Constructs a string reference object from a C string and a size.
+  constexpr basic_string_view(const Char* s, size_t count) noexcept
+      : data_(s), size_(count) {}
+
+  constexpr basic_string_view(std::nullptr_t) = delete;
+
+  /// Constructs a string reference object from a C string.
+  FMT_CONSTEXPR20
+  basic_string_view(const Char* s)
+      : data_(s),
+        size_(detail::const_check(std::is_same<Char, char>::value &&
+                                  !detail::is_constant_evaluated(false))
+                  ? strlen(reinterpret_cast<const char*>(s))
+                  : detail::length(s)) {}
+
+  /// Constructs a string reference from a `std::basic_string` or a
+  /// `std::basic_string_view` object.
+  template <typename S,
+            FMT_ENABLE_IF(detail::is_std_string_like<S>::value&& std::is_same<
+                          typename S::value_type, Char>::value)>
+  FMT_CONSTEXPR basic_string_view(const S& s) noexcept
+      : data_(s.data()), size_(s.size()) {}
+
+  /// Returns a pointer to the string data.
+  constexpr auto data() const noexcept -> const Char* { return data_; }
+
+  /// Returns the string size.
+  constexpr auto size() const noexcept -> size_t { return size_; }
+
+  constexpr auto begin() const noexcept -> iterator { return data_; }
+  constexpr auto end() const noexcept -> iterator { return data_ + size_; }
+
+  constexpr auto operator[](size_t pos) const noexcept -> const Char& {
+    return data_[pos];
+  }
+
+  FMT_CONSTEXPR void remove_prefix(size_t n) noexcept {
+    data_ += n;
+    size_ -= n;
+  }
+
+  FMT_CONSTEXPR auto starts_with(basic_string_view<Char> sv) const noexcept
+      -> bool {
+    return size_ >= sv.size_ && detail::compare(data_, sv.data_, sv.size_) == 0;
+  }
+  FMT_CONSTEXPR auto starts_with(Char c) const noexcept -> bool {
+    return size_ >= 1 && *data_ == c;
+  }
+  FMT_CONSTEXPR auto starts_with(const Char* s) const -> bool {
+    return starts_with(basic_string_view<Char>(s));
+  }
+
+  // Lexicographically compare this string reference to other.
+  FMT_CONSTEXPR auto compare(basic_string_view other) const -> int {
+    size_t str_size = size_ < other.size_ ? size_ : other.size_;
+    int result = detail::compare(data_, other.data_, str_size);
+    if (result == 0)
+      result = size_ == other.size_ ? 0 : (size_ < other.size_ ? -1 : 1);
+    return result;
+  }
+
+  FMT_CONSTEXPR friend auto operator==(basic_string_view lhs,
+                                       basic_string_view rhs) -> bool {
+    return lhs.compare(rhs) == 0;
+  }
+  friend auto operator!=(basic_string_view lhs, basic_string_view rhs) -> bool {
+    return lhs.compare(rhs) != 0;
+  }
+  friend auto operator<(basic_string_view lhs, basic_string_view rhs) -> bool {
+    return lhs.compare(rhs) < 0;
+  }
+  friend auto operator<=(basic_string_view lhs, basic_string_view rhs) -> bool {
+    return lhs.compare(rhs) <= 0;
+  }
+  friend auto operator>(basic_string_view lhs, basic_string_view rhs) -> bool {
+    return lhs.compare(rhs) > 0;
+  }
+  friend auto operator>=(basic_string_view lhs, basic_string_view rhs) -> bool {
+    return lhs.compare(rhs) >= 0;
+  }
+};
+
+FMT_EXPORT
+using string_view = basic_string_view<char>;
+
+/// Specifies if `T` is a character type. Can be specialized by users.
+FMT_EXPORT
+template <typename T> struct is_char : std::false_type {};
+template <> struct is_char<char> : std::true_type {};
+
+namespace detail {
+
+// Constructs fmt::basic_string_view<Char> from types implicitly convertible
+// to it, deducing Char. Explicitly convertible types such as the ones returned
+// from FMT_STRING are intentionally excluded.
+template <typename Char, FMT_ENABLE_IF(is_char<Char>::value)>
+constexpr auto to_string_view(const Char* s) -> basic_string_view<Char> {
+  return s;
+}
+template <typename T, FMT_ENABLE_IF(is_std_string_like<T>::value)>
+constexpr auto to_string_view(const T& s)
+    -> basic_string_view<typename T::value_type> {
+  return s;
+}
+template <typename Char>
+constexpr auto to_string_view(basic_string_view<Char> s)
+    -> basic_string_view<Char> {
+  return s;
+}
+
+template <typename T, typename Enable = void>
+struct has_to_string_view : std::false_type {};
+// detail:: is intentional since to_string_view is not an extension point.
+template <typename T>
+struct has_to_string_view<
+    T, void_t<decltype(detail::to_string_view(std::declval<T>()))>>
+    : std::true_type {};
+
+template <typename Char, Char... C> struct string_literal {
+  static constexpr Char value[sizeof...(C)] = {C...};
+  constexpr operator basic_string_view<Char>() const {
+    return {value, sizeof...(C)};
+  }
+};
+#if FMT_CPLUSPLUS < 201703L
+template <typename Char, Char... C>
+constexpr Char string_literal<Char, C...>::value[sizeof...(C)];
+#endif
+
+enum class type {
+  none_type,
+  // Integer types should go first,
+  int_type,
+  uint_type,
+  long_long_type,
+  ulong_long_type,
+  int128_type,
+  uint128_type,
+  bool_type,
+  char_type,
+  last_integer_type = char_type,
+  // followed by floating-point types.
+  float_type,
+  double_type,
+  long_double_type,
+  last_numeric_type = long_double_type,
+  cstring_type,
+  string_type,
+  pointer_type,
+  custom_type
+};
+
+// Maps core type T to the corresponding type enum constant.
+template <typename T, typename Char>
+struct type_constant : std::integral_constant<type, type::custom_type> {};
+
+#define FMT_TYPE_CONSTANT(Type, constant) \
+  template <typename Char>                \
+  struct type_constant<Type, Char>        \
+      : std::integral_constant<type, type::constant> {}
+
+FMT_TYPE_CONSTANT(int, int_type);
+FMT_TYPE_CONSTANT(unsigned, uint_type);
+FMT_TYPE_CONSTANT(long long, long_long_type);
+FMT_TYPE_CONSTANT(unsigned long long, ulong_long_type);
+FMT_TYPE_CONSTANT(int128_opt, int128_type);
+FMT_TYPE_CONSTANT(uint128_opt, uint128_type);
+FMT_TYPE_CONSTANT(bool, bool_type);
+FMT_TYPE_CONSTANT(Char, char_type);
+FMT_TYPE_CONSTANT(float, float_type);
+FMT_TYPE_CONSTANT(double, double_type);
+FMT_TYPE_CONSTANT(long double, long_double_type);
+FMT_TYPE_CONSTANT(const Char*, cstring_type);
+FMT_TYPE_CONSTANT(basic_string_view<Char>, string_type);
+FMT_TYPE_CONSTANT(const void*, pointer_type);
+
+constexpr auto is_integral_type(type t) -> bool {
+  return t > type::none_type && t <= type::last_integer_type;
+}
+constexpr auto is_arithmetic_type(type t) -> bool {
+  return t > type::none_type && t <= type::last_numeric_type;
+}
+
+constexpr auto set(type rhs) -> int { return 1 << static_cast<int>(rhs); }
+constexpr auto in(type t, int set) -> bool {
+  return ((set >> static_cast<int>(t)) & 1) != 0;
+}
+
+// Bitsets of types.
+enum {
+  sint_set =
+      set(type::int_type) | set(type::long_long_type) | set(type::int128_type),
+  uint_set = set(type::uint_type) | set(type::ulong_long_type) |
+             set(type::uint128_type),
+  bool_set = set(type::bool_type),
+  char_set = set(type::char_type),
+  float_set = set(type::float_type) | set(type::double_type) |
+              set(type::long_double_type),
+  string_set = set(type::string_type),
+  cstring_set = set(type::cstring_type),
+  pointer_set = set(type::pointer_type)
+};
+}  // namespace detail
+
+/// Reports a format error at compile time or, via a `format_error` exception,
+/// at runtime.
+// This function is intentionally not constexpr to give a compile-time error.
+FMT_NORETURN FMT_API void report_error(const char* message);
+
+FMT_DEPRECATED FMT_NORETURN inline void throw_format_error(
+    const char* message) {
+  report_error(message);
+}
+
+/// String's character (code unit) type.
+template <typename S,
+          typename V = decltype(detail::to_string_view(std::declval<S>()))>
+using char_t = typename V::value_type;
+
+/**
+ * Parsing context consisting of a format string range being parsed and an
+ * argument counter for automatic indexing.
+ * You can use the `format_parse_context` type alias for `char` instead.
+ */
+FMT_EXPORT
+template <typename Char> class basic_format_parse_context {
+ private:
+  basic_string_view<Char> format_str_;
+  int next_arg_id_;
+
+  FMT_CONSTEXPR void do_check_arg_id(int id);
+
+ public:
+  using char_type = Char;
+  using iterator = const Char*;
+
+  explicit constexpr basic_format_parse_context(
+      basic_string_view<Char> format_str, int next_arg_id = 0)
+      : format_str_(format_str), next_arg_id_(next_arg_id) {}
+
+  /// Returns an iterator to the beginning of the format string range being
+  /// parsed.
+  constexpr auto begin() const noexcept -> iterator {
+    return format_str_.begin();
+  }
+
+  /// Returns an iterator past the end of the format string range being parsed.
+  constexpr auto end() const noexcept -> iterator { return format_str_.end(); }
+
+  /// Advances the begin iterator to `it`.
+  FMT_CONSTEXPR void advance_to(iterator it) {
+    format_str_.remove_prefix(detail::to_unsigned(it - begin()));
+  }
+
+  /// Reports an error if using the manual argument indexing; otherwise returns
+  /// the next argument index and switches to the automatic indexing.
+  FMT_CONSTEXPR auto next_arg_id() -> int {
+    if (next_arg_id_ < 0) {
+      report_error("cannot switch from manual to automatic argument indexing");
+      return 0;
+    }
+    int id = next_arg_id_++;
+    do_check_arg_id(id);
+    return id;
+  }
+
+  /// Reports an error if using the automatic argument indexing; otherwise
+  /// switches to the manual indexing.
+  FMT_CONSTEXPR void check_arg_id(int id) {
+    if (next_arg_id_ > 0) {
+      report_error("cannot switch from automatic to manual argument indexing");
+      return;
+    }
+    next_arg_id_ = -1;
+    do_check_arg_id(id);
+  }
+  FMT_CONSTEXPR void check_arg_id(basic_string_view<Char>) {
+    next_arg_id_ = -1;
+  }
+  FMT_CONSTEXPR void check_dynamic_spec(int arg_id);
+};
+
+FMT_EXPORT
+using format_parse_context = basic_format_parse_context<char>;
+
+namespace detail {
+// A parse context with extra data used only in compile-time checks.
+template <typename Char>
+class compile_parse_context : public basic_format_parse_context<Char> {
+ private:
+  int num_args_;
+  const type* types_;
+  using base = basic_format_parse_context<Char>;
+
+ public:
+  explicit FMT_CONSTEXPR compile_parse_context(
+      basic_string_view<Char> format_str, int num_args, const type* types,
+      int next_arg_id = 0)
+      : base(format_str, next_arg_id), num_args_(num_args), types_(types) {}
+
+  constexpr auto num_args() const -> int { return num_args_; }
+  constexpr auto arg_type(int id) const -> type { return types_[id]; }
+
+  FMT_CONSTEXPR auto next_arg_id() -> int {
+    int id = base::next_arg_id();
+    if (id >= num_args_) report_error("argument not found");
+    return id;
+  }
+
+  FMT_CONSTEXPR void check_arg_id(int id) {
+    base::check_arg_id(id);
+    if (id >= num_args_) report_error("argument not found");
+  }
+  using base::check_arg_id;
+
+  FMT_CONSTEXPR void check_dynamic_spec(int arg_id) {
+    detail::ignore_unused(arg_id);
+    if (arg_id < num_args_ && types_ && !is_integral_type(types_[arg_id]))
+      report_error("width/precision is not integer");
+  }
+};
+
+/// A contiguous memory buffer with an optional growing ability. It is an
+/// internal class and shouldn't be used directly, only via `memory_buffer`.
+template <typename T> class buffer {
+ private:
+  T* ptr_;
+  size_t size_;
+  size_t capacity_;
+
+  using grow_fun = void (*)(buffer& buf, size_t capacity);
+  grow_fun grow_;
+
+ protected:
+  // Don't initialize ptr_ since it is not accessed to save a few cycles.
+  FMT_MSC_WARNING(suppress : 26495)
+  FMT_CONSTEXPR20 buffer(grow_fun grow, size_t sz) noexcept
+      : size_(sz), capacity_(sz), grow_(grow) {}
+
+  constexpr buffer(grow_fun grow, T* p = nullptr, size_t sz = 0,
+                   size_t cap = 0) noexcept
+      : ptr_(p), size_(sz), capacity_(cap), grow_(grow) {}
+
+  FMT_CONSTEXPR20 ~buffer() = default;
+  buffer(buffer&&) = default;
+
+  /// Sets the buffer data and capacity.
+  FMT_CONSTEXPR void set(T* buf_data, size_t buf_capacity) noexcept {
+    ptr_ = buf_data;
+    capacity_ = buf_capacity;
+  }
+
+ public:
+  using value_type = T;
+  using const_reference = const T&;
+
+  buffer(const buffer&) = delete;
+  void operator=(const buffer&) = delete;
+
+  auto begin() noexcept -> T* { return ptr_; }
+  auto end() noexcept -> T* { return ptr_ + size_; }
+
+  auto begin() const noexcept -> const T* { return ptr_; }
+  auto end() const noexcept -> const T* { return ptr_ + size_; }
+
+  /// Returns the size of this buffer.
+  constexpr auto size() const noexcept -> size_t { return size_; }
+
+  /// Returns the capacity of this buffer.
+  constexpr auto capacity() const noexcept -> size_t { return capacity_; }
+
+  /// Returns a pointer to the buffer data (not null-terminated).
+  FMT_CONSTEXPR auto data() noexcept -> T* { return ptr_; }
+  FMT_CONSTEXPR auto data() const noexcept -> const T* { return ptr_; }
+
+  /// Clears this buffer.
+  void clear() { size_ = 0; }
+
+  // Tries resizing the buffer to contain `count` elements. If T is a POD type
+  // the new elements may not be initialized.
+  FMT_CONSTEXPR void try_resize(size_t count) {
+    try_reserve(count);
+    size_ = count <= capacity_ ? count : capacity_;
+  }
+
+  // Tries increasing the buffer capacity to `new_capacity`. It can increase the
+  // capacity by a smaller amount than requested but guarantees there is space
+  // for at least one additional element either by increasing the capacity or by
+  // flushing the buffer if it is full.
+  FMT_CONSTEXPR void try_reserve(size_t new_capacity) {
+    if (new_capacity > capacity_) grow_(*this, new_capacity);
+  }
+
+  FMT_CONSTEXPR void push_back(const T& value) {
+    try_reserve(size_ + 1);
+    ptr_[size_++] = value;
+  }
+
+  /// Appends data to the end of the buffer.
+  template <typename U> void append(const U* begin, const U* end) {
+    while (begin != end) {
+      auto count = to_unsigned(end - begin);
+      try_reserve(size_ + count);
+      auto free_cap = capacity_ - size_;
+      if (free_cap < count) count = free_cap;
+      // A loop is faster than memcpy on small sizes.
+      T* out = ptr_ + size_;
+      for (size_t i = 0; i < count; ++i) out[i] = begin[i];
+      size_ += count;
+      begin += count;
+    }
+  }
+
+  template <typename Idx> FMT_CONSTEXPR auto operator[](Idx index) -> T& {
+    return ptr_[index];
+  }
+  template <typename Idx>
+  FMT_CONSTEXPR auto operator[](Idx index) const -> const T& {
+    return ptr_[index];
+  }
+};
+
+struct buffer_traits {
+  explicit buffer_traits(size_t) {}
+  auto count() const -> size_t { return 0; }
+  auto limit(size_t size) -> size_t { return size; }
+};
+
+class fixed_buffer_traits {
+ private:
+  size_t count_ = 0;
+  size_t limit_;
+
+ public:
+  explicit fixed_buffer_traits(size_t limit) : limit_(limit) {}
+  auto count() const -> size_t { return count_; }
+  auto limit(size_t size) -> size_t {
+    size_t n = limit_ > count_ ? limit_ - count_ : 0;
+    count_ += size;
+    return size < n ? size : n;
+  }
+};
+
+// A buffer that writes to an output iterator when flushed.
+template <typename OutputIt, typename T, typename Traits = buffer_traits>
+class iterator_buffer : public Traits, public buffer<T> {
+ private:
+  OutputIt out_;
+  enum { buffer_size = 256 };
+  T data_[buffer_size];
+
+  static FMT_CONSTEXPR void grow(buffer<T>& buf, size_t) {
+    if (buf.size() == buffer_size) static_cast<iterator_buffer&>(buf).flush();
+  }
+
+  void flush() {
+    auto size = this->size();
+    this->clear();
+    const T* begin = data_;
+    const T* end = begin + this->limit(size);
+    while (begin != end) *out_++ = *begin++;
+  }
+
+ public:
+  explicit iterator_buffer(OutputIt out, size_t n = buffer_size)
+      : Traits(n), buffer<T>(grow, data_, 0, buffer_size), out_(out) {}
+  iterator_buffer(iterator_buffer&& other) noexcept
+      : Traits(other),
+        buffer<T>(grow, data_, 0, buffer_size),
+        out_(other.out_) {}
+  ~iterator_buffer() {
+    // Don't crash if flush fails during unwinding.
+    FMT_TRY { flush(); }
+    FMT_CATCH(...) {}
+  }
+
+  auto out() -> OutputIt {
+    flush();
+    return out_;
+  }
+  auto count() const -> size_t { return Traits::count() + this->size(); }
+};
+
+template <typename T>
+class iterator_buffer<T*, T, fixed_buffer_traits> : public fixed_buffer_traits,
+                                                    public buffer<T> {
+ private:
+  T* out_;
+  enum { buffer_size = 256 };
+  T data_[buffer_size];
+
+  static FMT_CONSTEXPR void grow(buffer<T>& buf, size_t) {
+    if (buf.size() == buf.capacity())
+      static_cast<iterator_buffer&>(buf).flush();
+  }
+
+  void flush() {
+    size_t n = this->limit(this->size());
+    if (this->data() == out_) {
+      out_ += n;
+      this->set(data_, buffer_size);
+    }
+    this->clear();
+  }
+
+ public:
+  explicit iterator_buffer(T* out, size_t n = buffer_size)
+      : fixed_buffer_traits(n), buffer<T>(grow, out, 0, n), out_(out) {}
+  iterator_buffer(iterator_buffer&& other) noexcept
+      : fixed_buffer_traits(other),
+        buffer<T>(static_cast<iterator_buffer&&>(other)),
+        out_(other.out_) {
+    if (this->data() != out_) {
+      this->set(data_, buffer_size);
+      this->clear();
+    }
+  }
+  ~iterator_buffer() { flush(); }
+
+  auto out() -> T* {
+    flush();
+    return out_;
+  }
+  auto count() const -> size_t {
+    return fixed_buffer_traits::count() + this->size();
+  }
+};
+
+template <typename T> class iterator_buffer<T*, T> : public buffer<T> {
+ public:
+  explicit iterator_buffer(T* out, size_t = 0)
+      : buffer<T>([](buffer<T>&, size_t) {}, out, 0, ~size_t()) {}
+
+  auto out() -> T* { return &*this->end(); }
+};
+
+// A buffer that writes to a container with the contiguous storage.
+template <typename OutputIt>
+class iterator_buffer<
+    OutputIt,
+    enable_if_t<detail::is_back_insert_iterator<OutputIt>::value &&
+                    is_contiguous<typename OutputIt::container_type>::value,
+                typename OutputIt::container_type::value_type>>
+    : public buffer<typename OutputIt::container_type::value_type> {
+ private:
+  using container_type = typename OutputIt::container_type;
+  using value_type = typename container_type::value_type;
+  container_type& container_;
+
+  static FMT_CONSTEXPR void grow(buffer<value_type>& buf, size_t capacity) {
+    auto& self = static_cast<iterator_buffer&>(buf);
+    self.container_.resize(capacity);
+    self.set(&self.container_[0], capacity);
+  }
+
+ public:
+  explicit iterator_buffer(container_type& c)
+      : buffer<value_type>(grow, c.size()), container_(c) {}
+  explicit iterator_buffer(OutputIt out, size_t = 0)
+      : iterator_buffer(get_container(out)) {}
+
+  auto out() -> OutputIt { return back_inserter(container_); }
+};
+
+// A buffer that counts the number of code units written discarding the output.
+template <typename T = char> class counting_buffer : public buffer<T> {
+ private:
+  enum { buffer_size = 256 };
+  T data_[buffer_size];
+  size_t count_ = 0;
+
+  static FMT_CONSTEXPR void grow(buffer<T>& buf, size_t) {
+    if (buf.size() != buffer_size) return;
+    static_cast<counting_buffer&>(buf).count_ += buf.size();
+    buf.clear();
+  }
+
+ public:
+  counting_buffer() : buffer<T>(grow, data_, 0, buffer_size) {}
+
+  auto count() -> size_t { return count_ + this->size(); }
+};
+}  // namespace detail
+
+template <typename Char>
+FMT_CONSTEXPR void basic_format_parse_context<Char>::do_check_arg_id(int id) {
+  // Argument id is only checked at compile-time during parsing because
+  // formatting has its own validation.
+  if (detail::is_constant_evaluated() &&
+      (!FMT_GCC_VERSION || FMT_GCC_VERSION >= 1200)) {
+    using context = detail::compile_parse_context<Char>;
+    if (id >= static_cast<context*>(this)->num_args())
+      report_error("argument not found");
+  }
+}
+
+template <typename Char>
+FMT_CONSTEXPR void basic_format_parse_context<Char>::check_dynamic_spec(
+    int arg_id) {
+  if (detail::is_constant_evaluated() &&
+      (!FMT_GCC_VERSION || FMT_GCC_VERSION >= 1200)) {
+    using context = detail::compile_parse_context<Char>;
+    static_cast<context*>(this)->check_dynamic_spec(arg_id);
+  }
+}
+
+FMT_EXPORT template <typename Context> class basic_format_arg;
+FMT_EXPORT template <typename Context> class basic_format_args;
+FMT_EXPORT template <typename Context> class dynamic_format_arg_store;
+
+// A formatter for objects of type T.
+FMT_EXPORT
+template <typename T, typename Char = char, typename Enable = void>
+struct formatter {
+  // A deleted default constructor indicates a disabled formatter.
+  formatter() = delete;
+};
+
+// Specifies if T has an enabled formatter specialization. A type can be
+// formattable even if it doesn't have a formatter e.g. via a conversion.
+template <typename T, typename Context>
+using has_formatter =
+    std::is_constructible<typename Context::template formatter_type<T>>;
+
+// An output iterator that appends to a buffer. It is used instead of
+// back_insert_iterator to reduce symbol sizes and avoid <iterator> dependency.
+template <typename T> class basic_appender {
+ private:
+  detail::buffer<T>* buffer_;
+
+  friend auto get_container(basic_appender app) -> detail::buffer<T>& {
+    return *app.buffer_;
+  }
+
+ public:
+  using iterator_category = int;
+  using value_type = T;
+  using difference_type = ptrdiff_t;
+  using pointer = T*;
+  using reference = T&;
+  using container_type = detail::buffer<T>;
+  FMT_UNCHECKED_ITERATOR(basic_appender);
+
+  FMT_CONSTEXPR basic_appender(detail::buffer<T>& buf) : buffer_(&buf) {}
+
+  auto operator=(T c) -> basic_appender& {
+    buffer_->push_back(c);
+    return *this;
+  }
+  auto operator*() -> basic_appender& { return *this; }
+  auto operator++() -> basic_appender& { return *this; }
+  auto operator++(int) -> basic_appender { return *this; }
+};
+
+using appender = basic_appender<char>;
+
+namespace detail {
+template <typename T>
+struct is_back_insert_iterator<basic_appender<T>> : std::true_type {};
+
+template <typename T, typename Enable = void>
+struct locking : std::true_type {};
+template <typename T>
+struct locking<T, void_t<typename formatter<remove_cvref_t<T>>::nonlocking>>
+    : std::false_type {};
+
+template <typename T = int> FMT_CONSTEXPR inline auto is_locking() -> bool {
+  return locking<T>::value;
+}
+template <typename T1, typename T2, typename... Tail>
+FMT_CONSTEXPR inline auto is_locking() -> bool {
+  return locking<T1>::value || is_locking<T2, Tail...>();
+}
+
+// An optimized version of std::copy with the output value type (T).
+template <typename T, typename InputIt, typename OutputIt,
+          FMT_ENABLE_IF(is_back_insert_iterator<OutputIt>::value)>
+auto copy(InputIt begin, InputIt end, OutputIt out) -> OutputIt {
+  get_container(out).append(begin, end);
+  return out;
+}
+
+template <typename T, typename InputIt, typename OutputIt,
+          FMT_ENABLE_IF(!is_back_insert_iterator<OutputIt>::value)>
+FMT_CONSTEXPR auto copy(InputIt begin, InputIt end, OutputIt out) -> OutputIt {
+  while (begin != end) *out++ = static_cast<T>(*begin++);
+  return out;
+}
+
+template <typename T, typename V, typename OutputIt>
+FMT_CONSTEXPR auto copy(basic_string_view<V> s, OutputIt out) -> OutputIt {
+  return copy<T>(s.begin(), s.end(), out);
+}
+
+template <typename Context, typename T>
+constexpr auto has_const_formatter_impl(T*)
+    -> decltype(typename Context::template formatter_type<T>().format(
+                    std::declval<const T&>(), std::declval<Context&>()),
+                true) {
+  return true;
+}
+template <typename Context>
+constexpr auto has_const_formatter_impl(...) -> bool {
+  return false;
+}
+template <typename T, typename Context>
+constexpr auto has_const_formatter() -> bool {
+  return has_const_formatter_impl<Context>(static_cast<T*>(nullptr));
+}
+
+template <typename It, typename Enable = std::true_type>
+struct is_buffer_appender : std::false_type {};
+template <typename It>
+struct is_buffer_appender<
+    It, bool_constant<
+            is_back_insert_iterator<It>::value &&
+            std::is_base_of<buffer<typename It::container_type::value_type>,
+                            typename It::container_type>::value>>
+    : std::true_type {};
+
+// Maps an output iterator to a buffer.
+template <typename T, typename OutputIt,
+          FMT_ENABLE_IF(!is_buffer_appender<OutputIt>::value)>
+auto get_buffer(OutputIt out) -> iterator_buffer<OutputIt, T> {
+  return iterator_buffer<OutputIt, T>(out);
+}
+template <typename T, typename OutputIt,
+          FMT_ENABLE_IF(is_buffer_appender<OutputIt>::value)>
+auto get_buffer(OutputIt out) -> buffer<T>& {
+  return get_container(out);
+}
+
+template <typename Buf, typename OutputIt>
+auto get_iterator(Buf& buf, OutputIt) -> decltype(buf.out()) {
+  return buf.out();
+}
+template <typename T, typename OutputIt>
+auto get_iterator(buffer<T>&, OutputIt out) -> OutputIt {
+  return out;
+}
+
+struct view {};
+
+template <typename Char, typename T> struct named_arg : view {
+  const Char* name;
+  const T& value;
+  named_arg(const Char* n, const T& v) : name(n), value(v) {}
+};
+
+template <typename Char> struct named_arg_info {
+  const Char* name;
+  int id;
+};
+
+template <typename T> struct is_named_arg : std::false_type {};
+template <typename T> struct is_statically_named_arg : std::false_type {};
+
+template <typename T, typename Char>
+struct is_named_arg<named_arg<Char, T>> : std::true_type {};
+
+template <bool B = false> constexpr auto count() -> size_t { return B ? 1 : 0; }
+template <bool B1, bool B2, bool... Tail> constexpr auto count() -> size_t {
+  return (B1 ? 1 : 0) + count<B2, Tail...>();
+}
+
+template <typename... Args> constexpr auto count_named_args() -> size_t {
+  return count<is_named_arg<Args>::value...>();
+}
+
+template <typename... Args>
+constexpr auto count_statically_named_args() -> size_t {
+  return count<is_statically_named_arg<Args>::value...>();
+}
+
+struct unformattable {};
+struct unformattable_char : unformattable {};
+struct unformattable_pointer : unformattable {};
+
+template <typename Char> struct string_value {
+  const Char* data;
+  size_t size;
+};
+
+template <typename Char> struct named_arg_value {
+  const named_arg_info<Char>* data;
+  size_t size;
+};
+
+template <typename Context> struct custom_value {
+  using parse_context = typename Context::parse_context_type;
+  void* value;
+  void (*format)(void* arg, parse_context& parse_ctx, Context& ctx);
+};
+
+// A formatting argument value.
+template <typename Context> class value {
+ public:
+  using char_type = typename Context::char_type;
+
+  union {
+    monostate no_value;
+    int int_value;
+    unsigned uint_value;
+    long long long_long_value;
+    unsigned long long ulong_long_value;
+    int128_opt int128_value;
+    uint128_opt uint128_value;
+    bool bool_value;
+    char_type char_value;
+    float float_value;
+    double double_value;
+    long double long_double_value;
+    const void* pointer;
+    string_value<char_type> string;
+    custom_value<Context> custom;
+    named_arg_value<char_type> named_args;
+  };
+
+  constexpr FMT_ALWAYS_INLINE value() : no_value() {}
+  constexpr FMT_ALWAYS_INLINE value(int val) : int_value(val) {}
+  constexpr FMT_ALWAYS_INLINE value(unsigned val) : uint_value(val) {}
+  constexpr FMT_ALWAYS_INLINE value(long long val) : long_long_value(val) {}
+  constexpr FMT_ALWAYS_INLINE value(unsigned long long val)
+      : ulong_long_value(val) {}
+  FMT_ALWAYS_INLINE value(int128_opt val) : int128_value(val) {}
+  FMT_ALWAYS_INLINE value(uint128_opt val) : uint128_value(val) {}
+  constexpr FMT_ALWAYS_INLINE value(float val) : float_value(val) {}
+  constexpr FMT_ALWAYS_INLINE value(double val) : double_value(val) {}
+  FMT_ALWAYS_INLINE value(long double val) : long_double_value(val) {}
+  constexpr FMT_ALWAYS_INLINE value(bool val) : bool_value(val) {}
+  constexpr FMT_ALWAYS_INLINE value(char_type val) : char_value(val) {}
+  FMT_CONSTEXPR FMT_ALWAYS_INLINE value(const char_type* val) {
+    string.data = val;
+    if (is_constant_evaluated()) string.size = {};
+  }
+  FMT_CONSTEXPR FMT_ALWAYS_INLINE value(basic_string_view<char_type> val) {
+    string.data = val.data();
+    string.size = val.size();
+  }
+  FMT_ALWAYS_INLINE value(const void* val) : pointer(val) {}
+  FMT_ALWAYS_INLINE value(const named_arg_info<char_type>* args, size_t size)
+      : named_args{args, size} {}
+
+  template <typename T> FMT_CONSTEXPR20 FMT_ALWAYS_INLINE value(T& val) {
+    using value_type = remove_const_t<T>;
+    // T may overload operator& e.g. std::vector<bool>::reference in libc++.
+#if defined(__cpp_if_constexpr)
+    if constexpr (std::is_same<decltype(&val), T*>::value)
+      custom.value = const_cast<value_type*>(&val);
+#endif
+    if (!is_constant_evaluated())
+      custom.value = const_cast<char*>(&reinterpret_cast<const char&>(val));
+    // Get the formatter type through the context to allow different contexts
+    // have different extension points, e.g. `formatter<T>` for `format` and
+    // `printf_formatter<T>` for `printf`.
+    custom.format = format_custom_arg<
+        value_type, typename Context::template formatter_type<value_type>>;
+  }
+  value(unformattable);
+  value(unformattable_char);
+  value(unformattable_pointer);
+
+ private:
+  // Formats an argument of a custom type, such as a user-defined class.
+  template <typename T, typename Formatter>
+  static void format_custom_arg(void* arg,
+                                typename Context::parse_context_type& parse_ctx,
+                                Context& ctx) {
+    auto f = Formatter();
+    parse_ctx.advance_to(f.parse(parse_ctx));
+    using qualified_type =
+        conditional_t<has_const_formatter<T, Context>(), const T, T>;
+    // format must be const for compatibility with std::format and compilation.
+    const auto& cf = f;
+    ctx.advance_to(cf.format(*static_cast<qualified_type*>(arg), ctx));
+  }
+};
+
+// To minimize the number of types we need to deal with, long is translated
+// either to int or to long long depending on its size.
+enum { long_short = sizeof(long) == sizeof(int) };
+using long_type = conditional_t<long_short, int, long long>;
+using ulong_type = conditional_t<long_short, unsigned, unsigned long long>;
+
+template <typename T> struct format_as_result {
+  template <typename U,
+            FMT_ENABLE_IF(std::is_enum<U>::value || std::is_class<U>::value)>
+  static auto map(U*) -> remove_cvref_t<decltype(format_as(std::declval<U>()))>;
+  static auto map(...) -> void;
+
+  using type = decltype(map(static_cast<T*>(nullptr)));
+};
+template <typename T> using format_as_t = typename format_as_result<T>::type;
+
+template <typename T>
+struct has_format_as
+    : bool_constant<!std::is_same<format_as_t<T>, void>::value> {};
+
+#define FMT_MAP_API FMT_CONSTEXPR FMT_ALWAYS_INLINE
+
+// Maps formatting arguments to core types.
+// arg_mapper reports errors by returning unformattable instead of using
+// static_assert because it's used in the is_formattable trait.
+template <typename Context> struct arg_mapper {
+  using char_type = typename Context::char_type;
+
+  FMT_MAP_API auto map(signed char val) -> int { return val; }
+  FMT_MAP_API auto map(unsigned char val) -> unsigned { return val; }
+  FMT_MAP_API auto map(short val) -> int { return val; }
+  FMT_MAP_API auto map(unsigned short val) -> unsigned { return val; }
+  FMT_MAP_API auto map(int val) -> int { return val; }
+  FMT_MAP_API auto map(unsigned val) -> unsigned { return val; }
+  FMT_MAP_API auto map(long val) -> long_type { return val; }
+  FMT_MAP_API auto map(unsigned long val) -> ulong_type { return val; }
+  FMT_MAP_API auto map(long long val) -> long long { return val; }
+  FMT_MAP_API auto map(unsigned long long val) -> unsigned long long {
+    return val;
+  }
+  FMT_MAP_API auto map(int128_opt val) -> int128_opt { return val; }
+  FMT_MAP_API auto map(uint128_opt val) -> uint128_opt { return val; }
+  FMT_MAP_API auto map(bool val) -> bool { return val; }
+
+  template <typename T, FMT_ENABLE_IF(std::is_same<T, char>::value ||
+                                      std::is_same<T, char_type>::value)>
+  FMT_MAP_API auto map(T val) -> char_type {
+    return val;
+  }
+  template <typename T, enable_if_t<(std::is_same<T, wchar_t>::value ||
+#ifdef __cpp_char8_t
+                                     std::is_same<T, char8_t>::value ||
+#endif
+                                     std::is_same<T, char16_t>::value ||
+                                     std::is_same<T, char32_t>::value) &&
+                                        !std::is_same<T, char_type>::value,
+                                    int> = 0>
+  FMT_MAP_API auto map(T) -> unformattable_char {
+    return {};
+  }
+
+  FMT_MAP_API auto map(float val) -> float { return val; }
+  FMT_MAP_API auto map(double val) -> double { return val; }
+  FMT_MAP_API auto map(long double val) -> long double { return val; }
+
+  FMT_MAP_API auto map(char_type* val) -> const char_type* { return val; }
+  FMT_MAP_API auto map(const char_type* val) -> const char_type* { return val; }
+  template <typename T, typename Char = char_t<T>,
+            FMT_ENABLE_IF(std::is_same<Char, char_type>::value &&
+                          !std::is_pointer<T>::value)>
+  FMT_MAP_API auto map(const T& val) -> basic_string_view<Char> {
+    return to_string_view(val);
+  }
+  template <typename T, typename Char = char_t<T>,
+            FMT_ENABLE_IF(!std::is_same<Char, char_type>::value &&
+                          !std::is_pointer<T>::value)>
+  FMT_MAP_API auto map(const T&) -> unformattable_char {
+    return {};
+  }
+
+  FMT_MAP_API auto map(void* val) -> const void* { return val; }
+  FMT_MAP_API auto map(const void* val) -> const void* { return val; }
+  FMT_MAP_API auto map(volatile void* val) -> const void* {
+    return const_cast<const void*>(val);
+  }
+  FMT_MAP_API auto map(const volatile void* val) -> const void* {
+    return const_cast<const void*>(val);
+  }
+  FMT_MAP_API auto map(std::nullptr_t val) -> const void* { return val; }
+
+  // Use SFINAE instead of a const T* parameter to avoid a conflict with the
+  // array overload.
+  template <
+      typename T,
+      FMT_ENABLE_IF(
+          std::is_pointer<T>::value || std::is_member_pointer<T>::value ||
+          std::is_function<typename std::remove_pointer<T>::type>::value ||
+          (std::is_array<T>::value &&
+           !std::is_convertible<T, const char_type*>::value))>
+  FMT_CONSTEXPR auto map(const T&) -> unformattable_pointer {
+    return {};
+  }
+
+  template <typename T, std::size_t N,
+            FMT_ENABLE_IF(!std::is_same<T, wchar_t>::value)>
+  FMT_MAP_API auto map(const T (&values)[N]) -> const T (&)[N] {
+    return values;
+  }
+
+  // Only map owning types because mapping views can be unsafe.
+  template <typename T, typename U = format_as_t<T>,
+            FMT_ENABLE_IF(std::is_arithmetic<U>::value)>
+  FMT_MAP_API auto map(const T& val) -> decltype(FMT_DECLTYPE_THIS map(U())) {
+    return map(format_as(val));
+  }
+
+  template <typename T, typename U = remove_const_t<T>>
+  struct formattable : bool_constant<has_const_formatter<U, Context>() ||
+                                     (has_formatter<U, Context>::value &&
+                                      !std::is_const<T>::value)> {};
+
+  template <typename T, FMT_ENABLE_IF(formattable<T>::value)>
+  FMT_MAP_API auto do_map(T& val) -> T& {
+    return val;
+  }
+  template <typename T, FMT_ENABLE_IF(!formattable<T>::value)>
+  FMT_MAP_API auto do_map(T&) -> unformattable {
+    return {};
+  }
+
+  // is_fundamental is used to allow formatters for extended FP types.
+  template <typename T, typename U = remove_const_t<T>,
+            FMT_ENABLE_IF(
+                (std::is_class<U>::value || std::is_enum<U>::value ||
+                 std::is_union<U>::value || std::is_fundamental<U>::value) &&
+                !has_to_string_view<U>::value && !is_char<U>::value &&
+                !is_named_arg<U>::value && !std::is_integral<U>::value &&
+                !std::is_arithmetic<format_as_t<U>>::value)>
+  FMT_MAP_API auto map(T& val) -> decltype(FMT_DECLTYPE_THIS do_map(val)) {
+    return do_map(val);
+  }
+
+  template <typename T, FMT_ENABLE_IF(is_named_arg<T>::value)>
+  FMT_MAP_API auto map(const T& named_arg)
+      -> decltype(FMT_DECLTYPE_THIS map(named_arg.value)) {
+    return map(named_arg.value);
+  }
+
+  auto map(...) -> unformattable { return {}; }
+};
+
+// A type constant after applying arg_mapper<Context>.
+template <typename T, typename Context>
+using mapped_type_constant =
+    type_constant<decltype(arg_mapper<Context>().map(std::declval<const T&>())),
+                  typename Context::char_type>;
+
+enum { packed_arg_bits = 4 };
+// Maximum number of arguments with packed types.
+enum { max_packed_args = 62 / packed_arg_bits };
+enum : unsigned long long { is_unpacked_bit = 1ULL << 63 };
+enum : unsigned long long { has_named_args_bit = 1ULL << 62 };
+
+template <typename It, typename T, typename Enable = void>
+struct is_output_iterator : std::false_type {};
+
+template <> struct is_output_iterator<appender, char> : std::true_type {};
+
+template <typename It, typename T>
+struct is_output_iterator<
+    It, T, void_t<decltype(*std::declval<It&>()++ = std::declval<T>())>>
+    : std::true_type {};
+
+// A type-erased reference to an std::locale to avoid a heavy <locale> include.
+class locale_ref {
+ private:
+  const void* locale_;  // A type-erased pointer to std::locale.
+
+ public:
+  constexpr locale_ref() : locale_(nullptr) {}
+  template <typename Locale> explicit locale_ref(const Locale& loc);
+
+  explicit operator bool() const noexcept { return locale_ != nullptr; }
+
+  template <typename Locale> auto get() const -> Locale;
+};
+
+template <typename> constexpr auto encode_types() -> unsigned long long {
+  return 0;
+}
+
+template <typename Context, typename Arg, typename... Args>
+constexpr auto encode_types() -> unsigned long long {
+  return static_cast<unsigned>(mapped_type_constant<Arg, Context>::value) |
+         (encode_types<Context, Args...>() << packed_arg_bits);
+}
+
+template <typename Context, typename... T, size_t NUM_ARGS = sizeof...(T)>
+constexpr unsigned long long make_descriptor() {
+  return NUM_ARGS <= max_packed_args ? encode_types<Context, T...>()
+                                     : is_unpacked_bit | NUM_ARGS;
+}
+
+// This type is intentionally undefined, only used for errors.
+template <typename T, typename Char>
+#if FMT_CLANG_VERSION && FMT_CLANG_VERSION <= 1500
+// https://github.com/fmtlib/fmt/issues/3796
+struct type_is_unformattable_for {
+};
+#else
+struct type_is_unformattable_for;
+#endif
+
+template <bool PACKED, typename Context, typename T, FMT_ENABLE_IF(PACKED)>
+FMT_CONSTEXPR auto make_arg(T& val) -> value<Context> {
+  using arg_type = remove_cvref_t<decltype(arg_mapper<Context>().map(val))>;
+
+  // Use enum instead of constexpr because the latter may generate code.
+  enum {
+    formattable_char = !std::is_same<arg_type, unformattable_char>::value
+  };
+  static_assert(formattable_char, "Mixing character types is disallowed.");
+
+  // Formatting of arbitrary pointers is disallowed. If you want to format a
+  // pointer cast it to `void*` or `const void*`. In particular, this forbids
+  // formatting of `[const] volatile char*` printed as bool by iostreams.
+  enum {
+    formattable_pointer = !std::is_same<arg_type, unformattable_pointer>::value
+  };
+  static_assert(formattable_pointer,
+                "Formatting of non-void pointers is disallowed.");
+
+  enum { formattable = !std::is_same<arg_type, unformattable>::value };
+#if defined(__cpp_if_constexpr)
+  if constexpr (!formattable)
+    type_is_unformattable_for<T, typename Context::char_type> _;
+#endif
+  static_assert(
+      formattable,
+      "Cannot format an argument. To make type T formattable provide a "
+      "formatter<T> specialization: https://fmt.dev/latest/api.html#udt");
+  return {arg_mapper<Context>().map(val)};
+}
+
+template <typename Context, typename T>
+FMT_CONSTEXPR auto make_arg(T& val) -> basic_format_arg<Context> {
+  auto arg = basic_format_arg<Context>();
+  arg.type_ = mapped_type_constant<T, Context>::value;
+  arg.value_ = make_arg<true, Context>(val);
+  return arg;
+}
+
+template <bool PACKED, typename Context, typename T, FMT_ENABLE_IF(!PACKED)>
+FMT_CONSTEXPR inline auto make_arg(T& val) -> basic_format_arg<Context> {
+  return make_arg<Context>(val);
+}
+
+template <typename Context, size_t NUM_ARGS>
+using arg_t = conditional_t<NUM_ARGS <= max_packed_args, value<Context>,
+                            basic_format_arg<Context>>;
+
+template <typename Char, typename T, FMT_ENABLE_IF(!is_named_arg<T>::value)>
+void init_named_arg(named_arg_info<Char>*, int& arg_index, int&, const T&) {
+  ++arg_index;
+}
+template <typename Char, typename T, FMT_ENABLE_IF(is_named_arg<T>::value)>
+void init_named_arg(named_arg_info<Char>* named_args, int& arg_index,
+                    int& named_arg_index, const T& arg) {
+  named_args[named_arg_index++] = {arg.name, arg_index++};
+}
+
+// An array of references to arguments. It can be implicitly converted to
+// `fmt::basic_format_args` for passing into type-erased formatting functions
+// such as `fmt::vformat`.
+template <typename Context, size_t NUM_ARGS, size_t NUM_NAMED_ARGS,
+          unsigned long long DESC>
+struct format_arg_store {
+  // args_[0].named_args points to named_args to avoid bloating format_args.
+  // +1 to workaround a bug in gcc 7.5 that causes duplicated-branches warning.
+  static constexpr size_t ARGS_ARR_SIZE = 1 + (NUM_ARGS != 0 ? NUM_ARGS : +1);
+
+  arg_t<Context, NUM_ARGS> args[ARGS_ARR_SIZE];
+  named_arg_info<typename Context::char_type> named_args[NUM_NAMED_ARGS];
+
+  template <typename... T>
+  FMT_MAP_API format_arg_store(T&... values)
+      : args{{named_args, NUM_NAMED_ARGS},
+             make_arg<NUM_ARGS <= max_packed_args, Context>(values)...} {
+    using dummy = int[];
+    int arg_index = 0, named_arg_index = 0;
+    (void)dummy{
+        0,
+        (init_named_arg(named_args, arg_index, named_arg_index, values), 0)...};
+  }
+
+  format_arg_store(format_arg_store&& rhs) {
+    args[0] = {named_args, NUM_NAMED_ARGS};
+    for (size_t i = 1; i < ARGS_ARR_SIZE; ++i) args[i] = rhs.args[i];
+    for (size_t i = 0; i < NUM_NAMED_ARGS; ++i)
+      named_args[i] = rhs.named_args[i];
+  }
+
+  format_arg_store(const format_arg_store& rhs) = delete;
+  format_arg_store& operator=(const format_arg_store& rhs) = delete;
+  format_arg_store& operator=(format_arg_store&& rhs) = delete;
+};
+
+// A specialization of format_arg_store without named arguments.
+// It is a plain struct to reduce binary size in debug mode.
+template <typename Context, size_t NUM_ARGS, unsigned long long DESC>
+struct format_arg_store<Context, NUM_ARGS, 0, DESC> {
+  // +1 to workaround a bug in gcc 7.5 that causes duplicated-branches warning.
+  arg_t<Context, NUM_ARGS> args[NUM_ARGS != 0 ? NUM_ARGS : +1];
+};
+
+}  // namespace detail
+FMT_BEGIN_EXPORT
+
+// A formatting argument. Context is a template parameter for the compiled API
+// where output can be unbuffered.
+template <typename Context> class basic_format_arg {
+ private:
+  detail::value<Context> value_;
+  detail::type type_;
+
+  template <typename ContextType, typename T>
+  friend FMT_CONSTEXPR auto detail::make_arg(T& value)
+      -> basic_format_arg<ContextType>;
+
+  friend class basic_format_args<Context>;
+  friend class dynamic_format_arg_store<Context>;
+
+  using char_type = typename Context::char_type;
+
+  template <typename, size_t, size_t, unsigned long long>
+  friend struct detail::format_arg_store;
+
+  basic_format_arg(const detail::named_arg_info<char_type>* args, size_t size)
+      : value_(args, size) {}
+
+ public:
+  class handle {
+   public:
+    explicit handle(detail::custom_value<Context> custom) : custom_(custom) {}
+
+    void format(typename Context::parse_context_type& parse_ctx,
+                Context& ctx) const {
+      custom_.format(custom_.value, parse_ctx, ctx);
+    }
+
+   private:
+    detail::custom_value<Context> custom_;
+  };
+
+  constexpr basic_format_arg() : type_(detail::type::none_type) {}
+
+  constexpr explicit operator bool() const noexcept {
+    return type_ != detail::type::none_type;
+  }
+
+  auto type() const -> detail::type { return type_; }
+
+  auto is_integral() const -> bool { return detail::is_integral_type(type_); }
+  auto is_arithmetic() const -> bool {
+    return detail::is_arithmetic_type(type_);
+  }
+
+  /**
+   * Visits an argument dispatching to the appropriate visit method based on
+   * the argument type. For example, if the argument type is `double` then
+   * `vis(value)` will be called with the value of type `double`.
+   */
+  template <typename Visitor>
+  FMT_CONSTEXPR FMT_INLINE auto visit(Visitor&& vis) const -> decltype(vis(0)) {
+    switch (type_) {
+    case detail::type::none_type:
+      break;
+    case detail::type::int_type:
+      return vis(value_.int_value);
+    case detail::type::uint_type:
+      return vis(value_.uint_value);
+    case detail::type::long_long_type:
+      return vis(value_.long_long_value);
+    case detail::type::ulong_long_type:
+      return vis(value_.ulong_long_value);
+    case detail::type::int128_type:
+      return vis(detail::convert_for_visit(value_.int128_value));
+    case detail::type::uint128_type:
+      return vis(detail::convert_for_visit(value_.uint128_value));
+    case detail::type::bool_type:
+      return vis(value_.bool_value);
+    case detail::type::char_type:
+      return vis(value_.char_value);
+    case detail::type::float_type:
+      return vis(value_.float_value);
+    case detail::type::double_type:
+      return vis(value_.double_value);
+    case detail::type::long_double_type:
+      return vis(value_.long_double_value);
+    case detail::type::cstring_type:
+      return vis(value_.string.data);
+    case detail::type::string_type:
+      using sv = basic_string_view<typename Context::char_type>;
+      return vis(sv(value_.string.data, value_.string.size));
+    case detail::type::pointer_type:
+      return vis(value_.pointer);
+    case detail::type::custom_type:
+      return vis(typename basic_format_arg<Context>::handle(value_.custom));
+    }
+    return vis(monostate());
+  }
+
+  auto format_custom(const char_type* parse_begin,
+                     typename Context::parse_context_type& parse_ctx,
+                     Context& ctx) -> bool {
+    if (type_ != detail::type::custom_type) return false;
+    parse_ctx.advance_to(parse_begin);
+    value_.custom.format(value_.custom.value, parse_ctx, ctx);
+    return true;
+  }
+};
+
+template <typename Visitor, typename Context>
+FMT_DEPRECATED FMT_CONSTEXPR auto visit_format_arg(
+    Visitor&& vis, const basic_format_arg<Context>& arg) -> decltype(vis(0)) {
+  return arg.visit(static_cast<Visitor&&>(vis));
+}
+
+/**
+ * A view of a collection of formatting arguments. To avoid lifetime issues it
+ * should only be used as a parameter type in type-erased functions such as
+ * `vformat`:
+ *
+ *     void vlog(fmt::string_view fmt, fmt::format_args args);  // OK
+ *     fmt::format_args args = fmt::make_format_args();  // Dangling reference
+ */
+template <typename Context> class basic_format_args {
+ public:
+  using size_type = int;
+  using format_arg = basic_format_arg<Context>;
+
+ private:
+  // A descriptor that contains information about formatting arguments.
+  // If the number of arguments is less or equal to max_packed_args then
+  // argument types are passed in the descriptor. This reduces binary code size
+  // per formatting function call.
+  unsigned long long desc_;
+  union {
+    // If is_packed() returns true then argument values are stored in values_;
+    // otherwise they are stored in args_. This is done to improve cache
+    // locality and reduce compiled code size since storing larger objects
+    // may require more code (at least on x86-64) even if the same amount of
+    // data is actually copied to stack. It saves ~10% on the bloat test.
+    const detail::value<Context>* values_;
+    const format_arg* args_;
+  };
+
+  constexpr auto is_packed() const -> bool {
+    return (desc_ & detail::is_unpacked_bit) == 0;
+  }
+  constexpr auto has_named_args() const -> bool {
+    return (desc_ & detail::has_named_args_bit) != 0;
+  }
+
+  FMT_CONSTEXPR auto type(int index) const -> detail::type {
+    int shift = index * detail::packed_arg_bits;
+    unsigned int mask = (1 << detail::packed_arg_bits) - 1;
+    return static_cast<detail::type>((desc_ >> shift) & mask);
+  }
+
+ public:
+  constexpr basic_format_args() : desc_(0), args_(nullptr) {}
+
+  /// Constructs a `basic_format_args` object from `format_arg_store`.
+  template <size_t NUM_ARGS, size_t NUM_NAMED_ARGS, unsigned long long DESC,
+            FMT_ENABLE_IF(NUM_ARGS <= detail::max_packed_args)>
+  constexpr FMT_ALWAYS_INLINE basic_format_args(
+      const detail::format_arg_store<Context, NUM_ARGS, NUM_NAMED_ARGS, DESC>&
+          store)
+      : desc_(DESC), values_(store.args + (NUM_NAMED_ARGS != 0 ? 1 : 0)) {}
+
+  template <size_t NUM_ARGS, size_t NUM_NAMED_ARGS, unsigned long long DESC,
+            FMT_ENABLE_IF(NUM_ARGS > detail::max_packed_args)>
+  constexpr basic_format_args(
+      const detail::format_arg_store<Context, NUM_ARGS, NUM_NAMED_ARGS, DESC>&
+          store)
+      : desc_(DESC), args_(store.args + (NUM_NAMED_ARGS != 0 ? 1 : 0)) {}
+
+  /// Constructs a `basic_format_args` object from `dynamic_format_arg_store`.
+  constexpr basic_format_args(const dynamic_format_arg_store<Context>& store)
+      : desc_(store.get_types()), args_(store.data()) {}
+
+  /// Constructs a `basic_format_args` object from a dynamic list of arguments.
+  constexpr basic_format_args(const format_arg* args, int count)
+      : desc_(detail::is_unpacked_bit | detail::to_unsigned(count)),
+        args_(args) {}
+
+  /// Returns the argument with the specified id.
+  FMT_CONSTEXPR auto get(int id) const -> format_arg {
+    format_arg arg;
+    if (!is_packed()) {
+      if (id < max_size()) arg = args_[id];
+      return arg;
+    }
+    if (static_cast<unsigned>(id) >= detail::max_packed_args) return arg;
+    arg.type_ = type(id);
+    if (arg.type_ == detail::type::none_type) return arg;
+    arg.value_ = values_[id];
+    return arg;
+  }
+
+  template <typename Char>
+  auto get(basic_string_view<Char> name) const -> format_arg {
+    int id = get_id(name);
+    return id >= 0 ? get(id) : format_arg();
+  }
+
+  template <typename Char>
+  FMT_CONSTEXPR auto get_id(basic_string_view<Char> name) const -> int {
+    if (!has_named_args()) return -1;
+    const auto& named_args =
+        (is_packed() ? values_[-1] : args_[-1].value_).named_args;
+    for (size_t i = 0; i < named_args.size; ++i) {
+      if (named_args.data[i].name == name) return named_args.data[i].id;
+    }
+    return -1;
+  }
+
+  auto max_size() const -> int {
+    unsigned long long max_packed = detail::max_packed_args;
+    return static_cast<int>(is_packed() ? max_packed
+                                        : desc_ & ~detail::is_unpacked_bit);
+  }
+};
+
+// A formatting context.
+class context {
+ private:
+  appender out_;
+  basic_format_args<context> args_;
+  detail::locale_ref loc_;
+
+ public:
+  /// The character type for the output.
+  using char_type = char;
+
+  using iterator = appender;
+  using format_arg = basic_format_arg<context>;
+  using parse_context_type = basic_format_parse_context<char>;
+  template <typename T> using formatter_type = formatter<T, char>;
+
+  /// Constructs a `basic_format_context` object. References to the arguments
+  /// are stored in the object so make sure they have appropriate lifetimes.
+  FMT_CONSTEXPR context(iterator out, basic_format_args<context> ctx_args,
+                        detail::locale_ref loc = {})
+      : out_(out), args_(ctx_args), loc_(loc) {}
+  context(context&&) = default;
+  context(const context&) = delete;
+  void operator=(const context&) = delete;
+
+  FMT_CONSTEXPR auto arg(int id) const -> format_arg { return args_.get(id); }
+  auto arg(string_view name) -> format_arg { return args_.get(name); }
+  FMT_CONSTEXPR auto arg_id(string_view name) -> int {
+    return args_.get_id(name);
+  }
+  auto args() const -> const basic_format_args<context>& { return args_; }
+
+  // Returns an iterator to the beginning of the output range.
+  FMT_CONSTEXPR auto out() -> iterator { return out_; }
+
+  // Advances the begin iterator to `it`.
+  void advance_to(iterator) {}
+
+  FMT_CONSTEXPR auto locale() -> detail::locale_ref { return loc_; }
+};
+
+template <typename OutputIt, typename Char> class generic_context;
+
+// Longer aliases for C++20 compatibility.
+template <typename OutputIt, typename Char>
+using basic_format_context =
+    conditional_t<std::is_same<OutputIt, appender>::value, context,
+                  generic_context<OutputIt, Char>>;
+using format_context = context;
+
+template <typename Char>
+using buffered_context = basic_format_context<basic_appender<Char>, Char>;
+
+template <typename T, typename Char = char>
+using is_formattable = bool_constant<!std::is_base_of<
+    detail::unformattable, decltype(detail::arg_mapper<buffered_context<Char>>()
+                                        .map(std::declval<T&>()))>::value>;
+
+#if FMT_USE_CONCEPTS
+template <typename T, typename Char = char>
+concept formattable = is_formattable<remove_reference_t<T>, Char>::value;
+#endif
+
+/**
+ * Constructs an object that stores references to arguments and can be
+ * implicitly converted to `format_args`. `Context` can be omitted in which case
+ * it defaults to `format_context`. See `arg` for lifetime considerations.
+ */
+// Take arguments by lvalue references to avoid some lifetime issues, e.g.
+//   auto args = make_format_args(std::string());
+template <typename Context = format_context, typename... T,
+          size_t NUM_ARGS = sizeof...(T),
+          size_t NUM_NAMED_ARGS = detail::count_named_args<T...>(),
+          unsigned long long DESC = detail::make_descriptor<Context, T...>(),
+          FMT_ENABLE_IF(NUM_NAMED_ARGS == 0)>
+constexpr FMT_ALWAYS_INLINE auto make_format_args(T&... args)
+    -> detail::format_arg_store<Context, NUM_ARGS, 0, DESC> {
+  return {{detail::make_arg<NUM_ARGS <= detail::max_packed_args, Context>(
+      args)...}};
+}
+
+#ifndef FMT_DOC
+template <typename Context = format_context, typename... T,
+          size_t NUM_NAMED_ARGS = detail::count_named_args<T...>(),
+          unsigned long long DESC =
+              detail::make_descriptor<Context, T...>() |
+              static_cast<unsigned long long>(detail::has_named_args_bit),
+          FMT_ENABLE_IF(NUM_NAMED_ARGS != 0)>
+constexpr auto make_format_args(T&... args)
+    -> detail::format_arg_store<Context, sizeof...(T), NUM_NAMED_ARGS, DESC> {
+  return {args...};
+}
+#endif
+
+/**
+ * Returns a named argument to be used in a formatting function.
+ * It should only be used in a call to a formatting function or
+ * `dynamic_format_arg_store::push_back`.
+ *
+ * **Example**:
+ *
+ *     fmt::print("The answer is {answer}.", fmt::arg("answer", 42));
+ */
+template <typename Char, typename T>
+inline auto arg(const Char* name, const T& arg) -> detail::named_arg<Char, T> {
+  static_assert(!detail::is_named_arg<T>(), "nested named arguments");
+  return {name, arg};
+}
+FMT_END_EXPORT
+
+/// An alias for `basic_format_args<format_context>`.
+// A separate type would result in shorter symbols but break ABI compatibility
+// between clang and gcc on ARM (#1919).
+FMT_EXPORT using format_args = basic_format_args<format_context>;
+
+// We cannot use enum classes as bit fields because of a gcc bug, so we put them
+// in namespaces instead (https://gcc.gnu.org/bugzilla/show_bug.cgi?id=61414).
+// Additionally, if an underlying type is specified, older gcc incorrectly warns
+// that the type is too small. Both bugs are fixed in gcc 9.3.
+#if FMT_GCC_VERSION && FMT_GCC_VERSION < 903
+#  define FMT_ENUM_UNDERLYING_TYPE(type)
+#else
+#  define FMT_ENUM_UNDERLYING_TYPE(type) : type
+#endif
+namespace align {
+enum type FMT_ENUM_UNDERLYING_TYPE(unsigned char){none, left, right, center,
+                                                  numeric};
+}
+using align_t = align::type;
+namespace sign {
+enum type FMT_ENUM_UNDERLYING_TYPE(unsigned char){none, minus, plus, space};
+}
+using sign_t = sign::type;
+
+namespace detail {
+
+template <typename Char>
+using unsigned_char = typename conditional_t<std::is_integral<Char>::value,
+                                             std::make_unsigned<Char>,
+                                             type_identity<unsigned>>::type;
+
+// Character (code unit) type is erased to prevent template bloat.
+struct fill_t {
+ private:
+  enum { max_size = 4 };
+  char data_[max_size] = {' '};
+  unsigned char size_ = 1;
+
+ public:
+  template <typename Char>
+  FMT_CONSTEXPR void operator=(basic_string_view<Char> s) {
+    auto size = s.size();
+    size_ = static_cast<unsigned char>(size);
+    if (size == 1) {
+      unsigned uchar = static_cast<unsigned_char<Char>>(s[0]);
+      data_[0] = static_cast<char>(uchar);
+      data_[1] = static_cast<char>(uchar >> 8);
+      return;
+    }
+    FMT_ASSERT(size <= max_size, "invalid fill");
+    for (size_t i = 0; i < size; ++i) data_[i] = static_cast<char>(s[i]);
+  }
+
+  FMT_CONSTEXPR void operator=(char c) {
+    data_[0] = c;
+    size_ = 1;
+  }
+
+  constexpr auto size() const -> size_t { return size_; }
+
+  template <typename Char> constexpr auto get() const -> Char {
+    using uchar = unsigned char;
+    return static_cast<Char>(static_cast<uchar>(data_[0]) |
+                             (static_cast<uchar>(data_[1]) << 8));
+  }
+
+  template <typename Char, FMT_ENABLE_IF(std::is_same<Char, char>::value)>
+  constexpr auto data() const -> const Char* {
+    return data_;
+  }
+  template <typename Char, FMT_ENABLE_IF(!std::is_same<Char, char>::value)>
+  constexpr auto data() const -> const Char* {
+    return nullptr;
+  }
+};
+}  // namespace detail
+
+enum class presentation_type : unsigned char {
+  // Common specifiers:
+  none = 0,
+  debug = 1,   // '?'
+  string = 2,  // 's' (string, bool)
+
+  // Integral, bool and character specifiers:
+  dec = 3,  // 'd'
+  hex,      // 'x' or 'X'
+  oct,      // 'o'
+  bin,      // 'b' or 'B'
+  chr,      // 'c'
+
+  // String and pointer specifiers:
+  pointer = 3,  // 'p'
+
+  // Floating-point specifiers:
+  exp = 1,  // 'e' or 'E' (1 since there is no FP debug presentation)
+  fixed,    // 'f' or 'F'
+  general,  // 'g' or 'G'
+  hexfloat  // 'a' or 'A'
+};
+
+// Format specifiers for built-in and string types.
+struct format_specs {
+  int width;
+  int precision;
+  presentation_type type;
+  align_t align : 4;
+  sign_t sign : 3;
+  bool upper : 1;  // An uppercase version e.g. 'X' for 'x'.
+  bool alt : 1;    // Alternate form ('#').
+  bool localized : 1;
+  detail::fill_t fill;
+
+  constexpr format_specs()
+      : width(0),
+        precision(-1),
+        type(presentation_type::none),
+        align(align::none),
+        sign(sign::none),
+        upper(false),
+        alt(false),
+        localized(false) {}
+};
+
+namespace detail {
+
+enum class arg_id_kind { none, index, name };
+
+// An argument reference.
+template <typename Char> struct arg_ref {
+  FMT_CONSTEXPR arg_ref() : kind(arg_id_kind::none), val() {}
+
+  FMT_CONSTEXPR explicit arg_ref(int index)
+      : kind(arg_id_kind::index), val(index) {}
+  FMT_CONSTEXPR explicit arg_ref(basic_string_view<Char> name)
+      : kind(arg_id_kind::name), val(name) {}
+
+  FMT_CONSTEXPR auto operator=(int idx) -> arg_ref& {
+    kind = arg_id_kind::index;
+    val.index = idx;
+    return *this;
+  }
+
+  arg_id_kind kind;
+  union value {
+    FMT_CONSTEXPR value(int idx = 0) : index(idx) {}
+    FMT_CONSTEXPR value(basic_string_view<Char> n) : name(n) {}
+
+    int index;
+    basic_string_view<Char> name;
+  } val;
+};
+
+// Format specifiers with width and precision resolved at formatting rather
+// than parsing time to allow reusing the same parsed specifiers with
+// different sets of arguments (precompilation of format strings).
+template <typename Char = char> struct dynamic_format_specs : format_specs {
+  arg_ref<Char> width_ref;
+  arg_ref<Char> precision_ref;
+};
+
+// Converts a character to ASCII. Returns '\0' on conversion failure.
+template <typename Char, FMT_ENABLE_IF(std::is_integral<Char>::value)>
+constexpr auto to_ascii(Char c) -> char {
+  return c <= 0xff ? static_cast<char>(c) : '\0';
+}
+
+// Returns the number of code units in a code point or 1 on error.
+template <typename Char>
+FMT_CONSTEXPR auto code_point_length(const Char* begin) -> int {
+  if (const_check(sizeof(Char) != 1)) return 1;
+  auto c = static_cast<unsigned char>(*begin);
+  return static_cast<int>((0x3a55000000000000ull >> (2 * (c >> 3))) & 0x3) + 1;
+}
+
+// Return the result via the out param to workaround gcc bug 77539.
+template <bool IS_CONSTEXPR, typename T, typename Ptr = const T*>
+FMT_CONSTEXPR auto find(Ptr first, Ptr last, T value, Ptr& out) -> bool {
+  for (out = first; out != last; ++out) {
+    if (*out == value) return true;
+  }
+  return false;
+}
+
+template <>
+inline auto find<false, char>(const char* first, const char* last, char value,
+                              const char*& out) -> bool {
+  out =
+      static_cast<const char*>(memchr(first, value, to_unsigned(last - first)));
+  return out != nullptr;
+}
+
+// Parses the range [begin, end) as an unsigned integer. This function assumes
+// that the range is non-empty and the first character is a digit.
+template <typename Char>
+FMT_CONSTEXPR auto parse_nonnegative_int(const Char*& begin, const Char* end,
+                                         int error_value) noexcept -> int {
+  FMT_ASSERT(begin != end && '0' <= *begin && *begin <= '9', "");
+  unsigned value = 0, prev = 0;
+  auto p = begin;
+  do {
+    prev = value;
+    value = value * 10 + unsigned(*p - '0');
+    ++p;
+  } while (p != end && '0' <= *p && *p <= '9');
+  auto num_digits = p - begin;
+  begin = p;
+  int digits10 = static_cast<int>(sizeof(int) * CHAR_BIT * 3 / 10);
+  if (num_digits <= digits10) return static_cast<int>(value);
+  // Check for overflow.
+  unsigned max = INT_MAX;
+  return num_digits == digits10 + 1 &&
+                 prev * 10ull + unsigned(p[-1] - '0') <= max
+             ? static_cast<int>(value)
+             : error_value;
+}
+
+FMT_CONSTEXPR inline auto parse_align(char c) -> align_t {
+  switch (c) {
+  case '<':
+    return align::left;
+  case '>':
+    return align::right;
+  case '^':
+    return align::center;
+  }
+  return align::none;
+}
+
+template <typename Char> constexpr auto is_name_start(Char c) -> bool {
+  return ('a' <= c && c <= 'z') || ('A' <= c && c <= 'Z') || c == '_';
+}
+
+template <typename Char, typename Handler>
+FMT_CONSTEXPR auto do_parse_arg_id(const Char* begin, const Char* end,
+                                   Handler&& handler) -> const Char* {
+  Char c = *begin;
+  if (c >= '0' && c <= '9') {
+    int index = 0;
+    if (c != '0')
+      index = parse_nonnegative_int(begin, end, INT_MAX);
+    else
+      ++begin;
+    if (begin == end || (*begin != '}' && *begin != ':'))
+      report_error("invalid format string");
+    else
+      handler.on_index(index);
+    return begin;
+  }
+  if (!is_name_start(c)) {
+    report_error("invalid format string");
+    return begin;
+  }
+  auto it = begin;
+  do {
+    ++it;
+  } while (it != end && (is_name_start(*it) || ('0' <= *it && *it <= '9')));
+  handler.on_name({begin, to_unsigned(it - begin)});
+  return it;
+}
+
+template <typename Char, typename Handler>
+FMT_CONSTEXPR auto parse_arg_id(const Char* begin, const Char* end,
+                                Handler&& handler) -> const Char* {
+  FMT_ASSERT(begin != end, "");
+  Char c = *begin;
+  if (c != '}' && c != ':') return do_parse_arg_id(begin, end, handler);
+  handler.on_auto();
+  return begin;
+}
+
+template <typename Char> struct dynamic_spec_id_handler {
+  basic_format_parse_context<Char>& ctx;
+  arg_ref<Char>& ref;
+
+  FMT_CONSTEXPR void on_auto() {
+    int id = ctx.next_arg_id();
+    ref = arg_ref<Char>(id);
+    ctx.check_dynamic_spec(id);
+  }
+  FMT_CONSTEXPR void on_index(int id) {
+    ref = arg_ref<Char>(id);
+    ctx.check_arg_id(id);
+    ctx.check_dynamic_spec(id);
+  }
+  FMT_CONSTEXPR void on_name(basic_string_view<Char> id) {
+    ref = arg_ref<Char>(id);
+    ctx.check_arg_id(id);
+  }
+};
+
+// Parses [integer | "{" [arg_id] "}"].
+template <typename Char>
+FMT_CONSTEXPR auto parse_dynamic_spec(const Char* begin, const Char* end,
+                                      int& value, arg_ref<Char>& ref,
+                                      basic_format_parse_context<Char>& ctx)
+    -> const Char* {
+  FMT_ASSERT(begin != end, "");
+  if ('0' <= *begin && *begin <= '9') {
+    int val = parse_nonnegative_int(begin, end, -1);
+    if (val != -1)
+      value = val;
+    else
+      report_error("number is too big");
+  } else if (*begin == '{') {
+    ++begin;
+    auto handler = dynamic_spec_id_handler<Char>{ctx, ref};
+    if (begin != end) begin = parse_arg_id(begin, end, handler);
+    if (begin != end && *begin == '}') return ++begin;
+    report_error("invalid format string");
+  }
+  return begin;
+}
+
+template <typename Char>
+FMT_CONSTEXPR auto parse_precision(const Char* begin, const Char* end,
+                                   int& value, arg_ref<Char>& ref,
+                                   basic_format_parse_context<Char>& ctx)
+    -> const Char* {
+  ++begin;
+  if (begin == end || *begin == '}') {
+    report_error("invalid precision");
+    return begin;
+  }
+  return parse_dynamic_spec(begin, end, value, ref, ctx);
+}
+
+enum class state { start, align, sign, hash, zero, width, precision, locale };
+
+// Parses standard format specifiers.
+template <typename Char>
+FMT_CONSTEXPR auto parse_format_specs(const Char* begin, const Char* end,
+                                      dynamic_format_specs<Char>& specs,
+                                      basic_format_parse_context<Char>& ctx,
+                                      type arg_type) -> const Char* {
+  auto c = '\0';
+  if (end - begin > 1) {
+    auto next = to_ascii(begin[1]);
+    c = parse_align(next) == align::none ? to_ascii(*begin) : '\0';
+  } else {
+    if (begin == end) return begin;
+    c = to_ascii(*begin);
+  }
+
+  struct {
+    state current_state = state::start;
+    FMT_CONSTEXPR void operator()(state s, bool valid = true) {
+      if (current_state >= s || !valid)
+        report_error("invalid format specifier");
+      current_state = s;
+    }
+  } enter_state;
+
+  using pres = presentation_type;
+  constexpr auto integral_set = sint_set | uint_set | bool_set | char_set;
+  struct {
+    const Char*& begin;
+    dynamic_format_specs<Char>& specs;
+    type arg_type;
+
+    FMT_CONSTEXPR auto operator()(pres pres_type, int set) -> const Char* {
+      if (!in(arg_type, set)) {
+        if (arg_type == type::none_type) return begin;
+        report_error("invalid format specifier");
+      }
+      specs.type = pres_type;
+      return begin + 1;
+    }
+  } parse_presentation_type{begin, specs, arg_type};
+
+  for (;;) {
+    switch (c) {
+    case '<':
+    case '>':
+    case '^':
+      enter_state(state::align);
+      specs.align = parse_align(c);
+      ++begin;
+      break;
+    case '+':
+    case '-':
+    case ' ':
+      if (arg_type == type::none_type) return begin;
+      enter_state(state::sign, in(arg_type, sint_set | float_set));
+      switch (c) {
+      case '+':
+        specs.sign = sign::plus;
+        break;
+      case '-':
+        specs.sign = sign::minus;
+        break;
+      case ' ':
+        specs.sign = sign::space;
+        break;
+      }
+      ++begin;
+      break;
+    case '#':
+      if (arg_type == type::none_type) return begin;
+      enter_state(state::hash, is_arithmetic_type(arg_type));
+      specs.alt = true;
+      ++begin;
+      break;
+    case '0':
+      enter_state(state::zero);
+      if (!is_arithmetic_type(arg_type)) {
+        if (arg_type == type::none_type) return begin;
+        report_error("format specifier requires numeric argument");
+      }
+      if (specs.align == align::none) {
+        // Ignore 0 if align is specified for compatibility with std::format.
+        specs.align = align::numeric;
+        specs.fill = '0';
+      }
+      ++begin;
+      break;
+    case '1':
+    case '2':
+    case '3':
+    case '4':
+    case '5':
+    case '6':
+    case '7':
+    case '8':
+    case '9':
+    case '{':
+      enter_state(state::width);
+      begin = parse_dynamic_spec(begin, end, specs.width, specs.width_ref, ctx);
+      break;
+    case '.':
+      if (arg_type == type::none_type) return begin;
+      enter_state(state::precision,
+                  in(arg_type, float_set | string_set | cstring_set));
+      begin = parse_precision(begin, end, specs.precision, specs.precision_ref,
+                              ctx);
+      break;
+    case 'L':
+      if (arg_type == type::none_type) return begin;
+      enter_state(state::locale, is_arithmetic_type(arg_type));
+      specs.localized = true;
+      ++begin;
+      break;
+    case 'd':
+      return parse_presentation_type(pres::dec, integral_set);
+    case 'X':
+      specs.upper = true;
+      FMT_FALLTHROUGH;
+    case 'x':
+      return parse_presentation_type(pres::hex, integral_set);
+    case 'o':
+      return parse_presentation_type(pres::oct, integral_set);
+    case 'B':
+      specs.upper = true;
+      FMT_FALLTHROUGH;
+    case 'b':
+      return parse_presentation_type(pres::bin, integral_set);
+    case 'E':
+      specs.upper = true;
+      FMT_FALLTHROUGH;
+    case 'e':
+      return parse_presentation_type(pres::exp, float_set);
+    case 'F':
+      specs.upper = true;
+      FMT_FALLTHROUGH;
+    case 'f':
+      return parse_presentation_type(pres::fixed, float_set);
+    case 'G':
+      specs.upper = true;
+      FMT_FALLTHROUGH;
+    case 'g':
+      return parse_presentation_type(pres::general, float_set);
+    case 'A':
+      specs.upper = true;
+      FMT_FALLTHROUGH;
+    case 'a':
+      return parse_presentation_type(pres::hexfloat, float_set);
+    case 'c':
+      if (arg_type == type::bool_type) report_error("invalid format specifier");
+      return parse_presentation_type(pres::chr, integral_set);
+    case 's':
+      return parse_presentation_type(pres::string,
+                                     bool_set | string_set | cstring_set);
+    case 'p':
+      return parse_presentation_type(pres::pointer, pointer_set | cstring_set);
+    case '?':
+      return parse_presentation_type(pres::debug,
+                                     char_set | string_set | cstring_set);
+    case '}':
+      return begin;
+    default: {
+      if (*begin == '}') return begin;
+      // Parse fill and alignment.
+      auto fill_end = begin + code_point_length(begin);
+      if (end - fill_end <= 0) {
+        report_error("invalid format specifier");
+        return begin;
+      }
+      if (*begin == '{') {
+        report_error("invalid fill character '{'");
+        return begin;
+      }
+      auto align = parse_align(to_ascii(*fill_end));
+      enter_state(state::align, align != align::none);
+      specs.fill =
+          basic_string_view<Char>(begin, to_unsigned(fill_end - begin));
+      specs.align = align;
+      begin = fill_end + 1;
+    }
+    }
+    if (begin == end) return begin;
+    c = to_ascii(*begin);
+  }
+}
+
+template <typename Char, typename Handler>
+FMT_CONSTEXPR auto parse_replacement_field(const Char* begin, const Char* end,
+                                           Handler&& handler) -> const Char* {
+  struct id_adapter {
+    Handler& handler;
+    int arg_id;
+
+    FMT_CONSTEXPR void on_auto() { arg_id = handler.on_arg_id(); }
+    FMT_CONSTEXPR void on_index(int id) { arg_id = handler.on_arg_id(id); }
+    FMT_CONSTEXPR void on_name(basic_string_view<Char> id) {
+      arg_id = handler.on_arg_id(id);
+    }
+  };
+
+  ++begin;
+  if (begin == end) return handler.on_error("invalid format string"), end;
+  if (*begin == '}') {
+    handler.on_replacement_field(handler.on_arg_id(), begin);
+  } else if (*begin == '{') {
+    handler.on_text(begin, begin + 1);
+  } else {
+    auto adapter = id_adapter{handler, 0};
+    begin = parse_arg_id(begin, end, adapter);
+    Char c = begin != end ? *begin : Char();
+    if (c == '}') {
+      handler.on_replacement_field(adapter.arg_id, begin);
+    } else if (c == ':') {
+      begin = handler.on_format_specs(adapter.arg_id, begin + 1, end);
+      if (begin == end || *begin != '}')
+        return handler.on_error("unknown format specifier"), end;
+    } else {
+      return handler.on_error("missing '}' in format string"), end;
+    }
+  }
+  return begin + 1;
+}
+
+template <bool IS_CONSTEXPR, typename Char, typename Handler>
+FMT_CONSTEXPR void parse_format_string(basic_string_view<Char> format_str,
+                                       Handler&& handler) {
+  auto begin = format_str.data();
+  auto end = begin + format_str.size();
+  if (end - begin < 32) {
+    // Use a simple loop instead of memchr for small strings.
+    const Char* p = begin;
+    while (p != end) {
+      auto c = *p++;
+      if (c == '{') {
+        handler.on_text(begin, p - 1);
+        begin = p = parse_replacement_field(p - 1, end, handler);
+      } else if (c == '}') {
+        if (p == end || *p != '}')
+          return handler.on_error("unmatched '}' in format string");
+        handler.on_text(begin, p);
+        begin = ++p;
+      }
+    }
+    handler.on_text(begin, end);
+    return;
+  }
+  struct writer {
+    FMT_CONSTEXPR void operator()(const Char* from, const Char* to) {
+      if (from == to) return;
+      for (;;) {
+        const Char* p = nullptr;
+        if (!find<IS_CONSTEXPR>(from, to, Char('}'), p))
+          return handler_.on_text(from, to);
+        ++p;
+        if (p == to || *p != '}')
+          return handler_.on_error("unmatched '}' in format string");
+        handler_.on_text(from, p);
+        from = p + 1;
+      }
+    }
+    Handler& handler_;
+  } write = {handler};
+  while (begin != end) {
+    // Doing two passes with memchr (one for '{' and another for '}') is up to
+    // 2.5x faster than the naive one-pass implementation on big format strings.
+    const Char* p = begin;
+    if (*begin != '{' && !find<IS_CONSTEXPR>(begin + 1, end, Char('{'), p))
+      return write(begin, end);
+    write(begin, p);
+    begin = parse_replacement_field(p, end, handler);
+  }
+}
+
+template <typename T, bool = is_named_arg<T>::value> struct strip_named_arg {
+  using type = T;
+};
+template <typename T> struct strip_named_arg<T, true> {
+  using type = remove_cvref_t<decltype(T::value)>;
+};
+
+template <typename T, typename ParseContext>
+FMT_VISIBILITY("hidden")  // Suppress an ld warning on macOS (#3769).
+FMT_CONSTEXPR auto parse_format_specs(ParseContext& ctx)
+    -> decltype(ctx.begin()) {
+  using char_type = typename ParseContext::char_type;
+  using context = buffered_context<char_type>;
+  using mapped_type = conditional_t<
+      mapped_type_constant<T, context>::value != type::custom_type,
+      decltype(arg_mapper<context>().map(std::declval<const T&>())),
+      typename strip_named_arg<T>::type>;
+#if defined(__cpp_if_constexpr)
+  if constexpr (std::is_default_constructible<
+                    formatter<mapped_type, char_type>>::value) {
+    return formatter<mapped_type, char_type>().parse(ctx);
+  } else {
+    type_is_unformattable_for<T, char_type> _;
+    return ctx.begin();
+  }
+#else
+  return formatter<mapped_type, char_type>().parse(ctx);
+#endif
+}
+
+// Checks char specs and returns true iff the presentation type is char-like.
+FMT_CONSTEXPR inline auto check_char_specs(const format_specs& specs) -> bool {
+  if (specs.type != presentation_type::none &&
+      specs.type != presentation_type::chr &&
+      specs.type != presentation_type::debug) {
+    return false;
+  }
+  if (specs.align == align::numeric || specs.sign != sign::none || specs.alt)
+    report_error("invalid format specifier for char");
+  return true;
+}
+
+#if FMT_USE_NONTYPE_TEMPLATE_ARGS
+template <int N, typename T, typename... Args, typename Char>
+constexpr auto get_arg_index_by_name(basic_string_view<Char> name) -> int {
+  if constexpr (is_statically_named_arg<T>()) {
+    if (name == T::name) return N;
+  }
+  if constexpr (sizeof...(Args) > 0)
+    return get_arg_index_by_name<N + 1, Args...>(name);
+  (void)name;  // Workaround an MSVC bug about "unused" parameter.
+  return -1;
+}
+#endif
+
+template <typename... Args, typename Char>
+FMT_CONSTEXPR auto get_arg_index_by_name(basic_string_view<Char> name) -> int {
+#if FMT_USE_NONTYPE_TEMPLATE_ARGS
+  if constexpr (sizeof...(Args) > 0)
+    return get_arg_index_by_name<0, Args...>(name);
+#endif
+  (void)name;
+  return -1;
+}
+
+template <typename Char, typename... Args> class format_string_checker {
+ private:
+  using parse_context_type = compile_parse_context<Char>;
+  static constexpr int num_args = sizeof...(Args);
+
+  // Format specifier parsing function.
+  // In the future basic_format_parse_context will replace compile_parse_context
+  // here and will use is_constant_evaluated and downcasting to access the data
+  // needed for compile-time checks: https://godbolt.org/z/GvWzcTjh1.
+  using parse_func = const Char* (*)(parse_context_type&);
+
+  type types_[num_args > 0 ? static_cast<size_t>(num_args) : 1];
+  parse_context_type context_;
+  parse_func parse_funcs_[num_args > 0 ? static_cast<size_t>(num_args) : 1];
+
+ public:
+  explicit FMT_CONSTEXPR format_string_checker(basic_string_view<Char> fmt)
+      : types_{mapped_type_constant<Args, buffered_context<Char>>::value...},
+        context_(fmt, num_args, types_),
+        parse_funcs_{&parse_format_specs<Args, parse_context_type>...} {}
+
+  FMT_CONSTEXPR void on_text(const Char*, const Char*) {}
+
+  FMT_CONSTEXPR auto on_arg_id() -> int { return context_.next_arg_id(); }
+  FMT_CONSTEXPR auto on_arg_id(int id) -> int {
+    return context_.check_arg_id(id), id;
+  }
+  FMT_CONSTEXPR auto on_arg_id(basic_string_view<Char> id) -> int {
+#if FMT_USE_NONTYPE_TEMPLATE_ARGS
+    auto index = get_arg_index_by_name<Args...>(id);
+    if (index < 0) on_error("named argument is not found");
+    return index;
+#else
+    (void)id;
+    on_error("compile-time checks for named arguments require C++20 support");
+    return 0;
+#endif
+  }
+
+  FMT_CONSTEXPR void on_replacement_field(int id, const Char* begin) {
+    on_format_specs(id, begin, begin);  // Call parse() on empty specs.
+  }
+
+  FMT_CONSTEXPR auto on_format_specs(int id, const Char* begin, const Char*)
+      -> const Char* {
+    context_.advance_to(begin);
+    // id >= 0 check is a workaround for gcc 10 bug (#2065).
+    return id >= 0 && id < num_args ? parse_funcs_[id](context_) : begin;
+  }
+
+  FMT_NORETURN FMT_CONSTEXPR void on_error(const char* message) {
+    report_error(message);
+  }
+};
+
+// A base class for compile-time strings.
+struct compile_string {};
+
+template <typename S>
+using is_compile_string = std::is_base_of<compile_string, S>;
+
+// Reports a compile-time error if S is not a valid format string.
+template <typename..., typename S, FMT_ENABLE_IF(!is_compile_string<S>::value)>
+FMT_ALWAYS_INLINE void check_format_string(const S&) {
+#ifdef FMT_ENFORCE_COMPILE_STRING
+  static_assert(is_compile_string<S>::value,
+                "FMT_ENFORCE_COMPILE_STRING requires all format strings to use "
+                "FMT_STRING.");
+#endif
+}
+template <typename... Args, typename S,
+          FMT_ENABLE_IF(is_compile_string<S>::value)>
+void check_format_string(S format_str) {
+  using char_t = typename S::char_type;
+  FMT_CONSTEXPR auto s = basic_string_view<char_t>(format_str);
+  using checker = format_string_checker<char_t, remove_cvref_t<Args>...>;
+  FMT_CONSTEXPR bool error = (parse_format_string<true>(s, checker(s)), true);
+  ignore_unused(error);
+}
+
+// Report truncation to prevent silent data loss.
+inline void report_truncation(bool truncated) {
+  if (truncated) report_error("output is truncated");
+}
+
+// Use vformat_args and avoid type_identity to keep symbols short and workaround
+// a GCC <= 4.8 bug.
+template <typename Char = char> struct vformat_args {
+  using type = basic_format_args<buffered_context<Char>>;
+};
+template <> struct vformat_args<char> {
+  using type = format_args;
+};
+
+template <typename Char>
+void vformat_to(buffer<Char>& buf, basic_string_view<Char> fmt,
+                typename vformat_args<Char>::type args, locale_ref loc = {});
+
+FMT_API void vprint_mojibake(FILE*, string_view, format_args, bool = false);
+#ifndef _WIN32
+inline void vprint_mojibake(FILE*, string_view, format_args, bool) {}
+#endif
+
+template <typename T, typename Char, type TYPE> struct native_formatter {
+ private:
+  dynamic_format_specs<Char> specs_;
+
+ public:
+  using nonlocking = void;
+
+  template <typename ParseContext>
+  FMT_CONSTEXPR auto parse(ParseContext& ctx) -> const Char* {
+    if (ctx.begin() == ctx.end() || *ctx.begin() == '}') return ctx.begin();
+    auto end = parse_format_specs(ctx.begin(), ctx.end(), specs_, ctx, TYPE);
+    if (const_check(TYPE == type::char_type)) check_char_specs(specs_);
+    return end;
+  }
+
+  template <type U = TYPE,
+            FMT_ENABLE_IF(U == type::string_type || U == type::cstring_type ||
+                          U == type::char_type)>
+  FMT_CONSTEXPR void set_debug_format(bool set = true) {
+    specs_.type = set ? presentation_type::debug : presentation_type::none;
+  }
+
+  template <typename FormatContext>
+  FMT_CONSTEXPR auto format(const T& val, FormatContext& ctx) const
+      -> decltype(ctx.out());
+};
+}  // namespace detail
+
+FMT_BEGIN_EXPORT
+
+// A formatter specialization for natively supported types.
+template <typename T, typename Char>
+struct formatter<T, Char,
+                 enable_if_t<detail::type_constant<T, Char>::value !=
+                             detail::type::custom_type>>
+    : detail::native_formatter<T, Char, detail::type_constant<T, Char>::value> {
+};
+
+template <typename Char = char> struct runtime_format_string {
+  basic_string_view<Char> str;
+};
+
+/// A compile-time format string.
+template <typename Char, typename... Args> class basic_format_string {
+ private:
+  basic_string_view<Char> str_;
+
+ public:
+  template <
+      typename S,
+      FMT_ENABLE_IF(
+          std::is_convertible<const S&, basic_string_view<Char>>::value ||
+          (detail::is_compile_string<S>::value &&
+           std::is_constructible<basic_string_view<Char>, const S&>::value))>
+  FMT_CONSTEVAL FMT_ALWAYS_INLINE basic_format_string(const S& s) : str_(s) {
+    static_assert(
+        detail::count<
+            (std::is_base_of<detail::view, remove_reference_t<Args>>::value &&
+             std::is_reference<Args>::value)...>() == 0,
+        "passing views as lvalues is disallowed");
+#if FMT_USE_CONSTEVAL
+    if constexpr (detail::count_named_args<Args...>() ==
+                  detail::count_statically_named_args<Args...>()) {
+      using checker =
+          detail::format_string_checker<Char, remove_cvref_t<Args>...>;
+      detail::parse_format_string<true>(str_, checker(s));
+    }
+#else
+    detail::check_format_string<Args...>(s);
+#endif
+  }
+  basic_format_string(runtime_format_string<Char> fmt) : str_(fmt.str) {}
+
+  FMT_ALWAYS_INLINE operator basic_string_view<Char>() const { return str_; }
+  auto get() const -> basic_string_view<Char> { return str_; }
+};
+
+#if FMT_GCC_VERSION && FMT_GCC_VERSION < 409
+// Workaround broken conversion on older gcc.
+template <typename...> using format_string = string_view;
+inline auto runtime(string_view s) -> string_view { return s; }
+#else
+template <typename... Args>
+using format_string = basic_format_string<char, type_identity_t<Args>...>;
+/**
+ * Creates a runtime format string.
+ *
+ * **Example**:
+ *
+ *     // Check format string at runtime instead of compile-time.
+ *     fmt::print(fmt::runtime("{:d}"), "I am not a number");
+ */
+inline auto runtime(string_view s) -> runtime_format_string<> { return {{s}}; }
+#endif
+
+/// Formats a string and writes the output to `out`.
+template <typename OutputIt,
+          FMT_ENABLE_IF(detail::is_output_iterator<remove_cvref_t<OutputIt>,
+                                                   char>::value)>
+auto vformat_to(OutputIt&& out, string_view fmt, format_args args)
+    -> remove_cvref_t<OutputIt> {
+  auto&& buf = detail::get_buffer<char>(out);
+  detail::vformat_to(buf, fmt, args, {});
+  return detail::get_iterator(buf, out);
+}
+
+/**
+ * Formats `args` according to specifications in `fmt`, writes the result to
+ * the output iterator `out` and returns the iterator past the end of the output
+ * range. `format_to` does not append a terminating null character.
+ *
+ * **Example**:
+ *
+ *     auto out = std::vector<char>();
+ *     fmt::format_to(std::back_inserter(out), "{}", 42);
+ */
+template <typename OutputIt, typename... T,
+          FMT_ENABLE_IF(detail::is_output_iterator<remove_cvref_t<OutputIt>,
+                                                   char>::value)>
+FMT_INLINE auto format_to(OutputIt&& out, format_string<T...> fmt, T&&... args)
+    -> remove_cvref_t<OutputIt> {
+  return vformat_to(FMT_FWD(out), fmt, fmt::make_format_args(args...));
+}
+
+template <typename OutputIt> struct format_to_n_result {
+  /// Iterator past the end of the output range.
+  OutputIt out;
+  /// Total (not truncated) output size.
+  size_t size;
+};
+
+template <typename OutputIt, typename... T,
+          FMT_ENABLE_IF(detail::is_output_iterator<OutputIt, char>::value)>
+auto vformat_to_n(OutputIt out, size_t n, string_view fmt, format_args args)
+    -> format_to_n_result<OutputIt> {
+  using traits = detail::fixed_buffer_traits;
+  auto buf = detail::iterator_buffer<OutputIt, char, traits>(out, n);
+  detail::vformat_to(buf, fmt, args, {});
+  return {buf.out(), buf.count()};
+}
+
+/**
+ * Formats `args` according to specifications in `fmt`, writes up to `n`
+ * characters of the result to the output iterator `out` and returns the total
+ * (not truncated) output size and the iterator past the end of the output
+ * range. `format_to_n` does not append a terminating null character.
+ */
+template <typename OutputIt, typename... T,
+          FMT_ENABLE_IF(detail::is_output_iterator<OutputIt, char>::value)>
+FMT_INLINE auto format_to_n(OutputIt out, size_t n, format_string<T...> fmt,
+                            T&&... args) -> format_to_n_result<OutputIt> {
+  return vformat_to_n(out, n, fmt, fmt::make_format_args(args...));
+}
+
+template <typename OutputIt, typename Sentinel = OutputIt>
+struct format_to_result {
+  /// Iterator pointing to just after the last successful write in the range.
+  OutputIt out;
+  /// Specifies if the output was truncated.
+  bool truncated;
+
+  FMT_CONSTEXPR operator OutputIt&() & {
+    detail::report_truncation(truncated);
+    return out;
+  }
+  FMT_CONSTEXPR operator const OutputIt&() const& {
+    detail::report_truncation(truncated);
+    return out;
+  }
+  FMT_CONSTEXPR operator OutputIt&&() && {
+    detail::report_truncation(truncated);
+    return static_cast<OutputIt&&>(out);
+  }
+};
+
+template <size_t N>
+auto vformat_to(char (&out)[N], string_view fmt, format_args args)
+    -> format_to_result<char*> {
+  auto result = vformat_to_n(out, N, fmt, args);
+  return {result.out, result.size > N};
+}
+
+template <size_t N, typename... T>
+FMT_INLINE auto format_to(char (&out)[N], format_string<T...> fmt, T&&... args)
+    -> format_to_result<char*> {
+  auto result = fmt::format_to_n(out, N, fmt, static_cast<T&&>(args)...);
+  return {result.out, result.size > N};
+}
+
+/// Returns the number of chars in the output of `format(fmt, args...)`.
+template <typename... T>
+FMT_NODISCARD FMT_INLINE auto formatted_size(format_string<T...> fmt,
+                                             T&&... args) -> size_t {
+  auto buf = detail::counting_buffer<>();
+  detail::vformat_to<char>(buf, fmt, fmt::make_format_args(args...), {});
+  return buf.count();
+}
+
+FMT_API void vprint(string_view fmt, format_args args);
+FMT_API void vprint(FILE* f, string_view fmt, format_args args);
+FMT_API void vprint_buffered(FILE* f, string_view fmt, format_args args);
+FMT_API void vprintln(FILE* f, string_view fmt, format_args args);
+
+/**
+ * Formats `args` according to specifications in `fmt` and writes the output
+ * to `stdout`.
+ *
+ * **Example**:
+ *
+ *     fmt::print("The answer is {}.", 42);
+ */
+template <typename... T>
+FMT_INLINE void print(format_string<T...> fmt, T&&... args) {
+  const auto& vargs = fmt::make_format_args(args...);
+  if (!detail::use_utf8()) return detail::vprint_mojibake(stdout, fmt, vargs);
+  return detail::is_locking<T...>() ? vprint_buffered(stdout, fmt, vargs)
+                                    : vprint(fmt, vargs);
+}
+
+/**
+ * Formats `args` according to specifications in `fmt` and writes the
+ * output to the file `f`.
+ *
+ * **Example**:
+ *
+ *     fmt::print(stderr, "Don't {}!", "panic");
+ */
+template <typename... T>
+FMT_INLINE void print(FILE* f, format_string<T...> fmt, T&&... args) {
+  const auto& vargs = fmt::make_format_args(args...);
+  if (!detail::use_utf8()) return detail::vprint_mojibake(f, fmt, vargs);
+  return detail::is_locking<T...>() ? vprint_buffered(f, fmt, vargs)
+                                    : vprint(f, fmt, vargs);
+}
+
+/// Formats `args` according to specifications in `fmt` and writes the output
+/// to the file `f` followed by a newline.
+template <typename... T>
+FMT_INLINE void println(FILE* f, format_string<T...> fmt, T&&... args) {
+  const auto& vargs = fmt::make_format_args(args...);
+  return detail::use_utf8() ? vprintln(f, fmt, vargs)
+                            : detail::vprint_mojibake(f, fmt, vargs, true);
+}
+
+/// Formats `args` according to specifications in `fmt` and writes the output
+/// to `stdout` followed by a newline.
+template <typename... T>
+FMT_INLINE void println(format_string<T...> fmt, T&&... args) {
+  return fmt::println(stdout, fmt, static_cast<T&&>(args)...);
+}
+
+FMT_END_EXPORT
+FMT_GCC_PRAGMA("GCC pop_options")
+FMT_END_NAMESPACE
+
+#ifdef FMT_HEADER_ONLY
+#  include "format.h"
+#endif
+#endif  // FMT_BASE_H_
diff --git a/lib/fmt/fmt/chrono.h b/lib/fmt/fmt/chrono.h
new file mode 100644
index 000000000..c93123fd3
--- /dev/null
+++ b/lib/fmt/fmt/chrono.h
@@ -0,0 +1,2432 @@
+// Formatting library for C++ - chrono support
+//
+// Copyright (c) 2012 - present, Victor Zverovich
+// All rights reserved.
+//
+// For the license information refer to format.h.
+
+#ifndef FMT_CHRONO_H_
+#define FMT_CHRONO_H_
+
+#ifndef FMT_MODULE
+#  include <algorithm>
+#  include <chrono>
+#  include <cmath>    // std::isfinite
+#  include <cstring>  // std::memcpy
+#  include <ctime>
+#  include <iterator>
+#  include <locale>
+#  include <ostream>
+#  include <type_traits>
+#endif
+
+#include "format.h"
+
+FMT_BEGIN_NAMESPACE
+
+// Check if std::chrono::local_t is available.
+#ifndef FMT_USE_LOCAL_TIME
+#  ifdef __cpp_lib_chrono
+#    define FMT_USE_LOCAL_TIME (__cpp_lib_chrono >= 201907L)
+#  else
+#    define FMT_USE_LOCAL_TIME 0
+#  endif
+#endif
+
+// Check if std::chrono::utc_timestamp is available.
+#ifndef FMT_USE_UTC_TIME
+#  ifdef __cpp_lib_chrono
+#    define FMT_USE_UTC_TIME (__cpp_lib_chrono >= 201907L)
+#  else
+#    define FMT_USE_UTC_TIME 0
+#  endif
+#endif
+
+// Enable tzset.
+#ifndef FMT_USE_TZSET
+// UWP doesn't provide _tzset.
+#  if FMT_HAS_INCLUDE("winapifamily.h")
+#    include <winapifamily.h>
+#  endif
+#  if defined(_WIN32) && (!defined(WINAPI_FAMILY) || \
+                          (WINAPI_FAMILY == WINAPI_FAMILY_DESKTOP_APP))
+#    define FMT_USE_TZSET 1
+#  else
+#    define FMT_USE_TZSET 0
+#  endif
+#endif
+
+// Enable safe chrono durations, unless explicitly disabled.
+#ifndef FMT_SAFE_DURATION_CAST
+#  define FMT_SAFE_DURATION_CAST 1
+#endif
+#if FMT_SAFE_DURATION_CAST
+
+// For conversion between std::chrono::durations without undefined
+// behaviour or erroneous results.
+// This is a stripped down version of duration_cast, for inclusion in fmt.
+// See https://github.com/pauldreik/safe_duration_cast
+//
+// Copyright Paul Dreik 2019
+namespace safe_duration_cast {
+
+template <typename To, typename From,
+          FMT_ENABLE_IF(!std::is_same<From, To>::value &&
+                        std::numeric_limits<From>::is_signed ==
+                            std::numeric_limits<To>::is_signed)>
+FMT_CONSTEXPR auto lossless_integral_conversion(const From from, int& ec)
+    -> To {
+  ec = 0;
+  using F = std::numeric_limits<From>;
+  using T = std::numeric_limits<To>;
+  static_assert(F::is_integer, "From must be integral");
+  static_assert(T::is_integer, "To must be integral");
+
+  // A and B are both signed, or both unsigned.
+  if (detail::const_check(F::digits <= T::digits)) {
+    // From fits in To without any problem.
+  } else {
+    // From does not always fit in To, resort to a dynamic check.
+    if (from < (T::min)() || from > (T::max)()) {
+      // outside range.
+      ec = 1;
+      return {};
+    }
+  }
+  return static_cast<To>(from);
+}
+
+/// Converts From to To, without loss. If the dynamic value of from
+/// can't be converted to To without loss, ec is set.
+template <typename To, typename From,
+          FMT_ENABLE_IF(!std::is_same<From, To>::value &&
+                        std::numeric_limits<From>::is_signed !=
+                            std::numeric_limits<To>::is_signed)>
+FMT_CONSTEXPR auto lossless_integral_conversion(const From from, int& ec)
+    -> To {
+  ec = 0;
+  using F = std::numeric_limits<From>;
+  using T = std::numeric_limits<To>;
+  static_assert(F::is_integer, "From must be integral");
+  static_assert(T::is_integer, "To must be integral");
+
+  if (detail::const_check(F::is_signed && !T::is_signed)) {
+    // From may be negative, not allowed!
+    if (fmt::detail::is_negative(from)) {
+      ec = 1;
+      return {};
+    }
+    // From is positive. Can it always fit in To?
+    if (detail::const_check(F::digits > T::digits) &&
+        from > static_cast<From>(detail::max_value<To>())) {
+      ec = 1;
+      return {};
+    }
+  }
+
+  if (detail::const_check(!F::is_signed && T::is_signed &&
+                          F::digits >= T::digits) &&
+      from > static_cast<From>(detail::max_value<To>())) {
+    ec = 1;
+    return {};
+  }
+  return static_cast<To>(from);  // Lossless conversion.
+}
+
+template <typename To, typename From,
+          FMT_ENABLE_IF(std::is_same<From, To>::value)>
+FMT_CONSTEXPR auto lossless_integral_conversion(const From from, int& ec)
+    -> To {
+  ec = 0;
+  return from;
+}  // function
+
+// clang-format off
+/**
+ * converts From to To if possible, otherwise ec is set.
+ *
+ * input                            |    output
+ * ---------------------------------|---------------
+ * NaN                              | NaN
+ * Inf                              | Inf
+ * normal, fits in output           | converted (possibly lossy)
+ * normal, does not fit in output   | ec is set
+ * subnormal                        | best effort
+ * -Inf                             | -Inf
+ */
+// clang-format on
+template <typename To, typename From,
+          FMT_ENABLE_IF(!std::is_same<From, To>::value)>
+FMT_CONSTEXPR auto safe_float_conversion(const From from, int& ec) -> To {
+  ec = 0;
+  using T = std::numeric_limits<To>;
+  static_assert(std::is_floating_point<From>::value, "From must be floating");
+  static_assert(std::is_floating_point<To>::value, "To must be floating");
+
+  // catch the only happy case
+  if (std::isfinite(from)) {
+    if (from >= T::lowest() && from <= (T::max)()) {
+      return static_cast<To>(from);
+    }
+    // not within range.
+    ec = 1;
+    return {};
+  }
+
+  // nan and inf will be preserved
+  return static_cast<To>(from);
+}  // function
+
+template <typename To, typename From,
+          FMT_ENABLE_IF(std::is_same<From, To>::value)>
+FMT_CONSTEXPR auto safe_float_conversion(const From from, int& ec) -> To {
+  ec = 0;
+  static_assert(std::is_floating_point<From>::value, "From must be floating");
+  return from;
+}
+
+/// Safe duration cast between integral durations
+template <typename To, typename FromRep, typename FromPeriod,
+          FMT_ENABLE_IF(std::is_integral<FromRep>::value),
+          FMT_ENABLE_IF(std::is_integral<typename To::rep>::value)>
+auto safe_duration_cast(std::chrono::duration<FromRep, FromPeriod> from,
+                        int& ec) -> To {
+  using From = std::chrono::duration<FromRep, FromPeriod>;
+  ec = 0;
+  // the basic idea is that we need to convert from count() in the from type
+  // to count() in the To type, by multiplying it with this:
+  struct Factor
+      : std::ratio_divide<typename From::period, typename To::period> {};
+
+  static_assert(Factor::num > 0, "num must be positive");
+  static_assert(Factor::den > 0, "den must be positive");
+
+  // the conversion is like this: multiply from.count() with Factor::num
+  // /Factor::den and convert it to To::rep, all this without
+  // overflow/underflow. let's start by finding a suitable type that can hold
+  // both To, From and Factor::num
+  using IntermediateRep =
+      typename std::common_type<typename From::rep, typename To::rep,
+                                decltype(Factor::num)>::type;
+
+  // safe conversion to IntermediateRep
+  IntermediateRep count =
+      lossless_integral_conversion<IntermediateRep>(from.count(), ec);
+  if (ec) return {};
+  // multiply with Factor::num without overflow or underflow
+  if (detail::const_check(Factor::num != 1)) {
+    const auto max1 = detail::max_value<IntermediateRep>() / Factor::num;
+    if (count > max1) {
+      ec = 1;
+      return {};
+    }
+    const auto min1 =
+        (std::numeric_limits<IntermediateRep>::min)() / Factor::num;
+    if (detail::const_check(!std::is_unsigned<IntermediateRep>::value) &&
+        count < min1) {
+      ec = 1;
+      return {};
+    }
+    count *= Factor::num;
+  }
+
+  if (detail::const_check(Factor::den != 1)) count /= Factor::den;
+  auto tocount = lossless_integral_conversion<typename To::rep>(count, ec);
+  return ec ? To() : To(tocount);
+}
+
+/// Safe duration_cast between floating point durations
+template <typename To, typename FromRep, typename FromPeriod,
+          FMT_ENABLE_IF(std::is_floating_point<FromRep>::value),
+          FMT_ENABLE_IF(std::is_floating_point<typename To::rep>::value)>
+auto safe_duration_cast(std::chrono::duration<FromRep, FromPeriod> from,
+                        int& ec) -> To {
+  using From = std::chrono::duration<FromRep, FromPeriod>;
+  ec = 0;
+  if (std::isnan(from.count())) {
+    // nan in, gives nan out. easy.
+    return To{std::numeric_limits<typename To::rep>::quiet_NaN()};
+  }
+  // maybe we should also check if from is denormal, and decide what to do about
+  // it.
+
+  // +-inf should be preserved.
+  if (std::isinf(from.count())) {
+    return To{from.count()};
+  }
+
+  // the basic idea is that we need to convert from count() in the from type
+  // to count() in the To type, by multiplying it with this:
+  struct Factor
+      : std::ratio_divide<typename From::period, typename To::period> {};
+
+  static_assert(Factor::num > 0, "num must be positive");
+  static_assert(Factor::den > 0, "den must be positive");
+
+  // the conversion is like this: multiply from.count() with Factor::num
+  // /Factor::den and convert it to To::rep, all this without
+  // overflow/underflow. let's start by finding a suitable type that can hold
+  // both To, From and Factor::num
+  using IntermediateRep =
+      typename std::common_type<typename From::rep, typename To::rep,
+                                decltype(Factor::num)>::type;
+
+  // force conversion of From::rep -> IntermediateRep to be safe,
+  // even if it will never happen be narrowing in this context.
+  IntermediateRep count =
+      safe_float_conversion<IntermediateRep>(from.count(), ec);
+  if (ec) {
+    return {};
+  }
+
+  // multiply with Factor::num without overflow or underflow
+  if (detail::const_check(Factor::num != 1)) {
+    constexpr auto max1 = detail::max_value<IntermediateRep>() /
+                          static_cast<IntermediateRep>(Factor::num);
+    if (count > max1) {
+      ec = 1;
+      return {};
+    }
+    constexpr auto min1 = std::numeric_limits<IntermediateRep>::lowest() /
+                          static_cast<IntermediateRep>(Factor::num);
+    if (count < min1) {
+      ec = 1;
+      return {};
+    }
+    count *= static_cast<IntermediateRep>(Factor::num);
+  }
+
+  // this can't go wrong, right? den>0 is checked earlier.
+  if (detail::const_check(Factor::den != 1)) {
+    using common_t = typename std::common_type<IntermediateRep, intmax_t>::type;
+    count /= static_cast<common_t>(Factor::den);
+  }
+
+  // convert to the to type, safely
+  using ToRep = typename To::rep;
+
+  const ToRep tocount = safe_float_conversion<ToRep>(count, ec);
+  if (ec) {
+    return {};
+  }
+  return To{tocount};
+}
+}  // namespace safe_duration_cast
+#endif
+
+// Prevents expansion of a preceding token as a function-style macro.
+// Usage: f FMT_NOMACRO()
+#define FMT_NOMACRO
+
+namespace detail {
+template <typename T = void> struct null {};
+inline auto localtime_r FMT_NOMACRO(...) -> null<> { return null<>(); }
+inline auto localtime_s(...) -> null<> { return null<>(); }
+inline auto gmtime_r(...) -> null<> { return null<>(); }
+inline auto gmtime_s(...) -> null<> { return null<>(); }
+
+// It is defined here and not in ostream.h because the latter has expensive
+// includes.
+template <typename Streambuf> class formatbuf : public Streambuf {
+ private:
+  using char_type = typename Streambuf::char_type;
+  using streamsize = decltype(std::declval<Streambuf>().sputn(nullptr, 0));
+  using int_type = typename Streambuf::int_type;
+  using traits_type = typename Streambuf::traits_type;
+
+  buffer<char_type>& buffer_;
+
+ public:
+  explicit formatbuf(buffer<char_type>& buf) : buffer_(buf) {}
+
+ protected:
+  // The put area is always empty. This makes the implementation simpler and has
+  // the advantage that the streambuf and the buffer are always in sync and
+  // sputc never writes into uninitialized memory. A disadvantage is that each
+  // call to sputc always results in a (virtual) call to overflow. There is no
+  // disadvantage here for sputn since this always results in a call to xsputn.
+
+  auto overflow(int_type ch) -> int_type override {
+    if (!traits_type::eq_int_type(ch, traits_type::eof()))
+      buffer_.push_back(static_cast<char_type>(ch));
+    return ch;
+  }
+
+  auto xsputn(const char_type* s, streamsize count) -> streamsize override {
+    buffer_.append(s, s + count);
+    return count;
+  }
+};
+
+inline auto get_classic_locale() -> const std::locale& {
+  static const auto& locale = std::locale::classic();
+  return locale;
+}
+
+template <typename CodeUnit> struct codecvt_result {
+  static constexpr const size_t max_size = 32;
+  CodeUnit buf[max_size];
+  CodeUnit* end;
+};
+
+template <typename CodeUnit>
+void write_codecvt(codecvt_result<CodeUnit>& out, string_view in_buf,
+                   const std::locale& loc) {
+#if FMT_CLANG_VERSION
+#  pragma clang diagnostic push
+#  pragma clang diagnostic ignored "-Wdeprecated"
+  auto& f = std::use_facet<std::codecvt<CodeUnit, char, std::mbstate_t>>(loc);
+#  pragma clang diagnostic pop
+#else
+  auto& f = std::use_facet<std::codecvt<CodeUnit, char, std::mbstate_t>>(loc);
+#endif
+  auto mb = std::mbstate_t();
+  const char* from_next = nullptr;
+  auto result = f.in(mb, in_buf.begin(), in_buf.end(), from_next,
+                     std::begin(out.buf), std::end(out.buf), out.end);
+  if (result != std::codecvt_base::ok)
+    FMT_THROW(format_error("failed to format time"));
+}
+
+template <typename OutputIt>
+auto write_encoded_tm_str(OutputIt out, string_view in, const std::locale& loc)
+    -> OutputIt {
+  if (detail::use_utf8() && loc != get_classic_locale()) {
+    // char16_t and char32_t codecvts are broken in MSVC (linkage errors) and
+    // gcc-4.
+#if FMT_MSC_VERSION != 0 ||  \
+    (defined(__GLIBCXX__) && \
+     (!defined(_GLIBCXX_USE_DUAL_ABI) || _GLIBCXX_USE_DUAL_ABI == 0))
+    // The _GLIBCXX_USE_DUAL_ABI macro is always defined in libstdc++ from gcc-5
+    // and newer.
+    using code_unit = wchar_t;
+#else
+    using code_unit = char32_t;
+#endif
+
+    using unit_t = codecvt_result<code_unit>;
+    unit_t unit;
+    write_codecvt(unit, in, loc);
+    // In UTF-8 is used one to four one-byte code units.
+    auto u =
+        to_utf8<code_unit, basic_memory_buffer<char, unit_t::max_size * 4>>();
+    if (!u.convert({unit.buf, to_unsigned(unit.end - unit.buf)}))
+      FMT_THROW(format_error("failed to format time"));
+    return copy<char>(u.c_str(), u.c_str() + u.size(), out);
+  }
+  return copy<char>(in.data(), in.data() + in.size(), out);
+}
+
+template <typename Char, typename OutputIt,
+          FMT_ENABLE_IF(!std::is_same<Char, char>::value)>
+auto write_tm_str(OutputIt out, string_view sv, const std::locale& loc)
+    -> OutputIt {
+  codecvt_result<Char> unit;
+  write_codecvt(unit, sv, loc);
+  return copy<Char>(unit.buf, unit.end, out);
+}
+
+template <typename Char, typename OutputIt,
+          FMT_ENABLE_IF(std::is_same<Char, char>::value)>
+auto write_tm_str(OutputIt out, string_view sv, const std::locale& loc)
+    -> OutputIt {
+  return write_encoded_tm_str(out, sv, loc);
+}
+
+template <typename Char>
+inline void do_write(buffer<Char>& buf, const std::tm& time,
+                     const std::locale& loc, char format, char modifier) {
+  auto&& format_buf = formatbuf<std::basic_streambuf<Char>>(buf);
+  auto&& os = std::basic_ostream<Char>(&format_buf);
+  os.imbue(loc);
+  const auto& facet = std::use_facet<std::time_put<Char>>(loc);
+  auto end = facet.put(os, os, Char(' '), &time, format, modifier);
+  if (end.failed()) FMT_THROW(format_error("failed to format time"));
+}
+
+template <typename Char, typename OutputIt,
+          FMT_ENABLE_IF(!std::is_same<Char, char>::value)>
+auto write(OutputIt out, const std::tm& time, const std::locale& loc,
+           char format, char modifier = 0) -> OutputIt {
+  auto&& buf = get_buffer<Char>(out);
+  do_write<Char>(buf, time, loc, format, modifier);
+  return get_iterator(buf, out);
+}
+
+template <typename Char, typename OutputIt,
+          FMT_ENABLE_IF(std::is_same<Char, char>::value)>
+auto write(OutputIt out, const std::tm& time, const std::locale& loc,
+           char format, char modifier = 0) -> OutputIt {
+  auto&& buf = basic_memory_buffer<Char>();
+  do_write<char>(buf, time, loc, format, modifier);
+  return write_encoded_tm_str(out, string_view(buf.data(), buf.size()), loc);
+}
+
+template <typename Rep1, typename Rep2>
+struct is_same_arithmetic_type
+    : public std::integral_constant<bool,
+                                    (std::is_integral<Rep1>::value &&
+                                     std::is_integral<Rep2>::value) ||
+                                        (std::is_floating_point<Rep1>::value &&
+                                         std::is_floating_point<Rep2>::value)> {
+};
+
+template <
+    typename To, typename FromRep, typename FromPeriod,
+    FMT_ENABLE_IF(is_same_arithmetic_type<FromRep, typename To::rep>::value)>
+auto fmt_duration_cast(std::chrono::duration<FromRep, FromPeriod> from) -> To {
+#if FMT_SAFE_DURATION_CAST
+  // Throwing version of safe_duration_cast is only available for
+  // integer to integer or float to float casts.
+  int ec;
+  To to = safe_duration_cast::safe_duration_cast<To>(from, ec);
+  if (ec) FMT_THROW(format_error("cannot format duration"));
+  return to;
+#else
+  // Standard duration cast, may overflow.
+  return std::chrono::duration_cast<To>(from);
+#endif
+}
+
+template <
+    typename To, typename FromRep, typename FromPeriod,
+    FMT_ENABLE_IF(!is_same_arithmetic_type<FromRep, typename To::rep>::value)>
+auto fmt_duration_cast(std::chrono::duration<FromRep, FromPeriod> from) -> To {
+  // Mixed integer <-> float cast is not supported by safe_duration_cast.
+  return std::chrono::duration_cast<To>(from);
+}
+
+template <typename Duration>
+auto to_time_t(
+    std::chrono::time_point<std::chrono::system_clock, Duration> time_point)
+    -> std::time_t {
+  // Cannot use std::chrono::system_clock::to_time_t since this would first
+  // require a cast to std::chrono::system_clock::time_point, which could
+  // overflow.
+  return fmt_duration_cast<std::chrono::duration<std::time_t>>(
+             time_point.time_since_epoch())
+      .count();
+}
+}  // namespace detail
+
+FMT_BEGIN_EXPORT
+
+/**
+ * Converts given time since epoch as `std::time_t` value into calendar time,
+ * expressed in local time. Unlike `std::localtime`, this function is
+ * thread-safe on most platforms.
+ */
+inline auto localtime(std::time_t time) -> std::tm {
+  struct dispatcher {
+    std::time_t time_;
+    std::tm tm_;
+
+    dispatcher(std::time_t t) : time_(t) {}
+
+    auto run() -> bool {
+      using namespace fmt::detail;
+      return handle(localtime_r(&time_, &tm_));
+    }
+
+    auto handle(std::tm* tm) -> bool { return tm != nullptr; }
+
+    auto handle(detail::null<>) -> bool {
+      using namespace fmt::detail;
+      return fallback(localtime_s(&tm_, &time_));
+    }
+
+    auto fallback(int res) -> bool { return res == 0; }
+
+#if !FMT_MSC_VERSION
+    auto fallback(detail::null<>) -> bool {
+      using namespace fmt::detail;
+      std::tm* tm = std::localtime(&time_);
+      if (tm) tm_ = *tm;
+      return tm != nullptr;
+    }
+#endif
+  };
+  dispatcher lt(time);
+  // Too big time values may be unsupported.
+  if (!lt.run()) FMT_THROW(format_error("time_t value out of range"));
+  return lt.tm_;
+}
+
+#if FMT_USE_LOCAL_TIME
+template <typename Duration>
+inline auto localtime(std::chrono::local_time<Duration> time) -> std::tm {
+  return localtime(
+      detail::to_time_t(std::chrono::current_zone()->to_sys(time)));
+}
+#endif
+
+/**
+ * Converts given time since epoch as `std::time_t` value into calendar time,
+ * expressed in Coordinated Universal Time (UTC). Unlike `std::gmtime`, this
+ * function is thread-safe on most platforms.
+ */
+inline auto gmtime(std::time_t time) -> std::tm {
+  struct dispatcher {
+    std::time_t time_;
+    std::tm tm_;
+
+    dispatcher(std::time_t t) : time_(t) {}
+
+    auto run() -> bool {
+      using namespace fmt::detail;
+      return handle(gmtime_r(&time_, &tm_));
+    }
+
+    auto handle(std::tm* tm) -> bool { return tm != nullptr; }
+
+    auto handle(detail::null<>) -> bool {
+      using namespace fmt::detail;
+      return fallback(gmtime_s(&tm_, &time_));
+    }
+
+    auto fallback(int res) -> bool { return res == 0; }
+
+#if !FMT_MSC_VERSION
+    auto fallback(detail::null<>) -> bool {
+      std::tm* tm = std::gmtime(&time_);
+      if (tm) tm_ = *tm;
+      return tm != nullptr;
+    }
+#endif
+  };
+  auto gt = dispatcher(time);
+  // Too big time values may be unsupported.
+  if (!gt.run()) FMT_THROW(format_error("time_t value out of range"));
+  return gt.tm_;
+}
+
+template <typename Duration>
+inline auto gmtime(
+    std::chrono::time_point<std::chrono::system_clock, Duration> time_point)
+    -> std::tm {
+  return gmtime(detail::to_time_t(time_point));
+}
+
+namespace detail {
+
+// Writes two-digit numbers a, b and c separated by sep to buf.
+// The method by Pavel Novikov based on
+// https://johnnylee-sde.github.io/Fast-unsigned-integer-to-time-string/.
+inline void write_digit2_separated(char* buf, unsigned a, unsigned b,
+                                   unsigned c, char sep) {
+  unsigned long long digits =
+      a | (b << 24) | (static_cast<unsigned long long>(c) << 48);
+  // Convert each value to BCD.
+  // We have x = a * 10 + b and we want to convert it to BCD y = a * 16 + b.
+  // The difference is
+  //   y - x = a * 6
+  // a can be found from x:
+  //   a = floor(x / 10)
+  // then
+  //   y = x + a * 6 = x + floor(x / 10) * 6
+  // floor(x / 10) is (x * 205) >> 11 (needs 16 bits).
+  digits += (((digits * 205) >> 11) & 0x000f00000f00000f) * 6;
+  // Put low nibbles to high bytes and high nibbles to low bytes.
+  digits = ((digits & 0x00f00000f00000f0) >> 4) |
+           ((digits & 0x000f00000f00000f) << 8);
+  auto usep = static_cast<unsigned long long>(sep);
+  // Add ASCII '0' to each digit byte and insert separators.
+  digits |= 0x3030003030003030 | (usep << 16) | (usep << 40);
+
+  constexpr const size_t len = 8;
+  if (const_check(is_big_endian())) {
+    char tmp[len];
+    std::memcpy(tmp, &digits, len);
+    std::reverse_copy(tmp, tmp + len, buf);
+  } else {
+    std::memcpy(buf, &digits, len);
+  }
+}
+
+template <typename Period>
+FMT_CONSTEXPR inline auto get_units() -> const char* {
+  if (std::is_same<Period, std::atto>::value) return "as";
+  if (std::is_same<Period, std::femto>::value) return "fs";
+  if (std::is_same<Period, std::pico>::value) return "ps";
+  if (std::is_same<Period, std::nano>::value) return "ns";
+  if (std::is_same<Period, std::micro>::value) return "µs";
+  if (std::is_same<Period, std::milli>::value) return "ms";
+  if (std::is_same<Period, std::centi>::value) return "cs";
+  if (std::is_same<Period, std::deci>::value) return "ds";
+  if (std::is_same<Period, std::ratio<1>>::value) return "s";
+  if (std::is_same<Period, std::deca>::value) return "das";
+  if (std::is_same<Period, std::hecto>::value) return "hs";
+  if (std::is_same<Period, std::kilo>::value) return "ks";
+  if (std::is_same<Period, std::mega>::value) return "Ms";
+  if (std::is_same<Period, std::giga>::value) return "Gs";
+  if (std::is_same<Period, std::tera>::value) return "Ts";
+  if (std::is_same<Period, std::peta>::value) return "Ps";
+  if (std::is_same<Period, std::exa>::value) return "Es";
+  if (std::is_same<Period, std::ratio<60>>::value) return "min";
+  if (std::is_same<Period, std::ratio<3600>>::value) return "h";
+  if (std::is_same<Period, std::ratio<86400>>::value) return "d";
+  return nullptr;
+}
+
+enum class numeric_system {
+  standard,
+  // Alternative numeric system, e.g. 十二 instead of 12 in ja_JP locale.
+  alternative
+};
+
+// Glibc extensions for formatting numeric values.
+enum class pad_type {
+  // Pad a numeric result string with zeros (the default).
+  zero,
+  // Do not pad a numeric result string.
+  none,
+  // Pad a numeric result string with spaces.
+  space,
+};
+
+template <typename OutputIt>
+auto write_padding(OutputIt out, pad_type pad, int width) -> OutputIt {
+  if (pad == pad_type::none) return out;
+  return detail::fill_n(out, width, pad == pad_type::space ? ' ' : '0');
+}
+
+template <typename OutputIt>
+auto write_padding(OutputIt out, pad_type pad) -> OutputIt {
+  if (pad != pad_type::none) *out++ = pad == pad_type::space ? ' ' : '0';
+  return out;
+}
+
+// Parses a put_time-like format string and invokes handler actions.
+template <typename Char, typename Handler>
+FMT_CONSTEXPR auto parse_chrono_format(const Char* begin, const Char* end,
+                                       Handler&& handler) -> const Char* {
+  if (begin == end || *begin == '}') return begin;
+  if (*begin != '%') FMT_THROW(format_error("invalid format"));
+  auto ptr = begin;
+  while (ptr != end) {
+    pad_type pad = pad_type::zero;
+    auto c = *ptr;
+    if (c == '}') break;
+    if (c != '%') {
+      ++ptr;
+      continue;
+    }
+    if (begin != ptr) handler.on_text(begin, ptr);
+    ++ptr;  // consume '%'
+    if (ptr == end) FMT_THROW(format_error("invalid format"));
+    c = *ptr;
+    switch (c) {
+    case '_':
+      pad = pad_type::space;
+      ++ptr;
+      break;
+    case '-':
+      pad = pad_type::none;
+      ++ptr;
+      break;
+    }
+    if (ptr == end) FMT_THROW(format_error("invalid format"));
+    c = *ptr++;
+    switch (c) {
+    case '%':
+      handler.on_text(ptr - 1, ptr);
+      break;
+    case 'n': {
+      const Char newline[] = {'\n'};
+      handler.on_text(newline, newline + 1);
+      break;
+    }
+    case 't': {
+      const Char tab[] = {'\t'};
+      handler.on_text(tab, tab + 1);
+      break;
+    }
+    // Year:
+    case 'Y':
+      handler.on_year(numeric_system::standard);
+      break;
+    case 'y':
+      handler.on_short_year(numeric_system::standard);
+      break;
+    case 'C':
+      handler.on_century(numeric_system::standard);
+      break;
+    case 'G':
+      handler.on_iso_week_based_year();
+      break;
+    case 'g':
+      handler.on_iso_week_based_short_year();
+      break;
+    // Day of the week:
+    case 'a':
+      handler.on_abbr_weekday();
+      break;
+    case 'A':
+      handler.on_full_weekday();
+      break;
+    case 'w':
+      handler.on_dec0_weekday(numeric_system::standard);
+      break;
+    case 'u':
+      handler.on_dec1_weekday(numeric_system::standard);
+      break;
+    // Month:
+    case 'b':
+    case 'h':
+      handler.on_abbr_month();
+      break;
+    case 'B':
+      handler.on_full_month();
+      break;
+    case 'm':
+      handler.on_dec_month(numeric_system::standard);
+      break;
+    // Day of the year/month:
+    case 'U':
+      handler.on_dec0_week_of_year(numeric_system::standard, pad);
+      break;
+    case 'W':
+      handler.on_dec1_week_of_year(numeric_system::standard, pad);
+      break;
+    case 'V':
+      handler.on_iso_week_of_year(numeric_system::standard, pad);
+      break;
+    case 'j':
+      handler.on_day_of_year();
+      break;
+    case 'd':
+      handler.on_day_of_month(numeric_system::standard, pad);
+      break;
+    case 'e':
+      handler.on_day_of_month(numeric_system::standard, pad_type::space);
+      break;
+    // Hour, minute, second:
+    case 'H':
+      handler.on_24_hour(numeric_system::standard, pad);
+      break;
+    case 'I':
+      handler.on_12_hour(numeric_system::standard, pad);
+      break;
+    case 'M':
+      handler.on_minute(numeric_system::standard, pad);
+      break;
+    case 'S':
+      handler.on_second(numeric_system::standard, pad);
+      break;
+    // Other:
+    case 'c':
+      handler.on_datetime(numeric_system::standard);
+      break;
+    case 'x':
+      handler.on_loc_date(numeric_system::standard);
+      break;
+    case 'X':
+      handler.on_loc_time(numeric_system::standard);
+      break;
+    case 'D':
+      handler.on_us_date();
+      break;
+    case 'F':
+      handler.on_iso_date();
+      break;
+    case 'r':
+      handler.on_12_hour_time();
+      break;
+    case 'R':
+      handler.on_24_hour_time();
+      break;
+    case 'T':
+      handler.on_iso_time();
+      break;
+    case 'p':
+      handler.on_am_pm();
+      break;
+    case 'Q':
+      handler.on_duration_value();
+      break;
+    case 'q':
+      handler.on_duration_unit();
+      break;
+    case 'z':
+      handler.on_utc_offset(numeric_system::standard);
+      break;
+    case 'Z':
+      handler.on_tz_name();
+      break;
+    // Alternative representation:
+    case 'E': {
+      if (ptr == end) FMT_THROW(format_error("invalid format"));
+      c = *ptr++;
+      switch (c) {
+      case 'Y':
+        handler.on_year(numeric_system::alternative);
+        break;
+      case 'y':
+        handler.on_offset_year();
+        break;
+      case 'C':
+        handler.on_century(numeric_system::alternative);
+        break;
+      case 'c':
+        handler.on_datetime(numeric_system::alternative);
+        break;
+      case 'x':
+        handler.on_loc_date(numeric_system::alternative);
+        break;
+      case 'X':
+        handler.on_loc_time(numeric_system::alternative);
+        break;
+      case 'z':
+        handler.on_utc_offset(numeric_system::alternative);
+        break;
+      default:
+        FMT_THROW(format_error("invalid format"));
+      }
+      break;
+    }
+    case 'O':
+      if (ptr == end) FMT_THROW(format_error("invalid format"));
+      c = *ptr++;
+      switch (c) {
+      case 'y':
+        handler.on_short_year(numeric_system::alternative);
+        break;
+      case 'm':
+        handler.on_dec_month(numeric_system::alternative);
+        break;
+      case 'U':
+        handler.on_dec0_week_of_year(numeric_system::alternative, pad);
+        break;
+      case 'W':
+        handler.on_dec1_week_of_year(numeric_system::alternative, pad);
+        break;
+      case 'V':
+        handler.on_iso_week_of_year(numeric_system::alternative, pad);
+        break;
+      case 'd':
+        handler.on_day_of_month(numeric_system::alternative, pad);
+        break;
+      case 'e':
+        handler.on_day_of_month(numeric_system::alternative, pad_type::space);
+        break;
+      case 'w':
+        handler.on_dec0_weekday(numeric_system::alternative);
+        break;
+      case 'u':
+        handler.on_dec1_weekday(numeric_system::alternative);
+        break;
+      case 'H':
+        handler.on_24_hour(numeric_system::alternative, pad);
+        break;
+      case 'I':
+        handler.on_12_hour(numeric_system::alternative, pad);
+        break;
+      case 'M':
+        handler.on_minute(numeric_system::alternative, pad);
+        break;
+      case 'S':
+        handler.on_second(numeric_system::alternative, pad);
+        break;
+      case 'z':
+        handler.on_utc_offset(numeric_system::alternative);
+        break;
+      default:
+        FMT_THROW(format_error("invalid format"));
+      }
+      break;
+    default:
+      FMT_THROW(format_error("invalid format"));
+    }
+    begin = ptr;
+  }
+  if (begin != ptr) handler.on_text(begin, ptr);
+  return ptr;
+}
+
+template <typename Derived> struct null_chrono_spec_handler {
+  FMT_CONSTEXPR void unsupported() {
+    static_cast<Derived*>(this)->unsupported();
+  }
+  FMT_CONSTEXPR void on_year(numeric_system) { unsupported(); }
+  FMT_CONSTEXPR void on_short_year(numeric_system) { unsupported(); }
+  FMT_CONSTEXPR void on_offset_year() { unsupported(); }
+  FMT_CONSTEXPR void on_century(numeric_system) { unsupported(); }
+  FMT_CONSTEXPR void on_iso_week_based_year() { unsupported(); }
+  FMT_CONSTEXPR void on_iso_week_based_short_year() { unsupported(); }
+  FMT_CONSTEXPR void on_abbr_weekday() { unsupported(); }
+  FMT_CONSTEXPR void on_full_weekday() { unsupported(); }
+  FMT_CONSTEXPR void on_dec0_weekday(numeric_system) { unsupported(); }
+  FMT_CONSTEXPR void on_dec1_weekday(numeric_system) { unsupported(); }
+  FMT_CONSTEXPR void on_abbr_month() { unsupported(); }
+  FMT_CONSTEXPR void on_full_month() { unsupported(); }
+  FMT_CONSTEXPR void on_dec_month(numeric_system) { unsupported(); }
+  FMT_CONSTEXPR void on_dec0_week_of_year(numeric_system, pad_type) {
+    unsupported();
+  }
+  FMT_CONSTEXPR void on_dec1_week_of_year(numeric_system, pad_type) {
+    unsupported();
+  }
+  FMT_CONSTEXPR void on_iso_week_of_year(numeric_system, pad_type) {
+    unsupported();
+  }
+  FMT_CONSTEXPR void on_day_of_year() { unsupported(); }
+  FMT_CONSTEXPR void on_day_of_month(numeric_system, pad_type) {
+    unsupported();
+  }
+  FMT_CONSTEXPR void on_24_hour(numeric_system) { unsupported(); }
+  FMT_CONSTEXPR void on_12_hour(numeric_system) { unsupported(); }
+  FMT_CONSTEXPR void on_minute(numeric_system) { unsupported(); }
+  FMT_CONSTEXPR void on_second(numeric_system) { unsupported(); }
+  FMT_CONSTEXPR void on_datetime(numeric_system) { unsupported(); }
+  FMT_CONSTEXPR void on_loc_date(numeric_system) { unsupported(); }
+  FMT_CONSTEXPR void on_loc_time(numeric_system) { unsupported(); }
+  FMT_CONSTEXPR void on_us_date() { unsupported(); }
+  FMT_CONSTEXPR void on_iso_date() { unsupported(); }
+  FMT_CONSTEXPR void on_12_hour_time() { unsupported(); }
+  FMT_CONSTEXPR void on_24_hour_time() { unsupported(); }
+  FMT_CONSTEXPR void on_iso_time() { unsupported(); }
+  FMT_CONSTEXPR void on_am_pm() { unsupported(); }
+  FMT_CONSTEXPR void on_duration_value() { unsupported(); }
+  FMT_CONSTEXPR void on_duration_unit() { unsupported(); }
+  FMT_CONSTEXPR void on_utc_offset(numeric_system) { unsupported(); }
+  FMT_CONSTEXPR void on_tz_name() { unsupported(); }
+};
+
+struct tm_format_checker : null_chrono_spec_handler<tm_format_checker> {
+  FMT_NORETURN void unsupported() { FMT_THROW(format_error("no format")); }
+
+  template <typename Char>
+  FMT_CONSTEXPR void on_text(const Char*, const Char*) {}
+  FMT_CONSTEXPR void on_year(numeric_system) {}
+  FMT_CONSTEXPR void on_short_year(numeric_system) {}
+  FMT_CONSTEXPR void on_offset_year() {}
+  FMT_CONSTEXPR void on_century(numeric_system) {}
+  FMT_CONSTEXPR void on_iso_week_based_year() {}
+  FMT_CONSTEXPR void on_iso_week_based_short_year() {}
+  FMT_CONSTEXPR void on_abbr_weekday() {}
+  FMT_CONSTEXPR void on_full_weekday() {}
+  FMT_CONSTEXPR void on_dec0_weekday(numeric_system) {}
+  FMT_CONSTEXPR void on_dec1_weekday(numeric_system) {}
+  FMT_CONSTEXPR void on_abbr_month() {}
+  FMT_CONSTEXPR void on_full_month() {}
+  FMT_CONSTEXPR void on_dec_month(numeric_system) {}
+  FMT_CONSTEXPR void on_dec0_week_of_year(numeric_system, pad_type) {}
+  FMT_CONSTEXPR void on_dec1_week_of_year(numeric_system, pad_type) {}
+  FMT_CONSTEXPR void on_iso_week_of_year(numeric_system, pad_type) {}
+  FMT_CONSTEXPR void on_day_of_year() {}
+  FMT_CONSTEXPR void on_day_of_month(numeric_system, pad_type) {}
+  FMT_CONSTEXPR void on_24_hour(numeric_system, pad_type) {}
+  FMT_CONSTEXPR void on_12_hour(numeric_system, pad_type) {}
+  FMT_CONSTEXPR void on_minute(numeric_system, pad_type) {}
+  FMT_CONSTEXPR void on_second(numeric_system, pad_type) {}
+  FMT_CONSTEXPR void on_datetime(numeric_system) {}
+  FMT_CONSTEXPR void on_loc_date(numeric_system) {}
+  FMT_CONSTEXPR void on_loc_time(numeric_system) {}
+  FMT_CONSTEXPR void on_us_date() {}
+  FMT_CONSTEXPR void on_iso_date() {}
+  FMT_CONSTEXPR void on_12_hour_time() {}
+  FMT_CONSTEXPR void on_24_hour_time() {}
+  FMT_CONSTEXPR void on_iso_time() {}
+  FMT_CONSTEXPR void on_am_pm() {}
+  FMT_CONSTEXPR void on_utc_offset(numeric_system) {}
+  FMT_CONSTEXPR void on_tz_name() {}
+};
+
+inline auto tm_wday_full_name(int wday) -> const char* {
+  static constexpr const char* full_name_list[] = {
+      "Sunday",   "Monday", "Tuesday", "Wednesday",
+      "Thursday", "Friday", "Saturday"};
+  return wday >= 0 && wday <= 6 ? full_name_list[wday] : "?";
+}
+inline auto tm_wday_short_name(int wday) -> const char* {
+  static constexpr const char* short_name_list[] = {"Sun", "Mon", "Tue", "Wed",
+                                                    "Thu", "Fri", "Sat"};
+  return wday >= 0 && wday <= 6 ? short_name_list[wday] : "???";
+}
+
+inline auto tm_mon_full_name(int mon) -> const char* {
+  static constexpr const char* full_name_list[] = {
+      "January", "February", "March",     "April",   "May",      "June",
+      "July",    "August",   "September", "October", "November", "December"};
+  return mon >= 0 && mon <= 11 ? full_name_list[mon] : "?";
+}
+inline auto tm_mon_short_name(int mon) -> const char* {
+  static constexpr const char* short_name_list[] = {
+      "Jan", "Feb", "Mar", "Apr", "May", "Jun",
+      "Jul", "Aug", "Sep", "Oct", "Nov", "Dec",
+  };
+  return mon >= 0 && mon <= 11 ? short_name_list[mon] : "???";
+}
+
+template <typename T, typename = void>
+struct has_member_data_tm_gmtoff : std::false_type {};
+template <typename T>
+struct has_member_data_tm_gmtoff<T, void_t<decltype(T::tm_gmtoff)>>
+    : std::true_type {};
+
+template <typename T, typename = void>
+struct has_member_data_tm_zone : std::false_type {};
+template <typename T>
+struct has_member_data_tm_zone<T, void_t<decltype(T::tm_zone)>>
+    : std::true_type {};
+
+#if FMT_USE_TZSET
+inline void tzset_once() {
+  static bool init = []() -> bool {
+    _tzset();
+    return true;
+  }();
+  ignore_unused(init);
+}
+#endif
+
+// Converts value to Int and checks that it's in the range [0, upper).
+template <typename T, typename Int, FMT_ENABLE_IF(std::is_integral<T>::value)>
+inline auto to_nonnegative_int(T value, Int upper) -> Int {
+  if (!std::is_unsigned<Int>::value &&
+      (value < 0 || to_unsigned(value) > to_unsigned(upper))) {
+    FMT_THROW(fmt::format_error("chrono value is out of range"));
+  }
+  return static_cast<Int>(value);
+}
+template <typename T, typename Int, FMT_ENABLE_IF(!std::is_integral<T>::value)>
+inline auto to_nonnegative_int(T value, Int upper) -> Int {
+  auto int_value = static_cast<Int>(value);
+  if (int_value < 0 || value > static_cast<T>(upper))
+    FMT_THROW(format_error("invalid value"));
+  return int_value;
+}
+
+constexpr auto pow10(std::uint32_t n) -> long long {
+  return n == 0 ? 1 : 10 * pow10(n - 1);
+}
+
+// Counts the number of fractional digits in the range [0, 18] according to the
+// C++20 spec. If more than 18 fractional digits are required then returns 6 for
+// microseconds precision.
+template <long long Num, long long Den, int N = 0,
+          bool Enabled = (N < 19) && (Num <= max_value<long long>() / 10)>
+struct count_fractional_digits {
+  static constexpr int value =
+      Num % Den == 0 ? N : count_fractional_digits<Num * 10, Den, N + 1>::value;
+};
+
+// Base case that doesn't instantiate any more templates
+// in order to avoid overflow.
+template <long long Num, long long Den, int N>
+struct count_fractional_digits<Num, Den, N, false> {
+  static constexpr int value = (Num % Den == 0) ? N : 6;
+};
+
+// Format subseconds which are given as an integer type with an appropriate
+// number of digits.
+template <typename Char, typename OutputIt, typename Duration>
+void write_fractional_seconds(OutputIt& out, Duration d, int precision = -1) {
+  constexpr auto num_fractional_digits =
+      count_fractional_digits<Duration::period::num,
+                              Duration::period::den>::value;
+
+  using subsecond_precision = std::chrono::duration<
+      typename std::common_type<typename Duration::rep,
+                                std::chrono::seconds::rep>::type,
+      std::ratio<1, detail::pow10(num_fractional_digits)>>;
+
+  const auto fractional = d - fmt_duration_cast<std::chrono::seconds>(d);
+  const auto subseconds =
+      std::chrono::treat_as_floating_point<
+          typename subsecond_precision::rep>::value
+          ? fractional.count()
+          : fmt_duration_cast<subsecond_precision>(fractional).count();
+  auto n = static_cast<uint32_or_64_or_128_t<long long>>(subseconds);
+  const int num_digits = detail::count_digits(n);
+
+  int leading_zeroes = (std::max)(0, num_fractional_digits - num_digits);
+  if (precision < 0) {
+    FMT_ASSERT(!std::is_floating_point<typename Duration::rep>::value, "");
+    if (std::ratio_less<typename subsecond_precision::period,
+                        std::chrono::seconds::period>::value) {
+      *out++ = '.';
+      out = detail::fill_n(out, leading_zeroes, '0');
+      out = format_decimal<Char>(out, n, num_digits).end;
+    }
+  } else if (precision > 0) {
+    *out++ = '.';
+    leading_zeroes = (std::min)(leading_zeroes, precision);
+    int remaining = precision - leading_zeroes;
+    out = detail::fill_n(out, leading_zeroes, '0');
+    if (remaining < num_digits) {
+      int num_truncated_digits = num_digits - remaining;
+      n /= to_unsigned(detail::pow10(to_unsigned(num_truncated_digits)));
+      if (n) {
+        out = format_decimal<Char>(out, n, remaining).end;
+      }
+      return;
+    }
+    if (n) {
+      out = format_decimal<Char>(out, n, num_digits).end;
+      remaining -= num_digits;
+    }
+    out = detail::fill_n(out, remaining, '0');
+  }
+}
+
+// Format subseconds which are given as a floating point type with an
+// appropriate number of digits. We cannot pass the Duration here, as we
+// explicitly need to pass the Rep value in the chrono_formatter.
+template <typename Duration>
+void write_floating_seconds(memory_buffer& buf, Duration duration,
+                            int num_fractional_digits = -1) {
+  using rep = typename Duration::rep;
+  FMT_ASSERT(std::is_floating_point<rep>::value, "");
+
+  auto val = duration.count();
+
+  if (num_fractional_digits < 0) {
+    // For `std::round` with fallback to `round`:
+    // On some toolchains `std::round` is not available (e.g. GCC 6).
+    using namespace std;
+    num_fractional_digits =
+        count_fractional_digits<Duration::period::num,
+                                Duration::period::den>::value;
+    if (num_fractional_digits < 6 && static_cast<rep>(round(val)) != val)
+      num_fractional_digits = 6;
+  }
+
+  fmt::format_to(std::back_inserter(buf), FMT_STRING("{:.{}f}"),
+                 std::fmod(val * static_cast<rep>(Duration::period::num) /
+                               static_cast<rep>(Duration::period::den),
+                           static_cast<rep>(60)),
+                 num_fractional_digits);
+}
+
+template <typename OutputIt, typename Char,
+          typename Duration = std::chrono::seconds>
+class tm_writer {
+ private:
+  static constexpr int days_per_week = 7;
+
+  const std::locale& loc_;
+  const bool is_classic_;
+  OutputIt out_;
+  const Duration* subsecs_;
+  const std::tm& tm_;
+
+  auto tm_sec() const noexcept -> int {
+    FMT_ASSERT(tm_.tm_sec >= 0 && tm_.tm_sec <= 61, "");
+    return tm_.tm_sec;
+  }
+  auto tm_min() const noexcept -> int {
+    FMT_ASSERT(tm_.tm_min >= 0 && tm_.tm_min <= 59, "");
+    return tm_.tm_min;
+  }
+  auto tm_hour() const noexcept -> int {
+    FMT_ASSERT(tm_.tm_hour >= 0 && tm_.tm_hour <= 23, "");
+    return tm_.tm_hour;
+  }
+  auto tm_mday() const noexcept -> int {
+    FMT_ASSERT(tm_.tm_mday >= 1 && tm_.tm_mday <= 31, "");
+    return tm_.tm_mday;
+  }
+  auto tm_mon() const noexcept -> int {
+    FMT_ASSERT(tm_.tm_mon >= 0 && tm_.tm_mon <= 11, "");
+    return tm_.tm_mon;
+  }
+  auto tm_year() const noexcept -> long long { return 1900ll + tm_.tm_year; }
+  auto tm_wday() const noexcept -> int {
+    FMT_ASSERT(tm_.tm_wday >= 0 && tm_.tm_wday <= 6, "");
+    return tm_.tm_wday;
+  }
+  auto tm_yday() const noexcept -> int {
+    FMT_ASSERT(tm_.tm_yday >= 0 && tm_.tm_yday <= 365, "");
+    return tm_.tm_yday;
+  }
+
+  auto tm_hour12() const noexcept -> int {
+    const auto h = tm_hour();
+    const auto z = h < 12 ? h : h - 12;
+    return z == 0 ? 12 : z;
+  }
+
+  // POSIX and the C Standard are unclear or inconsistent about what %C and %y
+  // do if the year is negative or exceeds 9999. Use the convention that %C
+  // concatenated with %y yields the same output as %Y, and that %Y contains at
+  // least 4 characters, with more only if necessary.
+  auto split_year_lower(long long year) const noexcept -> int {
+    auto l = year % 100;
+    if (l < 0) l = -l;  // l in [0, 99]
+    return static_cast<int>(l);
+  }
+
+  // Algorithm: https://en.wikipedia.org/wiki/ISO_week_date.
+  auto iso_year_weeks(long long curr_year) const noexcept -> int {
+    const auto prev_year = curr_year - 1;
+    const auto curr_p =
+        (curr_year + curr_year / 4 - curr_year / 100 + curr_year / 400) %
+        days_per_week;
+    const auto prev_p =
+        (prev_year + prev_year / 4 - prev_year / 100 + prev_year / 400) %
+        days_per_week;
+    return 52 + ((curr_p == 4 || prev_p == 3) ? 1 : 0);
+  }
+  auto iso_week_num(int tm_yday, int tm_wday) const noexcept -> int {
+    return (tm_yday + 11 - (tm_wday == 0 ? days_per_week : tm_wday)) /
+           days_per_week;
+  }
+  auto tm_iso_week_year() const noexcept -> long long {
+    const auto year = tm_year();
+    const auto w = iso_week_num(tm_yday(), tm_wday());
+    if (w < 1) return year - 1;
+    if (w > iso_year_weeks(year)) return year + 1;
+    return year;
+  }
+  auto tm_iso_week_of_year() const noexcept -> int {
+    const auto year = tm_year();
+    const auto w = iso_week_num(tm_yday(), tm_wday());
+    if (w < 1) return iso_year_weeks(year - 1);
+    if (w > iso_year_weeks(year)) return 1;
+    return w;
+  }
+
+  void write1(int value) {
+    *out_++ = static_cast<char>('0' + to_unsigned(value) % 10);
+  }
+  void write2(int value) {
+    const char* d = digits2(to_unsigned(value) % 100);
+    *out_++ = *d++;
+    *out_++ = *d;
+  }
+  void write2(int value, pad_type pad) {
+    unsigned int v = to_unsigned(value) % 100;
+    if (v >= 10) {
+      const char* d = digits2(v);
+      *out_++ = *d++;
+      *out_++ = *d;
+    } else {
+      out_ = detail::write_padding(out_, pad);
+      *out_++ = static_cast<char>('0' + v);
+    }
+  }
+
+  void write_year_extended(long long year) {
+    // At least 4 characters.
+    int width = 4;
+    if (year < 0) {
+      *out_++ = '-';
+      year = 0 - year;
+      --width;
+    }
+    uint32_or_64_or_128_t<long long> n = to_unsigned(year);
+    const int num_digits = count_digits(n);
+    if (width > num_digits)
+      out_ = detail::fill_n(out_, width - num_digits, '0');
+    out_ = format_decimal<Char>(out_, n, num_digits).end;
+  }
+  void write_year(long long year) {
+    if (year >= 0 && year < 10000) {
+      write2(static_cast<int>(year / 100));
+      write2(static_cast<int>(year % 100));
+    } else {
+      write_year_extended(year);
+    }
+  }
+
+  void write_utc_offset(long offset, numeric_system ns) {
+    if (offset < 0) {
+      *out_++ = '-';
+      offset = -offset;
+    } else {
+      *out_++ = '+';
+    }
+    offset /= 60;
+    write2(static_cast<int>(offset / 60));
+    if (ns != numeric_system::standard) *out_++ = ':';
+    write2(static_cast<int>(offset % 60));
+  }
+  template <typename T, FMT_ENABLE_IF(has_member_data_tm_gmtoff<T>::value)>
+  void format_utc_offset_impl(const T& tm, numeric_system ns) {
+    write_utc_offset(tm.tm_gmtoff, ns);
+  }
+  template <typename T, FMT_ENABLE_IF(!has_member_data_tm_gmtoff<T>::value)>
+  void format_utc_offset_impl(const T& tm, numeric_system ns) {
+#if defined(_WIN32) && defined(_UCRT)
+#  if FMT_USE_TZSET
+    tzset_once();
+#  endif
+    long offset = 0;
+    _get_timezone(&offset);
+    if (tm.tm_isdst) {
+      long dstbias = 0;
+      _get_dstbias(&dstbias);
+      offset += dstbias;
+    }
+    write_utc_offset(-offset, ns);
+#else
+    if (ns == numeric_system::standard) return format_localized('z');
+
+    // Extract timezone offset from timezone conversion functions.
+    std::tm gtm = tm;
+    std::time_t gt = std::mktime(&gtm);
+    std::tm ltm = gmtime(gt);
+    std::time_t lt = std::mktime(&ltm);
+    long offset = gt - lt;
+    write_utc_offset(offset, ns);
+#endif
+  }
+
+  template <typename T, FMT_ENABLE_IF(has_member_data_tm_zone<T>::value)>
+  void format_tz_name_impl(const T& tm) {
+    if (is_classic_)
+      out_ = write_tm_str<Char>(out_, tm.tm_zone, loc_);
+    else
+      format_localized('Z');
+  }
+  template <typename T, FMT_ENABLE_IF(!has_member_data_tm_zone<T>::value)>
+  void format_tz_name_impl(const T&) {
+    format_localized('Z');
+  }
+
+  void format_localized(char format, char modifier = 0) {
+    out_ = write<Char>(out_, tm_, loc_, format, modifier);
+  }
+
+ public:
+  tm_writer(const std::locale& loc, OutputIt out, const std::tm& tm,
+            const Duration* subsecs = nullptr)
+      : loc_(loc),
+        is_classic_(loc_ == get_classic_locale()),
+        out_(out),
+        subsecs_(subsecs),
+        tm_(tm) {}
+
+  auto out() const -> OutputIt { return out_; }
+
+  FMT_CONSTEXPR void on_text(const Char* begin, const Char* end) {
+    out_ = copy<Char>(begin, end, out_);
+  }
+
+  void on_abbr_weekday() {
+    if (is_classic_)
+      out_ = write(out_, tm_wday_short_name(tm_wday()));
+    else
+      format_localized('a');
+  }
+  void on_full_weekday() {
+    if (is_classic_)
+      out_ = write(out_, tm_wday_full_name(tm_wday()));
+    else
+      format_localized('A');
+  }
+  void on_dec0_weekday(numeric_system ns) {
+    if (is_classic_ || ns == numeric_system::standard) return write1(tm_wday());
+    format_localized('w', 'O');
+  }
+  void on_dec1_weekday(numeric_system ns) {
+    if (is_classic_ || ns == numeric_system::standard) {
+      auto wday = tm_wday();
+      write1(wday == 0 ? days_per_week : wday);
+    } else {
+      format_localized('u', 'O');
+    }
+  }
+
+  void on_abbr_month() {
+    if (is_classic_)
+      out_ = write(out_, tm_mon_short_name(tm_mon()));
+    else
+      format_localized('b');
+  }
+  void on_full_month() {
+    if (is_classic_)
+      out_ = write(out_, tm_mon_full_name(tm_mon()));
+    else
+      format_localized('B');
+  }
+
+  void on_datetime(numeric_system ns) {
+    if (is_classic_) {
+      on_abbr_weekday();
+      *out_++ = ' ';
+      on_abbr_month();
+      *out_++ = ' ';
+      on_day_of_month(numeric_system::standard, pad_type::space);
+      *out_++ = ' ';
+      on_iso_time();
+      *out_++ = ' ';
+      on_year(numeric_system::standard);
+    } else {
+      format_localized('c', ns == numeric_system::standard ? '\0' : 'E');
+    }
+  }
+  void on_loc_date(numeric_system ns) {
+    if (is_classic_)
+      on_us_date();
+    else
+      format_localized('x', ns == numeric_system::standard ? '\0' : 'E');
+  }
+  void on_loc_time(numeric_system ns) {
+    if (is_classic_)
+      on_iso_time();
+    else
+      format_localized('X', ns == numeric_system::standard ? '\0' : 'E');
+  }
+  void on_us_date() {
+    char buf[8];
+    write_digit2_separated(buf, to_unsigned(tm_mon() + 1),
+                           to_unsigned(tm_mday()),
+                           to_unsigned(split_year_lower(tm_year())), '/');
+    out_ = copy<Char>(std::begin(buf), std::end(buf), out_);
+  }
+  void on_iso_date() {
+    auto year = tm_year();
+    char buf[10];
+    size_t offset = 0;
+    if (year >= 0 && year < 10000) {
+      copy2(buf, digits2(static_cast<size_t>(year / 100)));
+    } else {
+      offset = 4;
+      write_year_extended(year);
+      year = 0;
+    }
+    write_digit2_separated(buf + 2, static_cast<unsigned>(year % 100),
+                           to_unsigned(tm_mon() + 1), to_unsigned(tm_mday()),
+                           '-');
+    out_ = copy<Char>(std::begin(buf) + offset, std::end(buf), out_);
+  }
+
+  void on_utc_offset(numeric_system ns) { format_utc_offset_impl(tm_, ns); }
+  void on_tz_name() { format_tz_name_impl(tm_); }
+
+  void on_year(numeric_system ns) {
+    if (is_classic_ || ns == numeric_system::standard)
+      return write_year(tm_year());
+    format_localized('Y', 'E');
+  }
+  void on_short_year(numeric_system ns) {
+    if (is_classic_ || ns == numeric_system::standard)
+      return write2(split_year_lower(tm_year()));
+    format_localized('y', 'O');
+  }
+  void on_offset_year() {
+    if (is_classic_) return write2(split_year_lower(tm_year()));
+    format_localized('y', 'E');
+  }
+
+  void on_century(numeric_system ns) {
+    if (is_classic_ || ns == numeric_system::standard) {
+      auto year = tm_year();
+      auto upper = year / 100;
+      if (year >= -99 && year < 0) {
+        // Zero upper on negative year.
+        *out_++ = '-';
+        *out_++ = '0';
+      } else if (upper >= 0 && upper < 100) {
+        write2(static_cast<int>(upper));
+      } else {
+        out_ = write<Char>(out_, upper);
+      }
+    } else {
+      format_localized('C', 'E');
+    }
+  }
+
+  void on_dec_month(numeric_system ns) {
+    if (is_classic_ || ns == numeric_system::standard)
+      return write2(tm_mon() + 1);
+    format_localized('m', 'O');
+  }
+
+  void on_dec0_week_of_year(numeric_system ns, pad_type pad) {
+    if (is_classic_ || ns == numeric_system::standard)
+      return write2((tm_yday() + days_per_week - tm_wday()) / days_per_week,
+                    pad);
+    format_localized('U', 'O');
+  }
+  void on_dec1_week_of_year(numeric_system ns, pad_type pad) {
+    if (is_classic_ || ns == numeric_system::standard) {
+      auto wday = tm_wday();
+      write2((tm_yday() + days_per_week -
+              (wday == 0 ? (days_per_week - 1) : (wday - 1))) /
+                 days_per_week,
+             pad);
+    } else {
+      format_localized('W', 'O');
+    }
+  }
+  void on_iso_week_of_year(numeric_system ns, pad_type pad) {
+    if (is_classic_ || ns == numeric_system::standard)
+      return write2(tm_iso_week_of_year(), pad);
+    format_localized('V', 'O');
+  }
+
+  void on_iso_week_based_year() { write_year(tm_iso_week_year()); }
+  void on_iso_week_based_short_year() {
+    write2(split_year_lower(tm_iso_week_year()));
+  }
+
+  void on_day_of_year() {
+    auto yday = tm_yday() + 1;
+    write1(yday / 100);
+    write2(yday % 100);
+  }
+  void on_day_of_month(numeric_system ns, pad_type pad) {
+    if (is_classic_ || ns == numeric_system::standard)
+      return write2(tm_mday(), pad);
+    format_localized('d', 'O');
+  }
+
+  void on_24_hour(numeric_system ns, pad_type pad) {
+    if (is_classic_ || ns == numeric_system::standard)
+      return write2(tm_hour(), pad);
+    format_localized('H', 'O');
+  }
+  void on_12_hour(numeric_system ns, pad_type pad) {
+    if (is_classic_ || ns == numeric_system::standard)
+      return write2(tm_hour12(), pad);
+    format_localized('I', 'O');
+  }
+  void on_minute(numeric_system ns, pad_type pad) {
+    if (is_classic_ || ns == numeric_system::standard)
+      return write2(tm_min(), pad);
+    format_localized('M', 'O');
+  }
+
+  void on_second(numeric_system ns, pad_type pad) {
+    if (is_classic_ || ns == numeric_system::standard) {
+      write2(tm_sec(), pad);
+      if (subsecs_) {
+        if (std::is_floating_point<typename Duration::rep>::value) {
+          auto buf = memory_buffer();
+          write_floating_seconds(buf, *subsecs_);
+          if (buf.size() > 1) {
+            // Remove the leading "0", write something like ".123".
+            out_ = std::copy(buf.begin() + 1, buf.end(), out_);
+          }
+        } else {
+          write_fractional_seconds<Char>(out_, *subsecs_);
+        }
+      }
+    } else {
+      // Currently no formatting of subseconds when a locale is set.
+      format_localized('S', 'O');
+    }
+  }
+
+  void on_12_hour_time() {
+    if (is_classic_) {
+      char buf[8];
+      write_digit2_separated(buf, to_unsigned(tm_hour12()),
+                             to_unsigned(tm_min()), to_unsigned(tm_sec()), ':');
+      out_ = copy<Char>(std::begin(buf), std::end(buf), out_);
+      *out_++ = ' ';
+      on_am_pm();
+    } else {
+      format_localized('r');
+    }
+  }
+  void on_24_hour_time() {
+    write2(tm_hour());
+    *out_++ = ':';
+    write2(tm_min());
+  }
+  void on_iso_time() {
+    on_24_hour_time();
+    *out_++ = ':';
+    on_second(numeric_system::standard, pad_type::zero);
+  }
+
+  void on_am_pm() {
+    if (is_classic_) {
+      *out_++ = tm_hour() < 12 ? 'A' : 'P';
+      *out_++ = 'M';
+    } else {
+      format_localized('p');
+    }
+  }
+
+  // These apply to chrono durations but not tm.
+  void on_duration_value() {}
+  void on_duration_unit() {}
+};
+
+struct chrono_format_checker : null_chrono_spec_handler<chrono_format_checker> {
+  bool has_precision_integral = false;
+
+  FMT_NORETURN void unsupported() { FMT_THROW(format_error("no date")); }
+
+  template <typename Char>
+  FMT_CONSTEXPR void on_text(const Char*, const Char*) {}
+  FMT_CONSTEXPR void on_day_of_year() {}
+  FMT_CONSTEXPR void on_24_hour(numeric_system, pad_type) {}
+  FMT_CONSTEXPR void on_12_hour(numeric_system, pad_type) {}
+  FMT_CONSTEXPR void on_minute(numeric_system, pad_type) {}
+  FMT_CONSTEXPR void on_second(numeric_system, pad_type) {}
+  FMT_CONSTEXPR void on_12_hour_time() {}
+  FMT_CONSTEXPR void on_24_hour_time() {}
+  FMT_CONSTEXPR void on_iso_time() {}
+  FMT_CONSTEXPR void on_am_pm() {}
+  FMT_CONSTEXPR void on_duration_value() const {
+    if (has_precision_integral) {
+      FMT_THROW(format_error("precision not allowed for this argument type"));
+    }
+  }
+  FMT_CONSTEXPR void on_duration_unit() {}
+};
+
+template <typename T,
+          FMT_ENABLE_IF(std::is_integral<T>::value&& has_isfinite<T>::value)>
+inline auto isfinite(T) -> bool {
+  return true;
+}
+
+template <typename T, FMT_ENABLE_IF(std::is_integral<T>::value)>
+inline auto mod(T x, int y) -> T {
+  return x % static_cast<T>(y);
+}
+template <typename T, FMT_ENABLE_IF(std::is_floating_point<T>::value)>
+inline auto mod(T x, int y) -> T {
+  return std::fmod(x, static_cast<T>(y));
+}
+
+// If T is an integral type, maps T to its unsigned counterpart, otherwise
+// leaves it unchanged (unlike std::make_unsigned).
+template <typename T, bool INTEGRAL = std::is_integral<T>::value>
+struct make_unsigned_or_unchanged {
+  using type = T;
+};
+
+template <typename T> struct make_unsigned_or_unchanged<T, true> {
+  using type = typename std::make_unsigned<T>::type;
+};
+
+template <typename Rep, typename Period,
+          FMT_ENABLE_IF(std::is_integral<Rep>::value)>
+inline auto get_milliseconds(std::chrono::duration<Rep, Period> d)
+    -> std::chrono::duration<Rep, std::milli> {
+  // this may overflow and/or the result may not fit in the
+  // target type.
+#if FMT_SAFE_DURATION_CAST
+  using CommonSecondsType =
+      typename std::common_type<decltype(d), std::chrono::seconds>::type;
+  const auto d_as_common = fmt_duration_cast<CommonSecondsType>(d);
+  const auto d_as_whole_seconds =
+      fmt_duration_cast<std::chrono::seconds>(d_as_common);
+  // this conversion should be nonproblematic
+  const auto diff = d_as_common - d_as_whole_seconds;
+  const auto ms =
+      fmt_duration_cast<std::chrono::duration<Rep, std::milli>>(diff);
+  return ms;
+#else
+  auto s = fmt_duration_cast<std::chrono::seconds>(d);
+  return fmt_duration_cast<std::chrono::milliseconds>(d - s);
+#endif
+}
+
+template <typename Char, typename Rep, typename OutputIt,
+          FMT_ENABLE_IF(std::is_integral<Rep>::value)>
+auto format_duration_value(OutputIt out, Rep val, int) -> OutputIt {
+  return write<Char>(out, val);
+}
+
+template <typename Char, typename Rep, typename OutputIt,
+          FMT_ENABLE_IF(std::is_floating_point<Rep>::value)>
+auto format_duration_value(OutputIt out, Rep val, int precision) -> OutputIt {
+  auto specs = format_specs();
+  specs.precision = precision;
+  specs.type =
+      precision >= 0 ? presentation_type::fixed : presentation_type::general;
+  return write<Char>(out, val, specs);
+}
+
+template <typename Char, typename OutputIt>
+auto copy_unit(string_view unit, OutputIt out, Char) -> OutputIt {
+  return std::copy(unit.begin(), unit.end(), out);
+}
+
+template <typename OutputIt>
+auto copy_unit(string_view unit, OutputIt out, wchar_t) -> OutputIt {
+  // This works when wchar_t is UTF-32 because units only contain characters
+  // that have the same representation in UTF-16 and UTF-32.
+  utf8_to_utf16 u(unit);
+  return std::copy(u.c_str(), u.c_str() + u.size(), out);
+}
+
+template <typename Char, typename Period, typename OutputIt>
+auto format_duration_unit(OutputIt out) -> OutputIt {
+  if (const char* unit = get_units<Period>())
+    return copy_unit(string_view(unit), out, Char());
+  *out++ = '[';
+  out = write<Char>(out, Period::num);
+  if (const_check(Period::den != 1)) {
+    *out++ = '/';
+    out = write<Char>(out, Period::den);
+  }
+  *out++ = ']';
+  *out++ = 's';
+  return out;
+}
+
+class get_locale {
+ private:
+  union {
+    std::locale locale_;
+  };
+  bool has_locale_ = false;
+
+ public:
+  get_locale(bool localized, locale_ref loc) : has_locale_(localized) {
+#ifndef FMT_STATIC_THOUSANDS_SEPARATOR
+    if (localized)
+      ::new (&locale_) std::locale(loc.template get<std::locale>());
+#endif
+  }
+  ~get_locale() {
+    if (has_locale_) locale_.~locale();
+  }
+  operator const std::locale&() const {
+    return has_locale_ ? locale_ : get_classic_locale();
+  }
+};
+
+template <typename FormatContext, typename OutputIt, typename Rep,
+          typename Period>
+struct chrono_formatter {
+  FormatContext& context;
+  OutputIt out;
+  int precision;
+  bool localized = false;
+  // rep is unsigned to avoid overflow.
+  using rep =
+      conditional_t<std::is_integral<Rep>::value && sizeof(Rep) < sizeof(int),
+                    unsigned, typename make_unsigned_or_unchanged<Rep>::type>;
+  rep val;
+  using seconds = std::chrono::duration<rep>;
+  seconds s;
+  using milliseconds = std::chrono::duration<rep, std::milli>;
+  bool negative;
+
+  using char_type = typename FormatContext::char_type;
+  using tm_writer_type = tm_writer<OutputIt, char_type>;
+
+  chrono_formatter(FormatContext& ctx, OutputIt o,
+                   std::chrono::duration<Rep, Period> d)
+      : context(ctx),
+        out(o),
+        val(static_cast<rep>(d.count())),
+        negative(false) {
+    if (d.count() < 0) {
+      val = 0 - val;
+      negative = true;
+    }
+
+    // this may overflow and/or the result may not fit in the
+    // target type.
+    // might need checked conversion (rep!=Rep)
+    s = fmt_duration_cast<seconds>(std::chrono::duration<rep, Period>(val));
+  }
+
+  // returns true if nan or inf, writes to out.
+  auto handle_nan_inf() -> bool {
+    if (isfinite(val)) {
+      return false;
+    }
+    if (isnan(val)) {
+      write_nan();
+      return true;
+    }
+    // must be +-inf
+    if (val > 0) {
+      write_pinf();
+    } else {
+      write_ninf();
+    }
+    return true;
+  }
+
+  auto days() const -> Rep { return static_cast<Rep>(s.count() / 86400); }
+  auto hour() const -> Rep {
+    return static_cast<Rep>(mod((s.count() / 3600), 24));
+  }
+
+  auto hour12() const -> Rep {
+    Rep hour = static_cast<Rep>(mod((s.count() / 3600), 12));
+    return hour <= 0 ? 12 : hour;
+  }
+
+  auto minute() const -> Rep {
+    return static_cast<Rep>(mod((s.count() / 60), 60));
+  }
+  auto second() const -> Rep { return static_cast<Rep>(mod(s.count(), 60)); }
+
+  auto time() const -> std::tm {
+    auto time = std::tm();
+    time.tm_hour = to_nonnegative_int(hour(), 24);
+    time.tm_min = to_nonnegative_int(minute(), 60);
+    time.tm_sec = to_nonnegative_int(second(), 60);
+    return time;
+  }
+
+  void write_sign() {
+    if (negative) {
+      *out++ = '-';
+      negative = false;
+    }
+  }
+
+  void write(Rep value, int width, pad_type pad = pad_type::zero) {
+    write_sign();
+    if (isnan(value)) return write_nan();
+    uint32_or_64_or_128_t<int> n =
+        to_unsigned(to_nonnegative_int(value, max_value<int>()));
+    int num_digits = detail::count_digits(n);
+    if (width > num_digits) {
+      out = detail::write_padding(out, pad, width - num_digits);
+    }
+    out = format_decimal<char_type>(out, n, num_digits).end;
+  }
+
+  void write_nan() { std::copy_n("nan", 3, out); }
+  void write_pinf() { std::copy_n("inf", 3, out); }
+  void write_ninf() { std::copy_n("-inf", 4, out); }
+
+  template <typename Callback, typename... Args>
+  void format_tm(const tm& time, Callback cb, Args... args) {
+    if (isnan(val)) return write_nan();
+    get_locale loc(localized, context.locale());
+    auto w = tm_writer_type(loc, out, time);
+    (w.*cb)(args...);
+    out = w.out();
+  }
+
+  void on_text(const char_type* begin, const char_type* end) {
+    std::copy(begin, end, out);
+  }
+
+  // These are not implemented because durations don't have date information.
+  void on_abbr_weekday() {}
+  void on_full_weekday() {}
+  void on_dec0_weekday(numeric_system) {}
+  void on_dec1_weekday(numeric_system) {}
+  void on_abbr_month() {}
+  void on_full_month() {}
+  void on_datetime(numeric_system) {}
+  void on_loc_date(numeric_system) {}
+  void on_loc_time(numeric_system) {}
+  void on_us_date() {}
+  void on_iso_date() {}
+  void on_utc_offset(numeric_system) {}
+  void on_tz_name() {}
+  void on_year(numeric_system) {}
+  void on_short_year(numeric_system) {}
+  void on_offset_year() {}
+  void on_century(numeric_system) {}
+  void on_iso_week_based_year() {}
+  void on_iso_week_based_short_year() {}
+  void on_dec_month(numeric_system) {}
+  void on_dec0_week_of_year(numeric_system, pad_type) {}
+  void on_dec1_week_of_year(numeric_system, pad_type) {}
+  void on_iso_week_of_year(numeric_system, pad_type) {}
+  void on_day_of_month(numeric_system, pad_type) {}
+
+  void on_day_of_year() {
+    if (handle_nan_inf()) return;
+    write(days(), 0);
+  }
+
+  void on_24_hour(numeric_system ns, pad_type pad) {
+    if (handle_nan_inf()) return;
+
+    if (ns == numeric_system::standard) return write(hour(), 2, pad);
+    auto time = tm();
+    time.tm_hour = to_nonnegative_int(hour(), 24);
+    format_tm(time, &tm_writer_type::on_24_hour, ns, pad);
+  }
+
+  void on_12_hour(numeric_system ns, pad_type pad) {
+    if (handle_nan_inf()) return;
+
+    if (ns == numeric_system::standard) return write(hour12(), 2, pad);
+    auto time = tm();
+    time.tm_hour = to_nonnegative_int(hour12(), 12);
+    format_tm(time, &tm_writer_type::on_12_hour, ns, pad);
+  }
+
+  void on_minute(numeric_system ns, pad_type pad) {
+    if (handle_nan_inf()) return;
+
+    if (ns == numeric_system::standard) return write(minute(), 2, pad);
+    auto time = tm();
+    time.tm_min = to_nonnegative_int(minute(), 60);
+    format_tm(time, &tm_writer_type::on_minute, ns, pad);
+  }
+
+  void on_second(numeric_system ns, pad_type pad) {
+    if (handle_nan_inf()) return;
+
+    if (ns == numeric_system::standard) {
+      if (std::is_floating_point<rep>::value) {
+        auto buf = memory_buffer();
+        write_floating_seconds(buf, std::chrono::duration<rep, Period>(val),
+                               precision);
+        if (negative) *out++ = '-';
+        if (buf.size() < 2 || buf[1] == '.') {
+          out = detail::write_padding(out, pad);
+        }
+        out = std::copy(buf.begin(), buf.end(), out);
+      } else {
+        write(second(), 2, pad);
+        write_fractional_seconds<char_type>(
+            out, std::chrono::duration<rep, Period>(val), precision);
+      }
+      return;
+    }
+    auto time = tm();
+    time.tm_sec = to_nonnegative_int(second(), 60);
+    format_tm(time, &tm_writer_type::on_second, ns, pad);
+  }
+
+  void on_12_hour_time() {
+    if (handle_nan_inf()) return;
+    format_tm(time(), &tm_writer_type::on_12_hour_time);
+  }
+
+  void on_24_hour_time() {
+    if (handle_nan_inf()) {
+      *out++ = ':';
+      handle_nan_inf();
+      return;
+    }
+
+    write(hour(), 2);
+    *out++ = ':';
+    write(minute(), 2);
+  }
+
+  void on_iso_time() {
+    on_24_hour_time();
+    *out++ = ':';
+    if (handle_nan_inf()) return;
+    on_second(numeric_system::standard, pad_type::zero);
+  }
+
+  void on_am_pm() {
+    if (handle_nan_inf()) return;
+    format_tm(time(), &tm_writer_type::on_am_pm);
+  }
+
+  void on_duration_value() {
+    if (handle_nan_inf()) return;
+    write_sign();
+    out = format_duration_value<char_type>(out, val, precision);
+  }
+
+  void on_duration_unit() {
+    out = format_duration_unit<char_type, Period>(out);
+  }
+};
+
+}  // namespace detail
+
+#if defined(__cpp_lib_chrono) && __cpp_lib_chrono >= 201907
+using weekday = std::chrono::weekday;
+using day = std::chrono::day;
+using month = std::chrono::month;
+using year = std::chrono::year;
+using year_month_day = std::chrono::year_month_day;
+#else
+// A fallback version of weekday.
+class weekday {
+ private:
+  unsigned char value_;
+
+ public:
+  weekday() = default;
+  constexpr explicit weekday(unsigned wd) noexcept
+      : value_(static_cast<unsigned char>(wd != 7 ? wd : 0)) {}
+  constexpr auto c_encoding() const noexcept -> unsigned { return value_; }
+};
+
+class day {
+ private:
+  unsigned char value_;
+
+ public:
+  day() = default;
+  constexpr explicit day(unsigned d) noexcept
+      : value_(static_cast<unsigned char>(d)) {}
+  constexpr explicit operator unsigned() const noexcept { return value_; }
+};
+
+class month {
+ private:
+  unsigned char value_;
+
+ public:
+  month() = default;
+  constexpr explicit month(unsigned m) noexcept
+      : value_(static_cast<unsigned char>(m)) {}
+  constexpr explicit operator unsigned() const noexcept { return value_; }
+};
+
+class year {
+ private:
+  int value_;
+
+ public:
+  year() = default;
+  constexpr explicit year(int y) noexcept : value_(y) {}
+  constexpr explicit operator int() const noexcept { return value_; }
+};
+
+class year_month_day {
+ private:
+  fmt::year year_;
+  fmt::month month_;
+  fmt::day day_;
+
+ public:
+  year_month_day() = default;
+  constexpr year_month_day(const year& y, const month& m, const day& d) noexcept
+      : year_(y), month_(m), day_(d) {}
+  constexpr auto year() const noexcept -> fmt::year { return year_; }
+  constexpr auto month() const noexcept -> fmt::month { return month_; }
+  constexpr auto day() const noexcept -> fmt::day { return day_; }
+};
+#endif
+
+template <typename Char>
+struct formatter<weekday, Char> : private formatter<std::tm, Char> {
+ private:
+  bool localized_ = false;
+  bool use_tm_formatter_ = false;
+
+ public:
+  FMT_CONSTEXPR auto parse(basic_format_parse_context<Char>& ctx)
+      -> decltype(ctx.begin()) {
+    auto it = ctx.begin(), end = ctx.end();
+    if (it != end && *it == 'L') {
+      ++it;
+      localized_ = true;
+      return it;
+    }
+    use_tm_formatter_ = it != end && *it != '}';
+    return use_tm_formatter_ ? formatter<std::tm, Char>::parse(ctx) : it;
+  }
+
+  template <typename FormatContext>
+  auto format(weekday wd, FormatContext& ctx) const -> decltype(ctx.out()) {
+    auto time = std::tm();
+    time.tm_wday = static_cast<int>(wd.c_encoding());
+    if (use_tm_formatter_) return formatter<std::tm, Char>::format(time, ctx);
+    detail::get_locale loc(localized_, ctx.locale());
+    auto w = detail::tm_writer<decltype(ctx.out()), Char>(loc, ctx.out(), time);
+    w.on_abbr_weekday();
+    return w.out();
+  }
+};
+
+template <typename Char>
+struct formatter<day, Char> : private formatter<std::tm, Char> {
+ private:
+  bool use_tm_formatter_ = false;
+
+ public:
+  FMT_CONSTEXPR auto parse(basic_format_parse_context<Char>& ctx)
+      -> decltype(ctx.begin()) {
+    auto it = ctx.begin(), end = ctx.end();
+    use_tm_formatter_ = it != end && *it != '}';
+    return use_tm_formatter_ ? formatter<std::tm, Char>::parse(ctx) : it;
+  }
+
+  template <typename FormatContext>
+  auto format(day d, FormatContext& ctx) const -> decltype(ctx.out()) {
+    auto time = std::tm();
+    time.tm_mday = static_cast<int>(static_cast<unsigned>(d));
+    if (use_tm_formatter_) return formatter<std::tm, Char>::format(time, ctx);
+    detail::get_locale loc(false, ctx.locale());
+    auto w = detail::tm_writer<decltype(ctx.out()), Char>(loc, ctx.out(), time);
+    w.on_day_of_month(detail::numeric_system::standard, detail::pad_type::zero);
+    return w.out();
+  }
+};
+
+template <typename Char>
+struct formatter<month, Char> : private formatter<std::tm, Char> {
+ private:
+  bool localized_ = false;
+  bool use_tm_formatter_ = false;
+
+ public:
+  FMT_CONSTEXPR auto parse(basic_format_parse_context<Char>& ctx)
+      -> decltype(ctx.begin()) {
+    auto it = ctx.begin(), end = ctx.end();
+    if (it != end && *it == 'L') {
+      ++it;
+      localized_ = true;
+      return it;
+    }
+    use_tm_formatter_ = it != end && *it != '}';
+    return use_tm_formatter_ ? formatter<std::tm, Char>::parse(ctx) : it;
+  }
+
+  template <typename FormatContext>
+  auto format(month m, FormatContext& ctx) const -> decltype(ctx.out()) {
+    auto time = std::tm();
+    time.tm_mon = static_cast<int>(static_cast<unsigned>(m)) - 1;
+    if (use_tm_formatter_) return formatter<std::tm, Char>::format(time, ctx);
+    detail::get_locale loc(localized_, ctx.locale());
+    auto w = detail::tm_writer<decltype(ctx.out()), Char>(loc, ctx.out(), time);
+    w.on_abbr_month();
+    return w.out();
+  }
+};
+
+template <typename Char>
+struct formatter<year, Char> : private formatter<std::tm, Char> {
+ private:
+  bool use_tm_formatter_ = false;
+
+ public:
+  FMT_CONSTEXPR auto parse(basic_format_parse_context<Char>& ctx)
+      -> decltype(ctx.begin()) {
+    auto it = ctx.begin(), end = ctx.end();
+    use_tm_formatter_ = it != end && *it != '}';
+    return use_tm_formatter_ ? formatter<std::tm, Char>::parse(ctx) : it;
+  }
+
+  template <typename FormatContext>
+  auto format(year y, FormatContext& ctx) const -> decltype(ctx.out()) {
+    auto time = std::tm();
+    time.tm_year = static_cast<int>(y) - 1900;
+    if (use_tm_formatter_) return formatter<std::tm, Char>::format(time, ctx);
+    detail::get_locale loc(false, ctx.locale());
+    auto w = detail::tm_writer<decltype(ctx.out()), Char>(loc, ctx.out(), time);
+    w.on_year(detail::numeric_system::standard);
+    return w.out();
+  }
+};
+
+template <typename Char>
+struct formatter<year_month_day, Char> : private formatter<std::tm, Char> {
+ private:
+  bool use_tm_formatter_ = false;
+
+ public:
+  FMT_CONSTEXPR auto parse(basic_format_parse_context<Char>& ctx)
+      -> decltype(ctx.begin()) {
+    auto it = ctx.begin(), end = ctx.end();
+    use_tm_formatter_ = it != end && *it != '}';
+    return use_tm_formatter_ ? formatter<std::tm, Char>::parse(ctx) : it;
+  }
+
+  template <typename FormatContext>
+  auto format(year_month_day val, FormatContext& ctx) const
+      -> decltype(ctx.out()) {
+    auto time = std::tm();
+    time.tm_year = static_cast<int>(val.year()) - 1900;
+    time.tm_mon = static_cast<int>(static_cast<unsigned>(val.month())) - 1;
+    time.tm_mday = static_cast<int>(static_cast<unsigned>(val.day()));
+    if (use_tm_formatter_) return formatter<std::tm, Char>::format(time, ctx);
+    detail::get_locale loc(true, ctx.locale());
+    auto w = detail::tm_writer<decltype(ctx.out()), Char>(loc, ctx.out(), time);
+    w.on_iso_date();
+    return w.out();
+  }
+};
+
+template <typename Rep, typename Period, typename Char>
+struct formatter<std::chrono::duration<Rep, Period>, Char> {
+ private:
+  format_specs specs_;
+  detail::arg_ref<Char> width_ref_;
+  detail::arg_ref<Char> precision_ref_;
+  bool localized_ = false;
+  basic_string_view<Char> format_str_;
+
+ public:
+  FMT_CONSTEXPR auto parse(basic_format_parse_context<Char>& ctx)
+      -> decltype(ctx.begin()) {
+    auto it = ctx.begin(), end = ctx.end();
+    if (it == end || *it == '}') return it;
+
+    it = detail::parse_align(it, end, specs_);
+    if (it == end) return it;
+
+    it = detail::parse_dynamic_spec(it, end, specs_.width, width_ref_, ctx);
+    if (it == end) return it;
+
+    auto checker = detail::chrono_format_checker();
+    if (*it == '.') {
+      checker.has_precision_integral = !std::is_floating_point<Rep>::value;
+      it = detail::parse_precision(it, end, specs_.precision, precision_ref_,
+                                   ctx);
+    }
+    if (it != end && *it == 'L') {
+      localized_ = true;
+      ++it;
+    }
+    end = detail::parse_chrono_format(it, end, checker);
+    format_str_ = {it, detail::to_unsigned(end - it)};
+    return end;
+  }
+
+  template <typename FormatContext>
+  auto format(std::chrono::duration<Rep, Period> d, FormatContext& ctx) const
+      -> decltype(ctx.out()) {
+    auto specs = specs_;
+    auto precision = specs.precision;
+    specs.precision = -1;
+    auto begin = format_str_.begin(), end = format_str_.end();
+    // As a possible future optimization, we could avoid extra copying if width
+    // is not specified.
+    auto buf = basic_memory_buffer<Char>();
+    auto out = std::back_inserter(buf);
+    detail::handle_dynamic_spec<detail::width_checker>(specs.width, width_ref_,
+                                                       ctx);
+    detail::handle_dynamic_spec<detail::precision_checker>(precision,
+                                                           precision_ref_, ctx);
+    if (begin == end || *begin == '}') {
+      out = detail::format_duration_value<Char>(out, d.count(), precision);
+      detail::format_duration_unit<Char, Period>(out);
+    } else {
+      using chrono_formatter =
+          detail::chrono_formatter<FormatContext, decltype(out), Rep, Period>;
+      auto f = chrono_formatter(ctx, out, d);
+      f.precision = precision;
+      f.localized = localized_;
+      detail::parse_chrono_format(begin, end, f);
+    }
+    return detail::write(
+        ctx.out(), basic_string_view<Char>(buf.data(), buf.size()), specs);
+  }
+};
+
+template <typename Char, typename Duration>
+struct formatter<std::chrono::time_point<std::chrono::system_clock, Duration>,
+                 Char> : formatter<std::tm, Char> {
+  FMT_CONSTEXPR formatter() {
+    this->format_str_ = detail::string_literal<Char, '%', 'F', ' ', '%', 'T'>{};
+  }
+
+  template <typename FormatContext>
+  auto format(std::chrono::time_point<std::chrono::system_clock, Duration> val,
+              FormatContext& ctx) const -> decltype(ctx.out()) {
+    std::tm tm = gmtime(val);
+    using period = typename Duration::period;
+    if (detail::const_check(
+            period::num == 1 && period::den == 1 &&
+            !std::is_floating_point<typename Duration::rep>::value)) {
+      return formatter<std::tm, Char>::format(tm, ctx);
+    }
+    Duration epoch = val.time_since_epoch();
+    Duration subsecs = detail::fmt_duration_cast<Duration>(
+        epoch - detail::fmt_duration_cast<std::chrono::seconds>(epoch));
+    if (subsecs.count() < 0) {
+      auto second =
+          detail::fmt_duration_cast<Duration>(std::chrono::seconds(1));
+      if (tm.tm_sec != 0)
+        --tm.tm_sec;
+      else
+        tm = gmtime(val - second);
+      subsecs += detail::fmt_duration_cast<Duration>(std::chrono::seconds(1));
+    }
+    return formatter<std::tm, Char>::do_format(tm, ctx, &subsecs);
+  }
+};
+
+#if FMT_USE_LOCAL_TIME
+template <typename Char, typename Duration>
+struct formatter<std::chrono::local_time<Duration>, Char>
+    : formatter<std::tm, Char> {
+  FMT_CONSTEXPR formatter() {
+    this->format_str_ = detail::string_literal<Char, '%', 'F', ' ', '%', 'T'>{};
+  }
+
+  template <typename FormatContext>
+  auto format(std::chrono::local_time<Duration> val, FormatContext& ctx) const
+      -> decltype(ctx.out()) {
+    using period = typename Duration::period;
+    if (period::num != 1 || period::den != 1 ||
+        std::is_floating_point<typename Duration::rep>::value) {
+      const auto epoch = val.time_since_epoch();
+      const auto subsecs = detail::fmt_duration_cast<Duration>(
+          epoch - detail::fmt_duration_cast<std::chrono::seconds>(epoch));
+
+      return formatter<std::tm, Char>::do_format(localtime(val), ctx, &subsecs);
+    }
+
+    return formatter<std::tm, Char>::format(localtime(val), ctx);
+  }
+};
+#endif
+
+#if FMT_USE_UTC_TIME
+template <typename Char, typename Duration>
+struct formatter<std::chrono::time_point<std::chrono::utc_clock, Duration>,
+                 Char>
+    : formatter<std::chrono::time_point<std::chrono::system_clock, Duration>,
+                Char> {
+  template <typename FormatContext>
+  auto format(std::chrono::time_point<std::chrono::utc_clock, Duration> val,
+              FormatContext& ctx) const -> decltype(ctx.out()) {
+    return formatter<
+        std::chrono::time_point<std::chrono::system_clock, Duration>,
+        Char>::format(std::chrono::utc_clock::to_sys(val), ctx);
+  }
+};
+#endif
+
+template <typename Char> struct formatter<std::tm, Char> {
+ private:
+  format_specs specs_;
+  detail::arg_ref<Char> width_ref_;
+
+ protected:
+  basic_string_view<Char> format_str_;
+
+  template <typename FormatContext, typename Duration>
+  auto do_format(const std::tm& tm, FormatContext& ctx,
+                 const Duration* subsecs) const -> decltype(ctx.out()) {
+    auto specs = specs_;
+    auto buf = basic_memory_buffer<Char>();
+    auto out = std::back_inserter(buf);
+    detail::handle_dynamic_spec<detail::width_checker>(specs.width, width_ref_,
+                                                       ctx);
+
+    auto loc_ref = ctx.locale();
+    detail::get_locale loc(static_cast<bool>(loc_ref), loc_ref);
+    auto w =
+        detail::tm_writer<decltype(out), Char, Duration>(loc, out, tm, subsecs);
+    detail::parse_chrono_format(format_str_.begin(), format_str_.end(), w);
+    return detail::write(
+        ctx.out(), basic_string_view<Char>(buf.data(), buf.size()), specs);
+  }
+
+ public:
+  FMT_CONSTEXPR auto parse(basic_format_parse_context<Char>& ctx)
+      -> decltype(ctx.begin()) {
+    auto it = ctx.begin(), end = ctx.end();
+    if (it == end || *it == '}') return it;
+
+    it = detail::parse_align(it, end, specs_);
+    if (it == end) return it;
+
+    it = detail::parse_dynamic_spec(it, end, specs_.width, width_ref_, ctx);
+    if (it == end) return it;
+
+    end = detail::parse_chrono_format(it, end, detail::tm_format_checker());
+    // Replace the default format_str only if the new spec is not empty.
+    if (end != it) format_str_ = {it, detail::to_unsigned(end - it)};
+    return end;
+  }
+
+  template <typename FormatContext>
+  auto format(const std::tm& tm, FormatContext& ctx) const
+      -> decltype(ctx.out()) {
+    return do_format<FormatContext, std::chrono::seconds>(tm, ctx, nullptr);
+  }
+};
+
+FMT_END_EXPORT
+FMT_END_NAMESPACE
+
+#endif  // FMT_CHRONO_H_
diff --git a/lib/fmt/fmt/color.h b/lib/fmt/fmt/color.h
new file mode 100644
index 000000000..f0e9dd94e
--- /dev/null
+++ b/lib/fmt/fmt/color.h
@@ -0,0 +1,612 @@
+// Formatting library for C++ - color support
+//
+// Copyright (c) 2018 - present, Victor Zverovich and fmt contributors
+// All rights reserved.
+//
+// For the license information refer to format.h.
+
+#ifndef FMT_COLOR_H_
+#define FMT_COLOR_H_
+
+#include "format.h"
+
+FMT_BEGIN_NAMESPACE
+FMT_BEGIN_EXPORT
+
+enum class color : uint32_t {
+  alice_blue = 0xF0F8FF,               // rgb(240,248,255)
+  antique_white = 0xFAEBD7,            // rgb(250,235,215)
+  aqua = 0x00FFFF,                     // rgb(0,255,255)
+  aquamarine = 0x7FFFD4,               // rgb(127,255,212)
+  azure = 0xF0FFFF,                    // rgb(240,255,255)
+  beige = 0xF5F5DC,                    // rgb(245,245,220)
+  bisque = 0xFFE4C4,                   // rgb(255,228,196)
+  black = 0x000000,                    // rgb(0,0,0)
+  blanched_almond = 0xFFEBCD,          // rgb(255,235,205)
+  blue = 0x0000FF,                     // rgb(0,0,255)
+  blue_violet = 0x8A2BE2,              // rgb(138,43,226)
+  brown = 0xA52A2A,                    // rgb(165,42,42)
+  burly_wood = 0xDEB887,               // rgb(222,184,135)
+  cadet_blue = 0x5F9EA0,               // rgb(95,158,160)
+  chartreuse = 0x7FFF00,               // rgb(127,255,0)
+  chocolate = 0xD2691E,                // rgb(210,105,30)
+  coral = 0xFF7F50,                    // rgb(255,127,80)
+  cornflower_blue = 0x6495ED,          // rgb(100,149,237)
+  cornsilk = 0xFFF8DC,                 // rgb(255,248,220)
+  crimson = 0xDC143C,                  // rgb(220,20,60)
+  cyan = 0x00FFFF,                     // rgb(0,255,255)
+  dark_blue = 0x00008B,                // rgb(0,0,139)
+  dark_cyan = 0x008B8B,                // rgb(0,139,139)
+  dark_golden_rod = 0xB8860B,          // rgb(184,134,11)
+  dark_gray = 0xA9A9A9,                // rgb(169,169,169)
+  dark_green = 0x006400,               // rgb(0,100,0)
+  dark_khaki = 0xBDB76B,               // rgb(189,183,107)
+  dark_magenta = 0x8B008B,             // rgb(139,0,139)
+  dark_olive_green = 0x556B2F,         // rgb(85,107,47)
+  dark_orange = 0xFF8C00,              // rgb(255,140,0)
+  dark_orchid = 0x9932CC,              // rgb(153,50,204)
+  dark_red = 0x8B0000,                 // rgb(139,0,0)
+  dark_salmon = 0xE9967A,              // rgb(233,150,122)
+  dark_sea_green = 0x8FBC8F,           // rgb(143,188,143)
+  dark_slate_blue = 0x483D8B,          // rgb(72,61,139)
+  dark_slate_gray = 0x2F4F4F,          // rgb(47,79,79)
+  dark_turquoise = 0x00CED1,           // rgb(0,206,209)
+  dark_violet = 0x9400D3,              // rgb(148,0,211)
+  deep_pink = 0xFF1493,                // rgb(255,20,147)
+  deep_sky_blue = 0x00BFFF,            // rgb(0,191,255)
+  dim_gray = 0x696969,                 // rgb(105,105,105)
+  dodger_blue = 0x1E90FF,              // rgb(30,144,255)
+  fire_brick = 0xB22222,               // rgb(178,34,34)
+  floral_white = 0xFFFAF0,             // rgb(255,250,240)
+  forest_green = 0x228B22,             // rgb(34,139,34)
+  fuchsia = 0xFF00FF,                  // rgb(255,0,255)
+  gainsboro = 0xDCDCDC,                // rgb(220,220,220)
+  ghost_white = 0xF8F8FF,              // rgb(248,248,255)
+  gold = 0xFFD700,                     // rgb(255,215,0)
+  golden_rod = 0xDAA520,               // rgb(218,165,32)
+  gray = 0x808080,                     // rgb(128,128,128)
+  green = 0x008000,                    // rgb(0,128,0)
+  green_yellow = 0xADFF2F,             // rgb(173,255,47)
+  honey_dew = 0xF0FFF0,                // rgb(240,255,240)
+  hot_pink = 0xFF69B4,                 // rgb(255,105,180)
+  indian_red = 0xCD5C5C,               // rgb(205,92,92)
+  indigo = 0x4B0082,                   // rgb(75,0,130)
+  ivory = 0xFFFFF0,                    // rgb(255,255,240)
+  khaki = 0xF0E68C,                    // rgb(240,230,140)
+  lavender = 0xE6E6FA,                 // rgb(230,230,250)
+  lavender_blush = 0xFFF0F5,           // rgb(255,240,245)
+  lawn_green = 0x7CFC00,               // rgb(124,252,0)
+  lemon_chiffon = 0xFFFACD,            // rgb(255,250,205)
+  light_blue = 0xADD8E6,               // rgb(173,216,230)
+  light_coral = 0xF08080,              // rgb(240,128,128)
+  light_cyan = 0xE0FFFF,               // rgb(224,255,255)
+  light_golden_rod_yellow = 0xFAFAD2,  // rgb(250,250,210)
+  light_gray = 0xD3D3D3,               // rgb(211,211,211)
+  light_green = 0x90EE90,              // rgb(144,238,144)
+  light_pink = 0xFFB6C1,               // rgb(255,182,193)
+  light_salmon = 0xFFA07A,             // rgb(255,160,122)
+  light_sea_green = 0x20B2AA,          // rgb(32,178,170)
+  light_sky_blue = 0x87CEFA,           // rgb(135,206,250)
+  light_slate_gray = 0x778899,         // rgb(119,136,153)
+  light_steel_blue = 0xB0C4DE,         // rgb(176,196,222)
+  light_yellow = 0xFFFFE0,             // rgb(255,255,224)
+  lime = 0x00FF00,                     // rgb(0,255,0)
+  lime_green = 0x32CD32,               // rgb(50,205,50)
+  linen = 0xFAF0E6,                    // rgb(250,240,230)
+  magenta = 0xFF00FF,                  // rgb(255,0,255)
+  maroon = 0x800000,                   // rgb(128,0,0)
+  medium_aquamarine = 0x66CDAA,        // rgb(102,205,170)
+  medium_blue = 0x0000CD,              // rgb(0,0,205)
+  medium_orchid = 0xBA55D3,            // rgb(186,85,211)
+  medium_purple = 0x9370DB,            // rgb(147,112,219)
+  medium_sea_green = 0x3CB371,         // rgb(60,179,113)
+  medium_slate_blue = 0x7B68EE,        // rgb(123,104,238)
+  medium_spring_green = 0x00FA9A,      // rgb(0,250,154)
+  medium_turquoise = 0x48D1CC,         // rgb(72,209,204)
+  medium_violet_red = 0xC71585,        // rgb(199,21,133)
+  midnight_blue = 0x191970,            // rgb(25,25,112)
+  mint_cream = 0xF5FFFA,               // rgb(245,255,250)
+  misty_rose = 0xFFE4E1,               // rgb(255,228,225)
+  moccasin = 0xFFE4B5,                 // rgb(255,228,181)
+  navajo_white = 0xFFDEAD,             // rgb(255,222,173)
+  navy = 0x000080,                     // rgb(0,0,128)
+  old_lace = 0xFDF5E6,                 // rgb(253,245,230)
+  olive = 0x808000,                    // rgb(128,128,0)
+  olive_drab = 0x6B8E23,               // rgb(107,142,35)
+  orange = 0xFFA500,                   // rgb(255,165,0)
+  orange_red = 0xFF4500,               // rgb(255,69,0)
+  orchid = 0xDA70D6,                   // rgb(218,112,214)
+  pale_golden_rod = 0xEEE8AA,          // rgb(238,232,170)
+  pale_green = 0x98FB98,               // rgb(152,251,152)
+  pale_turquoise = 0xAFEEEE,           // rgb(175,238,238)
+  pale_violet_red = 0xDB7093,          // rgb(219,112,147)
+  papaya_whip = 0xFFEFD5,              // rgb(255,239,213)
+  peach_puff = 0xFFDAB9,               // rgb(255,218,185)
+  peru = 0xCD853F,                     // rgb(205,133,63)
+  pink = 0xFFC0CB,                     // rgb(255,192,203)
+  plum = 0xDDA0DD,                     // rgb(221,160,221)
+  powder_blue = 0xB0E0E6,              // rgb(176,224,230)
+  purple = 0x800080,                   // rgb(128,0,128)
+  rebecca_purple = 0x663399,           // rgb(102,51,153)
+  red = 0xFF0000,                      // rgb(255,0,0)
+  rosy_brown = 0xBC8F8F,               // rgb(188,143,143)
+  royal_blue = 0x4169E1,               // rgb(65,105,225)
+  saddle_brown = 0x8B4513,             // rgb(139,69,19)
+  salmon = 0xFA8072,                   // rgb(250,128,114)
+  sandy_brown = 0xF4A460,              // rgb(244,164,96)
+  sea_green = 0x2E8B57,                // rgb(46,139,87)
+  sea_shell = 0xFFF5EE,                // rgb(255,245,238)
+  sienna = 0xA0522D,                   // rgb(160,82,45)
+  silver = 0xC0C0C0,                   // rgb(192,192,192)
+  sky_blue = 0x87CEEB,                 // rgb(135,206,235)
+  slate_blue = 0x6A5ACD,               // rgb(106,90,205)
+  slate_gray = 0x708090,               // rgb(112,128,144)
+  snow = 0xFFFAFA,                     // rgb(255,250,250)
+  spring_green = 0x00FF7F,             // rgb(0,255,127)
+  steel_blue = 0x4682B4,               // rgb(70,130,180)
+  tan = 0xD2B48C,                      // rgb(210,180,140)
+  teal = 0x008080,                     // rgb(0,128,128)
+  thistle = 0xD8BFD8,                  // rgb(216,191,216)
+  tomato = 0xFF6347,                   // rgb(255,99,71)
+  turquoise = 0x40E0D0,                // rgb(64,224,208)
+  violet = 0xEE82EE,                   // rgb(238,130,238)
+  wheat = 0xF5DEB3,                    // rgb(245,222,179)
+  white = 0xFFFFFF,                    // rgb(255,255,255)
+  white_smoke = 0xF5F5F5,              // rgb(245,245,245)
+  yellow = 0xFFFF00,                   // rgb(255,255,0)
+  yellow_green = 0x9ACD32              // rgb(154,205,50)
+};                                     // enum class color
+
+enum class terminal_color : uint8_t {
+  black = 30,
+  red,
+  green,
+  yellow,
+  blue,
+  magenta,
+  cyan,
+  white,
+  bright_black = 90,
+  bright_red,
+  bright_green,
+  bright_yellow,
+  bright_blue,
+  bright_magenta,
+  bright_cyan,
+  bright_white
+};
+
+enum class emphasis : uint8_t {
+  bold = 1,
+  faint = 1 << 1,
+  italic = 1 << 2,
+  underline = 1 << 3,
+  blink = 1 << 4,
+  reverse = 1 << 5,
+  conceal = 1 << 6,
+  strikethrough = 1 << 7,
+};
+
+// rgb is a struct for red, green and blue colors.
+// Using the name "rgb" makes some editors show the color in a tooltip.
+struct rgb {
+  FMT_CONSTEXPR rgb() : r(0), g(0), b(0) {}
+  FMT_CONSTEXPR rgb(uint8_t r_, uint8_t g_, uint8_t b_) : r(r_), g(g_), b(b_) {}
+  FMT_CONSTEXPR rgb(uint32_t hex)
+      : r((hex >> 16) & 0xFF), g((hex >> 8) & 0xFF), b(hex & 0xFF) {}
+  FMT_CONSTEXPR rgb(color hex)
+      : r((uint32_t(hex) >> 16) & 0xFF),
+        g((uint32_t(hex) >> 8) & 0xFF),
+        b(uint32_t(hex) & 0xFF) {}
+  uint8_t r;
+  uint8_t g;
+  uint8_t b;
+};
+
+namespace detail {
+
+// color is a struct of either a rgb color or a terminal color.
+struct color_type {
+  FMT_CONSTEXPR color_type() noexcept : is_rgb(), value{} {}
+  FMT_CONSTEXPR color_type(color rgb_color) noexcept : is_rgb(true), value{} {
+    value.rgb_color = static_cast<uint32_t>(rgb_color);
+  }
+  FMT_CONSTEXPR color_type(rgb rgb_color) noexcept : is_rgb(true), value{} {
+    value.rgb_color = (static_cast<uint32_t>(rgb_color.r) << 16) |
+                      (static_cast<uint32_t>(rgb_color.g) << 8) | rgb_color.b;
+  }
+  FMT_CONSTEXPR color_type(terminal_color term_color) noexcept
+      : is_rgb(), value{} {
+    value.term_color = static_cast<uint8_t>(term_color);
+  }
+  bool is_rgb;
+  union color_union {
+    uint8_t term_color;
+    uint32_t rgb_color;
+  } value;
+};
+}  // namespace detail
+
+/// A text style consisting of foreground and background colors and emphasis.
+class text_style {
+ public:
+  FMT_CONSTEXPR text_style(emphasis em = emphasis()) noexcept
+      : set_foreground_color(), set_background_color(), ems(em) {}
+
+  FMT_CONSTEXPR auto operator|=(const text_style& rhs) -> text_style& {
+    if (!set_foreground_color) {
+      set_foreground_color = rhs.set_foreground_color;
+      foreground_color = rhs.foreground_color;
+    } else if (rhs.set_foreground_color) {
+      if (!foreground_color.is_rgb || !rhs.foreground_color.is_rgb)
+        report_error("can't OR a terminal color");
+      foreground_color.value.rgb_color |= rhs.foreground_color.value.rgb_color;
+    }
+
+    if (!set_background_color) {
+      set_background_color = rhs.set_background_color;
+      background_color = rhs.background_color;
+    } else if (rhs.set_background_color) {
+      if (!background_color.is_rgb || !rhs.background_color.is_rgb)
+        report_error("can't OR a terminal color");
+      background_color.value.rgb_color |= rhs.background_color.value.rgb_color;
+    }
+
+    ems = static_cast<emphasis>(static_cast<uint8_t>(ems) |
+                                static_cast<uint8_t>(rhs.ems));
+    return *this;
+  }
+
+  friend FMT_CONSTEXPR auto operator|(text_style lhs, const text_style& rhs)
+      -> text_style {
+    return lhs |= rhs;
+  }
+
+  FMT_CONSTEXPR auto has_foreground() const noexcept -> bool {
+    return set_foreground_color;
+  }
+  FMT_CONSTEXPR auto has_background() const noexcept -> bool {
+    return set_background_color;
+  }
+  FMT_CONSTEXPR auto has_emphasis() const noexcept -> bool {
+    return static_cast<uint8_t>(ems) != 0;
+  }
+  FMT_CONSTEXPR auto get_foreground() const noexcept -> detail::color_type {
+    FMT_ASSERT(has_foreground(), "no foreground specified for this style");
+    return foreground_color;
+  }
+  FMT_CONSTEXPR auto get_background() const noexcept -> detail::color_type {
+    FMT_ASSERT(has_background(), "no background specified for this style");
+    return background_color;
+  }
+  FMT_CONSTEXPR auto get_emphasis() const noexcept -> emphasis {
+    FMT_ASSERT(has_emphasis(), "no emphasis specified for this style");
+    return ems;
+  }
+
+ private:
+  FMT_CONSTEXPR text_style(bool is_foreground,
+                           detail::color_type text_color) noexcept
+      : set_foreground_color(), set_background_color(), ems() {
+    if (is_foreground) {
+      foreground_color = text_color;
+      set_foreground_color = true;
+    } else {
+      background_color = text_color;
+      set_background_color = true;
+    }
+  }
+
+  friend FMT_CONSTEXPR auto fg(detail::color_type foreground) noexcept
+      -> text_style;
+
+  friend FMT_CONSTEXPR auto bg(detail::color_type background) noexcept
+      -> text_style;
+
+  detail::color_type foreground_color;
+  detail::color_type background_color;
+  bool set_foreground_color;
+  bool set_background_color;
+  emphasis ems;
+};
+
+/// Creates a text style from the foreground (text) color.
+FMT_CONSTEXPR inline auto fg(detail::color_type foreground) noexcept
+    -> text_style {
+  return text_style(true, foreground);
+}
+
+/// Creates a text style from the background color.
+FMT_CONSTEXPR inline auto bg(detail::color_type background) noexcept
+    -> text_style {
+  return text_style(false, background);
+}
+
+FMT_CONSTEXPR inline auto operator|(emphasis lhs, emphasis rhs) noexcept
+    -> text_style {
+  return text_style(lhs) | rhs;
+}
+
+namespace detail {
+
+template <typename Char> struct ansi_color_escape {
+  FMT_CONSTEXPR ansi_color_escape(detail::color_type text_color,
+                                  const char* esc) noexcept {
+    // If we have a terminal color, we need to output another escape code
+    // sequence.
+    if (!text_color.is_rgb) {
+      bool is_background = esc == string_view("\x1b[48;2;");
+      uint32_t value = text_color.value.term_color;
+      // Background ASCII codes are the same as the foreground ones but with
+      // 10 more.
+      if (is_background) value += 10u;
+
+      size_t index = 0;
+      buffer[index++] = static_cast<Char>('\x1b');
+      buffer[index++] = static_cast<Char>('[');
+
+      if (value >= 100u) {
+        buffer[index++] = static_cast<Char>('1');
+        value %= 100u;
+      }
+      buffer[index++] = static_cast<Char>('0' + value / 10u);
+      buffer[index++] = static_cast<Char>('0' + value % 10u);
+
+      buffer[index++] = static_cast<Char>('m');
+      buffer[index++] = static_cast<Char>('\0');
+      return;
+    }
+
+    for (int i = 0; i < 7; i++) {
+      buffer[i] = static_cast<Char>(esc[i]);
+    }
+    rgb color(text_color.value.rgb_color);
+    to_esc(color.r, buffer + 7, ';');
+    to_esc(color.g, buffer + 11, ';');
+    to_esc(color.b, buffer + 15, 'm');
+    buffer[19] = static_cast<Char>(0);
+  }
+  FMT_CONSTEXPR ansi_color_escape(emphasis em) noexcept {
+    uint8_t em_codes[num_emphases] = {};
+    if (has_emphasis(em, emphasis::bold)) em_codes[0] = 1;
+    if (has_emphasis(em, emphasis::faint)) em_codes[1] = 2;
+    if (has_emphasis(em, emphasis::italic)) em_codes[2] = 3;
+    if (has_emphasis(em, emphasis::underline)) em_codes[3] = 4;
+    if (has_emphasis(em, emphasis::blink)) em_codes[4] = 5;
+    if (has_emphasis(em, emphasis::reverse)) em_codes[5] = 7;
+    if (has_emphasis(em, emphasis::conceal)) em_codes[6] = 8;
+    if (has_emphasis(em, emphasis::strikethrough)) em_codes[7] = 9;
+
+    size_t index = 0;
+    for (size_t i = 0; i < num_emphases; ++i) {
+      if (!em_codes[i]) continue;
+      buffer[index++] = static_cast<Char>('\x1b');
+      buffer[index++] = static_cast<Char>('[');
+      buffer[index++] = static_cast<Char>('0' + em_codes[i]);
+      buffer[index++] = static_cast<Char>('m');
+    }
+    buffer[index++] = static_cast<Char>(0);
+  }
+  FMT_CONSTEXPR operator const Char*() const noexcept { return buffer; }
+
+  FMT_CONSTEXPR auto begin() const noexcept -> const Char* { return buffer; }
+  FMT_CONSTEXPR20 auto end() const noexcept -> const Char* {
+    return buffer + basic_string_view<Char>(buffer).size();
+  }
+
+ private:
+  static constexpr size_t num_emphases = 8;
+  Char buffer[7u + 3u * num_emphases + 1u];
+
+  static FMT_CONSTEXPR void to_esc(uint8_t c, Char* out,
+                                   char delimiter) noexcept {
+    out[0] = static_cast<Char>('0' + c / 100);
+    out[1] = static_cast<Char>('0' + c / 10 % 10);
+    out[2] = static_cast<Char>('0' + c % 10);
+    out[3] = static_cast<Char>(delimiter);
+  }
+  static FMT_CONSTEXPR auto has_emphasis(emphasis em, emphasis mask) noexcept
+      -> bool {
+    return static_cast<uint8_t>(em) & static_cast<uint8_t>(mask);
+  }
+};
+
+template <typename Char>
+FMT_CONSTEXPR auto make_foreground_color(detail::color_type foreground) noexcept
+    -> ansi_color_escape<Char> {
+  return ansi_color_escape<Char>(foreground, "\x1b[38;2;");
+}
+
+template <typename Char>
+FMT_CONSTEXPR auto make_background_color(detail::color_type background) noexcept
+    -> ansi_color_escape<Char> {
+  return ansi_color_escape<Char>(background, "\x1b[48;2;");
+}
+
+template <typename Char>
+FMT_CONSTEXPR auto make_emphasis(emphasis em) noexcept
+    -> ansi_color_escape<Char> {
+  return ansi_color_escape<Char>(em);
+}
+
+template <typename Char> inline void reset_color(buffer<Char>& buffer) {
+  auto reset_color = string_view("\x1b[0m");
+  buffer.append(reset_color.begin(), reset_color.end());
+}
+
+template <typename T> struct styled_arg : detail::view {
+  const T& value;
+  text_style style;
+  styled_arg(const T& v, text_style s) : value(v), style(s) {}
+};
+
+template <typename Char>
+void vformat_to(
+    buffer<Char>& buf, const text_style& ts, basic_string_view<Char> format_str,
+    basic_format_args<buffered_context<type_identity_t<Char>>> args) {
+  bool has_style = false;
+  if (ts.has_emphasis()) {
+    has_style = true;
+    auto emphasis = detail::make_emphasis<Char>(ts.get_emphasis());
+    buf.append(emphasis.begin(), emphasis.end());
+  }
+  if (ts.has_foreground()) {
+    has_style = true;
+    auto foreground = detail::make_foreground_color<Char>(ts.get_foreground());
+    buf.append(foreground.begin(), foreground.end());
+  }
+  if (ts.has_background()) {
+    has_style = true;
+    auto background = detail::make_background_color<Char>(ts.get_background());
+    buf.append(background.begin(), background.end());
+  }
+  detail::vformat_to(buf, format_str, args, {});
+  if (has_style) detail::reset_color<Char>(buf);
+}
+
+}  // namespace detail
+
+inline void vprint(FILE* f, const text_style& ts, string_view fmt,
+                   format_args args) {
+  auto buf = memory_buffer();
+  detail::vformat_to(buf, ts, fmt, args);
+  print(f, FMT_STRING("{}"), string_view(buf.begin(), buf.size()));
+}
+
+/**
+ * Formats a string and prints it to the specified file stream using ANSI
+ * escape sequences to specify text formatting.
+ *
+ * **Example**:
+ *
+ *     fmt::print(fmt::emphasis::bold | fg(fmt::color::red),
+ *                "Elapsed time: {0:.2f} seconds", 1.23);
+ */
+template <typename... T>
+void print(FILE* f, const text_style& ts, format_string<T...> fmt,
+           T&&... args) {
+  vprint(f, ts, fmt, fmt::make_format_args(args...));
+}
+
+/**
+ * Formats a string and prints it to stdout using ANSI escape sequences to
+ * specify text formatting.
+ *
+ * **Example**:
+ *
+ *     fmt::print(fmt::emphasis::bold | fg(fmt::color::red),
+ *                "Elapsed time: {0:.2f} seconds", 1.23);
+ */
+template <typename... T>
+void print(const text_style& ts, format_string<T...> fmt, T&&... args) {
+  return print(stdout, ts, fmt, std::forward<T>(args)...);
+}
+
+inline auto vformat(const text_style& ts, string_view fmt, format_args args)
+    -> std::string {
+  auto buf = memory_buffer();
+  detail::vformat_to(buf, ts, fmt, args);
+  return fmt::to_string(buf);
+}
+
+/**
+ * Formats arguments and returns the result as a string using ANSI escape
+ * sequences to specify text formatting.
+ *
+ * **Example**:
+ *
+ * ```
+ * #include <fmt/color.h>
+ * std::string message = fmt::format(fmt::emphasis::bold | fg(fmt::color::red),
+ *                                   "The answer is {}", 42);
+ * ```
+ */
+template <typename... T>
+inline auto format(const text_style& ts, format_string<T...> fmt, T&&... args)
+    -> std::string {
+  return fmt::vformat(ts, fmt, fmt::make_format_args(args...));
+}
+
+/// Formats a string with the given text_style and writes the output to `out`.
+template <typename OutputIt,
+          FMT_ENABLE_IF(detail::is_output_iterator<OutputIt, char>::value)>
+auto vformat_to(OutputIt out, const text_style& ts, string_view fmt,
+                format_args args) -> OutputIt {
+  auto&& buf = detail::get_buffer<char>(out);
+  detail::vformat_to(buf, ts, fmt, args);
+  return detail::get_iterator(buf, out);
+}
+
+/**
+ * Formats arguments with the given text style, writes the result to the output
+ * iterator `out` and returns the iterator past the end of the output range.
+ *
+ * **Example**:
+ *
+ *     std::vector<char> out;
+ *     fmt::format_to(std::back_inserter(out),
+ *                    fmt::emphasis::bold | fg(fmt::color::red), "{}", 42);
+ */
+template <typename OutputIt, typename... T,
+          FMT_ENABLE_IF(detail::is_output_iterator<OutputIt, char>::value)>
+inline auto format_to(OutputIt out, const text_style& ts,
+                      format_string<T...> fmt, T&&... args) -> OutputIt {
+  return vformat_to(out, ts, fmt, fmt::make_format_args(args...));
+}
+
+template <typename T, typename Char>
+struct formatter<detail::styled_arg<T>, Char> : formatter<T, Char> {
+  template <typename FormatContext>
+  auto format(const detail::styled_arg<T>& arg, FormatContext& ctx) const
+      -> decltype(ctx.out()) {
+    const auto& ts = arg.style;
+    const auto& value = arg.value;
+    auto out = ctx.out();
+
+    bool has_style = false;
+    if (ts.has_emphasis()) {
+      has_style = true;
+      auto emphasis = detail::make_emphasis<Char>(ts.get_emphasis());
+      out = std::copy(emphasis.begin(), emphasis.end(), out);
+    }
+    if (ts.has_foreground()) {
+      has_style = true;
+      auto foreground =
+          detail::make_foreground_color<Char>(ts.get_foreground());
+      out = std::copy(foreground.begin(), foreground.end(), out);
+    }
+    if (ts.has_background()) {
+      has_style = true;
+      auto background =
+          detail::make_background_color<Char>(ts.get_background());
+      out = std::copy(background.begin(), background.end(), out);
+    }
+    out = formatter<T, Char>::format(value, ctx);
+    if (has_style) {
+      auto reset_color = string_view("\x1b[0m");
+      out = std::copy(reset_color.begin(), reset_color.end(), out);
+    }
+    return out;
+  }
+};
+
+/**
+ * Returns an argument that will be formatted using ANSI escape sequences,
+ * to be used in a formatting function.
+ *
+ * **Example**:
+ *
+ *     fmt::print("Elapsed time: {0:.2f} seconds",
+ *                fmt::styled(1.23, fmt::fg(fmt::color::green) |
+ *                                  fmt::bg(fmt::color::blue)));
+ */
+template <typename T>
+FMT_CONSTEXPR auto styled(const T& value, text_style ts)
+    -> detail::styled_arg<remove_cvref_t<T>> {
+  return detail::styled_arg<remove_cvref_t<T>>{value, ts};
+}
+
+FMT_END_EXPORT
+FMT_END_NAMESPACE
+
+#endif  // FMT_COLOR_H_
diff --git a/lib/fmt/fmt/compile.h b/lib/fmt/fmt/compile.h
new file mode 100644
index 000000000..b2afc2c30
--- /dev/null
+++ b/lib/fmt/fmt/compile.h
@@ -0,0 +1,529 @@
+// Formatting library for C++ - experimental format string compilation
+//
+// Copyright (c) 2012 - present, Victor Zverovich and fmt contributors
+// All rights reserved.
+//
+// For the license information refer to format.h.
+
+#ifndef FMT_COMPILE_H_
+#define FMT_COMPILE_H_
+
+#ifndef FMT_MODULE
+#  include <iterator>  // std::back_inserter
+#endif
+
+#include "format.h"
+
+FMT_BEGIN_NAMESPACE
+
+// A compile-time string which is compiled into fast formatting code.
+FMT_EXPORT class compiled_string {};
+
+namespace detail {
+
+template <typename T, typename InputIt>
+FMT_CONSTEXPR inline auto copy(InputIt begin, InputIt end, counting_iterator it)
+    -> counting_iterator {
+  return it + (end - begin);
+}
+
+template <typename S>
+struct is_compiled_string : std::is_base_of<compiled_string, S> {};
+
+/**
+ * Converts a string literal `s` into a format string that will be parsed at
+ * compile time and converted into efficient formatting code. Requires C++17
+ * `constexpr if` compiler support.
+ *
+ * **Example**:
+ *
+ *     // Converts 42 into std::string using the most efficient method and no
+ *     // runtime format string processing.
+ *     std::string s = fmt::format(FMT_COMPILE("{}"), 42);
+ */
+#if defined(__cpp_if_constexpr) && defined(__cpp_return_type_deduction)
+#  define FMT_COMPILE(s) FMT_STRING_IMPL(s, fmt::compiled_string, explicit)
+#else
+#  define FMT_COMPILE(s) FMT_STRING(s)
+#endif
+
+#if FMT_USE_NONTYPE_TEMPLATE_ARGS
+template <typename Char, size_t N,
+          fmt::detail_exported::fixed_string<Char, N> Str>
+struct udl_compiled_string : compiled_string {
+  using char_type = Char;
+  explicit constexpr operator basic_string_view<char_type>() const {
+    return {Str.data, N - 1};
+  }
+};
+#endif
+
+template <typename T, typename... Tail>
+auto first(const T& value, const Tail&...) -> const T& {
+  return value;
+}
+
+#if defined(__cpp_if_constexpr) && defined(__cpp_return_type_deduction)
+template <typename... Args> struct type_list {};
+
+// Returns a reference to the argument at index N from [first, rest...].
+template <int N, typename T, typename... Args>
+constexpr const auto& get([[maybe_unused]] const T& first,
+                          [[maybe_unused]] const Args&... rest) {
+  static_assert(N < 1 + sizeof...(Args), "index is out of bounds");
+  if constexpr (N == 0)
+    return first;
+  else
+    return detail::get<N - 1>(rest...);
+}
+
+template <typename Char, typename... Args>
+constexpr int get_arg_index_by_name(basic_string_view<Char> name,
+                                    type_list<Args...>) {
+  return get_arg_index_by_name<Args...>(name);
+}
+
+template <int N, typename> struct get_type_impl;
+
+template <int N, typename... Args> struct get_type_impl<N, type_list<Args...>> {
+  using type =
+      remove_cvref_t<decltype(detail::get<N>(std::declval<Args>()...))>;
+};
+
+template <int N, typename T>
+using get_type = typename get_type_impl<N, T>::type;
+
+template <typename T> struct is_compiled_format : std::false_type {};
+
+template <typename Char> struct text {
+  basic_string_view<Char> data;
+  using char_type = Char;
+
+  template <typename OutputIt, typename... Args>
+  constexpr OutputIt format(OutputIt out, const Args&...) const {
+    return write<Char>(out, data);
+  }
+};
+
+template <typename Char>
+struct is_compiled_format<text<Char>> : std::true_type {};
+
+template <typename Char>
+constexpr text<Char> make_text(basic_string_view<Char> s, size_t pos,
+                               size_t size) {
+  return {{&s[pos], size}};
+}
+
+template <typename Char> struct code_unit {
+  Char value;
+  using char_type = Char;
+
+  template <typename OutputIt, typename... Args>
+  constexpr OutputIt format(OutputIt out, const Args&...) const {
+    *out++ = value;
+    return out;
+  }
+};
+
+// This ensures that the argument type is convertible to `const T&`.
+template <typename T, int N, typename... Args>
+constexpr const T& get_arg_checked(const Args&... args) {
+  const auto& arg = detail::get<N>(args...);
+  if constexpr (detail::is_named_arg<remove_cvref_t<decltype(arg)>>()) {
+    return arg.value;
+  } else {
+    return arg;
+  }
+}
+
+template <typename Char>
+struct is_compiled_format<code_unit<Char>> : std::true_type {};
+
+// A replacement field that refers to argument N.
+template <typename Char, typename T, int N> struct field {
+  using char_type = Char;
+
+  template <typename OutputIt, typename... Args>
+  constexpr OutputIt format(OutputIt out, const Args&... args) const {
+    const T& arg = get_arg_checked<T, N>(args...);
+    if constexpr (std::is_convertible<T, basic_string_view<Char>>::value) {
+      auto s = basic_string_view<Char>(arg);
+      return copy<Char>(s.begin(), s.end(), out);
+    }
+    return write<Char>(out, arg);
+  }
+};
+
+template <typename Char, typename T, int N>
+struct is_compiled_format<field<Char, T, N>> : std::true_type {};
+
+// A replacement field that refers to argument with name.
+template <typename Char> struct runtime_named_field {
+  using char_type = Char;
+  basic_string_view<Char> name;
+
+  template <typename OutputIt, typename T>
+  constexpr static bool try_format_argument(
+      OutputIt& out,
+      // [[maybe_unused]] due to unused-but-set-parameter warning in GCC 7,8,9
+      [[maybe_unused]] basic_string_view<Char> arg_name, const T& arg) {
+    if constexpr (is_named_arg<typename std::remove_cv<T>::type>::value) {
+      if (arg_name == arg.name) {
+        out = write<Char>(out, arg.value);
+        return true;
+      }
+    }
+    return false;
+  }
+
+  template <typename OutputIt, typename... Args>
+  constexpr OutputIt format(OutputIt out, const Args&... args) const {
+    bool found = (try_format_argument(out, name, args) || ...);
+    if (!found) {
+      FMT_THROW(format_error("argument with specified name is not found"));
+    }
+    return out;
+  }
+};
+
+template <typename Char>
+struct is_compiled_format<runtime_named_field<Char>> : std::true_type {};
+
+// A replacement field that refers to argument N and has format specifiers.
+template <typename Char, typename T, int N> struct spec_field {
+  using char_type = Char;
+  formatter<T, Char> fmt;
+
+  template <typename OutputIt, typename... Args>
+  constexpr FMT_INLINE OutputIt format(OutputIt out,
+                                       const Args&... args) const {
+    const auto& vargs =
+        fmt::make_format_args<basic_format_context<OutputIt, Char>>(args...);
+    basic_format_context<OutputIt, Char> ctx(out, vargs);
+    return fmt.format(get_arg_checked<T, N>(args...), ctx);
+  }
+};
+
+template <typename Char, typename T, int N>
+struct is_compiled_format<spec_field<Char, T, N>> : std::true_type {};
+
+template <typename L, typename R> struct concat {
+  L lhs;
+  R rhs;
+  using char_type = typename L::char_type;
+
+  template <typename OutputIt, typename... Args>
+  constexpr OutputIt format(OutputIt out, const Args&... args) const {
+    out = lhs.format(out, args...);
+    return rhs.format(out, args...);
+  }
+};
+
+template <typename L, typename R>
+struct is_compiled_format<concat<L, R>> : std::true_type {};
+
+template <typename L, typename R>
+constexpr concat<L, R> make_concat(L lhs, R rhs) {
+  return {lhs, rhs};
+}
+
+struct unknown_format {};
+
+template <typename Char>
+constexpr size_t parse_text(basic_string_view<Char> str, size_t pos) {
+  for (size_t size = str.size(); pos != size; ++pos) {
+    if (str[pos] == '{' || str[pos] == '}') break;
+  }
+  return pos;
+}
+
+template <typename Args, size_t POS, int ID, typename S>
+constexpr auto compile_format_string(S fmt);
+
+template <typename Args, size_t POS, int ID, typename T, typename S>
+constexpr auto parse_tail(T head, S fmt) {
+  if constexpr (POS != basic_string_view<typename S::char_type>(fmt).size()) {
+    constexpr auto tail = compile_format_string<Args, POS, ID>(fmt);
+    if constexpr (std::is_same<remove_cvref_t<decltype(tail)>,
+                               unknown_format>())
+      return tail;
+    else
+      return make_concat(head, tail);
+  } else {
+    return head;
+  }
+}
+
+template <typename T, typename Char> struct parse_specs_result {
+  formatter<T, Char> fmt;
+  size_t end;
+  int next_arg_id;
+};
+
+enum { manual_indexing_id = -1 };
+
+template <typename T, typename Char>
+constexpr parse_specs_result<T, Char> parse_specs(basic_string_view<Char> str,
+                                                  size_t pos, int next_arg_id) {
+  str.remove_prefix(pos);
+  auto ctx =
+      compile_parse_context<Char>(str, max_value<int>(), nullptr, next_arg_id);
+  auto f = formatter<T, Char>();
+  auto end = f.parse(ctx);
+  return {f, pos + fmt::detail::to_unsigned(end - str.data()),
+          next_arg_id == 0 ? manual_indexing_id : ctx.next_arg_id()};
+}
+
+template <typename Char> struct arg_id_handler {
+  arg_ref<Char> arg_id;
+
+  constexpr int on_auto() {
+    FMT_ASSERT(false, "handler cannot be used with automatic indexing");
+    return 0;
+  }
+  constexpr int on_index(int id) {
+    arg_id = arg_ref<Char>(id);
+    return 0;
+  }
+  constexpr int on_name(basic_string_view<Char> id) {
+    arg_id = arg_ref<Char>(id);
+    return 0;
+  }
+};
+
+template <typename Char> struct parse_arg_id_result {
+  arg_ref<Char> arg_id;
+  const Char* arg_id_end;
+};
+
+template <int ID, typename Char>
+constexpr auto parse_arg_id(const Char* begin, const Char* end) {
+  auto handler = arg_id_handler<Char>{arg_ref<Char>{}};
+  auto arg_id_end = parse_arg_id(begin, end, handler);
+  return parse_arg_id_result<Char>{handler.arg_id, arg_id_end};
+}
+
+template <typename T, typename Enable = void> struct field_type {
+  using type = remove_cvref_t<T>;
+};
+
+template <typename T>
+struct field_type<T, enable_if_t<detail::is_named_arg<T>::value>> {
+  using type = remove_cvref_t<decltype(T::value)>;
+};
+
+template <typename T, typename Args, size_t END_POS, int ARG_INDEX, int NEXT_ID,
+          typename S>
+constexpr auto parse_replacement_field_then_tail(S fmt) {
+  using char_type = typename S::char_type;
+  constexpr auto str = basic_string_view<char_type>(fmt);
+  constexpr char_type c = END_POS != str.size() ? str[END_POS] : char_type();
+  if constexpr (c == '}') {
+    return parse_tail<Args, END_POS + 1, NEXT_ID>(
+        field<char_type, typename field_type<T>::type, ARG_INDEX>(), fmt);
+  } else if constexpr (c != ':') {
+    FMT_THROW(format_error("expected ':'"));
+  } else {
+    constexpr auto result = parse_specs<typename field_type<T>::type>(
+        str, END_POS + 1, NEXT_ID == manual_indexing_id ? 0 : NEXT_ID);
+    if constexpr (result.end >= str.size() || str[result.end] != '}') {
+      FMT_THROW(format_error("expected '}'"));
+      return 0;
+    } else {
+      return parse_tail<Args, result.end + 1, result.next_arg_id>(
+          spec_field<char_type, typename field_type<T>::type, ARG_INDEX>{
+              result.fmt},
+          fmt);
+    }
+  }
+}
+
+// Compiles a non-empty format string and returns the compiled representation
+// or unknown_format() on unrecognized input.
+template <typename Args, size_t POS, int ID, typename S>
+constexpr auto compile_format_string(S fmt) {
+  using char_type = typename S::char_type;
+  constexpr auto str = basic_string_view<char_type>(fmt);
+  if constexpr (str[POS] == '{') {
+    if constexpr (POS + 1 == str.size())
+      FMT_THROW(format_error("unmatched '{' in format string"));
+    if constexpr (str[POS + 1] == '{') {
+      return parse_tail<Args, POS + 2, ID>(make_text(str, POS, 1), fmt);
+    } else if constexpr (str[POS + 1] == '}' || str[POS + 1] == ':') {
+      static_assert(ID != manual_indexing_id,
+                    "cannot switch from manual to automatic argument indexing");
+      constexpr auto next_id =
+          ID != manual_indexing_id ? ID + 1 : manual_indexing_id;
+      return parse_replacement_field_then_tail<get_type<ID, Args>, Args,
+                                               POS + 1, ID, next_id>(fmt);
+    } else {
+      constexpr auto arg_id_result =
+          parse_arg_id<ID>(str.data() + POS + 1, str.data() + str.size());
+      constexpr auto arg_id_end_pos = arg_id_result.arg_id_end - str.data();
+      constexpr char_type c =
+          arg_id_end_pos != str.size() ? str[arg_id_end_pos] : char_type();
+      static_assert(c == '}' || c == ':', "missing '}' in format string");
+      if constexpr (arg_id_result.arg_id.kind == arg_id_kind::index) {
+        static_assert(
+            ID == manual_indexing_id || ID == 0,
+            "cannot switch from automatic to manual argument indexing");
+        constexpr auto arg_index = arg_id_result.arg_id.val.index;
+        return parse_replacement_field_then_tail<get_type<arg_index, Args>,
+                                                 Args, arg_id_end_pos,
+                                                 arg_index, manual_indexing_id>(
+            fmt);
+      } else if constexpr (arg_id_result.arg_id.kind == arg_id_kind::name) {
+        constexpr auto arg_index =
+            get_arg_index_by_name(arg_id_result.arg_id.val.name, Args{});
+        if constexpr (arg_index >= 0) {
+          constexpr auto next_id =
+              ID != manual_indexing_id ? ID + 1 : manual_indexing_id;
+          return parse_replacement_field_then_tail<
+              decltype(get_type<arg_index, Args>::value), Args, arg_id_end_pos,
+              arg_index, next_id>(fmt);
+        } else if constexpr (c == '}') {
+          return parse_tail<Args, arg_id_end_pos + 1, ID>(
+              runtime_named_field<char_type>{arg_id_result.arg_id.val.name},
+              fmt);
+        } else if constexpr (c == ':') {
+          return unknown_format();  // no type info for specs parsing
+        }
+      }
+    }
+  } else if constexpr (str[POS] == '}') {
+    if constexpr (POS + 1 == str.size())
+      FMT_THROW(format_error("unmatched '}' in format string"));
+    return parse_tail<Args, POS + 2, ID>(make_text(str, POS, 1), fmt);
+  } else {
+    constexpr auto end = parse_text(str, POS + 1);
+    if constexpr (end - POS > 1) {
+      return parse_tail<Args, end, ID>(make_text(str, POS, end - POS), fmt);
+    } else {
+      return parse_tail<Args, end, ID>(code_unit<char_type>{str[POS]}, fmt);
+    }
+  }
+}
+
+template <typename... Args, typename S,
+          FMT_ENABLE_IF(detail::is_compiled_string<S>::value)>
+constexpr auto compile(S fmt) {
+  constexpr auto str = basic_string_view<typename S::char_type>(fmt);
+  if constexpr (str.size() == 0) {
+    return detail::make_text(str, 0, 0);
+  } else {
+    constexpr auto result =
+        detail::compile_format_string<detail::type_list<Args...>, 0, 0>(fmt);
+    return result;
+  }
+}
+#endif  // defined(__cpp_if_constexpr) && defined(__cpp_return_type_deduction)
+}  // namespace detail
+
+FMT_BEGIN_EXPORT
+
+#if defined(__cpp_if_constexpr) && defined(__cpp_return_type_deduction)
+
+template <typename CompiledFormat, typename... Args,
+          typename Char = typename CompiledFormat::char_type,
+          FMT_ENABLE_IF(detail::is_compiled_format<CompiledFormat>::value)>
+FMT_INLINE std::basic_string<Char> format(const CompiledFormat& cf,
+                                          const Args&... args) {
+  auto s = std::basic_string<Char>();
+  cf.format(std::back_inserter(s), args...);
+  return s;
+}
+
+template <typename OutputIt, typename CompiledFormat, typename... Args,
+          FMT_ENABLE_IF(detail::is_compiled_format<CompiledFormat>::value)>
+constexpr FMT_INLINE OutputIt format_to(OutputIt out, const CompiledFormat& cf,
+                                        const Args&... args) {
+  return cf.format(out, args...);
+}
+
+template <typename S, typename... Args,
+          FMT_ENABLE_IF(detail::is_compiled_string<S>::value)>
+FMT_INLINE std::basic_string<typename S::char_type> format(const S&,
+                                                           Args&&... args) {
+  if constexpr (std::is_same<typename S::char_type, char>::value) {
+    constexpr auto str = basic_string_view<typename S::char_type>(S());
+    if constexpr (str.size() == 2 && str[0] == '{' && str[1] == '}') {
+      const auto& first = detail::first(args...);
+      if constexpr (detail::is_named_arg<
+                        remove_cvref_t<decltype(first)>>::value) {
+        return fmt::to_string(first.value);
+      } else {
+        return fmt::to_string(first);
+      }
+    }
+  }
+  constexpr auto compiled = detail::compile<Args...>(S());
+  if constexpr (std::is_same<remove_cvref_t<decltype(compiled)>,
+                             detail::unknown_format>()) {
+    return fmt::format(
+        static_cast<basic_string_view<typename S::char_type>>(S()),
+        std::forward<Args>(args)...);
+  } else {
+    return fmt::format(compiled, std::forward<Args>(args)...);
+  }
+}
+
+template <typename OutputIt, typename S, typename... Args,
+          FMT_ENABLE_IF(detail::is_compiled_string<S>::value)>
+FMT_CONSTEXPR OutputIt format_to(OutputIt out, const S&, Args&&... args) {
+  constexpr auto compiled = detail::compile<Args...>(S());
+  if constexpr (std::is_same<remove_cvref_t<decltype(compiled)>,
+                             detail::unknown_format>()) {
+    return fmt::format_to(
+        out, static_cast<basic_string_view<typename S::char_type>>(S()),
+        std::forward<Args>(args)...);
+  } else {
+    return fmt::format_to(out, compiled, std::forward<Args>(args)...);
+  }
+}
+#endif
+
+template <typename OutputIt, typename S, typename... Args,
+          FMT_ENABLE_IF(detail::is_compiled_string<S>::value)>
+auto format_to_n(OutputIt out, size_t n, const S& fmt, Args&&... args)
+    -> format_to_n_result<OutputIt> {
+  using traits = detail::fixed_buffer_traits;
+  auto buf = detail::iterator_buffer<OutputIt, char, traits>(out, n);
+  fmt::format_to(std::back_inserter(buf), fmt, std::forward<Args>(args)...);
+  return {buf.out(), buf.count()};
+}
+
+template <typename S, typename... Args,
+          FMT_ENABLE_IF(detail::is_compiled_string<S>::value)>
+FMT_CONSTEXPR20 auto formatted_size(const S& fmt, const Args&... args)
+    -> size_t {
+  return fmt::format_to(detail::counting_iterator(), fmt, args...).count();
+}
+
+template <typename S, typename... Args,
+          FMT_ENABLE_IF(detail::is_compiled_string<S>::value)>
+void print(std::FILE* f, const S& fmt, const Args&... args) {
+  memory_buffer buffer;
+  fmt::format_to(std::back_inserter(buffer), fmt, args...);
+  detail::print(f, {buffer.data(), buffer.size()});
+}
+
+template <typename S, typename... Args,
+          FMT_ENABLE_IF(detail::is_compiled_string<S>::value)>
+void print(const S& fmt, const Args&... args) {
+  print(stdout, fmt, args...);
+}
+
+#if FMT_USE_NONTYPE_TEMPLATE_ARGS
+inline namespace literals {
+template <detail_exported::fixed_string Str> constexpr auto operator""_cf() {
+  using char_t = remove_cvref_t<decltype(Str.data[0])>;
+  return detail::udl_compiled_string<char_t, sizeof(Str.data) / sizeof(char_t),
+                                     Str>();
+}
+}  // namespace literals
+#endif
+
+FMT_END_EXPORT
+FMT_END_NAMESPACE
+
+#endif  // FMT_COMPILE_H_
diff --git a/lib/fmt/fmt/core.h b/lib/fmt/fmt/core.h
new file mode 100644
index 000000000..8ca735f0c
--- /dev/null
+++ b/lib/fmt/fmt/core.h
@@ -0,0 +1,5 @@
+// This file is only provided for compatibility and may be removed in future
+// versions. Use fmt/base.h if you don't need fmt::format and fmt/format.h
+// otherwise.
+
+#include "format.h"
diff --git a/lib/fmt/fmt/format-inl.h b/lib/fmt/fmt/format-inl.h
new file mode 100644
index 000000000..a887483b6
--- /dev/null
+++ b/lib/fmt/fmt/format-inl.h
@@ -0,0 +1,1928 @@
+// Formatting library for C++ - implementation
+//
+// Copyright (c) 2012 - 2016, Victor Zverovich
+// All rights reserved.
+//
+// For the license information refer to format.h.
+
+#ifndef FMT_FORMAT_INL_H_
+#define FMT_FORMAT_INL_H_
+
+#ifndef FMT_MODULE
+#  include <algorithm>
+#  include <cerrno>  // errno
+#  include <climits>
+#  include <cmath>
+#  include <exception>
+
+#  if !defined(FMT_STATIC_THOUSANDS_SEPARATOR)
+#    include <locale>
+#  endif
+#endif
+
+#if defined(_WIN32) && !defined(FMT_USE_WRITE_CONSOLE)
+#  include <io.h>  // _isatty
+#endif
+
+#include "format.h"
+
+FMT_BEGIN_NAMESPACE
+namespace detail {
+
+FMT_FUNC void assert_fail(const char* file, int line, const char* message) {
+  // Use unchecked std::fprintf to avoid triggering another assertion when
+  // writing to stderr fails
+  std::fprintf(stderr, "%s:%d: assertion failed: %s", file, line, message);
+  // Chosen instead of std::abort to satisfy Clang in CUDA mode during device
+  // code pass.
+  std::terminate();
+}
+
+FMT_FUNC void format_error_code(detail::buffer<char>& out, int error_code,
+                                string_view message) noexcept {
+  // Report error code making sure that the output fits into
+  // inline_buffer_size to avoid dynamic memory allocation and potential
+  // bad_alloc.
+  out.try_resize(0);
+  static const char SEP[] = ": ";
+  static const char ERROR_STR[] = "error ";
+  // Subtract 2 to account for terminating null characters in SEP and ERROR_STR.
+  size_t error_code_size = sizeof(SEP) + sizeof(ERROR_STR) - 2;
+  auto abs_value = static_cast<uint32_or_64_or_128_t<int>>(error_code);
+  if (detail::is_negative(error_code)) {
+    abs_value = 0 - abs_value;
+    ++error_code_size;
+  }
+  error_code_size += detail::to_unsigned(detail::count_digits(abs_value));
+  auto it = appender(out);
+  if (message.size() <= inline_buffer_size - error_code_size)
+    fmt::format_to(it, FMT_STRING("{}{}"), message, SEP);
+  fmt::format_to(it, FMT_STRING("{}{}"), ERROR_STR, error_code);
+  FMT_ASSERT(out.size() <= inline_buffer_size, "");
+}
+
+FMT_FUNC void report_error(format_func func, int error_code,
+                           const char* message) noexcept {
+  memory_buffer full_message;
+  func(full_message, error_code, message);
+  // Don't use fwrite_fully because the latter may throw.
+  if (std::fwrite(full_message.data(), full_message.size(), 1, stderr) > 0)
+    std::fputc('\n', stderr);
+}
+
+// A wrapper around fwrite that throws on error.
+inline void fwrite_fully(const void* ptr, size_t count, FILE* stream) {
+  size_t written = std::fwrite(ptr, 1, count, stream);
+  if (written < count)
+    FMT_THROW(system_error(errno, FMT_STRING("cannot write to file")));
+}
+
+#ifndef FMT_STATIC_THOUSANDS_SEPARATOR
+template <typename Locale>
+locale_ref::locale_ref(const Locale& loc) : locale_(&loc) {
+  static_assert(std::is_same<Locale, std::locale>::value, "");
+}
+
+template <typename Locale> auto locale_ref::get() const -> Locale {
+  static_assert(std::is_same<Locale, std::locale>::value, "");
+  return locale_ ? *static_cast<const std::locale*>(locale_) : std::locale();
+}
+
+template <typename Char>
+FMT_FUNC auto thousands_sep_impl(locale_ref loc) -> thousands_sep_result<Char> {
+  auto& facet = std::use_facet<std::numpunct<Char>>(loc.get<std::locale>());
+  auto grouping = facet.grouping();
+  auto thousands_sep = grouping.empty() ? Char() : facet.thousands_sep();
+  return {std::move(grouping), thousands_sep};
+}
+template <typename Char>
+FMT_FUNC auto decimal_point_impl(locale_ref loc) -> Char {
+  return std::use_facet<std::numpunct<Char>>(loc.get<std::locale>())
+      .decimal_point();
+}
+#else
+template <typename Char>
+FMT_FUNC auto thousands_sep_impl(locale_ref) -> thousands_sep_result<Char> {
+  return {"\03", FMT_STATIC_THOUSANDS_SEPARATOR};
+}
+template <typename Char> FMT_FUNC Char decimal_point_impl(locale_ref) {
+  return '.';
+}
+#endif
+
+FMT_FUNC auto write_loc(appender out, loc_value value,
+                        const format_specs& specs, locale_ref loc) -> bool {
+#ifdef FMT_STATIC_THOUSANDS_SEPARATOR
+  value.visit(loc_writer<>{
+      out, specs, std::string(1, FMT_STATIC_THOUSANDS_SEPARATOR), "\3", "."});
+  return true;
+#else
+  auto locale = loc.get<std::locale>();
+  // We cannot use the num_put<char> facet because it may produce output in
+  // a wrong encoding.
+  using facet = format_facet<std::locale>;
+  if (std::has_facet<facet>(locale))
+    return std::use_facet<facet>(locale).put(out, value, specs);
+  return facet(locale).put(out, value, specs);
+#endif
+}
+}  // namespace detail
+
+FMT_FUNC void report_error(const char* message) {
+  FMT_THROW(format_error(message));
+}
+
+template <typename Locale> typename Locale::id format_facet<Locale>::id;
+
+#ifndef FMT_STATIC_THOUSANDS_SEPARATOR
+template <typename Locale> format_facet<Locale>::format_facet(Locale& loc) {
+  auto& numpunct = std::use_facet<std::numpunct<char>>(loc);
+  grouping_ = numpunct.grouping();
+  if (!grouping_.empty()) separator_ = std::string(1, numpunct.thousands_sep());
+}
+
+template <>
+FMT_API FMT_FUNC auto format_facet<std::locale>::do_put(
+    appender out, loc_value val, const format_specs& specs) const -> bool {
+  return val.visit(
+      detail::loc_writer<>{out, specs, separator_, grouping_, decimal_point_});
+}
+#endif
+
+FMT_FUNC auto vsystem_error(int error_code, string_view fmt, format_args args)
+    -> std::system_error {
+  auto ec = std::error_code(error_code, std::generic_category());
+  return std::system_error(ec, vformat(fmt, args));
+}
+
+namespace detail {
+
+template <typename F>
+inline auto operator==(basic_fp<F> x, basic_fp<F> y) -> bool {
+  return x.f == y.f && x.e == y.e;
+}
+
+// Compilers should be able to optimize this into the ror instruction.
+FMT_CONSTEXPR inline auto rotr(uint32_t n, uint32_t r) noexcept -> uint32_t {
+  r &= 31;
+  return (n >> r) | (n << (32 - r));
+}
+FMT_CONSTEXPR inline auto rotr(uint64_t n, uint32_t r) noexcept -> uint64_t {
+  r &= 63;
+  return (n >> r) | (n << (64 - r));
+}
+
+// Implementation of Dragonbox algorithm: https://github.com/jk-jeon/dragonbox.
+namespace dragonbox {
+// Computes upper 64 bits of multiplication of a 32-bit unsigned integer and a
+// 64-bit unsigned integer.
+inline auto umul96_upper64(uint32_t x, uint64_t y) noexcept -> uint64_t {
+  return umul128_upper64(static_cast<uint64_t>(x) << 32, y);
+}
+
+// Computes lower 128 bits of multiplication of a 64-bit unsigned integer and a
+// 128-bit unsigned integer.
+inline auto umul192_lower128(uint64_t x, uint128_fallback y) noexcept
+    -> uint128_fallback {
+  uint64_t high = x * y.high();
+  uint128_fallback high_low = umul128(x, y.low());
+  return {high + high_low.high(), high_low.low()};
+}
+
+// Computes lower 64 bits of multiplication of a 32-bit unsigned integer and a
+// 64-bit unsigned integer.
+inline auto umul96_lower64(uint32_t x, uint64_t y) noexcept -> uint64_t {
+  return x * y;
+}
+
+// Various fast log computations.
+inline auto floor_log10_pow2_minus_log10_4_over_3(int e) noexcept -> int {
+  FMT_ASSERT(e <= 2936 && e >= -2985, "too large exponent");
+  return (e * 631305 - 261663) >> 21;
+}
+
+FMT_INLINE_VARIABLE constexpr struct {
+  uint32_t divisor;
+  int shift_amount;
+} div_small_pow10_infos[] = {{10, 16}, {100, 16}};
+
+// Replaces n by floor(n / pow(10, N)) returning true if and only if n is
+// divisible by pow(10, N).
+// Precondition: n <= pow(10, N + 1).
+template <int N>
+auto check_divisibility_and_divide_by_pow10(uint32_t& n) noexcept -> bool {
+  // The numbers below are chosen such that:
+  //   1. floor(n/d) = floor(nm / 2^k) where d=10 or d=100,
+  //   2. nm mod 2^k < m if and only if n is divisible by d,
+  // where m is magic_number, k is shift_amount
+  // and d is divisor.
+  //
+  // Item 1 is a common technique of replacing division by a constant with
+  // multiplication, see e.g. "Division by Invariant Integers Using
+  // Multiplication" by Granlund and Montgomery (1994). magic_number (m) is set
+  // to ceil(2^k/d) for large enough k.
+  // The idea for item 2 originates from Schubfach.
+  constexpr auto info = div_small_pow10_infos[N - 1];
+  FMT_ASSERT(n <= info.divisor * 10, "n is too large");
+  constexpr uint32_t magic_number =
+      (1u << info.shift_amount) / info.divisor + 1;
+  n *= magic_number;
+  const uint32_t comparison_mask = (1u << info.shift_amount) - 1;
+  bool result = (n & comparison_mask) < magic_number;
+  n >>= info.shift_amount;
+  return result;
+}
+
+// Computes floor(n / pow(10, N)) for small n and N.
+// Precondition: n <= pow(10, N + 1).
+template <int N> auto small_division_by_pow10(uint32_t n) noexcept -> uint32_t {
+  constexpr auto info = div_small_pow10_infos[N - 1];
+  FMT_ASSERT(n <= info.divisor * 10, "n is too large");
+  constexpr uint32_t magic_number =
+      (1u << info.shift_amount) / info.divisor + 1;
+  return (n * magic_number) >> info.shift_amount;
+}
+
+// Computes floor(n / 10^(kappa + 1)) (float)
+inline auto divide_by_10_to_kappa_plus_1(uint32_t n) noexcept -> uint32_t {
+  // 1374389535 = ceil(2^37/100)
+  return static_cast<uint32_t>((static_cast<uint64_t>(n) * 1374389535) >> 37);
+}
+// Computes floor(n / 10^(kappa + 1)) (double)
+inline auto divide_by_10_to_kappa_plus_1(uint64_t n) noexcept -> uint64_t {
+  // 2361183241434822607 = ceil(2^(64+7)/1000)
+  return umul128_upper64(n, 2361183241434822607ull) >> 7;
+}
+
+// Various subroutines using pow10 cache
+template <typename T> struct cache_accessor;
+
+template <> struct cache_accessor<float> {
+  using carrier_uint = float_info<float>::carrier_uint;
+  using cache_entry_type = uint64_t;
+
+  static auto get_cached_power(int k) noexcept -> uint64_t {
+    FMT_ASSERT(k >= float_info<float>::min_k && k <= float_info<float>::max_k,
+               "k is out of range");
+    static constexpr const uint64_t pow10_significands[] = {
+        0x81ceb32c4b43fcf5, 0xa2425ff75e14fc32, 0xcad2f7f5359a3b3f,
+        0xfd87b5f28300ca0e, 0x9e74d1b791e07e49, 0xc612062576589ddb,
+        0xf79687aed3eec552, 0x9abe14cd44753b53, 0xc16d9a0095928a28,
+        0xf1c90080baf72cb2, 0x971da05074da7bef, 0xbce5086492111aeb,
+        0xec1e4a7db69561a6, 0x9392ee8e921d5d08, 0xb877aa3236a4b44a,
+        0xe69594bec44de15c, 0x901d7cf73ab0acda, 0xb424dc35095cd810,
+        0xe12e13424bb40e14, 0x8cbccc096f5088cc, 0xafebff0bcb24aaff,
+        0xdbe6fecebdedd5bf, 0x89705f4136b4a598, 0xabcc77118461cefd,
+        0xd6bf94d5e57a42bd, 0x8637bd05af6c69b6, 0xa7c5ac471b478424,
+        0xd1b71758e219652c, 0x83126e978d4fdf3c, 0xa3d70a3d70a3d70b,
+        0xcccccccccccccccd, 0x8000000000000000, 0xa000000000000000,
+        0xc800000000000000, 0xfa00000000000000, 0x9c40000000000000,
+        0xc350000000000000, 0xf424000000000000, 0x9896800000000000,
+        0xbebc200000000000, 0xee6b280000000000, 0x9502f90000000000,
+        0xba43b74000000000, 0xe8d4a51000000000, 0x9184e72a00000000,
+        0xb5e620f480000000, 0xe35fa931a0000000, 0x8e1bc9bf04000000,
+        0xb1a2bc2ec5000000, 0xde0b6b3a76400000, 0x8ac7230489e80000,
+        0xad78ebc5ac620000, 0xd8d726b7177a8000, 0x878678326eac9000,
+        0xa968163f0a57b400, 0xd3c21bcecceda100, 0x84595161401484a0,
+        0xa56fa5b99019a5c8, 0xcecb8f27f4200f3a, 0x813f3978f8940985,
+        0xa18f07d736b90be6, 0xc9f2c9cd04674edf, 0xfc6f7c4045812297,
+        0x9dc5ada82b70b59e, 0xc5371912364ce306, 0xf684df56c3e01bc7,
+        0x9a130b963a6c115d, 0xc097ce7bc90715b4, 0xf0bdc21abb48db21,
+        0x96769950b50d88f5, 0xbc143fa4e250eb32, 0xeb194f8e1ae525fe,
+        0x92efd1b8d0cf37bf, 0xb7abc627050305ae, 0xe596b7b0c643c71a,
+        0x8f7e32ce7bea5c70, 0xb35dbf821ae4f38c, 0xe0352f62a19e306f};
+    return pow10_significands[k - float_info<float>::min_k];
+  }
+
+  struct compute_mul_result {
+    carrier_uint result;
+    bool is_integer;
+  };
+  struct compute_mul_parity_result {
+    bool parity;
+    bool is_integer;
+  };
+
+  static auto compute_mul(carrier_uint u,
+                          const cache_entry_type& cache) noexcept
+      -> compute_mul_result {
+    auto r = umul96_upper64(u, cache);
+    return {static_cast<carrier_uint>(r >> 32),
+            static_cast<carrier_uint>(r) == 0};
+  }
+
+  static auto compute_delta(const cache_entry_type& cache, int beta) noexcept
+      -> uint32_t {
+    return static_cast<uint32_t>(cache >> (64 - 1 - beta));
+  }
+
+  static auto compute_mul_parity(carrier_uint two_f,
+                                 const cache_entry_type& cache,
+                                 int beta) noexcept
+      -> compute_mul_parity_result {
+    FMT_ASSERT(beta >= 1, "");
+    FMT_ASSERT(beta < 64, "");
+
+    auto r = umul96_lower64(two_f, cache);
+    return {((r >> (64 - beta)) & 1) != 0,
+            static_cast<uint32_t>(r >> (32 - beta)) == 0};
+  }
+
+  static auto compute_left_endpoint_for_shorter_interval_case(
+      const cache_entry_type& cache, int beta) noexcept -> carrier_uint {
+    return static_cast<carrier_uint>(
+        (cache - (cache >> (num_significand_bits<float>() + 2))) >>
+        (64 - num_significand_bits<float>() - 1 - beta));
+  }
+
+  static auto compute_right_endpoint_for_shorter_interval_case(
+      const cache_entry_type& cache, int beta) noexcept -> carrier_uint {
+    return static_cast<carrier_uint>(
+        (cache + (cache >> (num_significand_bits<float>() + 1))) >>
+        (64 - num_significand_bits<float>() - 1 - beta));
+  }
+
+  static auto compute_round_up_for_shorter_interval_case(
+      const cache_entry_type& cache, int beta) noexcept -> carrier_uint {
+    return (static_cast<carrier_uint>(
+                cache >> (64 - num_significand_bits<float>() - 2 - beta)) +
+            1) /
+           2;
+  }
+};
+
+template <> struct cache_accessor<double> {
+  using carrier_uint = float_info<double>::carrier_uint;
+  using cache_entry_type = uint128_fallback;
+
+  static auto get_cached_power(int k) noexcept -> uint128_fallback {
+    FMT_ASSERT(k >= float_info<double>::min_k && k <= float_info<double>::max_k,
+               "k is out of range");
+
+    static constexpr const uint128_fallback pow10_significands[] = {
+#if FMT_USE_FULL_CACHE_DRAGONBOX
+      {0xff77b1fcbebcdc4f, 0x25e8e89c13bb0f7b},
+      {0x9faacf3df73609b1, 0x77b191618c54e9ad},
+      {0xc795830d75038c1d, 0xd59df5b9ef6a2418},
+      {0xf97ae3d0d2446f25, 0x4b0573286b44ad1e},
+      {0x9becce62836ac577, 0x4ee367f9430aec33},
+      {0xc2e801fb244576d5, 0x229c41f793cda740},
+      {0xf3a20279ed56d48a, 0x6b43527578c11110},
+      {0x9845418c345644d6, 0x830a13896b78aaaa},
+      {0xbe5691ef416bd60c, 0x23cc986bc656d554},
+      {0xedec366b11c6cb8f, 0x2cbfbe86b7ec8aa9},
+      {0x94b3a202eb1c3f39, 0x7bf7d71432f3d6aa},
+      {0xb9e08a83a5e34f07, 0xdaf5ccd93fb0cc54},
+      {0xe858ad248f5c22c9, 0xd1b3400f8f9cff69},
+      {0x91376c36d99995be, 0x23100809b9c21fa2},
+      {0xb58547448ffffb2d, 0xabd40a0c2832a78b},
+      {0xe2e69915b3fff9f9, 0x16c90c8f323f516d},
+      {0x8dd01fad907ffc3b, 0xae3da7d97f6792e4},
+      {0xb1442798f49ffb4a, 0x99cd11cfdf41779d},
+      {0xdd95317f31c7fa1d, 0x40405643d711d584},
+      {0x8a7d3eef7f1cfc52, 0x482835ea666b2573},
+      {0xad1c8eab5ee43b66, 0xda3243650005eed0},
+      {0xd863b256369d4a40, 0x90bed43e40076a83},
+      {0x873e4f75e2224e68, 0x5a7744a6e804a292},
+      {0xa90de3535aaae202, 0x711515d0a205cb37},
+      {0xd3515c2831559a83, 0x0d5a5b44ca873e04},
+      {0x8412d9991ed58091, 0xe858790afe9486c3},
+      {0xa5178fff668ae0b6, 0x626e974dbe39a873},
+      {0xce5d73ff402d98e3, 0xfb0a3d212dc81290},
+      {0x80fa687f881c7f8e, 0x7ce66634bc9d0b9a},
+      {0xa139029f6a239f72, 0x1c1fffc1ebc44e81},
+      {0xc987434744ac874e, 0xa327ffb266b56221},
+      {0xfbe9141915d7a922, 0x4bf1ff9f0062baa9},
+      {0x9d71ac8fada6c9b5, 0x6f773fc3603db4aa},
+      {0xc4ce17b399107c22, 0xcb550fb4384d21d4},
+      {0xf6019da07f549b2b, 0x7e2a53a146606a49},
+      {0x99c102844f94e0fb, 0x2eda7444cbfc426e},
+      {0xc0314325637a1939, 0xfa911155fefb5309},
+      {0xf03d93eebc589f88, 0x793555ab7eba27cb},
+      {0x96267c7535b763b5, 0x4bc1558b2f3458df},
+      {0xbbb01b9283253ca2, 0x9eb1aaedfb016f17},
+      {0xea9c227723ee8bcb, 0x465e15a979c1cadd},
+      {0x92a1958a7675175f, 0x0bfacd89ec191eca},
+      {0xb749faed14125d36, 0xcef980ec671f667c},
+      {0xe51c79a85916f484, 0x82b7e12780e7401b},
+      {0x8f31cc0937ae58d2, 0xd1b2ecb8b0908811},
+      {0xb2fe3f0b8599ef07, 0x861fa7e6dcb4aa16},
+      {0xdfbdcece67006ac9, 0x67a791e093e1d49b},
+      {0x8bd6a141006042bd, 0xe0c8bb2c5c6d24e1},
+      {0xaecc49914078536d, 0x58fae9f773886e19},
+      {0xda7f5bf590966848, 0xaf39a475506a899f},
+      {0x888f99797a5e012d, 0x6d8406c952429604},
+      {0xaab37fd7d8f58178, 0xc8e5087ba6d33b84},
+      {0xd5605fcdcf32e1d6, 0xfb1e4a9a90880a65},
+      {0x855c3be0a17fcd26, 0x5cf2eea09a550680},
+      {0xa6b34ad8c9dfc06f, 0xf42faa48c0ea481f},
+      {0xd0601d8efc57b08b, 0xf13b94daf124da27},
+      {0x823c12795db6ce57, 0x76c53d08d6b70859},
+      {0xa2cb1717b52481ed, 0x54768c4b0c64ca6f},
+      {0xcb7ddcdda26da268, 0xa9942f5dcf7dfd0a},
+      {0xfe5d54150b090b02, 0xd3f93b35435d7c4d},
+      {0x9efa548d26e5a6e1, 0xc47bc5014a1a6db0},
+      {0xc6b8e9b0709f109a, 0x359ab6419ca1091c},
+      {0xf867241c8cc6d4c0, 0xc30163d203c94b63},
+      {0x9b407691d7fc44f8, 0x79e0de63425dcf1e},
+      {0xc21094364dfb5636, 0x985915fc12f542e5},
+      {0xf294b943e17a2bc4, 0x3e6f5b7b17b2939e},
+      {0x979cf3ca6cec5b5a, 0xa705992ceecf9c43},
+      {0xbd8430bd08277231, 0x50c6ff782a838354},
+      {0xece53cec4a314ebd, 0xa4f8bf5635246429},
+      {0x940f4613ae5ed136, 0x871b7795e136be9a},
+      {0xb913179899f68584, 0x28e2557b59846e40},
+      {0xe757dd7ec07426e5, 0x331aeada2fe589d0},
+      {0x9096ea6f3848984f, 0x3ff0d2c85def7622},
+      {0xb4bca50b065abe63, 0x0fed077a756b53aa},
+      {0xe1ebce4dc7f16dfb, 0xd3e8495912c62895},
+      {0x8d3360f09cf6e4bd, 0x64712dd7abbbd95d},
+      {0xb080392cc4349dec, 0xbd8d794d96aacfb4},
+      {0xdca04777f541c567, 0xecf0d7a0fc5583a1},
+      {0x89e42caaf9491b60, 0xf41686c49db57245},
+      {0xac5d37d5b79b6239, 0x311c2875c522ced6},
+      {0xd77485cb25823ac7, 0x7d633293366b828c},
+      {0x86a8d39ef77164bc, 0xae5dff9c02033198},
+      {0xa8530886b54dbdeb, 0xd9f57f830283fdfd},
+      {0xd267caa862a12d66, 0xd072df63c324fd7c},
+      {0x8380dea93da4bc60, 0x4247cb9e59f71e6e},
+      {0xa46116538d0deb78, 0x52d9be85f074e609},
+      {0xcd795be870516656, 0x67902e276c921f8c},
+      {0x806bd9714632dff6, 0x00ba1cd8a3db53b7},
+      {0xa086cfcd97bf97f3, 0x80e8a40eccd228a5},
+      {0xc8a883c0fdaf7df0, 0x6122cd128006b2ce},
+      {0xfad2a4b13d1b5d6c, 0x796b805720085f82},
+      {0x9cc3a6eec6311a63, 0xcbe3303674053bb1},
+      {0xc3f490aa77bd60fc, 0xbedbfc4411068a9d},
+      {0xf4f1b4d515acb93b, 0xee92fb5515482d45},
+      {0x991711052d8bf3c5, 0x751bdd152d4d1c4b},
+      {0xbf5cd54678eef0b6, 0xd262d45a78a0635e},
+      {0xef340a98172aace4, 0x86fb897116c87c35},
+      {0x9580869f0e7aac0e, 0xd45d35e6ae3d4da1},
+      {0xbae0a846d2195712, 0x8974836059cca10a},
+      {0xe998d258869facd7, 0x2bd1a438703fc94c},
+      {0x91ff83775423cc06, 0x7b6306a34627ddd0},
+      {0xb67f6455292cbf08, 0x1a3bc84c17b1d543},
+      {0xe41f3d6a7377eeca, 0x20caba5f1d9e4a94},
+      {0x8e938662882af53e, 0x547eb47b7282ee9d},
+      {0xb23867fb2a35b28d, 0xe99e619a4f23aa44},
+      {0xdec681f9f4c31f31, 0x6405fa00e2ec94d5},
+      {0x8b3c113c38f9f37e, 0xde83bc408dd3dd05},
+      {0xae0b158b4738705e, 0x9624ab50b148d446},
+      {0xd98ddaee19068c76, 0x3badd624dd9b0958},
+      {0x87f8a8d4cfa417c9, 0xe54ca5d70a80e5d7},
+      {0xa9f6d30a038d1dbc, 0x5e9fcf4ccd211f4d},
+      {0xd47487cc8470652b, 0x7647c32000696720},
+      {0x84c8d4dfd2c63f3b, 0x29ecd9f40041e074},
+      {0xa5fb0a17c777cf09, 0xf468107100525891},
+      {0xcf79cc9db955c2cc, 0x7182148d4066eeb5},
+      {0x81ac1fe293d599bf, 0xc6f14cd848405531},
+      {0xa21727db38cb002f, 0xb8ada00e5a506a7d},
+      {0xca9cf1d206fdc03b, 0xa6d90811f0e4851d},
+      {0xfd442e4688bd304a, 0x908f4a166d1da664},
+      {0x9e4a9cec15763e2e, 0x9a598e4e043287ff},
+      {0xc5dd44271ad3cdba, 0x40eff1e1853f29fe},
+      {0xf7549530e188c128, 0xd12bee59e68ef47d},
+      {0x9a94dd3e8cf578b9, 0x82bb74f8301958cf},
+      {0xc13a148e3032d6e7, 0xe36a52363c1faf02},
+      {0xf18899b1bc3f8ca1, 0xdc44e6c3cb279ac2},
+      {0x96f5600f15a7b7e5, 0x29ab103a5ef8c0ba},
+      {0xbcb2b812db11a5de, 0x7415d448f6b6f0e8},
+      {0xebdf661791d60f56, 0x111b495b3464ad22},
+      {0x936b9fcebb25c995, 0xcab10dd900beec35},
+      {0xb84687c269ef3bfb, 0x3d5d514f40eea743},
+      {0xe65829b3046b0afa, 0x0cb4a5a3112a5113},
+      {0x8ff71a0fe2c2e6dc, 0x47f0e785eaba72ac},
+      {0xb3f4e093db73a093, 0x59ed216765690f57},
+      {0xe0f218b8d25088b8, 0x306869c13ec3532d},
+      {0x8c974f7383725573, 0x1e414218c73a13fc},
+      {0xafbd2350644eeacf, 0xe5d1929ef90898fb},
+      {0xdbac6c247d62a583, 0xdf45f746b74abf3a},
+      {0x894bc396ce5da772, 0x6b8bba8c328eb784},
+      {0xab9eb47c81f5114f, 0x066ea92f3f326565},
+      {0xd686619ba27255a2, 0xc80a537b0efefebe},
+      {0x8613fd0145877585, 0xbd06742ce95f5f37},
+      {0xa798fc4196e952e7, 0x2c48113823b73705},
+      {0xd17f3b51fca3a7a0, 0xf75a15862ca504c6},
+      {0x82ef85133de648c4, 0x9a984d73dbe722fc},
+      {0xa3ab66580d5fdaf5, 0xc13e60d0d2e0ebbb},
+      {0xcc963fee10b7d1b3, 0x318df905079926a9},
+      {0xffbbcfe994e5c61f, 0xfdf17746497f7053},
+      {0x9fd561f1fd0f9bd3, 0xfeb6ea8bedefa634},
+      {0xc7caba6e7c5382c8, 0xfe64a52ee96b8fc1},
+      {0xf9bd690a1b68637b, 0x3dfdce7aa3c673b1},
+      {0x9c1661a651213e2d, 0x06bea10ca65c084f},
+      {0xc31bfa0fe5698db8, 0x486e494fcff30a63},
+      {0xf3e2f893dec3f126, 0x5a89dba3c3efccfb},
+      {0x986ddb5c6b3a76b7, 0xf89629465a75e01d},
+      {0xbe89523386091465, 0xf6bbb397f1135824},
+      {0xee2ba6c0678b597f, 0x746aa07ded582e2d},
+      {0x94db483840b717ef, 0xa8c2a44eb4571cdd},
+      {0xba121a4650e4ddeb, 0x92f34d62616ce414},
+      {0xe896a0d7e51e1566, 0x77b020baf9c81d18},
+      {0x915e2486ef32cd60, 0x0ace1474dc1d122f},
+      {0xb5b5ada8aaff80b8, 0x0d819992132456bb},
+      {0xe3231912d5bf60e6, 0x10e1fff697ed6c6a},
+      {0x8df5efabc5979c8f, 0xca8d3ffa1ef463c2},
+      {0xb1736b96b6fd83b3, 0xbd308ff8a6b17cb3},
+      {0xddd0467c64bce4a0, 0xac7cb3f6d05ddbdf},
+      {0x8aa22c0dbef60ee4, 0x6bcdf07a423aa96c},
+      {0xad4ab7112eb3929d, 0x86c16c98d2c953c7},
+      {0xd89d64d57a607744, 0xe871c7bf077ba8b8},
+      {0x87625f056c7c4a8b, 0x11471cd764ad4973},
+      {0xa93af6c6c79b5d2d, 0xd598e40d3dd89bd0},
+      {0xd389b47879823479, 0x4aff1d108d4ec2c4},
+      {0x843610cb4bf160cb, 0xcedf722a585139bb},
+      {0xa54394fe1eedb8fe, 0xc2974eb4ee658829},
+      {0xce947a3da6a9273e, 0x733d226229feea33},
+      {0x811ccc668829b887, 0x0806357d5a3f5260},
+      {0xa163ff802a3426a8, 0xca07c2dcb0cf26f8},
+      {0xc9bcff6034c13052, 0xfc89b393dd02f0b6},
+      {0xfc2c3f3841f17c67, 0xbbac2078d443ace3},
+      {0x9d9ba7832936edc0, 0xd54b944b84aa4c0e},
+      {0xc5029163f384a931, 0x0a9e795e65d4df12},
+      {0xf64335bcf065d37d, 0x4d4617b5ff4a16d6},
+      {0x99ea0196163fa42e, 0x504bced1bf8e4e46},
+      {0xc06481fb9bcf8d39, 0xe45ec2862f71e1d7},
+      {0xf07da27a82c37088, 0x5d767327bb4e5a4d},
+      {0x964e858c91ba2655, 0x3a6a07f8d510f870},
+      {0xbbe226efb628afea, 0x890489f70a55368c},
+      {0xeadab0aba3b2dbe5, 0x2b45ac74ccea842f},
+      {0x92c8ae6b464fc96f, 0x3b0b8bc90012929e},
+      {0xb77ada0617e3bbcb, 0x09ce6ebb40173745},
+      {0xe55990879ddcaabd, 0xcc420a6a101d0516},
+      {0x8f57fa54c2a9eab6, 0x9fa946824a12232e},
+      {0xb32df8e9f3546564, 0x47939822dc96abfa},
+      {0xdff9772470297ebd, 0x59787e2b93bc56f8},
+      {0x8bfbea76c619ef36, 0x57eb4edb3c55b65b},
+      {0xaefae51477a06b03, 0xede622920b6b23f2},
+      {0xdab99e59958885c4, 0xe95fab368e45ecee},
+      {0x88b402f7fd75539b, 0x11dbcb0218ebb415},
+      {0xaae103b5fcd2a881, 0xd652bdc29f26a11a},
+      {0xd59944a37c0752a2, 0x4be76d3346f04960},
+      {0x857fcae62d8493a5, 0x6f70a4400c562ddc},
+      {0xa6dfbd9fb8e5b88e, 0xcb4ccd500f6bb953},
+      {0xd097ad07a71f26b2, 0x7e2000a41346a7a8},
+      {0x825ecc24c873782f, 0x8ed400668c0c28c9},
+      {0xa2f67f2dfa90563b, 0x728900802f0f32fb},
+      {0xcbb41ef979346bca, 0x4f2b40a03ad2ffba},
+      {0xfea126b7d78186bc, 0xe2f610c84987bfa9},
+      {0x9f24b832e6b0f436, 0x0dd9ca7d2df4d7ca},
+      {0xc6ede63fa05d3143, 0x91503d1c79720dbc},
+      {0xf8a95fcf88747d94, 0x75a44c6397ce912b},
+      {0x9b69dbe1b548ce7c, 0xc986afbe3ee11abb},
+      {0xc24452da229b021b, 0xfbe85badce996169},
+      {0xf2d56790ab41c2a2, 0xfae27299423fb9c4},
+      {0x97c560ba6b0919a5, 0xdccd879fc967d41b},
+      {0xbdb6b8e905cb600f, 0x5400e987bbc1c921},
+      {0xed246723473e3813, 0x290123e9aab23b69},
+      {0x9436c0760c86e30b, 0xf9a0b6720aaf6522},
+      {0xb94470938fa89bce, 0xf808e40e8d5b3e6a},
+      {0xe7958cb87392c2c2, 0xb60b1d1230b20e05},
+      {0x90bd77f3483bb9b9, 0xb1c6f22b5e6f48c3},
+      {0xb4ecd5f01a4aa828, 0x1e38aeb6360b1af4},
+      {0xe2280b6c20dd5232, 0x25c6da63c38de1b1},
+      {0x8d590723948a535f, 0x579c487e5a38ad0f},
+      {0xb0af48ec79ace837, 0x2d835a9df0c6d852},
+      {0xdcdb1b2798182244, 0xf8e431456cf88e66},
+      {0x8a08f0f8bf0f156b, 0x1b8e9ecb641b5900},
+      {0xac8b2d36eed2dac5, 0xe272467e3d222f40},
+      {0xd7adf884aa879177, 0x5b0ed81dcc6abb10},
+      {0x86ccbb52ea94baea, 0x98e947129fc2b4ea},
+      {0xa87fea27a539e9a5, 0x3f2398d747b36225},
+      {0xd29fe4b18e88640e, 0x8eec7f0d19a03aae},
+      {0x83a3eeeef9153e89, 0x1953cf68300424ad},
+      {0xa48ceaaab75a8e2b, 0x5fa8c3423c052dd8},
+      {0xcdb02555653131b6, 0x3792f412cb06794e},
+      {0x808e17555f3ebf11, 0xe2bbd88bbee40bd1},
+      {0xa0b19d2ab70e6ed6, 0x5b6aceaeae9d0ec5},
+      {0xc8de047564d20a8b, 0xf245825a5a445276},
+      {0xfb158592be068d2e, 0xeed6e2f0f0d56713},
+      {0x9ced737bb6c4183d, 0x55464dd69685606c},
+      {0xc428d05aa4751e4c, 0xaa97e14c3c26b887},
+      {0xf53304714d9265df, 0xd53dd99f4b3066a9},
+      {0x993fe2c6d07b7fab, 0xe546a8038efe402a},
+      {0xbf8fdb78849a5f96, 0xde98520472bdd034},
+      {0xef73d256a5c0f77c, 0x963e66858f6d4441},
+      {0x95a8637627989aad, 0xdde7001379a44aa9},
+      {0xbb127c53b17ec159, 0x5560c018580d5d53},
+      {0xe9d71b689dde71af, 0xaab8f01e6e10b4a7},
+      {0x9226712162ab070d, 0xcab3961304ca70e9},
+      {0xb6b00d69bb55c8d1, 0x3d607b97c5fd0d23},
+      {0xe45c10c42a2b3b05, 0x8cb89a7db77c506b},
+      {0x8eb98a7a9a5b04e3, 0x77f3608e92adb243},
+      {0xb267ed1940f1c61c, 0x55f038b237591ed4},
+      {0xdf01e85f912e37a3, 0x6b6c46dec52f6689},
+      {0x8b61313bbabce2c6, 0x2323ac4b3b3da016},
+      {0xae397d8aa96c1b77, 0xabec975e0a0d081b},
+      {0xd9c7dced53c72255, 0x96e7bd358c904a22},
+      {0x881cea14545c7575, 0x7e50d64177da2e55},
+      {0xaa242499697392d2, 0xdde50bd1d5d0b9ea},
+      {0xd4ad2dbfc3d07787, 0x955e4ec64b44e865},
+      {0x84ec3c97da624ab4, 0xbd5af13bef0b113f},
+      {0xa6274bbdd0fadd61, 0xecb1ad8aeacdd58f},
+      {0xcfb11ead453994ba, 0x67de18eda5814af3},
+      {0x81ceb32c4b43fcf4, 0x80eacf948770ced8},
+      {0xa2425ff75e14fc31, 0xa1258379a94d028e},
+      {0xcad2f7f5359a3b3e, 0x096ee45813a04331},
+      {0xfd87b5f28300ca0d, 0x8bca9d6e188853fd},
+      {0x9e74d1b791e07e48, 0x775ea264cf55347e},
+      {0xc612062576589dda, 0x95364afe032a819e},
+      {0xf79687aed3eec551, 0x3a83ddbd83f52205},
+      {0x9abe14cd44753b52, 0xc4926a9672793543},
+      {0xc16d9a0095928a27, 0x75b7053c0f178294},
+      {0xf1c90080baf72cb1, 0x5324c68b12dd6339},
+      {0x971da05074da7bee, 0xd3f6fc16ebca5e04},
+      {0xbce5086492111aea, 0x88f4bb1ca6bcf585},
+      {0xec1e4a7db69561a5, 0x2b31e9e3d06c32e6},
+      {0x9392ee8e921d5d07, 0x3aff322e62439fd0},
+      {0xb877aa3236a4b449, 0x09befeb9fad487c3},
+      {0xe69594bec44de15b, 0x4c2ebe687989a9b4},
+      {0x901d7cf73ab0acd9, 0x0f9d37014bf60a11},
+      {0xb424dc35095cd80f, 0x538484c19ef38c95},
+      {0xe12e13424bb40e13, 0x2865a5f206b06fba},
+      {0x8cbccc096f5088cb, 0xf93f87b7442e45d4},
+      {0xafebff0bcb24aafe, 0xf78f69a51539d749},
+      {0xdbe6fecebdedd5be, 0xb573440e5a884d1c},
+      {0x89705f4136b4a597, 0x31680a88f8953031},
+      {0xabcc77118461cefc, 0xfdc20d2b36ba7c3e},
+      {0xd6bf94d5e57a42bc, 0x3d32907604691b4d},
+      {0x8637bd05af6c69b5, 0xa63f9a49c2c1b110},
+      {0xa7c5ac471b478423, 0x0fcf80dc33721d54},
+      {0xd1b71758e219652b, 0xd3c36113404ea4a9},
+      {0x83126e978d4fdf3b, 0x645a1cac083126ea},
+      {0xa3d70a3d70a3d70a, 0x3d70a3d70a3d70a4},
+      {0xcccccccccccccccc, 0xcccccccccccccccd},
+      {0x8000000000000000, 0x0000000000000000},
+      {0xa000000000000000, 0x0000000000000000},
+      {0xc800000000000000, 0x0000000000000000},
+      {0xfa00000000000000, 0x0000000000000000},
+      {0x9c40000000000000, 0x0000000000000000},
+      {0xc350000000000000, 0x0000000000000000},
+      {0xf424000000000000, 0x0000000000000000},
+      {0x9896800000000000, 0x0000000000000000},
+      {0xbebc200000000000, 0x0000000000000000},
+      {0xee6b280000000000, 0x0000000000000000},
+      {0x9502f90000000000, 0x0000000000000000},
+      {0xba43b74000000000, 0x0000000000000000},
+      {0xe8d4a51000000000, 0x0000000000000000},
+      {0x9184e72a00000000, 0x0000000000000000},
+      {0xb5e620f480000000, 0x0000000000000000},
+      {0xe35fa931a0000000, 0x0000000000000000},
+      {0x8e1bc9bf04000000, 0x0000000000000000},
+      {0xb1a2bc2ec5000000, 0x0000000000000000},
+      {0xde0b6b3a76400000, 0x0000000000000000},
+      {0x8ac7230489e80000, 0x0000000000000000},
+      {0xad78ebc5ac620000, 0x0000000000000000},
+      {0xd8d726b7177a8000, 0x0000000000000000},
+      {0x878678326eac9000, 0x0000000000000000},
+      {0xa968163f0a57b400, 0x0000000000000000},
+      {0xd3c21bcecceda100, 0x0000000000000000},
+      {0x84595161401484a0, 0x0000000000000000},
+      {0xa56fa5b99019a5c8, 0x0000000000000000},
+      {0xcecb8f27f4200f3a, 0x0000000000000000},
+      {0x813f3978f8940984, 0x4000000000000000},
+      {0xa18f07d736b90be5, 0x5000000000000000},
+      {0xc9f2c9cd04674ede, 0xa400000000000000},
+      {0xfc6f7c4045812296, 0x4d00000000000000},
+      {0x9dc5ada82b70b59d, 0xf020000000000000},
+      {0xc5371912364ce305, 0x6c28000000000000},
+      {0xf684df56c3e01bc6, 0xc732000000000000},
+      {0x9a130b963a6c115c, 0x3c7f400000000000},
+      {0xc097ce7bc90715b3, 0x4b9f100000000000},
+      {0xf0bdc21abb48db20, 0x1e86d40000000000},
+      {0x96769950b50d88f4, 0x1314448000000000},
+      {0xbc143fa4e250eb31, 0x17d955a000000000},
+      {0xeb194f8e1ae525fd, 0x5dcfab0800000000},
+      {0x92efd1b8d0cf37be, 0x5aa1cae500000000},
+      {0xb7abc627050305ad, 0xf14a3d9e40000000},
+      {0xe596b7b0c643c719, 0x6d9ccd05d0000000},
+      {0x8f7e32ce7bea5c6f, 0xe4820023a2000000},
+      {0xb35dbf821ae4f38b, 0xdda2802c8a800000},
+      {0xe0352f62a19e306e, 0xd50b2037ad200000},
+      {0x8c213d9da502de45, 0x4526f422cc340000},
+      {0xaf298d050e4395d6, 0x9670b12b7f410000},
+      {0xdaf3f04651d47b4c, 0x3c0cdd765f114000},
+      {0x88d8762bf324cd0f, 0xa5880a69fb6ac800},
+      {0xab0e93b6efee0053, 0x8eea0d047a457a00},
+      {0xd5d238a4abe98068, 0x72a4904598d6d880},
+      {0x85a36366eb71f041, 0x47a6da2b7f864750},
+      {0xa70c3c40a64e6c51, 0x999090b65f67d924},
+      {0xd0cf4b50cfe20765, 0xfff4b4e3f741cf6d},
+      {0x82818f1281ed449f, 0xbff8f10e7a8921a5},
+      {0xa321f2d7226895c7, 0xaff72d52192b6a0e},
+      {0xcbea6f8ceb02bb39, 0x9bf4f8a69f764491},
+      {0xfee50b7025c36a08, 0x02f236d04753d5b5},
+      {0x9f4f2726179a2245, 0x01d762422c946591},
+      {0xc722f0ef9d80aad6, 0x424d3ad2b7b97ef6},
+      {0xf8ebad2b84e0d58b, 0xd2e0898765a7deb3},
+      {0x9b934c3b330c8577, 0x63cc55f49f88eb30},
+      {0xc2781f49ffcfa6d5, 0x3cbf6b71c76b25fc},
+      {0xf316271c7fc3908a, 0x8bef464e3945ef7b},
+      {0x97edd871cfda3a56, 0x97758bf0e3cbb5ad},
+      {0xbde94e8e43d0c8ec, 0x3d52eeed1cbea318},
+      {0xed63a231d4c4fb27, 0x4ca7aaa863ee4bde},
+      {0x945e455f24fb1cf8, 0x8fe8caa93e74ef6b},
+      {0xb975d6b6ee39e436, 0xb3e2fd538e122b45},
+      {0xe7d34c64a9c85d44, 0x60dbbca87196b617},
+      {0x90e40fbeea1d3a4a, 0xbc8955e946fe31ce},
+      {0xb51d13aea4a488dd, 0x6babab6398bdbe42},
+      {0xe264589a4dcdab14, 0xc696963c7eed2dd2},
+      {0x8d7eb76070a08aec, 0xfc1e1de5cf543ca3},
+      {0xb0de65388cc8ada8, 0x3b25a55f43294bcc},
+      {0xdd15fe86affad912, 0x49ef0eb713f39ebf},
+      {0x8a2dbf142dfcc7ab, 0x6e3569326c784338},
+      {0xacb92ed9397bf996, 0x49c2c37f07965405},
+      {0xd7e77a8f87daf7fb, 0xdc33745ec97be907},
+      {0x86f0ac99b4e8dafd, 0x69a028bb3ded71a4},
+      {0xa8acd7c0222311bc, 0xc40832ea0d68ce0d},
+      {0xd2d80db02aabd62b, 0xf50a3fa490c30191},
+      {0x83c7088e1aab65db, 0x792667c6da79e0fb},
+      {0xa4b8cab1a1563f52, 0x577001b891185939},
+      {0xcde6fd5e09abcf26, 0xed4c0226b55e6f87},
+      {0x80b05e5ac60b6178, 0x544f8158315b05b5},
+      {0xa0dc75f1778e39d6, 0x696361ae3db1c722},
+      {0xc913936dd571c84c, 0x03bc3a19cd1e38ea},
+      {0xfb5878494ace3a5f, 0x04ab48a04065c724},
+      {0x9d174b2dcec0e47b, 0x62eb0d64283f9c77},
+      {0xc45d1df942711d9a, 0x3ba5d0bd324f8395},
+      {0xf5746577930d6500, 0xca8f44ec7ee3647a},
+      {0x9968bf6abbe85f20, 0x7e998b13cf4e1ecc},
+      {0xbfc2ef456ae276e8, 0x9e3fedd8c321a67f},
+      {0xefb3ab16c59b14a2, 0xc5cfe94ef3ea101f},
+      {0x95d04aee3b80ece5, 0xbba1f1d158724a13},
+      {0xbb445da9ca61281f, 0x2a8a6e45ae8edc98},
+      {0xea1575143cf97226, 0xf52d09d71a3293be},
+      {0x924d692ca61be758, 0x593c2626705f9c57},
+      {0xb6e0c377cfa2e12e, 0x6f8b2fb00c77836d},
+      {0xe498f455c38b997a, 0x0b6dfb9c0f956448},
+      {0x8edf98b59a373fec, 0x4724bd4189bd5ead},
+      {0xb2977ee300c50fe7, 0x58edec91ec2cb658},
+      {0xdf3d5e9bc0f653e1, 0x2f2967b66737e3ee},
+      {0x8b865b215899f46c, 0xbd79e0d20082ee75},
+      {0xae67f1e9aec07187, 0xecd8590680a3aa12},
+      {0xda01ee641a708de9, 0xe80e6f4820cc9496},
+      {0x884134fe908658b2, 0x3109058d147fdcde},
+      {0xaa51823e34a7eede, 0xbd4b46f0599fd416},
+      {0xd4e5e2cdc1d1ea96, 0x6c9e18ac7007c91b},
+      {0x850fadc09923329e, 0x03e2cf6bc604ddb1},
+      {0xa6539930bf6bff45, 0x84db8346b786151d},
+      {0xcfe87f7cef46ff16, 0xe612641865679a64},
+      {0x81f14fae158c5f6e, 0x4fcb7e8f3f60c07f},
+      {0xa26da3999aef7749, 0xe3be5e330f38f09e},
+      {0xcb090c8001ab551c, 0x5cadf5bfd3072cc6},
+      {0xfdcb4fa002162a63, 0x73d9732fc7c8f7f7},
+      {0x9e9f11c4014dda7e, 0x2867e7fddcdd9afb},
+      {0xc646d63501a1511d, 0xb281e1fd541501b9},
+      {0xf7d88bc24209a565, 0x1f225a7ca91a4227},
+      {0x9ae757596946075f, 0x3375788de9b06959},
+      {0xc1a12d2fc3978937, 0x0052d6b1641c83af},
+      {0xf209787bb47d6b84, 0xc0678c5dbd23a49b},
+      {0x9745eb4d50ce6332, 0xf840b7ba963646e1},
+      {0xbd176620a501fbff, 0xb650e5a93bc3d899},
+      {0xec5d3fa8ce427aff, 0xa3e51f138ab4cebf},
+      {0x93ba47c980e98cdf, 0xc66f336c36b10138},
+      {0xb8a8d9bbe123f017, 0xb80b0047445d4185},
+      {0xe6d3102ad96cec1d, 0xa60dc059157491e6},
+      {0x9043ea1ac7e41392, 0x87c89837ad68db30},
+      {0xb454e4a179dd1877, 0x29babe4598c311fc},
+      {0xe16a1dc9d8545e94, 0xf4296dd6fef3d67b},
+      {0x8ce2529e2734bb1d, 0x1899e4a65f58660d},
+      {0xb01ae745b101e9e4, 0x5ec05dcff72e7f90},
+      {0xdc21a1171d42645d, 0x76707543f4fa1f74},
+      {0x899504ae72497eba, 0x6a06494a791c53a9},
+      {0xabfa45da0edbde69, 0x0487db9d17636893},
+      {0xd6f8d7509292d603, 0x45a9d2845d3c42b7},
+      {0x865b86925b9bc5c2, 0x0b8a2392ba45a9b3},
+      {0xa7f26836f282b732, 0x8e6cac7768d7141f},
+      {0xd1ef0244af2364ff, 0x3207d795430cd927},
+      {0x8335616aed761f1f, 0x7f44e6bd49e807b9},
+      {0xa402b9c5a8d3a6e7, 0x5f16206c9c6209a7},
+      {0xcd036837130890a1, 0x36dba887c37a8c10},
+      {0x802221226be55a64, 0xc2494954da2c978a},
+      {0xa02aa96b06deb0fd, 0xf2db9baa10b7bd6d},
+      {0xc83553c5c8965d3d, 0x6f92829494e5acc8},
+      {0xfa42a8b73abbf48c, 0xcb772339ba1f17fa},
+      {0x9c69a97284b578d7, 0xff2a760414536efc},
+      {0xc38413cf25e2d70d, 0xfef5138519684abb},
+      {0xf46518c2ef5b8cd1, 0x7eb258665fc25d6a},
+      {0x98bf2f79d5993802, 0xef2f773ffbd97a62},
+      {0xbeeefb584aff8603, 0xaafb550ffacfd8fb},
+      {0xeeaaba2e5dbf6784, 0x95ba2a53f983cf39},
+      {0x952ab45cfa97a0b2, 0xdd945a747bf26184},
+      {0xba756174393d88df, 0x94f971119aeef9e5},
+      {0xe912b9d1478ceb17, 0x7a37cd5601aab85e},
+      {0x91abb422ccb812ee, 0xac62e055c10ab33b},
+      {0xb616a12b7fe617aa, 0x577b986b314d600a},
+      {0xe39c49765fdf9d94, 0xed5a7e85fda0b80c},
+      {0x8e41ade9fbebc27d, 0x14588f13be847308},
+      {0xb1d219647ae6b31c, 0x596eb2d8ae258fc9},
+      {0xde469fbd99a05fe3, 0x6fca5f8ed9aef3bc},
+      {0x8aec23d680043bee, 0x25de7bb9480d5855},
+      {0xada72ccc20054ae9, 0xaf561aa79a10ae6b},
+      {0xd910f7ff28069da4, 0x1b2ba1518094da05},
+      {0x87aa9aff79042286, 0x90fb44d2f05d0843},
+      {0xa99541bf57452b28, 0x353a1607ac744a54},
+      {0xd3fa922f2d1675f2, 0x42889b8997915ce9},
+      {0x847c9b5d7c2e09b7, 0x69956135febada12},
+      {0xa59bc234db398c25, 0x43fab9837e699096},
+      {0xcf02b2c21207ef2e, 0x94f967e45e03f4bc},
+      {0x8161afb94b44f57d, 0x1d1be0eebac278f6},
+      {0xa1ba1ba79e1632dc, 0x6462d92a69731733},
+      {0xca28a291859bbf93, 0x7d7b8f7503cfdcff},
+      {0xfcb2cb35e702af78, 0x5cda735244c3d43f},
+      {0x9defbf01b061adab, 0x3a0888136afa64a8},
+      {0xc56baec21c7a1916, 0x088aaa1845b8fdd1},
+      {0xf6c69a72a3989f5b, 0x8aad549e57273d46},
+      {0x9a3c2087a63f6399, 0x36ac54e2f678864c},
+      {0xc0cb28a98fcf3c7f, 0x84576a1bb416a7de},
+      {0xf0fdf2d3f3c30b9f, 0x656d44a2a11c51d6},
+      {0x969eb7c47859e743, 0x9f644ae5a4b1b326},
+      {0xbc4665b596706114, 0x873d5d9f0dde1fef},
+      {0xeb57ff22fc0c7959, 0xa90cb506d155a7eb},
+      {0x9316ff75dd87cbd8, 0x09a7f12442d588f3},
+      {0xb7dcbf5354e9bece, 0x0c11ed6d538aeb30},
+      {0xe5d3ef282a242e81, 0x8f1668c8a86da5fb},
+      {0x8fa475791a569d10, 0xf96e017d694487bd},
+      {0xb38d92d760ec4455, 0x37c981dcc395a9ad},
+      {0xe070f78d3927556a, 0x85bbe253f47b1418},
+      {0x8c469ab843b89562, 0x93956d7478ccec8f},
+      {0xaf58416654a6babb, 0x387ac8d1970027b3},
+      {0xdb2e51bfe9d0696a, 0x06997b05fcc0319f},
+      {0x88fcf317f22241e2, 0x441fece3bdf81f04},
+      {0xab3c2fddeeaad25a, 0xd527e81cad7626c4},
+      {0xd60b3bd56a5586f1, 0x8a71e223d8d3b075},
+      {0x85c7056562757456, 0xf6872d5667844e4a},
+      {0xa738c6bebb12d16c, 0xb428f8ac016561dc},
+      {0xd106f86e69d785c7, 0xe13336d701beba53},
+      {0x82a45b450226b39c, 0xecc0024661173474},
+      {0xa34d721642b06084, 0x27f002d7f95d0191},
+      {0xcc20ce9bd35c78a5, 0x31ec038df7b441f5},
+      {0xff290242c83396ce, 0x7e67047175a15272},
+      {0x9f79a169bd203e41, 0x0f0062c6e984d387},
+      {0xc75809c42c684dd1, 0x52c07b78a3e60869},
+      {0xf92e0c3537826145, 0xa7709a56ccdf8a83},
+      {0x9bbcc7a142b17ccb, 0x88a66076400bb692},
+      {0xc2abf989935ddbfe, 0x6acff893d00ea436},
+      {0xf356f7ebf83552fe, 0x0583f6b8c4124d44},
+      {0x98165af37b2153de, 0xc3727a337a8b704b},
+      {0xbe1bf1b059e9a8d6, 0x744f18c0592e4c5d},
+      {0xeda2ee1c7064130c, 0x1162def06f79df74},
+      {0x9485d4d1c63e8be7, 0x8addcb5645ac2ba9},
+      {0xb9a74a0637ce2ee1, 0x6d953e2bd7173693},
+      {0xe8111c87c5c1ba99, 0xc8fa8db6ccdd0438},
+      {0x910ab1d4db9914a0, 0x1d9c9892400a22a3},
+      {0xb54d5e4a127f59c8, 0x2503beb6d00cab4c},
+      {0xe2a0b5dc971f303a, 0x2e44ae64840fd61e},
+      {0x8da471a9de737e24, 0x5ceaecfed289e5d3},
+      {0xb10d8e1456105dad, 0x7425a83e872c5f48},
+      {0xdd50f1996b947518, 0xd12f124e28f7771a},
+      {0x8a5296ffe33cc92f, 0x82bd6b70d99aaa70},
+      {0xace73cbfdc0bfb7b, 0x636cc64d1001550c},
+      {0xd8210befd30efa5a, 0x3c47f7e05401aa4f},
+      {0x8714a775e3e95c78, 0x65acfaec34810a72},
+      {0xa8d9d1535ce3b396, 0x7f1839a741a14d0e},
+      {0xd31045a8341ca07c, 0x1ede48111209a051},
+      {0x83ea2b892091e44d, 0x934aed0aab460433},
+      {0xa4e4b66b68b65d60, 0xf81da84d56178540},
+      {0xce1de40642e3f4b9, 0x36251260ab9d668f},
+      {0x80d2ae83e9ce78f3, 0xc1d72b7c6b42601a},
+      {0xa1075a24e4421730, 0xb24cf65b8612f820},
+      {0xc94930ae1d529cfc, 0xdee033f26797b628},
+      {0xfb9b7cd9a4a7443c, 0x169840ef017da3b2},
+      {0x9d412e0806e88aa5, 0x8e1f289560ee864f},
+      {0xc491798a08a2ad4e, 0xf1a6f2bab92a27e3},
+      {0xf5b5d7ec8acb58a2, 0xae10af696774b1dc},
+      {0x9991a6f3d6bf1765, 0xacca6da1e0a8ef2a},
+      {0xbff610b0cc6edd3f, 0x17fd090a58d32af4},
+      {0xeff394dcff8a948e, 0xddfc4b4cef07f5b1},
+      {0x95f83d0a1fb69cd9, 0x4abdaf101564f98f},
+      {0xbb764c4ca7a4440f, 0x9d6d1ad41abe37f2},
+      {0xea53df5fd18d5513, 0x84c86189216dc5ee},
+      {0x92746b9be2f8552c, 0x32fd3cf5b4e49bb5},
+      {0xb7118682dbb66a77, 0x3fbc8c33221dc2a2},
+      {0xe4d5e82392a40515, 0x0fabaf3feaa5334b},
+      {0x8f05b1163ba6832d, 0x29cb4d87f2a7400f},
+      {0xb2c71d5bca9023f8, 0x743e20e9ef511013},
+      {0xdf78e4b2bd342cf6, 0x914da9246b255417},
+      {0x8bab8eefb6409c1a, 0x1ad089b6c2f7548f},
+      {0xae9672aba3d0c320, 0xa184ac2473b529b2},
+      {0xda3c0f568cc4f3e8, 0xc9e5d72d90a2741f},
+      {0x8865899617fb1871, 0x7e2fa67c7a658893},
+      {0xaa7eebfb9df9de8d, 0xddbb901b98feeab8},
+      {0xd51ea6fa85785631, 0x552a74227f3ea566},
+      {0x8533285c936b35de, 0xd53a88958f872760},
+      {0xa67ff273b8460356, 0x8a892abaf368f138},
+      {0xd01fef10a657842c, 0x2d2b7569b0432d86},
+      {0x8213f56a67f6b29b, 0x9c3b29620e29fc74},
+      {0xa298f2c501f45f42, 0x8349f3ba91b47b90},
+      {0xcb3f2f7642717713, 0x241c70a936219a74},
+      {0xfe0efb53d30dd4d7, 0xed238cd383aa0111},
+      {0x9ec95d1463e8a506, 0xf4363804324a40ab},
+      {0xc67bb4597ce2ce48, 0xb143c6053edcd0d6},
+      {0xf81aa16fdc1b81da, 0xdd94b7868e94050b},
+      {0x9b10a4e5e9913128, 0xca7cf2b4191c8327},
+      {0xc1d4ce1f63f57d72, 0xfd1c2f611f63a3f1},
+      {0xf24a01a73cf2dccf, 0xbc633b39673c8ced},
+      {0x976e41088617ca01, 0xd5be0503e085d814},
+      {0xbd49d14aa79dbc82, 0x4b2d8644d8a74e19},
+      {0xec9c459d51852ba2, 0xddf8e7d60ed1219f},
+      {0x93e1ab8252f33b45, 0xcabb90e5c942b504},
+      {0xb8da1662e7b00a17, 0x3d6a751f3b936244},
+      {0xe7109bfba19c0c9d, 0x0cc512670a783ad5},
+      {0x906a617d450187e2, 0x27fb2b80668b24c6},
+      {0xb484f9dc9641e9da, 0xb1f9f660802dedf7},
+      {0xe1a63853bbd26451, 0x5e7873f8a0396974},
+      {0x8d07e33455637eb2, 0xdb0b487b6423e1e9},
+      {0xb049dc016abc5e5f, 0x91ce1a9a3d2cda63},
+      {0xdc5c5301c56b75f7, 0x7641a140cc7810fc},
+      {0x89b9b3e11b6329ba, 0xa9e904c87fcb0a9e},
+      {0xac2820d9623bf429, 0x546345fa9fbdcd45},
+      {0xd732290fbacaf133, 0xa97c177947ad4096},
+      {0x867f59a9d4bed6c0, 0x49ed8eabcccc485e},
+      {0xa81f301449ee8c70, 0x5c68f256bfff5a75},
+      {0xd226fc195c6a2f8c, 0x73832eec6fff3112},
+      {0x83585d8fd9c25db7, 0xc831fd53c5ff7eac},
+      {0xa42e74f3d032f525, 0xba3e7ca8b77f5e56},
+      {0xcd3a1230c43fb26f, 0x28ce1bd2e55f35ec},
+      {0x80444b5e7aa7cf85, 0x7980d163cf5b81b4},
+      {0xa0555e361951c366, 0xd7e105bcc3326220},
+      {0xc86ab5c39fa63440, 0x8dd9472bf3fefaa8},
+      {0xfa856334878fc150, 0xb14f98f6f0feb952},
+      {0x9c935e00d4b9d8d2, 0x6ed1bf9a569f33d4},
+      {0xc3b8358109e84f07, 0x0a862f80ec4700c9},
+      {0xf4a642e14c6262c8, 0xcd27bb612758c0fb},
+      {0x98e7e9cccfbd7dbd, 0x8038d51cb897789d},
+      {0xbf21e44003acdd2c, 0xe0470a63e6bd56c4},
+      {0xeeea5d5004981478, 0x1858ccfce06cac75},
+      {0x95527a5202df0ccb, 0x0f37801e0c43ebc9},
+      {0xbaa718e68396cffd, 0xd30560258f54e6bb},
+      {0xe950df20247c83fd, 0x47c6b82ef32a206a},
+      {0x91d28b7416cdd27e, 0x4cdc331d57fa5442},
+      {0xb6472e511c81471d, 0xe0133fe4adf8e953},
+      {0xe3d8f9e563a198e5, 0x58180fddd97723a7},
+      {0x8e679c2f5e44ff8f, 0x570f09eaa7ea7649},
+      {0xb201833b35d63f73, 0x2cd2cc6551e513db},
+      {0xde81e40a034bcf4f, 0xf8077f7ea65e58d2},
+      {0x8b112e86420f6191, 0xfb04afaf27faf783},
+      {0xadd57a27d29339f6, 0x79c5db9af1f9b564},
+      {0xd94ad8b1c7380874, 0x18375281ae7822bd},
+      {0x87cec76f1c830548, 0x8f2293910d0b15b6},
+      {0xa9c2794ae3a3c69a, 0xb2eb3875504ddb23},
+      {0xd433179d9c8cb841, 0x5fa60692a46151ec},
+      {0x849feec281d7f328, 0xdbc7c41ba6bcd334},
+      {0xa5c7ea73224deff3, 0x12b9b522906c0801},
+      {0xcf39e50feae16bef, 0xd768226b34870a01},
+      {0x81842f29f2cce375, 0xe6a1158300d46641},
+      {0xa1e53af46f801c53, 0x60495ae3c1097fd1},
+      {0xca5e89b18b602368, 0x385bb19cb14bdfc5},
+      {0xfcf62c1dee382c42, 0x46729e03dd9ed7b6},
+      {0x9e19db92b4e31ba9, 0x6c07a2c26a8346d2},
+      {0xc5a05277621be293, 0xc7098b7305241886},
+      {0xf70867153aa2db38, 0xb8cbee4fc66d1ea8},
+      {0x9a65406d44a5c903, 0x737f74f1dc043329},
+      {0xc0fe908895cf3b44, 0x505f522e53053ff3},
+      {0xf13e34aabb430a15, 0x647726b9e7c68ff0},
+      {0x96c6e0eab509e64d, 0x5eca783430dc19f6},
+      {0xbc789925624c5fe0, 0xb67d16413d132073},
+      {0xeb96bf6ebadf77d8, 0xe41c5bd18c57e890},
+      {0x933e37a534cbaae7, 0x8e91b962f7b6f15a},
+      {0xb80dc58e81fe95a1, 0x723627bbb5a4adb1},
+      {0xe61136f2227e3b09, 0xcec3b1aaa30dd91d},
+      {0x8fcac257558ee4e6, 0x213a4f0aa5e8a7b2},
+      {0xb3bd72ed2af29e1f, 0xa988e2cd4f62d19e},
+      {0xe0accfa875af45a7, 0x93eb1b80a33b8606},
+      {0x8c6c01c9498d8b88, 0xbc72f130660533c4},
+      {0xaf87023b9bf0ee6a, 0xeb8fad7c7f8680b5},
+      {0xdb68c2ca82ed2a05, 0xa67398db9f6820e2},
+#else
+      {0xff77b1fcbebcdc4f, 0x25e8e89c13bb0f7b},
+      {0xce5d73ff402d98e3, 0xfb0a3d212dc81290},
+      {0xa6b34ad8c9dfc06f, 0xf42faa48c0ea481f},
+      {0x86a8d39ef77164bc, 0xae5dff9c02033198},
+      {0xd98ddaee19068c76, 0x3badd624dd9b0958},
+      {0xafbd2350644eeacf, 0xe5d1929ef90898fb},
+      {0x8df5efabc5979c8f, 0xca8d3ffa1ef463c2},
+      {0xe55990879ddcaabd, 0xcc420a6a101d0516},
+      {0xb94470938fa89bce, 0xf808e40e8d5b3e6a},
+      {0x95a8637627989aad, 0xdde7001379a44aa9},
+      {0xf1c90080baf72cb1, 0x5324c68b12dd6339},
+      {0xc350000000000000, 0x0000000000000000},
+      {0x9dc5ada82b70b59d, 0xf020000000000000},
+      {0xfee50b7025c36a08, 0x02f236d04753d5b5},
+      {0xcde6fd5e09abcf26, 0xed4c0226b55e6f87},
+      {0xa6539930bf6bff45, 0x84db8346b786151d},
+      {0x865b86925b9bc5c2, 0x0b8a2392ba45a9b3},
+      {0xd910f7ff28069da4, 0x1b2ba1518094da05},
+      {0xaf58416654a6babb, 0x387ac8d1970027b3},
+      {0x8da471a9de737e24, 0x5ceaecfed289e5d3},
+      {0xe4d5e82392a40515, 0x0fabaf3feaa5334b},
+      {0xb8da1662e7b00a17, 0x3d6a751f3b936244},
+      {0x95527a5202df0ccb, 0x0f37801e0c43ebc9},
+      {0xf13e34aabb430a15, 0x647726b9e7c68ff0}
+#endif
+    };
+
+#if FMT_USE_FULL_CACHE_DRAGONBOX
+    return pow10_significands[k - float_info<double>::min_k];
+#else
+    static constexpr const uint64_t powers_of_5_64[] = {
+        0x0000000000000001, 0x0000000000000005, 0x0000000000000019,
+        0x000000000000007d, 0x0000000000000271, 0x0000000000000c35,
+        0x0000000000003d09, 0x000000000001312d, 0x000000000005f5e1,
+        0x00000000001dcd65, 0x00000000009502f9, 0x0000000002e90edd,
+        0x000000000e8d4a51, 0x0000000048c27395, 0x000000016bcc41e9,
+        0x000000071afd498d, 0x0000002386f26fc1, 0x000000b1a2bc2ec5,
+        0x000003782dace9d9, 0x00001158e460913d, 0x000056bc75e2d631,
+        0x0001b1ae4d6e2ef5, 0x000878678326eac9, 0x002a5a058fc295ed,
+        0x00d3c21bcecceda1, 0x0422ca8b0a00a425, 0x14adf4b7320334b9};
+
+    static const int compression_ratio = 27;
+
+    // Compute base index.
+    int cache_index = (k - float_info<double>::min_k) / compression_ratio;
+    int kb = cache_index * compression_ratio + float_info<double>::min_k;
+    int offset = k - kb;
+
+    // Get base cache.
+    uint128_fallback base_cache = pow10_significands[cache_index];
+    if (offset == 0) return base_cache;
+
+    // Compute the required amount of bit-shift.
+    int alpha = floor_log2_pow10(kb + offset) - floor_log2_pow10(kb) - offset;
+    FMT_ASSERT(alpha > 0 && alpha < 64, "shifting error detected");
+
+    // Try to recover the real cache.
+    uint64_t pow5 = powers_of_5_64[offset];
+    uint128_fallback recovered_cache = umul128(base_cache.high(), pow5);
+    uint128_fallback middle_low = umul128(base_cache.low(), pow5);
+
+    recovered_cache += middle_low.high();
+
+    uint64_t high_to_middle = recovered_cache.high() << (64 - alpha);
+    uint64_t middle_to_low = recovered_cache.low() << (64 - alpha);
+
+    recovered_cache =
+        uint128_fallback{(recovered_cache.low() >> alpha) | high_to_middle,
+                         ((middle_low.low() >> alpha) | middle_to_low)};
+    FMT_ASSERT(recovered_cache.low() + 1 != 0, "");
+    return {recovered_cache.high(), recovered_cache.low() + 1};
+#endif
+  }
+
+  struct compute_mul_result {
+    carrier_uint result;
+    bool is_integer;
+  };
+  struct compute_mul_parity_result {
+    bool parity;
+    bool is_integer;
+  };
+
+  static auto compute_mul(carrier_uint u,
+                          const cache_entry_type& cache) noexcept
+      -> compute_mul_result {
+    auto r = umul192_upper128(u, cache);
+    return {r.high(), r.low() == 0};
+  }
+
+  static auto compute_delta(cache_entry_type const& cache, int beta) noexcept
+      -> uint32_t {
+    return static_cast<uint32_t>(cache.high() >> (64 - 1 - beta));
+  }
+
+  static auto compute_mul_parity(carrier_uint two_f,
+                                 const cache_entry_type& cache,
+                                 int beta) noexcept
+      -> compute_mul_parity_result {
+    FMT_ASSERT(beta >= 1, "");
+    FMT_ASSERT(beta < 64, "");
+
+    auto r = umul192_lower128(two_f, cache);
+    return {((r.high() >> (64 - beta)) & 1) != 0,
+            ((r.high() << beta) | (r.low() >> (64 - beta))) == 0};
+  }
+
+  static auto compute_left_endpoint_for_shorter_interval_case(
+      const cache_entry_type& cache, int beta) noexcept -> carrier_uint {
+    return (cache.high() -
+            (cache.high() >> (num_significand_bits<double>() + 2))) >>
+           (64 - num_significand_bits<double>() - 1 - beta);
+  }
+
+  static auto compute_right_endpoint_for_shorter_interval_case(
+      const cache_entry_type& cache, int beta) noexcept -> carrier_uint {
+    return (cache.high() +
+            (cache.high() >> (num_significand_bits<double>() + 1))) >>
+           (64 - num_significand_bits<double>() - 1 - beta);
+  }
+
+  static auto compute_round_up_for_shorter_interval_case(
+      const cache_entry_type& cache, int beta) noexcept -> carrier_uint {
+    return ((cache.high() >> (64 - num_significand_bits<double>() - 2 - beta)) +
+            1) /
+           2;
+  }
+};
+
+FMT_FUNC auto get_cached_power(int k) noexcept -> uint128_fallback {
+  return cache_accessor<double>::get_cached_power(k);
+}
+
+// Various integer checks
+template <typename T>
+auto is_left_endpoint_integer_shorter_interval(int exponent) noexcept -> bool {
+  const int case_shorter_interval_left_endpoint_lower_threshold = 2;
+  const int case_shorter_interval_left_endpoint_upper_threshold = 3;
+  return exponent >= case_shorter_interval_left_endpoint_lower_threshold &&
+         exponent <= case_shorter_interval_left_endpoint_upper_threshold;
+}
+
+// Remove trailing zeros from n and return the number of zeros removed (float)
+FMT_INLINE int remove_trailing_zeros(uint32_t& n, int s = 0) noexcept {
+  FMT_ASSERT(n != 0, "");
+  // Modular inverse of 5 (mod 2^32): (mod_inv_5 * 5) mod 2^32 = 1.
+  constexpr uint32_t mod_inv_5 = 0xcccccccd;
+  constexpr uint32_t mod_inv_25 = 0xc28f5c29;  // = mod_inv_5 * mod_inv_5
+
+  while (true) {
+    auto q = rotr(n * mod_inv_25, 2);
+    if (q > max_value<uint32_t>() / 100) break;
+    n = q;
+    s += 2;
+  }
+  auto q = rotr(n * mod_inv_5, 1);
+  if (q <= max_value<uint32_t>() / 10) {
+    n = q;
+    s |= 1;
+  }
+  return s;
+}
+
+// Removes trailing zeros and returns the number of zeros removed (double)
+FMT_INLINE int remove_trailing_zeros(uint64_t& n) noexcept {
+  FMT_ASSERT(n != 0, "");
+
+  // This magic number is ceil(2^90 / 10^8).
+  constexpr uint64_t magic_number = 12379400392853802749ull;
+  auto nm = umul128(n, magic_number);
+
+  // Is n is divisible by 10^8?
+  if ((nm.high() & ((1ull << (90 - 64)) - 1)) == 0 && nm.low() < magic_number) {
+    // If yes, work with the quotient...
+    auto n32 = static_cast<uint32_t>(nm.high() >> (90 - 64));
+    // ... and use the 32 bit variant of the function
+    int s = remove_trailing_zeros(n32, 8);
+    n = n32;
+    return s;
+  }
+
+  // If n is not divisible by 10^8, work with n itself.
+  constexpr uint64_t mod_inv_5 = 0xcccccccccccccccd;
+  constexpr uint64_t mod_inv_25 = 0x8f5c28f5c28f5c29;  // mod_inv_5 * mod_inv_5
+
+  int s = 0;
+  while (true) {
+    auto q = rotr(n * mod_inv_25, 2);
+    if (q > max_value<uint64_t>() / 100) break;
+    n = q;
+    s += 2;
+  }
+  auto q = rotr(n * mod_inv_5, 1);
+  if (q <= max_value<uint64_t>() / 10) {
+    n = q;
+    s |= 1;
+  }
+
+  return s;
+}
+
+// The main algorithm for shorter interval case
+template <typename T>
+FMT_INLINE decimal_fp<T> shorter_interval_case(int exponent) noexcept {
+  decimal_fp<T> ret_value;
+  // Compute k and beta
+  const int minus_k = floor_log10_pow2_minus_log10_4_over_3(exponent);
+  const int beta = exponent + floor_log2_pow10(-minus_k);
+
+  // Compute xi and zi
+  using cache_entry_type = typename cache_accessor<T>::cache_entry_type;
+  const cache_entry_type cache = cache_accessor<T>::get_cached_power(-minus_k);
+
+  auto xi = cache_accessor<T>::compute_left_endpoint_for_shorter_interval_case(
+      cache, beta);
+  auto zi = cache_accessor<T>::compute_right_endpoint_for_shorter_interval_case(
+      cache, beta);
+
+  // If the left endpoint is not an integer, increase it
+  if (!is_left_endpoint_integer_shorter_interval<T>(exponent)) ++xi;
+
+  // Try bigger divisor
+  ret_value.significand = zi / 10;
+
+  // If succeed, remove trailing zeros if necessary and return
+  if (ret_value.significand * 10 >= xi) {
+    ret_value.exponent = minus_k + 1;
+    ret_value.exponent += remove_trailing_zeros(ret_value.significand);
+    return ret_value;
+  }
+
+  // Otherwise, compute the round-up of y
+  ret_value.significand =
+      cache_accessor<T>::compute_round_up_for_shorter_interval_case(cache,
+                                                                    beta);
+  ret_value.exponent = minus_k;
+
+  // When tie occurs, choose one of them according to the rule
+  if (exponent >= float_info<T>::shorter_interval_tie_lower_threshold &&
+      exponent <= float_info<T>::shorter_interval_tie_upper_threshold) {
+    ret_value.significand = ret_value.significand % 2 == 0
+                                ? ret_value.significand
+                                : ret_value.significand - 1;
+  } else if (ret_value.significand < xi) {
+    ++ret_value.significand;
+  }
+  return ret_value;
+}
+
+template <typename T> auto to_decimal(T x) noexcept -> decimal_fp<T> {
+  // Step 1: integer promotion & Schubfach multiplier calculation.
+
+  using carrier_uint = typename float_info<T>::carrier_uint;
+  using cache_entry_type = typename cache_accessor<T>::cache_entry_type;
+  auto br = bit_cast<carrier_uint>(x);
+
+  // Extract significand bits and exponent bits.
+  const carrier_uint significand_mask =
+      (static_cast<carrier_uint>(1) << num_significand_bits<T>()) - 1;
+  carrier_uint significand = (br & significand_mask);
+  int exponent =
+      static_cast<int>((br & exponent_mask<T>()) >> num_significand_bits<T>());
+
+  if (exponent != 0) {  // Check if normal.
+    exponent -= exponent_bias<T>() + num_significand_bits<T>();
+
+    // Shorter interval case; proceed like Schubfach.
+    // In fact, when exponent == 1 and significand == 0, the interval is
+    // regular. However, it can be shown that the end-results are anyway same.
+    if (significand == 0) return shorter_interval_case<T>(exponent);
+
+    significand |= (static_cast<carrier_uint>(1) << num_significand_bits<T>());
+  } else {
+    // Subnormal case; the interval is always regular.
+    if (significand == 0) return {0, 0};
+    exponent =
+        std::numeric_limits<T>::min_exponent - num_significand_bits<T>() - 1;
+  }
+
+  const bool include_left_endpoint = (significand % 2 == 0);
+  const bool include_right_endpoint = include_left_endpoint;
+
+  // Compute k and beta.
+  const int minus_k = floor_log10_pow2(exponent) - float_info<T>::kappa;
+  const cache_entry_type cache = cache_accessor<T>::get_cached_power(-minus_k);
+  const int beta = exponent + floor_log2_pow10(-minus_k);
+
+  // Compute zi and deltai.
+  // 10^kappa <= deltai < 10^(kappa + 1)
+  const uint32_t deltai = cache_accessor<T>::compute_delta(cache, beta);
+  const carrier_uint two_fc = significand << 1;
+
+  // For the case of binary32, the result of integer check is not correct for
+  // 29711844 * 2^-82
+  // = 6.1442653300000000008655037797566933477355632930994033813476... * 10^-18
+  // and 29711844 * 2^-81
+  // = 1.2288530660000000001731007559513386695471126586198806762695... * 10^-17,
+  // and they are the unique counterexamples. However, since 29711844 is even,
+  // this does not cause any problem for the endpoints calculations; it can only
+  // cause a problem when we need to perform integer check for the center.
+  // Fortunately, with these inputs, that branch is never executed, so we are
+  // fine.
+  const typename cache_accessor<T>::compute_mul_result z_mul =
+      cache_accessor<T>::compute_mul((two_fc | 1) << beta, cache);
+
+  // Step 2: Try larger divisor; remove trailing zeros if necessary.
+
+  // Using an upper bound on zi, we might be able to optimize the division
+  // better than the compiler; we are computing zi / big_divisor here.
+  decimal_fp<T> ret_value;
+  ret_value.significand = divide_by_10_to_kappa_plus_1(z_mul.result);
+  uint32_t r = static_cast<uint32_t>(z_mul.result - float_info<T>::big_divisor *
+                                                        ret_value.significand);
+
+  if (r < deltai) {
+    // Exclude the right endpoint if necessary.
+    if (r == 0 && (z_mul.is_integer & !include_right_endpoint)) {
+      --ret_value.significand;
+      r = float_info<T>::big_divisor;
+      goto small_divisor_case_label;
+    }
+  } else if (r > deltai) {
+    goto small_divisor_case_label;
+  } else {
+    // r == deltai; compare fractional parts.
+    const typename cache_accessor<T>::compute_mul_parity_result x_mul =
+        cache_accessor<T>::compute_mul_parity(two_fc - 1, cache, beta);
+
+    if (!(x_mul.parity | (x_mul.is_integer & include_left_endpoint)))
+      goto small_divisor_case_label;
+  }
+  ret_value.exponent = minus_k + float_info<T>::kappa + 1;
+
+  // We may need to remove trailing zeros.
+  ret_value.exponent += remove_trailing_zeros(ret_value.significand);
+  return ret_value;
+
+  // Step 3: Find the significand with the smaller divisor.
+
+small_divisor_case_label:
+  ret_value.significand *= 10;
+  ret_value.exponent = minus_k + float_info<T>::kappa;
+
+  uint32_t dist = r - (deltai / 2) + (float_info<T>::small_divisor / 2);
+  const bool approx_y_parity =
+      ((dist ^ (float_info<T>::small_divisor / 2)) & 1) != 0;
+
+  // Is dist divisible by 10^kappa?
+  const bool divisible_by_small_divisor =
+      check_divisibility_and_divide_by_pow10<float_info<T>::kappa>(dist);
+
+  // Add dist / 10^kappa to the significand.
+  ret_value.significand += dist;
+
+  if (!divisible_by_small_divisor) return ret_value;
+
+  // Check z^(f) >= epsilon^(f).
+  // We have either yi == zi - epsiloni or yi == (zi - epsiloni) - 1,
+  // where yi == zi - epsiloni if and only if z^(f) >= epsilon^(f).
+  // Since there are only 2 possibilities, we only need to care about the
+  // parity. Also, zi and r should have the same parity since the divisor
+  // is an even number.
+  const auto y_mul = cache_accessor<T>::compute_mul_parity(two_fc, cache, beta);
+
+  // If z^(f) >= epsilon^(f), we might have a tie when z^(f) == epsilon^(f),
+  // or equivalently, when y is an integer.
+  if (y_mul.parity != approx_y_parity)
+    --ret_value.significand;
+  else if (y_mul.is_integer & (ret_value.significand % 2 != 0))
+    --ret_value.significand;
+  return ret_value;
+}
+}  // namespace dragonbox
+}  // namespace detail
+
+template <> struct formatter<detail::bigint> {
+  FMT_CONSTEXPR auto parse(format_parse_context& ctx)
+      -> format_parse_context::iterator {
+    return ctx.begin();
+  }
+
+  auto format(const detail::bigint& n, format_context& ctx) const
+      -> format_context::iterator {
+    auto out = ctx.out();
+    bool first = true;
+    for (auto i = n.bigits_.size(); i > 0; --i) {
+      auto value = n.bigits_[i - 1u];
+      if (first) {
+        out = fmt::format_to(out, FMT_STRING("{:x}"), value);
+        first = false;
+        continue;
+      }
+      out = fmt::format_to(out, FMT_STRING("{:08x}"), value);
+    }
+    if (n.exp_ > 0)
+      out = fmt::format_to(out, FMT_STRING("p{}"),
+                           n.exp_ * detail::bigint::bigit_bits);
+    return out;
+  }
+};
+
+FMT_FUNC detail::utf8_to_utf16::utf8_to_utf16(string_view s) {
+  for_each_codepoint(s, [this](uint32_t cp, string_view) {
+    if (cp == invalid_code_point) FMT_THROW(std::runtime_error("invalid utf8"));
+    if (cp <= 0xFFFF) {
+      buffer_.push_back(static_cast<wchar_t>(cp));
+    } else {
+      cp -= 0x10000;
+      buffer_.push_back(static_cast<wchar_t>(0xD800 + (cp >> 10)));
+      buffer_.push_back(static_cast<wchar_t>(0xDC00 + (cp & 0x3FF)));
+    }
+    return true;
+  });
+  buffer_.push_back(0);
+}
+
+FMT_FUNC void format_system_error(detail::buffer<char>& out, int error_code,
+                                  const char* message) noexcept {
+  FMT_TRY {
+    auto ec = std::error_code(error_code, std::generic_category());
+    detail::write(appender(out), std::system_error(ec, message).what());
+    return;
+  }
+  FMT_CATCH(...) {}
+  format_error_code(out, error_code, message);
+}
+
+FMT_FUNC void report_system_error(int error_code,
+                                  const char* message) noexcept {
+  report_error(format_system_error, error_code, message);
+}
+
+FMT_FUNC auto vformat(string_view fmt, format_args args) -> std::string {
+  // Don't optimize the "{}" case to keep the binary size small and because it
+  // can be better optimized in fmt::format anyway.
+  auto buffer = memory_buffer();
+  detail::vformat_to(buffer, fmt, args);
+  return to_string(buffer);
+}
+
+namespace detail {
+
+template <typename T> struct span {
+  T* data;
+  size_t size;
+};
+
+template <typename F> auto flockfile(F* f) -> decltype(_lock_file(f)) {
+  _lock_file(f);
+}
+template <typename F> auto funlockfile(F* f) -> decltype(_unlock_file(f)) {
+  _unlock_file(f);
+}
+
+#ifndef getc_unlocked
+template <typename F> auto getc_unlocked(F* f) -> decltype(_fgetc_nolock(f)) {
+  return _fgetc_nolock(f);
+}
+#endif
+
+template <typename F = FILE, typename Enable = void>
+struct has_flockfile : std::false_type {};
+
+template <typename F>
+struct has_flockfile<F, void_t<decltype(flockfile(&std::declval<F&>()))>>
+    : std::true_type {};
+
+// A FILE wrapper. F is FILE defined as a template parameter to make system API
+// detection work.
+template <typename F> class file_base {
+ public:
+  F* file_;
+
+ public:
+  file_base(F* file) : file_(file) {}
+  operator F*() const { return file_; }
+
+  // Reads a code unit from the stream.
+  auto get() -> int {
+    int result = getc_unlocked(file_);
+    if (result == EOF && ferror(file_) != 0)
+      FMT_THROW(system_error(errno, FMT_STRING("getc failed")));
+    return result;
+  }
+
+  // Puts the code unit back into the stream buffer.
+  void unget(char c) {
+    if (ungetc(c, file_) == EOF)
+      FMT_THROW(system_error(errno, FMT_STRING("ungetc failed")));
+  }
+
+  void flush() { fflush(this->file_); }
+};
+
+// A FILE wrapper for glibc.
+template <typename F> class glibc_file : public file_base<F> {
+ private:
+  enum {
+    line_buffered = 0x200,  // _IO_LINE_BUF
+    unbuffered = 2          // _IO_UNBUFFERED
+  };
+
+ public:
+  using file_base<F>::file_base;
+
+  auto is_buffered() const -> bool {
+    return (this->file_->_flags & unbuffered) == 0;
+  }
+
+  void init_buffer() {
+    if (this->file_->_IO_write_ptr) return;
+    // Force buffer initialization by placing and removing a char in a buffer.
+    putc_unlocked(0, this->file_);
+    --this->file_->_IO_write_ptr;
+  }
+
+  // Returns the file's read buffer.
+  auto get_read_buffer() const -> span<const char> {
+    auto ptr = this->file_->_IO_read_ptr;
+    return {ptr, to_unsigned(this->file_->_IO_read_end - ptr)};
+  }
+
+  // Returns the file's write buffer.
+  auto get_write_buffer() const -> span<char> {
+    auto ptr = this->file_->_IO_write_ptr;
+    return {ptr, to_unsigned(this->file_->_IO_buf_end - ptr)};
+  }
+
+  void advance_write_buffer(size_t size) { this->file_->_IO_write_ptr += size; }
+
+  bool needs_flush() const {
+    if ((this->file_->_flags & line_buffered) == 0) return false;
+    char* end = this->file_->_IO_write_end;
+    return memchr(end, '\n', to_unsigned(this->file_->_IO_write_ptr - end));
+  }
+
+  void flush() { fflush_unlocked(this->file_); }
+};
+
+// A FILE wrapper for Apple's libc.
+template <typename F> class apple_file : public file_base<F> {
+ private:
+  enum {
+    line_buffered = 1,  // __SNBF
+    unbuffered = 2      // __SLBF
+  };
+
+ public:
+  using file_base<F>::file_base;
+
+  auto is_buffered() const -> bool {
+    return (this->file_->_flags & unbuffered) == 0;
+  }
+
+  void init_buffer() {
+    if (this->file_->_p) return;
+    // Force buffer initialization by placing and removing a char in a buffer.
+    putc_unlocked(0, this->file_);
+    --this->file_->_p;
+    ++this->file_->_w;
+  }
+
+  auto get_read_buffer() const -> span<const char> {
+    return {reinterpret_cast<char*>(this->file_->_p),
+            to_unsigned(this->file_->_r)};
+  }
+
+  auto get_write_buffer() const -> span<char> {
+    return {reinterpret_cast<char*>(this->file_->_p),
+            to_unsigned(this->file_->_bf._base + this->file_->_bf._size -
+                        this->file_->_p)};
+  }
+
+  void advance_write_buffer(size_t size) {
+    this->file_->_p += size;
+    this->file_->_w -= size;
+  }
+
+  bool needs_flush() const {
+    if ((this->file_->_flags & line_buffered) == 0) return false;
+    return memchr(this->file_->_p + this->file_->_w, '\n',
+                  to_unsigned(-this->file_->_w));
+  }
+};
+
+// A fallback FILE wrapper.
+template <typename F> class fallback_file : public file_base<F> {
+ private:
+  char next_;  // The next unconsumed character in the buffer.
+  bool has_next_ = false;
+
+ public:
+  using file_base<F>::file_base;
+
+  auto is_buffered() const -> bool { return false; }
+  auto needs_flush() const -> bool { return false; }
+  void init_buffer() {}
+
+  auto get_read_buffer() const -> span<const char> {
+    return {&next_, has_next_ ? 1u : 0u};
+  }
+
+  auto get_write_buffer() const -> span<char> { return {nullptr, 0}; }
+
+  void advance_write_buffer(size_t) {}
+
+  auto get() -> int {
+    has_next_ = false;
+    return file_base<F>::get();
+  }
+
+  void unget(char c) {
+    file_base<F>::unget(c);
+    next_ = c;
+    has_next_ = true;
+  }
+};
+
+#ifndef FMT_USE_FALLBACK_FILE
+#  define FMT_USE_FALLBACK_FILE 1
+#endif
+
+template <typename F,
+          FMT_ENABLE_IF(sizeof(F::_p) != 0 && !FMT_USE_FALLBACK_FILE)>
+auto get_file(F* f, int) -> apple_file<F> {
+  return f;
+}
+template <typename F,
+          FMT_ENABLE_IF(sizeof(F::_IO_read_ptr) != 0 && !FMT_USE_FALLBACK_FILE)>
+inline auto get_file(F* f, int) -> glibc_file<F> {
+  return f;
+}
+
+inline auto get_file(FILE* f, ...) -> fallback_file<FILE> { return f; }
+
+using file_ref = decltype(get_file(static_cast<FILE*>(nullptr), 0));
+
+template <typename F = FILE, typename Enable = void>
+class file_print_buffer : public buffer<char> {
+ public:
+  explicit file_print_buffer(F*) : buffer(nullptr, size_t()) {}
+};
+
+template <typename F>
+class file_print_buffer<F, enable_if_t<has_flockfile<F>::value>>
+    : public buffer<char> {
+ private:
+  file_ref file_;
+
+  static void grow(buffer<char>& base, size_t) {
+    auto& self = static_cast<file_print_buffer&>(base);
+    self.file_.advance_write_buffer(self.size());
+    if (self.file_.get_write_buffer().size == 0) self.file_.flush();
+    auto buf = self.file_.get_write_buffer();
+    FMT_ASSERT(buf.size > 0, "");
+    self.set(buf.data, buf.size);
+    self.clear();
+  }
+
+ public:
+  explicit file_print_buffer(F* f) : buffer(grow, size_t()), file_(f) {
+    flockfile(f);
+    file_.init_buffer();
+    auto buf = file_.get_write_buffer();
+    set(buf.data, buf.size);
+  }
+  ~file_print_buffer() {
+    file_.advance_write_buffer(size());
+    bool flush = file_.needs_flush();
+    F* f = file_;    // Make funlockfile depend on the template parameter F
+    funlockfile(f);  // for the system API detection to work.
+    if (flush) fflush(file_);
+  }
+};
+
+#if !defined(_WIN32) || defined(FMT_USE_WRITE_CONSOLE)
+FMT_FUNC auto write_console(int, string_view) -> bool { return false; }
+#else
+using dword = conditional_t<sizeof(long) == 4, unsigned long, unsigned>;
+extern "C" __declspec(dllimport) int __stdcall WriteConsoleW(  //
+    void*, const void*, dword, dword*, void*);
+
+FMT_FUNC bool write_console(int fd, string_view text) {
+  auto u16 = utf8_to_utf16(text);
+  return WriteConsoleW(reinterpret_cast<void*>(_get_osfhandle(fd)), u16.c_str(),
+                       static_cast<dword>(u16.size()), nullptr, nullptr) != 0;
+}
+#endif
+
+#ifdef _WIN32
+// Print assuming legacy (non-Unicode) encoding.
+FMT_FUNC void vprint_mojibake(std::FILE* f, string_view fmt, format_args args,
+                              bool newline) {
+  auto buffer = memory_buffer();
+  detail::vformat_to(buffer, fmt, args);
+  if (newline) buffer.push_back('\n');
+  fwrite_fully(buffer.data(), buffer.size(), f);
+}
+#endif
+
+FMT_FUNC void print(std::FILE* f, string_view text) {
+#if defined(_WIN32) && !defined(FMT_USE_WRITE_CONSOLE)
+  int fd = _fileno(f);
+  if (_isatty(fd)) {
+    std::fflush(f);
+    if (write_console(fd, text)) return;
+  }
+#endif
+  fwrite_fully(text.data(), text.size(), f);
+}
+}  // namespace detail
+
+FMT_FUNC void vprint_buffered(std::FILE* f, string_view fmt, format_args args) {
+  auto buffer = memory_buffer();
+  detail::vformat_to(buffer, fmt, args);
+  detail::print(f, {buffer.data(), buffer.size()});
+}
+
+FMT_FUNC void vprint(std::FILE* f, string_view fmt, format_args args) {
+  if (!detail::file_ref(f).is_buffered() || !detail::has_flockfile<>())
+    return vprint_buffered(f, fmt, args);
+  auto&& buffer = detail::file_print_buffer<>(f);
+  return detail::vformat_to(buffer, fmt, args);
+}
+
+FMT_FUNC void vprintln(std::FILE* f, string_view fmt, format_args args) {
+  auto buffer = memory_buffer();
+  detail::vformat_to(buffer, fmt, args);
+  buffer.push_back('\n');
+  detail::print(f, {buffer.data(), buffer.size()});
+}
+
+FMT_FUNC void vprint(string_view fmt, format_args args) {
+  vprint(stdout, fmt, args);
+}
+
+namespace detail {
+
+struct singleton {
+  unsigned char upper;
+  unsigned char lower_count;
+};
+
+inline auto is_printable(uint16_t x, const singleton* singletons,
+                         size_t singletons_size,
+                         const unsigned char* singleton_lowers,
+                         const unsigned char* normal, size_t normal_size)
+    -> bool {
+  auto upper = x >> 8;
+  auto lower_start = 0;
+  for (size_t i = 0; i < singletons_size; ++i) {
+    auto s = singletons[i];
+    auto lower_end = lower_start + s.lower_count;
+    if (upper < s.upper) break;
+    if (upper == s.upper) {
+      for (auto j = lower_start; j < lower_end; ++j) {
+        if (singleton_lowers[j] == (x & 0xff)) return false;
+      }
+    }
+    lower_start = lower_end;
+  }
+
+  auto xsigned = static_cast<int>(x);
+  auto current = true;
+  for (size_t i = 0; i < normal_size; ++i) {
+    auto v = static_cast<int>(normal[i]);
+    auto len = (v & 0x80) != 0 ? (v & 0x7f) << 8 | normal[++i] : v;
+    xsigned -= len;
+    if (xsigned < 0) break;
+    current = !current;
+  }
+  return current;
+}
+
+// This code is generated by support/printable.py.
+FMT_FUNC auto is_printable(uint32_t cp) -> bool {
+  static constexpr singleton singletons0[] = {
+      {0x00, 1},  {0x03, 5},  {0x05, 6},  {0x06, 3},  {0x07, 6},  {0x08, 8},
+      {0x09, 17}, {0x0a, 28}, {0x0b, 25}, {0x0c, 20}, {0x0d, 16}, {0x0e, 13},
+      {0x0f, 4},  {0x10, 3},  {0x12, 18}, {0x13, 9},  {0x16, 1},  {0x17, 5},
+      {0x18, 2},  {0x19, 3},  {0x1a, 7},  {0x1c, 2},  {0x1d, 1},  {0x1f, 22},
+      {0x20, 3},  {0x2b, 3},  {0x2c, 2},  {0x2d, 11}, {0x2e, 1},  {0x30, 3},
+      {0x31, 2},  {0x32, 1},  {0xa7, 2},  {0xa9, 2},  {0xaa, 4},  {0xab, 8},
+      {0xfa, 2},  {0xfb, 5},  {0xfd, 4},  {0xfe, 3},  {0xff, 9},
+  };
+  static constexpr unsigned char singletons0_lower[] = {
+      0xad, 0x78, 0x79, 0x8b, 0x8d, 0xa2, 0x30, 0x57, 0x58, 0x8b, 0x8c, 0x90,
+      0x1c, 0x1d, 0xdd, 0x0e, 0x0f, 0x4b, 0x4c, 0xfb, 0xfc, 0x2e, 0x2f, 0x3f,
+      0x5c, 0x5d, 0x5f, 0xb5, 0xe2, 0x84, 0x8d, 0x8e, 0x91, 0x92, 0xa9, 0xb1,
+      0xba, 0xbb, 0xc5, 0xc6, 0xc9, 0xca, 0xde, 0xe4, 0xe5, 0xff, 0x00, 0x04,
+      0x11, 0x12, 0x29, 0x31, 0x34, 0x37, 0x3a, 0x3b, 0x3d, 0x49, 0x4a, 0x5d,
+      0x84, 0x8e, 0x92, 0xa9, 0xb1, 0xb4, 0xba, 0xbb, 0xc6, 0xca, 0xce, 0xcf,
+      0xe4, 0xe5, 0x00, 0x04, 0x0d, 0x0e, 0x11, 0x12, 0x29, 0x31, 0x34, 0x3a,
+      0x3b, 0x45, 0x46, 0x49, 0x4a, 0x5e, 0x64, 0x65, 0x84, 0x91, 0x9b, 0x9d,
+      0xc9, 0xce, 0xcf, 0x0d, 0x11, 0x29, 0x45, 0x49, 0x57, 0x64, 0x65, 0x8d,
+      0x91, 0xa9, 0xb4, 0xba, 0xbb, 0xc5, 0xc9, 0xdf, 0xe4, 0xe5, 0xf0, 0x0d,
+      0x11, 0x45, 0x49, 0x64, 0x65, 0x80, 0x84, 0xb2, 0xbc, 0xbe, 0xbf, 0xd5,
+      0xd7, 0xf0, 0xf1, 0x83, 0x85, 0x8b, 0xa4, 0xa6, 0xbe, 0xbf, 0xc5, 0xc7,
+      0xce, 0xcf, 0xda, 0xdb, 0x48, 0x98, 0xbd, 0xcd, 0xc6, 0xce, 0xcf, 0x49,
+      0x4e, 0x4f, 0x57, 0x59, 0x5e, 0x5f, 0x89, 0x8e, 0x8f, 0xb1, 0xb6, 0xb7,
+      0xbf, 0xc1, 0xc6, 0xc7, 0xd7, 0x11, 0x16, 0x17, 0x5b, 0x5c, 0xf6, 0xf7,
+      0xfe, 0xff, 0x80, 0x0d, 0x6d, 0x71, 0xde, 0xdf, 0x0e, 0x0f, 0x1f, 0x6e,
+      0x6f, 0x1c, 0x1d, 0x5f, 0x7d, 0x7e, 0xae, 0xaf, 0xbb, 0xbc, 0xfa, 0x16,
+      0x17, 0x1e, 0x1f, 0x46, 0x47, 0x4e, 0x4f, 0x58, 0x5a, 0x5c, 0x5e, 0x7e,
+      0x7f, 0xb5, 0xc5, 0xd4, 0xd5, 0xdc, 0xf0, 0xf1, 0xf5, 0x72, 0x73, 0x8f,
+      0x74, 0x75, 0x96, 0x2f, 0x5f, 0x26, 0x2e, 0x2f, 0xa7, 0xaf, 0xb7, 0xbf,
+      0xc7, 0xcf, 0xd7, 0xdf, 0x9a, 0x40, 0x97, 0x98, 0x30, 0x8f, 0x1f, 0xc0,
+      0xc1, 0xce, 0xff, 0x4e, 0x4f, 0x5a, 0x5b, 0x07, 0x08, 0x0f, 0x10, 0x27,
+      0x2f, 0xee, 0xef, 0x6e, 0x6f, 0x37, 0x3d, 0x3f, 0x42, 0x45, 0x90, 0x91,
+      0xfe, 0xff, 0x53, 0x67, 0x75, 0xc8, 0xc9, 0xd0, 0xd1, 0xd8, 0xd9, 0xe7,
+      0xfe, 0xff,
+  };
+  static constexpr singleton singletons1[] = {
+      {0x00, 6},  {0x01, 1}, {0x03, 1},  {0x04, 2}, {0x08, 8},  {0x09, 2},
+      {0x0a, 5},  {0x0b, 2}, {0x0e, 4},  {0x10, 1}, {0x11, 2},  {0x12, 5},
+      {0x13, 17}, {0x14, 1}, {0x15, 2},  {0x17, 2}, {0x19, 13}, {0x1c, 5},
+      {0x1d, 8},  {0x24, 1}, {0x6a, 3},  {0x6b, 2}, {0xbc, 2},  {0xd1, 2},
+      {0xd4, 12}, {0xd5, 9}, {0xd6, 2},  {0xd7, 2}, {0xda, 1},  {0xe0, 5},
+      {0xe1, 2},  {0xe8, 2}, {0xee, 32}, {0xf0, 4}, {0xf8, 2},  {0xf9, 2},
+      {0xfa, 2},  {0xfb, 1},
+  };
+  static constexpr unsigned char singletons1_lower[] = {
+      0x0c, 0x27, 0x3b, 0x3e, 0x4e, 0x4f, 0x8f, 0x9e, 0x9e, 0x9f, 0x06, 0x07,
+      0x09, 0x36, 0x3d, 0x3e, 0x56, 0xf3, 0xd0, 0xd1, 0x04, 0x14, 0x18, 0x36,
+      0x37, 0x56, 0x57, 0x7f, 0xaa, 0xae, 0xaf, 0xbd, 0x35, 0xe0, 0x12, 0x87,
+      0x89, 0x8e, 0x9e, 0x04, 0x0d, 0x0e, 0x11, 0x12, 0x29, 0x31, 0x34, 0x3a,
+      0x45, 0x46, 0x49, 0x4a, 0x4e, 0x4f, 0x64, 0x65, 0x5c, 0xb6, 0xb7, 0x1b,
+      0x1c, 0x07, 0x08, 0x0a, 0x0b, 0x14, 0x17, 0x36, 0x39, 0x3a, 0xa8, 0xa9,
+      0xd8, 0xd9, 0x09, 0x37, 0x90, 0x91, 0xa8, 0x07, 0x0a, 0x3b, 0x3e, 0x66,
+      0x69, 0x8f, 0x92, 0x6f, 0x5f, 0xee, 0xef, 0x5a, 0x62, 0x9a, 0x9b, 0x27,
+      0x28, 0x55, 0x9d, 0xa0, 0xa1, 0xa3, 0xa4, 0xa7, 0xa8, 0xad, 0xba, 0xbc,
+      0xc4, 0x06, 0x0b, 0x0c, 0x15, 0x1d, 0x3a, 0x3f, 0x45, 0x51, 0xa6, 0xa7,
+      0xcc, 0xcd, 0xa0, 0x07, 0x19, 0x1a, 0x22, 0x25, 0x3e, 0x3f, 0xc5, 0xc6,
+      0x04, 0x20, 0x23, 0x25, 0x26, 0x28, 0x33, 0x38, 0x3a, 0x48, 0x4a, 0x4c,
+      0x50, 0x53, 0x55, 0x56, 0x58, 0x5a, 0x5c, 0x5e, 0x60, 0x63, 0x65, 0x66,
+      0x6b, 0x73, 0x78, 0x7d, 0x7f, 0x8a, 0xa4, 0xaa, 0xaf, 0xb0, 0xc0, 0xd0,
+      0xae, 0xaf, 0x79, 0xcc, 0x6e, 0x6f, 0x93,
+  };
+  static constexpr unsigned char normal0[] = {
+      0x00, 0x20, 0x5f, 0x22, 0x82, 0xdf, 0x04, 0x82, 0x44, 0x08, 0x1b, 0x04,
+      0x06, 0x11, 0x81, 0xac, 0x0e, 0x80, 0xab, 0x35, 0x28, 0x0b, 0x80, 0xe0,
+      0x03, 0x19, 0x08, 0x01, 0x04, 0x2f, 0x04, 0x34, 0x04, 0x07, 0x03, 0x01,
+      0x07, 0x06, 0x07, 0x11, 0x0a, 0x50, 0x0f, 0x12, 0x07, 0x55, 0x07, 0x03,
+      0x04, 0x1c, 0x0a, 0x09, 0x03, 0x08, 0x03, 0x07, 0x03, 0x02, 0x03, 0x03,
+      0x03, 0x0c, 0x04, 0x05, 0x03, 0x0b, 0x06, 0x01, 0x0e, 0x15, 0x05, 0x3a,
+      0x03, 0x11, 0x07, 0x06, 0x05, 0x10, 0x07, 0x57, 0x07, 0x02, 0x07, 0x15,
+      0x0d, 0x50, 0x04, 0x43, 0x03, 0x2d, 0x03, 0x01, 0x04, 0x11, 0x06, 0x0f,
+      0x0c, 0x3a, 0x04, 0x1d, 0x25, 0x5f, 0x20, 0x6d, 0x04, 0x6a, 0x25, 0x80,
+      0xc8, 0x05, 0x82, 0xb0, 0x03, 0x1a, 0x06, 0x82, 0xfd, 0x03, 0x59, 0x07,
+      0x15, 0x0b, 0x17, 0x09, 0x14, 0x0c, 0x14, 0x0c, 0x6a, 0x06, 0x0a, 0x06,
+      0x1a, 0x06, 0x59, 0x07, 0x2b, 0x05, 0x46, 0x0a, 0x2c, 0x04, 0x0c, 0x04,
+      0x01, 0x03, 0x31, 0x0b, 0x2c, 0x04, 0x1a, 0x06, 0x0b, 0x03, 0x80, 0xac,
+      0x06, 0x0a, 0x06, 0x21, 0x3f, 0x4c, 0x04, 0x2d, 0x03, 0x74, 0x08, 0x3c,
+      0x03, 0x0f, 0x03, 0x3c, 0x07, 0x38, 0x08, 0x2b, 0x05, 0x82, 0xff, 0x11,
+      0x18, 0x08, 0x2f, 0x11, 0x2d, 0x03, 0x20, 0x10, 0x21, 0x0f, 0x80, 0x8c,
+      0x04, 0x82, 0x97, 0x19, 0x0b, 0x15, 0x88, 0x94, 0x05, 0x2f, 0x05, 0x3b,
+      0x07, 0x02, 0x0e, 0x18, 0x09, 0x80, 0xb3, 0x2d, 0x74, 0x0c, 0x80, 0xd6,
+      0x1a, 0x0c, 0x05, 0x80, 0xff, 0x05, 0x80, 0xdf, 0x0c, 0xee, 0x0d, 0x03,
+      0x84, 0x8d, 0x03, 0x37, 0x09, 0x81, 0x5c, 0x14, 0x80, 0xb8, 0x08, 0x80,
+      0xcb, 0x2a, 0x38, 0x03, 0x0a, 0x06, 0x38, 0x08, 0x46, 0x08, 0x0c, 0x06,
+      0x74, 0x0b, 0x1e, 0x03, 0x5a, 0x04, 0x59, 0x09, 0x80, 0x83, 0x18, 0x1c,
+      0x0a, 0x16, 0x09, 0x4c, 0x04, 0x80, 0x8a, 0x06, 0xab, 0xa4, 0x0c, 0x17,
+      0x04, 0x31, 0xa1, 0x04, 0x81, 0xda, 0x26, 0x07, 0x0c, 0x05, 0x05, 0x80,
+      0xa5, 0x11, 0x81, 0x6d, 0x10, 0x78, 0x28, 0x2a, 0x06, 0x4c, 0x04, 0x80,
+      0x8d, 0x04, 0x80, 0xbe, 0x03, 0x1b, 0x03, 0x0f, 0x0d,
+  };
+  static constexpr unsigned char normal1[] = {
+      0x5e, 0x22, 0x7b, 0x05, 0x03, 0x04, 0x2d, 0x03, 0x66, 0x03, 0x01, 0x2f,
+      0x2e, 0x80, 0x82, 0x1d, 0x03, 0x31, 0x0f, 0x1c, 0x04, 0x24, 0x09, 0x1e,
+      0x05, 0x2b, 0x05, 0x44, 0x04, 0x0e, 0x2a, 0x80, 0xaa, 0x06, 0x24, 0x04,
+      0x24, 0x04, 0x28, 0x08, 0x34, 0x0b, 0x01, 0x80, 0x90, 0x81, 0x37, 0x09,
+      0x16, 0x0a, 0x08, 0x80, 0x98, 0x39, 0x03, 0x63, 0x08, 0x09, 0x30, 0x16,
+      0x05, 0x21, 0x03, 0x1b, 0x05, 0x01, 0x40, 0x38, 0x04, 0x4b, 0x05, 0x2f,
+      0x04, 0x0a, 0x07, 0x09, 0x07, 0x40, 0x20, 0x27, 0x04, 0x0c, 0x09, 0x36,
+      0x03, 0x3a, 0x05, 0x1a, 0x07, 0x04, 0x0c, 0x07, 0x50, 0x49, 0x37, 0x33,
+      0x0d, 0x33, 0x07, 0x2e, 0x08, 0x0a, 0x81, 0x26, 0x52, 0x4e, 0x28, 0x08,
+      0x2a, 0x56, 0x1c, 0x14, 0x17, 0x09, 0x4e, 0x04, 0x1e, 0x0f, 0x43, 0x0e,
+      0x19, 0x07, 0x0a, 0x06, 0x48, 0x08, 0x27, 0x09, 0x75, 0x0b, 0x3f, 0x41,
+      0x2a, 0x06, 0x3b, 0x05, 0x0a, 0x06, 0x51, 0x06, 0x01, 0x05, 0x10, 0x03,
+      0x05, 0x80, 0x8b, 0x62, 0x1e, 0x48, 0x08, 0x0a, 0x80, 0xa6, 0x5e, 0x22,
+      0x45, 0x0b, 0x0a, 0x06, 0x0d, 0x13, 0x39, 0x07, 0x0a, 0x36, 0x2c, 0x04,
+      0x10, 0x80, 0xc0, 0x3c, 0x64, 0x53, 0x0c, 0x48, 0x09, 0x0a, 0x46, 0x45,
+      0x1b, 0x48, 0x08, 0x53, 0x1d, 0x39, 0x81, 0x07, 0x46, 0x0a, 0x1d, 0x03,
+      0x47, 0x49, 0x37, 0x03, 0x0e, 0x08, 0x0a, 0x06, 0x39, 0x07, 0x0a, 0x81,
+      0x36, 0x19, 0x80, 0xb7, 0x01, 0x0f, 0x32, 0x0d, 0x83, 0x9b, 0x66, 0x75,
+      0x0b, 0x80, 0xc4, 0x8a, 0xbc, 0x84, 0x2f, 0x8f, 0xd1, 0x82, 0x47, 0xa1,
+      0xb9, 0x82, 0x39, 0x07, 0x2a, 0x04, 0x02, 0x60, 0x26, 0x0a, 0x46, 0x0a,
+      0x28, 0x05, 0x13, 0x82, 0xb0, 0x5b, 0x65, 0x4b, 0x04, 0x39, 0x07, 0x11,
+      0x40, 0x05, 0x0b, 0x02, 0x0e, 0x97, 0xf8, 0x08, 0x84, 0xd6, 0x2a, 0x09,
+      0xa2, 0xf7, 0x81, 0x1f, 0x31, 0x03, 0x11, 0x04, 0x08, 0x81, 0x8c, 0x89,
+      0x04, 0x6b, 0x05, 0x0d, 0x03, 0x09, 0x07, 0x10, 0x93, 0x60, 0x80, 0xf6,
+      0x0a, 0x73, 0x08, 0x6e, 0x17, 0x46, 0x80, 0x9a, 0x14, 0x0c, 0x57, 0x09,
+      0x19, 0x80, 0x87, 0x81, 0x47, 0x03, 0x85, 0x42, 0x0f, 0x15, 0x85, 0x50,
+      0x2b, 0x80, 0xd5, 0x2d, 0x03, 0x1a, 0x04, 0x02, 0x81, 0x70, 0x3a, 0x05,
+      0x01, 0x85, 0x00, 0x80, 0xd7, 0x29, 0x4c, 0x04, 0x0a, 0x04, 0x02, 0x83,
+      0x11, 0x44, 0x4c, 0x3d, 0x80, 0xc2, 0x3c, 0x06, 0x01, 0x04, 0x55, 0x05,
+      0x1b, 0x34, 0x02, 0x81, 0x0e, 0x2c, 0x04, 0x64, 0x0c, 0x56, 0x0a, 0x80,
+      0xae, 0x38, 0x1d, 0x0d, 0x2c, 0x04, 0x09, 0x07, 0x02, 0x0e, 0x06, 0x80,
+      0x9a, 0x83, 0xd8, 0x08, 0x0d, 0x03, 0x0d, 0x03, 0x74, 0x0c, 0x59, 0x07,
+      0x0c, 0x14, 0x0c, 0x04, 0x38, 0x08, 0x0a, 0x06, 0x28, 0x08, 0x22, 0x4e,
+      0x81, 0x54, 0x0c, 0x15, 0x03, 0x03, 0x05, 0x07, 0x09, 0x19, 0x07, 0x07,
+      0x09, 0x03, 0x0d, 0x07, 0x29, 0x80, 0xcb, 0x25, 0x0a, 0x84, 0x06,
+  };
+  auto lower = static_cast<uint16_t>(cp);
+  if (cp < 0x10000) {
+    return is_printable(lower, singletons0,
+                        sizeof(singletons0) / sizeof(*singletons0),
+                        singletons0_lower, normal0, sizeof(normal0));
+  }
+  if (cp < 0x20000) {
+    return is_printable(lower, singletons1,
+                        sizeof(singletons1) / sizeof(*singletons1),
+                        singletons1_lower, normal1, sizeof(normal1));
+  }
+  if (0x2a6de <= cp && cp < 0x2a700) return false;
+  if (0x2b735 <= cp && cp < 0x2b740) return false;
+  if (0x2b81e <= cp && cp < 0x2b820) return false;
+  if (0x2cea2 <= cp && cp < 0x2ceb0) return false;
+  if (0x2ebe1 <= cp && cp < 0x2f800) return false;
+  if (0x2fa1e <= cp && cp < 0x30000) return false;
+  if (0x3134b <= cp && cp < 0xe0100) return false;
+  if (0xe01f0 <= cp && cp < 0x110000) return false;
+  return cp < 0x110000;
+}
+
+}  // namespace detail
+
+FMT_END_NAMESPACE
+
+#endif  // FMT_FORMAT_INL_H_
diff --git a/lib/fmt/fmt/format.h b/lib/fmt/fmt/format.h
new file mode 100644
index 000000000..67f0ab739
--- /dev/null
+++ b/lib/fmt/fmt/format.h
@@ -0,0 +1,4427 @@
+/*
+  Formatting library for C++
+
+  Copyright (c) 2012 - present, Victor Zverovich
+
+  Permission is hereby granted, free of charge, to any person obtaining
+  a copy of this software and associated documentation files (the
+  "Software"), to deal in the Software without restriction, including
+  without limitation the rights to use, copy, modify, merge, publish,
+  distribute, sublicense, and/or sell copies of the Software, and to
+  permit persons to whom the Software is furnished to do so, subject to
+  the following conditions:
+
+  The above copyright notice and this permission notice shall be
+  included in all copies or substantial portions of the Software.
+
+  THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+  EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+  MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+  NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
+  LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+  OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+  WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+  --- Optional exception to the license ---
+
+  As an exception, if, as a result of your compiling your source code, portions
+  of this Software are embedded into a machine-executable object form of such
+  source code, you may redistribute such embedded portions in such object form
+  without including the above copyright and permission notices.
+ */
+
+#ifndef FMT_FORMAT_H_
+#define FMT_FORMAT_H_
+
+#ifndef _LIBCPP_REMOVE_TRANSITIVE_INCLUDES
+#  define _LIBCPP_REMOVE_TRANSITIVE_INCLUDES
+#  define FMT_REMOVE_TRANSITIVE_INCLUDES
+#endif
+
+#include "base.h"
+
+#ifndef FMT_MODULE
+#  include <cmath>             // std::signbit
+#  include <cstdint>           // uint32_t
+#  include <cstring>           // std::memcpy
+#  include <initializer_list>  // std::initializer_list
+#  include <limits>            // std::numeric_limits
+#  if defined(__GLIBCXX__) && !defined(_GLIBCXX_USE_DUAL_ABI)
+// Workaround for pre gcc 5 libstdc++.
+#    include <memory>  // std::allocator_traits
+#  endif
+#  include <stdexcept>     // std::runtime_error
+#  include <string>        // std::string
+#  include <system_error>  // std::system_error
+
+// Checking FMT_CPLUSPLUS for warning suppression in MSVC.
+#  if FMT_HAS_INCLUDE(<bit>) && FMT_CPLUSPLUS > 201703L
+#    include <bit>  // std::bit_cast
+#  endif
+
+// libc++ supports string_view in pre-c++17.
+#  if FMT_HAS_INCLUDE(<string_view>) && \
+      (FMT_CPLUSPLUS >= 201703L || defined(_LIBCPP_VERSION))
+#    include <string_view>
+#    define FMT_USE_STRING_VIEW
+#  endif
+#endif  // FMT_MODULE
+
+#if defined __cpp_inline_variables && __cpp_inline_variables >= 201606L
+#  define FMT_INLINE_VARIABLE inline
+#else
+#  define FMT_INLINE_VARIABLE
+#endif
+
+#ifndef FMT_NO_UNIQUE_ADDRESS
+#  if FMT_CPLUSPLUS >= 202002L
+#    if FMT_HAS_CPP_ATTRIBUTE(no_unique_address)
+#      define FMT_NO_UNIQUE_ADDRESS [[no_unique_address]]
+// VS2019 v16.10 and later except clang-cl (https://reviews.llvm.org/D110485).
+#    elif (FMT_MSC_VERSION >= 1929) && !FMT_CLANG_VERSION
+#      define FMT_NO_UNIQUE_ADDRESS [[msvc::no_unique_address]]
+#    endif
+#  endif
+#endif
+#ifndef FMT_NO_UNIQUE_ADDRESS
+#  define FMT_NO_UNIQUE_ADDRESS
+#endif
+
+// Visibility when compiled as a shared library/object.
+#if defined(FMT_LIB_EXPORT) || defined(FMT_SHARED)
+#  define FMT_SO_VISIBILITY(value) FMT_VISIBILITY(value)
+#else
+#  define FMT_SO_VISIBILITY(value)
+#endif
+
+#ifdef __has_builtin
+#  define FMT_HAS_BUILTIN(x) __has_builtin(x)
+#else
+#  define FMT_HAS_BUILTIN(x) 0
+#endif
+
+#if FMT_GCC_VERSION || FMT_CLANG_VERSION
+#  define FMT_NOINLINE __attribute__((noinline))
+#else
+#  define FMT_NOINLINE
+#endif
+
+namespace std {
+template <> struct iterator_traits<fmt::appender> {
+  using iterator_category = output_iterator_tag;
+  using value_type = char;
+};
+}  // namespace std
+
+#ifndef FMT_THROW
+#  if FMT_EXCEPTIONS
+#    if FMT_MSC_VERSION || defined(__NVCC__)
+FMT_BEGIN_NAMESPACE
+namespace detail {
+template <typename Exception> inline void do_throw(const Exception& x) {
+  // Silence unreachable code warnings in MSVC and NVCC because these
+  // are nearly impossible to fix in a generic code.
+  volatile bool b = true;
+  if (b) throw x;
+}
+}  // namespace detail
+FMT_END_NAMESPACE
+#      define FMT_THROW(x) detail::do_throw(x)
+#    else
+#      define FMT_THROW(x) throw x
+#    endif
+#  else
+#    define FMT_THROW(x) \
+      ::fmt::detail::assert_fail(__FILE__, __LINE__, (x).what())
+#  endif
+#endif
+
+#ifndef FMT_MAYBE_UNUSED
+#  if FMT_HAS_CPP17_ATTRIBUTE(maybe_unused)
+#    define FMT_MAYBE_UNUSED [[maybe_unused]]
+#  else
+#    define FMT_MAYBE_UNUSED
+#  endif
+#endif
+
+#ifndef FMT_USE_USER_DEFINED_LITERALS
+// EDG based compilers (Intel, NVIDIA, Elbrus, etc), GCC and MSVC support UDLs.
+//
+// GCC before 4.9 requires a space in `operator"" _a` which is invalid in later
+// compiler versions.
+#  if (FMT_HAS_FEATURE(cxx_user_literals) || FMT_GCC_VERSION >= 409 || \
+       FMT_MSC_VERSION >= 1900) &&                                     \
+      (!defined(__EDG_VERSION__) || __EDG_VERSION__ >= /* UDL feature */ 480)
+#    define FMT_USE_USER_DEFINED_LITERALS 1
+#  else
+#    define FMT_USE_USER_DEFINED_LITERALS 0
+#  endif
+#endif
+
+// Defining FMT_REDUCE_INT_INSTANTIATIONS to 1, will reduce the number of
+// integer formatter template instantiations to just one by only using the
+// largest integer type. This results in a reduction in binary size but will
+// cause a decrease in integer formatting performance.
+#if !defined(FMT_REDUCE_INT_INSTANTIATIONS)
+#  define FMT_REDUCE_INT_INSTANTIATIONS 0
+#endif
+
+// __builtin_clz is broken in clang with Microsoft CodeGen:
+// https://github.com/fmtlib/fmt/issues/519.
+#if !FMT_MSC_VERSION
+#  if FMT_HAS_BUILTIN(__builtin_clz) || FMT_GCC_VERSION || FMT_ICC_VERSION
+#    define FMT_BUILTIN_CLZ(n) __builtin_clz(n)
+#  endif
+#  if FMT_HAS_BUILTIN(__builtin_clzll) || FMT_GCC_VERSION || FMT_ICC_VERSION
+#    define FMT_BUILTIN_CLZLL(n) __builtin_clzll(n)
+#  endif
+#endif
+
+// __builtin_ctz is broken in Intel Compiler Classic on Windows:
+// https://github.com/fmtlib/fmt/issues/2510.
+#ifndef __ICL
+#  if FMT_HAS_BUILTIN(__builtin_ctz) || FMT_GCC_VERSION || FMT_ICC_VERSION || \
+      defined(__NVCOMPILER)
+#    define FMT_BUILTIN_CTZ(n) __builtin_ctz(n)
+#  endif
+#  if FMT_HAS_BUILTIN(__builtin_ctzll) || FMT_GCC_VERSION || \
+      FMT_ICC_VERSION || defined(__NVCOMPILER)
+#    define FMT_BUILTIN_CTZLL(n) __builtin_ctzll(n)
+#  endif
+#endif
+
+#if FMT_MSC_VERSION
+#  include <intrin.h>  // _BitScanReverse[64], _BitScanForward[64], _umul128
+#endif
+
+// Some compilers masquerade as both MSVC and GCC-likes or otherwise support
+// __builtin_clz and __builtin_clzll, so only define FMT_BUILTIN_CLZ using the
+// MSVC intrinsics if the clz and clzll builtins are not available.
+#if FMT_MSC_VERSION && !defined(FMT_BUILTIN_CLZLL) && \
+    !defined(FMT_BUILTIN_CTZLL)
+FMT_BEGIN_NAMESPACE
+namespace detail {
+// Avoid Clang with Microsoft CodeGen's -Wunknown-pragmas warning.
+#  if !defined(__clang__)
+#    pragma intrinsic(_BitScanForward)
+#    pragma intrinsic(_BitScanReverse)
+#    if defined(_WIN64)
+#      pragma intrinsic(_BitScanForward64)
+#      pragma intrinsic(_BitScanReverse64)
+#    endif
+#  endif
+
+inline auto clz(uint32_t x) -> int {
+  unsigned long r = 0;
+  _BitScanReverse(&r, x);
+  FMT_ASSERT(x != 0, "");
+  // Static analysis complains about using uninitialized data
+  // "r", but the only way that can happen is if "x" is 0,
+  // which the callers guarantee to not happen.
+  FMT_MSC_WARNING(suppress : 6102)
+  return 31 ^ static_cast<int>(r);
+}
+#  define FMT_BUILTIN_CLZ(n) detail::clz(n)
+
+inline auto clzll(uint64_t x) -> int {
+  unsigned long r = 0;
+#  ifdef _WIN64
+  _BitScanReverse64(&r, x);
+#  else
+  // Scan the high 32 bits.
+  if (_BitScanReverse(&r, static_cast<uint32_t>(x >> 32)))
+    return 63 ^ static_cast<int>(r + 32);
+  // Scan the low 32 bits.
+  _BitScanReverse(&r, static_cast<uint32_t>(x));
+#  endif
+  FMT_ASSERT(x != 0, "");
+  FMT_MSC_WARNING(suppress : 6102)  // Suppress a bogus static analysis warning.
+  return 63 ^ static_cast<int>(r);
+}
+#  define FMT_BUILTIN_CLZLL(n) detail::clzll(n)
+
+inline auto ctz(uint32_t x) -> int {
+  unsigned long r = 0;
+  _BitScanForward(&r, x);
+  FMT_ASSERT(x != 0, "");
+  FMT_MSC_WARNING(suppress : 6102)  // Suppress a bogus static analysis warning.
+  return static_cast<int>(r);
+}
+#  define FMT_BUILTIN_CTZ(n) detail::ctz(n)
+
+inline auto ctzll(uint64_t x) -> int {
+  unsigned long r = 0;
+  FMT_ASSERT(x != 0, "");
+  FMT_MSC_WARNING(suppress : 6102)  // Suppress a bogus static analysis warning.
+#  ifdef _WIN64
+  _BitScanForward64(&r, x);
+#  else
+  // Scan the low 32 bits.
+  if (_BitScanForward(&r, static_cast<uint32_t>(x))) return static_cast<int>(r);
+  // Scan the high 32 bits.
+  _BitScanForward(&r, static_cast<uint32_t>(x >> 32));
+  r += 32;
+#  endif
+  return static_cast<int>(r);
+}
+#  define FMT_BUILTIN_CTZLL(n) detail::ctzll(n)
+}  // namespace detail
+FMT_END_NAMESPACE
+#endif
+
+FMT_BEGIN_NAMESPACE
+
+template <typename Char, typename Traits, typename Allocator>
+struct is_contiguous<std::basic_string<Char, Traits, Allocator>>
+    : std::true_type {};
+
+namespace detail {
+
+FMT_CONSTEXPR inline void abort_fuzzing_if(bool condition) {
+  ignore_unused(condition);
+#ifdef FMT_FUZZ
+  if (condition) throw std::runtime_error("fuzzing limit reached");
+#endif
+}
+
+#if defined(FMT_USE_STRING_VIEW)
+template <typename Char> using std_string_view = std::basic_string_view<Char>;
+#else
+template <typename T> struct std_string_view {};
+#endif
+
+// Implementation of std::bit_cast for pre-C++20.
+template <typename To, typename From, FMT_ENABLE_IF(sizeof(To) == sizeof(From))>
+FMT_CONSTEXPR20 auto bit_cast(const From& from) -> To {
+#ifdef __cpp_lib_bit_cast
+  if (is_constant_evaluated()) return std::bit_cast<To>(from);
+#endif
+  auto to = To();
+  // The cast suppresses a bogus -Wclass-memaccess on GCC.
+  std::memcpy(static_cast<void*>(&to), &from, sizeof(to));
+  return to;
+}
+
+inline auto is_big_endian() -> bool {
+#ifdef _WIN32
+  return false;
+#elif defined(__BIG_ENDIAN__)
+  return true;
+#elif defined(__BYTE_ORDER__) && defined(__ORDER_BIG_ENDIAN__)
+  return __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__;
+#else
+  struct bytes {
+    char data[sizeof(int)];
+  };
+  return bit_cast<bytes>(1).data[0] == 0;
+#endif
+}
+
+class uint128_fallback {
+ private:
+  uint64_t lo_, hi_;
+
+ public:
+  constexpr uint128_fallback(uint64_t hi, uint64_t lo) : lo_(lo), hi_(hi) {}
+  constexpr uint128_fallback(uint64_t value = 0) : lo_(value), hi_(0) {}
+
+  constexpr auto high() const noexcept -> uint64_t { return hi_; }
+  constexpr auto low() const noexcept -> uint64_t { return lo_; }
+
+  template <typename T, FMT_ENABLE_IF(std::is_integral<T>::value)>
+  constexpr explicit operator T() const {
+    return static_cast<T>(lo_);
+  }
+
+  friend constexpr auto operator==(const uint128_fallback& lhs,
+                                   const uint128_fallback& rhs) -> bool {
+    return lhs.hi_ == rhs.hi_ && lhs.lo_ == rhs.lo_;
+  }
+  friend constexpr auto operator!=(const uint128_fallback& lhs,
+                                   const uint128_fallback& rhs) -> bool {
+    return !(lhs == rhs);
+  }
+  friend constexpr auto operator>(const uint128_fallback& lhs,
+                                  const uint128_fallback& rhs) -> bool {
+    return lhs.hi_ != rhs.hi_ ? lhs.hi_ > rhs.hi_ : lhs.lo_ > rhs.lo_;
+  }
+  friend constexpr auto operator|(const uint128_fallback& lhs,
+                                  const uint128_fallback& rhs)
+      -> uint128_fallback {
+    return {lhs.hi_ | rhs.hi_, lhs.lo_ | rhs.lo_};
+  }
+  friend constexpr auto operator&(const uint128_fallback& lhs,
+                                  const uint128_fallback& rhs)
+      -> uint128_fallback {
+    return {lhs.hi_ & rhs.hi_, lhs.lo_ & rhs.lo_};
+  }
+  friend constexpr auto operator~(const uint128_fallback& n)
+      -> uint128_fallback {
+    return {~n.hi_, ~n.lo_};
+  }
+  friend auto operator+(const uint128_fallback& lhs,
+                        const uint128_fallback& rhs) -> uint128_fallback {
+    auto result = uint128_fallback(lhs);
+    result += rhs;
+    return result;
+  }
+  friend auto operator*(const uint128_fallback& lhs, uint32_t rhs)
+      -> uint128_fallback {
+    FMT_ASSERT(lhs.hi_ == 0, "");
+    uint64_t hi = (lhs.lo_ >> 32) * rhs;
+    uint64_t lo = (lhs.lo_ & ~uint32_t()) * rhs;
+    uint64_t new_lo = (hi << 32) + lo;
+    return {(hi >> 32) + (new_lo < lo ? 1 : 0), new_lo};
+  }
+  friend auto operator-(const uint128_fallback& lhs, uint64_t rhs)
+      -> uint128_fallback {
+    return {lhs.hi_ - (lhs.lo_ < rhs ? 1 : 0), lhs.lo_ - rhs};
+  }
+  FMT_CONSTEXPR auto operator>>(int shift) const -> uint128_fallback {
+    if (shift == 64) return {0, hi_};
+    if (shift > 64) return uint128_fallback(0, hi_) >> (shift - 64);
+    return {hi_ >> shift, (hi_ << (64 - shift)) | (lo_ >> shift)};
+  }
+  FMT_CONSTEXPR auto operator<<(int shift) const -> uint128_fallback {
+    if (shift == 64) return {lo_, 0};
+    if (shift > 64) return uint128_fallback(lo_, 0) << (shift - 64);
+    return {hi_ << shift | (lo_ >> (64 - shift)), (lo_ << shift)};
+  }
+  FMT_CONSTEXPR auto operator>>=(int shift) -> uint128_fallback& {
+    return *this = *this >> shift;
+  }
+  FMT_CONSTEXPR void operator+=(uint128_fallback n) {
+    uint64_t new_lo = lo_ + n.lo_;
+    uint64_t new_hi = hi_ + n.hi_ + (new_lo < lo_ ? 1 : 0);
+    FMT_ASSERT(new_hi >= hi_, "");
+    lo_ = new_lo;
+    hi_ = new_hi;
+  }
+  FMT_CONSTEXPR void operator&=(uint128_fallback n) {
+    lo_ &= n.lo_;
+    hi_ &= n.hi_;
+  }
+
+  FMT_CONSTEXPR20 auto operator+=(uint64_t n) noexcept -> uint128_fallback& {
+    if (is_constant_evaluated()) {
+      lo_ += n;
+      hi_ += (lo_ < n ? 1 : 0);
+      return *this;
+    }
+#if FMT_HAS_BUILTIN(__builtin_addcll) && !defined(__ibmxl__)
+    unsigned long long carry;
+    lo_ = __builtin_addcll(lo_, n, 0, &carry);
+    hi_ += carry;
+#elif FMT_HAS_BUILTIN(__builtin_ia32_addcarryx_u64) && !defined(__ibmxl__)
+    unsigned long long result;
+    auto carry = __builtin_ia32_addcarryx_u64(0, lo_, n, &result);
+    lo_ = result;
+    hi_ += carry;
+#elif defined(_MSC_VER) && defined(_M_X64)
+    auto carry = _addcarry_u64(0, lo_, n, &lo_);
+    _addcarry_u64(carry, hi_, 0, &hi_);
+#else
+    lo_ += n;
+    hi_ += (lo_ < n ? 1 : 0);
+#endif
+    return *this;
+  }
+};
+
+using uint128_t = conditional_t<FMT_USE_INT128, uint128_opt, uint128_fallback>;
+
+#ifdef UINTPTR_MAX
+using uintptr_t = ::uintptr_t;
+#else
+using uintptr_t = uint128_t;
+#endif
+
+// Returns the largest possible value for type T. Same as
+// std::numeric_limits<T>::max() but shorter and not affected by the max macro.
+template <typename T> constexpr auto max_value() -> T {
+  return (std::numeric_limits<T>::max)();
+}
+template <typename T> constexpr auto num_bits() -> int {
+  return std::numeric_limits<T>::digits;
+}
+// std::numeric_limits<T>::digits may return 0 for 128-bit ints.
+template <> constexpr auto num_bits<int128_opt>() -> int { return 128; }
+template <> constexpr auto num_bits<uint128_opt>() -> int { return 128; }
+template <> constexpr auto num_bits<uint128_fallback>() -> int { return 128; }
+
+// A heterogeneous bit_cast used for converting 96-bit long double to uint128_t
+// and 128-bit pointers to uint128_fallback.
+template <typename To, typename From, FMT_ENABLE_IF(sizeof(To) > sizeof(From))>
+inline auto bit_cast(const From& from) -> To {
+  constexpr auto size = static_cast<int>(sizeof(From) / sizeof(unsigned));
+  struct data_t {
+    unsigned value[static_cast<unsigned>(size)];
+  } data = bit_cast<data_t>(from);
+  auto result = To();
+  if (const_check(is_big_endian())) {
+    for (int i = 0; i < size; ++i)
+      result = (result << num_bits<unsigned>()) | data.value[i];
+  } else {
+    for (int i = size - 1; i >= 0; --i)
+      result = (result << num_bits<unsigned>()) | data.value[i];
+  }
+  return result;
+}
+
+template <typename UInt>
+FMT_CONSTEXPR20 inline auto countl_zero_fallback(UInt n) -> int {
+  int lz = 0;
+  constexpr UInt msb_mask = static_cast<UInt>(1) << (num_bits<UInt>() - 1);
+  for (; (n & msb_mask) == 0; n <<= 1) lz++;
+  return lz;
+}
+
+FMT_CONSTEXPR20 inline auto countl_zero(uint32_t n) -> int {
+#ifdef FMT_BUILTIN_CLZ
+  if (!is_constant_evaluated()) return FMT_BUILTIN_CLZ(n);
+#endif
+  return countl_zero_fallback(n);
+}
+
+FMT_CONSTEXPR20 inline auto countl_zero(uint64_t n) -> int {
+#ifdef FMT_BUILTIN_CLZLL
+  if (!is_constant_evaluated()) return FMT_BUILTIN_CLZLL(n);
+#endif
+  return countl_zero_fallback(n);
+}
+
+FMT_INLINE void assume(bool condition) {
+  (void)condition;
+#if FMT_HAS_BUILTIN(__builtin_assume) && !FMT_ICC_VERSION
+  __builtin_assume(condition);
+#elif FMT_GCC_VERSION
+  if (!condition) __builtin_unreachable();
+#endif
+}
+
+// An approximation of iterator_t for pre-C++20 systems.
+template <typename T>
+using iterator_t = decltype(std::begin(std::declval<T&>()));
+template <typename T> using sentinel_t = decltype(std::end(std::declval<T&>()));
+
+// A workaround for std::string not having mutable data() until C++17.
+template <typename Char>
+inline auto get_data(std::basic_string<Char>& s) -> Char* {
+  return &s[0];
+}
+template <typename Container>
+inline auto get_data(Container& c) -> typename Container::value_type* {
+  return c.data();
+}
+
+// Attempts to reserve space for n extra characters in the output range.
+// Returns a pointer to the reserved range or a reference to it.
+template <typename OutputIt,
+          FMT_ENABLE_IF(is_back_insert_iterator<OutputIt>::value&&
+                            is_contiguous<typename OutputIt::container>::value)>
+#if FMT_CLANG_VERSION >= 307 && !FMT_ICC_VERSION
+__attribute__((no_sanitize("undefined")))
+#endif
+inline auto
+reserve(OutputIt it, size_t n) -> typename OutputIt::value_type* {
+  auto& c = get_container(it);
+  size_t size = c.size();
+  c.resize(size + n);
+  return get_data(c) + size;
+}
+
+template <typename T>
+inline auto reserve(basic_appender<T> it, size_t n) -> basic_appender<T> {
+  buffer<T>& buf = get_container(it);
+  buf.try_reserve(buf.size() + n);
+  return it;
+}
+
+template <typename Iterator>
+constexpr auto reserve(Iterator& it, size_t) -> Iterator& {
+  return it;
+}
+
+template <typename OutputIt>
+using reserve_iterator =
+    remove_reference_t<decltype(reserve(std::declval<OutputIt&>(), 0))>;
+
+template <typename T, typename OutputIt>
+constexpr auto to_pointer(OutputIt, size_t) -> T* {
+  return nullptr;
+}
+template <typename T> auto to_pointer(basic_appender<T> it, size_t n) -> T* {
+  buffer<T>& buf = get_container(it);
+  auto size = buf.size();
+  buf.try_reserve(size + n);
+  if (buf.capacity() < size + n) return nullptr;
+  buf.try_resize(size + n);
+  return buf.data() + size;
+}
+
+template <typename OutputIt,
+          FMT_ENABLE_IF(is_back_insert_iterator<OutputIt>::value&&
+                            is_contiguous<typename OutputIt::container>::value)>
+inline auto base_iterator(OutputIt it,
+                          typename OutputIt::container_type::value_type*)
+    -> OutputIt {
+  return it;
+}
+
+template <typename Iterator>
+constexpr auto base_iterator(Iterator, Iterator it) -> Iterator {
+  return it;
+}
+
+// <algorithm> is spectacularly slow to compile in C++20 so use a simple fill_n
+// instead (#1998).
+template <typename OutputIt, typename Size, typename T>
+FMT_CONSTEXPR auto fill_n(OutputIt out, Size count, const T& value)
+    -> OutputIt {
+  for (Size i = 0; i < count; ++i) *out++ = value;
+  return out;
+}
+template <typename T, typename Size>
+FMT_CONSTEXPR20 auto fill_n(T* out, Size count, char value) -> T* {
+  if (is_constant_evaluated()) {
+    return fill_n<T*, Size, T>(out, count, value);
+  }
+  std::memset(out, value, to_unsigned(count));
+  return out + count;
+}
+
+template <typename OutChar, typename InputIt, typename OutputIt>
+FMT_CONSTEXPR FMT_NOINLINE auto copy_noinline(InputIt begin, InputIt end,
+                                              OutputIt out) -> OutputIt {
+  return copy<OutChar>(begin, end, out);
+}
+
+// A public domain branchless UTF-8 decoder by Christopher Wellons:
+// https://github.com/skeeto/branchless-utf8
+/* Decode the next character, c, from s, reporting errors in e.
+ *
+ * Since this is a branchless decoder, four bytes will be read from the
+ * buffer regardless of the actual length of the next character. This
+ * means the buffer _must_ have at least three bytes of zero padding
+ * following the end of the data stream.
+ *
+ * Errors are reported in e, which will be non-zero if the parsed
+ * character was somehow invalid: invalid byte sequence, non-canonical
+ * encoding, or a surrogate half.
+ *
+ * The function returns a pointer to the next character. When an error
+ * occurs, this pointer will be a guess that depends on the particular
+ * error, but it will always advance at least one byte.
+ */
+FMT_CONSTEXPR inline auto utf8_decode(const char* s, uint32_t* c, int* e)
+    -> const char* {
+  constexpr const int masks[] = {0x00, 0x7f, 0x1f, 0x0f, 0x07};
+  constexpr const uint32_t mins[] = {4194304, 0, 128, 2048, 65536};
+  constexpr const int shiftc[] = {0, 18, 12, 6, 0};
+  constexpr const int shifte[] = {0, 6, 4, 2, 0};
+
+  int len = "\1\1\1\1\1\1\1\1\1\1\1\1\1\1\1\1\0\0\0\0\0\0\0\0\2\2\2\2\3\3\4"
+      [static_cast<unsigned char>(*s) >> 3];
+  // Compute the pointer to the next character early so that the next
+  // iteration can start working on the next character. Neither Clang
+  // nor GCC figure out this reordering on their own.
+  const char* next = s + len + !len;
+
+  using uchar = unsigned char;
+
+  // Assume a four-byte character and load four bytes. Unused bits are
+  // shifted out.
+  *c = uint32_t(uchar(s[0]) & masks[len]) << 18;
+  *c |= uint32_t(uchar(s[1]) & 0x3f) << 12;
+  *c |= uint32_t(uchar(s[2]) & 0x3f) << 6;
+  *c |= uint32_t(uchar(s[3]) & 0x3f) << 0;
+  *c >>= shiftc[len];
+
+  // Accumulate the various error conditions.
+  *e = (*c < mins[len]) << 6;       // non-canonical encoding
+  *e |= ((*c >> 11) == 0x1b) << 7;  // surrogate half?
+  *e |= (*c > 0x10FFFF) << 8;       // out of range?
+  *e |= (uchar(s[1]) & 0xc0) >> 2;
+  *e |= (uchar(s[2]) & 0xc0) >> 4;
+  *e |= uchar(s[3]) >> 6;
+  *e ^= 0x2a;  // top two bits of each tail byte correct?
+  *e >>= shifte[len];
+
+  return next;
+}
+
+constexpr FMT_INLINE_VARIABLE uint32_t invalid_code_point = ~uint32_t();
+
+// Invokes f(cp, sv) for every code point cp in s with sv being the string view
+// corresponding to the code point. cp is invalid_code_point on error.
+template <typename F>
+FMT_CONSTEXPR void for_each_codepoint(string_view s, F f) {
+  auto decode = [f](const char* buf_ptr, const char* ptr) {
+    auto cp = uint32_t();
+    auto error = 0;
+    auto end = utf8_decode(buf_ptr, &cp, &error);
+    bool result = f(error ? invalid_code_point : cp,
+                    string_view(ptr, error ? 1 : to_unsigned(end - buf_ptr)));
+    return result ? (error ? buf_ptr + 1 : end) : nullptr;
+  };
+  auto p = s.data();
+  const size_t block_size = 4;  // utf8_decode always reads blocks of 4 chars.
+  if (s.size() >= block_size) {
+    for (auto end = p + s.size() - block_size + 1; p < end;) {
+      p = decode(p, p);
+      if (!p) return;
+    }
+  }
+  if (auto num_chars_left = s.data() + s.size() - p) {
+    char buf[2 * block_size - 1] = {};
+    copy<char>(p, p + num_chars_left, buf);
+    const char* buf_ptr = buf;
+    do {
+      auto end = decode(buf_ptr, p);
+      if (!end) return;
+      p += end - buf_ptr;
+      buf_ptr = end;
+    } while (buf_ptr - buf < num_chars_left);
+  }
+}
+
+template <typename Char>
+inline auto compute_width(basic_string_view<Char> s) -> size_t {
+  return s.size();
+}
+
+// Computes approximate display width of a UTF-8 string.
+FMT_CONSTEXPR inline auto compute_width(string_view s) -> size_t {
+  size_t num_code_points = 0;
+  // It is not a lambda for compatibility with C++14.
+  struct count_code_points {
+    size_t* count;
+    FMT_CONSTEXPR auto operator()(uint32_t cp, string_view) const -> bool {
+      *count += detail::to_unsigned(
+          1 +
+          (cp >= 0x1100 &&
+           (cp <= 0x115f ||  // Hangul Jamo init. consonants
+            cp == 0x2329 ||  // LEFT-POINTING ANGLE BRACKET
+            cp == 0x232a ||  // RIGHT-POINTING ANGLE BRACKET
+            // CJK ... Yi except IDEOGRAPHIC HALF FILL SPACE:
+            (cp >= 0x2e80 && cp <= 0xa4cf && cp != 0x303f) ||
+            (cp >= 0xac00 && cp <= 0xd7a3) ||    // Hangul Syllables
+            (cp >= 0xf900 && cp <= 0xfaff) ||    // CJK Compatibility Ideographs
+            (cp >= 0xfe10 && cp <= 0xfe19) ||    // Vertical Forms
+            (cp >= 0xfe30 && cp <= 0xfe6f) ||    // CJK Compatibility Forms
+            (cp >= 0xff00 && cp <= 0xff60) ||    // Fullwidth Forms
+            (cp >= 0xffe0 && cp <= 0xffe6) ||    // Fullwidth Forms
+            (cp >= 0x20000 && cp <= 0x2fffd) ||  // CJK
+            (cp >= 0x30000 && cp <= 0x3fffd) ||
+            // Miscellaneous Symbols and Pictographs + Emoticons:
+            (cp >= 0x1f300 && cp <= 0x1f64f) ||
+            // Supplemental Symbols and Pictographs:
+            (cp >= 0x1f900 && cp <= 0x1f9ff))));
+      return true;
+    }
+  };
+  // We could avoid branches by using utf8_decode directly.
+  for_each_codepoint(s, count_code_points{&num_code_points});
+  return num_code_points;
+}
+
+template <typename Char>
+inline auto code_point_index(basic_string_view<Char> s, size_t n) -> size_t {
+  size_t size = s.size();
+  return n < size ? n : size;
+}
+
+// Calculates the index of the nth code point in a UTF-8 string.
+inline auto code_point_index(string_view s, size_t n) -> size_t {
+  size_t result = s.size();
+  const char* begin = s.begin();
+  for_each_codepoint(s, [begin, &n, &result](uint32_t, string_view sv) {
+    if (n != 0) {
+      --n;
+      return true;
+    }
+    result = to_unsigned(sv.begin() - begin);
+    return false;
+  });
+  return result;
+}
+
+template <typename T> struct is_integral : std::is_integral<T> {};
+template <> struct is_integral<int128_opt> : std::true_type {};
+template <> struct is_integral<uint128_t> : std::true_type {};
+
+template <typename T>
+using is_signed =
+    std::integral_constant<bool, std::numeric_limits<T>::is_signed ||
+                                     std::is_same<T, int128_opt>::value>;
+
+template <typename T>
+using is_integer =
+    bool_constant<is_integral<T>::value && !std::is_same<T, bool>::value &&
+                  !std::is_same<T, char>::value &&
+                  !std::is_same<T, wchar_t>::value>;
+
+#ifndef FMT_USE_FLOAT
+#  define FMT_USE_FLOAT 1
+#endif
+#ifndef FMT_USE_DOUBLE
+#  define FMT_USE_DOUBLE 1
+#endif
+#ifndef FMT_USE_LONG_DOUBLE
+#  define FMT_USE_LONG_DOUBLE 1
+#endif
+
+#if defined(FMT_USE_FLOAT128)
+// Use the provided definition.
+#elif FMT_CLANG_VERSION && FMT_HAS_INCLUDE(<quadmath.h>)
+#  define FMT_USE_FLOAT128 1
+#elif FMT_GCC_VERSION && defined(_GLIBCXX_USE_FLOAT128) && \
+    !defined(__STRICT_ANSI__)
+#  define FMT_USE_FLOAT128 1
+#else
+#  define FMT_USE_FLOAT128 0
+#endif
+#if FMT_USE_FLOAT128
+using float128 = __float128;
+#else
+using float128 = void;
+#endif
+
+template <typename T> using is_float128 = std::is_same<T, float128>;
+
+template <typename T>
+using is_floating_point =
+    bool_constant<std::is_floating_point<T>::value || is_float128<T>::value>;
+
+template <typename T, bool = std::is_floating_point<T>::value>
+struct is_fast_float : bool_constant<std::numeric_limits<T>::is_iec559 &&
+                                     sizeof(T) <= sizeof(double)> {};
+template <typename T> struct is_fast_float<T, false> : std::false_type {};
+
+template <typename T>
+using is_double_double = bool_constant<std::numeric_limits<T>::digits == 106>;
+
+#ifndef FMT_USE_FULL_CACHE_DRAGONBOX
+#  define FMT_USE_FULL_CACHE_DRAGONBOX 0
+#endif
+
+template <typename T, typename Enable = void>
+struct is_locale : std::false_type {};
+template <typename T>
+struct is_locale<T, void_t<decltype(T::classic())>> : std::true_type {};
+}  // namespace detail
+
+FMT_BEGIN_EXPORT
+
+// The number of characters to store in the basic_memory_buffer object itself
+// to avoid dynamic memory allocation.
+enum { inline_buffer_size = 500 };
+
+/**
+ * A dynamically growing memory buffer for trivially copyable/constructible
+ * types with the first `SIZE` elements stored in the object itself. Most
+ * commonly used via the `memory_buffer` alias for `char`.
+ *
+ * **Example**:
+ *
+ *     auto out = fmt::memory_buffer();
+ *     fmt::format_to(std::back_inserter(out), "The answer is {}.", 42);
+ *
+ * This will append "The answer is 42." to `out`. The buffer content can be
+ * converted to `std::string` with `to_string(out)`.
+ */
+template <typename T, size_t SIZE = inline_buffer_size,
+          typename Allocator = std::allocator<T>>
+class basic_memory_buffer : public detail::buffer<T> {
+ private:
+  T store_[SIZE];
+
+  // Don't inherit from Allocator to avoid generating type_info for it.
+  FMT_NO_UNIQUE_ADDRESS Allocator alloc_;
+
+  // Deallocate memory allocated by the buffer.
+  FMT_CONSTEXPR20 void deallocate() {
+    T* data = this->data();
+    if (data != store_) alloc_.deallocate(data, this->capacity());
+  }
+
+  static FMT_CONSTEXPR20 void grow(detail::buffer<T>& buf, size_t size) {
+    detail::abort_fuzzing_if(size > 5000);
+    auto& self = static_cast<basic_memory_buffer&>(buf);
+    const size_t max_size =
+        std::allocator_traits<Allocator>::max_size(self.alloc_);
+    size_t old_capacity = buf.capacity();
+    size_t new_capacity = old_capacity + old_capacity / 2;
+    if (size > new_capacity)
+      new_capacity = size;
+    else if (new_capacity > max_size)
+      new_capacity = size > max_size ? size : max_size;
+    T* old_data = buf.data();
+    T* new_data = self.alloc_.allocate(new_capacity);
+    // Suppress a bogus -Wstringop-overflow in gcc 13.1 (#3481).
+    detail::assume(buf.size() <= new_capacity);
+    // The following code doesn't throw, so the raw pointer above doesn't leak.
+    memcpy(new_data, old_data, buf.size() * sizeof(T));
+    self.set(new_data, new_capacity);
+    // deallocate must not throw according to the standard, but even if it does,
+    // the buffer already uses the new storage and will deallocate it in
+    // destructor.
+    if (old_data != self.store_) self.alloc_.deallocate(old_data, old_capacity);
+  }
+
+ public:
+  using value_type = T;
+  using const_reference = const T&;
+
+  FMT_CONSTEXPR20 explicit basic_memory_buffer(
+      const Allocator& alloc = Allocator())
+      : detail::buffer<T>(grow), alloc_(alloc) {
+    this->set(store_, SIZE);
+    if (detail::is_constant_evaluated()) detail::fill_n(store_, SIZE, T());
+  }
+  FMT_CONSTEXPR20 ~basic_memory_buffer() { deallocate(); }
+
+ private:
+  // Move data from other to this buffer.
+  FMT_CONSTEXPR20 void move(basic_memory_buffer& other) {
+    alloc_ = std::move(other.alloc_);
+    T* data = other.data();
+    size_t size = other.size(), capacity = other.capacity();
+    if (data == other.store_) {
+      this->set(store_, capacity);
+      detail::copy<T>(other.store_, other.store_ + size, store_);
+    } else {
+      this->set(data, capacity);
+      // Set pointer to the inline array so that delete is not called
+      // when deallocating.
+      other.set(other.store_, 0);
+      other.clear();
+    }
+    this->resize(size);
+  }
+
+ public:
+  /// Constructs a `basic_memory_buffer` object moving the content of the other
+  /// object to it.
+  FMT_CONSTEXPR20 basic_memory_buffer(basic_memory_buffer&& other) noexcept
+      : detail::buffer<T>(grow) {
+    move(other);
+  }
+
+  /// Moves the content of the other `basic_memory_buffer` object to this one.
+  auto operator=(basic_memory_buffer&& other) noexcept -> basic_memory_buffer& {
+    FMT_ASSERT(this != &other, "");
+    deallocate();
+    move(other);
+    return *this;
+  }
+
+  // Returns a copy of the allocator associated with this buffer.
+  auto get_allocator() const -> Allocator { return alloc_; }
+
+  /// Resizes the buffer to contain `count` elements. If T is a POD type new
+  /// elements may not be initialized.
+  FMT_CONSTEXPR20 void resize(size_t count) { this->try_resize(count); }
+
+  /// Increases the buffer capacity to `new_capacity`.
+  void reserve(size_t new_capacity) { this->try_reserve(new_capacity); }
+
+  using detail::buffer<T>::append;
+  template <typename ContiguousRange>
+  void append(const ContiguousRange& range) {
+    append(range.data(), range.data() + range.size());
+  }
+};
+
+using memory_buffer = basic_memory_buffer<char>;
+
+template <typename T, size_t SIZE, typename Allocator>
+struct is_contiguous<basic_memory_buffer<T, SIZE, Allocator>> : std::true_type {
+};
+
+FMT_END_EXPORT
+namespace detail {
+FMT_API auto write_console(int fd, string_view text) -> bool;
+FMT_API void print(std::FILE*, string_view);
+}  // namespace detail
+
+FMT_BEGIN_EXPORT
+
+// Suppress a misleading warning in older versions of clang.
+#if FMT_CLANG_VERSION
+#  pragma clang diagnostic ignored "-Wweak-vtables"
+#endif
+
+/// An error reported from a formatting function.
+class FMT_SO_VISIBILITY("default") format_error : public std::runtime_error {
+ public:
+  using std::runtime_error::runtime_error;
+};
+
+namespace detail_exported {
+#if FMT_USE_NONTYPE_TEMPLATE_ARGS
+template <typename Char, size_t N> struct fixed_string {
+  constexpr fixed_string(const Char (&str)[N]) {
+    detail::copy<Char, const Char*, Char*>(static_cast<const Char*>(str),
+                                           str + N, data);
+  }
+  Char data[N] = {};
+};
+#endif
+
+// Converts a compile-time string to basic_string_view.
+template <typename Char, size_t N>
+constexpr auto compile_string_to_view(const Char (&s)[N])
+    -> basic_string_view<Char> {
+  // Remove trailing NUL character if needed. Won't be present if this is used
+  // with a raw character array (i.e. not defined as a string).
+  return {s, N - (std::char_traits<Char>::to_int_type(s[N - 1]) == 0 ? 1 : 0)};
+}
+template <typename Char>
+constexpr auto compile_string_to_view(basic_string_view<Char> s)
+    -> basic_string_view<Char> {
+  return s;
+}
+}  // namespace detail_exported
+
+// A generic formatting context with custom output iterator and character
+// (code unit) support. Char is the format string code unit type which can be
+// different from OutputIt::value_type.
+template <typename OutputIt, typename Char> class generic_context {
+ private:
+  OutputIt out_;
+  basic_format_args<generic_context> args_;
+  detail::locale_ref loc_;
+
+ public:
+  using char_type = Char;
+  using iterator = OutputIt;
+  using parse_context_type = basic_format_parse_context<Char>;
+  template <typename T> using formatter_type = formatter<T, Char>;
+
+  constexpr generic_context(OutputIt out,
+                            basic_format_args<generic_context> ctx_args,
+                            detail::locale_ref loc = {})
+      : out_(out), args_(ctx_args), loc_(loc) {}
+  generic_context(generic_context&&) = default;
+  generic_context(const generic_context&) = delete;
+  void operator=(const generic_context&) = delete;
+
+  constexpr auto arg(int id) const -> basic_format_arg<generic_context> {
+    return args_.get(id);
+  }
+  auto arg(basic_string_view<Char> name) -> basic_format_arg<generic_context> {
+    return args_.get(name);
+  }
+  FMT_CONSTEXPR auto arg_id(basic_string_view<Char> name) -> int {
+    return args_.get_id(name);
+  }
+  auto args() const -> const basic_format_args<generic_context>& {
+    return args_;
+  }
+
+  FMT_CONSTEXPR auto out() -> iterator { return out_; }
+
+  void advance_to(iterator it) {
+    if (!detail::is_back_insert_iterator<iterator>()) out_ = it;
+  }
+
+  FMT_CONSTEXPR auto locale() -> detail::locale_ref { return loc_; }
+};
+
+class loc_value {
+ private:
+  basic_format_arg<format_context> value_;
+
+ public:
+  template <typename T, FMT_ENABLE_IF(!detail::is_float128<T>::value)>
+  loc_value(T value) : value_(detail::make_arg<format_context>(value)) {}
+
+  template <typename T, FMT_ENABLE_IF(detail::is_float128<T>::value)>
+  loc_value(T) {}
+
+  template <typename Visitor> auto visit(Visitor&& vis) -> decltype(vis(0)) {
+    return value_.visit(vis);
+  }
+};
+
+// A locale facet that formats values in UTF-8.
+// It is parameterized on the locale to avoid the heavy <locale> include.
+template <typename Locale> class format_facet : public Locale::facet {
+ private:
+  std::string separator_;
+  std::string grouping_;
+  std::string decimal_point_;
+
+ protected:
+  virtual auto do_put(appender out, loc_value val,
+                      const format_specs& specs) const -> bool;
+
+ public:
+  static FMT_API typename Locale::id id;
+
+  explicit format_facet(Locale& loc);
+  explicit format_facet(string_view sep = "",
+                        std::initializer_list<unsigned char> g = {3},
+                        std::string decimal_point = ".")
+      : separator_(sep.data(), sep.size()),
+        grouping_(g.begin(), g.end()),
+        decimal_point_(decimal_point) {}
+
+  auto put(appender out, loc_value val, const format_specs& specs) const
+      -> bool {
+    return do_put(out, val, specs);
+  }
+};
+
+FMT_END_EXPORT
+
+namespace detail {
+
+// Returns true if value is negative, false otherwise.
+// Same as `value < 0` but doesn't produce warnings if T is an unsigned type.
+template <typename T, FMT_ENABLE_IF(is_signed<T>::value)>
+constexpr auto is_negative(T value) -> bool {
+  return value < 0;
+}
+template <typename T, FMT_ENABLE_IF(!is_signed<T>::value)>
+constexpr auto is_negative(T) -> bool {
+  return false;
+}
+
+template <typename T>
+FMT_CONSTEXPR auto is_supported_floating_point(T) -> bool {
+  if (std::is_same<T, float>()) return FMT_USE_FLOAT;
+  if (std::is_same<T, double>()) return FMT_USE_DOUBLE;
+  if (std::is_same<T, long double>()) return FMT_USE_LONG_DOUBLE;
+  return true;
+}
+
+// Smallest of uint32_t, uint64_t, uint128_t that is large enough to
+// represent all values of an integral type T.
+template <typename T>
+using uint32_or_64_or_128_t =
+    conditional_t<num_bits<T>() <= 32 && !FMT_REDUCE_INT_INSTANTIATIONS,
+                  uint32_t,
+                  conditional_t<num_bits<T>() <= 64, uint64_t, uint128_t>>;
+template <typename T>
+using uint64_or_128_t = conditional_t<num_bits<T>() <= 64, uint64_t, uint128_t>;
+
+#define FMT_POWERS_OF_10(factor)                                  \
+  factor * 10, (factor) * 100, (factor) * 1000, (factor) * 10000, \
+      (factor) * 100000, (factor) * 1000000, (factor) * 10000000, \
+      (factor) * 100000000, (factor) * 1000000000
+
+// Converts value in the range [0, 100) to a string.
+constexpr auto digits2(size_t value) -> const char* {
+  // GCC generates slightly better code when value is pointer-size.
+  return &"0001020304050607080910111213141516171819"
+         "2021222324252627282930313233343536373839"
+         "4041424344454647484950515253545556575859"
+         "6061626364656667686970717273747576777879"
+         "8081828384858687888990919293949596979899"[value * 2];
+}
+
+// Sign is a template parameter to workaround a bug in gcc 4.8.
+template <typename Char, typename Sign> constexpr auto sign(Sign s) -> Char {
+#if !FMT_GCC_VERSION || FMT_GCC_VERSION >= 604
+  static_assert(std::is_same<Sign, sign_t>::value, "");
+#endif
+  return static_cast<char>(((' ' << 24) | ('+' << 16) | ('-' << 8)) >> (s * 8));
+}
+
+template <typename T> FMT_CONSTEXPR auto count_digits_fallback(T n) -> int {
+  int count = 1;
+  for (;;) {
+    // Integer division is slow so do it for a group of four digits instead
+    // of for every digit. The idea comes from the talk by Alexandrescu
+    // "Three Optimization Tips for C++". See speed-test for a comparison.
+    if (n < 10) return count;
+    if (n < 100) return count + 1;
+    if (n < 1000) return count + 2;
+    if (n < 10000) return count + 3;
+    n /= 10000u;
+    count += 4;
+  }
+}
+#if FMT_USE_INT128
+FMT_CONSTEXPR inline auto count_digits(uint128_opt n) -> int {
+  return count_digits_fallback(n);
+}
+#endif
+
+#ifdef FMT_BUILTIN_CLZLL
+// It is a separate function rather than a part of count_digits to workaround
+// the lack of static constexpr in constexpr functions.
+inline auto do_count_digits(uint64_t n) -> int {
+  // This has comparable performance to the version by Kendall Willets
+  // (https://github.com/fmtlib/format-benchmark/blob/master/digits10)
+  // but uses smaller tables.
+  // Maps bsr(n) to ceil(log10(pow(2, bsr(n) + 1) - 1)).
+  static constexpr uint8_t bsr2log10[] = {
+      1,  1,  1,  2,  2,  2,  3,  3,  3,  4,  4,  4,  4,  5,  5,  5,
+      6,  6,  6,  7,  7,  7,  7,  8,  8,  8,  9,  9,  9,  10, 10, 10,
+      10, 11, 11, 11, 12, 12, 12, 13, 13, 13, 13, 14, 14, 14, 15, 15,
+      15, 16, 16, 16, 16, 17, 17, 17, 18, 18, 18, 19, 19, 19, 19, 20};
+  auto t = bsr2log10[FMT_BUILTIN_CLZLL(n | 1) ^ 63];
+  static constexpr const uint64_t zero_or_powers_of_10[] = {
+      0, 0, FMT_POWERS_OF_10(1U), FMT_POWERS_OF_10(1000000000ULL),
+      10000000000000000000ULL};
+  return t - (n < zero_or_powers_of_10[t]);
+}
+#endif
+
+// Returns the number of decimal digits in n. Leading zeros are not counted
+// except for n == 0 in which case count_digits returns 1.
+FMT_CONSTEXPR20 inline auto count_digits(uint64_t n) -> int {
+#ifdef FMT_BUILTIN_CLZLL
+  if (!is_constant_evaluated()) return do_count_digits(n);
+#endif
+  return count_digits_fallback(n);
+}
+
+// Counts the number of digits in n. BITS = log2(radix).
+template <int BITS, typename UInt>
+FMT_CONSTEXPR auto count_digits(UInt n) -> int {
+#ifdef FMT_BUILTIN_CLZ
+  if (!is_constant_evaluated() && num_bits<UInt>() == 32)
+    return (FMT_BUILTIN_CLZ(static_cast<uint32_t>(n) | 1) ^ 31) / BITS + 1;
+#endif
+  // Lambda avoids unreachable code warnings from NVHPC.
+  return [](UInt m) {
+    int num_digits = 0;
+    do {
+      ++num_digits;
+    } while ((m >>= BITS) != 0);
+    return num_digits;
+  }(n);
+}
+
+#ifdef FMT_BUILTIN_CLZ
+// It is a separate function rather than a part of count_digits to workaround
+// the lack of static constexpr in constexpr functions.
+FMT_INLINE auto do_count_digits(uint32_t n) -> int {
+// An optimization by Kendall Willets from https://bit.ly/3uOIQrB.
+// This increments the upper 32 bits (log10(T) - 1) when >= T is added.
+#  define FMT_INC(T) (((sizeof(#T) - 1ull) << 32) - T)
+  static constexpr uint64_t table[] = {
+      FMT_INC(0),          FMT_INC(0),          FMT_INC(0),           // 8
+      FMT_INC(10),         FMT_INC(10),         FMT_INC(10),          // 64
+      FMT_INC(100),        FMT_INC(100),        FMT_INC(100),         // 512
+      FMT_INC(1000),       FMT_INC(1000),       FMT_INC(1000),        // 4096
+      FMT_INC(10000),      FMT_INC(10000),      FMT_INC(10000),       // 32k
+      FMT_INC(100000),     FMT_INC(100000),     FMT_INC(100000),      // 256k
+      FMT_INC(1000000),    FMT_INC(1000000),    FMT_INC(1000000),     // 2048k
+      FMT_INC(10000000),   FMT_INC(10000000),   FMT_INC(10000000),    // 16M
+      FMT_INC(100000000),  FMT_INC(100000000),  FMT_INC(100000000),   // 128M
+      FMT_INC(1000000000), FMT_INC(1000000000), FMT_INC(1000000000),  // 1024M
+      FMT_INC(1000000000), FMT_INC(1000000000)                        // 4B
+  };
+  auto inc = table[FMT_BUILTIN_CLZ(n | 1) ^ 31];
+  return static_cast<int>((n + inc) >> 32);
+}
+#endif
+
+// Optional version of count_digits for better performance on 32-bit platforms.
+FMT_CONSTEXPR20 inline auto count_digits(uint32_t n) -> int {
+#ifdef FMT_BUILTIN_CLZ
+  if (!is_constant_evaluated()) {
+    return do_count_digits(n);
+  }
+#endif
+  return count_digits_fallback(n);
+}
+
+template <typename Int> constexpr auto digits10() noexcept -> int {
+  return std::numeric_limits<Int>::digits10;
+}
+template <> constexpr auto digits10<int128_opt>() noexcept -> int { return 38; }
+template <> constexpr auto digits10<uint128_t>() noexcept -> int { return 38; }
+
+template <typename Char> struct thousands_sep_result {
+  std::string grouping;
+  Char thousands_sep;
+};
+
+template <typename Char>
+FMT_API auto thousands_sep_impl(locale_ref loc) -> thousands_sep_result<Char>;
+template <typename Char>
+inline auto thousands_sep(locale_ref loc) -> thousands_sep_result<Char> {
+  auto result = thousands_sep_impl<char>(loc);
+  return {result.grouping, Char(result.thousands_sep)};
+}
+template <>
+inline auto thousands_sep(locale_ref loc) -> thousands_sep_result<wchar_t> {
+  return thousands_sep_impl<wchar_t>(loc);
+}
+
+template <typename Char>
+FMT_API auto decimal_point_impl(locale_ref loc) -> Char;
+template <typename Char> inline auto decimal_point(locale_ref loc) -> Char {
+  return Char(decimal_point_impl<char>(loc));
+}
+template <> inline auto decimal_point(locale_ref loc) -> wchar_t {
+  return decimal_point_impl<wchar_t>(loc);
+}
+
+// Compares two characters for equality.
+template <typename Char> auto equal2(const Char* lhs, const char* rhs) -> bool {
+  return lhs[0] == Char(rhs[0]) && lhs[1] == Char(rhs[1]);
+}
+inline auto equal2(const char* lhs, const char* rhs) -> bool {
+  return memcmp(lhs, rhs, 2) == 0;
+}
+
+// Copies two characters from src to dst.
+template <typename Char>
+FMT_CONSTEXPR20 FMT_INLINE void copy2(Char* dst, const char* src) {
+  if (!is_constant_evaluated() && sizeof(Char) == sizeof(char)) {
+    memcpy(dst, src, 2);
+    return;
+  }
+  *dst++ = static_cast<Char>(*src++);
+  *dst = static_cast<Char>(*src);
+}
+
+template <typename Iterator> struct format_decimal_result {
+  Iterator begin;
+  Iterator end;
+};
+
+// Formats a decimal unsigned integer value writing into out pointing to a
+// buffer of specified size. The caller must ensure that the buffer is large
+// enough.
+template <typename Char, typename UInt>
+FMT_CONSTEXPR20 auto format_decimal(Char* out, UInt value, int size)
+    -> format_decimal_result<Char*> {
+  FMT_ASSERT(size >= count_digits(value), "invalid digit count");
+  out += size;
+  Char* end = out;
+  while (value >= 100) {
+    // Integer division is slow so do it for a group of two digits instead
+    // of for every digit. The idea comes from the talk by Alexandrescu
+    // "Three Optimization Tips for C++". See speed-test for a comparison.
+    out -= 2;
+    copy2(out, digits2(static_cast<size_t>(value % 100)));
+    value /= 100;
+  }
+  if (value < 10) {
+    *--out = static_cast<Char>('0' + value);
+    return {out, end};
+  }
+  out -= 2;
+  copy2(out, digits2(static_cast<size_t>(value)));
+  return {out, end};
+}
+
+template <typename Char, typename UInt, typename Iterator,
+          FMT_ENABLE_IF(!std::is_pointer<remove_cvref_t<Iterator>>::value)>
+FMT_CONSTEXPR inline auto format_decimal(Iterator out, UInt value, int size)
+    -> format_decimal_result<Iterator> {
+  // Buffer is large enough to hold all digits (digits10 + 1).
+  Char buffer[digits10<UInt>() + 1] = {};
+  auto end = format_decimal(buffer, value, size).end;
+  return {out, detail::copy_noinline<Char>(buffer, end, out)};
+}
+
+template <unsigned BASE_BITS, typename Char, typename UInt>
+FMT_CONSTEXPR auto format_uint(Char* buffer, UInt value, int num_digits,
+                               bool upper = false) -> Char* {
+  buffer += num_digits;
+  Char* end = buffer;
+  do {
+    const char* digits = upper ? "0123456789ABCDEF" : "0123456789abcdef";
+    unsigned digit = static_cast<unsigned>(value & ((1 << BASE_BITS) - 1));
+    *--buffer = static_cast<Char>(BASE_BITS < 4 ? static_cast<char>('0' + digit)
+                                                : digits[digit]);
+  } while ((value >>= BASE_BITS) != 0);
+  return end;
+}
+
+template <unsigned BASE_BITS, typename Char, typename It, typename UInt>
+FMT_CONSTEXPR inline auto format_uint(It out, UInt value, int num_digits,
+                                      bool upper = false) -> It {
+  if (auto ptr = to_pointer<Char>(out, to_unsigned(num_digits))) {
+    format_uint<BASE_BITS>(ptr, value, num_digits, upper);
+    return out;
+  }
+  // Buffer should be large enough to hold all digits (digits / BASE_BITS + 1).
+  char buffer[num_bits<UInt>() / BASE_BITS + 1] = {};
+  format_uint<BASE_BITS>(buffer, value, num_digits, upper);
+  return detail::copy_noinline<Char>(buffer, buffer + num_digits, out);
+}
+
+// A converter from UTF-8 to UTF-16.
+class utf8_to_utf16 {
+ private:
+  basic_memory_buffer<wchar_t> buffer_;
+
+ public:
+  FMT_API explicit utf8_to_utf16(string_view s);
+  operator basic_string_view<wchar_t>() const { return {&buffer_[0], size()}; }
+  auto size() const -> size_t { return buffer_.size() - 1; }
+  auto c_str() const -> const wchar_t* { return &buffer_[0]; }
+  auto str() const -> std::wstring { return {&buffer_[0], size()}; }
+};
+
+enum class to_utf8_error_policy { abort, replace };
+
+// A converter from UTF-16/UTF-32 (host endian) to UTF-8.
+template <typename WChar, typename Buffer = memory_buffer> class to_utf8 {
+ private:
+  Buffer buffer_;
+
+ public:
+  to_utf8() {}
+  explicit to_utf8(basic_string_view<WChar> s,
+                   to_utf8_error_policy policy = to_utf8_error_policy::abort) {
+    static_assert(sizeof(WChar) == 2 || sizeof(WChar) == 4,
+                  "Expect utf16 or utf32");
+    if (!convert(s, policy))
+      FMT_THROW(std::runtime_error(sizeof(WChar) == 2 ? "invalid utf16"
+                                                      : "invalid utf32"));
+  }
+  operator string_view() const { return string_view(&buffer_[0], size()); }
+  auto size() const -> size_t { return buffer_.size() - 1; }
+  auto c_str() const -> const char* { return &buffer_[0]; }
+  auto str() const -> std::string { return std::string(&buffer_[0], size()); }
+
+  // Performs conversion returning a bool instead of throwing exception on
+  // conversion error. This method may still throw in case of memory allocation
+  // error.
+  auto convert(basic_string_view<WChar> s,
+               to_utf8_error_policy policy = to_utf8_error_policy::abort)
+      -> bool {
+    if (!convert(buffer_, s, policy)) return false;
+    buffer_.push_back(0);
+    return true;
+  }
+  static auto convert(Buffer& buf, basic_string_view<WChar> s,
+                      to_utf8_error_policy policy = to_utf8_error_policy::abort)
+      -> bool {
+    for (auto p = s.begin(); p != s.end(); ++p) {
+      uint32_t c = static_cast<uint32_t>(*p);
+      if (sizeof(WChar) == 2 && c >= 0xd800 && c <= 0xdfff) {
+        // Handle a surrogate pair.
+        ++p;
+        if (p == s.end() || (c & 0xfc00) != 0xd800 || (*p & 0xfc00) != 0xdc00) {
+          if (policy == to_utf8_error_policy::abort) return false;
+          buf.append(string_view("\xEF\xBF\xBD"));
+          --p;
+        } else {
+          c = (c << 10) + static_cast<uint32_t>(*p) - 0x35fdc00;
+        }
+      } else if (c < 0x80) {
+        buf.push_back(static_cast<char>(c));
+      } else if (c < 0x800) {
+        buf.push_back(static_cast<char>(0xc0 | (c >> 6)));
+        buf.push_back(static_cast<char>(0x80 | (c & 0x3f)));
+      } else if ((c >= 0x800 && c <= 0xd7ff) || (c >= 0xe000 && c <= 0xffff)) {
+        buf.push_back(static_cast<char>(0xe0 | (c >> 12)));
+        buf.push_back(static_cast<char>(0x80 | ((c & 0xfff) >> 6)));
+        buf.push_back(static_cast<char>(0x80 | (c & 0x3f)));
+      } else if (c >= 0x10000 && c <= 0x10ffff) {
+        buf.push_back(static_cast<char>(0xf0 | (c >> 18)));
+        buf.push_back(static_cast<char>(0x80 | ((c & 0x3ffff) >> 12)));
+        buf.push_back(static_cast<char>(0x80 | ((c & 0xfff) >> 6)));
+        buf.push_back(static_cast<char>(0x80 | (c & 0x3f)));
+      } else {
+        return false;
+      }
+    }
+    return true;
+  }
+};
+
+// Computes 128-bit result of multiplication of two 64-bit unsigned integers.
+inline auto umul128(uint64_t x, uint64_t y) noexcept -> uint128_fallback {
+#if FMT_USE_INT128
+  auto p = static_cast<uint128_opt>(x) * static_cast<uint128_opt>(y);
+  return {static_cast<uint64_t>(p >> 64), static_cast<uint64_t>(p)};
+#elif defined(_MSC_VER) && defined(_M_X64)
+  auto hi = uint64_t();
+  auto lo = _umul128(x, y, &hi);
+  return {hi, lo};
+#else
+  const uint64_t mask = static_cast<uint64_t>(max_value<uint32_t>());
+
+  uint64_t a = x >> 32;
+  uint64_t b = x & mask;
+  uint64_t c = y >> 32;
+  uint64_t d = y & mask;
+
+  uint64_t ac = a * c;
+  uint64_t bc = b * c;
+  uint64_t ad = a * d;
+  uint64_t bd = b * d;
+
+  uint64_t intermediate = (bd >> 32) + (ad & mask) + (bc & mask);
+
+  return {ac + (intermediate >> 32) + (ad >> 32) + (bc >> 32),
+          (intermediate << 32) + (bd & mask)};
+#endif
+}
+
+namespace dragonbox {
+// Computes floor(log10(pow(2, e))) for e in [-2620, 2620] using the method from
+// https://fmt.dev/papers/Dragonbox.pdf#page=28, section 6.1.
+inline auto floor_log10_pow2(int e) noexcept -> int {
+  FMT_ASSERT(e <= 2620 && e >= -2620, "too large exponent");
+  static_assert((-1 >> 1) == -1, "right shift is not arithmetic");
+  return (e * 315653) >> 20;
+}
+
+inline auto floor_log2_pow10(int e) noexcept -> int {
+  FMT_ASSERT(e <= 1233 && e >= -1233, "too large exponent");
+  return (e * 1741647) >> 19;
+}
+
+// Computes upper 64 bits of multiplication of two 64-bit unsigned integers.
+inline auto umul128_upper64(uint64_t x, uint64_t y) noexcept -> uint64_t {
+#if FMT_USE_INT128
+  auto p = static_cast<uint128_opt>(x) * static_cast<uint128_opt>(y);
+  return static_cast<uint64_t>(p >> 64);
+#elif defined(_MSC_VER) && defined(_M_X64)
+  return __umulh(x, y);
+#else
+  return umul128(x, y).high();
+#endif
+}
+
+// Computes upper 128 bits of multiplication of a 64-bit unsigned integer and a
+// 128-bit unsigned integer.
+inline auto umul192_upper128(uint64_t x, uint128_fallback y) noexcept
+    -> uint128_fallback {
+  uint128_fallback r = umul128(x, y.high());
+  r += umul128_upper64(x, y.low());
+  return r;
+}
+
+FMT_API auto get_cached_power(int k) noexcept -> uint128_fallback;
+
+// Type-specific information that Dragonbox uses.
+template <typename T, typename Enable = void> struct float_info;
+
+template <> struct float_info<float> {
+  using carrier_uint = uint32_t;
+  static const int exponent_bits = 8;
+  static const int kappa = 1;
+  static const int big_divisor = 100;
+  static const int small_divisor = 10;
+  static const int min_k = -31;
+  static const int max_k = 46;
+  static const int shorter_interval_tie_lower_threshold = -35;
+  static const int shorter_interval_tie_upper_threshold = -35;
+};
+
+template <> struct float_info<double> {
+  using carrier_uint = uint64_t;
+  static const int exponent_bits = 11;
+  static const int kappa = 2;
+  static const int big_divisor = 1000;
+  static const int small_divisor = 100;
+  static const int min_k = -292;
+  static const int max_k = 341;
+  static const int shorter_interval_tie_lower_threshold = -77;
+  static const int shorter_interval_tie_upper_threshold = -77;
+};
+
+// An 80- or 128-bit floating point number.
+template <typename T>
+struct float_info<T, enable_if_t<std::numeric_limits<T>::digits == 64 ||
+                                 std::numeric_limits<T>::digits == 113 ||
+                                 is_float128<T>::value>> {
+  using carrier_uint = detail::uint128_t;
+  static const int exponent_bits = 15;
+};
+
+// A double-double floating point number.
+template <typename T>
+struct float_info<T, enable_if_t<is_double_double<T>::value>> {
+  using carrier_uint = detail::uint128_t;
+};
+
+template <typename T> struct decimal_fp {
+  using significand_type = typename float_info<T>::carrier_uint;
+  significand_type significand;
+  int exponent;
+};
+
+template <typename T> FMT_API auto to_decimal(T x) noexcept -> decimal_fp<T>;
+}  // namespace dragonbox
+
+// Returns true iff Float has the implicit bit which is not stored.
+template <typename Float> constexpr auto has_implicit_bit() -> bool {
+  // An 80-bit FP number has a 64-bit significand an no implicit bit.
+  return std::numeric_limits<Float>::digits != 64;
+}
+
+// Returns the number of significand bits stored in Float. The implicit bit is
+// not counted since it is not stored.
+template <typename Float> constexpr auto num_significand_bits() -> int {
+  // std::numeric_limits may not support __float128.
+  return is_float128<Float>() ? 112
+                              : (std::numeric_limits<Float>::digits -
+                                 (has_implicit_bit<Float>() ? 1 : 0));
+}
+
+template <typename Float>
+constexpr auto exponent_mask() ->
+    typename dragonbox::float_info<Float>::carrier_uint {
+  using float_uint = typename dragonbox::float_info<Float>::carrier_uint;
+  return ((float_uint(1) << dragonbox::float_info<Float>::exponent_bits) - 1)
+         << num_significand_bits<Float>();
+}
+template <typename Float> constexpr auto exponent_bias() -> int {
+  // std::numeric_limits may not support __float128.
+  return is_float128<Float>() ? 16383
+                              : std::numeric_limits<Float>::max_exponent - 1;
+}
+
+// Writes the exponent exp in the form "[+-]d{2,3}" to buffer.
+template <typename Char, typename It>
+FMT_CONSTEXPR auto write_exponent(int exp, It it) -> It {
+  FMT_ASSERT(-10000 < exp && exp < 10000, "exponent out of range");
+  if (exp < 0) {
+    *it++ = static_cast<Char>('-');
+    exp = -exp;
+  } else {
+    *it++ = static_cast<Char>('+');
+  }
+  if (exp >= 100) {
+    const char* top = digits2(to_unsigned(exp / 100));
+    if (exp >= 1000) *it++ = static_cast<Char>(top[0]);
+    *it++ = static_cast<Char>(top[1]);
+    exp %= 100;
+  }
+  const char* d = digits2(to_unsigned(exp));
+  *it++ = static_cast<Char>(d[0]);
+  *it++ = static_cast<Char>(d[1]);
+  return it;
+}
+
+// A floating-point number f * pow(2, e) where F is an unsigned type.
+template <typename F> struct basic_fp {
+  F f;
+  int e;
+
+  static constexpr const int num_significand_bits =
+      static_cast<int>(sizeof(F) * num_bits<unsigned char>());
+
+  constexpr basic_fp() : f(0), e(0) {}
+  constexpr basic_fp(uint64_t f_val, int e_val) : f(f_val), e(e_val) {}
+
+  // Constructs fp from an IEEE754 floating-point number.
+  template <typename Float> FMT_CONSTEXPR basic_fp(Float n) { assign(n); }
+
+  // Assigns n to this and return true iff predecessor is closer than successor.
+  template <typename Float, FMT_ENABLE_IF(!is_double_double<Float>::value)>
+  FMT_CONSTEXPR auto assign(Float n) -> bool {
+    static_assert(std::numeric_limits<Float>::digits <= 113, "unsupported FP");
+    // Assume Float is in the format [sign][exponent][significand].
+    using carrier_uint = typename dragonbox::float_info<Float>::carrier_uint;
+    const auto num_float_significand_bits =
+        detail::num_significand_bits<Float>();
+    const auto implicit_bit = carrier_uint(1) << num_float_significand_bits;
+    const auto significand_mask = implicit_bit - 1;
+    auto u = bit_cast<carrier_uint>(n);
+    f = static_cast<F>(u & significand_mask);
+    auto biased_e = static_cast<int>((u & exponent_mask<Float>()) >>
+                                     num_float_significand_bits);
+    // The predecessor is closer if n is a normalized power of 2 (f == 0)
+    // other than the smallest normalized number (biased_e > 1).
+    auto is_predecessor_closer = f == 0 && biased_e > 1;
+    if (biased_e == 0)
+      biased_e = 1;  // Subnormals use biased exponent 1 (min exponent).
+    else if (has_implicit_bit<Float>())
+      f += static_cast<F>(implicit_bit);
+    e = biased_e - exponent_bias<Float>() - num_float_significand_bits;
+    if (!has_implicit_bit<Float>()) ++e;
+    return is_predecessor_closer;
+  }
+
+  template <typename Float, FMT_ENABLE_IF(is_double_double<Float>::value)>
+  FMT_CONSTEXPR auto assign(Float n) -> bool {
+    static_assert(std::numeric_limits<double>::is_iec559, "unsupported FP");
+    return assign(static_cast<double>(n));
+  }
+};
+
+using fp = basic_fp<unsigned long long>;
+
+// Normalizes the value converted from double and multiplied by (1 << SHIFT).
+template <int SHIFT = 0, typename F>
+FMT_CONSTEXPR auto normalize(basic_fp<F> value) -> basic_fp<F> {
+  // Handle subnormals.
+  const auto implicit_bit = F(1) << num_significand_bits<double>();
+  const auto shifted_implicit_bit = implicit_bit << SHIFT;
+  while ((value.f & shifted_implicit_bit) == 0) {
+    value.f <<= 1;
+    --value.e;
+  }
+  // Subtract 1 to account for hidden bit.
+  const auto offset = basic_fp<F>::num_significand_bits -
+                      num_significand_bits<double>() - SHIFT - 1;
+  value.f <<= offset;
+  value.e -= offset;
+  return value;
+}
+
+// Computes lhs * rhs / pow(2, 64) rounded to nearest with half-up tie breaking.
+FMT_CONSTEXPR inline auto multiply(uint64_t lhs, uint64_t rhs) -> uint64_t {
+#if FMT_USE_INT128
+  auto product = static_cast<__uint128_t>(lhs) * rhs;
+  auto f = static_cast<uint64_t>(product >> 64);
+  return (static_cast<uint64_t>(product) & (1ULL << 63)) != 0 ? f + 1 : f;
+#else
+  // Multiply 32-bit parts of significands.
+  uint64_t mask = (1ULL << 32) - 1;
+  uint64_t a = lhs >> 32, b = lhs & mask;
+  uint64_t c = rhs >> 32, d = rhs & mask;
+  uint64_t ac = a * c, bc = b * c, ad = a * d, bd = b * d;
+  // Compute mid 64-bit of result and round.
+  uint64_t mid = (bd >> 32) + (ad & mask) + (bc & mask) + (1U << 31);
+  return ac + (ad >> 32) + (bc >> 32) + (mid >> 32);
+#endif
+}
+
+FMT_CONSTEXPR inline auto operator*(fp x, fp y) -> fp {
+  return {multiply(x.f, y.f), x.e + y.e + 64};
+}
+
+template <typename T, bool doublish = num_bits<T>() == num_bits<double>()>
+using convert_float_result =
+    conditional_t<std::is_same<T, float>::value || doublish, double, T>;
+
+template <typename T>
+constexpr auto convert_float(T value) -> convert_float_result<T> {
+  return static_cast<convert_float_result<T>>(value);
+}
+
+template <typename Char, typename OutputIt>
+FMT_NOINLINE FMT_CONSTEXPR auto fill(OutputIt it, size_t n, const fill_t& fill)
+    -> OutputIt {
+  auto fill_size = fill.size();
+  if (fill_size == 1) return detail::fill_n(it, n, fill.template get<Char>());
+  if (const Char* data = fill.template data<Char>()) {
+    for (size_t i = 0; i < n; ++i) it = copy<Char>(data, data + fill_size, it);
+  }
+  return it;
+}
+
+// Writes the output of f, padded according to format specifications in specs.
+// size: output size in code units.
+// width: output display width in (terminal) column positions.
+template <typename Char, align::type align = align::left, typename OutputIt,
+          typename F>
+FMT_CONSTEXPR auto write_padded(OutputIt out, const format_specs& specs,
+                                size_t size, size_t width, F&& f) -> OutputIt {
+  static_assert(align == align::left || align == align::right, "");
+  unsigned spec_width = to_unsigned(specs.width);
+  size_t padding = spec_width > width ? spec_width - width : 0;
+  // Shifts are encoded as string literals because static constexpr is not
+  // supported in constexpr functions.
+  auto* shifts = align == align::left ? "\x1f\x1f\x00\x01" : "\x00\x1f\x00\x01";
+  size_t left_padding = padding >> shifts[specs.align];
+  size_t right_padding = padding - left_padding;
+  auto it = reserve(out, size + padding * specs.fill.size());
+  if (left_padding != 0) it = fill<Char>(it, left_padding, specs.fill);
+  it = f(it);
+  if (right_padding != 0) it = fill<Char>(it, right_padding, specs.fill);
+  return base_iterator(out, it);
+}
+
+template <typename Char, align::type align = align::left, typename OutputIt,
+          typename F>
+constexpr auto write_padded(OutputIt out, const format_specs& specs,
+                            size_t size, F&& f) -> OutputIt {
+  return write_padded<Char, align>(out, specs, size, size, f);
+}
+
+template <typename Char, align::type align = align::left, typename OutputIt>
+FMT_CONSTEXPR auto write_bytes(OutputIt out, string_view bytes,
+                               const format_specs& specs = {}) -> OutputIt {
+  return write_padded<Char, align>(
+      out, specs, bytes.size(), [bytes](reserve_iterator<OutputIt> it) {
+        const char* data = bytes.data();
+        return copy<Char>(data, data + bytes.size(), it);
+      });
+}
+
+template <typename Char, typename OutputIt, typename UIntPtr>
+auto write_ptr(OutputIt out, UIntPtr value, const format_specs* specs)
+    -> OutputIt {
+  int num_digits = count_digits<4>(value);
+  auto size = to_unsigned(num_digits) + size_t(2);
+  auto write = [=](reserve_iterator<OutputIt> it) {
+    *it++ = static_cast<Char>('0');
+    *it++ = static_cast<Char>('x');
+    return format_uint<4, Char>(it, value, num_digits);
+  };
+  return specs ? write_padded<Char, align::right>(out, *specs, size, write)
+               : base_iterator(out, write(reserve(out, size)));
+}
+
+// Returns true iff the code point cp is printable.
+FMT_API auto is_printable(uint32_t cp) -> bool;
+
+inline auto needs_escape(uint32_t cp) -> bool {
+  return cp < 0x20 || cp == 0x7f || cp == '"' || cp == '\\' ||
+         !is_printable(cp);
+}
+
+template <typename Char> struct find_escape_result {
+  const Char* begin;
+  const Char* end;
+  uint32_t cp;
+};
+
+template <typename Char>
+auto find_escape(const Char* begin, const Char* end)
+    -> find_escape_result<Char> {
+  for (; begin != end; ++begin) {
+    uint32_t cp = static_cast<unsigned_char<Char>>(*begin);
+    if (const_check(sizeof(Char) == 1) && cp >= 0x80) continue;
+    if (needs_escape(cp)) return {begin, begin + 1, cp};
+  }
+  return {begin, nullptr, 0};
+}
+
+inline auto find_escape(const char* begin, const char* end)
+    -> find_escape_result<char> {
+  if (!use_utf8()) return find_escape<char>(begin, end);
+  auto result = find_escape_result<char>{end, nullptr, 0};
+  for_each_codepoint(string_view(begin, to_unsigned(end - begin)),
+                     [&](uint32_t cp, string_view sv) {
+                       if (needs_escape(cp)) {
+                         result = {sv.begin(), sv.end(), cp};
+                         return false;
+                       }
+                       return true;
+                     });
+  return result;
+}
+
+#define FMT_STRING_IMPL(s, base, explicit)                                    \
+  [] {                                                                        \
+    /* Use the hidden visibility as a workaround for a GCC bug (#1973). */    \
+    /* Use a macro-like name to avoid shadowing warnings. */                  \
+    struct FMT_VISIBILITY("hidden") FMT_COMPILE_STRING : base {               \
+      using char_type FMT_MAYBE_UNUSED = fmt::remove_cvref_t<decltype(s[0])>; \
+      FMT_MAYBE_UNUSED FMT_CONSTEXPR explicit                                 \
+      operator fmt::basic_string_view<char_type>() const {                    \
+        return fmt::detail_exported::compile_string_to_view<char_type>(s);    \
+      }                                                                       \
+    };                                                                        \
+    return FMT_COMPILE_STRING();                                              \
+  }()
+
+/**
+ * Constructs a compile-time format string from a string literal `s`.
+ *
+ * **Example**:
+ *
+ *     // A compile-time error because 'd' is an invalid specifier for strings.
+ *     std::string s = fmt::format(FMT_STRING("{:d}"), "foo");
+ */
+#define FMT_STRING(s) FMT_STRING_IMPL(s, fmt::detail::compile_string, )
+
+template <size_t width, typename Char, typename OutputIt>
+auto write_codepoint(OutputIt out, char prefix, uint32_t cp) -> OutputIt {
+  *out++ = static_cast<Char>('\\');
+  *out++ = static_cast<Char>(prefix);
+  Char buf[width];
+  fill_n(buf, width, static_cast<Char>('0'));
+  format_uint<4>(buf, cp, width);
+  return copy<Char>(buf, buf + width, out);
+}
+
+template <typename OutputIt, typename Char>
+auto write_escaped_cp(OutputIt out, const find_escape_result<Char>& escape)
+    -> OutputIt {
+  auto c = static_cast<Char>(escape.cp);
+  switch (escape.cp) {
+  case '\n':
+    *out++ = static_cast<Char>('\\');
+    c = static_cast<Char>('n');
+    break;
+  case '\r':
+    *out++ = static_cast<Char>('\\');
+    c = static_cast<Char>('r');
+    break;
+  case '\t':
+    *out++ = static_cast<Char>('\\');
+    c = static_cast<Char>('t');
+    break;
+  case '"':
+    FMT_FALLTHROUGH;
+  case '\'':
+    FMT_FALLTHROUGH;
+  case '\\':
+    *out++ = static_cast<Char>('\\');
+    break;
+  default:
+    if (escape.cp < 0x100) return write_codepoint<2, Char>(out, 'x', escape.cp);
+    if (escape.cp < 0x10000)
+      return write_codepoint<4, Char>(out, 'u', escape.cp);
+    if (escape.cp < 0x110000)
+      return write_codepoint<8, Char>(out, 'U', escape.cp);
+    for (Char escape_char : basic_string_view<Char>(
+             escape.begin, to_unsigned(escape.end - escape.begin))) {
+      out = write_codepoint<2, Char>(out, 'x',
+                                     static_cast<uint32_t>(escape_char) & 0xFF);
+    }
+    return out;
+  }
+  *out++ = c;
+  return out;
+}
+
+template <typename Char, typename OutputIt>
+auto write_escaped_string(OutputIt out, basic_string_view<Char> str)
+    -> OutputIt {
+  *out++ = static_cast<Char>('"');
+  auto begin = str.begin(), end = str.end();
+  do {
+    auto escape = find_escape(begin, end);
+    out = copy<Char>(begin, escape.begin, out);
+    begin = escape.end;
+    if (!begin) break;
+    out = write_escaped_cp<OutputIt, Char>(out, escape);
+  } while (begin != end);
+  *out++ = static_cast<Char>('"');
+  return out;
+}
+
+template <typename Char, typename OutputIt>
+auto write_escaped_char(OutputIt out, Char v) -> OutputIt {
+  Char v_array[1] = {v};
+  *out++ = static_cast<Char>('\'');
+  if ((needs_escape(static_cast<uint32_t>(v)) && v != static_cast<Char>('"')) ||
+      v == static_cast<Char>('\'')) {
+    out = write_escaped_cp(out,
+                           find_escape_result<Char>{v_array, v_array + 1,
+                                                    static_cast<uint32_t>(v)});
+  } else {
+    *out++ = v;
+  }
+  *out++ = static_cast<Char>('\'');
+  return out;
+}
+
+template <typename Char, typename OutputIt>
+FMT_CONSTEXPR auto write_char(OutputIt out, Char value,
+                              const format_specs& specs) -> OutputIt {
+  bool is_debug = specs.type == presentation_type::debug;
+  return write_padded<Char>(out, specs, 1, [=](reserve_iterator<OutputIt> it) {
+    if (is_debug) return write_escaped_char(it, value);
+    *it++ = value;
+    return it;
+  });
+}
+template <typename Char, typename OutputIt>
+FMT_CONSTEXPR auto write(OutputIt out, Char value, const format_specs& specs,
+                         locale_ref loc = {}) -> OutputIt {
+  // char is formatted as unsigned char for consistency across platforms.
+  using unsigned_type =
+      conditional_t<std::is_same<Char, char>::value, unsigned char, unsigned>;
+  return check_char_specs(specs)
+             ? write_char<Char>(out, value, specs)
+             : write<Char>(out, static_cast<unsigned_type>(value), specs, loc);
+}
+
+// Data for write_int that doesn't depend on output iterator type. It is used to
+// avoid template code bloat.
+template <typename Char> struct write_int_data {
+  size_t size;
+  size_t padding;
+
+  FMT_CONSTEXPR write_int_data(int num_digits, unsigned prefix,
+                               const format_specs& specs)
+      : size((prefix >> 24) + to_unsigned(num_digits)), padding(0) {
+    if (specs.align == align::numeric) {
+      auto width = to_unsigned(specs.width);
+      if (width > size) {
+        padding = width - size;
+        size = width;
+      }
+    } else if (specs.precision > num_digits) {
+      size = (prefix >> 24) + to_unsigned(specs.precision);
+      padding = to_unsigned(specs.precision - num_digits);
+    }
+  }
+};
+
+// Writes an integer in the format
+//   <left-padding><prefix><numeric-padding><digits><right-padding>
+// where <digits> are written by write_digits(it).
+// prefix contains chars in three lower bytes and the size in the fourth byte.
+template <typename Char, typename OutputIt, typename W>
+FMT_CONSTEXPR FMT_INLINE auto write_int(OutputIt out, int num_digits,
+                                        unsigned prefix,
+                                        const format_specs& specs,
+                                        W write_digits) -> OutputIt {
+  // Slightly faster check for specs.width == 0 && specs.precision == -1.
+  if ((specs.width | (specs.precision + 1)) == 0) {
+    auto it = reserve(out, to_unsigned(num_digits) + (prefix >> 24));
+    if (prefix != 0) {
+      for (unsigned p = prefix & 0xffffff; p != 0; p >>= 8)
+        *it++ = static_cast<Char>(p & 0xff);
+    }
+    return base_iterator(out, write_digits(it));
+  }
+  auto data = write_int_data<Char>(num_digits, prefix, specs);
+  return write_padded<Char, align::right>(
+      out, specs, data.size, [=](reserve_iterator<OutputIt> it) {
+        for (unsigned p = prefix & 0xffffff; p != 0; p >>= 8)
+          *it++ = static_cast<Char>(p & 0xff);
+        it = detail::fill_n(it, data.padding, static_cast<Char>('0'));
+        return write_digits(it);
+      });
+}
+
+template <typename Char> class digit_grouping {
+ private:
+  std::string grouping_;
+  std::basic_string<Char> thousands_sep_;
+
+  struct next_state {
+    std::string::const_iterator group;
+    int pos;
+  };
+  auto initial_state() const -> next_state { return {grouping_.begin(), 0}; }
+
+  // Returns the next digit group separator position.
+  auto next(next_state& state) const -> int {
+    if (thousands_sep_.empty()) return max_value<int>();
+    if (state.group == grouping_.end()) return state.pos += grouping_.back();
+    if (*state.group <= 0 || *state.group == max_value<char>())
+      return max_value<int>();
+    state.pos += *state.group++;
+    return state.pos;
+  }
+
+ public:
+  explicit digit_grouping(locale_ref loc, bool localized = true) {
+    if (!localized) return;
+    auto sep = thousands_sep<Char>(loc);
+    grouping_ = sep.grouping;
+    if (sep.thousands_sep) thousands_sep_.assign(1, sep.thousands_sep);
+  }
+  digit_grouping(std::string grouping, std::basic_string<Char> sep)
+      : grouping_(std::move(grouping)), thousands_sep_(std::move(sep)) {}
+
+  auto has_separator() const -> bool { return !thousands_sep_.empty(); }
+
+  auto count_separators(int num_digits) const -> int {
+    int count = 0;
+    auto state = initial_state();
+    while (num_digits > next(state)) ++count;
+    return count;
+  }
+
+  // Applies grouping to digits and write the output to out.
+  template <typename Out, typename C>
+  auto apply(Out out, basic_string_view<C> digits) const -> Out {
+    auto num_digits = static_cast<int>(digits.size());
+    auto separators = basic_memory_buffer<int>();
+    separators.push_back(0);
+    auto state = initial_state();
+    while (int i = next(state)) {
+      if (i >= num_digits) break;
+      separators.push_back(i);
+    }
+    for (int i = 0, sep_index = static_cast<int>(separators.size() - 1);
+         i < num_digits; ++i) {
+      if (num_digits - i == separators[sep_index]) {
+        out = copy<Char>(thousands_sep_.data(),
+                         thousands_sep_.data() + thousands_sep_.size(), out);
+        --sep_index;
+      }
+      *out++ = static_cast<Char>(digits[to_unsigned(i)]);
+    }
+    return out;
+  }
+};
+
+FMT_CONSTEXPR inline void prefix_append(unsigned& prefix, unsigned value) {
+  prefix |= prefix != 0 ? value << 8 : value;
+  prefix += (1u + (value > 0xff ? 1 : 0)) << 24;
+}
+
+// Writes a decimal integer with digit grouping.
+template <typename OutputIt, typename UInt, typename Char>
+auto write_int(OutputIt out, UInt value, unsigned prefix,
+               const format_specs& specs, const digit_grouping<Char>& grouping)
+    -> OutputIt {
+  static_assert(std::is_same<uint64_or_128_t<UInt>, UInt>::value, "");
+  int num_digits = 0;
+  auto buffer = memory_buffer();
+  switch (specs.type) {
+  default:
+    FMT_ASSERT(false, "");
+    FMT_FALLTHROUGH;
+  case presentation_type::none:
+  case presentation_type::dec:
+    num_digits = count_digits(value);
+    format_decimal<char>(appender(buffer), value, num_digits);
+    break;
+  case presentation_type::hex:
+    if (specs.alt)
+      prefix_append(prefix, unsigned(specs.upper ? 'X' : 'x') << 8 | '0');
+    num_digits = count_digits<4>(value);
+    format_uint<4, char>(appender(buffer), value, num_digits, specs.upper);
+    break;
+  case presentation_type::oct:
+    num_digits = count_digits<3>(value);
+    // Octal prefix '0' is counted as a digit, so only add it if precision
+    // is not greater than the number of digits.
+    if (specs.alt && specs.precision <= num_digits && value != 0)
+      prefix_append(prefix, '0');
+    format_uint<3, char>(appender(buffer), value, num_digits);
+    break;
+  case presentation_type::bin:
+    if (specs.alt)
+      prefix_append(prefix, unsigned(specs.upper ? 'B' : 'b') << 8 | '0');
+    num_digits = count_digits<1>(value);
+    format_uint<1, char>(appender(buffer), value, num_digits);
+    break;
+  case presentation_type::chr:
+    return write_char<Char>(out, static_cast<Char>(value), specs);
+  }
+
+  unsigned size = (prefix != 0 ? prefix >> 24 : 0) + to_unsigned(num_digits) +
+                  to_unsigned(grouping.count_separators(num_digits));
+  return write_padded<Char, align::right>(
+      out, specs, size, size, [&](reserve_iterator<OutputIt> it) {
+        for (unsigned p = prefix & 0xffffff; p != 0; p >>= 8)
+          *it++ = static_cast<Char>(p & 0xff);
+        return grouping.apply(it, string_view(buffer.data(), buffer.size()));
+      });
+}
+
+// Writes a localized value.
+FMT_API auto write_loc(appender out, loc_value value, const format_specs& specs,
+                       locale_ref loc) -> bool;
+template <typename OutputIt>
+inline auto write_loc(OutputIt, loc_value, const format_specs&, locale_ref)
+    -> bool {
+  return false;
+}
+
+template <typename UInt> struct write_int_arg {
+  UInt abs_value;
+  unsigned prefix;
+};
+
+template <typename T>
+FMT_CONSTEXPR auto make_write_int_arg(T value, sign_t sign)
+    -> write_int_arg<uint32_or_64_or_128_t<T>> {
+  auto prefix = 0u;
+  auto abs_value = static_cast<uint32_or_64_or_128_t<T>>(value);
+  if (is_negative(value)) {
+    prefix = 0x01000000 | '-';
+    abs_value = 0 - abs_value;
+  } else {
+    constexpr const unsigned prefixes[4] = {0, 0, 0x1000000u | '+',
+                                            0x1000000u | ' '};
+    prefix = prefixes[sign];
+  }
+  return {abs_value, prefix};
+}
+
+template <typename Char = char> struct loc_writer {
+  basic_appender<Char> out;
+  const format_specs& specs;
+  std::basic_string<Char> sep;
+  std::string grouping;
+  std::basic_string<Char> decimal_point;
+
+  template <typename T, FMT_ENABLE_IF(is_integer<T>::value)>
+  auto operator()(T value) -> bool {
+    auto arg = make_write_int_arg(value, specs.sign);
+    write_int(out, static_cast<uint64_or_128_t<T>>(arg.abs_value), arg.prefix,
+              specs, digit_grouping<Char>(grouping, sep));
+    return true;
+  }
+
+  template <typename T, FMT_ENABLE_IF(!is_integer<T>::value)>
+  auto operator()(T) -> bool {
+    return false;
+  }
+};
+
+template <typename Char, typename OutputIt, typename T>
+FMT_CONSTEXPR FMT_INLINE auto write_int(OutputIt out, write_int_arg<T> arg,
+                                        const format_specs& specs, locale_ref)
+    -> OutputIt {
+  static_assert(std::is_same<T, uint32_or_64_or_128_t<T>>::value, "");
+  auto abs_value = arg.abs_value;
+  auto prefix = arg.prefix;
+  switch (specs.type) {
+  default:
+    FMT_ASSERT(false, "");
+    FMT_FALLTHROUGH;
+  case presentation_type::none:
+  case presentation_type::dec: {
+    int num_digits = count_digits(abs_value);
+    return write_int<Char>(
+        out, num_digits, prefix, specs, [=](reserve_iterator<OutputIt> it) {
+          return format_decimal<Char>(it, abs_value, num_digits).end;
+        });
+  }
+  case presentation_type::hex: {
+    if (specs.alt)
+      prefix_append(prefix, unsigned(specs.upper ? 'X' : 'x') << 8 | '0');
+    int num_digits = count_digits<4>(abs_value);
+    return write_int<Char>(
+        out, num_digits, prefix, specs, [=](reserve_iterator<OutputIt> it) {
+          return format_uint<4, Char>(it, abs_value, num_digits, specs.upper);
+        });
+  }
+  case presentation_type::oct: {
+    int num_digits = count_digits<3>(abs_value);
+    // Octal prefix '0' is counted as a digit, so only add it if precision
+    // is not greater than the number of digits.
+    if (specs.alt && specs.precision <= num_digits && abs_value != 0)
+      prefix_append(prefix, '0');
+    return write_int<Char>(
+        out, num_digits, prefix, specs, [=](reserve_iterator<OutputIt> it) {
+          return format_uint<3, Char>(it, abs_value, num_digits);
+        });
+  }
+  case presentation_type::bin: {
+    if (specs.alt)
+      prefix_append(prefix, unsigned(specs.upper ? 'B' : 'b') << 8 | '0');
+    int num_digits = count_digits<1>(abs_value);
+    return write_int<Char>(
+        out, num_digits, prefix, specs, [=](reserve_iterator<OutputIt> it) {
+          return format_uint<1, Char>(it, abs_value, num_digits);
+        });
+  }
+  case presentation_type::chr:
+    return write_char<Char>(out, static_cast<Char>(abs_value), specs);
+  }
+}
+template <typename Char, typename OutputIt, typename T>
+FMT_CONSTEXPR FMT_NOINLINE auto write_int_noinline(OutputIt out,
+                                                   write_int_arg<T> arg,
+                                                   const format_specs& specs,
+                                                   locale_ref loc) -> OutputIt {
+  return write_int<Char>(out, arg, specs, loc);
+}
+template <typename Char, typename T,
+          FMT_ENABLE_IF(is_integral<T>::value &&
+                        !std::is_same<T, bool>::value &&
+                        !std::is_same<T, Char>::value)>
+FMT_CONSTEXPR FMT_INLINE auto write(basic_appender<Char> out, T value,
+                                    const format_specs& specs, locale_ref loc)
+    -> basic_appender<Char> {
+  if (specs.localized && write_loc(out, value, specs, loc)) return out;
+  return write_int_noinline<Char>(out, make_write_int_arg(value, specs.sign),
+                                  specs, loc);
+}
+// An inlined version of write used in format string compilation.
+template <typename Char, typename OutputIt, typename T,
+          FMT_ENABLE_IF(is_integral<T>::value &&
+                        !std::is_same<T, bool>::value &&
+                        !std::is_same<T, Char>::value &&
+                        !std::is_same<OutputIt, basic_appender<Char>>::value)>
+FMT_CONSTEXPR FMT_INLINE auto write(OutputIt out, T value,
+                                    const format_specs& specs, locale_ref loc)
+    -> OutputIt {
+  if (specs.localized && write_loc(out, value, specs, loc)) return out;
+  return write_int<Char>(out, make_write_int_arg(value, specs.sign), specs,
+                         loc);
+}
+
+// An output iterator that counts the number of objects written to it and
+// discards them.
+class counting_iterator {
+ private:
+  size_t count_;
+
+ public:
+  using iterator_category = std::output_iterator_tag;
+  using difference_type = std::ptrdiff_t;
+  using pointer = void;
+  using reference = void;
+  FMT_UNCHECKED_ITERATOR(counting_iterator);
+
+  struct value_type {
+    template <typename T> FMT_CONSTEXPR void operator=(const T&) {}
+  };
+
+  FMT_CONSTEXPR counting_iterator() : count_(0) {}
+
+  FMT_CONSTEXPR auto count() const -> size_t { return count_; }
+
+  FMT_CONSTEXPR auto operator++() -> counting_iterator& {
+    ++count_;
+    return *this;
+  }
+  FMT_CONSTEXPR auto operator++(int) -> counting_iterator {
+    auto it = *this;
+    ++*this;
+    return it;
+  }
+
+  FMT_CONSTEXPR friend auto operator+(counting_iterator it, difference_type n)
+      -> counting_iterator {
+    it.count_ += static_cast<size_t>(n);
+    return it;
+  }
+
+  FMT_CONSTEXPR auto operator*() const -> value_type { return {}; }
+};
+
+template <typename Char, typename OutputIt>
+FMT_CONSTEXPR auto write(OutputIt out, basic_string_view<Char> s,
+                         const format_specs& specs) -> OutputIt {
+  auto data = s.data();
+  auto size = s.size();
+  if (specs.precision >= 0 && to_unsigned(specs.precision) < size)
+    size = code_point_index(s, to_unsigned(specs.precision));
+  bool is_debug = specs.type == presentation_type::debug;
+  size_t width = 0;
+
+  if (is_debug) size = write_escaped_string(counting_iterator{}, s).count();
+
+  if (specs.width != 0) {
+    if (is_debug)
+      width = size;
+    else
+      width = compute_width(basic_string_view<Char>(data, size));
+  }
+  return write_padded<Char>(out, specs, size, width,
+                            [=](reserve_iterator<OutputIt> it) {
+                              if (is_debug) return write_escaped_string(it, s);
+                              return copy<Char>(data, data + size, it);
+                            });
+}
+template <typename Char, typename OutputIt>
+FMT_CONSTEXPR auto write(OutputIt out,
+                         basic_string_view<type_identity_t<Char>> s,
+                         const format_specs& specs, locale_ref) -> OutputIt {
+  return write<Char>(out, s, specs);
+}
+template <typename Char, typename OutputIt>
+FMT_CONSTEXPR auto write(OutputIt out, const Char* s, const format_specs& specs,
+                         locale_ref) -> OutputIt {
+  if (specs.type == presentation_type::pointer)
+    return write_ptr<Char>(out, bit_cast<uintptr_t>(s), &specs);
+  if (!s) report_error("string pointer is null");
+  return write<Char>(out, basic_string_view<Char>(s), specs, {});
+}
+
+template <typename Char, typename OutputIt, typename T,
+          FMT_ENABLE_IF(is_integral<T>::value &&
+                        !std::is_same<T, bool>::value &&
+                        !std::is_same<T, Char>::value)>
+FMT_CONSTEXPR auto write(OutputIt out, T value) -> OutputIt {
+  auto abs_value = static_cast<uint32_or_64_or_128_t<T>>(value);
+  bool negative = is_negative(value);
+  // Don't do -abs_value since it trips unsigned-integer-overflow sanitizer.
+  if (negative) abs_value = ~abs_value + 1;
+  int num_digits = count_digits(abs_value);
+  auto size = (negative ? 1 : 0) + static_cast<size_t>(num_digits);
+  if (auto ptr = to_pointer<Char>(out, size)) {
+    if (negative) *ptr++ = static_cast<Char>('-');
+    format_decimal<Char>(ptr, abs_value, num_digits);
+    return out;
+  }
+  if (negative) *out++ = static_cast<Char>('-');
+  return format_decimal<Char>(out, abs_value, num_digits).end;
+}
+
+// DEPRECATED!
+template <typename Char>
+FMT_CONSTEXPR auto parse_align(const Char* begin, const Char* end,
+                               format_specs& specs) -> const Char* {
+  FMT_ASSERT(begin != end, "");
+  auto align = align::none;
+  auto p = begin + code_point_length(begin);
+  if (end - p <= 0) p = begin;
+  for (;;) {
+    switch (to_ascii(*p)) {
+    case '<':
+      align = align::left;
+      break;
+    case '>':
+      align = align::right;
+      break;
+    case '^':
+      align = align::center;
+      break;
+    }
+    if (align != align::none) {
+      if (p != begin) {
+        auto c = *begin;
+        if (c == '}') return begin;
+        if (c == '{') {
+          report_error("invalid fill character '{'");
+          return begin;
+        }
+        specs.fill = basic_string_view<Char>(begin, to_unsigned(p - begin));
+        begin = p + 1;
+      } else {
+        ++begin;
+      }
+      break;
+    } else if (p == begin) {
+      break;
+    }
+    p = begin;
+  }
+  specs.align = align;
+  return begin;
+}
+
+// A floating-point presentation format.
+enum class float_format : unsigned char {
+  general,  // General: exponent notation or fixed point based on magnitude.
+  exp,      // Exponent notation with the default precision of 6, e.g. 1.2e-3.
+  fixed     // Fixed point with the default precision of 6, e.g. 0.0012.
+};
+
+struct float_specs {
+  int precision;
+  float_format format : 8;
+  sign_t sign : 8;
+  bool locale : 1;
+  bool binary32 : 1;
+  bool showpoint : 1;
+};
+
+// DEPRECATED!
+FMT_CONSTEXPR inline auto parse_float_type_spec(const format_specs& specs)
+    -> float_specs {
+  auto result = float_specs();
+  result.showpoint = specs.alt;
+  result.locale = specs.localized;
+  switch (specs.type) {
+  default:
+    FMT_FALLTHROUGH;
+  case presentation_type::none:
+    result.format = float_format::general;
+    break;
+  case presentation_type::exp:
+    result.format = float_format::exp;
+    result.showpoint |= specs.precision != 0;
+    break;
+  case presentation_type::fixed:
+    result.format = float_format::fixed;
+    result.showpoint |= specs.precision != 0;
+    break;
+  case presentation_type::general:
+    result.format = float_format::general;
+    break;
+  }
+  return result;
+}
+
+template <typename Char, typename OutputIt>
+FMT_CONSTEXPR20 auto write_nonfinite(OutputIt out, bool isnan,
+                                     format_specs specs, sign_t sign)
+    -> OutputIt {
+  auto str =
+      isnan ? (specs.upper ? "NAN" : "nan") : (specs.upper ? "INF" : "inf");
+  constexpr size_t str_size = 3;
+  auto size = str_size + (sign ? 1 : 0);
+  // Replace '0'-padding with space for non-finite values.
+  const bool is_zero_fill =
+      specs.fill.size() == 1 && specs.fill.template get<Char>() == '0';
+  if (is_zero_fill) specs.fill = ' ';
+  return write_padded<Char>(out, specs, size,
+                            [=](reserve_iterator<OutputIt> it) {
+                              if (sign) *it++ = detail::sign<Char>(sign);
+                              return copy<Char>(str, str + str_size, it);
+                            });
+}
+
+// A decimal floating-point number significand * pow(10, exp).
+struct big_decimal_fp {
+  const char* significand;
+  int significand_size;
+  int exponent;
+};
+
+constexpr auto get_significand_size(const big_decimal_fp& f) -> int {
+  return f.significand_size;
+}
+template <typename T>
+inline auto get_significand_size(const dragonbox::decimal_fp<T>& f) -> int {
+  return count_digits(f.significand);
+}
+
+template <typename Char, typename OutputIt>
+constexpr auto write_significand(OutputIt out, const char* significand,
+                                 int significand_size) -> OutputIt {
+  return copy<Char>(significand, significand + significand_size, out);
+}
+template <typename Char, typename OutputIt, typename UInt>
+inline auto write_significand(OutputIt out, UInt significand,
+                              int significand_size) -> OutputIt {
+  return format_decimal<Char>(out, significand, significand_size).end;
+}
+template <typename Char, typename OutputIt, typename T, typename Grouping>
+FMT_CONSTEXPR20 auto write_significand(OutputIt out, T significand,
+                                       int significand_size, int exponent,
+                                       const Grouping& grouping) -> OutputIt {
+  if (!grouping.has_separator()) {
+    out = write_significand<Char>(out, significand, significand_size);
+    return detail::fill_n(out, exponent, static_cast<Char>('0'));
+  }
+  auto buffer = memory_buffer();
+  write_significand<char>(appender(buffer), significand, significand_size);
+  detail::fill_n(appender(buffer), exponent, '0');
+  return grouping.apply(out, string_view(buffer.data(), buffer.size()));
+}
+
+template <typename Char, typename UInt,
+          FMT_ENABLE_IF(std::is_integral<UInt>::value)>
+inline auto write_significand(Char* out, UInt significand, int significand_size,
+                              int integral_size, Char decimal_point) -> Char* {
+  if (!decimal_point)
+    return format_decimal(out, significand, significand_size).end;
+  out += significand_size + 1;
+  Char* end = out;
+  int floating_size = significand_size - integral_size;
+  for (int i = floating_size / 2; i > 0; --i) {
+    out -= 2;
+    copy2(out, digits2(static_cast<std::size_t>(significand % 100)));
+    significand /= 100;
+  }
+  if (floating_size % 2 != 0) {
+    *--out = static_cast<Char>('0' + significand % 10);
+    significand /= 10;
+  }
+  *--out = decimal_point;
+  format_decimal(out - integral_size, significand, integral_size);
+  return end;
+}
+
+template <typename OutputIt, typename UInt, typename Char,
+          FMT_ENABLE_IF(!std::is_pointer<remove_cvref_t<OutputIt>>::value)>
+inline auto write_significand(OutputIt out, UInt significand,
+                              int significand_size, int integral_size,
+                              Char decimal_point) -> OutputIt {
+  // Buffer is large enough to hold digits (digits10 + 1) and a decimal point.
+  Char buffer[digits10<UInt>() + 2];
+  auto end = write_significand(buffer, significand, significand_size,
+                               integral_size, decimal_point);
+  return detail::copy_noinline<Char>(buffer, end, out);
+}
+
+template <typename OutputIt, typename Char>
+FMT_CONSTEXPR auto write_significand(OutputIt out, const char* significand,
+                                     int significand_size, int integral_size,
+                                     Char decimal_point) -> OutputIt {
+  out = detail::copy_noinline<Char>(significand, significand + integral_size,
+                                    out);
+  if (!decimal_point) return out;
+  *out++ = decimal_point;
+  return detail::copy_noinline<Char>(significand + integral_size,
+                                     significand + significand_size, out);
+}
+
+template <typename OutputIt, typename Char, typename T, typename Grouping>
+FMT_CONSTEXPR20 auto write_significand(OutputIt out, T significand,
+                                       int significand_size, int integral_size,
+                                       Char decimal_point,
+                                       const Grouping& grouping) -> OutputIt {
+  if (!grouping.has_separator()) {
+    return write_significand(out, significand, significand_size, integral_size,
+                             decimal_point);
+  }
+  auto buffer = basic_memory_buffer<Char>();
+  write_significand(basic_appender<Char>(buffer), significand, significand_size,
+                    integral_size, decimal_point);
+  grouping.apply(
+      out, basic_string_view<Char>(buffer.data(), to_unsigned(integral_size)));
+  return detail::copy_noinline<Char>(buffer.data() + integral_size,
+                                     buffer.end(), out);
+}
+
+template <typename Char, typename OutputIt, typename DecimalFP,
+          typename Grouping = digit_grouping<Char>>
+FMT_CONSTEXPR20 auto do_write_float(OutputIt out, const DecimalFP& f,
+                                    const format_specs& specs,
+                                    float_specs fspecs, locale_ref loc)
+    -> OutputIt {
+  auto significand = f.significand;
+  int significand_size = get_significand_size(f);
+  const Char zero = static_cast<Char>('0');
+  auto sign = fspecs.sign;
+  size_t size = to_unsigned(significand_size) + (sign ? 1 : 0);
+  using iterator = reserve_iterator<OutputIt>;
+
+  Char decimal_point =
+      fspecs.locale ? detail::decimal_point<Char>(loc) : static_cast<Char>('.');
+
+  int output_exp = f.exponent + significand_size - 1;
+  auto use_exp_format = [=]() {
+    if (fspecs.format == float_format::exp) return true;
+    if (fspecs.format != float_format::general) return false;
+    // Use the fixed notation if the exponent is in [exp_lower, exp_upper),
+    // e.g. 0.0001 instead of 1e-04. Otherwise use the exponent notation.
+    const int exp_lower = -4, exp_upper = 16;
+    return output_exp < exp_lower ||
+           output_exp >= (fspecs.precision > 0 ? fspecs.precision : exp_upper);
+  };
+  if (use_exp_format()) {
+    int num_zeros = 0;
+    if (fspecs.showpoint) {
+      num_zeros = fspecs.precision - significand_size;
+      if (num_zeros < 0) num_zeros = 0;
+      size += to_unsigned(num_zeros);
+    } else if (significand_size == 1) {
+      decimal_point = Char();
+    }
+    auto abs_output_exp = output_exp >= 0 ? output_exp : -output_exp;
+    int exp_digits = 2;
+    if (abs_output_exp >= 100) exp_digits = abs_output_exp >= 1000 ? 4 : 3;
+
+    size += to_unsigned((decimal_point ? 1 : 0) + 2 + exp_digits);
+    char exp_char = specs.upper ? 'E' : 'e';
+    auto write = [=](iterator it) {
+      if (sign) *it++ = detail::sign<Char>(sign);
+      // Insert a decimal point after the first digit and add an exponent.
+      it = write_significand(it, significand, significand_size, 1,
+                             decimal_point);
+      if (num_zeros > 0) it = detail::fill_n(it, num_zeros, zero);
+      *it++ = static_cast<Char>(exp_char);
+      return write_exponent<Char>(output_exp, it);
+    };
+    return specs.width > 0
+               ? write_padded<Char, align::right>(out, specs, size, write)
+               : base_iterator(out, write(reserve(out, size)));
+  }
+
+  int exp = f.exponent + significand_size;
+  if (f.exponent >= 0) {
+    // 1234e5 -> 123400000[.0+]
+    size += to_unsigned(f.exponent);
+    int num_zeros = fspecs.precision - exp;
+    abort_fuzzing_if(num_zeros > 5000);
+    if (fspecs.showpoint) {
+      ++size;
+      if (num_zeros <= 0 && fspecs.format != float_format::fixed) num_zeros = 0;
+      if (num_zeros > 0) size += to_unsigned(num_zeros);
+    }
+    auto grouping = Grouping(loc, fspecs.locale);
+    size += to_unsigned(grouping.count_separators(exp));
+    return write_padded<Char, align::right>(out, specs, size, [&](iterator it) {
+      if (sign) *it++ = detail::sign<Char>(sign);
+      it = write_significand<Char>(it, significand, significand_size,
+                                   f.exponent, grouping);
+      if (!fspecs.showpoint) return it;
+      *it++ = decimal_point;
+      return num_zeros > 0 ? detail::fill_n(it, num_zeros, zero) : it;
+    });
+  } else if (exp > 0) {
+    // 1234e-2 -> 12.34[0+]
+    int num_zeros = fspecs.showpoint ? fspecs.precision - significand_size : 0;
+    size += 1 + to_unsigned(num_zeros > 0 ? num_zeros : 0);
+    auto grouping = Grouping(loc, fspecs.locale);
+    size += to_unsigned(grouping.count_separators(exp));
+    return write_padded<Char, align::right>(out, specs, size, [&](iterator it) {
+      if (sign) *it++ = detail::sign<Char>(sign);
+      it = write_significand(it, significand, significand_size, exp,
+                             decimal_point, grouping);
+      return num_zeros > 0 ? detail::fill_n(it, num_zeros, zero) : it;
+    });
+  }
+  // 1234e-6 -> 0.001234
+  int num_zeros = -exp;
+  if (significand_size == 0 && fspecs.precision >= 0 &&
+      fspecs.precision < num_zeros) {
+    num_zeros = fspecs.precision;
+  }
+  bool pointy = num_zeros != 0 || significand_size != 0 || fspecs.showpoint;
+  size += 1 + (pointy ? 1 : 0) + to_unsigned(num_zeros);
+  return write_padded<Char, align::right>(out, specs, size, [&](iterator it) {
+    if (sign) *it++ = detail::sign<Char>(sign);
+    *it++ = zero;
+    if (!pointy) return it;
+    *it++ = decimal_point;
+    it = detail::fill_n(it, num_zeros, zero);
+    return write_significand<Char>(it, significand, significand_size);
+  });
+}
+
+template <typename Char> class fallback_digit_grouping {
+ public:
+  constexpr fallback_digit_grouping(locale_ref, bool) {}
+
+  constexpr auto has_separator() const -> bool { return false; }
+
+  constexpr auto count_separators(int) const -> int { return 0; }
+
+  template <typename Out, typename C>
+  constexpr auto apply(Out out, basic_string_view<C>) const -> Out {
+    return out;
+  }
+};
+
+template <typename Char, typename OutputIt, typename DecimalFP>
+FMT_CONSTEXPR20 auto write_float(OutputIt out, const DecimalFP& f,
+                                 const format_specs& specs, float_specs fspecs,
+                                 locale_ref loc) -> OutputIt {
+  if (is_constant_evaluated()) {
+    return do_write_float<Char, OutputIt, DecimalFP,
+                          fallback_digit_grouping<Char>>(out, f, specs, fspecs,
+                                                         loc);
+  } else {
+    return do_write_float<Char>(out, f, specs, fspecs, loc);
+  }
+}
+
+template <typename T> constexpr auto isnan(T value) -> bool {
+  return value != value;  // std::isnan doesn't support __float128.
+}
+
+template <typename T, typename Enable = void>
+struct has_isfinite : std::false_type {};
+
+template <typename T>
+struct has_isfinite<T, enable_if_t<sizeof(std::isfinite(T())) != 0>>
+    : std::true_type {};
+
+template <typename T, FMT_ENABLE_IF(std::is_floating_point<T>::value&&
+                                        has_isfinite<T>::value)>
+FMT_CONSTEXPR20 auto isfinite(T value) -> bool {
+  constexpr T inf = T(std::numeric_limits<double>::infinity());
+  if (is_constant_evaluated())
+    return !detail::isnan(value) && value < inf && value > -inf;
+  return std::isfinite(value);
+}
+template <typename T, FMT_ENABLE_IF(!has_isfinite<T>::value)>
+FMT_CONSTEXPR auto isfinite(T value) -> bool {
+  T inf = T(std::numeric_limits<double>::infinity());
+  // std::isfinite doesn't support __float128.
+  return !detail::isnan(value) && value < inf && value > -inf;
+}
+
+template <typename T, FMT_ENABLE_IF(is_floating_point<T>::value)>
+FMT_INLINE FMT_CONSTEXPR bool signbit(T value) {
+  if (is_constant_evaluated()) {
+#ifdef __cpp_if_constexpr
+    if constexpr (std::numeric_limits<double>::is_iec559) {
+      auto bits = detail::bit_cast<uint64_t>(static_cast<double>(value));
+      return (bits >> (num_bits<uint64_t>() - 1)) != 0;
+    }
+#endif
+  }
+  return std::signbit(static_cast<double>(value));
+}
+
+inline FMT_CONSTEXPR20 void adjust_precision(int& precision, int exp10) {
+  // Adjust fixed precision by exponent because it is relative to decimal
+  // point.
+  if (exp10 > 0 && precision > max_value<int>() - exp10)
+    FMT_THROW(format_error("number is too big"));
+  precision += exp10;
+}
+
+class bigint {
+ private:
+  // A bigint is stored as an array of bigits (big digits), with bigit at index
+  // 0 being the least significant one.
+  using bigit = uint32_t;
+  using double_bigit = uint64_t;
+  enum { bigits_capacity = 32 };
+  basic_memory_buffer<bigit, bigits_capacity> bigits_;
+  int exp_;
+
+  FMT_CONSTEXPR20 auto operator[](int index) const -> bigit {
+    return bigits_[to_unsigned(index)];
+  }
+  FMT_CONSTEXPR20 auto operator[](int index) -> bigit& {
+    return bigits_[to_unsigned(index)];
+  }
+
+  static constexpr const int bigit_bits = num_bits<bigit>();
+
+  friend struct formatter<bigint>;
+
+  FMT_CONSTEXPR20 void subtract_bigits(int index, bigit other, bigit& borrow) {
+    auto result = static_cast<double_bigit>((*this)[index]) - other - borrow;
+    (*this)[index] = static_cast<bigit>(result);
+    borrow = static_cast<bigit>(result >> (bigit_bits * 2 - 1));
+  }
+
+  FMT_CONSTEXPR20 void remove_leading_zeros() {
+    int num_bigits = static_cast<int>(bigits_.size()) - 1;
+    while (num_bigits > 0 && (*this)[num_bigits] == 0) --num_bigits;
+    bigits_.resize(to_unsigned(num_bigits + 1));
+  }
+
+  // Computes *this -= other assuming aligned bigints and *this >= other.
+  FMT_CONSTEXPR20 void subtract_aligned(const bigint& other) {
+    FMT_ASSERT(other.exp_ >= exp_, "unaligned bigints");
+    FMT_ASSERT(compare(*this, other) >= 0, "");
+    bigit borrow = 0;
+    int i = other.exp_ - exp_;
+    for (size_t j = 0, n = other.bigits_.size(); j != n; ++i, ++j)
+      subtract_bigits(i, other.bigits_[j], borrow);
+    while (borrow > 0) subtract_bigits(i, 0, borrow);
+    remove_leading_zeros();
+  }
+
+  FMT_CONSTEXPR20 void multiply(uint32_t value) {
+    const double_bigit wide_value = value;
+    bigit carry = 0;
+    for (size_t i = 0, n = bigits_.size(); i < n; ++i) {
+      double_bigit result = bigits_[i] * wide_value + carry;
+      bigits_[i] = static_cast<bigit>(result);
+      carry = static_cast<bigit>(result >> bigit_bits);
+    }
+    if (carry != 0) bigits_.push_back(carry);
+  }
+
+  template <typename UInt, FMT_ENABLE_IF(std::is_same<UInt, uint64_t>::value ||
+                                         std::is_same<UInt, uint128_t>::value)>
+  FMT_CONSTEXPR20 void multiply(UInt value) {
+    using half_uint =
+        conditional_t<std::is_same<UInt, uint128_t>::value, uint64_t, uint32_t>;
+    const int shift = num_bits<half_uint>() - bigit_bits;
+    const UInt lower = static_cast<half_uint>(value);
+    const UInt upper = value >> num_bits<half_uint>();
+    UInt carry = 0;
+    for (size_t i = 0, n = bigits_.size(); i < n; ++i) {
+      UInt result = lower * bigits_[i] + static_cast<bigit>(carry);
+      carry = (upper * bigits_[i] << shift) + (result >> bigit_bits) +
+              (carry >> bigit_bits);
+      bigits_[i] = static_cast<bigit>(result);
+    }
+    while (carry != 0) {
+      bigits_.push_back(static_cast<bigit>(carry));
+      carry >>= bigit_bits;
+    }
+  }
+
+  template <typename UInt, FMT_ENABLE_IF(std::is_same<UInt, uint64_t>::value ||
+                                         std::is_same<UInt, uint128_t>::value)>
+  FMT_CONSTEXPR20 void assign(UInt n) {
+    size_t num_bigits = 0;
+    do {
+      bigits_[num_bigits++] = static_cast<bigit>(n);
+      n >>= bigit_bits;
+    } while (n != 0);
+    bigits_.resize(num_bigits);
+    exp_ = 0;
+  }
+
+ public:
+  FMT_CONSTEXPR20 bigint() : exp_(0) {}
+  explicit bigint(uint64_t n) { assign(n); }
+
+  bigint(const bigint&) = delete;
+  void operator=(const bigint&) = delete;
+
+  FMT_CONSTEXPR20 void assign(const bigint& other) {
+    auto size = other.bigits_.size();
+    bigits_.resize(size);
+    auto data = other.bigits_.data();
+    copy<bigit>(data, data + size, bigits_.data());
+    exp_ = other.exp_;
+  }
+
+  template <typename Int> FMT_CONSTEXPR20 void operator=(Int n) {
+    FMT_ASSERT(n > 0, "");
+    assign(uint64_or_128_t<Int>(n));
+  }
+
+  FMT_CONSTEXPR20 auto num_bigits() const -> int {
+    return static_cast<int>(bigits_.size()) + exp_;
+  }
+
+  FMT_NOINLINE FMT_CONSTEXPR20 auto operator<<=(int shift) -> bigint& {
+    FMT_ASSERT(shift >= 0, "");
+    exp_ += shift / bigit_bits;
+    shift %= bigit_bits;
+    if (shift == 0) return *this;
+    bigit carry = 0;
+    for (size_t i = 0, n = bigits_.size(); i < n; ++i) {
+      bigit c = bigits_[i] >> (bigit_bits - shift);
+      bigits_[i] = (bigits_[i] << shift) + carry;
+      carry = c;
+    }
+    if (carry != 0) bigits_.push_back(carry);
+    return *this;
+  }
+
+  template <typename Int>
+  FMT_CONSTEXPR20 auto operator*=(Int value) -> bigint& {
+    FMT_ASSERT(value > 0, "");
+    multiply(uint32_or_64_or_128_t<Int>(value));
+    return *this;
+  }
+
+  friend FMT_CONSTEXPR20 auto compare(const bigint& lhs, const bigint& rhs)
+      -> int {
+    int num_lhs_bigits = lhs.num_bigits(), num_rhs_bigits = rhs.num_bigits();
+    if (num_lhs_bigits != num_rhs_bigits)
+      return num_lhs_bigits > num_rhs_bigits ? 1 : -1;
+    int i = static_cast<int>(lhs.bigits_.size()) - 1;
+    int j = static_cast<int>(rhs.bigits_.size()) - 1;
+    int end = i - j;
+    if (end < 0) end = 0;
+    for (; i >= end; --i, --j) {
+      bigit lhs_bigit = lhs[i], rhs_bigit = rhs[j];
+      if (lhs_bigit != rhs_bigit) return lhs_bigit > rhs_bigit ? 1 : -1;
+    }
+    if (i != j) return i > j ? 1 : -1;
+    return 0;
+  }
+
+  // Returns compare(lhs1 + lhs2, rhs).
+  friend FMT_CONSTEXPR20 auto add_compare(const bigint& lhs1,
+                                          const bigint& lhs2, const bigint& rhs)
+      -> int {
+    auto minimum = [](int a, int b) { return a < b ? a : b; };
+    auto maximum = [](int a, int b) { return a > b ? a : b; };
+    int max_lhs_bigits = maximum(lhs1.num_bigits(), lhs2.num_bigits());
+    int num_rhs_bigits = rhs.num_bigits();
+    if (max_lhs_bigits + 1 < num_rhs_bigits) return -1;
+    if (max_lhs_bigits > num_rhs_bigits) return 1;
+    auto get_bigit = [](const bigint& n, int i) -> bigit {
+      return i >= n.exp_ && i < n.num_bigits() ? n[i - n.exp_] : 0;
+    };
+    double_bigit borrow = 0;
+    int min_exp = minimum(minimum(lhs1.exp_, lhs2.exp_), rhs.exp_);
+    for (int i = num_rhs_bigits - 1; i >= min_exp; --i) {
+      double_bigit sum =
+          static_cast<double_bigit>(get_bigit(lhs1, i)) + get_bigit(lhs2, i);
+      bigit rhs_bigit = get_bigit(rhs, i);
+      if (sum > rhs_bigit + borrow) return 1;
+      borrow = rhs_bigit + borrow - sum;
+      if (borrow > 1) return -1;
+      borrow <<= bigit_bits;
+    }
+    return borrow != 0 ? -1 : 0;
+  }
+
+  // Assigns pow(10, exp) to this bigint.
+  FMT_CONSTEXPR20 void assign_pow10(int exp) {
+    FMT_ASSERT(exp >= 0, "");
+    if (exp == 0) return *this = 1;
+    // Find the top bit.
+    int bitmask = 1;
+    while (exp >= bitmask) bitmask <<= 1;
+    bitmask >>= 1;
+    // pow(10, exp) = pow(5, exp) * pow(2, exp). First compute pow(5, exp) by
+    // repeated squaring and multiplication.
+    *this = 5;
+    bitmask >>= 1;
+    while (bitmask != 0) {
+      square();
+      if ((exp & bitmask) != 0) *this *= 5;
+      bitmask >>= 1;
+    }
+    *this <<= exp;  // Multiply by pow(2, exp) by shifting.
+  }
+
+  FMT_CONSTEXPR20 void square() {
+    int num_bigits = static_cast<int>(bigits_.size());
+    int num_result_bigits = 2 * num_bigits;
+    basic_memory_buffer<bigit, bigits_capacity> n(std::move(bigits_));
+    bigits_.resize(to_unsigned(num_result_bigits));
+    auto sum = uint128_t();
+    for (int bigit_index = 0; bigit_index < num_bigits; ++bigit_index) {
+      // Compute bigit at position bigit_index of the result by adding
+      // cross-product terms n[i] * n[j] such that i + j == bigit_index.
+      for (int i = 0, j = bigit_index; j >= 0; ++i, --j) {
+        // Most terms are multiplied twice which can be optimized in the future.
+        sum += static_cast<double_bigit>(n[i]) * n[j];
+      }
+      (*this)[bigit_index] = static_cast<bigit>(sum);
+      sum >>= num_bits<bigit>();  // Compute the carry.
+    }
+    // Do the same for the top half.
+    for (int bigit_index = num_bigits; bigit_index < num_result_bigits;
+         ++bigit_index) {
+      for (int j = num_bigits - 1, i = bigit_index - j; i < num_bigits;)
+        sum += static_cast<double_bigit>(n[i++]) * n[j--];
+      (*this)[bigit_index] = static_cast<bigit>(sum);
+      sum >>= num_bits<bigit>();
+    }
+    remove_leading_zeros();
+    exp_ *= 2;
+  }
+
+  // If this bigint has a bigger exponent than other, adds trailing zero to make
+  // exponents equal. This simplifies some operations such as subtraction.
+  FMT_CONSTEXPR20 void align(const bigint& other) {
+    int exp_difference = exp_ - other.exp_;
+    if (exp_difference <= 0) return;
+    int num_bigits = static_cast<int>(bigits_.size());
+    bigits_.resize(to_unsigned(num_bigits + exp_difference));
+    for (int i = num_bigits - 1, j = i + exp_difference; i >= 0; --i, --j)
+      bigits_[j] = bigits_[i];
+    memset(bigits_.data(), 0, to_unsigned(exp_difference) * sizeof(bigit));
+    exp_ -= exp_difference;
+  }
+
+  // Divides this bignum by divisor, assigning the remainder to this and
+  // returning the quotient.
+  FMT_CONSTEXPR20 auto divmod_assign(const bigint& divisor) -> int {
+    FMT_ASSERT(this != &divisor, "");
+    if (compare(*this, divisor) < 0) return 0;
+    FMT_ASSERT(divisor.bigits_[divisor.bigits_.size() - 1u] != 0, "");
+    align(divisor);
+    int quotient = 0;
+    do {
+      subtract_aligned(divisor);
+      ++quotient;
+    } while (compare(*this, divisor) >= 0);
+    return quotient;
+  }
+};
+
+// format_dragon flags.
+enum dragon {
+  predecessor_closer = 1,
+  fixup = 2,  // Run fixup to correct exp10 which can be off by one.
+  fixed = 4,
+};
+
+// Formats a floating-point number using a variation of the Fixed-Precision
+// Positive Floating-Point Printout ((FPP)^2) algorithm by Steele & White:
+// https://fmt.dev/papers/p372-steele.pdf.
+FMT_CONSTEXPR20 inline void format_dragon(basic_fp<uint128_t> value,
+                                          unsigned flags, int num_digits,
+                                          buffer<char>& buf, int& exp10) {
+  bigint numerator;    // 2 * R in (FPP)^2.
+  bigint denominator;  // 2 * S in (FPP)^2.
+  // lower and upper are differences between value and corresponding boundaries.
+  bigint lower;             // (M^- in (FPP)^2).
+  bigint upper_store;       // upper's value if different from lower.
+  bigint* upper = nullptr;  // (M^+ in (FPP)^2).
+  // Shift numerator and denominator by an extra bit or two (if lower boundary
+  // is closer) to make lower and upper integers. This eliminates multiplication
+  // by 2 during later computations.
+  bool is_predecessor_closer = (flags & dragon::predecessor_closer) != 0;
+  int shift = is_predecessor_closer ? 2 : 1;
+  if (value.e >= 0) {
+    numerator = value.f;
+    numerator <<= value.e + shift;
+    lower = 1;
+    lower <<= value.e;
+    if (is_predecessor_closer) {
+      upper_store = 1;
+      upper_store <<= value.e + 1;
+      upper = &upper_store;
+    }
+    denominator.assign_pow10(exp10);
+    denominator <<= shift;
+  } else if (exp10 < 0) {
+    numerator.assign_pow10(-exp10);
+    lower.assign(numerator);
+    if (is_predecessor_closer) {
+      upper_store.assign(numerator);
+      upper_store <<= 1;
+      upper = &upper_store;
+    }
+    numerator *= value.f;
+    numerator <<= shift;
+    denominator = 1;
+    denominator <<= shift - value.e;
+  } else {
+    numerator = value.f;
+    numerator <<= shift;
+    denominator.assign_pow10(exp10);
+    denominator <<= shift - value.e;
+    lower = 1;
+    if (is_predecessor_closer) {
+      upper_store = 1ULL << 1;
+      upper = &upper_store;
+    }
+  }
+  int even = static_cast<int>((value.f & 1) == 0);
+  if (!upper) upper = &lower;
+  bool shortest = num_digits < 0;
+  if ((flags & dragon::fixup) != 0) {
+    if (add_compare(numerator, *upper, denominator) + even <= 0) {
+      --exp10;
+      numerator *= 10;
+      if (num_digits < 0) {
+        lower *= 10;
+        if (upper != &lower) *upper *= 10;
+      }
+    }
+    if ((flags & dragon::fixed) != 0) adjust_precision(num_digits, exp10 + 1);
+  }
+  // Invariant: value == (numerator / denominator) * pow(10, exp10).
+  if (shortest) {
+    // Generate the shortest representation.
+    num_digits = 0;
+    char* data = buf.data();
+    for (;;) {
+      int digit = numerator.divmod_assign(denominator);
+      bool low = compare(numerator, lower) - even < 0;  // numerator <[=] lower.
+      // numerator + upper >[=] pow10:
+      bool high = add_compare(numerator, *upper, denominator) + even > 0;
+      data[num_digits++] = static_cast<char>('0' + digit);
+      if (low || high) {
+        if (!low) {
+          ++data[num_digits - 1];
+        } else if (high) {
+          int result = add_compare(numerator, numerator, denominator);
+          // Round half to even.
+          if (result > 0 || (result == 0 && (digit % 2) != 0))
+            ++data[num_digits - 1];
+        }
+        buf.try_resize(to_unsigned(num_digits));
+        exp10 -= num_digits - 1;
+        return;
+      }
+      numerator *= 10;
+      lower *= 10;
+      if (upper != &lower) *upper *= 10;
+    }
+  }
+  // Generate the given number of digits.
+  exp10 -= num_digits - 1;
+  if (num_digits <= 0) {
+    auto digit = '0';
+    if (num_digits == 0) {
+      denominator *= 10;
+      digit = add_compare(numerator, numerator, denominator) > 0 ? '1' : '0';
+    }
+    buf.push_back(digit);
+    return;
+  }
+  buf.try_resize(to_unsigned(num_digits));
+  for (int i = 0; i < num_digits - 1; ++i) {
+    int digit = numerator.divmod_assign(denominator);
+    buf[i] = static_cast<char>('0' + digit);
+    numerator *= 10;
+  }
+  int digit = numerator.divmod_assign(denominator);
+  auto result = add_compare(numerator, numerator, denominator);
+  if (result > 0 || (result == 0 && (digit % 2) != 0)) {
+    if (digit == 9) {
+      const auto overflow = '0' + 10;
+      buf[num_digits - 1] = overflow;
+      // Propagate the carry.
+      for (int i = num_digits - 1; i > 0 && buf[i] == overflow; --i) {
+        buf[i] = '0';
+        ++buf[i - 1];
+      }
+      if (buf[0] == overflow) {
+        buf[0] = '1';
+        if ((flags & dragon::fixed) != 0)
+          buf.push_back('0');
+        else
+          ++exp10;
+      }
+      return;
+    }
+    ++digit;
+  }
+  buf[num_digits - 1] = static_cast<char>('0' + digit);
+}
+
+// Formats a floating-point number using the hexfloat format.
+template <typename Float, FMT_ENABLE_IF(!is_double_double<Float>::value)>
+FMT_CONSTEXPR20 void format_hexfloat(Float value, format_specs specs,
+                                     buffer<char>& buf) {
+  // float is passed as double to reduce the number of instantiations and to
+  // simplify implementation.
+  static_assert(!std::is_same<Float, float>::value, "");
+
+  using info = dragonbox::float_info<Float>;
+
+  // Assume Float is in the format [sign][exponent][significand].
+  using carrier_uint = typename info::carrier_uint;
+
+  constexpr auto num_float_significand_bits =
+      detail::num_significand_bits<Float>();
+
+  basic_fp<carrier_uint> f(value);
+  f.e += num_float_significand_bits;
+  if (!has_implicit_bit<Float>()) --f.e;
+
+  constexpr auto num_fraction_bits =
+      num_float_significand_bits + (has_implicit_bit<Float>() ? 1 : 0);
+  constexpr auto num_xdigits = (num_fraction_bits + 3) / 4;
+
+  constexpr auto leading_shift = ((num_xdigits - 1) * 4);
+  const auto leading_mask = carrier_uint(0xF) << leading_shift;
+  const auto leading_xdigit =
+      static_cast<uint32_t>((f.f & leading_mask) >> leading_shift);
+  if (leading_xdigit > 1) f.e -= (32 - countl_zero(leading_xdigit) - 1);
+
+  int print_xdigits = num_xdigits - 1;
+  if (specs.precision >= 0 && print_xdigits > specs.precision) {
+    const int shift = ((print_xdigits - specs.precision - 1) * 4);
+    const auto mask = carrier_uint(0xF) << shift;
+    const auto v = static_cast<uint32_t>((f.f & mask) >> shift);
+
+    if (v >= 8) {
+      const auto inc = carrier_uint(1) << (shift + 4);
+      f.f += inc;
+      f.f &= ~(inc - 1);
+    }
+
+    // Check long double overflow
+    if (!has_implicit_bit<Float>()) {
+      const auto implicit_bit = carrier_uint(1) << num_float_significand_bits;
+      if ((f.f & implicit_bit) == implicit_bit) {
+        f.f >>= 4;
+        f.e += 4;
+      }
+    }
+
+    print_xdigits = specs.precision;
+  }
+
+  char xdigits[num_bits<carrier_uint>() / 4];
+  detail::fill_n(xdigits, sizeof(xdigits), '0');
+  format_uint<4>(xdigits, f.f, num_xdigits, specs.upper);
+
+  // Remove zero tail
+  while (print_xdigits > 0 && xdigits[print_xdigits] == '0') --print_xdigits;
+
+  buf.push_back('0');
+  buf.push_back(specs.upper ? 'X' : 'x');
+  buf.push_back(xdigits[0]);
+  if (specs.alt || print_xdigits > 0 || print_xdigits < specs.precision)
+    buf.push_back('.');
+  buf.append(xdigits + 1, xdigits + 1 + print_xdigits);
+  for (; print_xdigits < specs.precision; ++print_xdigits) buf.push_back('0');
+
+  buf.push_back(specs.upper ? 'P' : 'p');
+
+  uint32_t abs_e;
+  if (f.e < 0) {
+    buf.push_back('-');
+    abs_e = static_cast<uint32_t>(-f.e);
+  } else {
+    buf.push_back('+');
+    abs_e = static_cast<uint32_t>(f.e);
+  }
+  format_decimal<char>(appender(buf), abs_e, detail::count_digits(abs_e));
+}
+
+template <typename Float, FMT_ENABLE_IF(is_double_double<Float>::value)>
+FMT_CONSTEXPR20 void format_hexfloat(Float value, format_specs specs,
+                                     buffer<char>& buf) {
+  format_hexfloat(static_cast<double>(value), specs, buf);
+}
+
+constexpr auto fractional_part_rounding_thresholds(int index) -> uint32_t {
+  // For checking rounding thresholds.
+  // The kth entry is chosen to be the smallest integer such that the
+  // upper 32-bits of 10^(k+1) times it is strictly bigger than 5 * 10^k.
+  // It is equal to ceil(2^31 + 2^32/10^(k + 1)).
+  // These are stored in a string literal because we cannot have static arrays
+  // in constexpr functions and non-static ones are poorly optimized.
+  return U"\x9999999a\x828f5c29\x80418938\x80068db9\x8000a7c6\x800010c7"
+         U"\x800001ae\x8000002b"[index];
+}
+
+template <typename Float>
+FMT_CONSTEXPR20 auto format_float(Float value, int precision, float_specs specs,
+                                  buffer<char>& buf) -> int {
+  // float is passed as double to reduce the number of instantiations.
+  static_assert(!std::is_same<Float, float>::value, "");
+  FMT_ASSERT(value >= 0, "value is negative");
+  auto converted_value = convert_float(value);
+
+  const bool fixed = specs.format == float_format::fixed;
+  if (value <= 0) {  // <= instead of == to silence a warning.
+    if (precision <= 0 || !fixed) {
+      buf.push_back('0');
+      return 0;
+    }
+    buf.try_resize(to_unsigned(precision));
+    fill_n(buf.data(), precision, '0');
+    return -precision;
+  }
+
+  int exp = 0;
+  bool use_dragon = true;
+  unsigned dragon_flags = 0;
+  if (!is_fast_float<Float>() || is_constant_evaluated()) {
+    const auto inv_log2_10 = 0.3010299956639812;  // 1 / log2(10)
+    using info = dragonbox::float_info<decltype(converted_value)>;
+    const auto f = basic_fp<typename info::carrier_uint>(converted_value);
+    // Compute exp, an approximate power of 10, such that
+    //   10^(exp - 1) <= value < 10^exp or 10^exp <= value < 10^(exp + 1).
+    // This is based on log10(value) == log2(value) / log2(10) and approximation
+    // of log2(value) by e + num_fraction_bits idea from double-conversion.
+    auto e = (f.e + count_digits<1>(f.f) - 1) * inv_log2_10 - 1e-10;
+    exp = static_cast<int>(e);
+    if (e > exp) ++exp;  // Compute ceil.
+    dragon_flags = dragon::fixup;
+  } else if (precision < 0) {
+    // Use Dragonbox for the shortest format.
+    if (specs.binary32) {
+      auto dec = dragonbox::to_decimal(static_cast<float>(value));
+      write<char>(appender(buf), dec.significand);
+      return dec.exponent;
+    }
+    auto dec = dragonbox::to_decimal(static_cast<double>(value));
+    write<char>(appender(buf), dec.significand);
+    return dec.exponent;
+  } else {
+    // Extract significand bits and exponent bits.
+    using info = dragonbox::float_info<double>;
+    auto br = bit_cast<uint64_t>(static_cast<double>(value));
+
+    const uint64_t significand_mask =
+        (static_cast<uint64_t>(1) << num_significand_bits<double>()) - 1;
+    uint64_t significand = (br & significand_mask);
+    int exponent = static_cast<int>((br & exponent_mask<double>()) >>
+                                    num_significand_bits<double>());
+
+    if (exponent != 0) {  // Check if normal.
+      exponent -= exponent_bias<double>() + num_significand_bits<double>();
+      significand |=
+          (static_cast<uint64_t>(1) << num_significand_bits<double>());
+      significand <<= 1;
+    } else {
+      // Normalize subnormal inputs.
+      FMT_ASSERT(significand != 0, "zeros should not appear here");
+      int shift = countl_zero(significand);
+      FMT_ASSERT(shift >= num_bits<uint64_t>() - num_significand_bits<double>(),
+                 "");
+      shift -= (num_bits<uint64_t>() - num_significand_bits<double>() - 2);
+      exponent = (std::numeric_limits<double>::min_exponent -
+                  num_significand_bits<double>()) -
+                 shift;
+      significand <<= shift;
+    }
+
+    // Compute the first several nonzero decimal significand digits.
+    // We call the number we get the first segment.
+    const int k = info::kappa - dragonbox::floor_log10_pow2(exponent);
+    exp = -k;
+    const int beta = exponent + dragonbox::floor_log2_pow10(k);
+    uint64_t first_segment;
+    bool has_more_segments;
+    int digits_in_the_first_segment;
+    {
+      const auto r = dragonbox::umul192_upper128(
+          significand << beta, dragonbox::get_cached_power(k));
+      first_segment = r.high();
+      has_more_segments = r.low() != 0;
+
+      // The first segment can have 18 ~ 19 digits.
+      if (first_segment >= 1000000000000000000ULL) {
+        digits_in_the_first_segment = 19;
+      } else {
+        // When it is of 18-digits, we align it to 19-digits by adding a bogus
+        // zero at the end.
+        digits_in_the_first_segment = 18;
+        first_segment *= 10;
+      }
+    }
+
+    // Compute the actual number of decimal digits to print.
+    if (fixed) adjust_precision(precision, exp + digits_in_the_first_segment);
+
+    // Use Dragon4 only when there might be not enough digits in the first
+    // segment.
+    if (digits_in_the_first_segment > precision) {
+      use_dragon = false;
+
+      if (precision <= 0) {
+        exp += digits_in_the_first_segment;
+
+        if (precision < 0) {
+          // Nothing to do, since all we have are just leading zeros.
+          buf.try_resize(0);
+        } else {
+          // We may need to round-up.
+          buf.try_resize(1);
+          if ((first_segment | static_cast<uint64_t>(has_more_segments)) >
+              5000000000000000000ULL) {
+            buf[0] = '1';
+          } else {
+            buf[0] = '0';
+          }
+        }
+      }  // precision <= 0
+      else {
+        exp += digits_in_the_first_segment - precision;
+
+        // When precision > 0, we divide the first segment into three
+        // subsegments, each with 9, 9, and 0 ~ 1 digits so that each fits
+        // in 32-bits which usually allows faster calculation than in
+        // 64-bits. Since some compiler (e.g. MSVC) doesn't know how to optimize
+        // division-by-constant for large 64-bit divisors, we do it here
+        // manually. The magic number 7922816251426433760 below is equal to
+        // ceil(2^(64+32) / 10^10).
+        const uint32_t first_subsegment = static_cast<uint32_t>(
+            dragonbox::umul128_upper64(first_segment, 7922816251426433760ULL) >>
+            32);
+        const uint64_t second_third_subsegments =
+            first_segment - first_subsegment * 10000000000ULL;
+
+        uint64_t prod;
+        uint32_t digits;
+        bool should_round_up;
+        int number_of_digits_to_print = precision > 9 ? 9 : precision;
+
+        // Print a 9-digits subsegment, either the first or the second.
+        auto print_subsegment = [&](uint32_t subsegment, char* buffer) {
+          int number_of_digits_printed = 0;
+
+          // If we want to print an odd number of digits from the subsegment,
+          if ((number_of_digits_to_print & 1) != 0) {
+            // Convert to 64-bit fixed-point fractional form with 1-digit
+            // integer part. The magic number 720575941 is a good enough
+            // approximation of 2^(32 + 24) / 10^8; see
+            // https://jk-jeon.github.io/posts/2022/12/fixed-precision-formatting/#fixed-length-case
+            // for details.
+            prod = ((subsegment * static_cast<uint64_t>(720575941)) >> 24) + 1;
+            digits = static_cast<uint32_t>(prod >> 32);
+            *buffer = static_cast<char>('0' + digits);
+            number_of_digits_printed++;
+          }
+          // If we want to print an even number of digits from the
+          // first_subsegment,
+          else {
+            // Convert to 64-bit fixed-point fractional form with 2-digits
+            // integer part. The magic number 450359963 is a good enough
+            // approximation of 2^(32 + 20) / 10^7; see
+            // https://jk-jeon.github.io/posts/2022/12/fixed-precision-formatting/#fixed-length-case
+            // for details.
+            prod = ((subsegment * static_cast<uint64_t>(450359963)) >> 20) + 1;
+            digits = static_cast<uint32_t>(prod >> 32);
+            copy2(buffer, digits2(digits));
+            number_of_digits_printed += 2;
+          }
+
+          // Print all digit pairs.
+          while (number_of_digits_printed < number_of_digits_to_print) {
+            prod = static_cast<uint32_t>(prod) * static_cast<uint64_t>(100);
+            digits = static_cast<uint32_t>(prod >> 32);
+            copy2(buffer + number_of_digits_printed, digits2(digits));
+            number_of_digits_printed += 2;
+          }
+        };
+
+        // Print first subsegment.
+        print_subsegment(first_subsegment, buf.data());
+
+        // Perform rounding if the first subsegment is the last subsegment to
+        // print.
+        if (precision <= 9) {
+          // Rounding inside the subsegment.
+          // We round-up if:
+          //  - either the fractional part is strictly larger than 1/2, or
+          //  - the fractional part is exactly 1/2 and the last digit is odd.
+          // We rely on the following observations:
+          //  - If fractional_part >= threshold, then the fractional part is
+          //    strictly larger than 1/2.
+          //  - If the MSB of fractional_part is set, then the fractional part
+          //    must be at least 1/2.
+          //  - When the MSB of fractional_part is set, either
+          //    second_third_subsegments being nonzero or has_more_segments
+          //    being true means there are further digits not printed, so the
+          //    fractional part is strictly larger than 1/2.
+          if (precision < 9) {
+            uint32_t fractional_part = static_cast<uint32_t>(prod);
+            should_round_up =
+                fractional_part >= fractional_part_rounding_thresholds(
+                                       8 - number_of_digits_to_print) ||
+                ((fractional_part >> 31) &
+                 ((digits & 1) | (second_third_subsegments != 0) |
+                  has_more_segments)) != 0;
+          }
+          // Rounding at the subsegment boundary.
+          // In this case, the fractional part is at least 1/2 if and only if
+          // second_third_subsegments >= 5000000000ULL, and is strictly larger
+          // than 1/2 if we further have either second_third_subsegments >
+          // 5000000000ULL or has_more_segments == true.
+          else {
+            should_round_up = second_third_subsegments > 5000000000ULL ||
+                              (second_third_subsegments == 5000000000ULL &&
+                               ((digits & 1) != 0 || has_more_segments));
+          }
+        }
+        // Otherwise, print the second subsegment.
+        else {
+          // Compilers are not aware of how to leverage the maximum value of
+          // second_third_subsegments to find out a better magic number which
+          // allows us to eliminate an additional shift. 1844674407370955162 =
+          // ceil(2^64/10) < ceil(2^64*(10^9/(10^10 - 1))).
+          const uint32_t second_subsegment =
+              static_cast<uint32_t>(dragonbox::umul128_upper64(
+                  second_third_subsegments, 1844674407370955162ULL));
+          const uint32_t third_subsegment =
+              static_cast<uint32_t>(second_third_subsegments) -
+              second_subsegment * 10;
+
+          number_of_digits_to_print = precision - 9;
+          print_subsegment(second_subsegment, buf.data() + 9);
+
+          // Rounding inside the subsegment.
+          if (precision < 18) {
+            // The condition third_subsegment != 0 implies that the segment was
+            // of 19 digits, so in this case the third segment should be
+            // consisting of a genuine digit from the input.
+            uint32_t fractional_part = static_cast<uint32_t>(prod);
+            should_round_up =
+                fractional_part >= fractional_part_rounding_thresholds(
+                                       8 - number_of_digits_to_print) ||
+                ((fractional_part >> 31) &
+                 ((digits & 1) | (third_subsegment != 0) |
+                  has_more_segments)) != 0;
+          }
+          // Rounding at the subsegment boundary.
+          else {
+            // In this case, the segment must be of 19 digits, thus
+            // the third subsegment should be consisting of a genuine digit from
+            // the input.
+            should_round_up = third_subsegment > 5 ||
+                              (third_subsegment == 5 &&
+                               ((digits & 1) != 0 || has_more_segments));
+          }
+        }
+
+        // Round-up if necessary.
+        if (should_round_up) {
+          ++buf[precision - 1];
+          for (int i = precision - 1; i > 0 && buf[i] > '9'; --i) {
+            buf[i] = '0';
+            ++buf[i - 1];
+          }
+          if (buf[0] > '9') {
+            buf[0] = '1';
+            if (fixed)
+              buf[precision++] = '0';
+            else
+              ++exp;
+          }
+        }
+        buf.try_resize(to_unsigned(precision));
+      }
+    }  // if (digits_in_the_first_segment > precision)
+    else {
+      // Adjust the exponent for its use in Dragon4.
+      exp += digits_in_the_first_segment - 1;
+    }
+  }
+  if (use_dragon) {
+    auto f = basic_fp<uint128_t>();
+    bool is_predecessor_closer = specs.binary32
+                                     ? f.assign(static_cast<float>(value))
+                                     : f.assign(converted_value);
+    if (is_predecessor_closer) dragon_flags |= dragon::predecessor_closer;
+    if (fixed) dragon_flags |= dragon::fixed;
+    // Limit precision to the maximum possible number of significant digits in
+    // an IEEE754 double because we don't need to generate zeros.
+    const int max_double_digits = 767;
+    if (precision > max_double_digits) precision = max_double_digits;
+    format_dragon(f, dragon_flags, precision, buf, exp);
+  }
+  if (!fixed && !specs.showpoint) {
+    // Remove trailing zeros.
+    auto num_digits = buf.size();
+    while (num_digits > 0 && buf[num_digits - 1] == '0') {
+      --num_digits;
+      ++exp;
+    }
+    buf.try_resize(num_digits);
+  }
+  return exp;
+}
+
+template <typename Char, typename OutputIt, typename T>
+FMT_CONSTEXPR20 auto write_float(OutputIt out, T value, format_specs specs,
+                                 locale_ref loc) -> OutputIt {
+  sign_t sign = specs.sign;
+  if (detail::signbit(value)) {  // value < 0 is false for NaN so use signbit.
+    sign = sign::minus;
+    value = -value;
+  } else if (sign == sign::minus) {
+    sign = sign::none;
+  }
+
+  if (!detail::isfinite(value))
+    return write_nonfinite<Char>(out, detail::isnan(value), specs, sign);
+
+  if (specs.align == align::numeric && sign) {
+    auto it = reserve(out, 1);
+    *it++ = detail::sign<Char>(sign);
+    out = base_iterator(out, it);
+    sign = sign::none;
+    if (specs.width != 0) --specs.width;
+  }
+
+  memory_buffer buffer;
+  if (specs.type == presentation_type::hexfloat) {
+    if (sign) buffer.push_back(detail::sign<char>(sign));
+    format_hexfloat(convert_float(value), specs, buffer);
+    return write_bytes<Char, align::right>(out, {buffer.data(), buffer.size()},
+                                           specs);
+  }
+
+  int precision = specs.precision >= 0 || specs.type == presentation_type::none
+                      ? specs.precision
+                      : 6;
+  if (specs.type == presentation_type::exp) {
+    if (precision == max_value<int>())
+      report_error("number is too big");
+    else
+      ++precision;
+  } else if (specs.type != presentation_type::fixed && precision == 0) {
+    precision = 1;
+  }
+  float_specs fspecs = parse_float_type_spec(specs);
+  fspecs.sign = sign;
+  if (const_check(std::is_same<T, float>())) fspecs.binary32 = true;
+  int exp = format_float(convert_float(value), precision, fspecs, buffer);
+  fspecs.precision = precision;
+  auto f = big_decimal_fp{buffer.data(), static_cast<int>(buffer.size()), exp};
+  return write_float<Char>(out, f, specs, fspecs, loc);
+}
+
+template <typename Char, typename OutputIt, typename T,
+          FMT_ENABLE_IF(is_floating_point<T>::value)>
+FMT_CONSTEXPR20 auto write(OutputIt out, T value, format_specs specs,
+                           locale_ref loc = {}) -> OutputIt {
+  if (const_check(!is_supported_floating_point(value))) return out;
+  return specs.localized && write_loc(out, value, specs, loc)
+             ? out
+             : write_float<Char>(out, value, specs, loc);
+}
+
+template <typename Char, typename OutputIt, typename T,
+          FMT_ENABLE_IF(is_fast_float<T>::value)>
+FMT_CONSTEXPR20 auto write(OutputIt out, T value) -> OutputIt {
+  if (is_constant_evaluated()) return write<Char>(out, value, format_specs());
+  if (const_check(!is_supported_floating_point(value))) return out;
+
+  auto sign = sign_t::none;
+  if (detail::signbit(value)) {
+    sign = sign::minus;
+    value = -value;
+  }
+
+  constexpr auto specs = format_specs();
+  using floaty = conditional_t<std::is_same<T, long double>::value, double, T>;
+  using floaty_uint = typename dragonbox::float_info<floaty>::carrier_uint;
+  floaty_uint mask = exponent_mask<floaty>();
+  if ((bit_cast<floaty_uint>(value) & mask) == mask)
+    return write_nonfinite<Char>(out, std::isnan(value), specs, sign);
+
+  auto fspecs = float_specs();
+  fspecs.sign = sign;
+  auto dec = dragonbox::to_decimal(static_cast<floaty>(value));
+  return write_float<Char>(out, dec, specs, fspecs, {});
+}
+
+template <typename Char, typename OutputIt, typename T,
+          FMT_ENABLE_IF(is_floating_point<T>::value &&
+                        !is_fast_float<T>::value)>
+inline auto write(OutputIt out, T value) -> OutputIt {
+  return write<Char>(out, value, format_specs());
+}
+
+template <typename Char, typename OutputIt>
+auto write(OutputIt out, monostate, format_specs = {}, locale_ref = {})
+    -> OutputIt {
+  FMT_ASSERT(false, "");
+  return out;
+}
+
+template <typename Char, typename OutputIt>
+FMT_CONSTEXPR auto write(OutputIt out, basic_string_view<Char> value)
+    -> OutputIt {
+  return copy_noinline<Char>(value.begin(), value.end(), out);
+}
+
+template <typename Char, typename OutputIt, typename T,
+          FMT_ENABLE_IF(has_to_string_view<T>::value)>
+constexpr auto write(OutputIt out, const T& value) -> OutputIt {
+  return write<Char>(out, to_string_view(value));
+}
+
+// FMT_ENABLE_IF() condition separated to workaround an MSVC bug.
+template <
+    typename Char, typename OutputIt, typename T,
+    bool check =
+        std::is_enum<T>::value && !std::is_same<T, Char>::value &&
+        mapped_type_constant<T, basic_format_context<OutputIt, Char>>::value !=
+            type::custom_type,
+    FMT_ENABLE_IF(check)>
+FMT_CONSTEXPR auto write(OutputIt out, T value) -> OutputIt {
+  return write<Char>(out, static_cast<underlying_t<T>>(value));
+}
+
+template <typename Char, typename OutputIt, typename T,
+          FMT_ENABLE_IF(std::is_same<T, bool>::value)>
+FMT_CONSTEXPR auto write(OutputIt out, T value, const format_specs& specs = {},
+                         locale_ref = {}) -> OutputIt {
+  return specs.type != presentation_type::none &&
+                 specs.type != presentation_type::string
+             ? write<Char>(out, value ? 1 : 0, specs, {})
+             : write_bytes<Char>(out, value ? "true" : "false", specs);
+}
+
+template <typename Char, typename OutputIt>
+FMT_CONSTEXPR auto write(OutputIt out, Char value) -> OutputIt {
+  auto it = reserve(out, 1);
+  *it++ = value;
+  return base_iterator(out, it);
+}
+
+template <typename Char, typename OutputIt>
+FMT_CONSTEXPR20 auto write(OutputIt out, const Char* value) -> OutputIt {
+  if (value) return write(out, basic_string_view<Char>(value));
+  report_error("string pointer is null");
+  return out;
+}
+
+template <typename Char, typename OutputIt, typename T,
+          FMT_ENABLE_IF(std::is_same<T, void>::value)>
+auto write(OutputIt out, const T* value, const format_specs& specs = {},
+           locale_ref = {}) -> OutputIt {
+  return write_ptr<Char>(out, bit_cast<uintptr_t>(value), &specs);
+}
+
+// A write overload that handles implicit conversions.
+template <typename Char, typename OutputIt, typename T,
+          typename Context = basic_format_context<OutputIt, Char>>
+FMT_CONSTEXPR auto write(OutputIt out, const T& value) -> enable_if_t<
+    std::is_class<T>::value && !has_to_string_view<T>::value &&
+        !is_floating_point<T>::value && !std::is_same<T, Char>::value &&
+        !std::is_same<T, remove_cvref_t<decltype(arg_mapper<Context>().map(
+                             value))>>::value,
+    OutputIt> {
+  return write<Char>(out, arg_mapper<Context>().map(value));
+}
+
+template <typename Char, typename OutputIt, typename T,
+          typename Context = basic_format_context<OutputIt, Char>>
+FMT_CONSTEXPR auto write(OutputIt out, const T& value)
+    -> enable_if_t<mapped_type_constant<T, Context>::value ==
+                           type::custom_type &&
+                       !std::is_fundamental<T>::value,
+                   OutputIt> {
+  auto formatter = typename Context::template formatter_type<T>();
+  auto parse_ctx = typename Context::parse_context_type({});
+  formatter.parse(parse_ctx);
+  auto ctx = Context(out, {}, {});
+  return formatter.format(value, ctx);
+}
+
+// An argument visitor that formats the argument and writes it via the output
+// iterator. It's a class and not a generic lambda for compatibility with C++11.
+template <typename Char> struct default_arg_formatter {
+  using iterator = basic_appender<Char>;
+  using context = buffered_context<Char>;
+
+  iterator out;
+  basic_format_args<context> args;
+  locale_ref loc;
+
+  template <typename T> auto operator()(T value) -> iterator {
+    return write<Char>(out, value);
+  }
+  auto operator()(typename basic_format_arg<context>::handle h) -> iterator {
+    basic_format_parse_context<Char> parse_ctx({});
+    context format_ctx(out, args, loc);
+    h.format(parse_ctx, format_ctx);
+    return format_ctx.out();
+  }
+};
+
+template <typename Char> struct arg_formatter {
+  using iterator = basic_appender<Char>;
+  using context = buffered_context<Char>;
+
+  iterator out;
+  const format_specs& specs;
+  locale_ref locale;
+
+  template <typename T>
+  FMT_CONSTEXPR FMT_INLINE auto operator()(T value) -> iterator {
+    return detail::write<Char>(out, value, specs, locale);
+  }
+  auto operator()(typename basic_format_arg<context>::handle) -> iterator {
+    // User-defined types are handled separately because they require access
+    // to the parse context.
+    return out;
+  }
+};
+
+struct width_checker {
+  template <typename T, FMT_ENABLE_IF(is_integer<T>::value)>
+  FMT_CONSTEXPR auto operator()(T value) -> unsigned long long {
+    if (is_negative(value)) report_error("negative width");
+    return static_cast<unsigned long long>(value);
+  }
+
+  template <typename T, FMT_ENABLE_IF(!is_integer<T>::value)>
+  FMT_CONSTEXPR auto operator()(T) -> unsigned long long {
+    report_error("width is not integer");
+    return 0;
+  }
+};
+
+struct precision_checker {
+  template <typename T, FMT_ENABLE_IF(is_integer<T>::value)>
+  FMT_CONSTEXPR auto operator()(T value) -> unsigned long long {
+    if (is_negative(value)) report_error("negative precision");
+    return static_cast<unsigned long long>(value);
+  }
+
+  template <typename T, FMT_ENABLE_IF(!is_integer<T>::value)>
+  FMT_CONSTEXPR auto operator()(T) -> unsigned long long {
+    report_error("precision is not integer");
+    return 0;
+  }
+};
+
+template <typename Handler, typename FormatArg>
+FMT_CONSTEXPR auto get_dynamic_spec(FormatArg arg) -> int {
+  unsigned long long value = arg.visit(Handler());
+  if (value > to_unsigned(max_value<int>())) report_error("number is too big");
+  return static_cast<int>(value);
+}
+
+template <typename Context, typename ID>
+FMT_CONSTEXPR auto get_arg(Context& ctx, ID id) -> decltype(ctx.arg(id)) {
+  auto arg = ctx.arg(id);
+  if (!arg) report_error("argument not found");
+  return arg;
+}
+
+template <typename Handler, typename Context>
+FMT_CONSTEXPR void handle_dynamic_spec(int& value,
+                                       arg_ref<typename Context::char_type> ref,
+                                       Context& ctx) {
+  switch (ref.kind) {
+  case arg_id_kind::none:
+    break;
+  case arg_id_kind::index:
+    value = detail::get_dynamic_spec<Handler>(get_arg(ctx, ref.val.index));
+    break;
+  case arg_id_kind::name:
+    value = detail::get_dynamic_spec<Handler>(get_arg(ctx, ref.val.name));
+    break;
+  }
+}
+
+#if FMT_USE_USER_DEFINED_LITERALS
+#  if FMT_USE_NONTYPE_TEMPLATE_ARGS
+template <typename T, typename Char, size_t N,
+          fmt::detail_exported::fixed_string<Char, N> Str>
+struct statically_named_arg : view {
+  static constexpr auto name = Str.data;
+
+  const T& value;
+  statically_named_arg(const T& v) : value(v) {}
+};
+
+template <typename T, typename Char, size_t N,
+          fmt::detail_exported::fixed_string<Char, N> Str>
+struct is_named_arg<statically_named_arg<T, Char, N, Str>> : std::true_type {};
+
+template <typename T, typename Char, size_t N,
+          fmt::detail_exported::fixed_string<Char, N> Str>
+struct is_statically_named_arg<statically_named_arg<T, Char, N, Str>>
+    : std::true_type {};
+
+template <typename Char, size_t N,
+          fmt::detail_exported::fixed_string<Char, N> Str>
+struct udl_arg {
+  template <typename T> auto operator=(T&& value) const {
+    return statically_named_arg<T, Char, N, Str>(std::forward<T>(value));
+  }
+};
+#  else
+template <typename Char> struct udl_arg {
+  const Char* str;
+
+  template <typename T> auto operator=(T&& value) const -> named_arg<Char, T> {
+    return {str, std::forward<T>(value)};
+  }
+};
+#  endif
+#endif  // FMT_USE_USER_DEFINED_LITERALS
+
+template <typename Locale, typename Char>
+auto vformat(const Locale& loc, basic_string_view<Char> fmt,
+             typename detail::vformat_args<Char>::type args)
+    -> std::basic_string<Char> {
+  auto buf = basic_memory_buffer<Char>();
+  detail::vformat_to(buf, fmt, args, detail::locale_ref(loc));
+  return {buf.data(), buf.size()};
+}
+
+using format_func = void (*)(detail::buffer<char>&, int, const char*);
+
+FMT_API void format_error_code(buffer<char>& out, int error_code,
+                               string_view message) noexcept;
+
+using fmt::report_error;
+FMT_API void report_error(format_func func, int error_code,
+                          const char* message) noexcept;
+}  // namespace detail
+
+FMT_BEGIN_EXPORT
+FMT_API auto vsystem_error(int error_code, string_view format_str,
+                           format_args args) -> std::system_error;
+
+/**
+ * Constructs `std::system_error` with a message formatted with
+ * `fmt::format(fmt, args...)`.
+ * `error_code` is a system error code as given by `errno`.
+ *
+ * **Example**:
+ *
+ *     // This throws std::system_error with the description
+ *     //   cannot open file 'madeup': No such file or directory
+ *     // or similar (system message may vary).
+ *     const char* filename = "madeup";
+ *     std::FILE* file = std::fopen(filename, "r");
+ *     if (!file)
+ *       throw fmt::system_error(errno, "cannot open file '{}'", filename);
+ */
+template <typename... T>
+auto system_error(int error_code, format_string<T...> fmt, T&&... args)
+    -> std::system_error {
+  return vsystem_error(error_code, fmt, fmt::make_format_args(args...));
+}
+
+/**
+ * Formats an error message for an error returned by an operating system or a
+ * language runtime, for example a file opening error, and writes it to `out`.
+ * The format is the same as the one used by `std::system_error(ec, message)`
+ * where `ec` is `std::error_code(error_code, std::generic_category())`.
+ * It is implementation-defined but normally looks like:
+ *
+ *     <message>: <system-message>
+ *
+ * where `<message>` is the passed message and `<system-message>` is the system
+ * message corresponding to the error code.
+ * `error_code` is a system error code as given by `errno`.
+ */
+FMT_API void format_system_error(detail::buffer<char>& out, int error_code,
+                                 const char* message) noexcept;
+
+// Reports a system error without throwing an exception.
+// Can be used to report errors from destructors.
+FMT_API void report_system_error(int error_code, const char* message) noexcept;
+
+/// A fast integer formatter.
+class format_int {
+ private:
+  // Buffer should be large enough to hold all digits (digits10 + 1),
+  // a sign and a null character.
+  enum { buffer_size = std::numeric_limits<unsigned long long>::digits10 + 3 };
+  mutable char buffer_[buffer_size];
+  char* str_;
+
+  template <typename UInt>
+  FMT_CONSTEXPR20 auto format_unsigned(UInt value) -> char* {
+    auto n = static_cast<detail::uint32_or_64_or_128_t<UInt>>(value);
+    return detail::format_decimal(buffer_, n, buffer_size - 1).begin;
+  }
+
+  template <typename Int>
+  FMT_CONSTEXPR20 auto format_signed(Int value) -> char* {
+    auto abs_value = static_cast<detail::uint32_or_64_or_128_t<Int>>(value);
+    bool negative = value < 0;
+    if (negative) abs_value = 0 - abs_value;
+    auto begin = format_unsigned(abs_value);
+    if (negative) *--begin = '-';
+    return begin;
+  }
+
+ public:
+  explicit FMT_CONSTEXPR20 format_int(int value) : str_(format_signed(value)) {}
+  explicit FMT_CONSTEXPR20 format_int(long value)
+      : str_(format_signed(value)) {}
+  explicit FMT_CONSTEXPR20 format_int(long long value)
+      : str_(format_signed(value)) {}
+  explicit FMT_CONSTEXPR20 format_int(unsigned value)
+      : str_(format_unsigned(value)) {}
+  explicit FMT_CONSTEXPR20 format_int(unsigned long value)
+      : str_(format_unsigned(value)) {}
+  explicit FMT_CONSTEXPR20 format_int(unsigned long long value)
+      : str_(format_unsigned(value)) {}
+
+  /// Returns the number of characters written to the output buffer.
+  FMT_CONSTEXPR20 auto size() const -> size_t {
+    return detail::to_unsigned(buffer_ - str_ + buffer_size - 1);
+  }
+
+  /// Returns a pointer to the output buffer content. No terminating null
+  /// character is appended.
+  FMT_CONSTEXPR20 auto data() const -> const char* { return str_; }
+
+  /// Returns a pointer to the output buffer content with terminating null
+  /// character appended.
+  FMT_CONSTEXPR20 auto c_str() const -> const char* {
+    buffer_[buffer_size - 1] = '\0';
+    return str_;
+  }
+
+  /// Returns the content of the output buffer as an `std::string`.
+  auto str() const -> std::string { return std::string(str_, size()); }
+};
+
+template <typename T, typename Char>
+struct formatter<T, Char, enable_if_t<detail::has_format_as<T>::value>>
+    : formatter<detail::format_as_t<T>, Char> {
+  template <typename FormatContext>
+  auto format(const T& value, FormatContext& ctx) const -> decltype(ctx.out()) {
+    auto&& val = format_as(value);  // Make an lvalue reference for format.
+    return formatter<detail::format_as_t<T>, Char>::format(val, ctx);
+  }
+};
+
+#define FMT_FORMAT_AS(Type, Base)                                              \
+  template <typename Char>                                                     \
+  struct formatter<Type, Char> : formatter<Base, Char> {                       \
+    template <typename FormatContext>                                          \
+    auto format(Type value, FormatContext& ctx) const -> decltype(ctx.out()) { \
+      return formatter<Base, Char>::format(value, ctx);                        \
+    }                                                                          \
+  }
+
+FMT_FORMAT_AS(signed char, int);
+FMT_FORMAT_AS(unsigned char, unsigned);
+FMT_FORMAT_AS(short, int);
+FMT_FORMAT_AS(unsigned short, unsigned);
+FMT_FORMAT_AS(long, detail::long_type);
+FMT_FORMAT_AS(unsigned long, detail::ulong_type);
+FMT_FORMAT_AS(Char*, const Char*);
+FMT_FORMAT_AS(std::nullptr_t, const void*);
+FMT_FORMAT_AS(detail::std_string_view<Char>, basic_string_view<Char>);
+FMT_FORMAT_AS(void*, const void*);
+
+template <typename Char, typename Traits, typename Allocator>
+class formatter<std::basic_string<Char, Traits, Allocator>, Char>
+    : public formatter<basic_string_view<Char>, Char> {};
+
+template <typename Char, size_t N>
+struct formatter<Char[N], Char> : formatter<basic_string_view<Char>, Char> {};
+
+/**
+ * Converts `p` to `const void*` for pointer formatting.
+ *
+ * **Example**:
+ *
+ *     auto s = fmt::format("{}", fmt::ptr(p));
+ */
+template <typename T> auto ptr(T p) -> const void* {
+  static_assert(std::is_pointer<T>::value, "");
+  return detail::bit_cast<const void*>(p);
+}
+
+/**
+ * Converts `e` to the underlying type.
+ *
+ * **Example**:
+ *
+ *     enum class color { red, green, blue };
+ *     auto s = fmt::format("{}", fmt::underlying(color::red));
+ */
+template <typename Enum>
+constexpr auto underlying(Enum e) noexcept -> underlying_t<Enum> {
+  return static_cast<underlying_t<Enum>>(e);
+}
+
+namespace enums {
+template <typename Enum, FMT_ENABLE_IF(std::is_enum<Enum>::value)>
+constexpr auto format_as(Enum e) noexcept -> underlying_t<Enum> {
+  return static_cast<underlying_t<Enum>>(e);
+}
+}  // namespace enums
+
+class bytes {
+ private:
+  string_view data_;
+  friend struct formatter<bytes>;
+
+ public:
+  explicit bytes(string_view data) : data_(data) {}
+};
+
+template <> struct formatter<bytes> {
+ private:
+  detail::dynamic_format_specs<> specs_;
+
+ public:
+  template <typename ParseContext>
+  FMT_CONSTEXPR auto parse(ParseContext& ctx) -> const char* {
+    return parse_format_specs(ctx.begin(), ctx.end(), specs_, ctx,
+                              detail::type::string_type);
+  }
+
+  template <typename FormatContext>
+  auto format(bytes b, FormatContext& ctx) const -> decltype(ctx.out()) {
+    auto specs = specs_;
+    detail::handle_dynamic_spec<detail::width_checker>(specs.width,
+                                                       specs.width_ref, ctx);
+    detail::handle_dynamic_spec<detail::precision_checker>(
+        specs.precision, specs.precision_ref, ctx);
+    return detail::write_bytes<char>(ctx.out(), b.data_, specs);
+  }
+};
+
+// group_digits_view is not derived from view because it copies the argument.
+template <typename T> struct group_digits_view {
+  T value;
+};
+
+/**
+ * Returns a view that formats an integer value using ',' as a
+ * locale-independent thousands separator.
+ *
+ * **Example**:
+ *
+ *     fmt::print("{}", fmt::group_digits(12345));
+ *     // Output: "12,345"
+ */
+template <typename T> auto group_digits(T value) -> group_digits_view<T> {
+  return {value};
+}
+
+template <typename T> struct formatter<group_digits_view<T>> : formatter<T> {
+ private:
+  detail::dynamic_format_specs<> specs_;
+
+ public:
+  template <typename ParseContext>
+  FMT_CONSTEXPR auto parse(ParseContext& ctx) -> const char* {
+    return parse_format_specs(ctx.begin(), ctx.end(), specs_, ctx,
+                              detail::type::int_type);
+  }
+
+  template <typename FormatContext>
+  auto format(group_digits_view<T> t, FormatContext& ctx) const
+      -> decltype(ctx.out()) {
+    auto specs = specs_;
+    detail::handle_dynamic_spec<detail::width_checker>(specs.width,
+                                                       specs.width_ref, ctx);
+    detail::handle_dynamic_spec<detail::precision_checker>(
+        specs.precision, specs.precision_ref, ctx);
+    auto arg = detail::make_write_int_arg(t.value, specs.sign);
+    return detail::write_int(
+        ctx.out(), static_cast<detail::uint64_or_128_t<T>>(arg.abs_value),
+        arg.prefix, specs, detail::digit_grouping<char>("\3", ","));
+  }
+};
+
+template <typename T, typename Char> struct nested_view {
+  const formatter<T, Char>* fmt;
+  const T* value;
+};
+
+template <typename T, typename Char>
+struct formatter<nested_view<T, Char>, Char> {
+  template <typename ParseContext>
+  FMT_CONSTEXPR auto parse(ParseContext& ctx) -> decltype(ctx.begin()) {
+    return ctx.begin();
+  }
+  template <typename FormatContext>
+  auto format(nested_view<T, Char> view, FormatContext& ctx) const
+      -> decltype(ctx.out()) {
+    return view.fmt->format(*view.value, ctx);
+  }
+};
+
+template <typename T, typename Char = char> struct nested_formatter {
+ private:
+  int width_;
+  detail::fill_t fill_;
+  align_t align_ : 4;
+  formatter<T, Char> formatter_;
+
+ public:
+  constexpr nested_formatter() : width_(0), align_(align_t::none) {}
+
+  FMT_CONSTEXPR auto parse(basic_format_parse_context<Char>& ctx)
+      -> decltype(ctx.begin()) {
+    auto specs = detail::dynamic_format_specs<Char>();
+    auto it = parse_format_specs(ctx.begin(), ctx.end(), specs, ctx,
+                                 detail::type::none_type);
+    width_ = specs.width;
+    fill_ = specs.fill;
+    align_ = specs.align;
+    ctx.advance_to(it);
+    return formatter_.parse(ctx);
+  }
+
+  template <typename FormatContext, typename F>
+  auto write_padded(FormatContext& ctx, F write) const -> decltype(ctx.out()) {
+    if (width_ == 0) return write(ctx.out());
+    auto buf = basic_memory_buffer<Char>();
+    write(basic_appender<Char>(buf));
+    auto specs = format_specs();
+    specs.width = width_;
+    specs.fill = fill_;
+    specs.align = align_;
+    return detail::write<Char>(
+        ctx.out(), basic_string_view<Char>(buf.data(), buf.size()), specs);
+  }
+
+  auto nested(const T& value) const -> nested_view<T, Char> {
+    return nested_view<T, Char>{&formatter_, &value};
+  }
+};
+
+/**
+ * Converts `value` to `std::string` using the default format for type `T`.
+ *
+ * **Example**:
+ *
+ *     std::string answer = fmt::to_string(42);
+ */
+template <typename T, FMT_ENABLE_IF(!std::is_integral<T>::value &&
+                                    !detail::has_format_as<T>::value)>
+inline auto to_string(const T& value) -> std::string {
+  auto buffer = memory_buffer();
+  detail::write<char>(appender(buffer), value);
+  return {buffer.data(), buffer.size()};
+}
+
+template <typename T, FMT_ENABLE_IF(std::is_integral<T>::value)>
+FMT_NODISCARD inline auto to_string(T value) -> std::string {
+  // The buffer should be large enough to store the number including the sign
+  // or "false" for bool.
+  constexpr int max_size = detail::digits10<T>() + 2;
+  char buffer[max_size > 5 ? static_cast<unsigned>(max_size) : 5];
+  char* begin = buffer;
+  return std::string(begin, detail::write<char>(begin, value));
+}
+
+template <typename Char, size_t SIZE>
+FMT_NODISCARD auto to_string(const basic_memory_buffer<Char, SIZE>& buf)
+    -> std::basic_string<Char> {
+  auto size = buf.size();
+  detail::assume(size < std::basic_string<Char>().max_size());
+  return std::basic_string<Char>(buf.data(), size);
+}
+
+template <typename T, FMT_ENABLE_IF(!std::is_integral<T>::value &&
+                                    detail::has_format_as<T>::value)>
+inline auto to_string(const T& value) -> std::string {
+  return to_string(format_as(value));
+}
+
+FMT_END_EXPORT
+
+namespace detail {
+
+template <typename Char>
+void vformat_to(buffer<Char>& buf, basic_string_view<Char> fmt,
+                typename vformat_args<Char>::type args, locale_ref loc) {
+  auto out = basic_appender<Char>(buf);
+  if (fmt.size() == 2 && equal2(fmt.data(), "{}")) {
+    auto arg = args.get(0);
+    if (!arg) report_error("argument not found");
+    arg.visit(default_arg_formatter<Char>{out, args, loc});
+    return;
+  }
+
+  struct format_handler {
+    basic_format_parse_context<Char> parse_context;
+    buffered_context<Char> context;
+
+    format_handler(basic_appender<Char> p_out, basic_string_view<Char> str,
+                   basic_format_args<buffered_context<Char>> p_args,
+                   locale_ref p_loc)
+        : parse_context(str), context(p_out, p_args, p_loc) {}
+
+    void on_text(const Char* begin, const Char* end) {
+      auto text = basic_string_view<Char>(begin, to_unsigned(end - begin));
+      context.advance_to(write<Char>(context.out(), text));
+    }
+
+    FMT_CONSTEXPR auto on_arg_id() -> int {
+      return parse_context.next_arg_id();
+    }
+    FMT_CONSTEXPR auto on_arg_id(int id) -> int {
+      parse_context.check_arg_id(id);
+      return id;
+    }
+    FMT_CONSTEXPR auto on_arg_id(basic_string_view<Char> id) -> int {
+      parse_context.check_arg_id(id);
+      int arg_id = context.arg_id(id);
+      if (arg_id < 0) report_error("argument not found");
+      return arg_id;
+    }
+
+    FMT_INLINE void on_replacement_field(int id, const Char*) {
+      auto arg = get_arg(context, id);
+      context.advance_to(arg.visit(default_arg_formatter<Char>{
+          context.out(), context.args(), context.locale()}));
+    }
+
+    auto on_format_specs(int id, const Char* begin, const Char* end)
+        -> const Char* {
+      auto arg = get_arg(context, id);
+      // Not using a visitor for custom types gives better codegen.
+      if (arg.format_custom(begin, parse_context, context))
+        return parse_context.begin();
+      auto specs = detail::dynamic_format_specs<Char>();
+      begin = parse_format_specs(begin, end, specs, parse_context, arg.type());
+      detail::handle_dynamic_spec<detail::width_checker>(
+          specs.width, specs.width_ref, context);
+      detail::handle_dynamic_spec<detail::precision_checker>(
+          specs.precision, specs.precision_ref, context);
+      if (begin == end || *begin != '}')
+        report_error("missing '}' in format string");
+      context.advance_to(arg.visit(
+          arg_formatter<Char>{context.out(), specs, context.locale()}));
+      return begin;
+    }
+
+    FMT_NORETURN void on_error(const char* message) { report_error(message); }
+  };
+  detail::parse_format_string<false>(fmt, format_handler(out, fmt, args, loc));
+}
+
+FMT_BEGIN_EXPORT
+
+#ifndef FMT_HEADER_ONLY
+extern template FMT_API void vformat_to(buffer<char>&, string_view,
+                                        typename vformat_args<>::type,
+                                        locale_ref);
+extern template FMT_API auto thousands_sep_impl<char>(locale_ref)
+    -> thousands_sep_result<char>;
+extern template FMT_API auto thousands_sep_impl<wchar_t>(locale_ref)
+    -> thousands_sep_result<wchar_t>;
+extern template FMT_API auto decimal_point_impl(locale_ref) -> char;
+extern template FMT_API auto decimal_point_impl(locale_ref) -> wchar_t;
+#endif  // FMT_HEADER_ONLY
+
+FMT_END_EXPORT
+
+template <typename T, typename Char, type TYPE>
+template <typename FormatContext>
+FMT_CONSTEXPR FMT_INLINE auto native_formatter<T, Char, TYPE>::format(
+    const T& val, FormatContext& ctx) const -> decltype(ctx.out()) {
+  if (specs_.width_ref.kind == arg_id_kind::none &&
+      specs_.precision_ref.kind == arg_id_kind::none) {
+    return write<Char>(ctx.out(), val, specs_, ctx.locale());
+  }
+  auto specs = specs_;
+  handle_dynamic_spec<width_checker>(specs.width, specs.width_ref, ctx);
+  handle_dynamic_spec<precision_checker>(specs.precision, specs.precision_ref,
+                                         ctx);
+  return write<Char>(ctx.out(), val, specs, ctx.locale());
+}
+
+}  // namespace detail
+
+FMT_BEGIN_EXPORT
+
+template <typename Char>
+struct formatter<detail::float128, Char>
+    : detail::native_formatter<detail::float128, Char,
+                               detail::type::float_type> {};
+
+#if FMT_USE_USER_DEFINED_LITERALS
+inline namespace literals {
+/**
+ * User-defined literal equivalent of `fmt::arg`.
+ *
+ * **Example**:
+ *
+ *     using namespace fmt::literals;
+ *     fmt::print("The answer is {answer}.", "answer"_a=42);
+ */
+#  if FMT_USE_NONTYPE_TEMPLATE_ARGS
+template <detail_exported::fixed_string Str> constexpr auto operator""_a() {
+  using char_t = remove_cvref_t<decltype(Str.data[0])>;
+  return detail::udl_arg<char_t, sizeof(Str.data) / sizeof(char_t), Str>();
+}
+#  else
+constexpr auto operator""_a(const char* s, size_t) -> detail::udl_arg<char> {
+  return {s};
+}
+#  endif
+}  // namespace literals
+#endif  // FMT_USE_USER_DEFINED_LITERALS
+
+FMT_API auto vformat(string_view fmt, format_args args) -> std::string;
+
+/**
+ * Formats `args` according to specifications in `fmt` and returns the result
+ * as a string.
+ *
+ * **Example**:
+ *
+ *     #include <fmt/format.h>
+ *     std::string message = fmt::format("The answer is {}.", 42);
+ */
+template <typename... T>
+FMT_NODISCARD FMT_INLINE auto format(format_string<T...> fmt, T&&... args)
+    -> std::string {
+  return vformat(fmt, fmt::make_format_args(args...));
+}
+
+template <typename Locale, FMT_ENABLE_IF(detail::is_locale<Locale>::value)>
+inline auto vformat(const Locale& loc, string_view fmt, format_args args)
+    -> std::string {
+  return detail::vformat(loc, fmt, args);
+}
+
+template <typename Locale, typename... T,
+          FMT_ENABLE_IF(detail::is_locale<Locale>::value)>
+inline auto format(const Locale& loc, format_string<T...> fmt, T&&... args)
+    -> std::string {
+  return fmt::vformat(loc, string_view(fmt), fmt::make_format_args(args...));
+}
+
+template <typename OutputIt, typename Locale,
+          FMT_ENABLE_IF(detail::is_output_iterator<OutputIt, char>::value&&
+                            detail::is_locale<Locale>::value)>
+auto vformat_to(OutputIt out, const Locale& loc, string_view fmt,
+                format_args args) -> OutputIt {
+  using detail::get_buffer;
+  auto&& buf = get_buffer<char>(out);
+  detail::vformat_to(buf, fmt, args, detail::locale_ref(loc));
+  return detail::get_iterator(buf, out);
+}
+
+template <typename OutputIt, typename Locale, typename... T,
+          FMT_ENABLE_IF(detail::is_output_iterator<OutputIt, char>::value&&
+                            detail::is_locale<Locale>::value)>
+FMT_INLINE auto format_to(OutputIt out, const Locale& loc,
+                          format_string<T...> fmt, T&&... args) -> OutputIt {
+  return vformat_to(out, loc, fmt, fmt::make_format_args(args...));
+}
+
+template <typename Locale, typename... T,
+          FMT_ENABLE_IF(detail::is_locale<Locale>::value)>
+FMT_NODISCARD FMT_INLINE auto formatted_size(const Locale& loc,
+                                             format_string<T...> fmt,
+                                             T&&... args) -> size_t {
+  auto buf = detail::counting_buffer<>();
+  detail::vformat_to<char>(buf, fmt, fmt::make_format_args(args...),
+                           detail::locale_ref(loc));
+  return buf.count();
+}
+
+FMT_END_EXPORT
+
+FMT_END_NAMESPACE
+
+#ifdef FMT_HEADER_ONLY
+#  define FMT_FUNC inline
+#  include "format-inl.h"
+#else
+#  define FMT_FUNC
+#endif
+
+// Restore _LIBCPP_REMOVE_TRANSITIVE_INCLUDES.
+#ifdef FMT_REMOVE_TRANSITIVE_INCLUDES
+#  undef _LIBCPP_REMOVE_TRANSITIVE_INCLUDES
+#endif
+
+#endif  // FMT_FORMAT_H_
diff --git a/lib/fmt/fmt/os.h b/lib/fmt/fmt/os.h
new file mode 100644
index 000000000..5c85ea08f
--- /dev/null
+++ b/lib/fmt/fmt/os.h
@@ -0,0 +1,439 @@
+// Formatting library for C++ - optional OS-specific functionality
+//
+// Copyright (c) 2012 - present, Victor Zverovich
+// All rights reserved.
+//
+// For the license information refer to format.h.
+
+#ifndef FMT_OS_H_
+#define FMT_OS_H_
+
+#include "format.h"
+
+#ifndef FMT_MODULE
+#  include <cerrno>
+#  include <cstddef>
+#  include <cstdio>
+#  include <system_error>  // std::system_error
+
+#  if FMT_HAS_INCLUDE(<xlocale.h>)
+#    include <xlocale.h>  // LC_NUMERIC_MASK on macOS
+#  endif
+#endif  // FMT_MODULE
+
+#ifndef FMT_USE_FCNTL
+// UWP doesn't provide _pipe.
+#  if FMT_HAS_INCLUDE("winapifamily.h")
+#    include <winapifamily.h>
+#  endif
+#  if (FMT_HAS_INCLUDE(<fcntl.h>) || defined(__APPLE__) || \
+       defined(__linux__)) &&                              \
+      (!defined(WINAPI_FAMILY) ||                          \
+       (WINAPI_FAMILY == WINAPI_FAMILY_DESKTOP_APP))
+#    include <fcntl.h>  // for O_RDONLY
+#    define FMT_USE_FCNTL 1
+#  else
+#    define FMT_USE_FCNTL 0
+#  endif
+#endif
+
+#ifndef FMT_POSIX
+#  if defined(_WIN32) && !defined(__MINGW32__)
+// Fix warnings about deprecated symbols.
+#    define FMT_POSIX(call) _##call
+#  else
+#    define FMT_POSIX(call) call
+#  endif
+#endif
+
+// Calls to system functions are wrapped in FMT_SYSTEM for testability.
+#ifdef FMT_SYSTEM
+#  define FMT_HAS_SYSTEM
+#  define FMT_POSIX_CALL(call) FMT_SYSTEM(call)
+#else
+#  define FMT_SYSTEM(call) ::call
+#  ifdef _WIN32
+// Fix warnings about deprecated symbols.
+#    define FMT_POSIX_CALL(call) ::_##call
+#  else
+#    define FMT_POSIX_CALL(call) ::call
+#  endif
+#endif
+
+// Retries the expression while it evaluates to error_result and errno
+// equals to EINTR.
+#ifndef _WIN32
+#  define FMT_RETRY_VAL(result, expression, error_result) \
+    do {                                                  \
+      (result) = (expression);                            \
+    } while ((result) == (error_result) && errno == EINTR)
+#else
+#  define FMT_RETRY_VAL(result, expression, error_result) result = (expression)
+#endif
+
+#define FMT_RETRY(result, expression) FMT_RETRY_VAL(result, expression, -1)
+
+FMT_BEGIN_NAMESPACE
+FMT_BEGIN_EXPORT
+
+/**
+ * A reference to a null-terminated string. It can be constructed from a C
+ * string or `std::string`.
+ *
+ * You can use one of the following type aliases for common character types:
+ *
+ * +---------------+-----------------------------+
+ * | Type          | Definition                  |
+ * +===============+=============================+
+ * | cstring_view  | basic_cstring_view<char>    |
+ * +---------------+-----------------------------+
+ * | wcstring_view | basic_cstring_view<wchar_t> |
+ * +---------------+-----------------------------+
+ *
+ * This class is most useful as a parameter type for functions that wrap C APIs.
+ */
+template <typename Char> class basic_cstring_view {
+ private:
+  const Char* data_;
+
+ public:
+  /// Constructs a string reference object from a C string.
+  basic_cstring_view(const Char* s) : data_(s) {}
+
+  /// Constructs a string reference from an `std::string` object.
+  basic_cstring_view(const std::basic_string<Char>& s) : data_(s.c_str()) {}
+
+  /// Returns the pointer to a C string.
+  auto c_str() const -> const Char* { return data_; }
+};
+
+using cstring_view = basic_cstring_view<char>;
+using wcstring_view = basic_cstring_view<wchar_t>;
+
+#ifdef _WIN32
+FMT_API const std::error_category& system_category() noexcept;
+
+namespace detail {
+FMT_API void format_windows_error(buffer<char>& out, int error_code,
+                                  const char* message) noexcept;
+}
+
+FMT_API std::system_error vwindows_error(int error_code, string_view format_str,
+                                         format_args args);
+
+/**
+ * Constructs a `std::system_error` object with the description of the form
+ *
+ *     <message>: <system-message>
+ *
+ * where `<message>` is the formatted message and `<system-message>` is the
+ * system message corresponding to the error code.
+ * `error_code` is a Windows error code as given by `GetLastError`.
+ * If `error_code` is not a valid error code such as -1, the system message
+ * will look like "error -1".
+ *
+ * **Example**:
+ *
+ *     // This throws a system_error with the description
+ *     //   cannot open file 'madeup': The system cannot find the file
+ * specified.
+ *     // or similar (system message may vary).
+ *     const char *filename = "madeup";
+ *     LPOFSTRUCT of = LPOFSTRUCT();
+ *     HFILE file = OpenFile(filename, &of, OF_READ);
+ *     if (file == HFILE_ERROR) {
+ *       throw fmt::windows_error(GetLastError(),
+ *                                "cannot open file '{}'", filename);
+ *     }
+ */
+template <typename... Args>
+std::system_error windows_error(int error_code, string_view message,
+                                const Args&... args) {
+  return vwindows_error(error_code, message, fmt::make_format_args(args...));
+}
+
+// Reports a Windows error without throwing an exception.
+// Can be used to report errors from destructors.
+FMT_API void report_windows_error(int error_code, const char* message) noexcept;
+#else
+inline auto system_category() noexcept -> const std::error_category& {
+  return std::system_category();
+}
+#endif  // _WIN32
+
+// std::system is not available on some platforms such as iOS (#2248).
+#ifdef __OSX__
+template <typename S, typename... Args, typename Char = char_t<S>>
+void say(const S& format_str, Args&&... args) {
+  std::system(format("say \"{}\"", format(format_str, args...)).c_str());
+}
+#endif
+
+// A buffered file.
+class buffered_file {
+ private:
+  FILE* file_;
+
+  friend class file;
+
+  explicit buffered_file(FILE* f) : file_(f) {}
+
+ public:
+  buffered_file(const buffered_file&) = delete;
+  void operator=(const buffered_file&) = delete;
+
+  // Constructs a buffered_file object which doesn't represent any file.
+  buffered_file() noexcept : file_(nullptr) {}
+
+  // Destroys the object closing the file it represents if any.
+  FMT_API ~buffered_file() noexcept;
+
+ public:
+  buffered_file(buffered_file&& other) noexcept : file_(other.file_) {
+    other.file_ = nullptr;
+  }
+
+  auto operator=(buffered_file&& other) -> buffered_file& {
+    close();
+    file_ = other.file_;
+    other.file_ = nullptr;
+    return *this;
+  }
+
+  // Opens a file.
+  FMT_API buffered_file(cstring_view filename, cstring_view mode);
+
+  // Closes the file.
+  FMT_API void close();
+
+  // Returns the pointer to a FILE object representing this file.
+  auto get() const noexcept -> FILE* { return file_; }
+
+  FMT_API auto descriptor() const -> int;
+
+  template <typename... T>
+  inline void print(string_view fmt, const T&... args) {
+    const auto& vargs = fmt::make_format_args(args...);
+    detail::is_locking<T...>() ? fmt::vprint_buffered(file_, fmt, vargs)
+                               : fmt::vprint(file_, fmt, vargs);
+  }
+};
+
+#if FMT_USE_FCNTL
+
+// A file. Closed file is represented by a file object with descriptor -1.
+// Methods that are not declared with noexcept may throw
+// fmt::system_error in case of failure. Note that some errors such as
+// closing the file multiple times will cause a crash on Windows rather
+// than an exception. You can get standard behavior by overriding the
+// invalid parameter handler with _set_invalid_parameter_handler.
+class FMT_API file {
+ private:
+  int fd_;  // File descriptor.
+
+  // Constructs a file object with a given descriptor.
+  explicit file(int fd) : fd_(fd) {}
+
+  friend struct pipe;
+
+ public:
+  // Possible values for the oflag argument to the constructor.
+  enum {
+    RDONLY = FMT_POSIX(O_RDONLY),  // Open for reading only.
+    WRONLY = FMT_POSIX(O_WRONLY),  // Open for writing only.
+    RDWR = FMT_POSIX(O_RDWR),      // Open for reading and writing.
+    CREATE = FMT_POSIX(O_CREAT),   // Create if the file doesn't exist.
+    APPEND = FMT_POSIX(O_APPEND),  // Open in append mode.
+    TRUNC = FMT_POSIX(O_TRUNC)     // Truncate the content of the file.
+  };
+
+  // Constructs a file object which doesn't represent any file.
+  file() noexcept : fd_(-1) {}
+
+  // Opens a file and constructs a file object representing this file.
+  file(cstring_view path, int oflag);
+
+ public:
+  file(const file&) = delete;
+  void operator=(const file&) = delete;
+
+  file(file&& other) noexcept : fd_(other.fd_) { other.fd_ = -1; }
+
+  // Move assignment is not noexcept because close may throw.
+  auto operator=(file&& other) -> file& {
+    close();
+    fd_ = other.fd_;
+    other.fd_ = -1;
+    return *this;
+  }
+
+  // Destroys the object closing the file it represents if any.
+  ~file() noexcept;
+
+  // Returns the file descriptor.
+  auto descriptor() const noexcept -> int { return fd_; }
+
+  // Closes the file.
+  void close();
+
+  // Returns the file size. The size has signed type for consistency with
+  // stat::st_size.
+  auto size() const -> long long;
+
+  // Attempts to read count bytes from the file into the specified buffer.
+  auto read(void* buffer, size_t count) -> size_t;
+
+  // Attempts to write count bytes from the specified buffer to the file.
+  auto write(const void* buffer, size_t count) -> size_t;
+
+  // Duplicates a file descriptor with the dup function and returns
+  // the duplicate as a file object.
+  static auto dup(int fd) -> file;
+
+  // Makes fd be the copy of this file descriptor, closing fd first if
+  // necessary.
+  void dup2(int fd);
+
+  // Makes fd be the copy of this file descriptor, closing fd first if
+  // necessary.
+  void dup2(int fd, std::error_code& ec) noexcept;
+
+  // Creates a buffered_file object associated with this file and detaches
+  // this file object from the file.
+  auto fdopen(const char* mode) -> buffered_file;
+
+#  if defined(_WIN32) && !defined(__MINGW32__)
+  // Opens a file and constructs a file object representing this file by
+  // wcstring_view filename. Windows only.
+  static file open_windows_file(wcstring_view path, int oflag);
+#  endif
+};
+
+struct FMT_API pipe {
+  file read_end;
+  file write_end;
+
+  // Creates a pipe setting up read_end and write_end file objects for reading
+  // and writing respectively.
+  pipe();
+};
+
+// Returns the memory page size.
+auto getpagesize() -> long;
+
+namespace detail {
+
+struct buffer_size {
+  buffer_size() = default;
+  size_t value = 0;
+  auto operator=(size_t val) const -> buffer_size {
+    auto bs = buffer_size();
+    bs.value = val;
+    return bs;
+  }
+};
+
+struct ostream_params {
+  int oflag = file::WRONLY | file::CREATE | file::TRUNC;
+  size_t buffer_size = BUFSIZ > 32768 ? BUFSIZ : 32768;
+
+  ostream_params() {}
+
+  template <typename... T>
+  ostream_params(T... params, int new_oflag) : ostream_params(params...) {
+    oflag = new_oflag;
+  }
+
+  template <typename... T>
+  ostream_params(T... params, detail::buffer_size bs)
+      : ostream_params(params...) {
+    this->buffer_size = bs.value;
+  }
+
+// Intel has a bug that results in failure to deduce a constructor
+// for empty parameter packs.
+#  if defined(__INTEL_COMPILER) && __INTEL_COMPILER < 2000
+  ostream_params(int new_oflag) : oflag(new_oflag) {}
+  ostream_params(detail::buffer_size bs) : buffer_size(bs.value) {}
+#  endif
+};
+
+class file_buffer final : public buffer<char> {
+ private:
+  file file_;
+
+  FMT_API static void grow(buffer<char>& buf, size_t);
+
+ public:
+  FMT_API file_buffer(cstring_view path, const ostream_params& params);
+  FMT_API file_buffer(file_buffer&& other) noexcept;
+  FMT_API ~file_buffer();
+
+  void flush() {
+    if (size() == 0) return;
+    file_.write(data(), size() * sizeof(data()[0]));
+    clear();
+  }
+
+  void close() {
+    flush();
+    file_.close();
+  }
+};
+
+}  // namespace detail
+
+constexpr auto buffer_size = detail::buffer_size();
+
+/// A fast output stream for writing from a single thread. Writing from
+/// multiple threads without external synchronization may result in a data race.
+class FMT_API ostream {
+ private:
+  FMT_MSC_WARNING(suppress : 4251)
+  detail::file_buffer buffer_;
+
+  ostream(cstring_view path, const detail::ostream_params& params)
+      : buffer_(path, params) {}
+
+ public:
+  ostream(ostream&& other) : buffer_(std::move(other.buffer_)) {}
+
+  ~ostream();
+
+  void flush() { buffer_.flush(); }
+
+  template <typename... T>
+  friend auto output_file(cstring_view path, T... params) -> ostream;
+
+  void close() { buffer_.close(); }
+
+  /// Formats `args` according to specifications in `fmt` and writes the
+  /// output to the file.
+  template <typename... T> void print(format_string<T...> fmt, T&&... args) {
+    vformat_to(appender(buffer_), fmt, fmt::make_format_args(args...));
+  }
+};
+
+/**
+ * Opens a file for writing. Supported parameters passed in `params`:
+ *
+ * - `<integer>`: Flags passed to [open](
+ *   https://pubs.opengroup.org/onlinepubs/007904875/functions/open.html)
+ *   (`file::WRONLY | file::CREATE | file::TRUNC` by default)
+ * - `buffer_size=<integer>`: Output buffer size
+ *
+ * **Example**:
+ *
+ *     auto out = fmt::output_file("guide.txt");
+ *     out.print("Don't {}", "Panic");
+ */
+template <typename... T>
+inline auto output_file(cstring_view path, T... params) -> ostream {
+  return {path, detail::ostream_params(params...)};
+}
+#endif  // FMT_USE_FCNTL
+
+FMT_END_EXPORT
+FMT_END_NAMESPACE
+
+#endif  // FMT_OS_H_
diff --git a/lib/fmt/fmt/ostream.h b/lib/fmt/fmt/ostream.h
new file mode 100644
index 000000000..98faef659
--- /dev/null
+++ b/lib/fmt/fmt/ostream.h
@@ -0,0 +1,211 @@
+// Formatting library for C++ - std::ostream support
+//
+// Copyright (c) 2012 - present, Victor Zverovich
+// All rights reserved.
+//
+// For the license information refer to format.h.
+
+#ifndef FMT_OSTREAM_H_
+#define FMT_OSTREAM_H_
+
+#ifndef FMT_MODULE
+#  include <fstream>  // std::filebuf
+#endif
+
+#ifdef _WIN32
+#  ifdef __GLIBCXX__
+#    include <ext/stdio_filebuf.h>
+#    include <ext/stdio_sync_filebuf.h>
+#  endif
+#  include <io.h>
+#endif
+
+#include "chrono.h"  // formatbuf
+
+FMT_BEGIN_NAMESPACE
+namespace detail {
+
+// Generate a unique explicit instantion in every translation unit using a tag
+// type in an anonymous namespace.
+namespace {
+struct file_access_tag {};
+}  // namespace
+template <typename Tag, typename BufType, FILE* BufType::*FileMemberPtr>
+class file_access {
+  friend auto get_file(BufType& obj) -> FILE* { return obj.*FileMemberPtr; }
+};
+
+#if FMT_MSC_VERSION
+template class file_access<file_access_tag, std::filebuf,
+                           &std::filebuf::_Myfile>;
+auto get_file(std::filebuf&) -> FILE*;
+#endif
+
+inline auto write_ostream_unicode(std::ostream& os, fmt::string_view data)
+    -> bool {
+  FILE* f = nullptr;
+#if FMT_MSC_VERSION && FMT_USE_RTTI
+  if (auto* buf = dynamic_cast<std::filebuf*>(os.rdbuf()))
+    f = get_file(*buf);
+  else
+    return false;
+#elif defined(_WIN32) && defined(__GLIBCXX__) && FMT_USE_RTTI
+  auto* rdbuf = os.rdbuf();
+  if (auto* sfbuf = dynamic_cast<__gnu_cxx::stdio_sync_filebuf<char>*>(rdbuf))
+    f = sfbuf->file();
+  else if (auto* fbuf = dynamic_cast<__gnu_cxx::stdio_filebuf<char>*>(rdbuf))
+    f = fbuf->file();
+  else
+    return false;
+#else
+  ignore_unused(os, data, f);
+#endif
+#ifdef _WIN32
+  if (f) {
+    int fd = _fileno(f);
+    if (_isatty(fd)) {
+      os.flush();
+      return write_console(fd, data);
+    }
+  }
+#endif
+  return false;
+}
+inline auto write_ostream_unicode(std::wostream&,
+                                  fmt::basic_string_view<wchar_t>) -> bool {
+  return false;
+}
+
+// Write the content of buf to os.
+// It is a separate function rather than a part of vprint to simplify testing.
+template <typename Char>
+void write_buffer(std::basic_ostream<Char>& os, buffer<Char>& buf) {
+  const Char* buf_data = buf.data();
+  using unsigned_streamsize = std::make_unsigned<std::streamsize>::type;
+  unsigned_streamsize size = buf.size();
+  unsigned_streamsize max_size = to_unsigned(max_value<std::streamsize>());
+  do {
+    unsigned_streamsize n = size <= max_size ? size : max_size;
+    os.write(buf_data, static_cast<std::streamsize>(n));
+    buf_data += n;
+    size -= n;
+  } while (size != 0);
+}
+
+template <typename Char, typename T>
+void format_value(buffer<Char>& buf, const T& value) {
+  auto&& format_buf = formatbuf<std::basic_streambuf<Char>>(buf);
+  auto&& output = std::basic_ostream<Char>(&format_buf);
+#if !defined(FMT_STATIC_THOUSANDS_SEPARATOR)
+  output.imbue(std::locale::classic());  // The default is always unlocalized.
+#endif
+  output << value;
+  output.exceptions(std::ios_base::failbit | std::ios_base::badbit);
+}
+
+template <typename T> struct streamed_view {
+  const T& value;
+};
+
+}  // namespace detail
+
+// Formats an object of type T that has an overloaded ostream operator<<.
+template <typename Char>
+struct basic_ostream_formatter : formatter<basic_string_view<Char>, Char> {
+  void set_debug_format() = delete;
+
+  template <typename T, typename Context>
+  auto format(const T& value, Context& ctx) const -> decltype(ctx.out()) {
+    auto buffer = basic_memory_buffer<Char>();
+    detail::format_value(buffer, value);
+    return formatter<basic_string_view<Char>, Char>::format(
+        {buffer.data(), buffer.size()}, ctx);
+  }
+};
+
+using ostream_formatter = basic_ostream_formatter<char>;
+
+template <typename T, typename Char>
+struct formatter<detail::streamed_view<T>, Char>
+    : basic_ostream_formatter<Char> {
+  template <typename Context>
+  auto format(detail::streamed_view<T> view, Context& ctx) const
+      -> decltype(ctx.out()) {
+    return basic_ostream_formatter<Char>::format(view.value, ctx);
+  }
+};
+
+/**
+ * Returns a view that formats `value` via an ostream `operator<<`.
+ *
+ * **Example**:
+ *
+ *     fmt::print("Current thread id: {}\n",
+ *                fmt::streamed(std::this_thread::get_id()));
+ */
+template <typename T>
+constexpr auto streamed(const T& value) -> detail::streamed_view<T> {
+  return {value};
+}
+
+namespace detail {
+
+inline void vprint_directly(std::ostream& os, string_view format_str,
+                            format_args args) {
+  auto buffer = memory_buffer();
+  detail::vformat_to(buffer, format_str, args);
+  detail::write_buffer(os, buffer);
+}
+
+}  // namespace detail
+
+FMT_EXPORT template <typename Char>
+void vprint(std::basic_ostream<Char>& os,
+            basic_string_view<type_identity_t<Char>> format_str,
+            typename detail::vformat_args<Char>::type args) {
+  auto buffer = basic_memory_buffer<Char>();
+  detail::vformat_to(buffer, format_str, args);
+  if (detail::write_ostream_unicode(os, {buffer.data(), buffer.size()})) return;
+  detail::write_buffer(os, buffer);
+}
+
+/**
+ * Prints formatted data to the stream `os`.
+ *
+ * **Example**:
+ *
+ *     fmt::print(cerr, "Don't {}!", "panic");
+ */
+FMT_EXPORT template <typename... T>
+void print(std::ostream& os, format_string<T...> fmt, T&&... args) {
+  const auto& vargs = fmt::make_format_args(args...);
+  if (detail::use_utf8())
+    vprint(os, fmt, vargs);
+  else
+    detail::vprint_directly(os, fmt, vargs);
+}
+
+FMT_EXPORT
+template <typename... Args>
+void print(std::wostream& os,
+           basic_format_string<wchar_t, type_identity_t<Args>...> fmt,
+           Args&&... args) {
+  vprint(os, fmt, fmt::make_format_args<buffered_context<wchar_t>>(args...));
+}
+
+FMT_EXPORT template <typename... T>
+void println(std::ostream& os, format_string<T...> fmt, T&&... args) {
+  fmt::print(os, "{}\n", fmt::format(fmt, std::forward<T>(args)...));
+}
+
+FMT_EXPORT
+template <typename... Args>
+void println(std::wostream& os,
+             basic_format_string<wchar_t, type_identity_t<Args>...> fmt,
+             Args&&... args) {
+  print(os, L"{}\n", fmt::format(fmt, std::forward<Args>(args)...));
+}
+
+FMT_END_NAMESPACE
+
+#endif  // FMT_OSTREAM_H_
diff --git a/lib/fmt/fmt/printf.h b/lib/fmt/fmt/printf.h
new file mode 100644
index 000000000..072cc6b30
--- /dev/null
+++ b/lib/fmt/fmt/printf.h
@@ -0,0 +1,656 @@
+// Formatting library for C++ - legacy printf implementation
+//
+// Copyright (c) 2012 - 2016, Victor Zverovich
+// All rights reserved.
+//
+// For the license information refer to format.h.
+
+#ifndef FMT_PRINTF_H_
+#define FMT_PRINTF_H_
+
+#ifndef FMT_MODULE
+#  include <algorithm>  // std::max
+#  include <limits>     // std::numeric_limits
+#endif
+
+#include "format.h"
+
+FMT_BEGIN_NAMESPACE
+FMT_BEGIN_EXPORT
+
+template <typename T> struct printf_formatter {
+  printf_formatter() = delete;
+};
+
+template <typename Char> class basic_printf_context {
+ private:
+  basic_appender<Char> out_;
+  basic_format_args<basic_printf_context> args_;
+
+  static_assert(std::is_same<Char, char>::value ||
+                    std::is_same<Char, wchar_t>::value,
+                "Unsupported code unit type.");
+
+ public:
+  using char_type = Char;
+  using parse_context_type = basic_format_parse_context<Char>;
+  template <typename T> using formatter_type = printf_formatter<T>;
+
+  /// Constructs a `printf_context` object. References to the arguments are
+  /// stored in the context object so make sure they have appropriate lifetimes.
+  basic_printf_context(basic_appender<Char> out,
+                       basic_format_args<basic_printf_context> args)
+      : out_(out), args_(args) {}
+
+  auto out() -> basic_appender<Char> { return out_; }
+  void advance_to(basic_appender<Char>) {}
+
+  auto locale() -> detail::locale_ref { return {}; }
+
+  auto arg(int id) const -> basic_format_arg<basic_printf_context> {
+    return args_.get(id);
+  }
+};
+
+namespace detail {
+
+// Checks if a value fits in int - used to avoid warnings about comparing
+// signed and unsigned integers.
+template <bool IsSigned> struct int_checker {
+  template <typename T> static auto fits_in_int(T value) -> bool {
+    unsigned max = to_unsigned(max_value<int>());
+    return value <= max;
+  }
+  static auto fits_in_int(bool) -> bool { return true; }
+};
+
+template <> struct int_checker<true> {
+  template <typename T> static auto fits_in_int(T value) -> bool {
+    return value >= (std::numeric_limits<int>::min)() &&
+           value <= max_value<int>();
+  }
+  static auto fits_in_int(int) -> bool { return true; }
+};
+
+struct printf_precision_handler {
+  template <typename T, FMT_ENABLE_IF(std::is_integral<T>::value)>
+  auto operator()(T value) -> int {
+    if (!int_checker<std::numeric_limits<T>::is_signed>::fits_in_int(value))
+      report_error("number is too big");
+    return (std::max)(static_cast<int>(value), 0);
+  }
+
+  template <typename T, FMT_ENABLE_IF(!std::is_integral<T>::value)>
+  auto operator()(T) -> int {
+    report_error("precision is not integer");
+    return 0;
+  }
+};
+
+// An argument visitor that returns true iff arg is a zero integer.
+struct is_zero_int {
+  template <typename T, FMT_ENABLE_IF(std::is_integral<T>::value)>
+  auto operator()(T value) -> bool {
+    return value == 0;
+  }
+
+  template <typename T, FMT_ENABLE_IF(!std::is_integral<T>::value)>
+  auto operator()(T) -> bool {
+    return false;
+  }
+};
+
+template <typename T> struct make_unsigned_or_bool : std::make_unsigned<T> {};
+
+template <> struct make_unsigned_or_bool<bool> {
+  using type = bool;
+};
+
+template <typename T, typename Context> class arg_converter {
+ private:
+  using char_type = typename Context::char_type;
+
+  basic_format_arg<Context>& arg_;
+  char_type type_;
+
+ public:
+  arg_converter(basic_format_arg<Context>& arg, char_type type)
+      : arg_(arg), type_(type) {}
+
+  void operator()(bool value) {
+    if (type_ != 's') operator()<bool>(value);
+  }
+
+  template <typename U, FMT_ENABLE_IF(std::is_integral<U>::value)>
+  void operator()(U value) {
+    bool is_signed = type_ == 'd' || type_ == 'i';
+    using target_type = conditional_t<std::is_same<T, void>::value, U, T>;
+    if (const_check(sizeof(target_type) <= sizeof(int))) {
+      // Extra casts are used to silence warnings.
+      if (is_signed) {
+        auto n = static_cast<int>(static_cast<target_type>(value));
+        arg_ = detail::make_arg<Context>(n);
+      } else {
+        using unsigned_type = typename make_unsigned_or_bool<target_type>::type;
+        auto n = static_cast<unsigned>(static_cast<unsigned_type>(value));
+        arg_ = detail::make_arg<Context>(n);
+      }
+    } else {
+      if (is_signed) {
+        // glibc's printf doesn't sign extend arguments of smaller types:
+        //   std::printf("%lld", -42);  // prints "4294967254"
+        // but we don't have to do the same because it's a UB.
+        auto n = static_cast<long long>(value);
+        arg_ = detail::make_arg<Context>(n);
+      } else {
+        auto n = static_cast<typename make_unsigned_or_bool<U>::type>(value);
+        arg_ = detail::make_arg<Context>(n);
+      }
+    }
+  }
+
+  template <typename U, FMT_ENABLE_IF(!std::is_integral<U>::value)>
+  void operator()(U) {}  // No conversion needed for non-integral types.
+};
+
+// Converts an integer argument to T for printf, if T is an integral type.
+// If T is void, the argument is converted to corresponding signed or unsigned
+// type depending on the type specifier: 'd' and 'i' - signed, other -
+// unsigned).
+template <typename T, typename Context, typename Char>
+void convert_arg(basic_format_arg<Context>& arg, Char type) {
+  arg.visit(arg_converter<T, Context>(arg, type));
+}
+
+// Converts an integer argument to char for printf.
+template <typename Context> class char_converter {
+ private:
+  basic_format_arg<Context>& arg_;
+
+ public:
+  explicit char_converter(basic_format_arg<Context>& arg) : arg_(arg) {}
+
+  template <typename T, FMT_ENABLE_IF(std::is_integral<T>::value)>
+  void operator()(T value) {
+    auto c = static_cast<typename Context::char_type>(value);
+    arg_ = detail::make_arg<Context>(c);
+  }
+
+  template <typename T, FMT_ENABLE_IF(!std::is_integral<T>::value)>
+  void operator()(T) {}  // No conversion needed for non-integral types.
+};
+
+// An argument visitor that return a pointer to a C string if argument is a
+// string or null otherwise.
+template <typename Char> struct get_cstring {
+  template <typename T> auto operator()(T) -> const Char* { return nullptr; }
+  auto operator()(const Char* s) -> const Char* { return s; }
+};
+
+// Checks if an argument is a valid printf width specifier and sets
+// left alignment if it is negative.
+class printf_width_handler {
+ private:
+  format_specs& specs_;
+
+ public:
+  explicit printf_width_handler(format_specs& specs) : specs_(specs) {}
+
+  template <typename T, FMT_ENABLE_IF(std::is_integral<T>::value)>
+  auto operator()(T value) -> unsigned {
+    auto width = static_cast<uint32_or_64_or_128_t<T>>(value);
+    if (detail::is_negative(value)) {
+      specs_.align = align::left;
+      width = 0 - width;
+    }
+    unsigned int_max = to_unsigned(max_value<int>());
+    if (width > int_max) report_error("number is too big");
+    return static_cast<unsigned>(width);
+  }
+
+  template <typename T, FMT_ENABLE_IF(!std::is_integral<T>::value)>
+  auto operator()(T) -> unsigned {
+    report_error("width is not integer");
+    return 0;
+  }
+};
+
+// Workaround for a bug with the XL compiler when initializing
+// printf_arg_formatter's base class.
+template <typename Char>
+auto make_arg_formatter(basic_appender<Char> iter, format_specs& s)
+    -> arg_formatter<Char> {
+  return {iter, s, locale_ref()};
+}
+
+// The `printf` argument formatter.
+template <typename Char>
+class printf_arg_formatter : public arg_formatter<Char> {
+ private:
+  using base = arg_formatter<Char>;
+  using context_type = basic_printf_context<Char>;
+
+  context_type& context_;
+
+  void write_null_pointer(bool is_string = false) {
+    auto s = this->specs;
+    s.type = presentation_type::none;
+    write_bytes<Char>(this->out, is_string ? "(null)" : "(nil)", s);
+  }
+
+ public:
+  printf_arg_formatter(basic_appender<Char> iter, format_specs& s,
+                       context_type& ctx)
+      : base(make_arg_formatter(iter, s)), context_(ctx) {}
+
+  void operator()(monostate value) { base::operator()(value); }
+
+  template <typename T, FMT_ENABLE_IF(detail::is_integral<T>::value)>
+  void operator()(T value) {
+    // MSVC2013 fails to compile separate overloads for bool and Char so use
+    // std::is_same instead.
+    if (!std::is_same<T, Char>::value) {
+      base::operator()(value);
+      return;
+    }
+    format_specs s = this->specs;
+    if (s.type != presentation_type::none && s.type != presentation_type::chr) {
+      return (*this)(static_cast<int>(value));
+    }
+    s.sign = sign::none;
+    s.alt = false;
+    s.fill = ' ';  // Ignore '0' flag for char types.
+    // align::numeric needs to be overwritten here since the '0' flag is
+    // ignored for non-numeric types
+    if (s.align == align::none || s.align == align::numeric)
+      s.align = align::right;
+    write<Char>(this->out, static_cast<Char>(value), s);
+  }
+
+  template <typename T, FMT_ENABLE_IF(std::is_floating_point<T>::value)>
+  void operator()(T value) {
+    base::operator()(value);
+  }
+
+  void operator()(const char* value) {
+    if (value)
+      base::operator()(value);
+    else
+      write_null_pointer(this->specs.type != presentation_type::pointer);
+  }
+
+  void operator()(const wchar_t* value) {
+    if (value)
+      base::operator()(value);
+    else
+      write_null_pointer(this->specs.type != presentation_type::pointer);
+  }
+
+  void operator()(basic_string_view<Char> value) { base::operator()(value); }
+
+  void operator()(const void* value) {
+    if (value)
+      base::operator()(value);
+    else
+      write_null_pointer();
+  }
+
+  void operator()(typename basic_format_arg<context_type>::handle handle) {
+    auto parse_ctx = basic_format_parse_context<Char>({});
+    handle.format(parse_ctx, context_);
+  }
+};
+
+template <typename Char>
+void parse_flags(format_specs& specs, const Char*& it, const Char* end) {
+  for (; it != end; ++it) {
+    switch (*it) {
+    case '-':
+      specs.align = align::left;
+      break;
+    case '+':
+      specs.sign = sign::plus;
+      break;
+    case '0':
+      specs.fill = '0';
+      break;
+    case ' ':
+      if (specs.sign != sign::plus) specs.sign = sign::space;
+      break;
+    case '#':
+      specs.alt = true;
+      break;
+    default:
+      return;
+    }
+  }
+}
+
+template <typename Char, typename GetArg>
+auto parse_header(const Char*& it, const Char* end, format_specs& specs,
+                  GetArg get_arg) -> int {
+  int arg_index = -1;
+  Char c = *it;
+  if (c >= '0' && c <= '9') {
+    // Parse an argument index (if followed by '$') or a width possibly
+    // preceded with '0' flag(s).
+    int value = parse_nonnegative_int(it, end, -1);
+    if (it != end && *it == '$') {  // value is an argument index
+      ++it;
+      arg_index = value != -1 ? value : max_value<int>();
+    } else {
+      if (c == '0') specs.fill = '0';
+      if (value != 0) {
+        // Nonzero value means that we parsed width and don't need to
+        // parse it or flags again, so return now.
+        if (value == -1) report_error("number is too big");
+        specs.width = value;
+        return arg_index;
+      }
+    }
+  }
+  parse_flags(specs, it, end);
+  // Parse width.
+  if (it != end) {
+    if (*it >= '0' && *it <= '9') {
+      specs.width = parse_nonnegative_int(it, end, -1);
+      if (specs.width == -1) report_error("number is too big");
+    } else if (*it == '*') {
+      ++it;
+      specs.width = static_cast<int>(
+          get_arg(-1).visit(detail::printf_width_handler(specs)));
+    }
+  }
+  return arg_index;
+}
+
+inline auto parse_printf_presentation_type(char c, type t, bool& upper)
+    -> presentation_type {
+  using pt = presentation_type;
+  constexpr auto integral_set = sint_set | uint_set | bool_set | char_set;
+  switch (c) {
+  case 'd':
+    return in(t, integral_set) ? pt::dec : pt::none;
+  case 'o':
+    return in(t, integral_set) ? pt::oct : pt::none;
+  case 'X':
+    upper = true;
+    FMT_FALLTHROUGH;
+  case 'x':
+    return in(t, integral_set) ? pt::hex : pt::none;
+  case 'E':
+    upper = true;
+    FMT_FALLTHROUGH;
+  case 'e':
+    return in(t, float_set) ? pt::exp : pt::none;
+  case 'F':
+    upper = true;
+    FMT_FALLTHROUGH;
+  case 'f':
+    return in(t, float_set) ? pt::fixed : pt::none;
+  case 'G':
+    upper = true;
+    FMT_FALLTHROUGH;
+  case 'g':
+    return in(t, float_set) ? pt::general : pt::none;
+  case 'A':
+    upper = true;
+    FMT_FALLTHROUGH;
+  case 'a':
+    return in(t, float_set) ? pt::hexfloat : pt::none;
+  case 'c':
+    return in(t, integral_set) ? pt::chr : pt::none;
+  case 's':
+    return in(t, string_set | cstring_set) ? pt::string : pt::none;
+  case 'p':
+    return in(t, pointer_set | cstring_set) ? pt::pointer : pt::none;
+  default:
+    return pt::none;
+  }
+}
+
+template <typename Char, typename Context>
+void vprintf(buffer<Char>& buf, basic_string_view<Char> format,
+             basic_format_args<Context> args) {
+  using iterator = basic_appender<Char>;
+  auto out = iterator(buf);
+  auto context = basic_printf_context<Char>(out, args);
+  auto parse_ctx = basic_format_parse_context<Char>(format);
+
+  // Returns the argument with specified index or, if arg_index is -1, the next
+  // argument.
+  auto get_arg = [&](int arg_index) {
+    if (arg_index < 0)
+      arg_index = parse_ctx.next_arg_id();
+    else
+      parse_ctx.check_arg_id(--arg_index);
+    return detail::get_arg(context, arg_index);
+  };
+
+  const Char* start = parse_ctx.begin();
+  const Char* end = parse_ctx.end();
+  auto it = start;
+  while (it != end) {
+    if (!find<false, Char>(it, end, '%', it)) {
+      it = end;  // find leaves it == nullptr if it doesn't find '%'.
+      break;
+    }
+    Char c = *it++;
+    if (it != end && *it == c) {
+      write(out, basic_string_view<Char>(start, to_unsigned(it - start)));
+      start = ++it;
+      continue;
+    }
+    write(out, basic_string_view<Char>(start, to_unsigned(it - 1 - start)));
+
+    auto specs = format_specs();
+    specs.align = align::right;
+
+    // Parse argument index, flags and width.
+    int arg_index = parse_header(it, end, specs, get_arg);
+    if (arg_index == 0) report_error("argument not found");
+
+    // Parse precision.
+    if (it != end && *it == '.') {
+      ++it;
+      c = it != end ? *it : 0;
+      if ('0' <= c && c <= '9') {
+        specs.precision = parse_nonnegative_int(it, end, 0);
+      } else if (c == '*') {
+        ++it;
+        specs.precision =
+            static_cast<int>(get_arg(-1).visit(printf_precision_handler()));
+      } else {
+        specs.precision = 0;
+      }
+    }
+
+    auto arg = get_arg(arg_index);
+    // For d, i, o, u, x, and X conversion specifiers, if a precision is
+    // specified, the '0' flag is ignored
+    if (specs.precision >= 0 && arg.is_integral()) {
+      // Ignore '0' for non-numeric types or if '-' present.
+      specs.fill = ' ';
+    }
+    if (specs.precision >= 0 && arg.type() == type::cstring_type) {
+      auto str = arg.visit(get_cstring<Char>());
+      auto str_end = str + specs.precision;
+      auto nul = std::find(str, str_end, Char());
+      auto sv = basic_string_view<Char>(
+          str, to_unsigned(nul != str_end ? nul - str : specs.precision));
+      arg = make_arg<basic_printf_context<Char>>(sv);
+    }
+    if (specs.alt && arg.visit(is_zero_int())) specs.alt = false;
+    if (specs.fill.template get<Char>() == '0') {
+      if (arg.is_arithmetic() && specs.align != align::left)
+        specs.align = align::numeric;
+      else
+        specs.fill = ' ';  // Ignore '0' flag for non-numeric types or if '-'
+                           // flag is also present.
+    }
+
+    // Parse length and convert the argument to the required type.
+    c = it != end ? *it++ : 0;
+    Char t = it != end ? *it : 0;
+    switch (c) {
+    case 'h':
+      if (t == 'h') {
+        ++it;
+        t = it != end ? *it : 0;
+        convert_arg<signed char>(arg, t);
+      } else {
+        convert_arg<short>(arg, t);
+      }
+      break;
+    case 'l':
+      if (t == 'l') {
+        ++it;
+        t = it != end ? *it : 0;
+        convert_arg<long long>(arg, t);
+      } else {
+        convert_arg<long>(arg, t);
+      }
+      break;
+    case 'j':
+      convert_arg<intmax_t>(arg, t);
+      break;
+    case 'z':
+      convert_arg<size_t>(arg, t);
+      break;
+    case 't':
+      convert_arg<std::ptrdiff_t>(arg, t);
+      break;
+    case 'L':
+      // printf produces garbage when 'L' is omitted for long double, no
+      // need to do the same.
+      break;
+    default:
+      --it;
+      convert_arg<void>(arg, c);
+    }
+
+    // Parse type.
+    if (it == end) report_error("invalid format string");
+    char type = static_cast<char>(*it++);
+    if (arg.is_integral()) {
+      // Normalize type.
+      switch (type) {
+      case 'i':
+      case 'u':
+        type = 'd';
+        break;
+      case 'c':
+        arg.visit(char_converter<basic_printf_context<Char>>(arg));
+        break;
+      }
+    }
+    bool upper = false;
+    specs.type = parse_printf_presentation_type(type, arg.type(), upper);
+    if (specs.type == presentation_type::none)
+      report_error("invalid format specifier");
+    specs.upper = upper;
+
+    start = it;
+
+    // Format argument.
+    arg.visit(printf_arg_formatter<Char>(out, specs, context));
+  }
+  write(out, basic_string_view<Char>(start, to_unsigned(it - start)));
+}
+}  // namespace detail
+
+using printf_context = basic_printf_context<char>;
+using wprintf_context = basic_printf_context<wchar_t>;
+
+using printf_args = basic_format_args<printf_context>;
+using wprintf_args = basic_format_args<wprintf_context>;
+
+/// Constructs an `format_arg_store` object that contains references to
+/// arguments and can be implicitly converted to `printf_args`.
+template <typename Char = char, typename... T>
+inline auto make_printf_args(T&... args)
+    -> decltype(fmt::make_format_args<basic_printf_context<Char>>(args...)) {
+  return fmt::make_format_args<basic_printf_context<Char>>(args...);
+}
+
+template <typename Char> struct vprintf_args {
+  using type = basic_format_args<basic_printf_context<Char>>;
+};
+
+template <typename Char>
+inline auto vsprintf(basic_string_view<Char> fmt,
+                     typename vprintf_args<Char>::type args)
+    -> std::basic_string<Char> {
+  auto buf = basic_memory_buffer<Char>();
+  detail::vprintf(buf, fmt, args);
+  return to_string(buf);
+}
+
+/**
+ * Formats `args` according to specifications in `fmt` and returns the result
+ * as as string.
+ *
+ * **Example**:
+ *
+ *     std::string message = fmt::sprintf("The answer is %d", 42);
+ */
+template <typename S, typename... T, typename Char = char_t<S>>
+inline auto sprintf(const S& fmt, const T&... args) -> std::basic_string<Char> {
+  return vsprintf(detail::to_string_view(fmt),
+                  fmt::make_format_args<basic_printf_context<Char>>(args...));
+}
+
+template <typename Char>
+inline auto vfprintf(std::FILE* f, basic_string_view<Char> fmt,
+                     typename vprintf_args<Char>::type args) -> int {
+  auto buf = basic_memory_buffer<Char>();
+  detail::vprintf(buf, fmt, args);
+  size_t size = buf.size();
+  return std::fwrite(buf.data(), sizeof(Char), size, f) < size
+             ? -1
+             : static_cast<int>(size);
+}
+
+/**
+ * Formats `args` according to specifications in `fmt` and writes the output
+ * to `f`.
+ *
+ * **Example**:
+ *
+ *     fmt::fprintf(stderr, "Don't %s!", "panic");
+ */
+template <typename S, typename... T, typename Char = char_t<S>>
+inline auto fprintf(std::FILE* f, const S& fmt, const T&... args) -> int {
+  return vfprintf(f, detail::to_string_view(fmt),
+                  make_printf_args<Char>(args...));
+}
+
+template <typename Char>
+FMT_DEPRECATED inline auto vprintf(basic_string_view<Char> fmt,
+                                   typename vprintf_args<Char>::type args)
+    -> int {
+  return vfprintf(stdout, fmt, args);
+}
+
+/**
+ * Formats `args` according to specifications in `fmt` and writes the output
+ * to `stdout`.
+ *
+ * **Example**:
+ *
+ *   fmt::printf("Elapsed time: %.2f seconds", 1.23);
+ */
+template <typename... T>
+inline auto printf(string_view fmt, const T&... args) -> int {
+  return vfprintf(stdout, fmt, make_printf_args(args...));
+}
+template <typename... T>
+FMT_DEPRECATED inline auto printf(basic_string_view<wchar_t> fmt,
+                                  const T&... args) -> int {
+  return vfprintf(stdout, fmt, make_printf_args<wchar_t>(args...));
+}
+
+FMT_END_EXPORT
+FMT_END_NAMESPACE
+
+#endif  // FMT_PRINTF_H_
diff --git a/lib/fmt/fmt/ranges.h b/lib/fmt/fmt/ranges.h
new file mode 100644
index 000000000..0d3dfbd8d
--- /dev/null
+++ b/lib/fmt/fmt/ranges.h
@@ -0,0 +1,882 @@
+// Formatting library for C++ - range and tuple support
+//
+// Copyright (c) 2012 - present, Victor Zverovich and {fmt} contributors
+// All rights reserved.
+//
+// For the license information refer to format.h.
+
+#ifndef FMT_RANGES_H_
+#define FMT_RANGES_H_
+
+#ifndef FMT_MODULE
+#  include <initializer_list>
+#  include <iterator>
+#  include <string>
+#  include <tuple>
+#  include <type_traits>
+#  include <utility>
+#endif
+
+#include "format.h"
+
+FMT_BEGIN_NAMESPACE
+
+FMT_EXPORT
+enum class range_format { disabled, map, set, sequence, string, debug_string };
+
+namespace detail {
+
+template <typename T> class is_map {
+  template <typename U> static auto check(U*) -> typename U::mapped_type;
+  template <typename> static void check(...);
+
+ public:
+  static constexpr const bool value =
+      !std::is_void<decltype(check<T>(nullptr))>::value;
+};
+
+template <typename T> class is_set {
+  template <typename U> static auto check(U*) -> typename U::key_type;
+  template <typename> static void check(...);
+
+ public:
+  static constexpr const bool value =
+      !std::is_void<decltype(check<T>(nullptr))>::value && !is_map<T>::value;
+};
+
+template <typename... Ts> struct conditional_helper {};
+
+template <typename T, typename _ = void> struct is_range_ : std::false_type {};
+
+#if !FMT_MSC_VERSION || FMT_MSC_VERSION > 1800
+
+#  define FMT_DECLTYPE_RETURN(val)  \
+    ->decltype(val) { return val; } \
+    static_assert(                  \
+        true, "")  // This makes it so that a semicolon is required after the
+                   // macro, which helps clang-format handle the formatting.
+
+// C array overload
+template <typename T, std::size_t N>
+auto range_begin(const T (&arr)[N]) -> const T* {
+  return arr;
+}
+template <typename T, std::size_t N>
+auto range_end(const T (&arr)[N]) -> const T* {
+  return arr + N;
+}
+
+template <typename T, typename Enable = void>
+struct has_member_fn_begin_end_t : std::false_type {};
+
+template <typename T>
+struct has_member_fn_begin_end_t<T, void_t<decltype(*std::declval<T>().begin()),
+                                           decltype(std::declval<T>().end())>>
+    : std::true_type {};
+
+// Member function overloads.
+template <typename T>
+auto range_begin(T&& rng) FMT_DECLTYPE_RETURN(static_cast<T&&>(rng).begin());
+template <typename T>
+auto range_end(T&& rng) FMT_DECLTYPE_RETURN(static_cast<T&&>(rng).end());
+
+// ADL overloads. Only participate in overload resolution if member functions
+// are not found.
+template <typename T>
+auto range_begin(T&& rng)
+    -> enable_if_t<!has_member_fn_begin_end_t<T&&>::value,
+                   decltype(begin(static_cast<T&&>(rng)))> {
+  return begin(static_cast<T&&>(rng));
+}
+template <typename T>
+auto range_end(T&& rng) -> enable_if_t<!has_member_fn_begin_end_t<T&&>::value,
+                                       decltype(end(static_cast<T&&>(rng)))> {
+  return end(static_cast<T&&>(rng));
+}
+
+template <typename T, typename Enable = void>
+struct has_const_begin_end : std::false_type {};
+template <typename T, typename Enable = void>
+struct has_mutable_begin_end : std::false_type {};
+
+template <typename T>
+struct has_const_begin_end<
+    T, void_t<decltype(*detail::range_begin(
+                  std::declval<const remove_cvref_t<T>&>())),
+              decltype(detail::range_end(
+                  std::declval<const remove_cvref_t<T>&>()))>>
+    : std::true_type {};
+
+template <typename T>
+struct has_mutable_begin_end<
+    T, void_t<decltype(*detail::range_begin(std::declval<T&>())),
+              decltype(detail::range_end(std::declval<T&>())),
+              // the extra int here is because older versions of MSVC don't
+              // SFINAE properly unless there are distinct types
+              int>> : std::true_type {};
+
+template <typename T>
+struct is_range_<T, void>
+    : std::integral_constant<bool, (has_const_begin_end<T>::value ||
+                                    has_mutable_begin_end<T>::value)> {};
+#  undef FMT_DECLTYPE_RETURN
+#endif
+
+// tuple_size and tuple_element check.
+template <typename T> class is_tuple_like_ {
+  template <typename U>
+  static auto check(U* p) -> decltype(std::tuple_size<U>::value, int());
+  template <typename> static void check(...);
+
+ public:
+  static constexpr const bool value =
+      !std::is_void<decltype(check<T>(nullptr))>::value;
+};
+
+// Check for integer_sequence
+#if defined(__cpp_lib_integer_sequence) || FMT_MSC_VERSION >= 1900
+template <typename T, T... N>
+using integer_sequence = std::integer_sequence<T, N...>;
+template <size_t... N> using index_sequence = std::index_sequence<N...>;
+template <size_t N> using make_index_sequence = std::make_index_sequence<N>;
+#else
+template <typename T, T... N> struct integer_sequence {
+  using value_type = T;
+
+  static FMT_CONSTEXPR auto size() -> size_t { return sizeof...(N); }
+};
+
+template <size_t... N> using index_sequence = integer_sequence<size_t, N...>;
+
+template <typename T, size_t N, T... Ns>
+struct make_integer_sequence : make_integer_sequence<T, N - 1, N - 1, Ns...> {};
+template <typename T, T... Ns>
+struct make_integer_sequence<T, 0, Ns...> : integer_sequence<T, Ns...> {};
+
+template <size_t N>
+using make_index_sequence = make_integer_sequence<size_t, N>;
+#endif
+
+template <typename T>
+using tuple_index_sequence = make_index_sequence<std::tuple_size<T>::value>;
+
+template <typename T, typename C, bool = is_tuple_like_<T>::value>
+class is_tuple_formattable_ {
+ public:
+  static constexpr const bool value = false;
+};
+template <typename T, typename C> class is_tuple_formattable_<T, C, true> {
+  template <size_t... Is>
+  static auto all_true(index_sequence<Is...>,
+                       integer_sequence<bool, (Is >= 0)...>) -> std::true_type;
+  static auto all_true(...) -> std::false_type;
+
+  template <size_t... Is>
+  static auto check(index_sequence<Is...>) -> decltype(all_true(
+      index_sequence<Is...>{},
+      integer_sequence<bool,
+                       (is_formattable<typename std::tuple_element<Is, T>::type,
+                                       C>::value)...>{}));
+
+ public:
+  static constexpr const bool value =
+      decltype(check(tuple_index_sequence<T>{}))::value;
+};
+
+template <typename Tuple, typename F, size_t... Is>
+FMT_CONSTEXPR void for_each(index_sequence<Is...>, Tuple&& t, F&& f) {
+  using std::get;
+  // Using a free function get<Is>(Tuple) now.
+  const int unused[] = {0, ((void)f(get<Is>(t)), 0)...};
+  ignore_unused(unused);
+}
+
+template <typename Tuple, typename F>
+FMT_CONSTEXPR void for_each(Tuple&& t, F&& f) {
+  for_each(tuple_index_sequence<remove_cvref_t<Tuple>>(),
+           std::forward<Tuple>(t), std::forward<F>(f));
+}
+
+template <typename Tuple1, typename Tuple2, typename F, size_t... Is>
+void for_each2(index_sequence<Is...>, Tuple1&& t1, Tuple2&& t2, F&& f) {
+  using std::get;
+  const int unused[] = {0, ((void)f(get<Is>(t1), get<Is>(t2)), 0)...};
+  ignore_unused(unused);
+}
+
+template <typename Tuple1, typename Tuple2, typename F>
+void for_each2(Tuple1&& t1, Tuple2&& t2, F&& f) {
+  for_each2(tuple_index_sequence<remove_cvref_t<Tuple1>>(),
+            std::forward<Tuple1>(t1), std::forward<Tuple2>(t2),
+            std::forward<F>(f));
+}
+
+namespace tuple {
+// Workaround a bug in MSVC 2019 (v140).
+template <typename Char, typename... T>
+using result_t = std::tuple<formatter<remove_cvref_t<T>, Char>...>;
+
+using std::get;
+template <typename Tuple, typename Char, std::size_t... Is>
+auto get_formatters(index_sequence<Is...>)
+    -> result_t<Char, decltype(get<Is>(std::declval<Tuple>()))...>;
+}  // namespace tuple
+
+#if FMT_MSC_VERSION && FMT_MSC_VERSION < 1920
+// Older MSVC doesn't get the reference type correctly for arrays.
+template <typename R> struct range_reference_type_impl {
+  using type = decltype(*detail::range_begin(std::declval<R&>()));
+};
+
+template <typename T, std::size_t N> struct range_reference_type_impl<T[N]> {
+  using type = T&;
+};
+
+template <typename T>
+using range_reference_type = typename range_reference_type_impl<T>::type;
+#else
+template <typename Range>
+using range_reference_type =
+    decltype(*detail::range_begin(std::declval<Range&>()));
+#endif
+
+// We don't use the Range's value_type for anything, but we do need the Range's
+// reference type, with cv-ref stripped.
+template <typename Range>
+using uncvref_type = remove_cvref_t<range_reference_type<Range>>;
+
+template <typename Formatter>
+FMT_CONSTEXPR auto maybe_set_debug_format(Formatter& f, bool set)
+    -> decltype(f.set_debug_format(set)) {
+  f.set_debug_format(set);
+}
+template <typename Formatter>
+FMT_CONSTEXPR void maybe_set_debug_format(Formatter&, ...) {}
+
+template <typename T>
+struct range_format_kind_
+    : std::integral_constant<range_format,
+                             std::is_same<uncvref_type<T>, T>::value
+                                 ? range_format::disabled
+                             : is_map<T>::value ? range_format::map
+                             : is_set<T>::value ? range_format::set
+                                                : range_format::sequence> {};
+
+template <range_format K>
+using range_format_constant = std::integral_constant<range_format, K>;
+
+// These are not generic lambdas for compatibility with C++11.
+template <typename ParseContext> struct parse_empty_specs {
+  template <typename Formatter> FMT_CONSTEXPR void operator()(Formatter& f) {
+    f.parse(ctx);
+    detail::maybe_set_debug_format(f, true);
+  }
+  ParseContext& ctx;
+};
+template <typename FormatContext> struct format_tuple_element {
+  using char_type = typename FormatContext::char_type;
+
+  template <typename T>
+  void operator()(const formatter<T, char_type>& f, const T& v) {
+    if (i > 0) ctx.advance_to(detail::copy<char_type>(separator, ctx.out()));
+    ctx.advance_to(f.format(v, ctx));
+    ++i;
+  }
+
+  int i;
+  FormatContext& ctx;
+  basic_string_view<char_type> separator;
+};
+
+}  // namespace detail
+
+template <typename T> struct is_tuple_like {
+  static constexpr const bool value =
+      detail::is_tuple_like_<T>::value && !detail::is_range_<T>::value;
+};
+
+template <typename T, typename C> struct is_tuple_formattable {
+  static constexpr const bool value =
+      detail::is_tuple_formattable_<T, C>::value;
+};
+
+template <typename Tuple, typename Char>
+struct formatter<Tuple, Char,
+                 enable_if_t<fmt::is_tuple_like<Tuple>::value &&
+                             fmt::is_tuple_formattable<Tuple, Char>::value>> {
+ private:
+  decltype(detail::tuple::get_formatters<Tuple, Char>(
+      detail::tuple_index_sequence<Tuple>())) formatters_;
+
+  basic_string_view<Char> separator_ = detail::string_literal<Char, ',', ' '>{};
+  basic_string_view<Char> opening_bracket_ =
+      detail::string_literal<Char, '('>{};
+  basic_string_view<Char> closing_bracket_ =
+      detail::string_literal<Char, ')'>{};
+
+ public:
+  FMT_CONSTEXPR formatter() {}
+
+  FMT_CONSTEXPR void set_separator(basic_string_view<Char> sep) {
+    separator_ = sep;
+  }
+
+  FMT_CONSTEXPR void set_brackets(basic_string_view<Char> open,
+                                  basic_string_view<Char> close) {
+    opening_bracket_ = open;
+    closing_bracket_ = close;
+  }
+
+  template <typename ParseContext>
+  FMT_CONSTEXPR auto parse(ParseContext& ctx) -> decltype(ctx.begin()) {
+    auto it = ctx.begin();
+    if (it != ctx.end() && *it != '}') report_error("invalid format specifier");
+    detail::for_each(formatters_, detail::parse_empty_specs<ParseContext>{ctx});
+    return it;
+  }
+
+  template <typename FormatContext>
+  auto format(const Tuple& value, FormatContext& ctx) const
+      -> decltype(ctx.out()) {
+    ctx.advance_to(detail::copy<Char>(opening_bracket_, ctx.out()));
+    detail::for_each2(
+        formatters_, value,
+        detail::format_tuple_element<FormatContext>{0, ctx, separator_});
+    return detail::copy<Char>(closing_bracket_, ctx.out());
+  }
+};
+
+template <typename T, typename Char> struct is_range {
+  static constexpr const bool value =
+      detail::is_range_<T>::value && !detail::has_to_string_view<T>::value;
+};
+
+namespace detail {
+template <typename Context> struct range_mapper {
+  using mapper = arg_mapper<Context>;
+
+  template <typename T,
+            FMT_ENABLE_IF(has_formatter<remove_cvref_t<T>, Context>::value)>
+  static auto map(T&& value) -> T&& {
+    return static_cast<T&&>(value);
+  }
+  template <typename T,
+            FMT_ENABLE_IF(!has_formatter<remove_cvref_t<T>, Context>::value)>
+  static auto map(T&& value)
+      -> decltype(mapper().map(static_cast<T&&>(value))) {
+    return mapper().map(static_cast<T&&>(value));
+  }
+};
+
+template <typename Char, typename Element>
+using range_formatter_type =
+    formatter<remove_cvref_t<decltype(range_mapper<buffered_context<Char>>{}
+                                          .map(std::declval<Element>()))>,
+              Char>;
+
+template <typename R>
+using maybe_const_range =
+    conditional_t<has_const_begin_end<R>::value, const R, R>;
+
+// Workaround a bug in MSVC 2015 and earlier.
+#if !FMT_MSC_VERSION || FMT_MSC_VERSION >= 1910
+template <typename R, typename Char>
+struct is_formattable_delayed
+    : is_formattable<uncvref_type<maybe_const_range<R>>, Char> {};
+#endif
+}  // namespace detail
+
+template <typename...> struct conjunction : std::true_type {};
+template <typename P> struct conjunction<P> : P {};
+template <typename P1, typename... Pn>
+struct conjunction<P1, Pn...>
+    : conditional_t<bool(P1::value), conjunction<Pn...>, P1> {};
+
+template <typename T, typename Char, typename Enable = void>
+struct range_formatter;
+
+template <typename T, typename Char>
+struct range_formatter<
+    T, Char,
+    enable_if_t<conjunction<std::is_same<T, remove_cvref_t<T>>,
+                            is_formattable<T, Char>>::value>> {
+ private:
+  detail::range_formatter_type<Char, T> underlying_;
+  basic_string_view<Char> separator_ = detail::string_literal<Char, ',', ' '>{};
+  basic_string_view<Char> opening_bracket_ =
+      detail::string_literal<Char, '['>{};
+  basic_string_view<Char> closing_bracket_ =
+      detail::string_literal<Char, ']'>{};
+  bool is_debug = false;
+
+  template <typename Output, typename It, typename Sentinel, typename U = T,
+            FMT_ENABLE_IF(std::is_same<U, Char>::value)>
+  auto write_debug_string(Output& out, It it, Sentinel end) const -> Output {
+    auto buf = basic_memory_buffer<Char>();
+    for (; it != end; ++it) buf.push_back(*it);
+    auto specs = format_specs();
+    specs.type = presentation_type::debug;
+    return detail::write<Char>(
+        out, basic_string_view<Char>(buf.data(), buf.size()), specs);
+  }
+
+  template <typename Output, typename It, typename Sentinel, typename U = T,
+            FMT_ENABLE_IF(!std::is_same<U, Char>::value)>
+  auto write_debug_string(Output& out, It, Sentinel) const -> Output {
+    return out;
+  }
+
+ public:
+  FMT_CONSTEXPR range_formatter() {}
+
+  FMT_CONSTEXPR auto underlying() -> detail::range_formatter_type<Char, T>& {
+    return underlying_;
+  }
+
+  FMT_CONSTEXPR void set_separator(basic_string_view<Char> sep) {
+    separator_ = sep;
+  }
+
+  FMT_CONSTEXPR void set_brackets(basic_string_view<Char> open,
+                                  basic_string_view<Char> close) {
+    opening_bracket_ = open;
+    closing_bracket_ = close;
+  }
+
+  template <typename ParseContext>
+  FMT_CONSTEXPR auto parse(ParseContext& ctx) -> decltype(ctx.begin()) {
+    auto it = ctx.begin();
+    auto end = ctx.end();
+    detail::maybe_set_debug_format(underlying_, true);
+    if (it == end) return underlying_.parse(ctx);
+
+    switch (detail::to_ascii(*it)) {
+    case 'n':
+      set_brackets({}, {});
+      ++it;
+      break;
+    case '?':
+      is_debug = true;
+      set_brackets({}, {});
+      ++it;
+      if (it == end || *it != 's') report_error("invalid format specifier");
+      FMT_FALLTHROUGH;
+    case 's':
+      if (!std::is_same<T, Char>::value)
+        report_error("invalid format specifier");
+      if (!is_debug) {
+        set_brackets(detail::string_literal<Char, '"'>{},
+                     detail::string_literal<Char, '"'>{});
+        set_separator({});
+        detail::maybe_set_debug_format(underlying_, false);
+      }
+      ++it;
+      return it;
+    }
+
+    if (it != end && *it != '}') {
+      if (*it != ':') report_error("invalid format specifier");
+      detail::maybe_set_debug_format(underlying_, false);
+      ++it;
+    }
+
+    ctx.advance_to(it);
+    return underlying_.parse(ctx);
+  }
+
+  template <typename R, typename FormatContext>
+  auto format(R&& range, FormatContext& ctx) const -> decltype(ctx.out()) {
+    auto mapper = detail::range_mapper<buffered_context<Char>>();
+    auto out = ctx.out();
+    auto it = detail::range_begin(range);
+    auto end = detail::range_end(range);
+    if (is_debug) return write_debug_string(out, std::move(it), end);
+
+    out = detail::copy<Char>(opening_bracket_, out);
+    int i = 0;
+    for (; it != end; ++it) {
+      if (i > 0) out = detail::copy<Char>(separator_, out);
+      ctx.advance_to(out);
+      auto&& item = *it;  // Need an lvalue
+      out = underlying_.format(mapper.map(item), ctx);
+      ++i;
+    }
+    out = detail::copy<Char>(closing_bracket_, out);
+    return out;
+  }
+};
+
+FMT_EXPORT
+template <typename T, typename Char, typename Enable = void>
+struct range_format_kind
+    : conditional_t<
+          is_range<T, Char>::value, detail::range_format_kind_<T>,
+          std::integral_constant<range_format, range_format::disabled>> {};
+
+template <typename R, typename Char>
+struct formatter<
+    R, Char,
+    enable_if_t<conjunction<
+        bool_constant<
+            range_format_kind<R, Char>::value != range_format::disabled &&
+            range_format_kind<R, Char>::value != range_format::map &&
+            range_format_kind<R, Char>::value != range_format::string &&
+            range_format_kind<R, Char>::value != range_format::debug_string>
+// Workaround a bug in MSVC 2015 and earlier.
+#if !FMT_MSC_VERSION || FMT_MSC_VERSION >= 1910
+        ,
+        detail::is_formattable_delayed<R, Char>
+#endif
+        >::value>> {
+ private:
+  using range_type = detail::maybe_const_range<R>;
+  range_formatter<detail::uncvref_type<range_type>, Char> range_formatter_;
+
+ public:
+  using nonlocking = void;
+
+  FMT_CONSTEXPR formatter() {
+    if (detail::const_check(range_format_kind<R, Char>::value !=
+                            range_format::set))
+      return;
+    range_formatter_.set_brackets(detail::string_literal<Char, '{'>{},
+                                  detail::string_literal<Char, '}'>{});
+  }
+
+  template <typename ParseContext>
+  FMT_CONSTEXPR auto parse(ParseContext& ctx) -> decltype(ctx.begin()) {
+    return range_formatter_.parse(ctx);
+  }
+
+  template <typename FormatContext>
+  auto format(range_type& range, FormatContext& ctx) const
+      -> decltype(ctx.out()) {
+    return range_formatter_.format(range, ctx);
+  }
+};
+
+// A map formatter.
+template <typename R, typename Char>
+struct formatter<
+    R, Char,
+    enable_if_t<range_format_kind<R, Char>::value == range_format::map>> {
+ private:
+  using map_type = detail::maybe_const_range<R>;
+  using element_type = detail::uncvref_type<map_type>;
+
+  decltype(detail::tuple::get_formatters<element_type, Char>(
+      detail::tuple_index_sequence<element_type>())) formatters_;
+  bool no_delimiters_ = false;
+
+ public:
+  FMT_CONSTEXPR formatter() {}
+
+  template <typename ParseContext>
+  FMT_CONSTEXPR auto parse(ParseContext& ctx) -> decltype(ctx.begin()) {
+    auto it = ctx.begin();
+    auto end = ctx.end();
+    if (it != end) {
+      if (detail::to_ascii(*it) == 'n') {
+        no_delimiters_ = true;
+        ++it;
+      }
+      if (it != end && *it != '}') {
+        if (*it != ':') report_error("invalid format specifier");
+        ++it;
+      }
+      ctx.advance_to(it);
+    }
+    detail::for_each(formatters_, detail::parse_empty_specs<ParseContext>{ctx});
+    return it;
+  }
+
+  template <typename FormatContext>
+  auto format(map_type& map, FormatContext& ctx) const -> decltype(ctx.out()) {
+    auto out = ctx.out();
+    basic_string_view<Char> open = detail::string_literal<Char, '{'>{};
+    if (!no_delimiters_) out = detail::copy<Char>(open, out);
+    int i = 0;
+    auto mapper = detail::range_mapper<buffered_context<Char>>();
+    basic_string_view<Char> sep = detail::string_literal<Char, ',', ' '>{};
+    for (auto&& value : map) {
+      if (i > 0) out = detail::copy<Char>(sep, out);
+      ctx.advance_to(out);
+      detail::for_each2(formatters_, mapper.map(value),
+                        detail::format_tuple_element<FormatContext>{
+                            0, ctx, detail::string_literal<Char, ':', ' '>{}});
+      ++i;
+    }
+    basic_string_view<Char> close = detail::string_literal<Char, '}'>{};
+    if (!no_delimiters_) out = detail::copy<Char>(close, out);
+    return out;
+  }
+};
+
+// A (debug_)string formatter.
+template <typename R, typename Char>
+struct formatter<
+    R, Char,
+    enable_if_t<range_format_kind<R, Char>::value == range_format::string ||
+                range_format_kind<R, Char>::value ==
+                    range_format::debug_string>> {
+ private:
+  using range_type = detail::maybe_const_range<R>;
+  using string_type =
+      conditional_t<std::is_constructible<
+                        detail::std_string_view<Char>,
+                        decltype(detail::range_begin(std::declval<R>())),
+                        decltype(detail::range_end(std::declval<R>()))>::value,
+                    detail::std_string_view<Char>, std::basic_string<Char>>;
+
+  formatter<string_type, Char> underlying_;
+
+ public:
+  template <typename ParseContext>
+  FMT_CONSTEXPR auto parse(ParseContext& ctx) -> decltype(ctx.begin()) {
+    return underlying_.parse(ctx);
+  }
+
+  template <typename FormatContext>
+  auto format(range_type& range, FormatContext& ctx) const
+      -> decltype(ctx.out()) {
+    auto out = ctx.out();
+    if (detail::const_check(range_format_kind<R, Char>::value ==
+                            range_format::debug_string))
+      *out++ = '"';
+    out = underlying_.format(
+        string_type{detail::range_begin(range), detail::range_end(range)}, ctx);
+    if (detail::const_check(range_format_kind<R, Char>::value ==
+                            range_format::debug_string))
+      *out++ = '"';
+    return out;
+  }
+};
+
+template <typename It, typename Sentinel, typename Char = char>
+struct join_view : detail::view {
+  It begin;
+  Sentinel end;
+  basic_string_view<Char> sep;
+
+  join_view(It b, Sentinel e, basic_string_view<Char> s)
+      : begin(std::move(b)), end(e), sep(s) {}
+};
+
+template <typename It, typename Sentinel, typename Char>
+struct formatter<join_view<It, Sentinel, Char>, Char> {
+ private:
+  using value_type =
+#ifdef __cpp_lib_ranges
+      std::iter_value_t<It>;
+#else
+      typename std::iterator_traits<It>::value_type;
+#endif
+  formatter<remove_cvref_t<value_type>, Char> value_formatter_;
+
+  using view_ref = conditional_t<std::is_copy_constructible<It>::value,
+                                 const join_view<It, Sentinel, Char>&,
+                                 join_view<It, Sentinel, Char>&&>;
+
+ public:
+  using nonlocking = void;
+
+  template <typename ParseContext>
+  FMT_CONSTEXPR auto parse(ParseContext& ctx) -> const Char* {
+    return value_formatter_.parse(ctx);
+  }
+
+  template <typename FormatContext>
+  auto format(view_ref& value, FormatContext& ctx) const
+      -> decltype(ctx.out()) {
+    auto it = std::forward<view_ref>(value).begin;
+    auto out = ctx.out();
+    if (it == value.end) return out;
+    out = value_formatter_.format(*it, ctx);
+    ++it;
+    while (it != value.end) {
+      out = detail::copy<Char>(value.sep.begin(), value.sep.end(), out);
+      ctx.advance_to(out);
+      out = value_formatter_.format(*it, ctx);
+      ++it;
+    }
+    return out;
+  }
+};
+
+/// Returns a view that formats the iterator range `[begin, end)` with elements
+/// separated by `sep`.
+template <typename It, typename Sentinel>
+auto join(It begin, Sentinel end, string_view sep) -> join_view<It, Sentinel> {
+  return {std::move(begin), end, sep};
+}
+
+/**
+ * Returns a view that formats `range` with elements separated by `sep`.
+ *
+ * **Example**:
+ *
+ *     auto v = std::vector<int>{1, 2, 3};
+ *     fmt::print("{}", fmt::join(v, ", "));
+ *     // Output: 1, 2, 3
+ *
+ * `fmt::join` applies passed format specifiers to the range elements:
+ *
+ *     fmt::print("{:02}", fmt::join(v, ", "));
+ *     // Output: 01, 02, 03
+ */
+template <typename Range>
+auto join(Range&& r, string_view sep)
+    -> join_view<decltype(detail::range_begin(r)),
+                 decltype(detail::range_end(r))> {
+  return {detail::range_begin(r), detail::range_end(r), sep};
+}
+
+template <typename Char, typename... T> struct tuple_join_view : detail::view {
+  const std::tuple<T...>& tuple;
+  basic_string_view<Char> sep;
+
+  tuple_join_view(const std::tuple<T...>& t, basic_string_view<Char> s)
+      : tuple(t), sep{s} {}
+};
+
+// Define FMT_TUPLE_JOIN_SPECIFIERS to enable experimental format specifiers
+// support in tuple_join. It is disabled by default because of issues with
+// the dynamic width and precision.
+#ifndef FMT_TUPLE_JOIN_SPECIFIERS
+#  define FMT_TUPLE_JOIN_SPECIFIERS 0
+#endif
+
+template <typename Char, typename... T>
+struct formatter<tuple_join_view<Char, T...>, Char> {
+  template <typename ParseContext>
+  FMT_CONSTEXPR auto parse(ParseContext& ctx) -> decltype(ctx.begin()) {
+    return do_parse(ctx, std::integral_constant<size_t, sizeof...(T)>());
+  }
+
+  template <typename FormatContext>
+  auto format(const tuple_join_view<Char, T...>& value,
+              FormatContext& ctx) const -> typename FormatContext::iterator {
+    return do_format(value, ctx,
+                     std::integral_constant<size_t, sizeof...(T)>());
+  }
+
+ private:
+  std::tuple<formatter<typename std::decay<T>::type, Char>...> formatters_;
+
+  template <typename ParseContext>
+  FMT_CONSTEXPR auto do_parse(ParseContext& ctx,
+                              std::integral_constant<size_t, 0>)
+      -> decltype(ctx.begin()) {
+    return ctx.begin();
+  }
+
+  template <typename ParseContext, size_t N>
+  FMT_CONSTEXPR auto do_parse(ParseContext& ctx,
+                              std::integral_constant<size_t, N>)
+      -> decltype(ctx.begin()) {
+    auto end = ctx.begin();
+#if FMT_TUPLE_JOIN_SPECIFIERS
+    end = std::get<sizeof...(T) - N>(formatters_).parse(ctx);
+    if (N > 1) {
+      auto end1 = do_parse(ctx, std::integral_constant<size_t, N - 1>());
+      if (end != end1)
+        report_error("incompatible format specs for tuple elements");
+    }
+#endif
+    return end;
+  }
+
+  template <typename FormatContext>
+  auto do_format(const tuple_join_view<Char, T...>&, FormatContext& ctx,
+                 std::integral_constant<size_t, 0>) const ->
+      typename FormatContext::iterator {
+    return ctx.out();
+  }
+
+  template <typename FormatContext, size_t N>
+  auto do_format(const tuple_join_view<Char, T...>& value, FormatContext& ctx,
+                 std::integral_constant<size_t, N>) const ->
+      typename FormatContext::iterator {
+    auto out = std::get<sizeof...(T) - N>(formatters_)
+                   .format(std::get<sizeof...(T) - N>(value.tuple), ctx);
+    if (N <= 1) return out;
+    out = detail::copy<Char>(value.sep, out);
+    ctx.advance_to(out);
+    return do_format(value, ctx, std::integral_constant<size_t, N - 1>());
+  }
+};
+
+namespace detail {
+// Check if T has an interface like a container adaptor (e.g. std::stack,
+// std::queue, std::priority_queue).
+template <typename T> class is_container_adaptor_like {
+  template <typename U> static auto check(U* p) -> typename U::container_type;
+  template <typename> static void check(...);
+
+ public:
+  static constexpr const bool value =
+      !std::is_void<decltype(check<T>(nullptr))>::value;
+};
+
+template <typename Container> struct all {
+  const Container& c;
+  auto begin() const -> typename Container::const_iterator { return c.begin(); }
+  auto end() const -> typename Container::const_iterator { return c.end(); }
+};
+}  // namespace detail
+
+template <typename T, typename Char>
+struct formatter<
+    T, Char,
+    enable_if_t<conjunction<detail::is_container_adaptor_like<T>,
+                            bool_constant<range_format_kind<T, Char>::value ==
+                                          range_format::disabled>>::value>>
+    : formatter<detail::all<typename T::container_type>, Char> {
+  using all = detail::all<typename T::container_type>;
+  template <typename FormatContext>
+  auto format(const T& t, FormatContext& ctx) const -> decltype(ctx.out()) {
+    struct getter : T {
+      static auto get(const T& t) -> all {
+        return {t.*(&getter::c)};  // Access c through the derived class.
+      }
+    };
+    return formatter<all>::format(getter::get(t), ctx);
+  }
+};
+
+FMT_BEGIN_EXPORT
+
+/**
+ * Returns an object that formats `std::tuple` with elements separated by `sep`.
+ *
+ * **Example**:
+ *
+ *     auto t = std::tuple<int, char>{1, 'a'};
+ *     fmt::print("{}", fmt::join(t, ", "));
+ *     // Output: 1, a
+ */
+template <typename... T>
+FMT_CONSTEXPR auto join(const std::tuple<T...>& tuple, string_view sep)
+    -> tuple_join_view<char, T...> {
+  return {tuple, sep};
+}
+
+/**
+ * Returns an object that formats `std::initializer_list` with elements
+ * separated by `sep`.
+ *
+ * **Example**:
+ *
+ *     fmt::print("{}", fmt::join({1, 2, 3}, ", "));
+ *     // Output: "1, 2, 3"
+ */
+template <typename T>
+auto join(std::initializer_list<T> list, string_view sep)
+    -> join_view<const T*, const T*> {
+  return join(std::begin(list), std::end(list), sep);
+}
+
+FMT_END_EXPORT
+FMT_END_NAMESPACE
+
+#endif  // FMT_RANGES_H_
diff --git a/lib/fmt/fmt/std.h b/lib/fmt/fmt/std.h
new file mode 100644
index 000000000..fb43940bc
--- /dev/null
+++ b/lib/fmt/fmt/std.h
@@ -0,0 +1,699 @@
+// Formatting library for C++ - formatters for standard library types
+//
+// Copyright (c) 2012 - present, Victor Zverovich
+// All rights reserved.
+//
+// For the license information refer to format.h.
+
+#ifndef FMT_STD_H_
+#define FMT_STD_H_
+
+#include "format.h"
+#include "ostream.h"
+
+#ifndef FMT_MODULE
+#  include <atomic>
+#  include <bitset>
+#  include <complex>
+#  include <cstdlib>
+#  include <exception>
+#  include <memory>
+#  include <thread>
+#  include <type_traits>
+#  include <typeinfo>
+#  include <utility>
+#  include <vector>
+
+// Check FMT_CPLUSPLUS to suppress a bogus warning in MSVC.
+#  if FMT_CPLUSPLUS >= 201703L
+#    if FMT_HAS_INCLUDE(<filesystem>)
+#      include <filesystem>
+#    endif
+#    if FMT_HAS_INCLUDE(<variant>)
+#      include <variant>
+#    endif
+#    if FMT_HAS_INCLUDE(<optional>)
+#      include <optional>
+#    endif
+#  endif
+// Use > instead of >= in the version check because <source_location> may be
+// available after C++17 but before C++20 is marked as implemented.
+#  if FMT_CPLUSPLUS > 201703L && FMT_HAS_INCLUDE(<source_location>)
+#    include <source_location>
+#  endif
+#  if FMT_CPLUSPLUS > 202002L && FMT_HAS_INCLUDE(<expected>)
+#    include <expected>
+#  endif
+#endif  // FMT_MODULE
+
+#if FMT_HAS_INCLUDE(<version>)
+#  include <version>
+#endif
+
+// GCC 4 does not support FMT_HAS_INCLUDE.
+#if FMT_HAS_INCLUDE(<cxxabi.h>) || defined(__GLIBCXX__)
+#  include <cxxabi.h>
+// Android NDK with gabi++ library on some architectures does not implement
+// abi::__cxa_demangle().
+#  ifndef __GABIXX_CXXABI_H__
+#    define FMT_HAS_ABI_CXA_DEMANGLE
+#  endif
+#endif
+
+// For older Xcode versions, __cpp_lib_xxx flags are inaccurately defined.
+#ifndef FMT_CPP_LIB_FILESYSTEM
+#  ifdef __cpp_lib_filesystem
+#    define FMT_CPP_LIB_FILESYSTEM __cpp_lib_filesystem
+#  else
+#    define FMT_CPP_LIB_FILESYSTEM 0
+#  endif
+#endif
+
+#ifndef FMT_CPP_LIB_VARIANT
+#  ifdef __cpp_lib_variant
+#    define FMT_CPP_LIB_VARIANT __cpp_lib_variant
+#  else
+#    define FMT_CPP_LIB_VARIANT 0
+#  endif
+#endif
+
+#if FMT_CPP_LIB_FILESYSTEM
+FMT_BEGIN_NAMESPACE
+
+namespace detail {
+
+template <typename Char, typename PathChar>
+auto get_path_string(const std::filesystem::path& p,
+                     const std::basic_string<PathChar>& native) {
+  if constexpr (std::is_same_v<Char, char> && std::is_same_v<PathChar, wchar_t>)
+    return to_utf8<wchar_t>(native, to_utf8_error_policy::replace);
+  else
+    return p.string<Char>();
+}
+
+template <typename Char, typename PathChar>
+void write_escaped_path(basic_memory_buffer<Char>& quoted,
+                        const std::filesystem::path& p,
+                        const std::basic_string<PathChar>& native) {
+  if constexpr (std::is_same_v<Char, char> &&
+                std::is_same_v<PathChar, wchar_t>) {
+    auto buf = basic_memory_buffer<wchar_t>();
+    write_escaped_string<wchar_t>(std::back_inserter(buf), native);
+    bool valid = to_utf8<wchar_t>::convert(quoted, {buf.data(), buf.size()});
+    FMT_ASSERT(valid, "invalid utf16");
+  } else if constexpr (std::is_same_v<Char, PathChar>) {
+    write_escaped_string<std::filesystem::path::value_type>(
+        std::back_inserter(quoted), native);
+  } else {
+    write_escaped_string<Char>(std::back_inserter(quoted), p.string<Char>());
+  }
+}
+
+}  // namespace detail
+
+FMT_EXPORT
+template <typename Char> struct formatter<std::filesystem::path, Char> {
+ private:
+  format_specs specs_;
+  detail::arg_ref<Char> width_ref_;
+  bool debug_ = false;
+  char path_type_ = 0;
+
+ public:
+  FMT_CONSTEXPR void set_debug_format(bool set = true) { debug_ = set; }
+
+  template <typename ParseContext> FMT_CONSTEXPR auto parse(ParseContext& ctx) {
+    auto it = ctx.begin(), end = ctx.end();
+    if (it == end) return it;
+
+    it = detail::parse_align(it, end, specs_);
+    if (it == end) return it;
+
+    it = detail::parse_dynamic_spec(it, end, specs_.width, width_ref_, ctx);
+    if (it != end && *it == '?') {
+      debug_ = true;
+      ++it;
+    }
+    if (it != end && (*it == 'g')) path_type_ = detail::to_ascii(*it++);
+    return it;
+  }
+
+  template <typename FormatContext>
+  auto format(const std::filesystem::path& p, FormatContext& ctx) const {
+    auto specs = specs_;
+    auto path_string =
+        !path_type_ ? p.native()
+                    : p.generic_string<std::filesystem::path::value_type>();
+
+    detail::handle_dynamic_spec<detail::width_checker>(specs.width, width_ref_,
+                                                       ctx);
+    if (!debug_) {
+      auto s = detail::get_path_string<Char>(p, path_string);
+      return detail::write(ctx.out(), basic_string_view<Char>(s), specs);
+    }
+    auto quoted = basic_memory_buffer<Char>();
+    detail::write_escaped_path(quoted, p, path_string);
+    return detail::write(ctx.out(),
+                         basic_string_view<Char>(quoted.data(), quoted.size()),
+                         specs);
+  }
+};
+
+class path : public std::filesystem::path {
+ public:
+  auto display_string() const -> std::string {
+    const std::filesystem::path& base = *this;
+    return fmt::format(FMT_STRING("{}"), base);
+  }
+  auto system_string() const -> std::string { return string(); }
+
+  auto generic_display_string() const -> std::string {
+    const std::filesystem::path& base = *this;
+    return fmt::format(FMT_STRING("{:g}"), base);
+  }
+  auto generic_system_string() const -> std::string { return generic_string(); }
+};
+
+FMT_END_NAMESPACE
+#endif  // FMT_CPP_LIB_FILESYSTEM
+
+FMT_BEGIN_NAMESPACE
+FMT_EXPORT
+template <std::size_t N, typename Char>
+struct formatter<std::bitset<N>, Char> : nested_formatter<string_view> {
+ private:
+  // Functor because C++11 doesn't support generic lambdas.
+  struct writer {
+    const std::bitset<N>& bs;
+
+    template <typename OutputIt>
+    FMT_CONSTEXPR auto operator()(OutputIt out) -> OutputIt {
+      for (auto pos = N; pos > 0; --pos) {
+        out = detail::write<Char>(out, bs[pos - 1] ? Char('1') : Char('0'));
+      }
+
+      return out;
+    }
+  };
+
+ public:
+  template <typename FormatContext>
+  auto format(const std::bitset<N>& bs, FormatContext& ctx) const
+      -> decltype(ctx.out()) {
+    return write_padded(ctx, writer{bs});
+  }
+};
+
+FMT_EXPORT
+template <typename Char>
+struct formatter<std::thread::id, Char> : basic_ostream_formatter<Char> {};
+FMT_END_NAMESPACE
+
+#ifdef __cpp_lib_optional
+FMT_BEGIN_NAMESPACE
+FMT_EXPORT
+template <typename T, typename Char>
+struct formatter<std::optional<T>, Char,
+                 std::enable_if_t<is_formattable<T, Char>::value>> {
+ private:
+  formatter<T, Char> underlying_;
+  static constexpr basic_string_view<Char> optional =
+      detail::string_literal<Char, 'o', 'p', 't', 'i', 'o', 'n', 'a', 'l',
+                             '('>{};
+  static constexpr basic_string_view<Char> none =
+      detail::string_literal<Char, 'n', 'o', 'n', 'e'>{};
+
+  template <class U>
+  FMT_CONSTEXPR static auto maybe_set_debug_format(U& u, bool set)
+      -> decltype(u.set_debug_format(set)) {
+    u.set_debug_format(set);
+  }
+
+  template <class U>
+  FMT_CONSTEXPR static void maybe_set_debug_format(U&, ...) {}
+
+ public:
+  template <typename ParseContext> FMT_CONSTEXPR auto parse(ParseContext& ctx) {
+    maybe_set_debug_format(underlying_, true);
+    return underlying_.parse(ctx);
+  }
+
+  template <typename FormatContext>
+  auto format(const std::optional<T>& opt, FormatContext& ctx) const
+      -> decltype(ctx.out()) {
+    if (!opt) return detail::write<Char>(ctx.out(), none);
+
+    auto out = ctx.out();
+    out = detail::write<Char>(out, optional);
+    ctx.advance_to(out);
+    out = underlying_.format(*opt, ctx);
+    return detail::write(out, ')');
+  }
+};
+FMT_END_NAMESPACE
+#endif  // __cpp_lib_optional
+
+#if defined(__cpp_lib_expected) || FMT_CPP_LIB_VARIANT
+
+FMT_BEGIN_NAMESPACE
+namespace detail {
+
+template <typename Char, typename OutputIt, typename T>
+auto write_escaped_alternative(OutputIt out, const T& v) -> OutputIt {
+  if constexpr (has_to_string_view<T>::value)
+    return write_escaped_string<Char>(out, detail::to_string_view(v));
+  if constexpr (std::is_same_v<T, Char>) return write_escaped_char(out, v);
+  return write<Char>(out, v);
+}
+
+}  // namespace detail
+
+FMT_END_NAMESPACE
+#endif
+
+#ifdef __cpp_lib_expected
+FMT_BEGIN_NAMESPACE
+
+FMT_EXPORT
+template <typename T, typename E, typename Char>
+struct formatter<std::expected<T, E>, Char,
+                 std::enable_if_t<is_formattable<T, Char>::value &&
+                                  is_formattable<E, Char>::value>> {
+  template <typename ParseContext>
+  FMT_CONSTEXPR auto parse(ParseContext& ctx) -> decltype(ctx.begin()) {
+    return ctx.begin();
+  }
+
+  template <typename FormatContext>
+  auto format(const std::expected<T, E>& value, FormatContext& ctx) const
+      -> decltype(ctx.out()) {
+    auto out = ctx.out();
+
+    if (value.has_value()) {
+      out = detail::write<Char>(out, "expected(");
+      out = detail::write_escaped_alternative<Char>(out, *value);
+    } else {
+      out = detail::write<Char>(out, "unexpected(");
+      out = detail::write_escaped_alternative<Char>(out, value.error());
+    }
+    *out++ = ')';
+    return out;
+  }
+};
+FMT_END_NAMESPACE
+#endif  // __cpp_lib_expected
+
+#ifdef __cpp_lib_source_location
+FMT_BEGIN_NAMESPACE
+FMT_EXPORT
+template <> struct formatter<std::source_location> {
+  template <typename ParseContext> FMT_CONSTEXPR auto parse(ParseContext& ctx) {
+    return ctx.begin();
+  }
+
+  template <typename FormatContext>
+  auto format(const std::source_location& loc, FormatContext& ctx) const
+      -> decltype(ctx.out()) {
+    auto out = ctx.out();
+    out = detail::write(out, loc.file_name());
+    out = detail::write(out, ':');
+    out = detail::write<char>(out, loc.line());
+    out = detail::write(out, ':');
+    out = detail::write<char>(out, loc.column());
+    out = detail::write(out, ": ");
+    out = detail::write(out, loc.function_name());
+    return out;
+  }
+};
+FMT_END_NAMESPACE
+#endif
+
+#if FMT_CPP_LIB_VARIANT
+FMT_BEGIN_NAMESPACE
+namespace detail {
+
+template <typename T>
+using variant_index_sequence =
+    std::make_index_sequence<std::variant_size<T>::value>;
+
+template <typename> struct is_variant_like_ : std::false_type {};
+template <typename... Types>
+struct is_variant_like_<std::variant<Types...>> : std::true_type {};
+
+// formattable element check.
+template <typename T, typename C> class is_variant_formattable_ {
+  template <std::size_t... Is>
+  static std::conjunction<
+      is_formattable<std::variant_alternative_t<Is, T>, C>...>
+      check(std::index_sequence<Is...>);
+
+ public:
+  static constexpr const bool value =
+      decltype(check(variant_index_sequence<T>{}))::value;
+};
+
+}  // namespace detail
+
+template <typename T> struct is_variant_like {
+  static constexpr const bool value = detail::is_variant_like_<T>::value;
+};
+
+template <typename T, typename C> struct is_variant_formattable {
+  static constexpr const bool value =
+      detail::is_variant_formattable_<T, C>::value;
+};
+
+FMT_EXPORT
+template <typename Char> struct formatter<std::monostate, Char> {
+  template <typename ParseContext>
+  FMT_CONSTEXPR auto parse(ParseContext& ctx) -> decltype(ctx.begin()) {
+    return ctx.begin();
+  }
+
+  template <typename FormatContext>
+  auto format(const std::monostate&, FormatContext& ctx) const
+      -> decltype(ctx.out()) {
+    return detail::write<Char>(ctx.out(), "monostate");
+  }
+};
+
+FMT_EXPORT
+template <typename Variant, typename Char>
+struct formatter<
+    Variant, Char,
+    std::enable_if_t<std::conjunction_v<
+        is_variant_like<Variant>, is_variant_formattable<Variant, Char>>>> {
+  template <typename ParseContext>
+  FMT_CONSTEXPR auto parse(ParseContext& ctx) -> decltype(ctx.begin()) {
+    return ctx.begin();
+  }
+
+  template <typename FormatContext>
+  auto format(const Variant& value, FormatContext& ctx) const
+      -> decltype(ctx.out()) {
+    auto out = ctx.out();
+
+    out = detail::write<Char>(out, "variant(");
+    FMT_TRY {
+      std::visit(
+          [&](const auto& v) {
+            out = detail::write_escaped_alternative<Char>(out, v);
+          },
+          value);
+    }
+    FMT_CATCH(const std::bad_variant_access&) {
+      detail::write<Char>(out, "valueless by exception");
+    }
+    *out++ = ')';
+    return out;
+  }
+};
+FMT_END_NAMESPACE
+#endif  // FMT_CPP_LIB_VARIANT
+
+FMT_BEGIN_NAMESPACE
+FMT_EXPORT
+template <typename Char> struct formatter<std::error_code, Char> {
+  template <typename ParseContext>
+  FMT_CONSTEXPR auto parse(ParseContext& ctx) -> decltype(ctx.begin()) {
+    return ctx.begin();
+  }
+
+  template <typename FormatContext>
+  FMT_CONSTEXPR auto format(const std::error_code& ec, FormatContext& ctx) const
+      -> decltype(ctx.out()) {
+    auto out = ctx.out();
+    out = detail::write_bytes<Char>(out, ec.category().name(), format_specs());
+    out = detail::write<Char>(out, Char(':'));
+    out = detail::write<Char>(out, ec.value());
+    return out;
+  }
+};
+
+#if FMT_USE_RTTI
+namespace detail {
+
+template <typename Char, typename OutputIt>
+auto write_demangled_name(OutputIt out, const std::type_info& ti) -> OutputIt {
+#  ifdef FMT_HAS_ABI_CXA_DEMANGLE
+  int status = 0;
+  std::size_t size = 0;
+  std::unique_ptr<char, void (*)(void*)> demangled_name_ptr(
+      abi::__cxa_demangle(ti.name(), nullptr, &size, &status), &std::free);
+
+  string_view demangled_name_view;
+  if (demangled_name_ptr) {
+    demangled_name_view = demangled_name_ptr.get();
+
+    // Normalization of stdlib inline namespace names.
+    // libc++ inline namespaces.
+    //  std::__1::*       -> std::*
+    //  std::__1::__fs::* -> std::*
+    // libstdc++ inline namespaces.
+    //  std::__cxx11::*             -> std::*
+    //  std::filesystem::__cxx11::* -> std::filesystem::*
+    if (demangled_name_view.starts_with("std::")) {
+      char* begin = demangled_name_ptr.get();
+      char* to = begin + 5;  // std::
+      for (char *from = to, *end = begin + demangled_name_view.size();
+           from < end;) {
+        // This is safe, because demangled_name is NUL-terminated.
+        if (from[0] == '_' && from[1] == '_') {
+          char* next = from + 1;
+          while (next < end && *next != ':') next++;
+          if (next[0] == ':' && next[1] == ':') {
+            from = next + 2;
+            continue;
+          }
+        }
+        *to++ = *from++;
+      }
+      demangled_name_view = {begin, detail::to_unsigned(to - begin)};
+    }
+  } else {
+    demangled_name_view = string_view(ti.name());
+  }
+  return detail::write_bytes<Char>(out, demangled_name_view);
+#  elif FMT_MSC_VERSION
+  const string_view demangled_name(ti.name());
+  for (std::size_t i = 0; i < demangled_name.size(); ++i) {
+    auto sub = demangled_name;
+    sub.remove_prefix(i);
+    if (sub.starts_with("enum ")) {
+      i += 4;
+      continue;
+    }
+    if (sub.starts_with("class ") || sub.starts_with("union ")) {
+      i += 5;
+      continue;
+    }
+    if (sub.starts_with("struct ")) {
+      i += 6;
+      continue;
+    }
+    if (*sub.begin() != ' ') *out++ = *sub.begin();
+  }
+  return out;
+#  else
+  return detail::write_bytes<Char>(out, string_view(ti.name()));
+#  endif
+}
+
+}  // namespace detail
+
+FMT_EXPORT
+template <typename Char>
+struct formatter<std::type_info, Char  // DEPRECATED! Mixing code unit types.
+                 > {
+ public:
+  FMT_CONSTEXPR auto parse(basic_format_parse_context<Char>& ctx)
+      -> decltype(ctx.begin()) {
+    return ctx.begin();
+  }
+
+  template <typename Context>
+  auto format(const std::type_info& ti, Context& ctx) const
+      -> decltype(ctx.out()) {
+    return detail::write_demangled_name<Char>(ctx.out(), ti);
+  }
+};
+#endif
+
+FMT_EXPORT
+template <typename T, typename Char>
+struct formatter<
+    T, Char,  // DEPRECATED! Mixing code unit types.
+    typename std::enable_if<std::is_base_of<std::exception, T>::value>::type> {
+ private:
+  bool with_typename_ = false;
+
+ public:
+  FMT_CONSTEXPR auto parse(basic_format_parse_context<Char>& ctx)
+      -> decltype(ctx.begin()) {
+    auto it = ctx.begin();
+    auto end = ctx.end();
+    if (it == end || *it == '}') return it;
+    if (*it == 't') {
+      ++it;
+      with_typename_ = FMT_USE_RTTI != 0;
+    }
+    return it;
+  }
+
+  template <typename Context>
+  auto format(const std::exception& ex, Context& ctx) const
+      -> decltype(ctx.out()) {
+    auto out = ctx.out();
+#if FMT_USE_RTTI
+    if (with_typename_) {
+      out = detail::write_demangled_name<Char>(out, typeid(ex));
+      *out++ = ':';
+      *out++ = ' ';
+    }
+#endif
+    return detail::write_bytes<Char>(out, string_view(ex.what()));
+  }
+};
+
+namespace detail {
+
+template <typename T, typename Enable = void>
+struct has_flip : std::false_type {};
+
+template <typename T>
+struct has_flip<T, void_t<decltype(std::declval<T>().flip())>>
+    : std::true_type {};
+
+template <typename T> struct is_bit_reference_like {
+  static constexpr const bool value =
+      std::is_convertible<T, bool>::value &&
+      std::is_nothrow_assignable<T, bool>::value && has_flip<T>::value;
+};
+
+#ifdef _LIBCPP_VERSION
+
+// Workaround for libc++ incompatibility with C++ standard.
+// According to the Standard, `bitset::operator[] const` returns bool.
+template <typename C>
+struct is_bit_reference_like<std::__bit_const_reference<C>> {
+  static constexpr const bool value = true;
+};
+
+#endif
+
+}  // namespace detail
+
+// We can't use std::vector<bool, Allocator>::reference and
+// std::bitset<N>::reference because the compiler can't deduce Allocator and N
+// in partial specialization.
+FMT_EXPORT
+template <typename BitRef, typename Char>
+struct formatter<BitRef, Char,
+                 enable_if_t<detail::is_bit_reference_like<BitRef>::value>>
+    : formatter<bool, Char> {
+  template <typename FormatContext>
+  FMT_CONSTEXPR auto format(const BitRef& v, FormatContext& ctx) const
+      -> decltype(ctx.out()) {
+    return formatter<bool, Char>::format(v, ctx);
+  }
+};
+
+template <typename T, typename Deleter>
+auto ptr(const std::unique_ptr<T, Deleter>& p) -> const void* {
+  return p.get();
+}
+template <typename T> auto ptr(const std::shared_ptr<T>& p) -> const void* {
+  return p.get();
+}
+
+FMT_EXPORT
+template <typename T, typename Char>
+struct formatter<std::atomic<T>, Char,
+                 enable_if_t<is_formattable<T, Char>::value>>
+    : formatter<T, Char> {
+  template <typename FormatContext>
+  auto format(const std::atomic<T>& v, FormatContext& ctx) const
+      -> decltype(ctx.out()) {
+    return formatter<T, Char>::format(v.load(), ctx);
+  }
+};
+
+#ifdef __cpp_lib_atomic_flag_test
+FMT_EXPORT
+template <typename Char>
+struct formatter<std::atomic_flag, Char> : formatter<bool, Char> {
+  template <typename FormatContext>
+  auto format(const std::atomic_flag& v, FormatContext& ctx) const
+      -> decltype(ctx.out()) {
+    return formatter<bool, Char>::format(v.test(), ctx);
+  }
+};
+#endif  // __cpp_lib_atomic_flag_test
+
+FMT_EXPORT
+template <typename T, typename Char> struct formatter<std::complex<T>, Char> {
+ private:
+  detail::dynamic_format_specs<Char> specs_;
+
+  template <typename FormatContext, typename OutputIt>
+  FMT_CONSTEXPR auto do_format(const std::complex<T>& c,
+                               detail::dynamic_format_specs<Char>& specs,
+                               FormatContext& ctx, OutputIt out) const
+      -> OutputIt {
+    if (c.real() != 0) {
+      *out++ = Char('(');
+      out = detail::write<Char>(out, c.real(), specs, ctx.locale());
+      specs.sign = sign::plus;
+      out = detail::write<Char>(out, c.imag(), specs, ctx.locale());
+      if (!detail::isfinite(c.imag())) *out++ = Char(' ');
+      *out++ = Char('i');
+      *out++ = Char(')');
+      return out;
+    }
+    out = detail::write<Char>(out, c.imag(), specs, ctx.locale());
+    if (!detail::isfinite(c.imag())) *out++ = Char(' ');
+    *out++ = Char('i');
+    return out;
+  }
+
+ public:
+  FMT_CONSTEXPR auto parse(basic_format_parse_context<Char>& ctx)
+      -> decltype(ctx.begin()) {
+    if (ctx.begin() == ctx.end() || *ctx.begin() == '}') return ctx.begin();
+    return parse_format_specs(ctx.begin(), ctx.end(), specs_, ctx,
+                              detail::type_constant<T, Char>::value);
+  }
+
+  template <typename FormatContext>
+  auto format(const std::complex<T>& c, FormatContext& ctx) const
+      -> decltype(ctx.out()) {
+    auto specs = specs_;
+    if (specs.width_ref.kind != detail::arg_id_kind::none ||
+        specs.precision_ref.kind != detail::arg_id_kind::none) {
+      detail::handle_dynamic_spec<detail::width_checker>(specs.width,
+                                                         specs.width_ref, ctx);
+      detail::handle_dynamic_spec<detail::precision_checker>(
+          specs.precision, specs.precision_ref, ctx);
+    }
+
+    if (specs.width == 0) return do_format(c, specs, ctx, ctx.out());
+    auto buf = basic_memory_buffer<Char>();
+
+    auto outer_specs = format_specs();
+    outer_specs.width = specs.width;
+    outer_specs.fill = specs.fill;
+    outer_specs.align = specs.align;
+
+    specs.width = 0;
+    specs.fill = {};
+    specs.align = align::none;
+
+    do_format(c, specs, ctx, basic_appender<Char>(buf));
+    return detail::write<Char>(ctx.out(),
+                               basic_string_view<Char>(buf.data(), buf.size()),
+                               outer_specs);
+  }
+};
+
+FMT_END_NAMESPACE
+#endif  // FMT_STD_H_
diff --git a/lib/fmt/fmt/xchar.h b/lib/fmt/fmt/xchar.h
new file mode 100644
index 000000000..b1f39ed22
--- /dev/null
+++ b/lib/fmt/fmt/xchar.h
@@ -0,0 +1,322 @@
+// Formatting library for C++ - optional wchar_t and exotic character support
+//
+// Copyright (c) 2012 - present, Victor Zverovich
+// All rights reserved.
+//
+// For the license information refer to format.h.
+
+#ifndef FMT_XCHAR_H_
+#define FMT_XCHAR_H_
+
+#include "color.h"
+#include "format.h"
+#include "ranges.h"
+
+#ifndef FMT_MODULE
+#  include <cwchar>
+#  if !defined(FMT_STATIC_THOUSANDS_SEPARATOR)
+#    include <locale>
+#  endif
+#endif
+
+FMT_BEGIN_NAMESPACE
+namespace detail {
+
+template <typename T>
+using is_exotic_char = bool_constant<!std::is_same<T, char>::value>;
+
+template <typename S, typename = void> struct format_string_char {};
+
+template <typename S>
+struct format_string_char<
+    S, void_t<decltype(sizeof(detail::to_string_view(std::declval<S>())))>> {
+  using type = char_t<S>;
+};
+
+template <typename S>
+struct format_string_char<S, enable_if_t<is_compile_string<S>::value>> {
+  using type = typename S::char_type;
+};
+
+template <typename S>
+using format_string_char_t = typename format_string_char<S>::type;
+
+inline auto write_loc(basic_appender<wchar_t> out, loc_value value,
+                      const format_specs& specs, locale_ref loc) -> bool {
+#ifndef FMT_STATIC_THOUSANDS_SEPARATOR
+  auto& numpunct =
+      std::use_facet<std::numpunct<wchar_t>>(loc.get<std::locale>());
+  auto separator = std::wstring();
+  auto grouping = numpunct.grouping();
+  if (!grouping.empty()) separator = std::wstring(1, numpunct.thousands_sep());
+  return value.visit(loc_writer<wchar_t>{out, specs, separator, grouping, {}});
+#endif
+  return false;
+}
+}  // namespace detail
+
+FMT_BEGIN_EXPORT
+
+using wstring_view = basic_string_view<wchar_t>;
+using wformat_parse_context = basic_format_parse_context<wchar_t>;
+using wformat_context = buffered_context<wchar_t>;
+using wformat_args = basic_format_args<wformat_context>;
+using wmemory_buffer = basic_memory_buffer<wchar_t>;
+
+#if FMT_GCC_VERSION && FMT_GCC_VERSION < 409
+// Workaround broken conversion on older gcc.
+template <typename... Args> using wformat_string = wstring_view;
+inline auto runtime(wstring_view s) -> wstring_view { return s; }
+#else
+template <typename... Args>
+using wformat_string = basic_format_string<wchar_t, type_identity_t<Args>...>;
+inline auto runtime(wstring_view s) -> runtime_format_string<wchar_t> {
+  return {{s}};
+}
+#endif
+
+template <> struct is_char<wchar_t> : std::true_type {};
+template <> struct is_char<char16_t> : std::true_type {};
+template <> struct is_char<char32_t> : std::true_type {};
+
+#ifdef __cpp_char8_t
+template <>
+struct is_char<char8_t> : bool_constant<detail::is_utf8_enabled()> {};
+#endif
+
+template <typename... T>
+constexpr auto make_wformat_args(T&... args)
+    -> decltype(fmt::make_format_args<wformat_context>(args...)) {
+  return fmt::make_format_args<wformat_context>(args...);
+}
+
+inline namespace literals {
+#if FMT_USE_USER_DEFINED_LITERALS && !FMT_USE_NONTYPE_TEMPLATE_ARGS
+constexpr auto operator""_a(const wchar_t* s, size_t)
+    -> detail::udl_arg<wchar_t> {
+  return {s};
+}
+#endif
+}  // namespace literals
+
+template <typename It, typename Sentinel>
+auto join(It begin, Sentinel end, wstring_view sep)
+    -> join_view<It, Sentinel, wchar_t> {
+  return {begin, end, sep};
+}
+
+template <typename Range>
+auto join(Range&& range, wstring_view sep)
+    -> join_view<detail::iterator_t<Range>, detail::sentinel_t<Range>,
+                 wchar_t> {
+  return join(std::begin(range), std::end(range), sep);
+}
+
+template <typename T>
+auto join(std::initializer_list<T> list, wstring_view sep)
+    -> join_view<const T*, const T*, wchar_t> {
+  return join(std::begin(list), std::end(list), sep);
+}
+
+template <typename... T>
+auto join(const std::tuple<T...>& tuple, basic_string_view<wchar_t> sep)
+    -> tuple_join_view<wchar_t, T...> {
+  return {tuple, sep};
+}
+
+template <typename Char, FMT_ENABLE_IF(!std::is_same<Char, char>::value)>
+auto vformat(basic_string_view<Char> format_str,
+             typename detail::vformat_args<Char>::type args)
+    -> std::basic_string<Char> {
+  auto buf = basic_memory_buffer<Char>();
+  detail::vformat_to(buf, format_str, args);
+  return to_string(buf);
+}
+
+template <typename... T>
+auto format(wformat_string<T...> fmt, T&&... args) -> std::wstring {
+  return vformat(fmt::wstring_view(fmt), fmt::make_wformat_args(args...));
+}
+
+template <typename OutputIt, typename... T>
+auto format_to(OutputIt out, wformat_string<T...> fmt, T&&... args)
+    -> OutputIt {
+  return vformat_to(out, fmt::wstring_view(fmt),
+                    fmt::make_wformat_args(args...));
+}
+
+// Pass char_t as a default template parameter instead of using
+// std::basic_string<char_t<S>> to reduce the symbol size.
+template <typename S, typename... T,
+          typename Char = detail::format_string_char_t<S>,
+          FMT_ENABLE_IF(!std::is_same<Char, char>::value &&
+                        !std::is_same<Char, wchar_t>::value)>
+auto format(const S& format_str, T&&... args) -> std::basic_string<Char> {
+  return vformat(detail::to_string_view(format_str),
+                 fmt::make_format_args<buffered_context<Char>>(args...));
+}
+
+template <typename Locale, typename S,
+          typename Char = detail::format_string_char_t<S>,
+          FMT_ENABLE_IF(detail::is_locale<Locale>::value&&
+                            detail::is_exotic_char<Char>::value)>
+inline auto vformat(const Locale& loc, const S& format_str,
+                    typename detail::vformat_args<Char>::type args)
+    -> std::basic_string<Char> {
+  return detail::vformat(loc, detail::to_string_view(format_str), args);
+}
+
+template <typename Locale, typename S, typename... T,
+          typename Char = detail::format_string_char_t<S>,
+          FMT_ENABLE_IF(detail::is_locale<Locale>::value&&
+                            detail::is_exotic_char<Char>::value)>
+inline auto format(const Locale& loc, const S& format_str, T&&... args)
+    -> std::basic_string<Char> {
+  return detail::vformat(
+      loc, detail::to_string_view(format_str),
+      fmt::make_format_args<buffered_context<Char>>(args...));
+}
+
+template <typename OutputIt, typename S,
+          typename Char = detail::format_string_char_t<S>,
+          FMT_ENABLE_IF(detail::is_output_iterator<OutputIt, Char>::value&&
+                            detail::is_exotic_char<Char>::value)>
+auto vformat_to(OutputIt out, const S& format_str,
+                typename detail::vformat_args<Char>::type args) -> OutputIt {
+  auto&& buf = detail::get_buffer<Char>(out);
+  detail::vformat_to(buf, detail::to_string_view(format_str), args);
+  return detail::get_iterator(buf, out);
+}
+
+template <typename OutputIt, typename S, typename... T,
+          typename Char = detail::format_string_char_t<S>,
+          FMT_ENABLE_IF(detail::is_output_iterator<OutputIt, Char>::value &&
+                        !std::is_same<Char, char>::value &&
+                        !std::is_same<Char, wchar_t>::value)>
+inline auto format_to(OutputIt out, const S& fmt, T&&... args) -> OutputIt {
+  return vformat_to(out, detail::to_string_view(fmt),
+                    fmt::make_format_args<buffered_context<Char>>(args...));
+}
+
+template <typename Locale, typename S, typename OutputIt, typename... Args,
+          typename Char = detail::format_string_char_t<S>,
+          FMT_ENABLE_IF(detail::is_output_iterator<OutputIt, Char>::value&&
+                            detail::is_locale<Locale>::value&&
+                                detail::is_exotic_char<Char>::value)>
+inline auto vformat_to(OutputIt out, const Locale& loc, const S& format_str,
+                       typename detail::vformat_args<Char>::type args)
+    -> OutputIt {
+  auto&& buf = detail::get_buffer<Char>(out);
+  vformat_to(buf, detail::to_string_view(format_str), args,
+             detail::locale_ref(loc));
+  return detail::get_iterator(buf, out);
+}
+
+template <typename OutputIt, typename Locale, typename S, typename... T,
+          typename Char = detail::format_string_char_t<S>,
+          bool enable = detail::is_output_iterator<OutputIt, Char>::value &&
+                        detail::is_locale<Locale>::value &&
+                        detail::is_exotic_char<Char>::value>
+inline auto format_to(OutputIt out, const Locale& loc, const S& format_str,
+                      T&&... args) ->
+    typename std::enable_if<enable, OutputIt>::type {
+  return vformat_to(out, loc, detail::to_string_view(format_str),
+                    fmt::make_format_args<buffered_context<Char>>(args...));
+}
+
+template <typename OutputIt, typename Char, typename... Args,
+          FMT_ENABLE_IF(detail::is_output_iterator<OutputIt, Char>::value&&
+                            detail::is_exotic_char<Char>::value)>
+inline auto vformat_to_n(OutputIt out, size_t n,
+                         basic_string_view<Char> format_str,
+                         typename detail::vformat_args<Char>::type args)
+    -> format_to_n_result<OutputIt> {
+  using traits = detail::fixed_buffer_traits;
+  auto buf = detail::iterator_buffer<OutputIt, Char, traits>(out, n);
+  detail::vformat_to(buf, format_str, args);
+  return {buf.out(), buf.count()};
+}
+
+template <typename OutputIt, typename S, typename... T,
+          typename Char = detail::format_string_char_t<S>,
+          FMT_ENABLE_IF(detail::is_output_iterator<OutputIt, Char>::value&&
+                            detail::is_exotic_char<Char>::value)>
+inline auto format_to_n(OutputIt out, size_t n, const S& fmt, T&&... args)
+    -> format_to_n_result<OutputIt> {
+  return vformat_to_n(out, n, fmt::basic_string_view<Char>(fmt),
+                      fmt::make_format_args<buffered_context<Char>>(args...));
+}
+
+template <typename S, typename... T,
+          typename Char = detail::format_string_char_t<S>,
+          FMT_ENABLE_IF(detail::is_exotic_char<Char>::value)>
+inline auto formatted_size(const S& fmt, T&&... args) -> size_t {
+  auto buf = detail::counting_buffer<Char>();
+  detail::vformat_to(buf, detail::to_string_view(fmt),
+                     fmt::make_format_args<buffered_context<Char>>(args...));
+  return buf.count();
+}
+
+inline void vprint(std::FILE* f, wstring_view fmt, wformat_args args) {
+  auto buf = wmemory_buffer();
+  detail::vformat_to(buf, fmt, args);
+  buf.push_back(L'\0');
+  if (std::fputws(buf.data(), f) == -1)
+    FMT_THROW(system_error(errno, FMT_STRING("cannot write to file")));
+}
+
+inline void vprint(wstring_view fmt, wformat_args args) {
+  vprint(stdout, fmt, args);
+}
+
+template <typename... T>
+void print(std::FILE* f, wformat_string<T...> fmt, T&&... args) {
+  return vprint(f, wstring_view(fmt), fmt::make_wformat_args(args...));
+}
+
+template <typename... T> void print(wformat_string<T...> fmt, T&&... args) {
+  return vprint(wstring_view(fmt), fmt::make_wformat_args(args...));
+}
+
+template <typename... T>
+void println(std::FILE* f, wformat_string<T...> fmt, T&&... args) {
+  return print(f, L"{}\n", fmt::format(fmt, std::forward<T>(args)...));
+}
+
+template <typename... T> void println(wformat_string<T...> fmt, T&&... args) {
+  return print(L"{}\n", fmt::format(fmt, std::forward<T>(args)...));
+}
+
+inline auto vformat(const text_style& ts, wstring_view fmt, wformat_args args)
+    -> std::wstring {
+  auto buf = wmemory_buffer();
+  detail::vformat_to(buf, ts, fmt, args);
+  return fmt::to_string(buf);
+}
+
+template <typename... T>
+inline auto format(const text_style& ts, wformat_string<T...> fmt, T&&... args)
+    -> std::wstring {
+  return fmt::vformat(ts, fmt, fmt::make_wformat_args(args...));
+}
+
+template <typename... T>
+FMT_DEPRECATED void print(std::FILE* f, const text_style& ts,
+                          wformat_string<T...> fmt, const T&... args) {
+  vprint(f, ts, fmt, fmt::make_wformat_args(args...));
+}
+
+template <typename... T>
+FMT_DEPRECATED void print(const text_style& ts, wformat_string<T...> fmt,
+                          const T&... args) {
+  return print(stdout, ts, fmt, args...);
+}
+
+/// Converts `value` to `std::wstring` using the default format for type `T`.
+template <typename T> inline auto to_wstring(const T& value) -> std::wstring {
+  return format(FMT_STRING(L"{}"), value);
+}
+FMT_END_EXPORT
+FMT_END_NAMESPACE
+
+#endif  // FMT_XCHAR_H_
diff --git a/lib/gzstream/LICENSE b/lib/gzstream/LICENSE
deleted file mode 100644
index b1e3f5a26..000000000
--- a/lib/gzstream/LICENSE
+++ /dev/null
@@ -1,504 +0,0 @@
-		  GNU LESSER GENERAL PUBLIC LICENSE
-		       Version 2.1, February 1999
-
- Copyright (C) 1991, 1999 Free Software Foundation, Inc.
-     59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
- Everyone is permitted to copy and distribute verbatim copies
- of this license document, but changing it is not allowed.
-
-[This is the first released version of the Lesser GPL.  It also counts
- as the successor of the GNU Library Public License, version 2, hence
- the version number 2.1.]
-
-			    Preamble
-
-  The licenses for most software are designed to take away your
-freedom to share and change it.  By contrast, the GNU General Public
-Licenses are intended to guarantee your freedom to share and change
-free software--to make sure the software is free for all its users.
-
-  This license, the Lesser General Public License, applies to some
-specially designated software packages--typically libraries--of the
-Free Software Foundation and other authors who decide to use it.  You
-can use it too, but we suggest you first think carefully about whether
-this license or the ordinary General Public License is the better
-strategy to use in any particular case, based on the explanations below.
-
-  When we speak of free software, we are referring to freedom of use,
-not price.  Our General Public Licenses are designed to make sure that
-you have the freedom to distribute copies of free software (and charge
-for this service if you wish); that you receive source code or can get
-it if you want it; that you can change the software and use pieces of
-it in new free programs; and that you are informed that you can do
-these things.
-
-  To protect your rights, we need to make restrictions that forbid
-distributors to deny you these rights or to ask you to surrender these
-rights.  These restrictions translate to certain responsibilities for
-you if you distribute copies of the library or if you modify it.
-
-  For example, if you distribute copies of the library, whether gratis
-or for a fee, you must give the recipients all the rights that we gave
-you.  You must make sure that they, too, receive or can get the source
-code.  If you link other code with the library, you must provide
-complete object files to the recipients, so that they can relink them
-with the library after making changes to the library and recompiling
-it.  And you must show them these terms so they know their rights.
-
-  We protect your rights with a two-step method: (1) we copyright the
-library, and (2) we offer you this license, which gives you legal
-permission to copy, distribute and/or modify the library.
-
-  To protect each distributor, we want to make it very clear that
-there is no warranty for the free library.  Also, if the library is
-modified by someone else and passed on, the recipients should know
-that what they have is not the original version, so that the original
-author's reputation will not be affected by problems that might be
-introduced by others.
-
-  Finally, software patents pose a constant threat to the existence of
-any free program.  We wish to make sure that a company cannot
-effectively restrict the users of a free program by obtaining a
-restrictive license from a patent holder.  Therefore, we insist that
-any patent license obtained for a version of the library must be
-consistent with the full freedom of use specified in this license.
-
-  Most GNU software, including some libraries, is covered by the
-ordinary GNU General Public License.  This license, the GNU Lesser
-General Public License, applies to certain designated libraries, and
-is quite different from the ordinary General Public License.  We use
-this license for certain libraries in order to permit linking those
-libraries into non-free programs.
-
-  When a program is linked with a library, whether statically or using
-a shared library, the combination of the two is legally speaking a
-combined work, a derivative of the original library.  The ordinary
-General Public License therefore permits such linking only if the
-entire combination fits its criteria of freedom.  The Lesser General
-Public License permits more lax criteria for linking other code with
-the library.
-
-  We call this license the "Lesser" General Public License because it
-does Less to protect the user's freedom than the ordinary General
-Public License.  It also provides other free software developers Less
-of an advantage over competing non-free programs.  These disadvantages
-are the reason we use the ordinary General Public License for many
-libraries.  However, the Lesser license provides advantages in certain
-special circumstances.
-
-  For example, on rare occasions, there may be a special need to
-encourage the widest possible use of a certain library, so that it becomes
-a de-facto standard.  To achieve this, non-free programs must be
-allowed to use the library.  A more frequent case is that a free
-library does the same job as widely used non-free libraries.  In this
-case, there is little to gain by limiting the free library to free
-software only, so we use the Lesser General Public License.
-
-  In other cases, permission to use a particular library in non-free
-programs enables a greater number of people to use a large body of
-free software.  For example, permission to use the GNU C Library in
-non-free programs enables many more people to use the whole GNU
-operating system, as well as its variant, the GNU/Linux operating
-system.
-
-  Although the Lesser General Public License is Less protective of the
-users' freedom, it does ensure that the user of a program that is
-linked with the Library has the freedom and the wherewithal to run
-that program using a modified version of the Library.
-
-  The precise terms and conditions for copying, distribution and
-modification follow.  Pay close attention to the difference between a
-"work based on the library" and a "work that uses the library".  The
-former contains code derived from the library, whereas the latter must
-be combined with the library in order to run.
-
-		  GNU LESSER GENERAL PUBLIC LICENSE
-   TERMS AND CONDITIONS FOR COPYING, DISTRIBUTION AND MODIFICATION
-
-  0. This License Agreement applies to any software library or other
-program which contains a notice placed by the copyright holder or
-other authorized party saying it may be distributed under the terms of
-this Lesser General Public License (also called "this License").
-Each licensee is addressed as "you".
-
-  A "library" means a collection of software functions and/or data
-prepared so as to be conveniently linked with application programs
-(which use some of those functions and data) to form executables.
-
-  The "Library", below, refers to any such software library or work
-which has been distributed under these terms.  A "work based on the
-Library" means either the Library or any derivative work under
-copyright law: that is to say, a work containing the Library or a
-portion of it, either verbatim or with modifications and/or translated
-straightforwardly into another language.  (Hereinafter, translation is
-included without limitation in the term "modification".)
-
-  "Source code" for a work means the preferred form of the work for
-making modifications to it.  For a library, complete source code means
-all the source code for all modules it contains, plus any associated
-interface definition files, plus the scripts used to control compilation
-and installation of the library.
-
-  Activities other than copying, distribution and modification are not
-covered by this License; they are outside its scope.  The act of
-running a program using the Library is not restricted, and output from
-such a program is covered only if its contents constitute a work based
-on the Library (independent of the use of the Library in a tool for
-writing it).  Whether that is true depends on what the Library does
-and what the program that uses the Library does.
-  
-  1. You may copy and distribute verbatim copies of the Library's
-complete source code as you receive it, in any medium, provided that
-you conspicuously and appropriately publish on each copy an
-appropriate copyright notice and disclaimer of warranty; keep intact
-all the notices that refer to this License and to the absence of any
-warranty; and distribute a copy of this License along with the
-Library.
-
-  You may charge a fee for the physical act of transferring a copy,
-and you may at your option offer warranty protection in exchange for a
-fee.
-
-  2. You may modify your copy or copies of the Library or any portion
-of it, thus forming a work based on the Library, and copy and
-distribute such modifications or work under the terms of Section 1
-above, provided that you also meet all of these conditions:
-
-    a) The modified work must itself be a software library.
-
-    b) You must cause the files modified to carry prominent notices
-    stating that you changed the files and the date of any change.
-
-    c) You must cause the whole of the work to be licensed at no
-    charge to all third parties under the terms of this License.
-
-    d) If a facility in the modified Library refers to a function or a
-    table of data to be supplied by an application program that uses
-    the facility, other than as an argument passed when the facility
-    is invoked, then you must make a good faith effort to ensure that,
-    in the event an application does not supply such function or
-    table, the facility still operates, and performs whatever part of
-    its purpose remains meaningful.
-
-    (For example, a function in a library to compute square roots has
-    a purpose that is entirely well-defined independent of the
-    application.  Therefore, Subsection 2d requires that any
-    application-supplied function or table used by this function must
-    be optional: if the application does not supply it, the square
-    root function must still compute square roots.)
-
-These requirements apply to the modified work as a whole.  If
-identifiable sections of that work are not derived from the Library,
-and can be reasonably considered independent and separate works in
-themselves, then this License, and its terms, do not apply to those
-sections when you distribute them as separate works.  But when you
-distribute the same sections as part of a whole which is a work based
-on the Library, the distribution of the whole must be on the terms of
-this License, whose permissions for other licensees extend to the
-entire whole, and thus to each and every part regardless of who wrote
-it.
-
-Thus, it is not the intent of this section to claim rights or contest
-your rights to work written entirely by you; rather, the intent is to
-exercise the right to control the distribution of derivative or
-collective works based on the Library.
-
-In addition, mere aggregation of another work not based on the Library
-with the Library (or with a work based on the Library) on a volume of
-a storage or distribution medium does not bring the other work under
-the scope of this License.
-
-  3. You may opt to apply the terms of the ordinary GNU General Public
-License instead of this License to a given copy of the Library.  To do
-this, you must alter all the notices that refer to this License, so
-that they refer to the ordinary GNU General Public License, version 2,
-instead of to this License.  (If a newer version than version 2 of the
-ordinary GNU General Public License has appeared, then you can specify
-that version instead if you wish.)  Do not make any other change in
-these notices.
-
-  Once this change is made in a given copy, it is irreversible for
-that copy, so the ordinary GNU General Public License applies to all
-subsequent copies and derivative works made from that copy.
-
-  This option is useful when you wish to copy part of the code of
-the Library into a program that is not a library.
-
-  4. You may copy and distribute the Library (or a portion or
-derivative of it, under Section 2) in object code or executable form
-under the terms of Sections 1 and 2 above provided that you accompany
-it with the complete corresponding machine-readable source code, which
-must be distributed under the terms of Sections 1 and 2 above on a
-medium customarily used for software interchange.
-
-  If distribution of object code is made by offering access to copy
-from a designated place, then offering equivalent access to copy the
-source code from the same place satisfies the requirement to
-distribute the source code, even though third parties are not
-compelled to copy the source along with the object code.
-
-  5. A program that contains no derivative of any portion of the
-Library, but is designed to work with the Library by being compiled or
-linked with it, is called a "work that uses the Library".  Such a
-work, in isolation, is not a derivative work of the Library, and
-therefore falls outside the scope of this License.
-
-  However, linking a "work that uses the Library" with the Library
-creates an executable that is a derivative of the Library (because it
-contains portions of the Library), rather than a "work that uses the
-library".  The executable is therefore covered by this License.
-Section 6 states terms for distribution of such executables.
-
-  When a "work that uses the Library" uses material from a header file
-that is part of the Library, the object code for the work may be a
-derivative work of the Library even though the source code is not.
-Whether this is true is especially significant if the work can be
-linked without the Library, or if the work is itself a library.  The
-threshold for this to be true is not precisely defined by law.
-
-  If such an object file uses only numerical parameters, data
-structure layouts and accessors, and small macros and small inline
-functions (ten lines or less in length), then the use of the object
-file is unrestricted, regardless of whether it is legally a derivative
-work.  (Executables containing this object code plus portions of the
-Library will still fall under Section 6.)
-
-  Otherwise, if the work is a derivative of the Library, you may
-distribute the object code for the work under the terms of Section 6.
-Any executables containing that work also fall under Section 6,
-whether or not they are linked directly with the Library itself.
-
-  6. As an exception to the Sections above, you may also combine or
-link a "work that uses the Library" with the Library to produce a
-work containing portions of the Library, and distribute that work
-under terms of your choice, provided that the terms permit
-modification of the work for the customer's own use and reverse
-engineering for debugging such modifications.
-
-  You must give prominent notice with each copy of the work that the
-Library is used in it and that the Library and its use are covered by
-this License.  You must supply a copy of this License.  If the work
-during execution displays copyright notices, you must include the
-copyright notice for the Library among them, as well as a reference
-directing the user to the copy of this License.  Also, you must do one
-of these things:
-
-    a) Accompany the work with the complete corresponding
-    machine-readable source code for the Library including whatever
-    changes were used in the work (which must be distributed under
-    Sections 1 and 2 above); and, if the work is an executable linked
-    with the Library, with the complete machine-readable "work that
-    uses the Library", as object code and/or source code, so that the
-    user can modify the Library and then relink to produce a modified
-    executable containing the modified Library.  (It is understood
-    that the user who changes the contents of definitions files in the
-    Library will not necessarily be able to recompile the application
-    to use the modified definitions.)
-
-    b) Use a suitable shared library mechanism for linking with the
-    Library.  A suitable mechanism is one that (1) uses at run time a
-    copy of the library already present on the user's computer system,
-    rather than copying library functions into the executable, and (2)
-    will operate properly with a modified version of the library, if
-    the user installs one, as long as the modified version is
-    interface-compatible with the version that the work was made with.
-
-    c) Accompany the work with a written offer, valid for at
-    least three years, to give the same user the materials
-    specified in Subsection 6a, above, for a charge no more
-    than the cost of performing this distribution.
-
-    d) If distribution of the work is made by offering access to copy
-    from a designated place, offer equivalent access to copy the above
-    specified materials from the same place.
-
-    e) Verify that the user has already received a copy of these
-    materials or that you have already sent this user a copy.
-
-  For an executable, the required form of the "work that uses the
-Library" must include any data and utility programs needed for
-reproducing the executable from it.  However, as a special exception,
-the materials to be distributed need not include anything that is
-normally distributed (in either source or binary form) with the major
-components (compiler, kernel, and so on) of the operating system on
-which the executable runs, unless that component itself accompanies
-the executable.
-
-  It may happen that this requirement contradicts the license
-restrictions of other proprietary libraries that do not normally
-accompany the operating system.  Such a contradiction means you cannot
-use both them and the Library together in an executable that you
-distribute.
-
-  7. You may place library facilities that are a work based on the
-Library side-by-side in a single library together with other library
-facilities not covered by this License, and distribute such a combined
-library, provided that the separate distribution of the work based on
-the Library and of the other library facilities is otherwise
-permitted, and provided that you do these two things:
-
-    a) Accompany the combined library with a copy of the same work
-    based on the Library, uncombined with any other library
-    facilities.  This must be distributed under the terms of the
-    Sections above.
-
-    b) Give prominent notice with the combined library of the fact
-    that part of it is a work based on the Library, and explaining
-    where to find the accompanying uncombined form of the same work.
-
-  8. You may not copy, modify, sublicense, link with, or distribute
-the Library except as expressly provided under this License.  Any
-attempt otherwise to copy, modify, sublicense, link with, or
-distribute the Library is void, and will automatically terminate your
-rights under this License.  However, parties who have received copies,
-or rights, from you under this License will not have their licenses
-terminated so long as such parties remain in full compliance.
-
-  9. You are not required to accept this License, since you have not
-signed it.  However, nothing else grants you permission to modify or
-distribute the Library or its derivative works.  These actions are
-prohibited by law if you do not accept this License.  Therefore, by
-modifying or distributing the Library (or any work based on the
-Library), you indicate your acceptance of this License to do so, and
-all its terms and conditions for copying, distributing or modifying
-the Library or works based on it.
-
-  10. Each time you redistribute the Library (or any work based on the
-Library), the recipient automatically receives a license from the
-original licensor to copy, distribute, link with or modify the Library
-subject to these terms and conditions.  You may not impose any further
-restrictions on the recipients' exercise of the rights granted herein.
-You are not responsible for enforcing compliance by third parties with
-this License.
-
-  11. If, as a consequence of a court judgment or allegation of patent
-infringement or for any other reason (not limited to patent issues),
-conditions are imposed on you (whether by court order, agreement or
-otherwise) that contradict the conditions of this License, they do not
-excuse you from the conditions of this License.  If you cannot
-distribute so as to satisfy simultaneously your obligations under this
-License and any other pertinent obligations, then as a consequence you
-may not distribute the Library at all.  For example, if a patent
-license would not permit royalty-free redistribution of the Library by
-all those who receive copies directly or indirectly through you, then
-the only way you could satisfy both it and this License would be to
-refrain entirely from distribution of the Library.
-
-If any portion of this section is held invalid or unenforceable under any
-particular circumstance, the balance of the section is intended to apply,
-and the section as a whole is intended to apply in other circumstances.
-
-It is not the purpose of this section to induce you to infringe any
-patents or other property right claims or to contest validity of any
-such claims; this section has the sole purpose of protecting the
-integrity of the free software distribution system which is
-implemented by public license practices.  Many people have made
-generous contributions to the wide range of software distributed
-through that system in reliance on consistent application of that
-system; it is up to the author/donor to decide if he or she is willing
-to distribute software through any other system and a licensee cannot
-impose that choice.
-
-This section is intended to make thoroughly clear what is believed to
-be a consequence of the rest of this License.
-
-  12. If the distribution and/or use of the Library is restricted in
-certain countries either by patents or by copyrighted interfaces, the
-original copyright holder who places the Library under this License may add
-an explicit geographical distribution limitation excluding those countries,
-so that distribution is permitted only in or among countries not thus
-excluded.  In such case, this License incorporates the limitation as if
-written in the body of this License.
-
-  13. The Free Software Foundation may publish revised and/or new
-versions of the Lesser General Public License from time to time.
-Such new versions will be similar in spirit to the present version,
-but may differ in detail to address new problems or concerns.
-
-Each version is given a distinguishing version number.  If the Library
-specifies a version number of this License which applies to it and
-"any later version", you have the option of following the terms and
-conditions either of that version or of any later version published by
-the Free Software Foundation.  If the Library does not specify a
-license version number, you may choose any version ever published by
-the Free Software Foundation.
-
-  14. If you wish to incorporate parts of the Library into other free
-programs whose distribution conditions are incompatible with these,
-write to the author to ask for permission.  For software which is
-copyrighted by the Free Software Foundation, write to the Free
-Software Foundation; we sometimes make exceptions for this.  Our
-decision will be guided by the two goals of preserving the free status
-of all derivatives of our free software and of promoting the sharing
-and reuse of software generally.
-
-			    NO WARRANTY
-
-  15. BECAUSE THE LIBRARY IS LICENSED FREE OF CHARGE, THERE IS NO
-WARRANTY FOR THE LIBRARY, TO THE EXTENT PERMITTED BY APPLICABLE LAW.
-EXCEPT WHEN OTHERWISE STATED IN WRITING THE COPYRIGHT HOLDERS AND/OR
-OTHER PARTIES PROVIDE THE LIBRARY "AS IS" WITHOUT WARRANTY OF ANY
-KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, THE
-IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
-PURPOSE.  THE ENTIRE RISK AS TO THE QUALITY AND PERFORMANCE OF THE
-LIBRARY IS WITH YOU.  SHOULD THE LIBRARY PROVE DEFECTIVE, YOU ASSUME
-THE COST OF ALL NECESSARY SERVICING, REPAIR OR CORRECTION.
-
-  16. IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN
-WRITING WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MAY MODIFY
-AND/OR REDISTRIBUTE THE LIBRARY AS PERMITTED ABOVE, BE LIABLE TO YOU
-FOR DAMAGES, INCLUDING ANY GENERAL, SPECIAL, INCIDENTAL OR
-CONSEQUENTIAL DAMAGES ARISING OUT OF THE USE OR INABILITY TO USE THE
-LIBRARY (INCLUDING BUT NOT LIMITED TO LOSS OF DATA OR DATA BEING
-RENDERED INACCURATE OR LOSSES SUSTAINED BY YOU OR THIRD PARTIES OR A
-FAILURE OF THE LIBRARY TO OPERATE WITH ANY OTHER SOFTWARE), EVEN IF
-SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE POSSIBILITY OF SUCH
-DAMAGES.
-
-		     END OF TERMS AND CONDITIONS
-
-           How to Apply These Terms to Your New Libraries
-
-  If you develop a new library, and you want it to be of the greatest
-possible use to the public, we recommend making it free software that
-everyone can redistribute and change.  You can do so by permitting
-redistribution under these terms (or, alternatively, under the terms of the
-ordinary General Public License).
-
-  To apply these terms, attach the following notices to the library.  It is
-safest to attach them to the start of each source file to most effectively
-convey the exclusion of warranty; and each file should have at least the
-"copyright" line and a pointer to where the full notice is found.
-
-    <one line to give the library's name and a brief idea of what it does.>
-    Copyright (C) <year>  <name of author>
-
-    This library is free software; you can redistribute it and/or
-    modify it under the terms of the GNU Lesser General Public
-    License as published by the Free Software Foundation; either
-    version 2.1 of the License, or (at your option) any later version.
-
-    This library is distributed in the hope that it will be useful,
-    but WITHOUT ANY WARRANTY; without even the implied warranty of
-    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-    Lesser General Public License for more details.
-
-    You should have received a copy of the GNU Lesser General Public
-    License along with this library; if not, write to the Free Software
-    Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
-
-Also add information on how to contact you by electronic and paper mail.
-
-You should also get your employer (if you work as a programmer) or your
-school, if any, to sign a "copyright disclaimer" for the library, if
-necessary.  Here is a sample; alter the names:
-
-  Yoyodyne, Inc., hereby disclaims all copyright interest in the
-  library `Frob' (a library for tweaking knobs) written by James Random Hacker.
-
-  <signature of Ty Coon>, 1 April 1990
-  Ty Coon, President of Vice
-
-That's all there is to it!
-
-
diff --git a/lib/gzstream/README b/lib/gzstream/README
deleted file mode 100644
index 61d806044..000000000
--- a/lib/gzstream/README
+++ /dev/null
@@ -1,7 +0,0 @@
-
-                              gzstream
-      C++ iostream classes wrapping the zlib compression library.
-===========================================================================
-
-    Header Only version of this library from:
-    https://gist.github.com/piti118/1508048
diff --git a/lib/gzstream/gzstream.h b/lib/gzstream/gzstream.h
deleted file mode 100644
index a3809a781..000000000
--- a/lib/gzstream/gzstream.h
+++ /dev/null
@@ -1,209 +0,0 @@
-// gzstream, C++ iostream classes wrapping the zlib compression library.
-// Copyright (C) 2001  Deepak Bandyopadhyay, Lutz Kettner
-//
-// This library is free software; you can redistribute it and/or
-// modify it under the terms of the GNU Lesser General Public
-// License as published by the Free Software Foundation; either
-// version 2.1 of the License, or (at your option) any later version.
-//
-// This library is distributed in the hope that it will be useful,
-// but WITHOUT ANY WARRANTY; without even the implied warranty of
-// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-// Lesser General Public License for more details.
-//
-// You should have received a copy of the GNU Lesser General Public
-// License along with this library; if not, write to the Free Software
-// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
-//
-// File          : gzstream.h
-// Revision      : $Revision: 1.5 $
-// Revision_date : $Date: 2002/04/26 23:30:15 $
-// Author(s)     : Deepak Bandyopadhyay, Lutz Kettner
-// 
-// Standard streambuf implementation following Nicolai Josuttis, "The 
-// Standard C++ Library".
-
-#ifndef GZSTREAM_H
-#define GZSTREAM_H
-
-#include <iostream>
-#include <fstream>
-#include <zlib.h>
-#include <string.h>  // for memcpy
-
-// Internal classes to implement gzstream. See below for user classes.
-class gzstreambuf : public std::streambuf {
-private:
-    static const int bufferSize = 47 + 256;    // size of data buff
-    // totals 512 bytes under g++ for igzstream at the end.
-
-    gzFile file;               // file handle for compressed file
-    char buffer[bufferSize]; // data buffer
-    char opened;             // open/close state of stream
-    int mode;               // I/O mode
-
-    int flush_buffer() {
-        // Separate the writing of the buffer from overflow() and
-        // sync() operation.
-        int w = pptr() - pbase();
-        if (gzwrite(file, pbase(), w) != w)
-            return EOF;
-        pbump(-w);
-        return w;
-    }
-
-public:
-    gzstreambuf() : opened(0) {
-        setp(buffer, buffer + (bufferSize - 1));
-        setg(buffer + 4,     // beginning of putback area
-             buffer + 4,     // read position
-             buffer + 4);    // end position
-        // ASSERT: both input & output capabilities will not be used together
-    }
-
-    int is_open() { return opened; }
-
-    gzstreambuf *open(const char *name, int open_mode) {
-        if (is_open())
-            return (gzstreambuf *) 0;
-        mode = open_mode;
-        // no append nor read/write mode
-        if ((mode & std::ios::ate) || (mode & std::ios::app)
-            || ((mode & std::ios::in) && (mode & std::ios::out)))
-            return (gzstreambuf *) 0;
-        char fmode[10];
-        char *fmodeptr = fmode;
-        if (mode & std::ios::in)
-            *fmodeptr++ = 'r';
-        else if (mode & std::ios::out)
-            *fmodeptr++ = 'w';
-        *fmodeptr++ = 'b';
-        *fmodeptr = '\0';
-        file = gzopen(name, fmode);
-        if (file == 0)
-            return (gzstreambuf *) 0;
-        opened = 1;
-        return this;
-    }
-
-    gzstreambuf *close() {
-        if (is_open()) {
-            sync();
-            opened = 0;
-            if (gzclose(file) == Z_OK)
-                return this;
-        }
-        return (gzstreambuf *) 0;
-    }
-
-    ~gzstreambuf() { close(); }
-
-    virtual int overflow(int c = EOF) { // used for output buffer only
-        if (!(mode & std::ios::out) || !opened)
-            return EOF;
-        if (c != EOF) {
-            *pptr() = c;
-            pbump(1);
-        }
-        if (flush_buffer() == EOF)
-            return EOF;
-        return c;
-    }
-
-    virtual int underflow() { // used for input buffer only
-        if (gptr() && (gptr() < egptr()))
-            return *reinterpret_cast<unsigned char *>( gptr());
-
-        if (!(mode & std::ios::in) || !opened)
-            return EOF;
-        // Josuttis' implementation of inbuf
-        int n_putback = gptr() - eback();
-        if (n_putback > 4)
-            n_putback = 4;
-        memcpy(buffer + (4 - n_putback), gptr() - n_putback, n_putback);
-
-        int num = gzread(file, buffer + 4, bufferSize - 4);
-        if (num <= 0) // ERROR or EOF
-            return EOF;
-
-        // reset buffer pointers
-        setg(buffer + (4 - n_putback),   // beginning of putback area
-             buffer + 4,                 // read position
-             buffer + 4 + num);          // end of buffer
-
-        // return next character
-        return *reinterpret_cast<unsigned char *>( gptr());
-    }
-
-    virtual int sync() {
-        // Changed to use flush_buffer() instead of overflow( EOF)
-        // which caused improper behavior with std::endl and flush(),
-        // bug reported by Vincent Ricard.
-        if (pptr() && pptr() > pbase()) {
-            if (flush_buffer() == EOF)
-                return -1;
-        }
-        return 0;
-    }
-};
-
-class gzstreambase : virtual public std::ios {
-protected:
-    gzstreambuf buf;
-public:
-    gzstreambase() { init(&buf); }
-
-    gzstreambase(const char *name, int open_mode) {
-        init(&buf);
-        open(name, open_mode);
-    }
-
-    ~gzstreambase() {
-        buf.close();
-    }
-
-    void open(const char *name, int open_mode) {
-        if (!buf.open(name, open_mode))
-            clear(rdstate() | std::ios::badbit);
-    }
-
-    void close() {
-        if (buf.is_open()) if (!buf.close())
-            clear(rdstate() | std::ios::badbit);
-    }
-
-    gzstreambuf *rdbuf() { return &buf; }
-};
-
-// User classes. Use igzstream and ogzstream analogously to ifstream and
-// ofstream respectively. They read and write files based on the gz* 
-// function interface of the zlib. Files are compatible with gzip compression.
-class igzstream : public gzstreambase, public std::istream {
-public:
-    igzstream() : std::istream(&buf) { }
-
-    igzstream(const char *name, int open_mode = std::ios::in)
-            : gzstreambase(name, open_mode), std::istream(&buf) { }
-
-    gzstreambuf *rdbuf() { return gzstreambase::rdbuf(); }
-
-    void open(const char *name, int open_mode = std::ios::in) {
-        gzstreambase::open(name, open_mode);
-    }
-};
-
-class ogzstream : public gzstreambase, public std::ostream {
-public:
-    ogzstream() : std::ostream(&buf) { }
-
-    ogzstream(const char *name, int mode = std::ios::out)
-            : gzstreambase(name, mode), std::ostream(&buf) { }
-
-    gzstreambuf *rdbuf() { return gzstreambase::rdbuf(); }
-
-    void open(const char *name, int open_mode = std::ios::out) {
-        gzstreambase::open(name, open_mode);
-    }
-};
-
-#endif
diff --git a/lib/libmarv/.gitignore b/lib/libmarv/.gitignore
new file mode 100644
index 000000000..b673acb1b
--- /dev/null
+++ b/lib/libmarv/.gitignore
@@ -0,0 +1,5 @@
+align
+makedb
+modifydb
+tileconfigsearch
+build/
\ No newline at end of file
diff --git a/lib/libmarv/LICENSE b/lib/libmarv/LICENSE
new file mode 100644
index 000000000..11ccc3f6b
--- /dev/null
+++ b/lib/libmarv/LICENSE
@@ -0,0 +1,201 @@
+                                 Apache License
+                           Version 2.0, January 2004
+                        http://www.apache.org/licenses/
+
+   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
+
+   1. Definitions.
+
+      "License" shall mean the terms and conditions for use, reproduction,
+      and distribution as defined by Sections 1 through 9 of this document.
+
+      "Licensor" shall mean the copyright owner or entity authorized by
+      the copyright owner that is granting the License.
+
+      "Legal Entity" shall mean the union of the acting entity and all
+      other entities that control, are controlled by, or are under common
+      control with that entity. For the purposes of this definition,
+      "control" means (i) the power, direct or indirect, to cause the
+      direction or management of such entity, whether by contract or
+      otherwise, or (ii) ownership of fifty percent (50%) or more of the
+      outstanding shares, or (iii) beneficial ownership of such entity.
+
+      "You" (or "Your") shall mean an individual or Legal Entity
+      exercising permissions granted by this License.
+
+      "Source" form shall mean the preferred form for making modifications,
+      including but not limited to software source code, documentation
+      source, and configuration files.
+
+      "Object" form shall mean any form resulting from mechanical
+      transformation or translation of a Source form, including but
+      not limited to compiled object code, generated documentation,
+      and conversions to other media types.
+
+      "Work" shall mean the work of authorship, whether in Source or
+      Object form, made available under the License, as indicated by a
+      copyright notice that is included in or attached to the work
+      (an example is provided in the Appendix below).
+
+      "Derivative Works" shall mean any work, whether in Source or Object
+      form, that is based on (or derived from) the Work and for which the
+      editorial revisions, annotations, elaborations, or other modifications
+      represent, as a whole, an original work of authorship. For the purposes
+      of this License, Derivative Works shall not include works that remain
+      separable from, or merely link (or bind by name) to the interfaces of,
+      the Work and Derivative Works thereof.
+
+      "Contribution" shall mean any work of authorship, including
+      the original version of the Work and any modifications or additions
+      to that Work or Derivative Works thereof, that is intentionally
+      submitted to Licensor for inclusion in the Work by the copyright owner
+      or by an individual or Legal Entity authorized to submit on behalf of
+      the copyright owner. For the purposes of this definition, "submitted"
+      means any form of electronic, verbal, or written communication sent
+      to the Licensor or its representatives, including but not limited to
+      communication on electronic mailing lists, source code control systems,
+      and issue tracking systems that are managed by, or on behalf of, the
+      Licensor for the purpose of discussing and improving the Work, but
+      excluding communication that is conspicuously marked or otherwise
+      designated in writing by the copyright owner as "Not a Contribution."
+
+      "Contributor" shall mean Licensor and any individual or Legal Entity
+      on behalf of whom a Contribution has been received by Licensor and
+      subsequently incorporated within the Work.
+
+   2. Grant of Copyright License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      copyright license to reproduce, prepare Derivative Works of,
+      publicly display, publicly perform, sublicense, and distribute the
+      Work and such Derivative Works in Source or Object form.
+
+   3. Grant of Patent License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      (except as stated in this section) patent license to make, have made,
+      use, offer to sell, sell, import, and otherwise transfer the Work,
+      where such license applies only to those patent claims licensable
+      by such Contributor that are necessarily infringed by their
+      Contribution(s) alone or by combination of their Contribution(s)
+      with the Work to which such Contribution(s) was submitted. If You
+      institute patent litigation against any entity (including a
+      cross-claim or counterclaim in a lawsuit) alleging that the Work
+      or a Contribution incorporated within the Work constitutes direct
+      or contributory patent infringement, then any patent licenses
+      granted to You under this License for that Work shall terminate
+      as of the date such litigation is filed.
+
+   4. Redistribution. You may reproduce and distribute copies of the
+      Work or Derivative Works thereof in any medium, with or without
+      modifications, and in Source or Object form, provided that You
+      meet the following conditions:
+
+      (a) You must give any other recipients of the Work or
+          Derivative Works a copy of this License; and
+
+      (b) You must cause any modified files to carry prominent notices
+          stating that You changed the files; and
+
+      (c) You must retain, in the Source form of any Derivative Works
+          that You distribute, all copyright, patent, trademark, and
+          attribution notices from the Source form of the Work,
+          excluding those notices that do not pertain to any part of
+          the Derivative Works; and
+
+      (d) If the Work includes a "NOTICE" text file as part of its
+          distribution, then any Derivative Works that You distribute must
+          include a readable copy of the attribution notices contained
+          within such NOTICE file, excluding those notices that do not
+          pertain to any part of the Derivative Works, in at least one
+          of the following places: within a NOTICE text file distributed
+          as part of the Derivative Works; within the Source form or
+          documentation, if provided along with the Derivative Works; or,
+          within a display generated by the Derivative Works, if and
+          wherever such third-party notices normally appear. The contents
+          of the NOTICE file are for informational purposes only and
+          do not modify the License. You may add Your own attribution
+          notices within Derivative Works that You distribute, alongside
+          or as an addendum to the NOTICE text from the Work, provided
+          that such additional attribution notices cannot be construed
+          as modifying the License.
+
+      You may add Your own copyright statement to Your modifications and
+      may provide additional or different license terms and conditions
+      for use, reproduction, or distribution of Your modifications, or
+      for any such Derivative Works as a whole, provided Your use,
+      reproduction, and distribution of the Work otherwise complies with
+      the conditions stated in this License.
+
+   5. Submission of Contributions. Unless You explicitly state otherwise,
+      any Contribution intentionally submitted for inclusion in the Work
+      by You to the Licensor shall be under the terms and conditions of
+      this License, without any additional terms or conditions.
+      Notwithstanding the above, nothing herein shall supersede or modify
+      the terms of any separate license agreement you may have executed
+      with Licensor regarding such Contributions.
+
+   6. Trademarks. This License does not grant permission to use the trade
+      names, trademarks, service marks, or product names of the Licensor,
+      except as required for reasonable and customary use in describing the
+      origin of the Work and reproducing the content of the NOTICE file.
+
+   7. Disclaimer of Warranty. Unless required by applicable law or
+      agreed to in writing, Licensor provides the Work (and each
+      Contributor provides its Contributions) on an "AS IS" BASIS,
+      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+      implied, including, without limitation, any warranties or conditions
+      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
+      PARTICULAR PURPOSE. You are solely responsible for determining the
+      appropriateness of using or redistributing the Work and assume any
+      risks associated with Your exercise of permissions under this License.
+
+   8. Limitation of Liability. In no event and under no legal theory,
+      whether in tort (including negligence), contract, or otherwise,
+      unless required by applicable law (such as deliberate and grossly
+      negligent acts) or agreed to in writing, shall any Contributor be
+      liable to You for damages, including any direct, indirect, special,
+      incidental, or consequential damages of any character arising as a
+      result of this License or out of the use or inability to use the
+      Work (including but not limited to damages for loss of goodwill,
+      work stoppage, computer failure or malfunction, or any and all
+      other commercial damages or losses), even if such Contributor
+      has been advised of the possibility of such damages.
+
+   9. Accepting Warranty or Additional Liability. While redistributing
+      the Work or Derivative Works thereof, You may choose to offer,
+      and charge a fee for, acceptance of support, warranty, indemnity,
+      or other liability obligations and/or rights consistent with this
+      License. However, in accepting such obligations, You may act only
+      on Your own behalf and on Your sole responsibility, not on behalf
+      of any other Contributor, and only if You agree to indemnify,
+      defend, and hold each Contributor harmless for any liability
+      incurred by, or claims asserted against, such Contributor by reason
+      of your accepting any such warranty or additional liability.
+
+   END OF TERMS AND CONDITIONS
+
+   APPENDIX: How to apply the Apache License to your work.
+
+      To apply the Apache License to your work, attach the following
+      boilerplate notice, with the fields enclosed by brackets "[]"
+      replaced with your own identifying information. (Don't include
+      the brackets!)  The text should be enclosed in the appropriate
+      comment syntax for the file format. We also recommend that a
+      file or class name and description of purpose be included on the
+      same "printed page" as the copyright notice for easier
+      identification within third-party archives.
+
+   Copyright 2023 asbschmidt
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
diff --git a/lib/libmarv/Makefile b/lib/libmarv/Makefile
new file mode 100755
index 000000000..0cea4c785
--- /dev/null
+++ b/lib/libmarv/Makefile
@@ -0,0 +1,92 @@
+# settings
+DIALECT      = -std=c++17
+OPTIMIZATION = -O3 -g
+WARNINGS     = -Xcompiler="-Wall -Wextra"
+
+# NVCC_FLAGS   = -DCUDASW_DEBUG_CHECK_CORRECTNESS -arch=native -lineinfo --expt-relaxed-constexpr -rdc=true --extended-lambda -lnvToolsExt -Xcompiler="-fopenmp" #-res-usage #-Xptxas "-v"
+NVCC_FLAGS   = -arch=native -lineinfo --expt-relaxed-constexpr -rdc=true --extended-lambda -lnvToolsExt -Xcompiler="-fopenmp" #-res-usage #-Xptxas "-v"
+
+
+LDFLAGS      = -Xcompiler="-pthread"  $(NVCC_FLAGS) -lz
+COMPILER     = nvcc
+ARTIFACT     = align
+
+BUILDDIR = build
+
+MAKEDB = makedb
+MODIFYDB = modifydb
+GRIDSEARCH = gridsearch
+TILECONFIGSEARCH = tileconfigsearch
+
+$(shell mkdir -p $(BUILDDIR))
+
+# make targets
+.PHONY: clean
+
+release: $(ARTIFACT) $(MAKEDB) $(TILECONFIGSEARCH)
+
+
+clean :
+	rm -f $(BUILDDIR)/*
+	rm -f $(ARTIFACT)
+	rm -f $(MAKEDB)
+	rm -f $(TILECONFIGSEARCH)
+
+# compiler call
+COMPILE = $(COMPILER) $(NVCC_FLAGS) $(DIALECT) $(OPTIMIZATION) $(WARNINGS) -c $< -o $@
+
+CUDASW_OBJS =  $(BUILDDIR)/sequence_io.o $(BUILDDIR)/dbdata.o $(BUILDDIR)/options.o $(BUILDDIR)/blosum.o $(BUILDDIR)/pssmkernels_smithwaterman_instantiation_float.o $(BUILDDIR)/pssmkernels_smithwaterman_instantiation_dpx.o $(BUILDDIR)/pssmkernels_gapless_instantiation_half2.o $(BUILDDIR)/pssmkernels_gapless_instantiation_dpx.o $(BUILDDIR)/pssmkernels_gapless_instantiation_half2_kernelparamzero.o $(BUILDDIR)/pssmkernels_gapless_instantiation_dpx_kernelparamzero.o
+
+# link object files into executable
+$(ARTIFACT): $(BUILDDIR)/main.o $(CUDASW_OBJS)
+	$(COMPILER) $^ -o $(ARTIFACT) $(LDFLAGS)
+
+$(TILECONFIGSEARCH): $(BUILDDIR)/tileconfigsearch.o $(CUDASW_OBJS)
+	$(COMPILER) $^ -o $(TILECONFIGSEARCH) $(LDFLAGS)	
+
+$(MAKEDB): $(BUILDDIR)/makedb.o $(BUILDDIR)/sequence_io.o $(BUILDDIR)/dbdata.o
+	$(COMPILER) $^ -o $(MAKEDB) $(LDFLAGS)
+
+
+
+$(BUILDDIR)/tileconfigsearch.o : src/tileconfigsearch.cu src/sequence_io.h src/length_partitions.hpp src/dbdata.hpp src/cudasw4.cuh src/kernels.cuh src/convert.cuh src/blosum.hpp src/types.hpp src/pssm.cuh src/pssmkernels_gapless.cuh src/pssmkernels_smithwaterman.cuh src/gapless_kernel_config.cuh
+	$(COMPILE)
+
+$(BUILDDIR)/main.o : src/main.cu src/sequence_io.h src/length_partitions.hpp src/dbdata.hpp src/cudasw4.cuh src/kernels.cuh src/convert.cuh src/blosum.hpp src/types.hpp src/pssm.cuh src/pssmkernels_gapless.cuh src/pssmkernels_smithwaterman.cuh src/gapless_kernel_config.cuh
+	$(COMPILE)
+
+$(BUILDDIR)/sequence_io.o : src/sequence_io.cpp src/sequence_io.h
+	$(COMPILE)
+
+$(BUILDDIR)/dbdata.o : src/dbdata.cpp src/dbdata.hpp src/mapped_file.hpp src/sequence_io.h src/length_partitions.hpp
+	$(COMPILE)
+
+$(BUILDDIR)/options.o : src/options.cpp src/options.hpp src/types.hpp
+	$(COMPILE)
+
+$(BUILDDIR)/blosum.o : src/blosum.cu src/blosum.hpp
+	$(COMPILE)
+
+$(BUILDDIR)/pssmkernels_gapless_instantiation_half2.o : src/pssmkernels_gapless_instantiation_half2.cu src/pssmkernels_gapless.cuh src/convert.cuh src/util.cuh
+	$(COMPILE)
+
+$(BUILDDIR)/pssmkernels_gapless_instantiation_dpx.o : src/pssmkernels_gapless_instantiation_dpx.cu src/pssmkernels_gapless.cuh src/convert.cuh src/util.cuh
+	$(COMPILE)	
+
+$(BUILDDIR)/pssmkernels_gapless_instantiation_half2_kernelparamzero.o : src/pssmkernels_gapless_instantiation_half2_kernelparamzero.cu src/pssmkernels_gapless.cuh src/convert.cuh src/util.cuh
+	$(COMPILE)
+
+$(BUILDDIR)/pssmkernels_gapless_instantiation_dpx_kernelparamzero.o : src/pssmkernels_gapless_instantiation_dpx_kernelparamzero.cu src/pssmkernels_gapless.cuh src/convert.cuh src/util.cuh
+	$(COMPILE)	
+
+$(BUILDDIR)/pssmkernels_smithwaterman_instantiation_float.o : src/pssmkernels_smithwaterman_instantiation_float.cu src/pssmkernels_smithwaterman.cuh src/convert.cuh src/util.cuh
+	$(COMPILE)
+
+$(BUILDDIR)/pssmkernels_smithwaterman_instantiation_dpx.o : src/pssmkernels_smithwaterman_instantiation_dpx.cu src/pssmkernels_smithwaterman.cuh src/convert.cuh src/util.cuh
+	$(COMPILE)
+
+$(BUILDDIR)/makedb.o : src/makedb.cpp src/dbdata.hpp src/sequence_io.h
+	$(COMPILE)
+
+
+
diff --git a/lib/libmarv/Readme.md b/lib/libmarv/Readme.md
new file mode 100644
index 000000000..83bfa09e5
--- /dev/null
+++ b/lib/libmarv/Readme.md
@@ -0,0 +1,118 @@
+
+
+
+
+
+# CUDASW++4.0-GapLessFilter
+
+
+## Software requirements
+* Linux operating system with compatible CUDA Toolkit 12 or newer
+* C++17 compiler
+* zlib
+* make
+
+## Hardware requirements
+*   A modern CUDA-capable GPU of generation Ampère or newer. We have tested CUDASW4 on Ampère (sm_80), Ada Lovelace (sm_89), and Hopper (sm_90). Older generations lack hardware-support for specific instructions and may run at reduced speeds or may not run at all.
+
+
+
+
+## Build
+Our software has two components, **makedb** and **align** . **makedb** is used to construct a database which can be queried by **align**.
+
+The build step compiles the GPU code for all GPU archictectures of GPUs detected in the system. The CUDA environment variable `CUDA_VISIBLE_DEVICES` can be used to control the detected GPUs. If `CUDA_VISIBLE_DEVICES` is not set, it will default to all GPUs in the system.
+
+* Build makedb: `make makedb`
+
+* Build align: `make align`
+
+* Build align for the GPU architecture of GPUs 0 and 1: `CUDA_VISIBLE_DEVICES=0,1 make align`
+
+## Database construction
+Use **makedb** to create a database from a fasta file. The file can be gzip'ed.
+We support fasta files with up to 2 billion sequences.
+
+```
+mkdir -p dbfolder
+./makedb input.fa(.gz) dbfolder/dbname [options]
+```
+
+Options:
+* --mem val : Memory limit. Can use suffix K,M,G. If makedb requires more memory, temp files in temp directory will be used. Default all available memory.
+* --tempdir val : Temp directory for temporary files. Must exist. Default is db output directory.
+
+
+
+## Querying the database
+Use **align** to query the database. **align** has two mandatory arguments. 
+1. `--query` The query file which contains all queries
+2. `--db` The path to the reference database constructed with makedb. 
+
+Run `./align --help` to get a complete list of options.
+
+By default, the results will be output to stdout in plain text. Results can be output to file instead (`--of filename`), and can be output as tab-separated values (`--tsv`). Example tsv output is given below.
+
+| Query number | Query length | Query header | Result number | Result score | Reference length | Reference header | Reference ID in DB | Alignment_end_query | Alignment_end_ref |
+|------------|------------|------------|------------|------------|------------| ------------|------------|------------|------------|
+| 0 | 144 | gi\|122087146 | 0 | 541 | 148 | UniRef50_P02233 | 23128215 | 100 | 100 |
+| 0 | 144 | gi\|122087146 | 1 | 444 | 144 | UniRef50_P02238  | 22381647 | 100 | 100 |
+
+
+## Selecting GPUs
+Similar to the build process, **align** will use all GPUs that are set with `CUDA_VISIBLE_DEVICES`, or all GPUs if `CUDA_VISIBLE_DEVICES` is not set. 
+
+```
+# use the gpus that are currently set in CUDA_VISIBLE_DEVICES
+./align --query queries.fa(.gz) --db dbfolder/dbname
+
+# use gpus 0 and 1 for only this command
+CUDA_VISIBLE_DEVICES=0,1 ./align --query queries.fa(.gz) --db dbfolder/dbname
+```
+
+## Scoring options
+
+```
+    --top val : Output the val best scores. Default val = 10.
+    --gop val : Gap open score. Overwrites blosum-dependent default score.
+    --gex val : Gap extend score. Overwrites blosum-dependent default score.
+    --mat val : Set substitution matrix. Supported values: blosum45, blosum50, blosum62, blosum80. Default val = blosum62.
+    --scanType val : Set scan type. Supported values = {Gapless, SW_Endpos, Gapless+SW_Endpos}.
+            Gapless: Scan whole DB with gapless alignment. 
+            SW_Endpos: Scan whole DB with Smith Waterman Alignment, output score and end position.
+            Gapless+SW_Endpos: Scan whole DB with gapless alignment, then re-scan top results with Smith Waterman. Default val = Gapless
+    --subjectIdsFile val : Only consider database sequences with index specified in file. Must be a text file, one index per line.
+            Do not use together with scanType Gapless+SW_Endpos. When --subjectIdsFile is set, option --top is ignored.
+```
+
+
+## Memory options
+
+```
+    --maxGpuMem val : Try not to use more than val bytes of gpu memory per gpu. This is not a hard limit. Can use suffix K,M,G. All available gpu memory by default.
+    --maxTempBytes val : Size of temp storage in GPU memory. Can use suffix K,M,G. Default val = 4G
+    --maxBatchBytes val : Process DB in batches of at most val bytes. Can use suffix K,M,G. Default val = 128M
+    --maxBatchSequences val : Process DB in batches of at most val sequences. Default val = 10000000
+```
+
+Depending on the database size and available total GPU memory, the database is transferred to the GPU once for all queries, or it is processed in batches which requires a transfer for each query. Above options give some control over memory usage. For best performance, the complete database must fit into `maxGpuMem` times the number of used GPUs.
+
+## Other options
+```
+    --verbose : More console output. Shows timings.
+    --printLengthPartitions : Print number of sequences per length partition in db.
+    --interactive : Loads DB, then waits for sequence input by user
+    --help : Print all options
+```
+
+## Peak performance test:
+
+Align all queries to a simulated database of 2000000 sequences of length 512. All sequences are identical.
+```
+./align --query allqueries.fasta --top 0 --verbose --uploadFull --pseudodb 2000000 512 1
+```
+
+Align all queries to a simulated database of 2000000 sequences of length 512. All sequences are random.
+```
+./align --query allqueries.fasta --top 0 --verbose --uploadFull --pseudodb 2000000 512 0
+```
\ No newline at end of file
diff --git a/lib/libmarv/allqueries.fasta b/lib/libmarv/allqueries.fasta
new file mode 100644
index 000000000..d40ed98fb
--- /dev/null
+++ b/lib/libmarv/allqueries.fasta
@@ -0,0 +1,629 @@
+>gi|122087146|sp|P02232.2|LGB1_VICFA RecName: Full=Leghemoglobin-1; AltName: Full=Leghemoglobin I
+MGFTEKQEALVNSSSQLFKQNPSNYSVLFYTIILQKAPTAKAMFSFLKDSAGVVDSPKLGAHAEKVFGMV
+RDSAVQLRATGEVVLDGKDGSIHIQKGVLDPHFVVVKEALLKTIKEASGDKWSEELSAAWEVAYDGLATA
+IKAA
+>gi|124460|sp|P05013.1|IFNA6_HUMAN RecName: Full=Interferon alpha-6; AltName: Full=Interferon alpha-K; Short=LeIF K; AltName: Full=Interferon alpha-54; Flags: Precursor
+MALPFALLMALVVLSCKSSCSLDCDLPQTHSLGHRRTMMLLAQMRRISLFSCLKDRHDFRFPQEEFDGNQ
+FQKAEAISVLHEVIQQTFNLFSTKDSSVAWDERLLDKLYTELYQQLNDLEACVMQEVWVGGTPLMNEDSI
+LAVRKYFQRITLYLTEKKYSPCAWEVVRAEIMRSFSSSRNLQERLRRKE
+>gi|121714|sp|P14942.2|GSTA4_RAT RecName: Full=Glutathione S-transferase alpha-4; AltName: Full=Glutathione S-transferase Yk; Short=GST Yk; AltName: Full=GST 8-8; AltName: Full=GST K; AltName: Full=GST A4-4
+MEVKPKLYYFQGRGRMESIRWLLATAGVEFEEEFLETREQYEKLQKDGCLLFGQVPLVEIDGMLLTQTRA
+ILSYLAAKYNLYGKDLKERVRIDMYADGTQDLMMMIIGAPFKAPQEKEESLALAVKRAKNRYFPVFEKIL
+KDHGEAFLVGNQLSWADIQLLEAILMVEEVSAPVLSDFPLLQAFKTRISNIPTIKKFLQPGSQRKPPPDG
+HYVDVVRTVLKF
+>gi|113390|sp|P07327.2|ADH1A_HUMAN RecName: Full=Alcohol dehydrogenase 1A; AltName: Full=Alcohol dehydrogenase subunit alpha
+MSTAGKVIKCKAAVLWELKKPFSIEEVEVAPPKAHEVRIKMVAVGICGTDDHVVSGTMVTPLPVILGHEA
+AGIVESVGEGVTTVKPGDKVIPLAIPQCGKCRICKNPESNYCLKNDVSNPQGTLQDGTSRFTCRRKPIHH
+FLGISTFSQYTVVDENAVAKIDAASPLEKVCLIGCGFSTGYGSAVNVAKVTPGSTCAVFGLGGVGLSAIM
+GCKAAGAARIIAVDINKDKFAKAKELGATECINPQDYKKPIQEVLKEMTDGGVDFSFEVIGRLDTMMASL
+LCCHEACGTSVIVGVPPDSQNLSMNPMLLLTGRTWKGAILGGFKSKECVPKLVADFMAKKFSLDALITHV
+LPFEKINEGFDLLHSGKSIRTILMF
+>gi|113936|sp|P01008.1|ANT3_HUMAN RecName: Full=Antithrombin-III; Short=ATIII; Flags: Precursor
+MYSNVIGTVTSGKRKVYLLSLLLIGFWDCVTCHGSPVDICTAKPRDIPMNPMCIYRSPEKKATEDEGSEQ
+KIPEATNRRVWELSKANSRFATTFYQHLADSKNDNDNIFLSPLSISTAFAMTKLGACNDTLQQLMEVFKF
+DTISEKTSDQIHFFFAKLNCRLYRKANKSSKLVSANRLFGDKSLTFNETYQDISELVYGAKLQPLDFKEN
+AEQSRAAINKWVSNKTEGRITDVIPSEAINELTVLVLVNTIYFKGLWKSKFSPENTRKELFYKADGESCS
+ASMMYQEGKFRYRRVAEGTQVLELPFKGDDITMVLILPKPEKSLAKVEKELTPEVLQEWLDELEEMMLVV
+HMPRFRIEDGFSLKEQLQDMGLVDLFSPEKSKLPGIVAEGRDDLYVSDAFHKAFLEVNEEGSEAAASTAV
+VIAGRSLNPNRVTFKANRPFLVFIREVPLNTIIFMGRVANPCVK
+>gi|122942|sp|P03435.1|HEMA_I75A3 RecName: Full=Hemagglutinin; Contains: RecName: Full=Hemagglutinin HA1 chain; Contains: RecName: Full=Hemagglutinin HA2 chain; Flags: Precursor
+MKTIIALSYIFCLVFAQDLPGNDNNSTATLCLGHHAVPNGTLVKTITNDQIEVTNATELVQSSSTGKICN
+NPHRILDGINCTLIDALLGDPHCDGFQNEKWDLFVERSKAFSNCYPYDVPDYASLRSLVASSGTLEFINE
+GFNWTGVTQNGGSSACKRGPDSGFFSRLNWLYKSGSTYPVQNVTMPNNDNSDKLYIWGVHHPSTDKEQTN
+LYVQASGKVTVSTKRSQQTIIPNVGSRPWVRGLSSRISIYWTIVKPGDILVINSNGNLIAPRGYFKMRTG
+KSSIMRSDAPIGTCSSECITPNGSIPNDKPFQNVNKITYGACPKYVKQNTLKLATGMRNVPEKQTRGIFG
+AIAGFIENGWEGMIDGWYGFRHQNSEGTGQAADLKSTQAAIDQINGKLNRVIEKTNEKFHQIEKEFSEVE
+GRIQDLEKYVEDTKIDLWSYNAELLVALENQHTIDLTDSEMNKLFEKTRRQLRENAEDMGNGCFKIYHKC
+DNACIGSIRNGTYDHDVYRDEALNNRFQIKGVELKSGYKDWILWISFAISCFLLCVVLLGFIMWACQKGN
+IRCNICI
+>gi|1170423|sp|P42357.1|HUTH_HUMAN RecName: Full=Histidine ammonia-lyase; Short=Histidase
+MPRYTVHVRGEWLAVPCQDAQLTVGWLGREAVRRYIKNKPDNGGFTSVDDAHFLVRRCKGLGLLDNEDRL
+EVALENNEFVEVVIEGDAMSPDFIPSQPEGVYLYSKYREPEKYIELDGDRLTTEDLVNLGKGRYKIKLTP
+TAEKRVQKSREVIDSIIKEKTVVYGITTGFGKFARTVIPINKLQELQVNLVRSHSSGVGKPLSPERCRML
+LALRINVLAKGYSGISLETLKQVIEMFNASCLPYVPEKGTVGASGDLAPLSHLALGLVGEGKMWSPKSGW
+ADAKYVLEAHGLKPVILKPKEGLALINGTQMITSLGCEAVERASAIARQADIVAALTLEVLKGTTKAFDT
+DIHALRPHRGQIEVAFRFRSLLDSDHHPSEIAESHRFCDRVQDAYTLRCCPQVHGVVNDTIAFVKNIITT
+ELNSATDNPMVFANRGETVSGGNFHGEYPAKALDYLAIGIHELAAISERRIERLCNPSLSELPAFLVAEG
+GLNSGFMIAHCTAAALVSENKALCHPSSVDSLSTSAATEDHVSMGGWAARKALRVIEHVEQVLAIELLAA
+CQGIEFLRPLKTTTPLEKVYDLVRSVVRPWIKDRFMAPDIEAAHRLLLEQKVWEVAAPYIEKYRMEHIPE
+SRPLSPTAFSLQFLHKKSTKIPESEDL
+>gi|119811|sp|P21177.2|FADB_ECOLI RecName: Full=Fatty acid oxidation complex subunit alpha; Includes: RecName: Full=Enoyl-CoA hydratase/Delta(3)-cis-Delta(2)-trans-enoyl-CoA isomerase/3-hydroxybutyryl-CoA epimerase; Includes: RecName: Full=3-hydroxyacyl-CoA dehydrogenase
+MLYKGDTLYLDWLEDGIAELVFDAPGSVNKLDTATVASLGEAIGVLEQQSDLKGLLLRSNKAAFIVGADI
+TEFLSLFLVPEEQLSQWLHFANSVFNRLEDLPVPTIAAVNGYALGGGCECVLATDYRLATPDLRIGLPET
+KLGIMPGFGGSVRMPRMLGADSALEIIAAGKDVGADQALKIGLVDGVVKAEKLVEGAKAVLRQAINGDLD
+WKAKRQPKLEPLKLSKIEATMSFTIAKGMVAQTAGKHYPAPITAVKTIEAAARFGREEALNLENKSFVPL
+AHTNEARALVGIFLNDQYVKGKAKKLTKDVETPKQAAVLGAGIMGGGIAYQSAWKGVPVVMKDINDKSLT
+LGMTEAAKLLNKQLERGKIDGLKLAGVISTIHPTLDYAGFDRVDIVVEAVVENPKVKKAVLAETEQKVRQ
+DTVLASNTSTIPISELANALERPENFCGMHFFNPVHRMPLVEIIRGEKSSDETIAKVVAWASKMGKTPIV
+VNDCPGFFVNRVLFPYFAGFSQLLRDGADFRKIDKVMEKQFGWPMGPAYLLDVVGIDTAHHAQAVMAAGF
+PQRMQKDYRDAIDALFDANRFGQKNGLGFWRYKEDSKGKPKKEEDAAVEDLLAEVSQPKRDFSEEEIIAR
+MMIPMVNEVVRCLEEGIIATPAEADMALVYGLGFPPFHGGAFRWLDTLGSAKYLDMAQQYQHLGPLYEVP
+EGLRNKARHNEPYYPPVEPARPVGDLKTA
+>tr|Q38941|Q38941_ARATH Similarity to protein encoded by GenBank Accession Number U41815 (F23A5.3 protein) OS=Arabidopsis thaliana GN=F23A5.3 PE=4 SV=1
+MSTEISRMARDLDSRKKRRISLDGIAALCEHSKEIIDSLPMLNSPDYFLKPCINELVERE
+IESPDYCSRVPDFTIGRIGYGYIRFLGNTDVRRLDLDHIVKFHRHEVIVYDDESSKPVVG
+EGLNKAAEVTLVVNIPDLTWGKQQVNHIAYKLKQSTERQGATFISFDPDNGLWKFFVPHF
+SRFGLSDDEAEDIAMDDAPGLGDPVGLDGKKVADIDEEDQMETSELELSHSLPAHLGLDP
+EKMKEMRMLMFPNEDEDESEDFREQTSHLMTSLTKRNVRPSQKIAQRNSHQDPPPVVRKT
+PLALLEYNPGNDKSSPGSILMVQQNKNLAVRKSKTGGFELDISHVTPLTDNYSRNVVDAA
+LFMGRSFRAGWGPNGVLFHTGKPICSSSSQMVLSSVINKEKIAIDKVVWDRKGKVQKELI
+DSAFEAPLSLHKELNHVEEEVRFGSFSLKLQNVVTDRVVLSDICRSYIGIIEKQLEVAGL
+STSAKLFLMHQVMVWELIKVLFSERQSTERLMYAASDNEEDVMQDVKEDSAKIDTEALPL
+IRRAEFSCWLQESVSHRVQEDVSDLNGSSYLEHLFFLLTGRELDSAVELAISKGDVRLAC
+LLSQAGGSTVNRNDILQQLHLWRRNGLDFNFIEKERIKLYELLAGNIHDALQDFTIDWKR
+FLGSYQLLLNQAKAPWPVPIYIDEGPADGFVSDNKHSDILYYLMLLHSKEEEEFGFLQTM
+FSAFSSTDDPLDYHMIWHHRGILEAVGAFTSDDLHTLDMGFVAQLLSQGLCHWAIYVVLH
+IPFREDHPYLHVTVIREILFQYCETWSSMESQRQFIKDLGIPSEWMHEALVRTPYHSSFL
+LLLKVLVAHC
+>gi|85681922|sp|P27895.3|CIN8_YEAST RecName: Full=Kinesin-like protein CIN8; AltName: Full=Chromosome instability protein 8
+MPAENQNTGQDRSSNSISKNGNSQVGCHTVPNEELNITVAVRCRGRNEREISMKSSVVVNVPDITGSKEI
+SINTTGDTGITAQMNAKRYTVDKVFGPGASQDLIFDEVAGPLFQDFIKGYNCTVLVYGMTSTGKTYTMTG
+DEKLYNGELSDAAGIIPRVLLKLFDTLELQQNDYVVKCSFIELYNEELKDLLDSNSNGSSNTGFDGQFMK
+KLRIFDSSTANNTTSNSASSSRSNSRNSSPRSLNDLTPKAALLRKRLRTKSLPNTIKQQYQQQQAVNSRN
+NSSSNSGSTTNNASSNTNTNNGQRSSMAPNDQTNGIYIQNLQEFHITNAMEGLNLLQKGLKHRQVASTKM
+NDFSSRSHTIFTITLYKKHQDELFRISKMNLVDLAGSENINRSGALNQRAKEAGSINQSLLTLGRVINAL
+VDKSGHIPFRESKLTRLLQDSLGGNTKTALIATISPAKVTSEETCSTLEYASKAKNIKNKPQLGSFIMKD
+ILVKNITMELAKIKSDLLSTKSKEGIYMSQDHYKNLNSDLESYKNEVQECKREIESLTSKNALLVKDKLK
+SKETIQSQNCQIESLKTTIDHLRAQLDKQHKTEIEISDFNNKLQKLTEVMQMALHDYKKRELDLNQKFEM
+HITKEIKKLKSTLFLQLNTMQQESILQETNIQPNLDMIKNEVLTLMRTMQEKAELMYKDCVKKILNESPK
+FFNVVIEKIDIIRVDFQKFYKNIAENLSDISEENNNMKQYLKNHFFKNNHQELLNRHVDSTYENIEKRTN
+EFVENFKKVLNDHLDENKKLIMQNLTTATSAVIDQEMDLFEPKRVKWENSFDLINDCDSMNNEFYNSMAA
+TLSQIKSTVDTSSNSMNESISVMKGQVEESENAISLLKNNTKFNDQFEQLINKHNMLKDNIKNSITSTHS
+HITNVDDIYNTIENIMKNYGNKENATKDEMIENILKEIPNLSKKMPLRLSNINSNSVQSVISPKKHAIED
+ENKSSENVDNEGSRKMLKIE
+>gi|117492|sp|P07756.1|CPSM_RAT RecName: Full=Carbamoyl-phosphate synthase [ammonia], mitochondrial; AltName: Full=Carbamoyl-phosphate synthetase I; Short=CPSase I; Flags: Precursor
+MTRILTACKVVKTLKSGFGLANVTSKRQWDFSRPGIRLLSVKAQTAHIVLEDGTKMKGYSFGHPSSVAGE
+VVFNTGLGGYSEALTDPAYKGQILTMANPIIGNGGAPDTTARDELGLNKYMESDGIKVAGLLVLNYSHDY
+NHWLATKSLGQWLQEEKVPAIYGVDTRMLTKIIRDKGTMLGKIEFEGQSVDFVDPNKQNLIAEVSTKDVK
+VFGKGNPTKVVAVDCGIKNNVIRLLVKRGAEVHLVPWNHDFTQMDYDGLLIAGGPGNPALAQPLIQNVKK
+ILESDRKEPLFGISTGNIITGLAAGAKSYKMSMANRGQNQPVLNITNRQAFITAQNHGYALDNTLPAGWK
+PLFVNVNDQTNEGIMHESKPFFAVQFHPEVSPGPTDTEYLFDSFFSLIKKGKGTTITSVLPKPALVASRV
+EVSKVLILGSGGLSIGQAGEFDYSGSQAVKAMKEENVKTVLMNPNIASVQTNEVGLKQADAVYFLPITPQ
+FVTEVIKAERPDGLILGMGGQTALNCGVELFKRGVLKEYGVKVLGTSVESIMATEDRQLFSDKLNEINEK
+IAPSFAVESMEDALKAADTIGYPVMIRSAYALGGLGSGICPNKETLMDLGTKAFAMTNQILVERSVTGWK
+EIEYEVVRDADDNCVTVCNMENVDAMGVHTGDSVVVAPAQTLSNAEFQMLRRTSINVVRHLGIVGECNIQ
+FALHPTSMEYCIIEVNARLSRSSALASKATGYPLAFIAAKIALGIPLPEIKNVVSGKTSACFEPSLDYMV
+TKIPRWDLDRFHGTSSRIGSSMKSVGEVMAIGRTFEESFQKALRMCHPSVDGFTPRLPMNKEWPANLDLR
+KELSEPSSTRIYAIAKALENNMSLDEIVKLTSIDKWFLYKMRDILNMDKTLKGLNSESVTEETLRQAKEI
+GFSDKQISKCLGLTEAQTRELRLKKNIHPWVKQIDTLAAEYPSVTNYLYVTYNGQEHDIKFDEHGIMVLG
+CGPYHIGSSVEFDWCAVSSIRTLRQLGKKTVVVNCNPETVSTDFDECDKLYFEELSLERILDIYHQEACN
+GCIISVGGQIPNNLAVPLYKNGVKIMGTSPLQIDRAEDRSIFSAVLDELKVAQAPWKAVNTLNEALEFAN
+SVGYPCLLRPSYVLSGSAMNVVFSEDEMKRFLEEATRVSQEHPVVLTKFIEGAREVEMDAVGKEGRVISH
+AISEHVEDAGVHSGDATLMLPTQTISQGAIEKVKDATRKIAKAFAISGPFNVQFLVKGNDVLVIECNLRA
+SRSFPFVSKTLGVDFIDVATKVMIGESVDEKHLPTLEQPIIPSDYVAIKAPMFSWPRLRDADPILRCEMA
+STGEVACFGEGIHTAFLKAMLSTGFKIPQKGILIGIQQSFRPRFLGVAEQLHNEGFKLFATEATSDWLNA
+NNVPATPVAWPSQEGQNPSLSSIRKLIRDGSIDLVINLPNNNTKFVHDNYVIRRTAVDSGIALLTNFQVT
+KLFAEAVQKARTVDSKSLFHYRQYSAGKAA
+>gi|116448|sp|P04775.1|SCN2A_RAT RecName: Full=Sodium channel protein type 2 subunit alpha; AltName: Full=Sodium channel protein type II subunit alpha; AltName: Full=Voltage-gated sodium channel subunit alpha Nav1.2; AltName: Full=Sodium channel protein, brain II subunit alpha
+MARSVLVPPGPDSFRFFTRESLAAIEQRIAEEKAKRPKQERKDEDDENGPKPNSDLEAGKSLPFIYGDIP
+PEMVSEPLEDLDPYYINKKTFIVLNKGKAISRFSATSALYILTPFNPIRKLAIKILVHSLFNVLIMCTIL
+TNCVFMTMSNPPDWTKNVEYTFTGIYTFESLIKILARGFCLEDFTFLRNPWNWLDFTVITFAYVTEFVNL
+GNVSALRTFRVLRALKTISVIPGLKTIVGALIQSVKKLSDVMILTVFCLSVFALIGLQLFMGNLRNKCLQ
+WPPDNSTFEINITSFFNNSLDWNGTAFNRTVNMFNWDEYIEDKSHFYFLEGQNDALLCGNSSDAGQCPEG
+YICVKAGRNPNYGYTSFDTFSWAFLSLFRLMTQDFWENLYQLTLRAAGKTYMIFFVLVIFLGSFYLINLI
+LAVVAMAYEEQNQATLEEAEQKEAEFQQMLEQLKKQQEEAQAAAAAASAESRDFSGAGGIGVFSESSSVA
+SKLSSKSEKELKNRRKKKKQKEQAGEEEKEDAVRKSASEDSIRKKGFQFSLEGSRLTYEKRFSSPHQSLL
+SIRGSLFSPRRNSRASLFNFKGRVKDIGSENDFADDEHSTFEDNDSRRDSLFVPHRHGERRPSNVSQASR
+ASRGIPTLPMNGKMHSAVDCNGVVSLVGGPSALTSPVGQLLPEGTTTETEIRKRRSSSYHVSMDLLEDPS
+RQRAMSMASILTNTMEELEESRQKCPPCWYKFANMCLIWDCCKPWLKVKHVVNLVVMDPFVDLAITICIV
+LNTLFMAMEHYPMTEQFSSVLSVGNLVFTGIFTAEMFLKIIAMDPYYYFQEGWNIFDGFIVSLSLMELGL
+ANVEGLSVLRSFRLLRVFKLAKSWPTLNMLIKIIGNSVGALGNLTLVLAIIVFIFAVVGMQLFGKSYKEC
+VCKISNDCELPRWHMHHFFHSFLIVFRVLCGEWIETMWDCMEVAGQTMCLTVFMMVMVIGNLVVLNLFLA
+LLLSSFSSDNLAATDDDNEMNNLQIAVGRMQKGIDFVKRKIREFIQKAFVRKQKALDEIKPLEDLNNKKD
+SCISNHTTIEIGKDLNYLKDGNGTTSGIGSSVEKYVVDESDYMSFINNPSLTVTVPIALGESDFENLNTE
+EFSSESDMEESKEKLNATSSSEGSTVDIGAPAEGEQPEAEPEESLEPEACFTEDCVRKFKCCQISIEEGK
+GKLWWNLRKTCYKIVEHNWFETFIVFMILLSSGALAFEDIYIEQRKTIKTMLEYADKVFTYIFILEMLLK
+WVAYGFQMYFTNAWCWLDFLIVDVSLVSLTANALGYSELGAIKSLRTLRALRPLRALSRFEGMRVVVNAL
+LGAIPSIMNVLLVCLIFWLIFSIMGVNLFAGKFYHCINYTTGEMFDVSVVNNYSECQALIESNQTARWKN
+VKVNFDNVGLGYLSLLQVATFKGWMDIMYAAVDSRNVELQPKYEDNLYMYLYFVIFIIFGSFFTLNLFIG
+VIIDNFNQQKKKFGGQDIFMTEEQKKYYNAMKKLGSKKPQKPIPRPANKFQGMVFDFVTKQVFDISIMIL
+ICLNMVTMMVETDDQSQEMTNILYWINLVFIVLFTGECVLKLISLRHYYFTIGWNIFDFVVVILSIVGMF
+LAELIEKYFVSPTLFRVIRLARIGRILRLIKGAKGIRTLLFALMMSLPALFNIGLLLFLVMFIYAIFGMS
+NFAYVKREVGIDDMFNFETFGNSMICLFQITTSAGWDGLLAPILNSGPPDCDPEKDHPGSSVKGDCGNPS
+VGIFFFVSYIIISFLVVVNMYIAVILENFSVATEESAEPLSEDDFEMFYEVWEKFDPDATQFIEFCKLSD
+FAAALDPPLLIAKPNKVQLIAMDLPMVSGDRIHCLDILFAFTKRVLGESGEMDALRIQMEERFMASNPSK
+VSYEPITTTLKRKQEEVSAIVIQRAYRRYLLKQKVKKVSSIYKKDKGKEDEGTPIKEDIITDKLNENSTP
+EKTDVTPSTTSPPSYDSVTKPEKEKFEKDKSEKEDKGKDIRESKK
+>gi|54040727|sp|P19096.2|FAS_MOUSE RecName: Full=Fatty acid synthase; Includes: RecName: Full=[Acyl-carrier-protein] S-acetyltransferase; Includes: RecName: Full=[Acyl-carrier-protein] S-malonyltransferase; Includes: RecName: Full=3-oxoacyl-[acyl-carrier-protein] synthase; Includes: RecName: Full=3-oxoacyl-[acyl-carrier-protein] reductase; Includes: RecName: Full=3-hydroxypalmitoyl-[acyl-carrier-protein] dehydratase; Includes: RecName: Full=Enoyl-[acyl-carrier-protein] reductase; Includes: RecName: Full=Oleoyl-[acyl-carrier-protein] hydrolase
+MEEVVIAGMSGKLPESENLQEFWANLIGGVDMVTDDDRRWKAGLYGLPKRSGKLKDLSKFDASFFGVHPK
+QAHTMDPQLRLLLEVSYEAIVDGGINPASLRGTNTGVWVGVSGSEASEALSRDPETLLGYSMVGCQRAMM
+ANRLSFFFDFKGPSIALDTACSSSLLALQNAYQAIRSGECPAALVGGINLLLKPNTSVQFMKLGMLSPDG
+TCRSFDDSGSGYCRSEAVVAVLLTKKSLARRVYATILNAGTNTDGSKEQGVTFPSGEVQEQLICSLYQPA
+GLAPESLEYIEAHGTGTKVGDPQELNGITRSLCAFRQAPLLIGSTKSNMGHPEPASGLAALTKVLLSLEH
+GVWAPNLHFHNPNPEIPALLDGRLQVVDRPLPVRGGNVGINSFGFGGSNVHVILQPNTRQAPAPTAHAAL
+PHLLHASGRTLEAVQDLLEQGRQHSQDLAFVSMLNDIAATPTAAMPFRGYTVLGVEGRVQEVQQVSTNKR
+PLWFICSGMGTQWRGMGLSLMRLDSFRESILRSDEAVKPLGVKVSDLLLSTDERTFDDIVHAFVSLTAIQ
+IALIDLLTSVGLKPDGIIGHSLGEVACGYADGCLSQREAVLAAYWRGQCIKDAHLPPGSMAAVGLSWEEC
+KQRCPAGVVPACHNSEDTVTISGPQAAVNEFVEQLKQEGVFAKEVRTGGLAFHSYFMEGIAPTLLQALKK
+VIREPRPRSARWLSTSIPEAQWQSSLARTSSAEYNVNNLVSPVLFQEALWHIPEHAVVLEIAPHALLQAV
+LKRGVKSSCTIIPLMKRDHKDNLEFFLTNLGKVHLTGINVNPNALFPPVEFPAPRGTPLISPHIKWDHSQ
+TWDVPVAEDFPNGSSSSSATVYSIDASPESPDHYLVDHCIDGRVIFPGTGYLCLVWKTLARSLGLSLEET
+PVVFENVSFHQATILPKTGTVALEVRLLEASHAFEVSDTGNLIVSGKVYLWEDPNSKLFDHPEVPTPPES
+ASVSRLTQGEVYKELRLRGYDYGPQFQGICEATLEGEQGKLLWKDNWVTFMDTMLQVSILGSSQQSLQLP
+TRVTAIYIDPATHRQKVYRLKEDTQVADVTTSRCLGITVSGGIHISRLQTTATSRRQQEQLVPTLEKFVF
+TPHMEAECLSESTALQKELQLCKGLARALQTKATQQGLKAAMLGQEDPPQHGLPRLLAAACQLQLNGNLQ
+LELGEALAQERLLLPEDPLISGLLNSQALKACVDTALENLSTLKMKVAEVLAGEGHLYSRIPALLNTQPM
+LQLEYTATDRHPQALKDVQTKLQQHDVAQGQWNPSDPAPSSLGALDLLVCNCALATLGDPALALDNMVAA
+LKEGGFLLVHTVLKGHALGETLACLPSEVQPAPSLLSQEEWESLFSRKALHLVGLKRSFYGTALFLCRRA
+IPQEKPIFLSVEDTSFQWVDSLKSTLATSSSQPVWLTAMDCPTSGVVGLVNCLRKEPGGHRIRCILLSNL
+SNTSHAPKLDPGSPELQQVLKHDLVMNVYRDGAWGAFRHFQLEQDKPKEQTAHAFVNVLTRGDLASIRWV
+SSPLKHTQPSSSGAQLCTVYYASLNFRDIMLATGKLSPDAIPGKWASRDCMLGMEFSGRDRCGRRVMGLV
+PAEGLATSVLLSSDFLWDVPSSWTLEEAASVPVVYTTAYYSLVVRGRIQRGETVLIHSGSGGVGQAAISI
+ALSLGCRVFTTVGSAEKRAYLQARFPQLDDTSFANSRDTSFEQHVLLHTGGKGVDLVLNSLAEEKLQASV
+RCLAQHGRFLEIGKFDLSNNHPLGMAIFLKNVTFHGILLDALFEEANDSWREVAALLKAGIRDGVVKPLK
+CTVFPKAQVEDAFRYMAQGKHIGKVLVQVREEEPEAVLPGAQPTLISAISKTFCPAHKSYIITGGLGGFG
+LELARWLVLRGAQRLVLTSRSGIRTGYQAKHIREWRRQGIQVLVSTSNVSSLEGARALIAEATKLGPVGG
+VFNLAMVLRDAMLENQTPELFQDVNKPKYNGTLNLDRATREACPELDYFVAFSSVSCGRGNAGQTNYGFA
+NSTMERICEQRRHDGLPGLAVQWGAIGDVGIVLEAMGTNDTVIGGTLPQRISSCMEVLDLFLNQPHAVLS
+SFVLAEKKAVAHGDGDTQRDLVKAVAHILGIRDLAGINLDSTLADLGLDSLMGVEVRQILEREHDLVLPM
+REVRQLTLRKLQEMSSKTDSATDTTAPKSRSDTSLKQNQLNLSTLLVNPEGPTLTQLNSVQSSERPLFLV
+HPIEGSTTVFHSLAAKLSVPTYGLQCTQAAPLDSIPNLAAYYIDCIKQVQPEGPYRIAGYSFGACVAFEM
+CSQLQAQQGPAPTHNNLFLFDGSHTYVLAYTQSYRAKMTPGCEAEAEAEALCFFIKQFLDVEHSKVLEAL
+LPLKSLEDRVAASVDLITKSHHSLDRRELSFAAVSFYHKLRAADQYKPKAKYHGNVTLLRAKTGGTYGED
+LGADYNLSQVCDGKVSVHIIEGDHRTLLEGSGLESIINIIHSSLAEPRVSVREG
+>gi|48429221|sp|P28167.2|ZFH2_DROME RecName: Full=Zinc finger protein 2; AltName: Full=Zinc finger homeodomain protein 2
+MSSFDVETFNGKIVYNLDGSAHIIATDNTNGGGSGSGQNCYGSTTNSLKNLSKDKGRGQEEKDIEHPSQY
+HREQSDNKRQEEAVDNRPGVESLGSACYKSSPKIHSFRVVSAQDANSTCQDQIRAFKIQKPILMCFICKL
+SFGNVKSFSLHANTEHRLNLEELDQQLLNREYSSAIIQRNMDEKPQISFLQPLANNDASADTNDTEKLQT
+ATEGSDATLPSSPQPVFRNVSELEPENKQETEQNRLLNQDREQEPESDQHTSSSKMAAPSAYIPLSSPKV
+AGKLTVKFGSLNSATAKTNNLSKVSSTSSPPSTYASGEVLSPSTDNISNHKSTHCNQETEPPSSSSSEVE
+MKIGSMSTSPQTNDSDVPCSGFLQMQHMTTGGAYTPQVSSFHASLAALAANESNDNRVKLITEFLQQQLQ
+QHQSSLFPSPCPDHPDLNGVDCKTCELLDIQQRSKSPSSSHHQFSQSLPQLQIQSQPQQTPHRSPCSNSV
+ALPVSPSASSVASVGNASTATSSFTIGACSEHINGRPQGVDCARCEMLLNSARLNSGVQMSTRNSCKTLK
+CPQCNWHYKYQETLEIHMREKHPDGESACGYCLAGQQHPRLARGESYSCGYKPYRCEICNYSTTTKGNLS
+IHMQSDKHLNNMQELNSSQNMVAAAAAAAVTGKLLLSSSSPQVTAACPSNSGSGAGSGSSNIVGGTASLS
+GNATPSVTGANSSNANAGSNTNNAGTKPKPSFRCDICSYDTSVARNLRIHMTSEKHTHNMAVLQNNIKHI
+QAFNFLQQQQQSGTGNIASHSSGSFMPEVALADLAYNQALMIQLLHQQQQHQQSANTKLSPSSSPVSTPD
+QFSFSPKPIKLNHGTGAAMGIGMAMGMGMSHSNEVSCELSGDPHPLTKTDKWPMAFYSCLVCDCYSTNNL
+DDLNQHLLLDRSRQSSSASSEIMVIHNNNYICRLCNYKTNLKANFQLHSKTDKHLQKLNFINHIREGGPQ
+NEYKMQYQQQQLAANVVQLKCNCCDFHTNSIQKLSLHTQQMRHDTMRMIFQHLLYIVQQSEMHNKSSGSA
+EDDPQCACPDEDQQLQLQSSKKLLLCQLCNFTAQNIHEMVQHVKGIRHLQVEQFICLQRRSENQEIPALN
+EVFKVTEWVMENEDVSLAPGLNLARTTTNDATTDASYAAASSAAVPAIPDVSMFSPTSPSSCATSCDKNL
+SQIVLPNVNNLGSGVPTTVFKCNLCEYFVQSKSEIAAHIETEHSCAESDEFITIPTNTAALQAFQTAVAA
+AALAAVHQRCAVINPPTQDTVDEDKDLDTNVSDGPVGIKQERLEQEVDRTTSMDVTKDLASQATDFGAPE
+SPKVAETEVGVQCPLCLENHFREKQYLEDHLTSVHSVTRDGLSRLLLLVDQKALKKESTDIACPTDKAPY
+ANTNALERAPTPIENTCNVSLIKSTSANPSQSVSLQGLSCQQCEASFKHEEQLLKHAQQNQHFSLQNGEY
+LCLAASHISRPCFMTFRTIPTMISHFQDLHMSLIISERHVYKYRCKQCSLAFKTQEKLTTHMLYHSMRDA
+TKCSFCQRNFRSTQALQKHMEQAHAEDGTPSTRTNSPQTPMLSTEETHKHLLAESHAVEREVSGSDVSPI
+ELETHLNKETRHLSPTPMSLDSQSHQKHLATFAALLKQQQCNSDAGGLHPEALSMSTGEMPPQLQGLQNL
+QHIQQHFGAVAAAAGLPINPVDMLNIMQFHHLMSLNFMNLAPPLVFGANAAGNAVSGPSALNNSITTSTA
+TSASGLGDTHLTSGVSSIPVDSGKATAVPPQTQLNANANSQQLASNQKRARTRITDDQLKILRAHFDINN
+SPSEESIMEMSQKANLPMKVVKHWFRNTLFKERQRNKDSPYNFNNPPSTTLNLEEYERTGQAKVTPLNDT
+CSVAVTGPMTSSTISLPPSGNINLSSKENATSKVLAAGKANASGPVTFSATVPVSTPLSRPESTNSSGNI
+SDYIGNNIFFGQLGSKEQILPYSLDGQIKSEPQDDMIGATDFAYQTKQHSSFSFLKQQQDLVDPPEQCLT
+NQNADTAQDQSLLAGSSLASNCQSQQQINIFETKSESGSSDVLSRPPSPNSGAAGNVYGSMNDLLNQQLE
+NMGSNMGPPKKMQIVGKTFEKNVAPMVTSGSVSTQFESNSSNSSSSSSSTSGGKRANRTRFTDYQIKVLQ
+EFFENNSYPKDSDLEYLSKLLLLSPRVIVVWFQNARQKQRKIYENQPNNTLFENEETKKQNINYACKKCN
+LVFQRYYELIRHQKNHCFKEENNKKSAKAQIAAAQIAQNLSSEDSNSSMDIHHVGICPPGSAVASHTLST
+PGSAAPLPGQYTQHSFGALPSPQHLFAKSSSLTDFSPSTTPTPPQRERSNSLDQIQRPPKFDCDKCELNF
+NQLEKLREHQLLHLMNPGNICSDVGQNSNPEANFGPFGSILQSLQQAAAQQQQQHHQQPPTKKRKYSDCS
+SNADEMQSLSELEASQKKHEYLYKYFMQNETSQEVKQQFLMQQQQKKLEQGNECDFELDFLTNFYQQNEL
+KKVSNYDFLLQYYRTHEEAKSSQQHTFSSSKKPTIEFLLQYYQLNESKKFFQLVASPQIIPDVPGYKPSL
+RIPKSTSDEAPYIGETSLEQATELQREKQDEQLRIDRPSEENDLSMNKNKVENINNNNINVDQSNLTETN
+GGVPSVETKEECTQESSLIAMDDENKYLCTRSKQKDDKEKSHYLHNLEDFLDATMIENNSQTLTFNDDEK
+ACQKDELTQNSNAIEKRSSVSPVNVSSKQNKRLRTTILPEQLNFLYECYQSESNPSRKMLEEISKKVNLK
+KRVVQVWFQNSRAKDKKSRNQRHYAHISDDNSYDGSSGKEVYSDLRSNGITVDTDLETNLQDCQLCQVTQ
+VNIRKHAFSVEHISKMKKLLEQTTELYAQSNGSGSEDNDSDREKRFYNLSKAFLLQHVVTNATSHAIHTA
+RQDSDVIAEGNCILNYDTNGGDSKSHVQHNLPNEVVSEDARKIAGNQELMQQLFNRNHITVIGGK
+>gi|182676519|sp|P0C6B8.1|SVEP1_RAT RecName: Full=Sushi, von Willebrand factor type A, EGF and pentraxin domain-containing protein 1; Flags: Precursor
+MWTRLAFCCWALALVSGWTNFQPMAPSLNFSFRLFPEASPGALGRLAVPPRSGEEEAVGSKVERLGRTFR
+SRVRRLRELSDRLELVFLVDESSSVGQTNFLNELKFVRKLLSDFPVVSTATRVAIVTFSSKNNVVARVDY
+ISTSRAHQHKCALLSREIPAITYRGGGTYTMGAFQQAAQILRHSRENSTKVIFLITDGYSNGGDPRPIAA
+SLRDFGVEIFTFGIWQGNIRELNDMASTPKEEHCYLLHSFEEFEALARRALHEDLPSGSFIQEDMAHCSY
+LCEAGRDCCDRMASCKCGTHTGQFECICEKGYYGKGLQYECTACPPGTYKPEASPGGISTCIPCPDENHT
+SPPGSTAPEDCVCREGYQRSGQTCEVVHCPALKPPENGFFIQNTCKNHFNAACGVRCRPGFDLVGSSIHL
+CQPNGLWSGTESFCRVRTCPHLRQPKHGHISCSTVEMSYNTVCLVTCNEGYRLEGHAKLTCQGNAQWDGT
+EPRCVERHCATFQKPKGVIISPPSCGKQPAKPGMICQLGCRQGYILSGIREVRCATSGKWSARVQTAVCK
+DVEAPQISCPNDIKAKTEGQQDSANVTWQVPTAKDNSGEKVSVHVHPAFSPPYLFPIGEVAITYTATDSS
+GNQASCTFYIKVIDVEPPVIDWCRSPPPIQVVEKEHPASWDEPQFSDNSGAELVITSSHTQGDLFPHGET
+VVWYTATDPSGNNRTCDIHIVIKGSPCEVPFTPVNGDFICAQDSAGVNCSLTCREGYDFTEGSTEKYYCA
+FEDGIWRPPYSTEWPDCAIKRFANHGFKSFEMLYKTTRCDDMDLFKKFSAAFETTLGKMVPSFCSDADDI
+DCRLEDLTKKYCIEYNYNYENGFAIGPGGWGAGNRLDYSYDHFLDVVQETPADVGKTRSSRIKRTVPLSD
+PQIQLIFNITASVPLPEERNDTVELENQQRLIRTLETITNRLKSTLNKGPMYSFQLASETVVADSNSLET
+EKAFLFCRPGSVLRGRMCVNCPLGTSYSLEHSTCESCLMGSYQDEEGQLECKLCPPRTHTEYLHSRSISE
+CKAQCKQGTYSSSGLETCESCPLGTYQPDFGSRSCLPCPETTTTVKRGAVDISACGVPCPVGEFSRSGLT
+PCYPCPRDYYQPNAGKSFCLACPFYGTTTITGATSITDCSSFSSTFSAAEESIVPLAAPGPTQNKYEVFH
+ECFLNPCHNSGTCQQLGRGYVCLCPPGYTGLKCETDIDECSSLPCLNGGICRDKVGGFTCECSSGYTGQI
+CEENINECSSSPCLNKGTCTDGLASYRCTCVSGYVGVHCETDVNECQSSPCLNNAVCKDQVGGFSCKCPP
+GFLGTRCEKNVDECLSQPCQNGATCKDGANSFRCQCPAGFTGPHCELNINECQSNPCRNQATCVDELNSY
+SCKCRPGFSGRRCETEQPSGFNLDFEVSGIYGYVLLDGVLPTLHAITCAFWMKSSDVINYGTPISYALEG
+NKDNTFLLTDYNGWVLYVNGKEKITNCPSVNDGIWHHIAITWTSTGGAWRVYIDGELSDSGTGLSVGKAI
+PGGGALVLGQEQDKKGEGFNPAESFVGSISQLNLWDYVLSPQQVKSLASSCPEELSRGNVLAWPDFVSGI
+TGKVKVDSSSIFCSDCPSLEGSVPHLRPASGDRKPGSKVSLFCDPGFQMVGNPVQYCLNQGQWSQPLPHC
+ERIRCGLPPTLENGFYSAEDLHAGSTVTYQCTSGYYLLGDSRMFCTDNGSWNGISPSCLDVDECAVGSDC
+SEHASCLNTNGSYICSCKPPYTGDGKNCAEPVKCKAPENPENGHSLGKIYSVGAEVTFSCEEGHQLVGVR
+KITCLESGEWDHLRPSCEAISCGAPPVPENGGVDGSAFTYGSKVRYRCDKGYTLAGDEESACLASGSWSH
+SSPVCELVKCSQPENINNGKYILSGLTYLSIASYSCEDGYSLQGPSLIECTASGSWDRAPPSCQLVSCGE
+PPMVKDALTTGSNFTFGNMVTYTCKEGYTLAGPDTIICQANGKWNSSNHQCLAVSCDEPPNVDHASPETA
+HRLFGDTAFYYCADGYSLADNSQLICNAQGNWVPPEGQAVPRCIAHFCEKPPSVSYSILESVSKAKFAAG
+SVVSFKCMEGFVLNTSAKIECLRGGQWSPSPLSVQCIPVRCGEPPSITNGYPSGTNYSFGAVVAYSCHKG
+FYIKGEKKSTCEATGQWSRPLPTCHPVSCNEPPKVENGFLEHTTGRTFESEARFQCNPGYKAVGSPVFVC
+QANRHWHSDAPLSCTPLNCGKPPPIQNGFLRGESFEVGSKVQFVCNEGYELVGDNSWTCQKSGKWSKKPS
+PKCVPTKCAEPPLLENQLVLKELTSEVGVMTISCKEGHALQGPSVLKCLPSGQWNGSFPVCKLVLCQSPP
+LIPFGVPASSGALHFGSTVKYLCVDGFFLRGNPIILCQVDGTWSSPLPECVPVECPQPEEILNGIIHVQG
+LAYLSTTLYTCKPGFELVGNTTTLCGENGQWLGGKPMCRPIECPEPKEILNGQFSSVSFQYGQTITYSCD
+RGFRLEGPKSLTCLETGNWDMDAPSCNAIHCSDPQPIENGFVEGADYRYGAMIIYSCFPGFQVVGHAMQT
+CEETGWSSSSPTCVPIDCGLPPHIDFGDCTRVSDGQGYFVQEDDMMEVPYLTPHPQHLEATAKASEITEE
+SLVPHASQFLYGTTVSYRCEPGYELLGIPVLVCQEDGTWNGTAPSCISIECDLPVAPENGFLHFTQTTMG
+SAAQYSCKPGHVLEGSHLRLCLQNKQWSGTVPRCEVISCSEPNPLGNGSIKGNDYSYLGVLHYECDSGYV
+LNGTEKRTCQENKQWDGHEPVCLPVDCGSPPVPTNGQVTGEEYTFQKEIAYSCGEGFILEGARSRVCLTN
+GSWSGTTPSCVPVRCPAPPQVANGVTDGLDYGFKKEVTFHCLEGYVLQGTPKLTCQSNGTWDAEVPICKP
+ATCGPPADLPQGFPNGFSFFHGGHIQYQCFTGYKLHGNPSRRCLPDGSWSGSTPSCLPCTCSTPIIQQGT
+VNATDLGCGKTVQIECFKGFKLLGLPEITCDANGQWSDFPLCEHADCGPLPTVPNGIVIKGSPSEDNVVT
+YSCRPGYIIQGSSDLICTEKGIWSEPYPTCEPVSCGPPPTVANAVATGEAHTYESKVKLRCLEGYVVDTD
+TDTFTCQQDGRWFPERINCSPKTCPVPSNRTRIRVHGDDFQVNRQVSVSCTEGFTYDGADRSTCQPDGTW
+EPPLSEESCIPVVCGQPESPEHGFVVGSEYSFGSTVVYQCDPGYELEGNRERVCQENRQWSGRVAVCRES
+RCEAPAEFPNGKAVLENTTSGPSLLFSCHRGYTLEGPPEAHCTANGTWSHLAPLCKPNPCPVPFVIPENA
+LLSEREFYVNQNVSIKCREGFLLKGNGIITCNPDETWTQTNARCEKISCGPPTHVENAIARGVHYQYGDM
+VTFSCYSGYMLEGSLRSVCLENGTWTPPPICRAVCRFPCQNGGVCQRPNACSCPDGWMGRLCEEPICILP
+CLNGGRCVAPYRCDCPAGWTGSRCHTATCQSPCLNGGKCVRPNRCHCLSSWTGHDCSRKRRSGL
+>gi|84028206|sp|P20930.3|FILA_HUMAN RecName: Full=Filaggrin
+MSTLLENIFAIINLFKQYSKKDKNTDTLSKKELKELLEKEFRQILKNPDDPDMVDVFMDHLDIDHNKKID
+FTEFLLMVFKLAQAYYESTRKENLPISGHKHRKHSHHDKHEDNKQEENKENRKRPSSLERRNNRKGNKGR
+SKSPRETGGKRHESSSEKKERKGYSPTHREEEYGKNHHNSSKKEKNKTENTRLGDNRKRLSERLEEKEDN
+EEGVYDYENTGRMTQKWIQSGHIATYYTIQDEAYDTTDSLLEENKIYERSRSSDGKSSSQVNRSRHENTS
+QVPLQESRTRKRRGSRVSQDRDSEGHSEDSERHSGSASRNHHGSAWEQSRDGSRHPRSHDEDRASHGHSA
+DSSRQSGTRHAETSSRGQTASSHEQARSSPGERHGSGHQQSADSSRHSATGRGQASSAVSDRGHRGSSGS
+QASDSEGHSENSDTQSVSGHGKAGLRQQSHQESTRGRSGERSGRSGSSLYQVSTHEQPDSAHGRTGTSTG
+GRQGSHHEQARDSSRHSASQEGQDTIRGHPGSSRGGRQGSHHEQSVNRSGHSGSHHSHTTSQGRSDASHG
+QSGSRSASRQTRNEEQSGDGTRHSGSRHHEASSQADSSRHSQVGQGQSSGPRTSRNQGSSVSQDSDSQGH
+SEDSERWSGSASRNHHGSAQEQSRDGSRHPRSHHEDRAGHGHSADSSRKSGTRHTQNSSSGQAASSHEQA
+RSSAGERHGSRHQLQSADSSRHSGTGHGQASSAVRDSGHRGSSGSQATDSEGHSEDSDTQSVSGHGQAGH
+HQQSHQESARDRSGERSRRSGSFLYQVSTHKQSESSHGWTGPSTGVRQGSHHEQARDNSRHSASQDGQDT
+IRGHPGSSRRGRQGSHHEQSVDRSGHSGSHHSHTTSQGRSDASRGQSGSRSASRTTRNEEQSRDGSRHSG
+SRHHEASSHADISRHSQAGQGQSEGSRTSRRQGSSVSQDSDSEGHSEDSERWSGSASRNHRGSAQEQSRH
+GSRHPRSHHEDRAGHGHSADSSRQSGTPHAETSSGGQAASSHEQARSSPGERHGSRHQQSADSSRHSGIP
+RRQASSAVRDSGHWGSSGSQASDSEGHSEESDTQSVSGHGQDGPHQQSHQESARDWSGGRSGRSGSFIYQ
+VSTHEQSESAHGRTRTSTGRRQGSHHEQARDSSRHSASQEGQDTIRAHPGSRRGGRQGSHHEQSVDRSGH
+SGSHHSHTTSQGRSDASHGQSGSRSASRQTRKDKQSGDGSRHSGSRHHEAASWADSSRHSQVGQEQSSGS
+RTSRHQGSSVSQDSDSERHSDDSERLSGSASRNHHGSSREQSRDGSRHPGFHQEDRASHGHSADSSRQSG
+THHTESSSHGQAVSSHEQARSSPGERHGSRHQQSADSSRHSGIGHRQASSAVRDSGHRGSSGSQVTNSEG
+HSEDSDTQSVSAHGQAGPHQQSHKESARGQSGESSGRSRSFLYQVSSHEQSESTHGQTAPSTGGRQGSRH
+EQARNSSRHSASQDGQDTIRGHPGSSRGGRQGSYHEQSVDRSGHSGYHHSHTTPQGRSDASHGQSGPRSA
+SRQTRNEEQSGDGSRHSGSRHHEPSTRAGSSRHSQVGQGESAGSKTSRRQGSSVSQDRDSEGHSEDSERR
+SESASRNHYGSAREQSRHGSRNPRSHQEDRASHGHSAESSRQSGTRHAETSSGGQAASSQEQARSSPGER
+HGSRHQQSADSSTDSGTGRRQDSSVVGDSGNRGSSGSQASDSEGHSEESDTQSVSAHGQAGPHQQSHQES
+TRGQSGERSGRSGSFLYQVSTHEQSESAHGRTGPSTGGRQRSRHEQARDSSRHSASQEGQDTIRGHPGSS
+RGGRQGSHYEQSVDSSGHSGSHHSHTTSQERSDVSRGQSGSRSVSRQTRNEKQSGDGSRHSGSRHHEASS
+RADSSRHSQVGQGQSSGPRTSRNQGSSVSQDSDSQGHSEDSERWSGSASRNHLGSAWEQSRDGSRHPGSH
+HEDRAGHGHSADSSRQSGTRHTESSSRGQAASSHEQARSSAGERHGSHHQLQSADSSRHSGIGHGQASSA
+VRDSGHRGYSGSQASDSEGHSEDSDTQSVSAQGKAGPHQQSHKESARGQSGESSGRSGSFLYQVSTHEQS
+ESTHGQSAPSTGGRQGSHYDQAQDSSRHSASQEGQDTIRGHPGPSRGGRQGSHQEQSVDRSGHSGSHHSH
+TTSQGRSDASRGQSGSRSASRKTYDKEQSGDGSRHSGSHHHEASSWADSSRHSLVGQGQSSGPRTSRPRG
+SSVSQDSDSEGHSEDSERRSGSASRNHHGSAQEQSRDGSRHPRSHHEDRAGHGHSAESSRQSGTHHAENS
+SGGQAASSHEQARSSAGERHGSHHQQSADSSRHSGIGHGQASSAVRDSGHRGSSGSQASDSEGHSEDSDT
+QSVSAHGQAGPHQQSHQESTRGRSAGRSGRSGSFLYQVSTHEQSESAHGRTGTSTGGRQGSHHKQARDSS
+RHSTSQEGQDTIHGHPGSSSGGRQGSHYEQLVDRSGHSGSHHSHTTSQGRSDASHGHSGSRSASRQTRND
+EQSGDGSRHSGSRHHEASSRADSSGHSQVGQGQSEGPRTSRNWGSSFSQDSDSQGHSEDSERWSGSASRN
+HHGSAQEQLRDGSRHPRSHQEDRAGHGHSADSSRQSGTRHTQTSSGGQAASSHEQARSSAGERHGSHHQQ
+SADSSRHSGIGHGQASSAVRDSGHRGYSGSQASDNEGHSEDSDTQSVSAHGQAGSHQQSHQESARGRSGE
+TSGHSGSFLYQVSTHEQSESSHGWTGPSTRGRQGSRHEQAQDSSRHSASQDGQDTIRGHPGSSRGGRQGY
+HHEHSVDSSGHSGSHHSHTTSQGRSDASRGQSGSRSASRTTRNEEQSGDGSRHSGSRHHEASTHADISRH
+SQAVQGQSEGSRRSRRQGSSVSQDSDSEGHSEDSERWSGSASRNHHGSAQEQLRDGSRHPRSHQEDRAGH
+GHSADSSRQSGTRHTQTSSGGQAASSHEQARSSAGERHGSHHQQSADSSRHSGIGHGQASSAVRDSGHRG
+YSGSQASDNEGHSEDSDTQSVSAHGQAGSHQQSHQESARGRSGETSGHSGSFLYQVSTHEQSESSHGWTG
+PSTRGRQGSRHEQAQDSSRHSASQYGQDTIRGHPGSSRGGRQGYHHEHSVDSSGHSGSHHSHTTSQGRSD
+ASRGQSGSRSASRTTRNEEQSGDSSRHSVSRHHEASTHADISRHSQAVQGQSEGSRRSRRQGSSVSQDSD
+SEGHSEDSERWSGSASRNHRGSVQEQSRHGSRHPRSHHEDRAGHGHSADRSRQSGTRHAETSSGGQAASS
+HEQARSSPGERHGSRHQQSADSSRHSGIPRGQASSAVRDSRHWGSSGSQASDSEGHSEESDTQSVSGHGQ
+AGPHQQSHQESARDRSGGRSGRSGSFLYQVSTHEQSESAHGRTRTSTGRRQGSHHEQARDSSRHSASQEG
+QDTIRGHPGSSRRGRQGSHYEQSVDRSGHSGSHHSHTTSQGRSDASRGQSGSRSASRQTRNDEQSGDGSR
+HSWSHHHEASTQADSSRHSQSGQGQSAGPRTSRNQGSSVSQDSDSQGHSEDSERWSGSASRNHRGSAQEQ
+SRDGSRHPTSHHEDRAGHGHSAESSRQSGTHHAENSSGGQAASSHEQARSSAGERHGSHHQQSADSSRHS
+GIGHGQASSAVRDSGHRGSSGSQASDSEGHSEDSDTQSVSAHGQAGPHQQSHQESTRGRSAGRSGRSGSF
+LYQVSTHEQSESAHGRAGPSTGGRQGSRHEQARDSSRHSASQEGQDTIRGHPGSRRGGRQGSYHEQSVDR
+SGHSGSHHSHTTSQGRSDASHGQSGSRSASRETRNEEQSGDGSRHSGSRHHEASTQADSSRHSQSGQGES
+AGSRRSRRQGSSVSQDSDSEAYPEDSERRSESASRNHHGSSREQSRDGSRHPGSSHRDTASHVQSSPVQS
+DSSTAKEHGHFSSLSQDSAYHSGIQSRGSPHSSSSYHYQSEGTERQKGQSGLVWRHGSYGSADYDYGESG
+FRHSQHGSVSYNSNPVVFKERSDICKASAFGKDHPRYYATYINKDPGLCGHSSDISKQLGFSQSQRYYYY
+E
+>gi|114062|sp|P08519.1|APOA_HUMAN RecName: Full=Apolipoprotein(a); Short=Apo(a); Short=Lp(a); Flags: Precursor
+MEHKEVVLLLLLFLKSAAPEQSHVVQDCYHGDGQSYRGTYSTTVTGRTCQAWSSMTPHQHNRTTENYPNA
+GLIMNYCRNPDAVAAPYCYTRDPGVRWEYCNLTQCSDAEGTAVAPPTVTPVPSLEAPSEQAPTEQRPGVQ
+ECYHGNGQSYRGTYSTTVTGRTCQAWSSMTPHSHSRTPEYYPNAGLIMNYCRNPDAVAAPYCYTRDPGVR
+WEYCNLTQCSDAEGTAVAPPTVTPVPSLEAPSEQAPTEQRPGVQECYHGNGQSYRGTYSTTVTGRTCQAW
+SSMTPHSHSRTPEYYPNAGLIMNYCRNPDAVAAPYCYTRDPGVRWEYCNLTQCSDAEGTAVAPPTVTPVP
+SLEAPSEQAPTEQRPGVQECYHGNGQSYRGTYSTTVTGRTCQAWSSMTPHSHSRTPEYYPNAGLIMNYCR
+NPDAVAAPYCYTRDPGVRWEYCNLTQCSDAEGTAVAPPTVTPVPSLEAPSEQAPTEQRPGVQECYHGNGQ
+SYRGTYSTTVTGRTCQAWSSMTPHSHSRTPEYYPNAGLIMNYCRNPDAVAAPYCYTRDPGVRWEYCNLTQ
+CSDAEGTAVAPPTVTPVPSLEAPSEQAPTEQRPGVQECYHGNGQSYRGTYSTTVTGRTCQAWSSMTPHSH
+SRTPEYYPNAGLIMNYCRNPDAVAAPYCYTRDPGVRWEYCNLTQCSDAEGTAVAPPTVTPVPSLEAPSEQ
+APTEQRPGVQECYHGNGQSYRGTYSTTVTGRTCQAWSSMTPHSHSRTPEYYPNAGLIMNYCRNPDAVAAP
+YCYTRDPGVRWEYCNLTQCSDAEGTAVAPPTVTPVPSLEAPSEQAPTEQRPGVQECYHGNGQSYRGTYST
+TVTGRTCQAWSSMTPHSHSRTPEYYPNAGLIMNYCRNPDAVAAPYCYTRDPGVRWEYCNLTQCSDAEGTA
+VAPPTVTPVPSLEAPSEQAPTEQRPGVQECYHGNGQSYRGTYSTTVTGRTCQAWSSMTPHSHSRTPEYYP
+NAGLIMNYCRNPDAVAAPYCYTRDPGVRWEYCNLTQCSDAEGTAVAPPTVTPVPSLEAPSEQAPTEQRPG
+VQECYHGNGQSYRGTYSTTVTGRTCQAWSSMTPHSHSRTPEYYPNAGLIMNYCRNPDAVAAPYCYTRDPG
+VRWEYCNLTQCSDAEGTAVAPPTVTPVPSLEAPSEQAPTEQRPGVQECYHGNGQSYRGTYSTTVTGRTCQ
+AWSSMTPHSHSRTPEYYPNAGLIMNYCRNPDAVAAPYCYTRDPGVRWEYCNLTQCSDAEGTAVAPPTVTP
+VPSLEAPSEQAPTEQRPGVQECYHGNGQSYRGTYSTTVTGRTCQAWSSMTPHSHSRTPEYYPNAGLIMNY
+CRNPDAVAAPYCYTRDPGVRWEYCNLTQCSDAEGTAVAPPTVTPVPSLEAPSEQAPTEQRPGVQECYHGN
+GQSYRGTYSTTVTGRTCQAWSSMTPHSHSRTPEYYPNAGLIMNYCRNPDAVAAPYCYTRDPGVRWEYCNL
+TQCSDAEGTAVAPPTVTPVPSLEAPSEQAPTEQRPGVQECYHGNGQSYRGTYSTTVTGRTCQAWSSMTPH
+SHSRTPEYYPNAGLIMNYCRNPDAVAAPYCYTRDPGVRWEYCNLTQCSDAEGTAVAPPTVTPVPSLEAPS
+EQAPTEQRPGVQECYHGNGQSYRGTYSTTVTGRTCQAWSSMTPHSHSRTPEYYPNAGLIMNYCRNPDAVA
+APYCYTRDPGVRWEYCNLTQCSDAEGTAVAPPTVTPVPSLEAPSEQAPTEQRPGVQECYHGNGQSYRGTY
+STTVTGRTCQAWSSMTPHSHSRTPEYYPNAGLIMNYCRNPDAVAAPYCYTRDPGVRWEYCNLTQCSDAEG
+TAVAPPTVTPVPSLEAPSEQAPTEQRPGVQECYHGNGQSYRGTYSTTVTGRTCQAWSSMTPHSHSRTPEY
+YPNAGLIMNYCRNPDAVAAPYCYTRDPGVRWEYCNLTQCSDAEGTAVAPPTVTPVPSLEAPSEQAPTEQR
+PGVQECYHGNGQSYRGTYSTTVTGRTCQAWSSMTPHSHSRTPEYYPNAGLIMNYCRNPDAVAAPYCYTRD
+PGVRWEYCNLTQCSDAEGTAVAPPTVTPVPSLEAPSEQAPTEQRPGVQECYHGNGQSYRGTYSTTVTGRT
+CQAWSSMTPHSHSRTPEYYPNAGLIMNYCRNPDAVAAPYCYTRDPGVRWEYCNLTQCSDAEGTAVAPPTV
+TPVPSLEAPSEQAPTEQRPGVQECYHGNGQSYRGTYSTTVTGRTCQAWSSMTPHSHSRTPEYYPNAGLIM
+NYCRNPDAVAAPYCYTRDPGVRWEYCNLTQCSDAEGTAVAPPTVTPVPSLEAPSEQAPTEQRPGVQECYH
+GNGQSYRGTYSTTVTGRTCQAWSSMTPHSHSRTPEYYPNAGLIMNYCRNPDAVAAPYCYTRDPGVRWEYC
+NLTQCSDAEGTAVAPPTVTPVPSLEAPSEQAPTEQRPGVQECYHGNGQSYRGTYSTTVTGRTCQAWSSMT
+PHSHSRTPEYYPNAGLIMNYCRNPDAVAAPYCYTRDPGVRWEYCNLTQCSDAEGTAVAPPTVTPVPSLEA
+PSEQAPTEQRPGVQECYHGNGQSYRGTYSTTVTGRTCQAWSSMTPHSHSRTPEYYPNAGLIMNYCRNPDA
+VAAPYCYTRDPGVRWEYCNLTQCSDAEGTAVAPPTVTPVPSLEAPSEQAPTEQRPGVQECYHGNGQSYRG
+TYSTTVTGRTCQAWSSMTPHSHSRTPEYYPNAGLIMNYCRNPDAVAAPYCYTRDPGVRWEYCNLTQCSDA
+EGTAVAPPTVTPVPSLEAPSEQAPTEQRPGVQECYHGNGQSYRGTYSTTVTGRTCQAWSSMTPHSHSRTP
+EYYPNAGLIMNYCRNPDAVAAPYCYTRDPGVRWEYCNLTQCSDAEGTAVAPPTVTPVPSLEAPSEQAPTE
+QRPGVQECYHGNGQSYRGTYSTTVTGRTCQAWSSMTPHSHSRTPEYYPNAGLIMNYCRNPDAVAAPYCYT
+RDPGVRWEYCNLTQCSDAEGTAVAPPTVTPVPSLEAPSEQAPTEQRPGVQECYHGNGQSYRGTYSTTVTG
+RTCQAWSSMTPHSHSRTPEYYPNAGLIMNYCRNPDAVAAPYCYTRDPGVRWEYCNLTQCSDAEGTAVAPP
+TVTPVPSLEAPSEQAPTEQRPGVQECYHGNGQSYRGTYSTTVTGRTCQAWSSMTPHSHSRTPEYYPNAGL
+IMNYCRNPDAVAAPYCYTRDPGVRWEYCNLTQCSDAEGTAVAPPTVTPVPSLEAPSEQAPTEQRPGVQEC
+YHGNGQSYRGTYSTTVTGRTCQAWSSMTPHSHSRTPEYYPNAGLIMNYCRNPDAVAAPYCYTRDPGVRWE
+YCNLTQCSDAEGTAVAPPTVTPVPSLEAPSEQAPTEQRPGVQECYHGNGQSYRGTYSTTVTGRTCQAWSS
+MTPHSHSRTPEYYPNAGLIMNYCRNPDPVAAPYCYTRDPSVRWEYCNLTQCSDAEGTAVAPPTITPIPSL
+EAPSEQAPTEQRPGVQECYHGNGQSYQGTYFITVTGRTCQAWSSMTPHSHSRTPAYYPNAGLIKNYCRNP
+DPVAAPWCYTTDPSVRWEYCNLTRCSDAEWTAFVPPNVILAPSLEAFFEQALTEETPGVQDCYYHYGQSY
+RGTYSTTVTGRTCQAWSSMTPHQHSRTPENYPNAGLTRNYCRNPDAEIRPWCYTMDPSVRWEYCNLTQCL
+VTESSVLATLTVVPDPSTEASSEEAPTEQSPGVQDCYHGDGQSYRGSFSTTVTGRTCQSWSSMTPHWHQR
+TTEYYPNGGLTRNYCRNPDAEISPWCYTMDPNVRWEYCNLTQCPVTESSVLATSTAVSEQAPTEQSPTVQ
+DCYHGDGQSYRGSFSTTVTGRTCQSWSSMTPHWHQRTTEYYPNGGLTRNYCRNPDAEIRPWCYTMDPSVR
+WEYCNLTQCPVMESTLLTTPTVVPVPSTELPSEEAPTENSTGVQDCYRGDGQSYRGTLSTTITGRTCQSW
+SSMTPHWHRRIPLYYPNAGLTRNYCRNPDAEIRPWCYTMDPSVRWEYCNLTRCPVTESSVLTTPTVAPVP
+STEAPSEQAPPEKSPVVQDCYHGDGRSYRGISSTTVTGRTCQSWSSMIPHWHQRTPENYPNAGLTENYCR
+NPDSGKQPWCYTTDPCVRWEYCNLTQCSETESGVLETPTVVPVPSMEAHSEAAPTEQTPVVRQCYHGNGQ
+SYRGTFSTTVTGRTCQSWSSMTPHRHQRTPENYPNDGLTMNYCRNPDADTGPWCFTMDPSIRWEYCNLTR
+CSDTEGTVVAPPTVIQVPSLGPPSEQDCMFGNGKGYRGKKATTVTGTPCQEWAAQEPHRHSTFIPGTNKW
+AGLEKNYCRNPDGDINGPWCYTMNPRKLFDYCDIPLCASSSFDCGKPQVEPKKCPGSIVGGCVAHPHSWP
+WQVSLRTRFGKHFCGGTLISPEWVLTAAHCLKKSSRPSSYKVILGAHQEVNLESHVQEIEVSRLFLEPTQ
+ADIALLKLSRPAVITDKVMPACLPSPDYMVTARTECYITGWGETQGTFGTGLLKEAQLLVIENEVCNHYK
+YICAEHLARGTDSCQGDSGGPLVCFEKDKYILQGVTSWGLGCARPNKPGVYARVSRFVTWIEGMMRNN
+>gi|81894378|sp|Q7TMA5.1|APOB_RAT RecName: Full=Apolipoprotein B-100; Short=Apo B-100; Contains: RecName: Full=Apolipoprotein B-48; Short=Apo B-48; Flags: Precursor
+MGPQRPALRAPLLLLFLLLFLDTSVWAQDATRFKHLRKYVYSYEAESSSGVRGTADSRSATKINCKVELE
+VPQVCTLIMRTSQCTLKEVYGFNPEGKALMKKTKNSEEFASAMSRYELKLAFPEGKRVALYPDLGEPNYI
+LNIKRGIISALLVPPETEEDKQVLFQDTVYGNCSTQVTVNSRKGTVATEMSTERNLQHCDGFQPISTSVS
+PLALIKGLVRPLSTLISSSQSCQYTLEPKRKHVSEAICNEQHLFLPFSYKNKYGIMTHVTQKLSLEDTPK
+INSRFFRGGINQVGLAFESTKSTSPPKQADAVLKTLQELKKLSISEQNAQRANLFHKLVTELRGLSGEAI
+TSLLPQLIEVSSPITLQALIQCGQPECYTHILQWLKTEKAHPLLIDIVTYLMALIPNPSVQRLQEIFNTA
+KELQSRATLYALSHAVNSYYAIMDHSRSPVLEDIAGYLMKQIDNECMGDEDRTFLILRVIGNMGRTMERV
+MPALKSSVLNCVRSTKPSLQIQKAALQALRKMEMGDEVRTILFDTFVNDVAPVEKRLAAYLLLMRSPSSS
+DINKIAKLLQWEQSEQVKNFVASHIANILNSEELYVQDLKNLIKNALVNSRLPTIMDFRKFSRNYQISKS
+VSIPLFDPVSAKIEGNLVFDPSSYLPKESMLKTTLTVFGIASLDLFEIGLEGKGFEPTLEALFGKQGFFP
+DSVNKALYWVNGQVPDRVSKVLVDHFGYTKDDKHEQDMVNGIMPIVDKLIKELKSKEIPEARAYLRILGK
+ELGFVRLQDLQVLGKLLLNGAQTFRGVPQMIVQAIREGSKDDLFLHYIFMENAFELPTGVGLQLQVSSSG
+VFTPGIKAGVRLELANIQAELVAKPSVSLEFVTNMGIIIPDFAKSGVQMNTNFFHESGLEARVALKAGQL
+KVIIPSPKRPVKLFSGSNTLHLVSTTKTEVIPPLIENRKSWSTCKPFFTGMNYCTTGAYSNASSTESASY
+YPLTGDTRYELELKPTGEVEQYSASATYELLKEDKSLVDTLKFLVQAEGVQQSEATAMFKYNRRSRTLSS
+EVLIPGFDVNFGTILRVNDESSKDKNTYKLILDIQNKKITEVSVVGHVSYDKKGDGKVKGVVSIPRLQAE
+ARSEVHTHWSPTKLLFQMDSSATAYGSTISKRVAWRYDNEKIEFDWNTGTNVDTKKVASNFPVDLSRYPR
+MVHEYANGLLDHRVPQTDMTFRHMGSKLIVDHLNGLSELNLPKVGLPDFHIPDNLFLKTDGRVKYTLNKN
+RIEIDIPLPLGGKSSKDLKVPESVRTPALNFKSVGFHLPSQEVQIPTFTIPKTHQLQVPLLGILDLSTNV
+YSNLYNWSVSYTGGNTSRDHFSLQAQYRMKADSVVDLFSYSVQGSGETTYDSKSTFTLSCDGSLHHKFLD
+SKFKVSHVEKFGNNPVSKGLLTFETSSALGPQMSATVQLDSKKKQHLYVKDIKVDGQFRVFSLYAQGEYG
+LSYERDSMTGQMSGESNMKFNSTYFQGTNQIVGMYQDGMLSVTSTSDLQDGIFKNTASLKYENYELTLKS
+DSSGQYENFAASNKLDMTFSKQSALLRSEHQANYKSLRLVTLLSGSLTSQGVELNADILGTDKINTGAHK
+STLKIAQDGVSTSATTNLKYSPLLLENELNAELGLSGASMKLSTSGRFKEHHAKFSLDGRAALTEVSLGS
+IYQAMILGADSKNVFNFKLSREGLKLSNDMMGSYAEMKLDHTHSLRISGLSLDFFSKMDNIYSGDKFYKQ
+NFNLQLQPYSFGITLSNDLKYDALVLTNNGRLRLEPLKLNVGGNFKGTYQNNELKHIYTISYTDLVVASY
+RADTVATVQGVEFSHRLNADIEGLASSVDVTTSYSSDPLHFNNVFRFVLAPFTLGVDTHTSGDGKMSLWG
+EHTGQMYSKFLLKAEPLALTFSHDYKGSTSHNLLYKNSVSTALEHTLSALLTPAEQTSSWKFKTSLNDKV
+YSQEFEAYNTKDKIGIELSGRADLSGLYSPIKVPFFYSEPVNVLNSLEINDAFDEPREFTIDAVVKYDKN
+QDVHTISLPFFQSLPDYLERNRRGIISLLEAMKGELQRLSVDQFVRKYRVALSRLPQQIHDYLNASDWER
+QVAGAKEKLTSFMENYRITDNDVLIALDSAKINLNEKLSQLETYAIQFDQYIRDNYDAQDLKRTIAQIID
+RIIEKLKMLDEQYHIRVNLAKSIHNLYLFVENVDLNQISSSGASWIQNVDTKYQIRIQIQEKLQHLRTQI
+HNIDIQQLAAELKQQIEALDVPMHLDQLRTAILFQRISVIIERVKYFVMNLIEDFKVTEKINTFRVIVRE
+LIEKYEVDRQIQVLMDKSIELAHRYSLSEPLQKLSNVLQQIEIKDYYDKLVGFIDDTVEWIKAVSFKNII
+EELNRLIDMSVKKLKAFDYHQFVDKTNSKIREMTQRINAEIQALELPQKTEALKLWVEDFKTTVSNSLEK
+LKDTKVTVVVDWLQDGLAQIKAQFQDALEDVRDRIYQMDIQGELERCLSLVSQVYSTVVTYISDWWTLTA
+KNITDFAEQYSTQKWAESVKALVEQGFIVPEIQTFLGTMPAFEVSLHALQEANFQTPDFIVPLTDLRIPS
+IWINFKMLKNVKIPLRFSTPEFTLLNTFRVRSFTIDLLEIKAKIIRTIDQMLSSELQWPLPEVYLRDLEM
+VNISLARLSLPDFHVPEITIPEFTIPNVNLKDLQVPDLHIPEFQLPHLSCTTEIPAFGKLHSVLKIQSPL
+FILDASANIQNITTSENKAEIVASVTARGESKFEALNFDFQAQAQFLELNANPLVLKESVNFSSKHVRME
+HEGKILVSGKALEGKSDTVARLHTEKNTVEFNNGIVVKINNQFTLDSQTKYFHKLSVPRLDFSSKASLNN
+EIKTLLEAGHMAWTSSGTGSWNWACPNFSDEGIHSSKISFIVDGPIASFGLSNNINGKHLRVVQKLTSES
+GFLNYSRFEVESKVESQHVGSSILTAEGRALLGDAKAEMTGEHNANLNGKVIGTLKNSLFFSAQPFEITA
+STNNEGNLKVSFPLKLTGKIDFLNNYALFLSPHAQQASWQLSTRFNQYKYNQNFSAINNEHNMEASIVMN
+GDANLDFLNIPLTIPEINLPYTRFTTPLLKDFSIWEETGLKEFLKTTKQSFDLSIKAQYKKNRDKHSVVI
+PLKMFYEFMLNNVNSWDRKFEKVRDNALHFLTASYNETKIKFDKYKTENSLNQPSRTFQNRGHTIPVLNI
+EVSPFAVETLASSHVIPKAIRTPSVTIPGPNIIVPSYRLVLPSLQLPVFHIPRTLFKFSLPDFKKLSTID
+NIYIPAMGNFTYDFSFKSSVITLNTNAGLYNQSDLVARFLSSSSFVTDALQYKLEGTSRLMRKKVLKLAT
+AVSLTNKFLKGSHDSTISLTKKNMEASVKTTANLHAPIFTMNFKQELNGNTKSKPTVSSSIELNYDFNSS
+KLHSAAKGGVDHKFSLESLTSYLSIESFTKGNIKGSFLSQEYSGSVANEANVYLNSKGTRSSVRLQGASN
+FAGIWNFEVGENFAGEATLRRIYGTWEHNMINHLQVFSYFDTKGKQTCRATLELSPWTMSTLLQVHVSQP
+SPLFDLHHFDQEVILKASTKNQKVSWKSEVQVESQVLQHNAHFSNDQEEVRLDIAGSLEGQLWDLENFFL
+PAFGKSLRELLQIDGKRQYLQASTSLHYTKNPNGYLLSLPVQELTDRFIIPGLKLNDFSGIKIYKKLSTS
+PFALNLTMLPKVKFPGVDLLTQYSKPEGSSVPTFETTIPEIQLTVSQFTLPKSFPVGNTVFDLNKLTNLI
+ADVDLPSITLPEQTIEIPSLEFSVPAGIFIPFFGELTAHVGMASPLYNVTWSTGWKNKADHVETFLDSTC
+SSTLQFLEYALKVVGTHRIENDKFIYKIKGTLQHCDFNVKYNEDGIFEGLWDLEGEAHLDITSPALTDFH
+LHYKEDKTSVSASAASPAIGTVSLDASTDDQSVRLNVYFRPQSPPDNKLSIFKMEWRDKESDGETYIKIN
+WEEEAAFRLLDSLKSNVPKASEAVYDYVKKYHLGHASSELRKSLQNDAEHAIRMVDEMNVNAQRVTRDTY
+QSLYKKMLAQESQSIPEKLKKMVLGSLVRITQKYHMAVTWLMDSVIHFLKFNRVQFPGNAGTYTVDELYT
+IAMRETKKLLSQLFNGLGHLFSYVQDQVEKSRVINDITFKCPFSPTPCKLKDVLLIFREDLNILSNLGQQ
+DINFTTILSDFQSFLERLLDIIEEKIECLKNNESTCVPDHINMFFKTHIPFAFKSLRENIYSVFSEFNDF
+VQSILQEGSYKLQQVHQYMKAFREEYFDPSVVGWTVKYYEIEEKMVDLIKTLLAPLRDFYSEYSVTAADF
+ASKMSTQVEQFVSRDIREYLSMLADINGKGREKVAELSIVVKERIKSWSTAVAEITSDYLRQLHSKLQDF
+SDQLSGYYEKFVAESTRLIDLSIQNYHMFLRYIAELLKKLQVATANNGLLKRGDFEAAVKLGIACLYNEG
+LSVSDEAYAEVNGLKASRFFSMDERLNMGSDPFIWLSICPPCFRKLRDFAGKGCWEAQPALAKDCAGGSQ
+LGLEGKAFSESVCQLFQASQAVNKQQIFSVQKGLSDTVRYILIGWLVEVAPMKDFTSLCLHLTVECVGRY
+LQRKLVPRYKLQLLGIACMVICTWFISKEILTIREAVRLTDNTYKYKDLVRVKREIISALEGKIRIPTVV
+DYKEVLLTLVPVTPRTQYLCSFLCELTLSVYTPAHLASAALLLARLMHGQTQP
+>gi|13124727|sp|P33450.3|FAT_DROME RecName: Full=Cadherin-related tumor suppressor; AltName: Full=Protein fat; Flags: Precursor
+MERLLLLFFLLLAGRESLCQTGDTKLELLAPRGRSYATTYEQYAAFPRRRSSSSSPSGEMQSRAVDTSAD
+FEVLEGQPRGTTVGFIPTKPKFSYRFNEPPREFTLDPVTGEVKTNVVLDREGMRDHYDLVVLSSQPTYPI
+EVRIKVLDVNDNSPEFPEPSIAISFSESATSGTRLLLDAATDADVGENGVTDQYEIVAGNVDNKFRLVTT
+ANPSGDTSYLHLETTGNLDRESRGSYQLNISARDGGSPPRFGYLQVNVTILDVNDNPPIFDHSDYNVSLN
+ETALPGTPVVTVMASDNDLGDNSKITYYLAETEHQFTVNPETGVISTTERVNCPQQTNVKSSASQKSCVF
+TVFARDHGSPRQDGRTYVTVNLLDTNDHDPIISFRFFPDGGKVATVDENAVNGTVVAAVAVKDSDSGLNG
+RTSVRIVSGNELGHFRLEEAADLHIVRVNGVLDREEIGKYNLTVVAMDQGTPARTTTAHLIIDVNDVNDH
+EPVFEKSEYSAVLSELAPTGSFVASITATDEDTGVNAQVHYDILSGNELKWFSMDPLTGLIVTTGPLDRE
+IRDTVELSISARDGGPNPKFAYTQLKVIILDENDEAPQFSQREQNVTLGEDAPPQTIVALMTATDHDQGT
+NGSVTFALAPSVERLYPLQFALDALTGQLTTRRPLDREKMSQYEISVIARDQGAPTPQSATATVWLNVAD
+VNDNDPQFYPRHYIYSLADDDDDIKLKKEVEKERILLHVTASDKDDGDNALIEYRLESGGEGLFQLDARS
+GAISLRGDAPASMHWKPHYKLLVSARDAGQRRSQQDAIVEIVLKSKLEMLECGQAQAGGYEFQMVEDHEQ
+QRNSQPNREVGIVQVKSTNGKANSHIEYDIIQGDRAQNFRIDTRSGRITTARPLDREEQANYRLTILASS
+SSSSSAAASSVSYGQCIVNIAIIDLNDNAPVFALDRESEPTISLPENAAVGQEIYLSRVRDRDAGVNSRI
+SYSLTNNPNQQFRIGPVTGVLYLQRPIRAEPGSLIHVELMATDAGSPPLSSKLSLSVLIADVNDHTPVFD
+HTSYETSLPETTKVNTRFFALAATDIDLGDNGRISYEIIEGNTERMFGVFPDGYLFVRAPLDREERDYYA
+LTVSCRDAGQPSRSSVVPVVIHVIDENDNAPQFTNSTFTFSIPENAPADTFVGKLTAVDRDIGRNAELSF
+TLSSQTQDFTIDTRNGFIKTLRPFDREALVKVSRNAEASGEDGSLRGSMAGNYMLLEATVSDNGIPRLQD
+KVKVKVIVTDVNDNAPEFLRAPYHVTISEGASEGTHITHVFTQDADEGLNGDVYYSLAKGNEAGQFNLDS
+ATGQLSLGRRLDRESQEIHHLIVVAKDAALKHPLSSNASITIVVLDENDNAPEFTQSSSEVSVLETSPTG
+TELMRFRASDADQGVNSQVVFSISAGNRRDTFHIDSITGSLYLHKPLDYEDITSYTLNITASDCGTPSLS
+TTVLYNVLVVDDNDNPPIFPSTAIVRQIKEGIPLKTPIVTVTADDPDSGLNGKVSYAISKQEPQLPQGRH
+FGINTETGVIHTLREIDRESIDTFRLTVVATDRAQPSERQLSTEKLVTVIVEDINDNAPVFVSMNAAILP
+PKFSTSKGSSTAVMQVHAKDADSSSNGLVTYEIVSGPQELFKLQRNTGIITFTPGPQFKQEVRYQLTLKS
+TDEAVQSERRSSEVYITIITPGSGGSESSVPQFEQRSKLSGSVYENEPIGTSILTVTAHLASAEIEYFVT
+NVTATGSRGQVDRLFDIDAKLGILSTAAELDREAGPEEYEVEVYAIALGGQPRTSRTKVRVTVLDKNDSP
+PQFLDTPFVYNVSEDLQIGHTISTLRAHDPDTLGSVTFLLMDGHDGKFLLEPSTGKLILNDTLDRETKSK
+YELRIRVSDGVQYTEAYATIQVSDTNDNPPLFEDTVYSFDIPENAQRGYQVGQIVARDADLGQNAQLSYG
+VVSDWANDVFSLNPQTGMLTLTARLDYEEVQHYILIVQAQDNGQPSLSTTITVYCNVLDLNDNAPIFDPM
+SYSSEVFENVPIATEVVTVSAKDIDSGNNGLIEYSITAGDVDSEFGIDSNGTIRTRRNLDREHRSTYTLT
+VTARDCADEFASFSELEETQLKLKYRSPRKYQQTRQEFLAHQKQQRLSSTVKVTILIKDVNDEVPVFISA
+NETAIMENVAINTVVIAVKAVDNDEGRNGYIDYLMKEARDEDMGQSDPLPFSLNPTDGQLRVVDALDREL
+RSSYLLNITARDRGEPPQSTESQLLIRILDENDNSPVFDPKQYSASVAENASIGAMVLQVSATDVDEGAN
+GRIRYSIVLGDQNHDFSISEDTGVVRVAKNLNYERLSRYSLTVRAEDCALENPAGDTAELTINILDINDN
+RPTFLDSPYLARVMENTVPPNGGYVLTVNAYDADTPPLNSQVRYFLKEGDSDLFRINASSGDIALLKPLD
+REQQSEYTLTLVAMDTGSPPLTGTGIVRVEVQDINDNDPVFELQSYHATVRENLPSGTHVLTPRATDKDE
+GLNAKLRFNLLGEHMHRFHIDSETGEISTATTLDREETSVYHLTLMAQDSSITEPRASSVNLTISVSDVN
+DNIPKFDSTTYNVAVPERISKGEFVFGARALDLDDGENAVVHYTISGRDQHYFDINTKTGVVSTKLELKT
+KTKSHDDLTYTIVISAMDQGEQSLSSKAELTVILRPPELFPTFAYMANSHFAMSEDVRPGKMITKVSATS
+PKKGLVGKIRYAIAGGIMGDSLRVDPNSGLLSVGQDGLDYELTHLYEIWIEAADGDTPSLRSVTLITLNV
+TDANDNAPVMEQLIYNAEVLEEESPPQLIAVVKASDRDSGDNGNVIYRLQNDFDGTFEITESGEIYTRMR
+LDREEIGDYAFVVEAVDQGVPHMTGTASVLLHLLDKNDNPPKFTRLFSLNVTENAEIGSFVIRVTSSDLD
+LGANANASYSFSENPGEKFRIEPQSGNITVAGHLDREQQDEYILKVVASDGAWRAETPITITIQDQNDNA
+PEFEHSFYSFSFPELQQSIALVGQIIATDRDKQGPNSVISYSLQQPSPMFSIDPATGEVFSKKAVRFKHS
+QYVRSPENMYALTVLATDNGKPPLYSECLVNINIVDAHNNPPKFEQAEYLAPLPQDAVRGQRIVRVHAND
+KQDLGTNEMDYSLMTFNLSSIFSVGRHDGWITLVKPIQVPPNTRYELVVRATDRGVPPQSDETRVVIVVT
+GENMDTPRFSVNSYQVIVPENEPVGSTILTVGATDDDTGPNGMLRYSISGGNERQDFSVDERTGGIVIQQ
+QLDYDLIQEYHLNITVQDLGYHPLSSVAMLTIILTDVNDNPPVFNHKEYHCYIPENKPVGTFVFQAHAAD
+KDSPKNAIIHYAFLPSGPDRHFFIMNQSNGTISSAVSFDYEERRIYTLQIKAKNPDSSMESYANLYVHVL
+GVNEFYPQFLQPVFHFDVSETSAVGTRVGAVQATDKDSGEDGRVYYLLVGSSNDKGFRIDTNTGLIYVAR
+HLDRETQNRVVLTVMAKNYGSIRGNDTDEAQVIISIQDGNDPPEFIKHYYTSTISEAAPVGTKVTTVKAI
+DKDVRTQNNQFSYSIINGNLKQSFKIDVQTGEISTASRLDREETSTYNLVIGAIDTGLPPQTGSATVHIE
+LEDVNDNGPTFTPEGLNGYISENEPAGTSIMTLIASDPDLPRNGGPFTYQLIGGKHKSWLSVDRNSGVVR
+STTSFDREMTPILEAIIEVEDSGKPKQKSQHLLTITVLDQNDNPSTTRSLHIAVSLFNGDLPSNVKLADV
+RPNDIDIVGDYRCRLQKNPAQSQLQLAIPRACDLITTSHTTPIASVFSYTGNDGKHGDVSSKVSVAFQSF
+NNETLANSVSIMVRNMTAYHFLANHYRPILEMIKSRMSNEDEVILYSLLEGGSGNSTNLQLLMAVRLAKT
+SYQQPKYLIERLREKRSAFSELLQKEVIVGYEPCSEPDVCENGGVCSATMRLLDAHSFVIQDSPALVLSG
+PRVVHDYSCQCTSGFSGEQCSRRQDPCLPNPCHSQVQCRRLGSDFQCMCPANRDGKHCEKERSDVCYSKP
+CRNGGSCQRSPDGSSYFCLCRPGFRGNQCESVSDSCRPNPCLHGGLCVSLKPGYKCNCTPGRYGRHCERF
+SYGFQPLSYMTFPALDVTTNDISIVFATTKPNSLLLYNYGMQSGGRSDFLAIELVHGRAYFSSGGARTAI
+STVIAGRNLADGGWHKVTATRNGRVMSLSVAKCADSGDVCTECLPGDSSCYADEVGPVGTLNFNKQPLMI
+GGLSSADPILERPGQVHSDDLVGCLHSVHIGGRALNLSLPLQQKGILAGCNRQACQPALAAERCGGFAGQ
+CIDRWSSSLCQCGGHLQSPDCSDSLEPITLGEGAFVEFRISEIYRRMQLLDNLYNSKSAWLDNQQMRERR
+AVSNFSTASQIYEAPKMLSMLFRTYKDQGQILYAATNQMFTSLSLREGRLVYYSKQHLTINMTVQETSTL
+NDGKWHNVSLFSESRSLRLIVDGRQVGDELDIAGVHDFLDPYLTILNVGGEAFVGCLANVTVNNELQPLN
+GSGSIFPEVRYHGKIESGCRGDIGQDAAQVADPLSIGFTLVIVFFVILVVAILGSYVIYRFRGKQEKIGS
+LSCGVPGFKIKHPGGPVTQSQVDHVLVRNLHPSEAPSPPVGAGDHMRPPVGSHHLVGPELLTKKFKEPTA
+EMPQPQQQQQRPQRPDIIERESPLIREDHHLPIPPLHPLPLEHASSVDMGSEYPEHYDLENASSIAPSDI
+DIVYHYKGYREAAGLRKYKASVPPVSAYTHHKHQNSGSQQQQQQHRHTAPFVTRNQGGQPPPPPTSASRT
+HQSTPLARLSPSSELSSQQPRILTLHDISGKPLQSALLATTSSSGGVGKDVHSNSERSLNSPVMSQLSGQ
+SSSASRQKPGVPQQQAQQTSMGLTAEEIERLNGRPRTCSLISTLDAVSSSSEAPRVSSSALHMSLGGDVD
+AHSSTSTDESGNDSFTCSEIEYDNNSLSGDGKYSTSKSLLDGRSPVSRALSGGETSRNPPTTVVKTPPIP
+PHAYDGFESSFRGSLSTLVASDDDIANHLSGIYRKANGAASPSATTLGWEYLLNWGPSYENLMGVFKDIA
+ELPDTNGPSQQQQQQTQVVSTLRMPSSNGPAAPEEYV
+>gi|187609692|sp|Q9UKN1.2|MUC12_HUMAN RecName: Full=Mucin-12; Short=MUC-12; AltName: Full=Mucin-11; Short=MUC-11; Flags: Precursor
+MLVIWILTLALRLCASVTTVTPEGSAVHKAISQQGTLWTGEVLEKQTVEQGKSTLRRQKNHFHRSAGELR
+CRNALKDEGASAGWSVMFAGESVVVLVHLWMTGARVKNLGLVEFASPGDDGDGRAEGFSLGLPLSEQARA
+AGAREKERQETVINHSTFSGFSQITGSTVNTSIGGNTTSASTPSSSDPFTTFSDYGVSVTFITGSTATKH
+FLDSSTNSGHSEESTVSHSGPGATGTTLFPSHSATSVFVGEPKTSPITSASMETTALPGSTTTAGLSEKS
+TTFYSSPRSPDRTLSPARTTSSGVSEKSTTSHSRPGPTHTIAFPDSTTMPGVSQESTASHSIPGSTDTTL
+SPGTTTPSSLGPESTTFHSSPGYTKTTRLPDNTTTSGLLEASTPVHSSTGSPHTTLSPSSSTTHEGEPTT
+FQSWPSSKDTSPAPSGTTSAFVKLSTTYHSSPSSTPTTHFSASSTTLGHSEESTPVHSSPVATATTPPPA
+RSATSGHVEESTAYHRSPGSTQTMHFPESSTTSGHSEESATFHGSTTHTKSSTPSTTAALAHTSYHSSLG
+STETTHFRDSSTISGRSEESKASHSSPDAMATTVLPAGSTPSVLVGDSTPSPISSGSMETTALPGSTTKP
+GLSEKSTTFYSSPRSPDTTHLPASMTSSGVSEESTTSHSRPGSTHTTAFPGSTTMPGLSQESTASHSSPG
+PTDTTLSPGSTTASSLGPEYTTFHSRPGSTETTLLPDNTTASGLLEASMPVHSSTRSPHTTLSPAGSTTR
+QGESTTFHSWPSSKDTRPAPPTTTSAFVEPSTTSHGSPSSIPTTHISARSTTSGLVEESTTYHSSPGSTQ
+TMHFPESDTTSGRGEESTTSHSSTTHTISSAPSTTSALVEEPTSYHSSPGSTATTHFPDSSTTSGRSEES
+TASHSSQDATGTIVLPARSTTSVLLGESTTSPISSGSMETTALPGSTTTPGLSERSTTFHSSPRSPATTL
+SPASTTSSGVSEESTTSRSRPGSTHTTAFPDSTTTPGLSRHSTTSHSSPGSTDTTLLPASTTTSGPSQES
+TTSHSSSGSTDTALSPGSTTALSFGQESTTFHSNPGSTHTTLFPDSTTSSGIVEASTRVHSSTGSPRTTL
+SPASSTSPGLQGESTAFQTHPASTHTTPSPPSTATAPVEESTTYHRSPGSTPTTHFPASSTTSGHSEKST
+IFHSSPDASGTTPSSAHSTTSGRGESTTSRISPGSTEITTLPGSTTTPGLSEASTTFYSSPRSPTTTLSP
+ASMTSLGVGEESTTSRSQPGSTHSTVSPASTTTPGLSEESTTVYSSSRGSTETTVFPHSTTTSVHGEEPT
+TFHSRPASTHTTLFTEDSTTSGLTEESTAFPGSPASTQTGLPATLTTADLGEESTTFPSSSGSTGTKLSP
+ARSTTSGLVGESTPSRLSPSSTETTTLPGSPTTPSLSEKSTTFYTSPRSPDATLSPATTTSSGVSEESST
+SHSQPGSTHTTAFPDSTTTSDLSQEPTTSHSSQGSTEATLSPGSTTASSLGQQSTTFHSSPGDTETTLLP
+DDTITSGLVEASTPTHSSTGSLHTTLTPASSTSAGLQEESTTFQSWPSSSDTTPSPPGTTAAPVEVSTTY
+HSRPSSTPTTHFSASSTTLGRSEESTTVHSSPGATGTALFPTRSATSVLVGEPTTSPISSGSTETTALPG
+STTTAGLSEKSTTFYSSPRSPDTTLSPASTTSSGVSEESTTSHSRPGSTHTTAFPGSTTMPGVSQESTAS
+HSSPGSTDTTLSPGSTTASSLGPESTTFHSSPGSTETTLLPDNTTASGLLEASTPVHSSTGSPHTTLSPA
+GSTTRQGESTTFQSWPSSKDTMPAPPTTTSAFVELSTTSHGSPSSTPTTHFSASSTTLGRSEESTTVHSS
+PVATATTPSPARSTTSGLVEESTAYHSSPGSTQTMHFPESSTASGRSEESRTSHSSTTHTISSPPSTTSA
+LVEEPTSYHSSPGSTATTHFPDSSTTSGRSEESTASHSSQDATGTIVLPARSTTSVLLGESTTSPISSGS
+METTALPGSTTTPGLSEKSTTFHSSPRSPATTLSPASTTSSGVSEESTTSHSRPGSTHTTAFPDSTTTPG
+LSRHSTTSHSSPGSTDTTLLPASTTTSGPSQESTTSHSSPGSTDTALSPGSTTALSFGQESTTFHSSPGS
+THTTLFPDSTTSSGIVEASTRVHSSTGSPRTTLSPASSTSPGLQGESTAFQTHPASTHTTPSPPSTATAP
+VEESTTYHRSPGSTPTTHFPASSTTSGHSEKSTIFHSSPDASGTTPSSAHSTTSGRGESTTSRISPGSTE
+ITTLPGSTTTPGLSEASTTFYSSPRSPTTTLSPASMTSLGVGEESTTSRSQPGSTHSTVSPASTTTPGLS
+EESTTVYSSSPGSTETTVFPRTPTTSVRGEEPTTFHSRPASTHTTLFTEDSTTSGLTEESTAFPGSPAST
+QTGLPATLTTADLGEESTTFPSSSGSTGTTLSPARSTTSGLVGESTPSRLSPSSTETTTLPGSPTTPSLS
+EKSTTFYTSPRSPDATLSPATTTSSGVSEESSTSHSQPGSTHTTAFPDSTTTPGLSRHSTTSHSSPGSTD
+TTLLPASTTTSGPSQESTTSHSSPGSTDTALSPGSTTALSFGQESTTFHSSPGSTHTTLFPDSTTSSGIV
+EASTRVHSSTGSPRTTLSPASSTSPGLQGESTTFQTHPASTHTTPSPPSTATAPVEESTTYHRSPGSTPT
+THFPASSTTSGHSEKSTIFHSSPDASGTTPSSAHSTTSGRGESTTSRISPGSTEITTLPGSTTTPGLSEA
+STTFYSSPRSPTTTLSPASMTSLGVGEESTTSRSQPGSTHSTVSPASTTTPGLSEESTTVYSSSPGSTET
+TVFPRSTTTSVRGEEPTTFHSRPASTHTTLFTEDSTTSGLTEESTAFPGSPASTQTGLPATLTTADLGEE
+STTFPSSSGSTGTTLSPARSTTSGLVGESTPSRLSPSSTETTTLPGSPTTPSLSEKSTTFYTSPRSPDAT
+LSPATTTSSGVSEESSTSHSQPGSTHTTAFPDSTTTSGLSQEPTASHSSQGSTEATLSPGSTTASSLGQQ
+STTFHSSPGDTETTLLPDDTITSGLVEASTPTHSSTGSLHTTLTPASSTSAGLQEESTTFQSWPSSSDTT
+PSPPGTTAAPVEVSTTYHSRPSSTPTTHFSASSTTLGRSEESTTVHSSPGATGTALFPTRSATSVLVGEP
+TTSPISSGSTETTALPGSTTTAGLSEKSTTFYSSPRSPDTTLSPASTTSSGVSEESTTSHSRPGSTHTTA
+FPGSTTMPGVSQESTASHSSPGSTDTTLSPGSTTASSLGPESTTFHSGPGSTETTLLPDNTTASGLLEAS
+TPVHSSTGSPHTTLSPAGSTTRQGESTTFQSWPNSKDTTPAPPTTTSAFVELSTTSHGSPSSTPTTHFSA
+SSTTLGRSEESTTVHSSPVATATTPSPARSTTSGLVEESTTYHSSPGSTQTMHFPESDTTSGRGEESTTS
+HSSTTHTISSAPSTTSALVEEPTSYHSSPGSTATTHFPDSSTTSGRSEESTASHSSQDATGTIVLPARST
+TSVLLGESTTSPISSGSMETTALPGSTTTPGLSEKSTTFHSSPRSPATTLSPASTTSSGVSEESTTSHSR
+PGSTHTTAFPDSTTTPGLSRHSTTSHSSPGSTDTTLLPASTTTSGSSQESTTSHSSSGSTDTALSPGSTT
+ALSFGQESTTFHSSPGSTHTTLFPDSTTSSGIVEASTRVHSSTGSPRTTLSPASSTSPGLQGESTAFQTH
+PASTHTTPSPPSTATAPVEESTTYHRSPGSTPTTHFPASSTTSGHSEKSTIFHSSPDASGTTPSSAHSTT
+SGRGESTTSRISPGSTEITTLPGSTTTPGLSEASTTFYSSPRSPTTTLSPASMTSLGVGEESTTSRSQPG
+STHSTVSPASTTTPGLSEESTTVYSSSPGSTETTVFPRSTTTSVRREEPTTFHSRPASTHTTLFTEDSTT
+SGLTEESTAFPGSPASTQTGLPATLTTADLGEESTTFPSSSGSTGTKLSPARSTTSGLVGESTPSRLSPS
+STETTTLPGSPTTPSLSEKSTTFYTSPRSPDATLSPATTTSSGVSEESSTSHSQPGSTHTTAFPDSTTTS
+GLSQEPTTSHSSQGSTEATLSPGSTTASSLGQQSTTFHSSPGDTETTLLPDDTITSGLVEASTPTHSSTG
+SLHTTLTPASSTSTGLQEESTTFQSWPSSSDTTPSPPSTTAVPVEVSTTYHSRPSSTPTTHFSASSTTLG
+RSEESTTVHSSPGATGTALFPTRSATSVLVGEPTTSPISSGSTETTALPGSTTTAGLSEKSTTFYSSPRS
+PDTTLSPASTTSSGVSEESTTSHSRPGSMHTTAFPSSTTMPGVSQESTASHSSPGSTDTTLSPGSTTASS
+LGPESTTFHSSPGSTETTLLPDNTTASGLLEASTPVHSSTGSPHTTLSPAGSTTRQGESTTFQSWPNSKD
+TTPAPPTTTSAFVELSTTSHGSPSSTPTTHFSASSTTLGRSEESTTVHSSPVATATTPSPARSTTSGLVE
+ESTTYHSSPGSTQTMHFPESNTTSGRGEESTTSHSSTTHTISSAPSTTSALVEEPTSYHSSPGSTATTHF
+PDSSTTSGRSEESTASHSSQDATGTIVLPARSTTSVLLGESTTSPISSGSMETTALPGSTTTPGLSEKST
+TFHSSPSSTPTTHFSASSTTLGRSEESTTVHSSPVATATTPSPARSTTSGLVEESTAYHSSPGSTQTMHF
+PESSTASGRSEESRTSHSSTTHTISSPPSTTSALVEEPTSYHSSPGSIATTHFPESSTTSGRSEESTASH
+SSPDTNGITPLPAHFTTSGRIAESTTFYISPGSMETTLASTATTPGLSAKSTILYSSSRSPDQTLSPASM
+TSSSISGEPTSLYSQAESTHTTAFPASTTTSGLSQESTTFHSKPGSTETTLSPGSITTSSFAQEFTTPHS
+QPGSALSTVSPASTTVPGLSEESTTFYSSPGSTETTAFSHSNTMSIHSQQSTPFPDSPGFTHTVLPATLT
+TTDIGQESTAFHSSSDATGTTPLPARSTASDLVGEPTTFYISPSPTYTTLFPASSSTSGLTEESTTFHTS
+PSFTSTIVSTESLETLAPGLCQEGQIWNGKQCVCPQGYVGYQCLSPLESFPVETPEKLNATLGMTVKVTY
+RNFTEKMNDASSQEYQNFSTLFKNRMDVVLKGDNLPQYRGVNIRRLLNGSIVVKNDVILEADYTLEYEEL
+FENLAEIVKAKIMNETRTTLLDPDSCRKAILCYSEEDTFVDSSVTPGFDFQEQCTQKAAEGYTQFYYVDV
+LDGKLACVNKCTKGTKSQMNCNLGTCQLQRSGPRCLCPNTNTHWYWGETCEFNIAKSLVYGIVGAVMAVL
+LLALIILIILFSLSQRKRHREQYDVPQEWRKEGTPGIFQKTAIWEDQNLRESRFGLENAYNNFRPTLETV
+DSGTELHIQRPEMVASTV
diff --git a/lib/libmarv/src/CMakeLists.txt b/lib/libmarv/src/CMakeLists.txt
new file mode 100644
index 000000000..ad2ecc31b
--- /dev/null
+++ b/lib/libmarv/src/CMakeLists.txt
@@ -0,0 +1,60 @@
+cmake_minimum_required(VERSION 3.8)
+project(CUDA_INCLUDE_DIRS LANGUAGES CXX CUDA)
+
+set(CMAKE_CXX_STANDARD 17)
+set(CMAKE_CXX_EXTENSIONS OFF)
+
+set(LIBRARY_ONLY 0 CACHE BOOL "Do not build executables")
+
+set(NVCC_FLAGS)
+list(APPEND NVCC_FLAGS --extended-lambda)
+list(APPEND NVCC_FLAGS --expt-relaxed-constexpr)
+list(APPEND NVCC_FLAGS -rdc=true)
+string(TOLOWER "${CMAKE_BUILD_TYPE}" BUILD_TYPE_LOWER)
+if(BUILD_TYPE_LOWER STREQUAL "debug" OR BUILD_TYPE_LOWER STREQUAL "relwithdebinfo")
+    list(APPEND NVCC_FLAGS -lineinfo)
+endif()
+
+set(NVCC_FLAGS_OMP ${NVCC_FLAGS})
+list(APPEND NVCC_FLAGS_OMP -Xcompiler -fopenmp)
+
+
+set(ALIGN_SOURCES
+    kernels.cuh
+    blosum.cu
+    pssm.cuh
+    pssmkernels_gapless.cuh
+    pssmkernels_gapless_instantiation_dpx.cu
+    pssmkernels_gapless_instantiation_half2.cu
+    pssmkernels_gapless_instantiation_dpx_kernelparamzero.cu
+    pssmkernels_gapless_instantiation_half2_kernelparamzero.cu
+    pssmkernels_smithwaterman.cuh
+    pssmkernels_smithwaterman_instantiation_float.cu
+    pssmkernels_smithwaterman_instantiation_dpx.cu
+    dbdata.cpp
+)
+
+add_library(marv ${ALIGN_SOURCES} marv.cu marv.h)
+target_compile_options(marv PRIVATE $<$<COMPILE_LANGUAGE:CUDA>:${NVCC_FLAGS}>)
+set_target_properties(marv
+    PROPERTIES
+        CUDA_SEPARABLE_COMPILATION ON
+        CUDA_RESOLVE_DEVICE_SYMBOLS ON
+)
+target_compile_definitions(marv PRIVATE NO_NVTOOLSEXT)
+
+if (NOT LIBRARY_ONLY)
+    find_package(CUDAToolkit REQUIRED)
+    find_package(ZLIB REQUIRED)
+    find_package(OpenMP REQUIRED)
+
+    add_executable(align ${ALIGN_SOURCES} main.cu options.cpp sequence_io.cpp)
+    target_compile_options(align PRIVATE $<$<COMPILE_LANGUAGE:CUDA>:${NVCC_FLAGS_OMP}>)
+    target_link_libraries(align ZLIB::ZLIB CUDA::nvToolsExt OpenMP::OpenMP_CXX)
+    set_target_properties(align PROPERTIES CUDA_SEPARABLE_COMPILATION ON)
+    
+    add_executable(makedb makedb.cpp sequence_io.cpp dbdata.cpp)
+    target_compile_options(makedb PRIVATE $<$<COMPILE_LANGUAGE:CUDA>:${NVCC_FLAGS_OMP}>)
+    target_link_libraries(makedb ZLIB::ZLIB OpenMP::OpenMP_CXX)
+    set_target_properties(makedb PROPERTIES CUDA_SEPARABLE_COMPILATION ON)
+endif()
diff --git a/lib/libmarv/src/base64.h b/lib/libmarv/src/base64.h
new file mode 100644
index 000000000..7b9dad068
--- /dev/null
+++ b/lib/libmarv/src/base64.h
@@ -0,0 +1,148 @@
+/*
+
+  https://github.com/superwills/NibbleAndAHalf
+  base64.h -- Fast base64 encoding and decoding.
+  version 1.0.0, April 17, 2013 143a
+
+  Copyright (C) 2013 William Sherif
+
+  This software is provided 'as-is', without any express or implied
+  warranty.  In no event will the authors be held liable for any damages
+  arising from the use of this software.
+
+  Permission is granted to anyone to use this software for any purpose,
+  including commercial applications, and to alter it and redistribute it
+  freely, subject to the following restrictions:
+
+  1. The origin of this software must not be misrepresented; you must not
+     claim that you wrote the original software. If you use this software
+     in a product, an acknowledgment in the product documentation would be
+     appreciated but is not required.
+  2. Altered source versions must be plainly marked as such, and must not be
+     misrepresented as being the original software.
+  3. This notice may not be removed or altered from any source distribution.
+
+  William Sherif
+  will.sherif@gmail.com
+
+  YWxsIHlvdXIgYmFzZSBhcmUgYmVsb25nIHRvIHVz
+
+*/
+#ifndef BASE64_H
+#define BASE64_H
+
+#include <string>
+
+const static char *b64 = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/";
+
+// maps A=>0,B=>1..
+const static unsigned char unb64[] = {
+        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, //10
+        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, //20
+        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, //30
+        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, //40
+        0, 0, 0, 62, 0, 0, 0, 63, 52, 53, //50
+        54, 55, 56, 57, 58, 59, 60, 61, 0, 0, //60
+        0, 0, 0, 0, 0, 0, 1, 2, 3, 4, //70
+        5, 6, 7, 8, 9, 10, 11, 12, 13, 14, //80
+        15, 16, 17, 18, 19, 20, 21, 22, 23, 24, //90
+        25, 0, 0, 0, 0, 0, 0, 26, 27, 28, //100
+        29, 30, 31, 32, 33, 34, 35, 36, 37, 38, //110
+        39, 40, 41, 42, 43, 44, 45, 46, 47, 48, //120
+        49, 50, 51, 0, 0, 0, 0, 0, 0, 0, //130
+        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, //140
+        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, //150
+        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, //160
+        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, //170
+        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, //180
+        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, //190
+        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, //200
+        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, //210
+        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, //220
+        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, //230
+        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, //240
+        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, //250
+        0, 0, 0, 0, 0, 0,
+}; // This array has 256 elements
+
+// Converts binary data of length=len to base64 characters.
+std::string base64_encode(const void *data, int length) {
+    const unsigned char *bin = (const unsigned char *) data;
+
+    int modLength = length % 3;
+    // 2 gives 1 and 1 gives 2, but 0 gives 0.
+    int padding = ((modLength & 1) << 1) + ((modLength & 2) >> 1);
+
+    std::string res;
+    res.reserve(4 * (length + padding) / 3);
+
+    int byteNo;
+    for (byteNo = 0; byteNo <= length - 3; byteNo += 3) {
+        unsigned char BYTE0 = bin[byteNo];
+        unsigned char BYTE1 = bin[byteNo + 1];
+        unsigned char BYTE2 = bin[byteNo + 2];
+        res.append(1, b64[BYTE0 >> 2]);
+        res.append(1, b64[((0x3 & BYTE0) << 4) + (BYTE1 >> 4)]);
+        res.append(1, b64[((0x0f & BYTE1) << 2) + (BYTE2 >> 6)]);
+        res.append(1, b64[0x3f & BYTE2]);
+    }
+
+    if (padding == 2) {
+        res.append(1, b64[bin[byteNo] >> 2]);
+        res.append(1, b64[(0x3 & bin[byteNo]) << 4]);
+        res.append(1, '=');
+        res.append(1, '=');
+    } else if (padding == 1) {
+        res.append(1, b64[bin[byteNo] >> 2]);
+        res.append(1, b64[((0x3 & bin[byteNo]) << 4) + (bin[byteNo + 1] >> 4)]);
+        res.append(1, b64[(0x0f & bin[byteNo + 1]) << 2]);
+        res.append(1, '=');
+    }
+
+    return res;
+}
+
+std::string base64_decode(const char *base64, int length) {
+    const unsigned char *data = (const unsigned char *) base64;
+    // 2 accesses below would be OOB.
+    if (length < 2) {
+        return "";
+    }
+
+    int padding = 0;
+    if (data[length - 1] == '=') ++padding;
+    if (data[length - 2] == '=') ++padding;
+
+    std::string res;
+    res.reserve(3 * length / 4 - padding);
+
+    int charNo;
+    for (charNo = 0; charNo <= length - 4 - padding; charNo += 4) {
+        int A = unb64[data[charNo]];
+        int B = unb64[data[charNo + 1]];
+        int C = unb64[data[charNo + 2]];
+        int D = unb64[data[charNo + 3]];
+
+        res.append(1, (A << 2) | (B >> 4));
+        res.append(1, (B << 4) | (C >> 2));
+        res.append(1, (C << 6) | (D));
+    }
+
+    if (padding == 1) {
+        int A = unb64[data[charNo]];
+        int B = unb64[data[charNo + 1]];
+        int C = unb64[data[charNo + 2]];
+
+        res.append(1, (A << 2) | (B >> 4));
+        res.append(1, (B << 4) | (C >> 2));
+    } else if (padding == 2) {
+        int A = unb64[data[charNo]];
+        int B = unb64[data[charNo + 1]];
+
+        res.append(1, (A << 2) | (B >> 4));
+    }
+
+    return res;
+}
+
+#endif
diff --git a/lib/libmarv/src/benchmarking.cuh b/lib/libmarv/src/benchmarking.cuh
new file mode 100644
index 000000000..0cf8f58d6
--- /dev/null
+++ b/lib/libmarv/src/benchmarking.cuh
@@ -0,0 +1,1036 @@
+int peakBenchmark(int argc, char* argv[]){
+     // {
+    //     std::string tempname = "query128.fasta";
+    //     std::ofstream tempfile(tempname);
+
+    //     const char* letters = "ARNDCQEGHILKMFPSTWYV";
+
+    //     std::mt19937 gen(42);
+    //     std::uniform_int_distribution<> dist(0,19);
+
+    //     //for(int l = 8193; l < 4*4096; l += 64){
+    //     {
+    //         int l=128;
+    //         std::string sequence(l, ' ');
+    //         for(int i = 0; i < l; i++){
+    //             sequence[i] = letters[dist(gen)];
+    //         }
+    //         tempfile << ">length " << l << "\n";
+    //         tempfile << sequence << "\n";
+    //     }
+    //     tempfile.flush();
+
+    //     options.queryFiles = {tempname};
+    // }
+    
+
+    ProgramOptions options;
+    bool parseSuccess = parseArgs(argc, argv, options);
+
+    if(!parseSuccess || options.help){
+        printHelp(argc, argv);
+        return 0;
+    }
+
+    options.usePseudoDB = true;
+    options.verbose = true;
+    options.loadFullDBToGpu = true;
+    options.numTopOutputs = 0;
+    options.verbose = false;
+
+    printOptions(options);
+
+    std::vector<int> deviceIds;
+    {
+        int num = 0;
+        cudaGetDeviceCount(&num); CUERR
+        for(int i = 0; i < num; i++){
+            deviceIds.push_back(i);
+        }
+        if(deviceIds.size() > 0){
+            if(options.verbose){
+                std::cout << "Will use GPU";
+                for(auto x : deviceIds){
+                    std::cout << " " << x;
+                }
+                std::cout << "\n";
+            }
+        }else{
+            throw std::runtime_error("No GPU found");
+        }
+    }
+
+    helpers::PeerAccess peerAccess(deviceIds, false);
+ 
+    using KernelTypeConfig = cudasw4::KernelTypeConfig;
+    using MemoryConfig = cudasw4::MemoryConfig;
+    using ScanResult = cudasw4::ScanResult;
+
+    KernelTypeConfig kernelTypeConfig;
+    kernelTypeConfig.singlePassType = options.singlePassType;
+    kernelTypeConfig.manyPassType_small = options.manyPassType_small;
+    kernelTypeConfig.manyPassType_large = options.manyPassType_large;
+    kernelTypeConfig.overflowType = options.overflowType;
+
+    MemoryConfig memoryConfig;
+    memoryConfig.maxBatchBytes = options.maxBatchBytes;
+    memoryConfig.maxBatchSequences = options.maxBatchSequences;
+    memoryConfig.maxTempBytes = options.maxTempBytes;
+    memoryConfig.maxGpuMem = options.maxGpuMem;
+
+    cudasw4::CudaSW4 cudaSW4(
+        deviceIds, 
+        options.numTopOutputs,
+        options.blosumType, 
+        kernelTypeConfig, 
+        memoryConfig, 
+        options.verbose
+    );
+
+    const char* letters = "ARNDCQEGHILKMFPSTWYV";
+
+    std::mt19937 gen(42);
+    std::uniform_int_distribution<> dist(0,19);
+
+    for(int l = 1; l <= 65536; l *= 2){
+        std::string sequence(l, ' ');
+        for(int i = 0; i < l; i++){
+            sequence[i] = letters[dist(gen)];
+        }
+
+        options.pseudoDBLength = l;
+        options.pseudoDBSize = 1024*1024*1024 / l;
+
+        if(options.verbose){
+            std::cout << "Generating pseudo db\n";
+        }
+        helpers::CpuTimer timer_read_db("Generate DB");
+        auto fullDB_tmp = std::make_shared<cudasw4::PseudoDB>(cudasw4::loadPseudoDB(
+            options.pseudoDBSize, 
+            options.pseudoDBLength,
+            options.pseudoDBSameSequence
+        ));
+        if(options.verbose){
+            timer_read_db.print();
+        }
+        
+        cudaSW4.setDatabase(fullDB_tmp);
+
+
+        if(options.verbose){
+            cudaSW4.printDBInfo();
+            if(options.printLengthPartitions){
+                cudaSW4.printDBLengthPartitions();
+            }
+        }
+
+        if(options.loadFullDBToGpu){
+            cudaSW4.prefetchDBToGpus();
+        }
+
+        std::cout << "Processing query length " << sequence.size() << "\n";
+
+        cudaSW4.totalTimerStart();
+
+        std::cout.flush();
+
+        cudasw4::DecodedQueryView queryView(sequence.data(), sequence.size());
+        ScanResult scanResult = cudaSW4.scan(queryView, std::nullopt);
+
+        std::cout << "Done. Scan time: " << scanResult.stats.seconds << " s, " << scanResult.stats.gcups << " GCUPS\n";
+
+    }
+
+    return 0;
+}
+
+
+
+
+
+
+
+
+
+
+int gridsearchPseudo(int argc, char* argv[]){
+    ProgramOptions options;
+    bool parseSuccess = parseArgs(argc, argv, options);
+
+    if(!parseSuccess || options.help){
+        printHelp(argc, argv);
+        return 0;
+    }
+
+    options.usePseudoDB = true;
+    options.pseudoDBSameSequence = false;
+    options.verbose = true;
+    options.loadFullDBToGpu = true;
+    options.numTopOutputs = 0;
+    options.verbose = false;
+
+    printOptions(options);
+
+    std::vector<int> deviceIds;
+    {
+        int num = 0;
+        cudaGetDeviceCount(&num); CUERR
+        for(int i = 0; i < num; i++){
+            deviceIds.push_back(i);
+        }
+        if(deviceIds.size() > 0){
+            if(options.verbose){
+                std::cout << "Will use GPU";
+                for(auto x : deviceIds){
+                    std::cout << " " << x;
+                }
+                std::cout << "\n";
+            }
+        }else{
+            throw std::runtime_error("No GPU found");
+        }
+    }
+
+    helpers::PeerAccess peerAccess(deviceIds, false);
+ 
+    using KernelTypeConfig = cudasw4::KernelTypeConfig;
+    using MemoryConfig = cudasw4::MemoryConfig;
+    using ScanResult = cudasw4::ScanResult;
+
+    KernelTypeConfig kernelTypeConfig;
+    kernelTypeConfig.singlePassType = options.singlePassType;
+    kernelTypeConfig.manyPassType_small = options.manyPassType_small;
+    kernelTypeConfig.manyPassType_large = options.manyPassType_large;
+    kernelTypeConfig.overflowType = options.overflowType;
+
+    MemoryConfig memoryConfig;
+    memoryConfig.maxBatchBytes = options.maxBatchBytes;
+    memoryConfig.maxBatchSequences = options.maxBatchSequences;
+    memoryConfig.maxTempBytes = options.maxTempBytes;
+    memoryConfig.maxGpuMem = options.maxGpuMem;
+
+    cudasw4::CudaSW4 cudaSW4(
+        deviceIds, 
+        options.numTopOutputs,
+        options.blosumType, 
+        kernelTypeConfig, 
+        memoryConfig, 
+        options.verbose
+    );
+
+    const char* letters = "ARNDCQEGHILKMFPSTWYV";
+
+    std::mt19937 gen(42);
+    std::uniform_int_distribution<> dist(0,19);
+
+    auto execute = [&](std::string outputfilename, bool usekernelparamzero){
+
+        std::ofstream logfile(outputfilename);
+
+        for(int l = 128; l <= 384; l += 8){
+            std::string sequence(l, ' ');
+            for(int i = 0; i < l; i++){
+                sequence[i] = letters[dist(gen)];
+            }
+
+            options.pseudoDBLength = l;
+            options.pseudoDBSize = 5000000;
+
+            if(options.verbose){
+                std::cout << "Generating pseudo db\n";
+            }
+            helpers::CpuTimer timer_read_db("Generate DB");
+            auto fullDB_tmp = std::make_shared<cudasw4::PseudoDB>(cudasw4::loadPseudoDB(
+                options.pseudoDBSize, 
+                options.pseudoDBLength,
+                options.pseudoDBSameSequence
+            ));
+            if(options.verbose){
+                timer_read_db.print();
+            }
+            
+            cudaSW4.setDatabase(fullDB_tmp);
+            if(usekernelparamzero){
+                cudaSW4.setGaplessHalf2KernelApproach(KernelApproach::kernelparamzero);
+                cudaSW4.setGaplessDPXKernelApproach(KernelApproach::kernelparamzero);
+            }else{
+                cudaSW4.setGaplessHalf2KernelApproach(KernelApproach::hardcodedzero);
+                cudaSW4.setGaplessDPXKernelApproach(KernelApproach::hardcodedzero);
+            }
+
+            if(options.verbose){
+                cudaSW4.printDBInfo();
+                if(options.printLengthPartitions){
+                    cudaSW4.printDBLengthPartitions();
+                }
+            }
+
+            if(options.loadFullDBToGpu){
+                cudaSW4.prefetchDBToGpus();
+            }
+
+            //std::cout << "Processing query length " << sequence.size() << "\n";
+            std::cout << "Processing query length " << sequence.size();
+
+            logfile << sequence.size() << "\n";
+
+            constexpr int smallestValidGroupSize = 4;
+
+            std::vector<std::vector<float>> gcupsMatrix;
+            for(int groupsize = smallestValidGroupSize; groupsize <= 16; groupsize *= 2){
+                std::cout.flush();
+
+                std::vector<float> gcupsVector;
+                for(int numRegs = 4; numRegs <= 64; numRegs += 4){
+                    if(groupsize * numRegs * 2 >= int(sequence.size())){
+                    //if(groupsize >= 4){
+                        cudaSW4.setGroupConfig(groupsize, numRegs);
+
+                        cudasw4::DecodedQueryView queryView(sequence.data(), sequence.size());
+                        ScanResult scanResult = cudaSW4.scan(queryView, std::nullopt);
+                        gcupsVector.push_back(scanResult.stats.gcups);
+                    }else{
+                        gcupsVector.push_back(0);
+                    }
+                }
+                gcupsMatrix.push_back(gcupsVector);
+            }
+
+
+
+            float bestGcups = 0;
+            int bestgroupsizeIndex = 0;
+            int bestregsIndex = 0;
+            for(int r = 0; r < int(gcupsMatrix.size()); r++){
+                const auto& vec = gcupsMatrix[r];
+                for(int c = 0; c < int(vec.size()); c++){
+                    logfile << vec[c] << " ";
+                    //std::cout << vec[c] << " ";
+                    //printf("%5f ", vec[c]);
+                    if(vec[c] > bestGcups){
+                        bestgroupsizeIndex = r;
+                        bestregsIndex = c;
+                        bestGcups = vec[c];
+                    }
+                }
+                logfile << "\n";
+                //printf("\n");
+                //std::cout << "\n";
+            }
+            logfile << "Best: " << bestGcups << " " << smallestValidGroupSize * (1u << bestgroupsizeIndex) << " " << (4 * (1+bestregsIndex)) << "\n"; 
+            std::cout << ", Best: " << bestGcups << " " << smallestValidGroupSize * (1u << bestgroupsizeIndex) << " " << (4 * (1+bestregsIndex)) << "\n";
+            //<< " " << bestgroupsizeIndex << " " << bestregsIndex << "\n";
+
+        }
+
+    };
+
+    execute("gridsearch_128_384_8_alldifferentsubjects_withkernelparamzero.txt", true);
+    execute("gridsearch_128_384_8_alldifferentsubjects_withoutkernelparamzero.txt", false);
+
+    return 0;
+}
+
+
+int gridsearchPseudo_SW(int argc, char* argv[]){
+    ProgramOptions options;
+    bool parseSuccess = parseArgs(argc, argv, options);
+
+    if(!parseSuccess || options.help){
+        printHelp(argc, argv);
+        return 0;
+    }
+
+    options.usePseudoDB = true;
+    options.pseudoDBSameSequence = false;
+    options.verbose = true;
+    options.loadFullDBToGpu = true;
+    options.numTopOutputs = 0;
+    options.verbose = false;
+
+    printOptions(options);
+
+    std::vector<int> deviceIds;
+    {
+        int num = 0;
+        cudaGetDeviceCount(&num); CUERR
+        for(int i = 0; i < num; i++){
+            deviceIds.push_back(i);
+        }
+        if(deviceIds.size() > 0){
+            if(options.verbose){
+                std::cout << "Will use GPU";
+                for(auto x : deviceIds){
+                    std::cout << " " << x;
+                }
+                std::cout << "\n";
+            }
+        }else{
+            throw std::runtime_error("No GPU found");
+        }
+    }
+
+    helpers::PeerAccess peerAccess(deviceIds, false);
+ 
+    using KernelTypeConfig = cudasw4::KernelTypeConfig;
+    using MemoryConfig = cudasw4::MemoryConfig;
+    using ScanResult = cudasw4::ScanResult;
+    using ScanType = cudasw4::ScanType;
+
+    KernelTypeConfig kernelTypeConfig;
+    kernelTypeConfig.singlePassType = options.singlePassType;
+    kernelTypeConfig.manyPassType_small = options.manyPassType_small;
+    kernelTypeConfig.manyPassType_large = options.manyPassType_large;
+    kernelTypeConfig.overflowType = options.overflowType;
+
+    MemoryConfig memoryConfig;
+    memoryConfig.maxBatchBytes = options.maxBatchBytes;
+    memoryConfig.maxBatchSequences = options.maxBatchSequences;
+    memoryConfig.maxTempBytes = options.maxTempBytes;
+    memoryConfig.maxGpuMem = options.maxGpuMem;
+
+    cudasw4::CudaSW4 cudaSW4(
+        deviceIds, 
+        options.numTopOutputs,
+        options.blosumType, 
+        kernelTypeConfig, 
+        memoryConfig, 
+        options.verbose
+    );
+
+    cudaSW4.setScanType(ScanType::SW_Endpos);
+
+    const char* letters = "ARNDCQEGHILKMFPSTWYV";
+
+    std::mt19937 gen(42);
+    std::uniform_int_distribution<> dist(0,19);
+
+    std::ofstream logfile("gridsearch_SW.txt");
+
+    for(int l = 1232+16; l <= 2048; l += 16){
+        std::string sequence(l, ' ');
+        for(int i = 0; i < l; i++){
+            sequence[i] = letters[dist(gen)];
+        }
+
+        options.pseudoDBLength = l;
+        options.pseudoDBSize = 1024*1024*1024 / l;
+
+        if(options.verbose){
+            std::cout << "Generating pseudo db\n";
+        }
+        helpers::CpuTimer timer_read_db("Generate DB");
+        auto fullDB_tmp = std::make_shared<cudasw4::PseudoDB>(cudasw4::loadPseudoDB(
+            options.pseudoDBSize, 
+            options.pseudoDBLength,
+            options.pseudoDBSameSequence
+        ));
+        if(options.verbose){
+            timer_read_db.print();
+        }
+        
+        cudaSW4.setDatabase(fullDB_tmp);
+
+
+        if(options.verbose){
+            cudaSW4.printDBInfo();
+            if(options.printLengthPartitions){
+                cudaSW4.printDBLengthPartitions();
+            }
+        }
+
+        if(options.loadFullDBToGpu){
+            cudaSW4.prefetchDBToGpus();
+        }
+
+        //std::cout << "Processing query length " << sequence.size() << "\n";
+        std::cout << "Processing query length " << sequence.size();
+
+        logfile << sequence.size() << "\n";
+
+        std::vector<std::vector<float>> gcupsMatrix;
+        for(int groupsize = 4; groupsize <= 32; groupsize *= 2){
+            std::cout.flush();
+
+            std::vector<float> gcupsVector;
+            for(int numRegs = 4; numRegs <= 64; numRegs += 4){
+                if(groupsize * numRegs <= 1024){
+                //if(groupsize * numRegs >= int(sequence.size()) && groupsize * numRegs <= 1024){
+                //if(groupsize >= 4){
+                    cudaSW4.setGroupConfig(groupsize, numRegs);
+
+                    cudasw4::DecodedQueryView queryView(sequence.data(), sequence.size());
+                    ScanResult scanResult = cudaSW4.scan(queryView, std::nullopt);
+                    gcupsVector.push_back(scanResult.stats.gcups);
+                }else{
+                    gcupsVector.push_back(0);
+                }
+            }
+            gcupsMatrix.push_back(gcupsVector);
+        }
+
+
+
+        float bestGcups = 0;
+        int bestgroupsizeIndex = 0;
+        int bestregsIndex = 0;
+        for(int r = 0; r < int(gcupsMatrix.size()); r++){
+            const auto& vec = gcupsMatrix[r];
+            for(int c = 0; c < int(vec.size()); c++){
+                logfile << vec[c] << " ";
+                //std::cout << vec[c] << " ";
+                //printf("%5f ", vec[c]);
+                if(vec[c] > bestGcups){
+                    bestgroupsizeIndex = r;
+                    bestregsIndex = c;
+                    bestGcups = vec[c];
+                }
+            }
+            logfile << "\n";
+            //printf("\n");
+            //std::cout << "\n";
+        }
+        logfile << "Best: " << bestGcups << " " << (1u << bestgroupsizeIndex) << " " << (4 * (1+bestregsIndex)) << "\n"; 
+        std::cout << ", Best: " << bestGcups << " " << (1u << bestgroupsizeIndex) << " " << (4 * (1+bestregsIndex)) << "\n";
+        //<< " " << bestgroupsizeIndex << " " << bestregsIndex << "\n";
+
+    }
+
+    return 0;
+}
+
+
+
+
+int gridsearchReal(int argc, char* argv[]){
+    ProgramOptions options;
+    bool parseSuccess = parseArgs(argc, argv, options);
+
+    if(!parseSuccess || options.help){
+        printHelp(argc, argv);
+        return 0;
+    }
+
+    options.usePseudoDB = false;
+    options.pseudoDBSameSequence = false;
+    options.verbose = true;
+    options.loadFullDBToGpu = true;
+    options.numTopOutputs = 0;
+    options.verbose = false;
+
+    printOptions(options);
+
+    std::vector<int> deviceIds;
+    {
+        int num = 0;
+        cudaGetDeviceCount(&num); CUERR
+        for(int i = 0; i < num; i++){
+            deviceIds.push_back(i);
+        }
+        if(deviceIds.size() > 0){
+            if(options.verbose){
+                std::cout << "Will use GPU";
+                for(auto x : deviceIds){
+                    std::cout << " " << x;
+                }
+                std::cout << "\n";
+            }
+        }else{
+            throw std::runtime_error("No GPU found");
+        }
+    }
+
+    helpers::PeerAccess peerAccess(deviceIds, false);
+ 
+    using KernelTypeConfig = cudasw4::KernelTypeConfig;
+    using MemoryConfig = cudasw4::MemoryConfig;
+    using ScanResult = cudasw4::ScanResult;
+
+    KernelTypeConfig kernelTypeConfig;
+    kernelTypeConfig.singlePassType = options.singlePassType;
+    kernelTypeConfig.manyPassType_small = options.manyPassType_small;
+    kernelTypeConfig.manyPassType_large = options.manyPassType_large;
+    kernelTypeConfig.overflowType = options.overflowType;
+
+    MemoryConfig memoryConfig;
+    memoryConfig.maxBatchBytes = options.maxBatchBytes;
+    memoryConfig.maxBatchSequences = options.maxBatchSequences;
+    memoryConfig.maxTempBytes = options.maxTempBytes;
+    memoryConfig.maxGpuMem = options.maxGpuMem;
+
+    cudasw4::CudaSW4 cudaSW4(
+        deviceIds, 
+        options.numTopOutputs,
+        options.blosumType, 
+        kernelTypeConfig, 
+        memoryConfig, 
+        options.verbose
+    );
+
+    if(options.verbose){
+        std::cout << "Reading Database: \n";
+    }
+    try{
+        helpers::CpuTimer timer_read_db("Read DB");
+        constexpr bool writeAccess = false;
+        const bool prefetchSeq = options.prefetchDBFile;
+        auto fullDB_tmp = std::make_shared<cudasw4::DB>(cudasw4::loadDB(options.dbPrefix, writeAccess, prefetchSeq));
+        if(options.verbose){
+            timer_read_db.print();
+        }
+
+        cudaSW4.setDatabase(fullDB_tmp);
+    }catch(cudasw4::LoadDBException& ex){
+        if(options.verbose){
+            std::cout << "Failed to map db files. Using fallback db. Error message: " << ex.what() << "\n";
+        }
+        helpers::CpuTimer timer_read_db("Read DB");
+        auto fullDB_tmp = std::make_shared<cudasw4::DBWithVectors>(cudasw4::loadDBWithVectors(options.dbPrefix));
+        if(options.verbose){
+            timer_read_db.print();
+        }
+
+        cudaSW4.setDatabase(fullDB_tmp);
+    }
+
+    const char* letters = "ARNDCQEGHILKMFPSTWYV";
+
+    std::mt19937 gen(42);
+    std::uniform_int_distribution<> dist(0,19);
+
+    std::ofstream logfile("log1.txt", std::ios::app);
+
+    for(int l = 8192+4096; l <= 65536; l += 4096){
+        if(l == 16384 || l == 32768 || l == 65536) continue;
+
+        std::string sequence(l, ' ');
+        for(int i = 0; i < l; i++){
+            sequence[i] = letters[dist(gen)];
+        }
+
+        if(options.verbose){
+            cudaSW4.printDBInfo();
+            if(options.printLengthPartitions){
+                cudaSW4.printDBLengthPartitions();
+            }
+        }
+
+        if(options.loadFullDBToGpu){
+            cudaSW4.prefetchDBToGpus();
+        }
+
+        //std::cout << "Processing query length " << sequence.size() << "\n";
+        std::cout << "Processing query length " << sequence.size();
+
+        logfile << sequence.size() << "\n";
+
+        std::vector<std::vector<float>> gcupsMatrix;
+        for(int groupsize = 1; groupsize <= 16; groupsize *= 2){
+            std::cout.flush();
+
+            std::vector<float> gcupsVector;
+            for(int numRegs = 4; numRegs <= 64; numRegs += 4){
+                //if(groupsize * numRegs * 2 >= int(sequence.size())){
+                if(groupsize >= 8){
+                    cudaSW4.setGroupConfig(groupsize, numRegs);
+
+                    cudasw4::DecodedQueryView queryView(sequence.data(), sequence.size());
+                    ScanResult scanResult = cudaSW4.scan(queryView, std::nullopt);
+                    gcupsVector.push_back(scanResult.stats.gcups);
+                }else{
+                    gcupsVector.push_back(0);
+                }
+            }
+            gcupsMatrix.push_back(gcupsVector);
+        }
+
+
+
+        float bestGcups = 0;
+        int bestgroupsizeIndex = 0;
+        int bestregsIndex = 0;
+        for(int r = 0; r < int(gcupsMatrix.size()); r++){
+            const auto& vec = gcupsMatrix[r];
+            for(int c = 0; c < int(vec.size()); c++){
+                logfile << vec[c] << " ";
+                //std::cout << vec[c] << " ";
+                //printf("%5f ", vec[c]);
+                if(vec[c] > bestGcups){
+                    bestgroupsizeIndex = r;
+                    bestregsIndex = c;
+                    bestGcups = vec[c];
+                }
+            }
+            logfile << "\n";
+            //printf("\n");
+            //std::cout << "\n";
+        }
+        logfile << "Best: " << bestGcups << " " << (1u << bestgroupsizeIndex) << " " << (4 * (1+bestregsIndex)) << "\n"; 
+        std::cout << ", Best: " << bestGcups << " " << (1u << bestgroupsizeIndex) << " " << (4 * (1+bestregsIndex)) << "\n";
+        //<< " " << bestgroupsizeIndex << " " << bestregsIndex << "\n";
+
+    }
+
+    return 0;
+}
+
+
+
+
+
+
+
+
+
+
+
+
+int lengthbenchmarkReal(int argc, char* argv[], int firstLength, int lastLength, int stepLength){
+    ProgramOptions options;
+    bool parseSuccess = parseArgs(argc, argv, options);
+
+    if(!parseSuccess || options.help){
+        printHelp(argc, argv);
+        return 0;
+    }
+
+    options.usePseudoDB = false;
+    options.pseudoDBSameSequence = false;
+    options.verbose = true;
+    options.loadFullDBToGpu = true;
+    options.numTopOutputs = 0;
+    options.verbose = false;
+
+    printOptions(options);
+
+    std::vector<int> deviceIds;
+    {
+        int num = 0;
+        cudaGetDeviceCount(&num); CUERR
+        for(int i = 0; i < num; i++){
+            deviceIds.push_back(i);
+        }
+        if(deviceIds.size() > 0){
+            if(options.verbose){
+                std::cout << "Will use GPU";
+                for(auto x : deviceIds){
+                    std::cout << " " << x;
+                }
+                std::cout << "\n";
+            }
+        }else{
+            throw std::runtime_error("No GPU found");
+        }
+    }
+
+    helpers::PeerAccess peerAccess(deviceIds, false);
+ 
+    using KernelTypeConfig = cudasw4::KernelTypeConfig;
+    using MemoryConfig = cudasw4::MemoryConfig;
+    using ScanResult = cudasw4::ScanResult;
+
+    KernelTypeConfig kernelTypeConfig;
+    kernelTypeConfig.singlePassType = options.singlePassType;
+    kernelTypeConfig.manyPassType_small = options.manyPassType_small;
+    kernelTypeConfig.manyPassType_large = options.manyPassType_large;
+    kernelTypeConfig.overflowType = options.overflowType;
+
+    MemoryConfig memoryConfig;
+    memoryConfig.maxBatchBytes = options.maxBatchBytes;
+    memoryConfig.maxBatchSequences = options.maxBatchSequences;
+    memoryConfig.maxTempBytes = options.maxTempBytes;
+    memoryConfig.maxGpuMem = options.maxGpuMem;
+
+    cudasw4::CudaSW4 cudaSW4(
+        deviceIds, 
+        options.numTopOutputs,
+        options.blosumType, 
+        kernelTypeConfig, 
+        memoryConfig, 
+        options.verbose
+    );
+
+    if(options.verbose){
+        std::cout << "Reading Database: \n";
+    }
+    try{
+        helpers::CpuTimer timer_read_db("Read DB");
+        constexpr bool writeAccess = false;
+        const bool prefetchSeq = options.prefetchDBFile;
+        auto fullDB_tmp = std::make_shared<cudasw4::DB>(cudasw4::loadDB(options.dbPrefix, writeAccess, prefetchSeq));
+        if(options.verbose){
+            timer_read_db.print();
+        }
+
+        cudaSW4.setDatabase(fullDB_tmp);
+    }catch(cudasw4::LoadDBException& ex){
+        if(options.verbose){
+            std::cout << "Failed to map db files. Using fallback db. Error message: " << ex.what() << "\n";
+        }
+        helpers::CpuTimer timer_read_db("Read DB");
+        auto fullDB_tmp = std::make_shared<cudasw4::DBWithVectors>(cudasw4::loadDBWithVectors(options.dbPrefix));
+        if(options.verbose){
+            timer_read_db.print();
+        }
+
+        cudaSW4.setDatabase(fullDB_tmp);
+    }
+
+    const char* letters = "ARNDCQEGHILKMFPSTWYV";
+
+    std::mt19937 gen(42);
+    std::uniform_int_distribution<> dist(0,19);
+
+    for(int l = firstLength; l <= lastLength; l += stepLength){
+
+        std::string sequence(l, ' ');
+        for(int i = 0; i < l; i++){
+            sequence[i] = letters[dist(gen)];
+        }
+
+        if(options.verbose){
+            cudaSW4.printDBInfo();
+            if(options.printLengthPartitions){
+                cudaSW4.printDBLengthPartitions();
+            }
+        }
+
+        if(options.loadFullDBToGpu){
+            cudaSW4.prefetchDBToGpus();
+        }
+
+        std::cout << "Processing query length " << sequence.size();
+
+
+        cudasw4::DecodedQueryView queryView(sequence.data(), sequence.size());
+        ScanResult scanResult = cudaSW4.scan(queryView, std::nullopt);
+        std::cout << scanResult.stats.gcups << " GCUPS\n";
+                    
+    }
+
+    return 0;
+}
+
+
+
+
+
+
+struct BenchmarkData{
+    bool dpx;
+    int tilesize;
+    int groupsize;
+    int numRegs;
+    float gcups;
+    KernelApproach kernelApproach;
+};
+
+void writeBenchmarkDataHeader(std::ostream& os){
+    os << "tilesize groupsize numRegs dpx kernelApproach gcups" << "\n";
+}
+
+std::ostream& operator<<(std::ostream& os, const BenchmarkData& data){
+
+    os << data.tilesize << " " << data.groupsize << " " << data.numRegs 
+        << " " << data.dpx << " " << int(data.kernelApproach) << " " << data.gcups;
+    return os;
+}
+
+
+int peakbenchmarkAllSingleTileConfigs(int argc, char* argv[]){
+    ProgramOptions options;
+    bool parseSuccess = parseArgs(argc, argv, options);
+
+    if(!parseSuccess || options.help){
+        printHelp(argc, argv);
+        return 0;
+    }
+
+    options.usePseudoDB = true;
+    options.pseudoDBSameSequence = false;
+    options.verbose = true;
+    options.loadFullDBToGpu = true;
+    options.numTopOutputs = 0;
+    options.verbose = false;
+
+    printOptions(options);
+
+    std::vector<int> deviceIds;
+    {
+        int num = 0;
+        cudaGetDeviceCount(&num); CUERR
+        for(int i = 0; i < num; i++){
+            deviceIds.push_back(i);
+        }
+        if(deviceIds.size() > 0){
+            if(options.verbose){
+                std::cout << "Will use GPU";
+                for(auto x : deviceIds){
+                    std::cout << " " << x;
+                }
+                std::cout << "\n";
+            }
+        }else{
+            throw std::runtime_error("No GPU found");
+        }
+    }
+
+    helpers::PeerAccess peerAccess(deviceIds, false);
+ 
+    using KernelTypeConfig = cudasw4::KernelTypeConfig;
+    using MemoryConfig = cudasw4::MemoryConfig;
+    using ScanResult = cudasw4::ScanResult;
+
+    KernelTypeConfig kernelTypeConfig;
+    kernelTypeConfig.singlePassType = options.singlePassType;
+    kernelTypeConfig.manyPassType_small = options.manyPassType_small;
+    kernelTypeConfig.manyPassType_large = options.manyPassType_large;
+    kernelTypeConfig.overflowType = options.overflowType;
+
+    MemoryConfig memoryConfig;
+    memoryConfig.maxBatchBytes = options.maxBatchBytes;
+    memoryConfig.maxBatchSequences = options.maxBatchSequences;
+    memoryConfig.maxTempBytes = options.maxTempBytes;
+    memoryConfig.maxGpuMem = options.maxGpuMem;
+
+    const char* letters = "ARNDCQEGHILKMFPSTWYV";
+
+    std::mt19937 gen(42);
+    std::uniform_int_distribution<> dist(0,19);
+
+
+
+
+
+    std::vector<BenchmarkData> allBenchmarkData;
+
+    writeBenchmarkDataHeader(std::cout);
+
+    auto execute = [&](std::string /*outputfilename*/, KernelApproach kernelApproach, bool useDPX){
+        kernelTypeConfig.singlePassType = useDPX ? cudasw4::KernelType::DPXs16 : cudasw4::KernelType::Half2;
+
+        cudasw4::CudaSW4 cudaSW4(
+            deviceIds, 
+            options.numTopOutputs,
+            options.blosumType, 
+            kernelTypeConfig, 
+            memoryConfig, 
+            options.verbose
+        );
+
+        // std::ofstream logfile(outputfilename);
+
+        std::vector<BenchmarkData> benchmarkDataVec;
+
+        for(int groupsize : {4,8,16}){
+        // for(int groupsize : {4}){
+        // for(int groupsize : {8,16}){
+            for(int numRegs : {4,8,12,16,20,24,28,32,36,40,44,48,52,56,60,64}){
+            // for(int groupsize : {4,8,16}){
+            //     for(int numRegs : {32}){
+                const int l = groupsize * numRegs * 2;
+                if(l > 2048) continue;
+
+                BenchmarkData benchmarkData;
+                benchmarkData.dpx = useDPX;
+                benchmarkData.tilesize = l;
+                benchmarkData.groupsize = groupsize;
+                benchmarkData.numRegs = numRegs;
+                benchmarkData.kernelApproach = kernelApproach;
+
+                std::string sequence(l, ' ');
+                for(int i = 0; i < l; i++){
+                    sequence[i] = letters[dist(gen)];
+                }
+
+                options.pseudoDBLength = l;
+                options.pseudoDBSize = 5000000;
+
+                if(options.verbose){
+                    std::cout << "Generating pseudo db\n";
+                }
+                helpers::CpuTimer timer_read_db("Generate DB");
+                auto fullDB_tmp = std::make_shared<cudasw4::PseudoDB>(cudasw4::loadPseudoDB(
+                    options.pseudoDBSize, 
+                    options.pseudoDBLength,
+                    options.pseudoDBSameSequence
+                ));
+                if(options.verbose){
+                    timer_read_db.print();
+                }
+                
+                cudaSW4.setDatabase(fullDB_tmp);
+                cudaSW4.setGaplessHalf2KernelApproach(kernelApproach);
+                cudaSW4.setGaplessDPXKernelApproach(kernelApproach);
+
+                if(options.verbose){
+                    cudaSW4.printDBInfo();
+                    if(options.printLengthPartitions){
+                        cudaSW4.printDBLengthPartitions();
+                    }
+                }
+
+                if(options.loadFullDBToGpu){
+                    cudaSW4.prefetchDBToGpus();
+                }
+
+                // std::cout << "Processing query length " << sequence.size() << "\n";
+                // std::cout << "Processing query length " << sequence.size();
+
+                // logfile << sequence.size() << "\n";
+
+                cudaSW4.setGroupConfig(groupsize, numRegs);
+                cudasw4::DecodedQueryView queryView(sequence.data(), sequence.size());
+                ScanResult scanResult = cudaSW4.scan(queryView, std::nullopt);
+
+                benchmarkData.gcups = scanResult.stats.gcups;
+
+                benchmarkDataVec.push_back(benchmarkData);
+
+                std::cout << benchmarkData << "\n";
+
+                // std::cout << l << " " << groupsize << " " << numRegs << " " << scanResult.stats.gcups << " GCUPS\n"; 
+
+
+            }
+        }
+
+        return benchmarkDataVec;
+    };
+
+    for(auto kernelApproach : {KernelApproach::hardcodedzero, KernelApproach::kernelparamzero}){
+        auto resultNoDpx = execute("", kernelApproach, false);
+        allBenchmarkData.insert(allBenchmarkData.end(), resultNoDpx.begin(), resultNoDpx.end());
+
+        int ccMajor = 0;
+        cudaDeviceGetAttribute(&ccMajor, cudaDevAttrComputeCapabilityMajor, 0);
+        const bool supportsDPX = ccMajor >= 9;
+        if(supportsDPX){
+            auto resultDpx = execute("", kernelApproach, true);
+            allBenchmarkData.insert(allBenchmarkData.end(), resultDpx.begin(), resultDpx.end());
+        }
+    }
+
+    auto bestConfigs = allBenchmarkData;
+    std::sort(bestConfigs.begin(), bestConfigs.end(), [](const auto& l, const auto& r){
+        if(l.tilesize < r.tilesize) return true;
+        if(l.tilesize > r.tilesize) return false;
+        return l.gcups > r.gcups;
+    });
+
+    std::cout << "sorted\n";
+    std::copy(bestConfigs.begin(), bestConfigs.end(), std::ostream_iterator<BenchmarkData>(std::cout, "\n"));
+
+    //only keep best for each tilesize
+    bestConfigs.erase(
+        std::unique(bestConfigs.begin(), bestConfigs.end(), [](const auto& l, const auto& r){
+            return l.tilesize == r.tilesize;
+        }),
+        bestConfigs.end()
+    );
+
+    std::cout << "best\n";
+    std::copy(bestConfigs.begin(), bestConfigs.end(), std::ostream_iterator<BenchmarkData>(std::cout, "\n"));
+
+    return 0;
+}
\ No newline at end of file
diff --git a/lib/libmarv/src/blosum.cu b/lib/libmarv/src/blosum.cu
new file mode 100644
index 000000000..d68cc0739
--- /dev/null
+++ b/lib/libmarv/src/blosum.cu
@@ -0,0 +1,121 @@
+#include "blosum.hpp"
+#include "util.cuh"
+
+#include <cassert>
+
+namespace cudasw4{
+
+    #ifdef __CUDACC__
+    __constant__ std::int8_t deviceBlosum[25*25];
+    __constant__ int deviceBlosumDim;
+    __constant__ int deviceBlosumDimSquared;
+    #endif
+    
+    std::int8_t hostBlosum[25*25];
+    int hostBlosumDim;
+    int hostBlosumDimSquared;
+    
+    //set host and device global variables
+    
+    
+    void setProgramWideBlosum(BlosumType blosumType, const std::vector<int>& deviceIds){
+        switch(blosumType){
+            case BlosumType::BLOSUM45:
+                {
+                    const auto blosum = BLOSUM45::get1D();
+                    const int dim = BLOSUM45::dim;
+                    hostBlosumDim = dim;
+                    hostBlosumDimSquared = dim * dim;
+                    auto it = std::copy(blosum.begin(), blosum.end(), hostBlosum);
+                    assert(std::distance(hostBlosum, it) <= 25 * 25);                
+                }
+                break;
+            case BlosumType::BLOSUM50:
+                {
+                    const auto blosum = BLOSUM50::get1D();
+                    const int dim = BLOSUM50::dim;
+                    hostBlosumDim = dim;
+                    hostBlosumDimSquared = dim * dim;
+                    auto it = std::copy(blosum.begin(), blosum.end(), hostBlosum);
+                    assert(std::distance(hostBlosum, it) <= 25 * 25);                
+                }
+                break;
+            case BlosumType::BLOSUM62:
+                {
+                    const auto blosum = BLOSUM62::get1D();
+                    const int dim = BLOSUM62::dim;
+                    hostBlosumDim = dim;
+                    hostBlosumDimSquared = dim * dim;
+                    auto it = std::copy(blosum.begin(), blosum.end(), hostBlosum);
+                    assert(std::distance(hostBlosum, it) <= 25 * 25);                
+                }
+                break;
+            case BlosumType::BLOSUM80:
+                {
+                    const auto blosum = BLOSUM80::get1D();
+                    const int dim = BLOSUM80::dim;
+                    hostBlosumDim = dim;
+                    hostBlosumDimSquared = dim * dim;
+                    auto it = std::copy(blosum.begin(), blosum.end(), hostBlosum);
+                    assert(std::distance(hostBlosum, it) <= 25 * 25);                
+                }
+                break;
+            case BlosumType::BLOSUM45_20:
+                {
+                    const auto blosum = BLOSUM45_20::get1D();
+                    const int dim = BLOSUM45_20::dim;
+                    hostBlosumDim = dim;
+                    hostBlosumDimSquared = dim * dim;
+                    auto it = std::copy(blosum.begin(), blosum.end(), hostBlosum);
+                    assert(std::distance(hostBlosum, it) <= 25 * 25);                
+                }
+                break;
+            case BlosumType::BLOSUM50_20:
+                {
+                    const auto blosum = BLOSUM50_20::get1D();
+                    const int dim = BLOSUM50_20::dim;
+                    hostBlosumDim = dim;
+                    hostBlosumDimSquared = dim * dim;
+                    auto it = std::copy(blosum.begin(), blosum.end(), hostBlosum);
+                    assert(std::distance(hostBlosum, it) <= 25 * 25);                
+                }
+                break;
+            case BlosumType::BLOSUM62_20:
+                {
+                    const auto blosum = BLOSUM62_20::get1D();
+                    const int dim = BLOSUM62_20::dim;
+                    hostBlosumDim = dim;
+                    hostBlosumDimSquared = dim * dim;
+                    auto it = std::copy(blosum.begin(), blosum.end(), hostBlosum);
+                    assert(std::distance(hostBlosum, it) <= 25 * 25);                
+                }
+                break;
+            case BlosumType::BLOSUM80_20:
+                {
+                    const auto blosum = BLOSUM80_20::get1D();
+                    const int dim = BLOSUM80_20::dim;
+                    hostBlosumDim = dim;
+                    hostBlosumDimSquared = dim * dim;
+                    auto it = std::copy(blosum.begin(), blosum.end(), hostBlosum);
+                    assert(std::distance(hostBlosum, it) <= 25 * 25);                
+                }
+                break;
+            default:
+                assert(false && "unimplemented blosum copy");
+                break;
+        }
+    #ifdef __CUDACC__
+        RevertDeviceId rdi{};
+    
+        int numGpus = deviceIds.size();
+    
+        for(int gpu = 0; gpu < numGpus; gpu++){
+            cudaSetDevice(deviceIds[gpu]); CUERR;
+            cudaMemcpyToSymbol(deviceBlosum, &(hostBlosum[0]), sizeof(std::int8_t) * hostBlosumDim * hostBlosumDim); CUERR;
+            cudaMemcpyToSymbol(deviceBlosumDim, &hostBlosumDim, sizeof(int)); CUERR;
+            cudaMemcpyToSymbol(deviceBlosumDimSquared, &hostBlosumDimSquared, sizeof(int)); CUERR;
+        }
+    #endif    
+    }
+
+} //namespace cudasw4
\ No newline at end of file
diff --git a/lib/libmarv/src/blosum.hpp b/lib/libmarv/src/blosum.hpp
new file mode 100644
index 000000000..0af66d529
--- /dev/null
+++ b/lib/libmarv/src/blosum.hpp
@@ -0,0 +1,30 @@
+#ifndef BLOSUM_HPP
+#define BLOSUM_HPP
+
+#include "types.hpp"
+#include "util.cuh"
+#include <array>
+#include <string>
+#include <vector>
+#include <cstdint>
+
+namespace cudasw4{
+
+#ifdef __CUDACC__
+
+extern __constant__ std::int8_t deviceBlosum[25*25];
+extern __constant__ int deviceBlosumDim;
+extern __constant__ int deviceBlosumDimSquared;
+
+#endif
+
+extern std::int8_t hostBlosum[25*25];
+extern int hostBlosumDim;
+extern int hostBlosumDimSquared;
+
+//set host and device global blosum variables
+void setProgramWideBlosum(BlosumType blosumType, const std::vector<int>& deviceIds);
+
+} //namespace cudasw4
+
+#endif
\ No newline at end of file
diff --git a/lib/libmarv/src/config.hpp b/lib/libmarv/src/config.hpp
new file mode 100644
index 000000000..180de37a8
--- /dev/null
+++ b/lib/libmarv/src/config.hpp
@@ -0,0 +1,61 @@
+#ifndef CONFIG_HPP
+#define CONFIG_HPP
+
+#include <cstdint>
+#include <type_traits>
+
+namespace cudasw4{
+
+//MODIFY AT OWN RISK
+
+//data type to enumerate all sequences in the database
+using ReferenceIdT = std::int32_t;
+
+//data type for length of of both query sequences and databases sequences
+using SequenceLengthT = std::int32_t;
+
+static_assert(std::is_same_v<ReferenceIdT, std::int32_t>, "unexpected reference type");
+static_assert(std::is_same_v<SequenceLengthT, std::int32_t>, "unexpected sequence length type");
+
+struct MaxSequencesInDB{
+    static constexpr ReferenceIdT value(){
+        return std::numeric_limits<ReferenceIdT>::max() - 1;
+    }
+};
+
+struct MaxSequenceLength{
+    static constexpr SequenceLengthT value(){
+        return std::numeric_limits<SequenceLengthT>::max() - 128 - 4;
+    }
+};
+
+struct MaxNumberOfResults{
+    static constexpr int value(){
+        return 512*1024;
+    }
+};
+
+struct alignas(8) AlignmentEndPosition{
+    int x;
+    int y;
+
+    #ifdef __CUDACC__
+    __host__ __device__
+    #endif
+    int getQueryEndInclusive() const{
+        return x;
+    }
+
+    #ifdef __CUDACC__
+    __host__ __device__
+    #endif
+    int getSubjectEndInclusive() const{
+        return y;
+    }
+};
+
+
+} //namespace cudasw4
+
+
+#endif
\ No newline at end of file
diff --git a/lib/libmarv/src/convert.cuh b/lib/libmarv/src/convert.cuh
new file mode 100644
index 000000000..acb297d03
--- /dev/null
+++ b/lib/libmarv/src/convert.cuh
@@ -0,0 +1,322 @@
+#ifndef CONVERT_CUH
+#define CONVERT_CUH
+
+#include <cctype>
+#include <cstring>
+
+namespace cudasw4{
+
+struct ConvertAA_20{
+    #ifdef __CUDACC__
+    __host__ __device__
+    #endif
+    char operator()(const char& AA) {
+        // ORDER of AminoAcids following MMseqs2
+        // lower-case has upper-case encoding
+        switch(AA & ~32) { // bit twiddling to turn all AA to uppercase
+            case 'A': return 0;
+            case 'C': return 1;
+            case 'D':
+            case 'B': return 2;
+            case 'Z':
+            case 'E': return 3;
+            case 'F': return 4;
+            case 'G': return 5;
+            case 'H': return 6;
+            case 'I': return 7;
+            case 'K': return 8;
+            case 'J':
+            case 'L': return 9;
+            case 'M': return 10;
+            case 'N': return 11;
+            case 'P': return 12;
+            case 'Q': return 13;
+            case 'R': return 14;
+            case 'S': return 15;
+            case 'T': return 16;
+            case 'V': return 17;
+            case 'W': return 18;
+            case 'Y': return 19;
+            default:  return 20;
+        }
+    }
+};
+
+struct InverseConvertAA_20{
+    #ifdef __CUDACC__
+    __host__ __device__
+    #endif
+    char operator()(const char& AA) {
+        switch(AA) {
+            case 0:  return 'A';
+            case 1:  return 'C';
+            case 2:  return 'D';
+            case 3:  return 'E';
+            case 4:  return 'F';
+            case 5:  return 'G';
+            case 6:  return 'H';
+            case 7:  return 'I';
+            case 8:  return 'K';
+            case 9:  return 'L';
+            case 10: return 'M';
+            case 11: return 'N';
+            case 12: return 'P';
+            case 13: return 'Q';
+            case 14: return 'R';
+            case 15: return 'S';
+            case 16: return 'T';
+            case 17: return 'V';
+            case 18: return 'W';
+            case 19: return 'Y';
+            default: return 'X';
+        }
+    }
+};
+
+
+struct ConvertAA_20_CaseSensitive{
+    #ifdef __CUDACC__
+    __host__ __device__
+    #endif
+    char operator()(const char& AA) {
+        switch (AA) {
+            case 'A': return 0;
+            case 'C': return 1;
+            case 'B':
+            case 'D': return 2;
+            case 'Z':
+            case 'E': return 3;
+            case 'F': return 4;
+            case 'G': return 5;
+            case 'H': return 6;
+            case 'I': return 7;
+            case 'K': return 8;
+            case 'J':
+            case 'L': return 9;
+            case 'M': return 10;
+            case 'N': return 11;
+            case 'P': return 12;
+            case 'Q': return 13;
+            case 'R': return 14;
+            case 'S': return 15;
+            case 'T': return 16;
+            case 'V': return 17;
+            case 'W': return 18;
+            case 'Y': return 19;
+
+            case 'a': return 32;
+            case 'c': return 33;
+            case 'b':
+            case 'd': return 34;
+            case 'z':
+            case 'e': return 35;
+            case 'f': return 36;
+            case 'g': return 37;
+            case 'h': return 38;
+            case 'i': return 39;
+            case 'k': return 40;
+            case 'j':
+            case 'l': return 41;
+            case 'm': return 42;
+            case 'n': return 43;
+            case 'p': return 44;
+            case 'q': return 45;
+            case 'r': return 46;
+            case 's': return 47;
+            case 't': return 48;
+            case 'v': return 49;
+            case 'w': return 50;
+            case 'y': return 51;
+            default: return 20;
+        }
+    }
+};
+
+struct InverseConvertAA_20_CaseSensitive{
+    #ifdef __CUDACC__
+    __host__ __device__
+    #endif
+    char operator()(const char& AA) {
+        switch (AA) {
+            case 0:  return 'A';
+            case 1:  return 'C';
+            case 2:  return 'D';
+            case 3:  return 'E';
+            case 4:  return 'F';
+            case 5:  return 'G';
+            case 6:  return 'H';
+            case 7:  return 'I';
+            case 8:  return 'K';
+            case 9:  return 'L';
+            case 10: return 'M';
+            case 11: return 'N';
+            case 12: return 'P';
+            case 13: return 'Q';
+            case 14: return 'R';
+            case 15: return 'S';
+            case 16: return 'T';
+            case 17: return 'V';
+            case 18: return 'W';
+            case 19: return 'Y';
+            case 32: return 'a';
+            case 33: return 'c';
+            case 34: return 'd';
+            case 35: return 'e';
+            case 36: return 'f';
+            case 37: return 'g';
+            case 38: return 'h';
+            case 39: return 'i';
+            case 40: return 'k';
+            case 41: return 'l';
+            case 42: return 'm';
+            case 43: return 'n';
+            case 44: return 'p';
+            case 45: return 'q';
+            case 46: return 'r';
+            case 47: return 's';
+            case 48: return 't';
+            case 49: return 'v';
+            case 50: return 'w';
+            case 51: return 'y';
+            default: return '-';
+        }
+    }
+};
+
+struct CaseSensitive_to_CaseInsensitive{
+    #ifdef __CUDACC__
+    __host__ __device__
+    #endif
+    char operator()(const char& AA) {
+        return AA % 32;
+    }
+
+    //vectorized for 4 values packed in a single int
+    #ifdef __CUDACC__
+    __host__ __device__
+    #endif
+    unsigned int operator()(const unsigned int& packed4) {
+        constexpr unsigned int mod32mask = 0x1F1F1F1F;
+        return packed4 & mod32mask;
+    }
+};
+
+/*
+    Map lower-case encoded letters to "invalid letter"
+*/
+struct ClampToInvalid{
+    #ifdef __CUDACC__
+    __host__ __device__
+    #endif
+    char operator()(const char& AA) {
+        return AA < 20 ? AA : 20;
+    }
+
+    //vectorized for 4 values packed in a single int
+    #ifdef __CUDACC__
+    __device__
+    #endif
+    unsigned int operator()(const unsigned int& packed4) {
+        #ifdef __CUDA_ARCH__
+
+        constexpr unsigned int mask20 = 0x14141414; // decimal 20 per byte
+        return __vminu4(packed4, mask20);
+
+        #else
+
+        char asChar[4];
+        std::memcpy(&asChar[0], &packed4, sizeof(unsigned int));
+        asChar[0] = operator()(asChar[0]);
+        asChar[1] = operator()(asChar[1]);
+        asChar[2] = operator()(asChar[2]);
+        asChar[3] = operator()(asChar[3]);
+
+        unsigned int result;
+        std::memcpy(&result, &asChar[0], sizeof(unsigned int));
+        return result;
+
+        #endif
+    }
+
+};
+
+
+
+
+struct ConvertAA_20_mmseqs_to_ncbi{
+    #ifdef __CUDACC__
+    __host__ __device__
+    #endif
+    char operator()(const char& encodedAA) {
+        //             A  R  N  D  C  Q  E  G  H  I  L  K  M  F  P  S  T  W  Y  V
+        //   (NCBI)    0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19
+        //   (mmseqs)  0 14 11  2  1 13  3  5  6  7  9  8 10  4 12 15 16 18 19 17        
+        constexpr std::array<int, 21> mmseqsToNcbi{
+            0, //A
+            4, //C
+            3, //D
+            6, //E
+            13, //F
+            7, //G
+            8, //H
+            9, //I
+            11, //K
+            10, //L
+            12, //M
+            2, //N
+            14, //P
+            5, //Q
+            1, //R
+            15, //S
+            16, //T
+            19, //V
+            17, //W
+            18, //Y
+            20, //else
+        };
+
+        return mmseqsToNcbi[encodedAA];
+    }
+};
+
+struct ConvertAA_20_ncbi_to_mmseqs{
+    #ifdef __CUDACC__
+    __host__ __device__
+    #endif
+    char operator()(const char& encodedAA) {
+        //             A  R  N  D  C  Q  E  G  H  I  L  K  M  F  P  S  T  W  Y  V
+        //   (NCBI)    0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19
+        //   (mmseqs)  0 14 11  2  1 13  3  5  6  7  9  8 10  4 12 15 16 18 19 17        
+        constexpr std::array<int, 21> ncbiToMMseqs{
+            0,
+            14,
+            11,
+            2,
+            1,
+            13,
+            3,
+            5,
+            6,
+            7,
+            9,
+            8,
+            10,
+            4,
+            12,
+            15,
+            16,
+            18,
+            19,
+            17,
+            20, //else
+        };
+
+        return ncbiToMMseqs[encodedAA];
+    }
+};
+
+
+
+} //namespace cudasw4
+
+#endif
\ No newline at end of file
diff --git a/lib/libmarv/src/cudasw4.cuh b/lib/libmarv/src/cudasw4.cuh
new file mode 100644
index 000000000..378d017e4
--- /dev/null
+++ b/lib/libmarv/src/cudasw4.cuh
@@ -0,0 +1,4407 @@
+#ifndef CUDASW4_CUH
+#define CUDASW4_CUH
+
+#include "hpc_helpers/cuda_raiiwrappers.cuh"
+#include "hpc_helpers/all_helpers.cuh"
+#include "hpc_helpers/nvtx_markers.cuh"
+#include "hpc_helpers/simple_allocation.cuh"
+
+#include "config.hpp"
+#include "dbdata.hpp"
+#include "length_partitions.hpp"
+#include "util.cuh"
+#include "kernels.cuh"
+#include "blosum.hpp"
+#include "types.hpp"
+#include "dbbatching.cuh"
+#include "convert.cuh"
+#include "target_subject_ids.cuh"
+#include "gapless_kernel_config.cuh"
+#include "smithwaterman_kernel_config.cuh"
+#include "types.hpp"
+
+#include "pssm.cuh"
+#include "pssmkernels_gapless.cuh"
+#include "pssmkernels_smithwaterman.cuh"
+
+#include "gpudatabaseallocation.cuh"
+
+#include <thrust/binary_search.h>
+#include <thrust/sort.h>
+#include <thrust/equal.h>
+#include <thrust/iterator/counting_iterator.h>
+#include <thrust/iterator/transform_iterator.h>
+#include <thrust/iterator/constant_iterator.h>
+#include <thrust/merge.h>
+
+#include <iostream>
+#include <string>
+#include <vector>
+#include <memory>
+#include <string_view>
+#include <optional>
+#include <sstream>
+
+namespace cudasw4{
+
+    template<class T, T factor>
+    struct RoundToNextMultiple{
+        __host__ __device__ 
+        T operator()(const T& value){
+            return SDIV(value, factor) * factor;
+        }        
+    };
+
+    struct CompareScoresDescendingRefIdsAscending{
+        template<class Tuple1, class Tuple2>
+        __host__ __device__
+        bool operator()(const Tuple1& lhs, const Tuple2& rhs) const{
+            const auto scoreL = thrust::get<0>(lhs);
+            const auto refIdL = thrust::get<1>(lhs);
+            const auto scoreR = thrust::get<0>(rhs);
+            const auto refIdR = thrust::get<1>(rhs);
+            if(scoreL < scoreR) return false;
+            if(scoreL > scoreR) return true;
+            //scores are equal
+            return refIdL < refIdR;
+        }
+    };
+
+    __global__
+    void addKernel(int* output, const int* input1, const int* input2){
+        *output = *input1 + *input2;
+    }
+
+
+    __global__
+    void sumNumOverflowsKernel(int* output, const int* input, int numGpus){
+        int sum = input[0];
+        for(int gpu = 1; gpu < numGpus; gpu++){
+            sum += input[gpu];
+        }
+        output[0] = sum;
+    }
+
+    template<class PartitionOffsets, class Indices>
+    __global__
+    void transformLocalSequenceIndicesToGlobalIndices(
+        int gpu,
+        int N,
+        PartitionOffsets partitionOffsets,
+        Indices maxReduceArrayIndices
+    ){
+        const int tid = threadIdx.x + blockIdx.x * blockDim.x;
+
+        if(tid < N){
+            maxReduceArrayIndices[tid] = partitionOffsets.getGlobalIndex(gpu, maxReduceArrayIndices[tid]);
+        }
+    }
+
+    template<bool isEncoded_>
+    struct QueryView{
+        static constexpr bool isEncoded = isEncoded_;
+
+        QueryView(const char* ptr_, SequenceLengthT length_) : ptr(ptr_), length(length_){
+            if(ptr == nullptr) throw std::runtime_error("QueryView constructed from nullptr");
+        }
+        QueryView(const QueryView&) = default;
+        QueryView& operator=(const QueryView&) = default;
+
+        const char* ptr{};
+        SequenceLengthT length{};
+    };
+    using DecodedQueryView = QueryView<false>;
+    using EncodedQueryView = QueryView<true>;
+
+    struct KernelConfigFilenames{
+        std::optional<std::string> gapless;
+        std::optional<std::string> sw;
+    };
+
+    struct BenchmarkStats{
+        int numOverflows{};
+        double seconds{};
+        double gcups{};
+    };
+
+    struct ScanResult{
+        std::vector<int> scores{};
+        std::vector<ReferenceIdT> referenceIds{};
+        std::vector<AlignmentEndPosition> endPositions{};
+        BenchmarkStats stats{};
+    };
+
+    struct MemoryConfig{
+        size_t maxBatchBytes = 128ull * 1024ull * 1024ull;
+        size_t maxBatchSequences = 10'000'000;
+        size_t maxTempBytes = 4ull * 1024ull * 1024ull * 1024ull;
+        size_t maxGpuMem = std::numeric_limits<size_t>::max();
+    };
+
+    struct HostGpuPartitionOffsets{
+        int numGpus;
+        int numLengthPartitions;
+        std::vector<size_t> partitionSizes;
+        std::vector<size_t> horizontalPS;
+        std::vector<size_t> verticalPS;
+        std::vector<size_t> totalPerLengthPartitionPS;
+
+        HostGpuPartitionOffsets() = default;
+
+        HostGpuPartitionOffsets(int numGpus_, int numLengthpartitions_, std::vector<size_t> partitionSizes_)
+            : numGpus(numGpus_), 
+            numLengthPartitions(numLengthpartitions_), 
+            partitionSizes(std::move(partitionSizes_)),
+            horizontalPS(numGpus * numLengthPartitions, 0),
+            verticalPS(numGpus * numLengthPartitions, 0),
+            totalPerLengthPartitionPS(numLengthPartitions, 0)
+        {
+            assert(partitionSizes.size() == numGpus * numLengthPartitions);
+
+            for(int gpu = 0; gpu < numGpus; gpu++){
+                for(int l = 1; l < numLengthPartitions; l++){
+                    horizontalPS[gpu * numLengthPartitions + l] = horizontalPS[gpu * numLengthPartitions + l-1] + partitionSizes[gpu * numLengthPartitions + l-1];
+                }
+            }
+            for(int l = 0; l < numLengthPartitions; l++){
+                for(int gpu = 1; gpu < numGpus; gpu++){
+                    verticalPS[gpu * numLengthPartitions + l] = verticalPS[(gpu-1) * numLengthPartitions + l] + partitionSizes[(gpu-1) * numLengthPartitions + l];
+                }
+            }
+            for(int l = 1; l < numLengthPartitions; l++){
+                totalPerLengthPartitionPS[l] = totalPerLengthPartitionPS[l-1] 
+                    + (verticalPS[(numGpus - 1) * numLengthPartitions + (l-1)] + partitionSizes[(numGpus-1) * numLengthPartitions + (l-1)]);
+            }
+        }
+
+        size_t getGlobalIndex(int gpu, size_t localIndex) const {
+            const size_t* const myHorizontalPS = horizontalPS.data() + gpu * numLengthPartitions;
+            const auto it = std::lower_bound(myHorizontalPS, myHorizontalPS + numLengthPartitions, localIndex+1);
+            const int whichPartition = std::distance(myHorizontalPS, it) - 1;
+            const size_t occurenceInPartition = localIndex - myHorizontalPS[whichPartition];
+            const size_t globalPartitionBegin = totalPerLengthPartitionPS[whichPartition];
+            const size_t elementsOfOtherPreviousGpusInPartition = verticalPS[gpu * numLengthPartitions + whichPartition];
+            //std::cout << "whichPartition " << whichPartition << ", occurenceInPartition " << occurenceInPartition 
+            //    << ", globalPartitionBegin " << globalPartitionBegin << ", elementsOfOtherPreviousGpusInPartition " << elementsOfOtherPreviousGpusInPartition << "\n";
+            return globalPartitionBegin + elementsOfOtherPreviousGpusInPartition + occurenceInPartition;
+        };
+
+        void print(std::ostream& os){
+            os << "numGpus " << numGpus << "\n";
+            os << "numLengthPartitions " << numLengthPartitions << "\n";
+            os << "partitionSizes\n";
+            for(int gpu = 0; gpu < numGpus; gpu++){
+                for(int l = 0; l < numLengthPartitions; l++){
+                    os << partitionSizes[gpu * numLengthPartitions + l] << " ";
+                }
+                os << "\n";
+            }
+            os << "\n";
+            os << "horizontalPS\n";
+            for(int gpu = 0; gpu < numGpus; gpu++){
+                for(int l = 0; l < numLengthPartitions; l++){
+                    os << horizontalPS[gpu * numLengthPartitions + l] << " ";
+                }
+                os << "\n";
+            }
+            os << "\n";
+            os << "verticalPS\n";
+            for(int gpu = 0; gpu < numGpus; gpu++){
+                for(int l = 0; l < numLengthPartitions; l++){
+                    os << verticalPS[gpu * numLengthPartitions + l] << " ";
+                }
+                os << "\n";
+            }
+            os << "\n";
+            os << "totalPerLengthPartitionPS\n";
+            for(int l = 0; l < numLengthPartitions; l++){
+                os << totalPerLengthPartitionPS[l] << " ";
+            }
+            os << "\n";
+        }
+    };
+
+    struct DeviceGpuPartitionOffsets{
+        template<class T>
+        using MyDeviceBuffer = helpers::SimpleAllocationDevice<T, 0>;
+
+        int numGpus;
+        int numLengthPartitions;
+        MyDeviceBuffer<size_t> partitionSizes;
+        MyDeviceBuffer<size_t> horizontalPS;
+        MyDeviceBuffer<size_t> verticalPS;
+        MyDeviceBuffer<size_t> totalPerLengthPartitionPS;
+
+        struct View{
+            int numGpus;
+            int numLengthPartitions;
+            const size_t* partitionSizes;
+            const size_t* horizontalPS;
+            const size_t* verticalPS;
+            const size_t* totalPerLengthPartitionPS;
+
+            __device__
+            size_t getGlobalIndex(int gpu, size_t localIndex) const {
+                const size_t* const myHorizontalPS = horizontalPS + gpu * numLengthPartitions;
+                const auto it = thrust::lower_bound(thrust::seq, myHorizontalPS, myHorizontalPS + numLengthPartitions, localIndex+1);
+                const int whichPartition = thrust::distance(myHorizontalPS, it) - 1;
+                const size_t occurenceInPartition = localIndex - myHorizontalPS[whichPartition];
+                const size_t globalPartitionBegin = totalPerLengthPartitionPS[whichPartition];
+                const size_t elementsOfOtherPreviousGpusInPartition = verticalPS[gpu * numLengthPartitions + whichPartition];
+                return globalPartitionBegin + elementsOfOtherPreviousGpusInPartition + occurenceInPartition;
+            };
+        };
+
+        DeviceGpuPartitionOffsets() = default;
+        DeviceGpuPartitionOffsets(const HostGpuPartitionOffsets& hostData)
+            : numGpus(hostData.numGpus),
+            numLengthPartitions(hostData.numLengthPartitions),
+            partitionSizes(numGpus * numLengthPartitions),
+            horizontalPS(numGpus * numLengthPartitions),
+            verticalPS(numGpus * numLengthPartitions),
+            totalPerLengthPartitionPS(numLengthPartitions)
+        {
+            cudaMemcpyAsync(partitionSizes.data(), hostData.partitionSizes.data(), sizeof(size_t) * numGpus * numLengthPartitions, H2D, cudaStreamLegacy); CUERR;
+            cudaMemcpyAsync(horizontalPS.data(), hostData.horizontalPS.data(), sizeof(size_t) * numGpus * numLengthPartitions, H2D, cudaStreamLegacy); CUERR;
+            cudaMemcpyAsync(verticalPS.data(), hostData.verticalPS.data(), sizeof(size_t) * numGpus * numLengthPartitions, H2D, cudaStreamLegacy); CUERR;
+            cudaMemcpyAsync(totalPerLengthPartitionPS.data(), hostData.totalPerLengthPartitionPS.data(), sizeof(size_t) * numLengthPartitions, H2D, cudaStreamLegacy); CUERR;
+        }
+
+        View getDeviceView() const{
+            View view;
+            view.numGpus = numGpus;
+            view.numLengthPartitions = numLengthPartitions;
+            view.partitionSizes = partitionSizes.data();
+            view.horizontalPS = horizontalPS.data();
+            view.verticalPS = verticalPS.data();
+            view.totalPerLengthPartitionPS = totalPerLengthPartitionPS.data();
+            return view;
+        }
+    };
+
+
+    class CudaSW4{
+    public:
+        template<class T>
+        using MyPinnedBuffer = helpers::SimpleAllocationPinnedHost<T, 0>;
+        template<class T>
+        using MyDeviceBuffer = helpers::SimpleAllocationDevice<T, 0>;
+
+        struct GpuWorkingSet{
+
+            //using MaxReduceArray = TopNMaximaArray<maxReduceArraySize>;
+            using MaxReduceArray = TopNMaximaArray;
+            using MaxReduceArrayWithEndPositions = TopNMaximaArrayWithExtra<AlignmentEndPosition>;
+
+            GpuWorkingSet(
+                size_t gpumemlimit,
+                size_t maxBatchBytes,
+                size_t maxBatchSequences,
+                size_t maxTempBytes,
+                const std::vector<DBdataView>& dbPartitions,
+                const std::vector<DeviceBatchCopyToPinnedPlan>& dbBatches,
+                bool needsPinnedStagingBuffers,
+                int maxReduceArraySize_ = 512 * 1024
+            ) : maxReduceArraySize(maxReduceArraySize_)
+            {
+                cudaGetDevice(&deviceId);
+
+                size_t numSubjects = 0;
+                size_t numSubjectBytes = 0;
+                for(const auto& p : dbPartitions){
+                    numSubjects += p.numSequences();
+                    numSubjectBytes += p.numChars();
+                }
+        
+                d_query.resize(1024*1024); CUERR
+                gpuFullQueryPSSM.resize(10000, 21);
+
+                numTempBytes = std::min(maxTempBytes, gpumemlimit);
+                d_tempStorageHE.resize(numTempBytes);
+                d_maxReduceArrayScores.resize(maxReduceArraySize);
+                d_maxReduceArrayIndices.resize(maxReduceArraySize);
+                d_maxReduceArrayExtras.resize(maxReduceArraySize);                
+
+                size_t usedGpuMem = 0;
+                usedGpuMem += numTempBytes;
+                usedGpuMem += sizeof(float) * maxReduceArraySize; // d_maxReduceArrayScores
+                usedGpuMem += sizeof(ReferenceIdT) * maxReduceArraySize; // d_maxReduceArrayIndices
+                usedGpuMem += sizeof(AlignmentEndPosition) * maxReduceArraySize; //d_maxReduceArrayExtras
+
+                if(usedGpuMem > gpumemlimit){
+                    throw std::runtime_error("Out of memory working set");
+                }
+                
+        
+                //devAlignmentScoresFloat.resize(numSubjects);
+        
+                forkStreamEvent = CudaEvent{cudaEventDisableTiming}; CUERR;
+                numWorkStreamsWithoutTemp = 1;
+                workstreamIndex = 0;
+                workStreamsWithoutTemp.resize(numWorkStreamsWithoutTemp);
+
+                numCopyBuffers = 2;
+        
+                h_chardata_vec.resize(numCopyBuffers);
+                h_lengthdata_vec.resize(numCopyBuffers);
+                h_offsetdata_vec.resize(numCopyBuffers);
+                d_chardata_vec.resize(numCopyBuffers);
+                d_lengthdata_vec.resize(numCopyBuffers);
+                d_offsetdata_vec.resize(numCopyBuffers);
+                copyStreams.resize(numCopyBuffers);
+                pinnedBufferEvents.resize(numCopyBuffers);
+                deviceBufferEvents.resize(numCopyBuffers);
+                //d_total_overflow_number.resize(1);
+                //d_overflow_number.resize(numCopyBuffers);
+                //h_overflow_number.resize(numCopyBuffers);
+                //d_overflow_positions_vec.resize(numCopyBuffers);
+
+                size_t memoryRequiredForFullDB = 0;
+                memoryRequiredForFullDB += numSubjectBytes; // d_fulldb_chardata
+                memoryRequiredForFullDB += sizeof(SequenceLengthT) * numSubjects; //d_fulldb_lengthdata
+                memoryRequiredForFullDB += sizeof(size_t) * (numSubjects+1); //d_fulldb_offsetdata
+                //memoryRequiredForFullDB += sizeof(ReferenceIdT) * numSubjects * 2; //d_overflow_positions_vec
+        
+               
+
+                if(memoryRequiredForFullDB <= gpumemlimit){
+                    numBatchesInCachedDB = dbBatches.size();
+                    charsOfBatches = numSubjectBytes;
+                    subjectsOfBatches = numSubjects;
+                    d_cacheddb = std::make_shared<GpuDatabaseAllocation>(numSubjectBytes, numSubjects);
+
+                    for(int i = 0; i < numCopyBuffers; i++){
+                        if(needsPinnedStagingBuffers){
+                            h_chardata_vec[i].resize(maxBatchBytes);
+                            h_lengthdata_vec[i].resize(maxBatchSequences);
+                            h_offsetdata_vec[i].resize(maxBatchSequences+1);
+                        }
+                        pinnedBufferEvents[i] = CudaEvent{cudaEventDisableTiming}; CUERR;
+                        deviceBufferEvents[i] = CudaEvent{cudaEventDisableTiming}; CUERR;
+                        //d_overflow_positions_vec[i].resize(numSubjects);
+                    }
+                }else{
+                    //allocate a double buffer for batch transfering
+                    size_t memoryRequiredForBatchedProcessing = 0;
+                    memoryRequiredForBatchedProcessing += maxBatchBytes * 2; // d_chardata_vec
+                    memoryRequiredForBatchedProcessing += sizeof(SequenceLengthT) * maxBatchSequences * 2; //d_lengthdata_vec
+                    memoryRequiredForBatchedProcessing += sizeof(size_t) * (maxBatchSequences+1) * 2; //d_offsetdata_vec
+                    usedGpuMem += memoryRequiredForBatchedProcessing;
+                    if(usedGpuMem > gpumemlimit){
+                        throw std::runtime_error("Out of memory working set");
+                    }
+                    
+                    for(int i = 0; i < numCopyBuffers; i++){
+                        if(needsPinnedStagingBuffers){
+                            h_chardata_vec[i].resize(maxBatchBytes);
+                            h_lengthdata_vec[i].resize(maxBatchSequences);
+                            h_offsetdata_vec[i].resize(maxBatchSequences+1);
+                        }
+                        d_chardata_vec[i].resize(maxBatchBytes);
+                        d_lengthdata_vec[i].resize(maxBatchSequences);
+                        d_offsetdata_vec[i].resize(maxBatchSequences+1);
+                        pinnedBufferEvents[i] = CudaEvent{cudaEventDisableTiming}; CUERR;
+                        deviceBufferEvents[i] = CudaEvent{cudaEventDisableTiming}; CUERR;
+                        //d_overflow_positions_vec[i].resize(numSubjects);
+                    }
+
+                    //count how many batches fit into remaining gpu memory
+
+                    numBatchesInCachedDB = 0;
+                    charsOfBatches = 0;
+                    subjectsOfBatches = 0;
+                    size_t totalRequiredMemForBatches = sizeof(size_t);
+                    for(; numBatchesInCachedDB < dbBatches.size(); numBatchesInCachedDB++){
+                        const auto& batch = dbBatches[numBatchesInCachedDB];
+                        const size_t requiredMemForBatch = batch.usedSeq * sizeof(SequenceLengthT) + batch.usedSeq * sizeof(size_t) + batch.usedBytes;
+                        if(usedGpuMem + totalRequiredMemForBatches + requiredMemForBatch <= gpumemlimit){
+                            //ok, fits
+                            totalRequiredMemForBatches += requiredMemForBatch;
+                            charsOfBatches += batch.usedBytes;
+                            subjectsOfBatches += batch.usedSeq;
+                        }else{
+                            //does not fit
+                            break;
+                        }
+                    }
+                    assert(numBatchesInCachedDB < dbBatches.size());
+
+                    //std::cout << "numBatchesInCachedDB " << numBatchesInCachedDB << ", charsOfBatches " << charsOfBatches << ", subjectsOfBatches " << subjectsOfBatches << "\n";
+
+                    assert(usedGpuMem + totalRequiredMemForBatches <= gpumemlimit);
+                    d_cacheddb = std::make_shared<GpuDatabaseAllocation>(charsOfBatches, subjectsOfBatches);
+                }
+            }
+
+            GpuWorkingSet(
+                size_t gpumemlimit,
+                size_t maxBatchBytes,
+                size_t maxBatchSequences,
+                size_t maxTempBytes,
+                const std::vector<DBdataView>& dbPartitions,
+                const std::vector<DeviceBatchCopyToPinnedPlan>& dbBatches,
+                std::shared_ptr<GpuDatabaseAllocationBase> existingGpuDBAllocation,
+                bool needsPinnedStagingBuffers,
+                int maxReduceArraySize_ = 512 * 1024
+            ) : maxReduceArraySize(maxReduceArraySize_)
+            {
+                cudaGetDevice(&deviceId);
+
+                assert(existingGpuDBAllocation != nullptr);
+
+                size_t numSubjects = 0;
+                size_t numSubjectBytes = 0;
+                for(const auto& p : dbPartitions){
+                    numSubjects += p.numSequences();
+                    numSubjectBytes += p.numChars();
+                }
+        
+                d_query.resize(1024*1024); CUERR
+                gpuFullQueryPSSM.resize(10000, 21);
+
+                numTempBytes = std::min(maxTempBytes, gpumemlimit);
+                d_tempStorageHE.resize(numTempBytes);
+                d_maxReduceArrayScores.resize(maxReduceArraySize);
+                d_maxReduceArrayIndices.resize(maxReduceArraySize);
+                d_maxReduceArrayExtras.resize(maxReduceArraySize);
+
+                size_t usedGpuMem = 0;
+                usedGpuMem += numTempBytes;
+                usedGpuMem += sizeof(float) * maxReduceArraySize; // d_maxReduceArrayScores
+                usedGpuMem += sizeof(ReferenceIdT) * maxReduceArraySize; // d_maxReduceArrayIndices
+                usedGpuMem += sizeof(AlignmentEndPosition) * maxReduceArraySize; //d_maxReduceArrayExtras
+
+                if(usedGpuMem > gpumemlimit){
+                    throw std::runtime_error("Out of memory working set");
+                }
+                
+        
+                //devAlignmentScoresFloat.resize(numSubjects);
+        
+                forkStreamEvent = CudaEvent{cudaEventDisableTiming}; CUERR;
+                numWorkStreamsWithoutTemp = 10;
+                workstreamIndex = 0;
+                workStreamsWithoutTemp.resize(numWorkStreamsWithoutTemp);
+
+                numCopyBuffers = 2;
+        
+                h_chardata_vec.resize(numCopyBuffers);
+                h_lengthdata_vec.resize(numCopyBuffers);
+                h_offsetdata_vec.resize(numCopyBuffers);
+                d_chardata_vec.resize(numCopyBuffers);
+                d_lengthdata_vec.resize(numCopyBuffers);
+                d_offsetdata_vec.resize(numCopyBuffers);
+                copyStreams.resize(numCopyBuffers);
+                pinnedBufferEvents.resize(numCopyBuffers);
+                deviceBufferEvents.resize(numCopyBuffers);
+                //d_total_overflow_number.resize(1);
+                //d_overflow_number.resize(numCopyBuffers);
+                //h_overflow_number.resize(numCopyBuffers);
+                //d_overflow_positions_vec.resize(numCopyBuffers);
+        
+                d_cacheddb = existingGpuDBAllocation;
+
+                if(d_cacheddb->getNumChars() >= numSubjectBytes && d_cacheddb->getNumSubjects() >= numSubjects){
+                    numBatchesInCachedDB = dbBatches.size();
+                    charsOfBatches = numSubjectBytes;
+                    subjectsOfBatches = numSubjects;
+
+                    for(int i = 0; i < numCopyBuffers; i++){
+                        if(needsPinnedStagingBuffers){
+                            h_chardata_vec[i].resize(maxBatchBytes);
+                            h_lengthdata_vec[i].resize(maxBatchSequences);
+                            h_offsetdata_vec[i].resize(maxBatchSequences+1);
+                        }
+                        pinnedBufferEvents[i] = CudaEvent{cudaEventDisableTiming}; CUERR;
+                        deviceBufferEvents[i] = CudaEvent{cudaEventDisableTiming}; CUERR;
+                        //d_overflow_positions_vec[i].resize(numSubjects);
+                    }
+                }else{
+                    //allocate a double buffer for batch transfering
+                    size_t memoryRequiredForBatchedProcessing = 0;
+                    memoryRequiredForBatchedProcessing += maxBatchBytes * 2; // d_chardata_vec
+                    memoryRequiredForBatchedProcessing += sizeof(SequenceLengthT) * maxBatchSequences * 2; //d_lengthdata_vec
+                    memoryRequiredForBatchedProcessing += sizeof(size_t) * (maxBatchSequences+1) * 2; //d_offsetdata_vec
+                    usedGpuMem += memoryRequiredForBatchedProcessing;
+
+                    //std::cout << "usedGpuMem " << usedGpuMem << ", gpumemlimit " << gpumemlimit << "\n";
+
+                    //cached db is already accounted for because gpumemlimit was obtained after cached db was allocated
+
+                    // if(usedGpuMem > gpumemlimit){
+                    //     throw std::runtime_error("Out of memory working set");
+                    // }
+                    
+                    for(int i = 0; i < numCopyBuffers; i++){
+                        if(needsPinnedStagingBuffers){
+                            h_chardata_vec[i].resize(maxBatchBytes);
+                            h_lengthdata_vec[i].resize(maxBatchSequences);
+                            h_offsetdata_vec[i].resize(maxBatchSequences+1);
+                        }
+                        d_chardata_vec[i].resize(maxBatchBytes);
+                        d_lengthdata_vec[i].resize(maxBatchSequences);
+                        d_offsetdata_vec[i].resize(maxBatchSequences+1);
+                        pinnedBufferEvents[i] = CudaEvent{cudaEventDisableTiming}; CUERR;
+                        deviceBufferEvents[i] = CudaEvent{cudaEventDisableTiming}; CUERR;
+                        //d_overflow_positions_vec[i].resize(numSubjects);
+                    }
+
+                    //count how many batches fit into d_cacheddb
+
+                    numBatchesInCachedDB = 0;
+                    charsOfBatches = 0;
+                    subjectsOfBatches = 0;
+                    for(; numBatchesInCachedDB < dbBatches.size(); numBatchesInCachedDB++){
+                        const auto& batch = dbBatches[numBatchesInCachedDB];
+                        if(subjectsOfBatches + batch.usedSeq <= d_cacheddb->getNumSubjects() && charsOfBatches + batch.usedBytes <= d_cacheddb->getNumChars()){
+                            //ok, fits
+                            charsOfBatches += batch.usedBytes;
+                            subjectsOfBatches += batch.usedSeq;
+                        }else{
+                            //does not fit
+                            break;
+                        }
+                    }
+                    assert(charsOfBatches <= d_cacheddb->getNumChars());
+                    assert(subjectsOfBatches <= d_cacheddb->getNumSubjects());
+                    assert(numBatchesInCachedDB < dbBatches.size());
+
+
+                }
+            }
+        
+            MaxReduceArray getMaxReduceArray(size_t offset){
+                return MaxReduceArray(
+                    d_maxReduceArrayScores.data(), 
+                    d_maxReduceArrayIndices.data(), 
+                    offset,
+                    maxReduceArraySize
+                );
+            }
+        
+            void resetMaxReduceArray(cudaStream_t stream){
+                thrust::fill(thrust::cuda::par_nosync.on(stream),
+                    d_maxReduceArrayScores.data(),
+                    d_maxReduceArrayScores.data() + maxReduceArraySize,
+                    -1.f
+                );
+                cudaMemsetAsync(d_maxReduceArrayIndices.data(), 0, sizeof(ReferenceIdT) * maxReduceArraySize, stream); CUERR;
+            }
+
+            MaxReduceArrayWithEndPositions getMaxReduceArrayWithEndPositions(size_t offset){
+                return MaxReduceArrayWithEndPositions(
+                    d_maxReduceArrayScores.data(), 
+                    d_maxReduceArrayIndices.data(), 
+                    d_maxReduceArrayExtras.data(),
+                    offset,
+                    maxReduceArraySize
+                );
+            }
+        
+            void resetMaxReduceArrayWithEndPositions(cudaStream_t stream){
+                resetMaxReduceArray(stream);
+                cudaMemsetAsync(d_maxReduceArrayExtras.data(), 0, sizeof(AlignmentEndPosition) * maxReduceArraySize, stream); CUERR;
+            }
+
+            void resetTopNArrays(cudaStream_t stream){
+                thrust::fill(thrust::cuda::par_nosync.on(stream),
+                    d_topN_scores.data(),
+                    d_topN_scores.data() + d_topN_scores.size(),
+                    -1.f
+                );
+                cudaMemsetAsync(d_topN_refIds.data(), 0, sizeof(ReferenceIdT) * d_topN_refIds.size(), stream); CUERR;
+                cudaMemsetAsync(d_topN_alignmentEndPositions.data(), 0, sizeof(AlignmentEndPosition) * d_topN_alignmentEndPositions.size(), stream); CUERR;
+            }
+        
+            void setPartitionOffsets(const HostGpuPartitionOffsets& offsets){
+                deviceGpuPartitionOffsets = DeviceGpuPartitionOffsets(offsets);
+            }
+            
+            size_t getNumCharsInCachedDB() const{
+                return charsOfBatches;
+            }
+
+            size_t getNumSequencesInCachedDB() const{
+                return subjectsOfBatches;
+            }
+
+            size_t getNumBatchesInCachedDB() const{
+                return numBatchesInCachedDB;
+            }
+
+            void setTopNSize(size_t topN){
+                d_topN_scores.resize(2*topN);
+                d_topN_refIds.resize(2*topN);
+                d_topN_alignmentEndPositions.resize(2*topN);
+                d_topN_scores_tmp.resize(2*topN);
+                d_topN_refIds_tmp.resize(2*topN);
+                d_topN_alignmentEndPositions_tmp.resize(2*topN);
+            }
+        
+
+            int deviceId;
+            int numCopyBuffers;
+            int numWorkStreamsWithoutTemp = 1;
+            int workstreamIndex;
+            int copyBufferIndex = 0;
+            int maxReduceArraySize = 512 * 1024;
+            size_t numTempBytes;
+            size_t numBatchesInCachedDB = 0;
+            size_t charsOfBatches = 0;
+            size_t subjectsOfBatches = 0;
+        
+            MyDeviceBuffer<float> d_maxReduceArrayScores;
+            MyDeviceBuffer<ReferenceIdT> d_maxReduceArrayIndices;
+            MyDeviceBuffer<AlignmentEndPosition> d_maxReduceArrayExtras;
+
+            MyDeviceBuffer<float> d_topN_scores;
+            MyDeviceBuffer<ReferenceIdT> d_topN_refIds;
+            MyDeviceBuffer<float> d_topN_scores_tmp;
+            MyDeviceBuffer<ReferenceIdT> d_topN_refIds_tmp;
+            MyDeviceBuffer<AlignmentEndPosition> d_topN_alignmentEndPositions;
+            MyDeviceBuffer<AlignmentEndPosition> d_topN_alignmentEndPositions_tmp;
+        
+            MyDeviceBuffer<char> d_query;
+            MyDeviceBuffer<char> d_tempStorageHE;
+            //MyDeviceBuffer<float> devAlignmentScoresFloat;
+            // MyDeviceBuffer<size_t> d_selectedPositions;
+            //MyDeviceBuffer<int> d_total_overflow_number;
+            //MyDeviceBuffer<int> d_overflow_number;
+            //MyPinnedBuffer<int> h_overflow_number;
+            CudaStream hostFuncStream;
+            CudaStream workStreamForTempUsage;
+            CudaEvent forkStreamEvent;
+
+            size_t maxNumBatchesInCachedDB = 0;
+            std::shared_ptr<GpuDatabaseAllocationBase> d_cacheddb;
+
+            
+            std::vector<MyPinnedBuffer<char>> h_chardata_vec;
+            std::vector<MyPinnedBuffer<SequenceLengthT>> h_lengthdata_vec;
+            std::vector<MyPinnedBuffer<size_t>> h_offsetdata_vec;
+            std::vector<MyDeviceBuffer<char>> d_chardata_vec;
+            std::vector<MyDeviceBuffer<SequenceLengthT>> d_lengthdata_vec;
+            std::vector<MyDeviceBuffer<size_t>> d_offsetdata_vec;
+            std::vector<CudaStream> copyStreams;
+            std::vector<CudaEvent> pinnedBufferEvents;
+            std::vector<CudaEvent> deviceBufferEvents;
+            std::vector<CudaStream> workStreamsWithoutTemp;
+            //std::vector<MyDeviceBuffer<ReferenceIdT>> d_overflow_positions_vec;
+        
+            DeviceGpuPartitionOffsets deviceGpuPartitionOffsets;
+
+            GpuPSSM gpuFullQueryPSSM;
+            GpuPermutedPSSMforGapless gpuPermutedPSSMforGapless;
+            GpuPermutedPSSMforSW gpuPermutedPSSMforSW;
+        };
+
+        struct SequenceLengthStatistics{
+            SequenceLengthT max_length = 0;
+            SequenceLengthT min_length = std::numeric_limits<SequenceLengthT>::max();
+            size_t sumOfLengths = 0;
+        };
+    private:
+        struct BatchDstInfo{
+            bool isUploaded{};
+            char* charsPtr{};
+            SequenceLengthT* lengthsPtr{};
+            size_t* offsetsPtr{};
+        };
+
+    public:
+
+        CudaSW4(
+            std::vector<int> deviceIds_, 
+            int numTop,
+            BlosumType blosumType,
+            const MemoryConfig& memoryConfig,
+            bool verbose_,
+            const KernelConfigFilenames& kernelConfigFilenames
+        ) : deviceIds(std::move(deviceIds_)), verbose(verbose_)
+        {
+            #ifdef CUDASW_DEBUG_CHECK_CORRECTNESS
+                blosumType = BlosumType::BLOSUM62_20;
+            #endif
+            if(deviceIds.size() == 0){ 
+                throw std::runtime_error("No device selected");
+            
+            }
+            RevertDeviceId rdi{};
+
+            initializeGpus();
+
+            //resultNumOverflows.resize(1);
+
+            const int numGpus = deviceIds.size();
+            cudaSetDevice(deviceIds[0]);
+            
+            //d_resultNumOverflows.resize(numGpus);
+            scanTimer = std::make_unique<helpers::GpuTimer>("Scan");
+            totalTimer = std::make_unique<helpers::GpuTimer>("Total");
+
+            setBlosum(blosumType);
+            setNumTop(numTop);
+            setMemoryConfig(memoryConfig);
+
+            initializeListOfAvailableKernelConfigs(kernelConfigFilenames);
+
+            dbIsReady = false;
+        }
+
+        CudaSW4() = delete;
+        CudaSW4(const CudaSW4&) = delete;
+        CudaSW4(CudaSW4&&) = default;
+        CudaSW4& operator=(const CudaSW4&) = delete;
+        CudaSW4& operator=(CudaSW4&&) = default;
+
+        void setGapOpenScore(int score){
+            if(verbose && score >= 0){
+                std::cout << "Warning, gap open score set to non-negative value. Is this intended?\n";
+            }
+            gop = score;
+        }
+        void setGapExtendScore(int score){
+            if(verbose && score >= 0){
+                std::cout << "Warning, gap extend score set to non-negative value. Is this intended?\n";
+            }
+            gex = score;
+        }
+
+        void setDatabase(std::shared_ptr<DB> dbPtr){
+            RevertDeviceId rdi{};
+            fullDB = AnyDBWrapper(dbPtr);
+            makeReady();
+        }
+
+        void setDatabase(std::shared_ptr<DBWithVectors> dbPtr){
+            RevertDeviceId rdi{};
+            fullDB = AnyDBWrapper(dbPtr);
+            makeReady();
+        }
+
+        void setDatabase(std::shared_ptr<PseudoDB> dbPtr){
+            RevertDeviceId rdi{};
+            fullDB = AnyDBWrapper(dbPtr);
+            makeReady();
+        }
+
+        void setDatabase(std::shared_ptr<MMseqsDB> dbPtr){
+            RevertDeviceId rdi{};
+            fullDB = AnyDBWrapper(dbPtr);
+            makeReady();
+        }
+
+        void setDatabase(std::shared_ptr<ExternalDB> dbPtr){
+            RevertDeviceId rdi{};
+            fullDB = AnyDBWrapper(dbPtr);
+            makeReady();
+        }
+
+        void setDatabase(std::shared_ptr<DB> dbPtr, const std::vector<std::shared_ptr<GpuDatabaseAllocationBase>>& existingFullGpuDBAllocations){
+            RevertDeviceId rdi{};
+            fullDB = AnyDBWrapper(dbPtr);
+            makeReadyWithExistingFullGpuDB(existingFullGpuDBAllocations);
+        }
+
+        void setDatabase(std::shared_ptr<DBWithVectors> dbPtr, const std::vector<std::shared_ptr<GpuDatabaseAllocationBase>>& existingFullGpuDBAllocations){
+            RevertDeviceId rdi{};
+            fullDB = AnyDBWrapper(dbPtr);
+            makeReadyWithExistingFullGpuDB(existingFullGpuDBAllocations);
+        }
+
+        void setDatabase(std::shared_ptr<PseudoDB> dbPtr, const std::vector<std::shared_ptr<GpuDatabaseAllocationBase>>& existingFullGpuDBAllocations){
+            RevertDeviceId rdi{};
+            fullDB = AnyDBWrapper(dbPtr);
+            makeReadyWithExistingFullGpuDB(existingFullGpuDBAllocations);
+        }
+
+        void setDatabase(std::shared_ptr<MMseqsDB> dbPtr, const std::vector<std::shared_ptr<GpuDatabaseAllocationBase>>& existingFullGpuDBAllocations){
+            RevertDeviceId rdi{};
+            fullDB = AnyDBWrapper(dbPtr);
+            makeReadyWithExistingFullGpuDB(existingFullGpuDBAllocations);
+        }
+
+        void setDatabase(std::shared_ptr<ExternalDB> dbPtr, const std::vector<std::shared_ptr<GpuDatabaseAllocationBase>>& existingFullGpuDBAllocations){
+            RevertDeviceId rdi{};
+            fullDB = AnyDBWrapper(dbPtr);
+            makeReadyWithExistingFullGpuDB(existingFullGpuDBAllocations);
+        }
+
+        void setBlosum(BlosumType blosumType){
+            setProgramWideBlosum(blosumType, deviceIds);
+        }
+
+        void setNumTop(int value){
+            if(value > MaxNumberOfResults::value()){
+                throw std::runtime_error("setNumTop: value too large");
+            }
+            if(value >= 0){
+                numTop = value;
+                updateNumResultsPerQuery();
+
+                cub::SwitchDevice sd(deviceIds[0]);
+                const int numGpus = deviceIds.size();           
+
+                h_finalAlignmentScores.resize(results_per_query);
+                h_finalReferenceIds.resize(results_per_query);
+                h_finalEndPositions.resize(results_per_query);
+                d_finalAlignmentScores_allGpus.resize(results_per_query * numGpus);
+                d_finalReferenceIds_allGpus.resize(results_per_query * numGpus);  
+                d_finalEndPositions_allGpus.resize(results_per_query * numGpus);              
+            }
+        }
+
+        void setMemoryConfig(const MemoryConfig& val){
+            memoryConfig = val;
+        }
+
+        std::vector<std::shared_ptr<GpuDatabaseAllocationBase>> getFullGpuDBAllocations(){
+            if(!dbIsReady) return {};            
+
+            prefetchDBToGpus();
+
+            std::vector<std::shared_ptr<GpuDatabaseAllocationBase>> result;
+
+            const int numGpus = deviceIds.size();
+            for(int gpu = 0; gpu < numGpus; gpu++){
+                auto& ws = *workingSets[gpu];
+                
+                result.push_back(ws.d_cacheddb);
+            }
+
+            return result;
+        }
+
+        std::string_view getReferenceHeader(ReferenceIdT referenceId) const{
+            const auto& data = fullDB.getData();
+            const char* const headerBegin = data.headers() + data.headerOffsets()[referenceId];
+            const char* const headerEnd = data.headers() + data.headerOffsets()[referenceId+1];
+            return std::string_view(headerBegin, std::distance(headerBegin, headerEnd));
+        }
+
+        int getReferenceLength(ReferenceIdT referenceId) const{
+            const auto& data = fullDB.getData();
+            return data.lengths()[referenceId];
+        }
+
+        std::string getReferenceSequence(ReferenceIdT referenceId) const{
+            const auto& data = fullDB.getData();
+            const char* const begin = data.chars() + data.offsets()[referenceId];
+            const char* const end = begin + getReferenceLength(referenceId);
+
+            std::string sequence(end - begin, '\0');
+            std::transform(
+                begin, 
+                end,
+                sequence.begin(),
+                InverseConvertAA_20{}
+            );
+
+            return sequence;
+        }
+
+        void markCachedDBBatchesAsUploaded(int gpu){
+            auto& ws = *workingSets[gpu];
+            if(ws.getNumBatchesInCachedDB() > 0){
+                batchPlansDstInfoVec_cachedDB[gpu][0].isUploaded = true;
+                for(size_t i = 0; i < ws.getNumBatchesInCachedDB(); i++){
+                    batchPlansDstInfoVec[gpu][i].isUploaded = true;
+                }
+            }
+        }
+
+        void prefetchDBToGpus(){
+            nvtx::ScopedRange sr("prefetchDBToGpus", 1);
+            RevertDeviceId rdi{};
+
+            const int numGpus = deviceIds.size();
+            std::vector<int> copyIds;
+
+            helpers::CpuTimer copyTimer("transfer DB to GPUs");
+            for(int gpu = 0; gpu < numGpus; gpu++){
+                cudaSetDevice(deviceIds[gpu]);
+                auto& ws = *workingSets[gpu];
+
+                if(ws.getNumBatchesInCachedDB() > 0 && !batchPlansDstInfoVec_cachedDB[gpu][0].isUploaded){
+                    const auto& plan = batchPlans_cachedDB[gpu][0];
+                    const int currentBuffer = 0;
+                    cudaStream_t H2DcopyStream = ws.copyStreams[currentBuffer];
+
+                    executeCopyPlanH2DDirect(
+                        plan,
+                        ws.d_cacheddb->getCharData(),
+                        ws.d_cacheddb->getLengthData(),
+                        ws.d_cacheddb->getOffsetData(),
+                        subPartitionsForGpus[gpu],
+                        H2DcopyStream
+                    );
+                    
+                    copyIds.push_back(gpu);
+
+                    markCachedDBBatchesAsUploaded(gpu);
+                }
+            }
+            for(int gpu : copyIds){
+                cudaSetDevice(deviceIds[gpu]);
+                cudaDeviceSynchronize(); CUERR;
+            }
+            copyTimer.stop();
+            if(copyIds.size() > 0){
+                if(verbose){
+                    std::cout << "Transferred DB data in advance to GPU(s) ";
+                    for(auto x : copyIds){
+                        std::cout << x << " ";
+                    }
+                    std::cout << "\n";
+                    copyTimer.print();
+                }
+            }
+        }
+
+        template<class QueryView>
+        ScanResult scan(QueryView queryView, std::optional<const int8_t*> precomputedPssmOpt){
+            nvtx::ScopedRange sr("scan", 6);
+            if(!dbIsReady){
+                throw std::runtime_error("DB not set correctly");
+            }
+            RevertDeviceId rdi{};
+
+            // if(queryView.length <= getMaxSingleTileQueryLength_Gapless()){
+            //     return ScanResult{};
+            // }
+
+            const int masterDeviceId = deviceIds[0];
+            cudaSetDevice(masterDeviceId);
+
+            scanTimer->reset();
+            scanTimer->start();
+
+            setQuery(queryView, precomputedPssmOpt);
+
+            if(verbose && scanType == ScanType::GaplessPlusSW_Endpos && results_per_query == 0){
+                std::cout << "Warning. Gapless+SW_Endpos selected, but results_per_query == 0\n";
+            }
+
+            switch(scanType){
+                case ScanType::GaplessPlusSW_Endpos: 
+                {
+                    auto scanTypeOld = scanType;
+
+                    scanType = ScanType::Gapless;
+                    scanDatabaseForQuery_gapless();
+
+                    std::vector<ReferenceIdT> vec(h_finalReferenceIds.begin(), h_finalReferenceIds.begin() + results_per_query);
+                    // for(auto x : vec){ 
+                    //     std::cout << x << " : " << getReferenceSequence(x) << "\n";
+                    // }
+                    // std::cout << "\n";
+
+                    auto topIds = std::make_shared<TargetSubjectIds>(
+                        std::move(vec)
+                    );
+                    setTargetSubjectIds(topIds);
+
+                    scanType = ScanType::SW_Endpos;
+                    scanDatabaseForQuery_sw_endpos();
+                    setTargetSubjectIds(nullptr);
+
+                    scanType = scanTypeOld;
+                }; break;
+                case ScanType::SW_Endpos: scanDatabaseForQuery_sw_endpos(); break;
+                case ScanType::Gapless: //fall-through
+                default: scanDatabaseForQuery_gapless(); break;
+            }
+            
+
+            scanTimer->stop();
+
+            totalProcessedQueryLengths += queryView.length;
+            //totalNumOverflows += resultNumOverflows[0]; 
+
+            const auto& sequenceLengthStatistics = getSequenceLengthStatistics();
+
+            ScanResult result;
+            size_t computedCells = 0;
+            if(targetSubjectIds){
+                for(const auto& x : targetSubjectIds->subjectIds){
+                    computedCells += getReferenceLength(x);
+                }
+                computedCells *= queryView.length;
+            }else{
+                computedCells = sequenceLengthStatistics.sumOfLengths * queryView.length;
+            }
+            result.stats = makeBenchmarkStats(
+                scanTimer->elapsed() / 1000, 
+                computedCells, 
+                0 //resultNumOverflows[0]
+            );
+
+            #ifdef CUDASW_DEBUG_CHECK_CORRECTNESS
+
+            if(true || queryView.length > getMaxSingleTileQueryLength_Gapless()){
+                std::vector<char> convertedQuery;            
+                const char* query = queryView.ptr;
+                if constexpr(QueryView::isEncoded){
+                    convertedQuery.resize(queryView.length);
+                    std::transform(
+                        queryView.ptr,
+                        queryView.ptr + queryView.length,
+                        convertedQuery.data(),
+                        InverseConvertAA_20{}
+                    );
+                    query = convertedQuery.data();
+                }
+
+                std::vector<int> cpuScores = computeAllScoresCPU_gaplessfilter_blosum62(query, queryView.length);
+                //std::vector<int> cpuScoresExact = computeAllScoresCPU_exact_affine_blosum62(query, queryView.length);
+                size_t numToCheck = cpuScores.size();
+
+                //auto boundaries = getLengthPartitionBoundaries();
+                
+                //bool checkOk = true;
+                int numErrors = 0;
+                int numGreater2048 = 0;
+                for(size_t i = 0; i < numToCheck; i++){
+                    const auto refId = h_finalReferenceIds[i];
+                    int gpu = h_finalAlignmentScores[i];
+                    const int cpu = cpuScores[refId];
+
+
+                    //if(getReferenceLength(refId) <= boundaries[boundaries.size() - 2]){
+                        //gpu scores for all but last length partition are computed in half precision. don't report errors caused by rounding.
+                        if(gpu > 2048){
+                            gpu = cpu;
+                            numGreater2048++;
+                        }
+                    //}
+                    if(cpu != gpu){
+                        if(numErrors == 0){
+                            std::cout << "error. i " << i << ", sequence id " << refId 
+                                << ", cpu score " << cpu << ", gpu score " << gpu 
+                                //<< ", exact cpu score " << cpuScoresExact[refId] 
+                                << ".";
+                            std::cout << "Query:\n";
+                            std::copy(query, query + queryView.length, std::ostream_iterator<char>(std::cout, ""));
+                            std::cout << "\n";
+                            std::cout << "db sequence:\n";
+                            std::cout << getReferenceSequence(refId) << "\n";
+
+                            // if(refId > 5 && refId < cpuScores.size() - 5){
+                            //     for(int x = -5; x < 5; x++){
+                            //         std::cout << getReferenceHeader(refId + x) << "\n";
+                            //         std::cout << getReferenceSequence(refId + x) << "\n";
+                            //     }
+                            // }
+                        }
+                        numErrors++;
+                    }
+                }
+                if(numErrors == 0){
+                    std::cout << "Check ok, cpu and gpu produced same results. > 2048: " << numGreater2048 << " / " << numToCheck << "\n";
+                }else{
+                    std::cout << "Check not ok!!! " << numErrors << " sequences produced different results\n";
+                }
+            }
+
+            //#endif
+            #else
+
+            result.scores.insert(result.scores.end(), h_finalAlignmentScores.begin(), h_finalAlignmentScores.begin() + results_per_query);
+            result.referenceIds.insert(result.referenceIds.end(), h_finalReferenceIds.begin(), h_finalReferenceIds.begin() + results_per_query);
+            result.endPositions.insert(result.endPositions.end(), h_finalEndPositions.begin(), h_finalEndPositions.begin() + results_per_query);
+            
+            #endif
+
+            return result;
+        }
+
+        std::vector<int> computeAllScoresCPU_exact_affine_blosum62(const char* query, SequenceLengthT queryLength){
+            const auto& view = fullDB.getData();
+            size_t numSequences = view.numSequences();
+            std::vector<int> result(numSequences);
+
+            std::vector<char> convertedQuery(queryLength);
+            auto convertQuery = [&](auto c){
+                ConvertAA_20 charToMMseqs;
+                ConvertAA_20_mmseqs_to_ncbi mmseqsToNcbi;
+                return mmseqsToNcbi(charToMMseqs(c));
+            };
+            std::transform(
+                query,
+                query + queryLength,
+                convertedQuery.data(),
+                convertQuery
+            );
+            #pragma omp parallel for
+            for(size_t i = 0; i < numSequences; i++){
+                size_t offset = view.offsets()[i];
+                int length = view.lengths()[i];
+                const char* seq = view.chars() + offset;
+
+                #if 1
+                std::vector<char> convertedSubject(length);
+                std::transform(
+                    seq,
+                    seq + length,
+                    convertedSubject.data(),
+                    ConvertAA_20_mmseqs_to_ncbi{}
+                );
+                seq = convertedSubject.data();
+                #endif
+
+                int score = affine_local_DP_host_protein_blosum62_converted(
+                    convertedQuery.data(),
+                    seq,
+                    queryLength,
+                    length,
+                    gop,
+                    gex
+                );
+                result[i] = score;
+            }
+            return result;
+        }
+
+        std::vector<int> computeAllScoresCPU_gaplessfilter_blosum62(const char* query, SequenceLengthT queryLength){
+            const auto& view = fullDB.getData();
+            size_t numSequences = view.numSequences();
+            std::vector<int> result(numSequences);
+
+            std::vector<char> convertedQuery(queryLength);
+            auto convertQuery = [&](auto c){
+                ConvertAA_20 charToMMseqs;
+                ConvertAA_20_mmseqs_to_ncbi mmseqsToNcbi;
+                return mmseqsToNcbi(charToMMseqs(c));
+            };
+            std::transform(
+                query,
+                query + queryLength,
+                convertedQuery.data(),
+                convertQuery
+            );
+            #pragma omp parallel for
+            for(size_t i = 0; i < numSequences; i++){
+                size_t offset = view.offsets()[i];
+                int length = view.lengths()[i];
+                const char* seq = view.chars() + offset;
+
+                #if 1
+                std::vector<char> convertedSubject(length);
+                std::transform(
+                    seq,
+                    seq + length,
+                    convertedSubject.data(),
+                    ConvertAA_20_mmseqs_to_ncbi{}
+                );
+                seq = convertedSubject.data();
+                #endif
+
+                int score = GaplessFilter_host_protein_converted_blosum62(
+                    convertedQuery.data(),
+                    seq,
+                    queryLength,
+                    length
+                );
+                result[i] = score;
+            }
+            return result;
+        }
+
+
+        void printDBInfo() const{
+            nvtx::ScopedRange sr("printDBInfo", 0);
+            const size_t numSequences = fullDB.getData().numSequences();
+            std::cout << numSequences << " sequences, " << fullDB.getData().numChars() << " characters\n";
+
+            SequenceLengthStatistics stats = getSequenceLengthStatistics();
+
+            std::cout << "Min length " << stats.min_length << ", max length " << stats.max_length 
+                << ", avg length " << stats.sumOfLengths / numSequences << "\n";
+        }
+
+        void printDBLengthPartitions() const{
+            auto lengthBoundaries = getLengthPartitionBoundaries();
+            const int numLengthPartitions = getLengthPartitionBoundaries().size();
+
+            for(int i = 0; i < numLengthPartitions; i++){
+                std::cout << "<= " << lengthBoundaries[i] << ": " << fullDB_numSequencesPerLengthPartition[i] << "\n";
+            }
+        }
+
+        void totalTimerStart(){
+            RevertDeviceId rdi{};
+            cudaSetDevice(deviceIds[0]);
+            totalProcessedQueryLengths = 0;
+            //totalNumOverflows = 0;
+            totalTimer->start();
+        }
+
+        BenchmarkStats totalTimerStop(){
+            RevertDeviceId rdi{};
+            cudaSetDevice(deviceIds[0]);
+            totalTimer->stop();
+
+            const auto& sequenceLengthStatistics = getSequenceLengthStatistics();
+            BenchmarkStats stats = makeBenchmarkStats(
+                totalTimer->elapsed() / 1000,
+                totalProcessedQueryLengths * sequenceLengthStatistics.sumOfLengths,
+                0 //totalNumOverflows
+            );
+
+            return stats;
+        }
+
+        void setScanType(ScanType type){
+            if(verbose){
+                std::cout << "Set scan type to " << to_string(type) << "\n";
+            }
+            scanType = type;
+        }
+
+        void setTargetSubjectIds(std::shared_ptr<TargetSubjectIds> ptr){
+            targetSubjectIds = ptr;
+
+            if(targetSubjectIds){
+                if(dbIsReady){
+                    targetSubjectIds->removeOutOfBoundsTargets(fullDB.getData().numSequences());
+                }
+            }
+        }
+
+    private:
+        void initializeGpus(){
+            const int numGpus = deviceIds.size();
+
+            for(int i = 0; i < numGpus; i++){
+                cudaSetDevice(deviceIds[i]); CUERR
+                helpers::init_cuda_context(); CUERR
+                cudaDeviceSetCacheConfig(cudaFuncCachePreferShared);CUERR
+        
+                cudaMemPool_t mempool;
+                cudaDeviceGetDefaultMemPool(&mempool, deviceIds[i]); CUERR
+                uint64_t threshold = UINT64_MAX;
+                cudaMemPoolSetAttribute(mempool, cudaMemPoolAttrReleaseThreshold, &threshold);CUERR
+            
+                gpuStreams.emplace_back();
+                gpuEvents.emplace_back(cudaEventDisableTiming);
+            }
+        }
+
+        void makeReady(){
+            nvtx::ScopedRange sr("makeReady", 0);
+            #ifdef CUDASW_DEBUG_CHECK_CORRECTNESS
+            const auto& dbData = fullDB.getData();
+            size_t numDBSequences = dbData.numSequences();
+            if(numDBSequences > size_t(std::numeric_limits<int>::max()))
+                throw std::runtime_error("cannot check correctness for this db size");
+
+            maxReduceArraySize = numDBSequences;
+            results_per_query = maxReduceArraySize;
+            setNumTopNoCheck(maxReduceArraySize);
+            #endif
+
+            dbSequenceLengthStatistics = nullptr;
+
+            computeTotalNumSequencePerLengthPartition();
+            partitionDBAmongstGpus();
+
+            createDBBatchesForGpus();
+            allocateGpuWorkingSets();
+            assignBatchesToGpuMem();
+            
+            
+            dbIsReady = true;
+            updateNumResultsPerQuery();
+
+            if(targetSubjectIds){
+                targetSubjectIds->removeOutOfBoundsTargets(fullDB.getData().numSequences());
+            }
+        }
+
+        void makeReadyWithExistingFullGpuDB(const std::vector<std::shared_ptr<GpuDatabaseAllocationBase>>& existingFullGpuDBAllocations){
+            nvtx::ScopedRange sr("makeReadyWithExistingFullGpuDB", 0);
+            #ifdef CUDASW_DEBUG_CHECK_CORRECTNESS
+            const auto& dbData = fullDB.getData();
+            size_t numDBSequences = dbData.numSequences();
+            if(numDBSequences > size_t(std::numeric_limits<int>::max()))
+                throw std::runtime_error("cannot check correctness for this db size");
+
+            maxReduceArraySize = numDBSequences;
+            results_per_query = maxReduceArraySize;
+            setNumTopNoCheck(maxReduceArraySize);
+            #endif
+
+            dbSequenceLengthStatistics = nullptr;
+
+            computeTotalNumSequencePerLengthPartition();
+            partitionDBAmongstGpus();
+
+            createDBBatchesForGpus();
+            allocateGpuWorkingSetsWithExistingFullGpuDB(existingFullGpuDBAllocations);
+            assignBatchesToGpuMem();
+
+            const int numGpus = deviceIds.size();
+            for(int gpu = 0; gpu < numGpus; gpu++){
+                markCachedDBBatchesAsUploaded(gpu);
+            }
+            
+            
+            dbIsReady = true;
+            updateNumResultsPerQuery();
+
+            if(targetSubjectIds){
+                targetSubjectIds->removeOutOfBoundsTargets(fullDB.getData().numSequences());
+            }
+
+
+            // const auto& data = fullDB.getData();
+
+            // int pageableMemoryAccessUsesHostPageTables = 0;
+            // int readOnlyHostRegisterSupported = 0;
+            // cudaDeviceGetAttribute(&pageableMemoryAccessUsesHostPageTables, cudaDevAttrPageableMemoryAccessUsesHostPageTables, 0); CUERR;
+            // std::cout << "pageableMemoryAccessUsesHostPageTables " << pageableMemoryAccessUsesHostPageTables << "\n";
+            // //cudaDeviceGetAttribute(&readOnlyHostRegisterSupported, cudaDeviceAttrReadOnlyHostRegisterSupported, 0); CUERR;
+            // // std::cout << "readOnlyHostRegisterSupported " << readOnlyHostRegisterSupported << "\n";
+
+            // cudaDeviceProp prop;
+            // cudaGetDeviceProperties(&prop, 0); CUERR;
+
+            // std::cout << "prop.pageableMemoryAccess " << prop.pageableMemoryAccess << "\n";
+            // std::cout << "prop.pageableMemoryAccessUsesHostPageTables " << prop.pageableMemoryAccessUsesHostPageTables << "\n";
+            // std::cout << "prop.hostRegisterReadOnlySupported " << prop.hostRegisterReadOnlySupported << "\n";
+            // std::cout << "prop.hostRegisterSupported " << prop.hostRegisterSupported << "\n";
+
+
+            // cudaHostRegister((void*)data.chars(), sizeof(char) * data.numChars(), cudaHostRegisterDefault); CUERR;
+            // cudaHostRegister((void*)data.lengths(), sizeof(SequenceLengthT) * data.numSequences(), cudaHostRegisterDefault); CUERR;
+            // cudaHostRegister((void*)data.offsets(), sizeof(size_t) * data.numSequences(), cudaHostRegisterDefault); CUERR;
+        }
+
+        void computeTotalNumSequencePerLengthPartition(){
+            nvtx::ScopedRange sr("computeTotalNumSequencePerLengthPartition", 1);
+            auto lengthBoundaries = getLengthPartitionBoundaries();
+            const int numLengthPartitions = getLengthPartitionBoundaries().size();
+
+            fullDB_numSequencesPerLengthPartition.resize(numLengthPartitions);
+
+            const auto& dbData = fullDB.getData();
+            auto partitionBegin = dbData.lengths();
+            for(int i = 0; i < numLengthPartitions; i++){
+                //length k is in partition i if boundaries[i-1] < k <= boundaries[i]
+                SequenceLengthT searchFor = lengthBoundaries[i];
+                if(searchFor < std::numeric_limits<SequenceLengthT>::max()){
+                    searchFor += 1;
+                }
+                auto partitionEnd = std::lower_bound(
+                    partitionBegin, 
+                    dbData.lengths() + dbData.numSequences(), 
+                    searchFor
+                );
+                fullDB_numSequencesPerLengthPartition[i] = std::distance(partitionBegin, partitionEnd);
+                partitionBegin = partitionEnd;
+            }
+        }
+
+        void partitionDBAmongstGpus(){
+            nvtx::ScopedRange sr("partitionDBAmongstGpus", 2);
+            const int numGpus = deviceIds.size();
+            const int numLengthPartitions = getLengthPartitionBoundaries().size();
+
+            numSequencesPerLengthPartitionPrefixSum.clear();
+            dbPartitionsByLengthPartitioning.clear();
+            subPartitionsForGpus.clear();
+            lengthPartitionIdsForGpus.clear();
+            numSequencesPerGpu.clear();
+            numSequencesPerGpuPrefixSum.clear();
+
+            const auto& data = fullDB.getData();
+    
+            subPartitionsForGpus.resize(numGpus);
+            lengthPartitionIdsForGpus.resize(numGpus);
+            numSequencesPerGpu.resize(numGpus, 0);
+            numSequencesPerGpuPrefixSum.resize(numGpus, 0);
+    
+            numSequencesPerLengthPartitionPrefixSum.resize(numLengthPartitions, 0);
+            for(int i = 0; i < numLengthPartitions-1; i++){
+                numSequencesPerLengthPartitionPrefixSum[i+1] = numSequencesPerLengthPartitionPrefixSum[i] + fullDB_numSequencesPerLengthPartition[i];
+            }
+    
+            for(int i = 0; i < numLengthPartitions; i++){
+                size_t begin = numSequencesPerLengthPartitionPrefixSum[i];
+                size_t end = begin + fullDB_numSequencesPerLengthPartition[i];
+                dbPartitionsByLengthPartitioning.emplace_back(data, begin, end);        
+            }
+    
+            for(int lengthPartitionId = 0; lengthPartitionId < numLengthPartitions; lengthPartitionId++){
+                const auto& lengthPartition = dbPartitionsByLengthPartitioning[lengthPartitionId];        
+                const auto partitionedByGpu = partitionDBdata_by_numberOfChars(lengthPartition, lengthPartition.numChars() / numGpus);
+        
+                assert(int(partitionedByGpu.size()) <= numGpus);
+                for(int gpu = 0; gpu < numGpus; gpu++){
+                    if(gpu < int(partitionedByGpu.size())){
+                        subPartitionsForGpus[gpu].push_back(partitionedByGpu[gpu]);
+                        lengthPartitionIdsForGpus[gpu].push_back(lengthPartitionId);
+                    }else{
+                        //add empty partition
+                        subPartitionsForGpus[gpu].push_back(DBdataView(data, 0, 0));
+                        lengthPartitionIdsForGpus[gpu].push_back(0);
+                    }
+                }
+            }
+        
+            for(int i = 0; i < numGpus; i++){
+                for(const auto& p : subPartitionsForGpus[i]){
+                    numSequencesPerGpu[i] += p.numSequences();
+                }
+            }
+            for(int i = 0; i < numGpus-1; i++){
+                numSequencesPerGpuPrefixSum[i+1] = numSequencesPerGpuPrefixSum[i] + numSequencesPerGpu[i];
+            }
+        
+            numSequencesPerGpu_total.resize(numGpus);
+            numSequencesPerGpuPrefixSum_total.resize(numGpus);
+            numSequencesPerGpuPrefixSum_total[0] = 0;
+
+        
+            for(int i = 0; i < numGpus; i++){
+                size_t num = numSequencesPerGpu[i];
+                numSequencesPerGpu_total[i] = num;
+                if(i < numGpus - 1){
+                    numSequencesPerGpuPrefixSum_total[i+1] = numSequencesPerGpuPrefixSum_total[i] + num;
+                }
+            }
+
+            std::vector<size_t> sequencesInPartitions(numGpus * numLengthPartitions);
+            for(int gpu = 0; gpu < numGpus; gpu++){
+                assert(subPartitionsForGpus[gpu].size() == numLengthPartitions);
+                for(int i = 0; i < numLengthPartitions; i++){
+                    sequencesInPartitions[gpu * numLengthPartitions + i] = subPartitionsForGpus[gpu][i].numSequences();
+                }
+            }
+            hostGpuPartitionOffsets = HostGpuPartitionOffsets(numGpus, numLengthPartitions, std::move(sequencesInPartitions));
+        }
+
+        void allocateGpuWorkingSets(){
+            nvtx::ScopedRange sr("allocateGpuWorkingSets", 3);
+            const int numGpus = deviceIds.size();
+            workingSets.clear();
+            workingSets.resize(numGpus);
+
+            if(verbose){
+                std::cout << "Allocate Memory: \n";
+            }
+            //nvtx::push_range("ALLOC_MEM", 0);
+            helpers::CpuTimer allocTimer("ALLOC_MEM");
+
+            for(int gpu = 0; gpu < numGpus; gpu++){
+                cudaSetDevice(deviceIds[gpu]);
+
+                size_t freeMem, totalMem;
+                cudaMemGetInfo(&freeMem, &totalMem);
+                constexpr size_t safety = 256*1024*1024;
+                size_t memlimit = std::min(freeMem, memoryConfig.maxGpuMem);
+                if(memlimit > safety){
+                    memlimit -= safety;
+                }
+
+                if(verbose){
+                    std::cout << "gpu " << gpu << " may use " << memlimit << " bytes. ";
+                }
+
+                const bool needsPinnedStagingBuffers = numGpus > 1;
+
+                workingSets[gpu] = std::make_unique<GpuWorkingSet>(
+                    memlimit,
+                    memoryConfig.maxBatchBytes,
+                    memoryConfig.maxBatchSequences,
+                    memoryConfig.maxTempBytes,
+                    subPartitionsForGpus[gpu],
+                    batchPlans[gpu],
+                    needsPinnedStagingBuffers,
+                    maxReduceArraySize
+                );
+
+                if(verbose){
+                    std::cout << "Using " << workingSets[gpu]->numTempBytes << " temp bytes. ";
+                }
+                if(verbose){
+                    std::cout << workingSets[gpu]->getNumBatchesInCachedDB() << " out of " << batchPlans[gpu].size() << " DB batches will be cached in gpu memory\n";
+                }
+
+                //set gpu partition table
+                workingSets[gpu]->setPartitionOffsets(hostGpuPartitionOffsets);
+
+                const bool usesCallbackThread = numGpus > 1;
+
+                if(usesCallbackThread){
+                    //spin up the host callback thread
+                    auto noop = [](void*){};
+                    cudaLaunchHostFunc(
+                        gpuStreams[gpu], 
+                        noop, 
+                        nullptr
+                    ); CUERR
+                }
+
+                workingSets[gpu]->setTopNSize(results_per_query);
+            }    
+
+            if(verbose){
+                allocTimer.print();
+            }
+        }
+
+        void allocateGpuWorkingSetsWithExistingFullGpuDB(const std::vector<std::shared_ptr<GpuDatabaseAllocationBase>>& existingFullGpuDBAllocations){
+            nvtx::ScopedRange sr("allocateGpuWorkingSetsWithExistingFullGpuDB", 3);
+            const int numGpus = deviceIds.size();
+            workingSets.clear();
+            workingSets.resize(numGpus);
+
+            if(verbose){
+                std::cout << "Allocate Memory: \n";
+            }
+            //nvtx::push_range("ALLOC_MEM", 0);
+            helpers::CpuTimer allocTimer("ALLOC_MEM");
+
+            for(int gpu = 0; gpu < numGpus; gpu++){
+                cudaSetDevice(deviceIds[gpu]);
+
+                size_t freeMem, totalMem;
+                cudaMemGetInfo(&freeMem, &totalMem);
+                constexpr size_t safety = 256*1024*1024;
+                size_t memlimit = std::min(freeMem, memoryConfig.maxGpuMem);
+                if(memlimit > safety){
+                    memlimit -= safety;
+                }
+
+                if(verbose){
+                    std::cout << "gpu " << gpu << " may use " << memlimit << " bytes. ";
+                }
+
+                const bool needsPinnedStagingBuffers = numGpus > 1;
+
+                workingSets[gpu] = std::make_unique<GpuWorkingSet>(
+                    memlimit,
+                    memoryConfig.maxBatchBytes,
+                    memoryConfig.maxBatchSequences,
+                    memoryConfig.maxTempBytes,
+                    subPartitionsForGpus[gpu],
+                    batchPlans[gpu],
+                    existingFullGpuDBAllocations[gpu],
+                    needsPinnedStagingBuffers,
+                    maxReduceArraySize
+                );
+
+                if(verbose){
+                    std::cout << "Using " << workingSets[gpu]->numTempBytes << " temp bytes. ";
+                }
+                if(verbose){
+                    std::cout << workingSets[gpu]->getNumBatchesInCachedDB() << " out of " << batchPlans[gpu].size() << " DB batches will be cached in gpu memory\n";
+                }
+
+                //set gpu partition table
+                workingSets[gpu]->setPartitionOffsets(hostGpuPartitionOffsets);
+
+                const bool usesCallbackThread = numGpus > 1;
+
+                if(usesCallbackThread){
+                    //spin up the host callback thread
+                    auto noop = [](void*){};
+                    cudaLaunchHostFunc(
+                        gpuStreams[gpu], 
+                        noop, 
+                        nullptr
+                    ); CUERR
+                }
+
+                workingSets[gpu]->setTopNSize(results_per_query);
+            }    
+
+            if(verbose){
+                allocTimer.print();
+            }
+        }
+
+        
+
+        void createDBBatchesForGpus(){
+            nvtx::ScopedRange sr("createDBBatchesForGpus", 4);
+            const int numGpus = deviceIds.size();
+
+            batchPlans.clear();
+            batchPlans.resize(numGpus);
+            batchPlans_cachedDB.clear();
+            batchPlans_cachedDB.resize(numGpus);
+    
+            for(int gpu = 0; gpu < numGpus; gpu++){
+                batchPlans[gpu] = computeDbCopyPlan(
+                    subPartitionsForGpus[gpu],
+                    lengthPartitionIdsForGpus[gpu],
+                    memoryConfig.maxBatchBytes,
+                    memoryConfig.maxBatchSequences
+                );
+                if(verbose){
+                    std::cout << "Batch plan gpu " << gpu << ": " << batchPlans[gpu].size() << " batches\n";
+                }
+            }
+        }
+
+        void assignBatchesToGpuMem(){
+            nvtx::ScopedRange sr("createDBBatchesForGpus", 5);
+            const int numGpus = deviceIds.size();
+            batchPlansDstInfoVec.clear();
+            batchPlansDstInfoVec.resize(numGpus);
+            batchPlansDstInfoVec_cachedDB.clear();
+            batchPlansDstInfoVec_cachedDB.resize(numGpus);
+    
+            for(int gpu = 0; gpu < numGpus; gpu++){
+                cudaSetDevice(deviceIds[gpu]);
+                auto& ws = *workingSets[gpu];
+                if(ws.getNumBatchesInCachedDB() > 0){
+                    //can cache full db in gpu mem
+
+                    auto plansForCachedDB = computeDbCopyPlan(
+                        subPartitionsForGpus[gpu],
+                        lengthPartitionIdsForGpus[gpu],
+                        sizeof(char) * ws.getNumCharsInCachedDB(),
+                        ws.getNumSequencesInCachedDB()
+                    );
+                    assert(plansForCachedDB.size() >= 1);
+                    plansForCachedDB.erase(plansForCachedDB.begin() + 1, plansForCachedDB.end());
+                    batchPlans_cachedDB[gpu] = plansForCachedDB;
+                    // if(verbose){
+                    //     std::cout << "Cached db single batch plan " << plansForCachedDB[0] << "\n";
+                    // }
+
+                    BatchDstInfo dstInfo;
+                    dstInfo.isUploaded = false;
+                    dstInfo.charsPtr = ws.d_cacheddb->getCharData();
+                    dstInfo.lengthsPtr = ws.d_cacheddb->getLengthData();
+                    dstInfo.offsetsPtr = ws.d_cacheddb->getOffsetData();
+                    batchPlansDstInfoVec_cachedDB[gpu].push_back(dstInfo);
+                }
+
+                {
+                    BatchDstInfo dstInfo;
+                    dstInfo.isUploaded = false;
+                    dstInfo.charsPtr = ws.d_cacheddb->getCharData();
+                    dstInfo.lengthsPtr = ws.d_cacheddb->getLengthData();
+                    dstInfo.offsetsPtr = ws.d_cacheddb->getOffsetData();
+
+                    for(size_t i = 0; i < ws.getNumBatchesInCachedDB(); i++){
+                        batchPlansDstInfoVec[gpu].push_back(dstInfo);
+                        const auto& plan = batchPlans[gpu][i];
+                        dstInfo.charsPtr += plan.usedBytes;
+                        dstInfo.lengthsPtr += plan.usedSeq;
+                        dstInfo.offsetsPtr += plan.usedSeq;
+                    }
+
+                    for(size_t i = ws.getNumBatchesInCachedDB(), buf = 0; i < batchPlans[gpu].size(); i++, buf = (buf+1)%ws.numCopyBuffers){
+                        dstInfo.charsPtr = ws.d_chardata_vec[buf].data();
+                        dstInfo.lengthsPtr = ws.d_lengthdata_vec[buf].data();
+                        dstInfo.offsetsPtr = ws.d_offsetdata_vec[buf].data();
+                        batchPlansDstInfoVec[gpu].push_back(dstInfo);
+                    }
+                }
+            }
+        }
+
+        void printDBDataView(const DBdataView& view) const{
+            std::cout << "Sequences: " << view.numSequences() << "\n";
+            std::cout << "Chars: " << view.offsets()[0] << " - " << view.offsets()[view.numSequences()] << " (" << (view.offsets()[view.numSequences()] - view.offsets()[0]) << ")"
+                << " " << view.numChars() << "\n";
+        }
+
+        void printDBDataViews(const std::vector<DBdataView>& views) const {
+            size_t numViews = views.size();
+            for(size_t p = 0; p < numViews; p++){
+                const DBdataView& view = views[p];
+        
+                std::cout << "View " << p << "\n";
+                printDBDataView(view);
+            }
+        }
+
+        SequenceLengthStatistics getSequenceLengthStatistics() const{
+            if(dbSequenceLengthStatistics == nullptr){
+                dbSequenceLengthStatistics = std::make_unique<SequenceLengthStatistics>();
+                const auto& data = fullDB.getData();
+                size_t numSeq = data.numSequences();
+
+                for (size_t i=0; i < numSeq; i++) {
+                    if (data.lengths()[i] > dbSequenceLengthStatistics->max_length) dbSequenceLengthStatistics->max_length = data.lengths()[i];
+                    if (data.lengths()[i] < dbSequenceLengthStatistics->min_length) dbSequenceLengthStatistics->min_length = data.lengths()[i];
+                    dbSequenceLengthStatistics->sumOfLengths += data.lengths()[i];
+                }
+            }
+            return *dbSequenceLengthStatistics;
+        }
+
+        std::vector<DeviceBatchCopyToPinnedPlan> computeDbCopyPlan(
+            const std::vector<DBdataView>& dbPartitions,
+            const std::vector<int>& lengthPartitionIds,
+            size_t MAX_CHARDATA_BYTES,
+            size_t MAX_SEQ
+        ) const {
+            std::vector<DeviceBatchCopyToPinnedPlan> result;
+        
+            size_t currentCopyPartition = 0;
+            size_t currentCopySeqInPartition = 0;
+        
+            //size_t processedSequences = 0;
+            while(currentCopyPartition < dbPartitions.size()){
+                
+                size_t usedBytes = 0;
+                size_t usedSeq = 0;
+        
+                DeviceBatchCopyToPinnedPlan plan;
+        
+                while(currentCopyPartition < dbPartitions.size()){
+                    if(dbPartitions[currentCopyPartition].numSequences() == 0){
+                        currentCopyPartition++;
+                        continue;
+                    }
+        
+                    //figure out how many sequences to copy to pinned
+                    size_t remainingBytes = MAX_CHARDATA_BYTES - usedBytes;
+                    
+                    auto dboffsetsBegin = dbPartitions[currentCopyPartition].offsets() + currentCopySeqInPartition;
+                    auto dboffsetsEnd = dbPartitions[currentCopyPartition].offsets() + dbPartitions[currentCopyPartition].numSequences() + 1;
+                    
+                    auto searchFor = dbPartitions[currentCopyPartition].offsets()[currentCopySeqInPartition] + remainingBytes + 1; // +1 because remainingBytes is inclusive
+                    auto it = std::lower_bound(
+                        dboffsetsBegin,
+                        dboffsetsEnd,
+                        searchFor
+                    );
+        
+                    size_t numToCopyByBytes = 0;
+                    if(it != dboffsetsBegin){
+                        numToCopyByBytes = std::distance(dboffsetsBegin, it) - 1;
+                    }
+                    if(numToCopyByBytes == 0 && currentCopySeqInPartition == 0){
+                        std::cout << "Warning. copy buffer size too small. skipped a db portion\n";
+                        break;
+                    }
+                    
+                    size_t remainingSeq = MAX_SEQ - usedSeq;            
+                    size_t numToCopyBySeq = std::min(dbPartitions[currentCopyPartition].numSequences() - currentCopySeqInPartition, remainingSeq);
+                    size_t numToCopy = std::min(numToCopyByBytes,numToCopyBySeq);
+        
+                    if(numToCopy > 0){
+                        DeviceBatchCopyToPinnedPlan::CopyRange copyRange;
+                        copyRange.lengthPartitionId = lengthPartitionIds[currentCopyPartition];
+                        copyRange.currentCopyPartition = currentCopyPartition;
+                        copyRange.currentCopySeqInPartition = currentCopySeqInPartition;
+                        copyRange.numToCopy = numToCopy;
+                        plan.copyRanges.push_back(copyRange);
+        
+                        if(usedSeq == 0){
+                            plan.h_partitionIds.push_back(lengthPartitionIds[currentCopyPartition]);
+                            plan.h_numPerPartition.push_back(numToCopy);
+                        }else{
+                            //if is same length partition as previous copy 
+                            if(plan.h_partitionIds.back() == lengthPartitionIds[currentCopyPartition]){
+                                plan.h_numPerPartition.back() += numToCopy;
+                            }else{
+                                //new length partition
+                                plan.h_partitionIds.push_back(lengthPartitionIds[currentCopyPartition]);
+                                plan.h_numPerPartition.push_back(numToCopy);
+                            }
+                        }
+                        usedBytes += (dbPartitions[currentCopyPartition].offsets()[currentCopySeqInPartition+numToCopy] 
+                            - dbPartitions[currentCopyPartition].offsets()[currentCopySeqInPartition]);
+                        usedSeq += numToCopy;
+        
+                        currentCopySeqInPartition += numToCopy;
+                        if(currentCopySeqInPartition == dbPartitions[currentCopyPartition].numSequences()){
+                            currentCopySeqInPartition = 0;
+                            currentCopyPartition++;
+                        }
+                    }else{
+                        break;
+                    }
+                }
+        
+                plan.usedBytes = usedBytes;
+                plan.usedSeq = usedSeq;    
+                
+                if(usedSeq == 0 && currentCopyPartition < dbPartitions.size() && dbPartitions[currentCopyPartition].numSequences() > 0){
+                    std::cout << "Warning. copy buffer size too small. skipped a db portion. stop\n";
+                    break;
+                }
+        
+                if(plan.usedSeq > 0){
+                    result.push_back(plan);
+                }
+            }
+        
+            return result;
+        }
+
+        template<class QueryView>
+        void setQuery(QueryView queryView, std::optional<const int8_t*> precomputedPssmOpt){
+            nvtx::ScopedRange sr("setQuery", 0);
+
+            const int queryLength = queryView.length;
+            // const char* query = queryView.ptr;
+
+            if(queryLength > MaxSequenceLength::value()){
+                std::string msg = "Query length is " + std::to_string(queryLength) 
+                    + ", but config allows only lengths <= " + std::to_string(MaxSequenceLength::value());
+                throw std::runtime_error(msg);
+            }
+            
+            currentQueryLength = queryLength;
+            //pad query to multiple of 4 for char4 access
+            //add sizeof(char4) * warpsize for unguarded accesses outside of the DP matrix
+            currentQueryLengthWithPadding = SDIV(queryLength, 4) * 4 + sizeof(char4) * 32;
+
+            PSSM hostFullQueryPSSM = [&](){
+                if(precomputedPssmOpt.has_value()){
+                    const int8_t* precomputedPssm = precomputedPssmOpt.value();
+                    if(precomputedPssm == nullptr) throw std::runtime_error("setQuery pssm is nullptr");
+                    return PSSM::fromPSSM(queryView.ptr, queryView.length, precomputedPssm, 21);
+                }else{
+                    if constexpr(QueryView::isEncoded){
+                        return PSSM::fromBlosum(blosumType, queryView.ptr, queryView.length);
+                    }else{
+                        std::vector<char> currentQueryEncodedHost(queryView.length);
+                        std::transform(
+                            queryView.ptr,
+                            queryView.ptr + queryView.length,
+                            currentQueryEncodedHost.begin(),
+                            ConvertAA_20{}
+                        );
+                        return PSSM::fromBlosum(blosumType, currentQueryEncodedHost.data(), currentQueryEncodedHost.size());
+                    }
+                }                
+            }();
+
+            // std::cout << "hostFullQueryPSSM\n";
+            // for(int r = 0; r < 21; r++){
+            //     for(int l = 0; l < queryLength; l++){
+            //         std::cout << hostFullQueryPSSM[r][l] << " ";
+            //     }
+            //     std::cout << "\n";
+            // }
+
+            const int numGpus = deviceIds.size();
+            for(int gpu = 0; gpu < numGpus; gpu++){
+                cudaSetDevice(deviceIds[gpu]); CUERR;
+                auto& ws = *workingSets[gpu];
+                ws.d_query.resize(currentQueryLengthWithPadding);
+                cudaMemsetAsync(ws.d_query.data() + currentQueryLength, 20, currentQueryLengthWithPadding - currentQueryLength, gpuStreams[gpu]);
+                cudaMemcpyAsync(ws.d_query.data(), queryView.ptr, currentQueryLength, cudaMemcpyDefault, gpuStreams[gpu]); CUERR
+
+                if constexpr(!QueryView::isEncoded){
+                    thrust::transform(
+                        thrust::cuda::par_nosync.on(gpuStreams[gpu]),
+                        ws.d_query.data(),
+                        ws.d_query.data() + currentQueryLength,
+                        ws.d_query.data(),
+                        ConvertAA_20{}
+                    );
+                }
+
+                // std::vector<char> tmpvec(currentQueryLength);
+                // cudaMemcpy(tmpvec.data(), ws.d_query.data(), sizeof(char) * currentQueryLength, cudaMemcpyDeviceToHost);
+                // std::transform(
+                //     tmpvec.data(),
+                //     tmpvec.data() + queryView.length,
+                //     tmpvec.data(),
+                //     ConvertAA_20_mmseqs_to_ncbi{}
+                // );
+                // std::cout << "ws.d_query: ";
+                // for(auto x : tmpvec){
+                //     std::cout << int(x) << " ";
+                // }
+                // std::cout << "\n";
+
+                ws.gpuFullQueryPSSM.upload(hostFullQueryPSSM, gpuStreams[gpu]);
+
+                auto makeGaplessPSSM = [&](){
+                    if(currentQueryLength <= getMaxSingleTileQueryLength_Gapless()){
+                        auto config = getSingleTileGroupRegConfigForPSSM_Gapless(currentQueryLength);
+                        if(verbose){
+                            std::cout << "Query length " << currentQueryLength << ". Set up PSSM for single-tile processing. "
+                                << "Tilesize " << (config.groupsize * config.numRegs * 2) << " = " << config.groupsize << " * " << config.numRegs << " * 2" 
+                                ", dpx: " << config.dpx << ", approach: " << to_string(config.approach) << "\n";
+                        }
+                        constexpr int accessSize = 16; //kernel uses float4 for pssm access
+                        if(!config.dpx){
+                            ws.gpuPermutedPSSMforGapless.fromGpuPSSMView<half, accessSize>(ws.gpuFullQueryPSSM.makeView(), config.groupsize, config.numRegs, gpuStreams[gpu]);
+                        }else{
+                            ws.gpuPermutedPSSMforGapless.fromGpuPSSMView<short, accessSize>(ws.gpuFullQueryPSSM.makeView(), config.groupsize, config.numRegs, gpuStreams[gpu]);
+                        }
+                    }else{
+                        auto config = getMultiTileGroupRegConfigForPSSM_Gapless(currentQueryLength);
+                        if(verbose){
+                            std::cout << "Query length " << currentQueryLength << ". Set up PSSM for multi-tile processing. "
+                                << "Tilesize " << (config.groupsize * config.numRegs * 2) << " = " << config.groupsize << " * " << config.numRegs << " * 2" 
+                                ", dpx: " << config.dpx << ", approach: " << to_string(config.approach) << "\n";
+                        }
+                        constexpr int accessSize = 16; //kernel uses float4 for pssm access
+                        if(!config.dpx){
+                            ws.gpuPermutedPSSMforGapless.fromGpuPSSMView<half, accessSize>(ws.gpuFullQueryPSSM.makeView(), config.groupsize, config.numRegs, gpuStreams[gpu]);
+                        }else{
+                            ws.gpuPermutedPSSMforGapless.fromGpuPSSMView<short, accessSize>(ws.gpuFullQueryPSSM.makeView(), config.groupsize, config.numRegs, gpuStreams[gpu]);
+                        }
+                    }
+                };
+
+                auto makeSWPSSM = [&](){
+                    if(currentQueryLength <= getMaxSingleTileQueryLength_SW()){
+                        auto config = getSingleTileGroupRegConfigForPSSM_SW(currentQueryLength);
+                        if(verbose){
+                            std::cout << "Query length " << currentQueryLength << ". Set up PSSM for single-tile processing. "
+                                << "Tilesize " << (config.groupsize * config.numRegs) << " = " << config.groupsize << " * " << config.numRegs 
+                                << ", dpx: " << config.dpx << ", approach: " << to_string(config.approach) << "\n";
+                        }
+                        constexpr int accessSize = 16; //kernel uses float4 for pssm access                        
+                        if(!config.dpx){
+                            ws.gpuPermutedPSSMforSW.fromGpuPSSMView<accessSize, float>(ws.gpuFullQueryPSSM.makeView(), config.groupsize, config.numRegs, gpuStreams[gpu]);
+                        }else{
+                            ws.gpuPermutedPSSMforSW.fromGpuPSSMView<accessSize, int>(ws.gpuFullQueryPSSM.makeView(), config.groupsize, config.numRegs, gpuStreams[gpu]);
+                        }
+                    }else{
+                        auto config = getMultiTileGroupRegConfigForPSSM_SW(currentQueryLength);
+                        if(verbose){
+                            std::cout << "Query length " << currentQueryLength << ". Set up PSSM for single-tile processing. "
+                                << "Tilesize " << (config.groupsize * config.numRegs) << " = " << config.groupsize << " * " << config.numRegs 
+                                << ", dpx: " << config.dpx << ", approach: " << to_string(config.approach) << "\n";
+                        }
+                        constexpr int accessSize = 16; //kernel uses float4 for pssm access                        
+                        if(!config.dpx){
+                            ws.gpuPermutedPSSMforSW.fromGpuPSSMView<accessSize, float>(ws.gpuFullQueryPSSM.makeView(), config.groupsize, config.numRegs, gpuStreams[gpu]);
+                        }else{
+                            ws.gpuPermutedPSSMforSW.fromGpuPSSMView<accessSize, int>(ws.gpuFullQueryPSSM.makeView(), config.groupsize, config.numRegs, gpuStreams[gpu]);
+                        }
+                    }
+                };
+
+                if(scanType == ScanType::Gapless){
+                    makeGaplessPSSM();
+                }else if(scanType == ScanType::SW_Endpos){
+                    makeSWPSSM();
+                }else if(scanType == ScanType::GaplessPlusSW_Endpos){
+                    makeGaplessPSSM();
+                    makeSWPSSM();
+                }
+
+                //THIS cudaMemcpyToSymbolAsync IS ONLY REQUIRED FOR THE NON-PSSM KERNELS
+
+                //TODO leave query in gmem, dont use cmem ???
+                // cudaMemcpyToSymbolAsync(constantQuery4, ws.d_query.data(), currentQueryLength, 0, cudaMemcpyDeviceToDevice, gpuStreams[gpu]); CUERR
+
+            }
+
+
+            // for(int subjectLetter = 0; subjectLetter < hostFullQueryPSSM.alphabetSize; subjectLetter++){
+            //     for(int queryLetter = 0; queryLetter < hostFullQueryPSSM.queryLength; queryLetter++){
+            //         std::cout << hostFullQueryPSSM[subjectLetter][queryLetter] << " ";
+            //     }
+            //     std::cout << "\n";
+            // }
+            // std::cout << "\n";
+            // std::exit(0);
+        }
+
+        void scanDatabaseForQuery_gapless(){
+            nvtx::ScopedRange sr("scanDatabaseForQuery_gapless", 0);
+            const int numGpus = deviceIds.size();
+            const int masterDeviceId = deviceIds[0];
+            const auto& masterStream1 = gpuStreams[0];
+            auto& masterevent1 = gpuEvents[0];
+
+            cudaSetDevice(masterDeviceId);
+            // scanTimer->reset();
+            // scanTimer->start();
+
+            thrust::fill(
+                thrust::cuda::par_nosync.on(masterStream1),
+                d_finalAlignmentScores_allGpus.begin(),
+                d_finalAlignmentScores_allGpus.end(),
+                0
+            );
+
+            cudaSetDevice(masterDeviceId);           
+
+            cudaEventRecord(masterevent1, masterStream1); CUERR;
+
+            for(int gpu = 0; gpu < numGpus; gpu++){
+                cudaSetDevice(deviceIds[gpu]); CUERR;
+                cudaStreamWaitEvent(gpuStreams[gpu], masterevent1, 0); CUERR;
+            }
+
+            if(!targetSubjectIds){
+                processQueryOnGpus();
+            }else{
+                processQueryOnGpusWithTargetSubjectIds();
+            }
+
+            for(int gpu = 0; gpu < numGpus; gpu++){
+                cudaSetDevice(deviceIds[gpu]); CUERR;
+                auto& ws = *workingSets[gpu];
+
+                if(numGpus > 1){
+                    //transform per gpu local sequence indices into global sequence indices
+                    if(results_per_query > 0){
+                        transformLocalSequenceIndicesToGlobalIndices<<<SDIV(results_per_query, 128), 128, 0, gpuStreams[gpu]>>>(
+                            gpu,
+                            results_per_query,
+                            ws.deviceGpuPartitionOffsets.getDeviceView(),
+                            ws.d_topN_refIds.data()
+                        ); CUERR;
+                    }
+                }
+
+                cudaMemcpyAsync(
+                    d_finalAlignmentScores_allGpus.data() + results_per_query*gpu,
+                    ws.d_topN_scores.data(),
+                    sizeof(float) * results_per_query,
+                    cudaMemcpyDeviceToDevice,
+                    gpuStreams[gpu]
+                ); CUERR;
+                cudaMemcpyAsync(
+                    d_finalReferenceIds_allGpus.data() + results_per_query*gpu,
+                    ws.d_topN_refIds.data(),
+                    sizeof(ReferenceIdT) * results_per_query,
+                    cudaMemcpyDeviceToDevice,
+                    gpuStreams[gpu]
+                ); CUERR;                
+                // cudaMemcpyAsync(
+                //     d_resultNumOverflows.data() + gpu,
+                //     ws.d_total_overflow_number.data(),
+                //     sizeof(int),
+                //     cudaMemcpyDeviceToDevice,
+                //     gpuStreams[gpu]
+                // ); CUERR;                
+
+                cudaEventRecord(ws.forkStreamEvent, gpuStreams[gpu]); CUERR;
+
+                cudaSetDevice(masterDeviceId);
+                cudaStreamWaitEvent(masterStream1, ws.forkStreamEvent, 0); CUERR;
+            }
+
+            cudaSetDevice(masterDeviceId);
+
+            if(numGpus > 1){
+                //sort per-gpu top results to find overall top results
+                auto sortInput = thrust::make_zip_iterator(
+                    d_finalAlignmentScores_allGpus.begin(),
+                    d_finalReferenceIds_allGpus.begin()
+                );
+                thrust::sort(
+                    thrust::cuda::par_nosync(thrust_async_allocator<char>(masterStream1)).on(masterStream1),
+                    sortInput,
+                    sortInput + results_per_query * numGpus,
+                    CompareScoresDescendingRefIdsAscending{}
+                );
+
+                // thrust::sort_by_key(
+                //     thrust::cuda::par_nosync(thrust_async_allocator<char>(masterStream1)).on(masterStream1),
+                //     d_finalAlignmentScores_allGpus.begin(),
+                //     d_finalAlignmentScores_allGpus.begin() + results_per_query * numGpus,
+                //     d_finalReferenceIds_allGpus.begin(),
+                //     thrust::greater<float>()
+                // );
+
+
+                //sum the overflows per gpu
+                //sumNumOverflowsKernel<<<1,1,0,masterStream1>>>(d_resultNumOverflows.data(), d_resultNumOverflows.data(), numGpus); CUERR;                
+            }
+
+            cudaMemcpyAsync(
+                h_finalAlignmentScores.data(), 
+                d_finalAlignmentScores_allGpus.data(), 
+                sizeof(float) * results_per_query, 
+                cudaMemcpyDeviceToHost, 
+                masterStream1
+            );  CUERR
+            cudaMemcpyAsync(
+                h_finalReferenceIds.data(), 
+                d_finalReferenceIds_allGpus.data(), 
+                sizeof(ReferenceIdT) * results_per_query, 
+                cudaMemcpyDeviceToHost, 
+                masterStream1
+            );  CUERR
+            // cudaMemcpyAsync(
+            //     resultNumOverflows.data(), 
+            //     d_resultNumOverflows.data(), 
+            //     sizeof(int), 
+            //     cudaMemcpyDeviceToHost, 
+            //     masterStream1
+            // );  CUERR
+
+            cudaStreamSynchronize(masterStream1); CUERR;
+
+            if(targetSubjectIds){
+                //h_finalReferenceIds will contain numbers from 0 to num target subject ids. convert to proper target subject ids
+                for(int i = 0; i < results_per_query; i++){
+                    h_finalReferenceIds[i] = targetSubjectIds->subjectIds[h_finalReferenceIds[i]];
+                }
+            }
+        }
+
+
+        void scanDatabaseForQuery_sw_endpos(){
+            nvtx::ScopedRange sr("scanDatabaseForQuery_sw_endpos", 0);
+            const int numGpus = deviceIds.size();
+            const int masterDeviceId = deviceIds[0];
+            const auto& masterStream1 = gpuStreams[0];
+            auto& masterevent1 = gpuEvents[0];
+
+            cudaSetDevice(masterDeviceId);
+            // scanTimer->reset();
+            // scanTimer->start();
+
+            thrust::fill(
+                thrust::cuda::par_nosync.on(masterStream1),
+                d_finalAlignmentScores_allGpus.begin(),
+                d_finalAlignmentScores_allGpus.end(),
+                0
+            );
+
+            cudaSetDevice(masterDeviceId);           
+
+            cudaEventRecord(masterevent1, masterStream1); CUERR;
+
+            for(int gpu = 0; gpu < numGpus; gpu++){
+                cudaSetDevice(deviceIds[gpu]); CUERR;
+                cudaStreamWaitEvent(gpuStreams[gpu], masterevent1, 0); CUERR;
+            }
+
+            if(!targetSubjectIds){
+                processQueryOnGpus();
+            }else{
+                processQueryOnGpusWithTargetSubjectIds();
+            }
+
+            if(!targetSubjectIds){
+
+
+                for(int gpu = 0; gpu < numGpus; gpu++){
+                    cudaSetDevice(deviceIds[gpu]); CUERR;
+                    auto& ws = *workingSets[gpu];
+
+                    if(numGpus > 1){
+                        //transform per gpu local sequence indices into global sequence indices
+                        if(results_per_query > 0){
+                            transformLocalSequenceIndicesToGlobalIndices<<<SDIV(results_per_query, 128), 128, 0, gpuStreams[gpu]>>>(
+                                gpu,
+                                results_per_query,
+                                ws.deviceGpuPartitionOffsets.getDeviceView(),
+                                ws.d_topN_refIds.data()
+                            ); CUERR;
+                        }
+                    }
+
+                    cudaMemcpyAsync(
+                        d_finalAlignmentScores_allGpus.data() + results_per_query*gpu,
+                        ws.d_topN_scores.data(),
+                        sizeof(float) * results_per_query,
+                        cudaMemcpyDeviceToDevice,
+                        gpuStreams[gpu]
+                    ); CUERR;
+                    cudaMemcpyAsync(
+                        d_finalReferenceIds_allGpus.data() + results_per_query*gpu,
+                        ws.d_topN_refIds.data(),
+                        sizeof(ReferenceIdT) * results_per_query,
+                        cudaMemcpyDeviceToDevice,
+                        gpuStreams[gpu]
+                    ); CUERR;
+                    cudaMemcpyAsync(
+                        d_finalEndPositions_allGpus.data() + results_per_query*gpu,
+                        ws.d_topN_alignmentEndPositions.data(),
+                        sizeof(AlignmentEndPosition) * results_per_query,
+                        cudaMemcpyDeviceToDevice,
+                        gpuStreams[gpu]
+                    ); CUERR;  
+                    // cudaMemcpyAsync(
+                    //     d_resultNumOverflows.data() + gpu,
+                    //     ws.d_total_overflow_number.data(),
+                    //     sizeof(int),
+                    //     cudaMemcpyDeviceToDevice,
+                    //     gpuStreams[gpu]
+                    // ); CUERR;                
+
+                    cudaEventRecord(ws.forkStreamEvent, gpuStreams[gpu]); CUERR;
+
+                    cudaSetDevice(masterDeviceId);
+                    cudaStreamWaitEvent(masterStream1, ws.forkStreamEvent, 0); CUERR;
+                }
+            }else{
+                //processQueryOnGpusWithTargetSubjectIds currently does not utilize multiple gpus
+
+                for(int gpu = 0; gpu < 1; gpu++){
+                    cudaSetDevice(deviceIds[gpu]); CUERR;
+                    auto& ws = *workingSets[gpu];
+
+                    cudaMemcpyAsync(
+                        d_finalAlignmentScores_allGpus.data() + results_per_query*gpu,
+                        ws.d_topN_scores.data(),
+                        sizeof(float) * results_per_query,
+                        cudaMemcpyDeviceToDevice,
+                        gpuStreams[gpu]
+                    ); CUERR;
+                    cudaMemcpyAsync(
+                        d_finalReferenceIds_allGpus.data() + results_per_query*gpu,
+                        ws.d_topN_refIds.data(),
+                        sizeof(ReferenceIdT) * results_per_query,
+                        cudaMemcpyDeviceToDevice,
+                        gpuStreams[gpu]
+                    ); CUERR;
+                    cudaMemcpyAsync(
+                        d_finalEndPositions_allGpus.data() + results_per_query*gpu,
+                        ws.d_topN_alignmentEndPositions.data(),
+                        sizeof(AlignmentEndPosition) * results_per_query,
+                        cudaMemcpyDeviceToDevice,
+                        gpuStreams[gpu]
+                    ); CUERR;  
+                    // cudaMemcpyAsync(
+                    //     d_resultNumOverflows.data() + gpu,
+                    //     ws.d_total_overflow_number.data(),
+                    //     sizeof(int),
+                    //     cudaMemcpyDeviceToDevice,
+                    //     gpuStreams[gpu]
+                    // ); CUERR;                
+
+                    cudaEventRecord(ws.forkStreamEvent, gpuStreams[gpu]); CUERR;
+
+                    cudaSetDevice(masterDeviceId);
+                    cudaStreamWaitEvent(masterStream1, ws.forkStreamEvent, 0); CUERR;
+                }
+            }
+
+            cudaSetDevice(masterDeviceId);
+
+            if(!targetSubjectIds){
+                if(numGpus > 1){
+                    //sort per-gpu top results to find overall top results
+                    auto sortInputKeys = thrust::make_zip_iterator(
+                        d_finalAlignmentScores_allGpus.begin(),
+                        d_finalReferenceIds_allGpus.begin()
+                    );
+                    thrust::sort_by_key(
+                        thrust::cuda::par_nosync(thrust_async_allocator<char>(masterStream1)).on(masterStream1),
+                        sortInputKeys,
+                        sortInputKeys + results_per_query * numGpus,
+                        d_finalEndPositions_allGpus.begin(),
+                        CompareScoresDescendingRefIdsAscending{}
+                    );
+
+                    // thrust::sort_by_key(
+                    //     thrust::cuda::par_nosync(thrust_async_allocator<char>(masterStream1)).on(masterStream1),
+                    //     d_finalAlignmentScores_allGpus.begin(),
+                    //     d_finalAlignmentScores_allGpus.begin() + results_per_query * numGpus,
+                    //     thrust::make_zip_iterator(
+                    //         d_finalReferenceIds_allGpus.begin(),
+                    //         d_finalEndPositions_allGpus.begin()
+                    //     ),
+                    //     thrust::greater<float>()
+                    // );
+
+
+                    //sum the overflows per gpu
+                    //sumNumOverflowsKernel<<<1,1,0,masterStream1>>>(d_resultNumOverflows.data(), d_resultNumOverflows.data(), numGpus); CUERR;                
+                }
+            }
+
+            cudaMemcpyAsync(
+                h_finalAlignmentScores.data(), 
+                d_finalAlignmentScores_allGpus.data(), 
+                sizeof(float) * results_per_query, 
+                cudaMemcpyDeviceToHost, 
+                masterStream1
+            );  CUERR
+            cudaMemcpyAsync(
+                h_finalReferenceIds.data(), 
+                d_finalReferenceIds_allGpus.data(), 
+                sizeof(ReferenceIdT) * results_per_query, 
+                cudaMemcpyDeviceToHost, 
+                masterStream1
+            );  CUERR
+            cudaMemcpyAsync(
+                h_finalEndPositions.data(), 
+                d_finalEndPositions_allGpus.data(), 
+                sizeof(AlignmentEndPosition) * results_per_query, 
+                cudaMemcpyDeviceToHost, 
+                masterStream1
+            );  CUERR
+            // cudaMemcpyAsync(
+            //     resultNumOverflows.data(), 
+            //     d_resultNumOverflows.data(), 
+            //     sizeof(int), 
+            //     cudaMemcpyDeviceToHost, 
+            //     masterStream1
+            // );  CUERR
+
+            cudaStreamSynchronize(masterStream1); CUERR;
+
+            if(targetSubjectIds){
+                //h_finalReferenceIds will contain numbers from 0 to num target subject ids. convert to proper target subject ids
+                for(int i = 0; i < results_per_query; i++){
+                    h_finalReferenceIds[i] = targetSubjectIds->subjectIds[h_finalReferenceIds[i]];
+                }
+            }
+        }
+
+        void processQueryOnGpus(){
+
+            // std::cout << "ProcessQueryOnGpus: dstinfos isUploaded\n";
+            // for(size_t i = 0; i < batchPlans[0].size(); i++){
+            //     std::cout << batchPlansDstInfoVec[0][i].isUploaded << " ";
+            // }
+            // std::cout << "\n";
+
+            const std::vector<std::vector<DBdataView>>& dbPartitionsPerGpu = subPartitionsForGpus;
+
+            // constexpr auto boundaries = getLengthPartitionBoundaries();
+            // constexpr int numLengthPartitions = boundaries.size();
+            const int numGpus = deviceIds.size();
+            const bool useExtraThreadForBatchTransfer = numGpus > 1;
+        
+            size_t totalNumberOfSequencesToProcess = std::reduce(numSequencesPerGpu.begin(), numSequencesPerGpu.end());
+            
+            size_t totalNumberOfProcessedSequences = 0;
+        
+            for(int gpu = 0; gpu < numGpus; gpu++){
+                cudaSetDevice(deviceIds[gpu]); CUERR;
+                auto& ws = *workingSets[gpu];
+        
+                //cudaMemsetAsync(ws.d_total_overflow_number.data(), 0, sizeof(int), gpuStreams[gpu]);
+                
+                ws.resetMaxReduceArray(gpuStreams[gpu]);
+                ws.resetTopNArrays(gpuStreams[gpu]);
+        
+                //create dependency on mainStream
+                cudaEventRecord(ws.forkStreamEvent, gpuStreams[gpu]); CUERR;
+                cudaStreamWaitEvent(ws.workStreamForTempUsage, ws.forkStreamEvent, 0); CUERR;
+                for(auto& stream : ws.workStreamsWithoutTemp){
+                    cudaStreamWaitEvent(stream, ws.forkStreamEvent, 0); CUERR;
+                }
+                cudaStreamWaitEvent(ws.hostFuncStream, ws.forkStreamEvent, 0); CUERR;
+            }       
+        
+            //variables per gpu to keep between loops
+            struct Variables{
+                int currentBuffer = 0;
+                int previousBuffer = 0;
+                cudaStream_t H2DcopyStream = cudaStreamLegacy;
+                char* h_inputChars = nullptr;
+                SequenceLengthT* h_inputLengths = nullptr;
+                size_t* h_inputOffsets = nullptr;
+                char* d_inputChars = nullptr;
+                SequenceLengthT* d_inputLengths = nullptr;
+                size_t* d_inputOffsets = nullptr;
+                //int* d_overflow_number = nullptr;
+                //ReferenceIdT* d_overflow_positions = nullptr;
+                const std::vector<DeviceBatchCopyToPinnedPlan>* batchPlansPtr;
+                const std::vector<DeviceBatchCopyToPinnedPlan>* batchPlansCachedDBPtr;
+                const DeviceBatchCopyToPinnedPlan* currentPlanPtr;
+                size_t processedSequences = 0;
+                size_t processedBatches = 0;
+            };
+        
+            std::vector<Variables> variables_vec(numGpus);
+            //init variables
+            for(int gpu = 0; gpu < numGpus; gpu++){
+                cudaSetDevice(deviceIds[gpu]); CUERR;
+                const auto& ws = *workingSets[gpu];
+                auto& variables = variables_vec[gpu];
+                variables.processedSequences = 0;
+                variables.processedBatches = 0;
+                variables.batchPlansPtr = &batchPlans[gpu];
+                variables.batchPlansCachedDBPtr = &batchPlans_cachedDB[gpu];
+            }
+            
+            while(totalNumberOfProcessedSequences < totalNumberOfSequencesToProcess){
+                //set up gpu variables for current iteration
+                for(int gpu = 0; gpu < numGpus; gpu++){
+                    cudaSetDevice(deviceIds[gpu]); CUERR;
+                    auto& ws = *workingSets[gpu];
+                    auto& variables = variables_vec[gpu];
+                    if(variables.processedBatches < variables.batchPlansPtr->size()){
+
+                        if(variables.processedBatches < ws.getNumBatchesInCachedDB()){
+                            //will process a batch that could be cached in gpu memory
+                            if(batchPlansDstInfoVec[gpu][variables.processedBatches].isUploaded == false){
+                                //it is not cached, need upload
+                                variables.currentBuffer = ws.copyBufferIndex;
+                                if(variables.currentBuffer == 0){
+                                    variables.previousBuffer = ws.numCopyBuffers - 1;
+                                }else{
+                                    variables.previousBuffer = (variables.currentBuffer - 1);
+                                } 
+                                variables.H2DcopyStream = ws.copyStreams[variables.currentBuffer];
+                                if(ws.h_chardata_vec[variables.currentBuffer].size() > 0){
+                                    variables.h_inputChars = ws.h_chardata_vec[variables.currentBuffer].data();
+                                }else{
+                                    variables.h_inputChars = nullptr;
+                                }
+                                if(ws.h_lengthdata_vec[variables.currentBuffer].size() > 0){
+                                    variables.h_inputLengths = ws.h_lengthdata_vec[variables.currentBuffer].data();
+                                }else{
+                                    variables.h_inputLengths = nullptr;
+                                }
+                                if(ws.h_offsetdata_vec[variables.currentBuffer].size() > 0){
+                                    variables.h_inputOffsets = ws.h_offsetdata_vec[variables.currentBuffer].data();
+                                }else{
+                                    variables.h_inputOffsets = nullptr;
+                                }
+                                variables.d_inputChars = batchPlansDstInfoVec[gpu][variables.processedBatches].charsPtr;
+                                variables.d_inputLengths = batchPlansDstInfoVec[gpu][variables.processedBatches].lengthsPtr;
+                                variables.d_inputOffsets = batchPlansDstInfoVec[gpu][variables.processedBatches].offsetsPtr;
+                                //variables.d_overflow_number = ws.d_overflow_number.data() + variables.currentBuffer;
+                                //variables.d_overflow_positions = ws.d_overflow_positions_vec[variables.currentBuffer].data();
+                            }else{
+                                //already uploaded. process all batches for cached db together
+                                assert(variables.processedBatches == 0);
+                                variables.currentBuffer = 0;
+                                variables.previousBuffer = 0;
+                                variables.H2DcopyStream = ws.copyStreams[0];
+                                variables.h_inputChars = nullptr;
+                                variables.h_inputLengths = nullptr;
+                                variables.h_inputOffsets = nullptr;
+                                variables.d_inputChars = ws.d_cacheddb->getCharData();
+                                variables.d_inputLengths = ws.d_cacheddb->getLengthData();
+                                variables.d_inputOffsets = ws.d_cacheddb->getOffsetData();
+                                
+                            }
+                        }else{
+                            //will process batch that cannot be cached
+                            //upload to double buffer
+                            variables.currentBuffer = ws.copyBufferIndex;
+                            if(variables.currentBuffer == 0){
+                                variables.previousBuffer = ws.numCopyBuffers - 1;
+                            }else{
+                                variables.previousBuffer = (variables.currentBuffer - 1);
+                            } 
+                            variables.H2DcopyStream = ws.copyStreams[variables.currentBuffer];
+                            if(ws.h_chardata_vec[variables.currentBuffer].size() > 0){
+                                variables.h_inputChars = ws.h_chardata_vec[variables.currentBuffer].data();
+                            }else{
+                                variables.h_inputChars = nullptr;
+                            }
+                            if(ws.h_lengthdata_vec[variables.currentBuffer].size() > 0){
+                                variables.h_inputLengths = ws.h_lengthdata_vec[variables.currentBuffer].data();
+                            }else{
+                                variables.h_inputLengths = nullptr;
+                            }
+                            if(ws.h_offsetdata_vec[variables.currentBuffer].size() > 0){
+                                variables.h_inputOffsets = ws.h_offsetdata_vec[variables.currentBuffer].data();
+                            }else{
+                                variables.h_inputOffsets = nullptr;
+                            }
+                            variables.d_inputChars = batchPlansDstInfoVec[gpu][variables.processedBatches].charsPtr;
+                            variables.d_inputLengths = batchPlansDstInfoVec[gpu][variables.processedBatches].lengthsPtr;
+                            variables.d_inputOffsets = batchPlansDstInfoVec[gpu][variables.processedBatches].offsetsPtr;
+                            //variables.d_overflow_number = ws.d_overflow_number.data() + variables.currentBuffer;
+                            //variables.d_overflow_positions = ws.d_overflow_positions_vec[variables.currentBuffer].data();
+                        }
+                    }
+                }
+                //upload batch
+                for(int gpu = 0; gpu < numGpus; gpu++){
+                    cudaSetDevice(deviceIds[gpu]); CUERR;
+                    auto& ws = *workingSets[gpu];
+                    auto& variables = variables_vec[gpu];
+                    if(variables.processedBatches < variables.batchPlansPtr->size()){
+                        const bool needsUpload = !batchPlansDstInfoVec[gpu][variables.processedBatches].isUploaded;
+
+                        variables.currentPlanPtr = [&](){
+                            if(variables.processedBatches < ws.getNumBatchesInCachedDB()){
+                                if(!needsUpload){
+                                    return &(*variables.batchPlansCachedDBPtr)[0];
+                                    //return &(*variables.batchPlansPtr)[variables.processedBatches];
+                                }else{
+                                    return &(*variables.batchPlansPtr)[variables.processedBatches];
+                                }
+                            }else{
+                                return &(*variables.batchPlansPtr)[variables.processedBatches];
+                            }
+                        }();
+                            
+        
+                        if(needsUpload){
+                            //transfer data
+                            //can only overwrite device buffer if it is no longer in use on workstream
+                            cudaStreamWaitEvent(variables.H2DcopyStream, ws.deviceBufferEvents[variables.currentBuffer], 0); CUERR;
+        
+                            if(useExtraThreadForBatchTransfer){
+                                assert(variables.h_inputChars != nullptr);
+                                assert(variables.h_inputLengths != nullptr);
+                                assert(variables.h_inputOffsets != nullptr);
+
+                                cudaStreamWaitEvent(ws.hostFuncStream, ws.pinnedBufferEvents[variables.currentBuffer]); CUERR;
+                                executePinnedCopyPlanWithHostCallback(
+                                    *variables.currentPlanPtr, 
+                                    variables.h_inputChars,
+                                    variables.h_inputLengths,
+                                    variables.h_inputOffsets,
+                                    dbPartitionsPerGpu[gpu], 
+                                    ws.hostFuncStream
+                                );
+                                cudaEventRecord(ws.forkStreamEvent, ws.hostFuncStream); CUERR;
+                                cudaStreamWaitEvent(variables.H2DcopyStream, ws.forkStreamEvent, 0);
+        
+                                cudaMemcpyAsync(
+                                    variables.d_inputChars,
+                                    variables.h_inputChars,
+                                    variables.currentPlanPtr->usedBytes,
+                                    H2D,
+                                    variables.H2DcopyStream
+                                ); CUERR;
+                                cudaMemcpyAsync(
+                                    variables.d_inputLengths,
+                                    variables.h_inputLengths,
+                                    sizeof(SequenceLengthT) * variables.currentPlanPtr->usedSeq,
+                                    H2D,
+                                    variables.H2DcopyStream
+                                ); CUERR;
+                                cudaMemcpyAsync(
+                                    variables.d_inputOffsets,
+                                    variables.h_inputOffsets,
+                                    sizeof(size_t) * (variables.currentPlanPtr->usedSeq+1),
+                                    H2D,
+                                    variables.H2DcopyStream
+                                ); CUERR;
+                            }else{
+                                //synchronize to avoid overwriting pinned buffer of target before it has been fully transferred
+                                cudaEventSynchronize(ws.pinnedBufferEvents[variables.currentBuffer]); CUERR;
+
+                                executeCopyPlanH2DDirect(
+                                    *variables.currentPlanPtr, 
+                                    variables.d_inputChars,
+                                    variables.d_inputLengths,
+                                    variables.d_inputOffsets,
+                                    dbPartitionsPerGpu[gpu], 
+                                    variables.H2DcopyStream
+                                );
+
+                                // assert(variables.h_inputChars != nullptr);
+                                // assert(variables.h_inputLengths != nullptr);
+                                // assert(variables.h_inputOffsets != nullptr);
+        
+                                // executePinnedCopyPlanSerialAndTransferToGpu(
+                                //     *variables.currentPlanPtr, 
+                                //     variables.h_inputChars,
+                                //     variables.h_inputLengths,
+                                //     variables.h_inputOffsets,
+                                //     variables.d_inputChars,
+                                //     variables.d_inputLengths,
+                                //     variables.d_inputOffsets,
+                                //     dbPartitionsPerGpu[gpu], 
+                                //     variables.H2DcopyStream
+                                // );
+                            }
+                            
+                            cudaEventRecord(ws.pinnedBufferEvents[variables.currentBuffer], variables.H2DcopyStream); CUERR;
+                        }
+                    }
+                }
+
+                //all data is ready for alignments. create dependencies for work streams
+                for(int gpu = 0; gpu < numGpus; gpu++){
+                    cudaSetDevice(deviceIds[gpu]); CUERR;
+                    auto& ws = *workingSets[gpu];
+                    auto& variables = variables_vec[gpu];
+
+                    if(variables.processedBatches < variables.batchPlansPtr->size()){
+                        
+                        cudaEventRecord(ws.forkStreamEvent, variables.H2DcopyStream); CUERR;
+                        cudaStreamWaitEvent(ws.workStreamForTempUsage, ws.forkStreamEvent, 0); CUERR;
+                        for(auto& stream : ws.workStreamsWithoutTemp){
+                            cudaStreamWaitEvent(stream, ws.forkStreamEvent, 0); CUERR;
+                        }
+                        //wait for previous batch to finish
+                        cudaStreamWaitEvent(ws.workStreamForTempUsage, ws.deviceBufferEvents[variables.previousBuffer], 0); CUERR;
+                        for(auto& stream : ws.workStreamsWithoutTemp){
+                            cudaStreamWaitEvent(stream, ws.deviceBufferEvents[variables.previousBuffer], 0); CUERR;
+                        }
+
+                    }
+                }
+
+                //determine maximum number of sequences to process over all gpus
+                size_t maxNumSequencesInBatchForGpus = 0;
+                for(int gpu = 0; gpu < numGpus; gpu++){
+                    auto& variables = variables_vec[gpu];
+                    if(variables.processedBatches < variables.batchPlansPtr->size()){
+                        maxNumSequencesInBatchForGpus = std::max(maxNumSequencesInBatchForGpus, variables.currentPlanPtr->usedSeq);
+                    }
+                }
+                const size_t seqsPerPass = maxReduceArraySize;
+
+                for(size_t sequencePassOffset = 0; sequencePassOffset < maxNumSequencesInBatchForGpus; sequencePassOffset += seqsPerPass){
+                    for(int gpu = 0; gpu < numGpus; gpu++){
+                        cudaSetDevice(deviceIds[gpu]); CUERR;
+                        auto& ws = *workingSets[gpu];
+                        auto& variables = variables_vec[gpu];
+    
+                        if(variables.processedBatches < variables.batchPlansPtr->size()){
+                            const size_t numSequencesInBatch = variables.currentPlanPtr->usedSeq;
+                            
+                            if(sequencePassOffset < numSequencesInBatch){
+                                const char* const inputChars = variables.d_inputChars;
+                                const SequenceLengthT* const inputLengths = variables.d_inputLengths;
+                                const size_t* const inputOffsets = variables.d_inputOffsets;
+                                auto d_selectedPositions = thrust::make_counting_iterator<ReferenceIdT>(sequencePassOffset);
+                                const size_t numInPass = std::min(numSequencesInBatch - sequencePassOffset, seqsPerPass);
+                                const cudaStream_t stream = ws.workStreamsWithoutTemp[0];
+
+                                if(scanType == ScanType::Gapless){
+                                    auto maxReduceArray = ws.getMaxReduceArray(variables.processedSequences + sequencePassOffset);
+
+                                    runGaplessFilterKernels_PSSM(
+                                        maxReduceArray,
+                                        ws.gpuPermutedPSSMforGapless,
+                                        inputChars,
+                                        inputLengths,
+                                        inputOffsets,
+                                        d_selectedPositions,
+                                        numInPass,
+                                        ws.d_tempStorageHE.data(),
+                                        ws.numTempBytes,
+                                        stream
+                                    );
+
+                                    //db sequences are processed in ascending order. stable sort ensures that sequences with same score are sorted by ascending id without a custom comparator
+                                    thrust::stable_sort_by_key(
+                                        thrust::cuda::par_nosync(thrust_async_allocator<char>(stream)).on(stream),
+                                        ws.d_maxReduceArrayScores.data(),
+                                        ws.d_maxReduceArrayScores.data() + numInPass,
+                                        ws.d_maxReduceArrayIndices.data(),
+                                        thrust::greater<float>()
+                                    );
+
+                                    if(sequencePassOffset > 0 || totalNumberOfProcessedSequences > 0){
+                                        auto mergeInput1 = thrust::make_zip_iterator(
+                                            ws.d_maxReduceArrayScores.data(),
+                                            ws.d_maxReduceArrayIndices.data()
+                                        );
+                                        auto mergeInput2 = thrust::make_zip_iterator(
+                                            ws.d_topN_scores.data(), 
+                                            ws.d_topN_refIds.data()
+                                        );
+                                        auto mergeOutput = thrust::make_zip_iterator(
+                                            ws.d_topN_scores_tmp.data(), 
+                                            ws.d_topN_refIds_tmp.data()
+                                        );
+                                        thrust::merge(
+                                            thrust::cuda::par_nosync(thrust_async_allocator<char>(stream)).on(stream),
+                                            mergeInput1,
+                                            mergeInput1 + std::min(numInPass, size_t(results_per_query)),
+                                            mergeInput2,
+                                            mergeInput2 + results_per_query,
+                                            mergeOutput,
+                                            CompareScoresDescendingRefIdsAscending{}
+                                        );
+
+                                        std::swap(ws.d_topN_scores, ws.d_topN_scores_tmp);
+                                        std::swap(ws.d_topN_refIds, ws.d_topN_refIds_tmp);
+                                    }else{
+                                        cudaMemcpyAsync(
+                                            ws.d_topN_scores.data(),
+                                            ws.d_maxReduceArrayScores.data(), 
+                                            sizeof(float) * results_per_query,
+                                            cudaMemcpyDeviceToDevice,
+                                            stream
+                                        ); CUERR;
+                                        cudaMemcpyAsync(
+                                            ws.d_topN_refIds.data(),
+                                            ws.d_maxReduceArrayIndices.data(), 
+                                            sizeof(ReferenceIdT) * results_per_query,
+                                            cudaMemcpyDeviceToDevice,
+                                            stream
+                                        ); CUERR;
+                                    }
+                                }else if(scanType == ScanType::SW_Endpos){
+                                    constexpr bool subjectIsCaseSensitive = true;
+                                    constexpr bool withEndPosition = true;
+
+                                    auto maxReduceArray = ws.getMaxReduceArrayWithEndPositions(variables.processedSequences + sequencePassOffset);
+        
+                                    run_SW_endposition_kernels_PSSM<subjectIsCaseSensitive,withEndPosition>(
+                                        maxReduceArray,
+                                        ws.gpuPermutedPSSMforSW,
+                                        inputChars,
+                                        inputLengths,
+                                        inputOffsets,
+                                        d_selectedPositions,
+                                        numInPass,
+                                        ws.d_tempStorageHE.data(),
+                                        ws.numTempBytes,
+                                        stream
+                                    );
+
+                                    thrust::stable_sort_by_key(
+                                        thrust::cuda::par_nosync(thrust_async_allocator<char>(stream)).on(stream),
+                                        ws.d_maxReduceArrayScores.data(),
+                                        ws.d_maxReduceArrayScores.data() + numInPass,
+                                        thrust::make_zip_iterator(
+                                            ws.d_maxReduceArrayIndices.data(),
+                                            ws.d_maxReduceArrayExtras.data()
+                                        ),
+                                        thrust::greater<float>()
+                                    );
+
+                                    if(sequencePassOffset > 0 || totalNumberOfProcessedSequences > 0){
+                                        auto mergeInput1 = thrust::make_zip_iterator(
+                                            ws.d_maxReduceArrayScores.data(),
+                                            ws.d_maxReduceArrayIndices.data(),
+                                            ws.d_maxReduceArrayExtras.data()
+                                        );
+                                        auto mergeInput2 = thrust::make_zip_iterator(
+                                            ws.d_topN_scores.data(), 
+                                            ws.d_topN_refIds.data(),
+                                            ws.d_topN_alignmentEndPositions.data()
+                                        );
+                                        auto mergeOutput = thrust::make_zip_iterator(
+                                            ws.d_topN_scores_tmp.data(), 
+                                            ws.d_topN_refIds_tmp.data(),
+                                            ws.d_topN_alignmentEndPositions_tmp.data()
+                                        );
+                                        thrust::merge(
+                                            thrust::cuda::par_nosync(thrust_async_allocator<char>(stream)).on(stream),
+                                            mergeInput1,
+                                            mergeInput1 + std::min(numInPass, size_t(results_per_query)),
+                                            mergeInput2,
+                                            mergeInput2 + results_per_query,
+                                            mergeOutput,
+                                            CompareScoresDescendingRefIdsAscending{}
+                                        );
+
+                                        // thrust::merge_by_key(thrust::cuda::par_nosync(thrust_async_allocator<char>(stream)).on(stream),
+                                        //     ws.d_maxReduceArrayScores.data(), 
+                                        //     ws.d_maxReduceArrayScores.data() + std::min(numInPass, size_t(results_per_query)),
+                                        //     ws.d_topN_scores.data(), 
+                                        //     ws.d_topN_scores.data() + results_per_query,
+                                        //     thrust::make_zip_iterator(
+                                        //         ws.d_maxReduceArrayIndices.data(),
+                                        //         ws.d_maxReduceArrayExtras.data()
+                                        //     ),
+                                        //     thrust::make_zip_iterator(
+                                        //         ws.d_topN_refIds.data(),
+                                        //         ws.d_topN_alignmentEndPositions.data()
+                                        //     ),
+                                        //     ws.d_topN_scores_tmp.data(), 
+                                        //     thrust::make_zip_iterator(
+                                        //         ws.d_topN_refIds_tmp.data(),
+                                        //         ws.d_topN_alignmentEndPositions_tmp.data()
+                                        //     ),
+                                        //     thrust::greater<float>()
+                                        // );
+
+                                        std::swap(ws.d_topN_scores, ws.d_topN_scores_tmp);
+                                        std::swap(ws.d_topN_refIds, ws.d_topN_refIds_tmp);
+                                        std::swap(ws.d_topN_alignmentEndPositions, ws.d_topN_alignmentEndPositions_tmp);
+                                    }else{
+                                        cudaMemcpyAsync(
+                                            ws.d_topN_scores.data(),
+                                            ws.d_maxReduceArrayScores.data(), 
+                                            sizeof(float) * results_per_query,
+                                            cudaMemcpyDeviceToDevice,
+                                            stream
+                                        ); CUERR;
+                                        cudaMemcpyAsync(
+                                            ws.d_topN_refIds.data(),
+                                            ws.d_maxReduceArrayIndices.data(), 
+                                            sizeof(ReferenceIdT) * results_per_query,
+                                            cudaMemcpyDeviceToDevice,
+                                            stream
+                                        ); CUERR;
+                                        cudaMemcpyAsync(
+                                            ws.d_topN_alignmentEndPositions.data(),
+                                            ws.d_maxReduceArrayExtras.data(), 
+                                            sizeof(AlignmentEndPosition) * results_per_query,
+                                            cudaMemcpyDeviceToDevice,
+                                            stream
+                                        ); CUERR;
+                                    }
+                                }
+                            }
+                        }
+                    }
+                }
+
+                //alignments are done in workstreams. now, join all workstreams
+                for(int gpu = 0; gpu < numGpus; gpu++){
+                    cudaSetDevice(deviceIds[gpu]); CUERR;
+                    auto& ws = *workingSets[gpu];
+                    auto& variables = variables_vec[gpu];
+
+                    if(variables.processedBatches < variables.batchPlansPtr->size()){        
+                        for(auto& stream : ws.workStreamsWithoutTemp){
+                            cudaEventRecord(ws.forkStreamEvent, stream); CUERR;
+                            cudaStreamWaitEvent(ws.workStreamForTempUsage, ws.forkStreamEvent, 0); CUERR;    
+                        }
+                    }
+                }
+        
+                //finish processing of batch
+                for(int gpu = 0; gpu < numGpus; gpu++){
+                    cudaSetDevice(deviceIds[gpu]); CUERR;
+                    auto& ws = *workingSets[gpu];
+                    const auto& variables = variables_vec[gpu];
+                    if(variables.processedBatches < variables.batchPlansPtr->size()){
+                
+                        //the batch is done and its data can be resused
+                        cudaEventRecord(ws.deviceBufferEvents[variables.currentBuffer], ws.workStreamForTempUsage); CUERR;
+        
+                        //let other workstreams depend on temp usage stream
+                        for(auto& stream : ws.workStreamsWithoutTemp){
+                            cudaStreamWaitEvent(ws.workStreamForTempUsage, ws.deviceBufferEvents[variables.currentBuffer], 0); CUERR;    
+                        }
+        
+                        ws.copyBufferIndex = (ws.copyBufferIndex+1) % ws.numCopyBuffers;
+                    }
+                }
+        
+                //update running numbers
+                for(int gpu = 0; gpu < numGpus; gpu++){
+                    auto& variables = variables_vec[gpu];
+                    if(variables.processedBatches < variables.batchPlansPtr->size()){
+
+                        variables.processedSequences += variables.currentPlanPtr->usedSeq;
+                        if(batchPlansDstInfoVec[gpu][variables.processedBatches].isUploaded){
+                            variables.processedBatches += workingSets[gpu]->getNumBatchesInCachedDB();                            
+                            //variables.processedBatches++;
+                        }else{
+                            variables.processedBatches++;
+                        }
+                        //std::cout << "variables.processedBatches: " << variables.processedBatches << "\n";
+        
+                        totalNumberOfProcessedSequences += variables.currentPlanPtr->usedSeq;
+                    } 
+                }
+        
+            } //while not done
+        
+        
+            for(int gpu = 0; gpu < numGpus; gpu++){
+                cudaSetDevice(deviceIds[gpu]); CUERR;
+                auto& ws = *workingSets[gpu];
+
+                if(batchPlansDstInfoVec[gpu].size() > 0){
+                    if(!batchPlansDstInfoVec[gpu][0].isUploaded){
+                        //all batches for cached db are now resident in gpu memory. update the flags
+                        if(ws.getNumBatchesInCachedDB() > 0){
+                            markCachedDBBatchesAsUploaded(gpu);
+
+                            // current offsets in cached db store the offsets for each batch, i.e. for each batch the offsets will start again at 0
+                            // compute prefix sum to obtain the single-batch offsets
+                
+                            cudaMemsetAsync(ws.d_cacheddb->getOffsetData(), 0, sizeof(size_t), ws.workStreamForTempUsage); CUERR;
+                
+                            auto d_paddedLengths = thrust::make_transform_iterator(
+                                ws.d_cacheddb->getLengthData(),
+                                RoundToNextMultiple<size_t, 4>{}
+                            );
+                
+                            thrust::inclusive_scan(
+                                thrust::cuda::par_nosync(thrust_async_allocator<char>(ws.workStreamForTempUsage)).on(ws.workStreamForTempUsage),
+                                d_paddedLengths,
+                                d_paddedLengths + ws.getNumSequencesInCachedDB(),
+                                ws.d_cacheddb->getOffsetData() + 1
+                            );
+                        }
+                    }
+                }
+            }
+        
+        
+        
+            for(int gpu = 0; gpu < numGpus; gpu++){
+                cudaSetDevice(deviceIds[gpu]); CUERR;
+                auto& ws = *workingSets[gpu];
+                //create dependency for gpuStreams[gpu]
+                cudaEventRecord(ws.forkStreamEvent, ws.workStreamForTempUsage); CUERR;
+                cudaStreamWaitEvent(gpuStreams[gpu], ws.forkStreamEvent, 0); CUERR;
+        
+                for(auto& stream : ws.workStreamsWithoutTemp){
+                    cudaEventRecord(ws.forkStreamEvent, stream); CUERR;
+                    cudaStreamWaitEvent(gpuStreams[gpu], ws.forkStreamEvent, 0); CUERR;
+                }
+        
+                // for(auto& stream : ws.copyStreams){
+                //     cudaEventRecord(ws.forkStreamEvent, stream); CUERR;
+                //     cudaStreamWaitEvent(gpuStreams[gpu], ws.forkStreamEvent, 0); CUERR;
+                // }
+        
+                cudaEventRecord(ws.forkStreamEvent, ws.hostFuncStream); CUERR;
+                cudaStreamWaitEvent(gpuStreams[gpu], ws.forkStreamEvent, 0); CUERR;
+            }
+            
+            processingTheFirstQuery = false;
+        }
+
+        void processQueryOnGpusWithTargetSubjectIds(){
+
+
+            assert(targetSubjectIds);
+            const int numGpus = deviceIds.size();
+
+            //todo: for proper multi-gpu we need to find the correct gpu for each target subject to re-use the cached data
+            //for now, multi-gpu will always gather the data on the host and transfer it to gpu 0
+            
+            prefetchDBToGpus();
+            cudaSetDevice(deviceIds[0]); CUERR;
+            workingSets[0]->resetMaxReduceArray(gpuStreams[0]);
+            workingSets[0]->resetTopNArrays(gpuStreams[0]);
+            const size_t numCachedSubjects = workingSets[0]->d_cacheddb->getNumSubjects();
+            auto cachedTargetSubjectIdsEnd = targetSubjectIds->begin();
+            
+            if(numGpus == 1){
+                cachedTargetSubjectIdsEnd = std::lower_bound(targetSubjectIds->begin(), targetSubjectIds->end(), numCachedSubjects);
+            }
+
+            // [targetSubjectIds->begin() , cachedTargetSubjectIdsEnd) are in gpu mem and can be accessed directly via index, 
+            // [cachedTargetSubjectIdsEnd, targetSubjectIds->end()) are in cpu mem and need to be processed in batches
+
+            const size_t numCachedTargetSubjects = std::distance(targetSubjectIds->begin(), cachedTargetSubjectIdsEnd);
+            const size_t numUncachedTargetSubjects = std::distance(cachedTargetSubjectIdsEnd, targetSubjectIds->end());
+            if(verbose){
+                std::cout << "numCachedTargetSubjects " << numCachedTargetSubjects << "\n";
+                std::cout << "numUncachedTargetSubjects " << numUncachedTargetSubjects << "\n";
+            }
+
+            // size_t max300000 = 0;
+            // for(size_t i = 0; i < 20000; i++){
+            //     const auto& data = fullDB.getData();
+            //     size_t index = fullDB.getData().numSequences() - 300000 + i;
+            //     SequenceLengthT length = data.lengths()[index];
+            //     max300000 += SDIV(length,4) * 4;
+            // }
+            // std::cout << "max300000 " << max300000 << "\n";
+
+
+            //process subjects which reside in gpu memory
+            if(numCachedTargetSubjects > 0){
+                //cudaStream_t stream = ws.workStreamsWithoutTemp[0];
+                cudaStream_t stream = gpuStreams[0];
+
+                auto& ws = *workingSets[0];
+                const char* const inputChars = ws.d_cacheddb->getCharData();
+                const SequenceLengthT* const inputLengths = ws.d_cacheddb->getLengthData();
+                const size_t* const inputOffsets = ws.d_cacheddb->getOffsetData();
+
+                ReferenceIdT* d_selectedPositions;
+                char* d_availableTempStorage = ws.d_tempStorageHE.data();
+                size_t availableTempStorageBytes = ws.numTempBytes;
+
+                size_t numBytesForSelectedPositions = SDIV(sizeof(ReferenceIdT) * numCachedTargetSubjects, 512) * 512;
+                if(numBytesForSelectedPositions < availableTempStorageBytes * 0.4){
+                    d_selectedPositions = (ReferenceIdT*)d_availableTempStorage;
+                    d_availableTempStorage = ((char*)d_availableTempStorage) + numBytesForSelectedPositions;
+                    availableTempStorageBytes -= numBytesForSelectedPositions;
+                }else{
+                    cudaMallocAsync(&d_selectedPositions, sizeof(ReferenceIdT) * numCachedTargetSubjects, stream); CUERR;
+                }
+
+                cudaMemcpyAsync(
+                    d_selectedPositions, 
+                    targetSubjectIds->subjectIds.data(), 
+                    sizeof(ReferenceIdT) * numCachedTargetSubjects, 
+                    cudaMemcpyHostToDevice, 
+                    stream
+                ); CUERR;
+
+                if(scanType == ScanType::Gapless){
+                    const size_t seqsPerPass = maxReduceArraySize;
+                    for(size_t sequencePassOffset = 0; sequencePassOffset < numCachedTargetSubjects; sequencePassOffset += seqsPerPass){
+                        const size_t numInPass = std::min(numCachedTargetSubjects - sequencePassOffset, seqsPerPass);
+
+                        auto maxReduceArray = ws.getMaxReduceArray(sequencePassOffset);
+
+                        runGaplessFilterKernels_PSSM(
+                            maxReduceArray,
+                            ws.gpuPermutedPSSMforGapless,
+                            inputChars,
+                            inputLengths,
+                            inputOffsets,
+                            d_selectedPositions + sequencePassOffset,
+                            numInPass,
+                            d_availableTempStorage,
+                            availableTempStorageBytes,
+                            stream
+                        );
+
+                        thrust::stable_sort_by_key(
+                            thrust::cuda::par_nosync(thrust_async_allocator<char>(stream)).on(stream),
+                            ws.d_maxReduceArrayScores.data(),
+                            ws.d_maxReduceArrayScores.data() + numInPass,
+                            ws.d_maxReduceArrayIndices.data(),
+                            thrust::greater<float>()
+                        );
+
+                        if(sequencePassOffset > 0){
+                            auto mergeInput1 = thrust::make_zip_iterator(
+                                ws.d_maxReduceArrayScores.data(),
+                                ws.d_maxReduceArrayIndices.data()
+                            );
+                            auto mergeInput2 = thrust::make_zip_iterator(
+                                ws.d_topN_scores.data(), 
+                                ws.d_topN_refIds.data()
+                            );
+                            auto mergeOutput = thrust::make_zip_iterator(
+                                ws.d_topN_scores_tmp.data(), 
+                                ws.d_topN_refIds_tmp.data()
+                            );
+                            thrust::merge(
+                                thrust::cuda::par_nosync(thrust_async_allocator<char>(stream)).on(stream),
+                                mergeInput1,
+                                mergeInput1 + std::min(numInPass, size_t(results_per_query)),
+                                mergeInput2,
+                                mergeInput2 + results_per_query,
+                                mergeOutput,
+                                CompareScoresDescendingRefIdsAscending{}
+                            );
+                            // thrust::merge_by_key(thrust::cuda::par_nosync(thrust_async_allocator<char>(stream)).on(stream),
+                            //     ws.d_maxReduceArrayScores.data(), 
+                            //     ws.d_maxReduceArrayScores.data() + std::min(numInPass, size_t(results_per_query)),
+                            //     ws.d_topN_scores.data(), 
+                            //     ws.d_topN_scores.data() + results_per_query,
+                            //     ws.d_maxReduceArrayIndices.data(), 
+                            //     ws.d_topN_refIds.data(),
+                            //     ws.d_topN_scores_tmp.data(), 
+                            //     ws.d_topN_refIds_tmp.data(),
+                            //     thrust::greater<float>()
+                            // );
+
+                            std::swap(ws.d_topN_scores, ws.d_topN_scores_tmp);
+                            std::swap(ws.d_topN_refIds, ws.d_topN_refIds_tmp);
+                        }else{
+                            cudaMemcpyAsync(
+                                ws.d_topN_scores.data(),
+                                ws.d_maxReduceArrayScores.data(), 
+                                sizeof(float) * results_per_query,
+                                cudaMemcpyDeviceToDevice,
+                                stream
+                            ); CUERR;
+                            cudaMemcpyAsync(
+                                ws.d_topN_refIds.data(),
+                                ws.d_maxReduceArrayIndices.data(), 
+                                sizeof(ReferenceIdT) * results_per_query,
+                                cudaMemcpyDeviceToDevice,
+                                stream
+                            ); CUERR;
+                        }
+
+                    }
+                }else if(scanType == ScanType::SW_Endpos){
+                    constexpr bool subjectIsCaseSensitive = true;
+                    constexpr bool withEndPosition = true;                   
+
+                    const size_t seqsPerPass = maxReduceArraySize;
+                    for(size_t sequencePassOffset = 0; sequencePassOffset < numCachedTargetSubjects; sequencePassOffset += seqsPerPass){
+                        const size_t numInPass = std::min(numCachedTargetSubjects - sequencePassOffset, seqsPerPass);
+
+                        auto maxReduceArray = ws.getMaxReduceArrayWithEndPositions(sequencePassOffset);
+
+                        run_SW_endposition_kernels_PSSM<subjectIsCaseSensitive,withEndPosition>(
+                            maxReduceArray,
+                            ws.gpuPermutedPSSMforSW,
+                            inputChars,
+                            inputLengths,
+                            inputOffsets,
+                            d_selectedPositions + sequencePassOffset,
+                            numInPass,
+                            d_availableTempStorage,
+                            availableTempStorageBytes,
+                            stream
+                        );
+
+                        thrust::stable_sort_by_key(
+                            thrust::cuda::par_nosync(thrust_async_allocator<char>(stream)).on(stream),
+                            ws.d_maxReduceArrayScores.data(),
+                            ws.d_maxReduceArrayScores.data() + numInPass,
+                            thrust::make_zip_iterator(
+                                ws.d_maxReduceArrayIndices.data(),
+                                ws.d_maxReduceArrayExtras.data()
+                            ),
+                            thrust::greater<float>()
+                        );
+
+                        if(sequencePassOffset > 0){
+                            auto mergeInput1 = thrust::make_zip_iterator(
+                                ws.d_maxReduceArrayScores.data(),
+                                ws.d_maxReduceArrayIndices.data(),
+                                ws.d_maxReduceArrayExtras.data()
+                            );
+                            auto mergeInput2 = thrust::make_zip_iterator(
+                                ws.d_topN_scores.data(), 
+                                ws.d_topN_refIds.data(),
+                                ws.d_topN_alignmentEndPositions.data()
+                            );
+                            auto mergeOutput = thrust::make_zip_iterator(
+                                ws.d_topN_scores_tmp.data(), 
+                                ws.d_topN_refIds_tmp.data(),
+                                ws.d_topN_alignmentEndPositions_tmp.data()
+                            );
+                            thrust::merge(
+                                thrust::cuda::par_nosync(thrust_async_allocator<char>(stream)).on(stream),
+                                mergeInput1,
+                                mergeInput1 + std::min(numInPass, size_t(results_per_query)),
+                                mergeInput2,
+                                mergeInput2 + results_per_query,
+                                mergeOutput,
+                                CompareScoresDescendingRefIdsAscending{}
+                            );
+                            // thrust::merge_by_key(thrust::cuda::par_nosync(thrust_async_allocator<char>(stream)).on(stream),
+                            //     ws.d_maxReduceArrayScores.data(), 
+                            //     ws.d_maxReduceArrayScores.data() + std::min(numInPass, size_t(results_per_query)),
+                            //     ws.d_topN_scores.data(), 
+                            //     ws.d_topN_scores.data() + results_per_query,
+                            //     thrust::make_zip_iterator(
+                            //         ws.d_maxReduceArrayIndices.data(),
+                            //         ws.d_maxReduceArrayExtras.data()
+                            //     ),
+                            //     thrust::make_zip_iterator(
+                            //         ws.d_topN_refIds.data(),
+                            //         ws.d_topN_alignmentEndPositions.data()
+                            //     ),
+                            //     ws.d_topN_scores_tmp.data(), 
+                            //     thrust::make_zip_iterator(
+                            //         ws.d_topN_refIds_tmp.data(),
+                            //         ws.d_topN_alignmentEndPositions_tmp.data()
+                            //     ),
+                            //     thrust::greater<float>()
+                            // );
+
+                            std::swap(ws.d_topN_scores, ws.d_topN_scores_tmp);
+                            std::swap(ws.d_topN_refIds, ws.d_topN_refIds_tmp);
+                            std::swap(ws.d_topN_alignmentEndPositions, ws.d_topN_alignmentEndPositions_tmp);
+                        }else{
+                            cudaMemcpyAsync(
+                                ws.d_topN_scores.data(),
+                                ws.d_maxReduceArrayScores.data(), 
+                                sizeof(float) * results_per_query,
+                                cudaMemcpyDeviceToDevice,
+                                stream
+                            ); CUERR;
+                            cudaMemcpyAsync(
+                                ws.d_topN_refIds.data(),
+                                ws.d_maxReduceArrayIndices.data(), 
+                                sizeof(ReferenceIdT) * results_per_query,
+                                cudaMemcpyDeviceToDevice,
+                                stream
+                            ); CUERR;
+                            cudaMemcpyAsync(
+                                ws.d_topN_alignmentEndPositions.data(),
+                                ws.d_maxReduceArrayExtras.data(), 
+                                sizeof(AlignmentEndPosition) * results_per_query,
+                                cudaMemcpyDeviceToDevice,
+                                stream
+                            ); CUERR;
+                        }
+
+                    }
+                }
+
+                if(numBytesForSelectedPositions >= availableTempStorageBytes * 0.4){
+                    cudaFreeAsync(d_selectedPositions, stream); CUERR;
+                }
+            }
+
+            //process subjects which reside in host memory
+            if(numUncachedTargetSubjects > 0){
+                auto& ws = *workingSets[0];
+                //cudaStream_t stream = ws.workStreamsWithoutTemp[0];
+                cudaStream_t stream = gpuStreams[0];
+
+                helpers::CpuTimer targetGatherTimer("targetGatherTimer");
+                std::vector<char> targetChars;
+                std::vector<size_t> targetOffsets(numUncachedTargetSubjects+1, 0);
+                std::vector<SequenceLengthT> targetLengths(numUncachedTargetSubjects);
+                for(size_t i = 0; i < numUncachedTargetSubjects; i++){
+                    const auto& data = fullDB.getData();
+                    const ReferenceIdT subjectId = *(cachedTargetSubjectIdsEnd + i);
+                    const size_t offsetBegin = data.offsets()[subjectId];
+                    const size_t offsetEnd = data.offsets()[subjectId+1];
+                    SequenceLengthT length = data.lengths()[subjectId];
+                    targetChars.insert(targetChars.end(), data.chars() + offsetBegin, data.chars() + offsetEnd);
+                    targetOffsets[i+1] = targetChars.size();
+                    targetLengths[i] = length;
+                }
+                if(verbose){
+                    targetGatherTimer.print();
+                }
+
+                std::vector<DBdataView> targetDBPartition{DBdataView(
+                    0,
+                    numUncachedTargetSubjects,
+                    0,
+                    targetChars.data(),
+                    targetLengths.data(),
+                    targetOffsets.data(),
+                    nullptr,
+                    nullptr
+                )};
+                std::vector<DeviceBatchCopyToPinnedPlan> targetBatchPlans = computeDbCopyPlan(
+                    targetDBPartition,
+                    {0},
+                    memoryConfig.maxBatchBytes,
+                    memoryConfig.maxBatchSequences
+                );
+
+
+                char* d_targetBasePtr;
+                char* d_targetChars;
+                size_t* d_targetOffsets;
+                SequenceLengthT* d_targetLengths;
+                char* d_availableTempStorage = ws.d_tempStorageHE.data();
+                size_t availableTempStorageBytes = ws.numTempBytes;
+                
+                size_t bytes[3]{
+                    SDIV(std::min(memoryConfig.maxBatchBytes, targetChars.size()), 512) * 512, //chars
+                    SDIV(sizeof(size_t) * std::min(memoryConfig.maxBatchSequences+1, targetLengths.size()+1), 512) * 512, //offsets
+                    SDIV(sizeof(SequenceLengthT) * std::min(memoryConfig.maxBatchSequences, targetLengths.size()), 512) * 512, //lengths
+                };
+                size_t bytesSum = bytes[0] + bytes[1] + bytes[2];
+
+                if(bytesSum < availableTempStorageBytes * 0.5){
+                    d_targetChars = (char*)d_availableTempStorage;
+                    d_availableTempStorage = ((char*)d_availableTempStorage) + bytes[0];                    
+                    availableTempStorageBytes -= bytes[0];
+                    d_targetOffsets = (size_t*)d_availableTempStorage;
+                    d_availableTempStorage = ((char*)d_availableTempStorage) + bytes[1];                    
+                    availableTempStorageBytes -= bytes[1];
+                    d_targetLengths = (SequenceLengthT*)d_availableTempStorage;
+                    d_availableTempStorage = ((char*)d_availableTempStorage) + bytes[2];                    
+                    availableTempStorageBytes -= bytes[2];
+                }else{
+                    cudaMallocAsync(&d_targetBasePtr, bytesSum, stream); CUERR;
+                    d_targetChars = (char*)d_targetBasePtr;
+                    d_targetOffsets = (size_t*)(((char*)d_targetChars) + bytes[0]);
+                    d_targetLengths = (SequenceLengthT*)(((char*)d_targetOffsets) + bytes[1]);
+                }
+
+                size_t numProcessed = 0;
+                for(const auto& batchPlan : targetBatchPlans){
+
+                    executeCopyPlanH2DDirect(
+                        batchPlan, 
+                        d_targetChars,
+                        d_targetLengths,
+                        d_targetOffsets,
+                        targetDBPartition, 
+                        stream
+                    );
+                
+                    auto d_selectedPositions = thrust::make_counting_iterator<ReferenceIdT>(0);
+
+                    if(scanType == ScanType::Gapless){
+
+                        const size_t seqsPerPass = maxReduceArraySize;
+                        for(size_t sequencePassOffset = 0; sequencePassOffset < numUncachedTargetSubjects; sequencePassOffset += seqsPerPass){
+                            const size_t numInPass = std::min(numUncachedTargetSubjects - sequencePassOffset, seqsPerPass);
+
+                            auto maxReduceArray = ws.getMaxReduceArray(numCachedTargetSubjects + numProcessed + sequencePassOffset);
+
+                            runGaplessFilterKernels_PSSM(
+                                maxReduceArray,
+                                ws.gpuPermutedPSSMforGapless,
+                                d_targetChars,
+                                d_targetLengths,
+                                d_targetOffsets,
+                                d_selectedPositions + sequencePassOffset,
+                                numInPass,
+                                d_availableTempStorage,
+                                availableTempStorageBytes,
+                                stream
+                            );
+
+                            thrust::stable_sort_by_key(
+                                thrust::cuda::par_nosync(thrust_async_allocator<char>(stream)).on(stream),
+                                ws.d_maxReduceArrayScores.data(),
+                                ws.d_maxReduceArrayScores.data() + numInPass,
+                                ws.d_maxReduceArrayIndices.data(),
+                                thrust::greater<float>()
+                            );
+
+                            //merge kernel results with previous results
+                            if(sequencePassOffset > 0 || numCachedTargetSubjects > 0){
+                                auto mergeInput1 = thrust::make_zip_iterator(
+                                    ws.d_maxReduceArrayScores.data(),
+                                    ws.d_maxReduceArrayIndices.data()
+                                );
+                                auto mergeInput2 = thrust::make_zip_iterator(
+                                    ws.d_topN_scores.data(), 
+                                    ws.d_topN_refIds.data()
+                                );
+                                auto mergeOutput = thrust::make_zip_iterator(
+                                    ws.d_topN_scores_tmp.data(), 
+                                    ws.d_topN_refIds_tmp.data()
+                                );
+                                thrust::merge(
+                                    thrust::cuda::par_nosync(thrust_async_allocator<char>(stream)).on(stream),
+                                    mergeInput1,
+                                    mergeInput1 + std::min(numInPass, size_t(results_per_query)),
+                                    mergeInput2,
+                                    mergeInput2 + results_per_query,
+                                    mergeOutput,
+                                    CompareScoresDescendingRefIdsAscending{}
+                                );
+                                // thrust::merge_by_key(thrust::cuda::par_nosync(thrust_async_allocator<char>(stream)).on(stream),
+                                //     ws.d_maxReduceArrayScores.data(), 
+                                //     ws.d_maxReduceArrayScores.data() + std::min(numInPass, size_t(results_per_query)),
+                                //     ws.d_topN_scores.data(), 
+                                //     ws.d_topN_scores.data() + results_per_query,
+                                //     ws.d_maxReduceArrayIndices.data(), 
+                                //     ws.d_topN_refIds.data(),
+                                //     ws.d_topN_scores_tmp.data(), 
+                                //     ws.d_topN_refIds_tmp.data(),
+                                //     thrust::greater<float>()
+                                // );
+
+                                std::swap(ws.d_topN_scores, ws.d_topN_scores_tmp);
+                                std::swap(ws.d_topN_refIds, ws.d_topN_refIds_tmp);
+                            }else{
+                                cudaMemcpyAsync(
+                                    ws.d_topN_scores.data(),
+                                    ws.d_maxReduceArrayScores.data(), 
+                                    sizeof(float) * results_per_query,
+                                    cudaMemcpyDeviceToDevice,
+                                    stream
+                                ); CUERR;
+                                cudaMemcpyAsync(
+                                    ws.d_topN_refIds.data(),
+                                    ws.d_maxReduceArrayIndices.data(), 
+                                    sizeof(ReferenceIdT) * results_per_query,
+                                    cudaMemcpyDeviceToDevice,
+                                    stream
+                                ); CUERR;
+                            }
+
+                        }
+                    }else if(scanType == ScanType::SW_Endpos){
+                        constexpr bool subjectIsCaseSensitive = true;
+                        constexpr bool withEndPosition = true;
+
+                        const size_t seqsPerPass = maxReduceArraySize;
+                        for(size_t sequencePassOffset = 0; sequencePassOffset < numUncachedTargetSubjects; sequencePassOffset += seqsPerPass){
+                            const size_t numInPass = std::min(numUncachedTargetSubjects - sequencePassOffset, seqsPerPass);
+
+                            auto maxReduceArray = ws.getMaxReduceArrayWithEndPositions(numCachedTargetSubjects + numProcessed + sequencePassOffset);
+    
+                            run_SW_endposition_kernels_PSSM<subjectIsCaseSensitive,withEndPosition>(
+                                maxReduceArray,
+                                ws.gpuPermutedPSSMforSW,
+                                d_targetChars,
+                                d_targetLengths,
+                                d_targetOffsets,
+                                d_selectedPositions + sequencePassOffset,
+                                numInPass,
+                                d_availableTempStorage,
+                                availableTempStorageBytes,
+                                stream
+                            );
+
+                            thrust::stable_sort_by_key(
+                                thrust::cuda::par_nosync(thrust_async_allocator<char>(stream)).on(stream),
+                                ws.d_maxReduceArrayScores.data(),
+                                ws.d_maxReduceArrayScores.data() + numInPass,
+                                thrust::make_zip_iterator(
+                                    ws.d_maxReduceArrayIndices.data(),
+                                    ws.d_maxReduceArrayExtras.data()
+                                ),
+                                thrust::greater<float>()
+                            );
+                            
+                            if(sequencePassOffset > 0 || numCachedTargetSubjects > 0){
+                                auto mergeInput1 = thrust::make_zip_iterator(
+                                    ws.d_maxReduceArrayScores.data(),
+                                    ws.d_maxReduceArrayIndices.data(),
+                                    ws.d_maxReduceArrayExtras.data()
+                                );
+                                auto mergeInput2 = thrust::make_zip_iterator(
+                                    ws.d_topN_scores.data(), 
+                                    ws.d_topN_refIds.data(),
+                                    ws.d_topN_alignmentEndPositions.data()
+                                );
+                                auto mergeOutput = thrust::make_zip_iterator(
+                                    ws.d_topN_scores_tmp.data(), 
+                                    ws.d_topN_refIds_tmp.data(),
+                                    ws.d_topN_alignmentEndPositions_tmp.data()
+                                );
+                                thrust::merge(
+                                    thrust::cuda::par_nosync(thrust_async_allocator<char>(stream)).on(stream),
+                                    mergeInput1,
+                                    mergeInput1 + std::min(numInPass, size_t(results_per_query)),
+                                    mergeInput2,
+                                    mergeInput2 + results_per_query,
+                                    mergeOutput,
+                                    CompareScoresDescendingRefIdsAscending{}
+                                );
+
+                                // thrust::merge_by_key(thrust::cuda::par_nosync(thrust_async_allocator<char>(stream)).on(stream),
+                                //     ws.d_maxReduceArrayScores.data(), 
+                                //     ws.d_maxReduceArrayScores.data() + std::min(numInPass, size_t(results_per_query)),
+                                //     ws.d_topN_scores.data(), 
+                                //     ws.d_topN_scores.data() + results_per_query,
+                                //     thrust::make_zip_iterator(
+                                //         ws.d_maxReduceArrayIndices.data(),
+                                //         ws.d_maxReduceArrayExtras.data()
+                                //     ),
+                                //     thrust::make_zip_iterator(
+                                //         ws.d_topN_refIds.data(),
+                                //         ws.d_topN_alignmentEndPositions.data()
+                                //     ),
+                                //     ws.d_topN_scores_tmp.data(), 
+                                //     thrust::make_zip_iterator(
+                                //         ws.d_topN_refIds_tmp.data(),
+                                //         ws.d_topN_alignmentEndPositions_tmp.data()
+                                //     ),
+                                //     thrust::greater<float>()
+                                // );
+
+                                std::swap(ws.d_topN_scores, ws.d_topN_scores_tmp);
+                                std::swap(ws.d_topN_refIds, ws.d_topN_refIds_tmp);
+                                std::swap(ws.d_topN_alignmentEndPositions, ws.d_topN_alignmentEndPositions_tmp);
+                            }else{
+                                cudaMemcpyAsync(
+                                    ws.d_topN_scores.data(),
+                                    ws.d_maxReduceArrayScores.data(), 
+                                    sizeof(float) * results_per_query,
+                                    cudaMemcpyDeviceToDevice,
+                                    stream
+                                ); CUERR;
+                                cudaMemcpyAsync(
+                                    ws.d_topN_refIds.data(),
+                                    ws.d_maxReduceArrayIndices.data(), 
+                                    sizeof(ReferenceIdT) * results_per_query,
+                                    cudaMemcpyDeviceToDevice,
+                                    stream
+                                ); CUERR;
+                                cudaMemcpyAsync(
+                                    ws.d_topN_alignmentEndPositions.data(),
+                                    ws.d_maxReduceArrayExtras.data(), 
+                                    sizeof(AlignmentEndPosition) * results_per_query,
+                                    cudaMemcpyDeviceToDevice,
+                                    stream
+                                ); CUERR;
+                            }
+
+                        }
+                    }
+
+                    numProcessed += batchPlan.usedSeq;
+                }
+
+                if(bytesSum >= availableTempStorageBytes * 0.5){
+                    cudaFreeAsync(d_targetBasePtr, stream); CUERR;
+                }
+            }
+
+            // const int numGpus = deviceIds.size();
+            // for(int gpu = 0; gpu < numGpus; gpu++){
+            //     cudaSetDevice(deviceIds[gpu]); CUERR;
+            //     auto& ws = *workingSets[gpu];
+            //     //create dependency for gpuStreams[gpu]
+            //     cudaEventRecord(ws.forkStreamEvent, ws.workStreamForTempUsage); CUERR;
+            //     cudaStreamWaitEvent(gpuStreams[gpu], ws.forkStreamEvent, 0); CUERR;
+        
+            //     for(auto& stream : ws.workStreamsWithoutTemp){
+            //         cudaEventRecord(ws.forkStreamEvent, stream); CUERR;
+            //         cudaStreamWaitEvent(gpuStreams[gpu], ws.forkStreamEvent, 0); CUERR;
+            //     }
+        
+            //     // for(auto& stream : ws.copyStreams){
+            //     //     cudaEventRecord(ws.forkStreamEvent, stream); CUERR;
+            //     //     cudaStreamWaitEvent(gpuStreams[gpu], ws.forkStreamEvent, 0); CUERR;
+            //     // }
+        
+            //     cudaEventRecord(ws.forkStreamEvent, ws.hostFuncStream); CUERR;
+            //     cudaStreamWaitEvent(gpuStreams[gpu], ws.forkStreamEvent, 0); CUERR;
+            // }
+            
+            processingTheFirstQuery = false;
+  
+        }
+
+        BenchmarkStats makeBenchmarkStats(double seconds, double cells, int overflows) const{
+            BenchmarkStats stats;
+            stats.seconds = seconds;
+            stats.gcups = cells / 1000. / 1000. / 1000.;
+            stats.gcups = stats.gcups / stats.seconds;
+            stats.numOverflows = overflows;
+            return stats;
+        }
+
+        void updateNumResultsPerQuery(){
+
+            results_per_query = std::min(size_t(numTop), size_t(maxReduceArraySize));
+            if(dbIsReady){
+                results_per_query = std::min(size_t(results_per_query), fullDB.getData().numSequences());
+            }
+        }
+
+        int getMaxSingleTileQueryLength_Gapless() const{
+            auto largestconfig = *std::max_element(availableKernelConfigs_gapless_singletile.begin(),
+                availableKernelConfigs_gapless_singletile.end(),
+                [](const auto& l, const auto& r){
+                    return l.tilesize < r.tilesize;
+                }
+            );  
+            return largestconfig.tilesize;
+            
+            //must be a multiple of 64
+            // return 2048;
+            //return 0;
+        }
+
+        int getMaxSingleTileQueryLength_SW() const{
+            auto largestconfig = *std::max_element(availableKernelConfigs_sw_singletile.begin(),
+                availableKernelConfigs_sw_singletile.end(),
+                [](const auto& l, const auto& r){
+                    return l.tilesize < r.tilesize;
+                }
+            );  
+            return largestconfig.tilesize;
+
+            //must be a multiple of 32
+            // return 1408;
+            //return 0;
+        }
+
+
+
+
+        
+    public:
+        GaplessKernelConfig customKernelConfig_Gapless;
+        bool useCustomKernelConfig_Gapless = false;
+
+        SmithWatermanKernelConfig customKernelConfig_SW;
+        bool useCustomKernelConfig_SW = false;
+
+        void setCustomKernelConfig_Gapless(GaplessKernelConfig config){
+            customKernelConfig_Gapless = config;
+            useCustomKernelConfig_Gapless = true;
+        }
+
+        void setCustomKernelConfig_SW(SmithWatermanKernelConfig config){
+            customKernelConfig_SW = config;
+            useCustomKernelConfig_SW = true;
+        }
+    private:
+
+
+
+        GaplessKernelConfig getSingleTileGroupRegConfigForPSSM_Gapless(int queryLength){
+            if(useCustomKernelConfig_Gapless){
+                return customKernelConfig_Gapless;
+            }
+            const auto& configs = availableKernelConfigs_gapless_singletile;
+            auto it = std::lower_bound(configs.begin(), configs.end(), queryLength,
+                [](const GaplessKernelConfig& l, int r){
+                    return l.tilesize < r;
+                }
+            );
+            if(it == configs.end()){
+                throw std::runtime_error("kernel config does not exist");
+            }else{
+                return *it;
+            }
+        }
+
+        GaplessKernelConfig getMultiTileGroupRegConfigForPSSM_Gapless(int queryLength){
+            if(useCustomKernelConfig_Gapless){
+                return customKernelConfig_Gapless;
+            }
+            const auto& configs = availableKernelConfigs_gapless_multitile;
+
+            //find the config which best utilizes the last tile. larger tile sizes are preferred
+            auto selectedConfig = configs[0];
+            const int remainderInLastTile0 = queryLength % selectedConfig.tilesize;
+            double utilization = remainderInLastTile0 == 0 ? 1.0 : double(remainderInLastTile0) / selectedConfig.tilesize;
+            for(size_t i = 1; i < configs.size(); i++){
+                const auto& newConfig = configs[i];
+                const int remainderInLastTile = queryLength % newConfig.tilesize;
+                const double newUtilization = remainderInLastTile == 0 ? 1.0 : double(remainderInLastTile) / newConfig.tilesize;
+                if(newUtilization >= utilization){
+                    utilization = newUtilization;
+                    selectedConfig = newConfig;
+                }
+            }
+
+            return selectedConfig;
+        }
+
+        SmithWatermanKernelConfig getSingleTileGroupRegConfigForPSSM_SW(int queryLength){
+            if(useCustomKernelConfig_SW){
+                return customKernelConfig_SW;
+            }
+            const auto& configs = availableKernelConfigs_sw_singletile;
+            auto it = std::lower_bound(configs.begin(), configs.end(), queryLength,
+                [](const SmithWatermanKernelConfig& l, int r){
+                    return l.tilesize < r;
+                }
+            );
+            if(it == configs.end()){
+                throw std::runtime_error("kernel config does not exist");
+            }else{
+                return *it;
+            }
+        }
+
+        SmithWatermanKernelConfig getMultiTileGroupRegConfigForPSSM_SW(int queryLength){
+            if(useCustomKernelConfig_SW){
+                return customKernelConfig_SW;
+            }
+            const auto& configs = availableKernelConfigs_sw_multitile;
+
+            //find the config which best utilizes the last tile. larger tile sizes are preferred
+            auto selectedConfig = configs[0];
+            const int remainderInLastTile0 = queryLength % selectedConfig.tilesize;
+            double utilization = remainderInLastTile0 == 0 ? 1.0 : double(remainderInLastTile0) / selectedConfig.tilesize;
+            for(size_t i = 1; i < configs.size(); i++){
+                const auto& newConfig = configs[i];
+                const int remainderInLastTile = queryLength % newConfig.tilesize;
+                const double newUtilization = remainderInLastTile == 0 ? 1.0 : double(remainderInLastTile) / newConfig.tilesize;
+                if(newUtilization >= utilization){
+                    utilization = newUtilization;
+                    selectedConfig = newConfig;
+                }
+            }
+
+            return selectedConfig;
+        }
+
+        std::vector<std::tuple<int,int>> getSupportedGroupRegConfigs_gapless_singletile() const{
+            std::vector<std::tuple<int,int>> validRegConfigs;
+            #define X(g,r)\
+                validRegConfigs.push_back(std::make_tuple(g,r));
+            
+            PSSM_GAPLESS_SINGLETILE_FOR_EACH_VALID_CONFIG_DO_X
+
+            #undef X
+            return validRegConfigs;
+        }
+
+        std::vector<std::tuple<int,int>> getSupportedGroupRegConfigs_gapless_multitile() const{
+            std::vector<std::tuple<int,int>> validRegConfigs;
+            #define X(g,r)\
+                validRegConfigs.push_back(std::make_tuple(g,r));
+            
+            PSSM_GAPLESS_MULTITILE_FOR_EACH_VALID_CONFIG_DO_X
+
+            #undef X
+            return validRegConfigs;
+        }
+
+        std::vector<std::tuple<int,int>> getSupportedGroupRegConfigs_swendpos_singletile() const{
+            std::vector<std::tuple<int,int>> validRegConfigs;
+            #define X(g,r)\
+                validRegConfigs.push_back(std::make_tuple(g,r));
+            
+            PSSM_SW_ENDPOS_SINGLETILE_FLOAT_OR_INT_FOR_EACH_VALID_CONFIG_DO_X
+
+            #undef X
+            return validRegConfigs;
+        }
+
+        std::vector<std::tuple<int,int>> getSupportedGroupRegConfigs_swendpos_multitile() const{
+            std::vector<std::tuple<int,int>> validRegConfigs;
+            #define X(g,r)\
+                validRegConfigs.push_back(std::make_tuple(g,r));
+            
+            PSSM_SW_ENDPOS_MULTITILE_FLOAT_OR_INT_FOR_EACH_VALID_CONFIG_DO_X
+
+            #undef X
+            return validRegConfigs;
+        }
+
+        void initializeListOfAvailableKernelConfigs(const KernelConfigFilenames& kernelConfigFilenames){
+
+            const auto configsGapless = [&](){
+                if(kernelConfigFilenames.gapless){
+                    return loadKernelConfigsFromFile_gapless(kernelConfigFilenames.gapless.value());
+                }else{
+                    return getOptimalKernelConfigs_gapless(deviceIds[0]);
+                }
+            }();
+
+            {
+                const auto supported = getSupportedGroupRegConfigs_gapless_singletile();
+
+                for(const auto& config : configsGapless){
+                    for(const auto& tup : supported){
+                        if(config.groupsize == std::get<0>(tup) && config.numRegs == std::get<1>(tup)){
+                            availableKernelConfigs_gapless_singletile.push_back(config);
+                            break;
+                        }
+                    }
+                }
+                if(availableKernelConfigs_gapless_singletile.empty()){
+                    throw std::runtime_error("availableKernelConfigs_gapless_singletile is empty");
+                }
+            }
+            {
+                const auto supported = getSupportedGroupRegConfigs_gapless_multitile();
+
+                for(const auto& config : configsGapless){
+                    for(const auto& tup : supported){
+                        if(config.groupsize == std::get<0>(tup) && config.numRegs == std::get<1>(tup)){
+                            availableKernelConfigs_gapless_multitile.push_back(config);
+                            break;
+                        }
+                    }
+                }
+                if(availableKernelConfigs_gapless_multitile.empty()){
+                    throw std::runtime_error("availableKernelConfigs_gapless_multitile is empty");
+                }
+            }
+
+            const auto configsSW = [&](){
+                if(kernelConfigFilenames.sw){
+                    return loadKernelConfigsFromFile_sw(kernelConfigFilenames.sw.value());
+                }else{
+                    return getOptimalKernelConfigs_SW(deviceIds[0]);
+                }
+            }();
+
+            {
+                const auto supported = getSupportedGroupRegConfigs_swendpos_singletile();
+
+                for(const auto& config : configsSW){
+                    for(const auto& tup : supported){
+                        if(config.groupsize == std::get<0>(tup) && config.numRegs == std::get<1>(tup)){
+                            availableKernelConfigs_sw_singletile.push_back(config);
+                            break;
+                        }
+                    }
+                }
+                if(availableKernelConfigs_sw_singletile.empty()){
+                    throw std::runtime_error("availableKernelConfigs_sw_singletile is empty");
+                }
+            }
+            {
+                const auto supported = getSupportedGroupRegConfigs_swendpos_multitile();
+
+                for(const auto& config : configsSW){
+                    for(const auto& tup : supported){
+                        if(config.groupsize == std::get<0>(tup) && config.numRegs == std::get<1>(tup)){
+                            availableKernelConfigs_sw_multitile.push_back(config);
+                            break;
+                        }
+                    }
+                }
+                if(availableKernelConfigs_sw_multitile.empty()){
+                    throw std::runtime_error("availableKernelConfigs_sw_multitile is empty");
+                }
+            }
+        }
+
+        std::vector<GaplessKernelConfig> loadKernelConfigsFromFile_gapless(const std::string& filename){
+            std::ifstream is(filename);
+            if(!is){
+                throw std::runtime_error("could not open file " + filename);
+            }
+
+            auto split = [](const std::string& str, char c){
+                std::vector<std::string> result;
+
+                std::stringstream ss(str);
+                std::string s;
+
+                while (std::getline(ss, s, c)) {
+                    result.emplace_back(s);
+                }
+
+                return result;
+            };
+
+            std::vector<GaplessKernelConfig> result;
+
+            std::string line;
+            while(std::getline(is, line)){
+                if(line.size() > 0){
+                    if(line[0] == '#') continue;
+                    auto tokens = split(line, ' ');
+                    if(tokens.size() < 5) throw std::runtime_error("error parsing kernel configs file");
+
+                    GaplessKernelConfig config;
+                    config.tilesize = std::stoi(tokens[0]);
+                    config.groupsize = std::stoi(tokens[1]);
+                    config.numRegs = std::stoi(tokens[2]);
+                    config.dpx = std::stoi(tokens[3]);
+                    config.approach = GaplessKernelConfig::Approach(std::stoi(tokens[4]));
+                    result.push_back(config);
+                }
+            }
+
+            return result;
+        }
+
+        std::vector<SmithWatermanKernelConfig> loadKernelConfigsFromFile_sw(const std::string& filename){
+            std::ifstream is(filename);
+            if(!is){
+                throw std::runtime_error("could not open file " + filename);
+            }
+
+            auto split = [](const std::string& str, char c){
+                std::vector<std::string> result;
+
+                std::stringstream ss(str);
+                std::string s;
+
+                while (std::getline(ss, s, c)) {
+                    result.emplace_back(s);
+                }
+
+                return result;
+            };
+
+            std::vector<SmithWatermanKernelConfig> result;
+
+            std::string line;
+            while(std::getline(is, line)){
+                if(line.size() > 0){
+                    if(line[0] == '#') continue;
+                    auto tokens = split(line, ' ');
+                    if(tokens.size() < 5) throw std::runtime_error("error parsing kernel configs file");
+
+                    SmithWatermanKernelConfig config;
+                    config.tilesize = std::stoi(tokens[0]);
+                    config.groupsize = std::stoi(tokens[1]);
+                    config.numRegs = std::stoi(tokens[2]);
+                    config.dpx = std::stoi(tokens[3]);
+                    config.approach = SmithWatermanKernelConfig::Approach(std::stoi(tokens[4]));
+                    result.push_back(config);
+                }
+            }
+
+            return result;
+        }
+
+        int affine_local_DP_host_protein_blosum62(
+            const char* seq1,
+            const char* seq2,
+            const int length1,
+            const int length2,
+            const int gap_open,
+            const int gap_extend
+        ) {
+            const int NEGINFINITY = -10000;
+            std::vector<int> penalty_H(2*(length2+1));
+            std::vector<int> penalty_F(2*(length2+1));
+
+            int E, F, maxi = 0, result;
+            penalty_H[0] = 0;
+            penalty_F[0] = NEGINFINITY;
+            for (int index = 1; index <= length2; index++) {
+                penalty_H[index] = 0;
+                penalty_F[index] = NEGINFINITY;
+            }
+
+            auto convert_AA = cudasw4::ConvertAA_20{};
+
+            auto BLOSUM = cudasw4::BLOSUM62_20::get2D();
+
+            for (int row = 1; row <= length1; row++) {
+                char seq1_char = seq1[row-1];
+                char seq2_char;
+
+                const int target_row = row & 1;
+                const int source_row = !target_row;
+                penalty_H[target_row*(length2+1)] = 0; //gap_open + (row-1)*gap_extend;
+                penalty_F[target_row*(length2+1)] = gap_open + (row-1)*gap_extend;
+                E = NEGINFINITY;
+                for (int col = 1; col <= length2; col++) {
+                    const int diag = penalty_H[source_row*(length2+1)+col-1];
+                    const int abve = penalty_H[source_row*(length2+1)+col+0];
+                    const int left = penalty_H[target_row*(length2+1)+col-1];
+                    seq2_char = seq2[col-1];
+                    const int residue = BLOSUM[convert_AA(seq1_char)][convert_AA(seq2_char)];
+                    E = std::max(E+gap_extend, left+gap_open);
+                    F = std::max(penalty_F[source_row*(length2+1)+col+0]+gap_extend, abve+gap_open);
+                    result = std::max(0, std::max(diag + residue, std::max(E, F)));
+                    penalty_H[target_row*(length2+1)+col] = result;
+                    if (result > maxi) maxi = result;
+                    penalty_F[target_row*(length2+1)+col] = F;
+                }
+            }
+            return maxi;
+        }
+
+        //sequences must be in to ncbi converted format
+        int affine_local_DP_host_protein_blosum62_converted(
+            const char* seq1,
+            const char* seq2,
+            const int length1,
+            const int length2,
+            const int gap_open,
+            const int gap_extend
+        ) {
+            const int NEGINFINITY = -10000;
+            std::vector<int> penalty_H(2*(length2+1));
+            std::vector<int> penalty_F(2*(length2+1));
+
+            // std::cout << "length1 " << length1 << ", length2 " << length2 << "\n";
+
+            // for(int i = 0; i < length1; i++){
+            //     std::cout << int(seq1[i]) << " ";
+            // }
+            // std::cout << "\n";
+
+            // for(int i = 0; i < length2; i++){
+            //     std::cout << int(seq2[i]) << " ";
+            // }
+            // std::cout << "\n";
+
+            int E, F, maxi = 0, result;
+            penalty_H[0] = 0;
+            penalty_F[0] = NEGINFINITY;
+            for (int index = 1; index <= length2; index++) {
+                penalty_H[index] = 0;
+                penalty_F[index] = NEGINFINITY;
+            }
+
+            auto BLOSUM = cudasw4::BLOSUM62_20::get2D();
+
+            for (int row = 1; row <= length1; row++) {
+                int seq1_char = seq1[row-1];
+                int seq2_char;
+
+                const int target_row = row & 1;
+                const int source_row = !target_row;
+                penalty_H[target_row*(length2+1)] = 0; //gap_open + (row-1)*gap_extend;
+                penalty_F[target_row*(length2+1)] = gap_open + (row-1)*gap_extend;
+                E = NEGINFINITY;
+                for (int col = 1; col <= length2; col++) {
+                    const int diag = penalty_H[source_row*(length2+1)+col-1];
+                    const int abve = penalty_H[source_row*(length2+1)+col+0];
+                    const int left = penalty_H[target_row*(length2+1)+col-1];
+                    seq2_char = seq2[col-1];
+                    const int residue = BLOSUM[seq1_char][seq2_char];
+                    E = std::max(E+gap_extend, left+gap_open);
+                    F = std::max(penalty_F[source_row*(length2+1)+col+0]+gap_extend, abve+gap_open);
+                    result = std::max(0, std::max(diag + residue, std::max(E, F)));
+                    penalty_H[target_row*(length2+1)+col] = result;
+                    if (result > maxi) maxi = result;
+                    penalty_F[target_row*(length2+1)+col] = F;
+
+                    //std::cout << maxi << " ";
+                }
+                //std::cout << "\n";
+            }
+            return maxi;
+        }
+
+        //sequences must be in to ncbi converted format
+        int GaplessFilter_host_protein_converted_blosum62(
+            const char* seq1,
+            const char* seq2,
+            const int length1,
+            const int length2
+        ) {
+
+            //const int NEGINFINITY = -10000;
+            std::vector<int> penalty_H(2*(length2+1));
+
+            int maxi = 0, result;
+            for (int index = 0; index <= length2; index++) {
+                penalty_H[index] = 0;
+            }
+
+            auto BLOSUM = cudasw4::BLOSUM62_20::get2D();
+            
+            //std::cout << "CPU:\n";
+            for (int row = 1; row <= length1; row++) {
+                char seq1_char = seq1[row-1];
+                char seq2_char;
+
+                const int target_row = row & 1;
+                const int source_row = !target_row;
+                penalty_H[target_row*(length2+1)] = 0; //gap_open + (row-1)*gap_extend;
+                for (int col = 1; col <= length2; col++) {
+                    const int diag = penalty_H[source_row*(length2+1)+col-1];
+                    seq2_char = seq2[col-1];
+
+                    const int residue = BLOSUM[seq1_char][seq2_char];
+                    result = std::max(0, diag + residue);
+                    penalty_H[target_row*(length2+1)+col] = result;
+                    if (result > maxi) maxi = result;
+                }
+
+                // for (int col = 1; col <= length2; col++) {
+                //     printf("%2d ", penalty_H[target_row*(length2+1)+col]);
+                // }
+                // printf(", max %2d\n", maxi);
+            }
+
+            return maxi;
+        }
+
+        template<class OutputScores, class SelectedPositions>
+        void runGaplessFilterKernels_PSSM(
+            OutputScores& d_scores,
+            GpuPermutedPSSMforGapless& permutedPSSM,
+            const char* d_inputChars,
+            const SequenceLengthT* d_inputLengths,
+            const size_t* d_inputOffsets,
+            SelectedPositions d_selectedPositions,
+            size_t numSequences,
+            char* d_tempStorage,
+            size_t tempStorageBytes,
+            cudaStream_t stream
+        ){
+            if(currentQueryLength <= getMaxSingleTileQueryLength_Gapless()){
+                auto config = getSingleTileGroupRegConfigForPSSM_Gapless(currentQueryLength);
+
+                if(!config.dpx){
+                    if(config.approach == GaplessKernelConfig::Approach::hardcodedzero){
+                        PSSM_2D_View<half2> strided_PSSM = permutedPSSM.makeHalf2View();
+                        hardcodedzero::call_GaplessFilter_strided_PSSM_singletile_kernel<half2, 512>( 
+                            config.groupsize, config.numRegs, d_inputChars, 
+                            d_scores, d_inputOffsets, d_inputLengths, 
+                            d_selectedPositions, numSequences, 
+                            currentQueryLength, strided_PSSM, stream
+                        );
+                    }else{
+                        PSSM_2D_View<half2> strided_PSSM = permutedPSSM.makeHalf2View();
+                        kernelparamzero::call_GaplessFilter_strided_PSSM_singletile_kernel<half2, 512>(
+                            config.groupsize, config.numRegs, d_inputChars, 
+                            d_scores, d_inputOffsets, d_inputLengths, 
+                            d_selectedPositions, numSequences, 
+                            currentQueryLength, strided_PSSM, stream
+                        );
+                    }
+                }else{
+                    if(config.approach == GaplessKernelConfig::Approach::hardcodedzero){
+                        PSSM_2D_View<short2> strided_PSSM = permutedPSSM.makeShort2View();
+                        hardcodedzero::call_GaplessFilter_strided_PSSM_singletile_kernel<short2, 512>(
+                            config.groupsize, config.numRegs, d_inputChars, 
+                            d_scores, d_inputOffsets, d_inputLengths, 
+                            d_selectedPositions, numSequences, 
+                            currentQueryLength, strided_PSSM, stream
+                        );
+                    }else{
+                        PSSM_2D_View<short2> strided_PSSM = permutedPSSM.makeShort2View();
+                        kernelparamzero::call_GaplessFilter_strided_PSSM_singletile_kernel<short2, 512>(
+                            config.groupsize, config.numRegs, d_inputChars, 
+                            d_scores, d_inputOffsets, d_inputLengths, 
+                            d_selectedPositions, numSequences, 
+                            currentQueryLength, strided_PSSM, stream
+                        );
+                    }
+                }
+            }else{
+                
+
+                auto config = getMultiTileGroupRegConfigForPSSM_Gapless(currentQueryLength);
+
+                int deviceId = 0;
+                int numSMs = 0;
+                cudaGetDevice(&deviceId);
+                cudaDeviceGetAttribute(&numSMs, cudaDevAttrMultiProcessorCount, deviceId);
+
+                constexpr int threadBlockSize = 512;
+                const int numGroupsPerBlock = threadBlockSize / config.groupsize;
+                                            
+                auto dbview = fullDB.getData();  //TODO only consider length of gpu partition, not full db
+                const int maxSubjectLength = dbview.lengths()[dbview.numSequences()-1];
+
+                //need to store 1 half value per subject position. kernel uses float2 for vectorized stores
+                //4 halfs per float2
+                const size_t tempStorageElementsPerGroup = SDIV(maxSubjectLength, 4);
+
+                const size_t tempStorageElementsPerBlock = tempStorageElementsPerGroup * numGroupsPerBlock;
+                const size_t tempStorageElementsAvailable = tempStorageBytes / sizeof(float2);
+
+                const size_t maxNumBlocks = tempStorageElementsAvailable / tempStorageElementsPerBlock;
+                if(maxNumBlocks == 0){
+                    std::cout << "query with length " << currentQueryLength << " cannot be processed. ";
+                    std::cout << "Not enough temp storage for a single threadblock. setting all scores to 0\n";
+                    d_scores.setAllScoresToZero(stream);
+                }else{
+                    const int numThreadBlocks = numSMs; //std::min(maxNumBlocks, numSequences);
+
+                    assert(sizeof(float2) * numThreadBlocks * tempStorageElementsPerBlock <= tempStorageBytes);
+
+                    // std::cout << "maxSubjectLength: " << maxSubjectLength << ", numThreadBlocks: " << numThreadBlocks 
+                    //     << ", tempStorageElementsPerBlock " << tempStorageElementsPerBlock
+                    //     << ", tempStorage used: " << sizeof(float2) * size_t(numThreadBlocks) * tempStorageElementsPerBlock 
+                    //     << " bytes" << "\n";
+
+                    float2* const multiTileTempStorage = (float2*)d_tempStorage;
+
+                    if(!config.dpx){
+                        if(config.approach == GaplessKernelConfig::Approach::hardcodedzero){
+                            PSSM_2D_View<half2> strided_PSSM = permutedPSSM.makeHalf2View();
+                            hardcodedzero::call_GaplessFilter_strided_PSSM_multitile_kernel<half2, 512>(                              
+                                numThreadBlocks, config.groupsize, config.numRegs, 
+                                d_inputChars, 
+                                d_scores, d_inputOffsets, d_inputLengths, 
+                                d_selectedPositions, numSequences, 
+                                currentQueryLength, strided_PSSM, 
+                                multiTileTempStorage, tempStorageElementsPerGroup, stream
+                            );
+                        }else{
+                            PSSM_2D_View<half2> strided_PSSM = permutedPSSM.makeHalf2View();
+                            kernelparamzero::call_GaplessFilter_strided_PSSM_multitile_kernel<half2, 512>(
+                                numThreadBlocks, config.groupsize, config.numRegs, 
+                                d_inputChars, 
+                                d_scores, d_inputOffsets, d_inputLengths, 
+                                d_selectedPositions, numSequences, 
+                                currentQueryLength, strided_PSSM, 
+                                multiTileTempStorage, tempStorageElementsPerGroup, stream
+                            );
+                        }
+                    }else{
+                        if(config.approach == GaplessKernelConfig::Approach::hardcodedzero){
+                            PSSM_2D_View<short2> strided_PSSM = permutedPSSM.makeShort2View();
+                            hardcodedzero::call_GaplessFilter_strided_PSSM_multitile_kernel<short2, 512>(
+                                numThreadBlocks, config.groupsize, config.numRegs, 
+                                d_inputChars, 
+                                d_scores, d_inputOffsets, d_inputLengths, 
+                                d_selectedPositions, numSequences, 
+                                currentQueryLength, strided_PSSM, 
+                                multiTileTempStorage, tempStorageElementsPerGroup, stream
+                            );
+                        }else{
+                            PSSM_2D_View<short2> strided_PSSM = permutedPSSM.makeShort2View();
+                            kernelparamzero::call_GaplessFilter_strided_PSSM_multitile_kernel<short2, 512>(
+                                numThreadBlocks, config.groupsize, config.numRegs, 
+                                d_inputChars, 
+                                d_scores, d_inputOffsets, d_inputLengths, 
+                                d_selectedPositions, numSequences, 
+                                currentQueryLength, strided_PSSM, 
+                                multiTileTempStorage, tempStorageElementsPerGroup, stream
+                            );
+                        }
+                    }
+                }
+            }
+        }
+
+
+        template<bool subjectIsCaseSensitive, bool withEndPosition, class OutputScores, class SelectedPositions>
+        void run_SW_endposition_kernels_PSSM(
+            OutputScores& d_scores,
+            GpuPermutedPSSMforSW& permutedPSSM,
+            const char* d_inputChars,
+            const SequenceLengthT* d_inputLengths,
+            const size_t* d_inputOffsets,
+            SelectedPositions d_selectedPositions,
+            size_t numSequences,
+            char* d_tempStorage,
+            size_t tempStorageBytes,
+            cudaStream_t stream
+        ){     
+
+            if(currentQueryLength <= getMaxSingleTileQueryLength_SW()){
+                auto config = getSingleTileGroupRegConfigForPSSM_SW(currentQueryLength);
+
+                int deviceId = 0;
+                int numSMs = 0;
+                cudaGetDevice(&deviceId);
+                cudaDeviceGetAttribute(&numSMs, cudaDevAttrMultiProcessorCount, deviceId);
+
+                const int numBlocks = std::min(size_t(numSMs), numSequences);
+
+                if(!config.dpx){
+                    PSSM_2D_View<float> strided_PSSM = permutedPSSM.makeView<float>();
+                    call_amino_gpu_localAlignmentKernel_affinegap_floatOrInt_pssm_singletile<float, 512, withEndPosition, subjectIsCaseSensitive>(
+                        numBlocks,
+                        config.groupsize, config.numRegs, 
+                        d_inputChars, 
+                        d_scores, d_inputOffsets, d_inputLengths, 
+                        d_selectedPositions, numSequences, 
+                        currentQueryLength, strided_PSSM, 
+                        gop,
+                        gex,
+                        stream
+                    );
+                }else{
+                    PSSM_2D_View<int> strided_PSSM = permutedPSSM.makeView<int>();
+                    call_amino_gpu_localAlignmentKernel_affinegap_floatOrInt_pssm_singletile<int, 512, withEndPosition, subjectIsCaseSensitive>(
+                        numBlocks,
+                        config.groupsize, config.numRegs, 
+                        d_inputChars, 
+                        d_scores, d_inputOffsets, d_inputLengths, 
+                        d_selectedPositions, numSequences, 
+                        currentQueryLength, strided_PSSM, 
+                        gop,
+                        gex,
+                        stream
+                    );
+                }
+            }else{
+                auto config = getMultiTileGroupRegConfigForPSSM_SW(currentQueryLength);
+
+                int deviceId = 0;
+                int numSMs = 0;
+                cudaGetDevice(&deviceId);
+                cudaDeviceGetAttribute(&numSMs, cudaDevAttrMultiProcessorCount, deviceId);
+
+                constexpr int threadBlockSize = 512;
+                const int numGroupsPerBlock = threadBlockSize / config.groupsize;
+                                            
+                auto dbview = fullDB.getData();  //TODO only consider length of gpu partition, not full db
+                const int maxSubjectLength = dbview.lengths()[dbview.numSequences()-1];
+
+                const int maxSubjectLengthPadded = maxSubjectLength + config.groupsize;
+                const size_t tempBytesPerGroup = sizeof(float2) * maxSubjectLengthPadded;
+                const size_t tempBytesPerBlock = tempBytesPerGroup * numGroupsPerBlock;
+
+                //const size_t maxActiveGroups = std::min(numSequences, tempStorageBytes / tempBytesPerGroup);
+                const size_t maxSimultaneousBlocks = tempStorageBytes / tempBytesPerBlock;
+                if(maxSimultaneousBlocks == 0){
+                    std::cout << "query with length " << currentQueryLength << " cannot be processed. ";
+                    std::cout << "Not enough temp storage for a single threadblock. setting all scores to 0\n";
+                    d_scores.setAllScoresToZero(stream);
+                }else{
+                    const int numBlocks = std::min(numSequences, std::min(size_t(numSMs), maxSimultaneousBlocks));
+                    //const int numGroupsInGrid = numBlocks * numGroupsPerBlock;
+
+                    // std::cout << "maxSubjectLengthPadded " << maxSubjectLengthPadded << "\n";
+                    // std::cout << "tempBytesPerGroup " << tempBytesPerGroup << "\n";
+                    // std::cout << "tempBytesPerBlock " << tempBytesPerBlock << "\n";
+                    // std::cout << "maxSimultaneousBlocks " << maxSimultaneousBlocks << "\n";
+                    // std::cout << "numBlocks " << numBlocks << "\n";
+                    // std::cout << "numGroupsInGrid " << numGroupsInGrid << "\n";
+
+                    if(!config.dpx){
+                        PSSM_2D_View<float> strided_PSSM = permutedPSSM.makeView<float>();
+                        call_amino_gpu_localAlignmentKernel_affinegap_floatOrInt_pssm_multitile<float, threadBlockSize, withEndPosition, subjectIsCaseSensitive>(
+                            numBlocks,
+                            config.groupsize, 
+                            config.numRegs, 
+                            d_inputChars, 
+                            d_scores, 
+                            d_inputOffsets, 
+                            d_inputLengths, 
+                            d_selectedPositions, 
+                            numSequences, 
+                            currentQueryLength, 
+                            strided_PSSM, 
+                            gop,
+                            gex,
+                            d_tempStorage, 
+                            tempBytesPerGroup, 
+                            stream
+                        );
+                    }else{
+                        PSSM_2D_View<int> strided_PSSM = permutedPSSM.makeView<int>();
+                        call_amino_gpu_localAlignmentKernel_affinegap_floatOrInt_pssm_multitile<int, threadBlockSize, withEndPosition, subjectIsCaseSensitive>(
+                            numBlocks,
+                            config.groupsize, 
+                            config.numRegs, 
+                            d_inputChars, 
+                            d_scores, 
+                            d_inputOffsets, 
+                            d_inputLengths, 
+                            d_selectedPositions, 
+                            numSequences, 
+                            currentQueryLength, 
+                            strided_PSSM, 
+                            gop,
+                            gex,
+                            d_tempStorage, 
+                            tempBytesPerGroup, 
+                            stream
+                        );
+                    }
+                }
+            }
+        }
+
+        void setNumTopNoCheck(int value){
+            if(value >= 0){
+                numTop = value;
+                updateNumResultsPerQuery();
+
+                cub::SwitchDevice sd(deviceIds[0]);
+                const int numGpus = deviceIds.size();           
+
+                h_finalAlignmentScores.resize(results_per_query);
+                h_finalReferenceIds.resize(results_per_query);
+                h_finalEndPositions.resize(results_per_query);
+                d_finalAlignmentScores_allGpus.resize(results_per_query * numGpus);
+                d_finalReferenceIds_allGpus.resize(results_per_query * numGpus);  
+                d_finalEndPositions_allGpus.resize(results_per_query * numGpus);              
+            }
+        }
+
+
+        std::vector<size_t> fullDB_numSequencesPerLengthPartition;
+        std::vector<size_t> numSequencesPerGpu_total;
+        std::vector<size_t> numSequencesPerGpuPrefixSum_total;
+
+        //partition chars of whole DB amongst the gpus
+        std::vector<size_t> numSequencesPerLengthPartitionPrefixSum;
+        std::vector<DBdataView> dbPartitionsByLengthPartitioning;
+        std::vector<std::vector<DBdataView>> subPartitionsForGpus;
+        std::vector<std::vector<int>> lengthPartitionIdsForGpus;
+        std::vector<size_t> numSequencesPerGpu;
+        std::vector<size_t> numSequencesPerGpuPrefixSum;
+        std::vector<CudaStream> gpuStreams;
+        std::vector<CudaEvent> gpuEvents;
+        std::vector<std::unique_ptr<GpuWorkingSet>> workingSets;  
+
+        std::vector<std::vector<DeviceBatchCopyToPinnedPlan>> batchPlans;
+        std::vector<std::vector<BatchDstInfo>> batchPlansDstInfoVec;
+
+        std::vector<std::vector<DeviceBatchCopyToPinnedPlan>> batchPlans_cachedDB;
+        std::vector<std::vector<BatchDstInfo>> batchPlansDstInfoVec_cachedDB;
+
+        bool processingTheFirstQuery = true;
+        int results_per_query;
+        SequenceLengthT currentQueryLength;
+        SequenceLengthT currentQueryLengthWithPadding;
+
+        bool dbIsReady{};
+        AnyDBWrapper fullDB;
+
+        mutable std::unique_ptr<SequenceLengthStatistics> dbSequenceLengthStatistics;
+
+        //final scan results. device data resides on gpu deviceIds[0]
+        MyPinnedBuffer<float> h_finalAlignmentScores;
+        MyPinnedBuffer<ReferenceIdT> h_finalReferenceIds;
+        MyPinnedBuffer<AlignmentEndPosition> h_finalEndPositions;
+        
+        //MyPinnedBuffer<int> resultNumOverflows;
+        MyDeviceBuffer<float> d_finalAlignmentScores_allGpus;
+        MyDeviceBuffer<ReferenceIdT> d_finalReferenceIds_allGpus;
+        MyDeviceBuffer<AlignmentEndPosition> d_finalEndPositions_allGpus;
+        //MyDeviceBuffer<int> d_resultNumOverflows;
+        std::unique_ptr<helpers::GpuTimer> scanTimer;
+
+        size_t totalProcessedQueryLengths{};
+        size_t totalNumOverflows{};
+        std::unique_ptr<helpers::GpuTimer> totalTimer;
+
+        HostGpuPartitionOffsets hostGpuPartitionOffsets;
+        
+        std::shared_ptr<TargetSubjectIds> targetSubjectIds;
+
+        std::vector<GaplessKernelConfig> availableKernelConfigs_gapless_singletile;
+        std::vector<GaplessKernelConfig> availableKernelConfigs_gapless_multitile;
+        std::vector<SmithWatermanKernelConfig> availableKernelConfigs_sw_singletile;
+        std::vector<SmithWatermanKernelConfig> availableKernelConfigs_sw_multitile;
+
+        //--------------------------------------
+        bool verbose = false;
+        int gop = -11;
+        int gex = -1;
+        int numTop = 10;
+        BlosumType blosumType = BlosumType::BLOSUM62_20;
+        ScanType scanType = ScanType::Gapless;
+        int maxReduceArraySize = MaxNumberOfResults::value();
+
+        MemoryConfig memoryConfig;
+        
+        std::vector<int> deviceIds;
+
+    };
+
+
+} //namespace cudasw4
+
+#endif
+
+
diff --git a/lib/libmarv/src/dbbatching.cuh b/lib/libmarv/src/dbbatching.cuh
new file mode 100644
index 000000000..8e46a7dfa
--- /dev/null
+++ b/lib/libmarv/src/dbbatching.cuh
@@ -0,0 +1,280 @@
+#ifndef DBBATCHING_CUH
+#define DBBATCHING_CUH
+
+#include "config.hpp"
+
+#include <vector>
+#include <iostream>
+#include <algorithm>
+
+#include <thrust/for_each.h>
+#include <thrust/iterator/transform_iterator.h>
+#include <thrust/scan.h>
+
+namespace cudasw4{
+
+    struct DeviceBatchCopyToPinnedPlan{
+        struct CopyRange{
+            int lengthPartitionId;
+            int currentCopyPartition;
+            int currentCopySeqInPartition;
+            int numToCopy;
+        };
+        size_t usedBytes = 0;
+        size_t usedSeq = 0;
+        std::vector<int> h_partitionIds;
+        std::vector<size_t> h_numPerPartition;
+        std::vector<CopyRange> copyRanges;
+
+        friend std::ostream& operator<<(std::ostream& os, const DeviceBatchCopyToPinnedPlan& plan){
+            os << "usedBytes " << plan.usedBytes << ", usedSeq " << plan.usedSeq << " ";
+            for(int i = 0; i < int(plan.h_partitionIds.size()); i++){
+                os << "(" << plan.h_partitionIds[i] << "," << plan.h_numPerPartition[i] << ") ";
+            }
+            
+            return os;
+        }
+    };
+
+    struct ExecutePinnedCopyCallbackData{
+        const DeviceBatchCopyToPinnedPlan* planPtr; 
+        char* h_chardata;
+        SequenceLengthT* h_lengthdata;
+        size_t* h_offsetdata;
+        const std::vector<DBdataView>* dbPartitionsPtr;
+    };
+
+    void executeCopyPlanH2DDirect(
+        const DeviceBatchCopyToPinnedPlan& plan, 
+        char* d_chardata,
+        SequenceLengthT* d_lengthdata,
+        size_t* d_offsetdata,
+        const std::vector<DBdataView>& dbPartitions,
+        cudaStream_t stream
+    ){
+        size_t usedBytes = 0;
+        size_t usedSeq = 0;
+        for(const auto& copyRange : plan.copyRanges){
+            const auto& dbPartition = dbPartitions[copyRange.currentCopyPartition];
+            const auto& firstSeq = copyRange.currentCopySeqInPartition;
+            const auto& numToCopy = copyRange.numToCopy;
+            size_t numBytesToCopy = dbPartition.offsets()[firstSeq + numToCopy] - dbPartition.offsets()[firstSeq];
+    
+            cudaMemcpyAsync(
+                d_chardata + usedBytes,
+                dbPartition.chars() + dbPartition.offsets()[firstSeq],
+                numBytesToCopy,
+                H2D,
+                stream
+            ); CUERR;
+            cudaMemcpyAsync(
+                d_lengthdata + usedSeq,
+                dbPartition.lengths() + firstSeq,
+                sizeof(SequenceLengthT) * numToCopy,
+                H2D,
+                stream
+            ); CUERR;
+            cudaMemcpyAsync(
+                d_offsetdata + usedSeq,
+                dbPartition.offsets() + firstSeq,
+                sizeof(size_t) * (numToCopy+1),
+                H2D,
+                stream
+            ); CUERR;
+            thrust::for_each(
+                thrust::cuda::par_nosync.on(stream),
+                d_offsetdata + usedSeq,
+                d_offsetdata + usedSeq + (numToCopy+1),
+                [
+                    usedBytes,
+                    firstOffset = dbPartition.offsets()[firstSeq]
+                ] __device__ (size_t& off){
+                    off = off - firstOffset + usedBytes;
+                }
+            );
+    
+            usedBytes += numBytesToCopy;
+            usedSeq += numToCopy;
+        }
+    };
+    
+    void executePinnedCopyPlanSerial(
+        const DeviceBatchCopyToPinnedPlan& plan, 
+        char* h_chardata,
+        SequenceLengthT* h_lengthdata,
+        size_t* h_offsetdata,
+        const std::vector<DBdataView>& dbPartitions
+    ){
+        size_t usedBytes = 0;
+        size_t usedSeq = 0;
+        for(const auto& copyRange : plan.copyRanges){
+            const auto& dbPartition = dbPartitions[copyRange.currentCopyPartition];
+            const auto& firstSeq = copyRange.currentCopySeqInPartition;
+            const auto& numToCopy = copyRange.numToCopy;
+            size_t numBytesToCopy = dbPartition.offsets()[firstSeq + numToCopy] - dbPartition.offsets()[firstSeq];
+    
+            auto end = std::copy(
+                dbPartition.chars() + dbPartition.offsets()[firstSeq],
+                dbPartition.chars() + dbPartition.offsets()[firstSeq + numToCopy],
+                h_chardata + usedBytes
+            );
+            std::copy(
+                dbPartition.lengths() + firstSeq,
+                dbPartition.lengths() + firstSeq+numToCopy,
+                h_lengthdata + usedSeq
+            );
+            std::transform(
+                dbPartition.offsets() + firstSeq,
+                dbPartition.offsets() + firstSeq + (numToCopy+1),
+                h_offsetdata + usedSeq,
+                [&](size_t off){
+                    return off - dbPartition.offsets()[firstSeq] + usedBytes;
+                }
+            );
+            usedBytes += std::distance(h_chardata + usedBytes, end);
+            usedSeq += numToCopy;
+        }
+    };
+    
+    void executePinnedCopyPlanSerialAndTransferToGpu(
+        const DeviceBatchCopyToPinnedPlan& plan, 
+        char* h_chardata,
+        SequenceLengthT* h_lengthdata,
+        size_t* /*h_offsetdata*/,
+        char* d_chardata,
+        SequenceLengthT* d_lengthdata,
+        size_t* d_offsetdata,
+        const std::vector<DBdataView>& dbPartitions,
+        cudaStream_t H2DcopyStream
+    ){
+    
+        size_t usedBytes = 0;
+        for(const auto& copyRange : plan.copyRanges){
+            const auto& dbPartition = dbPartitions[copyRange.currentCopyPartition];
+            const auto& firstSeq = copyRange.currentCopySeqInPartition;
+            const auto& numToCopy = copyRange.numToCopy;
+            size_t numBytesToCopy = dbPartition.offsets()[firstSeq + numToCopy] - dbPartition.offsets()[firstSeq];
+            constexpr size_t maxTransferBatchSize = 8 * 1024 * 1024;
+            for(size_t i = 0; i < numBytesToCopy; i += maxTransferBatchSize){
+                const size_t x = std::min(numBytesToCopy - i, maxTransferBatchSize);
+    
+                std::copy_n(
+                    dbPartition.chars() + dbPartition.offsets()[firstSeq] + i,
+                    x,
+                    h_chardata + usedBytes + i
+                );
+                cudaMemcpyAsync(
+                    d_chardata + usedBytes + i,
+                    h_chardata + usedBytes + i,
+                    x,
+                    H2D,
+                    H2DcopyStream
+                ); CUERR;
+            }
+    
+            // auto end = std::copy(
+            //     dbPartition.chars() + dbPartition.offsets()[firstSeq],
+            //     dbPartition.chars() + dbPartition.offsets()[firstSeq + numToCopy],
+            //     h_chardata + usedBytes
+            // );
+            // cudaMemcpyAsync(
+            //     d_chardata + usedBytes,
+            //     h_chardata + usedBytes,
+            //     numBytesToCopy,
+            //     H2D,
+            //     H2DcopyStream
+            // ); CUERR;
+    
+            usedBytes += numBytesToCopy;
+        }
+    
+        size_t usedSeq = 0;
+        for(const auto& copyRange : plan.copyRanges){
+            const auto& dbPartition = dbPartitions[copyRange.currentCopyPartition];
+            const auto& firstSeq = copyRange.currentCopySeqInPartition;
+            const auto& numToCopy = copyRange.numToCopy;
+    
+            std::copy(
+                dbPartition.lengths() + firstSeq,
+                dbPartition.lengths() + firstSeq+numToCopy,
+                h_lengthdata + usedSeq
+            );
+            // cudaMemcpyAsync(
+            //     d_lengthdata + usedSeq,
+            //     h_lengthdata + usedSeq,
+            //     sizeof(size_t) * numToCopy,
+            //     H2D,
+            //     H2DcopyStream
+            // ); CUERR;
+    
+            usedSeq += numToCopy;
+        }
+        cudaMemcpyAsync(
+            d_lengthdata,
+            h_lengthdata,
+            sizeof(SequenceLengthT) * plan.usedSeq,
+            H2D,
+            H2DcopyStream
+        ); CUERR;
+    
+        cudaMemsetAsync(d_offsetdata, 0, sizeof(size_t), H2DcopyStream); CUERR;
+    
+        auto d_paddedLengths = thrust::make_transform_iterator(
+            d_lengthdata,
+            [] __host__ __device__ (const SequenceLengthT& length){
+                return size_t(SDIV(length, 4) * 4);
+            }
+        );
+    
+        thrust::inclusive_scan(
+            thrust::cuda::par_nosync(thrust_async_allocator<char>(H2DcopyStream)).on(H2DcopyStream),
+            d_paddedLengths,
+            d_paddedLengths + plan.usedSeq,
+            d_offsetdata + 1
+        );
+    
+    };
+        
+    void executePinnedCopyPlanCallback(void* args){
+        ExecutePinnedCopyCallbackData* callbackData = (ExecutePinnedCopyCallbackData*)args;
+        const auto& plan = *callbackData->planPtr;
+        auto& dbPartitions = *callbackData->dbPartitionsPtr;
+        
+    
+        executePinnedCopyPlanSerial(
+            plan, 
+            callbackData->h_chardata,
+            callbackData->h_lengthdata,
+            callbackData->h_offsetdata,
+            dbPartitions
+        );
+    
+        delete callbackData;
+    }
+    
+    void executePinnedCopyPlanWithHostCallback(
+        const DeviceBatchCopyToPinnedPlan& plan, 
+        char* h_chardata,
+        SequenceLengthT* h_lengthdata,
+        size_t* h_offsetdata,
+        const std::vector<DBdataView>& dbPartitions, 
+        cudaStream_t stream
+    ){
+        ExecutePinnedCopyCallbackData* data = new ExecutePinnedCopyCallbackData;
+    
+        data->planPtr = &plan;
+        data->h_chardata = h_chardata,
+        data->h_lengthdata = h_lengthdata,
+        data->h_offsetdata = h_offsetdata,
+        data->dbPartitionsPtr = &dbPartitions;
+    
+        cudaLaunchHostFunc(
+            stream,
+            executePinnedCopyPlanCallback,
+            (void*)data
+        ); CUERR;
+    }
+
+}
+
+#endif
\ No newline at end of file
diff --git a/lib/libmarv/src/dbdata.cpp b/lib/libmarv/src/dbdata.cpp
new file mode 100644
index 000000000..d87391100
--- /dev/null
+++ b/lib/libmarv/src/dbdata.cpp
@@ -0,0 +1,353 @@
+#include "dbdata.hpp"
+#include "length_partitions.hpp"
+
+#include "hpc_helpers/all_helpers.cuh"
+
+#include <type_traits>
+#include <string>
+#include <fstream>
+#include <algorithm>
+#include <numeric>
+
+namespace cudasw4{
+
+    //write vector to file, overwrites existing file
+    template<class T>
+    void writeTrivialVectorToFile(const std::vector<T>& vec, const std::string& filename){
+        static_assert(std::is_trivially_copyable<T>::value, "writeTrivialVectorToFile: type not trivially copyable");
+
+        std::ofstream out(filename, std::ios::binary);
+        if(!out) throw std::runtime_error("Cannot open output file " + filename);
+        out.write((const char*)vec.data(), sizeof(T) * vec.size());
+    }
+
+    template<class T>
+    void loadTrivialVectorFromFile(std::vector<T>& vec, const std::string& filename){
+        static_assert(std::is_trivially_copyable<T>::value, "writeTrivialVectorToFile: type not trivially copyable");
+
+        auto getFileSizeInBytes = [](const std::string& filename) -> size_t{
+            struct stat stat_buf;
+            int rc = stat(filename.c_str(), &stat_buf);
+            if(rc == 0){
+                return stat_buf.st_size;
+            }else{
+                throw std::runtime_error("Could not determine file size of file " + filename);
+            }
+        };
+
+        size_t bytes = getFileSizeInBytes(filename);
+        vec.resize(SDIV(bytes, sizeof(T)));
+
+        std::ifstream inputstream(filename, std::ios::binary);
+        if(!inputstream) throw std::runtime_error("Cannot open file " + filename);
+        inputstream.read((char*)vec.data(), bytes);
+    }
+
+void loadDBdata(const std::string& inputPrefix, DBdata& result, bool writeAccess, bool prefetchSeq, size_t globalSequenceOffset){
+
+
+    MappedFile::Options headerOptions;
+    headerOptions.readaccess = true;
+    headerOptions.writeaccess = writeAccess;
+    headerOptions.prefault = false;
+
+    result.mappedFileHeaders = std::make_unique<MappedFile>(inputPrefix + DBdataIoConfig::headerfilename(), headerOptions);
+    result.mappedFileHeaderOffsets = std::make_unique<MappedFile>(inputPrefix + DBdataIoConfig::headeroffsetsfilename(), headerOptions);
+
+
+    MappedFile::Options sequenceOptions;
+    sequenceOptions.readaccess = true;
+    sequenceOptions.writeaccess = writeAccess;
+    sequenceOptions.prefault = prefetchSeq;
+
+    result.mappedFileSequences = std::make_unique<MappedFile>(inputPrefix + DBdataIoConfig::sequencesfilename(), sequenceOptions);
+    result.mappedFileLengths = std::make_unique<MappedFile>(inputPrefix + DBdataIoConfig::sequencelengthsfilename(), sequenceOptions);
+    result.mappedFileOffsets = std::make_unique<MappedFile>(inputPrefix + DBdataIoConfig::sequenceoffsetsfilename(), sequenceOptions);
+
+    result.globalSequenceOffset = globalSequenceOffset;
+
+    // std::ifstream metadatain(inputPrefix + DBdataIoConfig::metadatafilename(), std::ios::binary);
+    // if(!metadatain) throw std::runtime_error("Cannot open file " + inputPrefix + DBdataIoConfig::metadatafilename());
+
+    // int numPartitions = 0;
+    // metadatain.read((char*)&numPartitions, sizeof(int));
+
+    // result.metaData.lengthBoundaries.resize(numPartitions);
+    // result.metaData.numSequencesPerLengthPartition.resize(numPartitions);
+    // metadatain.read((char*)result.metaData.lengthBoundaries.data(), sizeof(int) * numPartitions);
+    // metadatain.read((char*)result.metaData.numSequencesPerLengthPartition.data(), sizeof(size_t) * numPartitions);
+    //
+    // auto expectedBoundaries = getLengthPartitionBoundaries();
+    // if(expectedBoundaries.size() != result.metaData.lengthBoundaries.size()){
+    //     throw std::runtime_error("Invalid partition info in metadata.");
+    // }
+    // for(int i = 0; i < numPartitions; i++){
+    //     if(expectedBoundaries[i] != result.metaData.lengthBoundaries[i]){
+    //         throw std::runtime_error("Invalid partition info in metadata.");
+    //     }
+    // }
+
+
+    auto lengthBoundaries = getLengthPartitionBoundaries();
+    // std::vector<int> lengthBoundaries;
+    // for(int l = 64; l <= 8192; l += 64){
+    //     lengthBoundaries.push_back(l);
+    // }
+    const int numPartitions = lengthBoundaries.size();
+    result.metaData.lengthBoundaries.resize(numPartitions);
+    result.metaData.numSequencesPerLengthPartition.resize(numPartitions);
+
+    auto partitionBegin = result.lengths();
+    for(int i = 0; i < numPartitions; i++){
+        //length k is in partition i if boundaries[i-1] < k <= boundaries[i]
+        SequenceLengthT searchFor = lengthBoundaries[i];
+        if(searchFor < std::numeric_limits<SequenceLengthT>::max()){
+            searchFor += 1;
+        }
+        auto partitionEnd = std::lower_bound(
+            partitionBegin, 
+            result.lengths() + result.numSequences(), 
+            searchFor
+        );
+        result.metaData.lengthBoundaries[i] = lengthBoundaries[i];
+        result.metaData.numSequencesPerLengthPartition[i] = std::distance(partitionBegin, partitionEnd);
+        partitionBegin = partitionEnd;
+    }
+}
+
+void loadDBdata(const std::string& inputPrefix, DBdataWithVectors& result, size_t globalSequenceOffset){
+
+    loadTrivialVectorFromFile(result.vecFileHeaders, inputPrefix + DBdataIoConfig::headerfilename());
+    loadTrivialVectorFromFile(result.vecFileHeaderOffsets, inputPrefix + DBdataIoConfig::headeroffsetsfilename());
+    loadTrivialVectorFromFile(result.vecFileSequences, inputPrefix + DBdataIoConfig::sequencesfilename());
+    loadTrivialVectorFromFile(result.vecFileLengths, inputPrefix + DBdataIoConfig::sequencelengthsfilename());
+    loadTrivialVectorFromFile(result.vecFileOffsets, inputPrefix + DBdataIoConfig::sequenceoffsetsfilename());
+
+    // size_t headerBytes = getFileSizeInBytes(inputPrefix + DBdataIoConfig::headerfilename());
+    // size_t headerOffsetsBytes = getFileSizeInBytes(inputPrefix + DBdataIoConfig::headeroffsetsfilename());
+    // size_t sequenceBytes = getFileSizeInBytes(inputPrefix + DBdataIoConfig::sequencesfilename());
+    // size_t lengthBytes = getFileSizeInBytes(inputPrefix + DBdataIoConfig::sequencelengthsfilename());
+    // size_t offsetsBytes = getFileSizeInBytes(inputPrefix + DBdataIoConfig::sequenceoffsetsfilename());
+
+    // result.vecFileHeaders.resize(headerBytes);
+    // result.vecFileHeaderOffsets.resize(SDIV(headerOffsetsBytes, sizeof(size_t)));
+    // result.vecFileSequences.resize(sequenceBytes);
+    // result.vecFileLengths.resize(SDIV(lengthBytes, sizeof(SequenceLengthT)));
+    // result.vecFileOffsets.resize(SDIV(offsetsBytes, sizeof(size_t)));
+
+
+
+    // MappedFile::Options headerOptions;
+    // headerOptions.readaccess = true;
+    // headerOptions.writeaccess = writeAccess;
+    // headerOptions.prefault = false;
+
+    // result.mappedFileHeaders = std::make_unique<MappedFile>(inputPrefix + DBdataIoConfig::headerfilename(), headerOptions);
+    // result.mappedFileHeaderOffsets = std::make_unique<MappedFile>(inputPrefix + DBdataIoConfig::headeroffsetsfilename(), headerOptions);
+
+
+    // MappedFile::Options sequenceOptions;
+    // sequenceOptions.readaccess = true;
+    // sequenceOptions.writeaccess = writeAccess;
+    // sequenceOptions.prefault = prefetchSeq;
+
+    // result.mappedFileSequences = std::make_unique<MappedFile>(inputPrefix + DBdataIoConfig::sequencesfilename(), sequenceOptions);
+    // result.mappedFileLengths = std::make_unique<MappedFile>(inputPrefix + DBdataIoConfig::sequencelengthsfilename(), sequenceOptions);
+    // result.mappedFileOffsets = std::make_unique<MappedFile>(inputPrefix + DBdataIoConfig::sequenceoffsetsfilename(), sequenceOptions);
+
+    result.globalSequenceOffset = globalSequenceOffset;
+
+    auto lengthBoundaries = getLengthPartitionBoundaries();
+
+    const int numPartitions = lengthBoundaries.size();
+    result.metaData.lengthBoundaries.resize(numPartitions);
+    result.metaData.numSequencesPerLengthPartition.resize(numPartitions);
+
+    auto partitionBegin = result.lengths();
+    for(int i = 0; i < numPartitions; i++){
+        //length k is in partition i if boundaries[i-1] < k <= boundaries[i]
+        SequenceLengthT searchFor = lengthBoundaries[i];
+        if(searchFor < std::numeric_limits<SequenceLengthT>::max()){
+            searchFor += 1;
+        }
+        auto partitionEnd = std::lower_bound(
+            partitionBegin, 
+            result.lengths() + result.numSequences(), 
+            searchFor
+        );
+        result.metaData.lengthBoundaries[i] = lengthBoundaries[i];
+        result.metaData.numSequencesPerLengthPartition[i] = std::distance(partitionBegin, partitionEnd);
+        partitionBegin = partitionEnd;
+    }
+}
+
+
+
+
+
+
+
+
+
+void writeGlobalDbInfo(const std::string& outputPrefix, const DBGlobalInfo& /*info*/){
+    //write info data to metadata file
+    std::ofstream metadataout(outputPrefix + DBdataIoConfig::metadatafilename(), std::ios::binary);
+    if(!metadataout) throw std::runtime_error("Cannot open output file " + outputPrefix + DBdataIoConfig::metadatafilename());
+
+}
+
+void readGlobalDbInfo(const std::string& prefix, DBGlobalInfo& /*info*/){
+    //write info data to metadata file
+    std::ifstream metadatain(prefix + DBdataIoConfig::metadatafilename(), std::ios::binary);
+    if(!metadatain) throw std::runtime_error("Cannot open file " + prefix + DBdataIoConfig::metadatafilename());
+
+}
+
+
+DB loadDB(const std::string& prefix, bool writeAccess, bool prefetchSeq){
+
+    try{
+        DB result;
+        readGlobalDbInfo(prefix, result.info);
+
+        const std::string chunkPrefix = prefix + std::to_string(0);
+        result.data = DBdata(chunkPrefix, writeAccess, prefetchSeq, 0);
+
+        return result;
+    }catch(const MappedFileException& ex){
+        throw LoadDBException(ex.what());
+    }catch(...){
+        throw LoadDBException();
+    }
+}
+
+DBWithVectors loadDBWithVectors(const std::string& prefix){
+    try{
+        DBWithVectors result;
+        readGlobalDbInfo(prefix, result.info);
+
+        const std::string chunkPrefix = prefix + std::to_string(0);
+        result.data = DBdataWithVectors(chunkPrefix, 0);
+
+        return result;
+    }catch(...){
+        throw LoadDBException();
+    }
+}
+
+
+PseudoDB loadPseudoDB(size_t num, size_t length, bool allSameSequences, int randomseed){
+    PseudoDB result;
+    result.data = PseudoDBdata(num, length, allSameSequences, randomseed);
+
+    return result;
+}
+
+MMseqsDB loadMMseqsDB(size_t numSeqs, char * data, size_t *offset, int32_t* length, size_t dbCharSize ){
+    MMseqsDB result;
+    result.data = MMseqsDBdata(numSeqs, data, offset, length, dbCharSize);
+
+    return result;
+}
+
+ExternalDB loadExternalDB(size_t numSeqs, size_t dbCharSize, DBdataMetaData metaData){
+    ExternalDB result;
+    result.data = ExternalDBdata(numSeqs, dbCharSize, metaData);
+
+    return result;
+}
+
+std::vector<DBdataView> partitionDBdata_by_numberOfSequences(const DBdataView& parent, size_t maxNumSequencesPerPartition){
+
+    const size_t numSequences = parent.numSequences();
+    std::vector<DBdataView> result;
+
+    for(size_t i = 0; i < numSequences; i += maxNumSequencesPerPartition){
+        const size_t numInPartition = std::min(maxNumSequencesPerPartition, numSequences - i);
+        result.emplace_back(parent, i, i + numInPartition);
+    }
+
+    return result;
+}
+
+//partitions have the smallest number of chars such that is at least numCharsPerPartition. (with the exception of last partition)
+std::vector<DBdataView> partitionDBdata_by_numberOfChars(const DBdataView& parent, size_t numCharsPerPartition){
+
+    const size_t numChars = parent.numChars();
+
+    std::vector<size_t> bucketLimits(1,0);
+    size_t currentBegin = parent.offsets()[0];
+    const size_t end = parent.offsets()[0] + numChars;
+    while(currentBegin < end){
+        const size_t searchBegin = currentBegin + numCharsPerPartition;
+        const auto it = std::upper_bound(parent.offsets(), parent.offsets() + parent.numSequences()+1, searchBegin);
+        if(it == parent.offsets() + parent.numSequences()+1){
+            bucketLimits.push_back(parent.numSequences());
+            currentBegin = end;
+        }else{
+            const size_t dist = std::distance(parent.offsets(), it);
+            bucketLimits.push_back(dist);
+            currentBegin = parent.offsets()[dist];
+        }
+    }
+
+    const size_t numPartitions = bucketLimits.size()-1;
+    std::vector<DBdataView> result;
+    for(size_t p = 0; p < numPartitions; p++){
+        result.emplace_back(parent, bucketLimits[p], bucketLimits[p+1]);
+    }
+
+    return result;
+}
+
+
+
+void assertValidPartitioning(const std::vector<DBdataView>& views, const DBdataView& parent){
+    const int numPartitions = views.size();
+
+    std::vector<size_t> partitionOffsets(numPartitions+1, 0);
+    for(int p = 1; p <= numPartitions; p++){
+        partitionOffsets[p] = partitionOffsets[p-1] + views[p-1].numSequences();
+    }
+
+
+    const size_t totalNumSequencesInViews = partitionOffsets.back();
+    // const size_t totalNumSequencesInViews = std::reduce(views.begin(), views.end(),
+    //     [](const auto& v){return v.numSequences();}
+    // );
+
+    assert(parent.numSequences() == totalNumSequencesInViews);
+
+    #pragma omp parallel for
+    for(int p = 0; p < numPartitions; p++){
+        const DBdataView& view = views[p];
+
+        for(size_t i = 0; i < view.numSequences(); i++){
+            assert(view.lengths()[i] == parent.lengths()[partitionOffsets[p] + i]);
+            assert(view.offsets()[i] == parent.offsets()[partitionOffsets[p] + i]);
+            assert(view.headerOffsets()[i] == parent.headerOffsets()[partitionOffsets[p] + i]);
+
+            const char* const viewSeqEnd = view.chars() + view.offsets()[i] + view.lengths()[i];
+            const char* const dbSeqEnd =  parent.chars() + parent.offsets()[partitionOffsets[p] + i] + parent.lengths()[i];
+            auto mismatchSeq = std::mismatch(
+                view.chars() + view.offsets()[i],
+                viewSeqEnd,
+                parent.chars() + parent.offsets()[partitionOffsets[p] + i],
+                dbSeqEnd
+            );
+            assert(mismatchSeq.first == viewSeqEnd || mismatchSeq.second == dbSeqEnd);
+
+            const char* const viewHeaderEnd = view.headers() + view.headerOffsets()[i+1];
+            const char* const dbHeaderEnd =  parent.headers() + parent.headerOffsets()[partitionOffsets[p] + i+1];
+            auto mismatchHeader = std::mismatch(
+                view.headers() + view.headerOffsets()[i],
+                viewHeaderEnd,
+                parent.headers() + parent.headerOffsets()[partitionOffsets[p] + i],
+                dbHeaderEnd
+            );
+            assert(mismatchHeader.first == viewHeaderEnd || mismatchHeader.second == dbHeaderEnd);
+        }
+    }
+}
+
+} //namespace cudasw4
\ No newline at end of file
diff --git a/lib/libmarv/src/dbdata.hpp b/lib/libmarv/src/dbdata.hpp
new file mode 100644
index 000000000..8b1f6517e
--- /dev/null
+++ b/lib/libmarv/src/dbdata.hpp
@@ -0,0 +1,807 @@
+#ifndef DB_DATA_HPP
+#define DB_DATA_HPP
+
+#include "mapped_file.hpp"
+#include "sequence_io.h"
+#include "length_partitions.hpp"
+#include "convert.cuh"
+
+#include "config.hpp"
+
+#include <memory>
+#include <fstream>
+#include <random>
+#include <algorithm>
+#include <numeric>
+#include <vector>
+#include <iostream>
+
+namespace cudasw4{
+
+struct DBdataIoConfig{
+    static const std::string metadatafilename(){ return "metadata"; }
+    static const std::string headerfilename(){ return "headers"; }
+    static const std::string headeroffsetsfilename(){ return "headeroffsets"; }
+    static const std::string sequencesfilename(){ return "chars"; }
+    static const std::string sequenceoffsetsfilename(){ return "offsets"; }
+    static const std::string sequencelengthsfilename(){ return "lengths"; }
+};
+
+class LoadDBException : public std::exception{
+    std::string message;
+public:
+    LoadDBException() : LoadDBException("LoadDBException"){}
+    LoadDBException(const std::string& msg) : message(msg){}
+
+    const char* what() const noexcept override {
+        return message.c_str();
+    }
+};
+
+
+struct DBGlobalInfo{
+    
+};
+
+
+struct DBdataMetaData{
+    std::vector<int> lengthBoundaries;
+    std::vector<size_t> numSequencesPerLengthPartition;
+};
+
+struct DBdata{
+    friend void loadDBdata(const std::string& inputPrefix, DBdata& result, bool writeAccess, bool prefetchSeq, size_t globalSequenceOffset);
+    friend struct DB;
+
+    DBdata(const std::string& inputPrefix, bool writeAccess, bool prefetchSeq, size_t globalSequenceOffset = 0){
+        loadDBdata(inputPrefix, *this, writeAccess, prefetchSeq, globalSequenceOffset);
+    }
+
+    DBdata(const DBdata&) = delete;
+    DBdata(DBdata&&) = default;
+    DBdata& operator=(const DBdata&) = delete;
+    DBdata& operator=(DBdata&&) = default;
+
+    size_t getGlobalSequenceOffset() const noexcept{
+        return globalSequenceOffset;
+    }
+
+    size_t numSequences() const noexcept{
+        return mappedFileLengths->numElements<SequenceLengthT>();
+    }
+
+    size_t numChars() const noexcept{
+        return mappedFileSequences->numElements<char>();
+    }
+
+    const char* chars() const noexcept{
+        return mappedFileSequences->data();
+    }
+
+    const SequenceLengthT* lengths() const noexcept{
+        return reinterpret_cast<const SequenceLengthT*>(mappedFileLengths->data());
+    }
+
+    const size_t* offsets() const noexcept{
+        return reinterpret_cast<const size_t*>(mappedFileOffsets->data());
+    }
+
+    const char* headers() const noexcept{
+        return mappedFileHeaders->data();
+    }
+
+    const size_t* headerOffsets() const noexcept{
+        return reinterpret_cast<const size_t*>(mappedFileHeaderOffsets->data());
+    }
+
+    const DBdataMetaData& getMetaData() const noexcept{
+        return metaData;
+    }
+
+    char* chars() noexcept{
+        return mappedFileSequences->data();
+    }
+
+    SequenceLengthT* lengths() noexcept{
+        return reinterpret_cast<SequenceLengthT*>(mappedFileLengths->data());
+    }
+
+    size_t* offsets() noexcept{
+        return reinterpret_cast<size_t*>(mappedFileOffsets->data());
+    }
+
+    char* headers() noexcept{
+        return mappedFileHeaders->data();
+    }
+
+    size_t* headerOffsets() noexcept{
+        return reinterpret_cast<size_t*>(mappedFileHeaderOffsets->data());
+    }
+
+    
+private:
+    DBdata() = default;
+
+    size_t globalSequenceOffset;
+    std::unique_ptr<MappedFile> mappedFileSequences;
+    std::unique_ptr<MappedFile> mappedFileLengths;
+    std::unique_ptr<MappedFile> mappedFileOffsets;
+    std::unique_ptr<MappedFile> mappedFileHeaders;
+    std::unique_ptr<MappedFile> mappedFileHeaderOffsets;
+    DBdataMetaData metaData;
+};
+
+
+
+
+struct DBdataWithVectors{
+    friend void loadDBdata(const std::string& inputPrefix, DBdataWithVectors& result, size_t globalSequenceOffset);
+    friend struct DBWithVectors;
+
+    DBdataWithVectors(const std::string& inputPrefix, size_t globalSequenceOffset = 0){
+        loadDBdata(inputPrefix, *this, globalSequenceOffset);
+    }
+
+    DBdataWithVectors(const DBdataWithVectors&) = delete;
+    DBdataWithVectors(DBdataWithVectors&&) = default;
+    DBdataWithVectors& operator=(const DBdataWithVectors&) = delete;
+    DBdataWithVectors& operator=(DBdataWithVectors&&) = default;
+
+    size_t getGlobalSequenceOffset() const noexcept{
+        return globalSequenceOffset;
+    }
+
+    size_t numSequences() const noexcept{
+        return vecFileLengths.size();
+    }
+
+    size_t numChars() const noexcept{
+        return vecFileSequences.size();
+    }
+
+    const char* chars() const noexcept{
+        return vecFileSequences.data();
+    }
+
+    const SequenceLengthT* lengths() const noexcept{
+        return vecFileLengths.data();
+    }
+
+    const size_t* offsets() const noexcept{
+        return vecFileOffsets.data();
+    }
+
+    const char* headers() const noexcept{
+        return vecFileHeaders.data();
+    }
+
+    const size_t* headerOffsets() const noexcept{
+        return vecFileHeaderOffsets.data();
+    }
+
+    const DBdataMetaData& getMetaData() const noexcept{
+        return metaData;
+    }
+
+    char* chars() noexcept{
+        return vecFileSequences.data();
+    }
+
+    SequenceLengthT* lengths() noexcept{
+        return vecFileLengths.data();
+    }
+
+    size_t* offsets() noexcept{
+        return vecFileOffsets.data();
+    }
+
+    char* headers() noexcept{
+        return vecFileHeaders.data();
+    }
+
+    size_t* headerOffsets() noexcept{
+        return vecFileHeaderOffsets.data();
+    }
+
+    
+private:
+    DBdataWithVectors() = default;
+
+    size_t globalSequenceOffset;
+    std::vector<char> vecFileSequences;
+    std::vector<SequenceLengthT> vecFileLengths;
+    std::vector<size_t> vecFileOffsets;
+    std::vector<char> vecFileHeaders;
+    std::vector<size_t> vecFileHeaderOffsets;
+    DBdataMetaData metaData;
+};
+
+struct PseudoDBdata{
+    friend struct PseudoDB;
+
+    PseudoDBdata(size_t num, SequenceLengthT length, bool allSameSequences, int randomseed = 42)
+    : lengthRounded(((length + 3) / 4) * 4),
+        charvec(num * lengthRounded),
+        lengthvec(num),
+        offsetvec(num+1),
+        headervec(num), //headers will be only 1 letter
+        headeroffsetvec(num+1)
+    {
+        const char* letters = "ARNDCQEGHILKMFPSTWYV";
+
+        std::mt19937 gen(randomseed);
+        std::uniform_int_distribution<> dist(0,19);
+
+        if(allSameSequences){
+            std::string dummyseq(length, ' ');
+            for(SequenceLengthT i = 0; i < length; i++){
+                dummyseq[i] = letters[dist(gen)];
+            }
+            //std::cout << "PseudoDBdata: num " << num << ", length " << length << ", sequence " << dummyseq << "\n";
+
+            for(size_t i = 0; i < num; i++){
+                offsetvec[i] = i * lengthRounded;
+                std::copy(dummyseq.begin(), dummyseq.end(), charvec.begin() + i * lengthRounded);
+            }
+            offsetvec[num] = num * lengthRounded;
+        }else{
+            for(char& c : charvec){
+				c = letters[dist(gen)];
+			}
+            for(size_t i = 0; i < num; i++){
+				offsetvec[i] = i * lengthRounded;
+			}
+			offsetvec[num] = num * lengthRounded;
+        }
+
+        std::fill(lengthvec.begin(), lengthvec.end(), length);
+
+        std::fill(headervec.begin(), headervec.end(), 'H');
+        std::iota(headeroffsetvec.begin(), headeroffsetvec.end(), size_t(0));
+
+        //convert amino acids to integers
+        std::transform(charvec.begin(), charvec.end(), charvec.begin(), ConvertAA_20{});
+        
+
+        auto boundaries = getLengthPartitionBoundaries();
+
+        metaData.lengthBoundaries.insert(metaData.lengthBoundaries.end(), boundaries.begin(), boundaries.end());
+        metaData.numSequencesPerLengthPartition.resize(boundaries.size());
+
+        for(int i = 0; i < int(boundaries.size()); i++){
+            SequenceLengthT lower = i == 0 ? 0 : boundaries[i-1];
+            SequenceLengthT upper = boundaries[i];
+
+            if(lower < length && length <= upper){
+                metaData.numSequencesPerLengthPartition[i] = num;
+            }else{
+                metaData.numSequencesPerLengthPartition[i] = 0;
+            }
+        }
+    }
+
+    PseudoDBdata(const PseudoDBdata&) = delete;
+    PseudoDBdata(PseudoDBdata&&) = default;
+    PseudoDBdata& operator=(const PseudoDBdata&) = delete;
+    PseudoDBdata& operator=(PseudoDBdata&&) = default;
+
+    size_t getGlobalSequenceOffset() const noexcept{
+        return 0;
+    }
+
+    size_t numSequences() const noexcept{
+        return lengthvec.size();
+    }
+
+    size_t numChars() const noexcept{
+        return charvec.size();
+    }
+
+    const char* chars() const noexcept{
+        return charvec.data();
+    }
+
+    const SequenceLengthT* lengths() const noexcept{
+        return lengthvec.data();
+    }
+
+    const size_t* offsets() const noexcept{
+        return offsetvec.data();
+    }
+
+    const char* headers() const noexcept{
+        return headervec.data();
+    }
+
+    const size_t* headerOffsets() const noexcept{
+        return headeroffsetvec.data();
+    }
+
+    const DBdataMetaData& getMetaData() const noexcept{
+        return metaData;
+    }
+    
+private:
+
+    PseudoDBdata() = default;
+
+    size_t lengthRounded;
+    std::vector<char> charvec;
+    std::vector<SequenceLengthT> lengthvec;
+    std::vector<size_t> offsetvec;
+    std::vector<char> headervec;
+    std::vector<size_t> headeroffsetvec;
+    DBdataMetaData metaData;
+};
+
+
+struct MMseqsDBdata{
+    friend struct MMseqsDB;
+
+    MMseqsDBdata(size_t numSeqs, char *data, size_t *offset, int32_t *length, size_t dbCharSize) :
+        numSeqs(numSeqs),
+        data(data),
+        offset(offset),
+        length(length),
+        dbCharSize(dbCharSize)
+    {
+        auto lengthBoundaries = getLengthPartitionBoundaries();
+        const int numPartitions = lengthBoundaries.size();
+
+        metaData.lengthBoundaries.insert(metaData.lengthBoundaries.end(), lengthBoundaries.begin(), lengthBoundaries.end());
+        metaData.numSequencesPerLengthPartition.resize(lengthBoundaries.size());
+
+        auto partitionBegin = lengths();
+        for (int i = 0; i < numPartitions; i++) {
+            //length k is in partition i if boundaries[i-1] < k <= boundaries[i]
+            SequenceLengthT searchFor = lengthBoundaries[i];
+            if (searchFor < std::numeric_limits<SequenceLengthT>::max()) {
+                searchFor += 1;
+            }
+            auto partitionEnd = std::lower_bound(partitionBegin, lengths() + numSequences(), searchFor);
+            metaData.lengthBoundaries[i] = lengthBoundaries[i];
+            metaData.numSequencesPerLengthPartition[i] = std::distance(partitionBegin, partitionEnd);
+            partitionBegin = partitionEnd;
+        }
+    }
+
+    MMseqsDBdata(const MMseqsDBdata&) = delete;
+    MMseqsDBdata(MMseqsDBdata&&) = default;
+    MMseqsDBdata& operator=(const MMseqsDBdata&) = delete;
+    MMseqsDBdata& operator=(MMseqsDBdata&&) = default;
+
+    size_t getGlobalSequenceOffset() const noexcept{
+        return 0;
+    }
+
+    size_t numSequences() const noexcept{
+        return numSeqs;
+    }
+
+    size_t numChars() const noexcept{
+        return dbCharSize;
+    }
+
+    const char* chars() const noexcept{
+        return data;
+    }
+
+    const SequenceLengthT* lengths() const noexcept{
+        return length;
+    }
+
+    const size_t* offsets() const noexcept{
+        return offset;
+    }
+
+    const char* headers() const noexcept{
+        return data;
+    }
+
+    const size_t* headerOffsets() const noexcept{
+        return offset;
+    }
+
+    const DBdataMetaData& getMetaData() const noexcept{
+        return metaData;
+    }
+
+private:
+
+    MMseqsDBdata() = default;
+
+    size_t numSeqs;
+    char * data;
+    size_t *offset;
+    SequenceLengthT *length;
+    size_t dbCharSize;
+    std::vector<char> headervec;
+    std::vector<size_t> headeroffsetvec;
+    DBdataMetaData metaData;
+};
+
+struct ExternalDBdata {
+    friend struct ExternalDB;
+
+    ExternalDBdata(size_t numSeqs, size_t dbCharSize, DBdataMetaData metaData) :
+        numSeqs(numSeqs),
+        dbCharSize(dbCharSize),
+        metaData(metaData)
+    {}
+
+    ExternalDBdata(const ExternalDBdata&) = delete;
+    ExternalDBdata(ExternalDBdata&&) = default;
+    ExternalDBdata& operator=(const ExternalDBdata&) = delete;
+    ExternalDBdata& operator=(ExternalDBdata&&) = default;
+
+    size_t getGlobalSequenceOffset() const noexcept{
+        return 0;
+    }
+
+    size_t numSequences() const noexcept{
+        return numSeqs;
+    }
+
+    size_t numChars() const noexcept{
+        return dbCharSize;
+    }
+
+    const char* chars() const noexcept{
+        return NULL;
+    }
+
+    const SequenceLengthT* lengths() const noexcept{
+        return NULL;
+    }
+
+    const size_t* offsets() const noexcept{
+        return NULL;
+    }
+
+    const char* headers() const noexcept{
+        return NULL;
+    }
+
+    const size_t* headerOffsets() const noexcept{
+        return NULL;
+    }
+
+    const DBdataMetaData& getMetaData() const noexcept{
+        return metaData;
+    }
+
+private:
+    ExternalDBdata() = default;
+    size_t numSeqs;
+    size_t dbCharSize;
+    DBdataMetaData metaData;
+};
+
+
+struct DB{
+    friend DB loadDB(const std::string& prefix, bool writeAccess, bool prefetchSeq);
+
+    
+    DB(const DB&) = delete;
+    DB(DB&&) = default;
+    DB& operator=(const DB&) = delete;
+    DB& operator=(DB&&) = default;
+
+    DBGlobalInfo getInfo() const{
+        return info;
+    }
+
+    const DBdata& getData() const{
+        return data;
+    }
+
+    DBdata& getModyfiableData(){
+        return data;
+    }
+
+private:
+    DB() = default;
+
+    DBGlobalInfo info;
+    DBdata data;
+};
+
+struct DBWithVectors{
+    friend DBWithVectors loadDBWithVectors(const std::string& prefix);
+
+    
+    DBWithVectors(const DBWithVectors&) = delete;
+    DBWithVectors(DBWithVectors&&) = default;
+    DBWithVectors& operator=(const DBWithVectors&) = delete;
+    DBWithVectors& operator=(DBWithVectors&&) = default;
+
+    DBGlobalInfo getInfo() const{
+        return info;
+    }
+
+    const DBdataWithVectors& getData() const{
+        return data;
+    }
+
+    DBdataWithVectors& getModyfiableData(){
+        return data;
+    }
+
+private:
+    DBWithVectors() = default;
+
+    DBGlobalInfo info;
+    DBdataWithVectors data;
+};
+
+struct PseudoDB{
+    friend PseudoDB loadPseudoDB(size_t num, size_t length, bool allSameSequences, int randomseed);
+
+    PseudoDB() = default;
+    PseudoDB(const PseudoDB&) = delete;
+    PseudoDB(PseudoDB&&) = default;
+    PseudoDB& operator=(const PseudoDB&) = delete;
+    PseudoDB& operator=(PseudoDB&&) = default;
+
+    DBGlobalInfo getInfo() const{
+        return info;
+    }
+
+    const PseudoDBdata& getData() const{
+        return data;
+    }
+
+private:
+    DBGlobalInfo info;
+    PseudoDBdata data;
+};
+
+
+struct MMseqsDB{
+    friend MMseqsDB loadMMseqsDB(size_t numSeqs, char * data, size_t *offset, int32_t *length, size_t dbCharSize );
+
+    MMseqsDB() = default;
+    MMseqsDB(const MMseqsDB&) = delete;
+    MMseqsDB(MMseqsDB&&) = default;
+    MMseqsDB& operator=(const MMseqsDB&) = delete;
+    MMseqsDB& operator=(MMseqsDB&&) = default;
+
+    DBGlobalInfo getInfo() const{
+        return info;
+    }
+
+    const MMseqsDBdata& getData() const{
+        return data;
+    }
+
+private:
+    DBGlobalInfo info;
+    MMseqsDBdata data;
+};
+
+struct ExternalDB {
+    friend ExternalDB loadExternalDB(size_t numSeqs, size_t dbCharSize, DBdataMetaData metaData);
+
+    ExternalDB() = default;
+    ExternalDB(const ExternalDB&) = delete;
+    ExternalDB(ExternalDB&&) = default;
+    ExternalDB& operator=(const ExternalDB&) = delete;
+    ExternalDB& operator=(ExternalDB&&) = default;
+
+    DBGlobalInfo getInfo() const {
+        return info;
+    }
+
+    const ExternalDBdata& getData() const {
+        return data;
+    }
+
+private:
+    DBGlobalInfo info;
+    ExternalDBdata data;
+};
+
+void writeGlobalDbInfo(const std::string& outputPrefix, const DBGlobalInfo& info);
+void readGlobalDbInfo(const std::string& prefix, DBGlobalInfo& info);
+
+DB loadDB(const std::string& prefix, bool writeAccess, bool prefetchSeq);
+DBWithVectors loadDBWithVectors(const std::string& prefix);
+PseudoDB loadPseudoDB(size_t num, size_t length, bool allSameSequences, int randomseed = 42);
+MMseqsDB loadMMseqsDB(size_t numSeqs, char * data, size_t *offset, int32_t *length, size_t dbCharSize);
+ExternalDB loadExternalDB(size_t numSeqs, size_t dbCharSize, DBdataMetaData metaData);
+
+
+
+
+
+
+/*
+    A view of a partion of DBdata.
+
+    The i-th sequence data in the partition begins at chars() + offsets()[i].
+    It has length lengths[i].
+    Its header begins at headers() + headerOffsets()[i]
+
+    Important note!:
+    This view currently simply modifies the pointers to the original dbData arrays.
+    It does not contain a copy of access offsets that begin with 0, and chars() returns the original dbData chars() ptr.
+    This means when copying the view data to the device, the host sequence src pointer 
+    must be chars() + offsets()[0], not chars(). !, i.e. cudaMemcpy(d_chars, view.chars() + offsets()[0], sizeof(char) * view.numChars())
+
+    Because offsets are stored unmodified, offsets()[0] must be substracted from d_offsets after copying to obatin the correct offsets into d_chars
+
+    The same applies to header offsets if they were to be used on the gpu
+
+
+*/
+struct DBdataView{
+    DBdataView(): firstSequence(0), 
+        lastSequence_excl(0), 
+        globalSequenceOffset(0),
+        parentChars(nullptr),
+        parentLengths(nullptr),
+        parentOffsets(nullptr),
+        parentHeaders(nullptr),
+        parentHeaderOffsets(nullptr)
+    {
+
+    }
+
+    DBdataView(
+        size_t firstSequence_,
+        size_t lastSequence_excl_,
+        size_t globalSequenceOffset_,
+        const char* parentChars_,
+        const SequenceLengthT* parentLengths_,
+        const size_t* parentOffsets_,
+        const char* parentHeaders_,
+        const size_t* parentHeaderOffsets_
+    ) :
+        firstSequence(firstSequence_),
+        lastSequence_excl(lastSequence_excl_),
+        globalSequenceOffset(globalSequenceOffset_),
+        parentChars(parentChars_),
+        parentLengths(parentLengths_),
+        parentOffsets(parentOffsets_),
+        parentHeaders(parentHeaders_),
+        parentHeaderOffsets(parentHeaderOffsets_)
+    {}
+
+    template<class Data>
+    DBdataView(const Data& parent, size_t globalSequenceOffset_ = 0) 
+        : firstSequence(0), 
+        lastSequence_excl(parent.numSequences()), 
+        globalSequenceOffset(globalSequenceOffset_),
+        parentChars(parent.chars()),
+        parentLengths(parent.lengths()),
+        parentOffsets(parent.offsets()),
+        parentHeaders(parent.headers()),
+        parentHeaderOffsets(parent.headerOffsets())
+    {
+
+    }
+
+    DBdataView(const DBdataView&, size_t) = delete;
+
+
+    DBdataView(const DBdataView& parent, size_t first_, size_t last_) 
+        : firstSequence(first_), 
+        lastSequence_excl(last_), 
+        globalSequenceOffset(parent.getGlobalSequenceOffset() + firstSequence),
+        parentChars(parent.chars()),
+        parentLengths(parent.lengths()),
+        parentOffsets(parent.offsets()),
+        parentHeaders(parent.headers()),
+        parentHeaderOffsets(parent.headerOffsets())
+    {
+
+    }
+
+    size_t getGlobalSequenceOffset() const noexcept{
+        return globalSequenceOffset;
+    }
+
+    size_t numSequences() const noexcept{
+        return lastSequence_excl - firstSequence;
+    }
+
+    size_t numChars() const noexcept{
+        return parentOffsets[lastSequence_excl] - parentOffsets[firstSequence];
+    }
+
+    const char* chars() const noexcept{
+        return parentChars;
+    }
+
+    const SequenceLengthT* lengths() const noexcept{
+        return parentLengths + firstSequence;
+    }
+
+    const size_t* offsets() const noexcept{
+        return parentOffsets + firstSequence;
+    }
+
+    const char* headers() const noexcept{
+        return parentHeaders;
+    }
+
+    const size_t* headerOffsets() const noexcept{
+        return parentHeaderOffsets + firstSequence;
+    }
+    
+private:
+    size_t firstSequence;
+    size_t lastSequence_excl;
+    size_t globalSequenceOffset; //index of firstSequence at the top level, i.e. in the full db
+
+    const char* parentChars;
+    const SequenceLengthT* parentLengths;
+    const size_t* parentOffsets;
+    const char* parentHeaders;
+    const size_t* parentHeaderOffsets;
+};
+
+struct AnyDBWrapper{
+    AnyDBWrapper() = default;
+
+    AnyDBWrapper(std::shared_ptr<DB> db){
+        setDB(*db);
+        dbPtr = db;
+    }
+
+    AnyDBWrapper(std::shared_ptr<DBWithVectors> db){
+        setDB(*db);
+        dbWithVectorsPtr = db;
+    }
+
+    AnyDBWrapper(std::shared_ptr<PseudoDB> db){
+        setDB(*db);
+        pseudoDBPtr = db;
+    }
+
+    AnyDBWrapper(std::shared_ptr<MMseqsDB> db){
+        setDB(*db);
+        mmseqsDBPtr = db;
+    }
+
+    AnyDBWrapper(std::shared_ptr<ExternalDB> db){
+        setDB(*db);
+        externalDBPtr = db;
+    }
+
+    DBGlobalInfo getInfo() const{
+        return info;
+    }
+
+    const DBdataView& getData() const{
+        return data;
+    }
+
+private:
+    template<class DB>
+    void setDB(const DB& db){
+        info = db.getInfo();
+        data = DBdataView(db.getData());
+    }
+    std::shared_ptr<DB> dbPtr = nullptr;
+    std::shared_ptr<DBWithVectors> dbWithVectorsPtr = nullptr;
+    std::shared_ptr<PseudoDB> pseudoDBPtr = nullptr;
+    std::shared_ptr<MMseqsDB> mmseqsDBPtr = nullptr;
+    std::shared_ptr<ExternalDB> externalDBPtr = nullptr;
+
+    DBGlobalInfo info;
+    DBdataView data;   
+
+};
+
+
+
+std::vector<DBdataView> partitionDBdata_by_numberOfSequences(const DBdataView& parent, size_t maxNumSequencesPerPartition);
+//partitions have the smallest number of chars that is at least numCharsPerPartition. (with the exception of last partition)
+std::vector<DBdataView> partitionDBdata_by_numberOfChars(const DBdataView& parent, size_t numCharsPerPartition);
+
+void assertValidPartitioning(const std::vector<DBdataView>& views, const DBdataView& parent);
+
+} //namespace cudasw4
+
+#endif
\ No newline at end of file
diff --git a/lib/libmarv/src/gapless_kernel_config.cuh b/lib/libmarv/src/gapless_kernel_config.cuh
new file mode 100644
index 000000000..5ad8c1db2
--- /dev/null
+++ b/lib/libmarv/src/gapless_kernel_config.cuh
@@ -0,0 +1,243 @@
+#ifndef GAPLESS_KERNEL_CONFIG_CUH
+#define GAPLESS_KERNEL_CONFIG_CUH
+
+#include <algorithm>
+#include <vector>
+#include <string>
+
+namespace cudasw4{
+
+
+    struct GaplessKernelConfig{
+        enum class Approach : int{
+            hardcodedzero = 0,
+            kernelparamzero = 1
+        };
+        bool dpx;
+        int tilesize;
+        int groupsize;
+        int numRegs;
+        Approach approach;
+
+        GaplessKernelConfig() = default;
+        GaplessKernelConfig(int tilesize_, int groupsize_, int numRegs_, int dpx_, Approach approach_)
+            : dpx(dpx_), tilesize(tilesize_), groupsize(groupsize_), numRegs(numRegs_), approach(approach_)
+        {}
+
+        GaplessKernelConfig(const GaplessKernelConfig&) = default;
+        GaplessKernelConfig& operator=(const GaplessKernelConfig&) = default;
+    };
+
+    __inline__
+    std::string to_string(GaplessKernelConfig::Approach approach){
+        switch(approach){
+            case GaplessKernelConfig::Approach::hardcodedzero: return "hardcodedzero";
+            case GaplessKernelConfig::Approach::kernelparamzero: return "kernelparamzero";
+        }
+        return "to_string: missing case for GaplessKernelConfig::Approach";
+    }
+
+    __inline__
+    std::ostream& operator<<(std::ostream& os, const GaplessKernelConfig& data){
+
+        os << data.tilesize << " " << data.groupsize << " " << data.numRegs 
+            << " " << data.dpx << " " << int(data.approach);
+        return os;
+    }
+    
+
+    //T4
+    __inline__
+    std::vector<GaplessKernelConfig> getOptimalKernelConfigs_gapless_sm75(){
+        std::vector<GaplessKernelConfig> configs{
+            {32,4,4,0,GaplessKernelConfig::Approach::hardcodedzero},
+            {64,4,8,0,GaplessKernelConfig::Approach::hardcodedzero},
+            {96,4,12,0,GaplessKernelConfig::Approach::hardcodedzero},
+            {128,4,16,0,GaplessKernelConfig::Approach::hardcodedzero},
+            {160,4,20,0,GaplessKernelConfig::Approach::hardcodedzero},
+            {192,8,12,0,GaplessKernelConfig::Approach::hardcodedzero},
+            {224,4,28,0,GaplessKernelConfig::Approach::hardcodedzero},
+            {256,8,16,0,GaplessKernelConfig::Approach::hardcodedzero},
+            {288,4,36,0,GaplessKernelConfig::Approach::hardcodedzero},
+            {320,8,20,0,GaplessKernelConfig::Approach::hardcodedzero},
+            {352,4,44,0,GaplessKernelConfig::Approach::hardcodedzero},
+            {384,16,12,0,GaplessKernelConfig::Approach::hardcodedzero},
+            {416,4,52,0,GaplessKernelConfig::Approach::hardcodedzero},
+            {448,8,28,0,GaplessKernelConfig::Approach::hardcodedzero},
+            {480,4,60,0,GaplessKernelConfig::Approach::hardcodedzero},
+            {512,16,16,0,GaplessKernelConfig::Approach::hardcodedzero},
+            {576,8,36,0,GaplessKernelConfig::Approach::hardcodedzero},
+            {640,16,20,0,GaplessKernelConfig::Approach::hardcodedzero},
+            {704,8,44,0,GaplessKernelConfig::Approach::hardcodedzero},
+            {768,16,24,0,GaplessKernelConfig::Approach::hardcodedzero},
+            {832,8,52,0,GaplessKernelConfig::Approach::hardcodedzero},
+            {896,16,28,0,GaplessKernelConfig::Approach::hardcodedzero},
+            {960,8,60,0,GaplessKernelConfig::Approach::hardcodedzero},
+            {1024,16,32,0,GaplessKernelConfig::Approach::hardcodedzero},
+            {1152,16,36,0,GaplessKernelConfig::Approach::hardcodedzero},
+            {1280,16,40,0,GaplessKernelConfig::Approach::hardcodedzero},
+            {1408,16,44,0,GaplessKernelConfig::Approach::hardcodedzero},
+            {1536,16,48,0,GaplessKernelConfig::Approach::hardcodedzero},
+            //larger tiles are not supported because shared memory size is too small
+        };
+
+        return configs;
+    }
+
+
+    //A100
+    __inline__
+    std::vector<GaplessKernelConfig> getOptimalKernelConfigs_gapless_sm80(){
+        std::vector<GaplessKernelConfig> configs{
+            {32,4,4,0, GaplessKernelConfig::Approach::kernelparamzero},
+            {64,4,8,0, GaplessKernelConfig::Approach::hardcodedzero},
+            {96,4,12,0, GaplessKernelConfig::Approach::kernelparamzero},
+            {128,4,16,0, GaplessKernelConfig::Approach::kernelparamzero},
+            {160,4,20,0, GaplessKernelConfig::Approach::kernelparamzero},
+            {192,4,24,0, GaplessKernelConfig::Approach::kernelparamzero},
+            {224,4,28,0, GaplessKernelConfig::Approach::kernelparamzero},
+            {256,4,32,0, GaplessKernelConfig::Approach::kernelparamzero},
+            {288,4,36,0, GaplessKernelConfig::Approach::kernelparamzero},
+            {320,4,40,0, GaplessKernelConfig::Approach::kernelparamzero},
+            {352,4,44,0, GaplessKernelConfig::Approach::kernelparamzero},
+            {384,4,48,0, GaplessKernelConfig::Approach::kernelparamzero},
+            {416,4,52,0, GaplessKernelConfig::Approach::kernelparamzero},
+            {448,4,56,0, GaplessKernelConfig::Approach::hardcodedzero},
+            {480,4,60,0, GaplessKernelConfig::Approach::hardcodedzero},
+            {512,4,64,0, GaplessKernelConfig::Approach::hardcodedzero},
+            {576,8,36,0, GaplessKernelConfig::Approach::kernelparamzero},
+            {640,8,40,0, GaplessKernelConfig::Approach::hardcodedzero},
+            {704,8,44,0, GaplessKernelConfig::Approach::hardcodedzero},
+            {768,8,48,0, GaplessKernelConfig::Approach::kernelparamzero},
+            {832,8,52,0, GaplessKernelConfig::Approach::hardcodedzero},
+            {896,8,56,0, GaplessKernelConfig::Approach::kernelparamzero},
+            {960,8,60,0, GaplessKernelConfig::Approach::hardcodedzero},
+            {1024,8,64,0, GaplessKernelConfig::Approach::hardcodedzero},
+            {1152,16,36,0, GaplessKernelConfig::Approach::kernelparamzero},
+            {1280,16,40,0, GaplessKernelConfig::Approach::hardcodedzero},
+            {1408,16,44,0, GaplessKernelConfig::Approach::kernelparamzero},
+            {1536,16,48,0, GaplessKernelConfig::Approach::kernelparamzero},
+            {1664,16,52,0, GaplessKernelConfig::Approach::hardcodedzero},
+            {1792,16,56,0, GaplessKernelConfig::Approach::hardcodedzero},
+            {1920,16,60,0, GaplessKernelConfig::Approach::kernelparamzero},
+            {2048,16,64,0, GaplessKernelConfig::Approach::kernelparamzero},
+        };
+
+        return configs;
+    }
+
+    //L40S
+    __inline__
+    std::vector<GaplessKernelConfig> getOptimalKernelConfigs_gapless_sm89(){
+        std::vector<GaplessKernelConfig> configs{
+            {32,4,4,0, GaplessKernelConfig::Approach::kernelparamzero},
+            {64,4,8,0, GaplessKernelConfig::Approach::kernelparamzero},
+            {96,4,12,0, GaplessKernelConfig::Approach::kernelparamzero},
+            {128,4,16,0, GaplessKernelConfig::Approach::kernelparamzero},
+            {160,4,20,0, GaplessKernelConfig::Approach::hardcodedzero},
+            {192,4,24,0, GaplessKernelConfig::Approach::kernelparamzero},
+            {224,4,28,0, GaplessKernelConfig::Approach::hardcodedzero},
+            {256,4,32,0, GaplessKernelConfig::Approach::hardcodedzero},
+            {288,4,36,0, GaplessKernelConfig::Approach::hardcodedzero},
+            {320,4,40,0, GaplessKernelConfig::Approach::hardcodedzero},
+            {352,4,44,0, GaplessKernelConfig::Approach::hardcodedzero},
+            {384,4,48,0, GaplessKernelConfig::Approach::kernelparamzero},
+            {416,4,52,0, GaplessKernelConfig::Approach::hardcodedzero},
+            {448,4,56,0, GaplessKernelConfig::Approach::kernelparamzero},
+            {480,4,60,0, GaplessKernelConfig::Approach::kernelparamzero},
+            {512,4,64,0, GaplessKernelConfig::Approach::hardcodedzero},
+            {576,8,36,0, GaplessKernelConfig::Approach::kernelparamzero},
+            {640,8,40,0, GaplessKernelConfig::Approach::kernelparamzero},
+            {704,8,44,0, GaplessKernelConfig::Approach::kernelparamzero},
+            {768,8,48,0, GaplessKernelConfig::Approach::kernelparamzero},
+            {832,8,52,0, GaplessKernelConfig::Approach::kernelparamzero},
+            {896,8,56,0, GaplessKernelConfig::Approach::kernelparamzero},
+            {960,8,60,0, GaplessKernelConfig::Approach::hardcodedzero},
+            {1024,8,64,0, GaplessKernelConfig::Approach::hardcodedzero},
+            {1152,16,36,0, GaplessKernelConfig::Approach::hardcodedzero},
+            {1280,16,40,0, GaplessKernelConfig::Approach::hardcodedzero},
+            {1408,16,44,0, GaplessKernelConfig::Approach::kernelparamzero},
+            {1536,16,48,0, GaplessKernelConfig::Approach::kernelparamzero},
+            {1664,16,52,0, GaplessKernelConfig::Approach::hardcodedzero},
+            {1792,16,56,0, GaplessKernelConfig::Approach::hardcodedzero},
+            {1920,16,60,0, GaplessKernelConfig::Approach::kernelparamzero},
+            {2048,16,64,0, GaplessKernelConfig::Approach::hardcodedzero},
+        };
+
+        return configs;
+    }
+
+    //H100 SXM
+    __inline__
+    std::vector<GaplessKernelConfig> getOptimalKernelConfigs_gapless_sm90(){
+        std::vector<GaplessKernelConfig> configs{
+            {32, 4, 4, 1, GaplessKernelConfig::Approach::kernelparamzero},
+            {64, 4, 8, 1, GaplessKernelConfig::Approach::kernelparamzero},
+            {96, 4, 12, 1, GaplessKernelConfig::Approach::kernelparamzero},
+            {128, 4, 16, 0, GaplessKernelConfig::Approach::kernelparamzero},
+            {160, 4, 20, 1, GaplessKernelConfig::Approach::kernelparamzero},
+            {192, 4, 24, 1, GaplessKernelConfig::Approach::kernelparamzero},
+            {224, 4, 28, 1, GaplessKernelConfig::Approach::kernelparamzero},
+            {256, 4, 32, 1, GaplessKernelConfig::Approach::kernelparamzero},
+            {288, 4, 36, 1, GaplessKernelConfig::Approach::kernelparamzero},
+            {320, 4, 40, 1, GaplessKernelConfig::Approach::kernelparamzero},
+            {352, 4, 44, 1, GaplessKernelConfig::Approach::kernelparamzero},
+            {384, 4, 48, 1, GaplessKernelConfig::Approach::kernelparamzero},
+            {416, 4, 52, 1, GaplessKernelConfig::Approach::kernelparamzero},
+            {448, 4, 56, 1, GaplessKernelConfig::Approach::kernelparamzero},
+            {480, 4, 60, 1, GaplessKernelConfig::Approach::kernelparamzero},
+            {512, 8, 32, 1, GaplessKernelConfig::Approach::kernelparamzero},
+            {576, 8, 36, 1, GaplessKernelConfig::Approach::kernelparamzero},
+            {640, 8, 40, 1, GaplessKernelConfig::Approach::kernelparamzero},
+            {704, 8, 44, 1, GaplessKernelConfig::Approach::kernelparamzero},
+            {768, 8, 48, 1, GaplessKernelConfig::Approach::kernelparamzero},
+            {832, 8, 52, 1, GaplessKernelConfig::Approach::kernelparamzero},
+            {896, 8, 56, 1, GaplessKernelConfig::Approach::kernelparamzero},
+            {960, 8, 60, 1, GaplessKernelConfig::Approach::kernelparamzero},
+            {1024, 16, 32, 1, GaplessKernelConfig::Approach::kernelparamzero},
+            {1152, 16, 36, 1, GaplessKernelConfig::Approach::kernelparamzero},
+            {1280, 16, 40, 1, GaplessKernelConfig::Approach::kernelparamzero},
+            {1408, 16, 44, 1, GaplessKernelConfig::Approach::kernelparamzero},
+            {1536, 16, 48, 1, GaplessKernelConfig::Approach::kernelparamzero},
+            {1664, 16, 52, 1, GaplessKernelConfig::Approach::kernelparamzero},
+            {1792, 16, 56, 1, GaplessKernelConfig::Approach::kernelparamzero},
+            {1920, 16, 60, 1, GaplessKernelConfig::Approach::kernelparamzero},
+            {2048, 16, 64, 0, GaplessKernelConfig::Approach::hardcodedzero},
+        };
+
+        return configs;
+    }
+
+    __inline__
+    std::vector<GaplessKernelConfig> getOptimalKernelConfigs_gapless_default(){
+        return getOptimalKernelConfigs_gapless_sm89();
+    }
+
+    __inline__
+    std::vector<GaplessKernelConfig> getOptimalKernelConfigs_gapless(int deviceId){
+        int ccMajor = 0;
+        int ccMinor = 0;
+        cudaDeviceGetAttribute(&ccMajor, cudaDevAttrComputeCapabilityMajor, deviceId);
+        cudaDeviceGetAttribute(&ccMinor, cudaDevAttrComputeCapabilityMinor, deviceId);
+
+        std::vector<GaplessKernelConfig> configs;
+
+        if(ccMajor == 7 && ccMinor == 5){
+            configs = getOptimalKernelConfigs_gapless_sm75();
+        }else if(ccMajor == 8 && ccMinor == 0){
+            configs = getOptimalKernelConfigs_gapless_sm80();
+        }else if(ccMajor == 8 && ccMinor == 9){
+            configs = getOptimalKernelConfigs_gapless_sm89();
+        }else if(ccMajor == 9 && ccMinor == 0){
+            configs = getOptimalKernelConfigs_gapless_sm90();
+        }else{
+            configs = getOptimalKernelConfigs_gapless_default();
+        }
+
+        return configs;
+    }
+
+
+}
+
+#endif
\ No newline at end of file
diff --git a/lib/libmarv/src/gpudatabaseallocation.cuh b/lib/libmarv/src/gpudatabaseallocation.cuh
new file mode 100644
index 000000000..dbaeb2dcf
--- /dev/null
+++ b/lib/libmarv/src/gpudatabaseallocation.cuh
@@ -0,0 +1,113 @@
+#ifndef GPU_DATABASE_ALLOCATION_CUH
+#define GPU_DATABASE_ALLOCATION_CUH
+
+#include "config.hpp"
+
+#include "hpc_helpers/simple_allocation.cuh"
+
+struct GpuDatabaseAllocationBase{
+    virtual const char* getCharData() const = 0;
+    virtual const SequenceLengthT* getLengthData() const = 0;
+    virtual const size_t* getOffsetData() const = 0;
+
+    virtual char* getCharData() = 0;
+    virtual SequenceLengthT* getLengthData() = 0;
+    virtual size_t* getOffsetData() = 0;
+
+    virtual size_t getNumChars() const = 0;
+    virtual size_t getNumSubjects() const = 0;
+};
+
+
+struct GpuDatabaseAllocation : public GpuDatabaseAllocationBase{
+
+    GpuDatabaseAllocation() : GpuDatabaseAllocation(0,0) {}
+    GpuDatabaseAllocation(size_t numChars, size_t numSubjects){
+        d_fulldb_chardata.resize(numChars);
+        d_fulldb_lengthdata.resize(numSubjects);
+        d_fulldb_offsetdata.resize(numSubjects+1);
+    }
+
+    const char* getCharData() const override{
+        return d_fulldb_chardata.data();
+    }
+    const SequenceLengthT* getLengthData() const override{
+        return d_fulldb_lengthdata.data();
+    }
+    const size_t* getOffsetData() const override{
+        return d_fulldb_offsetdata.data();
+    }
+    char* getCharData() override{
+        return d_fulldb_chardata.data();
+    }
+    SequenceLengthT* getLengthData() override{
+        return d_fulldb_lengthdata.data();
+    }
+    size_t* getOffsetData() override{
+        return d_fulldb_offsetdata.data();
+    }
+
+    size_t getNumChars() const override{
+        return d_fulldb_chardata.size();
+    }
+
+    size_t getNumSubjects() const override{
+        return d_fulldb_lengthdata.size();
+    }
+
+    helpers::SimpleAllocationDevice<char, 0> d_fulldb_chardata;
+    helpers::SimpleAllocationDevice<SequenceLengthT, 0> d_fulldb_lengthdata;
+    helpers::SimpleAllocationDevice<size_t, 0> d_fulldb_offsetdata;
+};
+
+struct GpuDatabaseAllocationView : public GpuDatabaseAllocationBase{
+
+    GpuDatabaseAllocationView() = default;
+    GpuDatabaseAllocationView(
+        char* chardata_, 
+        SequenceLengthT* lengthdata_,
+        size_t* offsetdata_,
+        size_t numChars_,
+        size_t numSubjects_
+    ): chardata(chardata_), lengthdata(lengthdata_), offsetdata(offsetdata_),numChars(numChars_), numSubjects(numSubjects_){
+
+    }
+
+    const char* getCharData() const override{
+        return chardata;
+    }
+    const SequenceLengthT* getLengthData() const override{
+        return lengthdata;
+    }
+    const size_t* getOffsetData() const override{
+        return offsetdata;
+    }
+    char* getCharData() override{
+        return chardata;
+    }
+    SequenceLengthT* getLengthData() override{
+        return lengthdata;
+    }
+    size_t* getOffsetData() override{
+        return offsetdata;
+    }
+
+    size_t getNumChars() const override{
+        return numChars;
+    }
+
+    size_t getNumSubjects() const override{
+        return numSubjects;
+    }
+
+    char* chardata; //numChars
+    SequenceLengthT* lengthdata; // numSubjects
+    size_t* offsetdata; //numSubjects + 1
+
+    size_t numChars;
+    size_t numSubjects;
+};
+
+
+
+#endif
\ No newline at end of file
diff --git a/lib/libmarv/src/hpc_helpers/all_helpers.cuh b/lib/libmarv/src/hpc_helpers/all_helpers.cuh
new file mode 100644
index 000000000..430740c78
--- /dev/null
+++ b/lib/libmarv/src/hpc_helpers/all_helpers.cuh
@@ -0,0 +1,5 @@
+#include "cuda_helpers.cuh"
+#include "hpc_helpers.h"
+#include "io_helpers.h"
+#include "type_helpers.h"
+#include "timers.cuh"
\ No newline at end of file
diff --git a/lib/libmarv/src/hpc_helpers/coop_group_helpers.cuh b/lib/libmarv/src/hpc_helpers/coop_group_helpers.cuh
new file mode 100644
index 000000000..25238b334
--- /dev/null
+++ b/lib/libmarv/src/hpc_helpers/coop_group_helpers.cuh
@@ -0,0 +1,92 @@
+#ifndef HELPERS_COOP_GROUP_HELPERS_CUH
+#define HELPERS_COOP_GROUP_HELPERS_CUH
+
+#include "cuda_helpers.cuh"
+
+/*
+    Like cg::thread_block_tile<1>, but usable from the host.
+*/
+struct SingleThreadGroup{
+public:
+
+    HOSTDEVICEQUALIFIER
+    constexpr void sync() const noexcept{
+        ; //no-op
+    }
+
+    HOSTDEVICEQUALIFIER
+    constexpr unsigned long long thread_rank() const noexcept{
+        return 0;
+    }
+
+    HOSTDEVICEQUALIFIER
+    constexpr unsigned long long size() const noexcept{
+        return 1;
+    }
+
+    HOSTDEVICEQUALIFIER
+    constexpr unsigned long long meta_group_size() const noexcept{
+        return 1;
+    }
+
+    HOSTDEVICEQUALIFIER
+    constexpr unsigned long long meta_group_rank() const noexcept{
+        return 0;
+    }
+
+    template<class T>
+    HOSTDEVICEQUALIFIER
+    constexpr T shfl(T var, unsigned int src_rank) const noexcept{
+        return var;
+    }
+
+    template<class T>
+    HOSTDEVICEQUALIFIER
+    constexpr T shfl_up(T var, int /*delta*/) const noexcept{
+        return var;
+    }
+
+    template<class T>
+    HOSTDEVICEQUALIFIER
+    constexpr T shfl_down(T var, int /*delta*/) const noexcept{
+        return var;
+    }
+
+    template<class T>
+    HOSTDEVICEQUALIFIER
+    constexpr T shfl_xor(T var, int /*delta*/) const noexcept{
+        return var;
+    }
+
+    HOSTDEVICEQUALIFIER
+    constexpr int any(int predicate) const noexcept{
+        return predicate > 0;
+    }
+
+    HOSTDEVICEQUALIFIER
+    constexpr int all(int predicate) const noexcept{
+        return predicate > 0;
+    }
+
+    HOSTDEVICEQUALIFIER
+    constexpr unsigned int ballot(int predicate) const noexcept{
+        return predicate > 0;
+    }
+
+    template<class T>
+    HOSTDEVICEQUALIFIER
+    constexpr unsigned int match_any(T /*val*/) const noexcept{
+        return 1u;
+    }
+
+    template<class T>
+    HOSTDEVICEQUALIFIER
+    constexpr unsigned int match_all(T /*val*/, int &pred) const noexcept{
+        pred = 1;
+        return 1u;
+    }
+
+
+};
+
+#endif /* HELPERS_COOP_GROUP_HELPERS_CUH */
diff --git a/lib/libmarv/src/hpc_helpers/cuda_helpers.cuh b/lib/libmarv/src/hpc_helpers/cuda_helpers.cuh
new file mode 100644
index 000000000..6ade8354f
--- /dev/null
+++ b/lib/libmarv/src/hpc_helpers/cuda_helpers.cuh
@@ -0,0 +1,339 @@
+#ifndef HELPERS_CUDA_HELPERS_CUH
+#define HELPERS_CUDA_HELPERS_CUH
+
+#include <iostream>
+#include <cstdint>
+#include <vector>
+#include <algorithm>
+#include <assert.h>
+
+// #if CUDART_VERSION >= 9000
+//     #include <cooperative_groups.h>
+// #endif
+
+// #if !defined(CUDA_HELPERS_DONT_INCLUDE_V11_GROUP_HEADERS) && CUDART_VERSION >= 11000
+//     #include <cooperative_groups/reduce.h>
+//     #include <cooperative_groups/memcpy_async.h>
+// #endif
+
+#include "hpc_helpers.h"
+
+// error checking
+#ifdef __CUDACC__
+    #define CUERR {                                                            \
+        cudaError_t err;                                                       \
+        if ((err = cudaGetLastError()) != cudaSuccess) {                       \
+            std::cout << "CUDA error: " << cudaGetErrorString(err) << " : "    \
+                    << __FILE__ << ", line " << __LINE__ << std::endl;       \
+            exit(1);                                                           \
+        }                                                                      \
+    }
+#endif
+
+// common CUDA constants
+#define WARPSIZE (32)
+#define MAXBLOCKSIZE (1024)
+#define MAXSMEMBYTES (49152)
+#define MAXCONSTMEMBYTES (65536)
+#define H2D (cudaMemcpyHostToDevice)
+#define D2H (cudaMemcpyDeviceToHost)
+#define H2H (cudaMemcpyHostToHost)
+#define D2D (cudaMemcpyDeviceToDevice)
+
+// cross platform classifiers
+#ifdef __CUDACC__
+    #define HOSTDEVICEQUALIFIER  __host__ __device__
+#else
+    #define HOSTDEVICEQUALIFIER
+#endif
+
+#ifdef __CUDACC__
+    #define INLINEQUALIFIER  __forceinline__
+#else
+    #define INLINEQUALIFIER inline
+#endif
+
+#ifdef __CUDACC__
+    #define GLOBALQUALIFIER  __global__
+#else
+    #define GLOBALQUALIFIER
+#endif
+
+#ifdef __CUDACC__
+    #define DEVICEQUALIFIER  __device__
+#else
+    #define DEVICEQUALIFIER
+#endif
+
+#ifdef __CUDACC__
+    #define HOSTQUALIFIER  __host__
+#else
+    #define HOSTQUALIFIER
+#endif
+
+#ifdef __CUDACC__
+    #define HD_WARNING_DISABLE #pragma hd_warning_disable
+#else
+    #define HD_WARNING_DISABLE
+#endif
+
+// redefinition of CUDA atomics for common cstdint types
+#ifdef __CUDACC__
+    // CAS
+    DEVICEQUALIFIER INLINEQUALIFIER
+    std::uint64_t atomicCAS(
+        std::uint64_t* address,
+        std::uint64_t compare,
+        std::uint64_t val)
+    {
+        return atomicCAS(
+            reinterpret_cast<unsigned long long int*>(address),
+            static_cast<unsigned long long int>(compare),
+            static_cast<unsigned long long int>(val));
+    }
+
+    // Add
+    DEVICEQUALIFIER INLINEQUALIFIER
+    std::uint64_t atomicAdd(std::uint64_t* address, std::uint64_t val)
+    {
+        return atomicAdd(
+            reinterpret_cast<unsigned long long int*>(address),
+            static_cast<unsigned long long int>(val));
+    }
+
+    // Exch
+    DEVICEQUALIFIER INLINEQUALIFIER
+    std::uint64_t atomicExch(std::uint64_t* address, std::uint64_t val)
+    {
+        return atomicExch(
+            reinterpret_cast<unsigned long long int*>(address),
+            static_cast<unsigned long long int>(val));
+    }
+
+    #if __CUDA_ARCH__ > 300
+
+    // Min
+    DEVICEQUALIFIER INLINEQUALIFIER
+    std::uint64_t atomicMin(std::uint64_t* address, std::uint64_t val)
+    {
+        return atomicMin(
+            reinterpret_cast<unsigned long long int*>(address),
+            static_cast<unsigned long long int>(val));
+    }
+
+    // Max
+    DEVICEQUALIFIER INLINEQUALIFIER
+    std::uint64_t atomicMax(std::uint64_t* address, std::uint64_t val)
+    {
+        return atomicMax(
+            reinterpret_cast<unsigned long long int*>(address),
+            static_cast<unsigned long long int>(val));
+    }
+
+    // AND
+    DEVICEQUALIFIER INLINEQUALIFIER
+    std::uint64_t atomicAnd(std::uint64_t* address, std::uint64_t val)
+    {
+        return atomicAnd(
+            reinterpret_cast<unsigned long long int*>(address),
+            static_cast<unsigned long long int>(val));
+    }
+
+    // OR
+    DEVICEQUALIFIER INLINEQUALIFIER
+    std::uint64_t atomicOr(std::uint64_t* address, std::uint64_t val)
+    {
+        return atomicOr(
+            reinterpret_cast<unsigned long long int*>(address),
+            static_cast<unsigned long long int>(val));
+    }
+
+    // XOR
+    DEVICEQUALIFIER INLINEQUALIFIER
+    std::uint64_t atomicXor(std::uint64_t* address, uint64_t val)
+    {
+        return atomicXor(
+            reinterpret_cast<unsigned long long int*>(address),
+            static_cast<unsigned long long int>(val));
+    }
+
+    #endif
+
+    DEVICEQUALIFIER INLINEQUALIFIER
+    int ffs(std::uint32_t x)
+    {
+        return __ffs(x);
+    }
+
+    DEVICEQUALIFIER INLINEQUALIFIER
+    int ffs(std::uint64_t x)
+    {
+        return __ffsll(x);
+    }
+
+    namespace helpers {
+
+        #ifdef __CUDACC_EXTENDED_LAMBDA__
+        template<class T>
+        GLOBALQUALIFIER void lambda_kernel(T f)
+        {
+            f();
+        }
+        #endif
+
+        // only valid for linear kernel i.e. y = z = 0
+        DEVICEQUALIFIER INLINEQUALIFIER
+        std::uint64_t global_thread_id() noexcept
+        {
+            return
+                std::uint64_t(blockDim.x) * std::uint64_t(blockIdx.x) +
+                std::uint64_t(threadIdx.x);
+        }
+
+        DEVICEQUALIFIER INLINEQUALIFIER
+        unsigned int lane_id()
+        {
+            unsigned int lane;
+            asm volatile("mov.u32 %0, %%laneid;" : "=r"(lane));
+            return lane;
+        }
+
+        HOSTQUALIFIER INLINEQUALIFIER
+        void init_cuda_context()
+        {
+            cudaFree(0);
+        }
+
+        HOSTQUALIFIER INLINEQUALIFIER
+        std::uint64_t available_gpu_memory(float security_factor = 1.0)
+        {
+            assert(security_factor >= 1.0 && "invalid security factor");
+
+            std::uint64_t free;
+            std::uint64_t total;
+
+            cudaMemGetInfo(&free, &total);
+
+            return free / security_factor;
+        }
+
+        HOSTQUALIFIER INLINEQUALIFIER
+        std::vector<std::uint64_t> available_gpu_memory(
+            std::vector<std::uint64_t> device_ids,
+            float security_factor = 1.0)
+        {
+            std::vector<std::uint64_t> available;
+
+            for(auto id : device_ids)
+            {
+                cudaSetDevice(id);
+                available.push_back(available_gpu_memory(security_factor));
+            }
+
+            return available;
+        }
+
+        HOSTQUALIFIER INLINEQUALIFIER
+        std::uint64_t aggregated_available_gpu_memory(
+            std::vector<std::uint64_t> device_ids,
+            float security_factor = 1.0,
+            bool uniform = false)
+        {
+            std::sort(device_ids.begin(), device_ids.end());
+            device_ids.erase(
+                std::unique(device_ids.begin(), device_ids.end()), device_ids.end());
+
+            std::vector<std::uint64_t> available =
+                available_gpu_memory(device_ids, security_factor);
+
+            if(uniform)
+            {
+                std::uint64_t min_bytes =
+                    *std::min_element(available.begin(), available.end());
+
+                return min_bytes * device_ids.size();
+            }
+            else
+            {
+                std::uint64_t total = 0;
+
+                for(auto bytes : available)
+                {
+                    total += bytes;
+                }
+
+                return total;
+            }
+        }
+
+        // #if CUDART_VERSION >= 9000
+        //     template<typename index_t>
+        //     DEVICEQUALIFIER INLINEQUALIFIER index_t atomicAggInc(index_t * ctr)
+        //     {
+        //         using namespace cooperative_groups;
+        //         coalesced_group g = coalesced_threads();
+        //         index_t prev;
+        //         if (g.thread_rank() == 0) {
+        //             prev = atomicAdd(ctr, g.size());
+        //         }
+        //         prev = g.thread_rank() + g.shfl(prev, 0);
+        //         return prev;
+        //     }
+
+        //     template<typename index_t>
+        //     DEVICEQUALIFIER INLINEQUALIFIER index_t atomicAggAdd(index_t * ctr, index_t x)
+        //     {
+        //         namespace cg = cooperative_groups;
+
+        //         // error case
+        //         assert(x > 0);
+
+        //         const auto g = cg::coalesced_threads();
+
+        //         //inclusive prefix-sum
+        //         index_t psum = x;
+        //         for(std::uint32_t i = 1; i < g.size(); i <<= 1)
+        //         {
+        //             const auto s = g.shfl_up(psum, i);
+
+        //             if(g.thread_rank() >= i) psum += s;
+        //         }
+
+        //         // last active lane increments ctr
+        //         index_t offset;
+        //         if(g.thread_rank() == g.size() - 1)
+        //         {
+        //             offset = atomicAdd(ctr, psum);
+        //         }
+
+        //         // broadcast offset to group members
+        //         offset = g.shfl(offset, g.size() - 1);
+
+        //         return offset + psum - x;
+        //     }
+        // #else
+        //     template<typename index_t>
+        //     DEVICEQUALIFIER INLINEQUALIFIER index_t atomicAggInc(index_t * ctr)
+        //     {
+        //         int lane = lane_id();
+        //         //check if thread is active
+        //         int mask = __ballot(1);
+        //         //determine first active lane for atomic add
+        //         int leader = __ffs(mask) - 1;
+        //         index_t res;
+        //         if (lane == leader) res = atomicAdd(ctr, __popc(mask));
+        //         //broadcast to warp
+        //         res = __shfl(res, leader);
+        //         //compute index for each thread
+        //         return res + __popc(mask & ((1 << lane) -1));
+        //     }
+        // #endif
+
+        DEVICEQUALIFIER INLINEQUALIFIER
+        void die() { assert(0); } // mharris style
+
+    } // namespace helpers
+
+#endif
+
+#endif /* HELPERS_CUDA_HELPERS_CUH */
diff --git a/lib/libmarv/src/hpc_helpers/cuda_raiiwrappers.cuh b/lib/libmarv/src/hpc_helpers/cuda_raiiwrappers.cuh
new file mode 100644
index 000000000..04afacad1
--- /dev/null
+++ b/lib/libmarv/src/hpc_helpers/cuda_raiiwrappers.cuh
@@ -0,0 +1,174 @@
+#ifndef HELPERS_CUDA_RAII_WRAPPERS_CUH
+#define HELPERS_CUDA_RAII_WRAPPERS_CUH
+
+
+#ifdef __NVCC__
+
+#include "cuda_helpers.cuh"
+
+class CudaEvent{
+public:
+    CudaEvent(){
+        cudaGetDevice(&deviceId); CUERR;
+        cudaEventCreate(&event); CUERR;
+    }
+    CudaEvent(unsigned int flags){
+        cudaGetDevice(&deviceId); CUERR;
+        cudaEventCreateWithFlags(&event, flags); CUERR;
+    }
+
+    CudaEvent(const CudaEvent&) = delete;
+    CudaEvent(CudaEvent&& rhs){
+        destroy();
+        deviceId = rhs.deviceId;
+        event = std::exchange(rhs.event, nullptr);
+    }
+
+    ~CudaEvent(){
+        destroy();
+    }
+
+    void destroy(){
+        if(event != nullptr){
+            int d;
+            cudaGetDevice(&d); CUERR;
+            cudaSetDevice(deviceId); CUERR;
+
+            cudaEventDestroy(event); CUERR;
+            event = nullptr;
+
+            cudaSetDevice(d); CUERR;
+        }
+    }
+
+    CudaEvent& operator=(const CudaEvent&) = delete;
+
+    CudaEvent& operator=(CudaEvent&& rhs){
+        swap(*this, rhs);
+
+        return *this;
+    }
+
+    friend void swap(CudaEvent& l, CudaEvent& r) noexcept
+    {
+        std::swap(l.deviceId, r.deviceId);
+        std::swap(l.event, r.event);
+    }
+
+    cudaError_t query() const{
+        return cudaEventQuery(event);
+    }
+
+    cudaError_t record(cudaStream_t stream = 0) const{
+        return cudaEventRecord(event, stream);
+    }
+
+    cudaError_t synchronize() const{
+        return cudaEventSynchronize(event);
+    }
+
+    cudaError_t elapsedTime(float* ms, cudaEvent_t end) const{
+        return cudaEventElapsedTime(ms, event, end);
+    }
+
+    operator cudaEvent_t() const{
+        return event;
+    }
+
+    int getDeviceId() const{
+        return deviceId;
+    }
+
+    cudaEvent_t getEvent() const{
+        return event;
+    }
+private:
+
+    int deviceId{};
+    cudaEvent_t event{};
+};
+
+
+
+
+class CudaStream{
+public:
+    CudaStream(){
+        cudaGetDevice(&deviceId); CUERR;
+        cudaStreamCreate(&stream); CUERR;
+    }
+    CudaStream(unsigned int flags){
+        cudaGetDevice(&deviceId); CUERR;
+        cudaStreamCreateWithFlags(&stream, flags); CUERR;
+    }
+
+    CudaStream(const CudaStream&) = delete;
+    CudaStream(CudaStream&& rhs){
+        destroy();
+        deviceId = rhs.deviceId;
+        stream = std::exchange(rhs.stream, nullptr);
+    }
+
+    ~CudaStream(){
+        destroy();
+    }
+
+    void destroy(){
+        if(stream != nullptr){
+            int d;
+            cudaGetDevice(&d); CUERR;
+            cudaSetDevice(deviceId); CUERR;
+
+            cudaStreamDestroy(stream); CUERR;
+            stream = nullptr;
+
+            cudaSetDevice(d); CUERR;
+        }
+    }
+
+    CudaStream& operator=(const CudaStream&) = delete;
+
+    CudaStream& operator=(CudaStream&& rhs){
+        swap(*this, rhs);
+
+        return *this;
+    }
+
+    friend void swap(CudaStream& l, CudaStream& r) noexcept
+    {
+        std::swap(l.deviceId, r.deviceId);
+        std::swap(l.stream, r.stream);
+    }
+
+    cudaError_t query() const{
+        return cudaStreamQuery(stream);
+    }
+
+    cudaError_t synchronize() const{
+        return cudaStreamSynchronize(stream);
+    }
+
+    cudaError_t waitEvent(cudaEvent_t event, unsigned int flags) const{
+        return cudaStreamWaitEvent(stream, event, flags);
+    }
+
+    operator cudaStream_t() const{
+        return stream;
+    }
+
+    int getDeviceId() const{
+        return deviceId;
+    }
+
+    cudaStream_t getStream() const{
+        return stream;
+    }
+private:
+
+    int deviceId{};
+    cudaStream_t stream{};
+};
+
+#endif
+
+#endif /* HELPERS_CUDA_RAII_WRAPPERS_CUH */
diff --git a/lib/libmarv/src/hpc_helpers/custom_thrust_allocators.cuh b/lib/libmarv/src/hpc_helpers/custom_thrust_allocators.cuh
new file mode 100644
index 000000000..adf80991d
--- /dev/null
+++ b/lib/libmarv/src/hpc_helpers/custom_thrust_allocators.cuh
@@ -0,0 +1,116 @@
+#ifndef HELPERS_CUSTOM_THRUST_ALLOCATORS_HPP
+#define HELPERS_CUSTOM_THRUST_ALLOCATORS_HPP
+
+#include <stdexcept>
+#include <exception>
+#include <iostream>
+
+#ifdef __NVCC__
+
+#include <thrust/device_malloc_allocator.h>
+
+namespace helpers {
+
+template<typename T>
+struct ThrustUninitializedDeviceAllocator : thrust::device_malloc_allocator<T>{
+
+  __host__ __device__
+  void construct(T *p)
+  {
+    // no-op
+  }
+};
+
+template<class T, bool allowFallback>
+struct ThrustFallbackDeviceAllocator;
+
+template<class T>
+struct ThrustFallbackDeviceAllocator<T, true> : thrust::device_malloc_allocator<T> {
+    using value_type = T;
+    using super_t = thrust::device_malloc_allocator<T>;
+
+    using pointer = typename super_t::pointer;
+    using size_type = typename super_t::size_type;
+    using reference = typename super_t::reference;
+    using const_reference = typename super_t::const_reference;
+
+    pointer allocate(size_type n){
+        //std::cerr << "alloc" << std::endl;
+
+        T* ptr = nullptr;
+		cudaError_t status = cudaMalloc(&ptr, n * sizeof(T));
+		if(status == cudaSuccess){
+			//std::cerr << "cudaMalloc\n";
+		}else{
+			cudaGetLastError(); //reset the error of failed allocation
+
+	    	status = cudaMallocManaged(&ptr, n * sizeof(T));
+    		if(status != cudaSuccess){
+    			throw std::bad_alloc();
+    		}
+    		int deviceId = 0;
+    		status = cudaGetDevice(&deviceId);
+    		if(status != cudaSuccess){
+    			throw std::bad_alloc();
+    		}
+    		status = cudaMemAdvise(ptr, n * sizeof(T), cudaMemAdviseSetAccessedBy, deviceId);
+    		if(status != cudaSuccess){
+    			throw std::bad_alloc();
+    		}
+			//std::cerr << "cudaMallocManaged\n";
+		}
+		return thrust::device_pointer_cast(ptr);
+	}
+
+    void deallocate(pointer ptr, size_type /*n*/){
+    	//std::cerr << "dealloc" << std::endl;
+
+    	cudaError_t status = cudaFree(ptr.get());
+    	if(status != cudaSuccess){
+    		throw std::bad_alloc();
+    	}
+    }
+};
+
+template<class T>
+struct ThrustFallbackDeviceAllocator<T, false> : thrust::device_malloc_allocator<T>{
+	using value_type = T;
+
+	using super_t = thrust::device_malloc_allocator<T>;
+
+	using pointer = typename super_t::pointer;
+	using size_type = typename super_t::size_type;
+	using reference = typename super_t::reference;
+	using const_reference = typename super_t::const_reference;
+
+	pointer allocate(size_type n){
+		//std::cerr << "alloc" << std::endl;
+
+		T* ptr = nullptr;
+		cudaError_t status = cudaMalloc(&ptr, n * sizeof(T));
+		if(status == cudaSuccess){
+			//std::cerr << "cudaMalloc\n";
+		}else{
+			cudaGetLastError(); //reset the error of failed allocation
+
+    		throw std::bad_alloc();
+		}
+		return thrust::device_pointer_cast(ptr);
+	}
+
+    void deallocate(pointer ptr, size_type /*n*/){
+    	//std::cerr << "dealloc" << std::endl;
+
+    	cudaError_t status = cudaFree(ptr.get());
+    	if(status != cudaSuccess){
+    		throw std::bad_alloc();
+    	}
+    }
+};
+
+} // namespace helpers
+
+#endif
+
+#endif /* HELPERS_CUSTOM_THRUST_ALLOCATORS_HPP */
+
diff --git a/lib/libmarv/src/hpc_helpers/hashers.cuh b/lib/libmarv/src/hpc_helpers/hashers.cuh
new file mode 100644
index 000000000..cf6abd7d8
--- /dev/null
+++ b/lib/libmarv/src/hpc_helpers/hashers.cuh
@@ -0,0 +1,175 @@
+#ifndef HELPERS_HASHERS_CUH
+#define HELPERS_HASHERS_CUH
+
+#include <cstdint>
+#include <type_traits>
+#include "cuda_helpers.cuh"
+
+/*! \brief hash functions
+*/
+namespace hashers
+{
+
+/*! \brief hash function proposed by NVIDIA
+*/
+class NvidiaHash
+{
+
+public:
+    using key_type  = std::uint32_t;
+    using hash_type = std::uint32_t;
+
+    /*! \brief deleted hash function for types other than explicitly defined
+    * \tparam T key type
+    */
+    template<class T>
+    HOSTDEVICEQUALIFIER INLINEQUALIFIER
+    static hash_type hash(T) = delete;
+
+    /*! \brief hash function
+    * \param[in] x key to be hashed
+    * \return hash of \c x
+    */
+    HOSTDEVICEQUALIFIER INLINEQUALIFIER
+    static hash_type hash(key_type x) noexcept
+    {
+        x = (x + 0x7ed55d16) + (x << 12);
+        x = (x ^ 0xc761c23c) ^ (x >> 19);
+        x = (x + 0x165667b1) + (x <<  5);
+        x = (x + 0xd3a2646c) ^ (x <<  9);
+        x = (x + 0xfd7046c5) + (x <<  3);
+        x = (x ^ 0xb55a4f09) ^ (x >> 16);
+
+        return x;
+    }
+
+}; // class NvidiaHash
+
+/*! \brief hash function proposed by Mueller
+*/
+class MuellerHash
+{
+
+public:
+    using key_type  = std::uint32_t;
+    using hash_type = std::uint32_t;
+
+    /*! \brief deleted hash function for types other than explicitly defined
+    * \tparam T key type
+    */
+    template<class T>
+    HOSTDEVICEQUALIFIER INLINEQUALIFIER
+    static constexpr hash_type hash(T) = delete;
+
+    /*! \brief hash function
+    * \param[in] x key to be hashed
+    * \return hash of \c x
+    */
+    HOSTDEVICEQUALIFIER INLINEQUALIFIER
+    static constexpr hash_type hash(key_type x) noexcept
+    {
+        x = ((x >> 16) ^ x) * 0x45d9f3b;
+        x = ((x >> 16) ^ x) * 0x45d9f3b;
+        x = ((x >> 16) ^ x);
+
+        return x;
+    }
+
+}; // class MuellerHash
+
+
+/*! \brief murmur integer finalizer
+ * \tparam K key type (\c std::uint32_t or std::uint64_t)
+*/
+template<class K>
+class MurmurHash
+{
+
+public:
+    using key_type  = K;
+    using hash_type = K;
+
+    /*! \brief hash function
+    * \tparam T key type
+    * \param[in] x key to be hashed
+    * \return hash of \c x
+    */
+    template<class T>
+    HOSTDEVICEQUALIFIER INLINEQUALIFIER
+    static constexpr T hash(T x) noexcept
+    {
+        static_assert(
+            std::is_same<T, key_type>::value,
+            "invalid key type");
+
+        return hash_(x);
+    }
+
+private:
+    /*! \brief hash function
+    * \param[in] x key to be hashed
+    * \return hash of \c x
+    */
+    HOSTDEVICEQUALIFIER INLINEQUALIFIER
+    static constexpr std::uint32_t hash_(std::uint32_t x) noexcept
+    {
+        x ^= x >> 16;
+        x *= 0x85ebca6b;
+        x ^= x >> 13;
+        x *= 0xc2b2ae35;
+        x ^= x >> 16;
+        return x;
+    }
+
+    /*! \brief hash function
+    * \param[in] x key to be hashed
+    * \return hash of \c x
+    */
+    HOSTDEVICEQUALIFIER INLINEQUALIFIER
+    static constexpr std::uint64_t hash_(std::uint64_t x) noexcept
+    {
+        x ^= x >> 33;
+        x *= 0xff51afd7ed558ccd;
+        x ^= x >> 33;
+        x *= 0xc4ceb9fe1a85ec53;
+        x ^= x >> 33;
+        return x;
+    }
+
+}; // class MurmurHash
+
+/*! \brief identity hash
+ * \tparam K key type
+ * \tparam H hash type
+*/
+template<class K, class H = std::uint32_t>
+class IdentityMap
+{
+
+public:
+    using key_type  = K;
+    using hash_type = H;
+
+    static_assert(
+        std::is_same<hash_type, std::uint32_t>::value ||
+        std::is_same<hash_type, std::uint64_t>::value,
+        "invalid hash type");
+    static_assert(
+        std::is_convertible<key_type, hash_type>::value,
+        "key type not convertible to hash type");
+
+    /*! \brief hash function
+    * \param[in] x key to be hashed
+    * \return hash of \c x
+    */
+    HOSTDEVICEQUALIFIER INLINEQUALIFIER
+    static constexpr hash_type hash(key_type x) noexcept
+    {
+        return hash_type{x};
+    }
+
+}; // class IdentityMap
+
+} // namespace hashers
+
+#endif /* HELPERS_HASHERS_CUH */
\ No newline at end of file
diff --git a/lib/libmarv/src/hpc_helpers/hpc_helpers.h b/lib/libmarv/src/hpc_helpers/hpc_helpers.h
new file mode 100644
index 000000000..3c9ca2410
--- /dev/null
+++ b/lib/libmarv/src/hpc_helpers/hpc_helpers.h
@@ -0,0 +1,50 @@
+#ifndef HELPERS_HPC_HELPERS_H
+#define HELPERS_HPC_HELPERS_H
+
+#include <cstdio>
+
+// helper for gcc version check
+#define GCC_VERSION (__GNUC__ * 10000                                          \
+    + __GNUC_MINOR__ * 100                                                     \
+    + __GNUC_PATCHLEVEL__)
+
+// debug prinf
+#ifndef NDEBUG
+    #define STRINGIZE_DETAIL(x) #x
+    #define STRINGIZE(x) STRINGIZE_DETAIL(x)
+    #define debug_printf(fmt, ...)                                             \
+        printf("[DEBUG] file " STRINGIZE(__FILE__)                             \
+        ", line " STRINGIZE(__LINE__) ": " STRINGIZE(fmt) "\n",                \
+        ##__VA_ARGS__);
+#else
+    #define debug_printf(fmt, ...)
+#endif
+
+// safe division
+#ifndef SDIV
+    #define SDIV(x,y)(((x)+(y)-1)/(y))
+#endif
+
+namespace helpers {
+
+    inline
+    float B2KB(std::size_t bytes) noexcept { return float(bytes)/1024.0; }
+
+    inline
+    float B2MB(std::size_t bytes) noexcept { return float(bytes)/1048576.0; }
+
+    inline
+    float B2GB(std::size_t bytes) noexcept { return float(bytes)/1073741824.0; }
+
+    inline
+    std::size_t KB2B(float kb) noexcept { return std::size_t(kb*1024); }
+
+    inline
+    std::size_t MB2B(float mb) noexcept { return std::size_t(mb*1048576); }
+
+    inline
+    std::size_t GB2B(float gb) noexcept { return std::size_t(gb*1073741824); }
+
+} // namespace helpers
+
+#endif /* HELPERS_HPC_HELPERS_H */
diff --git a/lib/libmarv/src/hpc_helpers/io_helpers.h b/lib/libmarv/src/hpc_helpers/io_helpers.h
new file mode 100644
index 000000000..aa5ebd6eb
--- /dev/null
+++ b/lib/libmarv/src/hpc_helpers/io_helpers.h
@@ -0,0 +1,113 @@
+#ifndef HELPERS_IO_HELPERS_H
+#define HELPERS_IO_HELPERS_H
+
+#include <vector>
+#include <string>
+#include <fstream>
+#include <iostream>
+
+namespace helpers {
+
+constexpr std::size_t binary_dump_magic_number() noexcept
+{
+    return 0xAAAAAAAA55555555;
+}
+
+template<class T>
+void dump_binary(
+    const std::vector<T>& data,
+    const std::string& filename) noexcept
+{
+    std::ofstream ofile(filename, std::ios::binary);
+
+    if(ofile.good())
+    {
+        const std::size_t magic_number = binary_dump_magic_number();
+        const std::size_t t_bytes = sizeof(T);
+        const std::size_t size = data.size();
+
+        ofile.write((char *) &magic_number, sizeof(std::size_t));
+        ofile.write((char *) &t_bytes, sizeof(std::size_t));
+        ofile.write((char *) &size, sizeof(std::size_t));
+
+        ofile.write((char *) data.data(), sizeof(T) * size);
+    }
+    else
+    {
+        std::cerr << "Unable to open file." << std::endl;
+    }
+
+    ofile.close();
+}
+
+template<class T>
+std::vector<T> load_binary(
+    const std::string& filename,
+    std::size_t end = 0,
+    std::size_t begin = 0) noexcept
+{
+    std::vector<T> data;
+    std::ifstream ifile(filename, std::ios::binary);
+
+    if(ifile.is_open())
+    {
+        std::size_t magic_number;
+
+        ifile.read((char *) &magic_number, sizeof(std::size_t));
+
+        if(magic_number == binary_dump_magic_number())
+        {
+            std::size_t t_bytes;
+
+            ifile.read((char* ) &t_bytes, sizeof(std::size_t));
+
+            if(t_bytes == sizeof(T))
+            {
+                std::size_t size;
+
+                ifile.read((char* ) &size, sizeof(std::size_t));
+
+                const std::size_t end_ = (end == 0) ? size : end;
+
+                if(begin <= end_ && end_ <= size)
+                {
+                    ifile.seekg(ifile.tellg() + static_cast<std::streampos>(sizeof(T) * begin));
+
+                    const std::size_t diff = end_ - begin;
+
+                    data.resize(diff);
+
+                    ifile.read((char *) data.data(), sizeof(T) * diff);
+                }
+                else
+                {
+                    std::cerr << "Invalid file offsets." << std::endl;
+                    data.resize(0);
+                }
+            }
+            else
+            {
+                std::cerr << "Type mismatch." << std::endl;
+                data.resize(0);
+            }
+        }
+        else
+        {
+            std::cerr << "Invalid file format." << std::endl;
+            data.resize(0);
+        }
+    }
+    else
+    {
+        std::cerr << "Unable to open file." << std::endl;
+        data.resize(0);
+    }
+
+    ifile.close();
+
+    return data;
+}
+
+} // namespace helpers
+
+#endif /* HELPERS_IO_HELPERS_H */
diff --git a/lib/libmarv/src/hpc_helpers/nvtx_markers.cuh b/lib/libmarv/src/hpc_helpers/nvtx_markers.cuh
new file mode 100644
index 000000000..d951a6913
--- /dev/null
+++ b/lib/libmarv/src/hpc_helpers/nvtx_markers.cuh
@@ -0,0 +1,70 @@
+#ifndef HELPERS_NVTX_MARKERS_CUH
+#define HELPERS_NVTX_MARKERS_CUH
+
+/*
+    Need to link with -lnvToolsExt to use this
+*/
+
+#ifdef __NVCC__
+
+#ifndef NO_NVTOOLSEXT
+#include <nvtx3/nvToolsExt.h>
+#endif
+
+#include <iostream>
+#include <cstring>
+
+namespace nvtx {
+
+    inline
+    void push_range(const std::string& name, int cid){
+#ifndef NO_NVTOOLSEXT
+        const uint32_t colors_[] = { 0xff00ff00, 0xff0000ff, 0xffffff00, 0xffff00ff, 0xff00ffff, 0xffff0000, 0xffffffff, 0xdeadbeef, 0x12345678, 0xabcdef42 };
+        const int num_colors_ = sizeof(colors_)/sizeof(uint32_t);
+
+        int color_id = cid;
+        color_id = color_id%num_colors_;
+        nvtxEventAttributes_t eventAttrib;
+        std::memset(&eventAttrib, 0, sizeof(nvtxEventAttributes_t));
+        eventAttrib.version = NVTX_VERSION;
+        eventAttrib.size = NVTX_EVENT_ATTRIB_STRUCT_SIZE;
+        eventAttrib.colorType = NVTX_COLOR_ARGB;
+        eventAttrib.color = colors_[color_id];
+        eventAttrib.messageType = NVTX_MESSAGE_TYPE_ASCII;
+        eventAttrib.message.ascii = name.c_str();
+        nvtxRangePushEx(&eventAttrib);
+        //std::cout << "push " << name << std::endl;
+#endif
+    }
+
+    inline
+    void pop_range(const std::string& /*name*/){
+#ifndef NO_NVTOOLSEXT
+        nvtxRangePop();
+        //std::cerr << "pop " << name << std::endl;
+#endif
+    }
+
+    inline
+    void pop_range(){
+#ifndef NO_NVTOOLSEXT
+        nvtxRangePop();
+        //std::cerr << "pop " << std::endl;
+#endif
+    }
+
+    struct ScopedRange{
+        ScopedRange() : ScopedRange("unnamed", 0){}
+        ScopedRange(const std::string& name, int cid){
+            push_range(name, cid);
+        }
+        ~ScopedRange(){
+            pop_range();
+        }
+    };
+
+} // namespace nvtx
+
+#endif
+
+#endif /* HELPERS_NVTX_MARKERS_CUH */
diff --git a/lib/libmarv/src/hpc_helpers/packed_types.cuh b/lib/libmarv/src/hpc_helpers/packed_types.cuh
new file mode 100644
index 000000000..38f168f8a
--- /dev/null
+++ b/lib/libmarv/src/hpc_helpers/packed_types.cuh
@@ -0,0 +1,611 @@
+#ifndef HELPERS_PACKED_TYPES_CUH
+#define HELPERS_PACKED_TYPES_CUH
+
+#include <cstdint>
+#include <climits>
+#include <type_traits>
+#include <cassert>
+
+#include "cuda_helpers.cuh"
+#include "type_helpers.h"
+
+namespace packed_types {
+
+using helpers::uint_t;
+
+// INFO you can find the actual types as using statements at the end of this file
+
+// bit-wise reinterpret one fundamental type as another fundamental type
+template<class To, class From>
+HOSTDEVICEQUALIFIER INLINEQUALIFIER
+constexpr To reinterpret_as(From from) noexcept
+{
+    static_assert(
+        (std::is_fundamental<To>::value || std::is_enum<To>::value),
+        "Target type must be fundamental enum.");
+
+    static_assert(
+        (std::is_fundamental<From>::value || std::is_enum<From>::value),
+        "Input type must be fundamental or enum.");
+
+    union reinterpreter_t
+    {
+        From from;
+        To to;
+
+        HOSTDEVICEQUALIFIER
+        constexpr reinterpreter_t() noexcept : to(To()) {}
+    } reinterpreter;
+
+    // TODO add warning for narrowing conversions if desired
+    reinterpreter.from = from;
+    return reinterpreter.to;
+}
+
+namespace detail
+{
+
+template<
+    std::uint8_t FirstBits,
+    std::uint8_t SecondBits,
+    std::uint8_t ThirdBits = 0,
+    std::uint8_t FourthBits = 0>
+class Pack
+{
+    // memory layout: MSB->padding|fourth|third|second|first<-LSB
+
+public:
+    using base_type = uint_t<FirstBits + SecondBits + ThirdBits + FourthBits>;
+
+private:
+    static_assert(
+        FirstBits != 0 && SecondBits != 0,
+        "FirstBits and SecondBits both may not be zero.");
+
+    static_assert(
+        !(ThirdBits == 0 && FourthBits != 0),
+        "Third type cannot be zero-width if fourth type has non-zero width.");
+
+    // leftover bits are padding
+    static constexpr base_type PaddingBits =
+        (sizeof(base_type) * CHAR_BIT) - (FirstBits + SecondBits + ThirdBits + FourthBits);
+
+    // bit masks for each individual field
+    static constexpr base_type first_mask = ((base_type{1} << FirstBits) - base_type{1});
+
+    static constexpr base_type second_mask =
+        ((base_type{1} << SecondBits) - base_type{1}) <<
+            (FirstBits);
+
+    static constexpr base_type third_mask =
+        (ThirdBits == 0) ?
+            base_type{0} :
+            ((base_type{1} << ThirdBits) - base_type{1}) <<
+                (FirstBits + SecondBits);
+
+    static constexpr base_type fourth_mask =
+        (FourthBits == 0) ?
+            base_type{0} :
+            ((base_type{1} << FourthBits) - base_type{1}) <<
+                (FirstBits + SecondBits + ThirdBits);
+
+    static constexpr base_type padding_mask =
+        (PaddingBits == 0) ?
+            base_type{0} :
+            ((base_type{1} << PaddingBits) - base_type{1}) <<
+                (FirstBits + SecondBits + ThirdBits + FourthBits);
+
+public:
+    // number of bits per field
+    HOSTDEVICEQUALIFIER INLINEQUALIFIER
+    static constexpr std::uint8_t padding_bits() noexcept { return PaddingBits; }
+
+    HOSTDEVICEQUALIFIER INLINEQUALIFIER
+    static constexpr std::uint8_t first_bits() noexcept { return FirstBits; }
+
+    HOSTDEVICEQUALIFIER INLINEQUALIFIER
+    static constexpr std::uint8_t second_bits() noexcept { return SecondBits; }
+
+    template<
+        base_type B = ThirdBits,
+        class = std::enable_if_t<B != 0>>
+    HOSTDEVICEQUALIFIER INLINEQUALIFIER
+    static constexpr std::uint8_t third_bits() noexcept { return ThirdBits; }
+
+    template<
+        base_type B = FourthBits,
+        class = std::enable_if_t<B != 0>>
+    HOSTDEVICEQUALIFIER INLINEQUALIFIER
+    static constexpr std::uint8_t fourth_bits() noexcept { return FourthBits; }
+
+    // CONSTRUCTORS
+    HOSTDEVICEQUALIFIER
+    constexpr explicit Pack() noexcept : base_{empty().base_} {}
+
+    template<
+        class FirstType,
+        class SecondType,
+        std::uint8_t B1 = ThirdBits,
+        std::uint8_t B2 = FourthBits,
+        class = std::enable_if_t<B1 == 0 && B2 == 0>>
+    HOSTDEVICEQUALIFIER
+    constexpr explicit Pack(
+        FirstType first_,
+        SecondType second_) noexcept : base_{empty().base_}
+    {
+        first(first_);
+        second(second_);
+    }
+
+    template<
+        class FirstType,
+        class SecondType,
+        class ThirdType,
+        std::uint8_t B1 = ThirdBits,
+        std::uint8_t B2 = FourthBits,
+        class = std::enable_if_t<B1 != 0 && B2 == 0>>
+    HOSTDEVICEQUALIFIER
+    constexpr explicit Pack(
+        FirstType first_,
+        SecondType second_,
+        ThirdType third_) noexcept : base_{empty().base_}
+    {
+        first(first_);
+        second(second_);
+        third(third_);
+    }
+
+    template<
+        class FirstType,
+        class SecondType,
+        class ThirdType,
+        class FourthType,
+        std::uint8_t B1 = ThirdBits,
+        std::uint8_t B2 = FourthBits,
+        class = std::enable_if_t<B1 != 0 && B2 != 0>>
+    HOSTDEVICEQUALIFIER
+    constexpr explicit Pack(
+        FirstType first_,
+        SecondType second_,
+        ThirdType third_,
+        FourthType fourth_) noexcept : base_{empty().base_}
+    {
+        first(first_);
+        second(second_);
+        third(third_);
+        fourth(fourth_);
+    }
+
+    constexpr Pack(const Pack&) noexcept = default;
+    constexpr Pack(Pack&& pair) noexcept = default;
+
+    // returns an empty pack
+    HOSTDEVICEQUALIFIER INLINEQUALIFIER
+    static constexpr Pack empty() noexcept
+    {
+        return Pack(base_type{0});
+    }
+
+    // SETTERS
+    // by field name
+    template<class First>
+    HOSTDEVICEQUALIFIER INLINEQUALIFIER
+    constexpr void first(First first_) noexcept
+    {
+        // TODO find a better solution to prevent truncation
+        //static_assert(
+        //    sizeof(First) <= sizeof(base_type),
+        //    "Input type too wide. Truncation imminent.");
+
+        first(reinterpret_as<base_type>(first_));
+    }
+
+    HOSTDEVICEQUALIFIER INLINEQUALIFIER
+    constexpr void first(base_type first_) noexcept
+    {
+        assert(is_valid_first(first_));
+        base_ = (base_ & ~first_mask) + (first_ & first_mask);
+    }
+
+    template<class Second>
+    HOSTDEVICEQUALIFIER INLINEQUALIFIER
+    constexpr void second(Second second_) noexcept
+    {
+        static_assert(
+            sizeof(Second) <= sizeof(base_type),
+            "Input type too wide. Truncation imminent.");
+
+        second(reinterpret_as<base_type>(second_));
+    }
+
+    HOSTDEVICEQUALIFIER INLINEQUALIFIER
+    constexpr void second(base_type second_) noexcept
+    {
+        assert(is_valid_second(second_));
+        constexpr auto shift = FirstBits;
+        base_ = (base_ & ~second_mask) + ((second_ << shift) & second_mask);
+    }
+
+    template<
+        class Third,
+        std::uint8_t B = ThirdBits,
+        class = std::enable_if_t<B != 0>>
+    HOSTDEVICEQUALIFIER INLINEQUALIFIER
+    constexpr void third(Third third_) noexcept
+    {
+        //static_assert(
+        //    sizeof(Third) <= sizeof(base_type),
+        //    "Input type too wide. Truncation imminent.");
+
+        third(reinterpret_as<base_type>(third_));
+    }
+
+    template<
+        std::uint8_t B = ThirdBits,
+        class = std::enable_if_t<B != 0>>
+    HOSTDEVICEQUALIFIER INLINEQUALIFIER
+    constexpr void third(base_type third_) noexcept
+    {
+        assert(is_valid_third(third_));
+        constexpr auto shift = FirstBits + SecondBits;
+        base_ = (base_ & ~third_mask) + ((third_ << shift) & third_mask);
+    }
+
+    template<
+        class Fourth,
+        std::uint8_t B = FourthBits,
+        class = std::enable_if_t<B != 0>>
+    HOSTDEVICEQUALIFIER INLINEQUALIFIER
+    constexpr void fourth(Fourth fourth_) noexcept
+    {
+        //static_assert(
+        //    sizeof(Fourth) <= sizeof(base_type),
+        //    "Input type too wide. Truncation imminent.");
+
+        fourth(reinterpret_as<base_type>(fourth_));
+    }
+
+    template<
+        std::uint8_t B = FourthBits,
+        class  = std::enable_if_t<B != 0>>
+    HOSTDEVICEQUALIFIER INLINEQUALIFIER
+    constexpr void fourth(base_type fourth_) noexcept
+    {
+        assert(is_valid_fourth(fourth_));
+        constexpr auto shift = FirstBits + SecondBits + ThirdBits;
+        base_ = (base_ & ~fourth_mask) + ((fourth_ << shift) & fourth_mask);
+    }
+
+    // GETTERS
+    // by field name
+    template<class T>
+    HOSTDEVICEQUALIFIER INLINEQUALIFIER
+    constexpr T first_as() const noexcept
+    {
+        return reinterpret_as<T>(first());
+    }
+
+    HOSTDEVICEQUALIFIER INLINEQUALIFIER
+    constexpr base_type first() const noexcept
+    {
+        return (base_ & first_mask);
+    }
+
+    template<class T>
+    HOSTDEVICEQUALIFIER INLINEQUALIFIER
+    constexpr T second_as() const noexcept
+    {
+        return reinterpret_as<T>(second());
+    }
+
+    HOSTDEVICEQUALIFIER INLINEQUALIFIER
+    constexpr base_type second() const noexcept
+    {
+        return ((base_ & second_mask) >> (FirstBits));
+    }
+
+    template<
+        class T,
+        std::uint8_t B = ThirdBits,
+        class = std::enable_if_t<B != 0>>
+    HOSTDEVICEQUALIFIER INLINEQUALIFIER
+    constexpr T third_as() const noexcept
+    {
+        return reinterpret_as<T>(third());
+    }
+
+    template<
+        std::uint8_t B = ThirdBits,
+        class = std::enable_if_t<B != 0>>
+    HOSTDEVICEQUALIFIER INLINEQUALIFIER
+    constexpr base_type third() const noexcept
+    {
+        return ((base_ & third_mask) >> (FirstBits + SecondBits));
+    }
+
+    template<
+        class T,
+        std::uint8_t B = FourthBits,
+        class = std::enable_if_t<B != 0>>
+    HOSTDEVICEQUALIFIER INLINEQUALIFIER
+    constexpr T fourth_as() const noexcept
+    {
+        return reinterpret_as<T>(fourth());
+    }
+
+    template<
+        std::uint8_t B = FourthBits,
+        class = std::enable_if_t<B != 0>>
+    HOSTDEVICEQUALIFIER INLINEQUALIFIER
+    constexpr base_type fourth() const noexcept
+    {
+        return ((base_ & fourth_mask) >> (FirstBits + SecondBits + ThirdBits));
+    }
+
+    HOSTDEVICEQUALIFIER INLINEQUALIFIER
+    constexpr base_type base() const noexcept
+    {
+        return (base_ & ~padding_mask);
+    }
+
+    // SETTERS
+    // set<index>(value)
+    template<std::size_t I, class T>
+    HOSTDEVICEQUALIFIER INLINEQUALIFIER
+    constexpr typename std::enable_if_t<I == 0, void>
+    set(T first_) noexcept { first<T>(first_); }
+
+    template<std::size_t I>
+    HOSTDEVICEQUALIFIER INLINEQUALIFIER
+    constexpr typename std::enable_if_t<I == 0, void>
+    set(base_type first_) noexcept { first(first_); }
+
+    template<std::size_t I, class T>
+    HOSTDEVICEQUALIFIER INLINEQUALIFIER
+    constexpr typename std::enable_if_t<I == 1, void>
+    set(T second_) noexcept { second<T>(second_); }
+
+    template<std::size_t I>
+    HOSTDEVICEQUALIFIER INLINEQUALIFIER
+    constexpr typename std::enable_if_t<I == 1, void>
+    set(base_type second_) noexcept { second(second_); }
+
+    template<std::size_t I, class T>
+    HOSTDEVICEQUALIFIER INLINEQUALIFIER
+    constexpr typename std::enable_if_t<I == 2 && ThirdBits, void>
+    set(T third_) noexcept { third<T>(third_); }
+
+    template<std::size_t I>
+    HOSTDEVICEQUALIFIER INLINEQUALIFIER
+    constexpr typename std::enable_if_t<I == 2 && ThirdBits, void>
+    set(base_type third_) noexcept { third(third_); }
+
+    template<std::size_t I, class T>
+    HOSTDEVICEQUALIFIER INLINEQUALIFIER
+    constexpr typename std::enable_if_t<I == 3 && ThirdBits && FourthBits, void>
+    set(T fourth_) noexcept { fourth<T>(fourth_); }
+
+    template<std::size_t I>
+    HOSTDEVICEQUALIFIER INLINEQUALIFIER
+    constexpr typename std::enable_if_t<I == 3 && ThirdBits && FourthBits, void>
+    set(base_type fourth_) noexcept { fourth(fourth_); }
+
+    // GETTERS
+    // get<index>()
+    template<std::size_t I, class T>
+    HOSTDEVICEQUALIFIER INLINEQUALIFIER
+    constexpr typename std::enable_if_t<I == 0, T>
+    get() const noexcept { return first_as<T>(); }
+
+    template<std::size_t I>
+    HOSTDEVICEQUALIFIER INLINEQUALIFIER
+    constexpr typename std::enable_if_t<I == 0, base_type>
+    get() const noexcept { return first(); }
+
+    template<std::size_t I, class T>
+    HOSTDEVICEQUALIFIER INLINEQUALIFIER
+    constexpr typename std::enable_if_t<I == 1, T>
+    get() const noexcept { return second_as<T>(); }
+
+    template<std::size_t I>
+    HOSTDEVICEQUALIFIER INLINEQUALIFIER
+    constexpr typename std::enable_if_t<I == 1, base_type>
+    get() const noexcept { return second(); }
+
+    template<std::size_t I, class T>
+    HOSTDEVICEQUALIFIER INLINEQUALIFIER
+    constexpr typename std::enable_if_t<I == 2 && ThirdBits, T>
+    get() const noexcept { return third_as<T>(); }
+
+    template<std::size_t I>
+    HOSTDEVICEQUALIFIER INLINEQUALIFIER
+    constexpr typename std::enable_if_t<I == 2 && ThirdBits, base_type>
+    get() const noexcept { return third(); }
+
+    template<std::size_t I, class T>
+    HOSTDEVICEQUALIFIER INLINEQUALIFIER
+    constexpr typename std::enable_if_t<I == 3 && ThirdBits && FourthBits, T>
+    get() const noexcept { return fourth_as<T>(); }
+
+    template<std::size_t I>
+    HOSTDEVICEQUALIFIER INLINEQUALIFIER
+    constexpr typename std::enable_if_t<I == 3 && ThirdBits && FourthBits, base_type>
+    get() const noexcept { return fourth(); }
+
+    // INPUT VALIDATORS
+    template<class T>
+    HOSTDEVICEQUALIFIER INLINEQUALIFIER
+    static constexpr bool is_valid_first(T first_) noexcept
+    {
+        return is_valid_first(reinterpret_as<base_type>(first_));
+    }
+
+    HOSTDEVICEQUALIFIER INLINEQUALIFIER
+    static constexpr bool is_valid_first(base_type first_) noexcept
+    {
+        return !(first_ & ~((base_type{1} << FirstBits) - base_type{1}));
+    }
+
+    template<class T>
+    HOSTDEVICEQUALIFIER INLINEQUALIFIER
+    static constexpr bool is_valid_second(T second_) noexcept
+    {
+        return is_valid_second(reinterpret_as<base_type>(second_));
+    }
+
+    HOSTDEVICEQUALIFIER INLINEQUALIFIER
+    static constexpr bool is_valid_second(base_type second_) noexcept
+    {
+        return !(second_ & ~((base_type{1} << SecondBits) - base_type{1}));
+    }
+
+    template<
+        class T,
+        std::uint8_t B = ThirdBits,
+        class = std::enable_if_t<B != 0>>
+    HOSTDEVICEQUALIFIER INLINEQUALIFIER
+    static constexpr bool is_valid_third(T third_) noexcept
+    {
+        return is_valid_third(reinterpret_as<base_type>(third_));
+    }
+
+    template<
+        std::uint8_t B = ThirdBits,
+        class = std::enable_if_t<B != 0>>
+    HOSTDEVICEQUALIFIER INLINEQUALIFIER
+    static constexpr bool is_valid_third(base_type third_) noexcept
+    {
+        return !(third_ & ~((base_type{1} << ThirdBits) - base_type{1}));
+    }
+
+    template<
+        class T,
+        std::uint8_t B = FourthBits,
+        class = std::enable_if_t<B != 0>>
+    HOSTDEVICEQUALIFIER INLINEQUALIFIER
+    static constexpr bool is_valid_fourth(T fourth_) noexcept
+    {
+        return is_valid_fourth(reinterpret_as<base_type>(fourth_));
+    }
+
+    template<
+        std::uint8_t B = FourthBits,
+        class = std::enable_if_t<B != 0>>
+    HOSTDEVICEQUALIFIER INLINEQUALIFIER
+    static constexpr bool is_valid_fourth(base_type fourth_) noexcept
+    {
+        return !(fourth_ & ~((base_type{1} << FourthBits) - base_type{1}));
+    }
+
+    template<std::size_t I, class T>
+    HOSTDEVICEQUALIFIER INLINEQUALIFIER
+    static constexpr typename std::enable_if_t<I == 0, bool>
+    is_valid(T first_) noexcept
+    {
+        return is_valid_first(first_);
+    }
+
+    template<std::size_t I, class T>
+    HOSTDEVICEQUALIFIER INLINEQUALIFIER
+    static constexpr typename std::enable_if_t<I == 1, bool>
+    is_valid(T second_) noexcept
+    {
+        return is_valid_second(second_);
+    }
+
+    template<std::size_t I, class T>
+    HOSTDEVICEQUALIFIER INLINEQUALIFIER
+    static constexpr typename std::enable_if_t<I == 2 && ThirdBits, bool>
+    is_valid(T third_) noexcept
+    {
+        return is_valid_third(third_);
+    }
+
+    template<std::size_t I, class T>
+    HOSTDEVICEQUALIFIER INLINEQUALIFIER
+    static constexpr typename std::enable_if_t<I == 3 && ThirdBits && FourthBits, bool>
+    is_valid(T fourth_) noexcept
+    {
+        return is_valid_fourth(fourth_);
+    }
+
+    // OPERATORS
+    HOSTDEVICEQUALIFIER INLINEQUALIFIER
+    constexpr Pack& operator=(const Pack& pack_) noexcept
+    {
+        base_ = (pack_.base_ & ~padding_mask);
+        return *this;
+    }
+
+    HOSTDEVICEQUALIFIER INLINEQUALIFIER
+    constexpr bool operator==(const Pack& pack_) const noexcept
+    {
+        return (base_ & ~padding_mask) == (pack_.base_ & ~padding_mask);
+    }
+
+    HOSTDEVICEQUALIFIER INLINEQUALIFIER
+    constexpr bool operator!=(const Pack& pack_) const noexcept
+    {
+        return (base_ & ~padding_mask) != (pack_.base_ & ~padding_mask);
+    }
+
+    // CUDA ATOMICS
+    DEVICEQUALIFIER INLINEQUALIFIER
+    friend typename std::enable_if_t<
+        (std::is_same<base_type, std::uint32_t>::value ||
+         std::is_same<base_type, std::uint64_t>::value),
+        Pack> atomicCAS(
+        Pack * address_,
+        Pack   compare_,
+        Pack   val_) noexcept
+    {
+        return Pack(atomicCAS(&(address_->base_), compare_.base_, val_.base_));
+    }
+
+    DEVICEQUALIFIER INLINEQUALIFIER
+    friend typename std::enable_if_t<
+        (std::is_same<base_type, std::uint32_t>::value ||
+         std::is_same<base_type, std::uint64_t>::value),
+        Pack> atomicExch(
+        Pack * address_,
+        Pack   val_) noexcept
+    {
+        return Pack(atomicExch(&(address_->base_), val_.base_));
+    }
+
+private:
+    HOSTDEVICEQUALIFIER
+    explicit constexpr Pack(base_type base) noexcept : base_{base} {}
+
+    base_type base_;
+
+}; // class Pack
+
+} // namespace detail
+
+// std::get support
+template<
+    std::size_t I,
+    std::uint8_t B1,
+    std::uint8_t B2,
+    std::uint8_t B3,
+    std::uint8_t B4>
+HOSTDEVICEQUALIFIER INLINEQUALIFIER
+constexpr uint_t<B1 + B2 + B3 + B4> get(detail::Pack<B1, B2, B3, B4> pack) noexcept
+{
+    return pack.template get<I>();
+}
+
+// packed type aliases
+template<std::uint8_t FirstBits, std::uint8_t SecondBits>
+using PackedPair = detail::Pack<FirstBits, SecondBits>;
+
+template<std::uint8_t FirstBits, std::uint8_t SecondBits, std::uint8_t ThirdBits>
+using PackedTriple = detail::Pack<FirstBits, SecondBits, ThirdBits>;
+
+template<std::uint8_t FirstBits, std::uint8_t SecondBits, std::uint8_t ThirdBits, std::uint8_t FourthBits>
+using PackedQuadruple = detail::Pack<FirstBits, SecondBits, ThirdBits, FourthBits>;
+
+} // namespace packed_types
+
+#endif /* HELPERS_PACKED_TYPES_CUH */
diff --git a/lib/libmarv/src/hpc_helpers/peer_access.cuh b/lib/libmarv/src/hpc_helpers/peer_access.cuh
new file mode 100644
index 000000000..9e5c27150
--- /dev/null
+++ b/lib/libmarv/src/hpc_helpers/peer_access.cuh
@@ -0,0 +1,220 @@
+#ifndef HELPERS_PEER_ACCESS_CUH
+#define HELPERS_PEER_ACCESS_CUH
+
+#ifdef __NVCC__
+
+    #include <cassert>
+    #include <iostream>
+    #include <vector>
+
+    #include "cuda_helpers.cuh"
+
+    namespace helpers {
+
+    enum class PeerAccessDebugMode {Enabled, Disabled};
+
+    template<PeerAccessDebugMode dbg>
+    struct PeerAccessBase{
+        static constexpr PeerAccessDebugMode debugmode = dbg;
+
+        bool resetOnDestruction;
+        int numGpus;
+        std::vector<int> deviceIds;
+        std::vector<int> accessMatrix;
+        std::vector<int> oldEnabledPeerAccesses;
+
+        PeerAccessBase(){
+            int numDevices = 0;
+            cudaGetDeviceCount(&numDevices);
+
+            std::vector<int> ids(numDevices);
+            for(int i = 0; i < numDevices; i++){
+                ids[i] = i;
+            }
+
+            init(std::move(ids), true);
+        }
+
+        PeerAccessBase(std::vector<int> deviceIds_, bool resetOnDestruction_){
+            init(std::move(deviceIds_), resetOnDestruction_);
+        }
+
+        void init(std::vector<int> deviceIds_, bool resetOnDestruction_){
+            deviceIds = std::move(deviceIds_);
+            resetOnDestruction = resetOnDestruction_;
+            cudaGetDeviceCount(&numGpus);
+
+            accessMatrix.resize(numGpus * numGpus);
+            const int numIds = deviceIds.size();
+            for(int i = 0; i < numIds; i++){
+                for(int k = 0; k < numIds; k++){
+                    //device i can access device k?
+                    const int dev1 = deviceIds[i];
+                    const int dev2 = deviceIds[k];
+                    cudaDeviceCanAccessPeer(&accessMatrix[dev1 * numGpus + dev2], dev1, dev2); CUERR;
+                    if(debugmode == PeerAccessDebugMode::Enabled){
+                        std::cerr << "Peer access possible for " << dev1 << " -> " << dev2 << "\n";
+                    }
+                }
+            }
+
+            if(resetOnDestruction){
+                //save current enabled peer accesses
+                oldEnabledPeerAccesses = getEnabledPeerAccesses();
+            }
+        }
+
+        ~PeerAccessBase(){
+            if(resetOnDestruction && int(oldEnabledPeerAccesses.size()) == numGpus * numGpus){
+                setEnabledPeerAccesses(oldEnabledPeerAccesses);
+            }
+        }
+
+        PeerAccessBase(const PeerAccessBase&) = default;
+        PeerAccessBase(PeerAccessBase&&) = default;
+        PeerAccessBase& operator=(const PeerAccessBase&) = default;
+        PeerAccessBase& operator=(PeerAccessBase&&) = default;
+
+        bool canAccessPeer(int device, int peerDevice) const{
+            assert(device < numGpus);
+            assert(peerDevice < numGpus);
+
+            return accessMatrix[device * numGpus + peerDevice] == 1;
+        }
+
+        void enablePeerAccess(int device, int peerDevice) const{
+            if(!canAccessPeer(device, peerDevice)){
+                if(debugmode == PeerAccessDebugMode::Enabled){
+                    std::cerr << "Peer access from " << device << " to " << peerDevice << " is not available and cannot be enabled.\n";
+                }
+                return;
+            }
+
+            int oldId; cudaGetDevice(&oldId); CUERR;
+            cudaSetDevice(device); CUERR;
+            cudaError_t status = cudaDeviceEnablePeerAccess(peerDevice, 0);
+            if(status != cudaSuccess){
+                if(status == cudaErrorPeerAccessAlreadyEnabled){
+                    if(debugmode == PeerAccessDebugMode::Enabled){
+                        std::cerr << "Peer access from " << device << " to " << peerDevice << " has already been enabled. This is not a program error\n";
+                    }
+                    cudaGetLastError(); //reset error state;
+                }else{
+                    CUERR;
+                }
+            }
+            cudaSetDevice(oldId); CUERR;
+        }
+
+        void disablePeerAccess(int device, int peerDevice) const{
+            if(!canAccessPeer(device, peerDevice)){
+                if(debugmode == PeerAccessDebugMode::Enabled){
+                    std::cerr << "Peer access from " << device << " to " << peerDevice << " is not available and cannot be disabled.\n";
+                }
+                return;
+            }
+
+            int oldId; cudaGetDevice(&oldId); CUERR;
+            cudaSetDevice(device); CUERR;
+            cudaError_t status = cudaDeviceDisablePeerAccess(peerDevice);
+            if(status != cudaSuccess){
+                if(status == cudaErrorPeerAccessNotEnabled){
+                    if(debugmode == PeerAccessDebugMode::Enabled){
+                        std::cerr << "Peer access from " << device << " to " << peerDevice << " has not yet been enabled. This is not a program error\n";
+                    }
+                    cudaGetLastError(); //reset error state;
+                }else{
+                    CUERR;
+                }
+            }
+            cudaSetDevice(oldId); CUERR;
+        }
+
+        void enableAllPeerAccesses(){
+            for(int i = 0; i < numGpus; i++){
+                for(int k = 0; k < numGpus; k++){
+                    if(canAccessPeer(i, k)){
+                        enablePeerAccess(i, k);
+                    }
+                }
+            }
+        }
+
+        void disableAllPeerAccesses(){
+            for(int i = 0; i < numGpus; i++){
+                for(int k = 0; k < numGpus; k++){
+                    if(canAccessPeer(i, k)){
+                        disablePeerAccess(i, k);
+                    }
+                }
+            }
+        }
+
+        std::vector<int> getEnabledPeerAccesses() const{
+            int numGpus = 0;
+            cudaGetDeviceCount(&numGpus); CUERR;
+
+            std::vector<int> result(numGpus * numGpus, 0);
+
+            if(numGpus > 0){
+                int oldId; cudaGetDevice(&oldId); CUERR;
+
+                for(int i = 0; i < numGpus; i++){
+                    cudaSetDevice(i); CUERR;
+                    for(int k = 0; k < numGpus; k++){
+                        if(canAccessPeer(i,k)){
+                            cudaError_t status = cudaDeviceDisablePeerAccess(k);
+                            if(status == cudaSuccess){
+                                //if device i can disable access to device k, it must have been enabled
+                                result[i * numGpus + k] = 1;
+                                //enable again
+                                cudaDeviceEnablePeerAccess(k, 0); CUERR;
+                            }else{
+                                if(status != cudaErrorPeerAccessNotEnabled){
+                                    CUERR; //error
+                                }
+                                cudaGetLastError(); //reset error state;
+                            }
+                        }
+                    }
+                }
+
+                cudaSetDevice(oldId);
+            }
+
+            return result;
+        }
+
+        std::vector<int> getDisabledPeerAccesses() const{
+            std::vector<int> result = getEnabledPeerAccesses();
+            for(auto& i : result){
+                i = (i == 0) ? 1 : 0; // 0->1, 1->0
+            }
+            return result;
+        }
+
+        void setEnabledPeerAccesses(const std::vector<int>& vec){
+            for(int i = 0; i < numGpus; i++){
+                for(int k = 0; k < numGpus; k++){
+                    if(canAccessPeer(i,k)){
+                        int flag = vec[i * numGpus + k];
+                        if(flag == 1){
+                            enablePeerAccess(i,k);
+                        }else{
+                            disablePeerAccess(i,k);
+                        }
+                    }
+                }
+            }
+        }
+    };
+
+    using PeerAccess = PeerAccessBase<PeerAccessDebugMode::Disabled>;
+    using PeerAccessDebug = PeerAccessBase<PeerAccessDebugMode::Enabled>;
+
+    } // namespace helpers
+
+#endif
+
+#endif /* HELPERS_PEER_ACCESS_CUH */
+
diff --git a/lib/libmarv/src/hpc_helpers/simple_allocation.cuh b/lib/libmarv/src/hpc_helpers/simple_allocation.cuh
new file mode 100644
index 000000000..a2ab02f31
--- /dev/null
+++ b/lib/libmarv/src/hpc_helpers/simple_allocation.cuh
@@ -0,0 +1,281 @@
+#ifndef HELPERS_SIMPLE_ALLOCATION_CUH
+#define HELPERS_SIMPLE_ALLOCATION_CUH
+
+#ifdef __NVCC__
+
+    #include <cassert>
+    #include <iostream>
+    #include <stdexcept>
+
+    #include "cuda_helpers.cuh"
+
+    namespace helpers {
+
+    enum class DataLocation {Host, PinnedHost, Device};
+
+    template<DataLocation location, class T>
+    struct SimpleAllocator;
+
+    template<class T>
+    struct SimpleAllocator<DataLocation::Host, T>{
+        T* allocate(size_t elements){
+            T* ptr{};
+            ptr = new T[elements];
+
+            assert(ptr != nullptr);
+            return ptr;
+        }
+
+        void deallocate(T* ptr){
+            delete [] ptr;
+        }
+    };
+
+    template<class T>
+    struct SimpleAllocator<DataLocation::PinnedHost, T>{
+        T* allocate(size_t elements){
+            T* ptr{};
+            cudaError_t err = cudaMallocHost(&ptr, elements * sizeof(T));
+            if(err != cudaSuccess){
+                std::cerr << "SimpleAllocator: Failed to allocate " << (elements) << " * " << sizeof(T)
+                            << " = " << (elements * sizeof(T))
+                            << " bytes using cudaMallocHost!\n";
+
+                throw std::bad_alloc();
+            }
+
+            assert(ptr != nullptr);
+
+            return ptr;
+        }
+
+        void deallocate(T* ptr){
+            cudaFreeHost(ptr); CUERR;
+        }
+    };
+
+    template<class T>
+    struct SimpleAllocator<DataLocation::Device, T>{
+        T* allocate(size_t elements){
+            T* ptr;
+            cudaError_t err = cudaMalloc(&ptr, elements * sizeof(T));
+            if(err != cudaSuccess){
+                std::cerr << "SimpleAllocator: Failed to allocate " << (elements) << " * " << sizeof(T)
+                            << " = " << (elements * sizeof(T))
+                            << " bytes using cudaMalloc!\n";
+
+                throw std::bad_alloc();
+            }
+
+            assert(ptr != nullptr);
+
+            return ptr;
+        }
+
+        void deallocate(T* ptr){
+            cudaFree(ptr); CUERR;
+        }
+    };
+
+
+    template<DataLocation location, class T, int overprovisioningPercent = 10>
+    struct SimpleAllocation{
+        using Allocator = SimpleAllocator<location, T>;
+
+        static_assert(overprovisioningPercent >= 0, "overprovisioningPercent < 0");
+
+        static constexpr size_t getOverprovisionedSize(size_t requiredSize){
+            if(overprovisioningPercent <= 0){
+                return requiredSize;
+            }else{
+                const double onePercent = requiredSize / 100.0;
+                const size_t extra = onePercent * overprovisioningPercent;
+                return requiredSize + std::max(std::size_t(1), extra);
+            }
+        }
+
+        T* data_{};
+        size_t size_{};
+        size_t capacity_{};
+
+        SimpleAllocation() : SimpleAllocation(0){}
+        SimpleAllocation(size_t size){
+            resize(size);
+        }
+
+        SimpleAllocation(const SimpleAllocation&) = delete;
+        SimpleAllocation& operator=(const SimpleAllocation&) = delete;
+
+        SimpleAllocation(SimpleAllocation&& rhs) noexcept{
+            *this = std::move(rhs);
+        }
+
+        SimpleAllocation& operator=(SimpleAllocation&& rhs) noexcept{
+            if(data_ != nullptr){
+                Allocator alloc;
+                alloc.deallocate(data_);
+            }
+
+            data_ = rhs.data_;
+            size_ = rhs.size_;
+            capacity_ = rhs.capacity_;
+
+            rhs.data_ = nullptr;
+            rhs.size_ = 0;
+            rhs.capacity_ = 0;
+
+            return *this;
+        }
+
+        ~SimpleAllocation(){
+            destroy();
+        }
+
+        friend void swap(SimpleAllocation& l, SimpleAllocation& r) noexcept{
+            using std::swap;
+
+            swap(l.data_, r.data_);
+            swap(l.size_, r.size_);
+            swap(l.capacity_, r.capacity_);
+        }
+
+        void destroy(){
+            if(data_ != nullptr){
+                Allocator alloc;
+                alloc.deallocate(data_);
+                data_ = nullptr;
+            }
+            size_ = 0;
+            capacity_ = 0;
+        }
+
+        T& operator[](size_t i){
+            return get()[i];
+        }
+
+        const T& operator[](size_t i) const{
+            return get()[i];
+        }
+
+        T& at(size_t i){
+            if(i < size()){
+                return operator[](i);
+            }else{
+                throw std::out_of_range("SimpleAllocation::at out-of-bounds access.");
+            }
+        }
+
+        const T& at(size_t i) const{
+            if(i < size()){
+                return operator[](i);
+            }else{
+                throw std::out_of_range("SimpleAllocation::at out-of-bounds access.");
+            }
+        }
+
+        T* operator+(size_t i) const{
+            return get() + i;
+        }
+
+        operator T*(){
+            return get();
+        }
+
+        operator const T*() const{
+            return get();
+        }
+
+
+        //size is number of elements of type T
+        //return true if reallocation occured
+        bool resize(size_t newsize){
+            size_ = newsize;
+
+            if(capacity_ < newsize){
+                Allocator alloc;
+                alloc.deallocate(data_);
+                const size_t newCapacity = getOverprovisionedSize(newsize);
+                data_ = alloc.allocate(newCapacity);
+                capacity_ = newCapacity;
+
+                return true;
+            }else{
+                return false;
+            }
+        }
+
+        //reserve enough memory for at least max(newCapacity,newSize) elements, and set size to newSize
+        //return true if reallocation occured
+        bool reserveAndResize(size_t newCapacity, size_t newSize){
+            size_ = newSize;
+
+            newCapacity = std::max(newCapacity, newSize);
+
+            if(capacity_ < newCapacity){
+                Allocator alloc;
+                alloc.deallocate(data_);
+                data_ = alloc.allocate(newCapacity);
+                capacity_ = newCapacity;
+
+                return true;
+            }else{
+                return false;
+            }
+        }
+
+        T* get() const{
+            return data_;
+        }
+
+        size_t size() const{
+            return size_;
+        }
+
+        size_t& sizeRef(){
+            return size_;
+        }
+
+        size_t sizeInBytes() const{
+            return size() * sizeof(T);
+        }
+
+        size_t capacity() const{
+            return capacity_;
+        }
+
+        size_t capacityInBytes() const{
+            return capacity() * sizeof(T);
+        }
+
+        T* data() const noexcept{
+            return data_;
+        }
+
+        T* begin() const noexcept{
+            return data();
+        }
+
+        T* end() const noexcept{
+            return data() + size();
+        }
+
+        bool empty() const noexcept{
+            return size() == 0;
+        }
+    };
+
+    template<class T, int overprovisioningPercent = 10>
+    using SimpleAllocationHost = SimpleAllocation<DataLocation::Host, T, overprovisioningPercent>;
+
+    template<class T, int overprovisioningPercent = 10>
+    using SimpleAllocationPinnedHost = SimpleAllocation<DataLocation::PinnedHost, T, overprovisioningPercent>;
+
+    template<class T, int overprovisioningPercent = 10>
+    using SimpleAllocationDevice = SimpleAllocation<DataLocation::Device, T, overprovisioningPercent>;
+
+    } // namespace helpers
+
+#endif
+
+#endif /* HELPERS_SIMPLE_ALLOCATION_CUH */
+
diff --git a/lib/libmarv/src/hpc_helpers/timers.cuh b/lib/libmarv/src/hpc_helpers/timers.cuh
new file mode 100644
index 000000000..25e83a958
--- /dev/null
+++ b/lib/libmarv/src/hpc_helpers/timers.cuh
@@ -0,0 +1,288 @@
+#ifndef HELPERS_TIMERS_CUH
+#define HELPERS_TIMERS_CUH
+
+#include <chrono>
+#include <ostream>
+#include <iostream>
+
+
+namespace helpers {
+
+    class CpuTimer{
+    public:
+
+        CpuTimer() : CpuTimer("anonymous timer", std::cout)
+        {
+        }
+
+        CpuTimer(const std::string& label) : CpuTimer(label, std::cout)
+        {
+        }
+
+        CpuTimer(const std::string& label, std::ostream& outputstream)
+            : ongoing(true), calculatedDelta(true), elapsedTime(0),
+            begin(std::chrono::system_clock::now()),
+            end(std::chrono::system_clock::now()),
+            os(outputstream),
+            name(label)
+        {
+        }
+
+        ~CpuTimer(){
+            if(ongoing){
+                stop();
+            }
+        }
+
+        void start(){
+            if(!calculatedDelta){
+                const std::chrono::duration<double> delta = end - begin;
+                elapsedTime += delta.count();
+                calculatedDelta = true;
+            }
+
+            begin = std::chrono::system_clock::now();
+            end = std::chrono::system_clock::now();
+            ongoing = true;
+        }
+
+        void stop(){
+            end = std::chrono::system_clock::now();
+            ongoing = false;
+            calculatedDelta = false;
+        }
+
+        void reset(){
+            ongoing = false;
+            calculatedDelta = true;
+            elapsedTime = 0;
+        }
+
+        double elapsed(){
+            if(ongoing){
+                stop();
+            }
+
+            if(!calculatedDelta){
+                const std::chrono::duration<double> delta = end - begin;
+                elapsedTime += delta.count();
+                calculatedDelta = true;
+            }
+
+            return elapsedTime;
+        }
+
+        void print(){
+            os << "# elapsed time ("<< name <<"): " << elapsed()  << "s\n";
+        }
+
+        void printGCUPS(double cells){
+            double gcups = cells / 1000. / 1000. / 1000.;
+            gcups = gcups / (elapsed() / 1000);
+             os << "# elapsed time ("<< name <<"): " << elapsed() << "s " << gcups << " GCUPS (" << name << ")\n";
+        }
+
+        void print_throughput(std::size_t bytes, int num){
+
+            const double delta = elapsed(); //seconds
+
+            const double gb = ((bytes)*(num))/1073741824.0;
+            const double throughput = gb/delta;
+            const double ops = (num)/delta;
+
+            os << "THROUGHPUT: " << delta << " s @ " << gb << " GB "
+                << "-> " << ops << " elements/s or " <<
+                throughput << " GB/s (" << name << ")\n";
+        }
+
+    private:
+
+        bool ongoing;
+        bool calculatedDelta;
+        double elapsedTime;
+        std::chrono::time_point<std::chrono::system_clock> begin;
+        std::chrono::time_point<std::chrono::system_clock> end;
+        std::ostream& os;
+        std::string name;
+    };
+
+
+    #ifdef __CUDACC__
+
+    class GpuTimer{
+    public:
+
+        GpuTimer()
+            : GpuTimer(0, "anonymous timer", std::cout)
+        {
+            //default stream, current device
+        }
+
+        GpuTimer(const std::string& label)
+            : GpuTimer(0, label, std::cout)
+        {
+            //default stream, current device
+        }
+
+
+        GpuTimer(cudaStream_t stream, const std::string& label)
+            : GpuTimer(stream, label, std::cout)
+        {
+            //user-defined stream, current device
+        }
+
+        GpuTimer(cudaStream_t stream, const std::string& label, std::ostream& outputstream)
+            : calculatedDelta(true), elapsedTime(0), os(outputstream)
+        {
+            //user-defined stream, current device
+
+            int curGpu = 0;
+            cudaGetDevice(&curGpu);
+
+            init(stream, label, curGpu);
+            start();
+        }
+
+        GpuTimer(cudaStream_t stream, const std::string& label, int deviceId)
+            : GpuTimer(stream, label, deviceId, std::cout)
+        {
+            //user-defined stream, user-defined device
+        }
+
+        GpuTimer(cudaStream_t stream, const std::string& label, int deviceId, std::ostream& outputstream)
+            : calculatedDelta(true), elapsedTime(0), os(outputstream)
+        {
+            //user-defined stream, user-defined device
+
+            init(stream, label, deviceId);
+            start();
+        }
+
+        ~GpuTimer(){
+            if(ongoing){
+                stop();
+            }
+
+            int curGpu = 0;
+            cudaGetDevice(&curGpu);
+            cudaSetDevice(gpu);
+
+            cudaEventDestroy(begin);
+            cudaEventDestroy(end);
+
+            cudaSetDevice(curGpu);
+        }
+
+        void start(){
+            if(!calculatedDelta){
+                float delta = 0.0f;
+                cudaEventSynchronize(end);
+                cudaEventElapsedTime(&delta, begin, end);
+                elapsedTime += delta;
+                calculatedDelta = true;
+            }
+
+            ongoing = true;
+
+            int curGpu = 0;
+            cudaGetDevice(&curGpu);
+            cudaSetDevice(gpu);
+
+            cudaEventRecord(begin, timedstream);
+
+            cudaSetDevice(curGpu);
+        }
+
+        void stop(){
+            int curGpu = 0;
+            cudaGetDevice(&curGpu);
+            cudaSetDevice(gpu);
+
+            cudaEventRecord(end, timedstream);
+            ongoing = false;
+            calculatedDelta = false;
+
+            cudaSetDevice(curGpu);
+        }
+
+        void reset(){
+            ongoing = false;
+            calculatedDelta = true;
+            elapsedTime = 0;
+        }
+
+        float elapsed(){
+            if(ongoing){
+                stop();
+            }
+
+            if(!calculatedDelta){
+                float delta = 0.0f;
+                cudaEventSynchronize(end);
+                cudaEventElapsedTime(&delta, begin, end);
+                elapsedTime += delta;
+                calculatedDelta = true;
+            }
+
+            return elapsedTime;
+        }
+
+        void print(){
+            os << "TIMING: " << elapsed() << " ms (" << name << ")\n";
+        }
+
+        void printGCUPS(double cells){
+            double gcups = cells / 1000. / 1000. / 1000.;
+            gcups = gcups / (elapsed() / 1000);
+            os << "TIMING: " << elapsed() << " ms " << gcups << " GCUPS (" << name << ")\n";
+        }
+
+        void print_throughput(std::size_t bytes, int num){
+            const float delta = elapsed();
+
+            const double gb = ((bytes)*(num))/1073741824.0;
+            const double throughput = gb/((delta)/1000.0);
+            const double ops = (num)/((delta)/1000.0);
+
+            os << "THROUGHPUT: " << delta << " ms @ " << gb << " GB "
+                << "-> " << ops << " elements/s or " <<
+                throughput << " GB/s (" << name << ")\n";
+        }
+
+    private:
+
+        void init(cudaStream_t stream, const std::string& label, int deviceId){
+            gpu = deviceId;
+            timedstream = stream;
+            name = label;
+
+            int curGpu = 0;
+            cudaGetDevice(&curGpu);
+            cudaSetDevice(gpu);
+
+            cudaEventCreate(&begin);
+            cudaEventCreate(&end);
+
+            cudaSetDevice(curGpu);
+        }
+
+
+        bool ongoing;
+        bool calculatedDelta;
+        int gpu;
+        float elapsedTime;
+        cudaStream_t timedstream;
+        cudaEvent_t begin;
+        cudaEvent_t end;
+        std::ostream& os;
+        std::string name;
+    };
+
+
+
+    #endif
+
+} //namespace helpers
+
+
+#endif /* HELPERS_TIMERS_CUH */
diff --git a/lib/libmarv/src/hpc_helpers/type_helpers.h b/lib/libmarv/src/hpc_helpers/type_helpers.h
new file mode 100644
index 000000000..f942424fd
--- /dev/null
+++ b/lib/libmarv/src/hpc_helpers/type_helpers.h
@@ -0,0 +1,48 @@
+#ifndef HELPERS_TYPE_HELPERS_H
+#define HELPERS_TYPE_HELPERS_H
+
+#include <cstdint>
+#include <type_traits>
+
+namespace helpers {
+
+template<std::uint8_t Bits>
+using uint_t =
+    typename std::conditional<
+        (Bits > 64),
+        std::false_type,
+        typename std::conditional<
+            (Bits > 32),
+            std::uint64_t,
+            typename std::conditional<
+                (Bits > 16),
+                std::uint32_t,
+                typename std::conditional<
+                    (Bits > 8),
+                    std::uint16_t,
+                    std::uint8_t>::type>::type>::type>::type;
+
+template<class T>
+class no_init_t
+{
+public:
+    static_assert(std::is_fundamental<T>::value &&
+                  std::is_arithmetic<T>::value,
+                  "wrapped type must be a fundamental, numeric type");
+
+    //do nothing
+    constexpr no_init_t() noexcept {}
+
+    //convertible from a T
+    constexpr no_init_t(T value) noexcept: v_(value) {}
+
+    //act as a T in all conversion contexts
+    constexpr operator T () const noexcept { return v_; }
+
+private:
+    T v_;
+};
+
+} // namespace helpers
+
+#endif /* HELPERS_TYPE_HELPERS_H */
diff --git a/lib/libmarv/src/hpc_helpers/utility_kernels.cuh b/lib/libmarv/src/hpc_helpers/utility_kernels.cuh
new file mode 100644
index 000000000..b33a5fbaf
--- /dev/null
+++ b/lib/libmarv/src/hpc_helpers/utility_kernels.cuh
@@ -0,0 +1,227 @@
+#ifndef HELPERS_UTILITY_KERNELS_CUH
+#define HELPERS_UTILITY_KERNELS_CUH
+
+#include "cuda_helpers.cuh"
+#include "hpc_helpers.h"
+
+#ifdef __NVCC__
+
+namespace helpers {
+
+/*
+    Assigns value to the first nElements elements of data
+*/
+template<class T>
+GLOBALQUALIFIER
+void fill_kernel(T* data, int nElements, T value){
+    int index = threadIdx.x + blockDim.x * blockIdx.x;
+    const int stride = blockDim.x * gridDim.x;
+
+    for(; index < nElements; index += stride){
+        data[index] = value;
+    }
+}
+
+template<class T>
+void call_fill_kernel_async(T* d_data, int elements, const T& value, cudaStream_t stream){
+    if(elements == 0){
+        return;
+    }
+
+    const int blocksize = 128;
+    const int blocks = SDIV(elements, blocksize);
+    dim3 block(blocksize,1,1);
+    dim3 grid(blocks,1,1);
+
+    fill_kernel<<<grid, block, 0, stream>>>(d_data, elements, value); CUERR;
+}
+
+/*
+    Assign value to data[index]
+*/
+template<class T>
+GLOBALQUALIFIER
+void set_kernel(T* data, int index, T value){
+    data[index] = value;
+}
+
+template<class T>
+void call_set_kernel_async(T* d_data, int index, const T& value, cudaStream_t stream){
+    set_kernel<<<1, 1, 0, stream>>>(d_data, index, value); CUERR;
+}
+
+/*
+    Gather input elements at positions given by indices in output array.
+    input and output must not overlap
+    n is the number of indices.
+*/
+template<class Iter1, class Iter2, class IndexIter>
+GLOBALQUALIFIER
+void compact_kernel(Iter1 out, Iter2 in, IndexIter indices, int n){
+
+    for(int i = threadIdx.x + blockIdx.x * blockDim.x; i < n; i += blockDim.x * gridDim.x){
+        const int srcindex = *(indices + i);
+        *(out + i) = *(in + srcindex);
+    }
+}
+
+template<class Iter1, class Iter2, class IndexIter>
+GLOBALQUALIFIER
+void compact_kernel_nptr(Iter1 out, Iter2 in, IndexIter indices, const int* Nptr){
+
+    for(int i = threadIdx.x + blockIdx.x * blockDim.x; i < *Nptr; i += blockDim.x * gridDim.x){
+        const int srcindex = *(indices + i);
+        *(out + i) = *(in + srcindex);
+    }
+}
+
+template<class Iter1, class Iter2, class IndexIter>
+void call_compact_kernel_async(Iter1 d_out, Iter2 d_in, IndexIter d_indices, int n, cudaStream_t stream){
+    if(n <= 0){
+        return;
+    }
+
+    dim3 block(128,1,1);
+    dim3 grid(SDIV(n, block.x),1,1);
+
+    compact_kernel<<<grid, block, 0, stream>>>(d_out, d_in, d_indices, n); CUERR;
+}
+
+template<class Iter1, class Iter2, class IndexIter>
+void call_compact_kernel_async(Iter1 d_out, Iter2 d_in, IndexIter d_indices, const int* Nptr, int maxN, cudaStream_t stream){
+    if(maxN <= 0){
+        return;
+    }
+
+    dim3 block(128,1,1);
+    dim3 grid(SDIV(maxN, block.x),1,1);
+
+    compact_kernel_nptr<<<grid, block, 0, stream>>>(d_out, d_in, d_indices, Nptr); CUERR;
+}
+
+
+template<int blocksize_x, int blocksize_y, class T>
+GLOBALQUALIFIER
+void transpose_kernel(T* __restrict__ output, const T* __restrict__ input, int numRows, int numColumns, int columnpitchelements){
+    constexpr int tilesize = 32;
+    __shared__ T tile[tilesize][tilesize+1];
+
+    const int requiredTilesX = SDIV(numColumns, tilesize);
+    const int requiredTilesY = SDIV(numRows, tilesize);
+    const int dstNumRows = numColumns;
+    const int dstNumColumns = numRows;
+
+    for(int blockId = blockIdx.x; blockId < requiredTilesX * requiredTilesY; blockId += gridDim.x){
+        const int tile_id_x = blockId % requiredTilesX;
+        const int tile_id_y = blockId / requiredTilesX;
+
+        for(int tile_x = threadIdx.x; tile_x < tilesize; tile_x += blocksize_x){
+            for(int tile_y = threadIdx.y; tile_y < tilesize; tile_y += blocksize_y){
+                const int srcColumn = tile_id_x * tilesize + tile_x;
+                const int srcRow = tile_id_y * tilesize + tile_y;
+
+                if(srcColumn < numColumns && srcRow < numRows){
+                    tile[tile_y][tile_x] = input[srcRow * columnpitchelements + srcColumn];
+                }
+            }
+        }
+
+        __syncthreads(); //wait for tile to be loaded
+
+        for(int tile_x = threadIdx.x; tile_x < tilesize; tile_x += blocksize_x){
+            for(int tile_y = threadIdx.y; tile_y < tilesize; tile_y += blocksize_y){
+                const int dstColumn = tile_id_y * tilesize + tile_x;
+                const int dstRow = tile_id_x * tilesize + tile_y;
+
+                if(dstRow < dstNumRows && dstColumn < dstNumColumns){
+                    output[dstRow * dstNumColumns + dstColumn] = tile[tile_x][tile_y];
+                }
+            }
+        }
+
+        __syncthreads(); //wait before reusing shared memory
+    }
+}
+
+/*
+    Transpose input and save to output.
+    The size in bytes of each row in input must be columnpitchelements * sizeof(T)
+*/
+template<class T>
+void call_transpose_kernel(T* d_output, const T* d_input, int numRows, int numColumns, int columnpitchelements, cudaStream_t stream){
+    if(numRows == 0 || numColumns == 0){
+        return;
+    }
+
+    dim3 block(32,8);
+    const int blocks_x = SDIV(numColumns, block.x);
+    const int blocks_y = SDIV(numRows, block.y);
+    dim3 grid(min(65535, blocks_x * blocks_y), 1, 1);
+
+    transpose_kernel<32,8><<<grid, block, 0, stream>>>(d_output,
+                                                d_input,
+                                                numRows,
+                                                numColumns,
+                                                columnpitchelements); CUERR;
+}
+
+
+//copy n elements from range beginning at inputiter to range beginning at outputiter
+//ranges must not overlap
+template<class Iter1, class Iter2>
+__global__
+void copy_n_kernel(Iter1 inputiter, int N, Iter2 outputiter){
+    const int tid = threadIdx.x + blockIdx.x * blockDim.x;
+    const int stride = blockDim.x * gridDim.x;
+
+    for(int i = tid; i < N; i += stride){
+        *(outputiter + i) = *(inputiter + i);
+    }
+}
+
+template<class Iter1, class Iter2, class LimitIter>
+__global__
+void copy_n_kernel(Iter1 inputiter, LimitIter Nptr, Iter2 outputiter){
+    const int tid = threadIdx.x + blockIdx.x * blockDim.x;
+    const int stride = blockDim.x * gridDim.x;
+
+    const int N = *Nptr;
+
+    for(int i = tid; i < N; i += stride){
+        *(outputiter + i) = *(inputiter + i);
+    }
+}
+
+template<class Iter1, class Iter2>
+void call_copy_n_kernel(Iter1 d_inputiter, int N, Iter2 d_outputiter, cudaStream_t stream){
+    if(N <= 0) return;
+
+    dim3 block(256, 1, 1);
+    dim3 grid(SDIV(N, block.x), 1, 1);
+
+    copy_n_kernel<<<grid, block, 0, stream>>>(
+        d_inputiter,
+        N,
+        d_outputiter
+    ); CUERR;
+}
+
+template<class Iter1, class Iter2, class LimitIter>
+void call_copy_n_kernel(Iter1 d_inputiter, LimitIter d_Nptr, Iter2 d_outputiter, int maxN, cudaStream_t stream){
+    if(maxN <= 0) return;
+
+    dim3 block(256, 1, 1);
+    dim3 grid(SDIV(maxN, block.x), 1, 1);
+
+    copy_n_kernel<<<grid, block, 0, stream>>>(
+        d_inputiter,
+        d_Nptr,
+        d_outputiter
+    ); CUERR;
+}
+
+} // namespace helpers
+
+#endif
+
+#endif /* HELPERS_UTILITY_KERNELS_CUH */
diff --git a/lib/libmarv/src/kernelhelpers.cuh b/lib/libmarv/src/kernelhelpers.cuh
new file mode 100644
index 000000000..c738f9e86
--- /dev/null
+++ b/lib/libmarv/src/kernelhelpers.cuh
@@ -0,0 +1,94 @@
+#include <type_traits>
+
+namespace cudasw4{
+
+    template<class SequenceLengthT>
+    __host__ __device__
+    SequenceLengthT getPaddedQueryLength(SequenceLengthT queryLength){
+        //pad query length to char4, add warpsize char4 border.
+        return ((queryLength + 4 - 1) / 4) * 4 + sizeof(char4) * 32;
+    }
+
+    typedef union
+    {
+        int32_t   i;
+        uint32_t  u;
+        short2   s2;
+    } data_pack;
+
+    typedef short2	  score_type;
+    typedef data_pack score_pack;
+
+    template<class T, std::enable_if_t<(sizeof(T) <= 4), bool> = true>
+    __device__
+    T warp_max_reduce_broadcast(unsigned int mask, T val){
+        #if __CUDA_ARCH__ >= 800
+            return __reduce_max_sync(mask, val);
+        #else
+            for (int offset = 16; offset > 0; offset /= 2){
+                T tmp = __shfl_down_sync(mask, val, offset);
+                val = tmp > val ? tmp : val;
+            }
+            return __shfl_sync(mask, val, 0);
+        #endif
+    }
+
+    template<class T, std::enable_if_t<(sizeof(T) > 4), bool> = true>
+    __device__
+    T warp_max_reduce_broadcast(unsigned int mask, T val){
+        for (int offset = 16; offset > 0; offset /= 2){
+            T tmp = __shfl_down_sync(mask, val, offset);
+            val = tmp > val ? tmp : val;
+        }
+        return __shfl_sync(mask, val, 0);
+    }
+
+    inline __device__ short2 viaddmax(const short2 a_in, const short2 b_in, const short2 c_in) {
+        score_pack a, b, c, d;
+        a.s2 = a_in;
+        b.s2 = b_in;
+        c.s2 = c_in;
+        d.u = __viaddmax_s16x2(a.u, b.u, c.u);
+        return(d.s2);
+    }
+
+    inline __device__ short2 viadd(const short2 a_in, const short2 b_in) {
+        score_pack a, b, d;
+        a.s2 = a_in;
+        b.s2 = b_in;
+        d.u = __vadd2(a.u, b.u);
+        return(d.s2);
+    }
+
+    inline __device__ short2 vimax(const short2 a_in, const short2 b_in) {
+        score_pack a, b, d;
+        a.s2 = a_in;
+        b.s2 = b_in;
+        d.u = __vmaxs2(a.u, b.u);
+        return(d.s2);
+    }
+
+    inline __device__ short2 vimax3(const short2 a_in, const short2 b_in, const short2 c_in) {
+        score_pack a, b, c, d;
+        a.s2 = a_in;
+        b.s2 = b_in;
+        c.s2 = c_in;
+        d.u = __vimax3_s16x2_relu(a.u, b.u, c.u);
+        return(d.s2);
+    }
+
+    inline __device__ short2 shfl_up_2xint16(const uint32_t bitmap, const short2 value, const int lane, const int group_size) {
+        score_pack v, res;
+        v.s2 = value;
+        res.u =__shfl_up_sync(bitmap, v.u, lane, group_size);
+        return(res.s2);
+    }
+
+    inline __device__ short2 shfl_down_2xint16(const uint32_t bitmap, const short2 value, const int lane, const int group_size) {
+        score_pack v, res;
+        v.s2 = value;
+        res.u =__shfl_down_sync(bitmap, v.u, lane, group_size);
+        return(res.s2);
+    }
+
+} //namespace cudasw4
\ No newline at end of file
diff --git a/lib/libmarv/src/kernels.cuh b/lib/libmarv/src/kernels.cuh
new file mode 100644
index 000000000..a492a8c15
--- /dev/null
+++ b/lib/libmarv/src/kernels.cuh
@@ -0,0 +1,1158 @@
+#ifndef KERNELS_CUH
+#define KERNELS_CUH
+
+#include "blosum.hpp"
+#include "kernelhelpers.cuh"
+
+#define cBLOSUM62_dev deviceBlosum
+
+using namespace cudasw4;
+
+//THIS IS ONLY USED BY THE NON-PSSM KERNELS. INCREASE TO SUPPORT LONGER QUERIES.MAX ALLOWED QUERY LENGTH IS 4 * (length of constantQuery4)
+//REQUIRES cudaMemcpyToSymbolAsync IN cudasw4.cuh
+__constant__ char4 constantQuery4[2048];   
+
+
+//##################################################################################################################
+// MANY PASS HALF2
+//##################################################################################################################
+
+template <int blocksize, int group_size, int numRegs, class ScoreOutputIterator, class PositionsIterator> __global__
+void __launch_bounds__(256,2) GaplessFilter_row_wise_many_pass_half2(
+    __grid_constant__ const char * const devChars,
+    __grid_constant__ ScoreOutputIterator const devAlignmentScores,
+    __grid_constant__ __half2 * const devTempHcol2,
+    __grid_constant__ const size_t* const devOffsets,
+    __grid_constant__ const SequenceLengthT* const devLengths,
+    __grid_constant__ PositionsIterator const d_positions_of_selected_lengths,
+    __grid_constant__ const int numSelected,
+    __grid_constant__ const SequenceLengthT queryLength
+) {
+    static_assert(blocksize % group_size == 0);
+    static_assert(group_size == 32);
+
+    __builtin_assume(blockDim.x == blocksize);
+    __builtin_assume(blockDim.x % group_size == 0);
+    __builtin_assume(group_size == 32);
+
+    __shared__ __half2 shared_BLOSUM62[21][21*21];
+    int subject[numRegs];
+
+    const int blid = blockIdx.x;
+    const int thid = threadIdx.x;
+    const int group_id = thid%group_size;
+
+    int check_last = blockDim.x/group_size;
+    int check_last2 = 0;
+    if (blid == gridDim.x-1) {
+        if (numSelected % (2*blockDim.x/group_size)) {
+            check_last = (numSelected/2) % (blockDim.x/group_size);
+            check_last2 = numSelected%2;
+            check_last = check_last + check_last2;
+        }
+    }
+    check_last = check_last * group_size;
+
+    const SequenceLengthT length_S0 = devLengths[d_positions_of_selected_lengths[2*(blockDim.x/group_size)*blid+2*((thid%check_last)/group_size)]];
+    const size_t base_S0 = devOffsets[d_positions_of_selected_lengths[2*(blockDim.x/group_size)*blid+2*((thid%check_last)/group_size)]]-devOffsets[0];
+
+	SequenceLengthT length_S1 = devLengths[d_positions_of_selected_lengths[2*(blockDim.x/group_size)*blid+2*((thid%check_last)/group_size)+1]];
+	size_t base_S1 = devOffsets[d_positions_of_selected_lengths[2*(blockDim.x/group_size)*blid+2*((thid%check_last)/group_size)+1]]-devOffsets[0];
+
+    if (blid == gridDim.x-1)
+        if (check_last2)
+            if (((thid%check_last) >= check_last-group_size) && ((thid%check_last) < check_last)) {
+                length_S1 = length_S0;
+                base_S1 = base_S0;
+            }
+
+
+    const SequenceLengthT length = max(length_S0, length_S1);
+    //const int length = __reduce_max_sync(0xFFFFFFFF, temp_length);
+
+    __half2 H_temp_out{}, H_temp_in{};
+    __half2 penalty_temp0, penalty_temp1;
+    __half2 penalty_here_array[numRegs];
+    __half2 maximum = __float2half2_rn(0.0);
+    const __half2 ZERO = __float2half2_rn(0.0);
+    __half2 penalty_diag = ZERO;
+    H_temp_out.x = -1000; H_temp_out.y = -1000;
+    char4 new_query_letter4 = constantQuery4[0];
+    char query_letter = new_query_letter4.x;
+
+
+    const size_t numGroupsPerBlock = blockDim.x / group_size;
+    const size_t groupIdInBlock = threadIdx.x / group_size;
+    const size_t groupIdInGrid = numGroupsPerBlock * size_t(blockIdx.x) + groupIdInBlock;
+    const size_t base_3 = groupIdInGrid * size_t(getPaddedQueryLength(queryLength)); //temp of both subjects is packed into half2
+
+    __half2 * devTempHcol = (half2*)(&devTempHcol2[base_3]);
+
+    const int passes = (length + (group_size*numRegs) - 1) / (group_size*numRegs);
+
+    int offset_out = group_id;
+    int offset_in = group_id;
+
+
+    auto init_local_score_profile = [&]() {
+        for (int i=thid; i<21*21; i+=blockDim.x) {
+            __half2 temp0;
+            temp0.x = cBLOSUM62_dev[21*(i/21)+(i%21)];
+            for (int j=0; j<21; j++) {
+                temp0.y = cBLOSUM62_dev[21*(i/21)+j];
+                shared_BLOSUM62[i/21][21*(i%21)+j]=temp0;
+            }
+        }
+        __syncthreads();
+    };
+
+    auto load_subject_regs = [&](const auto& offset_isc) {
+       for (int i=0; i<numRegs; i++) {
+
+           if (offset_isc+numRegs*(thid%group_size)+i >= length_S0) subject[i] = 20;
+           else subject[i] = devChars[offset_isc+base_S0+numRegs*(thid%group_size)+i];
+
+           if (offset_isc+numRegs*(thid%group_size)+i >= length_S1) subject[i] += 20*21; 
+           else subject[i] += 21*devChars[offset_isc+base_S1+numRegs*(thid%group_size)+i];
+       }
+    };
+
+
+    auto calc32_local_affine_float = [&](){
+        const __half2* const sbt_row = shared_BLOSUM62[query_letter];
+
+        const __half2 score2_0 = sbt_row[subject[0]];
+        penalty_temp0 = penalty_here_array[0];
+        penalty_here_array[0] = __hmax2(__hadd2(penalty_diag,score2_0),ZERO);
+
+        const __half2 score2_1 = sbt_row[subject[1]];
+        penalty_temp1 = penalty_here_array[1];
+        penalty_here_array[1] = __hmax2(__hadd2(penalty_temp0,score2_1),ZERO);
+
+		maximum = __hmax2(maximum, __hmax2(penalty_here_array[1],penalty_here_array[0]));
+
+        #pragma unroll
+        for (int i=1; i<numRegs/2; i++) {
+            const __half2 score2_2i = sbt_row[subject[2*i]];
+            penalty_temp0 = penalty_here_array[2*i];
+            penalty_here_array[2*i] = __hmax2(__hadd2(penalty_temp1,score2_2i),ZERO);
+
+            const __half2 score2_2i1 = sbt_row[subject[2*i+1]];
+            penalty_temp1 = penalty_here_array[2*i+1];
+            penalty_here_array[2*i+1] = __hmax2(__hadd2(penalty_temp0,score2_2i1),ZERO);
+
+			maximum = __hmax2(maximum,__hmax2(penalty_here_array[2*i+1],penalty_here_array[2*i]));
+        }
+    };
+
+
+
+    auto shuffle_affine_penalty = [&](const auto& new_penalty_diag) {
+        penalty_diag = __shfl_up_sync(0xFFFFFFFF, penalty_here_array[numRegs-1], 1, 32);
+        if (!group_id) {
+            penalty_diag = new_penalty_diag;
+        }
+    };
+
+
+    auto shuffle_H_temp_out = [&]() {
+        const int temp = __shfl_down_sync(0xFFFFFFFF, *((int*)(&H_temp_out)), 1, 32);
+        H_temp_out = *((half2*)(&temp));
+    };
+
+    auto shuffle_H_temp_in = [&]() {
+        const int temp = __shfl_down_sync(0xFFFFFFFF, *((int*)(&H_temp_in)), 1, 32);
+        H_temp_in = *((half2*)(&temp));
+    };
+
+    auto set_H_temp_out = [&]() {
+        if (thid % group_size == 31) H_temp_out = penalty_here_array[numRegs-1]; // penalty_here31;
+    };
+
+    auto checkHindex = [&](int x, SequenceLengthT queryLength, int line){
+        // // if(groupIdInBlock == 0){
+        // //     printf("lane %d, x = %d\n", (threadIdx.x % group_size), x);
+        // // }
+        // const SequenceLengthT currentQueryLengthWithPadding = getPaddedQueryLength(queryLength);
+        // assert(x >= 0);
+        // assert(x < currentQueryLengthWithPadding);
+        // // if(x >= currentQueryLengthWithPadding){
+        // //     if(groupIdInBlock == 0){
+        // //         printf("error tid %d, x %d len %d, paddedlen %d, line %d\n", 
+        // //             threadIdx.x, x, queryLength, currentQueryLengthWithPadding, line);
+        // //     }
+        // // }
+    };
+
+
+    // FIRST PASS (of many passes)
+    // Note first pass has always full seqeunce length
+    for (int i=0; i<numRegs; i++) penalty_here_array[i] = ZERO;
+
+    init_local_score_profile();
+
+    load_subject_regs(0);
+
+    for (int k = 0; k < queryLength-3; k+=4) {
+
+        calc32_local_affine_float();
+        if (k>0) shuffle_H_temp_out();
+        set_H_temp_out();
+        query_letter = new_query_letter4.y;
+        shuffle_affine_penalty(ZERO);
+
+        calc32_local_affine_float();
+        shuffle_H_temp_out();
+        set_H_temp_out();
+        query_letter = new_query_letter4.z;
+        shuffle_affine_penalty(ZERO);
+
+        calc32_local_affine_float();
+        shuffle_H_temp_out();
+        set_H_temp_out();
+        query_letter = new_query_letter4.w;
+        shuffle_affine_penalty(ZERO);
+
+        calc32_local_affine_float();
+        shuffle_H_temp_out();
+        set_H_temp_out();
+
+        //each k iteration computes 4 rows. after 32 rows have been computed, store those 32 values of right border to temp storage
+        if((k+4) % 32 == 0){
+            checkHindex(offset_out, queryLength, __LINE__);
+            devTempHcol[offset_out]=H_temp_out;
+            offset_out += group_size;
+        }
+        new_query_letter4 = constantQuery4[(k/4)+1];
+        query_letter = new_query_letter4.x;
+        shuffle_affine_penalty(ZERO);
+
+    }
+
+    if (queryLength%4 >= 1) {
+        calc32_local_affine_float();
+        shuffle_H_temp_out();
+        set_H_temp_out();
+        query_letter = new_query_letter4.y;
+        shuffle_affine_penalty(ZERO);
+    }
+
+    if (queryLength%4 >= 2) {
+        calc32_local_affine_float();
+        shuffle_H_temp_out();
+        set_H_temp_out();
+        query_letter = new_query_letter4.z;
+        shuffle_affine_penalty(ZERO);
+    }
+    if (queryLength%4 >= 3) {
+        calc32_local_affine_float();
+        shuffle_H_temp_out();
+        set_H_temp_out();
+    }
+    int final_out = queryLength % 32;
+    int from_thread_id = 32 - final_out;
+
+    if (thid>=from_thread_id) {
+        checkHindex(offset_out, queryLength, __LINE__);
+        devTempHcol[offset_out-from_thread_id]=H_temp_out;
+    }
+
+
+   //middle passes
+    for (int pass = 1; pass < passes-1; pass++) {
+
+        H_temp_out.x = -1000; H_temp_out.y = -1000;
+        new_query_letter4 = constantQuery4[0];
+        query_letter = new_query_letter4.x;
+
+        offset_out = group_id;
+        offset_in = group_id;
+        checkHindex(offset_in, queryLength, __LINE__);
+        H_temp_in = devTempHcol[offset_in];
+        offset_in += group_size;
+
+        penalty_diag = ZERO;
+        for (int i=0; i<numRegs; i++) penalty_here_array[i] = ZERO;
+
+        load_subject_regs(pass*(32*numRegs));
+
+        for (int k = 0; k < queryLength-3; k+=4) {
+
+            calc32_local_affine_float();
+            if (k>0) shuffle_H_temp_out();
+            set_H_temp_out();
+            query_letter = new_query_letter4.y;
+            shuffle_affine_penalty(H_temp_in);
+            shuffle_H_temp_in();
+
+            calc32_local_affine_float();
+            shuffle_H_temp_out();
+            set_H_temp_out();
+            query_letter = new_query_letter4.z;
+            shuffle_affine_penalty(H_temp_in);
+            shuffle_H_temp_in();
+
+            calc32_local_affine_float();
+            shuffle_H_temp_out();
+            set_H_temp_out();
+            query_letter = new_query_letter4.w;
+            shuffle_affine_penalty(H_temp_in);
+            shuffle_H_temp_in();
+
+            calc32_local_affine_float();
+            shuffle_H_temp_out();
+            set_H_temp_out();
+            shuffle_affine_penalty(H_temp_in);
+            shuffle_H_temp_in();
+
+            //each k iteration computes 4 rows. after 32 rows have been computed, store those 32 values of right border to temp storage
+            //and load the temp values of the previous pass
+            if((k+4) % 32 == 0){
+                checkHindex(offset_out, queryLength, __LINE__);
+                devTempHcol[offset_out]=H_temp_out;
+                offset_out += group_size;
+
+                checkHindex(offset_in, queryLength, __LINE__);
+                H_temp_in = devTempHcol[offset_in];
+                offset_in += group_size;
+            }
+            new_query_letter4 = constantQuery4[(k/4)+1];
+            query_letter = new_query_letter4.x;
+        }
+
+        if (queryLength%4 >= 1) {
+            calc32_local_affine_float();
+            shuffle_H_temp_out();
+            set_H_temp_out();
+            query_letter = new_query_letter4.y;
+            shuffle_affine_penalty(H_temp_in);
+            shuffle_H_temp_in();
+        }
+
+        if (queryLength%4 >= 2) {
+            calc32_local_affine_float();
+            shuffle_H_temp_out();
+            set_H_temp_out();
+            query_letter = new_query_letter4.z;
+            shuffle_affine_penalty(H_temp_in);
+            shuffle_H_temp_in();
+        }
+        if (queryLength%4 >= 3) {
+            calc32_local_affine_float();
+            shuffle_H_temp_out();
+            set_H_temp_out();
+        }
+
+        int final_out = queryLength % 32;
+        int from_thread_id = 32 - final_out;
+
+        if (thid>=from_thread_id) {
+            checkHindex(offset_out, queryLength, __LINE__);
+            devTempHcol[offset_out-from_thread_id]=H_temp_out;
+        }
+    }
+
+    // Final pass
+    H_temp_out.x = -1000; H_temp_out.y = -1000;
+    new_query_letter4 = constantQuery4[0];
+    query_letter = new_query_letter4.x;
+
+    offset_in = group_id;
+    checkHindex(offset_in, queryLength, __LINE__);
+    H_temp_in = devTempHcol[offset_in];
+    offset_in += group_size;
+
+
+    penalty_diag = ZERO;
+    for (int i=0; i<numRegs; i++) penalty_here_array[i] = ZERO;
+
+    load_subject_regs((passes-1)*(32*numRegs));
+
+
+    for (int k = 0; k < queryLength-3; k+=4) {
+        calc32_local_affine_float();
+        query_letter = new_query_letter4.y;
+        shuffle_affine_penalty(H_temp_in);
+        shuffle_H_temp_in();
+
+        calc32_local_affine_float();
+        query_letter = new_query_letter4.z;
+        shuffle_affine_penalty(H_temp_in);
+        shuffle_H_temp_in();
+
+        calc32_local_affine_float();
+        query_letter = new_query_letter4.w;
+        shuffle_affine_penalty(H_temp_in);
+        shuffle_H_temp_in();
+
+        calc32_local_affine_float();
+        shuffle_affine_penalty(H_temp_in);
+        shuffle_H_temp_in();
+        //each k iteration computes 4 rows. after 32 rows have been computed
+        //load the temp values of the previous pass
+        if((k+4) % 32 == 0){
+            checkHindex(offset_in, queryLength, __LINE__);
+            H_temp_in = devTempHcol[offset_in];
+            offset_in += group_size;
+        }
+        new_query_letter4 = constantQuery4[(k/4)+1];
+        query_letter = new_query_letter4.x;
+    }
+
+    if (queryLength%4 >= 1) {
+        calc32_local_affine_float();
+        query_letter = new_query_letter4.y;
+        shuffle_affine_penalty(H_temp_in);
+        shuffle_H_temp_in();
+    }
+
+    if (queryLength%4 >= 2) {
+        calc32_local_affine_float();
+        query_letter = new_query_letter4.z;
+        shuffle_affine_penalty(H_temp_in);
+        shuffle_H_temp_in();
+    }
+    if (queryLength%4 >= 3) {
+        calc32_local_affine_float();
+    }
+
+
+    //group-wide max-reduce
+    for (int offset=group_size/2; offset>0; offset/=2){
+        maximum = __hmax2(maximum,__shfl_down_sync(0xFFFFFFFF,maximum,offset,group_size));
+    }
+
+    if (!group_id) {
+        if (blid < gridDim.x-1) {
+            devAlignmentScores[d_positions_of_selected_lengths[2*(blockDim.x/group_size)*blid+2*(thid/group_size)]] =  maximum.y;
+            devAlignmentScores[d_positions_of_selected_lengths[2*(blockDim.x/group_size)*blid+2*(thid/group_size)+1]] =  maximum.x;
+        } else {
+            devAlignmentScores[d_positions_of_selected_lengths[2*(blockDim.x/group_size)*blid+2*((thid%check_last)/group_size)]] =  maximum.y;
+            if (!check_last2 || (thid%check_last) < check_last-group_size) devAlignmentScores[d_positions_of_selected_lengths[2*(blockDim.x/group_size)*blid+2*((thid%check_last)/group_size)+1]] =  maximum.x;
+        }
+    }
+}
+
+
+
+template <int blocksize, int group_size, int numRegs, class ScoreOutputIterator, class PositionsIterator> 
+void call_GaplessFilter_row_wise_many_pass_half2(
+    const char * const devChars,
+    ScoreOutputIterator const devAlignmentScores,
+    __half2* const devTempHcol2,
+    const size_t* const devOffsets,
+    const SequenceLengthT* const devLengths,
+    PositionsIterator const d_positions_of_selected_lengths,
+    const int numSelected,
+    const int queryLength,
+    cudaStream_t stream
+){
+    constexpr int groupsPerBlock = blocksize / group_size;
+    constexpr int alignmentsPerGroup = 2;
+    constexpr int alignmentsPerBlock = groupsPerBlock * alignmentsPerGroup;
+
+    //int smem = sizeof(__half2) * hostBlosumDim * hostBlosumDim * hostBlosumDim;
+    int smem = 0;
+    auto kernel = GaplessFilter_row_wise_many_pass_half2<blocksize, group_size, numRegs, ScoreOutputIterator, PositionsIterator>;
+    //cudaFuncSetAttribute(kernel, cudaFuncAttributeMaxDynamicSharedMemorySize, smem);
+
+    dim3 grid = (numSelected + alignmentsPerBlock - 1) / alignmentsPerBlock;
+
+    //std::cout << "call_GaplessFilter_row_wise_many_pass_half2 gridsize " << grid.x << " blocksize " << blocksize << " groupsize " << group_size << " numRegs " << numRegs << "\n";
+
+    kernel<<<grid, blocksize, smem, stream>>>(
+        devChars,
+        devAlignmentScores,
+        devTempHcol2,
+        devOffsets,
+        devLengths,
+        d_positions_of_selected_lengths,
+        numSelected,       
+        queryLength
+    );
+}
+
+
+
+
+
+
+
+
+
+
+
+
+//##################################################################################################################
+// SINGLE PASS HALF2
+//##################################################################################################################
+
+
+
+template <int blocksize, int group_size, int numRegs, class ScoreOutputIterator, class PositionsIterator> __global__
+void __launch_bounds__(256,2) GaplessFilter_row_wise_half2(
+    __grid_constant__ const char * const devChars,
+    __grid_constant__ ScoreOutputIterator const devAlignmentScores,
+    __grid_constant__ const size_t* const devOffsets,
+    __grid_constant__ const SequenceLengthT* const devLengths,
+    __grid_constant__ PositionsIterator const d_positions_of_selected_lengths,
+    __grid_constant__ const int numSelected,
+    __grid_constant__ const SequenceLengthT queryLength
+) {
+
+    static_assert(blocksize % group_size == 0);
+    __builtin_assume(blockDim.x == blocksize);
+    __builtin_assume(blockDim.x % group_size == 0);
+
+
+
+    __shared__ __half2 shared_BLOSUM62[21][21*21];
+    int subject[numRegs];
+
+    const int blid = blockIdx.x;
+    const int thid = threadIdx.x;
+    const int group_id = thid%group_size;
+	//int offset = group_id + group_size;
+
+    int check_last = blockDim.x/group_size;
+    int check_last2 = 0;
+    if (blid == gridDim.x-1) {
+        if (numSelected % (2*blockDim.x/group_size)) {
+            check_last = (numSelected/2) % (blockDim.x/group_size);
+            check_last2 = numSelected%2;
+            check_last = check_last + check_last2;
+        }
+    }
+    check_last = check_last * group_size;
+
+    const SequenceLengthT length_S0 = devLengths[d_positions_of_selected_lengths[2*(blockDim.x/group_size)*blid+2*((thid%check_last)/group_size)]];
+    const size_t base_S0 = devOffsets[d_positions_of_selected_lengths[2*(blockDim.x/group_size)*blid+2*((thid%check_last)/group_size)]]-devOffsets[0];
+
+	SequenceLengthT length_S1 = length_S0;
+	size_t base_S1 = base_S0;
+	if ((blid < gridDim.x-1) || (!check_last2) || ((thid%check_last) < check_last-group_size) || ((thid%check_last) >= check_last)) {
+		length_S1 = devLengths[d_positions_of_selected_lengths[2*(blockDim.x/group_size)*blid+2*((thid%check_last)/group_size)+1]];
+	    base_S1 = devOffsets[d_positions_of_selected_lengths[2*(blockDim.x/group_size)*blid+2*((thid%check_last)/group_size)+1]]-devOffsets[0];
+	}
+
+	SequenceLengthT temp_length = max(length_S0, length_S1);
+    const SequenceLengthT length = warp_max_reduce_broadcast(0xFFFFFFFF, temp_length);
+
+    __half2 penalty_temp0, penalty_temp1;
+    __half2 penalty_here_array[numRegs];
+    __half2 maximum = __float2half2_rn(0.0);
+    const __half2 ZERO = __float2half2_rn(0.0);
+    __half2 penalty_diag = ZERO;
+
+
+    auto init_local_score_profile = [&]() {
+        for (int i=thid; i<21*21; i+=blockDim.x) {
+            __half2 temp0;
+            temp0.x = cBLOSUM62_dev[21*(i/21)+(i%21)];
+            for (int j=0; j<21; j++) {
+                temp0.y = cBLOSUM62_dev[21*(i/21)+j];
+                shared_BLOSUM62[i/21][21*(i%21)+j]=temp0;
+            }
+        }
+        __syncthreads();
+    };
+
+    auto load_subject_regs = [&](const auto& offset_isc) {
+        for (int i=0; i<numRegs; i++) {
+
+            if (offset_isc+numRegs*(thid%group_size)+i >= length_S0) subject[i] = 20;
+            else subject[i] = devChars[offset_isc+base_S0+numRegs*(thid%group_size)+i];
+ 
+            if (offset_isc+numRegs*(thid%group_size)+i >= length_S1) subject[i] += 20*21; 
+            else subject[i] += 21*devChars[offset_isc+base_S1+numRegs*(thid%group_size)+i];
+        }
+    };
+
+
+    char4 new_query_letter4 = constantQuery4[0];
+    char query_letter = new_query_letter4.x;
+
+    auto calc32_local_affine_float = [&](){
+        const __half2* const sbt_row = shared_BLOSUM62[query_letter];
+
+        const __half2 score2_0 = sbt_row[subject[0]];
+        penalty_temp0 = penalty_here_array[0];
+        penalty_here_array[0] = __hmax2(__hadd2(penalty_diag,score2_0),ZERO);
+
+        const __half2 score2_1 = sbt_row[subject[1]];
+        penalty_temp1 = penalty_here_array[1];
+        penalty_here_array[1] = __hmax2(__hadd2(penalty_temp0,score2_1),ZERO);
+
+		maximum = __hmax2(maximum, __hmax2(penalty_here_array[1],penalty_here_array[0]));
+
+        #pragma unroll
+        for (int i=1; i<numRegs/2; i++) {
+            const __half2 score2_2i = sbt_row[subject[2*i]];
+            penalty_temp0 = penalty_here_array[2*i];
+            penalty_here_array[2*i] = __hmax2(__hadd2(penalty_temp1,score2_2i),ZERO);
+
+            const __half2 score2_2i1 = sbt_row[subject[2*i+1]];
+            penalty_temp1 = penalty_here_array[2*i+1];
+            penalty_here_array[2*i+1] = __hmax2(__hadd2(penalty_temp0,score2_2i1),ZERO);
+
+			maximum = __hmax2(maximum,__hmax2(penalty_here_array[2*i+1],penalty_here_array[2*i]));
+        }
+    };
+
+
+
+    auto shuffle_affine_penalty = [&]() {
+        penalty_diag = __shfl_up_sync(0xFFFFFFFF, penalty_here_array[numRegs-1], 1, 32);
+        if (!group_id) penalty_diag = ZERO;
+    };
+
+
+    for (int i=0; i<numRegs; i++) penalty_here_array[i] = ZERO;
+
+    init_local_score_profile();
+
+    load_subject_regs(0);
+
+    for (int k=0; k<queryLength-3; k+=4) {
+        calc32_local_affine_float(); // .x
+        query_letter = new_query_letter4.y;
+        shuffle_affine_penalty();
+
+        calc32_local_affine_float(); // .y
+        query_letter = new_query_letter4.z;
+        shuffle_affine_penalty();
+
+        calc32_local_affine_float();  // .z
+        query_letter = new_query_letter4.w;
+        shuffle_affine_penalty();
+
+        calc32_local_affine_float();  // .w
+        new_query_letter4 = constantQuery4[(k/4)+1];
+        query_letter = new_query_letter4.x;
+        shuffle_affine_penalty();
+    }
+
+    if (queryLength%4 >= 1) {
+        calc32_local_affine_float(); // .x
+        query_letter = new_query_letter4.y;
+        shuffle_affine_penalty();
+    }
+
+    if (queryLength%4 >= 2) {
+        calc32_local_affine_float(); // .y
+        query_letter = new_query_letter4.z;
+        shuffle_affine_penalty();
+    }
+
+    if (queryLength%4 >= 3) {
+        calc32_local_affine_float(); // .z
+    }
+
+    //group-wide max-reduce
+    for (int offset=group_size/2; offset>0; offset/=2){
+	    maximum = __hmax2(maximum,__shfl_down_sync(0xFFFFFFFF,maximum,offset,group_size));
+    }
+
+    if (!group_id) {
+        if (blid < gridDim.x-1) {
+            devAlignmentScores[d_positions_of_selected_lengths[2*(blockDim.x/group_size)*blid+2*(thid/group_size)]] =  maximum.y;
+            devAlignmentScores[d_positions_of_selected_lengths[2*(blockDim.x/group_size)*blid+2*(thid/group_size)+1]] =  maximum.x;
+        } else {
+            devAlignmentScores[d_positions_of_selected_lengths[2*(blockDim.x/group_size)*blid+2*((thid%check_last)/group_size)]] =  maximum.y;
+            if (!check_last2 || (thid%check_last) < check_last-group_size) devAlignmentScores[d_positions_of_selected_lengths[2*(blockDim.x/group_size)*blid+2*((thid%check_last)/group_size)+1]] =  maximum.x;
+        }
+    }
+}
+
+
+template <int blocksize, int group_size, int numRegs, class ScoreOutputIterator, class PositionsIterator> 
+void call_GaplessFilter_row_wise_half2(
+    const char * const devChars,
+    ScoreOutputIterator const devAlignmentScores,
+    const size_t* const devOffsets,
+    const SequenceLengthT* const devLengths,
+    PositionsIterator const d_positions_of_selected_lengths,
+    const int numSelected,
+    const int queryLength,
+    cudaStream_t stream
+){
+    constexpr int groupsPerBlock = blocksize / group_size;
+    constexpr int alignmentsPerGroup = 2;
+    constexpr int alignmentsPerBlock = groupsPerBlock * alignmentsPerGroup;
+
+    //int smem = sizeof(__half2) * hostBlosumDim * hostBlosumDim * hostBlosumDim;
+    int smem = 0;
+    auto kernel = GaplessFilter_row_wise_half2<blocksize, group_size, numRegs, ScoreOutputIterator, PositionsIterator>;
+    //cudaFuncSetAttribute(kernel, cudaFuncAttributeMaxDynamicSharedMemorySize, smem);
+
+    dim3 grid = (numSelected + alignmentsPerBlock - 1) / alignmentsPerBlock;
+    //std::cout << "call_GaplessFilter_row_wise_half2 gridsize " << grid.x << " blocksize " << blocksize << " groupsize " << group_size << " numRegs " << numRegs << "\n";
+    
+    kernel<<<grid, blocksize, smem, stream>>>(
+        devChars,
+        devAlignmentScores,
+        devOffsets,
+        devLengths,
+        d_positions_of_selected_lengths,
+        numSelected,       
+        queryLength
+    );
+}
+
+
+
+
+
+
+//##################################################################################################################
+// MANY PASS FLOAT
+//##################################################################################################################
+
+
+template <int numRegs, class ScoreOutputIterator, class PositionsIterator> __global__
+void __launch_bounds__(32,16) GaplessFilter_float_many_pass(
+    __grid_constant__ const char * const devChars,
+    __grid_constant__ ScoreOutputIterator const devAlignmentScores,
+    __grid_constant__ float2* const devTempHcol2,
+    __grid_constant__ const size_t* const devOffsets,
+    __grid_constant__ const SequenceLengthT* const devLengths,
+    __grid_constant__ PositionsIterator const d_positions_of_selected_lengths,
+    __grid_constant__ const SequenceLengthT queryLength
+) {
+    __builtin_assume(blockDim.x == 32);
+
+    constexpr int group_size = 32;
+    
+    __shared__ float shared_BLOSUM62[21][21];
+    int subject[numRegs];
+
+    const int blid = blockIdx.x;
+    const int thid = threadIdx.x;
+    const int group_id = thid%group_size;
+    //int offset = group_id + group_size;
+
+    SequenceLengthT test_length =  devLengths[d_positions_of_selected_lengths[blid]];
+    const SequenceLengthT length = abs(test_length);
+    const size_t base = devOffsets[d_positions_of_selected_lengths[blid]]-devOffsets[0];
+
+    float2 H_temp_out{}, H_temp_in{};
+    float penalty_temp0, penalty_temp1;
+    float penalty_here_array[numRegs];
+    const float ZERO = 0;
+    float maximum = ZERO;
+    float penalty_diag = ZERO;
+
+    for (int i=0; i<numRegs; i++) penalty_here_array[i] = ZERO;
+
+    const size_t base_3 = size_t(blockIdx.x)*size_t(getPaddedQueryLength(queryLength) / 2);
+    float2* devTempHcol = &devTempHcol2[base_3];
+
+    //H_temp_out = __floats2half2_rn(NEGINFINITY,NEGINFINITY);
+    H_temp_out.x = -1000; H_temp_out.y = -1000;
+    char4 new_query_letter4 = constantQuery4[0];
+    char query_letter = new_query_letter4.x;
+
+    const int passes = (length + (32*numRegs) - 1) / (32*numRegs);
+
+    int offset_out = group_id;
+    int offset_in = group_id;
+
+    auto checkHindex = [&](int x, SequenceLengthT queryLength, int line){
+        // // if(groupIdInBlock == 0){
+        // //     printf("lane %d, x = %d\n", (threadIdx.x % group_size), x);
+        // // }
+        // const SequenceLengthT currentQueryLengthWithPadding = getPaddedQueryLength(queryLength);
+        // assert(x >= 0);
+        // assert(x < currentQueryLengthWithPadding);
+        // // if(x >= currentQueryLengthWithPadding){
+        // //     if(groupIdInBlock == 0){
+        // //         printf("error tid %d, x %d len %d, paddedlen %d, line %d\n", 
+        // //             threadIdx.x, x, queryLength, currentQueryLengthWithPadding, line);
+        // //     }
+        // // }
+    };
+
+    auto init_local_score_profile = [&]() {
+        for (int i=thid; i<21*21; i+=32) shared_BLOSUM62[i/21][i%21]=cBLOSUM62_dev[i];
+        __syncwarp();
+    };
+
+    auto load_subject_regs = [&](const auto& offset_isc) {
+        for (int i=0; i<numRegs; i++) {
+            if (offset_isc+numRegs*(thid%group_size)+i >= length) subject[i] = 20;
+            else subject[i] = devChars[offset_isc+base+numRegs*(thid%group_size)+i];
+        }
+    };
+
+
+    auto calc32_local_affine_float = [&](){
+        const float* const sbt_row = shared_BLOSUM62[query_letter];
+
+        const float score_0 = sbt_row[subject[0]];
+        penalty_temp0 = penalty_here_array[0];
+        penalty_here_array[0] = max(penalty_diag + score_0,ZERO);
+
+        const float score_1 = sbt_row[subject[1]];
+        penalty_temp1 = penalty_here_array[1];
+        penalty_here_array[1] = max(penalty_temp0 + score_1,ZERO);
+
+		maximum = max(maximum, max(penalty_here_array[1],penalty_here_array[0]));
+
+        #pragma unroll
+        for (int i=1; i<numRegs/2; i++) {
+            const float score_2i = sbt_row[subject[2*i]];
+            penalty_temp0 = penalty_here_array[2*i];
+            penalty_here_array[2*i] = max(penalty_temp1 + score_2i,ZERO);
+
+            const float score_2i1 = sbt_row[subject[2*i+1]];
+            penalty_temp1 = penalty_here_array[2*i+1];
+            penalty_here_array[2*i+1] = max(penalty_temp0+ score_2i1,ZERO);
+
+			maximum = max(maximum, max(penalty_here_array[2*i+1],penalty_here_array[2*i]));
+        }
+    };
+
+    auto shuffle_affine_penalty = [&](const auto& new_penalty_diag) {
+        penalty_diag = __shfl_up_sync(0xFFFFFFFF, penalty_here_array[numRegs-1], 1, 32);
+        if (!group_id) {
+            penalty_diag = new_penalty_diag;
+        }
+    };
+
+
+    auto shuffle_H_temp_out = [&]() {
+        const double temp0 = __shfl_down_sync(0xFFFFFFFF, *((double*)(&H_temp_out)), 1, 32);
+        H_temp_out = *((float2*)(&temp0));
+    };
+
+    auto shuffle_H_temp_in = [&]() {
+        const double temp0 = __shfl_down_sync(0xFFFFFFFF, *((double*)(&H_temp_in)), 1, 32);
+        H_temp_in = *((float2*)(&temp0));
+    };
+
+    auto set_H_temp_out_x = [&]() {
+        if (thid % group_size == 31) H_temp_out.x = penalty_here_array[numRegs-1]; // penalty_here31;
+    };
+
+    auto set_H_temp_out_y = [&]() {
+        if (thid % group_size == 31) H_temp_out.y = penalty_here_array[numRegs-1]; // penalty_here31;
+    };
+
+
+    init_local_score_profile();
+
+    load_subject_regs(0);
+
+    if (passes == 1) {
+
+        for (int k = 0; k < queryLength-3; k+=4) {
+
+            calc32_local_affine_float();
+            query_letter = new_query_letter4.y;
+            shuffle_affine_penalty(ZERO);
+
+            calc32_local_affine_float();
+            query_letter = new_query_letter4.z;
+            shuffle_affine_penalty(ZERO);
+
+            calc32_local_affine_float();
+            query_letter = new_query_letter4.w;
+            shuffle_affine_penalty(ZERO);
+
+            calc32_local_affine_float();
+            new_query_letter4 = constantQuery4[(k/4)+1];
+            query_letter = new_query_letter4.x;
+            shuffle_affine_penalty(ZERO);
+        }
+
+        if (queryLength%4 >= 1) {
+            calc32_local_affine_float();
+            query_letter = new_query_letter4.y;
+            shuffle_affine_penalty(ZERO);
+        }
+
+        if (queryLength%4 >= 2) {
+            calc32_local_affine_float();
+            query_letter = new_query_letter4.z;
+            shuffle_affine_penalty(ZERO);
+        }
+        if (queryLength%4 >= 3) calc32_local_affine_float();
+
+    }
+    else {
+
+        // first pass (of multiple passes)
+        for (int k = 0; k < queryLength-3; k+=4) {
+
+            calc32_local_affine_float();
+            if (k>0) shuffle_H_temp_out();
+            set_H_temp_out_x();
+            query_letter = new_query_letter4.y;
+            shuffle_affine_penalty(ZERO);
+
+            calc32_local_affine_float();
+            //shuffle_H_temp_out();
+            set_H_temp_out_y();
+            query_letter = new_query_letter4.z;
+            shuffle_affine_penalty(ZERO);
+
+            calc32_local_affine_float();
+            shuffle_H_temp_out();
+            set_H_temp_out_x();
+            query_letter = new_query_letter4.w;
+            shuffle_affine_penalty(ZERO);
+
+            calc32_local_affine_float();
+            //shuffle_H_temp_out();
+            set_H_temp_out_y();
+
+            //each k iteration computes 4 rows. after 64 rows have been computed, store those 32 float2 values of right border to temp storage
+            if((k+4) % 64 == 0){
+                checkHindex(offset_out, queryLength, __LINE__);
+                devTempHcol[offset_out]=H_temp_out;
+                offset_out += group_size;
+            }
+            new_query_letter4 = constantQuery4[(k/4)+1];
+            query_letter = new_query_letter4.x;
+            shuffle_affine_penalty(ZERO);
+        }
+
+
+        if (queryLength%4 >= 1) {
+            calc32_local_affine_float();
+            shuffle_H_temp_out();
+            set_H_temp_out_x();
+            query_letter = new_query_letter4.y;
+            shuffle_affine_penalty(ZERO);
+        }
+
+        if (queryLength%4 >= 2) {
+            calc32_local_affine_float();
+            //shuffle_H_temp_out();
+            set_H_temp_out_y();
+            query_letter = new_query_letter4.z;
+            shuffle_affine_penalty(ZERO);
+        }
+        if (queryLength%4 >= 3) {
+            calc32_local_affine_float();
+            shuffle_H_temp_out();
+            set_H_temp_out_x();
+        }
+        if (queryLength%2 == 1) set_H_temp_out_y();
+
+        int final_out = queryLength % 64;
+        int from_thread_id = 32 - ((final_out+1)/2);
+
+        if (thid>=from_thread_id) {
+            devTempHcol[offset_out-from_thread_id]=H_temp_out;
+        }
+
+
+        // Middle passes
+
+        for (int pass = 1; pass < passes-1; pass++) {
+
+            H_temp_out.x = -1000; H_temp_out.y = -1000;
+            new_query_letter4 = constantQuery4[0];
+            query_letter = new_query_letter4.x;
+
+            offset_out = group_id;
+            offset_in = group_id;
+            checkHindex(offset_in, queryLength, __LINE__);
+            H_temp_in = devTempHcol[offset_in];
+            offset_in += group_size;
+
+            penalty_diag = ZERO;
+            for (int i=0; i<numRegs; i++) penalty_here_array[i] = ZERO;
+
+            load_subject_regs(pass*(32*numRegs));
+
+            for (int k = 0; k < queryLength-3; k+=4) {
+                calc32_local_affine_float();
+                if (k>0) shuffle_H_temp_out();
+                set_H_temp_out_x();
+                query_letter = new_query_letter4.y;
+                shuffle_affine_penalty(H_temp_in.x);
+                //shuffle_H_temp_in();
+
+                calc32_local_affine_float();
+                //shuffle_H_temp_out();
+                set_H_temp_out_y();
+                query_letter = new_query_letter4.z;
+                shuffle_affine_penalty(H_temp_in.y);
+                shuffle_H_temp_in();
+
+                calc32_local_affine_float();
+                shuffle_H_temp_out();
+                set_H_temp_out_x();
+                query_letter = new_query_letter4.w;
+                shuffle_affine_penalty(H_temp_in.x);
+                //shuffle_H_temp_in();
+
+                calc32_local_affine_float();
+                //shuffle_H_temp_out();
+                set_H_temp_out_y();
+                shuffle_affine_penalty(H_temp_in.y);
+                shuffle_H_temp_in();
+
+                //each k iteration computes 4 rows. after 64 rows have been computed, store those 32 float2 values of right border to temp storage
+                //and load the temp values of the previous pass
+                if((k+4) % 64 == 0){
+                    checkHindex(offset_out, queryLength, __LINE__);
+                    devTempHcol[offset_out]=H_temp_out;
+                    offset_out += group_size;
+
+                    checkHindex(offset_in, queryLength, __LINE__);
+                    H_temp_in = devTempHcol[offset_in];
+                    offset_in += group_size;
+                }
+                new_query_letter4 = constantQuery4[(k/4)+1];
+                query_letter = new_query_letter4.x;
+            }
+
+            if (queryLength%4 >= 1) {
+                calc32_local_affine_float();
+                shuffle_H_temp_out();
+                set_H_temp_out_x();
+                query_letter = new_query_letter4.y;
+                shuffle_affine_penalty(H_temp_in.x);
+                //shuffle_H_temp_in();
+            }
+
+            if (queryLength%4 >= 2) {
+                calc32_local_affine_float();
+                //shuffle_H_temp_out();
+                set_H_temp_out_y();
+                query_letter = new_query_letter4.z;
+                shuffle_affine_penalty(H_temp_in.y);
+                shuffle_H_temp_in();
+            }
+            if (queryLength%4 >= 3) {
+                calc32_local_affine_float();
+                shuffle_H_temp_out();
+                set_H_temp_out_x();
+            }
+            if (queryLength%4 >= 3) set_H_temp_out_y();
+
+            int final_out = queryLength % 64;
+            int from_thread_id = 32 - ((final_out+1)/2);
+
+            if (thid>=from_thread_id) {
+                devTempHcol[offset_out-from_thread_id]=H_temp_out;
+            }
+        }
+
+        // Final pass
+        H_temp_out.x = -1000; H_temp_out.y = -1000;
+        new_query_letter4 = constantQuery4[0];
+        query_letter = new_query_letter4.x;
+
+        offset_in = group_id;
+        checkHindex(offset_in, queryLength, __LINE__);
+        H_temp_in = devTempHcol[offset_in];
+        offset_in += group_size;
+
+        penalty_diag = ZERO;
+        for (int i=0; i<numRegs; i++) penalty_here_array[i] = ZERO;
+
+        load_subject_regs((passes-1)*(32*numRegs));
+
+        for (int k = 0; k < queryLength-3; k+=4) {
+            calc32_local_affine_float();
+            query_letter = new_query_letter4.y;
+            shuffle_affine_penalty(H_temp_in.x);
+            //shuffle_H_temp_in();
+
+            calc32_local_affine_float();
+            query_letter = new_query_letter4.z;
+            shuffle_affine_penalty(H_temp_in.y);
+            shuffle_H_temp_in();
+
+            calc32_local_affine_float();
+            query_letter = new_query_letter4.w;
+            shuffle_affine_penalty(H_temp_in.x);
+            //shuffle_H_temp_in();
+
+            calc32_local_affine_float();
+            shuffle_affine_penalty(H_temp_in.y);
+            shuffle_H_temp_in();
+
+            //each k iteration computes 4 rows. after 64 rows have been computed
+            //load the temp values of the previous pass
+            if((k+4) % 64 == 0){
+                checkHindex(offset_in, queryLength, __LINE__);
+                H_temp_in = devTempHcol[offset_in];
+                offset_in += group_size;
+            }
+            new_query_letter4 = constantQuery4[(k/4)+1];
+            query_letter = new_query_letter4.x;
+        }
+
+        if (queryLength%4 >= 1) {
+            calc32_local_affine_float();
+            query_letter = new_query_letter4.y;
+            shuffle_affine_penalty(H_temp_in.x);
+            //shuffle_H_temp_in();
+        }
+
+        if (queryLength%4 >= 2) {
+            calc32_local_affine_float();
+            query_letter = new_query_letter4.z;
+            shuffle_affine_penalty(H_temp_in.y);
+            shuffle_H_temp_in();
+        }
+        if (queryLength%4 >= 3) {
+            calc32_local_affine_float();
+        }
+
+    }
+
+    //group-wide max-reduce
+    for (int offset=group_size/2; offset>0; offset/=2){
+        maximum = max(maximum,__shfl_down_sync(0xFFFFFFFF,maximum,offset,group_size));
+    }
+    if (!group_id){
+        devAlignmentScores[d_positions_of_selected_lengths[blid]] =  maximum;
+    }
+}
+
+
+
+
+template <int numRegs, class ScoreOutputIterator, class PositionsIterator> 
+void call_GaplessFilter_float_many_pass(
+    const char * const devChars,
+    ScoreOutputIterator const devAlignmentScores,
+    float2* const devTempHcol2,
+    const size_t* const devOffsets,
+    const SequenceLengthT* const devLengths,
+    PositionsIterator const d_positions_of_selected_lengths,
+    const int numSelected,
+    const int queryLength,
+    cudaStream_t stream
+){
+    dim3 block = 32;
+    dim3 grid = numSelected;
+
+    //int smem = sizeof(__half2) * hostBlosumDim * hostBlosumDim * hostBlosumDim;
+    //int smem = 0;
+    auto kernel = GaplessFilter_float_many_pass<numRegs, ScoreOutputIterator, PositionsIterator>;
+    cudaFuncSetAttribute(kernel, cudaFuncAttributeMaxDynamicSharedMemorySize, 0);
+
+    kernel<<<grid, block, 0, stream>>>(
+        devChars,
+        devAlignmentScores,
+        devTempHcol2,
+        devOffsets,
+        devLengths,
+        d_positions_of_selected_lengths,
+        //numSelected,       
+        queryLength
+    );
+}
+
+
+
+#undef cBLOSUM62_dev
+
+#endif
\ No newline at end of file
diff --git a/lib/libmarv/src/kseqpp/filereader.hpp b/lib/libmarv/src/kseqpp/filereader.hpp
new file mode 100644
index 000000000..bc4eb126c
--- /dev/null
+++ b/lib/libmarv/src/kseqpp/filereader.hpp
@@ -0,0 +1,330 @@
+#ifndef FILEREADER_HPP
+#define FILEREADER_HPP
+
+#include "gziphelpers.hpp"
+
+#include <zlib.h>
+
+#include <algorithm>
+#include <cassert>
+#include <fstream>
+#include <iostream>
+#include <string>
+#include <vector>
+
+#include <thread>
+#include <mutex>
+#include <condition_variable>
+
+namespace kseqpp{
+
+
+class FileReader{
+public:    
+    virtual ~FileReader() = default;
+
+    int read(char* outputbuffer, int outputsize){
+        return readImpl(outputbuffer, outputsize);
+    }
+private:
+    virtual int readImpl(char* outputbuffer, int outputsize) = 0;
+};
+
+class RawReader : public FileReader{
+public:    
+    RawReader(std::string filename_) 
+        : filename(std::move(filename_)),
+          inputstream(filename){
+
+        if(!bool(inputstream)){
+            throw std::runtime_error("Cannot open file " + filename);
+        }
+    }
+
+private:
+    int readImpl(char* outputbuffer, int outputsize) override{
+        inputstream.read(outputbuffer, outputsize);
+        return inputstream.gcount();
+    }
+
+    std::string filename;
+    std::ifstream inputstream;
+};
+
+class AsyncRawReader : public FileReader{
+public:    
+    AsyncRawReader(std::string filename_) 
+        : 
+          filename(std::move(filename_)),
+          inputstream(filename),
+          inputthread([&](){inputthreadfunc();}),
+          isRunning(true){
+
+        if(!bool(inputstream)){
+            throw std::runtime_error("Cannot open file " + filename);
+        }
+    }
+
+    ~AsyncRawReader(){
+        cancel();
+        inputthread.join();            
+    }
+
+    AsyncRawReader(const AsyncRawReader&) = delete;
+    AsyncRawReader(AsyncRawReader&&) = delete;
+    AsyncRawReader& operator=(const AsyncRawReader&) = delete;
+    AsyncRawReader& operator=(AsyncRawReader&&) = delete;
+
+private:
+
+    struct Buffer{
+        static constexpr int bufferSize = 1024 * 64;
+        Buffer() : data(std::vector<char>(bufferSize)){}
+
+        int numBytes = 0;
+        int begin = 0;
+        std::vector<char> data;        
+    };
+
+    void cancel(){
+        std::unique_lock<std::mutex> ul(commMutex);
+        canContinue = false;
+        cv_producer.notify_one();
+    }
+
+    void inputthreadfunc(){
+        Buffer tempbuffer;
+
+        while(canContinue && bool(inputstream)){
+            inputstream.read(tempbuffer.data.data(), tempbuffer.data.size());
+            tempbuffer.numBytes = inputstream.gcount();
+            tempbuffer.begin = 0;
+
+            std::unique_lock<std::mutex> ul(commMutex);
+            //std::cerr << "tfunc consumerNeedsNext " << consumerNeedsNext << "\n";
+            if(!consumerNeedsNext){
+                //std::cerr << "cv_producer.wait\n";
+                cv_producer.wait(ul, [&](){return consumerNeedsNext || !canContinue;});
+            }
+            if(!consumerNeedsNext){
+                assert(!canContinue);
+                std::cerr << "!canContinue break\n";
+                break;
+            }
+            std::swap(tempbuffer, buffer);
+            nextBufferIsReady = true;
+            consumerNeedsNext = false;
+            //std::cerr << "cv_consumer.notify_one\n";
+            cv_consumer.notify_one();
+        }
+
+        //std::cerr << canContinue << " " << bool(inputstream) << "\n";
+
+        std::unique_lock<std::mutex> ul(commMutex);
+        isRunning = false;
+        cv_consumer.notify_one();
+    }
+
+    int readImpl(char* outputbuffer, int outputsize) override{
+        const int oldOutputsize = outputsize;
+
+        while(outputsize > 0 && (buffer.numBytes > 0 || isRunning)){
+            if(buffer.numBytes == 0 && isRunning){
+                std::unique_lock<std::mutex> ul(commMutex);
+                consumerNeedsNext = true;
+                //std::cerr << "set consumerNeedsNext = true\n";
+                //std::cerr << "cv_producer.notify_one\n";
+                cv_producer.notify_one();
+
+                if(!nextBufferIsReady){                    
+                    //std::cerr << "cv_consumer.wait\n";
+                    cv_consumer.wait(ul, [&](){return nextBufferIsReady || !isRunning;});
+                }
+                if(!nextBufferIsReady){
+                    std::cerr << "!nextBufferIsReady\n";
+                    assert(!isRunning);
+                    break;
+                }
+                nextBufferIsReady = false;
+
+                assert(buffer.begin == 0);
+                //std::string s(buffer.data.begin() + buffer.begin, buffer.data.end());
+                //std::cout << s ;
+            }
+
+            const int bytesToCopy = std::min(outputsize, buffer.numBytes);
+            assert(buffer.begin + bytesToCopy <= int(buffer.data.size()));
+            std::copy_n(buffer.data.data() + buffer.begin, 
+                        bytesToCopy, 
+                        outputbuffer + oldOutputsize - outputsize);
+            outputsize -= bytesToCopy;
+            buffer.numBytes -= bytesToCopy;
+            buffer.begin += bytesToCopy;
+        }
+
+        if(isRunning){
+            assert(oldOutputsize - outputsize > 0);
+        }
+
+        return oldOutputsize - outputsize;
+    }
+
+    std::string filename;
+    std::ifstream inputstream;
+    std::thread inputthread;
+    bool isRunning = false;
+    bool canContinue = true;
+    bool nextBufferIsReady = false;
+    bool consumerNeedsNext = false;
+    Buffer buffer;
+    std::mutex commMutex;
+    std::condition_variable cv_producer;
+    std::condition_variable cv_consumer;
+};
+
+class ZlibReader : public FileReader{    
+public:    
+    ZlibReader(std::string filename_) 
+        : filename(std::move(filename_)){
+
+        fh = gzopen(filename.c_str(), "r");
+
+        if(fh == NULL){
+            throw std::runtime_error("Cannot open file " + filename);
+        }
+
+    }
+
+    ~ZlibReader(){
+        gzclose(fh);
+    }
+private:
+    int readImpl(char* outputbuffer, int outputsize) override{
+        return gzread(fh, outputbuffer, outputsize);
+    }
+
+    std::string filename;
+    gzFile fh;
+};
+
+template<class RawReader_t>
+class GzReaderBase : public FileReader{
+public:    
+    GzReaderBase(std::string filename_) 
+        : 
+          filename(std::move(filename_)),
+          rawReader(filename),
+          compressedBuffer(Buffer(compressedBufferSize)),
+          decompressedBuffer(Buffer(decompressedBufferSize)){
+
+        zstream.zalloc = Z_NULL;
+        zstream.zfree = Z_NULL;
+        zstream.opaque = Z_NULL;
+        zstream.avail_in = 0;
+        zstream.next_in = Z_NULL;
+        int initstatus = inflateInit2(&zstream, 16+MAX_WBITS);
+        if (initstatus != Z_OK){
+            std::cerr << "Error, inflateInit2 returned " << initstatus << '\n';
+        }
+        assert(initstatus == Z_OK);
+    }
+
+    GzReaderBase(const GzReaderBase&) = delete;
+    GzReaderBase(GzReaderBase&&) = delete;
+    GzReaderBase& operator=(const GzReaderBase&) = delete;
+    GzReaderBase& operator=(GzReaderBase&&) = delete;
+
+private:
+
+    static constexpr int compressedBufferSize = 1024 * 64;
+    static constexpr int decompressedBufferSize = 6 * compressedBufferSize;
+
+    struct Buffer{
+        Buffer() = default;
+        Buffer(int size){
+            data.resize(size);
+        }
+
+        int numBytes = 0;
+        int begin = 0;
+        std::vector<char> data;        
+    };
+
+    int readImpl(char* outputbuffer, int outputsize) override{
+        const int oldOutputsize = outputsize;
+
+        auto iszstreamerror = [&](){
+            return decompResult.statuscode < 0;
+        };
+
+        auto iszstreamend = [&](){
+            return decompResult.statuscode == Z_STREAM_END;
+        };
+
+        //can serve request if no error occured, if uncompressed data is available, or if there is compressed data to uncompress
+        while(outputsize > 0 && !iszstreamerror() && !(iszstreamend() && decompressedBuffer.numBytes == 0)){
+
+            //if no uncompressed data is available, try to decompress some
+            if(decompressedBuffer.numBytes == 0 && !iszstreamerror() && !iszstreamend()){
+                
+                if(decompResult.mustContinue){
+                    //current compressedBuffer is not fully processed yet, continue decompressing the data.
+                    decompResult = continueDecompressInMemory(zstream, 
+                                                        decompressedBuffer.data.data(), 
+                                                        decompressedBuffer.data.size());
+                }else{
+                    //current compressedBuffer is fully processed, read next chunk from file, then start decompression
+                    //TIMERSTARTCPU(fileread);
+                    compressedBuffer.numBytes = rawReader.read(reinterpret_cast<char*>(compressedBuffer.data.data()), 
+                                                               compressedBuffer.data.size());
+                    //TIMERSTOPCPU(fileread);
+                    //TIMERSTARTCPU(decomp);
+                    if(compressedBuffer.numBytes == 0){
+                        decompResult.statuscode = Z_STREAM_END;
+                        decompResult.writtenBytes = 0;
+                    }else{
+                        decompResult = decompressInMemory(zstream, compressedBuffer.data.data(), 
+                                                            compressedBuffer.numBytes, 
+                                                            decompressedBuffer.data.data(), 
+                                                            decompressedBuffer.data.size());
+                    }
+                    //TIMERSTOPCPU(decomp);
+                }
+                if(decompResult.statuscode < 0){
+                    return decompResult.statuscode;
+                }
+                decompressedBuffer.numBytes = decompResult.writtenBytes;
+                decompressedBuffer.begin = 0;
+            }
+
+            const int bytesToCopy = std::min(outputsize, decompressedBuffer.numBytes);
+            std::copy_n(decompressedBuffer.data.data() + decompressedBuffer.begin, 
+                        bytesToCopy, 
+                        outputbuffer + oldOutputsize - outputsize);
+            outputsize -= bytesToCopy;
+            decompressedBuffer.numBytes -= bytesToCopy;
+            decompressedBuffer.begin += bytesToCopy;
+        }
+
+        return oldOutputsize - outputsize;
+    }
+
+    std::string filename;
+    RawReader_t rawReader;
+    z_stream zstream; 
+
+    Buffer compressedBuffer;
+    Buffer decompressedBuffer;
+    DecompressResult decompResult;
+};
+
+
+using GzReader = GzReaderBase<RawReader>;
+using AsyncGzReader = GzReaderBase<AsyncRawReader>;
+
+
+} // namespace kseqpp
+
+
+
+#endif
\ No newline at end of file
diff --git a/lib/libmarv/src/kseqpp/gziphelpers.hpp b/lib/libmarv/src/kseqpp/gziphelpers.hpp
new file mode 100644
index 000000000..33897ff23
--- /dev/null
+++ b/lib/libmarv/src/kseqpp/gziphelpers.hpp
@@ -0,0 +1,79 @@
+#ifndef GZIP_HELPERS_HPP
+#define GZIP_HELPERS_HPP
+
+#include <zlib.h>
+#include <cassert>
+#include <fstream>
+#include <string>
+
+
+namespace kseqpp{
+
+    inline
+    bool hasGzipHeader(const std::string& filename){
+        std::ifstream is(filename, std::ios_base::binary);
+        if(!bool(is)){
+            throw std::runtime_error("Cannot open file " + filename);
+        }
+        unsigned char buf[2];
+        is.read(reinterpret_cast<char*>(&buf[0]), 2);
+
+        if(buf[0] == 0x1f && buf[1] == 0x8b){
+            return true;
+        }else{
+            return false;
+        }
+    }
+
+    struct DecompressResult{
+        bool mustContinue = false;
+        int writtenBytes = 0;
+        int statuscode = 0;
+    };
+
+    inline
+    DecompressResult decompressInMemoryCore(z_stream& zs, unsigned char* output, int outputsize){
+        DecompressResult result;
+
+        zs.avail_out = outputsize;
+        zs.next_out = output;
+        result.statuscode = inflate(&zs, Z_NO_FLUSH);
+        assert(result.statuscode != Z_STREAM_ERROR);
+        if(result.statuscode < 0){
+            return result;
+        }
+
+        result.writtenBytes = outputsize - zs.avail_out;
+        result.mustContinue = (zs.avail_out == 0);
+
+        return result;
+    }
+
+    inline
+    DecompressResult decompressInMemory(z_stream& zs, unsigned char* input, int inputsize, unsigned char* output, int outputsize){
+        zs.avail_in = inputsize;
+        zs.next_in = input;
+        return decompressInMemoryCore(zs, output, outputsize);
+    }
+
+    inline
+    DecompressResult continueDecompressInMemory(z_stream& zs, unsigned char* output, int outputsize){
+        return decompressInMemoryCore(zs, output, outputsize);
+    }
+
+    inline
+    DecompressResult decompressInMemory(z_stream& zs, char* input, int inputsize, char* output, int outputsize){
+        return decompressInMemory(zs, 
+                                  reinterpret_cast<unsigned char*>(input), inputsize, 
+                                  reinterpret_cast<unsigned char*>(output), outputsize);
+    }
+
+    inline
+    DecompressResult continueDecompressInMemory(z_stream& zs, char* output, int outputsize){
+        return continueDecompressInMemory(zs, 
+                                          reinterpret_cast<unsigned char*>(output), outputsize);
+    }
+
+} // namespace kseqpp
+
+#endif
\ No newline at end of file
diff --git a/lib/libmarv/src/kseqpp/kseqpp.hpp b/lib/libmarv/src/kseqpp/kseqpp.hpp
new file mode 100644
index 000000000..df8898c4b
--- /dev/null
+++ b/lib/libmarv/src/kseqpp/kseqpp.hpp
@@ -0,0 +1,536 @@
+#ifndef  kseq_pp_h
+#define kseq_pp_h
+
+#include "gziphelpers.hpp"
+#include "filereader.hpp"
+
+#include <cstdint>
+#include <string>
+#include <algorithm>
+#include <vector>
+#include <thread>
+#include <mutex>
+#include <condition_variable>
+#include <memory>
+#include <iostream>
+#include <functional>
+#include <cstring>
+
+
+namespace kseqpp{
+
+#ifdef KSEQPP_ASYNC_READER
+    using RawReader_t = AsyncRawReader;
+#else 
+    using RawReader_t = RawReader;
+#endif    
+
+
+#ifdef KSEQPP_ASYNC_READER
+    using CompressedReader_t = AsyncGzReader;
+#else 
+    using CompressedReader_t = ZlibReader;
+#endif   
+
+/*
+    The following code for parsing sequence files is adapted from klib/kseq.h which is available under MIT license
+*/
+
+struct KseqPP{    
+
+public:
+
+
+    KseqPP() = default;
+
+    KseqPP(const std::string& filename)
+            : f(std::make_unique<Stream>(filename)){
+        // std::cerr << "KseqPP(" << filename << ")\n";
+        header.reserve(256);
+        seq.reserve(256);
+        qual.reserve(256);
+    }
+
+    int next(){
+        int c = 0;
+        int r = 0;
+
+        if (last_char == 0) { /* then jump to the next header line */
+            while ((c = f->ks_getc()) >= 0 && c != '>' && c != '@'){
+                ;
+            }
+            if (c < 0){
+                f->cancel();
+                return c; /* end of file or error*/
+            }
+            last_char = c;
+        } /* else: the first header char has been read in the previous call */
+
+        seq.clear();
+        qual.clear();
+        
+        if ((r=f->ks_getline(header, &c, 0)) < 0){
+            f->cancel();
+            return r;  /* normal exit: EOF or error */
+        }
+
+        while ((c = f->ks_getc()) >= 0 && c != '>' && c != '+' && c != '@') {
+            if (c == '\n'){
+                continue; /* skip empty lines */
+            }
+            seq.push_back(c);
+            f->ks_getline(seq, 0, 1); /* read the rest of the line */
+        }
+
+        if (c == '>' || c == '@'){
+            last_char = c; /* the first header char has been read */
+        }
+        if (c != '+'){
+            return seq.length(); /* FASTA */
+        }
+        if(qual.capacity() < seq.capacity()){
+            qual.reserve(seq.capacity()); /* allocate memory for qual in case insufficient */
+        }
+        
+        while ((c = f->ks_getc()) >= 0 && c != '\n'){
+            ; /* skip the rest of '+' line */
+        }
+
+        if (c == -1){
+            f->cancel();
+            return -2; /* error: no quality string */
+        }
+
+        while ((c = f->ks_getline(qual, 0, 1) >= 0 && qual.length() < seq.length())){
+            ;
+        }
+        if (c == -3){
+            f->cancel();
+            return -3; /* stream error */
+        }
+        last_char = 0;	/* we have not come to the next header line */
+        if(seq.length() != qual.length()){
+            std::cerr << "got seq " << seq << "\n got qual " << qual << "\n";
+            f->cancel();
+            return -2;  /* error: qual string is of a different length */
+        }
+        
+        return seq.length();
+    }
+
+    void kseq_rewind(){
+        last_char = 0;
+        f->is_eof = 0;
+        f->begin = 0;
+        f->end = 0;
+    }
+
+    const std::string& getCurrentHeader() const{
+        return header;
+    }
+
+    const std::string& getCurrentSequence() const{
+        return seq;
+    }
+
+    const std::string& getCurrentQuality() const{
+        return qual;
+    }
+
+    std::string& getCurrentHeader(){
+        return header;
+    }
+
+    std::string& getCurrentSequence(){
+        return seq;
+    }
+
+    std::string& getCurrentQuality(){
+        return qual;
+    }
+
+private:
+
+    struct kstream_t {
+        static constexpr std::size_t bufsize = 16384;
+
+        static constexpr int KS_SEP_SPACE = 0;
+        static constexpr int KS_SEP_TAB = 1;
+        static constexpr int KS_SEP_LINE = 2;
+        static constexpr int KS_SEP_MAX = 2;
+
+        bool running = false;
+        bool canContinue = true;
+        int begin; 
+        int end;
+        int is_eof;
+        std::vector<char> buf;
+        std::unique_ptr<FileReader> filereader;
+
+        kstream_t() = default;
+
+        kstream_t(const std::string& filename) : begin(0), end(0), is_eof(0){
+
+            if(hasGzipHeader(filename)){
+                //std::cerr << "assume gz file\n";
+                filereader.reset(new CompressedReader_t(filename));
+            }else{
+                //std::cerr << "assume raw file\n";
+                filereader.reset(new RawReader_t(filename));
+            }
+
+            buf.resize(bufsize);
+        }
+
+        kstream_t(kstream_t&& rhs){
+            *this = std::move(rhs);
+        }
+
+        kstream_t& operator=(kstream_t&& rhs){
+            buf = std::move(rhs.buf);
+            begin = std::move(rhs.begin);
+            end = std::move(rhs.end);
+            is_eof = std::move(rhs.is_eof);
+            canContinue = std::move(rhs.canContinue);
+            filereader = std::move(rhs.filereader);
+
+            return *this;
+        }
+
+        void cancel(){}
+
+        bool ks_err() const{
+            return end == -1;
+        }
+
+        bool ks_eof() const{
+            return is_eof && begin >= end;
+        }
+
+        void ks_rewind(){
+            is_eof = 0;
+            begin = 0;
+            end = 0;
+        }
+
+        int fillBuffer(){
+            return filereader->read(buf.data(), bufsize);
+        }
+
+        int ks_getc(){
+            if (ks_err()) return -3;
+            if (ks_eof()) return -1;
+            if (begin >= end) {
+                begin = 0;
+                end = fillBuffer();
+                if (end == 0){
+                    is_eof = 1; 
+                    return -1;
+                }
+                if (end == -1){
+                    is_eof = 1; 
+                    return -3;
+                }
+            }
+            return (int)buf[begin++];
+        }
+
+        int ks_getline(std::string& str, int* dret, int append){
+
+            int gotany = 0;													
+            if (dret) *dret = 0;
+
+            if(!append){
+                str.clear();
+            }																			
+            for (;;) {														
+                                                                    
+                if (ks_err()) return -3;									
+                if (begin >= end) {									
+                    if (!is_eof) {										
+                        begin = 0;										
+                        end = fillBuffer();		
+                        if (end == 0) {
+                            is_eof = 1;
+                            break;
+                        }else if (end == -1){
+                            is_eof = 1; 
+                            return -3;
+                        }	
+                            
+                    }else{
+                        break;
+                    }
+                }
+
+                const unsigned char* const sep = (unsigned char*)memchr(buf.data() + begin, '\n', end - begin);
+                // if seperator was not found, set lineEnd to buffer end
+                const int i = (sep != 0) ? (sep - (unsigned char*)buf.data()) : end;
+
+                gotany = 1;	
+
+                str.append(buf.data() + begin, i - begin);
+
+                begin = i + 1;											
+                if (i < end) {											
+                    if (dret){
+                        *dret = buf[i];
+                    } 
+                    break;													
+                }															
+            }																
+            if (!gotany && ks_eof()){
+                return -1;
+            }
+            if(str.length() > 1 && str.back() == '\r'){
+                str.pop_back();
+            }
+                                        
+            return str.length();													
+        }
+
+    };
+
+    struct asynckstream_t {
+        static constexpr std::size_t bufsize = 16384;
+
+        static constexpr int KS_SEP_SPACE = 0;
+        static constexpr int KS_SEP_TAB = 1;
+        static constexpr int KS_SEP_LINE = 2;
+        static constexpr int KS_SEP_MAX = 2;
+
+        struct ThreadSyncData{
+            std::mutex m;
+            std::condition_variable cv_producer;
+            std::condition_variable cv_consumer;
+        };
+
+        bool running = false;
+        bool canContinue = true;
+        bool tempBufferFilled = false;
+        int tempReadBytes;
+        int begin; 
+        int end;
+        int is_eof;
+        std::vector<char> buf;
+        std::vector<char> tempbuf;
+        std::unique_ptr<FileReader> filereader;
+        std::unique_ptr<ThreadSyncData> threadSyncData = std::make_unique<ThreadSyncData>();
+        std::thread fillerThread{};
+
+        asynckstream_t() = default;
+
+        asynckstream_t(const std::string& filename) : begin(0), end(0), is_eof(0){
+
+            if(hasGzipHeader(filename)){
+                //std::cerr << "assume gz file\n";
+                filereader.reset(new CompressedReader_t(filename));
+            }else{
+                //std::cerr << "assume raw file\n";
+                filereader.reset(new RawReader_t(filename));
+            }
+
+            buf.resize(bufsize);
+            tempbuf.resize(bufsize);
+
+            fillerThread = std::move(std::thread([&](){fillerthreadfunc();}));
+
+            running = true;
+        }
+
+        ~asynckstream_t(){
+            if(running){
+                cancel();
+                fillerThread.join();
+            }
+        }
+
+        asynckstream_t(const asynckstream_t&) = delete;
+        asynckstream_t(asynckstream_t&&) = delete;
+        asynckstream_t& operator=(const asynckstream_t&) = delete;
+        asynckstream_t& operator=(asynckstream_t&&) = delete;
+
+        void fillerthreadfunc(){
+            //std::cerr << "launched thread\n";
+            std::vector<char> threadBuffer;
+            threadBuffer.resize(bufsize);
+
+            int n = 0;
+            do{
+                n = filereader->read(threadBuffer.data(), bufsize);
+
+                std::unique_lock<std::mutex> ul(threadSyncData->m);
+                if(!canContinue){
+                    break;
+                }
+                if(tempBufferFilled){
+                    //std::cerr << "filereaderThread: temp buffer still filled\n";
+                    threadSyncData->cv_producer.wait(ul, [&](){return !tempBufferFilled || !canContinue;});
+                }
+
+                tempBufferFilled = true;
+                if(canContinue){
+                    std::swap(threadBuffer, tempbuf);                    
+                    tempReadBytes = n;
+                    //std::cerr << "filereaderThread: temp buffer filled\n";                    
+                }else{
+                    tempReadBytes = 0;
+                }
+
+                threadSyncData->cv_consumer.notify_one();
+            }while(n > 0 && canContinue);
+
+            std::unique_lock<std::mutex> ul(threadSyncData->m);
+            tempBufferFilled = true;
+            tempReadBytes = 0;
+            threadSyncData->cv_consumer.notify_one();
+
+            //std::cerr << "finished thread\n";
+        }
+
+        void cancel(){
+            std::unique_lock<std::mutex> ul(threadSyncData->m);
+            canContinue = false;
+            threadSyncData->cv_producer.notify_one();
+        }
+
+        bool ks_err() const{
+            return end == -1;
+        }
+
+        bool ks_eof() const{
+            return is_eof && begin >= end;
+        }
+
+        void ks_rewind(){
+            is_eof = 0;
+            begin = 0;
+            end = 0;
+        }
+
+        int fillBuffer(){
+            std::unique_lock<std::mutex> ul(threadSyncData->m);
+            if(!tempBufferFilled){
+                //std::cerr << "main thread: temp buffer still not filled\n";
+                threadSyncData->cv_consumer.wait(ul, [&](){return tempBufferFilled;});
+            }
+
+            std::swap(buf, tempbuf);
+            int numRead = tempReadBytes;
+            tempBufferFilled = false;
+            //std::cerr << "main thread: temp buffer not filled\n";
+            threadSyncData->cv_producer.notify_one();
+
+            return numRead;
+        }
+
+        int ks_getline(std::string& str, int* dret, int append){
+
+            int gotany = 0;													
+            if (dret) *dret = 0;
+
+            if(!append){
+                str.clear();
+            }																			
+            for (;;) {														
+                                                                    
+                if (ks_err()) return -3;									
+                if (begin >= end) {									
+                    if (!is_eof) {										
+                        begin = 0;										
+                        end = fillBuffer();		
+                        if (end == 0) {
+                            is_eof = 1;
+                            break;
+                        }else if (end == -1){
+                            is_eof = 1; 
+                            return -3;
+                        }	
+                            
+                    }else{
+                        break;
+                    }
+                }
+ 
+                const unsigned char* const sep = (unsigned char*)memchr(buf.data() + begin, '\n', end - begin);
+                // if seperator was not found, set lineEnd to buffer end
+                const int i = (sep != 0) ? (sep - (unsigned char*)buf.data()) : end;
+
+                gotany = 1;	
+
+                str.append(buf.data() + begin, i - begin);
+
+                begin = i + 1;											
+                if (i < end) {											
+                    if (dret){
+                        *dret = buf[i];
+                    } 
+                    break;													
+                }															
+            }																
+            if (!gotany && ks_eof()){
+                return -1;
+            }
+            if(str.length() > 1 && str.back() == '\r'){
+                str.pop_back();
+            }
+                                        
+            return str.length();													
+        }
+
+        int ks_getc(){
+            if (ks_err()) return -3;
+            if (ks_eof()) return -1;
+            if (begin >= end) {
+                begin = 0;
+                end = fillBuffer();
+                if (end == 0){
+                    is_eof = 1; 
+                    return -1;
+                }
+                if (end == -1){
+                    is_eof = 1; 
+                    return -3;
+                }
+            }
+            return (int)buf[begin++];
+        }
+
+    };
+
+#ifdef KSEQPP_ASYNC_PARSER
+    using Stream = asynckstream_t;
+#else    
+    using Stream = kstream_t;
+#endif    
+
+    std::string header{};
+    std::string seq{};
+    std::string qual{};	
+
+    int last_char{};
+    std::unique_ptr<Stream> f{};	
+
+};
+
+
+
+
+
+} // namespace kseqpp
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+#endif
diff --git a/lib/libmarv/src/length_partitions.hpp b/lib/libmarv/src/length_partitions.hpp
new file mode 100644
index 000000000..ad0a297ea
--- /dev/null
+++ b/lib/libmarv/src/length_partitions.hpp
@@ -0,0 +1,62 @@
+#ifndef LENGTH_PARTITIONS_HPP
+#define LENGTH_PARTITIONS_HPP
+
+#include "config.hpp"
+
+#include <array>
+#include <limits>
+
+namespace cudasw4{
+
+//length k is in partition i if boundaries[i-1] < k <= boundaries[i]
+
+constexpr auto getLengthPartitionBoundaries(){
+ 
+	constexpr int numLengthPartitions = 36;
+	std::array<SequenceLengthT, numLengthPartitions> boundaries{
+		48,
+		64,
+		80,
+		96,
+		112,
+		128,
+		144,
+		160,
+		176,
+		192,
+		208,
+		224,
+		240,
+		256,
+		288,
+		320,
+		352,
+		384,
+		416,
+		448,
+		480,
+		512,
+		576,
+		640,
+		704,
+		768,
+		832,
+		896,
+		960,
+		1024,
+		1088,
+		1152,
+		1216,
+		1280,
+		8000,
+		std::numeric_limits<SequenceLengthT>::max()-1
+	};
+
+
+    return boundaries;
+}
+    
+
+} //namespace cudasw4
+
+#endif
\ No newline at end of file
diff --git a/lib/libmarv/src/main.cu b/lib/libmarv/src/main.cu
new file mode 100644
index 000000000..bdfd7a9db
--- /dev/null
+++ b/lib/libmarv/src/main.cu
@@ -0,0 +1,485 @@
+
+
+
+#include <algorithm>
+#include <iostream>
+#include <sstream>
+#include <string>
+#include <vector>
+
+#include "hpc_helpers/all_helpers.cuh"
+#include "hpc_helpers/peer_access.cuh"
+
+#include "kseqpp/kseqpp.hpp"
+#include "sequence_io.h"
+#include "options.hpp"
+#include "dbdata.hpp"
+#include "cudasw4.cuh"
+#include "config.hpp"
+#include "target_subject_ids.cuh"
+
+// #include "benchmarking.cuh"
+
+std::vector<std::string> split(const std::string& str, char c){
+	std::vector<std::string> result;
+
+	std::stringstream ss(str);
+	std::string s;
+
+	while (std::getline(ss, s, c)) {
+		result.emplace_back(s);
+	}
+
+	return result;
+}
+
+void printScanResultPlain(std::ostream& os, const cudasw4::ScanResult& scanResult, const cudasw4::CudaSW4& cudaSW4){
+    const int n = scanResult.scores.size();
+    for(int i = 0; i < n; i++){
+        const auto referenceId = scanResult.referenceIds[i];
+        os << "Result " << i << ".";
+        os << " Score: " << scanResult.scores[i] << ".";
+        os << " Length: " << cudaSW4.getReferenceLength(referenceId) << ".";
+        os << " Header " << cudaSW4.getReferenceHeader(referenceId) << ".";
+        os << " referenceId " << referenceId << ".";
+        os << " Alignment_end_query " << scanResult.endPositions[i].getQueryEndInclusive() << ".";
+        os << " Alignment_end_ref " << scanResult.endPositions[i].getSubjectEndInclusive();
+        os << "\n";
+        //std::cout << " Sequence " << cudaSW4.getReferenceSequence(referenceId) << "\n";
+
+    }
+}
+
+void printTSVHeader(std::ostream& os){
+    constexpr char sep = '\t';
+
+    os << "Query number" << sep 
+        << "Query length" << sep 
+        << "Query header" << sep
+        << "Result number" << sep
+        << "Result score" << sep
+        << "Reference length" << sep
+        << "Reference header" << sep
+        << "Reference ID in DB" << sep
+        << "Alignment_end_query" << sep
+        << "Alignment_end_ref" << sep
+        << "\n";
+}
+
+void printScanResultTSV(
+    std::ostream& os, 
+    const cudasw4::ScanResult& scanResult, 
+    const cudasw4::CudaSW4& cudaSW4, 
+    int64_t queryId,
+    cudasw4::SequenceLengthT queryLength,
+    std::string_view queryHeader
+){
+    constexpr char sep = '\t';
+
+    const int n = scanResult.scores.size();
+    for(int i = 0; i < n; i++){
+        const auto referenceId = scanResult.referenceIds[i];
+        
+        os << queryId << sep 
+            << queryLength << sep
+            << queryHeader << sep
+            << i << sep
+            << scanResult.scores[i] << sep
+            << cudaSW4.getReferenceLength(referenceId) << sep
+            << cudaSW4.getReferenceHeader(referenceId) << sep
+            << referenceId << sep
+            << scanResult.endPositions[i].getQueryEndInclusive() << sep
+            << scanResult.endPositions[i].getSubjectEndInclusive()
+            << "\n";
+
+        //std::cout << " Sequence " << cudaSW4.getReferenceSequence(referenceId) << "\n";
+    }
+}
+
+struct BatchOfQueries{
+    std::vector<char> chars;               
+    std::vector<std::size_t> offsets;  
+    std::vector<cudasw4::SequenceLengthT> lengths;  
+    std::vector<std::string> headers;  
+};
+
+
+
+int main(int argc, char* argv[])
+{
+    ProgramOptions options;
+    bool parseSuccess = parseArgs(argc, argv, options);
+
+    if(!parseSuccess || options.help){
+        printHelp(argc, argv);
+        return 0;
+    }
+
+    // peakbenchmarkAllSingleTileConfigs(argc, argv);
+
+    // peakBenchmark(argc, argv);
+    // gridsearchPseudo(argc, argv);
+    //gridsearchReal(argc, argv);
+
+    // lengthbenchmarkReal(argc, argv, 32, 4096, 32);
+    // lengthbenchmarkReal(argc, argv, 4096+1024, 16384, 1024);
+    // lengthbenchmarkReal(argc, argv, 16384+16384, 65536, 16384);
+    // return 0;
+
+    // gridsearchPseudo_SW(argc, argv);
+    // return 0;
+   
+
+
+
+    printOptions(options);
+
+    std::vector<int> deviceIds;
+    {
+        int num = 0;
+        cudaGetDeviceCount(&num); CUERR
+        for(int i = 0; i < num; i++){
+            deviceIds.push_back(i);
+        }
+        if(deviceIds.size() > 0){
+            if(options.verbose){
+                std::cout << "Will use GPU";
+                for(auto x : deviceIds){
+                    std::cout << " " << x;
+                }
+                std::cout << "\n";
+            }
+        }else{
+            throw std::runtime_error("No GPU found");
+        }
+    }
+
+    helpers::PeerAccess peerAccess(deviceIds, false);
+    peerAccess.enableAllPeerAccesses();
+ 
+    using MemoryConfig = cudasw4::MemoryConfig;
+    using ScanResult = cudasw4::ScanResult;
+    using ScanType = cudasw4::ScanType;
+
+    MemoryConfig memoryConfig;
+    memoryConfig.maxBatchBytes = options.maxBatchBytes;
+    memoryConfig.maxBatchSequences = options.maxBatchSequences;
+    memoryConfig.maxTempBytes = options.maxTempBytes;
+    memoryConfig.maxGpuMem = options.maxGpuMem;
+
+    //ScanType scanType = ScanType::Gapless;
+    //ScanType scanType = ScanType::SW_Endpos;
+
+    std::shared_ptr<cudasw4::TargetSubjectIds> targetSubjectIds;
+    if(options.subjectIdsFilename.has_value()){
+        targetSubjectIds = std::make_shared<cudasw4::TargetSubjectIds>(options.subjectIdsFilename.value());
+        // for(auto x : targetSubjectIds->subjectIds){
+        //     std::cout << x << ", ";
+        // }
+        // std::cout << "\n";
+    }
+
+
+    std::ofstream outputfile(options.outputfile);
+    if(!bool(outputfile)){
+        throw std::runtime_error("Cannot open file " + options.outputfile);
+    }
+    if(options.outputMode == ProgramOptions::OutputMode::TSV){
+        printTSVHeader(outputfile);
+    }
+
+    int numTopOutputs = options.numTopOutputs;
+    if(targetSubjectIds){
+        numTopOutputs = targetSubjectIds->subjectIds.size();
+    }
+    numTopOutputs = std::min(numTopOutputs, cudasw4::MaxNumberOfResults::value());
+    // std::cout << "Will output up to " << numTopOutputs << " results\n";
+
+    KernelConfigFilenames kernelConfigFilenames;
+    kernelConfigFilenames.gapless = options.kernelConfigsFile_gapless;
+    kernelConfigFilenames.sw = options.kernelConfigsFile_sw;
+
+    cudasw4::CudaSW4 cudaSW4(
+        deviceIds, 
+        numTopOutputs,
+        options.blosumType, 
+        memoryConfig, 
+        options.verbose,
+        kernelConfigFilenames
+    );
+
+    cudaSW4.setScanType(options.scanType);
+    if(targetSubjectIds){
+        cudaSW4.setTargetSubjectIds(targetSubjectIds);
+    }
+
+    if(!options.usePseudoDB){
+        if(options.verbose){
+            std::cout << "Reading Database: \n";
+        }
+        try{
+            helpers::CpuTimer timer_read_db("Read DB");
+            constexpr bool writeAccess = false;
+            const bool prefetchSeq = options.prefetchDBFile;
+            auto fullDB_tmp = std::make_shared<cudasw4::DB>(cudasw4::loadDB(options.dbPrefix, writeAccess, prefetchSeq));
+            if(options.verbose){
+                timer_read_db.print();
+            }
+
+            cudaSW4.setDatabase(fullDB_tmp);
+        }catch(cudasw4::LoadDBException& ex){
+            if(options.verbose){
+                std::cout << "Failed to map db files. Using fallback db. Error message: " << ex.what() << "\n";
+            }
+            helpers::CpuTimer timer_read_db("Read DB");
+            auto fullDB_tmp = std::make_shared<cudasw4::DBWithVectors>(cudasw4::loadDBWithVectors(options.dbPrefix));
+            if(options.verbose){
+                timer_read_db.print();
+            }
+
+            cudaSW4.setDatabase(fullDB_tmp);
+        }
+    }else{
+        if(options.verbose){
+            std::cout << "Generating pseudo db\n";
+        }
+        helpers::CpuTimer timer_read_db("Generate DB");
+        auto fullDB_tmp = std::make_shared<cudasw4::PseudoDB>(cudasw4::loadPseudoDB(
+            options.pseudoDBSize, 
+            options.pseudoDBLength,
+            options.pseudoDBSameSequence
+        ));
+        if(options.verbose){
+            timer_read_db.print();
+        }
+        
+        cudaSW4.setDatabase(fullDB_tmp);
+    }
+
+    if(options.verbose){
+        cudaSW4.printDBInfo();
+        if(options.printLengthPartitions){
+            cudaSW4.printDBLengthPartitions();
+        }
+    }
+
+    if(options.loadFullDBToGpu){
+        cudaSW4.prefetchDBToGpus();
+    }
+
+    if(!options.interactive){
+
+        for(const auto& queryFile : options.queryFiles){
+            std::cout << "Processing query file " << queryFile << "\n";
+        // 0 load all queries into memory, then process.
+        // 1 load and process queries one after another
+        #if 1
+            kseqpp::KseqPP reader(queryFile);
+            int64_t query_num = 0;
+
+            cudaSW4.totalTimerStart();
+
+            while(reader.next() >= 0){
+                std::cout << "Processing query " << query_num << " ... ";
+                std::cout.flush();
+                const std::string& header = reader.getCurrentHeader();
+                const std::string& sequence = reader.getCurrentSequence();
+
+                cudasw4::DecodedQueryView queryView(sequence.data(), sequence.size());
+
+                ScanResult scanResult = cudaSW4.scan(queryView, std::nullopt);
+                if(options.verbose){
+                    std::cout << "Done. Scan time: " << scanResult.stats.seconds << " s, " << scanResult.stats.gcups << " GCUPS\n";
+                }else{
+                    std::cout << "Done.\n";
+                }
+
+                if(numTopOutputs > 0){
+                    if(options.outputMode == ProgramOptions::OutputMode::Plain){
+                        outputfile << "Query " << query_num << ", header" <<  header
+                            << ", length " << sequence.size()
+                            << ", num overflows " << scanResult.stats.numOverflows << "\n";
+
+                        printScanResultPlain(outputfile, scanResult, cudaSW4);
+                    }else{
+                        printScanResultTSV(outputfile, scanResult, cudaSW4, query_num, sequence.size(), header);
+                    }
+                    outputfile.flush();
+                }
+
+                query_num++;
+            }
+
+            auto totalBenchmarkStats = cudaSW4.totalTimerStop();
+            if(options.verbose){
+                std::cout << "Total time: " << totalBenchmarkStats.seconds << " s, " << totalBenchmarkStats.gcups << " GCUPS\n";
+            }
+
+        #else
+
+            BatchOfQueries batchOfQueries;
+            {
+                
+                constexpr int ALIGN = 4;
+                kseqpp::KseqPP reader(queryFile);
+                batchOfQueries.offsets.push_back(0);
+                while(reader.next() >= 0){
+                    const std::string& header = reader.getCurrentHeader();
+                    const std::string& sequence = reader.getCurrentSequence();
+                    //we ignore quality
+                    //const std::string& quality = reader.getCurrentQuality();
+
+                    batchOfQueries.chars.insert(batchOfQueries.chars.end(), sequence.begin(), sequence.end());
+                    //padding
+                    if(batchOfQueries.chars.size() % ALIGN != 0){
+                        batchOfQueries.chars.insert(batchOfQueries.chars.end(), ALIGN - batchOfQueries.chars.size() % ALIGN, ' ');
+                    }
+
+                    batchOfQueries.offsets.push_back(batchOfQueries.chars.size());
+                    batchOfQueries.lengths.push_back(sequence.size());
+                    batchOfQueries.headers.push_back(header);
+                }
+            }
+
+            int64_t numQueries = batchOfQueries.lengths.size();
+            const char* maxNumQueriesString = std::getenv("ALIGNER_MAX_NUM_QUERIES");
+            if(maxNumQueriesString != nullptr){
+                int64_t maxNumQueries = std::atoi(maxNumQueriesString);
+                numQueries = std::min(numQueries, maxNumQueries);
+            }
+        
+            std::vector<ScanResult> scanResults(numQueries);
+
+            cudaSW4.totalTimerStart();
+
+            for(int64_t query_num = 0; query_num < numQueries; ++query_num) {
+                std::cout << "Processing query " << query_num << " ... ";
+                std::cout.flush();
+                const size_t offset = batchOfQueries.offsets[query_num];
+                const cudasw4::SequenceLengthT length = batchOfQueries.lengths[query_num];
+                const char* sequence = batchOfQueries.chars.data() + offset;
+                cudasw4::DecodedQueryView queryView(sequence, length);
+                ScanResult scanResult = cudaSW4.scan(queryView, std::nullopt);
+                scanResults[query_num] = scanResult;
+                if(options.verbose){
+                    std::cout << "Done. Scan time: " << scanResult.stats.seconds << " s, " << scanResult.stats.gcups << " GCUPS\n";
+                }else{
+                    std::cout << "Done.\n";
+                }
+            }
+
+            auto totalBenchmarkStats = cudaSW4.totalTimerStop();
+
+            if(options.verbose){
+                std::cout << "Total time: " << totalBenchmarkStats.seconds << " s, " << totalBenchmarkStats.gcups << " GCUPS\n";
+            }
+            if(numTopOutputs > 0){
+                for(int64_t query_num = 0; query_num < numQueries; ++query_num) {
+                    const ScanResult& scanResult = scanResults[query_num];
+
+                    if(options.outputMode == ProgramOptions::OutputMode::Plain){
+                        outputfile << "Query " << query_num << ", header" <<  batchOfQueries.headers[query_num] 
+                            << ", length " << batchOfQueries.lengths[query_num]
+                            << ", num overflows " << scanResult.stats.numOverflows << "\n";
+                        printScanResultPlain(outputfile, scanResult, cudaSW4);
+                    }else{
+                        printScanResultTSV(outputfile, scanResult, cudaSW4, query_num, batchOfQueries.lengths[query_num], batchOfQueries.headers[query_num]);
+                    }
+                }
+            }
+        #endif
+
+        }
+    }else{
+        std::cout << "Interactive mode ready\n";
+        std::cout << "Use 's inputsequence' to query inputsequence against the database. Press ENTER twice to begin.\n";
+        std::cout << "Use 'f inputfile' to query all sequences in inputfile\n";
+        std::cout << "Use 'exit' to terminate\n";
+        std::cout << "Waiting for command...\n";
+
+        std::string line;
+        while(std::getline(std::cin, line)){
+            auto tokens = split(line, ' ');
+            if(tokens.size() == 0) continue;
+
+            const auto& command = tokens[0];
+            if(command == "exit"){
+                break;
+            }else if(command == "s"){
+                if(tokens.size() > 1){
+                    auto& sequence = tokens[1];
+
+                    //read the remaining lines to catch multi-line sequence input (for example copy&paste fasta sequence)
+                    while(std::getline(std::cin, line)){
+                        if(line.empty()) break;
+                        sequence += line;
+                    }
+
+                    std::cout << "sequence: " << sequence << "\n";
+                    std::cout << "Processing query " << 0 << " ... ";
+                    std::cout.flush();
+                    cudasw4::DecodedQueryView queryView(sequence.data(), sequence.size());
+                    ScanResult scanResult = cudaSW4.scan(queryView, std::nullopt);
+                    if(options.verbose){
+                        std::cout << "Done. Scan time: " << scanResult.stats.seconds << " s, " << scanResult.stats.gcups << " GCUPS\n";
+                    }else{
+                        std::cout << "Done.\n";
+                    }
+
+                    if(options.outputMode == ProgramOptions::OutputMode::Plain){
+                        printScanResultPlain(outputfile, scanResult, cudaSW4);
+                    }else{
+                        printScanResultTSV(outputfile, scanResult, cudaSW4, -1, sequence.size(), "-");
+                    }
+                }else{
+                    std::cout << "Missing argument for command 's'\n";
+                }
+            }else if(command == "f"){
+                if(tokens.size() > 1){
+                    const auto& filename = tokens[1];
+                    try{
+                        kseqpp::KseqPP reader(filename);
+                        int64_t query_num = 0;
+
+                        while(reader.next() >= 0){
+                            std::cout << "Processing query " << query_num << " ... ";
+                            std::cout.flush();
+                            const std::string& header = reader.getCurrentHeader();
+                            const std::string& sequence = reader.getCurrentSequence();
+
+                            cudasw4::DecodedQueryView queryView(sequence.data(), sequence.size());
+                            ScanResult scanResult = cudaSW4.scan(queryView, std::nullopt);
+                            if(options.verbose){
+                                std::cout << "Done. Scan time: " << scanResult.stats.seconds << " s, " << scanResult.stats.gcups << " GCUPS\n";
+                            }else{
+                                std::cout << "Done.\n";
+                            }
+
+                            if(options.outputMode == ProgramOptions::OutputMode::Plain){
+                                std::cout << "Query " << query_num << ", header" <<  header
+                                << ", length " << sequence.size()
+                                << ", num overflows " << scanResult.stats.numOverflows << "\n";
+
+                                printScanResultPlain(outputfile, scanResult, cudaSW4);
+                            }else{
+                                printScanResultTSV(outputfile, scanResult, cudaSW4, -1, sequence.size(), "-");
+                            }
+
+                            query_num++;
+                        }
+                    }catch(...){
+                        std::cout << "Error\n";
+                    }
+                }else{
+                    std::cout << "Missing argument for command 'f' \n";
+                }
+            }else{
+                std::cout << "Unrecognized command: " << command << "\n";
+            }
+
+            std::cout << "Waiting for command...\n";
+        }
+
+    }
+
+}
diff --git a/lib/libmarv/src/makedb.cpp b/lib/libmarv/src/makedb.cpp
new file mode 100644
index 000000000..64ef00a14
--- /dev/null
+++ b/lib/libmarv/src/makedb.cpp
@@ -0,0 +1,389 @@
+
+#define THRUST_DEVICE_SYSTEM THRUST_DEVICE_SYSTEM_OMP
+
+#include <algorithm>
+#include <iterator>
+#include <iostream>
+#include <chrono>
+
+#include <unistd.h>
+#include <sys/resource.h>
+#include <sys/time.h>
+#include <sys/resource.h>
+
+#include "config.hpp"
+
+#include "hpc_helpers/all_helpers.cuh"
+#include "sequence_io.h"
+#include "dbdata.hpp"
+#include "convert.cuh"
+#include "kseqpp/kseqpp.hpp"
+#include "length_partitions.hpp"
+#include "mmapbuffer.hpp"
+
+
+std::size_t getAvailableMemoryInKB_linux(){
+    //https://stackoverflow.com/questions/349889/how-do-you-determine-the-amount-of-linux-system-ram-in-c
+    std::string token;
+    std::ifstream file("/proc/meminfo");
+    assert(bool(file));
+    while(file >> token) {
+        if(token == "MemAvailable:") {
+            std::size_t mem;
+            if(file >> mem) {
+                return mem;
+            } else {
+                return 0;       
+            }
+        }
+        file.ignore(std::numeric_limits<std::streamsize>::max(), '\n');
+    }
+    return 0;
+}
+
+
+std::size_t getCurrentRSS_linux(){
+    std::ifstream in("/proc/self/statm");
+    std::size_t tmp, rss;
+    in >> tmp >> rss;
+    
+    return rss * sysconf(_SC_PAGESIZE);
+}
+
+std::size_t getRSSLimit_linux(){
+    rlimit rlim;
+    int ret = getrlimit(RLIMIT_RSS, &rlim);
+    if(ret != 0){
+        std::perror("Could not get RSS limit!");
+        return 0;
+    }
+    return rlim.rlim_cur;    
+}
+
+
+std::size_t getAvailableMemoryInKB(){
+    return std::min(getAvailableMemoryInKB_linux(), (getRSSLimit_linux() - getCurrentRSS_linux()) / 1024);
+}
+
+
+struct InMemoryBatch{
+    std::vector<char> chars;               
+    std::vector<std::size_t> offsets;  
+    std::vector<cudasw4::SequenceLengthT> lengths;  
+    std::vector<char> headers;  
+    std::vector<std::size_t> headerOffsets;  
+};
+
+struct HybridBatch{
+    static constexpr size_t charweight = 50;
+    static constexpr size_t offsetweight = 7;
+    static constexpr size_t lengthweight = 7;
+    static constexpr size_t headersweight = 29;
+    static constexpr size_t headeroffsetweight = 7;
+
+    static_assert(charweight + offsetweight + lengthweight + headersweight + headeroffsetweight == 100);
+
+    HybridBatch(const std::string& temppath, size_t memoryLimit) : 
+        chars(0, (memoryLimit / 100) * charweight, temppath + "_cudasw4tmpchars"),
+        offsets(0, (memoryLimit / 100) * offsetweight, temppath + "_cudasw4tmpoffsets"),
+        lengths(0, (memoryLimit / 100) * lengthweight, temppath + "_cudasw4tmplengths"),
+        headers(0, (memoryLimit / 100) * headersweight, temppath + "_cudasw4tmpheaders"),
+        headerOffsets(0, (memoryLimit / 100) * headeroffsetweight, temppath + "_cudasw4tmpheaderOffsets")
+    {
+
+    }
+    cudasw4::FileBackedUVector<char> chars;               
+    cudasw4::FileBackedUVector<std::size_t> offsets;  
+    cudasw4::FileBackedUVector<cudasw4::SequenceLengthT> lengths;  
+    cudasw4::FileBackedUVector<char> headers;  
+    cudasw4::FileBackedUVector<std::size_t> headerOffsets;  
+};
+
+
+template<class Batch>
+void loadWholeFileIntoBatch_withPaddedSequences(const std::string& inputfilename, Batch& batch){
+    constexpr int ALIGN = 4;
+
+    batch.chars.clear();
+    batch.offsets.clear();
+    batch.lengths.clear();
+    batch.headers.clear();
+    batch.headerOffsets.clear();
+    batch.offsets.push_back(0);
+    batch.headerOffsets.push_back(0);
+
+    kseqpp::KseqPP reader(inputfilename);
+    while(reader.next() >= 0){
+        const std::string& header = reader.getCurrentHeader();
+        const std::string& sequence = reader.getCurrentSequence();
+
+        const size_t sequencepadding = (sequence.size() % ALIGN == 0) ? 0 : ALIGN - sequence.size() % ALIGN;
+
+        batch.chars.insert(batch.chars.end(), sequence.begin(), sequence.end());
+        batch.chars.insert(batch.chars.end(), sequencepadding, ' ');
+        batch.offsets.push_back(batch.chars.size());
+        batch.lengths.push_back(sequence.size());
+
+        batch.headers.insert(batch.headers.end(), header.begin(), header.end());
+        batch.headerOffsets.push_back(batch.headers.size());
+    }
+}
+
+template<class Batch>
+void loadWholeFileIntoBatch_withPaddedConvertedSequences(const std::string& inputfilename, Batch& batch, bool allowLowerCase){
+    constexpr int ALIGN = 4;
+
+    batch.chars.clear();
+    batch.offsets.clear();
+    batch.lengths.clear();
+    batch.headers.clear();
+    batch.headerOffsets.clear();
+    batch.offsets.push_back(0);
+    batch.headerOffsets.push_back(0);
+
+    kseqpp::KseqPP reader(inputfilename);
+    while(reader.next() >= 0){
+        if(batch.lengths.size() > cudasw4::MaxSequencesInDB::value()){
+            std::string msg = "File contains at least " + std::to_string(batch.lengths.size()+1) 
+                    + " sequences, but config allows at most " + std::to_string(cudasw4::MaxSequencesInDB::value());
+            throw std::runtime_error(msg);
+        }
+
+        const std::string& header = reader.getCurrentHeader();
+        const std::string& sequence = reader.getCurrentSequence();
+
+        size_t sequenceLength = sequence.size();
+        if(sequenceLength > cudasw4::MaxSequenceLength::value()){
+            std::string msg = "Got sequence of length " + std::to_string(sequenceLength) 
+                + ", but config allows only lengths <= " + std::to_string(cudasw4::MaxSequenceLength::value());
+            throw std::runtime_error(msg);
+        }
+
+
+        const size_t sequencepadding = (sequence.size() % ALIGN == 0) ? 0 : ALIGN - sequence.size() % ALIGN;
+
+        const size_t oldCharsSize = batch.chars.size();
+        const size_t newCharsSize = oldCharsSize + sequence.size() + sequencepadding;
+        batch.chars.resize(newCharsSize);
+        if(!allowLowerCase){
+            auto convert = cudasw4::ConvertAA_20{};
+            auto it = std::transform(sequence.begin(), sequence.end(), batch.chars.begin() + oldCharsSize, convert);
+            std::fill(it, batch.chars.end(), convert(' ')); // add converted padding
+        }else{
+            auto convert = cudasw4::ConvertAA_20_CaseSensitive{};
+            auto it = std::transform(sequence.begin(), sequence.end(), batch.chars.begin() + oldCharsSize, convert);
+            std::fill(it, batch.chars.end(), convert(' ')); // add converted padding
+        }
+        batch.offsets.push_back(newCharsSize);
+        batch.lengths.push_back(sequence.size());
+
+        batch.headers.insert(batch.headers.end(), header.begin(), header.end());
+        batch.headerOffsets.push_back(batch.headers.size());
+    }
+}
+
+template<class Batch>
+void createDBfilesFromSequenceBatch(const std::string& outputPrefix, const Batch& batch){
+    using cudasw4::DBdataIoConfig;
+
+    const size_t numSequences = batch.lengths.size();
+
+    std::vector<cudasw4::ReferenceIdT> indices(numSequences);
+    std::iota(indices.begin(), indices.end(), cudasw4::ReferenceIdT(0));
+
+    auto compareIndicesByLength = [&](const auto& l, const auto& r){
+        return batch.lengths[l] < batch.lengths[r];
+    };
+
+    std::sort(indices.begin(), indices.end(), compareIndicesByLength);
+
+    auto lengthBoundaries = cudasw4::getLengthPartitionBoundaries();
+    const int numPartitions = lengthBoundaries.size();
+
+    std::vector<size_t> numSequencesPerPartition(numPartitions);
+
+    auto partitionBegin = indices.begin();
+    for(int i = 0; i < numPartitions; i++){
+        //length k is in partition i if boundaries[i-1] < k <= boundaries[i]
+        int searchFor = lengthBoundaries[i];
+        if(searchFor < std::numeric_limits<int>::max()){
+            searchFor += 1;
+        }
+        auto partitionEnd = std::lower_bound(
+            partitionBegin, 
+            indices.end(), 
+            searchFor,
+            [&](const auto& l, const auto& r){
+                return batch.lengths[l] < r;
+            }
+        );
+        numSequencesPerPartition[i] = std::distance(partitionBegin, partitionEnd);
+        partitionBegin = partitionEnd;
+    }
+    // for(int i = 0; i < numPartitions; i++){
+    //     std::cout << "numInPartition " << i << " (<= " << lengthBoundaries[i] << " ) : " << numSequencesPerPartition[i] << "\n";
+    // }
+
+    //write partition data to metadata file
+    std::ofstream metadataout(outputPrefix + DBdataIoConfig::metadatafilename(), std::ios::binary);
+    if(!metadataout) throw std::runtime_error("Cannot open output file " + outputPrefix + DBdataIoConfig::metadatafilename());
+
+    metadataout.write((const char*)&numPartitions, sizeof(int));
+    for(int i = 0; i < numPartitions; i++){
+        const int limit = lengthBoundaries[i];
+        metadataout.write((const char*)&limit, sizeof(int));
+    }
+    metadataout.write((const char*)numSequencesPerPartition.data(), sizeof(size_t) * numPartitions);
+
+
+    //write db files with sequences sorted by length
+
+    std::ofstream headersout(outputPrefix + DBdataIoConfig::headerfilename(), std::ios::binary);
+    if(!headersout) throw std::runtime_error("Cannot open output file " + outputPrefix + DBdataIoConfig::headerfilename());
+    std::ofstream headersoffsetsout(outputPrefix + DBdataIoConfig::headeroffsetsfilename(), std::ios::binary);
+    if(!headersoffsetsout) throw std::runtime_error("Cannot open output file " + outputPrefix + DBdataIoConfig::headeroffsetsfilename());
+    std::ofstream charsout(outputPrefix + DBdataIoConfig::sequencesfilename(), std::ios::binary);
+    if(!charsout) throw std::runtime_error("Cannot open output file " + outputPrefix + DBdataIoConfig::sequencesfilename());
+    std::ofstream offsetsout(outputPrefix + DBdataIoConfig::sequenceoffsetsfilename(), std::ios::binary);
+    if(!offsetsout) throw std::runtime_error("Cannot open output file " + outputPrefix + DBdataIoConfig::sequenceoffsetsfilename());
+    std::ofstream lengthsout(outputPrefix + DBdataIoConfig::sequencelengthsfilename(), std::ios::binary);
+    if(!lengthsout) throw std::runtime_error("Cannot open output file " + outputPrefix + DBdataIoConfig::sequencelengthsfilename());
+
+    size_t currentHeaderOffset = 0;
+    size_t currentCharOffset = 0;
+    headersoffsetsout.write((const char*)&currentHeaderOffset, sizeof(size_t));
+    offsetsout.write((const char*)&currentCharOffset, sizeof(size_t));
+    for(size_t i = 0; i < numSequences; i++){
+        const size_t sortedIndex = indices[i];
+
+        const char* const header = batch.headers.data() + batch.headerOffsets[sortedIndex];
+        const int headerLength = batch.headerOffsets[sortedIndex+1] - batch.headerOffsets[sortedIndex];
+
+        headersout.write(header, headerLength);
+        currentHeaderOffset += headerLength;
+        headersoffsetsout.write((const char*)&currentHeaderOffset, sizeof(size_t));
+
+        const size_t numChars = batch.offsets[sortedIndex+1] - batch.offsets[sortedIndex];
+        const cudasw4::SequenceLengthT length = batch.lengths[sortedIndex];
+        const char* const sequence = batch.chars.data() + batch.offsets[sortedIndex];
+
+
+        charsout.write(sequence, numChars);
+        lengthsout.write((const char*)&length, sizeof(cudasw4::SequenceLengthT));
+        currentCharOffset += numChars;
+        offsetsout.write((const char*)&currentCharOffset, sizeof(size_t));
+    }
+}
+
+
+int main(int argc, char* argv[])
+{
+
+
+    if(argc < 3) {
+        std::cout << "Usage:\n  " << argv[0] << " <FASTA/FASTQ filename> pathtodb/dbname [options]\n";
+        std::cout << "Input file may be gzip'ed. pathtodb must exist.\n";
+        std::cout << "Options:\n";
+        std::cout << "    --mem val : Memory limit. Can use suffix K,M,G. If makedb requires more memory, temp files in temp directory will be used. Default all available memory.\n";
+        std::cout << "    --tempdir val : Temp directory for temporary files. Must exist. Default is db output directory.\n";
+        std::cout << "    --allowLowerCase : Convert lower-case letters to distinct numbers. Without this option, lower-case letters are treated as upper-case.\n";
+        return 0;
+    }
+
+    auto parseMemoryString = [](const std::string& string){
+        std::size_t result = 0;
+        if(string.length() > 0){
+            std::size_t factor = 1;
+            bool foundSuffix = false;
+            switch(string.back()){
+                case 'K':{
+                    factor = std::size_t(1) << 10; 
+                    foundSuffix = true;
+                }break;
+                case 'M':{
+                    factor = std::size_t(1) << 20;
+                    foundSuffix = true;
+                }break;
+                case 'G':{
+                    factor = std::size_t(1) << 30;
+                    foundSuffix = true;
+                }break;
+            }
+            if(foundSuffix){
+                const auto numberString = string.substr(0, string.size()-1);
+                result = factor * std::stoull(numberString);
+            }else{
+                result = std::stoull(string);
+            }
+        }else{
+            result = 0;
+        }
+        return result;
+    };
+
+    const std::string fastafilename = argv[1];
+    const std::string outputPrefix = argv[2];
+    std::string temppath = outputPrefix;
+    size_t availableMem = getAvailableMemoryInKB() * 1024;
+    constexpr size_t GB = 1024*1024*1024;
+    if(availableMem > 1*GB){
+        availableMem -= 1*GB;
+    }
+
+    bool allowLowerCase = false;
+
+    for(int i = 3; i < argc; i++){
+        const std::string arg = argv[i];
+        if(arg == "--mem"){
+            availableMem = parseMemoryString(argv[++i]);
+        }else if(arg == "--tempdir"){
+           temppath = argv[++i];
+           if(temppath.back() != '/'){
+                temppath += '/';
+           }
+        }else if(arg == "--allowLowerCase"){
+            allowLowerCase = true;
+        }else{
+            std::cout << "Unexpected arg " << arg << "\n";
+        }
+    }
+    std::cout << "availableMem: " << availableMem << "\n";
+
+    //InMemoryBatch batch;
+    HybridBatch batch(temppath, availableMem);
+
+    std::cout << "Parsing file\n";
+    helpers::CpuTimer timer1("file parsing");
+    //loadWholeFileIntoBatch_withPaddedConvertedSequences(fastafilename, batch, allowLowerCase);
+    loadWholeFileIntoBatch_withPaddedSequences(fastafilename, batch);
+    timer1.print();
+
+    std::cout << "Number of input sequences:  " << batch.offsets.size() - 1 << '\n';
+    std::cout << "Number of input characters: " << batch.chars.size() << '\n';
+
+    std::cout << "Converting amino acids\n";
+    helpers::CpuTimer timer2("amino conversion");
+    if(!allowLowerCase){
+        #pragma omp parallel for
+        for(size_t i = 0; i < batch.chars.size(); i++){
+            batch.chars[i] = cudasw4::ConvertAA_20{}(batch.chars[i]);
+        }
+    }else{
+        #pragma omp parallel for
+        for(size_t i = 0; i < batch.chars.size(); i++){
+            batch.chars[i] = cudasw4::ConvertAA_20_CaseSensitive{}(batch.chars[i]);
+        }
+    }
+    timer2.print();
+
+    std::cout << "Creating DB files\n";
+    const std::string batchOutputPrefix = outputPrefix + std::to_string(0);
+    helpers::CpuTimer timer3("db creation");
+    createDBfilesFromSequenceBatch(batchOutputPrefix, batch);
+    timer3.print();
+
+    cudasw4::DBGlobalInfo info;
+
+    cudasw4::writeGlobalDbInfo(outputPrefix, info);
+
+}
diff --git a/lib/libmarv/src/mapped_file.hpp b/lib/libmarv/src/mapped_file.hpp
new file mode 100644
index 000000000..87a7e9ce9
--- /dev/null
+++ b/lib/libmarv/src/mapped_file.hpp
@@ -0,0 +1,121 @@
+#ifndef MAPPED_FILE_HPP
+#define MAPPED_FILE_HPP
+
+#include <sys/mman.h>
+#include <sys/stat.h>
+#include <fcntl.h>
+#include <unistd.h>
+
+#include <string>
+#include <stdexcept>
+#include <future>
+#include <chrono>
+#include <iostream>
+#include <cassert>
+
+namespace cudasw4{
+
+class MappedFileException : public std::exception{
+    std::string message;
+public:
+    MappedFileException() : MappedFileException("MMapException"){}
+    MappedFileException(const std::string& msg) : message(msg){}
+
+    const char* what() const noexcept override{
+        return message.c_str();
+    }
+};
+
+struct MappedFile{
+    struct Options{
+        bool readaccess = true;
+        bool writeaccess = false;
+        bool prefault = false;
+    };
+    
+    int fd;
+    size_t filesize;
+    void* rawMmapPtr;
+    std::string filename;
+    Options options;
+
+    MappedFile(const std::string& filename_, MappedFile::Options options_)
+        :   filesize(getFileSizeInBytes(filename_)), 
+            filename(filename_),
+            options(options_){
+
+        int openflags = 0;
+        if(options.readaccess && options.writeaccess){
+            openflags = O_RDWR;
+        }else if(options.readaccess && !options.writeaccess){
+            openflags = O_RDONLY;
+        }else if(!options.readaccess && options.writeaccess){
+            openflags = O_WRONLY;
+        }else{
+            throw MappedFileException("Invalid options for MappedFile");
+        }
+
+        fd = open(filename.c_str(), openflags);
+        if(fd == -1){
+            perror("open");
+            throw MappedFileException("Could not open file " + filename);
+        }
+
+        int mmapprot = 0;
+        if(options.readaccess){
+            mmapprot |= PROT_READ;
+        }
+        if(options.writeaccess){
+            mmapprot |= PROT_WRITE; 
+        }
+
+        int mmapflags = MAP_PRIVATE;
+        if(options.writeaccess){
+            mmapflags = MAP_SHARED;
+        }
+        if(options.prefault){
+            mmapflags |= MAP_POPULATE; //load the file into memory immediately
+        }
+
+        rawMmapPtr = mmap(nullptr, filesize, mmapprot, mmapflags, fd, 0);
+        if(rawMmapPtr == MAP_FAILED){
+            close(fd);
+            throw MappedFileException("Could not map file " + filename);
+        }
+    }
+
+    ~MappedFile(){
+        munmap(rawMmapPtr, filesize);
+        close(fd);
+    }
+
+    char* data() noexcept{
+        return (char*)rawMmapPtr;
+    }
+    const char* data() const noexcept{
+        return (const char*)rawMmapPtr;
+    }
+    size_t size() const noexcept{
+        return filesize;
+    }
+
+    template<class T>
+    size_t numElements() const noexcept{
+        return size() / sizeof(T);
+    }
+private:
+    size_t getFileSizeInBytes(const std::string& filename){
+        struct stat stat_buf;
+        int rc = stat(filename.c_str(), &stat_buf);
+        if(rc == 0){
+            return stat_buf.st_size;
+        }else{
+            throw MappedFileException("Could not determine file size of file " + filename);
+        }
+    }
+};
+
+
+} //namespace cudasw4
+
+#endif
\ No newline at end of file
diff --git a/lib/libmarv/src/marv.cu b/lib/libmarv/src/marv.cu
new file mode 100644
index 000000000..1a3d4e8db
--- /dev/null
+++ b/lib/libmarv/src/marv.cu
@@ -0,0 +1,253 @@
+#include <algorithm>
+#include <string>
+#include <vector>
+
+#include "hpc_helpers/all_helpers.cuh"
+#include "hpc_helpers/peer_access.cuh"
+
+#include "dbdata.hpp"
+#include "cudasw4.cuh"
+#include "config.hpp"
+namespace b64 {
+#include "base64.h"
+}
+#include "marv.h"
+
+
+size_t getMaxTempBytes(int maxSubjectLength) {
+    int deviceId = 0;
+    int numSMs = 0;
+    cudaGetDevice(&deviceId); CUERR
+    cudaDeviceGetAttribute(&numSMs, cudaDevAttrMultiProcessorCount, deviceId); CUERR
+
+    constexpr int maxGroupSize = 8; // from getMultiTileGroupRegConfigForPSSM_Gapless
+    constexpr int threadBlockSize = 512;
+    const int numGroupsPerBlock = threadBlockSize / maxGroupSize;
+    const size_t tempStorageElementsPerGroup = SDIV(maxSubjectLength, 4);
+    const size_t tempStorageElementsPerBlock = tempStorageElementsPerGroup * numGroupsPerBlock;
+    return tempStorageElementsPerBlock * sizeof(float2) * numSMs;
+}
+
+cudasw4::ScanType mapCswScanType(Marv::AlignmentType type) {
+    switch (type) {
+        case Marv::AlignmentType::GAPLESS:
+            return cudasw4::ScanType::Gapless;
+        case Marv::AlignmentType::SMITH_WATERMAN:
+            return cudasw4::ScanType::SW_Endpos;
+        case Marv::AlignmentType::GAPLESS_SMITH_WATERMAN:
+            return cudasw4::ScanType::GaplessPlusSW_Endpos;
+        default:
+            return cudasw4::ScanType::Gapless;
+    }
+}
+
+Marv::Marv(size_t dbEntries, int alphabetSize, int maxSeqLength, size_t maxSeqs, Marv::AlignmentType alignmentType) : dbEntries(dbEntries), alphabetSize(alphabetSize), dbmanager(NULL), alignmentType(alignmentType) {
+    std::vector<int> deviceIds = getDeviceIds();
+    helpers::PeerAccess peerAccess(deviceIds, false);
+    peerAccess.enableAllPeerAccesses();
+
+    cudasw4::MemoryConfig memoryConfig;
+    memoryConfig.maxBatchBytes = 128ull * 1024ull * 1024ull;
+    memoryConfig.maxBatchSequences = 10'000'000;
+    // memoryConfig.maxTempBytes = 4ull * 1024ull * 1024ull * 1024ull;
+    memoryConfig.maxTempBytes = getMaxTempBytes(maxSeqLength);
+    memoryConfig.maxGpuMem = std::numeric_limits<size_t>::max();
+
+    const int maxResults = std::min((int)maxSeqs, cudasw4::MaxNumberOfResults::value());
+    const bool verbose = false;
+    KernelConfigFilenames kernelConfigFilenames;
+    //set the following to overwrite the hardcoded config
+    // kernelConfigFilenames.gapless = "configfileA.txt";
+    // kernelConfigFilenames.sw = "configfileB.txt";
+
+    cudasw = static_cast<void*>(new cudasw4::CudaSW4(
+        deviceIds,
+        maxResults,
+        cudasw4::BlosumType::BLOSUM62_20,
+        memoryConfig,
+        verbose,
+        kernelConfigFilenames
+    ));
+    cudasw4::CudaSW4* sw = static_cast<cudasw4::CudaSW4*>(cudasw);
+    sw->setScanType(mapCswScanType(alignmentType));
+}
+std::vector<std::shared_ptr<GpuDatabaseAllocationBase>> allocations_all;
+
+Marv::~Marv() {
+    allocations_all.clear();
+    delete static_cast<cudasw4::CudaSW4*>(cudasw);
+    // delete static_cast<cudasw4::MMseqsDB*>(db);
+}
+
+std::vector<int> Marv::getDeviceIds() {
+    std::vector<int> deviceIds;
+    int num = 0;
+    cudaGetDeviceCount(&num); CUERR
+    for (int i = 0; i < num; i++) {
+        deviceIds.push_back(i);
+    }
+    return deviceIds;
+}
+
+void* Marv::loadDb(char* data, size_t* offset, int32_t* length, size_t dbByteSize) {
+    return static_cast<void*>(new cudasw4::MMseqsDB(cudasw4::loadMMseqsDB(
+        dbEntries, data, offset, length, dbByteSize
+    )));
+}
+
+void* Marv::loadDb(char* data, size_t dbByteSize, void* otherdb) {
+    cudasw4::MMseqsDB* db = static_cast<cudasw4::MMseqsDB*>(otherdb);
+    const cudasw4::DBdataMetaData& meta = db->getData().getMetaData();
+    return static_cast<void*>(new cudasw4::ExternalDB(cudasw4::loadExternalDB(
+        dbEntries, dbByteSize, meta
+    )));
+}
+
+std::vector<std::string> split(const std::string &str, const std::string &sep) {
+    std::vector<std::string> arr;
+
+    char *cstr = strdup(str.c_str());
+    const char* csep = sep.c_str();
+    char *rest;
+    char *current = strtok_r(cstr, csep, &rest);
+    while (current != NULL) {
+        arr.emplace_back(current);
+        current = strtok_r(NULL, csep, &rest);
+    }
+    free(cstr);
+
+    return arr;
+}
+
+void Marv::setDbWithAllocation(void* dbhandle, const std::string& allocationinfo) {
+    auto parts = split(allocationinfo, ":");
+
+    cudaIpcMemHandle_t h1, h2, h3;
+    char* charData;
+    cudasw4::SequenceLengthT* lengths;
+    size_t* offsets;
+    size_t numChars, numSubjects;
+
+    std::string decode;
+
+    decode = b64::base64_decode(parts[0].data(), parts[0].length());
+    memcpy((unsigned char *)(&h1), (unsigned char *)decode.data(), decode.length());
+    cudaIpcOpenMemHandle((void **)&charData, h1, cudaIpcMemLazyEnablePeerAccess);
+    CUERR
+    decode = b64::base64_decode(parts[1].data(), parts[1].length());
+    memcpy((unsigned char *)(&h2), (unsigned char *)decode.data(), decode.length());
+    cudaIpcOpenMemHandle((void **)&lengths, h2, cudaIpcMemLazyEnablePeerAccess);
+    CUERR
+    decode = b64::base64_decode(parts[2].data(), parts[2].length());
+    memcpy((unsigned char *)(&h3), (unsigned char *)decode.data(), decode.length());
+    cudaIpcOpenMemHandle((void **)&offsets, h3, cudaIpcMemLazyEnablePeerAccess);
+    CUERR
+    numChars = strtoull(parts[3].c_str(), NULL, 10);
+    numSubjects = strtoull(parts[4].c_str(), NULL, 10);
+    std::vector<std::shared_ptr<GpuDatabaseAllocationBase>> allocations_remote;
+    allocations_remote.emplace_back(std::make_shared<GpuDatabaseAllocationView>(GpuDatabaseAllocationView(charData, lengths, offsets, numChars, numSubjects)));
+
+    cudasw4::MMseqsDB* db = static_cast<cudasw4::MMseqsDB*>(dbhandle);
+    auto doNothingDeleter = [](cudasw4::MMseqsDB* ptr){ /* do nothing */ };
+    std::shared_ptr<cudasw4::MMseqsDB> dbPtr(static_cast<cudasw4::MMseqsDB*>(db), doNothingDeleter);
+
+    cudasw4::CudaSW4* sw = static_cast<cudasw4::CudaSW4*>(cudasw);
+    // OpaqueAllocationManager* manager = static_cast<OpaqueAllocationManager*>(allocationhandle);
+    sw->setDatabase(dbPtr, allocations_remote);
+}
+
+void Marv::setDb(void* dbhandle) {
+    cudasw4::MMseqsDB* db = static_cast<cudasw4::MMseqsDB*>(dbhandle);
+    auto doNothingDeleter = [](cudasw4::MMseqsDB* ptr){ /* do nothing */ };
+    std::shared_ptr<cudasw4::MMseqsDB> dbPtr(static_cast<cudasw4::MMseqsDB*>(db), doNothingDeleter);
+    cudasw4::CudaSW4* sw = static_cast<cudasw4::CudaSW4*>(cudasw);
+    sw->setDatabase(dbPtr);
+}
+
+std::string Marv::getDbMemoryHandle() {
+    cudasw4::CudaSW4* sw = static_cast<cudasw4::CudaSW4*>(cudasw);
+    // sw->printDBInfo();
+    // sw->printDBLengthPartitions();
+    sw->prefetchDBToGpus();
+    allocations_all = sw->getFullGpuDBAllocations();
+    cudaIpcMemHandle_t h1, h2, h3;
+    // char* charData;
+    // cudasw4::SequenceLengthT* lengths;
+    // size_t* offsets;
+    size_t numChars, numSubjects;
+    // for(const auto& alloc : allocations_all){
+        const auto& alloc = allocations_all[0];
+        cudaIpcGetMemHandle(&h1, alloc->getCharData());
+        CUERR
+        cudaIpcGetMemHandle(&h2, alloc->getLengthData());
+        CUERR
+        cudaIpcGetMemHandle(&h3, alloc->getOffsetData());
+        CUERR
+        // charData = alloc->getCharData();
+        // lengths = alloc->getLengthData();
+        // offsets = alloc->getOffsetData();
+        numChars = alloc->getNumChars();
+        numSubjects = alloc->getNumSubjects();
+    // }
+
+    std::vector<std::string> handles;
+    std::string enc1 = b64::base64_encode(&h1, sizeof(cudaIpcMemHandle_t));
+    std::string enc2 = b64::base64_encode(&h2, sizeof(cudaIpcMemHandle_t));
+    std::string enc3 = b64::base64_encode(&h3, sizeof(cudaIpcMemHandle_t));
+
+    std::string res;
+    res.append(enc1);
+    res.append(1, ':');
+    res.append(enc2);
+    res.append(1, ':');
+    res.append(enc3);
+    res.append(1, ':');
+    res.append(std::to_string(numChars));
+    res.append(1, ':');
+    res.append(std::to_string(numSubjects));
+    res.append(1, '\n');
+
+    return res;
+}
+
+void Marv::printInfo() {
+    cudasw4::CudaSW4* sw = static_cast<cudasw4::CudaSW4*>(cudasw);
+    sw->printDBInfo();
+    sw->printDBLengthPartitions();
+}
+
+void Marv::prefetch() {
+    cudasw4::CudaSW4* sw = static_cast<cudasw4::CudaSW4*>(cudasw);
+    sw->prefetchDBToGpus();
+}
+
+void Marv::startTimer() {
+    cudasw4::CudaSW4* sw = static_cast<cudasw4::CudaSW4*>(cudasw);
+    sw->totalTimerStart();
+}
+
+void Marv::stopTimer() {
+    cudasw4::CudaSW4* sw = static_cast<cudasw4::CudaSW4*>(cudasw);
+    auto totalBenchmarkStats = sw->totalTimerStop();
+}
+
+//sequence must be encoded
+Marv::Stats Marv::scan(const char* sequence, size_t sequenceLength, int8_t* pssm, Result* results) {
+    cudasw4::CudaSW4* sw = static_cast<cudasw4::CudaSW4*>(cudasw);
+    cudasw4::EncodedQueryView queryView(sequence, sequenceLength);
+    cudasw4::ScanResult scanResult = sw->scan(queryView, pssm);
+    for (size_t i = 0; i < scanResult.scores.size(); i++) {
+        results[i] = Result(
+            scanResult.referenceIds[i],
+            scanResult.scores[i],
+            alignmentType != GAPLESS ? scanResult.endPositions[i].x : -1,
+            alignmentType != GAPLESS ? scanResult.endPositions[i].y : -1
+        );
+    }
+    Stats stats;
+    stats.results = scanResult.scores.size();
+    stats.numOverflows = scanResult.stats.numOverflows;
+    stats.seconds = scanResult.stats.seconds;
+    stats.gcups = scanResult.stats.gcups;
+    return stats;
+}
diff --git a/lib/libmarv/src/marv.h b/lib/libmarv/src/marv.h
new file mode 100644
index 000000000..4fcd69749
--- /dev/null
+++ b/lib/libmarv/src/marv.h
@@ -0,0 +1,60 @@
+#ifndef MARV_H
+#define MARV_H
+
+#include <vector>
+
+class Marv {
+public:
+    enum AlignmentType {
+        GAPLESS,
+        SMITH_WATERMAN,
+        GAPLESS_SMITH_WATERMAN
+    };
+
+    Marv(size_t dbEntries, int alphabetSize, int maxSeqLength, size_t maxSeqs, AlignmentType alignmentType = AlignmentType::GAPLESS);
+    ~Marv();
+
+    static std::vector<int> getDeviceIds();
+    void* loadDb(char* data, size_t* offset, int32_t* length, size_t dbByteSize);
+    void* loadDb(char* data, size_t dbByteSize, void* otherdb);
+    void setDb(void* dbhandle);
+    void setDbWithAllocation(void* dbhandle, const std::string& allocationinfo);
+    std::string getDbMemoryHandle();
+
+    void printInfo();
+    void prefetch();
+
+    void startTimer();
+    void stopTimer();
+
+    struct Stats {
+        size_t results;
+        int numOverflows;
+        double seconds;
+        double gcups;
+    };
+
+    struct Result {
+        unsigned int id;
+        int score;
+        int qEndPos;
+        int dbEndPos;
+
+        Result(unsigned int id, int score, int qEndPos, int dbEndPos) :
+            id(id), score(score), qEndPos(qEndPos), dbEndPos(dbEndPos) {};
+    };
+
+    //sequence must be encoded
+    Stats scan(const char* sequence,  size_t sequenceLength, int8_t* pssm, Result* results);
+
+private:
+    size_t dbEntries;
+    int alphabetSize;
+
+    void* cudasw;
+    // void* db;
+    void* dbmanager;
+    AlignmentType alignmentType;
+};
+
+#endif
diff --git a/lib/libmarv/src/mathops.cuh b/lib/libmarv/src/mathops.cuh
new file mode 100644
index 000000000..a64af4cda
--- /dev/null
+++ b/lib/libmarv/src/mathops.cuh
@@ -0,0 +1,435 @@
+#ifndef MATH_OPS_CUH
+#define MATH_OPS_CUH
+
+#include <cuda_fp16.h>
+#include <cooperative_groups.h>
+#include <cooperative_groups/reduce.h>
+
+// from https://docs.nvidia.com/cuda/cuda-math-api/group__CUDA__MATH__INTRINSIC__HALF__CONSTANTS.html
+#ifndef CUDART_ZERO_FP16
+#define CUDART_ZERO_FP16 __ushort_as_half((unsigned short)0x0000U)
+#endif
+
+namespace cudasw4{
+
+    template<class T>
+    struct MathOps{};
+
+    template<>
+    struct MathOps<half2>{
+        using Type = half;
+        using VecType = half2;
+
+        __host__ __device__
+        static VecType zero_score(){
+            return 	(make_half2(0,0));
+        }
+
+        __device__
+        static VecType add(const VecType& a, const VecType& b){
+            return __hadd2(a,b);
+        }
+
+        //max(a,b)
+        __device__
+        static Type max(const Type& a, const Type& b){
+            return __hmax(a, b);
+        }
+
+        //max(a,b)
+        __device__
+        static VecType max(const VecType& a, const VecType& b){
+            return __hmax2(a, b);
+        }
+
+        //max(a,b)
+        __device__
+        static VecType max(const VecType& a, const VecType& b, bool* a_xIsMax, bool* a_yIsMax){
+            #if 0
+                //uses prmt to extract x and y
+                VecType result;
+                if(a.x >= b.x){
+                    result.x = a.x;
+                    *a_xIsMax = true;
+                }else{
+                    result.x = b.x;
+                    *a_xIsMax = false;
+                }
+                if(a.y >= b.y){
+                    result.y = a.y;
+                    *a_yIsMax = true;
+                }else{
+                    result.y = b.y;
+                    *a_yIsMax = false;
+                }
+                return result;
+            #else
+                VecType result = max(a,b);
+                if(a.x >= b.x){
+                    *a_xIsMax = true;
+                }else{
+                    *a_xIsMax = false;
+                }
+                if(a.y >= b.y){
+                    *a_yIsMax = true;
+                }else{
+                    *a_yIsMax = false;
+                }
+                return result;
+            #endif
+        }
+
+
+        //max(a,b,c)
+        __device__
+        static VecType max3(const VecType& a, const VecType& b, const VecType& c){
+            return __hmax2(a, __hmax2(b,c));
+        }
+
+        //max(a+b,c)
+        __device__
+        static VecType add_max(const VecType& a, const VecType& b, const VecType& c){
+            return __hmax2(__hadd2(a,b), c);
+        }
+
+        //max(a+b,c)
+        __device__
+        static VecType add_max(const VecType& a, const VecType& b, const VecType& c, bool* sum_xIsMax, bool* sum_yIsMax){
+            VecType sum = add(a,b);
+            return max(sum, c, sum_xIsMax, sum_yIsMax);
+        }
+
+        //max(a+b,0)
+        __device__
+        static VecType add_relu(const VecType& a, const VecType& b){
+            return add_max(a,b, make_half2(0,0));
+        }
+
+        //max(a+b,0)
+        __device__
+        static VecType add_relu(const VecType& a, const VecType& b, const VecType& zero){
+            return add_max(a,b, zero);
+        }
+
+        //max(max(a + b, c), 0)
+        __device__
+        static VecType add_max_relu(const VecType& a, const VecType& b, const VecType& c){
+            return max(add_max(a,b,c), make_half2(0,0));
+        }
+
+        template<class Group>
+        __device__
+        static VecType reduce_max(Group& group, VecType val){
+            return cooperative_groups::reduce(group, val, [](const auto& l, const auto& r){return __hmax2(l,r);});
+            //return cooperative_groups::reduce(group, val, cooperative_groups::greater<VecType>{});
+        }
+    };
+
+    template<>
+    struct MathOps<short2>{
+        using Type = short;
+        using VecType = short2;
+
+        __host__ __device__
+        static VecType zero_score(){
+            return 	(make_short2(0,0));
+        }
+
+        __device__
+        static unsigned int asUint(const short2& s){
+            unsigned int u;
+            memcpy(&u, &s, sizeof(unsigned int));
+            return u;
+        }
+
+        __device__
+        static VecType asVec(unsigned int u){
+            VecType v;
+            memcpy(&v, &u, sizeof(unsigned int));
+            return v;
+        }
+
+        __device__
+        static VecType add(const VecType& a, const VecType& b){
+            return asVec(__vadd2(asUint(a), asUint(b)));
+        }
+
+        //max(a,b)
+        __device__
+        static Type max(const Type& a, const Type& b){
+            return ::max(a, b);
+        }
+
+        //max(a,b)
+        __device__
+        static VecType max(const VecType& a, const VecType& b){
+            return 	asVec(__vmaxs2(asUint(a), asUint(b)));
+        }
+
+        //max(a,b)
+        __device__
+        static VecType max(const VecType& a, const VecType& b, bool* a_xIsMax, bool* a_yIsMax){
+            return  asVec(__vibmax_u16x2(asUint(a), asUint(b), a_xIsMax, a_yIsMax));
+        }
+
+        //max(a,b,c)
+        __device__
+        static VecType max3(const VecType& a, const VecType& b, const VecType& c){
+            return 	asVec(__vimax3_s16x2(asUint(a), asUint(b), asUint(c)));
+        }
+
+        //max(a+b,c)
+        __device__
+        static VecType add_max(const VecType& a, const VecType& b, const VecType& c){
+            return 	asVec(__viaddmax_s16x2(asUint(a), asUint(b), asUint(c)));
+        }
+
+        //max(a+b,c)
+        __device__
+        static VecType add_max(const VecType& a, const VecType& b, const VecType& c, bool* sum_xIsMax, bool* sum_yIsMax){
+            VecType sum = add(a,b);
+            return max(sum, c, sum_xIsMax, sum_yIsMax);
+        }
+
+        //max(a+b,0)
+        __device__
+        static VecType add_relu(const VecType& a, const VecType& b){
+            return add_max(a,b, make_short2(0,0));
+        }
+
+        //max(a+b,0)
+        __device__
+        static VecType add_relu(const VecType& a, const VecType& b, const VecType& zero){
+            return asVec(__viaddmax_s16x2(asUint(a), asUint(b), asUint(zero)));
+        }
+
+        //max(max(a + b, c), 0)
+        __device__
+        static VecType add_max_relu(const VecType& a, const VecType& b, const VecType& c){
+            return asVec(__viaddmax_s16x2_relu(asUint(a), asUint(b), asUint(c)));
+        }
+
+        template<class Group>
+        __device__
+        static VecType reduce_max(Group& group, VecType val){
+            return asVec(cooperative_groups::reduce(group, asUint(val), [](const auto& l, const auto& r){return __vmaxs2(l,r);}));
+            //return cooperative_groups::reduce(group, val, cooperative_groups::greater<VecType>{});
+        }
+    };
+
+
+    template<>
+    struct MathOps<float>{
+        using Type = float;
+
+        __device__
+        static Type add(const Type& a, const Type& b){
+            return a + b;
+        }
+
+        //max(a,b)
+        __device__
+        static Type max(const Type& a, const Type& b){
+            return ::max(a,b);
+        }
+
+        //max(a,b)
+        __device__
+        static Type max(const Type& a, const Type& b, bool* firstIsMax){
+            if(a >= b){
+                *firstIsMax = true;
+                return a;
+            }else{
+                *firstIsMax = false;
+                return b;
+            }
+        }
+
+        //max(a,b,c)
+        __device__
+        static Type max3(const Type& a, const Type& b, const Type& c){
+            return max(a,max(b,c));
+        }
+
+        //max(a+b,c)
+        __device__
+        static Type add_max(const Type& a, const Type& b, const Type& c){
+            return max(a+b,c);
+        }
+
+        //max(a+b,c)
+        __device__
+        static Type add_max(const Type& a, const Type& b, const Type& c, bool* firstIsMax){
+            if(a+b >= c){
+                *firstIsMax = true;
+                return a+b;
+            }else{
+                *firstIsMax = false;
+                return c;
+            }
+        }
+
+        //max(a+b,0)
+        __device__
+        static Type add_relu(const Type& a, const Type& b){
+            return add_max(a,b,0.0f);
+        }
+
+        //max(max(a + b, c), 0)
+        __device__
+        static Type add_max_relu(const Type& a, const Type& b, const Type& c){
+            return max(add_max(a,b,c), 0.0f);
+        }
+
+        template<class Group>
+        __device__
+        static Type reduce_max(Group& group, Type val){
+            return cooperative_groups::reduce(group, val, cooperative_groups::greater<Type>{});
+        }
+    };
+
+
+    template<>
+    struct MathOps<int>{
+        using Type = int;
+
+        __device__
+        static Type add(const Type& a, const Type& b){
+            return a + b;
+        }
+
+        //max(a,b)
+        __device__
+        static Type max(const Type& a, const Type& b){
+            return ::max(a,b);
+        }
+
+        //max(a,b)
+        __device__
+        static Type max(const Type& a, const Type& b, bool* firstIsMax){
+            return __vibmax_s32(a, b, firstIsMax);
+        } 
+
+        //max(a,b,c)
+        __device__
+        static Type max3(const Type& a, const Type& b, const Type& c){
+            return __vimax3_s32(a,b,c);
+        }
+
+        //max(a+b,c)
+        __device__
+        static Type add_max(const Type& a, const Type& b, const Type& c){
+            return __viaddmax_s32(a,b,c);
+        }
+
+        //max(a+b,c)
+        __device__
+        static Type add_max(const Type& a, const Type& b, const Type& c, bool* firstIsMax){
+            return __vibmax_s32(a+b, c, firstIsMax);
+        }
+
+        //max(a+b,0)
+        __device__
+        static Type add_relu(const Type& a, const Type& b){
+            return add_max(a,b, 0);
+        }
+
+        //max(max(a + b, c), 0)
+        __device__
+        static Type add_max_relu(const Type& a, const Type& b, const Type& c){
+            return __viaddmax_s32_relu(a,b,c);
+        }    
+
+        template<class Group>
+        __device__
+        static Type reduce_max(Group& group, Type val){
+            return cooperative_groups::reduce(group, val, cooperative_groups::greater<Type>{});
+        }
+    };
+
+    template<>
+    struct MathOps<half>{
+        using Type = half;
+
+        //max(a,b)
+        __device__
+        static Type max(const Type& a, const Type& b){
+            return __hmax(a,b);
+        }
+
+        //max(a,b,c)
+        __device__
+        static Type max3(const Type& a, const Type& b, const Type& c){
+            return max(a,max(b,c));
+        }
+
+        //max(a+b,c)
+        __device__
+        static Type add_max(const Type& a, const Type& b, const Type& c){
+            return max(a+b,c);
+        }
+
+        //max(a+b,0)
+        __device__
+        static Type add_relu(const Type& a, const Type& b){
+            return add_max(a,b, CUDART_ZERO_FP16);
+        }
+
+        //max(max(a + b, c), 0)
+        __device__
+        static Type add_max_relu(const Type& a, const Type& b, const Type& c){
+            return max(add_max(a,b,c), CUDART_ZERO_FP16);
+        }
+
+        template<class Group>
+        __device__
+        static Type reduce_max(Group& group, Type val){
+            return cooperative_groups::reduce(group, val, cooperative_groups::greater<Type>{});
+        }
+    };
+
+    template<>
+    struct MathOps<short>{
+        using Type = short;
+
+        //max(a,b)
+        __device__
+        static Type max(const Type& a, const Type& b){
+            return ::max(a,b);
+        }
+
+        //max(a,b,c)
+        __device__
+        static Type max3(const Type& a, const Type& b, const Type& c){
+            return max(a,max(b,c));
+        }
+
+        //max(a+b,c)
+        __device__
+        static Type add_max(const Type& a, const Type& b, const Type& c){
+            return max(a+b,c);
+        }
+
+        //max(a+b,0)
+        __device__
+        static Type add_relu(const Type& a, const Type& b){
+            return add_max(a,b,0);
+        }
+
+        //max(max(a + b, c), 0)
+        __device__
+        static Type add_max_relu(const Type& a, const Type& b, const Type& c){
+            return max(add_max(a,b,c), 0);
+        }
+
+        template<class Group>
+        __device__
+        static Type reduce_max(Group& group, Type val){
+            return cooperative_groups::reduce(group, val, cooperative_groups::greater<Type>{});
+        }
+    };
+
+
+} //namespace cudasw4
+
+#endif
\ No newline at end of file
diff --git a/lib/libmarv/src/mmapbuffer.hpp b/lib/libmarv/src/mmapbuffer.hpp
new file mode 100644
index 000000000..82566e904
--- /dev/null
+++ b/lib/libmarv/src/mmapbuffer.hpp
@@ -0,0 +1,508 @@
+#ifndef MMAP_BUFFER_HPP
+#define MMAP_BUFFER_HPP
+
+#include "hpc_helpers/all_helpers.cuh"
+
+#include <sys/mman.h>
+#include <unistd.h>
+#include <cstdint>
+#include <cstdio>
+#include <cassert>
+#include <string>
+#include <stdexcept>
+#include <utility>
+#include <iostream>
+
+#include <type_traits>
+
+namespace cudasw4{
+
+class FileBackedMMapBuffer{    
+private:
+    class OpenCFile{
+    private:
+        FILE* file = nullptr;
+
+    public:
+        OpenCFile() = default;
+
+        OpenCFile(const char* filename, const char* mode){
+            file = fopen(filename, mode);
+            if(file == nullptr){
+                perror("OpenCFile fopen");
+                throw std::runtime_error("Cannot open file " + std::string(filename));
+            }
+        }
+
+        OpenCFile(const OpenCFile&) = delete;
+        OpenCFile(OpenCFile&& rhs){
+            file = std::exchange(rhs.file, nullptr);
+        }
+        
+        OpenCFile& operator=(OpenCFile rhs){
+            std::swap(*this, rhs);
+            return *this;
+        }
+        
+        ~OpenCFile(){
+            if(file != nullptr){
+                fclose(file);
+            }
+        }
+
+        friend void swap(OpenCFile& l, OpenCFile& r) noexcept{
+            using std::swap;
+
+            swap(l.file, r.file);
+        }
+
+        FILE* getFile() const noexcept{
+            return file;
+        }
+
+        int getFd() const noexcept{
+            int ret = fileno(file);
+            if(ret == -1){
+                perror("OpenCFile fileno");
+            }
+            return ret;
+        }
+    };
+
+    void* rawtotaldata = nullptr;
+    std::size_t size = 0;
+    std::size_t capacity = 0;
+
+    std::size_t memoryCapacity = 0;
+    std::size_t fileCapacity = 0;
+
+    std::size_t memoryLimit = 0;
+
+    OpenCFile filehandle{};
+    std::string filename{};
+
+    //undo all mappings
+    int unmapMemoryAndFile(){
+        int ret = unmapFile();
+        if(ret != 0){
+            return ret;
+        }
+
+        if(memoryCapacity > 0){
+            ret = munmap(rawtotaldata, memoryCapacity);
+            if(ret == 0){
+                rawtotaldata = nullptr;
+            }
+        };
+        return ret;
+    }
+
+    //undo mappings of file
+    int unmapFile(){
+        if(fileCapacity > 0){
+            int ret = msync(((char*)rawtotaldata) + memoryCapacity, fileCapacity, MS_SYNC);
+            if(ret != 0){
+                return ret;
+            }
+
+            ret = munmap(((char*)rawtotaldata) + memoryCapacity, fileCapacity);
+            if(ret != 0){
+                return ret;
+            }
+            if(memoryCapacity == 0){
+                rawtotaldata = nullptr;
+            }
+        }
+       
+        return 0;
+    }
+
+    //create virtual address range of size newcapacity which is backed by anonymous mapping
+    void* remap(std::size_t newcapacity){
+        void* newptr = nullptr;
+
+        if(rawtotaldata != nullptr){           
+            newptr = mremap(rawtotaldata, memoryCapacity, newcapacity, MREMAP_MAYMOVE);
+        }else{
+            //neither memory mapping nor file mapping. make a fresh range
+            newptr = mmap(0, newcapacity, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANONYMOUS | MAP_NORESERVE, -1, 0);
+        }
+
+        return newptr;
+    }
+
+    //create virtual address range of size newcapacity which is backed by anonymous mapping for at most memoryLimit bytes
+    //After memoryLimit bytes, the adress range is backed by a file mapping
+    void remapWithFile(std::size_t newcapacity){
+        //remove file mappings, if any
+        int ret = unmapFile();
+        if(ret != 0){
+            perror("FileBackedMMapBuffer::remapWithFile unmapFile");
+            throw std::runtime_error("remapWithFile failed");
+        }
+
+        #ifndef NDEBUG
+        const std::size_t pagesize = getpagesize();
+        assert(newcapacity % pagesize == 0);
+        #endif
+
+        //get new virtual adress range of size newcapacity with anonymous mapping
+        void* newptr = remap(newcapacity);
+        if(newptr == MAP_FAILED){
+            perror("FileBackedMMapBuffer::reserve remap");
+            throw std::runtime_error("Reserve failed");
+        }
+
+        rawtotaldata = newptr;
+
+        memoryCapacity = std::min(newcapacity, memoryLimit);
+        fileCapacity = newcapacity - memoryCapacity;
+
+        #ifndef NDEBUG
+        assert(memoryCapacity % pagesize == 0);
+        assert(fileCapacity % pagesize == 0);
+        #endif
+
+        if(fileCapacity > 0){
+            //update file size
+            int ret = ftruncate(filehandle.getFd(), fileCapacity);
+            if(ret != 0){
+                perror("FileBackedMMapBuffer::reserve ftruncate");
+                throw std::runtime_error("Reserve failed");
+            }
+
+            //update file mappings to virtual adress range
+            //this overwrites the previous anonymous mapping.
+            newptr = mmap(((char*)rawtotaldata) + memoryCapacity, fileCapacity, PROT_READ | PROT_WRITE, MAP_SHARED | MAP_FIXED | MAP_POPULATE | MAP_NONBLOCK, filehandle.getFd(), 0);
+            if(newptr == MAP_FAILED){
+                perror("FileBackedMMapBuffer::reserve file mmap");
+                throw std::runtime_error("Reserve failed");
+            }
+        }
+    }
+
+public:    
+
+    FileBackedMMapBuffer() = delete;
+
+    FileBackedMMapBuffer(std::size_t size_, std::size_t memoryLimit_, std::string filename_)
+        : memoryLimit(memoryLimit_), filehandle(filename_.c_str(), "a+"), filename(filename_){
+
+        const std::size_t pagesize = getpagesize();
+        //round down memoryLimit to pagesize
+        memoryLimit = (memoryLimit / pagesize) * pagesize;
+
+        std::size_t numPagesForSize = SDIV(size_, pagesize);
+
+        reserve(numPagesForSize * pagesize);
+
+        size = size_;
+    }
+
+    FileBackedMMapBuffer(const FileBackedMMapBuffer&) = delete;
+    FileBackedMMapBuffer(FileBackedMMapBuffer&& rhs){
+        rawtotaldata = std::exchange(rhs.rawtotaldata, nullptr);
+        size = std::exchange(rhs.size, 0);
+        capacity = std::exchange(rhs.capacity, 0);
+        memoryCapacity = std::exchange(rhs.memoryCapacity, 0);
+        fileCapacity = std::exchange(rhs.fileCapacity, 0);
+        memoryLimit = std::exchange(rhs.memoryLimit, 0);
+        filehandle = std::move(rhs.filehandle);
+        filename = std::exchange(rhs.filename, "");
+    }
+
+    ~FileBackedMMapBuffer(){
+        int ret = unmapMemoryAndFile();
+        if(ret != 0){
+            perror("~FileBackedMMapBuffer unmapMemoryAndFile");
+        }else{
+            rawtotaldata = nullptr;
+            size = 0;
+            capacity = 0;
+            memoryCapacity = 0;
+            memoryLimit = 0;
+        }
+
+        remove(filename.c_str());
+    }
+
+    friend void swap(FileBackedMMapBuffer& l, FileBackedMMapBuffer& r) noexcept{
+        using std::swap;
+
+        std::swap(l.rawtotaldata, r.rawtotaldata);
+        std::swap(l.size, r.size);
+        std::swap(l.capacity, r.capacity);
+        std::swap(l.memoryCapacity, r.memoryCapacity);
+        std::swap(l.fileCapacity, r.fileCapacity);
+        std::swap(l.memoryLimit, r.memoryLimit);
+        std::swap(l.filehandle, r.filehandle);
+        std::swap(l.filename, r.filename);
+    }
+
+    void destroy(){
+        int ret = unmapMemoryAndFile();
+        if(ret != 0){
+            perror("FileBackedMMapBuffer::destroy unmapMemoryAndFile");
+        }else{
+            rawtotaldata = nullptr;
+            size = 0;
+            capacity = 0;
+            memoryCapacity = 0;
+            memoryLimit = 0;
+        }
+    }
+
+    void reserve(std::size_t newcapacity){
+        if(newcapacity > capacity){
+            const std::size_t pagesize = getpagesize();
+            newcapacity = SDIV(newcapacity, pagesize) * pagesize;
+            remapWithFile(newcapacity);
+            capacity = newcapacity;
+        }        
+    }
+
+    void resize(std::size_t newsize){        
+        if(newsize <= capacity){
+            size = newsize;
+        }else{
+            reserve(newsize);
+            size = newsize;
+        }        
+    }
+
+    void shrink_to_fit(){
+        if(size == 0){
+            int ret = unmapMemoryAndFile();
+            if(ret != 0){
+                perror("FileBackedMMapBuffer::shrink_to_fit unmapMemoryAndFile");
+                throw std::runtime_error("shrink_to_fit failed");
+            }else{
+                rawtotaldata = nullptr;
+                size = 0;
+                capacity = 0;
+                memoryCapacity = 0;
+                memoryLimit = 0;
+            }
+        }else{
+            if(size < capacity){
+                const std::size_t pagesize = getpagesize();
+                std::size_t newcapacity = SDIV(size, pagesize) * pagesize;
+                remapWithFile(newcapacity);
+                capacity = newcapacity;
+            }
+        }
+    }
+
+    void clear(){
+        size = 0;
+    }
+
+    void* get() noexcept{
+        return rawtotaldata;
+    }
+
+    const void* get() const noexcept{
+        return rawtotaldata;
+    }
+
+    std::size_t getSize() const noexcept{
+        return size;
+    }
+    
+    std::size_t getCapacity() const noexcept{
+        return capacity;
+    }
+
+    std::size_t getCapacityInMemory() const noexcept{
+        return memoryCapacity;
+    }
+
+    std::size_t getCapacityInFile() const noexcept{
+        return fileCapacity;
+    }
+
+    void printStatus(std::ostream& os) const{
+        os << "size: " << getSize() << ", capacity: " << getCapacity();
+        os << ", memoryCapacity: " << getCapacityInMemory() << ", fileCapacity: " << getCapacityInFile();
+        os << ", memoryLimit: " << memoryLimit;
+    }
+};
+
+
+template<class T, int growth = 150>
+class FileBackedUVector{
+    static_assert(std::is_trivial<T>::value, "FileBackedUVector: T must be trivial");
+    static_assert(growth >= 100, "growth must be >= 100");
+
+private:
+    FileBackedMMapBuffer buffer;
+    std::size_t size_ = 0;
+    std::size_t capacity_ = 0;
+
+    void grow(std::size_t mincapacity){
+        constexpr double growthFactor = double(growth) / 100.0;
+
+        const size_t growthCapacity = std::max(capacity() + 1, size_t(capacity() * growthFactor));
+        if(mincapacity <= growthCapacity){
+            buffer.resize(sizeof(T) * growthCapacity);
+            capacity_ = growthCapacity;
+        }else{
+            buffer.resize(sizeof(T) * mincapacity);
+            capacity_ = mincapacity;
+        }
+        
+    }
+public:
+    using value_type = T;
+    using pointer = value_type*;
+    using const_pointer = const value_type*;
+    using iterator = value_type*;
+    using const_iterator = const value_type*;
+    using reference = value_type&;
+    using const_reference = const value_type&;
+
+
+    FileBackedUVector(std::size_t elements, std::size_t maxBytesInMemory, const std::string& backingfile)
+        : buffer(sizeof(T) * elements, maxBytesInMemory, backingfile){
+
+        size_ = elements;
+        capacity_ = elements;
+    }
+
+    void push_back(T obj){
+        if(size() >= capacity()){
+            reserve(size() + 1);
+        }
+
+        data()[size_++] = std::move(obj);
+    }
+
+    template<class InputIt>
+    iterator insert(const_iterator pos, InputIt first, InputIt last){
+        const std::size_t insertsize = std::distance(first, last);
+        const std::size_t newsize = size() + insertsize;
+        const std::size_t where = std::distance(cbegin(), pos);
+        assert(where <= size());
+
+        if(newsize > capacity()){
+            reserve(newsize);
+        }
+
+        std::copy_backward(cbegin() + where, cend(), begin() + newsize);
+        std::copy(first, last, begin() + where);
+
+        size_ = newsize;
+
+        return begin() + where;
+    }
+
+    iterator insert(const_iterator pos, size_t count, const T& value){
+        const std::size_t insertsize = count;
+        const std::size_t newsize = size() + insertsize;
+        const std::size_t where = std::distance(cbegin(), pos);
+        assert(where <= size());
+
+        if(newsize > capacity()){
+            reserve(newsize);
+        }
+
+        std::copy_backward(cbegin() + where, cend(), begin() + newsize);
+        std::fill(begin() + where, begin() + where + insertsize, value);
+
+        size_ = newsize;
+
+        return begin() + where;
+    }
+
+    std::size_t size() const noexcept{
+        return size_;
+    }
+
+    std::size_t capacity() const noexcept{
+        return capacity_;
+    }
+
+
+    pointer data() noexcept{
+        return reinterpret_cast<pointer>(buffer.get());
+    }
+
+    const_pointer data() const noexcept{
+        return reinterpret_cast<const_pointer>(buffer.get());
+    }
+
+    iterator begin() noexcept{
+        return data();
+    }
+
+    const_iterator cbegin() const noexcept{
+        return data();
+    }
+
+    iterator end() noexcept{
+        return data() + size();
+    }
+
+    const_iterator cend() const noexcept{
+        return data() + size();
+    }    
+
+    bool empty() const noexcept{
+        return size() == 0;
+    }
+
+    reference operator[](size_t i){
+        return data()[i];
+    }
+
+    const_reference operator[](size_t i) const{
+        return data()[i];
+    }
+
+    reference front(){
+        return data()[0];
+    }
+
+    const_reference front() const{
+        return data()[0];
+    }
+
+    reference back(){
+        return data()[size() - 1];
+    }
+
+    const_reference back() const{
+        return data()[size() - 1];
+    }
+
+    std::size_t getCapacityInMemoryInBytes() const noexcept{
+        return buffer.getCapacityInMemory();
+    }
+
+    std::size_t getCapacityInFileInBytes() const noexcept{
+        return buffer.getCapacityInFile();
+    }
+
+    void clear(){
+        buffer.clear();
+    }
+
+    void resize(std::size_t newsize){
+        if(newsize > capacity()){
+            grow(newsize);
+        }
+        size_ = newsize;
+    }
+
+    void reserve(std::size_t newcapacity){
+        if(newcapacity > capacity()){
+            grow(newcapacity);
+        }
+    }
+};
+
+} //namespace cudasw4
+
+
+
+#endif
\ No newline at end of file
diff --git a/lib/libmarv/src/options.cpp b/lib/libmarv/src/options.cpp
new file mode 100644
index 000000000..951049f4f
--- /dev/null
+++ b/lib/libmarv/src/options.cpp
@@ -0,0 +1,268 @@
+#include "options.hpp"
+#include "types.hpp"
+#include "hpc_helpers/all_helpers.cuh"
+
+#include <string>
+#include <iostream>
+
+
+
+void printOptions(const ProgramOptions& options){
+    std::cout << "Selected options:\n";
+    std::cout << "verbose: " << options.verbose << "\n";
+    std::cout << "interactive: " << options.interactive << "\n";
+    std::cout << "loadFullDBToGpu: " << options.loadFullDBToGpu << "\n";
+    std::cout << "prefetchDBFile: " << options.prefetchDBFile << "\n";
+    std::cout << "numTopOutputs: " << options.numTopOutputs << "\n";
+    std::cout << "gop: " << options.gop << "\n";
+    std::cout << "gex: " << options.gex << "\n";
+    std::cout << "maxBatchBytes: " << options.maxBatchBytes << "\n";
+    std::cout << "maxBatchSequences: " << options.maxBatchSequences << "\n";
+    std::cout << "maxTempBytes: " << options.maxTempBytes << "\n";
+    for(size_t i = 0; i < options.queryFiles.size(); i++){
+        std::cout << "queryFile " << i  << " : " << options.queryFiles[i] << "\n";
+    }
+    #ifdef CAN_USE_FULL_BLOSUM
+    std::cout << "blosum: " << to_string(options.blosumType) << "\n";
+    #else
+    std::cout << "blosum: " << to_string_nodim(options.blosumType) << "\n";
+    #endif
+    if(options.usePseudoDB){
+        std::cout << "Using built-in pseudo db with " << options.pseudoDBSize 
+            << " sequences of length " << options.pseudoDBLength << ". ";
+        if(options.pseudoDBSameSequence){
+            std::cout << "All sequences are identical\n";
+        }else{
+            std::cout << "All sequences are different\n";
+        }
+    }else{
+        std::cout << "Using db file: " << options.dbPrefix << "\n";
+    }
+    std::cout << "memory limit per gpu: " << (options.maxGpuMem == std::numeric_limits<size_t>::max() ? 
+        "unlimited" : std::to_string(options.maxGpuMem)) << "\n"; 
+
+    std::cout << "Output mode: " << options.outputModeString() << "\n";
+    std::cout << "Output file: " << options.outputfile << "\n";
+    std::cout << "Scan type: " << to_string(options.scanType) << "\n";
+    std::cout << "File with subject Ids: " << (options.subjectIdsFilename.has_value() ? options.subjectIdsFilename.value() : " unspecified") << "\n";
+    std::cout << "kernelConfigsFile_gapless: " << options.kernelConfigsFile_gapless.value_or("unspecified") << "\n";
+    std::cout << "kernelConfigsFile_sw: " << options.kernelConfigsFile_sw.value_or("unspecified") << "\n";
+}
+
+bool parseArgs(int argc, char** argv, ProgramOptions& options){
+
+    auto parseMemoryString = [](const std::string& string){
+        std::size_t result = 0;
+        if(string.length() > 0){
+            std::size_t factor = 1;
+            bool foundSuffix = false;
+            switch(string.back()){
+                case 'K':{
+                    factor = std::size_t(1) << 10; 
+                    foundSuffix = true;
+                }break;
+                case 'M':{
+                    factor = std::size_t(1) << 20;
+                    foundSuffix = true;
+                }break;
+                case 'G':{
+                    factor = std::size_t(1) << 30;
+                    foundSuffix = true;
+                }break;
+            }
+            if(foundSuffix){
+                const auto numberString = string.substr(0, string.size()-1);
+                result = factor * std::stoull(numberString);
+            }else{
+                result = std::stoull(string);
+            }
+        }else{
+            result = 0;
+        }
+        return result;
+    };
+
+    auto stringToScanType = [&](const std::string& string){
+        if(string == "Gapless") return cudasw4::ScanType::Gapless;
+        if(string == "SW_Endpos") return cudasw4::ScanType::SW_Endpos;
+        if(string == "Gapless+SW_Endpos") return cudasw4::ScanType::GaplessPlusSW_Endpos;
+        std::cout << "Unknown scan type " << string << ". Using Gapless.\n";
+        return cudasw4::ScanType::Gapless;
+    };
+
+    bool gotQuery = false;
+    bool gotDB = false;
+    bool gotGex = false;
+    bool gotGop = false;
+
+    options.queryFiles.clear();
+
+    for(int i = 1; i < argc; i++){
+        const std::string arg = argv[i];
+        if(arg == "--help"){
+            options.help = true;
+        }else if(arg == "--uploadFull"){
+            options.loadFullDBToGpu = true;
+        }else if(arg == "--verbose"){
+            options.verbose = true;            
+        }else if(arg == "--interactive"){
+            options.interactive = true;            
+        }else if(arg == "--printLengthPartitions"){
+            options.printLengthPartitions = true;
+        }else if(arg == "--prefetchDBFile"){
+            options.prefetchDBFile = true;
+        }else if(arg == "--top"){
+            options.numTopOutputs = std::atoi(argv[++i]);
+        }else if(arg == "--gop"){
+            options.gop = std::atoi(argv[++i]);
+            gotGop = true;
+        }else if(arg == "--gex"){
+            options.gex = std::atoi(argv[++i]);
+            gotGex = true;
+        }else if(arg == "--maxBatchBytes"){
+            options.maxBatchBytes = parseMemoryString(argv[++i]);
+        }else if(arg == "--maxBatchSequences"){
+            options.maxBatchSequences = std::atoi(argv[++i]);
+        }else if(arg == "--maxTempBytes"){
+            options.maxTempBytes = parseMemoryString(argv[++i]);
+        }else if(arg == "--maxGpuMem"){
+            options.maxGpuMem = parseMemoryString(argv[++i]);
+        }else if(arg == "--query"){
+            options.queryFiles.push_back(argv[++i]);
+            gotQuery = true;
+        }else if(arg == "--db"){
+            options.dbPrefix = argv[++i];
+            gotDB = true;
+        }else if(arg == "--mat"){
+            const std::string val = argv[++i];
+            #ifdef CAN_USE_FULL_BLOSUM
+            if(val == "blosum45") options.blosumType = cudasw4::BlosumType::BLOSUM45;
+            if(val == "blosum50") options.blosumType = cudasw4::BlosumType::BLOSUM50;
+            if(val == "blosum62") options.blosumType = cudasw4::BlosumType::BLOSUM62;
+            if(val == "blosum80") options.blosumType = cudasw4::BlosumType::BLOSUM80;
+            if(val == "blosum45_20") options.blosumType = cudasw4::BlosumType::BLOSUM45_20;
+            if(val == "blosum50_20") options.blosumType = cudasw4::BlosumType::BLOSUM50_20;
+            if(val == "blosum62_20") options.blosumType = cudasw4::BlosumType::BLOSUM62_20;
+            if(val == "blosum80_20") options.blosumType = cudasw4::BlosumType::BLOSUM80_20;
+            #else
+            if(val == "blosum45") options.blosumType = cudasw4::BlosumType::BLOSUM45_20;
+            if(val == "blosum50") options.blosumType = cudasw4::BlosumType::BLOSUM50_20;
+            if(val == "blosum62") options.blosumType = cudasw4::BlosumType::BLOSUM62_20;
+            if(val == "blosum80") options.blosumType = cudasw4::BlosumType::BLOSUM80_20;
+            if(val == "blosum45_20") options.blosumType = cudasw4::BlosumType::BLOSUM45_20;
+            if(val == "blosum50_20") options.blosumType = cudasw4::BlosumType::BLOSUM50_20;
+            if(val == "blosum62_20") options.blosumType = cudasw4::BlosumType::BLOSUM62_20;
+            if(val == "blosum80_20") options.blosumType = cudasw4::BlosumType::BLOSUM80_20;
+            #endif
+        }else if(arg == "--pseudodb"){
+            options.usePseudoDB = true;
+            options.pseudoDBSize = std::atoi(argv[++i]);
+            options.pseudoDBLength = std::atoi(argv[++i]);
+            int val = std::atoi(argv[++i]);
+            options.pseudoDBSameSequence = val != 0;
+            gotDB = true;
+        }else if(arg == "--tsv"){
+            options.outputMode = ProgramOptions::OutputMode::TSV;
+        }else if(arg == "--of"){
+            options.outputfile = argv[++i];
+        }else if(arg == "--scanType"){
+            options.scanType = stringToScanType(argv[++i]);
+        }else if(arg == "--subjectIdsFile"){
+            options.subjectIdsFilename = argv[++i];
+        }else if(arg == "--kernelconfigsGapless"){
+            options.kernelConfigsFile_gapless = argv[++i];
+        }else if(arg == "--kernelconfigsSW"){
+            options.kernelConfigsFile_sw = argv[++i];
+        }else{
+            std::cout << "Unexpected arg " << arg << "\n";
+        }
+    }
+
+    //set specific gop gex for blosum if no gop gex was set
+    if(options.blosumType == cudasw4::BlosumType::BLOSUM45 || options.blosumType == cudasw4::BlosumType::BLOSUM45_20){
+        if(!gotGop) options.gop = -13;
+        if(!gotGex) options.gex = -2;
+    }
+    if(options.blosumType == cudasw4::BlosumType::BLOSUM50 || options.blosumType == cudasw4::BlosumType::BLOSUM50_20){
+        if(!gotGop) options.gop = -13;
+        if(!gotGex) options.gex = -2;
+    }
+    if(options.blosumType == cudasw4::BlosumType::BLOSUM62 || options.blosumType == cudasw4::BlosumType::BLOSUM62_20){
+        if(!gotGop) options.gop = -11;
+        if(!gotGex) options.gex = -1;
+    }
+    if(options.blosumType == cudasw4::BlosumType::BLOSUM80 || options.blosumType == cudasw4::BlosumType::BLOSUM80_20){
+        if(!gotGop) options.gop = -10;
+        if(!gotGex) options.gex = -1;
+    }
+
+    if(!gotQuery){
+        std::cout << "Query is missing\n";
+        return false;
+    }
+    if(!gotDB){
+        std::cout << "DB prefix is missing\n";
+        return false;
+    }
+
+    return true;
+}
+
+void printHelp(int /*argc*/, char** argv){
+    ProgramOptions defaultoptions;
+
+    std::cout << "Usage: " << argv[0] << " [options]\n";
+    std::cout << "The GPUs to use are set via CUDA_VISIBLE_DEVICES environment variable.\n";
+    std::cout << "Options: \n";
+
+    std::cout << "   Mandatory\n";
+    std::cout << "      --query queryfile : Mandatory. Fasta or Fastq. Can be gzip'ed. Repeat this option for multiple query files\n";
+    std::cout << "      --db dbPrefix : Mandatory. The DB to query against. The same dbPrefix as used for makedb\n";
+    std::cout << "\n";
+
+    std::cout << "   Scoring\n";
+    std::cout << "      --top val : Output the val best scores. Default val = " << defaultoptions.numTopOutputs << "\n";
+    std::cout << "      --gop val : Gap open score. Overwrites our blosum-dependent default score.\n";
+    std::cout << "      --gex val : Gap extend score. Overwrites our blosum-dependent default score.\n";
+    #ifdef CAN_USE_FULL_BLOSUM
+    std::cout << "      --mat val: Set substitution matrix. Supported values: blosum45, blosum50, blosum62, blosum80, blosum45_20, blosum50_20, blosum62_20, blosum80_20. "
+                        "Default: " << "blosum62_20" << "\n";
+    #else 
+    std::cout << "      --mat val: Set substitution matrix. Supported values: blosum45, blosum50, blosum62, blosum80. "
+                        "Default: " << "blosum62" << "\n";
+    #endif
+    std::cout << "      --scanType val : Set scan type. Supported values = {Gapless, SW_Endpos, Gapless+SW_Endpos}.\n";
+    std::cout << "            Gapless: Scan whole DB with gapless alignment. \n";
+    std::cout << "            SW_Endpos: Scan whole DB with Smith Waterman Alignment, output score and end position.\n";
+    std::cout << "            Gapless+SW_Endpos: Scan whole DB with gapless alignment, then re-scan top results with Smith Waterman. Default val = " << to_string(defaultoptions.scanType) << "\n";
+    std::cout << "      --subjectIdsFile val : Only consider database sequences with index specified in file. Must be a text file, one index per line.\n";
+    std::cout << "            Do not use together with scanType Gapless+SW_Endpos. When --subjectIdsFile is set, option --top is ignored.\n";
+    std::cout << "\n";
+
+    std::cout << "   Memory\n";
+    std::cout << "      --maxGpuMem val : Try not to use more than val bytes of gpu memory per gpu. Uses all available gpu memory by default\n";
+    std::cout << "      --maxTempBytes val : Size of temp storage in GPU memory. Can use suffix K,M,G. Default val = " << defaultoptions.maxTempBytes << "\n";
+    std::cout << "      --maxBatchBytes val : Process DB in batches of at most val bytes. Can use suffix K,M,G. Default val = " << defaultoptions.maxBatchBytes << "\n";
+    std::cout << "      --maxBatchSequences val : Process DB in batches of at most val sequences. Default val = " << defaultoptions.maxBatchSequences << "\n";
+    std::cout << "\n";
+    
+    std::cout << "   Misc\n";
+    std::cout << "      --of val: Result output file. Parent directory must exist. Default: console output (/dev/stdout)\n";
+    std::cout << "      --tsv : Print results as tab-separated values instead of plain text. \n";
+    std::cout << "      --verbose : More console output. Shows timings. \n";
+    std::cout << "      --printLengthPartitions : Print number of sequences per length partition in db.\n";
+    std::cout << "      --interactive : Loads DB, then waits for sequence input by user\n";
+    std::cout << "      --help : Print this message\n";
+    std::cout << "\n";
+
+    std::cout << "   Performance and benchmarking\n";
+    std::cout << "      --prefetchDBFile : Load DB into RAM immediately at program start instead of waiting for the first access.\n";
+    std::cout << "      --uploadFull : If enough GPU memory is available to store full db, copy full DB to GPU before processing queries.\n";
+    std::cout << "      --pseudodb num length sameSeq: Use a generated DB which contains `num` equal sequences of length `length`."
+                        "sameSeq can be 0 or 1. If `sameSeq`!=0, all sequences in DB will be identical\n";
+    std::cout << "      --kernelconfigsGapless filename\n";
+    std::cout << "      --kernelconfigsSW filename\n";
+    std::cout << "\n";
+
+            
+}
\ No newline at end of file
diff --git a/lib/libmarv/src/options.hpp b/lib/libmarv/src/options.hpp
new file mode 100644
index 000000000..72a8e333f
--- /dev/null
+++ b/lib/libmarv/src/options.hpp
@@ -0,0 +1,62 @@
+#ifndef OPTIONS_HPP
+#define OPTIONS_HPP
+
+#include "types.hpp"
+#include <string>
+#include <iostream>
+#include <optional>
+
+struct ProgramOptions{
+    enum class OutputMode{
+        Plain,
+        TSV
+    };
+
+    bool help = false;
+    bool loadFullDBToGpu = false;
+    bool usePseudoDB = false;
+    bool printLengthPartitions = false;
+    bool interactive = false;
+    bool verbose = false;
+    bool prefetchDBFile = false;
+    bool pseudoDBSameSequence = true;
+    int numTopOutputs = 10;
+    int gop = -11;
+    int gex = -1;
+    int pseudoDBLength = 0;
+    int pseudoDBSize = 0;
+    cudasw4::BlosumType blosumType = cudasw4::BlosumType::BLOSUM62_20;
+    OutputMode outputMode = OutputMode::Plain;
+
+    cudasw4::ScanType scanType = cudasw4::ScanType::Gapless;
+
+    size_t maxBatchBytes = 128ull * 1024ull * 1024ull;
+    size_t maxBatchSequences = 10'000'000;
+    size_t maxTempBytes = 4ull * 1024ull * 1024ull * 1024ull;
+
+    size_t maxGpuMem = std::numeric_limits<size_t>::max();
+
+    std::optional<std::string> subjectIdsFilename;
+    std::string outputfile = "/dev/stdout";
+    std::string dbPrefix;
+    std::vector<std::string> queryFiles;
+
+    std::optional<std::string> kernelConfigsFile_gapless;
+    std::optional<std::string> kernelConfigsFile_sw;
+
+    std::string outputModeString() const{
+        switch(outputMode){
+            case OutputMode::Plain: return "Plain";
+            case OutputMode::TSV: return "TSV";
+            default: return "Unnamed output mode";
+        }
+    }
+};
+
+void printOptions(const ProgramOptions& options);
+
+bool parseArgs(int argc, char** argv, ProgramOptions& options);
+
+void printHelp(int argc, char** argv);
+
+#endif
\ No newline at end of file
diff --git a/lib/libmarv/src/pssm.cuh b/lib/libmarv/src/pssm.cuh
new file mode 100644
index 000000000..36e5dcb18
--- /dev/null
+++ b/lib/libmarv/src/pssm.cuh
@@ -0,0 +1,414 @@
+#ifndef PSSM_CUH
+#define PSSM_CUH
+
+#include "config.hpp"
+#include "types.hpp"
+#include "convert.cuh"
+#include "hpc_helpers/all_helpers.cuh"
+#include "hpc_helpers/simple_allocation.cuh"
+
+#include <vector>
+#include <cassert>
+
+namespace cudasw4{
+
+template<class T>
+struct PSSM_2D_View{
+    int numRows;
+    int numColumns;
+    int stride;
+    const T* data;
+
+    __host__ __device__
+    const T* operator[](int encodedSubjectLetter) const{
+        return data + encodedSubjectLetter * stride;
+    }
+};
+
+template<class T>
+struct PSSM_2D_ModifiableView{
+    int numRows;
+    int numColumns;
+    int stride;
+    T* data;
+
+    __host__ __device__
+    const T* operator[](int encodedSubjectLetter) const{
+        return data + encodedSubjectLetter * stride;
+    }
+
+    __host__ __device__
+    T* operator[](int encodedSubjectLetter){
+        return data + encodedSubjectLetter * stride;
+    }
+};
+
+struct PSSM{
+    int alphabetSize;
+    SequenceLengthT queryLength;
+    std::vector<int> data;
+
+    PSSM(int queryLength_, int alphabetSize_) : 
+        alphabetSize(alphabetSize_),
+        queryLength(queryLength_),
+        data(alphabetSize * queryLength){
+
+    }
+
+    int* operator[](int encodedSubjectLetter){
+        return data.data() + encodedSubjectLetter * queryLength;
+    }
+    const int* operator[](int encodedSubjectLetter) const{
+        return data.data() + encodedSubjectLetter * queryLength;
+    }
+
+    PSSM_2D_View<int> makeView(int startQueryPos = 0) const{
+        assert(startQueryPos < queryLength);
+        PSSM_2D_View<int> result;
+        result.numRows = alphabetSize;
+        result.numColumns = queryLength - startQueryPos;
+        result.stride = queryLength;
+        result.data = data.data() + startQueryPos;
+        return result;
+    }
+
+    //Generator::operator()(int queryPosition, int alphabetIndex) should return the pssm score
+    //for specific query position and alphabet letter
+    template<class Generator>
+    static PSSM fromGenerator(int alphabetSize, int queryLength, Generator generator){
+        PSSM pssm(queryLength, alphabetSize);
+
+        for (int subjectLetter = 0; subjectLetter < alphabetSize; subjectLetter++) {
+            for (int col = 0; col < queryLength; col++){
+                pssm[subjectLetter][col] = generator(col, subjectLetter);
+            }
+        }
+
+        return pssm;
+    }
+
+    static PSSM fromPSSM(const char* /*encodedQuery*/, int queryLength, const int8_t * pssm, int alphabetSize){
+        PSSM retPssm(queryLength, alphabetSize);
+        for (int subjectLetter = 0; subjectLetter < alphabetSize; subjectLetter++) {
+            for (int col = 0; col < queryLength; col++){
+                retPssm[subjectLetter][col] = static_cast<int>(pssm[subjectLetter * queryLength + col]);
+            }
+        }
+        return retPssm;
+    }
+
+    // query must have been encoded with ConvertAA_20
+    static PSSM fromBlosum(BlosumType blosumType, const char* encodedQuery, int queryLength){
+        auto make = [](const auto& blosum2D, const char* encodedQuery, int queryLength){
+            const int alphabetSize = blosum2D.size();
+
+
+            // auto generator = [&](int queryPosition, int alphabetIndex){
+            //     const int queryLetter = encodedQuery[queryPosition];
+            //     return blosum2D[queryLetter][alphabetIndex];
+            // };
+
+            auto generator_mmseqs_conversion = [&](int queryPosition, int alphabetIndex){
+                //blosum layout is for ncbi encoded letters, but input is mmseqs encoded.
+                //convert both queryletter and alphabetIndex to ncbi format
+                const int queryLetter_ncbi = ConvertAA_20_mmseqs_to_ncbi{}(encodedQuery[queryPosition]);
+                const int alphabetIndex_ncbi = ConvertAA_20_mmseqs_to_ncbi{}(alphabetIndex);
+                return blosum2D[queryLetter_ncbi][alphabetIndex_ncbi];
+            };
+            return fromGenerator(alphabetSize, queryLength, generator_mmseqs_conversion);
+        };
+
+        switch(blosumType){
+            case BlosumType::BLOSUM45_20: {
+                BLOSUM45_20 blosum;
+                return make(blosum.get2D(), encodedQuery, queryLength);
+            }
+            case BlosumType::BLOSUM50_20: {
+                BLOSUM50_20 blosum;
+                return make(blosum.get2D(), encodedQuery, queryLength);
+            }
+            case BlosumType::BLOSUM62_20: {
+                BLOSUM62_20 blosum;
+                return make(blosum.get2D(), encodedQuery, queryLength);
+            }
+            case BlosumType::BLOSUM80_20: {
+                BLOSUM80_20 blosum;
+                return make(blosum.get2D(), encodedQuery, queryLength);
+            }
+            default:
+                throw std::runtime_error("PSSM::fromBlosum invalid blosum type");
+        }
+
+    }
+
+};
+
+
+struct GpuPSSM{
+    int alphabetSize;
+    SequenceLengthT queryLength;
+    helpers::SimpleAllocationDevice<int, 0> data;
+
+    GpuPSSM() = default;
+
+    GpuPSSM(int queryLength_, int alphabetSize_) : 
+        alphabetSize(alphabetSize_),
+        queryLength(queryLength_),
+        data(alphabetSize * queryLength){
+
+    }
+
+    GpuPSSM(const PSSM& rhs, cudaStream_t stream){
+        upload(rhs, stream);
+    }
+
+    void resize(int queryLength_, int alphabetSize_){
+        alphabetSize = alphabetSize_;
+        queryLength = queryLength_;
+        data.resize(alphabetSize * queryLength);
+    }
+
+    void upload(const PSSM& rhs, cudaStream_t stream){
+        alphabetSize = rhs.alphabetSize;
+        queryLength = rhs.queryLength;
+        data.resize(rhs.data.size());
+        cudaMemcpyAsync(
+            data.data(), 
+            rhs.data.data(), 
+            sizeof(int) * rhs.data.size(), 
+            cudaMemcpyHostToDevice, 
+            stream
+        ); CUERR
+    }
+
+    PSSM_2D_View<int> makeView(int startQueryPos = 0) const{
+        assert(startQueryPos < queryLength);
+        PSSM_2D_View<int> result;
+        result.numRows = alphabetSize;
+        result.numColumns = queryLength - startQueryPos;
+        result.stride = queryLength;
+        result.data = data.data() + startQueryPos;
+        return result;
+    }
+};
+
+
+//PSSM alignment kernel will use vectorized loads to load multiple half elements
+// accessSizeBytes specifies the vector size in bytes, e.g. 16 for float4
+template<int accessSizeBytes, class InputT, class OutputT>
+__global__
+void permute_PSSM_for_gapless_kernel(
+    PSSM_2D_ModifiableView<OutputT> resultView,
+    PSSM_2D_View<InputT> inputView,
+    const int numRegs,
+    const int group_size
+) {
+    static_assert(accessSizeBytes == 4 || accessSizeBytes == 8 || accessSizeBytes == 16); //float, float2, float4
+    constexpr int numFloatsPerAccess = accessSizeBytes / 4;
+    constexpr int numHalfsPerAccess = numFloatsPerAccess * 2;
+
+    const int thid = threadIdx.x + blockIdx.x*blockDim.x;
+
+    const int numColumns = inputView.numColumns;
+    const int numRows = inputView.numRows;
+
+    const int tileSize = (numRegs * group_size * 2);
+
+    for(int inputRow = blockIdx.y; inputRow < numRows; inputRow += gridDim.y){
+
+        for (int inputCol = thid; inputCol < numColumns; inputCol += blockDim.x*gridDim.x) {
+            const int tileId = inputCol / tileSize;
+            const int tileColumnOffset = tileId * tileSize;
+            const int columnInTile = inputCol - tileColumnOffset;
+
+            const int l_2 = tileSize/2;
+            const int offset = (2*(columnInTile%l_2) + columnInTile/l_2)%numHalfsPerAccess;
+            const int thread = (columnInTile%l_2)/numRegs;
+            const int part = (columnInTile%numRegs)/numFloatsPerAccess;
+            const int resultCol = numHalfsPerAccess*thread + offset + part*numHalfsPerAccess*group_size;
+
+            resultView[inputRow][tileColumnOffset + resultCol] = inputView[inputRow][inputCol];
+        }
+    }
+}
+
+template<int accessSizeBytes, class InputT, class OutputT>
+__global__
+void permute_PSSM_for_SW_kernel(
+    PSSM_2D_ModifiableView<OutputT> resultView,
+    PSSM_2D_View<InputT> inputView,
+    int elementsPerThread,
+    int groupsize
+) {
+    static_assert(accessSizeBytes == 4 || accessSizeBytes == 8 || accessSizeBytes == 16); //float, float2, float4
+    static_assert(accessSizeBytes % sizeof(OutputT) == 0);
+    constexpr int numElementsPerAccess = accessSizeBytes / sizeof(OutputT);
+    assert(elementsPerThread % numElementsPerAccess == 0);
+
+    const int tileSize = (elementsPerThread * groupsize);
+    const int numAccesses = elementsPerThread / numElementsPerAccess;
+
+    const int numColumns = inputView.numColumns;
+    const int numRows = inputView.numRows;
+
+    for(int inputRow = blockIdx.y; inputRow < numRows; inputRow += gridDim.y){
+
+        for (int inputCol = threadIdx.x + blockIdx.x * blockDim.x; inputCol < numColumns; inputCol += blockDim.x * gridDim.x) {
+            const int tileId = inputCol / tileSize;
+            const int tileColumnOffset = tileId * tileSize;
+            const int columnInTile = inputCol - tileColumnOffset;
+
+            const int accessChunk = columnInTile / numElementsPerAccess;
+            const int elementIdInAccessChunk = columnInTile % numElementsPerAccess;
+            const int accessChunkIdInThread = accessChunk % numAccesses;
+            const int threadId = accessChunk / numAccesses;
+
+            const int outputAccessChunk = accessChunkIdInThread * groupsize + threadId;
+            const int outputCol = outputAccessChunk * numElementsPerAccess + elementIdInAccessChunk;
+            resultView[inputRow][tileColumnOffset + outputCol] = inputView[inputRow][inputCol];
+        }
+    }
+}
+
+struct GpuPermutedPSSMforGapless{
+    int alphabetSize;
+    int numRegs;
+    int group_size;
+    int columnstride;
+    SequenceLengthT queryLength;
+    helpers::SimpleAllocationDevice<char, 0> data;
+
+    template<class T>
+    void resize(int group_size_, int numRegs_, int alphabetSize_, int queryLength_, cudaStream_t stream){
+        assert(512 % sizeof(T) == 0);
+
+        if((group_size_ * numRegs_) % 2 == 1){
+            throw std::runtime_error("GpuPermutedPSSMforGapless resize error. elements per row must be even");
+        }
+        group_size = group_size_;
+        numRegs = numRegs_;
+        alphabetSize = alphabetSize_;
+        queryLength = queryLength_;
+
+        const int tileSize = (numRegs * group_size * 2);
+        const int numTiles = SDIV(queryLength, tileSize);
+        columnstride = numTiles * tileSize;
+        //numPaddedColumns = (SDIV(groupsize * numItems * sizeof(PssmScoreType), 512) * 512) / sizeof(PssmScoreType);
+
+        data.resize(sizeof(T) * alphabetSize * tileSize * numTiles);
+        //init with 0 so oob elements won't contribute to the score
+        cudaMemsetAsync(data.data(), 0, sizeof(char) * data.size(), stream);
+    }
+
+
+    PSSM_2D_View<half2> makeHalf2View() const{
+        assert(columnstride % 2 == 0);
+
+        PSSM_2D_View<half2> view;
+        view.numRows = alphabetSize;
+        view.numColumns = columnstride / 2;
+        view.stride = columnstride / 2;
+        view.data = reinterpret_cast<const half2*>(data.data());
+
+        return view;
+    }
+
+    PSSM_2D_View<short2> makeShort2View() const{
+        assert(columnstride % 2 == 0);
+
+        PSSM_2D_View<short2> view;
+        view.numRows = alphabetSize;
+        view.numColumns = columnstride / 2;
+        view.stride = columnstride / 2;
+        view.data = reinterpret_cast<const short2*>(data.data());
+
+        return view;
+    }
+
+    template<class OutputT, int accessSizeBytes, class InputT>
+    void fromGpuPSSMView(PSSM_2D_View<InputT> inputView, int group_size_, int numRegs_, cudaStream_t stream){
+        resize<OutputT>(group_size_, numRegs_, inputView.numRows, inputView.numColumns, stream);
+
+        PSSM_2D_ModifiableView<OutputT> resultView;
+        resultView.numRows = alphabetSize;
+        resultView.numColumns = queryLength;
+        resultView.stride = columnstride;
+        resultView.data = reinterpret_cast<OutputT*>(data.data());
+
+        dim3 block(128,1,1);
+        dim3 grid(SDIV(inputView.numColumns, block.x), inputView.numRows, 1);
+
+        permute_PSSM_for_gapless_kernel<accessSizeBytes><<<grid, block, 0, stream>>>(
+            resultView,
+            inputView,
+            numRegs,
+            group_size
+        ); CUERR;
+    }
+};
+
+struct GpuPermutedPSSMforSW{
+    int alphabetSize;
+    int numRegs;
+    int group_size;
+    int columnstride;
+    SequenceLengthT queryLength;
+    helpers::SimpleAllocationDevice<char, 0> data;
+
+    template<class T>
+    void resize(int group_size_, int numRegs_, int alphabetSize_, int queryLength_, cudaStream_t stream){
+        assert(512 % sizeof(T) == 0);
+
+        group_size = group_size_;
+        numRegs = numRegs_;
+        alphabetSize = alphabetSize_;
+        queryLength = queryLength_;
+
+        const int tileSize = (numRegs * group_size);
+        const int numTiles = SDIV(queryLength, tileSize);
+        columnstride = numTiles * tileSize;
+        //numPaddedColumns = (SDIV(groupsize * numItems * sizeof(PssmScoreType), 512) * 512) / sizeof(PssmScoreType);
+
+        data.resize(sizeof(T) * alphabetSize * tileSize * numTiles);
+        //init with 0 so oob elements won't contribute to the score
+        cudaMemsetAsync(data.data(), 0, sizeof(char) * data.size(), stream);
+    }
+
+    template<class T>
+    PSSM_2D_View<T> makeView() const{
+        PSSM_2D_View<T> view;
+        view.numRows = alphabetSize;
+        view.numColumns = columnstride;
+        view.stride = columnstride;
+        view.data = reinterpret_cast<const T*>(data.data());
+
+        return view;
+    }
+
+    template<int accessSizeBytes, class OutputT, class InputT>
+    void fromGpuPSSMView(PSSM_2D_View<InputT> inputView, int group_size_, int numRegs_, cudaStream_t stream){
+        resize<OutputT>(group_size_, numRegs_, inputView.numRows, inputView.numColumns, stream);
+
+        PSSM_2D_ModifiableView<OutputT> resultView;
+        resultView.numRows = alphabetSize;
+        resultView.numColumns = queryLength;
+        resultView.stride = columnstride;
+        resultView.data = reinterpret_cast<OutputT*>(data.data());
+
+        dim3 block(128,1,1);
+        dim3 grid(SDIV(inputView.numColumns, block.x), inputView.numRows, 1);
+
+        permute_PSSM_for_SW_kernel<accessSizeBytes><<<grid, block, 0, stream>>>(
+            resultView,
+            inputView,
+            numRegs,
+            group_size
+        ); CUERR;
+    }
+};
+
+
+}
+
+
+#endif
diff --git a/lib/libmarv/src/pssmkernels_gapless.cuh b/lib/libmarv/src/pssmkernels_gapless.cuh
new file mode 100644
index 000000000..7e178a791
--- /dev/null
+++ b/lib/libmarv/src/pssmkernels_gapless.cuh
@@ -0,0 +1,2229 @@
+#ifndef PSSM_KERNELS_CUH
+#define PSSM_KERNELS_CUH
+
+#include <cuda_fp16.h>
+
+#include <map>
+
+#include "pssm.cuh"
+#include "convert.cuh"
+#include "mathops.cuh"
+#include "util.cuh"
+
+#include <cooperative_groups.h>
+#include <cooperative_groups/reduce.h>
+namespace cg = cooperative_groups;
+
+
+#define USE_IMPROVED_SMEM
+
+
+#define PSSM_GAPLESS_SINGLETILE_FOR_EACH_VALID_CONFIG_DO_X \
+    X(4,4) X(4,8) X(4,12) X(4,16) X(4,20) X(4,24) X(4,28) X(4,32) \
+    X(4,36) X(4,40) X(4,44) X(4,48) X(4,52) X(4,56) X(4,60) X(4,64) \
+    X(8,4) X(8,8) X(8,12) X(8,16) X(8,20) X(8,24) X(8,28) X(8,32) \
+    X(8,36) X(8,40) X(8,44) X(8,48) X(8,52) X(8,56) X(8,60) X(8,64) \
+    X(16,4) X(16,8) X(16,12) X(16,16) X(16,20) X(16,24) X(16,28) X(16,32) \
+    X(16,36) X(16,40) X(16,44) X(16,48) X(16,52) X(16,56) X(16,60) X(16,64)
+
+#define PSSM_GAPLESS_MULTITILE_FOR_EACH_VALID_CONFIG_DO_X \
+    X(8,4) X(8,8) X(8,12) X(8,16) X(8,20) X(8,24) X(8,28) X(8,32) \
+    X(8,36) X(8,40) X(8,44) X(8,48) X(8,52) X(8,56) X(8,60) X(8,64) \
+    X(16,4) X(16,8) X(16,12) X(16,16) X(16,20) X(16,24) X(16,28) X(16,32) \
+    X(16,36) X(16,40) X(16,44) X(16,48) X(16,52) X(16,56) X(16,60) X(16,64)
+
+
+namespace cudasw4{
+
+
+
+
+namespace hardcodedzero{
+
+    template<class ScoreType> struct ScalarScoreType{};
+    template<> struct ScalarScoreType<half2>{ using type = half; };
+    template<> struct ScalarScoreType<short2>{ using type = short; };
+    template<> struct ScalarScoreType<int>{ using type = int; };
+    template<> struct ScalarScoreType<float>{ using type = float; };
+
+    template<class ScoreType, int numRegs, class Group, class SharedPSSM, class SmemIndexCalculator>
+    struct GaplessPSSMState{
+        using Scalar = typename ScalarScoreType<ScoreType>::type;
+        using MathOps = MathOps<ScoreType>;
+
+        ScoreType penalty_here_array[numRegs];
+        ScoreType maximum{}; //0
+        ScoreType penalty_diag{}; //0
+        SharedPSSM& shared_strided_PSSM;
+        Group& group;
+
+        __device__
+        GaplessPSSMState(SharedPSSM& s, Group& g) : shared_strided_PSSM(s), group(g) {}
+
+        __device__
+        void resetScores(){
+            #pragma unroll
+            for(int i = 0; i < numRegs; i++){
+                penalty_here_array[i] = ScoreType{};
+            }
+            
+            penalty_diag = ScoreType{};
+        }
+
+        __device__
+        void resetMaximum(){
+            maximum = ScoreType{};
+        }
+
+        __device__
+        void relax(int subject_letter){
+            SmemIndexCalculator smemIndexCalculator;
+
+            ScoreType score2;
+            ScoreType penalty_temp0;
+            ScoreType penalty_temp1;
+
+            const auto* row = &shared_strided_PSSM.data[subject_letter][0];
+
+            float4 foo = *((float4*)&row[smemIndexCalculator.getIndex(0)]);
+            memcpy(&score2, &foo.x, sizeof(ScoreType));
+            penalty_temp0 = penalty_here_array[0];
+            penalty_here_array[0] = MathOps::add_relu(penalty_diag, score2);
+
+            memcpy(&score2, &foo.y, sizeof(ScoreType));
+            penalty_temp1 = penalty_here_array[1];
+            penalty_here_array[1] = MathOps::add_relu(penalty_temp0, score2);
+            maximum = MathOps::max3(maximum, penalty_here_array[1], penalty_here_array[0]);
+
+            memcpy(&score2, &foo.z, sizeof(ScoreType));
+            penalty_temp0 = penalty_here_array[2];
+            penalty_here_array[2] = MathOps::add_relu(penalty_temp1, score2);
+
+            memcpy(&score2, &foo.w, sizeof(ScoreType));
+            penalty_temp1 = penalty_here_array[3];
+            penalty_here_array[3] = MathOps::add_relu(penalty_temp0, score2);
+            maximum = MathOps::max3(maximum, penalty_here_array[3], penalty_here_array[2]);
+
+
+            #pragma unroll
+            for (int i=1; i<numRegs/4; i++) {
+                foo = *((float4*)&row[smemIndexCalculator.getIndex(i)]);
+                memcpy(&score2, &foo.x, sizeof(ScoreType));
+                penalty_temp0 = penalty_here_array[4*i];
+                penalty_here_array[4*i] = MathOps::add_relu(penalty_temp1, score2);
+
+                memcpy(&score2, &foo.y, sizeof(ScoreType));
+                penalty_temp1 = penalty_here_array[4*i+1];
+                penalty_here_array[4*i+1] = MathOps::add_relu(penalty_temp0, score2);
+                maximum = MathOps::max3(maximum, penalty_here_array[4*i+1], penalty_here_array[4*i]);
+
+                memcpy(&score2, &foo.z, sizeof(ScoreType));
+                penalty_temp0 = penalty_here_array[4*i+2];
+                penalty_here_array[4*i+2] = MathOps::add_relu(penalty_temp1, score2);
+
+                memcpy(&score2, &foo.w, sizeof(ScoreType));
+                penalty_temp1 = penalty_here_array[4*i+3];
+                penalty_here_array[4*i+3] = MathOps::add_relu(penalty_temp0, score2);
+                maximum = MathOps::max3(maximum, penalty_here_array[4*i+3], penalty_here_array[4*i+2]);
+            }
+        };
+
+        __device__
+        void shuffleScores(const Scalar& border_in){
+            penalty_diag = group.shfl_up(penalty_here_array[numRegs-1], 1);
+            const ScoreType penalty_temp0 = group.shfl_down(penalty_here_array[numRegs-1], group.size()-1);
+
+            if (group.thread_rank() == 0) {
+                penalty_diag.x = border_in;
+                penalty_diag.y = penalty_temp0.x;
+            }
+        }
+
+        __device__
+        void stepSingleTile(int subject_letter){
+            relax(subject_letter);
+            shuffleScores(Scalar{});
+        }
+
+        __device__
+        void stepFirstTile(int subject_letter, Scalar& border_out){
+            relax(subject_letter);
+            shuffleScores(Scalar{});
+            if(group.thread_rank() == group.size() - 1){
+                border_out = penalty_here_array[numRegs-1].y;
+            }
+        }
+
+        __device__
+        void stepIntermediateTile(int subject_letter, const Scalar& border_in, Scalar& border_out){
+            relax(subject_letter);
+            shuffleScores(border_in);
+            if(group.thread_rank() == group.size() - 1){
+                border_out = penalty_here_array[numRegs-1].y;
+            }
+        }
+
+        __device__
+        void stepLastTile(int subject_letter, const Scalar& border_in){
+            relax(subject_letter);
+            shuffleScores(border_in);
+        }
+
+        __device__
+        void reduceMaximumScore(){
+            maximum = MathOps::reduce_max(group, maximum);
+        }
+    };
+
+
+    /*
+    PSSM kernel for a query of max length (2 * group_size * numRegs)
+    */
+    template<
+        class ScoreType, 
+        int blocksize, 
+        int group_size, 
+        int numRegs, 
+        bool subjectIsCaseSensitive, 
+        class ScoreOutputIterator,
+        class PositionsIterator
+    >
+    __global__
+    __launch_bounds__(512,1)
+    void GaplessFilter_strided_PSSM_singletile_kernel(
+        __grid_constant__ const char * const devChars,
+        __grid_constant__ ScoreOutputIterator const devAlignmentScores,
+        __grid_constant__ const size_t* const devOffsets,
+        __grid_constant__ const SequenceLengthT* const devLengths,
+        __grid_constant__ PositionsIterator const d_positions_of_selected_lengths,
+        __grid_constant__ const int numSelected,
+        __grid_constant__ const SequenceLengthT queryLength,
+        __grid_constant__ const PSSM_2D_View<ScoreType> strided_PSSM
+    ) {
+        if constexpr (std::is_same_v<ScoreType, short2>) {
+            #if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 900
+            return;
+            #endif
+        }
+        static_assert(numRegs % 4 == 0);
+        static_assert(blocksize % group_size == 0);
+        __builtin_assume(blockDim.x == blocksize);
+        __builtin_assume(blockDim.x % group_size == 0);
+
+        constexpr int numRowsPSSM = 21;
+        #ifdef USE_IMPROVED_SMEM
+        constexpr int numColumnsPSSM = std::max(group_size,8) * numRegs;
+        #else
+        constexpr int numColumnsPSSM = group_size * numRegs;
+        #endif
+
+        using SharedPSSM = SharedPSSM_singletile<ScoreType, numRowsPSSM, numColumnsPSSM>;
+        using MathOps = MathOps<ScoreType>;
+
+        extern  __shared__ char externalSmem[];
+
+        SharedPSSM& shared_strided_PSSM = *((SharedPSSM*)externalSmem);
+
+
+        auto group = cg::tiled_partition<group_size>(cg::this_thread_block());
+        const int idOfGroupInGrid = (threadIdx.x + blockIdx.x * blockDim.x) / group_size;
+        //const int numGroupsInGrid = (blockDim.x * gridDim.x) / group_size;
+
+        #ifdef USE_IMPROVED_SMEM
+        using SmemIndexCalculator = typename std::conditional<
+            group_size == 4, 
+            SmemIndexCalculator<group_size, 2>,
+            SmemIndexCalculator<group_size, 1>
+        >::type;
+        #else
+        using SmemIndexCalculator = SmemIndexCalculator<group_size, 1>;
+        #endif
+        GaplessPSSMState<ScoreType, numRegs, decltype(group), SharedPSSM, SmemIndexCalculator> state(shared_strided_PSSM, group);
+
+        auto load_PSSM_single = [&]() {
+            for (int i=threadIdx.x; i<21*group_size*numRegs; i+=blockDim.x) {
+                const int letter = i/(group_size*numRegs);
+                const int col = i%(group_size*numRegs);
+                shared_strided_PSSM.data[letter][col] = strided_PSSM[letter][col];
+            }
+            __syncthreads();
+        };
+
+        auto load_PSSM_double = [&]() {
+            for (int i=threadIdx.x; i<21*group_size*numRegs; i+=blockDim.x) {
+                const int letter = i/(group_size*numRegs);
+                const int col = i%(group_size*numRegs);
+                auto value = strided_PSSM[letter][col];
+
+                const int float4Index = col / 4;
+                const int offsetWithinFloat4 = col % 4;
+
+                const int ithChunkOfFour = float4Index / group_size;
+                const int float4PositionInChunkOfFour = float4Index % group_size;
+
+                const int outputFloat4Index0 = (ithChunkOfFour*2*group_size + 0*group_size) + float4PositionInChunkOfFour;
+                const int outputFloat4Index1 = (ithChunkOfFour*2*group_size + 1*group_size) + float4PositionInChunkOfFour;
+
+                shared_strided_PSSM.data[letter][4*outputFloat4Index0 + offsetWithinFloat4] = value;
+                shared_strided_PSSM.data[letter][4*outputFloat4Index1 + offsetWithinFloat4] = value;
+            }
+            __syncthreads();
+
+            // for (int i=threadIdx.x; i<21*group_size*numRegs*2; i+=blockDim.x) {
+            //     const int letter = i/(group_size*numRegs*2);
+            //     const int outputCol = i%(group_size*numRegs*2);
+
+            //     const int outputFloat4Index = outputCol / 4;
+            //     const int offsetWithinFloat4 = outputCol % 4;
+            //     const int inputIthChunkOfFour = outputFloat4Index / (2*group_size);
+            //     const int remainder1 = outputFloat4Index % (2*group_size); // 0*group_size + float4PositionInChunkOfFour or 1*group_size + float4PositionInChunkOfFour
+            //     const int float4PositionInChunkOfFour = remainder1 % group_size;
+            //     const int float4Index = 4*inputIthChunkOfFour+float4PositionInChunkOfFour;
+
+            //     // const int inputIthChunkOfFour = outputCol / 2*group_size;
+            //     // const int remainder1 = outputCol % 2*group_size; // 0*group_size + float4PositionInChunkOfFour or 1*group_size + float4PositionInChunkOfFour
+            //     // const int float4PositionInChunkOfFour = remainder1 % group_size;
+            //     // const int float4Index = 4*inputIthChunkOfFour+float4PositionInChunkOfFour;
+            //     // const int offsetWithinFloat4 = col % 4;
+
+            //     const int inputCol = 4*float4Index + offsetWithinFloat4;
+            //     shared_strided_PSSM.data[letter][outputCol] = strided_PSSM[letter][inputCol];
+            // }
+            // __syncthreads();
+        };
+
+        auto load_PSSM = [&](){
+            if constexpr(SmemIndexCalculator::factor == 2){
+                load_PSSM_double();
+            }else{
+                load_PSSM_single();
+            }
+        };
+
+        const char4* subjectAsChar4;
+        char4 new_subject_letter4;
+
+        auto makeCaseInsensitive4 = [](char4 encoded4){
+            unsigned int asUint;
+            memcpy(&asUint, &encoded4, sizeof(unsigned int));
+
+            if constexpr(subjectIsCaseSensitive){
+                // asUint = CaseSensitive_to_CaseInsensitive{}(asUint);
+                asUint = ClampToInvalid{}(asUint);
+            }
+
+            memcpy(&encoded4, &asUint, sizeof(unsigned int));
+            return encoded4;
+        };
+
+        load_PSSM();
+
+        //for(int alignmentId = idOfGroupInGrid; alignmentId < numSelected; alignmentId += numGroupsInGrid){
+        const int alignmentId = idOfGroupInGrid;
+        if(alignmentId < numSelected){
+            const auto subjectId = d_positions_of_selected_lengths[alignmentId];
+            const SequenceLengthT subjectLength = devLengths[subjectId];
+            const size_t base_S = devOffsets[subjectId]-devOffsets[0];
+
+            state.resetScores();
+            state.resetMaximum();
+
+            subjectAsChar4 = reinterpret_cast<const char4*>(&devChars[base_S]);
+
+            int k;
+            for (k=0; k<subjectLength-3; k+=4) {
+                new_subject_letter4 = makeCaseInsensitive4(subjectAsChar4[k/4]);
+                state.stepSingleTile(new_subject_letter4.x);
+                state.stepSingleTile(new_subject_letter4.y);
+                state.stepSingleTile(new_subject_letter4.z);
+                state.stepSingleTile(new_subject_letter4.w);
+            }
+
+            if (subjectLength%4 >= 1) {
+                new_subject_letter4 = makeCaseInsensitive4(subjectAsChar4[k/4]);
+                state.stepSingleTile(new_subject_letter4.x);
+            }
+
+            if (subjectLength%4 >= 2) {
+                state.stepSingleTile(new_subject_letter4.y);
+            }
+
+            if (subjectLength%4 >= 3) {
+                state.stepSingleTile(new_subject_letter4.z);
+            }
+
+            state.reduceMaximumScore();
+            const float overall_max = MathOps::max(state.maximum.x, state.maximum.y);
+
+            if(group.thread_rank() == 0){
+                devAlignmentScores[alignmentId] = overall_max;
+            }
+        }
+    }
+
+
+
+    /*
+    PSSM kernel for a query of max length (2 * group_size * numRegs)
+    */
+    template<
+        class ScoreType,
+        int blocksize, 
+        int group_size, 
+        int numRegs, 
+        bool subjectIsCaseSensitive, 
+        class ScoreOutputIterator, 
+        class PositionsIterator
+    >
+    void call_GaplessFilter_strided_PSSM_singletile_kernel(
+        const char * const devChars,
+        ScoreOutputIterator const devAlignmentScores,
+        const size_t* const devOffsets,
+        const SequenceLengthT* const devLengths,
+        PositionsIterator const d_positions_of_selected_lengths,
+        const int numSelected,
+        const SequenceLengthT queryLength,
+        const PSSM_2D_View<ScoreType>& strided_PSSM,
+        cudaStream_t stream
+    ){
+        constexpr int groupsPerBlock = blocksize / group_size;
+        constexpr int alignmentsPerGroup = 1;
+        constexpr int alignmentsPerBlock = groupsPerBlock * alignmentsPerGroup;
+        // std::cout << "blocksize " << blocksize << ", group_size " << group_size 
+        //     << ", alignmentsPerBlock " << alignmentsPerBlock << ", numSelected " << numSelected << "\n";
+
+        constexpr int numRowsPSSM = 21;
+        #ifdef USE_IMPROVED_SMEM
+        constexpr int numColumnsPSSM = std::max(group_size,8) * numRegs;
+        #else
+        constexpr int numColumnsPSSM = group_size * numRegs;
+        #endif
+        using SharedPSSM = SharedPSSM_singletile<ScoreType, numRowsPSSM, numColumnsPSSM>;
+        
+        int smem = sizeof(SharedPSSM);
+        auto kernel = GaplessFilter_strided_PSSM_singletile_kernel<
+            ScoreType,
+            blocksize, 
+            group_size, 
+            numRegs, 
+            subjectIsCaseSensitive,
+            ScoreOutputIterator, 
+            PositionsIterator>;
+
+        auto setSmemKernelAttribute = [&](){
+            static std::map<int, bool> isSet;
+            if(smem > 48*1024){
+                int deviceId;
+                cudaGetDevice(&deviceId); CUERR;
+                if(!isSet[deviceId]){
+                    cudaFuncSetAttribute(kernel, cudaFuncAttributeMaxDynamicSharedMemorySize, smem); CUERR;
+                    isSet[deviceId] = true;
+                }
+            }
+        };
+        setSmemKernelAttribute();
+
+        dim3 grid = (numSelected + alignmentsPerBlock - 1) / alignmentsPerBlock;
+
+        kernel<<<grid, blocksize, smem, stream>>>(
+            devChars,
+            devAlignmentScores,
+            devOffsets,
+            devLengths,
+            d_positions_of_selected_lengths,
+            numSelected,       
+            queryLength,
+            strided_PSSM
+        ); CUERR;
+    }
+
+
+
+
+
+
+    #define ScoreOutputIterator TopNMaximaArray
+    #define PositionsIterator decltype(thrust::make_counting_iterator<ReferenceIdT>(0))
+    #define subjectIsCaseSensitive true
+    #define X(g,r) \
+        extern template void call_GaplessFilter_strided_PSSM_singletile_kernel<half2, 512, g, r, subjectIsCaseSensitive, ScoreOutputIterator, PositionsIterator>( \
+            const char * const, \
+            ScoreOutputIterator const, \
+            const size_t* const, \
+            const SequenceLengthT* const, \
+            PositionsIterator const, \
+            const int, \
+            const SequenceLengthT, \
+            const PSSM_2D_View<half2>&, \
+            cudaStream_t \
+        );
+
+        PSSM_GAPLESS_SINGLETILE_FOR_EACH_VALID_CONFIG_DO_X
+
+    #undef X
+    #undef subjectIsCaseSensitive
+    #undef PositionsIterator
+    #undef ScoreOutputIterator
+
+    #define ScoreOutputIterator TopNMaximaArray
+    #define PositionsIterator decltype(thrust::make_counting_iterator<ReferenceIdT>(0))
+    #define subjectIsCaseSensitive true
+    #define X(g,r) \
+        extern template void call_GaplessFilter_strided_PSSM_singletile_kernel<short2, 512, g, r, subjectIsCaseSensitive, ScoreOutputIterator, PositionsIterator>( \
+            const char * const, \
+            ScoreOutputIterator const, \
+            const size_t* const, \
+            const SequenceLengthT* const, \
+            PositionsIterator const, \
+            const int, \
+            const SequenceLengthT, \
+            const PSSM_2D_View<short2>&, \
+            cudaStream_t \
+        );
+
+        PSSM_GAPLESS_SINGLETILE_FOR_EACH_VALID_CONFIG_DO_X
+
+    #undef X
+    #undef subjectIsCaseSensitive
+    #undef PositionsIterator
+    #undef ScoreOutputIterator
+
+
+
+
+    #define ScoreOutputIterator TopNMaximaArray
+    #define PositionsIterator ReferenceIdT*
+    #define subjectIsCaseSensitive true
+    #define X(g,r) \
+        extern template void call_GaplessFilter_strided_PSSM_singletile_kernel<half2, 512, g, r, subjectIsCaseSensitive, ScoreOutputIterator, PositionsIterator>( \
+            const char * const, \
+            ScoreOutputIterator const, \
+            const size_t* const, \
+            const SequenceLengthT* const, \
+            PositionsIterator const, \
+            const int, \
+            const SequenceLengthT, \
+            const PSSM_2D_View<half2>&, \
+            cudaStream_t \
+        );
+
+        PSSM_GAPLESS_SINGLETILE_FOR_EACH_VALID_CONFIG_DO_X
+
+    #undef X
+    #undef subjectIsCaseSensitive
+    #undef PositionsIterator
+    #undef ScoreOutputIterator
+
+    #define ScoreOutputIterator TopNMaximaArray
+    #define PositionsIterator ReferenceIdT*
+    #define subjectIsCaseSensitive true
+    #define X(g,r) \
+        extern template void call_GaplessFilter_strided_PSSM_singletile_kernel<short2, 512, g, r, subjectIsCaseSensitive, ScoreOutputIterator, PositionsIterator>( \
+            const char * const, \
+            ScoreOutputIterator const, \
+            const size_t* const, \
+            const SequenceLengthT* const, \
+            PositionsIterator const, \
+            const int, \
+            const SequenceLengthT, \
+            const PSSM_2D_View<short2>&, \
+            cudaStream_t \
+        );
+
+        PSSM_GAPLESS_SINGLETILE_FOR_EACH_VALID_CONFIG_DO_X
+
+    #undef X
+    #undef subjectIsCaseSensitive
+    #undef PositionsIterator
+    #undef ScoreOutputIterator
+
+
+
+    template<class ScoreType, int blocksize, class ScoreOutputIterator, class PositionsIterator>
+    void call_GaplessFilter_strided_PSSM_singletile_kernel(
+        int group_size,
+        int numRegs,
+        const char * const devChars,
+        ScoreOutputIterator const devAlignmentScores,
+        const size_t* const devOffsets,
+        const SequenceLengthT* const devLengths,
+        PositionsIterator const d_positions_of_selected_lengths,
+        const int numSelected,
+        const SequenceLengthT queryLength,
+        const PSSM_2D_View<ScoreType>& strided_PSSM,
+        cudaStream_t stream
+    ){
+        constexpr bool subjectIsCaseSensitive = true;
+
+        #define X(g,r) \
+            if(group_size == g && numRegs == r){ \
+                call_GaplessFilter_strided_PSSM_singletile_kernel<ScoreType, blocksize,g,r,subjectIsCaseSensitive>( \
+                    devChars, devAlignmentScores, devOffsets, devLengths, d_positions_of_selected_lengths, \
+                    numSelected, queryLength, strided_PSSM, stream \
+                ); \
+            } else 
+
+            PSSM_GAPLESS_SINGLETILE_FOR_EACH_VALID_CONFIG_DO_X
+        { throw std::runtime_error("invalid groupsize/numregs config");}
+
+        #undef X
+    }
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+    /*
+    PSSM kernel for arbitrary query length
+    */
+    template<
+        class ScoreType, 
+        int blocksize, 
+        int group_size, 
+        int numRegs, 
+        bool subjectIsCaseSensitive, 
+        class ScoreOutputIterator,
+        class PositionsIterator
+    >
+    __global__
+    __launch_bounds__(512,1)
+    void GaplessFilter_strided_PSSM_multitile_kernel(
+        __grid_constant__ const char * const devChars,
+        __grid_constant__ ScoreOutputIterator const devAlignmentScores,
+        __grid_constant__ const size_t* const devOffsets,
+        __grid_constant__ const SequenceLengthT* const devLengths,
+        __grid_constant__ PositionsIterator const d_positions_of_selected_lengths,
+        __grid_constant__ const int numSelected,
+        __grid_constant__ const SequenceLengthT queryLength,
+        __grid_constant__ const PSSM_2D_View<ScoreType> strided_PSSM,
+        __grid_constant__ float2* const multiTileTempStorage,
+        __grid_constant__ const size_t tempStorageElementsPerGroup
+    ) {
+        if constexpr (std::is_same_v<ScoreType, short2>) {
+            #if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 900
+            return;
+            #endif
+        }
+        static_assert(numRegs % 4 == 0);
+        static_assert(blocksize % group_size == 0);
+        __builtin_assume(blockDim.x == blocksize);
+        __builtin_assume(blockDim.x % group_size == 0);
+
+        extern  __shared__ char externalSmem[];
+        constexpr int numRowsPSSM = 21;
+        #ifdef USE_IMPROVED_SMEM
+        constexpr int numColumnsPSSM = std::max(group_size,8) * numRegs;
+        #else
+        constexpr int numColumnsPSSM = group_size * numRegs;
+        #endif
+        using SharedPSSM = SharedPSSM_singletile<ScoreType, numRowsPSSM, numColumnsPSSM>;
+
+        SharedPSSM& shared_strided_PSSM = *((SharedPSSM*)externalSmem);
+
+        using MathOps = MathOps<ScoreType>;
+        using Scalar = typename ScalarScoreType<ScoreType>::type;
+
+        auto group = cg::tiled_partition<group_size>(cg::this_thread_block());
+        const int numGroupsInBlock = blockDim.x / group_size;
+        const int idOfGroupInGrid = (threadIdx.x + blockIdx.x * blockDim.x) / group_size;
+        const int numGroupsInGrid = (blockDim.x * gridDim.x) / group_size;
+        
+        const size_t groupTempStorageOffset = idOfGroupInGrid * tempStorageElementsPerGroup;
+        float2* const groupTempStorage = multiTileTempStorage + groupTempStorageOffset;
+        
+        const int numTiles = SDIV(queryLength, 2 * group_size * numRegs);
+
+        #ifdef USE_IMPROVED_SMEM
+        using SmemIndexCalculator = typename std::conditional<
+            group_size == 4, 
+            SmemIndexCalculator<group_size, 2>,
+            SmemIndexCalculator<group_size, 1>
+        >::type;
+        #else
+        using SmemIndexCalculator = SmemIndexCalculator<group_size, 1>;
+        #endif
+        GaplessPSSMState<ScoreType, numRegs, decltype(group), SharedPSSM, SmemIndexCalculator> state(shared_strided_PSSM, group);
+
+        alignas(8) Scalar penalty_in[4];
+        alignas(8) Scalar penalty_out[4];
+
+        auto load_PSSM_single = [&](int tileNr) {
+            const int columnOffset = tileNr * group_size * numRegs;
+            __syncthreads(); //wait for all groups before overwriting pssm
+
+            for (int i=threadIdx.x; i<21*group_size*numRegs; i+=blockDim.x) {
+                int letter = i/(group_size*numRegs);
+                int col = i%(group_size*numRegs);
+                //shared_strided_PSSM.data[letter][col] = strided_PSSM_1d[i];
+                //shared_strided_PSSM.data[letter][col] = strided_PSSM.data[i];
+                shared_strided_PSSM.data[letter][col] = strided_PSSM[letter][columnOffset + col];
+            }
+            __syncthreads();
+        };
+
+        auto load_PSSM_double = [&](int tileNr) {
+            const int columnOffset = tileNr * group_size * numRegs;
+            __syncthreads(); //wait for all groups before overwriting pssm
+
+            for (int i=threadIdx.x; i<21*group_size*numRegs; i+=blockDim.x) {
+                const int letter = i/(group_size*numRegs);
+                const int col = i%(group_size*numRegs);
+                auto value = strided_PSSM[letter][columnOffset + col];
+
+                const int float4Index = col / 4;
+                const int offsetWithinFloat4 = col % 4;
+
+                const int ithChunkOfFour = float4Index / group_size;
+                const int float4PositionInChunkOfFour = float4Index % group_size;
+
+                const int outputFloat4Index0 = (ithChunkOfFour*2*group_size + 0*group_size) + float4PositionInChunkOfFour;
+                const int outputFloat4Index1 = (ithChunkOfFour*2*group_size + 1*group_size) + float4PositionInChunkOfFour;
+
+                shared_strided_PSSM.data[letter][4*outputFloat4Index0 + offsetWithinFloat4] = value;
+                shared_strided_PSSM.data[letter][4*outputFloat4Index1 + offsetWithinFloat4] = value;
+            }
+            __syncthreads();
+        };
+
+        auto load_PSSM = [&](int tileNr){
+            if constexpr(SmemIndexCalculator::factor == 2){
+                load_PSSM_double(tileNr);
+            }else{
+                load_PSSM_single(tileNr);
+            }
+        };
+
+        char4 new_subject_letter4;
+
+        auto makeCaseInsensitive4 = [](char4 encoded4){
+            unsigned int asUint;
+            memcpy(&asUint, &encoded4, sizeof(unsigned int));
+
+            if constexpr(subjectIsCaseSensitive){
+                //asUint = CaseSensitive_to_CaseInsensitive{}(asUint);
+                asUint = ClampToInvalid{}(asUint);
+            }
+
+            memcpy(&encoded4, &asUint, sizeof(unsigned int));
+            return encoded4;
+        };
+
+
+        //need to round up to blocks because loading pssm is a block-wide operation
+        const int numSelectedRoundedUp = SDIV(numSelected, numGroupsInBlock) * numGroupsInBlock;
+
+        for(int alignmentId = idOfGroupInGrid; alignmentId < numSelectedRoundedUp; alignmentId += numGroupsInGrid){
+
+            size_t subjectId;
+            SequenceLengthT subjectLength;
+            size_t base_S;
+            const char4* subjectAsChar4;
+
+            //first tile
+            {
+                /* 
+                    -----------------------
+                    Process tile 0
+                    ----------------------- 
+                */
+
+                //load pssm for tile 0. blockwide operation
+                load_PSSM(0);
+
+                if(alignmentId < numSelected){
+                    subjectId = d_positions_of_selected_lengths[alignmentId];
+                    subjectLength = devLengths[subjectId];
+                    base_S = devOffsets[subjectId]-devOffsets[0];
+
+                    state.resetScores();
+                    state.resetMaximum();
+                    subjectAsChar4 = reinterpret_cast<const char4*>(&devChars[base_S]);
+
+                    int k;
+
+                    //process rows in chunks of 4 rows
+                    for (k=0; k<subjectLength-3; k+=4) {
+
+                        new_subject_letter4 = makeCaseInsensitive4(subjectAsChar4[k/4]);
+
+                        state.stepFirstTile(new_subject_letter4.x, penalty_out[0]);
+                        state.stepFirstTile(new_subject_letter4.y, penalty_out[1]);
+                        state.stepFirstTile(new_subject_letter4.z, penalty_out[2]);
+                        state.stepFirstTile(new_subject_letter4.w, penalty_out[3]);
+                        
+                        //update temp storage for next tile
+                        if(group.thread_rank() == group.size() - 1){
+                            groupTempStorage[k/4] = *((float2*)&penalty_out[0]);
+                        }
+                    }
+
+                    //process at most 3 remaining rows
+                    if (subjectLength%4 >= 1) {
+                        new_subject_letter4 = makeCaseInsensitive4(subjectAsChar4[k/4]);
+                        state.stepFirstTile(new_subject_letter4.x, penalty_out[0]);
+                    }
+
+                    if (subjectLength%4 >= 2) {
+                        state.stepFirstTile(new_subject_letter4.y, penalty_out[1]);
+                    }
+
+                    if (subjectLength%4 >= 3) {
+                        state.stepFirstTile(new_subject_letter4.z, penalty_out[2]);
+                    }
+
+                    //if there were remaining rows, update temp storage
+                    if(subjectLength % 4 > 0){
+                        if(group.thread_rank() == group.size() - 1){
+                            groupTempStorage[k/4] = *((float2*)&penalty_out[0]);
+                        }
+                    }
+                }
+            }
+
+            //intermediate tiles
+            for(int tileNr = 1; tileNr < numTiles - 1; tileNr++){
+                /* 
+                    -----------------------
+                    Process tile tileNr
+                    ----------------------- 
+                */
+
+                //load pssm for tile tileNr. blockwide operation
+                load_PSSM(tileNr);
+
+                if(alignmentId < numSelected){    
+                    state.resetScores();
+        
+                    int k;
+        
+                    //process rows in chunks of 4 rows
+                    for (k=0; k<subjectLength-3; k+=4) {
+        
+                        new_subject_letter4 = makeCaseInsensitive4(subjectAsChar4[k/4]);
+                        if (group.thread_rank() == 0){
+                            *((float2*)&penalty_in[0]) = groupTempStorage[k/4];
+                        }
+
+                        state.stepIntermediateTile(new_subject_letter4.x, penalty_in[0], penalty_out[0]);
+                        state.stepIntermediateTile(new_subject_letter4.y, penalty_in[1], penalty_out[1]);
+                        state.stepIntermediateTile(new_subject_letter4.z, penalty_in[2], penalty_out[2]);
+                        state.stepIntermediateTile(new_subject_letter4.w, penalty_in[3], penalty_out[3]);
+            
+                        //update temp storage for next tile
+                        if(group.thread_rank() == group.size() - 1){
+                            groupTempStorage[k/4] = *((float2*)&penalty_out[0]);
+                        }
+                    }
+        
+                    //process at most 3 remaining rows
+                    if (subjectLength%4 >= 1) {
+                        new_subject_letter4 = makeCaseInsensitive4(subjectAsChar4[k/4]);
+                        //load input penalty for remaining rows
+                        if (group.thread_rank() == 0){
+                            *((float2*)&penalty_in[0]) = groupTempStorage[k/4];
+                        }
+                        state.stepIntermediateTile(new_subject_letter4.x, penalty_in[0], penalty_out[0]);
+                    }
+        
+                    if (subjectLength%4 >= 2) {
+                        state.stepIntermediateTile(new_subject_letter4.y, penalty_in[1], penalty_out[1]);
+                    }
+        
+                    if (subjectLength%4 >= 3) {
+                        state.stepIntermediateTile(new_subject_letter4.z, penalty_in[2], penalty_out[2]);
+                    }
+        
+                    //if there were remaining rows, update temp storage
+                    if(subjectLength % 4 > 0){
+                        if(group.thread_rank() == group.size() - 1){
+                            groupTempStorage[k/4] = *((float2*)&penalty_out[0]);
+                        }
+                    }
+                }
+            }
+
+            //last tile
+            if(numTiles > 1){
+                /* 
+                    -----------------------
+                    Process last tile (numTiles-1)
+                    ----------------------- 
+                */
+
+                //load pssm for tile (numTiles-1). blockwide operation
+                load_PSSM(numTiles-1);
+
+                if(alignmentId < numSelected){
+        
+                    state.resetScores();
+        
+                    int k;
+        
+                    //process rows in chunks of 4 rows
+                    for (k=0; k<subjectLength-3; k+=4) {
+        
+                        new_subject_letter4 = makeCaseInsensitive4(subjectAsChar4[k/4]);
+                        if (group.thread_rank() == 0){
+                            *((float2*)&penalty_in[0]) = groupTempStorage[k/4];
+                        }
+
+                        state.stepLastTile(new_subject_letter4.x, penalty_in[0]);
+                        state.stepLastTile(new_subject_letter4.y, penalty_in[1]);
+                        state.stepLastTile(new_subject_letter4.z, penalty_in[2]);
+                        state.stepLastTile(new_subject_letter4.w, penalty_in[3]);
+                    }
+        
+                    //process at most 3 remaining rows
+                    if (subjectLength%4 >= 1) {
+                        new_subject_letter4 = makeCaseInsensitive4(subjectAsChar4[k/4]);
+                        //load input penalty for remaining rows
+                        if (group.thread_rank() == 0){
+                            *((float2*)&penalty_in[0]) = groupTempStorage[k/4];
+                        }
+                        state.stepLastTile(new_subject_letter4.x, penalty_in[0]);
+                    }
+        
+                    if (subjectLength%4 >= 2) {
+                        state.stepLastTile(new_subject_letter4.y, penalty_in[1]);
+                    }
+        
+                    if (subjectLength%4 >= 3) {
+                        state.stepLastTile(new_subject_letter4.z, penalty_in[2]);
+                    }
+                }
+            }
+
+            if(alignmentId < numSelected){
+                state.reduceMaximumScore();
+                const float overall_max = MathOps::max(state.maximum.x, state.maximum.y);
+
+                if(group.thread_rank() == 0){
+                    devAlignmentScores[alignmentId] = overall_max;
+                }
+            }
+        }
+
+    }
+
+
+    /*
+    PSSM kernel for a query of max length (2 * group_size * numRegs)
+    */
+    template<
+        class ScoreType, 
+        int blocksize, 
+        int group_size, 
+        int numRegs, 
+        bool subjectIsCaseSensitive, 
+        class ScoreOutputIterator,
+        class PositionsIterator
+    >
+    void call_GaplessFilter_strided_PSSM_multitile_kernel(
+        int numThreadBlocks,
+        const char * const devChars,
+        ScoreOutputIterator const devAlignmentScores,
+        const size_t* const devOffsets,
+        const SequenceLengthT* const devLengths,
+        PositionsIterator const d_positions_of_selected_lengths,
+        const int numSelected,
+        const SequenceLengthT queryLength,
+        const PSSM_2D_View<ScoreType>& strided_PSSM,
+        float2* const multiTileTempStorage,
+        size_t tempStorageElementsPerGroup, //number of float2s per group
+        cudaStream_t stream
+    ){
+        //constexpr int groupsPerBlock = blocksize / group_size;
+        //constexpr int alignmentsPerGroup = 1;
+        //constexpr int alignmentsPerBlock = groupsPerBlock * alignmentsPerGroup;
+        // std::cout << "blocksize " << blocksize << ", group_size " << group_size 
+        //     << ", alignmentsPerBlock " << alignmentsPerBlock << ", numSelected " << numSelected << "\n";
+
+        constexpr int numRowsPSSM = 21;
+        #ifdef USE_IMPROVED_SMEM
+        constexpr int numColumnsPSSM = std::max(group_size,8) * numRegs;
+        #else
+        constexpr int numColumnsPSSM = group_size * numRegs;
+        #endif
+        using SharedPSSM = SharedPSSM_singletile<ScoreType, numRowsPSSM, numColumnsPSSM>;
+
+        int smem = sizeof(SharedPSSM);
+        auto kernel = GaplessFilter_strided_PSSM_multitile_kernel<ScoreType, blocksize, group_size, numRegs, subjectIsCaseSensitive,
+            ScoreOutputIterator, PositionsIterator>;
+
+        auto setSmemKernelAttribute = [&](){
+            static std::map<int, bool> isSet;
+            if(smem > 48*1024){
+                int deviceId;
+                cudaGetDevice(&deviceId); CUERR;
+                if(!isSet[deviceId]){
+                    cudaFuncSetAttribute(kernel, cudaFuncAttributeMaxDynamicSharedMemorySize, smem); CUERR;
+                    isSet[deviceId] = true;
+                }
+            }
+        };
+        setSmemKernelAttribute();
+
+        dim3 grid = std::min(numSelected, numThreadBlocks);
+
+        kernel<<<grid, blocksize, smem, stream>>>(
+            devChars,
+            devAlignmentScores,
+            devOffsets,
+            devLengths,
+            d_positions_of_selected_lengths,
+            numSelected,       
+            queryLength,
+            strided_PSSM,
+            multiTileTempStorage,
+            tempStorageElementsPerGroup
+        ); CUERR;
+    }
+
+
+
+
+    #define ScoreOutputIterator TopNMaximaArray
+    #define PositionsIterator decltype(thrust::make_counting_iterator<ReferenceIdT>(0))
+    #define subjectIsCaseSensitive true
+    #define X(g,r) \
+        extern template void call_GaplessFilter_strided_PSSM_multitile_kernel<half2, 512, g, r, subjectIsCaseSensitive, ScoreOutputIterator, PositionsIterator>( \
+            int, \
+            const char * const, \
+            ScoreOutputIterator const, \
+            const size_t* const, \
+            const SequenceLengthT* const, \
+            PositionsIterator const, \
+            const int, \
+            const SequenceLengthT, \
+            const PSSM_2D_View<half2>&, \
+            float2*, \
+            size_t, \
+            cudaStream_t \
+        );
+
+        PSSM_GAPLESS_MULTITILE_FOR_EACH_VALID_CONFIG_DO_X
+
+    #undef X
+    #undef subjectIsCaseSensitive
+    #undef PositionsIterator
+    #undef ScoreOutputIterator
+
+    #define ScoreOutputIterator TopNMaximaArray
+    #define PositionsIterator decltype(thrust::make_counting_iterator<ReferenceIdT>(0))
+    #define subjectIsCaseSensitive true
+    #define X(g,r) \
+        extern template void call_GaplessFilter_strided_PSSM_multitile_kernel<short2, 512, g, r, subjectIsCaseSensitive, ScoreOutputIterator, PositionsIterator>( \
+            int, \
+            const char * const, \
+            ScoreOutputIterator const, \
+            const size_t* const, \
+            const SequenceLengthT* const, \
+            PositionsIterator const, \
+            const int, \
+            const SequenceLengthT, \
+            const PSSM_2D_View<short2>&, \
+            float2*, \
+            size_t, \
+            cudaStream_t \
+        );
+
+        PSSM_GAPLESS_MULTITILE_FOR_EACH_VALID_CONFIG_DO_X
+
+    #undef X
+    #undef subjectIsCaseSensitive
+    #undef PositionsIterator
+    #undef ScoreOutputIterator
+
+
+    #define ScoreOutputIterator TopNMaximaArray
+    #define PositionsIterator ReferenceIdT*
+    #define subjectIsCaseSensitive true
+    #define X(g,r) \
+        extern template void call_GaplessFilter_strided_PSSM_multitile_kernel<half2, 512, g, r, subjectIsCaseSensitive, ScoreOutputIterator, PositionsIterator>( \
+            int, \
+            const char * const, \
+            ScoreOutputIterator const, \
+            const size_t* const, \
+            const SequenceLengthT* const, \
+            PositionsIterator const, \
+            const int, \
+            const SequenceLengthT, \
+            const PSSM_2D_View<half2>&, \
+            float2*, \
+            size_t, \
+            cudaStream_t \
+        );
+
+        PSSM_GAPLESS_MULTITILE_FOR_EACH_VALID_CONFIG_DO_X
+
+    #undef X
+    #undef subjectIsCaseSensitive
+    #undef PositionsIterator
+    #undef ScoreOutputIterator
+
+    #define ScoreOutputIterator TopNMaximaArray
+    #define PositionsIterator ReferenceIdT*
+    #define subjectIsCaseSensitive true
+    #define X(g,r) \
+        extern template void call_GaplessFilter_strided_PSSM_multitile_kernel<short2, 512, g, r, subjectIsCaseSensitive, ScoreOutputIterator, PositionsIterator>( \
+            int, \
+            const char * const, \
+            ScoreOutputIterator const, \
+            const size_t* const, \
+            const SequenceLengthT* const, \
+            PositionsIterator const, \
+            const int, \
+            const SequenceLengthT, \
+            const PSSM_2D_View<short2>&, \
+            float2*, \
+            size_t, \
+            cudaStream_t \
+        );
+
+        PSSM_GAPLESS_MULTITILE_FOR_EACH_VALID_CONFIG_DO_X
+
+    #undef X
+    #undef subjectIsCaseSensitive
+    #undef PositionsIterator
+    #undef ScoreOutputIterator
+
+
+    template<
+        class ScoreType,
+        int blocksize, 
+        class ScoreOutputIterator, 
+        class PositionsIterator
+    >
+    void call_GaplessFilter_strided_PSSM_multitile_kernel(
+        int numThreadBlocks,
+        int group_size,
+        int numRegs,
+        const char * const devChars,
+        ScoreOutputIterator const devAlignmentScores,
+        const size_t* const devOffsets,
+        const SequenceLengthT* const devLengths,
+        PositionsIterator const d_positions_of_selected_lengths,
+        const int numSelected,
+        const SequenceLengthT queryLength,
+        const PSSM_2D_View<ScoreType>& strided_PSSM,
+        float2* const multiTileTempStorage,
+        size_t tempStorageElementsPerGroup,
+        cudaStream_t stream
+    ){
+        constexpr bool subjectIsCaseSensitive = true;
+
+        #define X(g,r) \
+            if(group_size == g && numRegs == r){ \
+                call_GaplessFilter_strided_PSSM_multitile_kernel<ScoreType, blocksize,g,r,subjectIsCaseSensitive>( \
+                    numThreadBlocks, devChars, devAlignmentScores, devOffsets, devLengths, \
+                    d_positions_of_selected_lengths, numSelected, queryLength, strided_PSSM, \
+                    multiTileTempStorage, tempStorageElementsPerGroup, stream \
+                ); \
+            } else 
+
+            PSSM_GAPLESS_MULTITILE_FOR_EACH_VALID_CONFIG_DO_X
+        { throw std::runtime_error("invalid groupsize/numregs config");}
+
+        #undef X
+    }
+
+} //namespace hardcodedzero
+
+
+namespace kernelparamzero{    
+
+    template<class ScoreType> struct ScalarScoreType{};
+    template<> struct ScalarScoreType<half2>{ using type = half; };
+    template<> struct ScalarScoreType<short2>{ using type = short; };
+    template<> struct ScalarScoreType<int>{ using type = int; };
+    template<> struct ScalarScoreType<float>{ using type = float; };
+
+    template<class ScoreType, int numRegs, class Group, class SharedPSSM, class SmemIndexCalculator>
+    struct GaplessPSSMState{
+        using Scalar = typename ScalarScoreType<ScoreType>::type;
+        using MathOps = MathOps<ScoreType>;
+
+        ScoreType penalty_here_array[numRegs];
+        ScoreType maximum{}; //0
+        ScoreType penalty_diag{}; //0
+        SharedPSSM& shared_strided_PSSM;
+        Group& group;
+
+        __device__
+        GaplessPSSMState(SharedPSSM& s, Group& g) : shared_strided_PSSM(s), group(g) {}
+
+        __device__
+        void resetScores(){
+            #pragma unroll
+            for(int i = 0; i < numRegs; i++){
+                penalty_here_array[i] = ScoreType{};
+            }
+            
+            penalty_diag = ScoreType{};
+        }
+
+        __device__
+        void resetMaximum(){
+            maximum = ScoreType{};
+        }
+
+        __device__
+        void relax(int subject_letter, ScoreType zero){
+            SmemIndexCalculator smemIndexCalculator;
+
+            ScoreType score2;
+            ScoreType penalty_temp0;
+            ScoreType penalty_temp1;
+
+            const auto* row = &shared_strided_PSSM.data[subject_letter][0];
+
+            float4 foo = *((float4*)&row[smemIndexCalculator.getIndex(0)]);
+            memcpy(&score2, &foo.x, sizeof(ScoreType));
+            penalty_temp0 = penalty_here_array[0];
+            penalty_here_array[0] = MathOps::add_relu(penalty_diag, score2, zero);
+
+            memcpy(&score2, &foo.y, sizeof(ScoreType));
+            penalty_temp1 = penalty_here_array[1];
+            penalty_here_array[1] = MathOps::add_relu(penalty_temp0, score2, zero);
+            maximum = MathOps::max3(maximum, penalty_here_array[1], penalty_here_array[0]);
+
+            memcpy(&score2, &foo.z, sizeof(ScoreType));
+            penalty_temp0 = penalty_here_array[2];
+            penalty_here_array[2] = MathOps::add_relu(penalty_temp1, score2, zero);
+
+            memcpy(&score2, &foo.w, sizeof(ScoreType));
+            penalty_temp1 = penalty_here_array[3];
+            penalty_here_array[3] = MathOps::add_relu(penalty_temp0, score2, zero);
+            maximum = MathOps::max3(maximum, penalty_here_array[3], penalty_here_array[2]);
+
+
+            #pragma unroll
+            for (int i=1; i<numRegs/4; i++) {
+                foo = *((float4*)&row[smemIndexCalculator.getIndex(i)]);
+                memcpy(&score2, &foo.x, sizeof(ScoreType));
+                penalty_temp0 = penalty_here_array[4*i];
+                penalty_here_array[4*i] = MathOps::add_relu(penalty_temp1, score2, zero);
+
+                memcpy(&score2, &foo.y, sizeof(ScoreType));
+                penalty_temp1 = penalty_here_array[4*i+1];
+                penalty_here_array[4*i+1] = MathOps::add_relu(penalty_temp0, score2, zero);
+                maximum = MathOps::max3(maximum, penalty_here_array[4*i+1], penalty_here_array[4*i]);
+
+                memcpy(&score2, &foo.z, sizeof(ScoreType));
+                penalty_temp0 = penalty_here_array[4*i+2];
+                penalty_here_array[4*i+2] = MathOps::add_relu(penalty_temp1, score2, zero);
+
+                memcpy(&score2, &foo.w, sizeof(ScoreType));
+                penalty_temp1 = penalty_here_array[4*i+3];
+                penalty_here_array[4*i+3] = MathOps::add_relu(penalty_temp0, score2, zero);
+                maximum = MathOps::max3(maximum, penalty_here_array[4*i+3], penalty_here_array[4*i+2]);
+            }
+        };
+
+        __device__
+        void shuffleScores(const Scalar& border_in){
+            penalty_diag = group.shfl_up(penalty_here_array[numRegs-1], 1);
+            const ScoreType penalty_temp0 = group.shfl_down(penalty_here_array[numRegs-1], group.size()-1);
+
+            if (group.thread_rank() == 0) {
+                penalty_diag.x = border_in;
+                penalty_diag.y = penalty_temp0.x;
+            }
+        }
+
+        __device__
+        void stepSingleTile(int subject_letter, ScoreType zero){
+            relax(subject_letter, zero);
+            shuffleScores(Scalar{});
+        }
+
+        __device__
+        void stepFirstTile(int subject_letter, Scalar& border_out, ScoreType zero){
+            relax(subject_letter, zero);
+            shuffleScores(Scalar{});
+            if(group.thread_rank() == group.size() - 1){
+                border_out = penalty_here_array[numRegs-1].y;
+            }
+        }
+
+        __device__
+        void stepIntermediateTile(int subject_letter, const Scalar& border_in, Scalar& border_out, ScoreType zero){
+            relax(subject_letter, zero);
+            shuffleScores(border_in);
+            if(group.thread_rank() == group.size() - 1){
+                border_out = penalty_here_array[numRegs-1].y;
+            }
+        }
+
+        __device__
+        void stepLastTile(int subject_letter, const Scalar& border_in, ScoreType zero){
+            relax(subject_letter, zero);
+            shuffleScores(border_in);
+        }
+
+        __device__
+        void reduceMaximumScore(){
+            maximum = MathOps::reduce_max(group, maximum);
+        }
+    };
+
+
+    /*
+    PSSM kernel for a query of max length (2 * group_size * numRegs)
+    */
+    template<
+        class ScoreType, 
+        int blocksize, 
+        int group_size, 
+        int numRegs, 
+        bool subjectIsCaseSensitive, 
+        class ScoreOutputIterator,
+        class PositionsIterator
+    >
+    __global__
+    __launch_bounds__(512,1)
+    void GaplessFilter_strided_PSSM_singletile_kernel(
+        __grid_constant__ const char * const devChars,
+        __grid_constant__ ScoreOutputIterator const devAlignmentScores,
+        __grid_constant__ const size_t* const devOffsets,
+        __grid_constant__ const SequenceLengthT* const devLengths,
+        __grid_constant__ PositionsIterator const d_positions_of_selected_lengths,
+        __grid_constant__ const int numSelected,
+        __grid_constant__ const SequenceLengthT queryLength,
+        __grid_constant__ const PSSM_2D_View<ScoreType> strided_PSSM,
+        __grid_constant__ const ScoreType zero
+    ) {
+        if constexpr (std::is_same_v<ScoreType, short2>) {
+            #if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 900
+            return;
+            #endif
+        }
+        static_assert(numRegs % 4 == 0);
+        static_assert(blocksize % group_size == 0);
+        __builtin_assume(blockDim.x == blocksize);
+        __builtin_assume(blockDim.x % group_size == 0);
+
+        constexpr int numRowsPSSM = 21;
+        #ifdef USE_IMPROVED_SMEM
+        constexpr int numColumnsPSSM = std::max(group_size,8) * numRegs;
+        #else
+        constexpr int numColumnsPSSM = group_size * numRegs;
+        #endif
+
+        using SharedPSSM = SharedPSSM_singletile<ScoreType, numRowsPSSM, numColumnsPSSM>;
+        using MathOps = MathOps<ScoreType>;
+
+        extern  __shared__ char externalSmem[];
+
+        SharedPSSM& shared_strided_PSSM = *((SharedPSSM*)externalSmem);
+
+
+        auto group = cg::tiled_partition<group_size>(cg::this_thread_block());
+        const int idOfGroupInGrid = (threadIdx.x + blockIdx.x * blockDim.x) / group_size;
+        //const int numGroupsInGrid = (blockDim.x * gridDim.x) / group_size;
+
+        #ifdef USE_IMPROVED_SMEM
+        using SmemIndexCalculator = typename std::conditional<
+            group_size == 4, 
+            SmemIndexCalculator<group_size, 2>,
+            SmemIndexCalculator<group_size, 1>
+        >::type;
+        #else
+        using SmemIndexCalculator = SmemIndexCalculator<group_size, 1>;
+        #endif
+        GaplessPSSMState<ScoreType, numRegs, decltype(group), SharedPSSM, SmemIndexCalculator> state(shared_strided_PSSM, group);
+
+        auto load_PSSM_single = [&]() {
+            for (int i=threadIdx.x; i<21*group_size*numRegs; i+=blockDim.x) {
+                const int letter = i/(group_size*numRegs);
+                const int col = i%(group_size*numRegs);
+                shared_strided_PSSM.data[letter][col] = strided_PSSM[letter][col];
+            }
+            __syncthreads();
+        };
+
+        auto load_PSSM_double = [&]() {
+            for (int i=threadIdx.x; i<21*group_size*numRegs; i+=blockDim.x) {
+                const int letter = i/(group_size*numRegs);
+                const int col = i%(group_size*numRegs);
+                auto value = strided_PSSM[letter][col];
+
+                const int float4Index = col / 4;
+                const int offsetWithinFloat4 = col % 4;
+
+                const int ithChunkOfFour = float4Index / group_size;
+                const int float4PositionInChunkOfFour = float4Index % group_size;
+
+                const int outputFloat4Index0 = (ithChunkOfFour*2*group_size + 0*group_size) + float4PositionInChunkOfFour;
+                const int outputFloat4Index1 = (ithChunkOfFour*2*group_size + 1*group_size) + float4PositionInChunkOfFour;
+
+                shared_strided_PSSM.data[letter][4*outputFloat4Index0 + offsetWithinFloat4] = value;
+                shared_strided_PSSM.data[letter][4*outputFloat4Index1 + offsetWithinFloat4] = value;
+            }
+            __syncthreads();
+        };
+
+        auto load_PSSM = [&](){
+            if constexpr(SmemIndexCalculator::factor == 2){
+                load_PSSM_double();
+            }else{
+                load_PSSM_single();
+            }
+        };
+
+        const char4* subjectAsChar4;
+        char4 new_subject_letter4;
+
+        auto makeCaseInsensitive4 = [](char4 encoded4){
+            unsigned int asUint;
+            memcpy(&asUint, &encoded4, sizeof(unsigned int));
+
+            if constexpr(subjectIsCaseSensitive){
+                // asUint = CaseSensitive_to_CaseInsensitive{}(asUint);
+                asUint = ClampToInvalid{}(asUint);
+            }
+
+            memcpy(&encoded4, &asUint, sizeof(unsigned int));
+            return encoded4;
+        };
+
+        load_PSSM();
+
+        //for(int alignmentId = idOfGroupInGrid; alignmentId < numSelected; alignmentId += numGroupsInGrid){
+        const int alignmentId = idOfGroupInGrid;
+        if(alignmentId < numSelected){
+            const auto subjectId = d_positions_of_selected_lengths[alignmentId];
+            const SequenceLengthT subjectLength = devLengths[subjectId];
+            const size_t base_S = devOffsets[subjectId]-devOffsets[0];
+
+            state.resetScores();
+            state.resetMaximum();
+
+            subjectAsChar4 = reinterpret_cast<const char4*>(&devChars[base_S]);
+
+            int k;
+            for (k=0; k<subjectLength-3; k+=4) {
+                new_subject_letter4 = makeCaseInsensitive4(subjectAsChar4[k/4]);
+                state.stepSingleTile(new_subject_letter4.x, zero);
+                state.stepSingleTile(new_subject_letter4.y, zero);
+                state.stepSingleTile(new_subject_letter4.z, zero);
+                state.stepSingleTile(new_subject_letter4.w, zero);
+            }
+
+            if (subjectLength%4 >= 1) {
+                new_subject_letter4 = makeCaseInsensitive4(subjectAsChar4[k/4]);
+                state.stepSingleTile(new_subject_letter4.x, zero);
+            }
+
+            if (subjectLength%4 >= 2) {
+                state.stepSingleTile(new_subject_letter4.y, zero);
+            }
+
+            if (subjectLength%4 >= 3) {
+                state.stepSingleTile(new_subject_letter4.z, zero);
+            }
+
+            state.reduceMaximumScore();
+            const float overall_max = MathOps::max(state.maximum.x, state.maximum.y);
+
+            if(group.thread_rank() == 0){
+                devAlignmentScores[alignmentId] = overall_max;
+            }
+        }
+    }
+
+
+
+    /*
+    PSSM kernel for a query of max length (2 * group_size * numRegs)
+    */
+    template<
+        class ScoreType,
+        int blocksize, 
+        int group_size, 
+        int numRegs, 
+        bool subjectIsCaseSensitive, 
+        class ScoreOutputIterator, 
+        class PositionsIterator
+    >
+    void call_GaplessFilter_strided_PSSM_singletile_kernel(
+        const char * const devChars,
+        ScoreOutputIterator const devAlignmentScores,
+        const size_t* const devOffsets,
+        const SequenceLengthT* const devLengths,
+        PositionsIterator const d_positions_of_selected_lengths,
+        const int numSelected,
+        const SequenceLengthT queryLength,
+        const PSSM_2D_View<ScoreType>& strided_PSSM,
+        cudaStream_t stream
+    ){
+        constexpr int groupsPerBlock = blocksize / group_size;
+        constexpr int alignmentsPerGroup = 1;
+        constexpr int alignmentsPerBlock = groupsPerBlock * alignmentsPerGroup;
+        // std::cout << "blocksize " << blocksize << ", group_size " << group_size 
+        //     << ", alignmentsPerBlock " << alignmentsPerBlock << ", numSelected " << numSelected << "\n";
+
+        constexpr int numRowsPSSM = 21;
+        #ifdef USE_IMPROVED_SMEM
+        constexpr int numColumnsPSSM = std::max(group_size,8) * numRegs;
+        #else
+        constexpr int numColumnsPSSM = group_size * numRegs;
+        #endif
+        using SharedPSSM = SharedPSSM_singletile<ScoreType, numRowsPSSM, numColumnsPSSM>;
+        
+        int smem = sizeof(SharedPSSM);
+        auto kernel = GaplessFilter_strided_PSSM_singletile_kernel<
+            ScoreType,
+            blocksize, 
+            group_size, 
+            numRegs, 
+            subjectIsCaseSensitive,
+            ScoreOutputIterator, 
+            PositionsIterator>;
+
+        auto setSmemKernelAttribute = [&](){
+            static std::map<int, bool> isSet;
+            if(smem > 48*1024){
+                int deviceId;
+                cudaGetDevice(&deviceId); CUERR;
+                if(!isSet[deviceId]){
+                    cudaFuncSetAttribute(kernel, cudaFuncAttributeMaxDynamicSharedMemorySize, smem); CUERR;
+                    isSet[deviceId] = true;
+                }
+            }
+        };
+        setSmemKernelAttribute();
+
+        dim3 grid = (numSelected + alignmentsPerBlock - 1) / alignmentsPerBlock;
+
+        kernel<<<grid, blocksize, smem, stream>>>(
+            devChars,
+            devAlignmentScores,
+            devOffsets,
+            devLengths,
+            d_positions_of_selected_lengths,
+            numSelected,       
+            queryLength,
+            strided_PSSM,
+            MathOps<ScoreType>::zero_score()
+        ); CUERR;
+    }
+
+
+
+
+
+
+    #define ScoreOutputIterator TopNMaximaArray
+    #define PositionsIterator decltype(thrust::make_counting_iterator<ReferenceIdT>(0))
+    #define subjectIsCaseSensitive true
+    #define X(g,r) \
+        extern template void call_GaplessFilter_strided_PSSM_singletile_kernel<half2, 512, g, r, subjectIsCaseSensitive, ScoreOutputIterator, PositionsIterator>( \
+            const char * const, \
+            ScoreOutputIterator const, \
+            const size_t* const, \
+            const SequenceLengthT* const, \
+            PositionsIterator const, \
+            const int, \
+            const SequenceLengthT, \
+            const PSSM_2D_View<half2>&, \
+            cudaStream_t \
+        );
+
+        PSSM_GAPLESS_SINGLETILE_FOR_EACH_VALID_CONFIG_DO_X
+
+    #undef X
+    #undef subjectIsCaseSensitive
+    #undef PositionsIterator
+    #undef ScoreOutputIterator
+
+    #define ScoreOutputIterator TopNMaximaArray
+    #define PositionsIterator decltype(thrust::make_counting_iterator<ReferenceIdT>(0))
+    #define subjectIsCaseSensitive true
+    #define X(g,r) \
+        extern template void call_GaplessFilter_strided_PSSM_singletile_kernel<short2, 512, g, r, subjectIsCaseSensitive, ScoreOutputIterator, PositionsIterator>( \
+            const char * const, \
+            ScoreOutputIterator const, \
+            const size_t* const, \
+            const SequenceLengthT* const, \
+            PositionsIterator const, \
+            const int, \
+            const SequenceLengthT, \
+            const PSSM_2D_View<short2>&, \
+            cudaStream_t \
+        );
+
+        PSSM_GAPLESS_SINGLETILE_FOR_EACH_VALID_CONFIG_DO_X
+
+    #undef X
+    #undef subjectIsCaseSensitive
+    #undef PositionsIterator
+    #undef ScoreOutputIterator
+
+
+
+
+    #define ScoreOutputIterator TopNMaximaArray
+    #define PositionsIterator ReferenceIdT*
+    #define subjectIsCaseSensitive true
+    #define X(g,r) \
+        extern template void call_GaplessFilter_strided_PSSM_singletile_kernel<half2, 512, g, r, subjectIsCaseSensitive, ScoreOutputIterator, PositionsIterator>( \
+            const char * const, \
+            ScoreOutputIterator const, \
+            const size_t* const, \
+            const SequenceLengthT* const, \
+            PositionsIterator const, \
+            const int, \
+            const SequenceLengthT, \
+            const PSSM_2D_View<half2>&, \
+            cudaStream_t \
+        );
+
+        PSSM_GAPLESS_SINGLETILE_FOR_EACH_VALID_CONFIG_DO_X
+
+    #undef X
+    #undef subjectIsCaseSensitive
+    #undef PositionsIterator
+    #undef ScoreOutputIterator
+
+    #define ScoreOutputIterator TopNMaximaArray
+    #define PositionsIterator ReferenceIdT*
+    #define subjectIsCaseSensitive true
+    #define X(g,r) \
+        extern template void call_GaplessFilter_strided_PSSM_singletile_kernel<short2, 512, g, r, subjectIsCaseSensitive, ScoreOutputIterator, PositionsIterator>( \
+            const char * const, \
+            ScoreOutputIterator const, \
+            const size_t* const, \
+            const SequenceLengthT* const, \
+            PositionsIterator const, \
+            const int, \
+            const SequenceLengthT, \
+            const PSSM_2D_View<short2>&, \
+            cudaStream_t \
+        );
+
+        PSSM_GAPLESS_SINGLETILE_FOR_EACH_VALID_CONFIG_DO_X
+
+    #undef X
+    #undef subjectIsCaseSensitive
+    #undef PositionsIterator
+    #undef ScoreOutputIterator
+
+
+
+    template<class ScoreType, int blocksize, class ScoreOutputIterator, class PositionsIterator>
+    void call_GaplessFilter_strided_PSSM_singletile_kernel(
+        int group_size,
+        int numRegs,
+        const char * const devChars,
+        ScoreOutputIterator const devAlignmentScores,
+        const size_t* const devOffsets,
+        const SequenceLengthT* const devLengths,
+        PositionsIterator const d_positions_of_selected_lengths,
+        const int numSelected,
+        const SequenceLengthT queryLength,
+        const PSSM_2D_View<ScoreType>& strided_PSSM,
+        cudaStream_t stream
+    ){
+        constexpr bool subjectIsCaseSensitive = true;
+
+        #define X(g,r) \
+            if(group_size == g && numRegs == r){ \
+                call_GaplessFilter_strided_PSSM_singletile_kernel<ScoreType, blocksize,g,r,subjectIsCaseSensitive>( \
+                    devChars, devAlignmentScores, devOffsets, devLengths, d_positions_of_selected_lengths, \
+                    numSelected, queryLength, strided_PSSM, stream \
+                ); \
+            } else 
+
+            PSSM_GAPLESS_SINGLETILE_FOR_EACH_VALID_CONFIG_DO_X
+        { throw std::runtime_error("invalid groupsize/numregs config");}
+
+        #undef X
+    }
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+    /*
+    PSSM kernel for arbitrary query length
+    */
+    template<
+        class ScoreType, 
+        int blocksize, 
+        int group_size, 
+        int numRegs, 
+        bool subjectIsCaseSensitive, 
+        class ScoreOutputIterator,
+        class PositionsIterator
+    >
+    __global__
+    __launch_bounds__(512,1)
+    void GaplessFilter_strided_PSSM_multitile_kernel(
+        __grid_constant__ const char * const devChars,
+        __grid_constant__ ScoreOutputIterator const devAlignmentScores,
+        __grid_constant__ const size_t* const devOffsets,
+        __grid_constant__ const SequenceLengthT* const devLengths,
+        __grid_constant__ PositionsIterator const d_positions_of_selected_lengths,
+        __grid_constant__ const int numSelected,
+        __grid_constant__ const SequenceLengthT queryLength,
+        __grid_constant__ const PSSM_2D_View<ScoreType> strided_PSSM,
+        __grid_constant__ float2* const multiTileTempStorage,
+        __grid_constant__ const size_t tempStorageElementsPerGroup,
+        __grid_constant__ const ScoreType zero
+    ) {
+        if constexpr (std::is_same_v<ScoreType, short2>) {
+            #if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 900
+            return;
+            #endif
+        }
+        static_assert(numRegs % 4 == 0);
+        static_assert(blocksize % group_size == 0);
+        __builtin_assume(blockDim.x == blocksize);
+        __builtin_assume(blockDim.x % group_size == 0);
+
+        extern  __shared__ char externalSmem[];
+        constexpr int numRowsPSSM = 21;
+        #ifdef USE_IMPROVED_SMEM
+        constexpr int numColumnsPSSM = std::max(group_size,8) * numRegs;
+        #else
+        constexpr int numColumnsPSSM = group_size * numRegs;
+        #endif
+        using SharedPSSM = SharedPSSM_singletile<ScoreType, numRowsPSSM, numColumnsPSSM>;
+
+        SharedPSSM& shared_strided_PSSM = *((SharedPSSM*)externalSmem);
+
+        using MathOps = MathOps<ScoreType>;
+        using Scalar = typename ScalarScoreType<ScoreType>::type;
+
+        auto group = cg::tiled_partition<group_size>(cg::this_thread_block());
+        const int numGroupsInBlock = blockDim.x / group_size;
+        const int idOfGroupInGrid = (threadIdx.x + blockIdx.x * blockDim.x) / group_size;
+        const int numGroupsInGrid = (blockDim.x * gridDim.x) / group_size;
+        
+        const size_t groupTempStorageOffset = idOfGroupInGrid * tempStorageElementsPerGroup;
+        float2* const groupTempStorage = multiTileTempStorage + groupTempStorageOffset;
+        
+        const int numTiles = SDIV(queryLength, 2 * group_size * numRegs);
+
+        #ifdef USE_IMPROVED_SMEM
+        using SmemIndexCalculator = typename std::conditional<
+            group_size == 4, 
+            SmemIndexCalculator<group_size, 2>,
+            SmemIndexCalculator<group_size, 1>
+        >::type;
+        #else
+        using SmemIndexCalculator = SmemIndexCalculator<group_size, 1>;
+        #endif
+        GaplessPSSMState<ScoreType, numRegs, decltype(group), SharedPSSM, SmemIndexCalculator> state(shared_strided_PSSM, group);
+
+        alignas(8) Scalar penalty_in[4];
+        alignas(8) Scalar penalty_out[4];
+
+        auto load_PSSM_single = [&](int tileNr) {
+            const int columnOffset = tileNr * group_size * numRegs;
+            __syncthreads(); //wait for all groups before overwriting pssm
+
+            for (int i=threadIdx.x; i<21*group_size*numRegs; i+=blockDim.x) {
+                int letter = i/(group_size*numRegs);
+                int col = i%(group_size*numRegs);
+                //shared_strided_PSSM.data[letter][col] = strided_PSSM_1d[i];
+                //shared_strided_PSSM.data[letter][col] = strided_PSSM.data[i];
+                shared_strided_PSSM.data[letter][col] = strided_PSSM[letter][columnOffset + col];
+            }
+            __syncthreads();
+        };
+
+        auto load_PSSM_double = [&](int tileNr) {
+            const int columnOffset = tileNr * group_size * numRegs;
+            __syncthreads(); //wait for all groups before overwriting pssm
+
+            for (int i=threadIdx.x; i<21*group_size*numRegs; i+=blockDim.x) {
+                const int letter = i/(group_size*numRegs);
+                const int col = i%(group_size*numRegs);
+                auto value = strided_PSSM[letter][columnOffset + col];
+
+                const int float4Index = col / 4;
+                const int offsetWithinFloat4 = col % 4;
+
+                const int ithChunkOfFour = float4Index / group_size;
+                const int float4PositionInChunkOfFour = float4Index % group_size;
+
+                const int outputFloat4Index0 = (ithChunkOfFour*2*group_size + 0*group_size) + float4PositionInChunkOfFour;
+                const int outputFloat4Index1 = (ithChunkOfFour*2*group_size + 1*group_size) + float4PositionInChunkOfFour;
+
+                shared_strided_PSSM.data[letter][4*outputFloat4Index0 + offsetWithinFloat4] = value;
+                shared_strided_PSSM.data[letter][4*outputFloat4Index1 + offsetWithinFloat4] = value;
+            }
+            __syncthreads();
+        };
+
+        auto load_PSSM = [&](int tileNr){
+            if constexpr(SmemIndexCalculator::factor == 2){
+                load_PSSM_double(tileNr);
+            }else{
+                load_PSSM_single(tileNr);
+            }
+        };
+
+        char4 new_subject_letter4;
+
+        auto makeCaseInsensitive4 = [](char4 encoded4){
+            unsigned int asUint;
+            memcpy(&asUint, &encoded4, sizeof(unsigned int));
+
+            if constexpr(subjectIsCaseSensitive){
+                //asUint = CaseSensitive_to_CaseInsensitive{}(asUint);
+                asUint = ClampToInvalid{}(asUint);
+            }
+
+            memcpy(&encoded4, &asUint, sizeof(unsigned int));
+            return encoded4;
+        };
+
+
+        //need to round up to blocks because loading pssm is a block-wide operation
+        const int numSelectedRoundedUp = SDIV(numSelected, numGroupsInBlock) * numGroupsInBlock;
+
+        for(int alignmentId = idOfGroupInGrid; alignmentId < numSelectedRoundedUp; alignmentId += numGroupsInGrid){
+
+            size_t subjectId;
+            SequenceLengthT subjectLength;
+            size_t base_S;
+            const char4* subjectAsChar4;
+
+            //first tile
+            {
+                /* 
+                    -----------------------
+                    Process tile 0
+                    ----------------------- 
+                */
+
+                //load pssm for tile 0. blockwide operation
+                load_PSSM(0);
+
+                if(alignmentId < numSelected){
+                    subjectId = d_positions_of_selected_lengths[alignmentId];
+                    subjectLength = devLengths[subjectId];
+                    base_S = devOffsets[subjectId]-devOffsets[0];
+
+                    state.resetScores();
+                    state.resetMaximum();
+                    subjectAsChar4 = reinterpret_cast<const char4*>(&devChars[base_S]);
+
+                    int k;
+
+                    //process rows in chunks of 4 rows
+                    for (k=0; k<subjectLength-3; k+=4) {
+
+                        new_subject_letter4 = makeCaseInsensitive4(subjectAsChar4[k/4]);
+
+                        state.stepFirstTile(new_subject_letter4.x, penalty_out[0], zero);
+                        state.stepFirstTile(new_subject_letter4.y, penalty_out[1], zero);
+                        state.stepFirstTile(new_subject_letter4.z, penalty_out[2], zero);
+                        state.stepFirstTile(new_subject_letter4.w, penalty_out[3], zero);
+                        
+                        //update temp storage for next tile
+                        if(group.thread_rank() == group.size() - 1){
+                            groupTempStorage[k/4] = *((float2*)&penalty_out[0]);
+                        }
+                    }
+
+                    //process at most 3 remaining rows
+                    if (subjectLength%4 >= 1) {
+                        new_subject_letter4 = makeCaseInsensitive4(subjectAsChar4[k/4]);
+                        state.stepFirstTile(new_subject_letter4.x, penalty_out[0], zero);
+                    }
+
+                    if (subjectLength%4 >= 2) {
+                        state.stepFirstTile(new_subject_letter4.y, penalty_out[1], zero);
+                    }
+
+                    if (subjectLength%4 >= 3) {
+                        state.stepFirstTile(new_subject_letter4.z, penalty_out[2], zero);
+                    }
+
+                    //if there were remaining rows, update temp storage
+                    if(subjectLength % 4 > 0){
+                        if(group.thread_rank() == group.size() - 1){
+                            groupTempStorage[k/4] = *((float2*)&penalty_out[0]);
+                        }
+                    }
+                }
+            }
+
+            //intermediate tiles
+            for(int tileNr = 1; tileNr < numTiles - 1; tileNr++){
+                /* 
+                    -----------------------
+                    Process tile tileNr
+                    ----------------------- 
+                */
+
+                //load pssm for tile tileNr. blockwide operation
+                load_PSSM(tileNr);
+
+                if(alignmentId < numSelected){    
+                    state.resetScores();
+        
+                    int k;
+        
+                    //process rows in chunks of 4 rows
+                    for (k=0; k<subjectLength-3; k+=4) {
+        
+                        new_subject_letter4 = makeCaseInsensitive4(subjectAsChar4[k/4]);
+                        if (group.thread_rank() == 0){
+                            *((float2*)&penalty_in[0]) = groupTempStorage[k/4];
+                        }
+
+                        state.stepIntermediateTile(new_subject_letter4.x, penalty_in[0], penalty_out[0], zero);
+                        state.stepIntermediateTile(new_subject_letter4.y, penalty_in[1], penalty_out[1], zero);
+                        state.stepIntermediateTile(new_subject_letter4.z, penalty_in[2], penalty_out[2], zero);
+                        state.stepIntermediateTile(new_subject_letter4.w, penalty_in[3], penalty_out[3], zero);
+            
+                        //update temp storage for next tile
+                        if(group.thread_rank() == group.size() - 1){
+                            groupTempStorage[k/4] = *((float2*)&penalty_out[0]);
+                        }
+                    }
+        
+                    //process at most 3 remaining rows
+                    if (subjectLength%4 >= 1) {
+                        new_subject_letter4 = makeCaseInsensitive4(subjectAsChar4[k/4]);
+                        //load input penalty for remaining rows
+                        if (group.thread_rank() == 0){
+                            *((float2*)&penalty_in[0]) = groupTempStorage[k/4];
+                        }
+                        state.stepIntermediateTile(new_subject_letter4.x, penalty_in[0], penalty_out[0], zero);
+                    }
+        
+                    if (subjectLength%4 >= 2) {
+                        state.stepIntermediateTile(new_subject_letter4.y, penalty_in[1], penalty_out[1], zero);
+                    }
+        
+                    if (subjectLength%4 >= 3) {
+                        state.stepIntermediateTile(new_subject_letter4.z, penalty_in[2], penalty_out[2], zero);
+                    }
+        
+                    //if there were remaining rows, update temp storage
+                    if(subjectLength % 4 > 0){
+                        if(group.thread_rank() == group.size() - 1){
+                            groupTempStorage[k/4] = *((float2*)&penalty_out[0]);
+                        }
+                    }
+                }
+            }
+
+            //last tile
+            if(numTiles > 1){
+                /* 
+                    -----------------------
+                    Process last tile (numTiles-1)
+                    ----------------------- 
+                */
+
+                //load pssm for tile (numTiles-1). blockwide operation
+                load_PSSM(numTiles-1);
+
+                if(alignmentId < numSelected){
+        
+                    state.resetScores();
+        
+                    int k;
+        
+                    //process rows in chunks of 4 rows
+                    for (k=0; k<subjectLength-3; k+=4) {
+        
+                        new_subject_letter4 = makeCaseInsensitive4(subjectAsChar4[k/4]);
+                        if (group.thread_rank() == 0){
+                            *((float2*)&penalty_in[0]) = groupTempStorage[k/4];
+                        }
+
+                        state.stepLastTile(new_subject_letter4.x, penalty_in[0], zero);
+                        state.stepLastTile(new_subject_letter4.y, penalty_in[1], zero);
+                        state.stepLastTile(new_subject_letter4.z, penalty_in[2], zero);
+                        state.stepLastTile(new_subject_letter4.w, penalty_in[3], zero);
+                    }
+        
+                    //process at most 3 remaining rows
+                    if (subjectLength%4 >= 1) {
+                        new_subject_letter4 = makeCaseInsensitive4(subjectAsChar4[k/4]);
+                        //load input penalty for remaining rows
+                        if (group.thread_rank() == 0){
+                            *((float2*)&penalty_in[0]) = groupTempStorage[k/4];
+                        }
+                        state.stepLastTile(new_subject_letter4.x, penalty_in[0], zero);
+                    }
+        
+                    if (subjectLength%4 >= 2) {
+                        state.stepLastTile(new_subject_letter4.y, penalty_in[1], zero);
+                    }
+        
+                    if (subjectLength%4 >= 3) {
+                        state.stepLastTile(new_subject_letter4.z, penalty_in[2], zero);
+                    }
+                }
+            }
+
+            if(alignmentId < numSelected){
+                state.reduceMaximumScore();
+                const float overall_max = MathOps::max(state.maximum.x, state.maximum.y);
+
+                if(group.thread_rank() == 0){
+                    devAlignmentScores[alignmentId] = overall_max;
+                }
+            }
+        }
+
+    }
+
+
+    /*
+    PSSM kernel for a query of max length (2 * group_size * numRegs)
+    */
+    template<
+        class ScoreType, 
+        int blocksize, 
+        int group_size, 
+        int numRegs, 
+        bool subjectIsCaseSensitive, 
+        class ScoreOutputIterator,
+        class PositionsIterator
+    >
+    void call_GaplessFilter_strided_PSSM_multitile_kernel(
+        int numThreadBlocks,
+        const char * const devChars,
+        ScoreOutputIterator const devAlignmentScores,
+        const size_t* const devOffsets,
+        const SequenceLengthT* const devLengths,
+        PositionsIterator const d_positions_of_selected_lengths,
+        const int numSelected,
+        const SequenceLengthT queryLength,
+        const PSSM_2D_View<ScoreType>& strided_PSSM,
+        float2* const multiTileTempStorage,
+        size_t tempStorageElementsPerGroup, //number of float2s per group
+        cudaStream_t stream
+    ){
+        //constexpr int groupsPerBlock = blocksize / group_size;
+        //constexpr int alignmentsPerGroup = 1;
+        //constexpr int alignmentsPerBlock = groupsPerBlock * alignmentsPerGroup;
+        // std::cout << "blocksize " << blocksize << ", group_size " << group_size 
+        //     << ", alignmentsPerBlock " << alignmentsPerBlock << ", numSelected " << numSelected << "\n";
+
+        constexpr int numRowsPSSM = 21;
+        #ifdef USE_IMPROVED_SMEM
+        constexpr int numColumnsPSSM = std::max(group_size,8) * numRegs;
+        #else
+        constexpr int numColumnsPSSM = group_size * numRegs;
+        #endif
+        using SharedPSSM = SharedPSSM_singletile<ScoreType, numRowsPSSM, numColumnsPSSM>;
+
+        int smem = sizeof(SharedPSSM);
+        auto kernel = GaplessFilter_strided_PSSM_multitile_kernel<ScoreType, blocksize, group_size, numRegs, subjectIsCaseSensitive,
+            ScoreOutputIterator, PositionsIterator>;
+
+        auto setSmemKernelAttribute = [&](){
+            static std::map<int, bool> isSet;
+            if(smem > 48*1024){
+                int deviceId;
+                cudaGetDevice(&deviceId); CUERR;
+                if(!isSet[deviceId]){
+                    cudaFuncSetAttribute(kernel, cudaFuncAttributeMaxDynamicSharedMemorySize, smem); CUERR;
+                    isSet[deviceId] = true;
+                }
+            }
+        };
+        setSmemKernelAttribute();
+
+        dim3 grid = std::min(numSelected, numThreadBlocks);
+
+        kernel<<<grid, blocksize, smem, stream>>>(
+            devChars,
+            devAlignmentScores,
+            devOffsets,
+            devLengths,
+            d_positions_of_selected_lengths,
+            numSelected,       
+            queryLength,
+            strided_PSSM,
+            multiTileTempStorage,
+            tempStorageElementsPerGroup,
+            MathOps<ScoreType>::zero_score()
+        ); CUERR;
+    }
+
+
+
+    #define ScoreOutputIterator TopNMaximaArray
+    #define PositionsIterator decltype(thrust::make_counting_iterator<ReferenceIdT>(0))
+    #define subjectIsCaseSensitive true
+    #define X(g,r) \
+        extern template void call_GaplessFilter_strided_PSSM_multitile_kernel<half2, 512, g, r, subjectIsCaseSensitive, ScoreOutputIterator, PositionsIterator>( \
+            int, \
+            const char * const, \
+            ScoreOutputIterator const, \
+            const size_t* const, \
+            const SequenceLengthT* const, \
+            PositionsIterator const, \
+            const int, \
+            const SequenceLengthT, \
+            const PSSM_2D_View<half2>&, \
+            float2*, \
+            size_t, \
+            cudaStream_t \
+        );
+
+        PSSM_GAPLESS_MULTITILE_FOR_EACH_VALID_CONFIG_DO_X
+
+    #undef X
+    #undef subjectIsCaseSensitive
+    #undef PositionsIterator
+    #undef ScoreOutputIterator
+
+    #define ScoreOutputIterator TopNMaximaArray
+    #define PositionsIterator decltype(thrust::make_counting_iterator<ReferenceIdT>(0))
+    #define subjectIsCaseSensitive true
+    #define X(g,r) \
+        extern template void call_GaplessFilter_strided_PSSM_multitile_kernel<short2, 512, g, r, subjectIsCaseSensitive, ScoreOutputIterator, PositionsIterator>( \
+            int, \
+            const char * const, \
+            ScoreOutputIterator const, \
+            const size_t* const, \
+            const SequenceLengthT* const, \
+            PositionsIterator const, \
+            const int, \
+            const SequenceLengthT, \
+            const PSSM_2D_View<short2>&, \
+            float2*, \
+            size_t, \
+            cudaStream_t \
+        );
+
+        PSSM_GAPLESS_MULTITILE_FOR_EACH_VALID_CONFIG_DO_X
+
+    #undef X
+    #undef subjectIsCaseSensitive
+    #undef PositionsIterator
+    #undef ScoreOutputIterator
+
+
+    #define ScoreOutputIterator TopNMaximaArray
+    #define PositionsIterator ReferenceIdT*
+    #define subjectIsCaseSensitive true
+    #define X(g,r) \
+        extern template void call_GaplessFilter_strided_PSSM_multitile_kernel<half2, 512, g, r, subjectIsCaseSensitive, ScoreOutputIterator, PositionsIterator>( \
+            int, \
+            const char * const, \
+            ScoreOutputIterator const, \
+            const size_t* const, \
+            const SequenceLengthT* const, \
+            PositionsIterator const, \
+            const int, \
+            const SequenceLengthT, \
+            const PSSM_2D_View<half2>&, \
+            float2*, \
+            size_t, \
+            cudaStream_t \
+        );
+
+        PSSM_GAPLESS_MULTITILE_FOR_EACH_VALID_CONFIG_DO_X
+
+    #undef X
+    #undef subjectIsCaseSensitive
+    #undef PositionsIterator
+    #undef ScoreOutputIterator
+
+    #define ScoreOutputIterator TopNMaximaArray
+    #define PositionsIterator ReferenceIdT*
+    #define subjectIsCaseSensitive true
+    #define X(g,r) \
+        extern template void call_GaplessFilter_strided_PSSM_multitile_kernel<short2, 512, g, r, subjectIsCaseSensitive, ScoreOutputIterator, PositionsIterator>( \
+            int, \
+            const char * const, \
+            ScoreOutputIterator const, \
+            const size_t* const, \
+            const SequenceLengthT* const, \
+            PositionsIterator const, \
+            const int, \
+            const SequenceLengthT, \
+            const PSSM_2D_View<short2>&, \
+            float2*, \
+            size_t, \
+            cudaStream_t \
+        );
+
+        PSSM_GAPLESS_MULTITILE_FOR_EACH_VALID_CONFIG_DO_X
+
+    #undef X
+    #undef subjectIsCaseSensitive
+    #undef PositionsIterator
+    #undef ScoreOutputIterator
+
+
+    template<
+        class ScoreType,
+        int blocksize, 
+        class ScoreOutputIterator, 
+        class PositionsIterator
+    >
+    void call_GaplessFilter_strided_PSSM_multitile_kernel(
+        int numThreadBlocks,
+        int group_size,
+        int numRegs,
+        const char * const devChars,
+        ScoreOutputIterator const devAlignmentScores,
+        const size_t* const devOffsets,
+        const SequenceLengthT* const devLengths,
+        PositionsIterator const d_positions_of_selected_lengths,
+        const int numSelected,
+        const SequenceLengthT queryLength,
+        const PSSM_2D_View<ScoreType>& strided_PSSM,
+        float2* const multiTileTempStorage,
+        size_t tempStorageElementsPerGroup,
+        cudaStream_t stream
+    ){
+        constexpr bool subjectIsCaseSensitive = true;
+
+        #define X(g,r) \
+            if(group_size == g && numRegs == r){ \
+                call_GaplessFilter_strided_PSSM_multitile_kernel<ScoreType, blocksize,g,r,subjectIsCaseSensitive>( \
+                    numThreadBlocks, devChars, devAlignmentScores, devOffsets, devLengths, \
+                    d_positions_of_selected_lengths, numSelected, queryLength, strided_PSSM, \
+                    multiTileTempStorage, tempStorageElementsPerGroup, stream \
+                ); \
+            } else 
+
+            PSSM_GAPLESS_MULTITILE_FOR_EACH_VALID_CONFIG_DO_X
+        { throw std::runtime_error("invalid groupsize/numregs config");}
+
+        #undef X
+    }
+
+
+} //namespace kernelparamzero
+
+
+} //namespace cudasw4
+
+#endif
\ No newline at end of file
diff --git a/lib/libmarv/src/pssmkernels_gapless_instantiation_dpx.cu b/lib/libmarv/src/pssmkernels_gapless_instantiation_dpx.cu
new file mode 100644
index 000000000..88c58856d
--- /dev/null
+++ b/lib/libmarv/src/pssmkernels_gapless_instantiation_dpx.cu
@@ -0,0 +1,111 @@
+#include "pssmkernels_gapless.cuh"
+
+namespace cudasw4{
+
+namespace hardcodedzero{
+
+    #define ScoreOutputIterator TopNMaximaArray
+    #define PositionsIterator decltype(thrust::make_counting_iterator<ReferenceIdT>(0))
+    #define subjectIsCaseSensitive true
+    #define X(g,r) \
+        template void call_GaplessFilter_strided_PSSM_singletile_kernel<short2, 512, g, r, subjectIsCaseSensitive, ScoreOutputIterator, PositionsIterator>( \
+            const char * const, \
+            ScoreOutputIterator const, \
+            const size_t* const, \
+            const SequenceLengthT* const, \
+            PositionsIterator const, \
+            const int, \
+            const SequenceLengthT, \
+            const PSSM_2D_View<short2>&, \
+            cudaStream_t \
+        );
+
+        PSSM_GAPLESS_SINGLETILE_FOR_EACH_VALID_CONFIG_DO_X
+
+    #undef X
+    #undef subjectIsCaseSensitive
+    #undef PositionsIterator
+    #undef ScoreOutputIterator
+
+
+    #define ScoreOutputIterator TopNMaximaArray
+    #define PositionsIterator ReferenceIdT*
+    #define subjectIsCaseSensitive true
+    #define X(g,r) \
+        template void call_GaplessFilter_strided_PSSM_singletile_kernel<short2, 512, g, r, subjectIsCaseSensitive, ScoreOutputIterator, PositionsIterator>( \
+            const char * const, \
+            ScoreOutputIterator const, \
+            const size_t* const, \
+            const SequenceLengthT* const, \
+            PositionsIterator const, \
+            const int, \
+            const SequenceLengthT, \
+            const PSSM_2D_View<short2>&, \
+            cudaStream_t \
+        );
+
+        PSSM_GAPLESS_SINGLETILE_FOR_EACH_VALID_CONFIG_DO_X
+
+    #undef X
+    #undef subjectIsCaseSensitive
+    #undef PositionsIterator
+    #undef ScoreOutputIterator
+
+
+    #define ScoreOutputIterator TopNMaximaArray
+    #define PositionsIterator decltype(thrust::make_counting_iterator<ReferenceIdT>(0))
+    #define subjectIsCaseSensitive true
+    #define X(g,r) \
+        template void call_GaplessFilter_strided_PSSM_multitile_kernel<short2, 512, g, r, subjectIsCaseSensitive, ScoreOutputIterator, PositionsIterator>( \
+            int, \
+            const char * const, \
+            ScoreOutputIterator const, \
+            const size_t* const, \
+            const SequenceLengthT* const, \
+            PositionsIterator const, \
+            const int, \
+            const SequenceLengthT, \
+            const PSSM_2D_View<short2>&, \
+            float2*, \
+            size_t, \
+            cudaStream_t \
+        );
+
+        PSSM_GAPLESS_MULTITILE_FOR_EACH_VALID_CONFIG_DO_X
+
+    #undef X
+    #undef subjectIsCaseSensitive
+    #undef PositionsIterator
+    #undef ScoreOutputIterator
+
+
+    #define ScoreOutputIterator TopNMaximaArray
+    #define PositionsIterator ReferenceIdT*
+    #define subjectIsCaseSensitive true
+    #define X(g,r) \
+        template void call_GaplessFilter_strided_PSSM_multitile_kernel<short2, 512, g, r, subjectIsCaseSensitive, ScoreOutputIterator, PositionsIterator>( \
+            int, \
+            const char * const, \
+            ScoreOutputIterator const, \
+            const size_t* const, \
+            const SequenceLengthT* const, \
+            PositionsIterator const, \
+            const int, \
+            const SequenceLengthT, \
+            const PSSM_2D_View<short2>&, \
+            float2*, \
+            size_t, \
+            cudaStream_t \
+        );
+
+        PSSM_GAPLESS_MULTITILE_FOR_EACH_VALID_CONFIG_DO_X
+
+    #undef X
+    #undef subjectIsCaseSensitive
+    #undef PositionsIterator
+    #undef ScoreOutputIterator
+
+} //namespace hardcodedzero
+
+
+} //namespace cudasw4
\ No newline at end of file
diff --git a/lib/libmarv/src/pssmkernels_gapless_instantiation_dpx_kernelparamzero.cu b/lib/libmarv/src/pssmkernels_gapless_instantiation_dpx_kernelparamzero.cu
new file mode 100644
index 000000000..21535d159
--- /dev/null
+++ b/lib/libmarv/src/pssmkernels_gapless_instantiation_dpx_kernelparamzero.cu
@@ -0,0 +1,111 @@
+#include "pssmkernels_gapless.cuh"
+
+namespace cudasw4{
+
+namespace kernelparamzero{
+
+    #define ScoreOutputIterator TopNMaximaArray
+    #define PositionsIterator decltype(thrust::make_counting_iterator<ReferenceIdT>(0))
+    #define subjectIsCaseSensitive true
+    #define X(g,r) \
+        template void call_GaplessFilter_strided_PSSM_singletile_kernel<short2, 512, g, r, subjectIsCaseSensitive, ScoreOutputIterator, PositionsIterator>( \
+            const char * const, \
+            ScoreOutputIterator const, \
+            const size_t* const, \
+            const SequenceLengthT* const, \
+            PositionsIterator const, \
+            const int, \
+            const SequenceLengthT, \
+            const PSSM_2D_View<short2>&, \
+            cudaStream_t \
+        );
+
+        PSSM_GAPLESS_SINGLETILE_FOR_EACH_VALID_CONFIG_DO_X
+
+    #undef X
+    #undef subjectIsCaseSensitive
+    #undef PositionsIterator
+    #undef ScoreOutputIterator
+
+
+    #define ScoreOutputIterator TopNMaximaArray
+    #define PositionsIterator ReferenceIdT*
+    #define subjectIsCaseSensitive true
+    #define X(g,r) \
+        template void call_GaplessFilter_strided_PSSM_singletile_kernel<short2, 512, g, r, subjectIsCaseSensitive, ScoreOutputIterator, PositionsIterator>( \
+            const char * const, \
+            ScoreOutputIterator const, \
+            const size_t* const, \
+            const SequenceLengthT* const, \
+            PositionsIterator const, \
+            const int, \
+            const SequenceLengthT, \
+            const PSSM_2D_View<short2>&, \
+            cudaStream_t \
+        );
+
+        PSSM_GAPLESS_SINGLETILE_FOR_EACH_VALID_CONFIG_DO_X
+
+    #undef X
+    #undef subjectIsCaseSensitive
+    #undef PositionsIterator
+    #undef ScoreOutputIterator
+
+
+    #define ScoreOutputIterator TopNMaximaArray
+    #define PositionsIterator decltype(thrust::make_counting_iterator<ReferenceIdT>(0))
+    #define subjectIsCaseSensitive true
+    #define X(g,r) \
+        template void call_GaplessFilter_strided_PSSM_multitile_kernel<short2, 512, g, r, subjectIsCaseSensitive, ScoreOutputIterator, PositionsIterator>( \
+            int, \
+            const char * const, \
+            ScoreOutputIterator const, \
+            const size_t* const, \
+            const SequenceLengthT* const, \
+            PositionsIterator const, \
+            const int, \
+            const SequenceLengthT, \
+            const PSSM_2D_View<short2>&, \
+            float2*, \
+            size_t, \
+            cudaStream_t \
+        );
+
+        PSSM_GAPLESS_MULTITILE_FOR_EACH_VALID_CONFIG_DO_X
+
+    #undef X
+    #undef subjectIsCaseSensitive
+    #undef PositionsIterator
+    #undef ScoreOutputIterator
+
+
+    #define ScoreOutputIterator TopNMaximaArray
+    #define PositionsIterator ReferenceIdT*
+    #define subjectIsCaseSensitive true
+    #define X(g,r) \
+        template void call_GaplessFilter_strided_PSSM_multitile_kernel<short2, 512, g, r, subjectIsCaseSensitive, ScoreOutputIterator, PositionsIterator>( \
+            int, \
+            const char * const, \
+            ScoreOutputIterator const, \
+            const size_t* const, \
+            const SequenceLengthT* const, \
+            PositionsIterator const, \
+            const int, \
+            const SequenceLengthT, \
+            const PSSM_2D_View<short2>&, \
+            float2*, \
+            size_t, \
+            cudaStream_t \
+        );
+
+        PSSM_GAPLESS_MULTITILE_FOR_EACH_VALID_CONFIG_DO_X
+
+    #undef X
+    #undef subjectIsCaseSensitive
+    #undef PositionsIterator
+    #undef ScoreOutputIterator
+
+} //namespace hardcodedzero
+
+
+} //namespace cudasw4
\ No newline at end of file
diff --git a/lib/libmarv/src/pssmkernels_gapless_instantiation_half2.cu b/lib/libmarv/src/pssmkernels_gapless_instantiation_half2.cu
new file mode 100644
index 000000000..77836855f
--- /dev/null
+++ b/lib/libmarv/src/pssmkernels_gapless_instantiation_half2.cu
@@ -0,0 +1,112 @@
+#include "pssmkernels_gapless.cuh"
+
+namespace cudasw4{
+
+namespace hardcodedzero{
+
+    #define ScoreOutputIterator TopNMaximaArray
+    #define PositionsIterator decltype(thrust::make_counting_iterator<ReferenceIdT>(0))
+    #define subjectIsCaseSensitive true
+    #define X(g,r) \
+        template void call_GaplessFilter_strided_PSSM_singletile_kernel<half2, 512, g, r, subjectIsCaseSensitive, ScoreOutputIterator, PositionsIterator>( \
+            const char * const, \
+            ScoreOutputIterator const, \
+            const size_t* const, \
+            const SequenceLengthT* const, \
+            PositionsIterator const, \
+            const int, \
+            const SequenceLengthT, \
+            const PSSM_2D_View<half2>&, \
+            cudaStream_t \
+        );
+
+        PSSM_GAPLESS_SINGLETILE_FOR_EACH_VALID_CONFIG_DO_X
+
+    #undef X
+    #undef subjectIsCaseSensitive
+    #undef PositionsIterator
+    #undef ScoreOutputIterator
+
+    #define ScoreOutputIterator TopNMaximaArray
+    #define PositionsIterator ReferenceIdT*
+    #define subjectIsCaseSensitive true
+    #define X(g,r) \
+        template void call_GaplessFilter_strided_PSSM_singletile_kernel<half2, 512, g, r, subjectIsCaseSensitive, ScoreOutputIterator, PositionsIterator>( \
+            const char * const, \
+            ScoreOutputIterator const, \
+            const size_t* const, \
+            const SequenceLengthT* const, \
+            PositionsIterator const, \
+            const int, \
+            const SequenceLengthT, \
+            const PSSM_2D_View<half2>&, \
+            cudaStream_t \
+        );
+
+        PSSM_GAPLESS_SINGLETILE_FOR_EACH_VALID_CONFIG_DO_X
+
+    #undef X
+    #undef subjectIsCaseSensitive
+    #undef PositionsIterator
+    #undef ScoreOutputIterator
+
+
+    #define ScoreOutputIterator TopNMaximaArray
+    #define PositionsIterator decltype(thrust::make_counting_iterator<ReferenceIdT>(0))
+    #define subjectIsCaseSensitive true
+    #define X(g,r) \
+        template void call_GaplessFilter_strided_PSSM_multitile_kernel<half2, 512, g, r, subjectIsCaseSensitive, ScoreOutputIterator, PositionsIterator>( \
+            int, \
+            const char * const, \
+            ScoreOutputIterator const, \
+            const size_t* const, \
+            const SequenceLengthT* const, \
+            PositionsIterator const, \
+            const int, \
+            const SequenceLengthT, \
+            const PSSM_2D_View<half2>&, \
+            float2*, \
+            size_t, \
+            cudaStream_t \
+        );
+
+        PSSM_GAPLESS_MULTITILE_FOR_EACH_VALID_CONFIG_DO_X
+
+    #undef X
+    #undef subjectIsCaseSensitive
+    #undef PositionsIterator
+    #undef ScoreOutputIterator
+
+
+    #define ScoreOutputIterator TopNMaximaArray
+    #define PositionsIterator ReferenceIdT*
+    #define subjectIsCaseSensitive true
+    #define X(g,r) \
+        template void call_GaplessFilter_strided_PSSM_multitile_kernel<half2, 512, g, r, subjectIsCaseSensitive, ScoreOutputIterator, PositionsIterator>( \
+            int, \
+            const char * const, \
+            ScoreOutputIterator const, \
+            const size_t* const, \
+            const SequenceLengthT* const, \
+            PositionsIterator const, \
+            const int, \
+            const SequenceLengthT, \
+            const PSSM_2D_View<half2>&, \
+            float2*, \
+            size_t, \
+            cudaStream_t \
+        );
+
+        PSSM_GAPLESS_MULTITILE_FOR_EACH_VALID_CONFIG_DO_X
+
+    #undef X
+    #undef subjectIsCaseSensitive
+    #undef PositionsIterator
+    #undef ScoreOutputIterator
+
+
+} //namespace hardcodedzero
+
+
+
+} //namespace cudasw4
\ No newline at end of file
diff --git a/lib/libmarv/src/pssmkernels_gapless_instantiation_half2_kernelparamzero.cu b/lib/libmarv/src/pssmkernels_gapless_instantiation_half2_kernelparamzero.cu
new file mode 100644
index 000000000..725e8ff17
--- /dev/null
+++ b/lib/libmarv/src/pssmkernels_gapless_instantiation_half2_kernelparamzero.cu
@@ -0,0 +1,112 @@
+#include "pssmkernels_gapless.cuh"
+
+namespace cudasw4{
+
+namespace kernelparamzero{
+
+    #define ScoreOutputIterator TopNMaximaArray
+    #define PositionsIterator decltype(thrust::make_counting_iterator<ReferenceIdT>(0))
+    #define subjectIsCaseSensitive true
+    #define X(g,r) \
+        template void call_GaplessFilter_strided_PSSM_singletile_kernel<half2, 512, g, r, subjectIsCaseSensitive, ScoreOutputIterator, PositionsIterator>( \
+            const char * const, \
+            ScoreOutputIterator const, \
+            const size_t* const, \
+            const SequenceLengthT* const, \
+            PositionsIterator const, \
+            const int, \
+            const SequenceLengthT, \
+            const PSSM_2D_View<half2>&, \
+            cudaStream_t \
+        );
+
+        PSSM_GAPLESS_SINGLETILE_FOR_EACH_VALID_CONFIG_DO_X
+
+    #undef X
+    #undef subjectIsCaseSensitive
+    #undef PositionsIterator
+    #undef ScoreOutputIterator
+
+    #define ScoreOutputIterator TopNMaximaArray
+    #define PositionsIterator ReferenceIdT*
+    #define subjectIsCaseSensitive true
+    #define X(g,r) \
+        template void call_GaplessFilter_strided_PSSM_singletile_kernel<half2, 512, g, r, subjectIsCaseSensitive, ScoreOutputIterator, PositionsIterator>( \
+            const char * const, \
+            ScoreOutputIterator const, \
+            const size_t* const, \
+            const SequenceLengthT* const, \
+            PositionsIterator const, \
+            const int, \
+            const SequenceLengthT, \
+            const PSSM_2D_View<half2>&, \
+            cudaStream_t \
+        );
+
+        PSSM_GAPLESS_SINGLETILE_FOR_EACH_VALID_CONFIG_DO_X
+
+    #undef X
+    #undef subjectIsCaseSensitive
+    #undef PositionsIterator
+    #undef ScoreOutputIterator
+
+
+    #define ScoreOutputIterator TopNMaximaArray
+    #define PositionsIterator decltype(thrust::make_counting_iterator<ReferenceIdT>(0))
+    #define subjectIsCaseSensitive true
+    #define X(g,r) \
+        template void call_GaplessFilter_strided_PSSM_multitile_kernel<half2, 512, g, r, subjectIsCaseSensitive, ScoreOutputIterator, PositionsIterator>( \
+            int, \
+            const char * const, \
+            ScoreOutputIterator const, \
+            const size_t* const, \
+            const SequenceLengthT* const, \
+            PositionsIterator const, \
+            const int, \
+            const SequenceLengthT, \
+            const PSSM_2D_View<half2>&, \
+            float2*, \
+            size_t, \
+            cudaStream_t \
+        );
+
+        PSSM_GAPLESS_MULTITILE_FOR_EACH_VALID_CONFIG_DO_X
+
+    #undef X
+    #undef subjectIsCaseSensitive
+    #undef PositionsIterator
+    #undef ScoreOutputIterator
+
+
+    #define ScoreOutputIterator TopNMaximaArray
+    #define PositionsIterator ReferenceIdT*
+    #define subjectIsCaseSensitive true
+    #define X(g,r) \
+        template void call_GaplessFilter_strided_PSSM_multitile_kernel<half2, 512, g, r, subjectIsCaseSensitive, ScoreOutputIterator, PositionsIterator>( \
+            int, \
+            const char * const, \
+            ScoreOutputIterator const, \
+            const size_t* const, \
+            const SequenceLengthT* const, \
+            PositionsIterator const, \
+            const int, \
+            const SequenceLengthT, \
+            const PSSM_2D_View<half2>&, \
+            float2*, \
+            size_t, \
+            cudaStream_t \
+        );
+
+        PSSM_GAPLESS_MULTITILE_FOR_EACH_VALID_CONFIG_DO_X
+
+    #undef X
+    #undef subjectIsCaseSensitive
+    #undef PositionsIterator
+    #undef ScoreOutputIterator
+
+
+} //namespace hardcodedzero
+
+
+
+} //namespace cudasw4
\ No newline at end of file
diff --git a/lib/libmarv/src/pssmkernels_smithwaterman.cuh b/lib/libmarv/src/pssmkernels_smithwaterman.cuh
new file mode 100644
index 000000000..bd1e76980
--- /dev/null
+++ b/lib/libmarv/src/pssmkernels_smithwaterman.cuh
@@ -0,0 +1,1931 @@
+#ifndef PSSM_KERNELS_SMITH_WATERMAN_CUH
+#define PSSM_KERNELS_SMITH_WATERMAN_CUH
+
+#include <cuda_fp16.h>
+
+#include <map>
+
+//#include "validtileconfigs.hpp"
+#include "config.hpp"
+#include "pssm.cuh"
+#include "convert.cuh"
+#include "mathops.cuh"
+#include "util.cuh"
+
+#include <cooperative_groups.h>
+#include <cooperative_groups/reduce.h>
+namespace cg = cooperative_groups;
+
+namespace cudasw4{
+
+
+
+#define PSSM_SW_ENDPOS_SINGLETILE_FLOAT_OR_INT_FOR_EACH_VALID_CONFIG_DO_X \
+    X(4,4) X(4,8) X(4,12) X(4,16) X(4,20) X(4,24) X(4,28) X(4,32) \
+    X(4,36) X(4,40) X(4,44) \
+    X(8,4) X(8,8) X(8,12) X(8,16) X(8,20) X(8,24) X(8,28) X(8,32) \
+    X(8,36) X(8,40) X(8,44) \
+    X(16,4) X(16,8) X(16,12) X(16,16) X(16,20) X(16,24) X(16,28) X(16,32) \
+    X(16,36) X(16,40) X(16,44) \
+    X(32,4) X(32,8) X(32,12) X(32,16) X(32,20) X(32,24) X(32,28) X(32,32)
+
+
+#if 0
+#define PSSM_SW_ENDPOS_MULTITILE_FLOAT_OR_INT_FOR_EACH_VALID_CONFIG_DO_X \
+    X(4,4) X(4,8) X(4,12) X(4,16) X(4,20) X(4,24) X(4,28) X(4,32) \
+    X(4,36) X(4,40) X(4,44) X(4,48) X(4,52) X(4,56) X(4,60) X(4,64) \
+    X(8,4) X(8,8) X(8,12) X(8,16) X(8,20) X(8,24) X(8,28) X(8,32) \
+    X(8,36) X(8,40) X(8,44) X(8,48) X(8,52) X(8,56) X(8,60) X(8,64) \
+    X(16,4) X(16,8) X(16,12) X(16,16) X(16,20) X(16,24) X(16,28) X(16,32) \
+    X(16,36) X(16,40) X(16,44) X(16,48) X(16,52) X(16,56) X(16,60) X(16,64) \
+    X(32,4) X(32,8) X(32,12) X(32,16) X(32,20) X(32,24) X(32,28) X(32,32)
+#else
+#define PSSM_SW_ENDPOS_MULTITILE_FLOAT_OR_INT_FOR_EACH_VALID_CONFIG_DO_X \
+    X(8,24) X(8,28) X(8,32) X(8,36) \
+    X(16,20) X(16,24) X(16,28) X(16,32) \
+    X(32,20) X(32,24) X(32,28)
+#endif  
+
+
+
+
+
+template<
+    class ScoreType,
+    int blocksize, 
+    int groupsize, 
+    int numItems, 
+    bool withEndPosition,
+    bool subjectIsCaseSensitive, 
+    class ScoreOutputIterator, 
+    class PositionsIterator
+>
+__global__
+__launch_bounds__(blocksize,1)
+void amino_gpu_localAlignmentKernel_affinegap_floatOrInt_pssm_singletile(
+    __grid_constant__ const char * const devChars,
+    __grid_constant__ ScoreOutputIterator const devAlignmentScores,
+    __grid_constant__ const size_t* const devOffsets,
+    __grid_constant__ const SequenceLengthT* const devLengths,
+    __grid_constant__ PositionsIterator const d_indices,
+    __grid_constant__ const int numAlignments,
+    __grid_constant__ const SequenceLengthT queryLength,
+    __grid_constant__ const PSSM_2D_View<ScoreType> strided_PSSM,
+    __grid_constant__ const ScoreType gapopenscore, 
+    __grid_constant__ const ScoreType gapextendscore
+){
+    if constexpr (std::is_same_v<ScoreType, int>) {
+        #if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 900
+        return;
+        #endif
+    }
+    static_assert(std::is_same_v<ScoreType, float> || std::is_same_v<ScoreType, int>);
+
+    static_assert(groupsize >= 4);
+    static_assert(groupsize <= 32);
+    static_assert(blocksize % groupsize == 0);
+
+    __builtin_assume(blockDim.x == blocksize);
+    __builtin_assume(blockDim.x % groupsize == 0);
+    __builtin_assume(groupsize >= 4);
+    __builtin_assume(groupsize <= 32);
+    
+    auto group = cg::tiled_partition<groupsize>(cg::this_thread_block());
+    
+    const int groupIdInGrid = (threadIdx.x + blockIdx.x * blockDim.x) / groupsize;
+    const int numGroupsInGrid = (blockDim.x * gridDim.x) / groupsize;
+
+    constexpr ScoreType oobscore = -9999999;
+    constexpr int badLetter = 20;
+
+    constexpr int numRowsPSSM = 21;
+    constexpr int numColumnsPSSM = groupsize * numItems;
+
+    using MathOps = MathOps<ScoreType>;
+
+    using SPSSM = SharedPSSM_singletile<ScoreType, numRowsPSSM, numColumnsPSSM>;
+    extern __shared__ float4 externalSharedMem[];
+    SPSSM& shared_pssm = *((SPSSM*)((char*)&externalSharedMem[0]));
+
+    auto load_PSSM = [&](){
+        for(int i = threadIdx.x; i < 21 * groupsize * numItems; i += blockDim.x){
+            const int row = i / (groupsize * numItems);
+            const int col = i % (groupsize * numItems);
+            shared_pssm.data[row][col] = strided_PSSM[row][col];
+        }
+        __syncthreads();
+    };
+
+    auto makeCaseInsensitive4 = [](char4 encoded4){
+        unsigned int asUint;
+        memcpy(&asUint, &encoded4, sizeof(unsigned int));
+
+        if constexpr(subjectIsCaseSensitive){
+            asUint = CaseSensitive_to_CaseInsensitive{}(asUint);
+            // asUint = ClampToInvalid{}(asUint);
+        }
+
+        memcpy(&encoded4, &asUint, sizeof(unsigned int));
+        return encoded4;
+    };
+
+    //load PSSM to smem
+    for(int i = threadIdx.x; i < 21 * groupsize * numItems; i += blockDim.x){
+        const int row = i / (groupsize * numItems);
+        const int col = i % (groupsize * numItems);
+        shared_pssm.data[row][col] = strided_PSSM[row][col];
+    }
+    __syncthreads();
+
+
+    for(int alignmentId = groupIdInGrid; alignmentId < numAlignments; alignmentId += numGroupsInGrid){
+
+        ScoreType scoresF[numItems]{};
+        ScoreType scoresM[numItems]{};
+        ScoreType scoreLeft;
+        ScoreType scoreDiag;
+        ScoreType E;
+        ScoreType maxObserved = oobscore;
+        int positionOfMaxObserved_y = 0;
+        int positionOfMaxObserved_itemIndex = 0;
+
+
+        auto printState = [&](int row){
+            // for(int g = 0; g < 1; g++){
+            //     if(groupIdInGrid == g){
+            //         printf("printstate row %d, groupIdInGrid %d\n", row, groupIdInGrid);
+            //         if(group.thread_rank() == 0){
+            //             printf("M\n");
+            //         }
+            //         for(int t = 0; t < groupsize; t++){
+            //             if(t == group.thread_rank()){
+            //                 for(int i = 0; i < numItems; i++){
+            //                     printf("%3f ", scoresM[i]);
+            //                 }
+            //                 printf("\n");
+            //             }
+            //             group.sync();
+            //         }
+            //         if(group.thread_rank() == 0){
+            //             printf("\n");
+            //         }
+
+            //         if(group.thread_rank() == 0){
+            //             printf("F\n");
+            //         }
+            //         for(int t = 0; t < groupsize; t++){
+            //             if(t == group.thread_rank()){
+            //                 for(int i = 0; i < numItems; i++){
+            //                     printf("%3f ", scoresF[i]);
+            //                 }
+            //                 printf("\n");
+            //             }
+            //             group.sync();
+            //         }
+            //         if(group.thread_rank() == 0){
+            //             printf("\n");
+            //         }
+            //     }
+            // }
+        };
+
+        const auto globalIndex = d_indices[alignmentId];
+        SequenceLengthT subjectLength = devLengths[globalIndex];
+        const auto charOffset = devOffsets[globalIndex]-devOffsets[0];
+        const char4* groupSubjectChar4 = reinterpret_cast<const char4*>(&devChars[charOffset]);
+
+        int loadOffsetLimit = SDIV(subjectLength, 4);
+        int subjectLoadOffset = group.thread_rank();                
+        char4 current4Letters;
+        int currentLetter = badLetter;
+
+        auto loadNext4Letters = [&](){
+            if(subjectLoadOffset < loadOffsetLimit){
+                current4Letters = makeCaseInsensitive4(groupSubjectChar4[subjectLoadOffset]);
+                subjectLoadOffset += group.size();
+            }else{
+                current4Letters = makeCaseInsensitive4(make_char4(badLetter, badLetter, badLetter, badLetter));
+            }
+        };
+
+        auto shuffleCurrentLetter = [&](){
+            currentLetter = group.shfl_up(currentLetter, 1);
+        };
+
+        auto shuffle4Letters = [&](){
+            static_assert(sizeof(char4) == sizeof(int));
+            int temp;
+            memcpy(&temp, &current4Letters, sizeof(char4));
+            temp = group.shfl_down(temp, 1);
+            memcpy(&current4Letters, &temp, sizeof(int));
+        };
+
+        auto relaxFirstDiagonal = [&](int row, bool isFirstTile){
+            static_assert(numItems % 4 == 0);
+
+            using Vec4T = typename Vectorized4<ScoreType>::type;
+            const Vec4T* const pssmRow4 = reinterpret_cast<const Vec4T*>(&shared_pssm.data[currentLetter][0]);
+            //const Vec4T* const pssmRow4 = reinterpret_cast<const Vec4T*>(&strided_PSSM[currentLetter][0]);
+            Vec4T foo = pssmRow4[0 * groupsize + group.thread_rank()];
+            ScoreType fooArray[4];
+            memcpy(&fooArray[0], &foo, sizeof(Vec4T));
+
+            //in the first tile E is always computed. In succeeding tiles, E is already computed for the first thread (loaded from temp storage)
+            if(isFirstTile){
+                E = MathOps::add_max(scoreLeft, gapopenscore, MathOps::add(E, gapextendscore));
+            }else{
+                if(group.thread_rank() > 0){
+                    E = MathOps::add_max(scoreLeft, gapopenscore, MathOps::add(E, gapextendscore));
+                }
+            }
+
+            scoresF[0] = MathOps::add_max(scoresM[0], gapopenscore, MathOps::add(scoresF[0], gapextendscore));
+            ScoreType upTempScore = scoresM[0];
+            scoresM[0] = MathOps::add_max_relu(scoreDiag, fooArray[0], MathOps::max(E, scoresF[0]));
+            scoreDiag = upTempScore;
+            if constexpr (withEndPosition){
+                if(maxObserved < scoresM[0]){
+                    maxObserved = scoresM[0];
+                    positionOfMaxObserved_itemIndex = 0;
+                    positionOfMaxObserved_y = row;
+                }
+            }else{
+                maxObserved = MathOps::max(maxObserved, scoresM[0]);
+            }
+
+
+            #pragma unroll
+            for(int k = 1; k < 4; k++){
+                E = MathOps::add_max(scoresM[k-1], gapopenscore, MathOps::add(E, gapextendscore));
+                scoresF[k] = MathOps::add_max(scoresM[k], gapopenscore, MathOps::add(scoresF[k], gapextendscore));
+                ScoreType upTempScore = scoresM[k];
+                scoresM[k] = MathOps::add_max_relu(scoreDiag, fooArray[k], MathOps::max(E, scoresF[k]));
+                scoreDiag = upTempScore;
+                if constexpr (withEndPosition){
+                    if(maxObserved < scoresM[k]){
+                        maxObserved = scoresM[k];
+                        positionOfMaxObserved_itemIndex = k;
+                        positionOfMaxObserved_y = row;
+                    }
+                }else{
+                    maxObserved = MathOps::max(maxObserved, scoresM[k]);
+                }
+            }
+
+            #pragma unroll
+            for(int i = 1; i < numItems/4; i++){
+                foo = pssmRow4[i * group.size() + group.thread_rank()];
+                memcpy(&fooArray[0], &foo, sizeof(ScoreType) * 4);
+
+                #pragma unroll
+                for(int k = 0; k < 4; k++){
+                    E = MathOps::add_max(scoresM[4*i + k-1], gapopenscore, MathOps::add(E, gapextendscore));
+                    scoresF[4*i + k] = MathOps::add_max(scoresM[4*i + k], gapopenscore, MathOps::add(scoresF[4*i + k], gapextendscore));
+                    ScoreType upTempScore = scoresM[4*i + k];
+                    scoresM[4*i + k] = MathOps::add_max_relu(scoreDiag, fooArray[k], MathOps::max(E, scoresF[4*i + k]));
+                    scoreDiag = upTempScore;
+                    if constexpr (withEndPosition){
+                        if(maxObserved < scoresM[4*i + k]){
+                            maxObserved = scoresM[4*i + k];
+                            positionOfMaxObserved_itemIndex = 4*i + k;
+                            positionOfMaxObserved_y = row;
+                        }
+                    }else{
+                        maxObserved = MathOps::max(maxObserved, scoresM[4*i + k]);
+                    }
+                }
+            }
+
+            //advance E by 1 column and F by 1 row to allow for optimized computations of remaining diagonals
+            E = MathOps::add_max(scoresM[numItems-1], gapopenscore, MathOps::add(E, gapextendscore));
+            for(int k = 0; k < numItems; k++){
+                scoresF[k] = MathOps::add_max(scoresM[k], gapopenscore, MathOps::add(scoresF[k], gapextendscore));
+            }
+
+            //printState(row);
+        };
+
+        auto relax = [&](int row){
+            static_assert(numItems % 4 == 0);
+
+            using Vec4T = typename Vectorized4<ScoreType>::type;
+            const Vec4T* const pssmRow4 = reinterpret_cast<const Vec4T*>(&shared_pssm.data[currentLetter][0]);
+            //const Vec4T* const pssmRow4 = reinterpret_cast<const Vec4T*>(&strided_PSSM[currentLetter][0]);
+            Vec4T foo = pssmRow4[0 * groupsize + group.thread_rank()];
+            ScoreType fooArray[4];
+            memcpy(&fooArray[0], &foo, sizeof(Vec4T));
+
+            // E of current column and scoresF of current row are already computed
+
+            ScoreType tempM = scoresM[0];
+            scoresM[0] = MathOps::add_max_relu(scoreDiag, fooArray[0], MathOps::max(E, scoresF[0]));     
+            if constexpr (withEndPosition){
+                if(maxObserved < scoresM[0]){
+                    maxObserved = scoresM[0];
+                    positionOfMaxObserved_itemIndex = 0;
+                    positionOfMaxObserved_y = row;
+                }
+            }else{
+                maxObserved = MathOps::max(maxObserved, scoresM[0]);
+            }
+            E = MathOps::add_max(scoresM[0], gapopenscore, MathOps::add(E, gapextendscore));
+            scoresF[0] = MathOps::add_max(scoresM[0], gapopenscore, MathOps::add(scoresF[0],gapextendscore)); //this computes F of the next row !
+            scoreDiag = tempM;
+
+            #pragma unroll
+            for(int i = 1; i < 4; i++){
+                tempM = scoresM[i];
+                scoresM[i] = MathOps::add_max_relu(scoreDiag, fooArray[i], MathOps::max(E, scoresF[i]));
+                if constexpr (withEndPosition){
+                    if(maxObserved < scoresM[i]){
+                        maxObserved = scoresM[i];
+                        positionOfMaxObserved_itemIndex = i;
+                        positionOfMaxObserved_y = row;
+                    }
+                }else{
+                    maxObserved = MathOps::max(maxObserved, scoresM[i]);
+                }
+                E = MathOps::add_max(scoresM[i], gapopenscore, MathOps::add(E, gapextendscore));
+                scoresF[i] = MathOps::add_max(scoresM[i], gapopenscore, MathOps::add(scoresF[i], gapextendscore)); //this computes F of the next row !
+                scoreDiag = tempM;
+            }
+
+            #pragma unroll
+            for(int k = 1; k < numItems/4; k++){
+                foo = pssmRow4[k * groupsize + group.thread_rank()];
+                memcpy(&fooArray[0], &foo, sizeof(Vec4T));
+
+                #pragma unroll
+                for(int i = 0; i < 4; i++){
+                    const int index = k*4+i;
+                    tempM = scoresM[index];
+                    scoresM[index] = MathOps::add_max_relu(scoreDiag, fooArray[i], MathOps::max(E, scoresF[index]));
+                    if constexpr (withEndPosition){
+                        if(maxObserved < scoresM[index]){
+                            maxObserved = scoresM[index];
+                            positionOfMaxObserved_itemIndex = index;
+                            positionOfMaxObserved_y = row;
+                        }
+                    }else{
+                        maxObserved = MathOps::max(maxObserved, scoresM[index]);
+                    }
+                    E = MathOps::add_max(scoresM[index], gapopenscore, MathOps::add(E, gapextendscore));
+                    scoresF[index] = MathOps::add_max(scoresM[index], gapopenscore, MathOps::add(scoresF[index], gapextendscore)); //this computes F of the next row !
+                    scoreDiag = tempM;
+                }
+
+            }
+
+            //printState(row);
+        };
+
+        auto initScores = [&](){
+            if(group.thread_rank() == 0){
+                #pragma unroll
+                for (int i=0; i < numItems; i++) {
+                    scoresM[i] = 0;
+                    scoresF[i] = oobscore;
+                }
+                scoreDiag = 0;
+                scoreLeft = 0;
+                E = oobscore;
+            }else{
+                #pragma unroll
+                for (int i=0; i < numItems; i++) {
+                    scoresM[i] = oobscore;
+                    scoresF[i] = oobscore;
+                }
+                scoreDiag = oobscore;
+                scoreLeft = group.thread_rank() == 1 ? 0 : oobscore;
+                E = oobscore;
+            }
+        };
+
+        auto shuffleScores = [&](){
+            scoreDiag = scoreLeft;
+            const ScoreType newscoreLeft = group.shfl_up(scoresM[numItems-1], 1);
+            const ScoreType newE = group.shfl_up(E, 1);
+            if(group.thread_rank() == 0){
+                //scoreLeft is only modified in this function and is initialized with 0 for thread 0
+                // assert(scoreLeft == 0);
+                //scoreLeft = 0;
+
+                // E = oobscore;
+                E = gapopenscore; // After first diagonal was processed, thread 0 needs E of matrix column 1, not -infty
+            }else{
+                scoreLeft = newscoreLeft;
+                E = newE;
+            }
+        };
+
+        loadNext4Letters();
+        initScores();
+
+        const int outputThreadRank = (queryLength-1) / numItems;
+        const int numRows = subjectLength + outputThreadRank + 1;
+
+
+        //printState(0);
+
+        //process 4 letters per iteration
+        int r = 1;
+        if(group.thread_rank() == 0){ currentLetter = current4Letters.x; }
+        constexpr bool isFirstTile = true;
+        relaxFirstDiagonal(r, isFirstTile); //x
+        shuffleScores();
+
+        if(r+1 < numRows){
+            shuffleCurrentLetter();
+            if(group.thread_rank() == 0){ currentLetter = current4Letters.y; }
+            relax(r+1); //y
+            shuffleScores();
+        }
+        if(r+2 < numRows){
+            shuffleCurrentLetter();
+            if(group.thread_rank() == 0){ currentLetter = current4Letters.z; }
+            relax(r+2); //z
+            shuffleScores();        
+        }
+        if(r+3 < numRows){
+            shuffleCurrentLetter();
+
+            if(group.thread_rank() == 0){ currentLetter = current4Letters.w; }
+            relax(r+3); //w
+            shuffleScores();
+
+            shuffleCurrentLetter(); 
+            if((r + 3) % (4*group.size()) == 0){
+                //used up all query letters stored across the group. reload
+                loadNext4Letters();
+            }else{
+                //get next 4 letters from neighbor
+                shuffle4Letters();
+            }
+        }
+        r = 5;
+        for(; r < numRows - 3; r += 4){
+
+            if(group.thread_rank() == 0){ currentLetter = current4Letters.x; }
+            relax(r); //x
+            shuffleScores();
+
+            shuffleCurrentLetter();
+            if(group.thread_rank() == 0){ currentLetter = current4Letters.y; }
+            relax(r+1); //y
+            shuffleScores();
+
+            shuffleCurrentLetter();
+            if(group.thread_rank() == 0){ currentLetter = current4Letters.z; }
+            relax(r+2); //z
+            shuffleScores();
+
+            shuffleCurrentLetter();
+            if(group.thread_rank() == 0){ currentLetter = current4Letters.w; }
+            relax(r+3); //w
+            shuffleScores();
+
+            shuffleCurrentLetter(); 
+            if((r + 3) % (4*group.size()) == 0){
+                //used up all query letters stored across the group. reload
+                loadNext4Letters();
+            }else{
+                //get next 4 letters from neighbor
+                shuffle4Letters();
+            }     
+        }
+
+        //can have at most 3 remaining rows
+        if(r < numRows){
+            if(group.thread_rank() == 0){ currentLetter = current4Letters.x; }   
+            relax(r); //x
+            shuffleScores();
+            shuffleCurrentLetter();
+            
+        }
+        if(r+1 < numRows){
+            if(group.thread_rank() == 0){ currentLetter = current4Letters.y; }
+            relax(r+1); //y
+            shuffleScores();
+            shuffleCurrentLetter();
+            
+        }
+        if(r+2 < numRows){
+            if(group.thread_rank() == 0){ currentLetter = current4Letters.z; }
+            relax(r+2); //z
+        }
+
+        if constexpr (withEndPosition){
+            if(alignmentId < numAlignments){
+                const int3 packed = make_int3(maxObserved, 
+                    group.thread_rank() * numItems + positionOfMaxObserved_itemIndex,
+                    positionOfMaxObserved_y - group.thread_rank() - 1);
+                const int3 maxPacked = cg::reduce(group, packed, [](int3 l, int3 r){
+                    if(l.x > r.x){
+                        return l;
+                    }else{
+                        return r;
+                    }
+                });
+
+                if(group.thread_rank() == 0){
+                    ScoreWithExtra<ScoreType, AlignmentEndPosition> res(maxPacked.x, AlignmentEndPosition{maxPacked.y, maxPacked.z});
+
+                    devAlignmentScores[alignmentId] = res;
+                    //devAlignmentScores[alignmentId] = maxPacked.x;
+                    //endPositionOutput[alignmentId] = make_int2(maxPacked.y, maxPacked.z);
+                }
+            }
+        }else{
+            if(alignmentId < numAlignments){
+                maxObserved = cg::reduce(group, maxObserved, cg::greater<ScoreType>{});
+
+                if(group.thread_rank() == 0){
+                    ScoreWithExtra<ScoreType, AlignmentEndPosition> res(maxObserved, AlignmentEndPosition{0, 0});
+
+                    devAlignmentScores[alignmentId] = res;
+                    //devAlignmentScores[alignmentId] = maxObserved;
+                }
+            }
+        }
+    }
+
+}
+
+
+template<
+    class ScoreType,
+    int blocksize, 
+    int groupsize, 
+    int numItems, 
+    bool withEndPosition,
+    bool subjectIsCaseSensitive, 
+    class ScoreOutputIterator, 
+    class PositionsIterator
+>
+void call_amino_gpu_localAlignmentKernel_affinegap_floatOrInt_pssm_singletile(
+    int numThreadBlocks,
+    const char * const devChars,
+    ScoreOutputIterator const devAlignmentScores,
+    const size_t* const devOffsets,
+    const SequenceLengthT* const devLengths,
+    PositionsIterator const d_indices,
+    const int numAlignments,
+    const SequenceLengthT queryLength,
+    const PSSM_2D_View<ScoreType>& strided_PSSM,
+    const ScoreType gapopenscore, 
+    const ScoreType gapextendscore,
+    cudaStream_t stream
+){
+    //constexpr int groupsPerBlock = blocksize / group_size;
+    //constexpr int alignmentsPerGroup = 1;
+    //constexpr int alignmentsPerBlock = groupsPerBlock * alignmentsPerGroup;
+    // std::cout << "blocksize " << blocksize << ", group_size " << group_size 
+    //     << ", alignmentsPerBlock " << alignmentsPerBlock << ", numAlignments " << numAlignments << "\n";
+
+    constexpr int numRowsPSSM = 21;
+    constexpr int numColumnsPSSM = groupsize * numItems;
+    using SPSSM = SharedPSSM_singletile<float, numRowsPSSM, numColumnsPSSM>;
+    int smem = sizeof(SPSSM);
+    auto kernel = amino_gpu_localAlignmentKernel_affinegap_floatOrInt_pssm_singletile<
+        ScoreType,
+        blocksize, 
+        groupsize, 
+        numItems, 
+        withEndPosition,
+        subjectIsCaseSensitive,
+        ScoreOutputIterator, 
+        PositionsIterator>;
+
+    auto setSmemKernelAttribute = [&](){
+        static std::map<int, bool> isSet;
+        if(smem > 48*1024){
+            int deviceId;
+            cudaGetDevice(&deviceId); CUERR;
+            if(!isSet[deviceId]){
+                cudaFuncSetAttribute(kernel, cudaFuncAttributeMaxDynamicSharedMemorySize, smem); CUERR;
+                isSet[deviceId] = true;
+            }
+        }
+    };
+    setSmemKernelAttribute();
+
+    dim3 grid = std::min(numAlignments, numThreadBlocks);
+
+    kernel<<<grid, blocksize, smem, stream>>>(
+        devChars,
+        devAlignmentScores,
+        devOffsets,
+        devLengths,
+        d_indices,
+        numAlignments,       
+        queryLength,
+        strided_PSSM,
+        gapopenscore, 
+        gapextendscore
+    ); CUERR;
+}
+
+
+
+
+
+
+#define ScoreOutputIterator TopNMaximaArrayWithExtra<AlignmentEndPosition>
+#define PositionsIterator decltype(thrust::make_counting_iterator<ReferenceIdT>(0))
+#define withEndPosition true
+#define subjectIsCaseSensitive true
+#define X(g,r) \
+    extern template void call_amino_gpu_localAlignmentKernel_affinegap_floatOrInt_pssm_singletile<float, 512, g, r, withEndPosition, subjectIsCaseSensitive, ScoreOutputIterator, PositionsIterator>( \
+        int, \
+        const char * const, \
+        ScoreOutputIterator const, \
+        const size_t* const, \
+        const SequenceLengthT* const, \
+        PositionsIterator const, \
+        const int, \
+        const SequenceLengthT, \
+        const PSSM_2D_View<float>&, \
+        const float,  \
+        const float, \
+        cudaStream_t \
+    );
+
+PSSM_SW_ENDPOS_SINGLETILE_FLOAT_OR_INT_FOR_EACH_VALID_CONFIG_DO_X
+
+#undef X
+#undef subjectIsCaseSensitive
+#undef withEndPosition
+#undef PositionsIterator
+#undef ScoreOutputIterator
+
+#define ScoreOutputIterator TopNMaximaArrayWithExtra<AlignmentEndPosition>
+#define PositionsIterator ReferenceIdT*
+#define withEndPosition true
+#define subjectIsCaseSensitive true
+#define X(g,r) \
+    extern template void call_amino_gpu_localAlignmentKernel_affinegap_floatOrInt_pssm_singletile<float, 512, g, r, withEndPosition, subjectIsCaseSensitive, ScoreOutputIterator, PositionsIterator>( \
+        int, \
+        const char * const, \
+        ScoreOutputIterator const, \
+        const size_t* const, \
+        const SequenceLengthT* const, \
+        PositionsIterator const, \
+        const int, \
+        const SequenceLengthT, \
+        const PSSM_2D_View<float>&, \
+        const float,  \
+        const float, \
+        cudaStream_t \
+    );
+
+PSSM_SW_ENDPOS_SINGLETILE_FLOAT_OR_INT_FOR_EACH_VALID_CONFIG_DO_X
+
+#undef X
+#undef subjectIsCaseSensitive
+#undef withEndPosition
+#undef PositionsIterator
+#undef ScoreOutputIterator
+
+#define ScoreOutputIterator TopNMaximaArrayWithExtra<AlignmentEndPosition>
+#define PositionsIterator decltype(thrust::make_counting_iterator<ReferenceIdT>(0))
+#define withEndPosition true
+#define subjectIsCaseSensitive true
+#define X(g,r) \
+    extern template void call_amino_gpu_localAlignmentKernel_affinegap_floatOrInt_pssm_singletile<int, 512, g, r, withEndPosition, subjectIsCaseSensitive, ScoreOutputIterator, PositionsIterator>( \
+        int, \
+        const char * const, \
+        ScoreOutputIterator const, \
+        const size_t* const, \
+        const SequenceLengthT* const, \
+        PositionsIterator const, \
+        const int, \
+        const SequenceLengthT, \
+        const PSSM_2D_View<int>&, \
+        const int,  \
+        const int, \
+        cudaStream_t \
+    );
+
+PSSM_SW_ENDPOS_SINGLETILE_FLOAT_OR_INT_FOR_EACH_VALID_CONFIG_DO_X
+
+#undef X
+#undef subjectIsCaseSensitive
+#undef withEndPosition
+#undef PositionsIterator
+#undef ScoreOutputIterator
+
+#define ScoreOutputIterator TopNMaximaArrayWithExtra<AlignmentEndPosition>
+#define PositionsIterator ReferenceIdT*
+#define withEndPosition true
+#define subjectIsCaseSensitive true
+#define X(g,r) \
+    extern template void call_amino_gpu_localAlignmentKernel_affinegap_floatOrInt_pssm_singletile<int, 512, g, r, withEndPosition, subjectIsCaseSensitive, ScoreOutputIterator, PositionsIterator>( \
+        int, \
+        const char * const, \
+        ScoreOutputIterator const, \
+        const size_t* const, \
+        const SequenceLengthT* const, \
+        PositionsIterator const, \
+        const int, \
+        const SequenceLengthT, \
+        const PSSM_2D_View<int>&, \
+        const int,  \
+        const int, \
+        cudaStream_t \
+    );
+
+PSSM_SW_ENDPOS_SINGLETILE_FLOAT_OR_INT_FOR_EACH_VALID_CONFIG_DO_X
+
+#undef X
+#undef subjectIsCaseSensitive
+#undef withEndPosition
+#undef PositionsIterator
+#undef ScoreOutputIterator
+
+
+
+
+template<
+    class ScoreType,
+    int blocksize, 
+    bool withEndPosition,
+    bool subjectIsCaseSensitive, 
+    class ScoreOutputIterator, 
+    class PositionsIterator
+>
+void call_amino_gpu_localAlignmentKernel_affinegap_floatOrInt_pssm_singletile(
+    int numThreadBlocks,
+    int groupsize, 
+    int numItems, 
+    const char * const devChars,
+    ScoreOutputIterator const devAlignmentScores,
+    const size_t* const devOffsets,
+    const SequenceLengthT* const devLengths,
+    PositionsIterator const d_indices,
+    const int numAlignments,
+    const SequenceLengthT queryLength,
+    const PSSM_2D_View<ScoreType>& strided_PSSM,
+    const ScoreType gapopenscore, 
+    const ScoreType gapextendscore,
+    cudaStream_t stream
+){
+    #define X(g,r) \
+        if(groupsize == g && numItems == r){ \
+            call_amino_gpu_localAlignmentKernel_affinegap_floatOrInt_pssm_singletile<ScoreType, blocksize,g,r,withEndPosition,subjectIsCaseSensitive>( \
+                numThreadBlocks, devChars, devAlignmentScores, devOffsets, devLengths, \
+                d_indices, numAlignments, queryLength, strided_PSSM, \
+                gapopenscore, gapextendscore, stream \
+            ); \
+        } else 
+
+        PSSM_SW_ENDPOS_SINGLETILE_FLOAT_OR_INT_FOR_EACH_VALID_CONFIG_DO_X
+    { throw std::runtime_error("invalid groupsize/numregs config");}
+
+    #undef X
+    
+}
+
+
+
+
+
+
+template<
+    class ScoreType,
+    int blocksize, 
+    int groupsize, 
+    int numItems, 
+    bool withEndPosition,
+    bool subjectIsCaseSensitive, 
+    class ScoreOutputIterator, 
+    class PositionsIterator
+>
+__global__
+__launch_bounds__(blocksize,1)
+void amino_gpu_localAlignmentKernel_affinegap_floatOrInt_pssm_multitile(
+    __grid_constant__ const char * const devChars,
+    __grid_constant__ ScoreOutputIterator const devAlignmentScores,
+    __grid_constant__ const size_t* const devOffsets,
+    __grid_constant__ const SequenceLengthT* const devLengths,
+    __grid_constant__ PositionsIterator const d_indices,
+    __grid_constant__ const int numAlignments,
+    __grid_constant__ const SequenceLengthT queryLength,
+    __grid_constant__ const PSSM_2D_View<ScoreType> strided_PSSM,
+    __grid_constant__ const ScoreType gapopenscore, 
+    __grid_constant__ const ScoreType gapextendscore,
+    __grid_constant__ char* const tempStorage,
+    __grid_constant__ const size_t tempBytesPerGroup
+){
+    if constexpr (std::is_same_v<ScoreType, int>) {
+        #if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 900
+        return;
+        #endif
+    }
+    static_assert(std::is_same_v<ScoreType, float> || std::is_same_v<ScoreType, int>);
+
+    static_assert(groupsize >= 4);
+    static_assert(groupsize <= 32);
+    static_assert(blocksize % groupsize == 0);
+
+    __builtin_assume(blockDim.x == blocksize);
+    __builtin_assume(blockDim.x % groupsize == 0);
+    __builtin_assume(groupsize >= 4);
+    __builtin_assume(groupsize <= 32);
+    
+    auto group = cg::tiled_partition<groupsize>(cg::this_thread_block());
+    
+    const int groupIdInGrid = (threadIdx.x + blockIdx.x * blockDim.x) / groupsize;
+    const int numGroupsInGrid = (blockDim.x * gridDim.x) / groupsize;
+    constexpr int numGroupsPerBlock = blocksize / groupsize;
+
+    constexpr ScoreType oobscore = -9999999;
+    constexpr int badLetter = 20;
+
+    constexpr int numRowsPSSM = 21;
+    constexpr int numColumnsPSSM = groupsize * numItems;
+
+    using MathOps = MathOps<ScoreType>;
+    using SPSSM = SharedPSSM_singletile<ScoreType, numRowsPSSM, numColumnsPSSM>;
+    extern __shared__ float4 externalSharedMem[];
+    SPSSM& shared_pssm = *((SPSSM*)((char*)&externalSharedMem[0]));
+
+    auto load_PSSM = [&](int tileNr){
+        __syncthreads();
+        // if(threadIdx.x == 0){
+        //     printf("load_PSSM tileNr %d\n", tileNr);
+        // }
+        const int columnOffset = tileNr * groupsize * numItems;
+        for(int i = threadIdx.x; i < 21 * groupsize * numItems; i += blockDim.x){
+            const int row = i / (groupsize * numItems);
+            const int col = i % (groupsize * numItems);
+            shared_pssm.data[row][col] = strided_PSSM[row][columnOffset + col];
+        }
+        __syncthreads();
+        // if(blockIdx.x == 0 && threadIdx.x == 0){
+        //     printf("in gmem:\n");
+        //     for(int i = 0; i < groupsize * numItems; i++){
+        //         printf("%3f ", strided_PSSM[0][columnOffset + i]);
+        //     }
+        //     printf("\n");
+
+        //     printf("in smem:\n");
+        //     for(int i = 0; i < groupsize * numItems; i++){
+        //         printf("%3f ", shared_pssm.data[0][i]);
+        //     }
+        //     printf("\n");
+        // }
+        // __syncthreads();
+    };
+
+    auto makeCaseInsensitive4 = [](char4 encoded4){
+        unsigned int asUint;
+        memcpy(&asUint, &encoded4, sizeof(unsigned int));
+
+        if constexpr(subjectIsCaseSensitive){
+            asUint = CaseSensitive_to_CaseInsensitive{}(asUint);
+            // asUint = ClampToInvalid{}(asUint);
+        }
+
+        memcpy(&encoded4, &asUint, sizeof(unsigned int));
+        return encoded4;
+    };
+
+    using Vec2T = typename Vectorized2<ScoreType>::type;
+
+    Vec2T* groupTempStorage = (Vec2T*)(((char*)tempStorage) + tempBytesPerGroup * groupIdInGrid);
+
+    auto clearOutOfTileTempStorage = [&](int subjectLength){
+        if(group.thread_rank() < group.size() - 1){
+            groupTempStorage[subjectLength + group.thread_rank()] = Vec2T{};
+        }
+    };
+
+    const int numAlignmentsRounded = SDIV(numAlignments, numGroupsPerBlock) * numGroupsPerBlock;
+    const int numTiles = SDIV(queryLength, groupsize * numItems);
+
+    for(int alignmentId = groupIdInGrid; alignmentId < numAlignmentsRounded; alignmentId += numGroupsInGrid){
+
+        ScoreType scoresF[numItems]{};
+        ScoreType scoresM[numItems]{};
+        ScoreType scoreLeft;
+        ScoreType scoreDiag;
+        ScoreType E;
+        ScoreType maxObserved = oobscore;
+        int positionOfMaxObserved_y = 0;
+        int positionOfMaxObserved_tileNr = 0;
+        int positionOfMaxObserved_itemIndex = 0;
+
+        Vec2T tileLastColumnM_E;
+        Vec2T leftBorderM_E;
+
+        // #define PRINT_WRITE
+        // #define PRINT_LOAD
+
+        auto printState = [&](int row){
+            // for(int g = 0; g < 1; g++){
+            //     if(groupIdInGrid == g){
+            //         printf("printstate row %d, groupIdInGrid %d\n", row, groupIdInGrid);
+            //         if(group.thread_rank() == 0){
+            //             printf("M\n");
+            //         }
+            //         for(int t = 0; t < groupsize; t++){
+            //             if(t == group.thread_rank()){
+            //                 for(int i = 0; i < numItems; i++){
+            //                     printf("%3f ", scoresM[i]);
+            //                 }
+            //                 printf("\n");
+            //             }
+            //             group.sync();
+            //         }
+            //         if(group.thread_rank() == 0){
+            //             printf("\n");
+            //         }
+
+            //         if(group.thread_rank() == 0){
+            //             printf("F\n");
+            //         }
+            //         for(int t = 0; t < groupsize; t++){
+            //             if(t == group.thread_rank()){
+            //                 for(int i = 0; i < numItems; i++){
+            //                     printf("%3f ", scoresF[i]);
+            //                 }
+            //                 printf("\n");
+            //             }
+            //             group.sync();
+            //         }
+            //         if(group.thread_rank() == 0){
+            //             printf("\n");
+            //         }
+            //     }
+            // }
+        };
+
+        SequenceLengthT subjectLength = 0;
+        const char4* groupSubjectChar4 = nullptr;
+        int loadOffsetLimit = 0;
+        int subjectLoadOffset = 0;
+        char4 current4Letters;
+        int currentLetter;
+        int tempLoadOffset = 0;
+        int tempWriteOffset = 0;
+
+        auto loadNext4Letters = [&](){
+            if(subjectLoadOffset < loadOffsetLimit){
+                current4Letters = makeCaseInsensitive4(groupSubjectChar4[subjectLoadOffset]);
+                subjectLoadOffset += group.size();
+            }else{
+                current4Letters = makeCaseInsensitive4(make_char4(badLetter, badLetter, badLetter, badLetter));
+            }
+        };
+
+        auto shuffleCurrentLetter = [&](){
+            currentLetter = group.shfl_up(currentLetter, 1);
+        };
+
+        auto shuffle4Letters = [&](){
+            static_assert(sizeof(char4) == sizeof(int));
+            int temp;
+            memcpy(&temp, &current4Letters, sizeof(char4));
+            temp = group.shfl_down(temp, 1);
+            memcpy(&current4Letters, &temp, sizeof(int));
+        };
+
+        auto setTileLastColumn = [&](){
+            if(group.thread_rank() == group.size() - 1){
+                tileLastColumnM_E.x = scoresM[numItems-1];
+                tileLastColumnM_E.y = E;
+            }
+        };
+
+        auto shuffleTileLastColumn = [&](){
+            tileLastColumnM_E = group.shfl_down(tileLastColumnM_E, 1);
+        };
+        auto shuffleLeftBorder = [&](){
+            leftBorderM_E = group.shfl_down(leftBorderM_E, 1);
+        };
+
+        auto relaxFirstDiagonal = [&](int row, int tileNr){
+            static_assert(numItems % 4 == 0);
+
+            using Vec4T = typename Vectorized4<ScoreType>::type;
+            const Vec4T* const pssmRow4 = reinterpret_cast<const Vec4T*>(&shared_pssm.data[currentLetter][0]);
+            //const Vec4T* const pssmRow4 = reinterpret_cast<const Vec4T*>(&strided_PSSM[currentLetter][0]);
+            Vec4T foo = pssmRow4[0 * groupsize + group.thread_rank()];
+            ScoreType fooArray[4];
+            memcpy(&fooArray[0], &foo, sizeof(Vec4T));
+
+            //in the first tile E is always computed. In succeeding tiles, E is already computed for the first thread (loaded from temp storage)
+            if(tileNr == 0){
+                E = MathOps::add_max(scoreLeft, gapopenscore, MathOps::add(E, gapextendscore));
+            }else{
+                if(group.thread_rank() > 0){
+                    E = MathOps::add_max(scoreLeft, gapopenscore, MathOps::add(E, gapextendscore));
+                }
+            }
+
+            scoresF[0] = MathOps::add_max(scoresM[0], gapopenscore, MathOps::add(scoresF[0], gapextendscore));
+            ScoreType upTempScore = scoresM[0];
+            scoresM[0] = MathOps::add_max_relu(scoreDiag, fooArray[0], MathOps::max(E, scoresF[0]));
+            scoreDiag = upTempScore;
+            if constexpr (withEndPosition){
+                if(maxObserved < scoresM[0]){
+                    maxObserved = scoresM[0];
+                    positionOfMaxObserved_tileNr = tileNr;
+                    positionOfMaxObserved_itemIndex = 0;
+                    positionOfMaxObserved_y = row;
+                }
+            }else{
+                maxObserved = MathOps::max(maxObserved, scoresM[0]);
+            }
+
+
+            #pragma unroll
+            for(int k = 1; k < 4; k++){
+                E = MathOps::add_max(scoresM[k-1], gapopenscore, MathOps::add(E, gapextendscore));
+                scoresF[k] = MathOps::add_max(scoresM[k], gapopenscore, MathOps::add(scoresF[k], gapextendscore));
+                ScoreType upTempScore = scoresM[k];
+                scoresM[k] = MathOps::add_max_relu(scoreDiag, fooArray[k], MathOps::max(E, scoresF[k]));
+                scoreDiag = upTempScore;
+                if constexpr (withEndPosition){
+                    if(maxObserved < scoresM[k]){
+                        maxObserved = scoresM[k];
+                        positionOfMaxObserved_tileNr = tileNr;
+                        positionOfMaxObserved_itemIndex = k;
+                        positionOfMaxObserved_y = row;
+                    }
+                }else{
+                    maxObserved = MathOps::max(maxObserved, scoresM[k]);
+                }
+            }
+
+            #pragma unroll
+            for(int i = 1; i < numItems/4; i++){
+                foo = pssmRow4[i * group.size() + group.thread_rank()];
+                memcpy(&fooArray[0], &foo, sizeof(ScoreType) * 4);
+
+                #pragma unroll
+                for(int k = 0; k < 4; k++){
+                    E = MathOps::add_max(scoresM[4*i + k-1], gapopenscore, MathOps::add(E, gapextendscore));
+                    scoresF[4*i + k] = MathOps::add_max(scoresM[4*i + k], gapopenscore, MathOps::add(scoresF[4*i + k], gapextendscore));
+                    ScoreType upTempScore = scoresM[4*i + k];
+                    scoresM[4*i + k] = MathOps::add_max_relu(scoreDiag, fooArray[k], MathOps::max(E, scoresF[4*i + k]));
+                    scoreDiag = upTempScore;
+                    if constexpr (withEndPosition){
+                        if(maxObserved < scoresM[4*i + k]){
+                            maxObserved = scoresM[4*i + k];
+                            positionOfMaxObserved_tileNr = tileNr;
+                            positionOfMaxObserved_itemIndex = 4*i + k;
+                            positionOfMaxObserved_y = row;
+                        }
+                    }else{
+                        maxObserved = MathOps::max(maxObserved, scoresM[4*i + k]);
+                    }
+                }
+            }
+
+            //advance E by 1 column and F by 1 row to allow for optimized computations of remaining diagonals
+            E = MathOps::add_max(scoresM[numItems-1], gapopenscore, MathOps::add(E, gapextendscore));
+            for(int k = 0; k < numItems; k++){
+                scoresF[k] = MathOps::add_max(scoresM[k], gapopenscore, MathOps::add(scoresF[k], gapextendscore));
+            }
+
+            //printState(row);
+        };
+
+        auto relax = [&](int row, int tileNr){
+            static_assert(numItems % 4 == 0);
+
+            using Vec4T = typename Vectorized4<ScoreType>::type;
+            const Vec4T* const pssmRow4 = reinterpret_cast<const Vec4T*>(&shared_pssm.data[currentLetter][0]);
+            //const Vec4T* const pssmRow4 = reinterpret_cast<const Vec4T*>(&strided_PSSM[currentLetter][0]);
+            Vec4T foo = pssmRow4[0 * groupsize + group.thread_rank()];
+            ScoreType fooArray[4];
+            memcpy(&fooArray[0], &foo, sizeof(Vec4T));
+
+            // E of current column and scoresF of current row are already computed
+
+            ScoreType tempM = scoresM[0];
+            scoresM[0] = MathOps::add_max_relu(scoreDiag, fooArray[0], MathOps::max(E, scoresF[0]));
+            if constexpr (withEndPosition){
+                if(maxObserved < scoresM[0]){
+                    maxObserved = scoresM[0];
+                    positionOfMaxObserved_tileNr = tileNr;
+                    positionOfMaxObserved_itemIndex = 0;
+                    positionOfMaxObserved_y = row;
+                }
+            }else{
+                maxObserved = MathOps::max(maxObserved, scoresM[0]);
+            }
+            E = MathOps::add_max(scoresM[0],gapopenscore, MathOps::add(E, gapextendscore));
+            scoresF[0] = MathOps::add_max(scoresM[0], gapopenscore, MathOps::add(scoresF[0], gapextendscore)); //this computes F of the next row !
+            scoreDiag = tempM;
+
+            #pragma unroll
+            for(int i = 1; i < 4; i++){
+                tempM = scoresM[i];
+                scoresM[i] = MathOps::add_max_relu(scoreDiag, fooArray[i], MathOps::max(E, scoresF[i]));
+                if constexpr (withEndPosition){
+                    if(maxObserved < scoresM[i]){
+                        maxObserved = scoresM[i];
+                        positionOfMaxObserved_tileNr = tileNr;
+                        positionOfMaxObserved_itemIndex = i;
+                        positionOfMaxObserved_y = row;
+                    }
+                }else{
+                    maxObserved = MathOps::max(maxObserved, scoresM[i]);
+                }
+                E = MathOps::add_max(scoresM[i], gapopenscore, MathOps::add(E, gapextendscore));
+                scoresF[i] = MathOps::add_max(scoresM[i], gapopenscore, MathOps::add(scoresF[i], gapextendscore)); //this computes F of the next row !
+                scoreDiag = tempM;
+            }
+
+            #pragma unroll
+            for(int k = 1; k < numItems/4; k++){
+                foo = pssmRow4[k * groupsize + group.thread_rank()];
+                memcpy(&fooArray[0], &foo, sizeof(Vec4T));
+
+                #pragma unroll
+                for(int i = 0; i < 4; i++){
+                    const int index = k*4+i;
+                    tempM = scoresM[index];
+                    scoresM[index] = MathOps::add_max_relu(scoreDiag, fooArray[i], MathOps::max(E, scoresF[index]));
+                    if constexpr (withEndPosition){
+                        if(maxObserved < scoresM[index]){
+                            maxObserved = scoresM[index];
+                            positionOfMaxObserved_tileNr = tileNr;
+                            positionOfMaxObserved_itemIndex = index;
+                            positionOfMaxObserved_y = row;
+                        }
+                    }else{
+                        maxObserved = MathOps::max(maxObserved, scoresM[index]);
+                    }
+                    E = MathOps::add_max(scoresM[index], gapopenscore, MathOps::add(E, gapextendscore));
+                    scoresF[index] = MathOps::add_max(scoresM[index], gapopenscore, MathOps::add(scoresF[index], gapextendscore)); //this computes F of the next row !
+                    scoreDiag = tempM;
+                }
+
+            }
+
+            //printState(row);
+        };
+
+        auto initScoresFirstTile = [&](){
+            if(group.thread_rank() == 0){
+                #pragma unroll
+                for (int i=0; i < numItems; i++) {
+                    scoresM[i] = 0;
+                    scoresF[i] = oobscore;
+                }
+                scoreDiag = 0;
+                scoreLeft = 0;
+                E = oobscore;
+            }else{
+                #pragma unroll
+                for (int i=0; i < numItems; i++) {
+                    scoresM[i] = oobscore;
+                    scoresF[i] = oobscore;
+                }
+                scoreDiag = oobscore;
+                scoreLeft = group.thread_rank() == 1 ? 0 : oobscore;
+                E = oobscore;
+            }
+        };
+
+        auto shuffleScoresFirstTile = [&](){
+            scoreDiag = scoreLeft;
+            const ScoreType newscoreLeft = group.shfl_up(scoresM[numItems-1], 1);
+            const ScoreType newE = group.shfl_up(E, 1);
+            if(group.thread_rank() == 0){
+                //scoreLeft is only modified in this function and is initialized with 0 for thread 0
+                // assert(scoreLeft == 0);
+                //scoreLeft = 0;
+
+                // E = oobscore;
+                E = gapopenscore; // After first diagonal was processed, thread 0 needs E of matrix column 1, not -infty
+            }else{
+                scoreLeft = newscoreLeft;
+                E = newE;
+            }
+        };
+
+        auto initScoresNotFirstTile = [&](int tileNr){
+            if(group.thread_rank() == 0){
+                #pragma unroll
+                for (int i=0; i < numItems; i++) {
+                    scoresM[i] = 0;
+                    scoresF[i] = oobscore;
+                }
+                scoreDiag = 0;
+                scoreLeft = leftBorderM_E.x;
+                E = leftBorderM_E.y;
+            }else{
+                #pragma unroll
+                for (int i=0; i < numItems; i++) {
+                    scoresM[i] = oobscore;
+                    scoresF[i] = oobscore;
+                }
+                scoreDiag = oobscore;
+                scoreLeft = group.thread_rank() == 1 ? 0 : oobscore;
+                E = oobscore;
+            }
+        };
+
+        auto shuffleScoresNotFirstTile = [&](){
+            scoreDiag = scoreLeft;
+            const ScoreType newscoreLeft = group.shfl_up(scoresM[numItems-1], 1);
+            const ScoreType newE = group.shfl_up(E, 1);
+            if(group.thread_rank() == 0){
+                scoreLeft = leftBorderM_E.x;
+                E = leftBorderM_E.y;
+            }else{
+                scoreLeft = newscoreLeft;
+                E = newE;
+            }
+        };
+
+        //first tile
+        {
+            /* 
+                -----------------------
+                Process tile 0
+                ----------------------- 
+            */
+            constexpr int tileNr = 0;
+
+            //load pssm for tile 0. blockwide operation
+            load_PSSM(0);
+
+            if(alignmentId < numAlignments){
+
+                const auto globalIndex = d_indices[alignmentId];
+                subjectLength = devLengths[globalIndex];
+                const auto charOffset = devOffsets[globalIndex]-devOffsets[0];
+                groupSubjectChar4 = reinterpret_cast<const char4*>(&devChars[charOffset]);
+                clearOutOfTileTempStorage(subjectLength);
+
+                // if(threadIdx.x == 0){
+                //     printf("subjectLength %d, queryLength %d, globalIndex %d, offset %lu\n", subjectLength, queryLength, globalIndex, charOffset);
+                // }
+                
+                loadOffsetLimit = SDIV(subjectLength, 4);
+                subjectLoadOffset = group.thread_rank();
+                loadNext4Letters();
+                currentLetter = badLetter;
+
+                tempWriteOffset = group.thread_rank();
+
+                initScoresFirstTile();
+
+                const int numRows = (subjectLength + 1) + (groupsize-1);
+                int r = 1;
+
+                //process first groupsize - 1 diagonals which contain out-of-bound threads
+                {
+                    if(group.thread_rank() == 0){ currentLetter = current4Letters.x; }
+                    relaxFirstDiagonal(r, tileNr); //x
+                    shuffleScoresFirstTile();
+                    shuffleCurrentLetter();
+
+                    if(group.thread_rank() == 0){ currentLetter = current4Letters.y; }
+                    relax(r+1, tileNr); //y
+                    shuffleScoresFirstTile();
+                    shuffleCurrentLetter();
+
+                    if(group.thread_rank() == 0){ currentLetter = current4Letters.z; }
+                    relax(r+2, tileNr); //z
+                    shuffleScoresFirstTile();
+                    shuffleCurrentLetter();
+
+                    if(group.thread_rank() == 0){ currentLetter = current4Letters.w; }
+                    shuffle4Letters();
+
+                    r = 4;
+                    for(; r < groupsize - 1; r += 4){                    
+                        relax(r, tileNr); //w
+                        shuffleScoresFirstTile();
+                        shuffleCurrentLetter(); 
+
+                        if(group.thread_rank() == 0){ currentLetter = current4Letters.x; }
+                        relax(r+1, tileNr); //x
+                        shuffleScoresFirstTile();
+                        shuffleCurrentLetter(); 
+
+                        if(group.thread_rank() == 0){ currentLetter = current4Letters.y; }
+                        relax(r+2, tileNr); //y
+                        shuffleScoresFirstTile();
+                        shuffleCurrentLetter(); 
+
+                        if(group.thread_rank() == 0){ currentLetter = current4Letters.z; }
+                        relax(r+3, tileNr); //z
+                        shuffleScoresFirstTile();
+                        shuffleCurrentLetter(); 
+
+                        if(group.thread_rank() == 0){ currentLetter = current4Letters.w; }
+                        shuffle4Letters();
+                    }
+                }
+ 
+                //process remaining diagonals. process in chunks of 4 diagonals.
+                //for those diagonals we need to store the last column of the tile to temp memory
+                //last column is stored in "rightBorder"
+
+                //r starts with r=max(4, groupsize)
+                for(; r < numRows - 3; r += 4){
+
+                    relax(r, tileNr); //w
+                    shuffleTileLastColumn(); //must be called before setTileLastColumn
+                    setTileLastColumn(); //must be called before shuffleScores
+                    shuffleScoresFirstTile();
+                    shuffleCurrentLetter(); 
+
+
+                    if(group.thread_rank() == 0){ currentLetter = current4Letters.x; }
+                    relax(r+1, tileNr); //x
+                    shuffleTileLastColumn(); //must be called before setTileLastColumn
+                    setTileLastColumn(); //must be called before shuffleScores
+                    shuffleScoresFirstTile();
+                    shuffleCurrentLetter(); 
+
+
+                    if(group.thread_rank() == 0){ currentLetter = current4Letters.y; }
+                    relax(r+2, tileNr); //y
+                    shuffleTileLastColumn(); //must be called before setTileLastColumn
+                    setTileLastColumn(); //must be called before shuffleScores
+                    shuffleScoresFirstTile();
+                    shuffleCurrentLetter();
+
+
+                    if(group.thread_rank() == 0){ currentLetter = current4Letters.z; }
+                    relax(r+3, tileNr); //z 
+                    shuffleTileLastColumn(); //must be called before setTileLastColumn
+                    setTileLastColumn(); //must be called before shuffleScores
+                    shuffleScoresFirstTile();
+                    shuffleCurrentLetter(); 
+
+                    if(group.thread_rank() == 0){ currentLetter = current4Letters.w; }
+
+                    if((r + 4) % (4*group.size()) == 0){
+                        //used up all query letters stored across the group. reload
+                        loadNext4Letters();
+                    }else{
+                        //get next 4 letters from neighbor
+                        shuffle4Letters();
+                    }
+
+                    if((r + 4) % (group.size()) == 0){
+                        #ifdef PRINT_WRITE
+                        printf("tid %d, write %f %f to %d\n", group.thread_rank(), tileLastColumnM_E.x, tileLastColumnM_E.y, tempWriteOffset);
+                        #endif
+                        groupTempStorage[tempWriteOffset] = tileLastColumnM_E;
+                        tempWriteOffset += group.size();
+                    }                    
+                }
+
+                //can have at most 3 remaining rows
+                if(r < numRows){
+                    relax(r, tileNr); //w
+                    shuffleTileLastColumn(); //must be called before setTileLastColumn
+                    setTileLastColumn(); //must be called before shuffleScores
+                    shuffleScoresFirstTile();
+                    shuffleCurrentLetter();
+
+                }
+                if(r+1 < numRows){
+                    if(group.thread_rank() == 0){ currentLetter = current4Letters.x; }
+                    relax(r+1, tileNr); //x
+                    shuffleTileLastColumn(); //must be called before setTileLastColumn
+                    setTileLastColumn(); //must be called before shuffleScores                    
+                    shuffleScoresFirstTile();
+                    shuffleCurrentLetter();
+                }
+                if(r+2 < numRows){
+                    if(group.thread_rank() == 0){ currentLetter = current4Letters.y; }
+                    relax(r+2, tileNr); //y
+                    shuffleTileLastColumn(); //must be called before setTileLastColumn
+                    setTileLastColumn(); //must be called before shuffleScores
+                }
+
+                const int totalChunksOfFour = subjectLength / 4;
+                const int unsavedChunksOfFour = totalChunksOfFour % (group.size() / 4);
+                const int numThreadsWithValidTileLastColumn = unsavedChunksOfFour * 4 + subjectLength % 4;
+                if(numThreadsWithValidTileLastColumn > 0){
+                    const int firstValidThread = group.size() - numThreadsWithValidTileLastColumn;
+                    if(group.thread_rank() >= firstValidThread){
+                        #ifdef PRINT_WRITE
+                        printf("last write. tid %d, write %f %f to %d\n", group.thread_rank(), tileLastColumnM_E.x, tileLastColumnM_E.y, tempWriteOffset - firstValidThread);
+                        #endif
+                        groupTempStorage[tempWriteOffset - firstValidThread] = tileLastColumnM_E;
+                    }
+                }
+            }
+        }
+
+
+
+        for(int tileNr = 1; tileNr < numTiles; tileNr++){
+            load_PSSM(tileNr);
+
+            /* 
+                -----------------------
+                Process tile tileNr
+                ----------------------- 
+            */
+
+            if(alignmentId < numAlignments){
+
+                subjectLoadOffset = group.thread_rank();
+                loadNext4Letters();
+                currentLetter = badLetter;
+
+                tempWriteOffset = group.thread_rank();
+
+                #ifdef PRINT_LOAD
+                printf("tid %d, load %f %f from %d\n", group.thread_rank(), groupTempStorage[group.thread_rank()].x, groupTempStorage[group.thread_rank()].y, group.thread_rank());
+                #endif
+                leftBorderM_E = groupTempStorage[group.thread_rank()];
+                tempLoadOffset = group.size() + group.thread_rank();
+
+
+                initScoresNotFirstTile(tileNr);
+
+                const int numRows = (subjectLength + 1) + (groupsize-1);
+                int r = 1;
+
+                //process first groupsize - 1 diagonals which contain out-of-bound threads
+                {
+                    if(group.thread_rank() == 0){ currentLetter = current4Letters.x; }
+                    relaxFirstDiagonal(r, tileNr); //x
+                    shuffleLeftBorder(); //must be called before shuffleScores
+                    shuffleScoresNotFirstTile();
+                    shuffleCurrentLetter();
+
+                    if(group.thread_rank() == 0){ currentLetter = current4Letters.y; }
+                    relax(r+1, tileNr); //y
+                    shuffleLeftBorder(); //must be called before shuffleScores
+                    shuffleScoresNotFirstTile();
+                    shuffleCurrentLetter();
+
+                    if(group.thread_rank() == 0){ currentLetter = current4Letters.z; }
+                    relax(r+2, tileNr); //z
+                    shuffleLeftBorder(); //must be called before shuffleScores
+                    shuffleScoresNotFirstTile();
+                    shuffleCurrentLetter();
+
+                    if(group.thread_rank() == 0){ currentLetter = current4Letters.w; }
+                    shuffle4Letters();
+
+                    r = 4;
+                    for(; r < groupsize - 1; r += 4){                    
+                        relax(r, tileNr); //w
+                        shuffleLeftBorder(); //must be called before shuffleScores
+                        shuffleScoresNotFirstTile();
+                        shuffleCurrentLetter(); 
+
+                        if(group.thread_rank() == 0){ currentLetter = current4Letters.x; }
+                        relax(r+1, tileNr); //x
+                        shuffleLeftBorder(); //must be called before shuffleScores
+                        shuffleScoresNotFirstTile();
+                        shuffleCurrentLetter(); 
+
+                        if(group.thread_rank() == 0){ currentLetter = current4Letters.y; }
+                        relax(r+2, tileNr); //y
+                        shuffleLeftBorder(); //must be called before shuffleScores
+                        shuffleScoresNotFirstTile();
+                        shuffleCurrentLetter(); 
+
+                        if(group.thread_rank() == 0){ currentLetter = current4Letters.z; }
+                        relax(r+3, tileNr); //z
+                        shuffleLeftBorder(); //must be called before shuffleScores
+                        shuffleScoresNotFirstTile();
+                        shuffleCurrentLetter(); 
+
+                        if(group.thread_rank() == 0){ currentLetter = current4Letters.w; }
+                        shuffle4Letters();
+                    }
+                }
+
+                //process remaining diagonals. process in chunks of 4 diagonals.
+                //for those diagonals we need to store the last column of the tile to temp memory
+                //last column is stored in "rightBorder"
+
+                //r starts with r=max(4, groupsize)
+                for(; r < numRows - 3; r += 4){
+
+                    relax(r, tileNr); //w
+                    shuffleTileLastColumn(); //must be called before setTileLastColumn
+                    setTileLastColumn(); //must be called before shuffleScores
+                    if(r % group.size() == 0 && r < subjectLength){
+                        #ifdef PRINT_LOAD
+                        printf("tid %d, load %f %f from %d\n", group.thread_rank(), groupTempStorage[tempLoadOffset].x, groupTempStorage[tempLoadOffset].y, tempLoadOffset);
+                        #endif
+                        leftBorderM_E = groupTempStorage[tempLoadOffset];
+                        tempLoadOffset += group.size();
+                    }else{
+                        shuffleLeftBorder(); //must be called before shuffleScores
+                    }
+                    shuffleScoresNotFirstTile();
+                    shuffleCurrentLetter(); 
+                    
+
+
+
+                    if(group.thread_rank() == 0){ currentLetter = current4Letters.x; }
+                    relax(r+1, tileNr); //x
+                    shuffleTileLastColumn(); //must be called before setTileLastColumn
+                    setTileLastColumn(); //must be called before shuffleScores
+                    shuffleLeftBorder(); //must be called before shuffleScores
+                    shuffleScoresNotFirstTile();
+                    shuffleCurrentLetter(); 
+                    
+
+                    if(group.thread_rank() == 0){ currentLetter = current4Letters.y; }
+                    relax(r+2, tileNr); //y
+                    shuffleTileLastColumn(); //must be called before setTileLastColumn
+                    setTileLastColumn(); //must be called before shuffleScores
+                    shuffleLeftBorder(); //must be called before shuffleScores
+                    shuffleScoresNotFirstTile();
+                    shuffleCurrentLetter(); 
+                    
+
+                    if(group.thread_rank() == 0){ currentLetter = current4Letters.z; }
+                    relax(r+3, tileNr); //z
+                    shuffleTileLastColumn(); //must be called before setTileLastColumn
+                    setTileLastColumn(); //must be called before shuffleScores
+                    shuffleLeftBorder(); //must be called before shuffleScores
+                    shuffleScoresNotFirstTile();
+                    shuffleCurrentLetter(); 
+
+                    if(group.thread_rank() == 0){ currentLetter = current4Letters.w; }
+
+                    if((r + 4) % (4*group.size()) == 0){
+                        //used up all query letters stored across the group. reload
+                        loadNext4Letters();
+                    }else{
+                        //get next 4 letters from neighbor
+                        shuffle4Letters();
+                    }
+
+                    if((r + 4) % (group.size()) == 0){
+                        #ifdef PRINT_WRITE
+                        printf("tid %d, write %f %f to %d\n", group.thread_rank(), tileLastColumnM_E.x, tileLastColumnM_E.y, tempWriteOffset);
+                        #endif
+                        groupTempStorage[tempWriteOffset] = tileLastColumnM_E;
+                        tempWriteOffset += group.size();
+                    }
+                }
+
+                //can have at most 3 remaining rows
+                if(r < numRows){
+                    relax(r, tileNr); //w
+                    shuffleTileLastColumn(); //must be called before setTileLastColumn
+                    setTileLastColumn(); //must be called before shuffleScores
+                    if(r % group.size() == 0 && r < subjectLength){
+                        #ifdef PRINT_LOAD
+                        printf("last load. tid %d, load %f %f from %d\n", group.thread_rank(), groupTempStorage[tempLoadOffset].x, groupTempStorage[tempLoadOffset].y, tempLoadOffset);
+                        #endif
+                        leftBorderM_E = groupTempStorage[tempLoadOffset];
+                        tempLoadOffset += group.size();
+                    }else{
+                        shuffleLeftBorder(); //must be called before shuffleScores
+                    }
+                    shuffleScoresNotFirstTile();
+                    shuffleCurrentLetter();
+                    
+
+                }
+                if(r+1 < numRows){
+                    if(group.thread_rank() == 0){ currentLetter = current4Letters.x; }
+                    relax(r+1, tileNr); //x
+                    shuffleTileLastColumn(); //must be called before setTileLastColumn
+                    setTileLastColumn(); //must be called before shuffleScores
+                    shuffleLeftBorder(); //must be called before shuffleScores
+                    shuffleScoresNotFirstTile();
+                    shuffleCurrentLetter();
+                    
+                }
+                if(r+2 < numRows){
+                    if(group.thread_rank() == 0){ currentLetter = current4Letters.y; }
+                    relax(r+2, tileNr); //y
+                    shuffleTileLastColumn(); //must be called before setTileLastColumn
+                    setTileLastColumn(); //must be called before shuffleScores
+                }
+
+                const int totalChunksOfFour = subjectLength / 4;
+                const int unsavedChunksOfFour = totalChunksOfFour % (group.size() / 4);
+                const int numThreadsWithValidTileLastColumn = unsavedChunksOfFour * 4 + subjectLength % 4;
+                if(numThreadsWithValidTileLastColumn > 0){
+                    const int firstValidThread = group.size() - numThreadsWithValidTileLastColumn;
+                    if(group.thread_rank() >= firstValidThread){
+                        #ifdef PRINT_WRITE
+                        printf("last write. tid %d, write %f %f\n", group.thread_rank(), tileLastColumnM_E.x, tileLastColumnM_E.y);
+                        #endif
+                        groupTempStorage[tempWriteOffset - firstValidThread] = tileLastColumnM_E;
+                    }
+                }
+            }
+        }
+        //printState(r+3);
+
+        if constexpr (withEndPosition){
+ 
+            if(alignmentId < numAlignments){
+                const int3 packed = make_int3(maxObserved, 
+                    positionOfMaxObserved_tileNr * groupsize * numItems + group.thread_rank() * numItems + positionOfMaxObserved_itemIndex,
+                    positionOfMaxObserved_y - group.thread_rank() - 1);
+                const int3 maxPacked = cg::reduce(group, packed, [](int3 l, int3 r){
+                    if(l.x > r.x){
+                        return l;
+                    }else{
+                        return r;
+                    }
+                });
+
+                if(group.thread_rank() == 0){
+                    ScoreWithExtra<ScoreType, AlignmentEndPosition> res(maxPacked.x, AlignmentEndPosition{maxPacked.y, maxPacked.z});
+
+                    devAlignmentScores[alignmentId] = res;
+                    //devAlignmentScores[alignmentId] = maxPacked.x;
+                    //endPositionOutput[alignmentId] = make_int2(maxPacked.y, maxPacked.z);
+                }
+            }
+        }else{
+            if(alignmentId < numAlignments){
+                maxObserved = cg::reduce(group, maxObserved, cg::greater<ScoreType>{});
+
+                if(group.thread_rank() == 0){
+                    ScoreWithExtra<ScoreType, AlignmentEndPosition> res(maxObserved, AlignmentEndPosition{0, 0});
+
+                    devAlignmentScores[alignmentId] = res;
+                    //devAlignmentScores[alignmentId] = maxObserved;
+                }
+            }
+        }
+    }
+
+}
+
+
+
+
+
+
+template<
+    class ScoreType,
+    int blocksize, 
+    int groupsize, 
+    int numItems, 
+    bool withEndPosition,
+    bool subjectIsCaseSensitive, 
+    class ScoreOutputIterator, 
+    class PositionsIterator
+>
+void call_amino_gpu_localAlignmentKernel_affinegap_floatOrInt_pssm_multitile(
+    int numThreadBlocks,
+    const char * const devChars,
+    ScoreOutputIterator const devAlignmentScores,
+    const size_t* const devOffsets,
+    const SequenceLengthT* const devLengths,
+    PositionsIterator const d_indices,
+    const int numAlignments,
+    const SequenceLengthT queryLength,
+    const PSSM_2D_View<ScoreType>& strided_PSSM,
+    const ScoreType gapopenscore, 
+    const ScoreType gapextendscore,
+    char* const tempStorage,
+    const size_t tempBytesPerGroup,
+    cudaStream_t stream
+){
+    //constexpr int groupsPerBlock = blocksize / group_size;
+    //constexpr int alignmentsPerGroup = 1;
+    //constexpr int alignmentsPerBlock = groupsPerBlock * alignmentsPerGroup;
+    // std::cout << "blocksize " << blocksize << ", group_size " << group_size 
+    //     << ", alignmentsPerBlock " << alignmentsPerBlock << ", numAlignments " << numAlignments << "\n";
+
+    constexpr int numRowsPSSM = 21;
+    constexpr int numColumnsPSSM = groupsize * numItems;
+    using SPSSM = SharedPSSM_singletile<float, numRowsPSSM, numColumnsPSSM>;
+    int smem = sizeof(SPSSM);
+    auto kernel = amino_gpu_localAlignmentKernel_affinegap_floatOrInt_pssm_multitile<
+        ScoreType,
+        blocksize, 
+        groupsize, 
+        numItems, 
+        withEndPosition,
+        subjectIsCaseSensitive,
+        ScoreOutputIterator, 
+        PositionsIterator>;
+
+    auto setSmemKernelAttribute = [&](){
+        static std::map<int, bool> isSet;
+        if(smem > 48*1024){
+            int deviceId;
+            cudaGetDevice(&deviceId); CUERR;
+            if(!isSet[deviceId]){
+                cudaFuncSetAttribute(kernel, cudaFuncAttributeMaxDynamicSharedMemorySize, smem); CUERR;
+                isSet[deviceId] = true;
+            }
+        }
+    };
+    setSmemKernelAttribute();
+
+    dim3 grid = std::min(numAlignments, numThreadBlocks);
+
+    kernel<<<grid, blocksize, smem, stream>>>(
+        devChars,
+        devAlignmentScores,
+        devOffsets,
+        devLengths,
+        d_indices,
+        numAlignments,       
+        queryLength,
+        strided_PSSM,
+        gapopenscore, 
+        gapextendscore,
+        tempStorage,
+        tempBytesPerGroup
+    ); CUERR;
+}
+
+
+
+
+
+
+
+
+#define ScoreOutputIterator TopNMaximaArrayWithExtra<AlignmentEndPosition>
+#define PositionsIterator decltype(thrust::make_counting_iterator<ReferenceIdT>(0))
+#define withEndPosition true
+#define subjectIsCaseSensitive true
+#define X(g,r) \
+    extern template void call_amino_gpu_localAlignmentKernel_affinegap_floatOrInt_pssm_multitile<float, 512, g, r, withEndPosition, subjectIsCaseSensitive, ScoreOutputIterator, PositionsIterator>( \
+        int, \
+        const char * const, \
+        ScoreOutputIterator const, \
+        const size_t* const, \
+        const SequenceLengthT* const, \
+        PositionsIterator const, \
+        const int, \
+        const SequenceLengthT, \
+        const PSSM_2D_View<float>&, \
+        const float,  \
+        const float, \
+        char* const, \
+        const size_t, \
+        cudaStream_t \
+    );
+
+    PSSM_SW_ENDPOS_MULTITILE_FLOAT_OR_INT_FOR_EACH_VALID_CONFIG_DO_X
+
+#undef X
+#undef subjectIsCaseSensitive
+#undef withEndPosition
+#undef PositionsIterator
+#undef ScoreOutputIterator
+
+#define ScoreOutputIterator TopNMaximaArrayWithExtra<AlignmentEndPosition>
+#define PositionsIterator ReferenceIdT*
+#define withEndPosition true
+#define subjectIsCaseSensitive true
+#define X(g,r) \
+    extern template void call_amino_gpu_localAlignmentKernel_affinegap_floatOrInt_pssm_multitile<float, 512, g, r, withEndPosition, subjectIsCaseSensitive, ScoreOutputIterator, PositionsIterator>( \
+        int, \
+        const char * const, \
+        ScoreOutputIterator const, \
+        const size_t* const, \
+        const SequenceLengthT* const, \
+        PositionsIterator const, \
+        const int, \
+        const SequenceLengthT, \
+        const PSSM_2D_View<float>&, \
+        const float,  \
+        const float, \
+        char* const, \
+        const size_t, \
+        cudaStream_t \
+    );
+
+    PSSM_SW_ENDPOS_MULTITILE_FLOAT_OR_INT_FOR_EACH_VALID_CONFIG_DO_X
+
+#undef X
+#undef subjectIsCaseSensitive
+#undef withEndPosition
+#undef PositionsIterator
+#undef ScoreOutputIterator
+
+
+
+
+#define ScoreOutputIterator TopNMaximaArrayWithExtra<AlignmentEndPosition>
+#define PositionsIterator decltype(thrust::make_counting_iterator<ReferenceIdT>(0))
+#define withEndPosition true
+#define subjectIsCaseSensitive true
+#define X(g,r) \
+    extern template void call_amino_gpu_localAlignmentKernel_affinegap_floatOrInt_pssm_multitile<int, 512, g, r, withEndPosition, subjectIsCaseSensitive, ScoreOutputIterator, PositionsIterator>( \
+        int, \
+        const char * const, \
+        ScoreOutputIterator const, \
+        const size_t* const, \
+        const SequenceLengthT* const, \
+        PositionsIterator const, \
+        const int, \
+        const SequenceLengthT, \
+        const PSSM_2D_View<int>&, \
+        const int,  \
+        const int, \
+        char* const, \
+        const size_t, \
+        cudaStream_t \
+    );
+
+    PSSM_SW_ENDPOS_MULTITILE_FLOAT_OR_INT_FOR_EACH_VALID_CONFIG_DO_X
+
+#undef X
+#undef subjectIsCaseSensitive
+#undef withEndPosition
+#undef PositionsIterator
+#undef ScoreOutputIterator
+
+#define ScoreOutputIterator TopNMaximaArrayWithExtra<AlignmentEndPosition>
+#define PositionsIterator ReferenceIdT*
+#define withEndPosition true
+#define subjectIsCaseSensitive true
+#define X(g,r) \
+    extern template void call_amino_gpu_localAlignmentKernel_affinegap_floatOrInt_pssm_multitile<int, 512, g, r, withEndPosition, subjectIsCaseSensitive, ScoreOutputIterator, PositionsIterator>( \
+        int, \
+        const char * const, \
+        ScoreOutputIterator const, \
+        const size_t* const, \
+        const SequenceLengthT* const, \
+        PositionsIterator const, \
+        const int, \
+        const SequenceLengthT, \
+        const PSSM_2D_View<int>&, \
+        const int,  \
+        const int, \
+        char* const, \
+        const size_t, \
+        cudaStream_t \
+    );
+
+    PSSM_SW_ENDPOS_MULTITILE_FLOAT_OR_INT_FOR_EACH_VALID_CONFIG_DO_X
+
+#undef X
+#undef subjectIsCaseSensitive
+#undef withEndPosition
+#undef PositionsIterator
+#undef ScoreOutputIterator
+
+
+
+
+
+
+template<
+    class ScoreType,
+    int blocksize, 
+    bool withEndPosition,
+    bool subjectIsCaseSensitive, 
+    class ScoreOutputIterator, 
+    class PositionsIterator
+>
+void call_amino_gpu_localAlignmentKernel_affinegap_floatOrInt_pssm_multitile(
+    int numThreadBlocks,
+    int groupsize, 
+    int numItems, 
+    const char * const devChars,
+    ScoreOutputIterator const devAlignmentScores,
+    const size_t* const devOffsets,
+    const SequenceLengthT* const devLengths,
+    PositionsIterator const d_indices,
+    const int numAlignments,
+    const SequenceLengthT queryLength,
+    const PSSM_2D_View<ScoreType>& strided_PSSM,
+    const ScoreType gapopenscore, 
+    const ScoreType gapextendscore,
+    char* const tempStorage,
+    const size_t tempBytesPerGroup,
+    cudaStream_t stream
+){
+    #define X(g,r) \
+        if(groupsize == g && numItems == r){ \
+            call_amino_gpu_localAlignmentKernel_affinegap_floatOrInt_pssm_multitile<ScoreType, blocksize,g,r,withEndPosition,subjectIsCaseSensitive>( \
+                numThreadBlocks, devChars, devAlignmentScores, devOffsets, devLengths, \
+                d_indices, numAlignments, queryLength, strided_PSSM, \
+                gapopenscore, gapextendscore, tempStorage, tempBytesPerGroup, stream \
+            ); \
+        } else 
+
+        PSSM_SW_ENDPOS_MULTITILE_FLOAT_OR_INT_FOR_EACH_VALID_CONFIG_DO_X
+    { throw std::runtime_error("invalid groupsize/numregs config");}
+
+    #undef X
+}
+
+
+
+} //namespace cudasw4
+
+
+
+#endif
\ No newline at end of file
diff --git a/lib/libmarv/src/pssmkernels_smithwaterman_instantiation_dpx.cu b/lib/libmarv/src/pssmkernels_smithwaterman_instantiation_dpx.cu
new file mode 100644
index 000000000..bf4ef20b5
--- /dev/null
+++ b/lib/libmarv/src/pssmkernels_smithwaterman_instantiation_dpx.cu
@@ -0,0 +1,136 @@
+//#include "validtileconfigs.hpp"
+#include "util.cuh"
+#include "config.hpp"
+#include "pssmkernels_smithwaterman.cuh"
+
+#include <thrust/iterator/counting_iterator.h>
+
+namespace cudasw4{
+
+
+#define ScoreOutputIterator TopNMaximaArrayWithExtra<AlignmentEndPosition>
+#define PositionsIterator decltype(thrust::make_counting_iterator<ReferenceIdT>(0))
+#define withEndPosition true
+#define subjectIsCaseSensitive true
+#define X(g,r) \
+    template void call_amino_gpu_localAlignmentKernel_affinegap_floatOrInt_pssm_singletile<int, 512, g, r, withEndPosition, subjectIsCaseSensitive, ScoreOutputIterator, PositionsIterator>( \
+        int, \
+        const char * const, \
+        ScoreOutputIterator const, \
+        const size_t* const, \
+        const SequenceLengthT* const, \
+        PositionsIterator const, \
+        const int, \
+        const SequenceLengthT, \
+        const PSSM_2D_View<int>&, \
+        const int,  \
+        const int, \
+        cudaStream_t \
+    );
+
+PSSM_SW_ENDPOS_SINGLETILE_FLOAT_OR_INT_FOR_EACH_VALID_CONFIG_DO_X
+
+#undef X
+#undef subjectIsCaseSensitive
+#undef withEndPosition
+#undef SequenceLengthT
+#undef PositionsIterator
+#undef ScoreOutputIterator
+
+
+#define ScoreOutputIterator TopNMaximaArrayWithExtra<AlignmentEndPosition>
+#define PositionsIterator ReferenceIdT*
+#define withEndPosition true
+#define subjectIsCaseSensitive true
+#define X(g,r) \
+    template void call_amino_gpu_localAlignmentKernel_affinegap_floatOrInt_pssm_singletile<int, 512, g, r, withEndPosition, subjectIsCaseSensitive, ScoreOutputIterator, PositionsIterator>( \
+        int, \
+        const char * const, \
+        ScoreOutputIterator const, \
+        const size_t* const, \
+        const SequenceLengthT* const, \
+        PositionsIterator const, \
+        const int, \
+        const SequenceLengthT, \
+        const PSSM_2D_View<int>&, \
+        const int,  \
+        const int, \
+        cudaStream_t \
+    );
+
+PSSM_SW_ENDPOS_SINGLETILE_FLOAT_OR_INT_FOR_EACH_VALID_CONFIG_DO_X
+
+#undef X
+#undef subjectIsCaseSensitive
+#undef withEndPosition
+#undef SequenceLengthT
+#undef PositionsIterator
+#undef ScoreOutputIterator
+
+
+
+#define ScoreOutputIterator TopNMaximaArrayWithExtra<AlignmentEndPosition>
+#define PositionsIterator decltype(thrust::make_counting_iterator<ReferenceIdT>(0))
+#define withEndPosition true
+#define subjectIsCaseSensitive true
+#define X(g,r) \
+    template void call_amino_gpu_localAlignmentKernel_affinegap_floatOrInt_pssm_multitile<int, 512, g, r, withEndPosition, subjectIsCaseSensitive, ScoreOutputIterator, PositionsIterator>( \
+        int, \
+        const char * const, \
+        ScoreOutputIterator const, \
+        const size_t* const, \
+        const SequenceLengthT* const, \
+        PositionsIterator const, \
+        const int, \
+        const SequenceLengthT, \
+        const PSSM_2D_View<int>&, \
+        const int,  \
+        const int, \
+        char* const, \
+        const size_t, \
+        cudaStream_t \
+    );
+
+    PSSM_SW_ENDPOS_MULTITILE_FLOAT_OR_INT_FOR_EACH_VALID_CONFIG_DO_X
+
+#undef X
+#undef subjectIsCaseSensitive
+#undef withEndPosition
+#undef PositionsIterator
+#undef ScoreOutputIterator
+
+
+
+#define ScoreOutputIterator TopNMaximaArrayWithExtra<AlignmentEndPosition>
+#define PositionsIterator ReferenceIdT*
+#define withEndPosition true
+#define subjectIsCaseSensitive true
+#define X(g,r) \
+    template void call_amino_gpu_localAlignmentKernel_affinegap_floatOrInt_pssm_multitile<int, 512, g, r, withEndPosition, subjectIsCaseSensitive, ScoreOutputIterator, PositionsIterator>( \
+        int, \
+        const char * const, \
+        ScoreOutputIterator const, \
+        const size_t* const, \
+        const SequenceLengthT* const, \
+        PositionsIterator const, \
+        const int, \
+        const SequenceLengthT, \
+        const PSSM_2D_View<int>&, \
+        const int,  \
+        const int, \
+        char* const, \
+        const size_t, \
+        cudaStream_t \
+    );
+
+    PSSM_SW_ENDPOS_MULTITILE_FLOAT_OR_INT_FOR_EACH_VALID_CONFIG_DO_X
+
+#undef X
+#undef subjectIsCaseSensitive
+#undef withEndPosition
+#undef PositionsIterator
+#undef ScoreOutputIterator
+
+
+
+} //namespace cudasw4
\ No newline at end of file
diff --git a/lib/libmarv/src/pssmkernels_smithwaterman_instantiation_float.cu b/lib/libmarv/src/pssmkernels_smithwaterman_instantiation_float.cu
new file mode 100644
index 000000000..9651f74fe
--- /dev/null
+++ b/lib/libmarv/src/pssmkernels_smithwaterman_instantiation_float.cu
@@ -0,0 +1,136 @@
+//#include "validtileconfigs.hpp"
+#include "util.cuh"
+#include "config.hpp"
+#include "pssmkernels_smithwaterman.cuh"
+
+#include <thrust/iterator/counting_iterator.h>
+
+namespace cudasw4{
+
+
+#define ScoreOutputIterator TopNMaximaArrayWithExtra<AlignmentEndPosition>
+#define PositionsIterator decltype(thrust::make_counting_iterator<ReferenceIdT>(0))
+#define withEndPosition true
+#define subjectIsCaseSensitive true
+#define X(g,r) \
+    template void call_amino_gpu_localAlignmentKernel_affinegap_floatOrInt_pssm_singletile<float, 512, g, r, withEndPosition, subjectIsCaseSensitive, ScoreOutputIterator, PositionsIterator>( \
+        int, \
+        const char * const, \
+        ScoreOutputIterator const, \
+        const size_t* const, \
+        const SequenceLengthT* const, \
+        PositionsIterator const, \
+        const int, \
+        const SequenceLengthT, \
+        const PSSM_2D_View<float>&, \
+        const float,  \
+        const float, \
+        cudaStream_t \
+    );
+
+PSSM_SW_ENDPOS_SINGLETILE_FLOAT_OR_INT_FOR_EACH_VALID_CONFIG_DO_X
+
+#undef X
+#undef subjectIsCaseSensitive
+#undef withEndPosition
+#undef SequenceLengthT
+#undef PositionsIterator
+#undef ScoreOutputIterator
+
+
+#define ScoreOutputIterator TopNMaximaArrayWithExtra<AlignmentEndPosition>
+#define PositionsIterator ReferenceIdT*
+#define withEndPosition true
+#define subjectIsCaseSensitive true
+#define X(g,r) \
+    template void call_amino_gpu_localAlignmentKernel_affinegap_floatOrInt_pssm_singletile<float, 512, g, r, withEndPosition, subjectIsCaseSensitive, ScoreOutputIterator, PositionsIterator>( \
+        int, \
+        const char * const, \
+        ScoreOutputIterator const, \
+        const size_t* const, \
+        const SequenceLengthT* const, \
+        PositionsIterator const, \
+        const int, \
+        const SequenceLengthT, \
+        const PSSM_2D_View<float>&, \
+        const float,  \
+        const float, \
+        cudaStream_t \
+    );
+
+PSSM_SW_ENDPOS_SINGLETILE_FLOAT_OR_INT_FOR_EACH_VALID_CONFIG_DO_X
+
+#undef X
+#undef subjectIsCaseSensitive
+#undef withEndPosition
+#undef SequenceLengthT
+#undef PositionsIterator
+#undef ScoreOutputIterator
+
+
+
+#define ScoreOutputIterator TopNMaximaArrayWithExtra<AlignmentEndPosition>
+#define PositionsIterator decltype(thrust::make_counting_iterator<ReferenceIdT>(0))
+#define withEndPosition true
+#define subjectIsCaseSensitive true
+#define X(g,r) \
+    template void call_amino_gpu_localAlignmentKernel_affinegap_floatOrInt_pssm_multitile<float, 512, g, r, withEndPosition, subjectIsCaseSensitive, ScoreOutputIterator, PositionsIterator>( \
+        int, \
+        const char * const, \
+        ScoreOutputIterator const, \
+        const size_t* const, \
+        const SequenceLengthT* const, \
+        PositionsIterator const, \
+        const int, \
+        const SequenceLengthT, \
+        const PSSM_2D_View<float>&, \
+        const float,  \
+        const float, \
+        char* const, \
+        const size_t, \
+        cudaStream_t \
+    );
+
+    PSSM_SW_ENDPOS_MULTITILE_FLOAT_OR_INT_FOR_EACH_VALID_CONFIG_DO_X
+
+#undef X
+#undef subjectIsCaseSensitive
+#undef withEndPosition
+#undef PositionsIterator
+#undef ScoreOutputIterator
+
+
+
+#define ScoreOutputIterator TopNMaximaArrayWithExtra<AlignmentEndPosition>
+#define PositionsIterator ReferenceIdT*
+#define withEndPosition true
+#define subjectIsCaseSensitive true
+#define X(g,r) \
+    template void call_amino_gpu_localAlignmentKernel_affinegap_floatOrInt_pssm_multitile<float, 512, g, r, withEndPosition, subjectIsCaseSensitive, ScoreOutputIterator, PositionsIterator>( \
+        int, \
+        const char * const, \
+        ScoreOutputIterator const, \
+        const size_t* const, \
+        const SequenceLengthT* const, \
+        PositionsIterator const, \
+        const int, \
+        const SequenceLengthT, \
+        const PSSM_2D_View<float>&, \
+        const float,  \
+        const float, \
+        char* const, \
+        const size_t, \
+        cudaStream_t \
+    );
+
+    PSSM_SW_ENDPOS_MULTITILE_FLOAT_OR_INT_FOR_EACH_VALID_CONFIG_DO_X
+
+#undef X
+#undef subjectIsCaseSensitive
+#undef withEndPosition
+#undef PositionsIterator
+#undef ScoreOutputIterator
+
+
+
+} //namespace cudasw4
\ No newline at end of file
diff --git a/lib/libmarv/src/sequence_io.cpp b/lib/libmarv/src/sequence_io.cpp
new file mode 100644
index 000000000..fa2c42916
--- /dev/null
+++ b/lib/libmarv/src/sequence_io.cpp
@@ -0,0 +1,882 @@
+/*************************************************************************//**
+ *
+ * @file  single and pairwise reading of sequences from FASTA/FASTQ files
+ *
+ * (c) 2017-2023 André Müller (mail@andremueller-online.de)
+ * MIT License
+ *
+ *****************************************************************************/
+
+#include <sstream>
+#include <limits>
+#include <iostream>
+
+#include "sequence_io.h"
+
+using std::string;
+
+
+//-----------------------------------------------------------------------------
+// BATCHED READING 
+//-----------------------------------------------------------------------------
+sequence_batch
+read_all_sequences_from_file (const string& filename, int align) 
+{
+    sequence_batch batch;
+
+    read_all_sequences_from_file(filename, batch, align);
+
+    return batch;
+}
+
+
+//-------------------------------------------------------------------
+void read_all_sequences_from_file (const string& filename, sequence_batch& out, 
+                                   int alignment) 
+{
+    auto reader = make_sequence_reader(filename);
+
+    out.chars.clear();
+    out.offsets.clear();
+    out.lengths.clear();
+    out.headers.clear();
+    out.qualities.clear();
+
+    if (!reader) return;
+
+    sequence_reader::data_type buffer;
+    while (reader->has_next()) {
+        reader->next_data(buffer);
+        if (!buffer.empty()) {
+            // pad if alignment criterion not met
+            const auto miss = out.chars.size() % alignment;
+            if (miss > 0) out.chars.insert(out.chars.end(), alignment-miss, ' '); 
+            out.offsets.push_back(out.chars.size());
+            out.lengths.push_back(buffer.size());
+            out.chars.insert(out.chars.end(), buffer.begin(), buffer.end());
+        }
+    }
+    out.offsets.push_back(out.chars.size());
+}
+
+
+
+
+//-------------------------------------------------------------------
+sequence_batch
+read_all_sequences_and_headers_from_file (const string& filename, int align) 
+{
+    sequence_batch batch;
+
+    read_all_sequences_and_headers_from_file(filename, batch, align);
+
+    return batch;
+}
+
+
+//-------------------------------------------------------------------
+void read_all_sequences_and_headers_from_file (const string& filename,
+                                               sequence_batch& out, 
+                                               int alignment)
+{
+    auto reader = make_sequence_reader(filename);
+
+    out.chars.clear();
+    out.offsets.clear();
+    out.lengths.clear();
+    out.headers.clear();
+    out.qualities.clear();
+
+    if (!reader) return;
+
+    sequence_reader::header_type headerBuf;
+    sequence_reader::data_type dataBuf;
+
+    while (reader->has_next()) {
+        reader->next_header_and_data(headerBuf, dataBuf);
+        if (!dataBuf.empty()) {
+            // pad if alignment criterion not met
+            const auto miss = out.chars.size() % alignment;
+            if (miss > 0) out.chars.insert(out.chars.end(), alignment-miss, ' '); 
+            out.offsets.push_back(out.chars.size());
+            out.lengths.push_back(dataBuf.size());
+            out.chars.insert(out.chars.end(), dataBuf.begin(), dataBuf.end());
+            out.headers.push_back(headerBuf);
+        }
+    }
+    out.offsets.push_back(out.chars.size());
+}
+
+
+
+
+//-------------------------------------------------------------------
+sequence_batch
+read_all_sequences_and_meta_info_from_file (const string& filename, int align) 
+{
+    sequence_batch batch;
+
+    read_all_sequences_and_meta_info_from_file(filename, batch, align);
+
+    return batch;
+}
+
+
+//-------------------------------------------------------------------
+void read_all_sequences_and_meta_info_from_file (const string& filename,
+                                               sequence_batch& out, 
+                                               int alignment)
+{
+    auto reader = make_sequence_reader(filename);
+
+    out.chars.clear();
+    out.offsets.clear();
+    out.lengths.clear();
+    out.headers.clear();
+    out.qualities.clear();
+
+    if (!reader) return;
+
+    sequence_reader::header_type headerBuf;
+    sequence_reader::data_type dataBuf;
+    sequence_reader::qualities_type qualBuf;
+
+    while (reader->has_next()) {
+        reader->next_header_data_qualities(headerBuf, dataBuf, qualBuf);
+        if (!dataBuf.empty()) {
+            // pad if alignment criterion not met
+            const auto miss = out.chars.size() % alignment;
+            if (miss > 0) out.chars.insert(out.chars.end(), alignment-miss, ' '); 
+            out.offsets.push_back(out.chars.size());
+            out.lengths.push_back(dataBuf.size());
+            out.chars.insert(out.chars.end(), dataBuf.begin(), dataBuf.end());
+            out.headers.push_back(headerBuf);
+            out.qualities.push_back(qualBuf);
+        }
+    }
+    out.offsets.push_back(out.chars.size());
+}
+
+
+
+
+//-----------------------------------------------------------------------------
+// SEQUENCE_READER  B A S E 
+//-----------------------------------------------------------------------------
+sequence_reader::sequence
+sequence_reader::next ()
+{
+    sequence seq;
+    next(seq);
+    return seq;
+}
+
+
+
+//-------------------------------------------------------------------
+void sequence_reader::next (sequence& seq)
+{
+    if (!has_next()) return;
+
+    ++index_;
+    seq.index = index_;
+    read_next(&seq.header, &seq.data, &seq.qualities);
+}
+
+
+
+//-------------------------------------------------------------------
+sequence_reader::header_type
+sequence_reader::next_header ()
+{
+    if (!has_next()) return header_type{};
+
+    ++index_;
+    header_type header;
+    read_next(&header, nullptr, nullptr);
+    return header;
+}
+
+
+
+//-------------------------------------------------------------------
+sequence_reader::data_type
+sequence_reader::next_data ()
+{
+    if (!has_next()) return data_type{};
+
+    ++index_;
+    data_type data;
+    read_next(nullptr, &data, nullptr);
+    return data;
+}
+
+
+
+//-------------------------------------------------------------------
+sequence_reader::index_type
+sequence_reader::next_data (sequence::data_type& data)
+{
+    if (!has_next()) {
+        data.clear();
+        return index(); 
+    }
+
+    ++index_;
+    read_next(nullptr, &data, nullptr);
+    return index_;
+}
+
+
+
+//-------------------------------------------------------------------
+sequence_reader::index_type
+sequence_reader::next_header_and_data (sequence::header_type& header,
+                                       sequence::data_type& data)
+{
+    if (!has_next()) {
+        header.clear();
+        data.clear();
+        return index();
+    }
+
+    ++index_;
+    read_next(&header, &data, nullptr);
+    return index_;
+}
+
+
+
+//-------------------------------------------------------------------
+sequence_reader::index_type
+sequence_reader::next_data_and_qualities (
+    sequence::data_type& data, sequence::qualities_type& qual)
+{
+    if (!has_next()) {
+        data.clear();
+        qual.clear();
+        return index();
+    }
+
+    ++index_;
+    read_next(nullptr, &data, &qual);
+    return index_;
+}
+
+
+
+//-------------------------------------------------------------------
+sequence_reader::index_type
+sequence_reader::next_header_data_qualities (
+    sequence::header_type& header,
+    sequence::data_type& data,
+    sequence::qualities_type& qual)
+{
+    if (!has_next()) {
+        header.clear();
+        data.clear();
+        qual.clear();
+        return index();
+    }
+
+    ++index_;
+    read_next(&header, &data, &qual);
+    return index_;
+}
+
+
+
+//-------------------------------------------------------------------
+void sequence_reader::skip (index_type skip)
+{
+    if (skip < 1) return;
+
+    for(; skip > 0 && has_next(); --skip) {
+        ++index_;
+        skip_next();
+    }
+}
+
+
+
+
+
+
+//-----------------------------------------------------------------------------
+// F A S T A    R E A D E R
+//-----------------------------------------------------------------------------
+fasta_reader::fasta_reader (const string& filename):
+    sequence_reader{},
+    file_{},
+    linebuffer_{},
+    pos_{0}
+{
+    if (!filename.empty()) {
+        file_.open(filename);
+
+        if (!file_.good()) {
+            invalidate();
+            throw file_access_error{"can't open file " + filename};
+        }
+    }
+    else {
+        throw file_access_error{"no filename was given"};
+    }
+}
+
+
+
+//-------------------------------------------------------------------
+void fasta_reader::read_next (header_type* header, data_type* data, qualities_type*)
+{
+    if (linebuffer_.empty()) {
+        getline(file_, linebuffer_);
+    }
+    pos_ += linebuffer_.size() + 1;
+
+    if (linebuffer_[0] != '>') {
+        throw io_format_error{"malformed fasta file - expected header char > not found"};
+        invalidate();
+        return;
+    }
+
+    if (header) *header = linebuffer_.substr(1);
+
+    if (data) data->clear();
+
+    while (file_.good()) {
+        getline(file_, linebuffer_);
+        if (linebuffer_[0] == '>') {
+            break;
+        }
+        else {
+            if (data) data->append(linebuffer_);
+            pos_ += linebuffer_.size() + 1;
+        }
+    }
+
+    if (data && data->empty()) {
+        throw io_format_error{"malformed fasta file - zero-length sequence"
+                              + (header ? *header : header_type{""})};
+        invalidate();
+        return;
+    }
+
+    if (!file_.good()) {
+        pos_ = -1;
+        invalidate();
+    }
+}
+
+
+
+
+//-------------------------------------------------------------------
+void fasta_reader::skip_next ()
+{
+    if (linebuffer_.empty()) {
+        file_.ignore(1);
+        pos_ += file_.gcount();
+    } else {
+        pos_ += linebuffer_.size() + 1;
+        linebuffer_.clear();
+    }
+    file_.ignore(std::numeric_limits<std::streamsize>::max(), '>');
+    pos_ += file_.gcount();
+
+    if (file_.good()) {
+        file_.unget();
+        pos_ -= 1;
+    } else {
+        pos_ = -1;
+        invalidate();
+    }
+}
+
+
+
+//-------------------------------------------------------------------
+void fasta_reader::do_seek (std::streampos pos)
+{
+    file_.seekg(pos);
+    pos_ = pos;
+    linebuffer_.clear();
+
+    if (!file_.good()) {
+        pos_ = -1;
+        invalidate();
+    }
+}
+
+
+
+//-------------------------------------------------------------------
+std::streampos fasta_reader::do_tell ()
+{
+    return pos_;
+}
+
+
+
+
+
+
+//-----------------------------------------------------------------------------
+// F A S T Q    R E A D E R
+//-----------------------------------------------------------------------------
+fastq_reader::fastq_reader (const string& filename):
+    sequence_reader{},
+    file_{}, linebuffer_{}, pos_{0}
+{
+    if (!filename.empty()) {
+        file_.open(filename);
+
+        if (!file_.good()) {
+            invalidate();
+            throw file_access_error{"can't open file " + filename};
+        }
+    }
+    else {
+        throw file_access_error{"no filename was given"};
+    }
+}
+
+
+
+//-------------------------------------------------------------------
+void fastq_reader::read_next (header_type* header, data_type* data,
+                              qualities_type* qualities)
+{
+    // 1st line (data header)
+    getline(file_, linebuffer_);
+
+    if (linebuffer_.empty()) {
+        pos_ = -1;
+        invalidate();
+        if (header) header->clear();
+        if (data) data->clear();
+        if (qualities) qualities->clear();
+        return;
+    }
+
+    pos_ += linebuffer_.size() + 1;
+
+    if (linebuffer_[0] != '@') {
+        if (linebuffer_[0] != '\r') {
+            throw io_format_error{"malformed fastq file - sequence header: "  + linebuffer_};
+        }
+        invalidate();
+        return;
+    }
+
+    if (header) {
+        *header = linebuffer_.substr(1);
+    }
+
+    // 2nd line (sequence data)
+    if (data) {
+        getline(file_, *data);
+        pos_ += data->size() + 1;
+    } else {
+        file_.ignore(std::numeric_limits<std::streamsize>::max(), '\n');
+        pos_ += file_.gcount();
+    }
+
+    // 3rd (qualities header) + 4th line (qualities)
+    if (qualities) {
+        getline(file_, linebuffer_);
+        pos_ += linebuffer_.size() + 1;
+
+        if (linebuffer_.empty() || linebuffer_[0] != '+') {
+            if (linebuffer_[0] != '\r') {
+                throw io_format_error{"malformed fastq file - quality header: "  + linebuffer_};
+            }
+            invalidate();
+            return;
+        }
+
+        getline(file_, *qualities);
+        pos_ += qualities->size() + 1;
+    }
+    else {
+        file_.ignore(std::numeric_limits<std::streamsize>::max(), '\n');
+        pos_ += file_.gcount();
+        file_.ignore(std::numeric_limits<std::streamsize>::max(), '\n');
+        pos_ += file_.gcount();
+    }
+
+    if (!file_.good()) {
+        pos_ = -1;
+        invalidate();
+    }
+}
+
+
+
+//-------------------------------------------------------------------
+void fastq_reader::skip_next ()
+{
+    read_next(nullptr, nullptr, nullptr);
+}
+
+
+
+//-------------------------------------------------------------------
+void fastq_reader::do_seek (std::streampos pos)
+{
+    file_.seekg(pos);
+    pos_ = pos;
+
+    if (!file_.good()) {
+        pos_ = -1;
+        invalidate();
+    }
+}
+
+
+
+//-------------------------------------------------------------------
+std::streampos fastq_reader::do_tell ()
+{
+    return pos_;
+}
+
+
+
+
+
+
+//-----------------------------------------------------------------------------
+// P A I R    R E A D E R
+//-----------------------------------------------------------------------------
+sequence_pair_reader::sequence_pair_reader (const std::string& filename1,
+                                            const std::string& filename2)
+:
+    reader1_{nullptr},
+    reader2_{nullptr},
+    singleMode_{true}
+{
+    if (!filename1.empty()) {
+        reader1_ = make_sequence_reader(filename1);
+
+        if (!filename2.empty()) {
+            singleMode_ = false;
+            if (filename1 != filename2) {
+                reader2_ = make_sequence_reader(filename2);
+            }
+        }
+    }
+}
+
+
+
+//-------------------------------------------------------------------
+bool sequence_pair_reader::has_next () const noexcept
+{
+    if (!reader1_) return false;
+    if (!reader1_->has_next()) return false;
+    if (!reader2_) return true;
+    if (!reader2_->has_next()) return false;
+    return true;
+}
+
+
+
+//-------------------------------------------------------------------
+sequence_pair_reader::sequence_pair
+sequence_pair_reader::next ()
+{
+    sequence_pair seq;
+    next(seq);
+    return seq;
+}
+
+
+
+//-------------------------------------------------------------------
+void sequence_pair_reader::next (sequence_pair& seq)
+{
+    if (!has_next()) return;
+
+    // only one sequence per call
+    if (singleMode_) {
+        reader1_->next(seq.first);
+        seq.second.header.clear();
+        seq.second.data.clear();
+        seq.second.qualities.clear();
+    }
+    // pair = single sequences from 2 separate files (read in lockstep)
+    else if (reader2_) {
+        reader1_->next(seq.first);
+        reader2_->next(seq.second);
+    }
+    // pair = 2 consecutive sequences from same file
+    else {
+        const auto idx = reader1_->index();
+        reader1_->next(seq.first);
+        //make sure the index is only increased after the 2nd 'next()'
+        reader1_->index_offset(idx);
+        reader1_->next(seq.second);
+    }
+}
+
+
+
+//-------------------------------------------------------------------
+sequence_pair_reader::header_type
+sequence_pair_reader::next_header ()
+{
+    if (!has_next()) return header_type{};
+
+    // only one sequence per call
+    if (singleMode_) {
+        return reader1_->next_header();
+    }
+
+    // pair = single sequences from 2 separate files (read in lockstep)
+    if (reader2_) {
+        reader2_->next_header();
+        return reader1_->next_header();
+    }
+
+    // pair = 2 consecutive sequences from same file
+    const auto idx = reader1_->index();
+    auto header = reader1_->next_header();
+    //make sure the index is only increased after the 2nd 'next()'
+    reader1_->index_offset(idx);
+    reader1_->next_header();
+    return header;
+}
+
+
+
+//-------------------------------------------------------------------
+sequence_pair_reader::index_type
+sequence_pair_reader::next_data (sequence::data_type& data1,
+                                 sequence::data_type& data2)
+{
+    if (!has_next()) return index();
+
+    // only one sequence per call
+    if (singleMode_) {
+        data2.clear();
+        return reader1_->next_data(data1);
+    }
+
+    // pair = single sequences from 2 separate files (read in lockstep)
+    if (reader2_) {
+        reader1_->next_data(data1);
+        return reader2_->next_data(data2);
+    }
+
+    // pair = 2 consecutive sequences from same file
+    const auto idx = reader1_->index();
+    reader1_->next_data(data1);
+    //make sure the index is only increased after the 2nd 'next()'
+    reader1_->index_offset(idx);
+    return reader1_->next_data(data2);
+}
+
+
+
+//-------------------------------------------------------------------
+sequence_pair_reader::index_type
+sequence_pair_reader::next_header_and_data (sequence::header_type& header1,
+                                            sequence::data_type& data1,
+                                            sequence::data_type& data2)
+{
+    if (!has_next()) return index();
+
+    // only one sequence per call
+    if (singleMode_) {
+        data2.clear();
+        return reader1_->next_header_and_data(header1, data1);
+    }
+
+    // pair = single sequences from 2 separate files (read in lockstep)
+    if (reader2_) {
+        reader1_->next_header_and_data(header1, data1);
+        return reader2_->next_data(data2);
+    }
+
+    // pair = 2 consecutive sequences from same file
+    const auto idx = reader1_->index();
+    reader1_->next_header_and_data(header1, data1);
+    //make sure the index is only increased after the 2nd 'next()'
+    reader1_->index_offset(idx);
+    return reader1_->next_data(data2);
+}
+
+
+
+//-------------------------------------------------------------------
+sequence_pair_reader::index_type
+sequence_pair_reader::next_data_and_qualities (
+    sequence::data_type& data1,
+    sequence::data_type& data2,
+    sequence::qualities_type& qual1,
+    sequence::qualities_type& qual2)
+{
+    if (!has_next()) return index();
+
+    // only one sequence per call
+    if (singleMode_) {
+        data2.clear();
+        return reader1_->next_data_and_qualities(data1, qual1);
+    }
+
+    // pair = single sequences from 2 separate files (read in lockstep)
+    if (reader2_) {
+        reader1_->next_data_and_qualities(data1, qual1);
+        return reader2_->next_data_and_qualities(data2, qual2);
+    }
+
+    // pair = 2 consecutive sequences from same file
+    const auto idx = reader1_->index();
+    reader1_->next_data_and_qualities(data1, qual1);
+    //make sure the index is only increased after the 2nd 'next()'
+    reader1_->index_offset(idx);
+    return reader1_->next_data_and_qualities(data2, qual2);
+}
+
+
+
+//-------------------------------------------------------------------
+sequence_pair_reader::index_type
+sequence_pair_reader::next_header_data_qualities (
+    sequence::header_type& header1,
+    sequence::data_type& data1,
+    sequence::data_type& data2,
+    sequence::qualities_type& qual1,
+    sequence::qualities_type& qual2)
+{
+    if (!has_next()) return index();
+
+    // only one sequence per call
+    if (singleMode_) {
+        data2.clear();
+        return reader1_->next_header_data_qualities(header1, data1, qual1);
+    }
+
+    // pair = single sequences from 2 separate files (read in lockstep)
+    if (reader2_) {
+        reader1_->next_header_data_qualities(header1, data1, qual1);
+        return reader2_->next_data_and_qualities(data2, qual2);
+    }
+
+    // pair = 2 consecutive sequences from same file
+    const auto idx = reader1_->index();
+    reader1_->next_header_data_qualities(header1, data1, qual1);
+    //make sure the index is only increased after the 2nd 'next()'
+    reader1_->index_offset(idx);
+    return reader1_->next_data_and_qualities(data2, qual2);
+}
+
+
+
+//-------------------------------------------------------------------
+void sequence_pair_reader::skip (index_type skip)
+{
+    if (skip < 1 || !reader1_) return;
+
+    if (reader2_) {
+        reader1_->skip(skip);
+        reader2_->skip(skip);
+    }
+    else if (singleMode_) {
+        reader1_->skip(skip);
+    }
+    else {
+        const auto idx = reader1_->index();
+        reader1_->skip(2*skip);
+        reader1_->index_offset(idx+skip);
+    }
+}
+
+
+
+//-------------------------------------------------------------------
+sequence_pair_reader::index_type sequence_pair_reader::index () const noexcept
+{
+    if (!reader1_) return index_type{0};
+    return reader1_->index();
+}
+
+
+
+//-------------------------------------------------------------------
+void sequence_pair_reader::index_offset (index_type index)
+{
+    if (!reader1_) return;
+
+    reader1_->index_offset(index);
+    if (reader2_) reader2_->index_offset(index);
+}
+
+
+
+//-------------------------------------------------------------------
+void sequence_pair_reader::seek (const stream_positions& pos)
+{
+    if (!reader1_) return;
+
+    reader1_->seek(pos.first);
+    if (reader2_) reader2_->seek(pos.second);
+}
+
+
+
+//-------------------------------------------------------------------
+sequence_pair_reader::stream_positions
+sequence_pair_reader::tell ()
+{
+    return stream_positions{
+            reader1_ ? reader1_->tell() : std::streampos{},
+            reader2_ ? reader2_->tell() : std::streampos{} };
+}
+
+
+
+
+
+
+//-------------------------------------------------------------------
+std::unique_ptr<sequence_reader>
+make_sequence_reader (const string& filename)
+{
+    if (filename.empty()) return nullptr;
+
+    auto n = filename.size();
+    if (filename.find(".fq")    == (n-3) ||
+       filename.find(".fnq")   == (n-4) ||
+       filename.find(".fastq") == (n-6) )
+    {
+        return std::make_unique<fastq_reader>(filename);
+    }
+    else if (filename.find(".fa")    == (n-3) ||
+            filename.find(".fna")   == (n-4) ||
+            filename.find(".fasta") == (n-6) )
+    {
+        return std::make_unique<fasta_reader>(filename);
+    }
+
+    //try to determine file type content
+    std::ifstream is {filename};
+    if (is.good()) {
+        string line;
+        getline(is,line);
+        if (!line.empty()) {
+            if (line[0] == '>') {
+                return std::make_unique<fasta_reader>(filename);
+            }
+            else if (line[0] == '@') {
+                return std::make_unique<fastq_reader>(filename);
+            }
+        }
+        throw file_read_error{"file format not recognized"};
+    }
+
+    throw file_access_error{"file not accessible"};
+    return nullptr;
+}
+
diff --git a/lib/libmarv/src/sequence_io.h b/lib/libmarv/src/sequence_io.h
new file mode 100644
index 000000000..9c81f61a1
--- /dev/null
+++ b/lib/libmarv/src/sequence_io.h
@@ -0,0 +1,446 @@
+/*************************************************************************//**
+ *
+ * @file  single and pairwise reading of sequences from FASTA/FASTQ files
+ *
+ * (c) 2017-2023 André Müller (mail@andremueller-online.de)
+ * MIT License
+ *
+ *****************************************************************************/
+
+#ifndef SEQUENCE_IO_H_
+#define SEQUENCE_IO_H_
+
+
+#include <cstdint>
+#include <fstream>
+#include <memory>
+#include <string>
+#include <stdexcept>
+#include <vector>
+
+
+
+/*************************************************************************//**
+ *
+ * @member  chars    concatenation of all sequences
+ * @member  offsets  start indices of individual sequences
+ *
+ *****************************************************************************/
+struct sequence_batch {
+    std::vector<char> chars;               
+    std::vector<std::size_t> offsets;  
+    std::vector<std::size_t> lengths;  
+    std::vector<std::string> headers;  
+    std::vector<std::string> qualities;  
+};
+
+
+
+
+
+
+/*************************************************************************//**
+ *
+ * @brief  reads ALL sequences from FASTA/FASTQ file, discards headers/qualities;
+ *         allocates new sequence_batch
+ *
+ * @param  align:  alignment of sequence characters in bytes
+ *
+ *****************************************************************************/
+sequence_batch
+read_all_sequences_from_file (const std::string& filename, 
+                              int charAlignment = 4);
+
+
+
+/*************************************************************************//**
+ *
+ * @brief  reads ALL sequences from FASTA/FASTQ file, discards headers/qualities;
+ *         (re-)uses external sequence_batch buffer
+ *
+ * @param  align:  alignment of sequence characters in bytes
+ *
+ *****************************************************************************/
+void read_all_sequences_from_file (const std::string& filename,
+                                   sequence_batch&,
+                                   int charAlignment = 4);
+
+
+
+
+
+
+/*************************************************************************//**
+ *
+ * @brief  reads ALL sequences and headers from FASTA/FASTQ file,
+ *         discards FASTQ qualities;
+ *         allocates new sequence_batch
+ *
+ * @param  align:  alignment of sequence characters in bytes
+ *
+ *****************************************************************************/
+sequence_batch 
+read_all_sequences_and_headers_from_file (const std::string& filename,
+                                          int charAlignment = 4);
+
+
+
+/*************************************************************************//**
+ *
+ * @brief  reads ALL sequences from FASTA/FASTQ file, discards FASTQ qualities;
+ *         (re-)uses external sequence_batch buffer
+ *
+ * @param  align:  alignment of sequence characters in bytes
+ *
+ *****************************************************************************/
+void read_all_sequences_and_headers_from_file (const std::string& filename, 
+                                               sequence_batch&,
+                                               int charAlignment = 4);
+
+
+
+
+
+
+/*************************************************************************//**
+ *
+ * @brief  reads ALL sequences and headers from FASTA/FASTQ file,
+ *         allocates new sequence_batch
+ *
+ * @param  align:  alignment of sequence characters in bytes
+ *
+ *****************************************************************************/
+sequence_batch 
+read_all_sequences_and_meta_info_from_file (const std::string& filename,
+                                            int charAlignment = 4);
+
+
+
+/*************************************************************************//**
+ *
+ * @brief  reads ALL sequences from FASTA/FASTQ file
+ *         (re-)uses external sequence_batch buffer
+ *
+ * @param  align:  alignment of sequence characters in bytes
+ *
+ *****************************************************************************/
+void read_all_sequences_and_meta_info_from_file (const std::string& filename, 
+                                                 sequence_batch&,
+                                                 int charAlignment = 4);
+
+
+
+
+/*************************************************************************//**
+ *
+ * exception types
+ *
+ *****************************************************************************/
+struct io_error : public std::runtime_error {
+    explicit io_error (const std::string& what): std::runtime_error(what) {}
+};
+
+
+struct io_format_error : public io_error {
+    explicit io_format_error (const std::string& what): io_error(what) {}
+};
+
+
+struct file_io_error : public io_error {
+    explicit
+    file_io_error (const std::string& what): io_error(what) {}
+
+    explicit
+    file_io_error(const std::string& what, const std::string& filename):
+        io_error(what), filename_(filename)
+    {}
+
+    const char* filename() const noexcept { return filename_.c_str(); }
+
+private:
+    std::string filename_;
+};
+
+
+struct file_access_error : public file_io_error {
+    explicit 
+    file_access_error (const std::string& what): file_io_error(what) {}
+
+    explicit
+    file_access_error (const std::string& what, const std::string& filename):
+        file_io_error(what,filename)
+    {}
+};
+
+struct file_read_error : public file_io_error {
+    explicit file_read_error (const std::string& what): file_io_error(what) {}
+
+    explicit
+    file_read_error (const std::string& what, const std::string& filename):
+        file_io_error(what,filename)
+    {}
+};
+
+
+
+
+/*************************************************************************//**
+ *
+ * @brief polymorphic file reader for (pairs of) bio-sequences
+ *        NOT concurrency safe
+ *
+ *****************************************************************************/
+class sequence_reader
+{
+public:
+    using index_type     = std::uint_least64_t;
+    using header_type    = std::string;
+    using data_type      = std::string;
+    using qualities_type = std::string;
+
+    /** @brief data associated with one sequence */
+    struct sequence {
+        using index_type     = sequence_reader::index_type;
+        using header_type    = sequence_reader::header_type;
+        using data_type      = sequence_reader::data_type;
+        using qualities_type = sequence_reader::qualities_type;
+
+        index_type  index;      // number of sequence in file (+ offset)
+        header_type header;     // meta information (FASTA >, FASTQ @)
+        data_type   data;       // actual sequence data
+        data_type   qualities;  // quality scores (FASTQ)
+    };
+
+
+    sequence_reader(): index_{0}, valid_{true} {}
+
+    sequence_reader (const sequence_reader&) = delete;
+    sequence_reader& operator = (const sequence_reader&) = delete;
+    sequence_reader& operator = (sequence_reader&&) = delete;
+
+    virtual ~sequence_reader () = default;
+
+
+    /** @brief read & return next sequence */
+    sequence next ();
+
+    /** @brief read next header only */
+    header_type next_header( );
+
+    /** @brief read next sequence data only */
+    data_type next_data ();
+
+
+    /** @brief read next sequence re-using external storage */
+    void next (sequence&);
+
+    /** @brief read next header only, re-uses external storage */
+    index_type next_header (header_type&);
+
+    /** @brief read next sequence data only, re-uses external storage */
+    index_type next_data (data_type&);
+
+    /** @brief read next sequence data & header, re-uses external storage */
+    index_type next_header_and_data (header_type&, data_type&);
+    
+    /** @brief read next header from 1st sequence and data from both sequences
+               re-using external storage */
+    index_type next_data_and_qualities (sequence::data_type&,
+                                        sequence::qualities_type&);
+
+    /** @brief read next sequence data, header & qualities, re-uses external storage */
+    index_type next_header_data_qualities (header_type&, data_type&, qualities_type&);
+
+
+    /** @brief skip n sequences */
+    void skip (index_type n);
+
+    bool has_next () const noexcept { return valid_; }
+
+    index_type index () const noexcept { return index_; }
+
+    void index_offset (index_type index) { index_ = index; }
+
+    void seek (std::streampos pos) { do_seek(pos); }
+    std::streampos tell ()         { return do_tell(); }
+
+
+protected:
+    void invalidate () { valid_ = false; }
+
+
+private:
+    // derived readers have to implement these
+    virtual std::streampos do_tell () = 0;
+
+    virtual void do_seek (std::streampos) = 0;
+
+    virtual void read_next (header_type*, data_type*, qualities_type*) = 0;
+
+    virtual void skip_next () = 0;
+
+    index_type index_;
+    bool valid_;
+};
+
+
+
+
+/*************************************************************************//**
+ *
+ *
+ *
+ *****************************************************************************/
+class fasta_reader :
+    public sequence_reader
+{
+public:
+    explicit
+    fasta_reader (const std::string& filename);
+
+private:
+    std::streampos do_tell () override;
+
+    void do_seek (std::streampos) override;
+    void read_next (header_type*, data_type*, qualities_type*) override;
+    void skip_next () override;
+
+private:
+    std::ifstream file_;
+    std::string linebuffer_;
+    std::streampos pos_;
+};
+
+
+
+
+/*************************************************************************//**
+ *
+ *
+ *
+ *****************************************************************************/
+class fastq_reader :
+    public sequence_reader
+{
+public:
+    explicit
+    fastq_reader (const std::string& filename);
+
+private:
+    std::streampos do_tell () override;
+
+    void do_seek (std::streampos) override;
+    void read_next (header_type*, data_type*, qualities_type*) override;
+    void skip_next () override;
+
+private:
+    std::ifstream file_;
+    std::string linebuffer_;
+    std::streampos pos_;
+};
+
+
+
+
+
+/*************************************************************************//**
+ *
+ * @brief file reader for (pairs of) bio-sequences
+ *        NOT concurrency safe
+ *
+ *****************************************************************************/
+class sequence_pair_reader
+{
+public:
+    using index_type       = sequence_reader::index_type;
+    using data_type        = sequence_reader::data_type;
+    using header_type      = sequence_reader::header_type;
+    using sequence         = sequence_reader::sequence;
+
+    using stream_positions = std::pair<std::streampos,std::streampos>;
+    using sequence_pair    = std::pair<sequence,sequence>;
+
+
+    /** @brief if filename2 empty : single sequence mode
+     *         if filename1 == filename2 : read consecutive pairs in one file
+     *         else : read from 2 files in lockstep
+     */
+    sequence_pair_reader (const std::string& filename1,
+                          const std::string& filename2);
+
+    sequence_pair_reader (const sequence_pair_reader&) = delete;
+    sequence_pair_reader& operator = (const sequence_pair_reader&) = delete;
+    sequence_pair_reader& operator = (sequence_pair_reader&&) = delete;
+
+    ~sequence_pair_reader () = default;
+
+
+    /** @brief read & return next sequence */
+    sequence_pair next ();
+
+    /** @brief read next header only */
+    header_type next_header ();
+
+    /** @brief read next sequence re-using external storage */
+    void next (sequence_pair&);
+
+    /** @brief read next header only, re-uses external storage */
+    index_type next_header (sequence::header_type&);
+
+    /** @brief read next sequence data only, re-uses external storage */
+    index_type next_data (sequence::data_type&, sequence::data_type&);
+
+    /** @brief read next header from 1st sequence and data from both sequences
+               re-using external storage */
+    index_type next_header_and_data (sequence::header_type&,
+                                     sequence::data_type&,
+                                     sequence::data_type&);
+
+    /** @brief read next header from 1st sequence and data from both sequences
+               re-using external storage */
+    index_type next_data_and_qualities (sequence::data_type&,
+                                        sequence::data_type&,
+                                        sequence::qualities_type&,
+                                        sequence::qualities_type&);
+
+    /** @brief read next header from 1st sequence and 
+               data and qualities from both sequences re-using external storage */
+    index_type next_header_data_qualities (sequence::header_type&,
+                                           sequence::data_type&,
+                                           sequence::data_type&,
+                                           sequence::qualities_type&,
+                                           sequence::qualities_type&);
+
+
+    /** @brief skip n sequences */
+    void skip (index_type n);
+
+    bool has_next () const noexcept;
+
+    index_type index () const noexcept;
+
+    void index_offset (index_type index);
+
+    void seek (const stream_positions& pos);
+    stream_positions tell ();
+
+
+private:
+    std::unique_ptr<sequence_reader> reader1_;
+    std::unique_ptr<sequence_reader> reader2_;
+    bool singleMode_;
+};
+
+
+
+/*************************************************************************//**
+ *
+ * @brief guesses and returns a suitable sequence reader
+ *        based on a filename pattern
+ *
+ *****************************************************************************/
+std::unique_ptr<sequence_reader>
+make_sequence_reader (const std::string& filename);
+
+
+#endif
diff --git a/lib/libmarv/src/smithwaterman_kernel_config.cuh b/lib/libmarv/src/smithwaterman_kernel_config.cuh
new file mode 100644
index 000000000..06f24fa32
--- /dev/null
+++ b/lib/libmarv/src/smithwaterman_kernel_config.cuh
@@ -0,0 +1,217 @@
+#ifndef SW_KERNEL_CONFIG_CUH
+#define SW_KERNEL_CONFIG_CUH
+
+#include <algorithm>
+#include <vector>
+#include <string>
+
+namespace cudasw4{
+
+
+    struct SmithWatermanKernelConfig{
+        enum class Approach : int{
+            Unused = 999999
+        };
+        bool dpx;
+        int tilesize;
+        int groupsize;
+        int numRegs;
+        Approach approach;
+
+        SmithWatermanKernelConfig() = default;
+        SmithWatermanKernelConfig(int tilesize_, int groupsize_, int numRegs_, int dpx_, Approach approach_)
+            : dpx(dpx_), tilesize(tilesize_), groupsize(groupsize_), numRegs(numRegs_), approach(approach_)
+        {}
+
+        SmithWatermanKernelConfig(const SmithWatermanKernelConfig&) = default;
+        SmithWatermanKernelConfig& operator=(const SmithWatermanKernelConfig&) = default;
+    };
+
+    __inline__
+    std::string to_string(SmithWatermanKernelConfig::Approach approach){
+        switch(approach){
+            case SmithWatermanKernelConfig::Approach::Unused: return "Unused";
+        }
+        return "to_string: missing case for SmithWatermanKernelConfig::Approach";
+    }
+
+    __inline__
+    std::ostream& operator<<(std::ostream& os, const SmithWatermanKernelConfig& data){
+
+        os << data.tilesize << " " << data.groupsize << " " << data.numRegs 
+            << " " << data.dpx << " " << int(data.approach);
+        return os;
+    }
+    
+    //T4
+    __inline__
+    std::vector<SmithWatermanKernelConfig> getOptimalKernelConfigs_SW_sm75(){
+        std::vector<SmithWatermanKernelConfig> configs{
+            {16,4,4,0,SmithWatermanKernelConfig::Approach::Unused},
+            {32,4,8,0,SmithWatermanKernelConfig::Approach::Unused},
+            {48,4,12,0,SmithWatermanKernelConfig::Approach::Unused},
+            {64,4,16,0,SmithWatermanKernelConfig::Approach::Unused},
+            {80,4,20,0,SmithWatermanKernelConfig::Approach::Unused},
+            {96,4,24,0,SmithWatermanKernelConfig::Approach::Unused},
+            {112,4,28,0,SmithWatermanKernelConfig::Approach::Unused},
+            {128,8,16,0,SmithWatermanKernelConfig::Approach::Unused},
+            {144,4,36,0,SmithWatermanKernelConfig::Approach::Unused},
+            {160,8,20,0,SmithWatermanKernelConfig::Approach::Unused},
+            {176,4,44,0,SmithWatermanKernelConfig::Approach::Unused},
+            {192,8,24,0,SmithWatermanKernelConfig::Approach::Unused},
+            {224,8,28,0,SmithWatermanKernelConfig::Approach::Unused},
+            {256,16,16,0,SmithWatermanKernelConfig::Approach::Unused},
+            {288,8,36,0,SmithWatermanKernelConfig::Approach::Unused},
+            {320,16,20,0,SmithWatermanKernelConfig::Approach::Unused},
+            {352,8,44,0,SmithWatermanKernelConfig::Approach::Unused},
+            {384,16,24,0,SmithWatermanKernelConfig::Approach::Unused},
+            {448,16,28,0,SmithWatermanKernelConfig::Approach::Unused},
+            {512,32,16,0,SmithWatermanKernelConfig::Approach::Unused},
+            {576,16,36,0,SmithWatermanKernelConfig::Approach::Unused},
+            {640,32,20,0,SmithWatermanKernelConfig::Approach::Unused},
+            {704,16,44,0,SmithWatermanKernelConfig::Approach::Unused},
+            {768,32,24,0,SmithWatermanKernelConfig::Approach::Unused},
+            //larger tiles are not supported because shared memory size is too small
+        };
+        
+        return configs;
+    }
+
+    //A100
+    __inline__
+    std::vector<SmithWatermanKernelConfig> getOptimalKernelConfigs_SW_sm80(){
+        std::vector<SmithWatermanKernelConfig> configs{
+            {16,4,4,0,SmithWatermanKernelConfig::Approach::Unused}, 
+            {32,4,8,0,SmithWatermanKernelConfig::Approach::Unused}, 
+            {48,4,12,0,SmithWatermanKernelConfig::Approach::Unused}, 
+            {64,4,16,0,SmithWatermanKernelConfig::Approach::Unused}, 
+            {80,4,20,0,SmithWatermanKernelConfig::Approach::Unused}, 
+            {96,4,24,0,SmithWatermanKernelConfig::Approach::Unused}, 
+            {112,4,28,0,SmithWatermanKernelConfig::Approach::Unused}, 
+            {128,8,16,0,SmithWatermanKernelConfig::Approach::Unused}, 
+            {144,4,36,0,SmithWatermanKernelConfig::Approach::Unused}, 
+            {160,8,20,0,SmithWatermanKernelConfig::Approach::Unused}, 
+            {176,4,44,0,SmithWatermanKernelConfig::Approach::Unused}, 
+            {192,8,24,0,SmithWatermanKernelConfig::Approach::Unused}, 
+            {224,8,28,0,SmithWatermanKernelConfig::Approach::Unused}, 
+            {256,8,32,0,SmithWatermanKernelConfig::Approach::Unused}, 
+            {288,8,36,0,SmithWatermanKernelConfig::Approach::Unused}, 
+            {320,16,20,0,SmithWatermanKernelConfig::Approach::Unused}, 
+            {352,8,44,0,SmithWatermanKernelConfig::Approach::Unused}, 
+            {384,16,24,0,SmithWatermanKernelConfig::Approach::Unused}, 
+            {448,16,28,0,SmithWatermanKernelConfig::Approach::Unused}, 
+            {512,16,32,0,SmithWatermanKernelConfig::Approach::Unused}, 
+            {576,16,36,0,SmithWatermanKernelConfig::Approach::Unused}, 
+            {640,32,20,0,SmithWatermanKernelConfig::Approach::Unused}, 
+            {704,16,44,0,SmithWatermanKernelConfig::Approach::Unused}, 
+            {768,32,24,0,SmithWatermanKernelConfig::Approach::Unused}, 
+            {896,32,28,0,SmithWatermanKernelConfig::Approach::Unused}, 
+            {1024,32,32,0,SmithWatermanKernelConfig::Approach::Unused}, 
+        };
+
+        return configs;
+    }
+
+    //L40S
+    __inline__
+    std::vector<SmithWatermanKernelConfig> getOptimalKernelConfigs_SW_sm89(){
+        std::vector<SmithWatermanKernelConfig> configs{
+            {16,4,4,0,SmithWatermanKernelConfig::Approach::Unused}, 
+            {32,4,8,0,SmithWatermanKernelConfig::Approach::Unused}, 
+            {48,4,12,0,SmithWatermanKernelConfig::Approach::Unused}, 
+            {64,4,16,0,SmithWatermanKernelConfig::Approach::Unused}, 
+            {80,4,20,0,SmithWatermanKernelConfig::Approach::Unused}, 
+            {96,4,24,0,SmithWatermanKernelConfig::Approach::Unused}, 
+            {112,4,28,0,SmithWatermanKernelConfig::Approach::Unused}, 
+            {128,4,32,0,SmithWatermanKernelConfig::Approach::Unused}, 
+            {144,4,36,0,SmithWatermanKernelConfig::Approach::Unused}, 
+            {160,4,40,0,SmithWatermanKernelConfig::Approach::Unused}, 
+            {176,4,44,0,SmithWatermanKernelConfig::Approach::Unused}, 
+            {192,8,24,0,SmithWatermanKernelConfig::Approach::Unused}, 
+            {224,8,28,0,SmithWatermanKernelConfig::Approach::Unused}, 
+            {256,16,16,0,SmithWatermanKernelConfig::Approach::Unused}, 
+            {288,8,36,0,SmithWatermanKernelConfig::Approach::Unused}, 
+            {320,16,20,0,SmithWatermanKernelConfig::Approach::Unused}, 
+            {352,8,44,0,SmithWatermanKernelConfig::Approach::Unused}, 
+            {384,16,24,0,SmithWatermanKernelConfig::Approach::Unused}, 
+            {448,16,28,0,SmithWatermanKernelConfig::Approach::Unused}, 
+            {512,32,16,0,SmithWatermanKernelConfig::Approach::Unused}, 
+            {576,16,36,0,SmithWatermanKernelConfig::Approach::Unused}, 
+            {640,32,20,0,SmithWatermanKernelConfig::Approach::Unused}, 
+            {704,16,44,0,SmithWatermanKernelConfig::Approach::Unused}, 
+            {768,32,24,0,SmithWatermanKernelConfig::Approach::Unused}, 
+            {896,32,28,0,SmithWatermanKernelConfig::Approach::Unused}, 
+            {1024,32,32,0,SmithWatermanKernelConfig::Approach::Unused}, 
+        };
+
+        return configs;
+    }
+
+    //H100 SXM
+    __inline__
+    std::vector<SmithWatermanKernelConfig> getOptimalKernelConfigs_SW_sm90(){
+        std::vector<SmithWatermanKernelConfig> configs{
+            {16,4,4,1,SmithWatermanKernelConfig::Approach::Unused}, 
+            {32,4,8,1,SmithWatermanKernelConfig::Approach::Unused}, 
+            {48,4,12,1,SmithWatermanKernelConfig::Approach::Unused}, 
+            {64,4,16,1,SmithWatermanKernelConfig::Approach::Unused}, 
+            {80,4,20,1,SmithWatermanKernelConfig::Approach::Unused}, 
+            {96,4,24,1,SmithWatermanKernelConfig::Approach::Unused}, 
+            {112,4,28,1,SmithWatermanKernelConfig::Approach::Unused}, 
+            {128,4,32,1,SmithWatermanKernelConfig::Approach::Unused}, 
+            {144,4,36,1,SmithWatermanKernelConfig::Approach::Unused}, 
+            {160,4,40,1,SmithWatermanKernelConfig::Approach::Unused}, 
+            {176,4,44,1,SmithWatermanKernelConfig::Approach::Unused}, 
+            {192,8,24,1,SmithWatermanKernelConfig::Approach::Unused}, 
+            {224,8,28,1,SmithWatermanKernelConfig::Approach::Unused}, 
+            {256,8,32,1,SmithWatermanKernelConfig::Approach::Unused}, 
+            {288,8,36,1,SmithWatermanKernelConfig::Approach::Unused}, 
+            {320,8,40,1,SmithWatermanKernelConfig::Approach::Unused}, 
+            {352,8,44,1,SmithWatermanKernelConfig::Approach::Unused}, 
+            {384,16,24,1,SmithWatermanKernelConfig::Approach::Unused}, 
+            {448,16,28,1,SmithWatermanKernelConfig::Approach::Unused}, 
+            {512,16,32,1,SmithWatermanKernelConfig::Approach::Unused}, 
+            {576,16,36,1,SmithWatermanKernelConfig::Approach::Unused}, 
+            {640,16,40,1,SmithWatermanKernelConfig::Approach::Unused}, 
+            {704,16,44,1,SmithWatermanKernelConfig::Approach::Unused}, 
+            {768,32,24,1,SmithWatermanKernelConfig::Approach::Unused}, 
+            {896,32,28,1,SmithWatermanKernelConfig::Approach::Unused}, 
+            {1024,32,32,1,SmithWatermanKernelConfig::Approach::Unused}, 
+        };
+
+        return configs;
+    }
+
+    __inline__
+    std::vector<SmithWatermanKernelConfig> getOptimalKernelConfigs_SW_default(){
+        return getOptimalKernelConfigs_SW_sm89();
+    }
+
+    __inline__
+    std::vector<SmithWatermanKernelConfig> getOptimalKernelConfigs_SW(int deviceId){
+        int ccMajor = 0;
+        int ccMinor = 0;
+        cudaDeviceGetAttribute(&ccMajor, cudaDevAttrComputeCapabilityMajor, deviceId);
+        cudaDeviceGetAttribute(&ccMinor, cudaDevAttrComputeCapabilityMinor, deviceId);
+
+        std::vector<SmithWatermanKernelConfig> configs;
+        
+        if(ccMajor == 7 && ccMinor == 5){
+            configs = getOptimalKernelConfigs_SW_sm75();
+        }else if(ccMajor == 8 && ccMinor == 0){
+            configs = getOptimalKernelConfigs_SW_sm80();
+        }else if(ccMajor == 8 && ccMinor == 9){
+            configs = getOptimalKernelConfigs_SW_sm89();
+        }else if(ccMajor == 9 && ccMinor == 0){
+            configs = getOptimalKernelConfigs_SW_sm90();
+        }else{
+            configs = getOptimalKernelConfigs_SW_default();
+        }
+
+        return configs;
+    }
+
+
+}
+
+#endif
\ No newline at end of file
diff --git a/lib/libmarv/src/target_subject_ids.cuh b/lib/libmarv/src/target_subject_ids.cuh
new file mode 100644
index 000000000..dc5b78784
--- /dev/null
+++ b/lib/libmarv/src/target_subject_ids.cuh
@@ -0,0 +1,71 @@
+#ifndef TARGET_SUBJECT_IDS_CUH
+#define TARGET_SUBJECT_IDS_CUH
+
+#include "config.hpp"
+
+#include <algorithm>
+#include <string>
+#include <fstream>
+#include <vector>
+
+namespace cudasw4{
+
+struct TargetSubjectIds{
+    std::vector<ReferenceIdT> subjectIds;
+
+    TargetSubjectIds() = default;
+    TargetSubjectIds(const std::string& filename){
+        std::ifstream is(filename);
+        if(!is){
+            throw std::runtime_error("File " + filename + " could not be opened");
+        }
+        std::string line;
+        while(std::getline(is, line)){
+            try{
+                std::int64_t val = std::stoull(line);
+                if(0 <= val && val < std::int64_t(MaxSequencesInDB::value())){
+                    subjectIds.push_back(val);
+                }else{
+                    std::cerr << "Invalid reference id '" << line << "'. Skipping line...\n";
+                }
+            }catch(std::invalid_argument&){
+                std::cerr << "Could not convert '" << line << "' to number. Skipping line...\n";
+            }
+        }
+
+        std::sort(subjectIds.begin(), subjectIds.end());
+    }
+
+    TargetSubjectIds(std::vector<ReferenceIdT> ids) : subjectIds(std::move(ids)){
+        std::sort(subjectIds.begin(), subjectIds.end());
+    }
+
+    void removeOutOfBoundsTargets(size_t databaseSize){
+        subjectIds.erase(
+            std::remove_if(subjectIds.begin(), subjectIds.end(), [&](size_t id){return id >= databaseSize; }),
+            subjectIds.end()
+        );
+    }
+
+    size_t size() const{
+        return subjectIds.size();
+    }
+
+    auto begin() const{
+        return subjectIds.begin();
+    }
+
+    auto end() const{
+        return subjectIds.end();
+    }
+};
+
+}
+
+
+
+
+
+
+
+#endif
\ No newline at end of file
diff --git a/lib/libmarv/src/tileconfigsearch.cu b/lib/libmarv/src/tileconfigsearch.cu
new file mode 100644
index 000000000..dbedafef9
--- /dev/null
+++ b/lib/libmarv/src/tileconfigsearch.cu
@@ -0,0 +1,445 @@
+#include <algorithm>
+#include <vector>
+#include <iostream>
+
+#include "options.hpp"
+#include "cudasw4.cuh"
+
+#include "hpc_helpers/all_helpers.cuh"
+#include "hpc_helpers/peer_access.cuh"
+
+
+
+
+struct BenchmarkData{
+    bool dpx;
+    int tilesize;
+    int groupsize;
+    int numRegs;
+    float gcups;
+    GaplessKernelConfig::Approach kernelApproach;
+};
+
+struct BenchmarkDataSW{
+    bool dpx;
+    int tilesize;
+    int groupsize;
+    int numRegs;
+    float gcups;
+    SmithWatermanKernelConfig::Approach kernelApproach;
+};
+
+void writeBenchmarkDataHeader(std::ostream& os){
+    os << "tilesize groupsize numRegs dpx kernelApproach gcups" << "\n";
+}
+
+std::ostream& operator<<(std::ostream& os, const BenchmarkData& data){
+
+    os << data.tilesize << " " << data.groupsize << " " << data.numRegs 
+        << " " << data.dpx << " " << int(data.kernelApproach) << " " << data.gcups;
+    return os;
+}
+
+std::ostream& operator<<(std::ostream& os, const BenchmarkDataSW& data){
+
+    os << data.tilesize << " " << data.groupsize << " " << data.numRegs 
+        << " " << data.dpx << " " << int(data.kernelApproach) << " " << data.gcups;
+    return os;
+}
+
+
+void gapless_search(){
+    std::cout << "gapless_search\n";
+
+    ProgramOptions options;
+
+    const int numTopOutputs = 0;
+    const auto blosumType = cudasw4::BlosumType::BLOSUM62_20;
+    const bool verbose = false;
+
+    std::vector<int> deviceIds;
+    {
+        int num = 0;
+        cudaGetDeviceCount(&num); CUERR
+        for(int i = 0; i < num; i++){
+            deviceIds.push_back(i);
+        }
+        if(deviceIds.size() > 0){
+            if(verbose){
+                std::cout << "Will use GPU";
+                for(auto x : deviceIds){
+                    std::cout << " " << x;
+                }
+                std::cout << "\n";
+            }
+        }else{
+            throw std::runtime_error("No GPU found");
+        }
+    }
+
+    helpers::PeerAccess peerAccess(deviceIds, false);
+ 
+    using MemoryConfig = cudasw4::MemoryConfig;
+    using ScanResult = cudasw4::ScanResult;
+
+    MemoryConfig memoryConfig;
+    memoryConfig.maxBatchBytes = options.maxBatchBytes;
+    memoryConfig.maxBatchSequences = options.maxBatchSequences;
+    memoryConfig.maxTempBytes = options.maxTempBytes;
+    memoryConfig.maxGpuMem = options.maxGpuMem;
+
+    const char* letters = "ARNDCQEGHILKMFPSTWYV";
+
+    std::mt19937 gen(42);
+    std::uniform_int_distribution<> dist(0,19);
+
+
+
+
+
+    std::vector<BenchmarkData> allBenchmarkData;
+
+    writeBenchmarkDataHeader(std::cout);
+
+    auto execute = [&](GaplessKernelConfig::Approach kernelApproach, bool useDPX){
+        KernelConfigFilenames kernelConfigFilenames;
+        cudasw4::CudaSW4 cudaSW4(
+            deviceIds, 
+            numTopOutputs,
+            blosumType, 
+            memoryConfig, 
+            verbose,
+            kernelConfigFilenames
+        );
+
+        cudaSW4.setScanType(ScanType::Gapless);
+        // cudaSW4.setScanType(ScanType::SW_Endpos);
+
+        // std::ofstream logfile(outputfilename);
+
+        std::vector<BenchmarkData> benchmarkDataVec;
+
+        std::vector<std::tuple<int,int>> validRegConfigs;
+        #define X(g,r)\
+            validRegConfigs.push_back(std::make_tuple(g,r));
+        
+        PSSM_GAPLESS_SINGLETILE_FOR_EACH_VALID_CONFIG_DO_X
+
+        #undef X
+
+
+
+        for(auto regConfig : validRegConfigs){
+            const int groupsize = std::get<0>(regConfig);
+            const int numRegs = std::get<1>(regConfig);
+        
+
+        // for(int groupsize : {4,8,16}){
+        // // for(int groupsize : {4}){
+        // // for(int groupsize : {8,16}){
+        //     for(int numRegs : {4,8,12,16,20,24,28,32,36,40,44,48,52,56,60,64}){
+        //     // for(int groupsize : {4,8,16}){
+        //     //     for(int numRegs : {32}){
+
+
+
+        // // for(int groupsize : {8,16}){
+        // //     for(int numRegs : {4,8,12,16,20,24,28,32,36,40,44,48,52,56,60,64}){
+
+
+                const int l = groupsize * numRegs * 2;
+                // if(l <= 2048) continue;
+ 
+
+                BenchmarkData benchmarkData;
+                benchmarkData.dpx = useDPX;
+                benchmarkData.tilesize = l;
+                benchmarkData.groupsize = groupsize;
+                benchmarkData.numRegs = numRegs;
+                benchmarkData.kernelApproach = kernelApproach;
+
+                std::string sequence(l, ' ');
+                for(int i = 0; i < l; i++){
+                    sequence[i] = letters[dist(gen)];
+                }
+
+                const int pseudoDBLength = l;
+                const int pseudoDBSize = 5000000;
+                const bool pseudoDBSameSequence = false;
+
+                helpers::CpuTimer timer_read_db("Generate DB");
+                auto fullDB_tmp = std::make_shared<cudasw4::PseudoDB>(cudasw4::loadPseudoDB(
+                    pseudoDBSize, 
+                    pseudoDBLength,
+                    pseudoDBSameSequence
+                ));
+                timer_read_db.stop();
+                
+                cudaSW4.setDatabase(fullDB_tmp);
+                cudaSW4.prefetchDBToGpus();
+                
+                GaplessKernelConfig config;
+                config.dpx = useDPX;
+                config.tilesize = l;
+                config.groupsize = groupsize;
+                config.numRegs = numRegs;
+                config.approach =kernelApproach;
+
+                cudaSW4.setCustomKernelConfig_Gapless(config);
+                cudasw4::DecodedQueryView queryView(sequence.data(), sequence.size());
+                ScanResult scanResult = cudaSW4.scan(queryView, std::nullopt);
+
+                benchmarkData.gcups = scanResult.stats.gcups;
+
+                benchmarkDataVec.push_back(benchmarkData);
+
+                std::cout << benchmarkData << "\n";
+
+        }
+
+        return benchmarkDataVec;
+    };
+
+    for(auto kernelApproach : {GaplessKernelConfig::Approach::hardcodedzero, GaplessKernelConfig::Approach::kernelparamzero}){
+        auto resultNoDpx = execute(kernelApproach, false);
+        allBenchmarkData.insert(allBenchmarkData.end(), resultNoDpx.begin(), resultNoDpx.end());
+
+        int ccMajor = 0;
+        cudaDeviceGetAttribute(&ccMajor, cudaDevAttrComputeCapabilityMajor, 0);
+        const bool supportsDPX = ccMajor >= 9;
+        if(supportsDPX){
+            auto resultDpx = execute(kernelApproach, true);
+            allBenchmarkData.insert(allBenchmarkData.end(), resultDpx.begin(), resultDpx.end());
+        }
+    }
+
+    auto bestConfigs = allBenchmarkData;
+    std::sort(bestConfigs.begin(), bestConfigs.end(), [](const auto& l, const auto& r){
+        if(l.tilesize < r.tilesize) return true;
+        if(l.tilesize > r.tilesize) return false;
+        return l.gcups > r.gcups;
+    });
+
+    std::cout << "sorted\n";
+    std::copy(bestConfigs.begin(), bestConfigs.end(), std::ostream_iterator<BenchmarkData>(std::cout, "\n"));
+
+    //only keep best for each tilesize
+    bestConfigs.erase(
+        std::unique(bestConfigs.begin(), bestConfigs.end(), [](const auto& l, const auto& r){
+            return l.tilesize == r.tilesize;
+        }),
+        bestConfigs.end()
+    );
+
+    std::cout << "best\n";
+    std::copy(bestConfigs.begin(), bestConfigs.end(), std::ostream_iterator<BenchmarkData>(std::cout, "\n"));
+}
+
+
+
+
+
+
+
+
+
+void sw_search(){
+    std::cout << "sw_search\n";
+    ProgramOptions options;
+
+    const int numTopOutputs = 0;
+    const auto blosumType = cudasw4::BlosumType::BLOSUM62_20;
+    const bool verbose = false;
+
+    std::vector<int> deviceIds;
+    {
+        int num = 0;
+        cudaGetDeviceCount(&num); CUERR
+        for(int i = 0; i < num; i++){
+            deviceIds.push_back(i);
+        }
+        if(deviceIds.size() > 0){
+            if(verbose){
+                std::cout << "Will use GPU";
+                for(auto x : deviceIds){
+                    std::cout << " " << x;
+                }
+                std::cout << "\n";
+            }
+        }else{
+            throw std::runtime_error("No GPU found");
+        }
+    }
+
+    helpers::PeerAccess peerAccess(deviceIds, false);
+ 
+    using MemoryConfig = cudasw4::MemoryConfig;
+    using ScanResult = cudasw4::ScanResult;
+
+    MemoryConfig memoryConfig;
+    memoryConfig.maxBatchBytes = options.maxBatchBytes;
+    memoryConfig.maxBatchSequences = options.maxBatchSequences;
+    memoryConfig.maxTempBytes = options.maxTempBytes;
+    memoryConfig.maxGpuMem = options.maxGpuMem;
+
+    const char* letters = "ARNDCQEGHILKMFPSTWYV";
+
+    std::mt19937 gen(42);
+    std::uniform_int_distribution<> dist(0,19);
+
+
+
+
+
+    std::vector<BenchmarkDataSW> allBenchmarkData;
+
+    writeBenchmarkDataHeader(std::cout);
+
+    auto execute = [&](bool useDPX){
+
+        KernelConfigFilenames kernelConfigFilenames;
+        cudasw4::CudaSW4 cudaSW4(
+            deviceIds, 
+            numTopOutputs,
+            blosumType, 
+            memoryConfig, 
+            verbose,
+            kernelConfigFilenames
+        );
+
+        cudaSW4.setScanType(ScanType::SW_Endpos);
+
+        // std::ofstream logfile(outputfilename);
+
+        std::vector<BenchmarkDataSW> benchmarkDataVec;
+
+        std::vector<std::tuple<int,int>> validRegConfigs;
+        #define X(g,r)\
+            validRegConfigs.push_back(std::make_tuple(g,r));
+        
+        PSSM_SW_ENDPOS_SINGLETILE_FLOAT_OR_INT_FOR_EACH_VALID_CONFIG_DO_X
+
+        #undef X
+
+
+
+        for(auto regConfig : validRegConfigs){
+            const int groupsize = std::get<0>(regConfig);
+            const int numRegs = std::get<1>(regConfig);
+
+            const int l = groupsize * numRegs;
+            // if(l <= 2048) continue;
+
+
+            BenchmarkDataSW benchmarkData;
+            benchmarkData.dpx = useDPX;
+            benchmarkData.tilesize = l;
+            benchmarkData.groupsize = groupsize;
+            benchmarkData.numRegs = numRegs;
+            benchmarkData.kernelApproach = SmithWatermanKernelConfig::Approach::Unused;
+
+            std::string sequence(l, ' ');
+            for(int i = 0; i < l; i++){
+                sequence[i] = letters[dist(gen)];
+            }
+
+            const int pseudoDBLength = l;
+            const int pseudoDBSize = 5000000;
+            const bool pseudoDBSameSequence = false;
+
+            helpers::CpuTimer timer_read_db("Generate DB");
+            auto fullDB_tmp = std::make_shared<cudasw4::PseudoDB>(cudasw4::loadPseudoDB(
+                pseudoDBSize, 
+                pseudoDBLength,
+                pseudoDBSameSequence
+            ));
+            timer_read_db.stop();
+            
+            cudaSW4.setDatabase(fullDB_tmp);
+            cudaSW4.prefetchDBToGpus();
+            
+
+            SmithWatermanKernelConfig config;
+            config.dpx = useDPX;
+            config.tilesize = l;
+            config.groupsize = groupsize;
+            config.numRegs = numRegs;
+            config.approach = SmithWatermanKernelConfig::Approach::Unused;
+
+            cudaSW4.setCustomKernelConfig_SW(config);
+            cudasw4::DecodedQueryView queryView(sequence.data(), sequence.size());
+            ScanResult scanResult = cudaSW4.scan(queryView, std::nullopt);
+
+            benchmarkData.gcups = scanResult.stats.gcups;
+
+            benchmarkDataVec.push_back(benchmarkData);
+
+            std::cout << benchmarkData << "\n";
+        }
+
+        return benchmarkDataVec;
+    };
+
+    {
+        auto resultNoDpx = execute(false);
+        allBenchmarkData.insert(allBenchmarkData.end(), resultNoDpx.begin(), resultNoDpx.end());
+
+        int ccMajor = 0;
+        cudaDeviceGetAttribute(&ccMajor, cudaDevAttrComputeCapabilityMajor, 0);
+        const bool supportsDPX = ccMajor >= 9;
+        if(supportsDPX){
+            auto resultDpx = execute(true);
+            allBenchmarkData.insert(allBenchmarkData.end(), resultDpx.begin(), resultDpx.end());
+        }
+    }
+
+    auto bestConfigs = allBenchmarkData;
+    std::sort(bestConfigs.begin(), bestConfigs.end(), [](const auto& l, const auto& r){
+        if(l.tilesize < r.tilesize) return true;
+        if(l.tilesize > r.tilesize) return false;
+        return l.gcups > r.gcups;
+    });
+
+    std::cout << "sorted\n";
+    std::copy(bestConfigs.begin(), bestConfigs.end(), std::ostream_iterator<BenchmarkDataSW>(std::cout, "\n"));
+
+    //only keep best for each tilesize
+    bestConfigs.erase(
+        std::unique(bestConfigs.begin(), bestConfigs.end(), [](const auto& l, const auto& r){
+            return l.tilesize == r.tilesize;
+        }),
+        bestConfigs.end()
+    );
+
+    std::cout << "best\n";
+    std::copy(bestConfigs.begin(), bestConfigs.end(), std::ostream_iterator<BenchmarkDataSW>(std::cout, "\n"));
+}
+
+
+
+
+int main(int argc, char* argv[]){
+
+    bool gapless = false;
+    bool sw = false;
+    for(int x = 1; x < argc; x++){
+        std::string argstring = argv[x];
+        if(argstring == "--gapless"){
+            gapless = true;
+        }
+        if(argstring == "--sw"){
+            sw = true;
+        }
+    }
+
+    if(gapless){
+        gapless_search();
+    }
+
+    if(sw){
+        sw_search();
+    }
+
+    return 0;
+}
+
diff --git a/lib/libmarv/src/types.hpp b/lib/libmarv/src/types.hpp
new file mode 100644
index 000000000..c99998691
--- /dev/null
+++ b/lib/libmarv/src/types.hpp
@@ -0,0 +1,445 @@
+#ifndef TYPES_HPP
+#define TYPES_HPP
+
+#include "hpc_helpers/all_helpers.cuh"
+
+#include <array>
+#include <string>
+
+namespace cudasw4{
+
+enum ScanType{
+    Gapless,
+    SW_Endpos,
+    GaplessPlusSW_Endpos
+};
+
+
+enum class BlosumType{
+    BLOSUM45,
+    BLOSUM50,
+    BLOSUM62,
+    BLOSUM80,
+    BLOSUM45_20,
+    BLOSUM50_20,
+    BLOSUM62_20,
+    BLOSUM80_20,
+};
+
+struct BLOSUM45_20{
+    static constexpr std::int8_t low = -5;
+    static constexpr int dim = 20 + 1;
+
+    static constexpr std::array<std::int8_t, dim*dim> get1D() {
+        return  {
+            //A, R, N, D, C, Q, E, G, H, I, L, K, M, F, P, S, T, W, Y, V, other
+            5, -2, -1, -2, -1, -1, -1, 0, -2, -1, -1, -1, -1, -2, -1, 1, 0, -2, -2, 0, low,
+            -2, 7, 0, -1, -3, 1, 0, -2, 0, -3, -2, 3, -1, -2, -2, -1, -1, -2, -1, -2, low,
+            -1, 0, 6, 2, -2, 0, 0, 0, 1, -2, -3, 0, -2, -2, -2, 1, 0, -4, -2, -3, low,
+            -2, -1, 2, 7, -3, 0, 2, -1, 0, -4, -3, 0, -3, -4, -1, 0, -1, -4, -2, -3, low,
+            -1, -3, -2, -3, 12, -3, -3, -3, -3, -3, -2, -3, -2, -2, -4, -1, -1, -5, -3, -1, low,
+            -1, 1, 0, 0, -3, 6, 2, -2, 1, -2, -2, 1, 0, -4, -1, 0, -1, -2, -1, -3, low,
+            -1, 0, 0, 2, -3, 2, 6, -2, 0, -3, -2, 1, -2, -3, 0, 0, -1, -3, -2, -3, low,
+            0, -2, 0, -1, -3, -2, -2, 7, -2, -4, -3, -2, -2, -3, -2, 0, -2, -2, -3, -3, low,
+            -2, 0, 1, 0, -3, 1, 0, -2, 10, -3, -2, -1, 0, -2, -2, -1, -2, -3, 2, -3, low,
+            -1, -3, -2, -4, -3, -2, -3, -4, -3, 5, 2, -3, 2, 0, -2, -2, -1, -2, 0, 3, low,
+            -1, -2, -3, -3, -2, -2, -2, -3, -2, 2, 5, -3, 2, 1, -3, -3, -1, -2, 0, 1, low,
+            -1, 3, 0, 0, -3, 1, 1, -2, -1, -3, -3, 5, -1, -3, -1, -1, -1, -2, -1, -2, low,
+            -1, -1, -2, -3, -2, 0, -2, -2, 0, 2, 2, -1, 6, 0, -2, -2, -1, -2, 0, 1, low,
+            -2, -2, -2, -4, -2, -4, -3, -3, -2, 0, 1, -3, 0, 8, -3, -2, -1, 1, 3, 0, low,
+            -1, -2, -2, -1, -4, -1, 0, -2, -2, -2, -3, -1, -2, -3, 9, -1, -1, -3, -3, -3, low,
+            1, -1, 1, 0, -1, 0, 0, 0, -1, -2, -3, -1, -2, -2, -1, 4, 2, -4, -2, -1, low,
+            0, -1, 0, -1, -1, -1, -1, -2, -2, -1, -1, -1, -1, -1, -1, 2, 5, -3, -1, 0, low,
+            -2, -2, -4, -4, -5, -2, -3, -2, -3, -2, -2, -2, -2, 1, -3, -4, -3, 15, 3, -3, low,
+            -2, -1, -2, -2, -3, -1, -2, -3, 2, 0, 0, -1, 0, 3, -3, -2, -1, 3, 8, -1, low,
+            0, -2, -3, -3, -1, -3, -3, -3, -3, 3, 1, -2, 1, 0, -3, -1, 0, -3, -1, 5, low,
+            low, low, low, low, low, low, low, low, low, low, low, low, low, low, low, low, low, low, low, low, low,
+        };
+    }
+
+    static constexpr std::array<std::array<std::int8_t, dim>, dim> get2D() {
+        auto flat = get1D();
+        std::array<std::array<std::int8_t, dim>, dim> result{};
+        for(int y = 0; y < dim; y++){
+            for(int x = 0; x < dim; x++){
+                result[y][x] = flat[y * dim + x];
+            }
+        }
+        return result;
+    }
+};
+
+struct BLOSUM50_20{
+    static constexpr std::int8_t low = -5;
+    static constexpr int dim = 20+1;
+
+    static constexpr std::array<std::int8_t, dim*dim> get1D() {
+        return  {
+            // A   R   N   D   C   Q   E   G   H   I   L   K   M   F   P   S   T   W   Y   V, other
+            5, -2, -1, -2, -1, -1, -1,  0, -2, -1, -2, -1, -1, -3, -1,  1,  0, -3, -2,  0, low,
+            -2,  7, -1, -2, -4,  1,  0, -3,  0, -4, -3,  3, -2, -3, -3, -1, -1, -3, -1, -3, low,
+            -1, -1,  7,  2, -2,  0,  0,  0,  1, -3, -4,  0, -2, -4, -2,  1,  0, -4, -2, -3, low,
+            -2, -2,  2,  8, -4,  0,  2, -1, -1, -4, -4, -1, -4, -5, -1,  0, -1, -5, -3, -4, low,
+            -1, -4, -2, -4, 13, -3, -3, -3, -3, -2, -2, -3, -2, -2, -4, -1, -1, -5, -3, -1, low,
+            -1,  1,  0,  0, -3,  7,  2, -2,  1, -3, -2,  2,  0, -4, -1,  0, -1, -1, -1, -3 , low,
+            -1,  0,  0,  2, -3,  2,  6, -3,  0, -4, -3,  1, -2, -3, -1, -1, -1, -3, -2, -3 , low,
+            0, -3,  0, -1, -3, -2, -3,  8, -2, -4, -4, -2, -3, -4, -2,  0, -2, -3, -3, -4 , low,
+            -2,  0,  1, -1, -3,  1,  0, -2, 10, -4, -3,  0, -1, -1, -2, -1, -2, -3,  2, -4 , low,
+            -1, -4, -3, -4, -2, -3, -4, -4, -4,  5,  2, -3,  2,  0, -3, -3, -1, -3, -1,  4 , low,
+            -2, -3, -4, -4, -2, -2, -3, -4, -3,  2,  5, -3,  3,  1, -4, -3, -1, -2, -1,  1 , low,
+            -1,  3,  0, -1, -3,  2,  1, -2,  0, -3, -3,  6, -2, -4, -1,  0, -1, -3, -2, -3 , low,
+            -1, -2, -2, -4, -2,  0, -2, -3, -1,  2,  3, -2,  7,  0, -3, -2, -1, -1,  0,  1 , low,
+            -3, -3, -4, -5, -2, -4, -3, -4, -1,  0,  1, -4,  0,  8, -4, -3, -2,  1,  4, -1 , low,
+            -1, -3, -2, -1, -4, -1, -1, -2, -2, -3, -4, -1, -3, -4, 10, -1, -1, -4, -3, -3 , low,
+            1, -1,  1,  0, -1,  0, -1,  0, -1, -3, -3,  0, -2, -3, -1,  5,  2, -4, -2, -2 , low,
+            0, -1,  0, -1, -1, -1, -1, -2, -2, -1, -1, -1, -1, -2, -1,  2,  5, -3, -2,  0 , low,
+            -3, -3, -4, -5, -5, -1, -3, -3, -3, -3, -2, -3, -1,  1, -4, -4, -3, 15,  2, -3, low,
+            -2, -1, -2, -3, -3, -1, -2, -3,  2, -1, -1, -2,  0,  4, -3, -2, -2,  2,  8, -1 , low,
+            0, -3, -3, -4, -1, -3, -3, -4, -4,  4,  1, -3,  1, -1, -3, -2,  0, -3, -1,  5, low,
+            low, low, low, low, low, low, low, low, low, low, low, low, low, low, low, low, low, low, low, low, low
+        };
+    }
+
+    static constexpr std::array<std::array<std::int8_t, dim>, dim> get2D() {
+        auto flat = get1D();
+        std::array<std::array<std::int8_t, dim>, dim> result{};
+        for(int y = 0; y < dim; y++){
+            for(int x = 0; x < dim; x++){
+                result[y][x] = flat[y * dim + x];
+            }
+        }
+        return result;
+    }
+};
+
+struct BLOSUM62_20{
+    static constexpr std::int8_t low = -4;
+    static constexpr int dim = 20+1;
+
+    static constexpr std::array<std::int8_t, dim*dim> get1D() {
+        return  {
+         // A   R   N   D   C   Q   E   G   H   I   L   K   M   F   P   S   T   W   Y   V, other
+            4, -1, -2, -2,  0, -1, -1,  0, -2, -1, -1, -1, -1, -2, -1,  1,  0, -3, -2,  0, low, // A
+            -1,  5,  0, -2, -3,  1,  0, -2,  0, -3, -2,  2, -1, -3, -2, -1, -1, -3, -2, -3, low,// R
+            -2,  0,  6,  1, -3,  0,  0,  0,  1, -3, -3,  0, -2, -3, -2,  1,  0, -4, -2, -3, low,// N
+            -2, -2,  1,  6, -3,  0,  2, -1, -1, -3, -4, -1, -3, -3, -1,  0, -1, -4, -3, -3, low,// D
+            0, -3, -3, -3,  9, -3, -4, -3, -3, -1, -1, -3, -1, -2, -3, -1, -1, -2, -2, -1, low,// C
+            -1,  1,  0,  0, -3,  5,  2, -2,  0, -3, -2,  1,  0, -3, -1,  0, -1, -2, -1, -2, low,// Q
+            -1,  0,  0,  2, -4,  2,  5, -2,  0, -3, -3,  1, -2, -3, -1,  0, -1, -3, -2, -2, low,// E
+            0, -2,  0, -1, -3, -2, -2,  6, -2, -4, -4, -2, -3, -3, -2,  0, -2, -2, -3, -3, low,// G
+            -2,  0,  1, -1, -3,  0,  0, -2,  8, -3, -3, -1, -2, -1, -2, -1, -2, -2,  2, -3, low,// H
+            -1, -3, -3, -3, -1, -3, -3, -4, -3,  4,  2, -3,  1,  0, -3, -2, -1, -3, -1,  3, low,// I
+            -1, -2, -3, -4, -1, -2, -3, -4, -3,  2,  4, -2,  2,  0, -3, -2, -1, -2, -1,  1, low,// L
+            -1,  2,  0, -1, -3,  1,  1, -2, -1, -3, -2,  5, -1, -3, -1,  0, -1, -3, -2, -2, low,// K
+            -1, -1, -2, -3, -1,  0, -2, -3, -2,  1,  2, -1,  5,  0, -2, -1, -1, -1, -1,  1, low,// M
+            -2, -3, -3, -3, -2, -3, -3, -3, -1,  0,  0, -3,  0,  6, -4, -2, -2,  1,  3, -1, low,// F
+            -1, -2, -2, -1, -3, -1, -1, -2, -2, -3, -3, -1, -2, -4,  7, -1, -1, -4, -3, -2, low,// P
+            1, -1,  1,  0, -1,  0,  0,  0, -1, -2, -2,  0, -1, -2, -1,  4,  1, -3, -2, -2, low,// S
+            0, -1,  0, -1, -1, -1, -1, -2, -2, -1, -1, -1, -1, -2, -1,  1,  5, -2, -2,  0, low,// T
+            -3, -3, -4, -4, -2, -2, -3, -2, -2, -3, -2, -3, -1,  1, -4, -3, -2, 11,  2, -3, low,// W
+            -2, -2, -2, -3, -2, -1, -2, -3,  2, -1, -1, -2, -1,  3, -3, -2, -2,  2,  7, -1, low,// Y
+            0, -3, -3, -3, -1, -2, -2, -3, -3,  3,  1, -2,  1, -1, -2, -2,  0, -3, -1,  4, low,// V
+            low, low, low, low, low, low, low, low, low, low, low, low, low, low, low, low, low, low, low, low, low
+        };
+    }
+
+    static constexpr std::array<std::array<std::int8_t, dim>, dim> get2D() {
+        auto flat = get1D();
+        std::array<std::array<std::int8_t, dim>, dim> result{};
+        for(int y = 0; y < dim; y++){
+            for(int x = 0; x < dim; x++){
+                result[y][x] = flat[y * dim + x];
+            }
+        }
+        return result;
+    }
+};
+
+
+
+struct BLOSUM80_20{
+    static constexpr std::int8_t low = -6;
+    static constexpr int dim = 20 + 1;
+
+    static constexpr std::array<std::int8_t, dim*dim> get1D() {
+        return  {
+            //A, R, N, D, C, Q, E, G, H, I, L, K, M, F, P, S, T, W, Y, V, other
+            5, -2, -2, -2, -1, -1, -1, 0, -2, -2, -2, -1, -1, -3, -1, 1, 0, -3, -2, 0, low,
+            -2, 6, -1, -2, -4, 1, -1, -3, 0, -3, -3, 2, -2, -4, -2, -1, -1, -4, -3, -3, low,
+            -2, -1, 6, 1, -3, 0, -1, -1, 0, -4, -4, 0, -3, -4, -3, 0, 0, -4, -3, -4, low,
+            -2, -2, 1, 6, -4, -1, 1, -2, -2, -4, -5, -1, -4, -4, -2, -1, -1, -6, -4, -4, low,
+            -1, -4, -3, -4, 9, -4, -5, -4, -4, -2, -2, -4, -2, -3, -4, -2, -1, -3, -3, -1, low,
+            -1, 1, 0, -1, -4, 6, 2, -2, 1, -3, -3, 1, 0, -4, -2, 0, -1, -3, -2, -3, low,
+            -1, -1, -1, 1, -5, 2, 6, -3, 0, -4, -4, 1, -2, -4, -2, 0, -1, -4, -3, -3, low,
+            0, -3, -1, -2, -4, -2, -3, 6, -3, -5, -4, -2, -4, -4, -3, -1, -2, -4, -4, -4, low,
+            -2, 0, 0, -2, -4, 1, 0, -3, 8, -4, -3, -1, -2, -2, -3, -1, -2, -3, 2, -4, low,
+            -2, -3, -4, -4, -2, -3, -4, -5, -4, 5, 1, -3, 1, -1, -4, -3, -1, -3, -2, 3, low,
+            -2, -3, -4, -5, -2, -3, -4, -4, -3, 1, 4, -3, 2, 0, -3, -3, -2, -2, -2, 1, low,
+            -1, 2, 0, -1, -4, 1, 1, -2, -1, -3, -3, 5, -2, -4, -1, -1, -1, -4, -3, -3, low,
+            -1, -2, -3, -4, -2, 0, -2, -4, -2, 1, 2, -2, 6, 0, -3, -2, -1, -2, -2, 1, low,
+            -3, -4, -4, -4, -3, -4, -4, -4, -2, -1, 0, -4, 0, 6, -4, -3, -2, 0, 3, -1, low,
+            -1, -2, -3, -2, -4, -2, -2, -3, -3, -4, -3, -1, -3, -4, 8, -1, -2, -5, -4, -3, low,
+            1, -1, 0, -1, -2, 0, 0, -1, -1, -3, -3, -1, -2, -3, -1, 5, 1, -4, -2, -2, low,
+            0, -1, 0, -1, -1, -1, -1, -2, -2, -1, -2, -1, -1, -2, -2, 1, 5, -4, -2, 0, low,
+            -3, -4, -4, -6, -3, -3, -4, -4, -3, -3, -2, -4, -2, 0, -5, -4, -4, 11, 2, -3, low,
+            -2, -3, -3, -4, -3, -2, -3, -4, 2, -2, -2, -3, -2, 3, -4, -2, -2, 2, 7, -2, low,
+            0, -3, -4, -4, -1, -3, -3, -4, -4, 3, 1, -3, 1, -1, -3, -2, 0, -3, -2, 4, low,
+            low, low, low, low, low, low, low, low, low, low, low, low, low, low, low, low, low, low, low, low, low,
+        };
+    }
+
+    static constexpr std::array<std::array<std::int8_t, dim>, dim> get2D() {
+        auto flat = get1D();
+        std::array<std::array<std::int8_t, dim>, dim> result{};
+        for(int y = 0; y < dim; y++){
+            for(int x = 0; x < dim; x++){
+                result[y][x] = flat[y * dim + x];
+            }
+        }
+        return result;
+    }
+};
+
+
+
+struct BLOSUM45{
+    static constexpr std::int8_t low = -5;
+    static constexpr int dim = 25;
+
+    static constexpr std::array<std::int8_t, dim*dim> get1D() {
+        return  {
+            //A, R, N,  D,  C,  Q,  E, G,  H,  I,  L,  K,  M,  F,  P, S, T,  W,  Y, V,  B,  J,  Z,  X,  *
+            5, -2, -1, -2, -1, -1, -1, 0, -2, -1, -1, -1, -1, -2, -1, 1, 0, -2, -2, 0, -1, -1, -1, -1, -5,
+            -2, 7, 0, -1, -3, 1, 0, -2, 0, -3, -2, 3, -1, -2, -2, -1, -1, -2, -1, -2, -1, -3, 1, -1, -5,
+            -1, 0, 6, 2, -2, 0, 0, 0, 1, -2, -3, 0, -2, -2, -2, 1, 0, -4, -2, -3, 5, -3, 0, -1, -5,
+            -2, -1, 2, 7, -3, 0, 2, -1, 0, -4, -3, 0, -3, -4, -1, 0, -1, -4, -2, -3, 6, -3, 1, -1, -5,
+            -1, -3, -2, -3, 12, -3, -3, -3, -3, -3, -2, -3, -2, -2, -4, -1, -1, -5, -3, -1, -2, -2, -3, -1, -5,
+            -1, 1, 0, 0, -3, 6, 2, -2, 1, -2, -2, 1, 0, -4, -1, 0, -1, -2, -1, -3, 0, -2, 4, -1, -5,
+            -1, 0, 0, 2, -3, 2, 6, -2, 0, -3, -2, 1, -2, -3, 0, 0, -1, -3, -2, -3, 1, -3, 5, -1, -5,
+            0, -2, 0, -1, -3, -2, -2, 7, -2, -4, -3, -2, -2, -3, -2, 0, -2, -2, -3, -3, -1, -4, -2, -1, -5,
+            -2, 0, 1, 0, -3, 1, 0, -2, 10, -3, -2, -1, 0, -2, -2, -1, -2, -3, 2, -3, 0, -2, 0, -1, -5,
+            -1, -3, -2, -4, -3, -2, -3, -4, -3, 5, 2, -3, 2, 0, -2, -2, -1, -2, 0, 3, -3, 4, -3, -1, -5,
+            -1, -2, -3, -3, -2, -2, -2, -3, -2, 2, 5, -3, 2, 1, -3, -3, -1, -2, 0, 1, -3, 4, -2, -1, -5,
+            -1, 3, 0, 0, -3, 1, 1, -2, -1, -3, -3, 5, -1, -3, -1, -1, -1, -2, -1, -2, 0, -3, 1, -1, -5,
+            -1, -1, -2, -3, -2, 0, -2, -2, 0, 2, 2, -1, 6, 0, -2, -2, -1, -2, 0, 1, -2, 2, -1, -1, -5,
+            -2, -2, -2, -4, -2, -4, -3, -3, -2, 0, 1, -3, 0, 8, -3, -2, -1, 1, 3, 0, -3, 1, -3, -1, -5,
+            -1, -2, -2, -1, -4, -1, 0, -2, -2, -2, -3, -1, -2, -3, 9, -1, -1, -3, -3, -3, -2, -3, -1, -1, -5,
+            1, -1, 1, 0, -1, 0, 0, 0, -1, -2, -3, -1, -2, -2, -1, 4, 2, -4, -2, -1, 0, -2, 0, -1, -5,
+            0, -1, 0, -1, -1, -1, -1, -2, -2, -1, -1, -1, -1, -1, -1, 2, 5, -3, -1, 0, 0, -1, -1, -1, -5,
+            -2, -2, -4, -4, -5, -2, -3, -2, -3, -2, -2, -2, -2, 1, -3, -4, -3, 15, 3, -3, -4, -2, -2, -1, -5,
+            -2, -1, -2, -2, -3, -1, -2, -3, 2, 0, 0, -1, 0, 3, -3, -2, -1, 3, 8, -1, -2, 0, -2, -1, -5,
+            0, -2, -3, -3, -1, -3, -3, -3, -3, 3, 1, -2, 1, 0, -3, -1, 0, -3, -1, 5, -3, 2, -3, -1, -5,
+            -1, -1, 5, 6, -2, 0, 1, -1, 0, -3, -3, 0, -2, -3, -2, 0, 0, -4, -2, -3, 5, -3, 1, -1, -5,
+            -1, -3, -3, -3, -2, -2, -3, -4, -2, 4, 4, -3, 2, 1, -3, -2, -1, -2, 0, 2, -3, 4, -2, -1, -5,
+            -1, 1, 0, 1, -3, 4, 5, -2, 0, -3, -2, 1, -1, -3, -1, 0, -1, -2, -2, -3, 1, -2, 5, -1, -5,
+            -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -5,
+            -5, -5, -5, -5, -5, -5, -5, -5, -5, -5, -5, -5, -5, -5, -5, -5, -5, -5, -5, -5, -5, -5, -5, -5, 1,
+        };
+    }
+
+    static constexpr std::array<std::array<std::int8_t, dim>, dim> get2D() {
+        auto flat = get1D();
+        std::array<std::array<std::int8_t, dim>, dim> result{};
+        for(int y = 0; y < dim; y++){
+            for(int x = 0; x < dim; x++){
+                result[y][x] = flat[y * dim + x];
+            }
+        }
+        return result;
+    }
+};
+
+struct BLOSUM50{
+    static constexpr std::int8_t low = -5;
+    static constexpr int dim = 25;
+
+    static constexpr std::array<std::int8_t, dim*dim> get1D() {
+        return  {
+            //A, R, N, D, C, Q, E, G, H, I, L, K, M, F, P, S, T, W, Y, V, B, J, Z, X, *
+            5, -2, -1, -2, -1, -1, -1, 0, -2, -1, -2, -1, -1, -3, -1, 1, 0, -3, -2, 0, -2, -2, -1, -1, -5,
+            -2, 7, -1, -2, -4, 1, 0, -3, 0, -4, -3, 3, -2, -3, -3, -1, -1, -3, -1, -3, -1, -3, 0, -1, -5,
+            -1, -1, 7, 2, -2, 0, 0, 0, 1, -3, -4, 0, -2, -4, -2, 1, 0, -4, -2, -3, 5, -4, 0, -1, -5,
+            -2, -2, 2, 8, -4, 0, 2, -1, -1, -4, -4, -1, -4, -5, -1, 0, -1, -5, -3, -4, 6, -4, 1, -1, -5,
+            -1, -4, -2, -4, 13, -3, -3, -3, -3, -2, -2, -3, -2, -2, -4, -1, -1, -5, -3, -1, -3, -2, -3, -1, -5,
+            -1, 1, 0, 0, -3, 7, 2, -2, 1, -3, -2, 2, 0, -4, -1, 0, -1, -1, -1, -3, 0, -3, 4, -1, -5,
+            -1, 0, 0, 2, -3, 2, 6, -3, 0, -4, -3, 1, -2, -3, -1, -1, -1, -3, -2, -3, 1, -3, 5, -1, -5,
+            0, -3, 0, -1, -3, -2, -3, 8, -2, -4, -4, -2, -3, -4, -2, 0, -2, -3, -3, -4, -1, -4, -2, -1, -5,
+            -2, 0, 1, -1, -3, 1, 0, -2, 10, -4, -3, 0, -1, -1, -2, -1, -2, -3, 2, -4, 0, -3, 0, -1, -5,
+            -1, -4, -3, -4, -2, -3, -4, -4, -4, 5, 2, -3, 2, 0, -3, -3, -1, -3, -1, 4, -4, 4, -3, -1, -5,
+            -2, -3, -4, -4, -2, -2, -3, -4, -3, 2, 5, -3, 3, 1, -4, -3, -1, -2, -1, 1, -4, 4, -3, -1, -5,
+            -1, 3, 0, -1, -3, 2, 1, -2, 0, -3, -3, 6, -2, -4, -1, 0, -1, -3, -2, -3, 0, -3, 1, -1, -5,
+            -1, -2, -2, -4, -2, 0, -2, -3, -1, 2, 3, -2, 7, 0, -3, -2, -1, -1, 0, 1, -3, 2, -1, -1, -5,
+            -3, -3, -4, -5, -2, -4, -3, -4, -1, 0, 1, -4, 0, 8, -4, -3, -2, 1, 4, -1, -4, 1, -4, -1, -5,
+            -1, -3, -2, -1, -4, -1, -1, -2, -2, -3, -4, -1, -3, -4, 10, -1, -1, -4, -3, -3, -2, -3, -1, -1, -5,
+            1, -1, 1, 0, -1, 0, -1, 0, -1, -3, -3, 0, -2, -3, -1, 5, 2, -4, -2, -2, 0, -3, 0, -1, -5,
+            0, -1, 0, -1, -1, -1, -1, -2, -2, -1, -1, -1, -1, -2, -1, 2, 5, -3, -2, 0, 0, -1, -1, -1, -5,
+            -3, -3, -4, -5, -5, -1, -3, -3, -3, -3, -2, -3, -1, 1, -4, -4, -3, 15, 2, -3, -5, -2, -2, -1, -5,
+            -2, -1, -2, -3, -3, -1, -2, -3, 2, -1, -1, -2, 0, 4, -3, -2, -2, 2, 8, -1, -3, -1, -2, -1, -5,
+            0, -3, -3, -4, -1, -3, -3, -4, -4, 4, 1, -3, 1, -1, -3, -2, 0, -3, -1, 5, -3, 2, -3, -1, -5,
+            -2, -1, 5, 6, -3, 0, 1, -1, 0, -4, -4, 0, -3, -4, -2, 0, 0, -5, -3, -3, 6, -4, 1, -1, -5,
+            -2, -3, -4, -4, -2, -3, -3, -4, -3, 4, 4, -3, 2, 1, -3, -3, -1, -2, -1, 2, -4, 4, -3, -1, -5,
+            -1, 0, 0, 1, -3, 4, 5, -2, 0, -3, -3, 1, -1, -4, -1, 0, -1, -2, -2, -3, 1, -3, 5, -1, -5,
+            -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -5,
+            -5, -5, -5, -5, -5, -5, -5, -5, -5, -5, -5, -5, -5, -5, -5, -5, -5, -5, -5, -5, -5, -5, -5, -5, 1,
+
+        };
+    }
+
+    static constexpr std::array<std::array<std::int8_t, dim>, dim> get2D() {
+        auto flat = get1D();
+        std::array<std::array<std::int8_t, dim>, dim> result{};
+        for(int y = 0; y < dim; y++){
+            for(int x = 0; x < dim; x++){
+                result[y][x] = flat[y * dim + x];
+            }
+        }
+        return result;
+    }
+};
+
+
+struct BLOSUM62{
+    static constexpr std::int8_t low = -4;
+    static constexpr int dim = 25;
+
+    static constexpr std::array<std::int8_t, dim*dim> get1D() {
+        return  {
+            // A, R, N, D, C, Q, E, G, H, I, L, K, M, F, P, S, T, W, Y, V, B, J, Z, X, *
+            4, -1, -2, -2, 0, -1, -1, 0, -2, -1, -1, -1, -1, -2, -1, 1, 0, -3, -2, 0, -2, -1, -1, -1, -4,
+            -1, 5, 0, -2, -3, 1, 0, -2, 0, -3, -2, 2, -1, -3, -2, -1, -1, -3, -2, -3, -1, -2, 0, -1, -4,
+            -2, 0, 6, 1, -3, 0, 0, 0, 1, -3, -3, 0, -2, -3, -2, 1, 0, -4, -2, -3, 4, -3, 0, -1, -4,
+            -2, -2, 1, 6, -3, 0, 2, -1, -1, -3, -4, -1, -3, -3, -1, 0, -1, -4, -3, -3, 4, -3, 1, -1, -4,
+            0, -3, -3, -3, 9, -3, -4, -3, -3, -1, -1, -3, -1, -2, -3, -1, -1, -2, -2, -1, -3, -1, -3, -1, -4,
+            -1, 1, 0, 0, -3, 5, 2, -2, 0, -3, -2, 1, 0, -3, -1, 0, -1, -2, -1, -2, 0, -2, 4, -1, -4,
+            -1, 0, 0, 2, -4, 2, 5, -2, 0, -3, -3, 1, -2, -3, -1, 0, -1, -3, -2, -2, 1, -3, 4, -1, -4,
+            0, -2, 0, -1, -3, -2, -2, 6, -2, -4, -4, -2, -3, -3, -2, 0, -2, -2, -3, -3, -1, -4, -2, -1, -4,
+            -2, 0, 1, -1, -3, 0, 0, -2, 8, -3, -3, -1, -2, -1, -2, -1, -2, -2, 2, -3, 0, -3, 0, -1, -4,
+            -1, -3, -3, -3, -1, -3, -3, -4, -3, 4, 2, -3, 1, 0, -3, -2, -1, -3, -1, 3, -3, 3, -3, -1, -4,
+            -1, -2, -3, -4, -1, -2, -3, -4, -3, 2, 4, -2, 2, 0, -3, -2, -1, -2, -1, 1, -4, 3, -3, -1, -4,
+            -1, 2, 0, -1, -3, 1, 1, -2, -1, -3, -2, 5, -1, -3, -1, 0, -1, -3, -2, -2, 0, -3, 1, -1, -4,
+            -1, -1, -2, -3, -1, 0, -2, -3, -2, 1, 2, -1, 5, 0, -2, -1, -1, -1, -1, 1, -3, 2, -1, -1, -4,
+            -2, -3, -3, -3, -2, -3, -3, -3, -1, 0, 0, -3, 0, 6, -4, -2, -2, 1, 3, -1, -3, 0, -3, -1, -4,
+            -1, -2, -2, -1, -3, -1, -1, -2, -2, -3, -3, -1, -2, -4, 7, -1, -1, -4, -3, -2, -2, -3, -1, -1, -4,
+            1, -1, 1, 0, -1, 0, 0, 0, -1, -2, -2, 0, -1, -2, -1, 4, 1, -3, -2, -2, 0, -2, 0, -1, -4,
+            0, -1, 0, -1, -1, -1, -1, -2, -2, -1, -1, -1, -1, -2, -1, 1, 5, -2, -2, 0, -1, -1, -1, -1, -4,
+            -3, -3, -4, -4, -2, -2, -3, -2, -2, -3, -2, -3, -1, 1, -4, -3, -2, 11, 2, -3, -4, -2, -2, -1, -4,
+            -2, -2, -2, -3, -2, -1, -2, -3, 2, -1, -1, -2, -1, 3, -3, -2, -2, 2, 7, -1, -3, -1, -2, -1, -4,
+            0, -3, -3, -3, -1, -2, -2, -3, -3, 3, 1, -2, 1, -1, -2, -2, 0, -3, -1, 4, -3, 2, -2, -1, -4,
+            -2, -1, 4, 4, -3, 0, 1, -1, 0, -3, -4, 0, -3, -3, -2, 0, -1, -4, -3, -3, 4, -3, 0, -1, -4,
+            -1, -2, -3, -3, -1, -2, -3, -4, -3, 3, 3, -3, 2, 0, -3, -2, -1, -2, -1, 2, -3, 3, -3, -1, -4,
+            -1, 0, 0, 1, -3, 4, 4, -2, 0, -3, -3, 1, -1, -3, -1, 0, -1, -2, -2, -2, 0, -3, 4, -1, -4,
+            -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -4,
+            -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, 1,
+        };
+    }
+
+    static constexpr std::array<std::array<std::int8_t, dim>, dim> get2D() {
+        auto flat = get1D();
+        std::array<std::array<std::int8_t, dim>, dim> result{};
+        for(int y = 0; y < dim; y++){
+            for(int x = 0; x < dim; x++){
+                result[y][x] = flat[y * dim + x];
+            }
+        }
+        return result;
+    }
+};
+
+
+
+struct BLOSUM80{
+    static constexpr std::int8_t low = -6;
+    static constexpr int dim = 25;
+
+    static constexpr std::array<std::int8_t, dim*dim> get1D() {
+        return  {
+            // A, R, N, D, C, Q, E, G, H, I, L, K, M, F, P, S, T, W, Y, V, B, J, Z, X, *
+            5, -2, -2, -2, -1, -1, -1, 0, -2, -2, -2, -1, -1, -3, -1, 1, 0, -3, -2, 0, -2, -2, -1, -1, -6,
+            -2, 6, -1, -2, -4, 1, -1, -3, 0, -3, -3, 2, -2, -4, -2, -1, -1, -4, -3, -3, -1, -3, 0, -1, -6,
+            -2, -1, 6, 1, -3, 0, -1, -1, 0, -4, -4, 0, -3, -4, -3, 0, 0, -4, -3, -4, 5, -4, 0, -1, -6,
+            -2, -2, 1, 6, -4, -1, 1, -2, -2, -4, -5, -1, -4, -4, -2, -1, -1, -6, -4, -4, 5, -5, 1, -1, -6,
+            -1, -4, -3, -4, 9, -4, -5, -4, -4, -2, -2, -4, -2, -3, -4, -2, -1, -3, -3, -1, -4, -2, -4, -1, -6,
+            -1, 1, 0, -1, -4, 6, 2, -2, 1, -3, -3, 1, 0, -4, -2, 0, -1, -3, -2, -3, 0, -3, 4, -1, -6,
+            -1, -1, -1, 1, -5, 2, 6, -3, 0, -4, -4, 1, -2, -4, -2, 0, -1, -4, -3, -3, 1, -4, 5, -1, -6,
+            0, -3, -1, -2, -4, -2, -3, 6, -3, -5, -4, -2, -4, -4, -3, -1, -2, -4, -4, -4, -1, -5, -3, -1, -6,
+            -2, 0, 0, -2, -4, 1, 0, -3, 8, -4, -3, -1, -2, -2, -3, -1, -2, -3, 2, -4, -1, -4, 0, -1, -6,
+            -2, -3, -4, -4, -2, -3, -4, -5, -4, 5, 1, -3, 1, -1, -4, -3, -1, -3, -2, 3, -4, 3, -4, -1, -6,
+            -2, -3, -4, -5, -2, -3, -4, -4, -3, 1, 4, -3, 2, 0, -3, -3, -2, -2, -2, 1, -4, 3, -3, -1, -6,
+            -1, 2, 0, -1, -4, 1, 1, -2, -1, -3, -3, 5, -2, -4, -1, -1, -1, -4, -3, -3, -1, -3, 1, -1, -6,
+            -1, -2, -3, -4, -2, 0, -2, -4, -2, 1, 2, -2, 6, 0, -3, -2, -1, -2, -2, 1, -3, 2, -1, -1, -6,
+            -3, -4, -4, -4, -3, -4, -4, -4, -2, -1, 0, -4, 0, 6, -4, -3, -2, 0, 3, -1, -4, 0, -4, -1, -6,
+            -1, -2, -3, -2, -4, -2, -2, -3, -3, -4, -3, -1, -3, -4, 8, -1, -2, -5, -4, -3, -2, -4, -2, -1, -6,
+            1, -1, 0, -1, -2, 0, 0, -1, -1, -3, -3, -1, -2, -3, -1, 5, 1, -4, -2, -2, 0, -3, 0, -1, -6,
+            0, -1, 0, -1, -1, -1, -1, -2, -2, -1, -2, -1, -1, -2, -2, 1, 5, -4, -2, 0, -1, -1, -1, -1, -6,
+            -3, -4, -4, -6, -3, -3, -4, -4, -3, -3, -2, -4, -2, 0, -5, -4, -4, 11, 2, -3, -5, -3, -3, -1, -6,
+            -2, -3, -3, -4, -3, -2, -3, -4, 2, -2, -2, -3, -2, 3, -4, -2, -2, 2, 7, -2, -3, -2, -3, -1, -6,
+            0, -3, -4, -4, -1, -3, -3, -4, -4, 3, 1, -3, 1, -1, -3, -2, 0, -3, -2, 4, -4, 2, -3, -1, -6,
+            -2, -1, 5, 5, -4, 0, 1, -1, -1, -4, -4, -1, -3, -4, -2, 0, -1, -5, -3, -4, 5, -4, 0, -1, -6,
+            -2, -3, -4, -5, -2, -3, -4, -5, -4, 3, 3, -3, 2, 0, -4, -3, -1, -3, -2, 2, -4, 3, -3, -1, -6,
+            -1, 0, 0, 1, -4, 4, 5, -3, 0, -4, -3, 1, -1, -4, -2, 0, -1, -3, -3, -3, 0, -3, 5, -1, -6,
+            -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -6,
+            -6, -6, -6, -6, -6, -6, -6, -6, -6, -6, -6, -6, -6, -6, -6, -6, -6, -6, -6, -6, -6, -6, -6, -6, 1,
+
+        };
+    }
+
+    static constexpr std::array<std::array<std::int8_t, dim>, dim> get2D() {
+        auto flat = get1D();
+        std::array<std::array<std::int8_t, dim>, dim> result{};
+        for(int y = 0; y < dim; y++){
+            for(int x = 0; x < dim; x++){
+                result[y][x] = flat[y * dim + x];
+            }
+        }
+        return result;
+    }
+};
+
+
+
+
+__inline__
+std::string to_string(cudasw4::ScanType type){
+    switch(type){
+        case cudasw4::ScanType::Gapless: return "Gapless"; break;
+        case cudasw4::ScanType::SW_Endpos: return "SW with end positions"; break;
+        case cudasw4::ScanType::GaplessPlusSW_Endpos: return "Gapless + SW with end positions"; break;
+        default: return "Missing name for ScanType";
+    }
+}
+
+
+__inline__
+std::string to_string(cudasw4::BlosumType type){
+    switch(type){
+        case cudasw4::BlosumType::BLOSUM45: return "BLOSUM45";
+        case cudasw4::BlosumType::BLOSUM50: return "BLOSUM50";
+        case cudasw4::BlosumType::BLOSUM62: return "BLOSUM62";
+        case cudasw4::BlosumType::BLOSUM80: return "BLOSUM80";
+        case cudasw4::BlosumType::BLOSUM45_20: return "BLOSUM45 (20)";
+        case cudasw4::BlosumType::BLOSUM50_20: return "BLOSUM50 (20)";
+        case cudasw4::BlosumType::BLOSUM62_20: return "BLOSUM62 (20)";
+        case cudasw4::BlosumType::BLOSUM80_20: return "BLOSUM80 (20)";
+        default: return "FORGOT TO NAME THIS BLOSUM TYPE";
+    }
+}
+
+__inline__
+std::string to_string_nodim(cudasw4::BlosumType type){
+    switch(type){
+        case cudasw4::BlosumType::BLOSUM45: return "BLOSUM45";
+        case cudasw4::BlosumType::BLOSUM50: return "BLOSUM50";
+        case cudasw4::BlosumType::BLOSUM62: return "BLOSUM62";
+        case cudasw4::BlosumType::BLOSUM80: return "BLOSUM80";
+        case cudasw4::BlosumType::BLOSUM45_20: return "BLOSUM45";
+        case cudasw4::BlosumType::BLOSUM50_20: return "BLOSUM50";
+        case cudasw4::BlosumType::BLOSUM62_20: return "BLOSUM62";
+        case cudasw4::BlosumType::BLOSUM80_20: return "BLOSUM80";
+        default: return "FORGOT TO NAME THIS BLOSUM TYPE";
+    }
+}
+
+} //namespace cudasw4
+
+
+#endif
\ No newline at end of file
diff --git a/lib/libmarv/src/util.cuh b/lib/libmarv/src/util.cuh
new file mode 100644
index 000000000..8ceddabee
--- /dev/null
+++ b/lib/libmarv/src/util.cuh
@@ -0,0 +1,277 @@
+#ifndef UTIL_CUH
+#define UTIL_CUH
+
+#include "config.hpp"
+#include "hpc_helpers/all_helpers.cuh"
+
+#include <thrust/device_malloc_allocator.h>
+#include <thrust/sequence.h>
+#include <thrust/fill.h>
+
+
+
+namespace cudasw4{
+
+
+template<class T, int numRows, int numColumns>
+struct SharedPSSM_singletile{
+    static_assert(16 % sizeof(T) == 0);
+    //each row is padded to 16 bytes
+    static constexpr int numPaddedColumns = SDIV(numColumns, 16/sizeof(T)) * 16/sizeof(T);
+    alignas(16) T data[numRows][numPaddedColumns];
+};
+
+template<int groupsize, int factor_>
+struct SmemIndexCalculator{
+    static constexpr int factor = factor_;
+
+    __device__
+    int getIndex(int ithChunkOfFour){
+        constexpr int groupsizeForSmem = factor*groupsize;
+        const int groupLaneForSmem = threadIdx.x % groupsizeForSmem;
+        return 4*(groupLaneForSmem+ithChunkOfFour*groupsizeForSmem);
+    }
+};
+
+
+template<class T> struct Vectorized2;
+template<> struct Vectorized2<int>{ using type = int2; };
+template<> struct Vectorized2<float>{ using type = float2; };
+
+template<class T> struct Vectorized4;
+template<> struct Vectorized4<int>{ using type = int4; };
+template<> struct Vectorized4<float>{ using type = float4; };
+
+template<class Score, class Extra>
+struct ScoreWithExtra{
+    Score score;
+    Extra extra;
+
+    ScoreWithExtra() = default;
+
+    __host__ __device__
+    ScoreWithExtra(Score s, Extra e) : score(s), extra(e){}
+
+    __host__ __device__
+    Score getScore() const{
+        return score;
+    }
+
+    __host__ __device__
+    Extra getExtra() const{
+        return extra;
+    }
+};
+
+
+
+
+
+
+template <class T>
+struct thrust_async_allocator : public thrust::device_malloc_allocator<T> {
+public:
+    using Base      = thrust::device_malloc_allocator<T>;
+    using pointer   = typename Base::pointer;
+    using size_type = typename Base::size_type;
+
+    thrust_async_allocator(cudaStream_t stream_) : stream{stream_} {}
+
+    pointer allocate(size_type num){
+        //std::cout << "allocate " << num << "\n";
+        T* result = nullptr;
+        cudaError_t status = cudaMallocAsync(&result, sizeof(T) * num, stream);
+        if(status != cudaSuccess){
+            throw std::runtime_error("thrust_async_allocator error allocate");
+        }
+        return thrust::device_pointer_cast(result);
+    }
+
+    void deallocate(pointer ptr, size_type /*num*/){
+        //std::cout << "deallocate \n";
+        cudaError_t status = cudaFreeAsync(thrust::raw_pointer_cast(ptr), stream);
+        if(status != cudaSuccess){
+            throw std::runtime_error("thrust_async_allocator error deallocate");
+        }
+    }
+
+private:
+    cudaStream_t stream;
+};
+
+template <class T>
+struct thrust_preallocated_single_allocator : public thrust::device_malloc_allocator<T> {
+public:
+    using Base      = thrust::device_malloc_allocator<T>;
+    using pointer   = typename Base::pointer;
+    using size_type = typename Base::size_type;
+
+    thrust_preallocated_single_allocator(void* ptr, size_t size) : preallocated{ptr}, preallocatedSize{size} {}
+
+    pointer allocate(size_type num){
+        if(!free){
+            throw std::runtime_error("thrust_async_allocator error allocate");
+        }else{
+            if(sizeof(T) * num <= preallocatedSize){
+                T* result = (T*)preallocated;
+                free = false;
+                return thrust::device_pointer_cast(result);
+            }else{
+                throw std::runtime_error("thrust_async_allocator error allocate");
+            }
+        }
+    }
+
+    void deallocate(pointer ptr, size_type /*num*/){
+        if(free){
+            throw std::runtime_error("thrust_async_allocator error deallocate");
+        }else{
+            T* result = thrust::raw_pointer_cast(ptr);
+            if((void*) result != preallocated){
+                throw std::runtime_error("thrust_async_allocator error deallocate");
+            }
+            free = true;
+        }
+    }
+
+private:
+    bool free = true;
+    void* preallocated;
+    size_t preallocatedSize;
+    cudaStream_t stream;
+};
+
+//Call cudaSetDevice on destruction
+struct RevertDeviceId{
+    RevertDeviceId(){
+        cudaGetDevice(&id);
+    }
+    RevertDeviceId(int id_) : id(id_){}
+    ~RevertDeviceId(){
+        cudaSetDevice(id);
+    }
+    int id;
+};
+
+
+//template<size_t size>
+struct TopNMaximaArray{
+    struct Ref{
+        size_t index;
+        size_t indexOffset;
+        float* d_scores;
+        ReferenceIdT* d_indices;
+        size_t size;
+
+        __device__
+        Ref& operator=(float newscore){     
+            d_scores[index] = newscore;
+            d_indices[index] = indexOffset + index;
+            return *this;
+        }
+    };
+
+    TopNMaximaArray(float* d_scores_, ReferenceIdT* d_indices_, size_t offset, size_t size_)
+        : indexOffset(offset), d_scores(d_scores_), d_indices(d_indices_), size(size_){}
+
+    template<class Index>
+    __device__
+    Ref operator[](Index index) const{
+        Ref r;
+        r.index = index;
+        r.indexOffset = indexOffset;
+        r.d_scores = d_scores;
+        r.d_indices = d_indices;
+        r.size = size;
+        return r;
+    }
+
+    void setAllScoresToZero(cudaStream_t stream){
+        thrust::fill(
+            thrust::cuda::par_nosync.on(stream),
+            d_scores,
+            d_scores + size,
+            0
+        );
+        thrust::sequence(
+            thrust::cuda::par_nosync.on(stream),
+            d_indices,
+            d_indices + size,
+            ReferenceIdT(0)
+        );
+    }
+
+    size_t indexOffset = 0;
+    float* d_scores;
+    ReferenceIdT* d_indices;
+    size_t size;
+};
+
+
+template<class ExtraData>
+struct TopNMaximaArrayWithExtra{
+    struct Ref{
+        size_t index;
+        size_t indexOffset;
+        float* d_scores;
+        ReferenceIdT* d_indices;
+        ExtraData* d_extras;
+        size_t size;
+
+        template<class Payload>
+        __device__
+        Ref& operator=(const Payload& payload){     
+            d_scores[index] = payload.getScore();
+            d_indices[index] = indexOffset + index;
+            d_extras[index] = payload.getExtra();
+            return *this;
+        }
+    };
+
+    TopNMaximaArrayWithExtra(float* d_scores_, ReferenceIdT* d_indices_, ExtraData* d_extras_, size_t offset, size_t size_)
+        : indexOffset(offset), d_scores(d_scores_), d_indices(d_indices_), d_extras(d_extras_), size(size_){}
+
+    template<class Index>
+    __device__
+    Ref operator[](Index index) const{
+        Ref r;
+        r.index = index;
+        r.indexOffset = indexOffset;
+        r.d_scores = d_scores;
+        r.d_indices = d_indices;
+        r.d_extras = d_extras;
+        r.size = size;
+        return r;
+    }
+
+    void setAllScoresToZero(cudaStream_t stream){
+        thrust::fill(
+            thrust::cuda::par_nosync.on(stream),
+            d_scores,
+            d_scores + size,
+            0
+        );
+        thrust::sequence(
+            thrust::cuda::par_nosync.on(stream),
+            d_indices,
+            d_indices + size,
+            ReferenceIdT(0)
+        );
+        thrust::fill(
+            thrust::cuda::par_nosync.on(stream),
+            d_extras,
+            d_extras + size,
+            ExtraData{}
+        );
+    }
+
+    size_t indexOffset = 0;
+    float* d_scores;
+    ReferenceIdT* d_indices;
+    ExtraData* d_extras;
+    size_t size;
+};
+
+}
+
+#endif
\ No newline at end of file
diff --git a/lib/libmarv/tuningconfigs/A100/gapless.txt b/lib/libmarv/tuningconfigs/A100/gapless.txt
new file mode 100644
index 000000000..dc415202e
--- /dev/null
+++ b/lib/libmarv/tuningconfigs/A100/gapless.txt
@@ -0,0 +1,33 @@
+#tilesize groupsize numRegs dpx kernelApproach
+32 4 4 0 1
+64 4 8 0 0
+96 4 12 0 1
+128 4 16 0 0
+160 4 20 0 1
+192 4 24 0 1
+224 4 28 0 1
+256 4 32 0 1
+288 4 36 0 1
+320 4 40 0 1
+352 4 44 0 1
+384 4 48 0 1
+416 4 52 0 0
+448 4 56 0 0
+480 4 60 0 0
+512 4 64 0 1
+576 8 36 0 0
+640 8 40 0 0
+704 8 44 0 0
+768 8 48 0 0
+832 8 52 0 0
+896 8 56 0 0
+960 8 60 0 0
+1024 8 64 0 0
+1152 16 36 0 1
+1280 16 40 0 0
+1408 16 44 0 0
+1536 16 48 0 0
+1664 16 52 0 0
+1792 16 56 0 0
+1920 16 60 0 0
+2048 16 64 0 0
diff --git a/lib/libmarv/tuningconfigs/A100/swendpos.txt b/lib/libmarv/tuningconfigs/A100/swendpos.txt
new file mode 100644
index 000000000..0bbd031ce
--- /dev/null
+++ b/lib/libmarv/tuningconfigs/A100/swendpos.txt
@@ -0,0 +1,27 @@
+#tilesize groupsize numRegs dpx kernelApproach
+16 4 4 0 999999
+32 4 8 0 999999
+48 4 12 0 999999
+64 4 16 0 999999
+80 4 20 0 999999
+96 4 24 0 999999
+112 4 28 0 999999
+128 8 16 0 999999
+144 4 36 0 999999
+160 8 20 0 999999
+176 4 44 0 999999
+192 8 24 0 999999
+224 8 28 0 999999
+256 8 32 0 999999
+288 8 36 0 999999
+320 16 20 0 999999
+352 8 44 0 999999
+384 16 24 0 999999
+448 16 28 0 999999
+512 16 32 0 999999
+576 16 36 0 999999
+640 32 20 0 999999
+704 16 44 0 999999
+768 32 24 0 999999
+896 32 28 0 999999
+1024 32 32 0 999999
diff --git a/lib/libmarv/tuningconfigs/GraceHopper/gapless.txt b/lib/libmarv/tuningconfigs/GraceHopper/gapless.txt
new file mode 100644
index 000000000..8ae9b5463
--- /dev/null
+++ b/lib/libmarv/tuningconfigs/GraceHopper/gapless.txt
@@ -0,0 +1,33 @@
+#tilesize groupsize numRegs dpx kernelApproach   
+32 4 4 1 1
+64 4 8 1 1
+96 4 12 1 1
+128 4 16 0 0
+160 4 20 1 1
+192 4 24 1 1
+224 4 28 1 1
+256 4 32 1 1
+288 4 36 1 1
+320 4 40 1 1
+352 4 44 1 1
+384 4 48 1 1
+416 4 52 1 1
+448 4 56 1 1
+480 4 60 1 1
+512 8 32 1 1
+576 8 36 1 1
+640 8 40 1 1
+704 8 44 1 1
+768 8 48 1 1
+832 8 52 1 1
+896 8 56 1 1
+960 8 60 1 1
+1024 16 32 1 1
+1152 16 36 1 1
+1280 16 40 1 1
+1408 16 44 1 1
+1536 16 48 1 1
+1664 16 52 1 1
+1792 16 56 1 1
+1920 16 60 1 1
+2048 16 64 0 0
diff --git a/lib/libmarv/tuningconfigs/GraceHopper/swendpos.txt b/lib/libmarv/tuningconfigs/GraceHopper/swendpos.txt
new file mode 100644
index 000000000..8d10c3266
--- /dev/null
+++ b/lib/libmarv/tuningconfigs/GraceHopper/swendpos.txt
@@ -0,0 +1,27 @@
+#tilesize groupsize numRegs dpx kernelApproach
+16 4 4 1 999999
+32 4 8 1 999999
+48 4 12 1 999999
+64 4 16 1 999999
+80 4 20 1 999999
+96 4 24 1 999999
+112 4 28 1 999999
+128 4 32 1 999999
+144 4 36 1 999999
+160 4 40 1 999999
+176 4 44 1 999999
+192 8 24 1 999999
+224 8 28 1 999999
+256 8 32 1 999999
+288 8 36 1 999999
+320 8 40 1 999999
+352 8 44 1 999999
+384 16 24 1 999999
+448 16 28 1 999999
+512 16 32 1 999999
+576 16 36 1 999999
+640 16 40 1 999999
+704 16 44 1 999999
+768 32 24 1 999999
+896 32 28 1 999999
+1024 32 32 1 999999
diff --git a/lib/libmarv/tuningconfigs/H100/gapless.txt b/lib/libmarv/tuningconfigs/H100/gapless.txt
new file mode 100644
index 000000000..b0ed7d42c
--- /dev/null
+++ b/lib/libmarv/tuningconfigs/H100/gapless.txt
@@ -0,0 +1,33 @@
+#tilesize groupsize numRegs dpx kernelApproach
+32 4 4 1 1
+64 4 8 1 1
+96 4 12 1 1
+128 4 16 0 0
+160 4 20 1 1
+192 4 24 1 1
+224 4 28 1 1
+256 4 32 1 1
+288 4 36 1 1
+320 4 40 1 1
+352 4 44 1 1
+384 4 48 1 1
+416 4 52 1 1
+448 4 56 1 1
+480 4 60 0 0
+512 8 32 1 1
+576 8 36 1 1
+640 8 40 1 1
+704 8 44 1 1
+768 8 48 1 1
+832 8 52 1 1
+896 8 56 1 1
+960 8 60 1 1
+1024 16 32 1 1
+1152 16 36 1 1
+1280 16 40 1 1
+1408 16 44 1 1
+1536 16 48 1 1
+1664 16 52 1 1
+1792 16 56 1 1
+1920 16 60 1 1
+2048 16 64 0 0
diff --git a/lib/libmarv/tuningconfigs/H100/swendpos.txt b/lib/libmarv/tuningconfigs/H100/swendpos.txt
new file mode 100644
index 000000000..8d10c3266
--- /dev/null
+++ b/lib/libmarv/tuningconfigs/H100/swendpos.txt
@@ -0,0 +1,27 @@
+#tilesize groupsize numRegs dpx kernelApproach
+16 4 4 1 999999
+32 4 8 1 999999
+48 4 12 1 999999
+64 4 16 1 999999
+80 4 20 1 999999
+96 4 24 1 999999
+112 4 28 1 999999
+128 4 32 1 999999
+144 4 36 1 999999
+160 4 40 1 999999
+176 4 44 1 999999
+192 8 24 1 999999
+224 8 28 1 999999
+256 8 32 1 999999
+288 8 36 1 999999
+320 8 40 1 999999
+352 8 44 1 999999
+384 16 24 1 999999
+448 16 28 1 999999
+512 16 32 1 999999
+576 16 36 1 999999
+640 16 40 1 999999
+704 16 44 1 999999
+768 32 24 1 999999
+896 32 28 1 999999
+1024 32 32 1 999999
diff --git a/lib/libmarv/tuningconfigs/L4/gapless.txt b/lib/libmarv/tuningconfigs/L4/gapless.txt
new file mode 100644
index 000000000..a75431dd7
--- /dev/null
+++ b/lib/libmarv/tuningconfigs/L4/gapless.txt
@@ -0,0 +1,33 @@
+#tilesize groupsize numRegs dpx kernelApproach
+32 4 4 0 1
+64 4 8 0 0
+96 4 12 0 0
+128 4 16 0 0
+160 4 20 0 0
+192 4 24 0 0
+224 4 28 0 0
+256 4 32 0 0
+288 4 36 0 0
+320 4 40 0 0
+352 4 44 0 0
+384 4 48 0 0
+416 4 52 0 0
+448 4 56 0 0
+480 4 60 0 0
+512 4 64 0 0
+576 8 36 0 0
+640 8 40 0 0
+704 8 44 0 0
+768 8 48 0 0
+832 8 52 0 0
+896 8 56 0 1
+960 8 60 0 0
+1024 8 64 0 0
+1152 16 36 0 0
+1280 16 40 0 0
+1408 16 44 0 0
+1536 16 48 0 0
+1664 16 52 0 0
+1792 16 56 0 0
+1920 16 60 0 0
+2048 16 64 0 0
diff --git a/lib/libmarv/tuningconfigs/L4/swendpos.txt b/lib/libmarv/tuningconfigs/L4/swendpos.txt
new file mode 100644
index 000000000..2552322a8
--- /dev/null
+++ b/lib/libmarv/tuningconfigs/L4/swendpos.txt
@@ -0,0 +1,27 @@
+#tilesize groupsize numRegs dpx kernelApproach
+16 4 4 0 999999
+32 4 8 0 999999
+48 4 12 0 999999
+64 4 16 0 999999
+80 4 20 0 999999
+96 4 24 0 999999
+112 4 28 0 999999
+128 8 16 0 999999
+144 4 36 0 999999
+160 8 20 0 999999
+176 4 44 0 999999
+192 8 24 0 999999
+224 8 28 0 999999
+256 16 16 0 999999
+288 8 36 0 999999
+320 16 20 0 999999
+352 8 44 0 999999
+384 16 24 0 999999
+448 16 28 0 999999
+512 32 16 0 999999
+576 16 36 0 999999
+640 32 20 0 999999
+704 16 44 0 999999
+768 32 24 0 999999
+896 32 28 0 999999
+1024 32 32 0 999999
diff --git a/lib/libmarv/tuningconfigs/L40S/gapless.txt b/lib/libmarv/tuningconfigs/L40S/gapless.txt
new file mode 100644
index 000000000..3829b92e5
--- /dev/null
+++ b/lib/libmarv/tuningconfigs/L40S/gapless.txt
@@ -0,0 +1,33 @@
+#tilesize groupsize numRegs dpx kernelApproach
+32 4 4 0 1
+64 4 8 0 1
+96 4 12 0 1
+128 4 16 0 1
+160 4 20 0 0
+192 4 24 0 1
+224 4 28 0 0
+256 4 32 0 0
+288 4 36 0 0
+320 4 40 0 0
+352 4 44 0 0
+384 4 48 0 1
+416 4 52 0 0
+448 4 56 0 1
+480 4 60 0 1
+512 4 64 0 0
+576 8 36 0 1
+640 8 40 0 1
+704 8 44 0 1
+768 8 48 0 1
+832 8 52 0 1
+896 8 56 0 1
+960 8 60 0 0
+1024 8 64 0 0
+1152 16 36 0 0
+1280 16 40 0 0
+1408 16 44 0 1
+1536 16 48 0 1
+1664 16 52 0 0
+1792 16 56 0 0
+1920 16 60 0 1
+2048 16 64 0 0
diff --git a/lib/libmarv/tuningconfigs/L40S/swendpos.txt b/lib/libmarv/tuningconfigs/L40S/swendpos.txt
new file mode 100644
index 000000000..87020a7e6
--- /dev/null
+++ b/lib/libmarv/tuningconfigs/L40S/swendpos.txt
@@ -0,0 +1,27 @@
+#tilesize groupsize numRegs dpx kernelApproach
+16 4 4 0 999999
+32 4 8 0 999999
+48 4 12 0 999999
+64 4 16 0 999999
+80 4 20 0 999999
+96 4 24 0 999999
+112 4 28 0 999999
+128 4 32 0 999999
+144 4 36 0 999999
+160 4 40 0 999999
+176 4 44 0 999999
+192 8 24 0 999999
+224 8 28 0 999999
+256 16 16 0 999999
+288 8 36 0 999999
+320 16 20 0 999999
+352 8 44 0 999999
+384 16 24 0 999999
+448 16 28 0 999999
+512 32 16 0 999999
+576 16 36 0 999999
+640 32 20 0 999999
+704 16 44 0 999999
+768 32 24 0 999999
+896 32 28 0 999999
+1024 32 32 0 999999
diff --git a/lib/libmarv/tuningconfigs/RTX4090/gapless.txt b/lib/libmarv/tuningconfigs/RTX4090/gapless.txt
new file mode 100644
index 000000000..8f03ad8ed
--- /dev/null
+++ b/lib/libmarv/tuningconfigs/RTX4090/gapless.txt
@@ -0,0 +1,33 @@
+#tilesize groupsize numRegs dpx kernelApproach
+32 4 4 0 1
+64 4 8 0 0
+96 4 12 0 1
+128 4 16 0 1
+160 4 20 0 0
+192 4 24 0 1
+224 4 28 0 1
+256 4 32 0 1
+288 4 36 0 0
+320 4 40 0 0
+352 4 44 0 1
+384 4 48 0 1
+416 4 52 0 0
+448 4 56 0 0
+480 4 60 0 0
+512 4 64 0 1
+576 8 36 0 1
+640 8 40 0 1
+704 8 44 0 0
+768 8 48 0 0
+832 8 52 0 1
+896 8 56 0 1
+960 8 60 0 1
+1024 8 64 0 1
+1152 16 36 0 1
+1280 16 40 0 0
+1408 16 44 0 1
+1536 16 48 0 1
+1664 16 52 0 1
+1792 16 56 0 1
+1920 16 60 0 1
+2048 16 64 0 0
diff --git a/lib/libmarv/tuningconfigs/RTX4090/swendpos.txt b/lib/libmarv/tuningconfigs/RTX4090/swendpos.txt
new file mode 100644
index 000000000..a08d734be
--- /dev/null
+++ b/lib/libmarv/tuningconfigs/RTX4090/swendpos.txt
@@ -0,0 +1,28 @@
+#tilesize groupsize numRegs dpx kernelApproach
+16 4 4 0 999999
+32 4 8 0 999999
+48 4 12 0 999999
+64 4 16 0 999999
+80 4 20 0 999999
+96 4 24 0 999999
+112 4 28 0 999999
+128 8 16 0 999999
+144 4 36 0 999999
+160 4 40 0 999999
+176 4 44 0 999999
+192 8 24 0 999999
+224 8 28 0 999999
+256 8 32 0 999999
+288 8 36 0 999999
+320 8 40 0 999999
+352 8 44 0 999999
+384 16 24 0 999999
+448 16 28 0 999999
+512 16 32 0 999999
+576 16 36 0 999999
+640 32 20 0 999999
+704 16 44 0 999999
+768 32 24 0 999999
+896 32 28 0 999999
+1024 32 32 0 999999
+
diff --git a/lib/libmarv/tuningconfigs/T4/gapless.txt b/lib/libmarv/tuningconfigs/T4/gapless.txt
new file mode 100644
index 000000000..a4fc8f0ae
--- /dev/null
+++ b/lib/libmarv/tuningconfigs/T4/gapless.txt
@@ -0,0 +1,29 @@
+#tilesize groupsize numRegs dpx kernelApproach
+32 4 4 0 0
+64 4 8 0 0
+96 4 12 0 0
+128 4 16 0 0
+160 4 20 0 0
+192 8 12 0 0
+224 4 28 0 0
+256 8 16 0 0
+288 4 36 0 0
+320 8 20 0 0
+352 4 44 0 0
+384 16 12 0 0
+416 4 52 0 0
+448 8 28 0 0
+480 4 60 0 0
+512 16 16 0 0
+576 8 36 0 0
+640 16 20 0 0
+704 8 44 0 0
+768 16 24 0 0
+832 8 52 0 0
+896 16 28 0 0
+960 8 60 0 0
+1024 16 32 0 0
+1152 16 36 0 0
+1280 16 40 0 0
+1408 16 44 0 0
+1536 16 48 0 0
diff --git a/lib/libmarv/tuningconfigs/T4/swendpos.txt b/lib/libmarv/tuningconfigs/T4/swendpos.txt
new file mode 100644
index 000000000..fbddf5ab9
--- /dev/null
+++ b/lib/libmarv/tuningconfigs/T4/swendpos.txt
@@ -0,0 +1,25 @@
+#tilesize groupsize numRegs dpx kernelApproach
+16 4 4 0 999999
+32 4 8 0 999999
+48 4 12 0 999999
+64 4 16 0 999999
+80 4 20 0 999999
+96 4 24 0 999999
+112 4 28 0 999999
+128 8 16 0 999999
+144 4 36 0 999999
+160 8 20 0 999999
+176 4 44 0 999999
+192 8 24 0 999999
+224 8 28 0 999999
+256 16 16 0 999999
+288 8 36 0 999999
+320 16 20 0 999999
+352 8 44 0 999999
+384 16 24 0 999999
+448 16 28 0 999999
+512 32 16 0 999999
+576 16 36 0 999999
+640 32 20 0 999999
+704 16 44 0 999999
+768 32 24 0 999999
diff --git a/lib/omptl/Example.cpp b/lib/omptl/Example.cpp
deleted file mode 100644
index 97d261e1c..000000000
--- a/lib/omptl/Example.cpp
+++ /dev/null
@@ -1,43 +0,0 @@
-#include <vector>
-
-#include <omptl/omptl_numeric>
-#include <omptl/omptl_algorithm>
-
-#include <cstdlib>
-#include <cmath>
-#include <iostream>
-#include <omp.h>
-
-const unsigned N = 100 * (1 << 23);
-
-template <typename T>
-struct Sqrt
-{
-	T operator()(const T &x) const { return std::sqrt(x); }
-};
-
-int main (int argc, char * const argv[])
-{
-	// Number of threads is derived from environment
-	// variable "OMP_NUM_THREADS"
-	std::cout << "Threads: " << omp_get_max_threads() << std::endl;
-	std::cout << "Num: " << N << std::endl;
-
-	std::vector<int> v1(N);
-
-	omptl::generate(v1.begin(), v1.end(), std::rand);
-	omptl::sort(v1.begin(), v1.end());
-	omptl::random_shuffle(v1.begin(), v1.end());
-
-	std::vector<int> v2(N);
-	omptl::transform(v1.begin(), v1.end(), v2.begin(), Sqrt<int>());
-	std::cout << "Nr 3's: " << omptl::count(v2.begin(), v2.end(), 3)
-		<< std::endl;
-	std::cout << "Sum: "
-		<< omptl::accumulate(v2.begin(), v2.end(), 0) << std::endl;
-
-	std::cout << *v1.begin() << std::endl;
-
-	return 0;
-}
-
diff --git a/lib/omptl/License.txt b/lib/omptl/License.txt
deleted file mode 100644
index 8add30ad5..000000000
--- a/lib/omptl/License.txt
+++ /dev/null
@@ -1,504 +0,0 @@
-		  GNU LESSER GENERAL PUBLIC LICENSE
-		       Version 2.1, February 1999
-
- Copyright (C) 1991, 1999 Free Software Foundation, Inc.
-     51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
- Everyone is permitted to copy and distribute verbatim copies
- of this license document, but changing it is not allowed.
-
-[This is the first released version of the Lesser GPL.  It also counts
- as the successor of the GNU Library Public License, version 2, hence
- the version number 2.1.]
-
-			    Preamble
-
-  The licenses for most software are designed to take away your
-freedom to share and change it.  By contrast, the GNU General Public
-Licenses are intended to guarantee your freedom to share and change
-free software--to make sure the software is free for all its users.
-
-  This license, the Lesser General Public License, applies to some
-specially designated software packages--typically libraries--of the
-Free Software Foundation and other authors who decide to use it.  You
-can use it too, but we suggest you first think carefully about whether
-this license or the ordinary General Public License is the better
-strategy to use in any particular case, based on the explanations below.
-
-  When we speak of free software, we are referring to freedom of use,
-not price.  Our General Public Licenses are designed to make sure that
-you have the freedom to distribute copies of free software (and charge
-for this service if you wish); that you receive source code or can get
-it if you want it; that you can change the software and use pieces of
-it in new free programs; and that you are informed that you can do
-these things.
-
-  To protect your rights, we need to make restrictions that forbid
-distributors to deny you these rights or to ask you to surrender these
-rights.  These restrictions translate to certain responsibilities for
-you if you distribute copies of the library or if you modify it.
-
-  For example, if you distribute copies of the library, whether gratis
-or for a fee, you must give the recipients all the rights that we gave
-you.  You must make sure that they, too, receive or can get the source
-code.  If you link other code with the library, you must provide
-complete object files to the recipients, so that they can relink them
-with the library after making changes to the library and recompiling
-it.  And you must show them these terms so they know their rights.
-
-  We protect your rights with a two-step method: (1) we copyright the
-library, and (2) we offer you this license, which gives you legal
-permission to copy, distribute and/or modify the library.
-
-  To protect each distributor, we want to make it very clear that
-there is no warranty for the free library.  Also, if the library is
-modified by someone else and passed on, the recipients should know
-that what they have is not the original version, so that the original
-author's reputation will not be affected by problems that might be
-introduced by others.
-
-  Finally, software patents pose a constant threat to the existence of
-any free program.  We wish to make sure that a company cannot
-effectively restrict the users of a free program by obtaining a
-restrictive license from a patent holder.  Therefore, we insist that
-any patent license obtained for a version of the library must be
-consistent with the full freedom of use specified in this license.
-
-  Most GNU software, including some libraries, is covered by the
-ordinary GNU General Public License.  This license, the GNU Lesser
-General Public License, applies to certain designated libraries, and
-is quite different from the ordinary General Public License.  We use
-this license for certain libraries in order to permit linking those
-libraries into non-free programs.
-
-  When a program is linked with a library, whether statically or using
-a shared library, the combination of the two is legally speaking a
-combined work, a derivative of the original library.  The ordinary
-General Public License therefore permits such linking only if the
-entire combination fits its criteria of freedom.  The Lesser General
-Public License permits more lax criteria for linking other code with
-the library.
-
-  We call this license the "Lesser" General Public License because it
-does Less to protect the user's freedom than the ordinary General
-Public License.  It also provides other free software developers Less
-of an advantage over competing non-free programs.  These disadvantages
-are the reason we use the ordinary General Public License for many
-libraries.  However, the Lesser license provides advantages in certain
-special circumstances.
-
-  For example, on rare occasions, there may be a special need to
-encourage the widest possible use of a certain library, so that it becomes
-a de-facto standard.  To achieve this, non-free programs must be
-allowed to use the library.  A more frequent case is that a free
-library does the same job as widely used non-free libraries.  In this
-case, there is little to gain by limiting the free library to free
-software only, so we use the Lesser General Public License.
-
-  In other cases, permission to use a particular library in non-free
-programs enables a greater number of people to use a large body of
-free software.  For example, permission to use the GNU C Library in
-non-free programs enables many more people to use the whole GNU
-operating system, as well as its variant, the GNU/Linux operating
-system.
-
-  Although the Lesser General Public License is Less protective of the
-users' freedom, it does ensure that the user of a program that is
-linked with the Library has the freedom and the wherewithal to run
-that program using a modified version of the Library.
-
-  The precise terms and conditions for copying, distribution and
-modification follow.  Pay close attention to the difference between a
-"work based on the library" and a "work that uses the library".  The
-former contains code derived from the library, whereas the latter must
-be combined with the library in order to run.
-
-		  GNU LESSER GENERAL PUBLIC LICENSE
-   TERMS AND CONDITIONS FOR COPYING, DISTRIBUTION AND MODIFICATION
-
-  0. This License Agreement applies to any software library or other
-program which contains a notice placed by the copyright holder or
-other authorized party saying it may be distributed under the terms of
-this Lesser General Public License (also called "this License").
-Each licensee is addressed as "you".
-
-  A "library" means a collection of software functions and/or data
-prepared so as to be conveniently linked with application programs
-(which use some of those functions and data) to form executables.
-
-  The "Library", below, refers to any such software library or work
-which has been distributed under these terms.  A "work based on the
-Library" means either the Library or any derivative work under
-copyright law: that is to say, a work containing the Library or a
-portion of it, either verbatim or with modifications and/or translated
-straightforwardly into another language.  (Hereinafter, translation is
-included without limitation in the term "modification".)
-
-  "Source code" for a work means the preferred form of the work for
-making modifications to it.  For a library, complete source code means
-all the source code for all modules it contains, plus any associated
-interface definition files, plus the scripts used to control compilation
-and installation of the library.
-
-  Activities other than copying, distribution and modification are not
-covered by this License; they are outside its scope.  The act of
-running a program using the Library is not restricted, and output from
-such a program is covered only if its contents constitute a work based
-on the Library (independent of the use of the Library in a tool for
-writing it).  Whether that is true depends on what the Library does
-and what the program that uses the Library does.
-  
-  1. You may copy and distribute verbatim copies of the Library's
-complete source code as you receive it, in any medium, provided that
-you conspicuously and appropriately publish on each copy an
-appropriate copyright notice and disclaimer of warranty; keep intact
-all the notices that refer to this License and to the absence of any
-warranty; and distribute a copy of this License along with the
-Library.
-
-  You may charge a fee for the physical act of transferring a copy,
-and you may at your option offer warranty protection in exchange for a
-fee.
-
-  2. You may modify your copy or copies of the Library or any portion
-of it, thus forming a work based on the Library, and copy and
-distribute such modifications or work under the terms of Section 1
-above, provided that you also meet all of these conditions:
-
-    a) The modified work must itself be a software library.
-
-    b) You must cause the files modified to carry prominent notices
-    stating that you changed the files and the date of any change.
-
-    c) You must cause the whole of the work to be licensed at no
-    charge to all third parties under the terms of this License.
-
-    d) If a facility in the modified Library refers to a function or a
-    table of data to be supplied by an application program that uses
-    the facility, other than as an argument passed when the facility
-    is invoked, then you must make a good faith effort to ensure that,
-    in the event an application does not supply such function or
-    table, the facility still operates, and performs whatever part of
-    its purpose remains meaningful.
-
-    (For example, a function in a library to compute square roots has
-    a purpose that is entirely well-defined independent of the
-    application.  Therefore, Subsection 2d requires that any
-    application-supplied function or table used by this function must
-    be optional: if the application does not supply it, the square
-    root function must still compute square roots.)
-
-These requirements apply to the modified work as a whole.  If
-identifiable sections of that work are not derived from the Library,
-and can be reasonably considered independent and separate works in
-themselves, then this License, and its terms, do not apply to those
-sections when you distribute them as separate works.  But when you
-distribute the same sections as part of a whole which is a work based
-on the Library, the distribution of the whole must be on the terms of
-this License, whose permissions for other licensees extend to the
-entire whole, and thus to each and every part regardless of who wrote
-it.
-
-Thus, it is not the intent of this section to claim rights or contest
-your rights to work written entirely by you; rather, the intent is to
-exercise the right to control the distribution of derivative or
-collective works based on the Library.
-
-In addition, mere aggregation of another work not based on the Library
-with the Library (or with a work based on the Library) on a volume of
-a storage or distribution medium does not bring the other work under
-the scope of this License.
-
-  3. You may opt to apply the terms of the ordinary GNU General Public
-License instead of this License to a given copy of the Library.  To do
-this, you must alter all the notices that refer to this License, so
-that they refer to the ordinary GNU General Public License, version 2,
-instead of to this License.  (If a newer version than version 2 of the
-ordinary GNU General Public License has appeared, then you can specify
-that version instead if you wish.)  Do not make any other change in
-these notices.
-
-  Once this change is made in a given copy, it is irreversible for
-that copy, so the ordinary GNU General Public License applies to all
-subsequent copies and derivative works made from that copy.
-
-  This option is useful when you wish to copy part of the code of
-the Library into a program that is not a library.
-
-  4. You may copy and distribute the Library (or a portion or
-derivative of it, under Section 2) in object code or executable form
-under the terms of Sections 1 and 2 above provided that you accompany
-it with the complete corresponding machine-readable source code, which
-must be distributed under the terms of Sections 1 and 2 above on a
-medium customarily used for software interchange.
-
-  If distribution of object code is made by offering access to copy
-from a designated place, then offering equivalent access to copy the
-source code from the same place satisfies the requirement to
-distribute the source code, even though third parties are not
-compelled to copy the source along with the object code.
-
-  5. A program that contains no derivative of any portion of the
-Library, but is designed to work with the Library by being compiled or
-linked with it, is called a "work that uses the Library".  Such a
-work, in isolation, is not a derivative work of the Library, and
-therefore falls outside the scope of this License.
-
-  However, linking a "work that uses the Library" with the Library
-creates an executable that is a derivative of the Library (because it
-contains portions of the Library), rather than a "work that uses the
-library".  The executable is therefore covered by this License.
-Section 6 states terms for distribution of such executables.
-
-  When a "work that uses the Library" uses material from a header file
-that is part of the Library, the object code for the work may be a
-derivative work of the Library even though the source code is not.
-Whether this is true is especially significant if the work can be
-linked without the Library, or if the work is itself a library.  The
-threshold for this to be true is not precisely defined by law.
-
-  If such an object file uses only numerical parameters, data
-structure layouts and accessors, and small macros and small inline
-functions (ten lines or less in length), then the use of the object
-file is unrestricted, regardless of whether it is legally a derivative
-work.  (Executables containing this object code plus portions of the
-Library will still fall under Section 6.)
-
-  Otherwise, if the work is a derivative of the Library, you may
-distribute the object code for the work under the terms of Section 6.
-Any executables containing that work also fall under Section 6,
-whether or not they are linked directly with the Library itself.
-
-  6. As an exception to the Sections above, you may also combine or
-link a "work that uses the Library" with the Library to produce a
-work containing portions of the Library, and distribute that work
-under terms of your choice, provided that the terms permit
-modification of the work for the customer's own use and reverse
-engineering for debugging such modifications.
-
-  You must give prominent notice with each copy of the work that the
-Library is used in it and that the Library and its use are covered by
-this License.  You must supply a copy of this License.  If the work
-during execution displays copyright notices, you must include the
-copyright notice for the Library among them, as well as a reference
-directing the user to the copy of this License.  Also, you must do one
-of these things:
-
-    a) Accompany the work with the complete corresponding
-    machine-readable source code for the Library including whatever
-    changes were used in the work (which must be distributed under
-    Sections 1 and 2 above); and, if the work is an executable linked
-    with the Library, with the complete machine-readable "work that
-    uses the Library", as object code and/or source code, so that the
-    user can modify the Library and then relink to produce a modified
-    executable containing the modified Library.  (It is understood
-    that the user who changes the contents of definitions files in the
-    Library will not necessarily be able to recompile the application
-    to use the modified definitions.)
-
-    b) Use a suitable shared library mechanism for linking with the
-    Library.  A suitable mechanism is one that (1) uses at run time a
-    copy of the library already present on the user's computer system,
-    rather than copying library functions into the executable, and (2)
-    will operate properly with a modified version of the library, if
-    the user installs one, as long as the modified version is
-    interface-compatible with the version that the work was made with.
-
-    c) Accompany the work with a written offer, valid for at
-    least three years, to give the same user the materials
-    specified in Subsection 6a, above, for a charge no more
-    than the cost of performing this distribution.
-
-    d) If distribution of the work is made by offering access to copy
-    from a designated place, offer equivalent access to copy the above
-    specified materials from the same place.
-
-    e) Verify that the user has already received a copy of these
-    materials or that you have already sent this user a copy.
-
-  For an executable, the required form of the "work that uses the
-Library" must include any data and utility programs needed for
-reproducing the executable from it.  However, as a special exception,
-the materials to be distributed need not include anything that is
-normally distributed (in either source or binary form) with the major
-components (compiler, kernel, and so on) of the operating system on
-which the executable runs, unless that component itself accompanies
-the executable.
-
-  It may happen that this requirement contradicts the license
-restrictions of other proprietary libraries that do not normally
-accompany the operating system.  Such a contradiction means you cannot
-use both them and the Library together in an executable that you
-distribute.
-
-  7. You may place library facilities that are a work based on the
-Library side-by-side in a single library together with other library
-facilities not covered by this License, and distribute such a combined
-library, provided that the separate distribution of the work based on
-the Library and of the other library facilities is otherwise
-permitted, and provided that you do these two things:
-
-    a) Accompany the combined library with a copy of the same work
-    based on the Library, uncombined with any other library
-    facilities.  This must be distributed under the terms of the
-    Sections above.
-
-    b) Give prominent notice with the combined library of the fact
-    that part of it is a work based on the Library, and explaining
-    where to find the accompanying uncombined form of the same work.
-
-  8. You may not copy, modify, sublicense, link with, or distribute
-the Library except as expressly provided under this License.  Any
-attempt otherwise to copy, modify, sublicense, link with, or
-distribute the Library is void, and will automatically terminate your
-rights under this License.  However, parties who have received copies,
-or rights, from you under this License will not have their licenses
-terminated so long as such parties remain in full compliance.
-
-  9. You are not required to accept this License, since you have not
-signed it.  However, nothing else grants you permission to modify or
-distribute the Library or its derivative works.  These actions are
-prohibited by law if you do not accept this License.  Therefore, by
-modifying or distributing the Library (or any work based on the
-Library), you indicate your acceptance of this License to do so, and
-all its terms and conditions for copying, distributing or modifying
-the Library or works based on it.
-
-  10. Each time you redistribute the Library (or any work based on the
-Library), the recipient automatically receives a license from the
-original licensor to copy, distribute, link with or modify the Library
-subject to these terms and conditions.  You may not impose any further
-restrictions on the recipients' exercise of the rights granted herein.
-You are not responsible for enforcing compliance by third parties with
-this License.
-
-  11. If, as a consequence of a court judgment or allegation of patent
-infringement or for any other reason (not limited to patent issues),
-conditions are imposed on you (whether by court order, agreement or
-otherwise) that contradict the conditions of this License, they do not
-excuse you from the conditions of this License.  If you cannot
-distribute so as to satisfy simultaneously your obligations under this
-License and any other pertinent obligations, then as a consequence you
-may not distribute the Library at all.  For example, if a patent
-license would not permit royalty-free redistribution of the Library by
-all those who receive copies directly or indirectly through you, then
-the only way you could satisfy both it and this License would be to
-refrain entirely from distribution of the Library.
-
-If any portion of this section is held invalid or unenforceable under any
-particular circumstance, the balance of the section is intended to apply,
-and the section as a whole is intended to apply in other circumstances.
-
-It is not the purpose of this section to induce you to infringe any
-patents or other property right claims or to contest validity of any
-such claims; this section has the sole purpose of protecting the
-integrity of the free software distribution system which is
-implemented by public license practices.  Many people have made
-generous contributions to the wide range of software distributed
-through that system in reliance on consistent application of that
-system; it is up to the author/donor to decide if he or she is willing
-to distribute software through any other system and a licensee cannot
-impose that choice.
-
-This section is intended to make thoroughly clear what is believed to
-be a consequence of the rest of this License.
-
-  12. If the distribution and/or use of the Library is restricted in
-certain countries either by patents or by copyrighted interfaces, the
-original copyright holder who places the Library under this License may add
-an explicit geographical distribution limitation excluding those countries,
-so that distribution is permitted only in or among countries not thus
-excluded.  In such case, this License incorporates the limitation as if
-written in the body of this License.
-
-  13. The Free Software Foundation may publish revised and/or new
-versions of the Lesser General Public License from time to time.
-Such new versions will be similar in spirit to the present version,
-but may differ in detail to address new problems or concerns.
-
-Each version is given a distinguishing version number.  If the Library
-specifies a version number of this License which applies to it and
-"any later version", you have the option of following the terms and
-conditions either of that version or of any later version published by
-the Free Software Foundation.  If the Library does not specify a
-license version number, you may choose any version ever published by
-the Free Software Foundation.
-
-  14. If you wish to incorporate parts of the Library into other free
-programs whose distribution conditions are incompatible with these,
-write to the author to ask for permission.  For software which is
-copyrighted by the Free Software Foundation, write to the Free
-Software Foundation; we sometimes make exceptions for this.  Our
-decision will be guided by the two goals of preserving the free status
-of all derivatives of our free software and of promoting the sharing
-and reuse of software generally.
-
-			    NO WARRANTY
-
-  15. BECAUSE THE LIBRARY IS LICENSED FREE OF CHARGE, THERE IS NO
-WARRANTY FOR THE LIBRARY, TO THE EXTENT PERMITTED BY APPLICABLE LAW.
-EXCEPT WHEN OTHERWISE STATED IN WRITING THE COPYRIGHT HOLDERS AND/OR
-OTHER PARTIES PROVIDE THE LIBRARY "AS IS" WITHOUT WARRANTY OF ANY
-KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, THE
-IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
-PURPOSE.  THE ENTIRE RISK AS TO THE QUALITY AND PERFORMANCE OF THE
-LIBRARY IS WITH YOU.  SHOULD THE LIBRARY PROVE DEFECTIVE, YOU ASSUME
-THE COST OF ALL NECESSARY SERVICING, REPAIR OR CORRECTION.
-
-  16. IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN
-WRITING WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MAY MODIFY
-AND/OR REDISTRIBUTE THE LIBRARY AS PERMITTED ABOVE, BE LIABLE TO YOU
-FOR DAMAGES, INCLUDING ANY GENERAL, SPECIAL, INCIDENTAL OR
-CONSEQUENTIAL DAMAGES ARISING OUT OF THE USE OR INABILITY TO USE THE
-LIBRARY (INCLUDING BUT NOT LIMITED TO LOSS OF DATA OR DATA BEING
-RENDERED INACCURATE OR LOSSES SUSTAINED BY YOU OR THIRD PARTIES OR A
-FAILURE OF THE LIBRARY TO OPERATE WITH ANY OTHER SOFTWARE), EVEN IF
-SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE POSSIBILITY OF SUCH
-DAMAGES.
-
-		     END OF TERMS AND CONDITIONS
-
-           How to Apply These Terms to Your New Libraries
-
-  If you develop a new library, and you want it to be of the greatest
-possible use to the public, we recommend making it free software that
-everyone can redistribute and change.  You can do so by permitting
-redistribution under these terms (or, alternatively, under the terms of the
-ordinary General Public License).
-
-  To apply these terms, attach the following notices to the library.  It is
-safest to attach them to the start of each source file to most effectively
-convey the exclusion of warranty; and each file should have at least the
-"copyright" line and a pointer to where the full notice is found.
-
-    <one line to give the library's name and a brief idea of what it does.>
-    Copyright (C) <year>  <name of author>
-
-    This library is free software; you can redistribute it and/or
-    modify it under the terms of the GNU Lesser General Public
-    License as published by the Free Software Foundation; either
-    version 2.1 of the License, or (at your option) any later version.
-
-    This library is distributed in the hope that it will be useful,
-    but WITHOUT ANY WARRANTY; without even the implied warranty of
-    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-    Lesser General Public License for more details.
-
-    You should have received a copy of the GNU Lesser General Public
-    License along with this library; if not, write to the Free Software
-    Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
-
-Also add information on how to contact you by electronic and paper mail.
-
-You should also get your employer (if you work as a programmer) or your
-school, if any, to sign a "copyright disclaimer" for the library, if
-necessary.  Here is a sample; alter the names:
-
-  Yoyodyne, Inc., hereby disclaims all copyright interest in the
-  library `Frob' (a library for tweaking knobs) written by James Random Hacker.
-
-  <signature of Ty Coon>, 1 April 1990
-  Ty Coon, President of Vice
-
-That's all there is to it!
-
-
diff --git a/lib/omptl/algorithm b/lib/omptl/algorithm
deleted file mode 100644
index 4750868df..000000000
--- a/lib/omptl/algorithm
+++ /dev/null
@@ -1,561 +0,0 @@
-// Copyright (C) 2006 Fokko Beekhof
-// Email contact: Fokko.Beekhof@unige.ch
-
-// The OMPTL library is free software; you can redistribute it and/or
-// modify it under the terms of the GNU Lesser General Public
-// License as published by the Free Software Foundation; either
-// version 2.1 of the License, or (at your option) any later version.
-
-// This library is distributed in the hope that it will be useful,
-// but WITHOUT ANY WARRANTY; without even the implied warranty of
-// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-// Lesser General Public License for more details.
-
-// You should have received a copy of the GNU Lesser General Public
-// License along with this library; if not, write to the Free Software
-// Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
-
-
-#ifndef OMPTL_ALGORITHM
-#define OMPTL_ALGORITHM 1
-
-#include <algorithm>
-#include <omptl/omptl>
-
-namespace omptl
-{
-
-template <class ForwardIterator>
-ForwardIterator adjacent_find(ForwardIterator first, ForwardIterator last,
-			const unsigned P = _Pfunc::Pfunc());
-
-template <class ForwardIterator, class BinaryPredicate>
-ForwardIterator adjacent_find(ForwardIterator first, ForwardIterator last,
-                              BinaryPredicate binary_pred,
-			      const unsigned P = _Pfunc::Pfunc());
-
-template <class ForwardIterator, class T, class StrictWeakOrdering>
-bool binary_search(ForwardIterator first, ForwardIterator last, const T& value,
-                   StrictWeakOrdering comp,
-		   const unsigned P = _Pfunc::Pfunc());
-
-template <class ForwardIterator, class T>
-bool binary_search(ForwardIterator first, ForwardIterator last, const T& value,
-		   const unsigned P = _Pfunc::Pfunc());
-
-template <class InputIterator, class OutputIterator>
-OutputIterator copy(InputIterator first, InputIterator last,
-		    OutputIterator result,
-		    const unsigned P = _Pfunc::Pfunc());
-
-template <class BidirectionalIterator1, class BidirectionalIterator2>
-BidirectionalIterator2 copy_backward(BidirectionalIterator1 first,
-                                     BidirectionalIterator1 last,
-                                     BidirectionalIterator2 result,
-			 	     const unsigned P = _Pfunc::Pfunc());
-
-/*
- * Note: implementation assumes that
- * ::std::iterator_traits<InputIterator>::difference_type(0) will
- * return a difference_type representing zero.
- */
-template <class InputIterator, class EqualityComparable>
-typename ::std::iterator_traits<InputIterator>::difference_type
-count(InputIterator first, InputIterator last, const EqualityComparable& value,
-		   const unsigned P = _Pfunc::Pfunc());
-
-template <class InputIterator, class EqualityComparable, class Size>
-void count(InputIterator first, InputIterator last,
-           const EqualityComparable& value,
-           Size& n, const unsigned P = _Pfunc::Pfunc());
-
-template <class InputIterator, class Predicate>
-typename InputIterator::difference_type
-count_if(InputIterator first, InputIterator last, Predicate pred,
-	 const unsigned P = _Pfunc::Pfunc());
-
-template <class InputIterator, class Predicate, class Size>
-void count_if(InputIterator first, InputIterator last,
-              Predicate pred, Size& n, const unsigned P = _Pfunc::Pfunc());
-
-template <class InputIterator1, class InputIterator2,
-          class BinaryPredicate>
-bool equal(InputIterator1 first1, InputIterator1 last1,
-           InputIterator2 first2, BinaryPredicate binary_pred,
-	   const unsigned P = _Pfunc::Pfunc());
-
-template <class InputIterator1, class InputIterator2>
-bool equal(InputIterator1 first1, InputIterator1 last1,
-           InputIterator2 first2, const unsigned P = _Pfunc::Pfunc());
-
-template <class ForwardIterator, class T, class StrictWeakOrdering>
-::std::pair<ForwardIterator, ForwardIterator>
-equal_range(ForwardIterator first, ForwardIterator last, const T& value,
-            StrictWeakOrdering comp,
-	    const unsigned P = _Pfunc::Pfunc());
-
-template <class ForwardIterator, class T>
-::std::pair<ForwardIterator, ForwardIterator>
-equal_range(ForwardIterator first, ForwardIterator last, const T& value,
-	    const unsigned P = _Pfunc::Pfunc());
-
-template <class ForwardIterator, class T>
-void fill(ForwardIterator first, ForwardIterator last, const T& value,
-	  const unsigned P = _Pfunc::Pfunc());
-
-template <class OutputIterator, class Size, class T>
-OutputIterator fill_n(OutputIterator first, Size n, const T& value,
-		      const unsigned P = _Pfunc::Pfunc());
-
-/*
- * find suffers from a loss of efficiency when executed in parallel!
- */
-template<class InputIterator, class EqualityComparable>
-InputIterator find(InputIterator first, InputIterator last,
-                   const EqualityComparable& value,
-		   const unsigned P = _Pfunc::Pfunc());
-
-/*
- * find_if suffers from a loss of efficiency when executed in parallel!
- */
-template<class InputIterator, class Predicate>
-InputIterator find_if(InputIterator first, InputIterator last,
-                      Predicate pred, const unsigned P = _Pfunc::Pfunc());
-
-template <class ForwardIterator1, class ForwardIterator2,
-          class BinaryPredicate>
-ForwardIterator1
-find_end(ForwardIterator1 first1, ForwardIterator1 last1,
-         ForwardIterator2 first2, ForwardIterator2 last2,
-         BinaryPredicate comp, const unsigned P = _Pfunc::Pfunc() );
-
-template <class ForwardIterator1, class ForwardIterator2>
-ForwardIterator1 find_end(ForwardIterator1 first1, ForwardIterator1 last1,
-         		  ForwardIterator2 first2, ForwardIterator2 last2,
-	 		  const unsigned P = _Pfunc::Pfunc() );
-
-/*
- * find_first_of suffers from a loss of efficiency when executed in parallel!
- */
-template <class InputIterator, class ForwardIterator, class BinaryPredicate>
-InputIterator find_first_of(InputIterator first1, InputIterator last1,
-                            ForwardIterator first2, ForwardIterator last2,
-                            BinaryPredicate comp,
-			    const unsigned P = _Pfunc::Pfunc());
-
-template <class InputIterator, class ForwardIterator>
-InputIterator find_first_of(InputIterator first1, InputIterator last1,
-                            ForwardIterator first2, ForwardIterator last2,
-			    const unsigned P = _Pfunc::Pfunc());
-
-// Calls to UnaryFunction f must be completely independent
-template <class InputIterator, class UnaryFunction>
-UnaryFunction for_each(InputIterator first, InputIterator last, UnaryFunction f,
-			const unsigned P = _Pfunc::Pfunc());
-
-// Not parallellized, Generator is explicitly allowed and expected to return
-// different results on subsequent calls. Order is therefor respected unless
-// the programmer explicitly accepts responsibility and calls par_generate.
-template <class ForwardIterator, class Generator>
-void generate(ForwardIterator first, ForwardIterator last, Generator gen);
-
-template <class ForwardIterator, class Generator>
-void par_generate(ForwardIterator first, ForwardIterator last, Generator gen,
-	      const unsigned P = _Pfunc::Pfunc());
-
-template <class RandomAccessIterator, class StrictWeakOrdering>
-void push_heap(RandomAccessIterator first, RandomAccessIterator last,
-               StrictWeakOrdering comp,
-	       const unsigned P = _Pfunc::Pfunc());
-
-template <class RandomAccessIterator>
-void push_heap(RandomAccessIterator first, RandomAccessIterator last,
-	       const unsigned P = _Pfunc::Pfunc());
-
-template <class RandomAccessIterator, class StrictWeakOrdering>
-inline void pop_heap(RandomAccessIterator first, RandomAccessIterator last,
-                     StrictWeakOrdering comp,
-		     const unsigned P = _Pfunc::Pfunc());
-
-template <class RandomAccessIterator>
-inline void pop_heap(RandomAccessIterator first, RandomAccessIterator last,
-		     const unsigned P = _Pfunc::Pfunc());
-
-template <class RandomAccessIterator, class StrictWeakOrdering>
-void make_heap(RandomAccessIterator first, RandomAccessIterator last,
-               StrictWeakOrdering comp,
-		const unsigned P = _Pfunc::Pfunc());
-
-template <class RandomAccessIterator>
-void make_heap(RandomAccessIterator first, RandomAccessIterator last,
-		const unsigned P = _Pfunc::Pfunc());
-
-template <class RandomAccessIterator, class StrictWeakOrdering>
-void sort_heap(RandomAccessIterator first, RandomAccessIterator last,
-               StrictWeakOrdering comp,
-		 const unsigned P = _Pfunc::Pfunc());
-
-template <class RandomAccessIterator>
-void sort_heap(RandomAccessIterator first, RandomAccessIterator last,
-		 const unsigned P = _Pfunc::Pfunc());
-
-// Warning: includes loses some efficiency in parallel.
-template <class InputIterator1, class InputIterator2, class StrictWeakOrdering>
-bool includes(InputIterator1 first1, InputIterator1 last1,
-              InputIterator2 first2, InputIterator2 last2,
-              StrictWeakOrdering comp, const unsigned P = _Pfunc::Pfunc());
-
-template <class InputIterator1, class InputIterator2>
-bool includes(InputIterator1 first1, InputIterator1 last1,
-              InputIterator2 first2, InputIterator2 last2,
-	      const unsigned P = _Pfunc::Pfunc());
-
-template <class InputIterator1, class InputIterator2, class BinaryPredicate>
-bool lexicographical_compare(InputIterator1 first1, InputIterator1 last1,
-                             InputIterator2 first2, InputIterator2 last2,
-                             BinaryPredicate comp,
-			     const unsigned P = _Pfunc::Pfunc());
-
-template <class InputIterator1, class InputIterator2>
-bool lexicographical_compare(InputIterator1 first1, InputIterator1 last1,
-                             InputIterator2 first2, InputIterator2 last2,
-			     const unsigned P = _Pfunc::Pfunc());
-
-template <class ForwardIterator, class T, class StrictWeakOrdering>
-ForwardIterator lower_bound(ForwardIterator first, ForwardIterator last,
-                            const T& value, StrictWeakOrdering comp,
-			    const unsigned P = _Pfunc::Pfunc());
-
-template <class ForwardIterator, class T>
-ForwardIterator
-lower_bound(ForwardIterator first, ForwardIterator last,
-	    const T& value, const unsigned P = _Pfunc::Pfunc());
-
-template <class InputIterator1, class InputIterator2, class OutputIterator,
-          class StrictWeakOrdering>
-OutputIterator merge(InputIterator1 first1, InputIterator1 last1,
-                     InputIterator2 first2, InputIterator2 last2,
-                     OutputIterator result,
-		     StrictWeakOrdering comp,
-		     const unsigned P = _Pfunc::Pfunc());
-
-template <class InputIterator1, class InputIterator2, class OutputIterator>
-OutputIterator merge(InputIterator1 first1, InputIterator1 last1,
-                     InputIterator2 first2, InputIterator2 last2,
-                     OutputIterator result,
-		     const unsigned P = _Pfunc::Pfunc());
-
-template <class ForwardIterator, class BinaryPredicate>
-ForwardIterator min_element(ForwardIterator first, ForwardIterator last,
-                            BinaryPredicate comp,
-			    const unsigned P = _Pfunc::Pfunc());
-
-template <class ForwardIterator>
-ForwardIterator min_element(ForwardIterator first, ForwardIterator last,
-			    const unsigned P = _Pfunc::Pfunc());
-
-template <class ForwardIterator, class BinaryPredicate>
-ForwardIterator max_element(ForwardIterator first, ForwardIterator last,
-                            BinaryPredicate comp,
-			    const unsigned P = _Pfunc::Pfunc());
-
-template <class ForwardIterator>
-ForwardIterator max_element(ForwardIterator first, ForwardIterator last,
-			    const unsigned P = _Pfunc::Pfunc());
-
-template <class InputIterator1, class InputIterator2, class BinaryPredicate>
-::std::pair<InputIterator1, InputIterator2>
-mismatch(InputIterator1 first1, InputIterator1 last1, InputIterator2 first2,
-         BinaryPredicate binary_pred, const unsigned P = _Pfunc::Pfunc());
-
-template <class InputIterator1, class InputIterator2>
-::std::pair<InputIterator1, InputIterator2>
-mismatch(InputIterator1 first1, InputIterator1 last1, InputIterator2 first2,
-	 const unsigned P = _Pfunc::Pfunc());
-
-template <class RandomAccessIterator, class StrictWeakOrdering>
-void nth_element(RandomAccessIterator first, RandomAccessIterator nth,
-                 RandomAccessIterator last, StrictWeakOrdering comp,
-		 const unsigned P = _Pfunc::Pfunc());
-
-template <class RandomAccessIterator>
-void nth_element(RandomAccessIterator first, RandomAccessIterator nth,
-                 RandomAccessIterator last,
-		 const unsigned P = _Pfunc::Pfunc());
-
-template <class RandomAccessIterator, class StrictWeakOrdering>
-void partial_sort(RandomAccessIterator first,
-                  RandomAccessIterator middle,
-                  RandomAccessIterator last,
-                  StrictWeakOrdering comp,
-		  const unsigned P = _Pfunc::Pfunc());
-
-template <class RandomAccessIterator>
-void partial_sort(RandomAccessIterator first,
-                  RandomAccessIterator middle,
-                  RandomAccessIterator last,
-		  const unsigned P = _Pfunc::Pfunc());
-
-template <class InputIterator, class RandomAccessIterator,
-          class StrictWeakOrdering>
-RandomAccessIterator
-partial_sort_copy(InputIterator first, InputIterator last,
-                  RandomAccessIterator result_first,
-                  RandomAccessIterator result_last, StrictWeakOrdering comp,
-		  const unsigned P = _Pfunc::Pfunc());
-
-template <class InputIterator, class RandomAccessIterator>
-RandomAccessIterator
-partial_sort_copy(InputIterator first, InputIterator last,
-                  RandomAccessIterator result_first,
-                  RandomAccessIterator result_last,
-		  const unsigned P = _Pfunc::Pfunc());
-
-template <class ForwardIterator, class Predicate>
-ForwardIterator partition(ForwardIterator first, ForwardIterator last,
-			  Predicate pred,
-			  const unsigned P = _Pfunc::Pfunc());
-
-template <class BidirectionalIterator, class StrictWeakOrdering>
-bool next_permutation(BidirectionalIterator first, BidirectionalIterator last,
-                      StrictWeakOrdering comp,
-		      const unsigned P = _Pfunc::Pfunc());
-
-template <class BidirectionalIterator>
-bool next_permutation(BidirectionalIterator first, BidirectionalIterator last,
-		      const unsigned P = _Pfunc::Pfunc());
-
-template <class BidirectionalIterator, class StrictWeakOrdering>
-bool prev_permutation(BidirectionalIterator first, BidirectionalIterator last,
-                      StrictWeakOrdering comp,
-		      const unsigned P = _Pfunc::Pfunc());
-
-template <class BidirectionalIterator>
-bool prev_permutation(BidirectionalIterator first, BidirectionalIterator last,
-		      const unsigned P = _Pfunc::Pfunc());
-
-template <class ForwardIterator, class Predicate>
-ForwardIterator
-stable_partition(ForwardIterator first, ForwardIterator last,
-		 Predicate pred, const unsigned P = _Pfunc::Pfunc());
-
-template <class RandomAccessIterator>
-void random_shuffle(RandomAccessIterator first, RandomAccessIterator last,
-		    const unsigned P = _Pfunc::Pfunc());
-
-template <class RandomAccessIterator, class RandomNumberGenerator>
-void random_shuffle(RandomAccessIterator first, RandomAccessIterator last,
-                    RandomNumberGenerator &rgen,
-		    const unsigned P = _Pfunc::Pfunc());
-
-template <class ForwardIterator, class T>
-ForwardIterator remove(ForwardIterator first, ForwardIterator last,
-                       const T& value, const unsigned P = _Pfunc::Pfunc());
-
-template <class ForwardIterator, class Predicate>
-ForwardIterator remove_if(ForwardIterator first, ForwardIterator last,
-                          Predicate pred,
-			  const unsigned P = _Pfunc::Pfunc());
-
-template <class InputIterator, class OutputIterator, class T>
-OutputIterator remove_copy(InputIterator first, InputIterator last,
-                           OutputIterator result, const T& value,
-			   const unsigned P = _Pfunc::Pfunc());
-
-template <class InputIterator, class OutputIterator, class Predicate>
-OutputIterator remove_copy_if(InputIterator first, InputIterator last,
-                              OutputIterator result, Predicate pred,
-			      const unsigned P = _Pfunc::Pfunc());
-
-template <class ForwardIterator, class T>
-void replace(ForwardIterator first, ForwardIterator last, const T& old_value,
-             const T& new_value, const unsigned P = _Pfunc::Pfunc());
-
-template <class InputIterator, class OutputIterator, class T>
-OutputIterator replace_copy(InputIterator first, InputIterator last,
-                            OutputIterator result, const T& old_value,
-                            const T& new_value,
-			    const unsigned P = _Pfunc::Pfunc());
-
-template <class InputIterator, class OutputIterator, class Predicate, class T>
-OutputIterator replace_copy_if(InputIterator first, InputIterator last,
-                               OutputIterator result, Predicate pred,
-                               const T& new_value,
-			       const unsigned P = _Pfunc::Pfunc());
-
-template <class ForwardIterator, class Predicate, class T>
-void replace_if(ForwardIterator first, ForwardIterator last, Predicate pred,
-                const T& new_value,
-		const unsigned P = _Pfunc::Pfunc());
-
-template <class BidirectionalIterator>
-void reverse(BidirectionalIterator first, BidirectionalIterator last,
-	     const unsigned P = _Pfunc::Pfunc());
-
-template <class BidirectionalIterator, class OutputIterator>
-OutputIterator reverse_copy(BidirectionalIterator first,
-			    BidirectionalIterator last,
-			    OutputIterator result,
-			    const unsigned P = _Pfunc::Pfunc());
-
-template <class ForwardIterator>
-ForwardIterator rotate( ForwardIterator first, ForwardIterator middle,
-			ForwardIterator last,
-			const unsigned P = _Pfunc::Pfunc());
-
-template <class ForwardIterator, class OutputIterator>
-OutputIterator rotate_copy(ForwardIterator first, ForwardIterator middle,
-                           ForwardIterator last, OutputIterator result,
-			   const unsigned P = _Pfunc::Pfunc());
-
-// search suffers from a loss of efficiency when executed in parallel!
-template <class ForwardIterator1, class ForwardIterator2, class BinaryPredicate>
-ForwardIterator1 search(ForwardIterator1 first1, ForwardIterator1 last1,
-                        ForwardIterator2 first2, ForwardIterator2 last2,
-                        BinaryPredicate binary_pred,
-			const unsigned P = _Pfunc::Pfunc());
-
-template <class ForwardIterator1, class ForwardIterator2>
-ForwardIterator1 search(ForwardIterator1 first1, ForwardIterator1 last1,
-                        ForwardIterator2 first2, ForwardIterator2 last2,
-			const unsigned P = _Pfunc::Pfunc());
-
-template <class ForwardIterator, class Integer,
-          class T, class BinaryPredicate>
-ForwardIterator search_n(ForwardIterator first, ForwardIterator last,
-                         Integer count, const T& value,
-                         BinaryPredicate binary_pred,
-			 const unsigned P = _Pfunc::Pfunc());
-
-template <class ForwardIterator, class Integer, class T>
-ForwardIterator search_n(ForwardIterator first, ForwardIterator last,
-                         Integer count, const T& value,
-			const unsigned P = _Pfunc::Pfunc());
-
-template <class InputIterator1, class InputIterator2, class OutputIterator,
-          class StrictWeakOrdering>
-OutputIterator set_difference(InputIterator1 first1, InputIterator1 last1,
-				InputIterator2 first2, InputIterator2 last2,
-				OutputIterator result, StrictWeakOrdering comp,
-				const unsigned P = _Pfunc::Pfunc());
-
-template <class InputIterator1, class InputIterator2, class OutputIterator>
-OutputIterator set_difference(InputIterator1 first1, InputIterator1 last1,
-				InputIterator2 first2, InputIterator2 last2,
-				OutputIterator result,
-				const unsigned P = _Pfunc::Pfunc());
-
-template <class InputIterator1, class InputIterator2, class OutputIterator,
-          class StrictWeakOrdering>
-OutputIterator set_intersection(InputIterator1 first1, InputIterator1 last1,
-				InputIterator2 first2, InputIterator2 last2,
-				OutputIterator result, StrictWeakOrdering comp,
-			 	const unsigned P = _Pfunc::Pfunc());
-
-template <class InputIterator1, class InputIterator2, class OutputIterator>
-OutputIterator set_intersection(InputIterator1 first1, InputIterator1 last1,
-				InputIterator2 first2, InputIterator2 last2,
-				OutputIterator result,
-				const unsigned P = _Pfunc::Pfunc());
-
-
-template <class InputIterator1, class InputIterator2, class OutputIterator,
-          class StrictWeakOrdering>
-OutputIterator
-set_symmetric_difference(InputIterator1 first1, InputIterator1 last1,
-			 InputIterator2 first2, InputIterator2 last2,
-			 OutputIterator result, StrictWeakOrdering comp,
-			 const unsigned P = _Pfunc::Pfunc());
-
-template <class InputIterator1, class InputIterator2, class OutputIterator>
-OutputIterator
-set_symmetric_difference(InputIterator1 first1, InputIterator1 last1,
-			 InputIterator2 first2, InputIterator2 last2,
-			 OutputIterator result,
-			 const unsigned P = _Pfunc::Pfunc());
-
-template <class InputIterator1, class InputIterator2, class OutputIterator,
-          class StrictWeakOrdering>
-OutputIterator set_union(InputIterator1 first1, InputIterator1 last1,
-			 InputIterator2 first2, InputIterator2 last2,
-			 OutputIterator result, StrictWeakOrdering comp,
-			 const unsigned P = _Pfunc::Pfunc());
-
-template <class InputIterator1, class InputIterator2, class OutputIterator>
-OutputIterator set_union(InputIterator1 first1, InputIterator1 last1,
-			 InputIterator2 first2, InputIterator2 last2,
-			 OutputIterator result,
-			 const unsigned P = _Pfunc::Pfunc());
-
-
-template<typename RandomAccessIterator>
-void stable_sort(RandomAccessIterator first, RandomAccessIterator last,
-			const unsigned P = _Pfunc::Pfunc());
-
-template<typename RandomAccessIterator, class StrictWeakOrdering>
-void stable_sort(RandomAccessIterator first, RandomAccessIterator last,
-	StrictWeakOrdering comp, const unsigned P = _Pfunc::Pfunc());
-
-template<typename RandomAccessIterator>
-void sort(RandomAccessIterator first, RandomAccessIterator last,
-			const unsigned P = _Pfunc::Pfunc());
-
-template<typename RandomAccessIterator, class StrictWeakOrdering>
-void sort(RandomAccessIterator first, RandomAccessIterator last,
-	  StrictWeakOrdering comp, const unsigned P = _Pfunc::Pfunc());
-
-template <class ForwardIterator1, class ForwardIterator2>
-ForwardIterator2 swap_ranges(ForwardIterator1 first1, ForwardIterator1 last1,
-				ForwardIterator2 first2,
-				const unsigned P = _Pfunc::Pfunc());
-
-template <class InputIterator, class OutputIterator, class UnaryFunction>
-OutputIterator transform(InputIterator first, InputIterator last,
-                         OutputIterator result, UnaryFunction op,
-			 const unsigned P = _Pfunc::Pfunc());
-
-template <class InputIterator1, class InputIterator2, class OutputIterator,
-          class BinaryFunction>
-OutputIterator transform(InputIterator1 first1, InputIterator1 last1,
-                         InputIterator2 first2, OutputIterator result,
-                         BinaryFunction binary_op,
-			 const unsigned P = _Pfunc::Pfunc());
-
-template <class ForwardIterator, class BinaryPredicate>
-ForwardIterator unique(ForwardIterator first, ForwardIterator last,
-                       BinaryPredicate binary_pred,
-		       const unsigned P = _Pfunc::Pfunc());
-
-template <class ForwardIterator>
-ForwardIterator unique(ForwardIterator first, ForwardIterator last,
-		       const unsigned P = _Pfunc::Pfunc());
-
-template <class InputIterator, class OutputIterator, class BinaryPredicate>
-OutputIterator unique_copy(InputIterator first, InputIterator last,
-                           OutputIterator result, BinaryPredicate binary_pred,
-		       const unsigned P = _Pfunc::Pfunc());
-
-template <class InputIterator, class OutputIterator>
-OutputIterator unique_copy(InputIterator first, InputIterator last,
-                           OutputIterator result,
-		       const unsigned P = _Pfunc::Pfunc());
-
-template <class ForwardIterator, class T, class StrictWeakOrdering>
-ForwardIterator upper_bound(ForwardIterator first, ForwardIterator last,
-			    const T& value, StrictWeakOrdering comp,
-			    const unsigned P = _Pfunc::Pfunc());
-
-template <class ForwardIterator, class T>
-ForwardIterator upper_bound(ForwardIterator first, ForwardIterator last,
-			    const T& value,
-			    const unsigned P = _Pfunc::Pfunc());
-
-} // namespace omptl
-
-#ifdef _OPENMP
-  #include <omptl/omptl_algorithm_par.h>
-#else
-  #include <omptl/omptl_algorithm_ser.h>
-#endif
-
-#endif /* OMPTL_ALGORITHM */
diff --git a/lib/omptl/numeric b/lib/omptl/numeric
deleted file mode 100644
index c64a9b0d7..000000000
--- a/lib/omptl/numeric
+++ /dev/null
@@ -1,83 +0,0 @@
-// Copyright (C) 2006 Fokko Beekhof
-// Email contact: Fokko.Beekhof@unige.ch
-
-// The OMPTL library is free software; you can redistribute it and/or
-// modify it under the terms of the GNU Lesser General Public
-// License as published by the Free Software Foundation; either
-// version 2.1 of the License, or (at your option) any later version.
-
-// This library is distributed in the hope that it will be useful,
-// but WITHOUT ANY WARRANTY; without even the implied warranty of
-// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-// Lesser General Public License for more details.
-
-// You should have received a copy of the GNU Lesser General Public
-// License along with this library; if not, write to the Free Software
-// Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
-
-#ifndef OMPTL_NUMERIC
-#define OMPTL_NUMERIC 1
-
-#include <numeric>
-#include <omptl/omptl>
-
-namespace omptl
-{
-
-template <class InputIterator, class T>
-T accumulate(InputIterator first, InputIterator last, T init,
-		const unsigned P = _Pfunc::Pfunc());
-
-template <class InputIterator, class T, class BinaryFunction>
-T accumulate(InputIterator first, InputIterator last, T init,
-	     BinaryFunction binary_op,
-	     const unsigned P = _Pfunc::Pfunc());
-
-/*
- * Not (yet) paralellized due to data dependance.
- */
-template <class InputIterator, class OutputIterator, class BinaryFunction>
-OutputIterator
-adjacent_difference(InputIterator first, InputIterator last,
-		    OutputIterator result, BinaryFunction binary_op,
-		    const unsigned P = _Pfunc::Pfunc());
-
-template <class InputIterator, class OutputIterator>
-OutputIterator adjacent_difference(InputIterator first, InputIterator last,
-				   OutputIterator result,
-				   const unsigned P = _Pfunc::Pfunc());
-
-template <class InputIterator1, class InputIterator2, class T,
-          class BinaryFunction1, class BinaryFunction2>
-T inner_product(InputIterator1 first1, InputIterator1 last1,
-		InputIterator2 first2, T init,
-		BinaryFunction1 binary_op1, BinaryFunction2 binary_op2,
-		const unsigned P = _Pfunc::Pfunc());
-
-template <class InputIterator1, class InputIterator2, class T>
-T inner_product(InputIterator1 first1, InputIterator1 last1,
-		InputIterator2 first2, T init,
-		const unsigned P = _Pfunc::Pfunc());
-
-// Not paralellized due to dependencies and complications with OutputIterators.
-template <class InputIterator, class OutputIterator,
-	  class BinaryOperation>
-OutputIterator partial_sum(InputIterator first, InputIterator last,
-			   OutputIterator result, BinaryOperation binary_op,
-			   const unsigned P = _Pfunc::Pfunc());
-
-template <class InputIterator, class OutputIterator>
-OutputIterator partial_sum(InputIterator first, InputIterator last,
-		OutputIterator result, const unsigned P = _Pfunc::Pfunc());
-
-} // namespace omptl
-
-#ifdef _OPENMP
-  #include <omptl/omptl_numeric_par.h>
-#else
-  #include <omptl/omptl_numeric_ser.h>
-#endif
-
-#include <omptl/omptl_numeric_extensions.h>
-
-#endif /* OMPTL_NUMERIC */
diff --git a/lib/omptl/omptl b/lib/omptl/omptl
deleted file mode 100644
index b1dddf859..000000000
--- a/lib/omptl/omptl
+++ /dev/null
@@ -1,54 +0,0 @@
-// Copyright (C) 2006 Fokko Beekhof
-// Email contact: Fokko.Beekhof@unige.ch
-
-// The OMPTL library is free software; you can redistribute it and/or
-// modify it under the terms of the GNU Lesser General Public
-// License as published by the Free Software Foundation; either
-// version 2.1 of the License, or (at your option) any later version.
-
-// This library is distributed in the hope that it will be useful,
-// but WITHOUT ANY WARRANTY; without even the implied warranty of
-// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-// Lesser General Public License for more details.
-
-// You should have received a copy of the GNU Lesser General Public
-// License along with this library; if not, write to the Free Software
-// Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
-
-#ifndef OMPTL
-#define OMPTL 1
-
-#ifndef _OPENMP
-  #define par_generate generate
-#else
-/*
-#ifdef OMPTL_NO_DEBUG
-  #define OMPTL_ASSERT(X)
-#else
-*/  #include <cassert>
-/*  #define OMPTL_ASSERT(X) assert(X)
-#endif
-*/
-// For debugging
-#ifndef _OMPTL_DEBUG_NO_OMP
-  #include <omp.h>
-#else
-  #define omp_get_max_threads() (2)
-#endif
-
-#endif /* ifndef _OPENMP */
-
-struct _Pfunc
-{
-	static unsigned Pfunc()
-	{
-		#ifdef _OPENMP
-		assert(omp_get_max_threads() > 0);
-		return omp_get_max_threads();
-		#else
-		return 0;
-		#endif
-	}
-};
-
-#endif /* OMPTL */
diff --git a/lib/omptl/omptl_algorithm b/lib/omptl/omptl_algorithm
deleted file mode 100644
index 7058d34ee..000000000
--- a/lib/omptl/omptl_algorithm
+++ /dev/null
@@ -1,567 +0,0 @@
-// Copyright (C) 2006 Fokko Beekhof
-// Email contact: Fokko.Beekhof@unige.ch
-
-// The OMPTL library is free software; you can redistribute it and/or
-// modify it under the terms of the GNU Lesser General Public
-// License as published by the Free Software Foundation; either
-// version 2.1 of the License, or (at your option) any later version.
-
-// This library is distributed in the hope that it will be useful,
-// but WITHOUT ANY WARRANTY; without even the implied warranty of
-// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-// Lesser General Public License for more details.
-
-// You should have received a copy of the GNU Lesser General Public
-// License along with this library; if not, write to the Free Software
-// Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
-
-
-#ifndef OMPTL_ALGORITHM
-#define OMPTL_ALGORITHM 1
-
-#include <algorithm>
-#include <omptl/omptl>
-
-namespace omptl
-{
-
-template <class ForwardIterator>
-ForwardIterator adjacent_find(ForwardIterator first, ForwardIterator last,
-			const unsigned P = _Pfunc::Pfunc());
-
-template <class ForwardIterator, class BinaryPredicate>
-ForwardIterator adjacent_find(ForwardIterator first, ForwardIterator last,
-                              BinaryPredicate binary_pred,
-			      const unsigned P = _Pfunc::Pfunc());
-
-template <class ForwardIterator, class T, class StrictWeakOrdering>
-bool binary_search(ForwardIterator first, ForwardIterator last, const T& value,
-                   StrictWeakOrdering comp,
-		   const unsigned P = _Pfunc::Pfunc());
-
-template <class ForwardIterator, class T>
-bool binary_search(ForwardIterator first, ForwardIterator last, const T& value,
-		   const unsigned P = _Pfunc::Pfunc());
-
-template <class InputIterator, class OutputIterator>
-OutputIterator copy(InputIterator first, InputIterator last,
-		    OutputIterator result,
-		    const unsigned P = _Pfunc::Pfunc());
-
-template <class BidirectionalIterator1, class BidirectionalIterator2>
-BidirectionalIterator2 copy_backward(BidirectionalIterator1 first,
-                                     BidirectionalIterator1 last,
-                                     BidirectionalIterator2 result,
-			 	     const unsigned P = _Pfunc::Pfunc());
-
-/*
- * Note: implementation assumes that
- * ::std::iterator_traits<InputIterator>::difference_type(0) will
- * return a difference_type representing zero.
- */
-template <class InputIterator, class EqualityComparable>
-typename ::std::iterator_traits<InputIterator>::difference_type
-count(InputIterator first, InputIterator last, const EqualityComparable& value,
-		   const unsigned P = _Pfunc::Pfunc());
-
-template <class InputIterator, class EqualityComparable, class Size>
-void count(InputIterator first, InputIterator last,
-           const EqualityComparable& value,
-           Size& n, const unsigned P = _Pfunc::Pfunc());
-
-template <class InputIterator, class Predicate>
-typename InputIterator::difference_type
-count_if(InputIterator first, InputIterator last, Predicate pred,
-	 const unsigned P = _Pfunc::Pfunc());
-
-template <class InputIterator, class Predicate, class Size>
-void count_if(InputIterator first, InputIterator last,
-              Predicate pred, Size& n, const unsigned P = _Pfunc::Pfunc());
-
-template <class InputIterator1, class InputIterator2,
-          class BinaryPredicate>
-bool equal(InputIterator1 first1, InputIterator1 last1,
-           InputIterator2 first2, BinaryPredicate binary_pred,
-	   const unsigned P = _Pfunc::Pfunc());
-
-template <class InputIterator1, class InputIterator2>
-bool equal(InputIterator1 first1, InputIterator1 last1,
-           InputIterator2 first2, const unsigned P = _Pfunc::Pfunc());
-
-template <class ForwardIterator, class T, class StrictWeakOrdering>
-::std::pair<ForwardIterator, ForwardIterator>
-equal_range(ForwardIterator first, ForwardIterator last, const T& value,
-            StrictWeakOrdering comp,
-	    const unsigned P = _Pfunc::Pfunc());
-
-template <class ForwardIterator, class T>
-::std::pair<ForwardIterator, ForwardIterator>
-equal_range(ForwardIterator first, ForwardIterator last, const T& value,
-	    const unsigned P = _Pfunc::Pfunc());
-
-template <class ForwardIterator, class T>
-void fill(ForwardIterator first, ForwardIterator last, const T& value,
-	  const unsigned P = _Pfunc::Pfunc());
-
-template <class OutputIterator, class Size, class T>
-OutputIterator fill_n(OutputIterator first, Size n, const T& value,
-		      const unsigned P = _Pfunc::Pfunc());
-
-/*
- * find suffers from a loss of efficiency when executed in parallel!
- */
-template<class InputIterator, class EqualityComparable>
-InputIterator find(InputIterator first, InputIterator last,
-                   const EqualityComparable& value,
-		   const unsigned P = _Pfunc::Pfunc());
-
-/*
- * find_if suffers from a loss of efficiency when executed in parallel!
- */
-template<class InputIterator, class Predicate>
-InputIterator find_if(InputIterator first, InputIterator last,
-                      Predicate pred, const unsigned P = _Pfunc::Pfunc());
-
-template <class ForwardIterator1, class ForwardIterator2,
-          class BinaryPredicate>
-ForwardIterator1
-find_end(ForwardIterator1 first1, ForwardIterator1 last1,
-         ForwardIterator2 first2, ForwardIterator2 last2,
-         BinaryPredicate comp, const unsigned P = _Pfunc::Pfunc() );
-
-template <class ForwardIterator1, class ForwardIterator2>
-ForwardIterator1 find_end(ForwardIterator1 first1, ForwardIterator1 last1,
-         		  ForwardIterator2 first2, ForwardIterator2 last2,
-	 		  const unsigned P = _Pfunc::Pfunc() );
-
-/*
- * find_first_of suffers from a loss of efficiency when executed in parallel!
- */
-template <class InputIterator, class ForwardIterator, class BinaryPredicate>
-InputIterator find_first_of(InputIterator first1, InputIterator last1,
-                            ForwardIterator first2, ForwardIterator last2,
-                            BinaryPredicate comp,
-			    const unsigned P = _Pfunc::Pfunc());
-
-template <class InputIterator, class ForwardIterator>
-InputIterator find_first_of(InputIterator first1, InputIterator last1,
-                            ForwardIterator first2, ForwardIterator last2,
-			    const unsigned P = _Pfunc::Pfunc());
-
-// Calls to UnaryFunction f must be completely independent
-template <class InputIterator, class UnaryFunction>
-UnaryFunction for_each(InputIterator first, InputIterator last, UnaryFunction f,
-			const unsigned P = _Pfunc::Pfunc());
-
-// Not parallellized, Generator is explicitly allowed and expected to return
-// different results on subsequent calls. Order is therefor respected unless
-// the programmer explicitly accepts responsibility and calls par_generate.
-template <class ForwardIterator, class Generator>
-void generate(ForwardIterator first, ForwardIterator last, Generator gen);
-
-template <class ForwardIterator, class Generator>
-void par_generate(ForwardIterator first, ForwardIterator last, Generator gen,
-	      const unsigned P = _Pfunc::Pfunc());
-
-template <class RandomAccessIterator, class StrictWeakOrdering>
-void push_heap(RandomAccessIterator first, RandomAccessIterator last,
-               StrictWeakOrdering comp,
-	       const unsigned P = _Pfunc::Pfunc());
-
-template <class RandomAccessIterator>
-void push_heap(RandomAccessIterator first, RandomAccessIterator last,
-	       const unsigned P = _Pfunc::Pfunc());
-
-template <class RandomAccessIterator, class StrictWeakOrdering>
-inline void pop_heap(RandomAccessIterator first, RandomAccessIterator last,
-                     StrictWeakOrdering comp,
-		     const unsigned P = _Pfunc::Pfunc());
-
-template <class RandomAccessIterator>
-inline void pop_heap(RandomAccessIterator first, RandomAccessIterator last,
-		     const unsigned P = _Pfunc::Pfunc());
-
-template <class RandomAccessIterator, class StrictWeakOrdering>
-void make_heap(RandomAccessIterator first, RandomAccessIterator last,
-               StrictWeakOrdering comp,
-		const unsigned P = _Pfunc::Pfunc());
-
-template <class RandomAccessIterator>
-void make_heap(RandomAccessIterator first, RandomAccessIterator last,
-		const unsigned P = _Pfunc::Pfunc());
-
-template <class RandomAccessIterator, class StrictWeakOrdering>
-void sort_heap(RandomAccessIterator first, RandomAccessIterator last,
-               StrictWeakOrdering comp,
-		 const unsigned P = _Pfunc::Pfunc());
-
-template <class RandomAccessIterator>
-void sort_heap(RandomAccessIterator first, RandomAccessIterator last,
-		 const unsigned P = _Pfunc::Pfunc());
-
-// Warning: includes loses some efficiency in parallel.
-template <class InputIterator1, class InputIterator2, class StrictWeakOrdering>
-bool includes(InputIterator1 first1, InputIterator1 last1,
-              InputIterator2 first2, InputIterator2 last2,
-              StrictWeakOrdering comp, const unsigned P = _Pfunc::Pfunc());
-
-template <class InputIterator1, class InputIterator2>
-bool includes(InputIterator1 first1, InputIterator1 last1,
-              InputIterator2 first2, InputIterator2 last2,
-	      const unsigned P = _Pfunc::Pfunc());
-
-template <class InputIterator1, class InputIterator2, class BinaryPredicate>
-bool lexicographical_compare(InputIterator1 first1, InputIterator1 last1,
-                             InputIterator2 first2, InputIterator2 last2,
-                             BinaryPredicate comp,
-			     const unsigned P = _Pfunc::Pfunc());
-
-template <class InputIterator1, class InputIterator2>
-bool lexicographical_compare(InputIterator1 first1, InputIterator1 last1,
-                             InputIterator2 first2, InputIterator2 last2,
-			     const unsigned P = _Pfunc::Pfunc());
-
-template <class ForwardIterator, class T, class StrictWeakOrdering>
-ForwardIterator lower_bound(ForwardIterator first, ForwardIterator last,
-                            const T& value, StrictWeakOrdering comp,
-			    const unsigned P = _Pfunc::Pfunc());
-
-template <class ForwardIterator, class T>
-ForwardIterator
-lower_bound(ForwardIterator first, ForwardIterator last,
-	    const T& value, const unsigned P = _Pfunc::Pfunc());
-
-template <class InputIterator1, class InputIterator2, class OutputIterator,
-          class StrictWeakOrdering>
-OutputIterator merge(InputIterator1 first1, InputIterator1 last1,
-                     InputIterator2 first2, InputIterator2 last2,
-                     OutputIterator result,
-		     StrictWeakOrdering comp,
-		     const unsigned P = _Pfunc::Pfunc());
-
-template <class InputIterator1, class InputIterator2, class OutputIterator>
-OutputIterator merge(InputIterator1 first1, InputIterator1 last1,
-                     InputIterator2 first2, InputIterator2 last2,
-                     OutputIterator result,
-		     const unsigned P = _Pfunc::Pfunc());
-
-template <class ForwardIterator, class BinaryPredicate>
-ForwardIterator min_element(ForwardIterator first, ForwardIterator last,
-                            BinaryPredicate comp,
-			    const unsigned P = _Pfunc::Pfunc());
-
-template <class ForwardIterator>
-ForwardIterator min_element(ForwardIterator first, ForwardIterator last,
-			    const unsigned P = _Pfunc::Pfunc());
-
-template <class ForwardIterator, class BinaryPredicate>
-ForwardIterator max_element(ForwardIterator first, ForwardIterator last,
-                            BinaryPredicate comp,
-			    const unsigned P = _Pfunc::Pfunc());
-
-template <class ForwardIterator>
-ForwardIterator max_element(ForwardIterator first, ForwardIterator last,
-			    const unsigned P = _Pfunc::Pfunc());
-
-template <class InputIterator1, class InputIterator2, class BinaryPredicate>
-::std::pair<InputIterator1, InputIterator2>
-mismatch(InputIterator1 first1, InputIterator1 last1, InputIterator2 first2,
-         BinaryPredicate binary_pred, const unsigned P = _Pfunc::Pfunc());
-
-template <class InputIterator1, class InputIterator2>
-::std::pair<InputIterator1, InputIterator2>
-mismatch(InputIterator1 first1, InputIterator1 last1, InputIterator2 first2,
-	 const unsigned P = _Pfunc::Pfunc());
-
-template <class RandomAccessIterator, class StrictWeakOrdering>
-void nth_element(RandomAccessIterator first, RandomAccessIterator nth,
-                 RandomAccessIterator last, StrictWeakOrdering comp,
-		 const unsigned P = _Pfunc::Pfunc());
-
-template <class RandomAccessIterator>
-void nth_element(RandomAccessIterator first, RandomAccessIterator nth,
-                 RandomAccessIterator last,
-		 const unsigned P = _Pfunc::Pfunc());
-
-template <class RandomAccessIterator, class StrictWeakOrdering>
-void partial_sort(RandomAccessIterator first,
-                  RandomAccessIterator middle,
-                  RandomAccessIterator last,
-                  StrictWeakOrdering comp,
-		  const unsigned P = _Pfunc::Pfunc());
-
-template <class RandomAccessIterator>
-void partial_sort(RandomAccessIterator first,
-                  RandomAccessIterator middle,
-                  RandomAccessIterator last,
-		  const unsigned P = _Pfunc::Pfunc());
-
-template <class InputIterator, class RandomAccessIterator,
-          class StrictWeakOrdering>
-RandomAccessIterator
-partial_sort_copy(InputIterator first, InputIterator last,
-                  RandomAccessIterator result_first,
-                  RandomAccessIterator result_last, StrictWeakOrdering comp,
-		  const unsigned P = _Pfunc::Pfunc());
-
-template <class InputIterator, class RandomAccessIterator>
-RandomAccessIterator
-partial_sort_copy(InputIterator first, InputIterator last,
-                  RandomAccessIterator result_first,
-                  RandomAccessIterator result_last,
-		  const unsigned P = _Pfunc::Pfunc());
-
-template <class ForwardIterator, class Predicate>
-ForwardIterator partition(ForwardIterator first, ForwardIterator last,
-			  Predicate pred,
-			  const unsigned P = _Pfunc::Pfunc());
-
-template <class BidirectionalIterator, class StrictWeakOrdering>
-bool next_permutation(BidirectionalIterator first, BidirectionalIterator last,
-                      StrictWeakOrdering comp,
-		      const unsigned P = _Pfunc::Pfunc());
-
-template <class BidirectionalIterator>
-bool next_permutation(BidirectionalIterator first, BidirectionalIterator last,
-		      const unsigned P = _Pfunc::Pfunc());
-
-template <class BidirectionalIterator, class StrictWeakOrdering>
-bool prev_permutation(BidirectionalIterator first, BidirectionalIterator last,
-                      StrictWeakOrdering comp,
-		      const unsigned P = _Pfunc::Pfunc());
-
-template <class BidirectionalIterator>
-bool prev_permutation(BidirectionalIterator first, BidirectionalIterator last,
-		      const unsigned P = _Pfunc::Pfunc());
-
-template <class ForwardIterator, class Predicate>
-ForwardIterator
-stable_partition(ForwardIterator first, ForwardIterator last,
-		 Predicate pred, const unsigned P = _Pfunc::Pfunc());
-
-template <class RandomAccessIterator>
-void random_shuffle(RandomAccessIterator first, RandomAccessIterator last,
-		    const unsigned P = _Pfunc::Pfunc());
-
-template <class RandomAccessIterator, class RandomNumberGenerator>
-void random_shuffle(RandomAccessIterator first, RandomAccessIterator last,
-                    RandomNumberGenerator &rgen,
-		    const unsigned P = _Pfunc::Pfunc());
-
-template <class ForwardIterator, class T>
-ForwardIterator remove(ForwardIterator first, ForwardIterator last,
-                       const T& value, const unsigned P = _Pfunc::Pfunc());
-
-template <class ForwardIterator, class Predicate>
-ForwardIterator remove_if(ForwardIterator first, ForwardIterator last,
-                          Predicate pred,
-			  const unsigned P = _Pfunc::Pfunc());
-
-template <class InputIterator, class OutputIterator, class T>
-OutputIterator remove_copy(InputIterator first, InputIterator last,
-                           OutputIterator result, const T& value,
-			   const unsigned P = _Pfunc::Pfunc());
-
-template <class InputIterator, class OutputIterator, class Predicate>
-OutputIterator remove_copy_if(InputIterator first, InputIterator last,
-                              OutputIterator result, Predicate pred,
-			      const unsigned P = _Pfunc::Pfunc());
-
-template <class ForwardIterator, class T>
-void replace(ForwardIterator first, ForwardIterator last, const T& old_value,
-             const T& new_value, const unsigned P = _Pfunc::Pfunc());
-
-template <class InputIterator, class OutputIterator, class T>
-OutputIterator replace_copy(InputIterator first, InputIterator last,
-                            OutputIterator result, const T& old_value,
-                            const T& new_value,
-			    const unsigned P = _Pfunc::Pfunc());
-
-template <class InputIterator, class OutputIterator, class Predicate, class T>
-OutputIterator replace_copy_if(InputIterator first, InputIterator last,
-                               OutputIterator result, Predicate pred,
-                               const T& new_value,
-			       const unsigned P = _Pfunc::Pfunc());
-
-template <class ForwardIterator, class Predicate, class T>
-void replace_if(ForwardIterator first, ForwardIterator last, Predicate pred,
-                const T& new_value,
-		const unsigned P = _Pfunc::Pfunc());
-
-template <class BidirectionalIterator>
-void reverse(BidirectionalIterator first, BidirectionalIterator last,
-	     const unsigned P = _Pfunc::Pfunc());
-
-template <class BidirectionalIterator, class OutputIterator>
-OutputIterator reverse_copy(BidirectionalIterator first,
-			    BidirectionalIterator last,
-			    OutputIterator result,
-			    const unsigned P = _Pfunc::Pfunc());
-
-template <class ForwardIterator>
-ForwardIterator rotate( ForwardIterator first, ForwardIterator middle,
-			ForwardIterator last,
-			const unsigned P = _Pfunc::Pfunc());
-
-template <class ForwardIterator, class OutputIterator>
-OutputIterator rotate_copy(ForwardIterator first, ForwardIterator middle,
-                           ForwardIterator last, OutputIterator result,
-			   const unsigned P = _Pfunc::Pfunc());
-
-// search suffers from a loss of efficiency when executed in parallel!
-template <class ForwardIterator1, class ForwardIterator2, class BinaryPredicate>
-ForwardIterator1 search(ForwardIterator1 first1, ForwardIterator1 last1,
-                        ForwardIterator2 first2, ForwardIterator2 last2,
-                        BinaryPredicate binary_pred,
-			const unsigned P = _Pfunc::Pfunc());
-
-template <class ForwardIterator1, class ForwardIterator2>
-ForwardIterator1 search(ForwardIterator1 first1, ForwardIterator1 last1,
-                        ForwardIterator2 first2, ForwardIterator2 last2,
-			const unsigned P = _Pfunc::Pfunc());
-
-template <class ForwardIterator, class Integer,
-          class T, class BinaryPredicate>
-ForwardIterator search_n(ForwardIterator first, ForwardIterator last,
-                         Integer count, const T& value,
-                         BinaryPredicate binary_pred,
-			 const unsigned P = _Pfunc::Pfunc());
-
-template <class ForwardIterator, class Integer, class T>
-ForwardIterator search_n(ForwardIterator first, ForwardIterator last,
-                         Integer count, const T& value,
-			const unsigned P = _Pfunc::Pfunc());
-
-template <class InputIterator1, class InputIterator2, class OutputIterator,
-          class StrictWeakOrdering>
-OutputIterator set_difference(InputIterator1 first1, InputIterator1 last1,
-				InputIterator2 first2, InputIterator2 last2,
-				OutputIterator result, StrictWeakOrdering comp,
-				const unsigned P = _Pfunc::Pfunc());
-
-template <class InputIterator1, class InputIterator2, class OutputIterator>
-OutputIterator set_difference(InputIterator1 first1, InputIterator1 last1,
-				InputIterator2 first2, InputIterator2 last2,
-				OutputIterator result,
-				const unsigned P = _Pfunc::Pfunc());
-
-template <class InputIterator1, class InputIterator2, class OutputIterator,
-          class StrictWeakOrdering>
-OutputIterator set_intersection(InputIterator1 first1, InputIterator1 last1,
-				InputIterator2 first2, InputIterator2 last2,
-				OutputIterator result, StrictWeakOrdering comp,
-			 	const unsigned P = _Pfunc::Pfunc());
-
-template <class InputIterator1, class InputIterator2, class OutputIterator>
-OutputIterator set_intersection(InputIterator1 first1, InputIterator1 last1,
-				InputIterator2 first2, InputIterator2 last2,
-				OutputIterator result,
-				const unsigned P = _Pfunc::Pfunc());
-
-
-template <class InputIterator1, class InputIterator2, class OutputIterator,
-          class StrictWeakOrdering>
-OutputIterator
-set_symmetric_difference(InputIterator1 first1, InputIterator1 last1,
-			 InputIterator2 first2, InputIterator2 last2,
-			 OutputIterator result, StrictWeakOrdering comp,
-			 const unsigned P = _Pfunc::Pfunc());
-
-template <class InputIterator1, class InputIterator2, class OutputIterator>
-OutputIterator
-set_symmetric_difference(InputIterator1 first1, InputIterator1 last1,
-			 InputIterator2 first2, InputIterator2 last2,
-			 OutputIterator result,
-			 const unsigned P = _Pfunc::Pfunc());
-
-template <class InputIterator1, class InputIterator2, class OutputIterator,
-          class StrictWeakOrdering>
-OutputIterator set_union(InputIterator1 first1, InputIterator1 last1,
-			 InputIterator2 first2, InputIterator2 last2,
-			 OutputIterator result, StrictWeakOrdering comp,
-			 const unsigned P = _Pfunc::Pfunc());
-
-template <class InputIterator1, class InputIterator2, class OutputIterator>
-OutputIterator set_union(InputIterator1 first1, InputIterator1 last1,
-			 InputIterator2 first2, InputIterator2 last2,
-			 OutputIterator result,
-			 const unsigned P = _Pfunc::Pfunc());
-
-
-template<typename RandomAccessIterator>
-void stable_sort(RandomAccessIterator first, RandomAccessIterator last,
-			const unsigned P = _Pfunc::Pfunc());
-
-template<typename RandomAccessIterator, class StrictWeakOrdering>
-void stable_sort(RandomAccessIterator first, RandomAccessIterator last,
-	StrictWeakOrdering comp, const unsigned P = _Pfunc::Pfunc());
-
-template<typename RandomAccessIterator>
-void sort(RandomAccessIterator first, RandomAccessIterator last,
-			const unsigned P = _Pfunc::Pfunc());
-
-template<typename RandomAccessIterator, class StrictWeakOrdering>
-void sort(RandomAccessIterator first, RandomAccessIterator last,
-	  StrictWeakOrdering comp, const unsigned P = _Pfunc::Pfunc());
-
-template <class ForwardIterator1, class ForwardIterator2>
-ForwardIterator2 swap_ranges(ForwardIterator1 first1, ForwardIterator1 last1,
-				ForwardIterator2 first2,
-				const unsigned P = _Pfunc::Pfunc());
-
-template <class InputIterator, class OutputIterator, class UnaryFunction>
-OutputIterator transform(InputIterator first, InputIterator last,
-                         OutputIterator result, UnaryFunction op,
-			 const unsigned P = _Pfunc::Pfunc());
-
-template <class InputIterator1, class InputIterator2, class OutputIterator,
-          class BinaryFunction>
-OutputIterator transform(InputIterator1 first1, InputIterator1 last1,
-                         InputIterator2 first2, OutputIterator result,
-                         BinaryFunction binary_op,
-			 const unsigned P = _Pfunc::Pfunc());
-
-template <class ForwardIterator, class BinaryPredicate>
-ForwardIterator unique(ForwardIterator first, ForwardIterator last,
-                       BinaryPredicate binary_pred,
-		       const unsigned P = _Pfunc::Pfunc());
-
-template <class ForwardIterator>
-ForwardIterator unique(ForwardIterator first, ForwardIterator last,
-		       const unsigned P = _Pfunc::Pfunc());
-
-template <class InputIterator, class OutputIterator, class BinaryPredicate>
-OutputIterator unique_copy(InputIterator first, InputIterator last,
-                           OutputIterator result, BinaryPredicate binary_pred,
-		       const unsigned P = _Pfunc::Pfunc());
-
-template <class InputIterator, class OutputIterator>
-OutputIterator unique_copy(InputIterator first, InputIterator last,
-                           OutputIterator result,
-		       const unsigned P = _Pfunc::Pfunc());
-
-template <class ForwardIterator, class T, class StrictWeakOrdering>
-ForwardIterator upper_bound(ForwardIterator first, ForwardIterator last,
-			    const T& value, StrictWeakOrdering comp,
-			    const unsigned P = _Pfunc::Pfunc());
-
-template <class ForwardIterator, class T>
-ForwardIterator upper_bound(ForwardIterator first, ForwardIterator last,
-			    const T& value,
-			    const unsigned P = _Pfunc::Pfunc());
-
-} // namespace omptl
-
-#if defined(__clang__)
-#pragma clang diagnostic push
-#pragma clang diagnostic ignored "-Wunused-parameter"
-#endif
-#ifdef _OPENMP
-  #include <omptl/omptl_algorithm_par.h>
-#else
-  #include <omptl/omptl_algorithm_ser.h>
-#endif
-#if defined(__clang__)
-#pragma clang diagnostic pop
-#endif
-#endif /* OMPTL_ALGORITHM */
diff --git a/lib/omptl/omptl_algorithm_par.h b/lib/omptl/omptl_algorithm_par.h
deleted file mode 100644
index e57c3ef23..000000000
--- a/lib/omptl/omptl_algorithm_par.h
+++ /dev/null
@@ -1,2247 +0,0 @@
-// Copyright (C) 2006-2011 Fokko Beekhof
-// Email contact: Fokko.Beekhof@unige.ch
-
-// The OMPTL library is free software; you can redistribute it and/or
-// modify it under the terms of the GNU Lesser General Public
-// License as published by the Free Software Foundation; either
-// version 2.1 of the License, or (at your option) any later version.
-
-// This library is distributed in the hope that it will be useful,
-// but WITHOUT ANY WARRANTY; without even the implied warranty of
-// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-// Lesser General Public License for more details.
-
-// You should have received a copy of the GNU Lesser General Public
-// License along with this library; if not, write to the Free Software
-// Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
-
-
-#include <functional>
-#include <utility>
-#include <cmath>
-#include <cstdlib>
-
-#include <omptl/omptl_tools.h>
-#include <omptl/omptl_numeric>
-
-#include <iterator>
-
-namespace omptl
-{
-
-/*
- * Not (yet) paralellized due to data dependance.
- */
-template <class ForwardIterator>
-ForwardIterator adjacent_find(ForwardIterator first, ForwardIterator last,
-			      const unsigned P)
-{
-	return std::adjacent_find(first, last);
-}
-
-/*
- * Not (yet) paralellized due to data dependance.
- */
-template <class ForwardIterator, class BinaryPredicate>
-ForwardIterator adjacent_find(ForwardIterator first, ForwardIterator last,
-                              BinaryPredicate binary_pred, const unsigned P)
-{
-	return std::adjacent_find(first, last, binary_pred);
-}
-
-template <class ForwardIterator, class T, class StrictWeakOrdering>
-bool binary_search(ForwardIterator first, ForwardIterator last, const T& value,
-                   StrictWeakOrdering comp, const unsigned P)
-{
-	if (detail::_linear_serial_is_faster(first, last, P))
-		return std::binary_search(first, last, value, comp);
-
-	std::vector< std::pair<ForwardIterator, ForwardIterator> > partitions(P);
-	::omptl::detail::_partition_range(first, last, partitions, P);
-
-	bool result = 0;
-	#pragma omp parallel for reduction(|:result)
-	for (int t = 0; t < int(P); ++t)
-		result |= std::binary_search(partitions[t].first,
-					     partitions[t].second, value, comp);
-
-	return result;
-}
-
-template <class ForwardIterator, class T>
-bool binary_search(ForwardIterator first, ForwardIterator last, const T& value,
-		   const unsigned P)
-{
-	typedef typename std::iterator_traits<ForwardIterator>::value_type VT;
-	return ::omptl::binary_search(first, last, value, std::less<VT>());
-}
-
-namespace detail
-{
-
-
-template <class IteratorInTag, class IteratorOutTag>
-struct Copy_
-{
-	template <class IteratorIn, class IteratorOut>
-	static IteratorOut _copy(IteratorIn first, IteratorIn last,
-				 IteratorOut result, const unsigned P)
-	{
-		if (detail::_linear_serial_is_faster(first, last, P))
-			return std::copy(first, last, result);
-
-		std::vector< std::pair<IteratorIn, IteratorIn> > source_partitions(P);
-		::omptl::detail::_partition_range(first, last, source_partitions, P);
-
-		std::vector<IteratorOut> dest_partitions(P);
-		::omptl::detail::_copy_partitions(source_partitions, result, dest_partitions, P);
-
-		#pragma omp parallel for
-		for (int t = 0; t < int(P); ++t)
-		{
-			IteratorOut tmp;
-			*( (t == int(P-1)) ? &result : &tmp )
-				 = std::copy( source_partitions[t].first,
-					      source_partitions[t].second,
-						dest_partitions[t]);
-		}
-
-		return result;
-	}
-};
-
-template <class IteratorOutTag>
-struct Copy_< std::input_iterator_tag, IteratorOutTag >
-{
-	template <class InputIterator, class OutputIterator>
-	static OutputIterator _copy(InputIterator first, InputIterator last,
-				    OutputIterator result, const unsigned P)
-	{
-		return std::copy(first, last, result);
-	}
-};
-
-template <class IteratorInTag>
-struct Copy_<IteratorInTag, std::output_iterator_tag>
-{
-	template <class InputIterator, class OutputIterator>
-	static OutputIterator _copy(InputIterator first, InputIterator last,
-				    OutputIterator result, const unsigned P)
-	{
-		return std::copy(first, last, result);
-	}
-};
-
-} // end namespace detail
-
-template <class InputIterator, class OutputIterator>
-OutputIterator copy(InputIterator first, InputIterator last,
-		    OutputIterator result, const unsigned P)
-{
-	return detail::Copy_<
-	    typename std::iterator_traits<InputIterator>::iterator_category,
-	    typename std::iterator_traits<OutputIterator>::iterator_category>
-		::_copy(first, last, result, P);
-}
-
-template <class BidirectionalIterator1, class BidirectionalIterator2>
-BidirectionalIterator2 copy_backward(BidirectionalIterator1 first,
-                                     BidirectionalIterator1 last,
-                                     BidirectionalIterator2 result,
-				     const unsigned P)
-{
-	if (detail::_linear_serial_is_faster(first, last, P))
-		return std::copy_backward(first, last, result);
-
-	std::vector< std::pair<BidirectionalIterator1, BidirectionalIterator1> > source_partitions(P);
-	::omptl::detail::_partition_range(first, last, source_partitions, P);
-
-	std::vector<BidirectionalIterator2> dest_partitions(P);
-	::omptl::detail::_copy_partitions(source_partitions, result, dest_partitions, P);
-
-	#pragma omp parallel for
-	for (int t = 0; t < int(P); ++t)
-	{
-		BidirectionalIterator2 tmp;
-		*( (t == int(P-1)) ? &result : &tmp ) =
-			std::copy_backward(   source_partitions[t].first,
-						source_partitions[t].second,
-						dest_partitions[t] );
-	}
-
-	return result;
-}
-
-namespace detail
-{
-
-template <class IteratorTag>
-struct Count_
-{
-	template <class Iterator, class EqualityComparable>
-	static typename std::iterator_traits<Iterator>::difference_type
-	count(Iterator first, Iterator last, const EqualityComparable& value,
- 		const unsigned P)
-	{
-		if (detail::_linear_serial_is_faster(first, last, P))
-			return std::count(first, last, value);
-
-		std::vector< std::pair<Iterator, Iterator> > partitions(P);
-		::omptl::detail::_partition_range(first, last, partitions, P);
-
-		typename std::iterator_traits<Iterator>::difference_type result = 0;
-		#pragma omp parallel for reduction(+:result)
-		for (int t = 0; t < int(P); ++t)
-			result += std::count( partitions[t].first, partitions[t].second, value );
-
-		return result;
-	}
-};
-
-template <>
-struct Count_< std::input_iterator_tag >
-{
-	template <class Iterator, class EqualityComparable>
-	static typename std::iterator_traits<Iterator>::difference_type
-	count(Iterator first, Iterator last, const EqualityComparable& value,
-		const unsigned P)
-	{
-		return std::count(first, last, value);
-	}
-};
-
-} // end namespace detail
-
-template <class InputIterator, class EqualityComparable>
-typename std::iterator_traits<InputIterator>::difference_type
-count(InputIterator first, InputIterator last,
-      const EqualityComparable& value, const unsigned P)
-{
-	return detail::Count_<typename std::iterator_traits<InputIterator>::iterator_category>::
-		count(first, last, value, P);
-}
-
-template <class InputIterator, class EqualityComparable, class Size>
-void count(InputIterator first, InputIterator last,
-           const EqualityComparable& value, Size& n, const unsigned P)
-{
-	n = ::omptl::count(first, last, value, P);
-}
-
-namespace detail
-{
-
-template <class IteratorTag>
-struct Count_if_
-{
-	template <class Iterator, class Predicate>
-	static typename std::iterator_traits<Iterator>::difference_type
-	count_if(Iterator first, Iterator last, Predicate pred,
-		const unsigned P)
-	{
-		if (detail::_linear_serial_is_faster(first, last, P))
-			return std::count_if(first, last, pred);
-
-		std::vector< std::pair<Iterator, Iterator> > partitions(P);
-		detail::_partition_range(first, last, partitions, P);
-
-		typename std::iterator_traits<Iterator>::difference_type
-			result = 0;
-
-		#pragma omp parallel for reduction(+:result)
-		for (int t = 0; t < int(P); ++t)
-			result += std::count_if(partitions[t].first,
-						partitions[t].second, pred);
-
-		return result;
-	}
-};
-
-template <>
-struct Count_if_< std::input_iterator_tag >
-{
-	template <class InputIterator, class Predicate>
-	typename std::iterator_traits<InputIterator>::difference_type
-	static count_if(InputIterator first, InputIterator last,
-			 Predicate pred, const unsigned P)
-	{
-		return std::count_if(first, last, pred);
-	}
-};
-
-
-} // end namespace detail
-
-template <class InputIterator, class Predicate>
-typename std::iterator_traits<InputIterator>::difference_type
-count_if(InputIterator first, InputIterator last,
-	 Predicate pred, const unsigned P)
-{
-	return detail::Count_if_<typename
-		std::iterator_traits<InputIterator>::iterator_category>::
-			count_if(first, last, pred, P);
-}
-
-template <class InputIterator, class Predicate, class Size>
-void count_if(InputIterator first, InputIterator last,
-              Predicate pred, Size& n, const unsigned P)
-{
-	n = ::omptl::count_if(first, last, pred, P);
-}
-
-namespace detail
-{
-
-template<class Iterator1Tag, class Iterator2Tag>
-struct Equal_
-{
-	template <class Iterator1, class Iterator2, class BinaryPredicate>
-	static bool _equal(Iterator1 first1, Iterator1 last1,
-			   Iterator2 first2, BinaryPredicate binary_pred,
-			   const unsigned P)
-	{
-		if (detail::_linear_serial_is_faster(first1, last1, P))
-			return std::equal(first1, last1, first2, binary_pred);
-
-		std::vector< std::pair<Iterator1, Iterator1> > source_partitions(P);
-		::omptl::detail::_partition_range(first1, last1, source_partitions, P);
-
-		std::vector<Iterator2> dest_partitions(P);
-		::omptl::detail::_copy_partitions(source_partitions, first2, dest_partitions, P);
-
-		bool result = true;
-		#pragma omp parallel for reduction(&:result)
-		for (int t = 0; t < int(P); ++t)
-			result &= std::equal( source_partitions[t].first,
-					      source_partitions[t].second,
-						dest_partitions[t], binary_pred);
-
-		return result;
-	}
-};
-
-template<class Iterator2Tag>
-struct Equal_<std::input_iterator_tag, Iterator2Tag>
-{
-	template <class InputIterator1, class Iterator2, class BinaryPredicate>
-	static bool _equal(InputIterator1 first1, InputIterator1 last1,
-			   Iterator2 first2, BinaryPredicate binary_pred,
-			   const unsigned P)
-	{
-		return std::equal(first1, last1, first2, binary_pred);
-	}
-};
-
-template<class Iterator1Tag>
-struct Equal_<Iterator1Tag, std::input_iterator_tag>
-{
-	template <class Iterator1, class InputIterator2, class BinaryPredicate>
-	static bool _equal(Iterator1 first1, Iterator1 last1,
-			   InputIterator2 first2, BinaryPredicate binary_pred,
-			   const unsigned P)
-	{
-		return std::equal(first1, last1, first2, binary_pred);
-	}
-};
-
-template<>
-struct Equal_<std::input_iterator_tag, std::input_iterator_tag>
-{
-	template <class InputIterator1, class InputIterator2, class BinaryPredicate>
-	static bool _equal(InputIterator1 first1, InputIterator1 last1,
-			   InputIterator2 first2, BinaryPredicate binary_pred,
-			   const unsigned P)
-	{
-		return std::equal(first1, last1, first2, binary_pred);
-	}
-};
-
-} // end namespace detail
-
-template <class InputIterator1, class InputIterator2,
-          class BinaryPredicate>
-bool equal(InputIterator1 first1, InputIterator1 last1,
-           InputIterator2 first2, BinaryPredicate binary_pred, const unsigned P)
-{
-// 	return std::equal(first1, last1, first2, binary_pred);
-
-	return ::omptl::detail::Equal_<
-	    typename std::iterator_traits<InputIterator1>::iterator_category,
-	    typename std::iterator_traits<InputIterator2>::iterator_category>
-		::_equal(first1, last1, first2, binary_pred, P);
-}
-
-template <class InputIterator1, class InputIterator2>
-bool equal(InputIterator1 first1, InputIterator1 last1,
-           InputIterator2 first2, const unsigned P)
-{
-	typedef typename std::iterator_traits<InputIterator1>::value_type VT;
-
-	return ::omptl::equal(first1, last1, first2, std::equal_to<VT>());
-}
-
-//TODO
-template <class ForwardIterator, class T, class StrictWeakOrdering>
-std::pair<ForwardIterator, ForwardIterator>
-equal_range(ForwardIterator first, ForwardIterator last, const T& value,
-            StrictWeakOrdering comp, const unsigned P)
-{
-	return std::equal_range(first, last, value, comp);
-}
-
-template <class ForwardIterator, class T>
-std::pair<ForwardIterator, ForwardIterator>
-equal_range(ForwardIterator first, ForwardIterator last, const T& value,
-            const unsigned P)
-{
-	typedef typename std::iterator_traits<ForwardIterator>::value_type VT;
-
-	return ::omptl::equal_range(first, last, value, std::less<VT>(), P);
-}
-
-template <class ForwardIterator, class T>
-void fill(ForwardIterator first, ForwardIterator last,
-	  const T& value, const unsigned P)
-{
-	assert(P > 0u);
-	if (detail::_linear_serial_is_faster(first, last, P))
-	{
-		std::fill(first, last, value);
-		return;
-	}
-	assert(std::distance(first, last) >= 0);
-	assert(2*(int)P <= std::distance(first, last));
-
-	std::vector< std::pair<ForwardIterator, ForwardIterator> > partitions(P);
-	::omptl::detail::_partition_range(first, last, partitions, P);
-
-	#pragma omp parallel for
-	for (int t = 0; t < int(P); ++t)
-		std::fill(partitions[t].first, partitions[t].second, value);
-}
-
-namespace detail
-{
-
-template <class IteratorTag>
-struct Fill_n_
-{
-	template <class Iterator, class Size, class T>
-	static Iterator fill_n(Iterator first, Size n, const T& value,
-				const unsigned P)
-	{
-		assert(P > 0u);
-		Iterator last = first;
-		std::advance(last, n);
-		if (detail::_linear_serial_is_faster(first, last, P))
-			return std::fill_n(first, n, value);
-
-		const Size range = (n / P) + ( (n % P) ? 1 : 0 );
-		std::vector<Size> ranges(P);
-		std::fill_n(ranges.begin(), P - 1, range);
-		ranges[P - 1] = n - (P - 1) * range;
-
-		std::vector<Iterator> partitions(P);
-		partitions[0] = first;
-		for (unsigned i = 1; i < P; ++i)
-		{
-			partitions[i] = partitions[i - 1];
-			std::advance(partitions[i], range);
-		}
-
-		Iterator result;
-		#pragma omp parallel for
-		for (int t = 0; t < int(P); ++t)
-		{
-			Iterator tmp;
-			*( (t == int(P-1)) ? &result : &tmp )
-				 = std::fill_n(partitions[t], ranges[t], value);
-		}
-
-		return result;
-	}
-};
-
-template <>
-struct Fill_n_< std::output_iterator_tag >
-{
-	template <class OutputIterator, class Size, class T>
-	static OutputIterator fill_n(OutputIterator first, Size n,
-					 const T& value, const unsigned P)
-	{
-		return std::fill_n(first, n, value);
-	}
-};
-
-} // end namespace detail
-
-template <class OutputIterator, class Size, class T>
-OutputIterator fill_n(OutputIterator first, Size n,
-		      const T& value, const unsigned P)
-{
-	return ::omptl::detail::Fill_n_<typename std::iterator_traits<OutputIterator>::iterator_category>::
-		fill_n(first, n, value, P);
-}
-
-namespace detail {
-template <class IteratorTag>
-struct Find_
-{
-	template <class Iterator, class EqualityComparable>
-	static Iterator find(Iterator first, Iterator last,
-				const EqualityComparable& value,
-				const unsigned P)
-	{
-		if (detail::_linear_serial_is_faster(first, last, P))
-			return std::find(first, last, value);
-
-		std::vector< std::pair<Iterator, Iterator> > partitions(P);
-		::omptl::detail::_partition_range(first, last, partitions, P);
-
-		std::vector<Iterator> results(P);
-
-		#pragma omp parallel for
-		for (int t = 0; t < int(P); ++t)
-		{
-			results[t] = std::find(partitions[t].first, partitions[t].second, value);
-			if (results[t] == partitions[t].second)
-				results[t] = last;
-		}
-
-		typename std::vector<Iterator>::iterator result =
-		    std::find_if(results.begin(),results.end(),
-			std::bind2nd(std::not_equal_to<Iterator>(), last) );
-
-		if ( result != results.end() )
-			return *result;
-
-		return last;
-	}
-};
-
-template <>
-struct Find_< std::input_iterator_tag >
-{
-	template<class InputIterator, class EqualityComparable>
-	static InputIterator find(InputIterator first, InputIterator last,
-			const EqualityComparable& value, const unsigned P)
-	{
-		return std::find(first, last, value);
-	}
-};
-
-} // end namespace detail
-
-template<class InputIterator, class EqualityComparable>
-InputIterator find(InputIterator first, InputIterator last,
-                   const EqualityComparable& value, const unsigned P)
-{
-	return ::omptl::detail::Find_< typename std::iterator_traits<InputIterator>::iterator_category >::
-			find(first, last, value, P);
-}
-
-namespace detail
-{
-
-template <class IteratorTag>
-struct Find_if_
-{
-	template <class Iterator, class Predicate>
-	static Iterator find_if(Iterator first, Iterator last, Predicate pred,
-				const unsigned P, IteratorTag)
-	{
-		if (detail::_linear_serial_is_faster(first, last, P))
-			return std::find_if(first, last, pred);
-
-		std::vector< std::pair<Iterator, Iterator> > partitions(P);
-		::omptl::detail::_partition_range(first, last, partitions, P);
-
-		std::vector<Iterator> results(P);
-
-		#pragma omp parallel for
-		for (int t = 0; t < int(P); ++t)
-		{
-			results[t] = std::find_if(partitions[t].first, partitions[t].second, pred);
-
-			if (results[t] == partitions[t].second)
-				results[t] = last;
-		}
-
-		const typename std::vector<Iterator>::iterator result
-			 = std::find_if(results.begin(), results.end(),
-			std::bind2nd(std::not_equal_to<Iterator>(), last) );
-
-		if ( result != results.end() )
-			return *result;
-
-		return last;
-	}
-};
-
-template <>
-struct Find_if_< std::input_iterator_tag >
-{
-	template <class InputIterator, class Predicate>
-	static InputIterator _find_if(InputIterator first, InputIterator last,
-					Predicate pred, const unsigned P)
-	{
-		return std::find_if(first, last, pred);
-	}
-};
-
-} // end namespace detail
-
-template<class InputIterator, class Predicate>
-InputIterator find_if(InputIterator first, InputIterator last,
-                      Predicate pred, const unsigned P)
-{
-	return ::omptl::detail::Find_if_<typename std::iterator_traits<InputIterator>::iterator_category>::
-			find_if(first, last, pred, P);
-}
-
-// TODO
-template <class ForwardIterator1, class ForwardIterator2,
-          class BinaryPredicate>
-ForwardIterator1 find_end(ForwardIterator1 first1, ForwardIterator1 last1,
-			  ForwardIterator2 first2, ForwardIterator2 last2,
-			  BinaryPredicate comp, const unsigned P)
-{
-	return std::find_end(first1, last1, first2, last2, comp);
-}
-
-template <class ForwardIterator1, class ForwardIterator2>
-ForwardIterator1 find_end(ForwardIterator1 first1, ForwardIterator1 last1,
-			  ForwardIterator2 first2, ForwardIterator2 last2,
-			  const unsigned P)
-{
-// typedef typename std::iterator_traits<ForwardIterator1>::value_type VT;
-// return ::omptl::find_end(first1, last1, first2, last2, std::less<VT>());
-	return std::find_end(first1, last1, first2, last2);
-}
-
-namespace detail
-{
-
-// find_first_of suffers from a loss of efficiency, and potentially a loss of
-// performance when executed in parallel!
-template <class IteratorTag>
-struct Find_first_of_
-{
-	template <class Iterator, class ForwardIterator, class BinaryPredicate>
-	static Iterator
-	find_first_of(Iterator first1, Iterator last1,
-			ForwardIterator first2, ForwardIterator last2,
-			BinaryPredicate comp, const unsigned P)
-	{
-		if (detail::_linear_serial_is_faster(first1, last1, P))
-			return std::find_first_of(first1, last1,
-						  first2, last2, comp);
-
-		std::vector< std::pair<Iterator, Iterator> > partitions(P);
-		::omptl::detail::_partition_range(first1, last1, partitions, P);
-
-		std::vector<Iterator> results(P);
-
-		#pragma omp parallel for
-		for (int t = 0; t < int(P); ++t)
-		{
-			results[t] = std::find_first_of(partitions[t].first,
-							partitions[t].second,
-							first2, last2, comp);
-			if (results[t] == partitions[t].second)
-				results[t] = last1;
-		}
-
-		const typename std::vector<Iterator>::iterator
-		result = std::find_if(results.begin(), results.end(),
-					std::bind2nd(std::not_equal_to<Iterator>(), last1));
-
-		if ( result != results.end() )
-			return *result;
-
-		return last1;
-	}
-};
-
-template <>
-struct Find_first_of_< std::input_iterator_tag >
-{
-	template <class InputIterator, class ForwardIterator,
-		  class BinaryPredicate>
-	static InputIterator
-	find_first_of(    InputIterator first1,   InputIterator last1,
-			ForwardIterator first2, ForwardIterator last2,
-			BinaryPredicate comp, const unsigned P)
-	{
-		return std::find_first_of(first1, last1, first2, last2, comp);
-	}
-};
-
-} // end namespace detail
-
-template <class InputIterator, class ForwardIterator, class BinaryPredicate>
-InputIterator find_first_of(InputIterator first1, InputIterator last1,
-			ForwardIterator first2, ForwardIterator last2,
-			BinaryPredicate comp, const unsigned P)
-{
-	return ::omptl::detail::Find_first_of_<typename std::iterator_traits<InputIterator>::iterator_category>::
-		find_first_of(first1, last1, first2, last2, comp, P);
-}
-
-template <class InputIterator, class ForwardIterator>
-InputIterator find_first_of(InputIterator first1,   InputIterator last1,
-			  ForwardIterator first2, ForwardIterator last2,
-			const unsigned P)
-{
-	typedef typename std::iterator_traits<InputIterator>::value_type VT;
-	return ::omptl::find_first_of(first1, last1, first2, last2, std::equal_to<VT>());
-}
-
-namespace detail
-{
-
-template <class IteratorTag>
-struct For_each_
-{
-	template <class Iterator, class UnaryFunction>
-	static UnaryFunction for_each(Iterator first, Iterator last,
-			UnaryFunction f, const unsigned P)
-	{
-		if (detail::_linear_serial_is_faster(first, last, P))
-			return std::for_each(first, last, f);
-
-		std::vector< std::pair<Iterator, Iterator> > partitions(P);
-		::omptl::detail::_partition_range(first, last, partitions, P);
-
-		#pragma omp parallel for
-		for (int t = 0; t < int(P); ++t)
-			std::for_each(partitions[t].first, partitions[t].second, f);
-
-		return f;
-	}
-};
-
-template <>
-struct For_each_< std::input_iterator_tag >
-{
-	template <class InputIterator, class UnaryFunction>
-	static UnaryFunction for_each(InputIterator first, InputIterator last, UnaryFunction f, const unsigned P)
-	{
-		return std::for_each(first, last, f);
-	}
-};
-
-} // end namespace detail
-
-template <class InputIterator, class UnaryFunction>
-UnaryFunction for_each(InputIterator first, InputIterator last, UnaryFunction f, const unsigned P)
-{
-	return ::omptl::detail::For_each_<typename std::iterator_traits<InputIterator>::iterator_category>::
-			for_each(first, last, f, P);
-}
-
-template <class ForwardIterator, class Generator>
-void generate(ForwardIterator first, ForwardIterator last, Generator gen)
-{
-	std::generate(first, last, gen);
-}
-
-template <class ForwardIterator, class Generator>
-void par_generate(ForwardIterator first, ForwardIterator last,
-		  Generator gen, const unsigned P)
-{
-	if (detail::_linear_serial_is_faster(first, last, P))
-	{
-		std::generate(first, last, gen);
-		return;
-	}
-
-	std::vector< std::pair<ForwardIterator, ForwardIterator> > partitions(P);
-	::omptl::detail::_partition_range(first, last, partitions, P);
-
-	#pragma omp parallel for
-	for (int t = 0; t < int(P); ++t)
-		std::generate(partitions[t].first, partitions[t].second, gen);
-}
-
-template <class RandomAccessIterator, class StrictWeakOrdering>
-void push_heap(RandomAccessIterator first, RandomAccessIterator last,
-               StrictWeakOrdering comp, const unsigned P)
-{
-	return std::push_heap(first, last, comp);
-}
-
-template <class RandomAccessIterator>
-void push_heap(RandomAccessIterator first, RandomAccessIterator last,
-               const unsigned P)
-{
-// 	std::less<typename
-// 	std::iterator_traits<RandomAccessIterator>::value_type>(),
-	return std::push_heap(first, last);
-}
-
-template <class RandomAccessIterator, class StrictWeakOrdering>
-inline void pop_heap(RandomAccessIterator first, RandomAccessIterator last,
-                     StrictWeakOrdering comp, const unsigned P)
-{
-	return std::pop_heap(first, last, comp);
-}
-
-template <class RandomAccessIterator>
-inline void pop_heap(RandomAccessIterator first, RandomAccessIterator last, const unsigned P)
-{
-// 	std::less<typename
-// 	std::iterator_traits<RandomAccessIterator>::value_type>
-	return std::pop_heap(first, last);
-}
-
-template <class RandomAccessIterator, class StrictWeakOrdering>
-void make_heap(RandomAccessIterator first, RandomAccessIterator last,
-               StrictWeakOrdering comp, const unsigned P)
-{
-	return std::make_heap(first, last, comp);
-}
-
-template <class RandomAccessIterator>
-void make_heap(RandomAccessIterator first, RandomAccessIterator last, const unsigned P)
-{
-// 	std::less<typename
-// 		std::iterator_traits<RandomAccessIterator>::value_type>(),
-	return std::make_heap(first, last);
-}
-
-template <class RandomAccessIterator, class StrictWeakOrdering>
-void sort_heap(RandomAccessIterator first, RandomAccessIterator last,
-               StrictWeakOrdering comp, const unsigned P)
-{
-	return std::sort_heap(first, last, comp);
-}
-
-template <class RandomAccessIterator>
-void sort_heap(RandomAccessIterator first, RandomAccessIterator last, const unsigned P)
-{
-// 	std::less<typename
-// 		std::iterator_traits<RandomAccessIterator>::value_type>
-	return std::sort_heap(first, last);
-}
-
-namespace detail
-{
-
-template <class Iterator1Tag, class Iterator2Tag>
-struct Includes_
-{
-	template <class Iterator1, class Iterator2, class StrictWeakOrdering>
-	static bool includes(Iterator1 first1, Iterator1 last1,
-			     Iterator2 first2, Iterator2 last2,
-			     StrictWeakOrdering comp, const unsigned P)
-	{
-		if (detail::_linear_serial_is_faster(first2, last2, P))
-			return std::includes(first1, last1, first2, last2, comp);
-
-		/*
-		 * Includes is parallelized by splitting the second range
-		 * (needles), rather than the first (the haystack).
-		 */
-		std::vector< std::pair<Iterator2, Iterator2> >partitions(P);
-		::omptl::detail::_partition_range(first2, last2, partitions, P);
-
-		bool result = true;
-
-		// Hence, all needles should be found in the haystack
-		#pragma omp parallel for reduction(&:result)
-		for (int t = 0; t < int(P); ++t)
-			result &= std::includes(first1, last1,
-						partitions[t].first,
-						partitions[t].second, comp);
-
-		return result;
-	}
-};
-
-template <class Iterator2Tag>
-struct Includes_< std::input_iterator_tag, Iterator2Tag >
-{
-	template <class InputIterator1, class Iterator2, class StrictWeakOrdering>
-	static bool includes(InputIterator1 first1, InputIterator1 last1,
-			     Iterator2 first2, Iterator2 last2,
-			     StrictWeakOrdering comp, const unsigned P)
-	{
-		return std::includes(first1, last1, first2, last2, comp);
-	}
-};
-
-template <class Iterator1Tag>
-struct Includes_<Iterator1Tag, std::input_iterator_tag>
-{
-	template <class Iterator1, class InputIterator2,  class StrictWeakOrdering>
-	static bool includes(Iterator1 first1, Iterator1 last1,
-			     InputIterator2 first2, InputIterator2 last2,
-			     StrictWeakOrdering comp, const unsigned P)
-	{
-		return std::includes(first1, last1, first2, last2, comp);
-	}
-};
-
-template <>
-struct Includes_< std::input_iterator_tag, std::input_iterator_tag >
-{
-	template <class InputIterator1, class InputIterator2, class StrictWeakOrdering>
-	static bool includes(InputIterator1 first1, InputIterator1 last1,
-			     InputIterator2 first2, InputIterator2 last2,
-			     StrictWeakOrdering comp, const unsigned P)
-	{
-		return std::includes(first1, last1, first2, last2, comp);
-	}
-};
-
-} // end namespace detail
-
-template <class InputIterator1, class InputIterator2, class StrictWeakOrdering>
-bool includes(InputIterator1 first1, InputIterator1 last1,
-              InputIterator2 first2, InputIterator2 last2,
-              StrictWeakOrdering comp, const unsigned P)
-{
-	typedef typename std::iterator_traits<InputIterator1>::iterator_category IC1;
-	typedef typename std::iterator_traits<InputIterator2>::iterator_category IC2;
-
- 	return ::omptl::detail::Includes_<IC1, IC2>::includes(first1, last1, first2, last2, comp, P);
-}
-
-template <class InputIterator1, class InputIterator2>
-bool includes(InputIterator1 first1, InputIterator1 last1,
-              InputIterator2 first2, InputIterator2 last2,
-              const unsigned P)
-{
-	typedef typename std::iterator_traits<InputIterator1>::value_type VT;
-	return ::omptl::includes(first1, last1, first2, last2, std::less<VT>());
-}
-
-template <class InputIterator1, class InputIterator2, class BinaryPredicate>
-bool lexicographical_compare(InputIterator1 first1, InputIterator1 last1,
-                             InputIterator2 first2, InputIterator2 last2,
-                             BinaryPredicate comp, const unsigned P)
-{
-	return std::lexicographical_compare(first1, last1, first2, last2, comp);
-}
-
-template <class InputIterator1, class InputIterator2>
-bool lexicographical_compare(InputIterator1 first1, InputIterator1 last1,
-                             InputIterator2 first2, InputIterator2 last2,
-                             const unsigned P)
-{
-// 	std::less<typename
-// 		std::iterator_traits<InputIterator1>::value_type>
-	return std::lexicographical_compare(first1, last1, first2, last2);
-}
-
-template <class ForwardIterator, class T, class StrictWeakOrdering>
-ForwardIterator lower_bound(ForwardIterator first, ForwardIterator last,
-                            const T& value, StrictWeakOrdering comp,
-			    const unsigned P)
-{
-	if (detail::_logn_serial_is_faster(first, last, P))
-		return std::lower_bound(first, last, value, comp);
-
-	std::vector< std::pair<ForwardIterator, ForwardIterator> > partitions(P);
-	::omptl::detail::_partition_range(first, last, partitions, P);
-
-	std::vector<ForwardIterator> results(P);
-
-	#pragma omp parallel for
-	for (int t = 0; t < int(P); ++t)
-		results[t] = std::lower_bound(partitions[t].first, partitions[t].second, value, comp);
-
-	const typename std::vector<ForwardIterator>::iterator result =
-		std::find_if(results.begin(), results.end(), std::bind2nd(std::not_equal_to<ForwardIterator>(), last) );
-
-	if (result != results.end())
-		return *result;
-
-	return last;
-}
-
-template <class ForwardIterator, class T>
-ForwardIterator lower_bound(ForwardIterator first, ForwardIterator last,
-                            const T& value, const unsigned P)
-{
-	return ::omptl::lower_bound(first, last, value, std::less<T>(), P);
-}
-
-// Not parallelized, dependencies between data.
-template <class InputIterator1, class InputIterator2, class OutputIterator,
-          class StrictWeakOrdering>
-OutputIterator merge(InputIterator1 first1, InputIterator1 last1,
-                     InputIterator2 first2, InputIterator2 last2,
-                     OutputIterator result,
-		     StrictWeakOrdering comp, const unsigned P)
-{
-	return std::merge(first1, last1, first2, last2, result, comp);
-}
-
-template <class InputIterator1, class InputIterator2, class OutputIterator>
-OutputIterator merge(InputIterator1 first1, InputIterator1 last1,
-                     InputIterator2 first2, InputIterator2 last2,
-                     OutputIterator result, const unsigned P)
-{
-// 	std::less<typename
-// 		std::iterator_traits<InputIterator1>::value_type>
-	return std::merge(first1, last1, first2, last2, result);
-}
-
-template <class ForwardIterator, class BinaryPredicate>
-ForwardIterator min_element(ForwardIterator first, ForwardIterator last,
-                            BinaryPredicate comp, const unsigned P)
-{
-	if (detail::_linear_serial_is_faster(first, last, P))
-		return std::min_element(first, last, comp);
-
-	std::vector< std::pair<ForwardIterator, ForwardIterator> > partitions(P);
-	::omptl::detail::_partition_range(first, last, partitions, P);
-
-	std::vector<ForwardIterator> results(P);
-
-	#pragma omp parallel for
-	for (int t = 0; t < int(P); ++t)
-		results[t] = std::min_element(partitions[t].first, partitions[t].second, comp);
-
-	ForwardIterator result = results[0];
-	for (unsigned i = 1; i < P; ++i)
-		if ( (result != last) && (results[i] != last) && comp(*results[i], *result) )
-			result = results[i];
-
-	return result;
-}
-
-template <class ForwardIterator>
-ForwardIterator min_element(ForwardIterator first, ForwardIterator last,
-			    const unsigned P)
-{
-	typedef typename std::iterator_traits<ForwardIterator>::value_type value_type;
-	return ::omptl::min_element(first, last, std::less<value_type>(), P);
-}
-
-template <class ForwardIterator, class BinaryPredicate>
-ForwardIterator max_element(ForwardIterator first, ForwardIterator last,
-                            BinaryPredicate comp, const unsigned P)
-{
-	if (detail::_linear_serial_is_faster(first, last, P))
-		return std::max_element(first, last, comp);
-
-	std::vector< std::pair<ForwardIterator, ForwardIterator> > partitions(P);
-	::omptl::detail::_partition_range(first, last, partitions, P);
-
-	std::vector<ForwardIterator> results(P);
-
-	#pragma omp parallel for
-	for (int t = 0; t < int(P); ++t)
-		results[t] = std::max_element(partitions[t].first, partitions[t].second, comp);
-
-	ForwardIterator result = results[0];
-	for (unsigned i = 1; i < P; ++i)
-	{
-		if ( (result != last) && (results[i] != last) && comp(*result, *results[i]) )
-			result = results[i];
-	}
-
-	return result;
-}
-
-template <class ForwardIterator>
-ForwardIterator max_element(ForwardIterator first, ForwardIterator last,
-			    const unsigned P)
-{
-	typedef typename std::iterator_traits<ForwardIterator>::value_type value_type;
-	return ::omptl::max_element(first, last, std::less<value_type>(), P);
-}
-
-namespace detail
-{
-
-template <class Iterator1Tag, class Iterator2Tag>
-struct Mismatch_
-{
-	template <class Iterator1, class Iterator2, class BinaryPredicate>
-	static std::pair<Iterator1, Iterator2>
-	mismatch(Iterator1 first1, Iterator1 last1, Iterator2 first2,
-		BinaryPredicate binary_pred, const unsigned P)
-	{
-		if (detail::_linear_serial_is_faster(first1, last1, P))
-			return std::mismatch(first1, last1, first2, binary_pred);
-
-		std::vector< std::pair<Iterator1, Iterator1> > source_partitions(P);
-		::omptl::detail::_partition_range(first1, last1, source_partitions, P);
-
-		std::vector<Iterator2> dest_partitions(P);
-		::omptl::detail::_copy_partitions(source_partitions, first2, dest_partitions, P);
-
-		std::vector< std::pair<Iterator1, Iterator2> > results(P);
-
-		#pragma omp parallel for
-		for (int t = 0; t < int(P); ++t)
-			results[t] = std::mismatch(source_partitions[t].first,
-						   source_partitions[t].second,
-						     dest_partitions[t], binary_pred);
-
-		// This could have been done more elegantly with select1st
-		for (unsigned i = 0; i < P - 1; ++i)
-			if (results[i].first != source_partitions[i].second)
-				return results[i];
-
-		return results[P - 1];
-	}
-};
-
-
-template <class Iterator1Tag>
-struct Mismatch_<Iterator1Tag, std::input_iterator_tag >
-{
-	template <class InputIterator1, class InputIterator2,
-		  class BinaryPredicate>
-	static std::pair<InputIterator1, InputIterator2>
-	mismatch(InputIterator1 first1, InputIterator1 last1,
-		 InputIterator2 first2, BinaryPredicate binary_pred,
-		 const unsigned P)
-	{
-		return std::mismatch(first1, last1, first2, binary_pred);
-	}
-};
-
-template <class Iterator2Tag>
-struct Mismatch_< std::input_iterator_tag, Iterator2Tag >
-{
-	template <class InputIterator1, class InputIterator2,
-		  class BinaryPredicate>
-	static std::pair<InputIterator1, InputIterator2>
-	mismatch(InputIterator1 first1, InputIterator1 last1,
-		 InputIterator2 first2, BinaryPredicate binary_pred,
- 		 const unsigned P)
-	{
-		return std::mismatch(first1, last1, first2, binary_pred);
-	}
-};
-
-template <>
-struct Mismatch_< std::input_iterator_tag, std::input_iterator_tag >
-{
-	template <class InputIterator1, class InputIterator2,
-		  class BinaryPredicate>
-	static std::pair<InputIterator1, InputIterator2>
-	mismatch(InputIterator1 first1, InputIterator1 last1,
-		 InputIterator2 first2, BinaryPredicate binary_pred,
-		 const unsigned P)
-	{
-		return std::mismatch(first1, last1, first2, binary_pred);
-	}
-};
-
-} // end namespace detail
-
-template <class InputIterator1, class InputIterator2, class BinaryPredicate>
-std::pair<InputIterator1, InputIterator2>
-mismatch(InputIterator1 first1, InputIterator1 last1, InputIterator2 first2,
-         BinaryPredicate binary_pred, const unsigned P)
-{
-	return ::omptl::detail::Mismatch_<
-	typename std::iterator_traits<InputIterator1>::iterator_category,
-	typename std::iterator_traits<InputIterator2>::iterator_category>::
-		mismatch(first1, last1, first2, binary_pred, P);
-}
-
-template <class InputIterator1, class InputIterator2>
-std::pair<InputIterator1, InputIterator2>
-mismatch(InputIterator1 first1, InputIterator1 last1,
-	 InputIterator2 first2, const unsigned P)
-{
-	typedef typename std::iterator_traits<InputIterator1>::value_type VT;
-	return ::omptl::mismatch(first1, last1, first2,std::equal_to<VT>(),P);
-}
-
-// TODO How can this be parallelized ?
-template <class RandomAccessIterator, class StrictWeakOrdering>
-void nth_element(RandomAccessIterator first, RandomAccessIterator nth,
-                 RandomAccessIterator last,
-		 StrictWeakOrdering comp, const unsigned P)
-{
-	std::nth_element(first, nth, last, comp);
-}
-
-template <class RandomAccessIterator>
-void nth_element(RandomAccessIterator first, RandomAccessIterator nth,
-                 RandomAccessIterator last, const unsigned P)
-{
-// 	typedef typename
-// 		std::iterator_traits<RandomAccessIterator>::value_type
-// 	std::less<VT>
-
-	std::nth_element(first, nth, last);
-}
-
-namespace detail
-{
-	
-template<typename Iterator, class StrictWeakOrdering>
-Iterator _pivot_range(Iterator first, Iterator last,
-	const typename std::iterator_traits<Iterator>::value_type pivot,
-	StrictWeakOrdering comp)
-{
-	while (first < last)
-	{
-		if (comp(*first, pivot))
-			++first;
-		else
-		{
-			while ( (first < --last) && !comp(*last, pivot) )
-				/* nop */;
-			std::iter_swap(first, last);
-		}
-	}
-
-	return last;
-}
-
-} // end namespace detail
-
-template <class RandomAccessIterator, class StrictWeakOrdering>
-void partial_sort(RandomAccessIterator first,
-                  RandomAccessIterator middle,
-                  RandomAccessIterator last,
-                  StrictWeakOrdering comp, const unsigned P)
-{
-	const typename std::iterator_traits<RandomAccessIterator>::difference_type
-	N = std::distance(first, last);
-	assert(N >= 0);
-
-	if (2*P < unsigned(N))
-	{
-		::omptl::detail::_pivot_range(first, last, *middle, comp);
-		::omptl::sort(first, middle, comp, P);
-	}
-	else
-		std::partial_sort(first, last, middle, comp);
-}
-
-template <class RandomAccessIterator>
-void partial_sort(RandomAccessIterator first, RandomAccessIterator middle,
-                  RandomAccessIterator last, const unsigned P)
-{
-
-	typedef typename std::iterator_traits<RandomAccessIterator>::value_type VT;
-	::omptl::partial_sort(first, middle, last, std::less<VT>(), P);
-}
-
-// Not parallelized due to dependencies.
-template <class InputIterator, class RandomAccessIterator,
-          class StrictWeakOrdering>
-RandomAccessIterator
-partial_sort_copy(InputIterator first, InputIterator last,
-                  RandomAccessIterator result_first,
-                  RandomAccessIterator result_last, StrictWeakOrdering comp,
-		  const unsigned P)
-{
-	return std::partial_sort_copy(first, last, result_first, result_last, comp);
-}
-
-// Not parallelized due to dependencies.
-template <class InputIterator, class RandomAccessIterator>
-RandomAccessIterator
-partial_sort_copy(InputIterator first, InputIterator last,
-                  RandomAccessIterator result_first,
-                  RandomAccessIterator result_last, const unsigned P)
-{
-// 		std::less<typename
-// std::iterator_traits<InputIterator>::value_type>(),
-
-	return std::partial_sort_copy(first, last, result_first, result_last);
-}
-
-// Not (yet) parallelized, not straightforward due to possible dependencies
-// between subtasks.
-template <class ForwardIterator, class Predicate>
-ForwardIterator partition(ForwardIterator first, ForwardIterator last,
-			  Predicate pred, const unsigned P)
-{
-	return std::partition(first, last, pred);
-}
-
-// Not (yet) parallelized, not straightforward due to possible dependencies
-// between subtasks.
-template <class ForwardIterator, class Predicate>
-ForwardIterator stable_partition(ForwardIterator first, ForwardIterator last,
-				 Predicate pred, const unsigned P)
-{
-	return std::stable_partition(first, last, pred);
-}
-
-template <class BidirectionalIterator, class StrictWeakOrdering>
-bool next_permutation(BidirectionalIterator first, BidirectionalIterator last,
-		      StrictWeakOrdering comp, const unsigned P)
-{
-	return std::next_permutation(first, last, comp);
-}
-
-template <class BidirectionalIterator>
-bool next_permutation(BidirectionalIterator first, BidirectionalIterator last, const unsigned P)
-{
-// std::less<typename
-// std::iterator_traits<BidirectionalIterator>::value_type>
-	return std::next_permutation(first, last);
-}
-
-template <class BidirectionalIterator, class StrictWeakOrdering>
-bool prev_permutation(BidirectionalIterator first, BidirectionalIterator last,
-                      StrictWeakOrdering comp, const unsigned P)
-{
-	return std::prev_permutation(first, last, comp);
-}
-
-template <class BidirectionalIterator>
-bool prev_permutation(BidirectionalIterator first, BidirectionalIterator last,
-                      const unsigned P)
-{
-// 		std::less<typename
-// std::iterator_traits<BidirectionalIterator>::value_type>(),
-	return std::prev_permutation(first, last);
-}
-
-
-template <class RandomAccessIterator>
-void random_shuffle(RandomAccessIterator first, RandomAccessIterator last,
-                    const unsigned P)
-{
-	std::random_shuffle(first, last);
-}
-
-template <class RandomAccessIterator, class RandomNumberGenerator>
-void random_shuffle(RandomAccessIterator first, RandomAccessIterator last,
-                    RandomNumberGenerator& rgen, const unsigned P)
-{
-	std::random_shuffle(first, last, rgen);
-}
-
-// Not (yet) parallelized, not straightforward due to possible dependencies
-// between subtasks.
-template <class ForwardIterator, class T>
-ForwardIterator remove( ForwardIterator first, ForwardIterator last,
-			const T& value, const unsigned P)
-{
-	return std::remove(first, last, value);
-}
-
-// Not (yet) parallelized, not straightforward due to possible dependencies
-// between subtasks.
-template <class ForwardIterator, class Predicate>
-ForwardIterator remove_if(ForwardIterator first, ForwardIterator last,
-			  Predicate pred, const unsigned P)
-{
-	return std::remove_if(first, last, pred);
-}
-
-// Not parallelized due to possible complications with OutputIterators.
-// No par_remove_copy exists due to possible dependencies between subtasks.
-template <class InputIterator, class OutputIterator, class T>
-OutputIterator remove_copy(InputIterator first, InputIterator last,
-			   OutputIterator result, const T& value,
-			   const unsigned P)
-{
-	return std::remove_copy(first, last, result, value);
-}
-
-// Not parallelized due to possible complications with OutputIterators.
-// No par_remove_copy_if exists due to possible dependencies between subtasks.
-template <class InputIterator, class OutputIterator, class Predicate>
-OutputIterator remove_copy_if(InputIterator first, InputIterator last,
-			      OutputIterator result, Predicate pred,
-			      const unsigned P)
-{
-	return std::remove_copy(first, last, result, pred);
-}
-
-template <class ForwardIterator, class T>
-void replace(ForwardIterator first, ForwardIterator last, const T& old_value,
-             const T& new_value, const unsigned P)
-{
-	if (detail::_linear_serial_is_faster(first, last, P))
-	{
-		std::replace(first, last, old_value, new_value);
-		return;
-	}
-
-	std::vector< std::pair<ForwardIterator, ForwardIterator> > partitions(P);
-	::omptl::detail::_partition_range(first, last, partitions, P);
-
-	#pragma omp parallel for
-	for (int t = 0; t < int(P); ++t)
-		std::replace(partitions[t].first, partitions[t].second, old_value, new_value);
-}
-
-namespace detail
-{
-
-template <class Iterator1Tag, class Iterator2Tag>
-struct Replace_copy_if_
-{
-	template <class Iterator1, class Iterator2, class Predicate, class T>
-	static Iterator2
-	replace_copy_if(Iterator1 first, Iterator1 last,
-		     Iterator2 result, Predicate pred,
-		     const T& new_value, const unsigned P)
-	{
-		if (detail::_linear_serial_is_faster(first, last, P))
-			return std::replace_copy_if(first, last, result, pred, new_value);
-
-		std::vector< std::pair<Iterator1, Iterator1> > source_partitions(P);
-		::omptl::detail::_partition_range(first, last, source_partitions, P);
-
-		std::vector<Iterator2> dest_partitions(P);
-		::omptl::detail::_copy_partitions(source_partitions, result, dest_partitions, P);
-
-		#pragma omp parallel for
-		for (int t = 0; t < int(P); ++t)
-		{
-			Iterator2 tmp;
-			*( (t == int(P-1)) ? &result : &tmp )
-				 = std::replace_copy_if(source_partitions[t].first,
-							source_partitions[t].second,
-							  dest_partitions[t], pred, new_value);
-		}
-
-		return result;
-	}
-
-};
-
-template <class Iterator2Tag>
-struct Replace_copy_if_< std::input_iterator_tag, Iterator2Tag>
-{
-	template <class Iterator1, class Iterator2,
-		  class Predicate, class T>
-	static Iterator2
-	replace_copy_if(Iterator1 first, Iterator1 last,
-			Iterator2 result, Predicate pred,
-			const T& new_value, const unsigned P)
-	{
-		return std::replace_copy_if(first, last, result, pred, new_value);
-	}
-};
-
-template <class Iterator1Tag>
-struct Replace_copy_if_< Iterator1Tag, std::output_iterator_tag>
-{
-	template <class Iterator1, class OutputIterator,
-		  class Predicate, class T>
-	static OutputIterator
-	replace_copy_if(Iterator1 first, Iterator1 last,
-			OutputIterator result, Predicate pred,
-			const T& new_value, const unsigned P)
-	{
-		return std::replace_copy_if(first, last, result, pred, new_value);
-	}
-};
-
-template <>
-struct Replace_copy_if_< std::input_iterator_tag, std::output_iterator_tag>
-{
-	template <class InputIterator, class OutputIterator,
-		  class Predicate, class T>
-	static OutputIterator
-	replace_copy_if(InputIterator first, InputIterator last,
-			OutputIterator result, Predicate pred,
-			const T& new_value, const unsigned P)
-	{
-		return std::replace_copy_if(first, last, result, pred, new_value);
-	}
-};
-
-} // end namespace detail
-
-template <class InputIterator, class OutputIterator, class Predicate, class T>
-OutputIterator replace_copy_if(InputIterator first, InputIterator last,
-                               OutputIterator result, Predicate pred,
-                               const T& new_value, const unsigned P)
-{
-	return ::omptl::detail::Replace_copy_if_<
-	typename std::iterator_traits< InputIterator>::iterator_category,
-	typename std::iterator_traits<OutputIterator>::iterator_category>
-		::replace_copy_if(first, last, result, pred, new_value, P);
-}
-
-template <class InputIterator, class OutputIterator, class T>
-OutputIterator replace_copy(InputIterator first, InputIterator last,
-                            OutputIterator result, const T& old_value,
-                            const T& new_value, const unsigned P)
-{
-	return ::omptl::replace_copy_if(first, last, result,
-			std::bind2nd(std::equal_to<T>(), old_value), new_value, P);
-}
-
-template <class ForwardIterator, class Predicate, class T>
-void replace_if(ForwardIterator first, ForwardIterator last, Predicate pred,
-                const T& new_value, const unsigned P)
-{
-	if (detail::_linear_serial_is_faster(first, last, P))
-		return std::replace_if(first, last, pred, new_value);
-
-	std::vector< std::pair<ForwardIterator, ForwardIterator> > partitions(P);
-	::omptl::detail::_partition_range(first, last, partitions, P);
-
-	#pragma omp parallel for
-	for (int t = 0; t < int(P); ++t)
-		std::replace_if(partitions[t].first, partitions[t].second,  pred, new_value);
-}
-
-// TODO
-template <class BidirectionalIterator>
-void reverse(BidirectionalIterator first, BidirectionalIterator last, const unsigned P)
-{
-	std::reverse(first, last);
-}
-
-// TODO
-template <class BidirectionalIterator, class OutputIterator>
-OutputIterator reverse_copy(BidirectionalIterator first,
-			    BidirectionalIterator last,
-			    OutputIterator result, const unsigned P)
-{
-	return std::reverse_copy(first, last, result);
-}
-
-// TODO
-template <class ForwardIterator>
-ForwardIterator rotate( ForwardIterator first, ForwardIterator middle,
-			ForwardIterator last, const unsigned P)
-{
-	return std::rotate(first, middle, last);
-}
-
-// TODO
-template <class ForwardIterator, class OutputIterator>
-OutputIterator rotate_copy(ForwardIterator first, ForwardIterator middle,
-                           ForwardIterator last, OutputIterator result,
-			   const unsigned P)
-{
-	return std::rotate(first, middle, last, result);
-}
-/*
-This can't be right - partitioning the range might cut valid subsequences
-in [first1-last1]
-template <class ForwardIterator1, class ForwardIterator2,
-	  class BinaryPredicate>
-ForwardIterator1 search(ForwardIterator1 first1, ForwardIterator1 last1,
-                        ForwardIterator2 first2, ForwardIterator2 last2,
-                        BinaryPredicate binary_pred, const unsigned P)
-{
-	if (detail::_linear_serial_is_faster(first1, last1, P))
-		return std::search(first1, last1, first2, last2,
-					 binary_pred);
-
-	std::vector< std::pair<ForwardIterator1, ForwardIterator1> >
-		partitions(P);
-	::omptl::detail::_partition_range(first1, last1, partitions, P);
-
-	std::vector<ForwardIterator1> results(P);
-
-	#pragma omp parallel for
-	for (int t = 0; t < int(P); ++t)
-	{
-		results[t] = std::search(partitions[t].first,
-					   partitions[t].second,
-					   first2, last2, binary_pred);
-
-
-	}
-
-	const typename std::vector<ForwardIterator1>::iterator
-		result = std::find_if(results.begin(), results.end(),
-		std::bind2nd(std::not_equal_to<ForwardIterator1>(),
-				last1));
-
-	if (result != results.end())
-		return *result;
-
-	return last1;
-}
-*/
-
-template <class ForwardIterator1, class ForwardIterator2,
-	  class BinaryPredicate>
-ForwardIterator1 search(ForwardIterator1 first1, ForwardIterator1 last1,
-                        ForwardIterator2 first2, ForwardIterator2 last2,
-                        BinaryPredicate binary_pred, const unsigned P)
-{
-	return std::search(first1, last1, first2, last2, binary_pred);
-}
-
-template <class ForwardIterator1, class ForwardIterator2>
-ForwardIterator1 search(ForwardIterator1 first1, ForwardIterator1 last1,
-                        ForwardIterator2 first2, ForwardIterator2 last2,
-                        const unsigned P)
-{
-// 	typedef typename
-// 		std::iterator_traits<ForwardIterator1>::value_type VT;
-// 	return ::omptl::search(first1, last1, first2, last2,
-// 				std::equal_to<VT>(), P);
-
-	return std::search(first1, last1, first2, last2);
-}
-
-// TODO
-template <class ForwardIterator, class Integer,
-          class T, class BinaryPredicate>
-ForwardIterator search_n(ForwardIterator first, ForwardIterator last,
-                         Integer count, const T& value,
-                         BinaryPredicate binary_pred, const unsigned P)
-{
-	return std::search_n(first, last, count, value, binary_pred);
-}
-
-template <class ForwardIterator, class Integer, class T>
-ForwardIterator search_n(ForwardIterator first, ForwardIterator last,
-			 Integer count, const T& value, const unsigned P)
-{
-// std::equal_to<typename
-// std::iterator_traits<ForwardIterator>::value_type>
-	return std::search_n(first, last, count, value);
-}
-
-template <class InputIterator1, class InputIterator2, class OutputIterator,
-          class StrictWeakOrdering>
-OutputIterator set_difference(InputIterator1 first1, InputIterator1 last1,
-				InputIterator2 first2, InputIterator2 last2,
-				OutputIterator result, StrictWeakOrdering comp,
-				const unsigned P)
-{
-	return std::set_difference(first1, last1, first2, last2, result, comp);
-}
-
-template <class InputIterator1, class InputIterator2, class OutputIterator>
-OutputIterator set_difference(InputIterator1 first1, InputIterator1 last1,
-			      InputIterator2 first2, InputIterator2 last2,
-			     OutputIterator result, const unsigned P)
-{
-	return std::set_difference(first1, last1, first2, last2, result);
-}
-
-template <class InputIterator1, class InputIterator2, class OutputIterator,
-          class StrictWeakOrdering>
-OutputIterator set_intersection(InputIterator1 first1, InputIterator1 last1,
-				InputIterator2 first2, InputIterator2 last2,
-				OutputIterator result, StrictWeakOrdering comp,
-			 	const unsigned P)
-{
-	return std::set_intersection( first1, last1, first2, last2, result, comp);
-}
-
-template <class InputIterator1, class InputIterator2, class OutputIterator>
-OutputIterator set_intersection(InputIterator1 first1, InputIterator1 last1,
-				InputIterator2 first2, InputIterator2 last2,
-				OutputIterator result, const unsigned P)
-{
-	return std::set_intersection( first1, last1, first2, last2, result);
-}
-
-template <class InputIterator1, class InputIterator2, class OutputIterator, class StrictWeakOrdering>
-OutputIterator
-set_symmetric_difference(InputIterator1 first1, InputIterator1 last1,
-			 InputIterator2 first2, InputIterator2 last2,
-			 OutputIterator result, StrictWeakOrdering comp,
-			 const unsigned P)
-{
-	return std::set_symmetric_difference( first1, last1, first2, last2, result, comp);
-}
-
-template <class InputIterator1, class InputIterator2, class OutputIterator, class StrictWeakOrdering>
-OutputIterator
-set_symmetric_difference(InputIterator1 first1, InputIterator1 last1,
-			 InputIterator2 first2, InputIterator2 last2,
-			 OutputIterator result, const unsigned P)
-{
-	return std::set_symmetric_difference( first1, last1, first2, last2, result);
-}
-
-template <class InputIterator1, class InputIterator2, class OutputIterator, class StrictWeakOrdering>
-OutputIterator set_union(InputIterator1 first1, InputIterator1 last1,
-			 InputIterator2 first2, InputIterator2 last2,
-			 OutputIterator result, StrictWeakOrdering comp,
-			 const unsigned P)
-{
-	return std::set_union(first1, last1, first2, last2, result, comp);
-}
-
-template <class InputIterator1, class InputIterator2, class OutputIterator>
-OutputIterator set_union(InputIterator1 first1, InputIterator1 last1,
-			 InputIterator2 first2, InputIterator2 last2,
-			 OutputIterator result, const unsigned P)
-{
-	return std::set_union(first1, last1, first2, last2, result);
-}
-
-template<typename RandomAccessIterator, class StrictWeakOrdering>
-void sort(RandomAccessIterator first, RandomAccessIterator last,
-	  StrictWeakOrdering comp, const unsigned P)
-{
-	if ( ::omptl::detail::_nlogn_serial_is_faster(first, last, P) )
-	{
-		std::sort(first, last, comp);
-		return;
-	}
-
-	assert(std::distance(first, last) >= 3u*P);
-
-	// Generate pivots
-	typedef typename std::iterator_traits<RandomAccessIterator>::value_type value_type;
-
-	std::vector<value_type> pivots;
-	::omptl::detail::_find_pivots(first, last, pivots, comp, P);
-
-	// Sort sufficiently to respect pivot order
-	typedef std::pair<RandomAccessIterator, RandomAccessIterator> Partition;
-	std::vector< Partition > borders(1, std::make_pair(first, last));
-
-	std::vector<char> pivot_used(pivots.size(), false); // can't be bool due to parallel write
-
-	const unsigned max_depth = std::floor(std::log2(P));
-	assert(1u << max_depth <= P);
-	for (unsigned i = 0; i < max_depth; ++i)
-	{
-		const int Npartitions = borders.size();
-		assert(borders.size() == 1u << i);
-		assert(borders.size() <= P);
-//std::cerr << "depth: " << i << " size: " << Npartitions << " new size: " << (2*Npartitions) << std::endl;
-
-		std::vector< Partition > new_borders(2u*Npartitions);
-
-		#pragma omp parallel for
-		for (int p = 0; p < Npartitions; ++p)
-		{
-			if (2*p+1 >= int(P))
-				continue;
-
-			const unsigned pivot_index = (2*p+1) * pivots.size() / (1u<<i) / 2u;
-			assert(pivot_index < pivots.size());
-
-//std::cerr << "\tp: " << p << " P: " << P << " Npartitions: " << Npartitions << " pivot_index: " << pivot_index << std::endl;
-			assert(!pivot_used[pivot_index]);
-			pivot_used[pivot_index] = true;
-
-			const RandomAccessIterator middle =
-				detail::_pivot_range(borders[p].first,
-						     borders[p].second,
-						     pivots[pivot_index], comp);
-			new_borders[2*p    ] = std::make_pair(borders[p].first, middle);
-			new_borders[2*p + 1] = std::make_pair(middle, borders[p].second);
-		}
-		std::swap(borders, new_borders);
-	}
-	assert(borders.size() <= P);
-
-	assert(borders[0].first == first);
-	for (unsigned i = 0; i < borders.size()-1; ++i)
-		assert(borders[i].second == borders[i+1].first);
-	assert(borders.back().second == last);
-
-	// Powers of two are easy: sort and leave
-	if (borders.size() == P)
-	{
-		#pragma omp parallel for
-		for (int t = 0; t < int(P); ++t)
-			std::sort(borders[t].first, borders[t].second, comp);
-		return;
-	}
-	// For non-powers of two, split remaining partitions and sort those
-	// that are already their final size.
-	std::vector< Partition > partitions;
-	std::vector<bool> final;
-	std::vector<bool> dummy;
-	for (unsigned i = 0; i < borders.size(); ++i)
-	{
-		partitions.push_back(borders[i]);
-		dummy.push_back(false);
-		const unsigned pivot_index = (2*i+1) * pivots.size() / borders.size() / 2;
-		assert(pivot_index < pivots.size());
-		if (pivot_used[pivot_index])
-			final.push_back(true);
-		else
-		{
-			// meta-data first part
-			final.push_back(false);
-
-			// dummy to be overwritten by splitting
-			partitions.push_back( std::make_pair(last, last) ); // dummy
-			dummy.push_back(true);
-			final.push_back(false);
-		}
-	}
-	assert(partitions.size() == P);
-	assert(final.size() == P);
-	assert(dummy.size() == P);
-/*
-for (unsigned i = 0; i < pivot_used.size(); ++i)
-	std::cout << bool(pivot_used[i]) << " ";
-std::cout  << std::endl;
-
-std::cout << borders.size() << " " << partitions.size() << " " << P << std::endl;
-*/	
-	// Round one: sort final partitions, split remaining
-	#pragma omp parallel for
-	for (int i = 0; i < int(partitions.size()); ++i)
-	{
-		//std::cout << i;
-		if (final[i])
-		{
-			assert(!dummy[i]);
-			std::sort(partitions[i].first, partitions[i].second, comp);
-			//std::cout << " sort"<< std::endl;
-		}
-		else if (dummy[i]) // will be handled by first part
-		{
-			assert(i > 0);
-			assert(!dummy[i-1]);
-			//std::cout << " skip"<< std::endl;
-			continue;
-		}
-		else
-		{
-			//std::cout << " split"<< std::endl;
-			assert(dummy[i+1]);
-			assert(!final[i+1]);
-
-			const unsigned pivot_index = i * (P-1) / (partitions.size()-1);
-//std::cerr << "\tp: " << i << " P: " << P << " Npartitions: " << partitions.size() << " pivot_index: " << pivot_index << std::endl;
-			assert(pivot_index < pivots.size());
-			assert(!pivot_used[pivot_index]);
-			pivot_used[pivot_index] = true;
-
-			const RandomAccessIterator begin  = partitions[i].first;
-			const RandomAccessIterator end    = partitions[i].second;
-			
-			const RandomAccessIterator middle =
-				detail::_pivot_range(begin, end, pivots[pivot_index], comp);
-			partitions[i  ] = std::make_pair(begin, middle);
-			partitions[i+1] = std::make_pair(middle, end);
-		}
-	}
-	for (unsigned i = 0; i < pivot_used.size(); ++i)
-		assert(pivot_used[i]);
-
-	assert(partitions.size() == P);
-	assert(std::find(pivot_used.begin(), pivot_used.end(), false) == pivot_used.end());
-
-	assert(partitions[0].first == first);
-	for (unsigned i = 0; i < P-1; ++i)
-		assert(partitions[i].second == partitions[i+1].first);
-	assert(partitions[P-1].second == last);
-
-	// Sort last unsorted partitions
-	#pragma omp parallel for
-	for (int i = 0; i < int(partitions.size()); ++i)
-		if (!final[i])
-			std::sort(partitions[i].first, partitions[i].second, comp);
-}
-
-template<typename RandomAccessIterator>
-void sort(RandomAccessIterator first, RandomAccessIterator last, const unsigned P)
-{
-	typedef typename std::iterator_traits<RandomAccessIterator>::value_type VT;
-	::omptl::sort(first, last, std::less<VT>(), P);
-}
-
-/*
-template<typename RandomAccessIterator, class StrictWeakOrdering>
-void _par_stable_sort(RandomAccessIterator first, RandomAccessIterator last,
-	StrictWeakOrdering comp, const unsigned P)
-{
-	if ( ::omptl::detail::_nlogn_serial_is_faster(first, last, P) )
-	{
-		std::stable_sort(first, last, comp);
-		return;
-	}
-
-	// Generate pivots
-	std::vector<typename
-		std::iterator_traits<RandomAccessIterator>::value_type>
-			pivots;
-	_find_pivots(first, last, pivots, P);
-
-	// Sort sufficiently to respect pivot order
-	std::vector< std::pair<RandomAccessIterator, RandomAccessIterator> >
-		partitions(P);
-	::omptl::detail::_partition_range_stable_by_pivots(first, last, pivots,
-						   partitions, comp, P);
-
-	// Sort
-	#pragma omp parallel for // default(none) shared(partitions)
-	for (int t = 0; t < int(P); ++t)
-		std::stable_sort(partitions[t].first,
-				   partitions[t].second, comp);
-}
-
-template<typename RandomAccessIterator, class StrictWeakOrdering>
-void _stable_sort(RandomAccessIterator first, RandomAccessIterator last,
-	StrictWeakOrdering comp, const unsigned P)
-{
-	std::stable_sort(first, last, comp);
-}
-
-template<typename RandomAccessIterator>
-void _stable_sort(RandomAccessIterator first, RandomAccessIterator last,
-	std::less<typename
-std::iterator_traits<RandomAccessIterator>::value_type>
-	 comp, const unsigned P)
-{
-	::omptl::detail::_par_stable_sort(first, last, comp, P);
-}
-
-// template<typename RandomAccessIterator>
-// void _stable_sort(RandomAccessIterator first, RandomAccessIterator last,
-// 	std::greater<
-// 	typename std::iterator_traits<RandomAccessIterator>::value_type> comp,
-//  	const unsigned P)
-// {
-// 	::omptl::detail::_par_stable_sort(first, last, comp, P);
-// }
-*/
-
-template<typename RandomAccessIterator, class StrictWeakOrdering>
-void stable_sort(RandomAccessIterator first, RandomAccessIterator last,
-	StrictWeakOrdering comp, const unsigned P)
-{
-	std::stable_sort(first, last, comp);
-}
-
-template<typename RandomAccessIterator>
-void stable_sort(RandomAccessIterator first, RandomAccessIterator last, const unsigned P)
-{
-	typedef typename std::iterator_traits<RandomAccessIterator>::value_type VT;
-	::omptl::stable_sort(first, last, std::less<VT>(), P);
-}
-
-template <class ForwardIterator1, class ForwardIterator2>
-ForwardIterator2 swap_ranges(ForwardIterator1 first1, ForwardIterator1 last1,
-                             ForwardIterator2 first2, const unsigned P)
-{
-	if (detail::_linear_serial_is_faster(first1, last1, P))
-		return std::swap_ranges(first1, last1, first2);
-
-	std::vector< std::pair<ForwardIterator1, ForwardIterator1> > source_partitions(P);
-	::omptl::detail::_partition_range(first1, last1, source_partitions, P);
-
-	std::vector<ForwardIterator2> dest_partitions(P);
-	::omptl::detail::_copy_partitions(source_partitions, first2, dest_partitions, P);
-
-	ForwardIterator2 result;
-	#pragma omp parallel for
-	for (int t = 0; t < int(P); ++t)
-	{
-		ForwardIterator2 tmp;
-		*( (t == int(P-1)) ? &result : &tmp )
-			= std::swap_ranges(source_partitions[t].first,
-					   source_partitions[t].second,
-					     dest_partitions[t]);
-	}
-
-	return result;
-}
-
-namespace detail
-{
-
-template <class IteratorInTag, class IteratorOutTag>
-struct Transform_
-{
-	template <class IteratorIn, class IteratorOut, class UnaryFunction>
-	static IteratorOut transform(IteratorIn first, IteratorIn last,
-				     IteratorOut result, UnaryFunction op, const unsigned P)
-	{
-		if (detail::_linear_serial_is_faster(first, last, P))
-			return std::transform(first, last, result, op);
-
-		std::vector< std::pair<IteratorIn, IteratorIn> > source_partitions(P);
-		detail::_partition_range(first, last, source_partitions, P);
-
-		std::vector<IteratorOut> dest_partitions(P);
-		detail::_copy_partitions(source_partitions, result, dest_partitions, P);
-
-		#pragma omp parallel for
-		for (int t = 0; t < int(P); ++t)
-		{
-			IteratorOut tmp;
-			*( (t == int(P-1)) ? &result : &tmp )
-				= std::transform(source_partitions[t].first,
-						 source_partitions[t].second,
-						   dest_partitions[t], op);
-		}
-
-		return result;
-	}
-};
-
-template <class IteratorInTag>
-struct Transform_<IteratorInTag, std::output_iterator_tag>
-{
-	template <class InputIterator, class OutputIterator,
-		  class UnaryFunction>
-	static OutputIterator transform(InputIterator first, InputIterator last,
-					OutputIterator result, UnaryFunction op,
-			 		const unsigned P)
-	{
-		return std::transform(first, last, result, op);
-	}
-};
-
-template <class IteratorOutTag>
-struct Transform_< std::input_iterator_tag, IteratorOutTag >
-{
-	template <class InputIterator, class OutputIterator, class UnaryFunction>
-	OutputIterator transform(InputIterator first, InputIterator last,
-				 OutputIterator result, UnaryFunction op,
-				 const unsigned P)
-	{
-		return std::transform(first, last, result, op);
-	}
-};
-
-template <>
-struct Transform_< std::input_iterator_tag, std::output_iterator_tag >
-{
-	template <class InputIterator, class OutputIterator, class UnaryFunction>
-	OutputIterator transform(InputIterator first, InputIterator last,
-				OutputIterator result, UnaryFunction op,
-				const unsigned P)
-	{
-		return std::transform(first, last, result, op);
-	}
-};
-
-} // end namespace detail
-
-template <class InputIterator, class OutputIterator, class UnaryFunction>
-OutputIterator transform(InputIterator first, InputIterator last,
-                         OutputIterator result, UnaryFunction op,
-			 const unsigned P)
-{
-	return ::omptl::detail::Transform_<
-	typename std::iterator_traits< InputIterator>::iterator_category,
-	typename std::iterator_traits<OutputIterator>::iterator_category>::
-		transform(first, last, result, op, P);
-}
-
-namespace detail
-{
-
-template <class Iterator1Tag, class Iterator2Tag, class IteratorOutTag>
-struct Transform2_
-{
-	template <class Iterator1, class Iterator2, class IteratorOut, class BinaryFunction>
-	static IteratorOut transform(Iterator1 first1, Iterator1 last1,
-				     Iterator2 first2, IteratorOut result,
-				     BinaryFunction binary_op, const unsigned P)
-	{
-		if (detail::_linear_serial_is_faster(first1, last1, P))
-			return std::transform(first1, last1, first2, result, binary_op);
-
-		std::vector< std::pair<Iterator1, Iterator1> > source_partitions1(P);
-		::omptl::detail::_partition_range(first1, last1, source_partitions1, P);
-
-		std::vector<Iterator2> source_partitions2(P);
-		::omptl::detail::_copy_partitions(source_partitions1, first2, source_partitions2 , P);
-
-		std::vector<IteratorOut> dest_partitions(P);
-		::omptl::detail::_copy_partitions(source_partitions1, result, dest_partitions, P);
-
-		#pragma omp parallel for
-		for (int t = 0; t < int(P); ++t)
-		{
-			IteratorOut tmp;
-			*( (t == int(P-1)) ? &result : &tmp ) =
-				std::transform( source_partitions1[t].first,
-						source_partitions1[t].second,
-						source_partitions2[t],
-						  dest_partitions [t], binary_op);
-		}
-
-		return result;
-	}
-};
-
-template <class Iterator2Tag, class IteratorOutTag>
-struct Transform2_< std::input_iterator_tag, Iterator2Tag, IteratorOutTag >
-{
-	template <class InputIterator1, class InputIterator2,
-		  class OutputIterator, class BinaryFunction>
-	static OutputIterator
-	transform(InputIterator1 first1, InputIterator1 last1,
-		 InputIterator2 first2, OutputIterator result,
-		 BinaryFunction binary_op, const unsigned P)
-	{
-		return std::transform(first1, last1, first2, result, binary_op);
-	}
-};
-
-template <class Iterator1Tag, class IteratorOutTag>
-struct Transform2_< Iterator1Tag, std::input_iterator_tag, IteratorOutTag >
-{
-	template <class InputIterator1, class InputIterator2,
-		  class OutputIterator, class BinaryFunction>
-	static OutputIterator
-	transform(InputIterator1 first1, InputIterator1 last1,
-		  InputIterator2 first2, OutputIterator result,
-		  BinaryFunction binary_op, const unsigned P)
-	{
-		return std::transform(first1, last1, first2, result, binary_op);
-	}
-};
-
-template <class Iterator1Tag, class Iterator2Tag>
-struct Transform2_< Iterator1Tag, Iterator2Tag, std::output_iterator_tag>
-{
-	template <class InputIterator1, class InputIterator2,
-		  class OutputIterator, class BinaryFunction>
-	static OutputIterator
-	transform(InputIterator1 first1, InputIterator1 last1,
-		  InputIterator2 first2, OutputIterator result,
-		  BinaryFunction binary_op, const unsigned P)
-	{
-		return std::transform(first1, last1, first2, result, binary_op);
-	}
-};
-
-template <class IteratorOutTag>
-struct Transform2_< std::input_iterator_tag,
-		    std::input_iterator_tag, IteratorOutTag >
-{
-	template <class InputIterator1, class InputIterator2,
-		  class OutputIterator, class BinaryFunction>
-	static OutputIterator
-	transform(InputIterator1 first1, InputIterator1 last1,
-		  InputIterator2 first2, OutputIterator result,
-		  BinaryFunction binary_op, const unsigned P)
-	{
-		return std::transform(first1, last1, first2, result, binary_op);
-	}
-};
-
-template <class Iterator1Tag>
-struct Transform2_< Iterator1Tag, std:: input_iterator_tag,
-		    std::output_iterator_tag >
-{
-	template <class InputIterator1, class InputIterator2,
-		  class OutputIterator, class BinaryFunction>
-	static OutputIterator
-	transform(InputIterator1 first1, InputIterator1 last1,
-		  InputIterator2 first2, OutputIterator result,
-		  BinaryFunction binary_op, const unsigned P)
-	{
-		return std::transform(first1, last1, first2, result, binary_op);
-	}
-};
-
-template <class Iterator2Tag>
-struct Transform2_< std:: input_iterator_tag, Iterator2Tag,
-		    std::output_iterator_tag >
-{
-	template <class InputIterator1, class InputIterator2,
-		  class OutputIterator, class BinaryFunction>
-	static OutputIterator
-	transform(InputIterator1 first1, InputIterator1 last1,
-		  InputIterator2 first2, OutputIterator result,
-		  BinaryFunction binary_op, const unsigned P)
-	{
-		return std::transform(first1, last1, first2, result, binary_op);
-	}
-};
-
-template <>
-struct Transform2_< std:: input_iterator_tag, std:: input_iterator_tag,
-		    std::output_iterator_tag >
-{
-	template <class InputIterator1, class InputIterator2,
-		  class OutputIterator, class BinaryFunction>
-	static OutputIterator
-	transform(InputIterator1 first1, InputIterator1 last1,
-		  InputIterator2 first2, OutputIterator result,
-		  BinaryFunction binary_op, const unsigned P)
-	{
-		return std::transform(first1, last1, first2, result, binary_op);
-	}
-};
-
-} // end namespace detail
-
-template <class InputIterator1, class InputIterator2, class OutputIterator, class BinaryFunction>
-OutputIterator transform(InputIterator1 first1, InputIterator1 last1,
-                         InputIterator2 first2, OutputIterator result,
-                         BinaryFunction binary_op, const unsigned P)
-{
-	return ::omptl::detail::Transform2_<
-	typename std::iterator_traits<InputIterator1>::iterator_category,
-	typename std::iterator_traits<InputIterator2>::iterator_category,
-	typename std::iterator_traits<OutputIterator>::iterator_category>::
-		transform(first1, last1, first2, result, binary_op, P);
-}
-
-template <class ForwardIterator, class BinaryPredicate>
-ForwardIterator unique(ForwardIterator first, ForwardIterator last,
-                       BinaryPredicate binary_pred, const unsigned P)
-{
-	return std::unique(first, last, binary_pred);
-}
-
-template <class ForwardIterator>
-ForwardIterator unique(ForwardIterator first, ForwardIterator last, const unsigned P)
-{
-// 		       std::equal_to<typename
-// std::iterator_traits<ForwardIterator>::value_type>(),
-	return std::unique(first, last);
-}
-
-template <class InputIterator, class OutputIterator, class BinaryPredicate>
-OutputIterator unique_copy(InputIterator first, InputIterator last,
-			  OutputIterator result, BinaryPredicate binary_pred,
-			   const unsigned P)
-{
-	return std::unique_copy(first, last, result, binary_pred);
-}
-
-template <class InputIterator, class OutputIterator>
-OutputIterator unique_copy(InputIterator first, InputIterator last,
-                           OutputIterator result, const unsigned P)
-{
-// 		       std::equal_to<typename
-// std::iterator_traits<InputIterator>::value_type>(),
-	return std::unique_copy(first, last, result);
-}
-
-template <class ForwardIterator, class T, class StrictWeakOrdering>
-ForwardIterator upper_bound(ForwardIterator first, ForwardIterator last,
-                            const T& value, StrictWeakOrdering comp, const unsigned P)
-{
-	if (detail::_logn_serial_is_faster(first, last, P))
-		return std::upper_bound(first, last, value, comp);
-
-	std::vector< std::pair<ForwardIterator, ForwardIterator> > partitions(P);
-	::omptl::detail::_partition_range(first, last, partitions, P);
-
-	std::vector<ForwardIterator> results(P);
-	#pragma omp parallel for
-	for (int t = 0; t < int(P); ++t)
-		results[t] = std::upper_bound(partitions[t].first,
-					      partitions[t].second, value, comp);
-
-	// There has to be a better way...
-	for (unsigned i = P - 1; i > 0; --i)
-		if (results[i] != partitions[i].second)
-			return results[i];
-
-	return results[0];
-}
-
-template <class ForwardIterator, class T>
-ForwardIterator upper_bound(ForwardIterator first, ForwardIterator last, const T& value, const unsigned P)
-{
-	typedef typename std::iterator_traits<ForwardIterator>::value_type VT;
-	return ::omptl::upper_bound(first, last, value, std::less<VT>(), P);
-}
-
-} /* namespace omptl */
-
diff --git a/lib/omptl/omptl_algorithm_ser.h b/lib/omptl/omptl_algorithm_ser.h
deleted file mode 100644
index 9a51e1bee..000000000
--- a/lib/omptl/omptl_algorithm_ser.h
+++ /dev/null
@@ -1,749 +0,0 @@
-// Copyright (C) 2006 Fokko Beekhof
-// Email contact: Fokko.Beekhof@unige.ch
-
-// The OMPTL library is free software; you can redistribute it and/or
-// modify it under the terms of the GNU Lesser General Public
-// License as published by the Free Software Foundation; either
-// version 2.1 of the License, or (at your option) any later version.
-
-// This library is distributed in the hope that it will be useful,
-// but WITHOUT ANY WARRANTY; without even the implied warranty of
-// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-// Lesser General Public License for more details.
-
-// You should have received a copy of the GNU Lesser General Public
-// License along with this library; if not, write to the Free Software
-// Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
-
-
-namespace omptl
-{
-
-template <class ForwardIterator>
-ForwardIterator adjacent_find(ForwardIterator first, ForwardIterator last,
-				const unsigned)
-{
-	return ::std::adjacent_find(first, last);
-}
-
-template <class ForwardIterator, class BinaryPredicate>
-ForwardIterator adjacent_find(ForwardIterator first, ForwardIterator last,
-                              BinaryPredicate binary_pred, const unsigned)
-{
-	return ::std::adjacent_find(first, last, binary_pred);
-}
-
-template <class ForwardIterator, class T, class StrictWeakOrdering>
-bool binary_search(ForwardIterator first, ForwardIterator last, const T& value,
-                   StrictWeakOrdering comp, const unsigned)
-{
-	return ::std::binary_search(first, last, value, comp);
-}
-
-template <class ForwardIterator, class T>
-bool binary_search(ForwardIterator first, ForwardIterator last, const T& value,
-     const unsigned)
-{
-	return ::std::binary_search(first, last, value);
-}
-
-template <class InputIterator, class OutputIterator>
-OutputIterator copy(InputIterator first, InputIterator last,
-		    OutputIterator result, const unsigned)
-{
-	return ::std::copy(first, last, result);
-}
-
-template <class BidirectionalIterator1, class BidirectionalIterator2>
-BidirectionalIterator2 copy_backward(BidirectionalIterator1 first,
-                                     BidirectionalIterator1 last,
-                                     BidirectionalIterator2 result, const unsigned)
-{
-	return ::std::copy_backward(first, last, result);
-}
-
-template <class InputIterator, class EqualityComparable>
-typename ::std::iterator_traits<InputIterator>::difference_type
-count(InputIterator first, InputIterator last, const EqualityComparable& value, const unsigned)
-{
-	return ::std::count(first, last, value);
-}
-
-template <class InputIterator, class EqualityComparable, class Size>
-void count(InputIterator first, InputIterator last,
-           const EqualityComparable& value, Size& n, const unsigned)
-{
-	return ::std::count_if(first, last, value, n);
-}
-
-template <class InputIterator, class Predicate>
-typename InputIterator::difference_type
-count_if(InputIterator first, InputIterator last, Predicate pred,
-	 const unsigned)
-{
-	return ::std::count_if(first, last, pred);
-}
-
-template <class InputIterator, class Predicate, class Size>
-void count_if(InputIterator first, InputIterator last,
-              Predicate pred, Size& n, const unsigned)
-{
-	return ::std::count_if(first, last, pred, n);
-}
-
-template <class InputIterator1, class InputIterator2,
-          class BinaryPredicate>
-bool equal(InputIterator1 first1, InputIterator1 last1,
-           InputIterator2 first2, BinaryPredicate binary_pred,
-	   const unsigned)
-{
-	return ::std::equal(first1, last1, first2, binary_pred);
-}
-
-template <class InputIterator1, class InputIterator2>
-bool equal(InputIterator1 first1, InputIterator1 last1,
-           InputIterator2 first2, const unsigned)
-{
-	return ::std::equal(first1, last1, first2);
-}
-
-template <class ForwardIterator, class T, class StrictWeakOrdering>
-::std::pair<ForwardIterator, ForwardIterator>
-equal_range(ForwardIterator first, ForwardIterator last, const T& value,
-            StrictWeakOrdering comp, const unsigned)
-{
-	return ::std::equal_range(first, last, value, comp);
-}
-
-template <class ForwardIterator, class T>
-::std::pair<ForwardIterator, ForwardIterator>
-equal_range(ForwardIterator first, ForwardIterator last, const T& value,
-		const unsigned)
-{
-	return ::std::equal_range(first, last, value);
-}
-
-template <class ForwardIterator, class T>
-void fill(ForwardIterator first, ForwardIterator last, const T& value,
-	const unsigned)
-{
-	::std::fill(first, last, value);
-}
-
-template <class OutputIterator, class Size, class T>
-OutputIterator fill_n(OutputIterator first, Size n, const T& value,
-			const unsigned)
-{
-	return ::std::fill_n(first, n, value);
-}
-
-template<class InputIterator, class EqualityComparable>
-InputIterator find(InputIterator first, InputIterator last,
-                   const EqualityComparable& value, const unsigned)
-{
-	return ::std::find(first, last, value);
-}
-
-template<class InputIterator, class Predicate>
-InputIterator find_if(InputIterator first, InputIterator last,
-                      Predicate pred, const unsigned)
-{
-	return ::std::find_if(first, last, pred);
-}
-
-template <class ForwardIterator1, class ForwardIterator2,
-          class BinaryPredicate>
-ForwardIterator1 find_end(ForwardIterator1 first1, ForwardIterator1 last1,
-			ForwardIterator2 first2, ForwardIterator2 last2,
-			BinaryPredicate comp, const unsigned)
-{
-	return ::std::find_end(first1, last1, first2, last2, comp);
-}
-
-template <class ForwardIterator1, class ForwardIterator2>
-ForwardIterator1 find_end(ForwardIterator1 first1, ForwardIterator1 last1,
-         		  ForwardIterator2 first2, ForwardIterator2 last2, const unsigned)
-{
-	return ::std::find_end(first1, last1, first2, last2);
-}
-
-template <class InputIterator, class ForwardIterator, class BinaryPredicate>
-InputIterator find_first_of(InputIterator first1, InputIterator last1,
-                            ForwardIterator first2, ForwardIterator last2,
-                            BinaryPredicate comp, const unsigned)
-{
-	return ::std::find_first_of(first1, last1, first2, last2, comp);
-}
-
-template <class InputIterator, class ForwardIterator>
-InputIterator find_first_of(InputIterator first1, InputIterator last1,
-                            ForwardIterator first2, ForwardIterator last2,
-				const unsigned)
-{
-	return ::std::find_first_of(first1, last1, first2, last2);
-}
-
-template <class InputIterator, class UnaryFunction>
-UnaryFunction for_each(InputIterator first, InputIterator last, UnaryFunction f,
-			const unsigned)
-{
-	return ::std::for_each(first, last, f);
-}
-
-template <class ForwardIterator, class Generator>
-void generate(ForwardIterator first, ForwardIterator last, Generator gen)
-{
-	::std::generate(first, last, gen);
-}
-
-template <class ForwardIterator, class Generator>
-void par_generate(ForwardIterator first, ForwardIterator last, Generator gen,
-       const unsigned)
-{
-	::std::generate(first, last, gen);
-}
-
-template <class RandomAccessIterator, class StrictWeakOrdering>
-void push_heap(RandomAccessIterator first, RandomAccessIterator last,
-               StrictWeakOrdering comp, const unsigned)
-{
-	::std::push_heap(first, last, comp);
-}
-
-template <class RandomAccessIterator>
-void push_heap(RandomAccessIterator first, RandomAccessIterator last,
-		const unsigned)
-{
-	::std::push_heap(first, last);
-}
-
-template <class RandomAccessIterator, class StrictWeakOrdering>
-void pop_heap(RandomAccessIterator first, RandomAccessIterator last,
-		StrictWeakOrdering comp, const unsigned)
-{
-	::std::pop_heap(first, last, comp);
-}
-
-template <class RandomAccessIterator>
-void pop_heap(RandomAccessIterator first, RandomAccessIterator last,
-       const unsigned)
-{
-	::std::pop_heap(first, last);
-}
-
-template <class RandomAccessIterator, class StrictWeakOrdering>
-void make_heap(RandomAccessIterator first, RandomAccessIterator last,
-               StrictWeakOrdering comp, const unsigned)
-{
-	::std::make_heap(first, last, comp);
-}
-
-template <class RandomAccessIterator>
-void make_heap(RandomAccessIterator first, RandomAccessIterator last, const unsigned)
-{
-	::std::make_heap(first, last);
-}
-
-template <class RandomAccessIterator, class StrictWeakOrdering>
-void sort_heap(RandomAccessIterator first, RandomAccessIterator last,
-               StrictWeakOrdering comp, const unsigned)
-{
-	::std::sort_heap(first, last, comp);
-}
-
-template <class RandomAccessIterator>
-void sort_heap(RandomAccessIterator first, RandomAccessIterator last,
-		const unsigned)
-{
-	::std::sort_heap(first, last);
-}
-
-template <class InputIterator1, class InputIterator2, class StrictWeakOrdering>
-bool includes(InputIterator1 first1, InputIterator1 last1,
-              InputIterator2 first2, InputIterator2 last2,
-              StrictWeakOrdering comp, const unsigned)
-{
-	return ::std::includes(first1, last1, first2, last2, comp);
-}
-
-template <class InputIterator1, class InputIterator2>
-bool includes(InputIterator1 first1, InputIterator1 last1,
-              InputIterator2 first2, InputIterator2 last2, const unsigned)
-{
-	return ::std::includes(first1, last1, first2, last2);
-}
-
-template <class InputIterator1, class InputIterator2, class BinaryPredicate>
-bool lexicographical_compare(InputIterator1 first1, InputIterator1 last1,
-                             InputIterator2 first2, InputIterator2 last2,
-                             BinaryPredicate comp, const unsigned)
-{
-    return ::std::lexicographical_compare(first1, last1, first2, last2, comp);
-}
-
-template <class InputIterator1, class InputIterator2>
-bool lexicographical_compare(InputIterator1 first1, InputIterator1 last1,
-				InputIterator2 first2, InputIterator2 last2,
-				const unsigned)
-{
-	return ::std::lexicographical_compare(first1, last1, first2, last2);
-}
-
-template <class ForwardIterator, class T, class StrictWeakOrdering>
-ForwardIterator lower_bound(ForwardIterator first, ForwardIterator last,
-		const T& value, StrictWeakOrdering comp, const unsigned)
-{
-	return ::std::lower_bound(first, last, value, comp);
-}
-
-template <class ForwardIterator, class T>
-ForwardIterator lower_bound(ForwardIterator first, ForwardIterator last,
-			const T& value, const unsigned)
-{
-	return ::std::lower_bound(first, last, value);
-}
-
-template <class InputIterator1, class InputIterator2, class OutputIterator,
-          class StrictWeakOrdering>
-OutputIterator merge(InputIterator1 first1, InputIterator1 last1,
-                     InputIterator2 first2, InputIterator2 last2,
-                     OutputIterator result, StrictWeakOrdering comp,
-		     const unsigned)
-{
-	return ::std::mismatch(first1, last1, first2, last2, result, comp);
-}
-
-template <class InputIterator1, class InputIterator2, class OutputIterator>
-OutputIterator merge(InputIterator1 first1, InputIterator1 last1,
-                     InputIterator2 first2, InputIterator2 last2,
-                     OutputIterator result, const unsigned)
-{
-	return ::std::mismatch(first1, last1, first2, last2, result);
-}
-
-template <class ForwardIterator, class BinaryPredicate>
-ForwardIterator min_element(ForwardIterator first, ForwardIterator last,
-                            BinaryPredicate comp, const unsigned)
-{
-	return ::std::min_element(first, last, comp);
-}
-
-template <class ForwardIterator>
-ForwardIterator min_element(ForwardIterator first, ForwardIterator last,
-			const unsigned)
-{
-	return ::std::min_element(first, last);
-}
-
-template <class ForwardIterator, class BinaryPredicate>
-ForwardIterator max_element(ForwardIterator first, ForwardIterator last,
-                            BinaryPredicate comp, const unsigned)
-{
-	return ::std::max_element(first, last, comp);
-}
-
-template <class ForwardIterator>
-ForwardIterator max_element(ForwardIterator first, ForwardIterator last,
-			const unsigned)
-{
-	return ::std::max_element(first, last);
-}
-
-template <class InputIterator1, class InputIterator2, class BinaryPredicate>
-::std::pair<InputIterator1, InputIterator2>
-mismatch(InputIterator1 first1, InputIterator1 last1, InputIterator2 first2,
-         BinaryPredicate binary_pred, const unsigned)
-{
-	return ::std::mismatch(first1, last1, first2, binary_pred);
-}
-
-template <class InputIterator1, class InputIterator2>
-::std::pair<InputIterator1, InputIterator2>
-mismatch(InputIterator1 first1, InputIterator1 last1, InputIterator2 first2,
-	const unsigned)
-{
-	return ::std::mismatch(first1, last1, first2);
-}
-
-template <class RandomAccessIterator, class StrictWeakOrdering>
-void nth_element(RandomAccessIterator first, RandomAccessIterator nth,
-                 RandomAccessIterator last, StrictWeakOrdering comp,
-		 const unsigned)
-{
-	return ::std::nth_element(first, nth, last, comp);
-}
-
-template <class RandomAccessIterator>
-void nth_element(RandomAccessIterator first, RandomAccessIterator nth,
-                 RandomAccessIterator last, const unsigned)
-{
-	return ::std::nth_element(first, nth, last);
-}
-
-template <class RandomAccessIterator, class StrictWeakOrdering>
-void partial_sort(RandomAccessIterator first, RandomAccessIterator middle,
-		  RandomAccessIterator last, StrictWeakOrdering comp, const unsigned)
-{
-	return ::std::partial_sort(first, middle, last, comp);
-}
-
-template <class RandomAccessIterator>
-void partial_sort(RandomAccessIterator first, RandomAccessIterator middle,
-                  RandomAccessIterator last, const unsigned)
-{
-	return ::std::partial_sort(first, middle, last);
-}
-
-template <class InputIterator, class RandomAccessIterator,
-          class StrictWeakOrdering>
-RandomAccessIterator partial_sort_copy(InputIterator first, InputIterator last,
-	RandomAccessIterator result_first, RandomAccessIterator result_last,
-	StrictWeakOrdering comp, const unsigned)
-{
-	return ::std::partial_sort_copy(first, last,
-					result_first, result_last, comp);
-}
-
-template <class InputIterator, class RandomAccessIterator>
-RandomAccessIterator partial_sort_copy(InputIterator first, InputIterator last,
-			RandomAccessIterator result_first,
-			RandomAccessIterator result_last, const unsigned)
-{
-	return ::std::partial_sort_copy(first, last, result_first, result_last);
-}
-
-template <class ForwardIterator, class Predicate>
-ForwardIterator partition(ForwardIterator first, ForwardIterator last,
-			  Predicate pred, const unsigned)
-{
-	return ::std::partition(first, last, pred);
-}
-
-template <class BidirectionalIterator, class StrictWeakOrdering>
-bool next_permutation(BidirectionalIterator first, BidirectionalIterator last,
-                      StrictWeakOrdering comp, const unsigned)
-{
-	return ::std::next_permutation(first, last, comp);
-}
-
-template <class BidirectionalIterator>
-bool next_permutation(BidirectionalIterator first, BidirectionalIterator last, const unsigned)
-{
-	return ::std::next_permutation(first, last);
-}
-
-template <class BidirectionalIterator, class StrictWeakOrdering>
-bool prev_permutation(BidirectionalIterator first, BidirectionalIterator last,
-                      StrictWeakOrdering comp, const unsigned)
-{
-	return ::std::prev_permutation(first, last, comp);
-}
-
-template <class BidirectionalIterator>
-bool prev_permutation(BidirectionalIterator first, BidirectionalIterator last, const unsigned)
-{
-	return ::std::prev_permutation(first, last);
-}
-
-template <class ForwardIterator, class Predicate>
-ForwardIterator stable_partition(ForwardIterator first, ForwardIterator last,
-				 Predicate pred, const unsigned)
-{
-	return ::std::stable_partition(first, last, pred);
-}
-
-template <class RandomAccessIterator>
-void random_shuffle(RandomAccessIterator first, RandomAccessIterator last, const unsigned)
-{
-	return ::std::random_shuffle(first, last);
-}
-
-template <class RandomAccessIterator, class RandomNumberGenerator>
-void random_shuffle(RandomAccessIterator first, RandomAccessIterator last,
-                    RandomNumberGenerator &rgen, const unsigned)
-{
-	return ::std::random_shuffle(first, last, rgen);
-}
-
-template <class ForwardIterator, class T>
-ForwardIterator remove(ForwardIterator first, ForwardIterator last,
-                       const T& value, const unsigned)
-{
-	return ::std::remove(first, last, value);
-}
-
-template <class ForwardIterator, class Predicate>
-ForwardIterator remove_if(ForwardIterator first, ForwardIterator last,
-                          Predicate pred, const unsigned)
-{
-	return ::std::remove_if(first, last, pred);
-}
-
-template <class InputIterator, class OutputIterator, class T>
-OutputIterator remove_copy(InputIterator first, InputIterator last,
-                           OutputIterator result, const T& value, const unsigned)
-{
-	return ::std::remove_copy(first, last, result, value);
-}
-
-template <class InputIterator, class OutputIterator, class Predicate>
-OutputIterator remove_copy_if(InputIterator first, InputIterator last,
-                              OutputIterator result, Predicate pred, const unsigned)
-{
-	return ::std::remove_copy_if(first, last, result, pred);
-}
-
-template <class ForwardIterator, class T>
-void replace(ForwardIterator first, ForwardIterator last, const T& old_value,
-             const T& new_value, const unsigned)
-{
-	return ::std::replace(first, last, old_value, new_value);
-}
-
-template <class InputIterator, class OutputIterator, class T>
-OutputIterator replace_copy(InputIterator first, InputIterator last,
-                            OutputIterator result, const T& old_value,
-                            const T& new_value, const unsigned)
-{
-	return ::std::replace_copy(first, last, result, old_value, new_value);
-}
-
-template <class InputIterator, class OutputIterator, class Predicate, class T>
-OutputIterator replace_copy_if(InputIterator first, InputIterator last,
-                               OutputIterator result, Predicate pred,
-                               const T& new_value, const unsigned)
-{
-	return ::std::replace_copy_if(first, last, result, pred, new_value);
-}
-
-template <class ForwardIterator, class Predicate, class T>
-void replace_if(ForwardIterator first, ForwardIterator last, Predicate pred,
-                const T& new_value, const unsigned)
-{
-	return ::std::replace_if(first, last, pred, new_value);
-}
-
-template <class BidirectionalIterator>
-void reverse(BidirectionalIterator first, BidirectionalIterator last, const unsigned)
-{
-	return ::std::reverse(first, last);
-}
-
-template <class BidirectionalIterator, class OutputIterator>
-OutputIterator reverse_copy(BidirectionalIterator first,
-			    BidirectionalIterator last,
-			    OutputIterator result, const unsigned)
-{
-	return ::std::reverse_copy(first, last, result);
-}
-
-template <class ForwardIterator>
-ForwardIterator rotate( ForwardIterator first, ForwardIterator middle,
-			ForwardIterator last, const unsigned)
-{
-	return ::std::rotate(first, middle, last);
-}
-
-template <class ForwardIterator, class OutputIterator>
-OutputIterator rotate_copy(ForwardIterator first, ForwardIterator middle,
-			   ForwardIterator last, OutputIterator result, const unsigned)
-{
-	return ::std::rotate_copy(first, middle, last, result);
-}
-
-template <class ForwardIterator1, class ForwardIterator2, class BinaryPredicate>
-ForwardIterator1 search(ForwardIterator1 first1, ForwardIterator1 last1,
-                        ForwardIterator2 first2, ForwardIterator2 last2,
-                        BinaryPredicate binary_pred, const unsigned)
-{
-	return ::std::search(first1, last1, first2, last2, binary_pred);
-}
-
-template <class ForwardIterator1, class ForwardIterator2>
-ForwardIterator1 search(ForwardIterator1 first1, ForwardIterator1 last1,
-                        ForwardIterator2 first2, ForwardIterator2 last2,
-			const unsigned)
-{
-	return ::std::search(first1, last1, first2, last2);
-}
-
-template <class ForwardIterator, class Integer,
-          class T, class BinaryPredicate>
-ForwardIterator search_n(ForwardIterator first, ForwardIterator last,
-                         Integer count, const T& value,
-                         BinaryPredicate binary_pred, const unsigned)
-{
-	return ::std::search_n(first, last, count, value, binary_pred);
-}
-
-template <class ForwardIterator, class Integer, class T>
-ForwardIterator search_n(ForwardIterator first, ForwardIterator last,
-                         Integer count, const T& value, const unsigned)
-{
-	return ::std::search_n(first, last, count, value);
-}
-
-template <class InputIterator1, class InputIterator2, class OutputIterator,
-          class StrictWeakOrdering>
-OutputIterator set_difference(InputIterator1 first1, InputIterator1 last1,
-				InputIterator2 first2, InputIterator2 last2,
-				OutputIterator result, StrictWeakOrdering comp, const unsigned)
-{
-	return ::std::set_difference(first1, last1, first2, last2, result,comp);
-}
-
-template <class InputIterator1, class InputIterator2, class OutputIterator>
-OutputIterator set_difference(InputIterator1 first1, InputIterator1 last1,
-				InputIterator2 first2, InputIterator2 last2,
-				OutputIterator result, const unsigned)
-{
-	return ::std::set_difference(first1, last1, first2, last2, result);
-}
-
-template <class InputIterator1, class InputIterator2, class OutputIterator,
-          class StrictWeakOrdering>
-OutputIterator set_intersection(InputIterator1 first1, InputIterator1 last1,
-				InputIterator2 first2, InputIterator2 last2,
-				OutputIterator result, StrictWeakOrdering comp, const unsigned)
-{
-	return ::std::set_intersection(first1, last1,
-					first2, last2, result, comp);
-}
-
-template <class InputIterator1, class InputIterator2, class OutputIterator>
-OutputIterator set_intersection(InputIterator1 first1, InputIterator1 last1,
-				InputIterator2 first2, InputIterator2 last2,
-				OutputIterator result, const unsigned)
-{
-	return ::std::set_intersection(first1, last1, first2, last2, result);
-}
-
-template <class InputIterator1, class InputIterator2, class OutputIterator,
-          class StrictWeakOrdering>
-OutputIterator
-set_symmetric_difference(InputIterator1 first1, InputIterator1 last1,
-			 InputIterator2 first2, InputIterator2 last2,
-			 OutputIterator result, StrictWeakOrdering comp, const unsigned)
-{
-	return ::std::set_symmetric_difference(first1, last1,
-						first2, last2, result, comp);
-}
-
-template <class InputIterator1, class InputIterator2, class OutputIterator>
-OutputIterator
-set_symmetric_difference(InputIterator1 first1, InputIterator1 last1,
-			 InputIterator2 first2, InputIterator2 last2,
-			 OutputIterator result, const unsigned)
-{
-	return ::std::set_symmetric_difference(first1, last1,
-						first2, last2, result);
-}
-
-template <class InputIterator1, class InputIterator2, class OutputIterator,
-          class StrictWeakOrdering>
-OutputIterator set_union(InputIterator1 first1, InputIterator1 last1,
-			 InputIterator2 first2, InputIterator2 last2,
-			 OutputIterator result, StrictWeakOrdering comp, const unsigned)
-{
-	return ::std::set_union(first1, last1, first2, last2, result, comp);
-}
-
-template <class InputIterator1, class InputIterator2, class OutputIterator>
-OutputIterator set_union(InputIterator1 first1, InputIterator1 last1,
-			 InputIterator2 first2, InputIterator2 last2,
-			 OutputIterator result, const unsigned)
-{
-	return ::std::set_union(first1, last1, first2, last2, result);
-}
-
-template<typename RandomAccessIterator>
-void stable_sort(RandomAccessIterator first, RandomAccessIterator last, const unsigned)
-{
-	return ::std::stable_sort(first, last);
-}
-
-template<typename RandomAccessIterator, class StrictWeakOrdering>
-void stable_sort(RandomAccessIterator first, RandomAccessIterator last,
-		StrictWeakOrdering comp, const unsigned)
-{
-	return ::std::stable_sort(first, last, comp);
-}
-
-template<typename RandomAccessIterator>
-void sort(RandomAccessIterator first, RandomAccessIterator last, const unsigned)
-{
-	return ::std::sort(first, last);
-}
-
-template<typename RandomAccessIterator, class StrictWeakOrdering>
-void sort(RandomAccessIterator first, RandomAccessIterator last,
-	  StrictWeakOrdering comp, const unsigned)
-{
-	return ::std::sort(first, last, comp);
-}
-
-template <class ForwardIterator1, class ForwardIterator2>
-ForwardIterator2 swap_ranges(ForwardIterator1 first1, ForwardIterator1 last1,
-				ForwardIterator2 first2, const unsigned)
-{
-	return ::std::swap_ranges(first1, last1, first2);
-}
-
-template <class InputIterator, class OutputIterator, class UnaryFunction>
-OutputIterator transform(InputIterator first, InputIterator last,
-                         OutputIterator result, UnaryFunction op, const unsigned)
-{
-	return ::std::transform(first, last, result, op);
-}
-
-template <class InputIterator1, class InputIterator2, class OutputIterator,
-          class BinaryFunction>
-OutputIterator transform(InputIterator1 first1, InputIterator1 last1,
-                         InputIterator2 first2, OutputIterator result,
-                         BinaryFunction binary_op, const unsigned)
-{
-	return ::std::transform(first1, last1, first2, result, binary_op);
-}
-
-template <class ForwardIterator, class BinaryPredicate>
-ForwardIterator unique(ForwardIterator first, ForwardIterator last,
-                       BinaryPredicate binary_pred, const unsigned)
-{
-	return ::std::unique(first, last, binary_pred);
-}
-
-template <class ForwardIterator>
-ForwardIterator unique(ForwardIterator first, ForwardIterator last, const unsigned)
-{
-	return ::std::unique(first, last);
-}
-
-template <class InputIterator, class OutputIterator, class BinaryPredicate>
-OutputIterator unique_copy(InputIterator first, InputIterator last,
-			OutputIterator result, BinaryPredicate binary_pred, const unsigned)
-{
-	return ::std::unique_copy(first, last, result, binary_pred);
-}
-
-template <class InputIterator, class OutputIterator>
-OutputIterator unique_copy(InputIterator first, InputIterator last,
-                           OutputIterator result, const unsigned)
-{
-	return ::std::unique_copy(first, last, result);
-}
-
-template <class ForwardIterator, class T, class StrictWeakOrdering>
-ForwardIterator upper_bound(ForwardIterator first, ForwardIterator last,
-			    const T& value, StrictWeakOrdering comp, const unsigned)
-{
-	return ::std::upper_bound(first, last, value, comp);
-}
-
-template <class ForwardIterator, class T>
-ForwardIterator upper_bound(ForwardIterator first, ForwardIterator last,
-			    const T& value, const unsigned)
-{
-	return ::std::upper_bound(first, last, value);
-}
-
-} // namespace omptl
diff --git a/lib/omptl/omptl_numeric b/lib/omptl/omptl_numeric
deleted file mode 100644
index c64a9b0d7..000000000
--- a/lib/omptl/omptl_numeric
+++ /dev/null
@@ -1,83 +0,0 @@
-// Copyright (C) 2006 Fokko Beekhof
-// Email contact: Fokko.Beekhof@unige.ch
-
-// The OMPTL library is free software; you can redistribute it and/or
-// modify it under the terms of the GNU Lesser General Public
-// License as published by the Free Software Foundation; either
-// version 2.1 of the License, or (at your option) any later version.
-
-// This library is distributed in the hope that it will be useful,
-// but WITHOUT ANY WARRANTY; without even the implied warranty of
-// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-// Lesser General Public License for more details.
-
-// You should have received a copy of the GNU Lesser General Public
-// License along with this library; if not, write to the Free Software
-// Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
-
-#ifndef OMPTL_NUMERIC
-#define OMPTL_NUMERIC 1
-
-#include <numeric>
-#include <omptl/omptl>
-
-namespace omptl
-{
-
-template <class InputIterator, class T>
-T accumulate(InputIterator first, InputIterator last, T init,
-		const unsigned P = _Pfunc::Pfunc());
-
-template <class InputIterator, class T, class BinaryFunction>
-T accumulate(InputIterator first, InputIterator last, T init,
-	     BinaryFunction binary_op,
-	     const unsigned P = _Pfunc::Pfunc());
-
-/*
- * Not (yet) paralellized due to data dependance.
- */
-template <class InputIterator, class OutputIterator, class BinaryFunction>
-OutputIterator
-adjacent_difference(InputIterator first, InputIterator last,
-		    OutputIterator result, BinaryFunction binary_op,
-		    const unsigned P = _Pfunc::Pfunc());
-
-template <class InputIterator, class OutputIterator>
-OutputIterator adjacent_difference(InputIterator first, InputIterator last,
-				   OutputIterator result,
-				   const unsigned P = _Pfunc::Pfunc());
-
-template <class InputIterator1, class InputIterator2, class T,
-          class BinaryFunction1, class BinaryFunction2>
-T inner_product(InputIterator1 first1, InputIterator1 last1,
-		InputIterator2 first2, T init,
-		BinaryFunction1 binary_op1, BinaryFunction2 binary_op2,
-		const unsigned P = _Pfunc::Pfunc());
-
-template <class InputIterator1, class InputIterator2, class T>
-T inner_product(InputIterator1 first1, InputIterator1 last1,
-		InputIterator2 first2, T init,
-		const unsigned P = _Pfunc::Pfunc());
-
-// Not paralellized due to dependencies and complications with OutputIterators.
-template <class InputIterator, class OutputIterator,
-	  class BinaryOperation>
-OutputIterator partial_sum(InputIterator first, InputIterator last,
-			   OutputIterator result, BinaryOperation binary_op,
-			   const unsigned P = _Pfunc::Pfunc());
-
-template <class InputIterator, class OutputIterator>
-OutputIterator partial_sum(InputIterator first, InputIterator last,
-		OutputIterator result, const unsigned P = _Pfunc::Pfunc());
-
-} // namespace omptl
-
-#ifdef _OPENMP
-  #include <omptl/omptl_numeric_par.h>
-#else
-  #include <omptl/omptl_numeric_ser.h>
-#endif
-
-#include <omptl/omptl_numeric_extensions.h>
-
-#endif /* OMPTL_NUMERIC */
diff --git a/lib/omptl/omptl_numeric_extensions.h b/lib/omptl/omptl_numeric_extensions.h
deleted file mode 100644
index 2da12a34d..000000000
--- a/lib/omptl/omptl_numeric_extensions.h
+++ /dev/null
@@ -1,113 +0,0 @@
-// Copyright (C) 2007-2011 Fokko Beekhof
-// Email contact: Fokko.Beekhof@unige.ch
-
-// The OMPTL library is free software; you can redistribute it and/or
-// modify it under the terms of the GNU Lesser General Public
-// License as published by the Free Software Foundation; either
-// version 2.1 of the License, or (at your option) any later version.
-
-// This library is distributed in the hope that it will be useful,
-// but WITHOUT ANY WARRANTY; without even the implied warranty of
-// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-// Lesser General Public License for more details.
-
-// You should have received a copy of the GNU Lesser General Public
-// License along with this library; if not, write to the Free Software
-// Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
-
-#include <cmath>
-
-namespace omptl
-{
-
-// Extentions
-
-template <class Iterator,class T,class UnaryFunction, class BinaryFunction>
-T transform_accumulate(Iterator first, Iterator last, T init,
-			UnaryFunction unary_op, BinaryFunction binary_op,
-			const unsigned P = _Pfunc::Pfunc());
-
-template <class Iterator, class T, class UnaryFunction>
-T transform_accumulate(Iterator first, Iterator last, T init,
-		UnaryFunction unary_op, const unsigned P = _Pfunc::Pfunc());
-
-// "Manhattan" distance
-template <class InputIterator1, class InputIterator2>
-typename ::std::iterator_traits<InputIterator1>::value_type
-L1(InputIterator1 first1, InputIterator1 last1,
-   InputIterator2 first2, const unsigned P = _Pfunc::Pfunc());
-
-// "Euclidean" distance
-template <class InputIterator1, class InputIterator2>
-typename ::std::iterator_traits<InputIterator1>::value_type
-L2(InputIterator1 first1, InputIterator1 last1,
-   InputIterator2 first2, const unsigned P = _Pfunc::Pfunc());
-
-// "Euclidean" length
-template <class InputIterator>
-typename ::std::iterator_traits<InputIterator>::value_type
-L2(InputIterator first, InputIterator last, const unsigned P = _Pfunc::Pfunc());
-
-} // namespace
-
-#ifdef _OPENMP
-  #include <omptl/omptl_numeric_extensions_par.h>
-#else
-  #include <omptl/omptl_numeric_extensions_ser.h>
-#endif
-
-namespace omptl
-{
-
-// "Manhattan" distance
-template <class InputIterator1, class InputIterator2>
-typename ::std::iterator_traits<InputIterator1>::value_type
-L1(InputIterator1 first1, InputIterator1 last1,
-   InputIterator2 first2, const unsigned P)
-{
-	typedef typename ::std::iterator_traits<InputIterator1>::value_type VT;
-	return ::omptl::inner_product(first1, last1, first2, VT(0),
-					std::plus<VT>(), std::minus<VT>(), P);
-}
-
-template <typename T>
-struct _MinusSq
-{
-	T operator()(const T &lhs, const T &rhs) const
-	{
-		const T d = lhs - rhs;
-		return d*d;
-	}
-};
-
-// "Euclidean" distance
-template <class InputIterator1, class InputIterator2>
-typename ::std::iterator_traits<InputIterator1>::value_type
-L2(InputIterator1 first1, InputIterator1 last1,
-   InputIterator2 first2, const unsigned P)
-{
-	typedef typename ::std::iterator_traits<InputIterator1>::value_type VT;
-	return ::std::sqrt(::omptl::inner_product(first1, last1, first2, VT(0),
-					std::plus<VT>(), _MinusSq<VT>(), P));
-}
-
-template <typename T>
-struct _Sq
-{
-	T operator()(const T &d) const
-	{
-		return d*d;
-	}
-};
-
-// "Euclidean" length
-template <class InputIterator>
-typename ::std::iterator_traits<InputIterator>::value_type
-L2(InputIterator first, InputIterator last, const unsigned P)
-{
-	typedef typename ::std::iterator_traits<InputIterator>::value_type VT;
-	return ::std::sqrt(::omptl::transform_accumulate(first, last, VT(0),
-						_Sq<VT>(), std::plus<VT>(), P));
-}
-
-} /* namespace _OMPTL_EXTENTION_NAMESPACE */
diff --git a/lib/omptl/omptl_numeric_extensions_par.h b/lib/omptl/omptl_numeric_extensions_par.h
deleted file mode 100644
index 3fa131303..000000000
--- a/lib/omptl/omptl_numeric_extensions_par.h
+++ /dev/null
@@ -1,149 +0,0 @@
-// Copyright (C) 2007 Fokko Beekhof
-// Email contact: Fokko.Beekhof@unige.ch
-
-// The OMPTL library is free software; you can redistribute it and/or
-// modify it under the terms of the GNU Lesser General Public
-// License as published by the Free Software Foundation; either
-// version 2.1 of the License, or (at your option) any later version.
-
-// This library is distributed in the hope that it will be useful,
-// but WITHOUT ANY WARRANTY; without even the implied warranty of
-// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-// Lesser General Public License for more details.
-
-// You should have received a copy of the GNU Lesser General Public
-// License along with this library; if not, write to the Free Software
-// Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
-
-namespace omptl
-{
-
-namespace detail
-{
-
-// transform_accumulate
-template <class Iterator, class T, class UnaryFunction, class BinaryFunction>
-T _ser_transform_accumulate(Iterator first, Iterator last, T init,
-			UnaryFunction unary_op, BinaryFunction binary_op)
-{
-	// serial version
-	while (first != last)
-	{
-		init = binary_op(unary_op(*first), init);
-		++first;
-	}
-
-	return init;
-}
-
-template <class Iterator,class T, class UnaryFunction, class BinaryFunction>
-T _par_transform_accumulate(Iterator first, Iterator last,
-			const T init, const T par_init,
-			UnaryFunction unary_op, BinaryFunction binary_op,
-			const unsigned P)
-{
-	assert(P > 0u);
-	if (_linear_serial_is_faster(first, last, P))
-		return _ser_transform_accumulate(first, last, init,
-						unary_op, binary_op);
-	assert(2*(int)P <= std::distance(first, last));
-
-	::std::vector< ::std::pair<Iterator, Iterator> > partitions(P);
-	_partition_range(first, last, partitions, P);
-
-	::std::vector<T> results(P);
-	#pragma omp parallel for
-	for (int t = 0; t < int(P); ++t)
-		results[t] = _ser_transform_accumulate(partitions[t].first,
-					partitions[t].second, par_init,
-					unary_op, binary_op);
-
-	return ::std::accumulate(results.begin(), results.end(),
-				 init, binary_op);
-}
-
-template <class Iterator, class T, class UnaryFunction>
-T _transform_accumulate(Iterator first, Iterator last,
-		const T init, UnaryFunction unary_op,
-		::std::plus<typename UnaryFunction::result_type> binary_op,
-		const unsigned P)
-{
-	return ::omptl::detail::_par_transform_accumulate(first, last, init,
-				typename UnaryFunction::result_type(0),
-					 unary_op, binary_op, P);
-}
-
-template <class Iterator, class T, class UnaryFunction>
-T _transform_accumulate(Iterator first, Iterator last, const T init,
-		UnaryFunction unary_op,
-		::std::multiplies<typename UnaryFunction::result_type>binary_op,
-		const unsigned P)
-{
-	return ::omptl::detail::_par_transform_accumulate(first, last, init,
-				typename UnaryFunction::result_type(1),
-					 unary_op, binary_op, P);
-}
-
-template <class Iterator, class T, class UnaryFunction, class BinaryFunction>
-T _transform_accumulate(Iterator first, Iterator last, const T init,
-			UnaryFunction unary_op, BinaryFunction binary_op,
-			const unsigned P)
-{
-	return ::omptl::detail::_ser_transform_accumulate(first, last, init,
-						  unary_op,binary_op);
-}
-
-template <class IteratorTag>
-struct _TransformAccumulate
-{
-	template <class Iterator, class T, class UnaryFunction,
-		class BinaryFunction>
-	static typename BinaryFunction::result_type
-	transform_accumulate(Iterator first, Iterator last, const T init,
-			UnaryFunction unary_op, BinaryFunction binary_op,
-			const unsigned P)
-	{
-		return ::omptl::detail::_transform_accumulate(first, last, init,
-							unary_op, binary_op, P);
-	}
-};
-
-template <>
-struct _TransformAccumulate< ::std::input_iterator_tag >
-{
-	template <class Iterator, class T, class UnaryFunction,
-		class BinaryFunction>
-	static typename BinaryFunction::result_type
-	transform_accumulate(Iterator first, Iterator last, const T init,
-			UnaryFunction unary_op, BinaryFunction binary_op,
-			const unsigned P)
-	{
-		return ::omptl::detail::_ser_transform_accumulate(first, last, init,
-							  unary_op, binary_op);
-	}
-};
-
-} // end namespace detail
-
-template <class Iterator, class T, class UnaryFunction, class BinaryFunction>
-T transform_accumulate(Iterator first, Iterator last, const T init,
-			UnaryFunction unary_op, BinaryFunction binary_op,
-			const unsigned P)
-{
-	return ::omptl::detail::_TransformAccumulate
-	<typename ::std::iterator_traits<Iterator>::iterator_category>
-		::transform_accumulate(first, last, init,
-					unary_op, binary_op, P);
-}
-
-template <class Iterator, class T, class UnaryFunction>
-T transform_accumulate(Iterator first, Iterator last,
-			const T init, UnaryFunction unary_op,
-			const unsigned P)
-{
-	typedef typename UnaryFunction::result_type RT;
-	return ::omptl::transform_accumulate(first, last, init, unary_op,
-					     ::std::plus<RT>(), P);
-}
-
-} /* namespace omptl */
diff --git a/lib/omptl/omptl_numeric_extensions_ser.h b/lib/omptl/omptl_numeric_extensions_ser.h
deleted file mode 100644
index 09dc33b3f..000000000
--- a/lib/omptl/omptl_numeric_extensions_ser.h
+++ /dev/null
@@ -1,46 +0,0 @@
-// Copyright (C) 2007 Fokko Beekhof
-// Email contact: Fokko.Beekhof@unige.ch
-
-// The OMPTL library is free software; you can redistribute it and/or
-// modify it under the terms of the GNU Lesser General Public
-// License as published by the Free Software Foundation; either
-// version 2.1 of the License, or (at your option) any later version.
-
-// This library is distributed in the hope that it will be useful,
-// but WITHOUT ANY WARRANTY; without even the implied warranty of
-// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-// Lesser General Public License for more details.
-
-// You should have received a copy of the GNU Lesser General Public
-// License along with this library; if not, write to the Free Software
-// Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
-
-namespace omptl
-{
-
-// transform_accumulate
-template <class Iterator,class T, class UnaryFunction,class BinaryFunction>
-T transform_accumulate(Iterator first, Iterator last, T init,
-		UnaryFunction unary_op, BinaryFunction binary_op,
-		const unsigned P)
-{
-	// serial version
-	while (first != last)
-	{
-		init = binary_op(unary_op(*first), init);
-		++first;
-	}
-
-	return init;
-}
-
-template <class Iterator, class T, class UnaryFunction>
-T transform_accumulate(Iterator first, Iterator last,
-		T init, UnaryFunction unary_op,
-		const unsigned P)
-{
-	return omptl::transform_accumulate(first, last, init, unary_op,
-					   std::plus<T>());
-}
-
-} /* namespace std */
diff --git a/lib/omptl/omptl_numeric_par.h b/lib/omptl/omptl_numeric_par.h
deleted file mode 100644
index 1a82085c5..000000000
--- a/lib/omptl/omptl_numeric_par.h
+++ /dev/null
@@ -1,277 +0,0 @@
-// Copyright (C) 2006 Fokko Beekhof
-// Email contact: Fokko.Beekhof@unige.ch
-
-// The OMPTL library is free software; you can redistribute it and/or
-// modify it under the terms of the GNU Lesser General Public
-// License as published by the Free Software Foundation; either
-// version 2.1 of the License, or (at your option) any later version.
-
-// This library is distributed in the hope that it will be useful,
-// but WITHOUT ANY WARRANTY; without even the implied warranty of
-// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-// Lesser General Public License for more details.
-
-// You should have received a copy of the GNU Lesser General Public
-// License along with this library; if not, write to the Free Software
-// Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
-
-#ifndef OMPTL_NUMERIC_H
-#define OMPTL_NUMERIC_H 1
-
-#include <iostream>
-#include <utility>
-#include <functional>
-#include <iterator>
-
-#include <omptl/omptl_algorithm>
-#include <omptl/omptl_tools.h>
-
-namespace omptl
-{
-
-namespace detail
-{
-
-template <class IteratorTag>
-struct _Accumulate
-{
-	template <class InputIterator, class T, class BinaryFunction>
-	static T accumulate(InputIterator first, InputIterator last, T init,
-			T par_init, BinaryFunction binary_op, const unsigned P)
-	{
-		assert(P > 0u);
-		if (detail::_linear_serial_is_faster(first, last, P))
-			return ::std::accumulate(first, last, init, binary_op);
-		assert(2*(int)P <= std::distance(first, last));
-
-		::std::vector< ::std::pair<InputIterator, InputIterator> > partitions(P);
-		::omptl::detail::_partition_range(first, last, partitions, P);
-
-		::std::vector<T> results(P);
-		#pragma omp parallel for
-		for (int t = 0; t < int(P); ++t)
-			results[t] = ::std::accumulate( partitions[t].first,
-							partitions[t].second,
-							par_init, binary_op);
-
-		return ::std::accumulate(results.begin(), results.end(),
-					 init, binary_op);
-	}
-};
-
-template <>
-struct _Accumulate< ::std::input_iterator_tag >
-{
-	template <class InputIterator, class T, class BinaryFunction>
-	static T accumulate(InputIterator first, InputIterator last, T init,
-			T par_init, BinaryFunction binary_op, const unsigned P)
-	{
-		return ::std::accumulate(first, last, init, binary_op);
-	}
-};
-
-template <typename T, class InputIterator, class BinaryFunction>
-struct _AccumulateOp
-{
-	static T accumulate(InputIterator first, InputIterator last, T init,
-			    BinaryFunction binary_op, const unsigned P)
-	{
-		assert(P > 0u);
-		return ::std::accumulate(first, last, init, binary_op);
-	}
-};
-
-template <typename T, class InputIterator>
-struct _AccumulateOp<T, InputIterator, ::std::plus<T> >
-{
-	typedef ::std::plus<T> BinaryFunction;
-
-	static T accumulate(InputIterator first, InputIterator last, T init,
-			BinaryFunction binary_op, const unsigned P)
-	{
-		assert(P > 0u);
-		return ::omptl::detail::_Accumulate< typename
-		::std::iterator_traits<InputIterator>::iterator_category>::
-			accumulate(first, last, init, T(0), binary_op, P);
-	}
-};
-
-template <typename T, class InputIterator>
-struct _AccumulateOp<T, InputIterator, ::std::multiplies<T> >
-{
-	typedef ::std::multiplies<T> BinaryFunction;
-
-	static T accumulate(InputIterator first, InputIterator last, T init,
-			BinaryFunction binary_op, const unsigned P)
-	{
-		assert(P > 0u);
-		return ::omptl::detail::_Accumulate<typename
-		::std::iterator_traits<InputIterator>::iterator_category>::
-			accumulate(first, last, init, T(1), binary_op, P);
-	}
-};
-
-} // end namespace detail
-
-template <class InputIterator, class T, class BinaryFunction>
-T accumulate(InputIterator first, InputIterator last, T init,
-	     BinaryFunction binary_op, const unsigned P)
-{
-	assert(P > 0u);
-	return ::omptl::detail::_AccumulateOp<T, InputIterator, BinaryFunction>::
-		accumulate(first, last, init, binary_op, P);
-}
-
-template <class InputIterator, class T>
-T accumulate(InputIterator first, InputIterator last, T init, const unsigned P)
-{
-	assert(P > 0u);
-
-	return ::omptl::accumulate(first, last, init, std::plus<T>(), P);
-}
-
-template <class InputIterator, class OutputIterator, class BinaryFunction>
-OutputIterator adjacent_difference(InputIterator first, InputIterator last,
-				   OutputIterator result,
-				   BinaryFunction binary_op, const unsigned P)
-{
-	return ::std::adjacent_difference(first, last, result, binary_op);
-}
-
-template <class InputIterator, class OutputIterator>
-OutputIterator adjacent_difference(InputIterator first, InputIterator last,
-				   OutputIterator result, const unsigned P)
-{
-	return ::std::adjacent_difference(first, last, result);
-}
-
-namespace detail
-{
-
-template <class Iterator1Tag, class Iterator2Tag>
-struct _InnerProduct
-{
-	template <class InputIterator1, class InputIterator2, class T,
-        	  class BinaryFunction1, class BinaryFunction2>
-	static T inner_product( InputIterator1 first1, InputIterator1 last1,
-				InputIterator2 first2, T init,
-				BinaryFunction1 binary_op1,
-				BinaryFunction2 binary_op2, const unsigned P)
-	{
-		return ::std::inner_product(first1, last1, first2, init, binary_op1, binary_op2);
-	}
-
-	template <class Iterator1, class Iterator2, class T,
-		  class BinaryFunction2>
-	static T inner_product(Iterator1 first1, Iterator1 last1,
-				Iterator2 first2, T init,
-				::std::plus<T> binary_op1,
-				BinaryFunction2 binary_op2, const unsigned P)
-	{
-		assert(P > 0u);
-		if (detail::_linear_serial_is_faster(first1, last1, P))
-			return ::std::inner_product(first1, last1, first2, init, binary_op1, binary_op2);
-
-		assert(2*(int)P <= std::distance(first1, last1));
-
-		::std::vector< ::std::pair<Iterator1, Iterator1> > partitions1(P);
-		::omptl::detail::_partition_range(first1, last1, partitions1, P);
-		::std::vector<Iterator2> partitions2(P);
-		::omptl::detail::_copy_partitions(partitions1, first2, partitions2, P);
-
-		#pragma omp parallel for reduction(+:init)
-		for (int t = 0; t < int(P); ++t)
-			init += ::std::inner_product(partitions1[t].first,
-						     partitions1[t].second,
-						     partitions2[t], T(0.0),
-						     binary_op1, binary_op2);
-
-		return init;
-	}
-};
-
-template <class Iterator2Tag>
-struct _InnerProduct< ::std::input_iterator_tag, Iterator2Tag>
-{
-	template <class InputIterator1, class InputIterator2, class T,
-        	  class BinaryFunction1, class BinaryFunction2>
-	static T inner_product(InputIterator1 first1, InputIterator1 last1,
-                InputIterator2 first2, T init,
-		BinaryFunction1 binary_op1, BinaryFunction2 binary_op2,
-		const unsigned P)
-	{
-		return ::std::inner_product(first1, last1, first2, init, binary_op1, binary_op2);
-	}
-};
-
-template <class Iterator1Tag>
-struct _InnerProduct<Iterator1Tag, ::std::input_iterator_tag>
-{
-	template <class InputIterator1, class InputIterator2, class T,
-        	  class BinaryFunction1, class BinaryFunction2>
-	static T inner_product(InputIterator1 first1, InputIterator1 last1,
-                InputIterator2 first2, T init,
-		BinaryFunction1 binary_op1, BinaryFunction2 binary_op2,
-		const unsigned P)
-	{
-		return ::std::inner_product(first1, last1, first2, init, binary_op1, binary_op2);
-	}
-};
-
-template <>
-struct _InnerProduct< ::std::input_iterator_tag, ::std::input_iterator_tag >
-{
-	template <class InputIterator1, class InputIterator2, class T,
-        	  class BinaryFunction1, class BinaryFunction2>
-	static T inner_product(InputIterator1 first1, InputIterator1 last1,
-                InputIterator2 first2, T init,
-		BinaryFunction1 binary_op1, BinaryFunction2 binary_op2,
-		const unsigned P)
-	{
-		return ::std::inner_product(first1, last1, first2, init, binary_op1, binary_op2);
-	}
-};
-
-} // end namespace detail
-
-template <class InputIterator1, class InputIterator2, class T,
-          class BinaryFunction1, class BinaryFunction2>
-T inner_product(InputIterator1 first1, InputIterator1 last1,
-                InputIterator2 first2, T init,
-		BinaryFunction1 binary_op1, BinaryFunction2 binary_op2,
-		const unsigned P)
-{
-	return detail::_InnerProduct<
-	typename ::std::iterator_traits<InputIterator1>::iterator_category,
-	typename ::std::iterator_traits<InputIterator2>::iterator_category>
-			::inner_product(first1, last1, first2, init,
-				    binary_op1, binary_op2, P);
-}
-
-template <class InputIterator1, class InputIterator2, class T>
-T inner_product(InputIterator1 first1, InputIterator1 last1,
-                InputIterator2 first2, T init, const unsigned P)
-{
-	return ::omptl::inner_product(first1, last1, first2, init,
-				::std::plus<T>(), ::std::multiplies<T>(), P);
-}
-
-template <class InputIterator, class OutputIterator, class BinaryOperation>
-OutputIterator partial_sum(InputIterator first, InputIterator last,
-                           OutputIterator result, BinaryOperation binary_op,
-			   const unsigned P)
-{
-	return ::std::partial_sum(first, last, result, binary_op);
-}
-
-template <class InputIterator, class OutputIterator>
-OutputIterator partial_sum(InputIterator first, InputIterator last,
-                           OutputIterator result, const unsigned P)
-{
-// ::std::plus<typename ::std::iterator_traits<InputIterator>::value_type>(),
-	return ::std::partial_sum(first, last, result);
-}
-
-} // namespace omptl
-
-#endif /* OMPTL_NUMERIC_H */
diff --git a/lib/omptl/omptl_numeric_ser.h b/lib/omptl/omptl_numeric_ser.h
deleted file mode 100644
index 912f93ec6..000000000
--- a/lib/omptl/omptl_numeric_ser.h
+++ /dev/null
@@ -1,87 +0,0 @@
-// Copyright (C) 2006 Fokko Beekhof
-// Email contact: Fokko.Beekhof@unige.ch
-
-// The OMPTL library is free software; you can redistribute it and/or
-// modify it under the terms of the GNU Lesser General Public
-// License as published by the Free Software Foundation; either
-// version 2.1 of the License, or (at your option) any later version.
-
-// This library is distributed in the hope that it will be useful,
-// but WITHOUT ANY WARRANTY; without even the implied warranty of
-// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-// Lesser General Public License for more details.
-
-// You should have received a copy of the GNU Lesser General Public
-// License along with this library; if not, write to the Free Software
-// Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
-
-namespace omptl
-{
-
-template <class InputIterator, class T>
-T accumulate(InputIterator first, InputIterator last, T init,
-		const unsigned P)
-{
-	return ::std::accumulate(first, last, init);
-}
-
-template <class InputIterator, class T, class BinaryFunction>
-T accumulate(InputIterator first, InputIterator last, T init,
-	     BinaryFunction binary_op, const unsigned P)
-{
-	return ::std::accumulate(first, last, init, binary_op);
-}
-
-/*
- * Not (yet) paralellized due to data dependance.
- */
-template <class InputIterator, class OutputIterator, class BinaryFunction>
-OutputIterator adjacent_difference(InputIterator first, InputIterator last,
-			OutputIterator result, BinaryFunction binary_op,
-			const unsigned P)
-{
-	return ::std::adjacent_difference(first, last, result, binary_op);
-}
-
-template <class InputIterator, class OutputIterator>
-OutputIterator adjacent_difference(InputIterator first, InputIterator last,
-				OutputIterator result, const unsigned P)
-{
-	return ::std::adjacent_difference(first, last, result);
-}
-
-template <class InputIterator1, class InputIterator2, class T,
-          class BinaryFunction1, class BinaryFunction2>
-T inner_product(InputIterator1 first1, InputIterator1 last1,
-		InputIterator2 first2, T init,
-		BinaryFunction1 binary_op1, BinaryFunction2 binary_op2,
-		const unsigned P)
-{
-	return ::std::inner_product(first1, last1, first2, init,
-				    binary_op1, binary_op2);
-}
-
-template <class InputIterator1, class InputIterator2, class T>
-T inner_product(InputIterator1 first1, InputIterator1 last1,
-		InputIterator2 first2, T init, const unsigned P)
-{
-	return ::std::inner_product(first1, last1, first2, init);
-}
-
-// Not paralellized due to dependencies and complications with OutputIterators.
-template <class InputIterator, class OutputIterator, class BinaryOperation>
-OutputIterator partial_sum(InputIterator first, InputIterator last,
-			   OutputIterator result, BinaryOperation binary_op,
-			   const unsigned P)
-{
-	return ::std::partial_sum(first, last, result, binary_op);
-}
-
-template <class InputIterator, class OutputIterator>
-OutputIterator partial_sum(InputIterator first, InputIterator last,
-		OutputIterator result, const unsigned P)
-{
-	return ::std::partial_sum(first, last, result);
-}
-
-} // namespace omptl
diff --git a/lib/omptl/omptl_tools.h b/lib/omptl/omptl_tools.h
deleted file mode 100644
index 72ce323c9..000000000
--- a/lib/omptl/omptl_tools.h
+++ /dev/null
@@ -1,266 +0,0 @@
-// Copyright (C) 2006-2011 Fokko Beekhof
-// Email contact: Fokko.Beekhof@unige.ch
-
-// The OMPTL library is free software; you can redistribute it and/or
-// modify it under the terms of the GNU Lesser General Public
-// License as published by the Free Software Foundation; either
-// version 2.1 of the License, or (at your option) any later version.
-
-// This library is distributed in the hope that it will be useful,
-// but WITHOUT ANY WARRANTY; without even the implied warranty of
-// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-// Lesser General Public License for more details.
-
-// You should have received a copy of the GNU Lesser General Public
-// License along with this library; if not, write to the Free Software
-// Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
-
-#ifndef OMPTL_TOOLS_H
-#define OMPTL_TOOLS_H 1
-
-#include <utility>
-#include <vector>
-#include <cassert>
-#include <algorithm>
-#include <climits>
-#include <iterator>
-
-//#include <tr1/cmath>
-
-namespace omptl
-{
-
-namespace detail
-{
-
-
-// Log of the number of operations that is expected to run faster in a single
-// thread.
-const unsigned C = 12;
-
-template <typename T>
-T log2N_(T n)
-{
-	assert(n > 0);
-	const std::size_t N = CHAR_BIT*sizeof(T);
-
-	T result = 0;
-	for (std::size_t i = 1; i < N; ++i)
-	{
-		const std::size_t M = N-i;
-		if ( n >= (std::size_t(1) << M) )
-		{
-			n >>= M;
-			result |= M;
-		}
-	}
-
-	return result;
-}
-
-template<typename Iterator>
-bool _linear_serial_is_faster(Iterator first, Iterator last,
-			     const unsigned P)
-{
-	assert(P > 0u);
-	assert(std::distance(first, last) >= 0);
-	const std::size_t N = std::distance(first, last);
-
-	return (N < 4u*P) || (log2N_(N) < C);
-}
-
-template<typename Iterator>
-bool _logn_serial_is_faster(Iterator first, Iterator last,
-			    const unsigned P)
-{
-	assert(P > 0u);
-	assert(std::distance(first, last) >= 0);
-	const std::size_t N = std::distance(first, last);
-
-	return (N < 4u*P) || (log2N_(N) < (1 << C));
-}
-
-template<typename Iterator>
-bool _nlogn_serial_is_faster(Iterator first, Iterator last,
-			    const unsigned P)
-{
-	assert(P > 0u);
-	assert(std::distance(first, last) >= 0);
-	const std::size_t N = std::distance(first, last);
-
-	return (N < 4u*P) || (N*log2N_(N) < (1 << C));
-}
-
-template<typename Iterator1, typename Iterator2>
-void _copy_partitions(const std::vector< std::pair<Iterator1, Iterator1> >
-			&source_partitions, Iterator2 first,
-		std::vector<Iterator2> &dest_partitions, const unsigned P)
-{
-	assert(source_partitions.size() == P);
-	assert(dest_partitions.size() == P);
-	for (unsigned i = 0; i < P; ++i)
-	{
-		dest_partitions[i] = first;
-
-		// The last "advance" is very important, it may create space
-		// if it is an InsertIterator or something like that.
-		std::advance(first, std::distance(
-						source_partitions[i].first,
-						source_partitions[i].second) );
-	}
-}
-
-// Divide a given range into P partitions
-template<typename Iterator>
-void _partition_range(const Iterator first, const Iterator last,
-		std::vector< std::pair<Iterator, Iterator> > &partitions,
-		const unsigned P)
-{
-	assert(partitions.size() == P);
-
-	typedef std::pair<Iterator, Iterator> Partition;
-
-	const std::size_t N = std::distance(first, last);
-
-	// All but last partition have same range
-	Iterator currentLast = first;
-	for (unsigned i = 0; i < P - 1; ++i)
-	{
-		const Iterator prev = currentLast;
-		currentLast = first;
-		std::advance(currentLast, (i+1)*N/P);
-		partitions[i] = Partition(prev, currentLast);
-	}
-	assert(std::distance(currentLast, last) >= 0);
-
-	// Last range may be shorter
-	partitions[P - 1] = Partition(currentLast, last);
-}
-
-// Given a range, re-arrange the items such that all elements smaller than
-// the pivot precede all other values. Returns an Iterator to the first
-// element not smaller than the pivot.
-template<typename Iterator, class StrictWeakOrdering>
-Iterator _stable_pivot_range(Iterator first, Iterator last,
-	const typename std::iterator_traits<Iterator>::value_type pivot,
-	StrictWeakOrdering comp = std::less<
-		typename std::iterator_traits<Iterator>::value_type>())
-{
-	Iterator pivotIt = last;
-	while (first < last)
-	{
-		if (comp(*first, pivot))
-			++first;
-		else
-		{
-			Iterator high = first;
-			while ( (++high < last) && !comp(*high, pivot) )
-				/* nop */;
-			if (high < last)
-				std::iter_swap(first, last);
-			first = pivotIt = ++high;
-		}
-	}
-
-	return pivotIt;
-}
-
-template<typename Iterator>
-void _partition_range_stable_by_pivots(Iterator first, Iterator last,
-	const std::vector<typename
-			std::iterator_traits<Iterator>::value_type> &pivots,
-	std::vector< std::pair<Iterator, Iterator> > &partitions,
-	std::less<typename std::iterator_traits<Iterator>::value_type> comp,
-	const unsigned P)
-{
-	assert(partitions.size() == P);
-	assert(pivots.size() == P);
-	typedef std::pair<Iterator, Iterator> Partition;
-
-	Iterator start = first;
-	for (unsigned i = 0; i < P - 1; ++i)
-	{
-		Iterator low = start;
-
-		while (low < last)
-		{
-			// Find a value not lower than the pivot.
-			while( (*low < pivots[i]) && (low < last) )
-				std::advance(low, 1);
-
-			// Entire range scanned ?
-			if (low == last) break;
-
-			// Find a value lower than the pivot, starting from
-			// low, working our way up.
-			Iterator high = low;
-			std::advance(high, 1);
-			while( !(*high < pivots[i]) && (high < last) )
-				std::advance(high, 1);
-
-			// Entire range scanned ?
-			if (high == last) break;
-
-			// Swap values
-			assert( !(*low<pivots[i]) && (*high<pivots[i]) );
-			std::iter_swap(low, high);
-		}
-
-		partitions[i] = Partition(start, low);
-		start = low;
-	}
-	partitions[P - 1] = Partition(start, last);
-}
-
-template<typename RandomAccessIterator, class StrictWeakOrdering>
-void _find_pivots(RandomAccessIterator first, RandomAccessIterator last,
-	std::vector<typename
-	std::iterator_traits<RandomAccessIterator>::value_type> &pivots,
-	StrictWeakOrdering comp, const unsigned P)
-{
-	const std::size_t N = std::distance(first, last);
-
-	assert(N > P);
-
-	pivots.clear();
-	pivots.reserve(P - 1);
-
-	typedef typename
-	    std::iterator_traits<RandomAccessIterator>::value_type value_type;
-
-	/*
-	 * The sample ratio of 3 is used to sample more data. This way, the pivots can be
-	 * chosen more wisely, which is our only guarantee we can generate partitions
-	 * of equal size.
-	 */
-	const std::size_t NSAMPLES = std::min( 3u*std::size_t(P), N);
-	std::vector<value_type> samples;
-	samples.reserve(NSAMPLES);
-
-	for (std::size_t i = 0; i < NSAMPLES; ++i)
-	{
-		const std::size_t index = i * (N-1) / (NSAMPLES - 1);
-		assert(index < N);
-		samples.push_back(*(first + index));
-// std::cout << "index: " << index << " sample: " << samples[i] << std::endl;
-	}
-	assert(samples.size() == NSAMPLES);
-
-	// Sort samples to create relative ordering in data
-	std::sort(samples.begin(), samples.end(), comp );
-
-	// Take pivots from sampled data
-	for (std::size_t i = 0; i < P-1; ++i)
-	{
-		pivots.push_back(samples[std::min(1+3*i, N-1)]);
-/*std::cout << "pivot: " << i << " idx: " << (i * samples.size() / P)
-	<< " " << pivots[i-1] << std::endl;*/
-	}
-	assert(pivots.size() == P - 1);
-}
-
-}  // namespace detail
-
-}  // namespace omptl
-
-#endif /* OMPTL_TOOLS_H */
diff --git a/lib/tantan/CMakeLists.txt b/lib/tantan/CMakeLists.txt
new file mode 100644
index 000000000..27b1bb1d3
--- /dev/null
+++ b/lib/tantan/CMakeLists.txt
@@ -0,0 +1,3 @@
+add_library(tantan tantan.cpp tantan.h mcf_simd.h)
+set_target_properties(tantan PROPERTIES COMPILE_FLAGS "${MMSEQS_CXX_FLAGS}" LINK_FLAGS "${MMSEQS_CXX_FLAGS}")
+
diff --git a/lib/tantan/mcf_simd.h b/lib/tantan/mcf_simd.h
new file mode 100644
index 000000000..c5dd4687a
--- /dev/null
+++ b/lib/tantan/mcf_simd.h
@@ -0,0 +1,538 @@
+// Author: Martin C. Frith 2019
+// SPDX-License-Identifier: MPL-2.0
+
+#ifndef MCF_SIMD_HH
+#define MCF_SIMD_HH
+
+#if defined __SSE4_1__
+#include <immintrin.h>
+#elif defined __ARM_NEON
+#include <arm_neon.h>
+#endif
+
+#include <stddef.h>  // size_t
+
+namespace mcf {
+
+#if defined __AVX2__
+
+typedef __m256i SimdInt;
+typedef __m256i SimdUint1;
+typedef __m256d SimdDbl;
+
+const int simdBytes = 32;
+
+static inline SimdInt simdZero() {
+  return _mm256_setzero_si256();
+}
+
+static inline SimdInt simdZero1() {
+  return _mm256_setzero_si256();
+}
+
+static inline SimdDbl simdZeroDbl() {
+  return _mm256_setzero_pd();
+}
+
+static inline SimdInt simdOnes1() {
+  return _mm256_set1_epi32(-1);
+}
+
+static inline SimdInt simdLoad(const void *p) {
+  return _mm256_loadu_si256((const SimdInt *)p);
+}
+
+static inline SimdInt simdLoad1(const void *p) {
+  return _mm256_loadu_si256((const SimdInt *)p);
+}
+
+static inline SimdDbl simdLoadDbl(const double *p) {
+  return _mm256_loadu_pd(p);
+}
+
+static inline void simdStore(void *p, SimdInt x) {
+  _mm256_storeu_si256((SimdInt *)p, x);
+}
+
+static inline void simdStore1(void *p, SimdInt x) {
+  _mm256_storeu_si256((SimdInt *)p, x);
+}
+
+static inline void simdStoreDbl(double *p, SimdDbl x) {
+  _mm256_storeu_pd(p, x);
+}
+
+static inline SimdInt simdOr1(SimdInt x, SimdInt y) {
+  return _mm256_or_si256(x, y);
+}
+
+static inline SimdInt simdBlend(SimdInt x, SimdInt y, SimdInt mask) {
+  return _mm256_blendv_epi8(x, y, mask);
+}
+
+const int simdLen = 8;
+const int simdDblLen = 4;
+
+static inline SimdInt simdSet(int i7, int i6, int i5, int i4,
+			      int i3, int i2, int i1, int i0) {
+  return _mm256_set_epi32(i7, i6, i5, i4, i3, i2, i1, i0);
+}
+
+static inline SimdInt simdSet1(char jF, char jE, char jD, char jC,
+			       char jB, char jA, char j9, char j8,
+			       char j7, char j6, char j5, char j4,
+			       char j3, char j2, char j1, char j0,
+			       char iF, char iE, char iD, char iC,
+			       char iB, char iA, char i9, char i8,
+			       char i7, char i6, char i5, char i4,
+			       char i3, char i2, char i1, char i0) {
+  return _mm256_set_epi8(jF, jE, jD, jC, jB, jA, j9, j8,
+			 j7, j6, j5, j4, j3, j2, j1, j0,
+			 iF, iE, iD, iC, iB, iA, i9, i8,
+			 i7, i6, i5, i4, i3, i2, i1, i0);
+}
+
+static inline SimdDbl simdSetDbl(double i3, double i2, double i1, double i0) {
+  return _mm256_set_pd(i3, i2, i1, i0);
+}
+
+static inline SimdInt simdFill(int x) {
+  return _mm256_set1_epi32(x);
+}
+
+static inline SimdInt simdFill1(char x) {
+  return _mm256_set1_epi8(x);
+}
+
+static inline SimdDbl simdFillDbl(double x) {
+  return _mm256_set1_pd(x);
+}
+
+static inline SimdInt simdGt(SimdInt x, SimdInt y) {
+  return _mm256_cmpgt_epi32(x, y);
+}
+
+static inline SimdInt simdGe1(SimdInt x, SimdInt y) {
+  return _mm256_cmpeq_epi8(_mm256_min_epu8(x, y), y);
+}
+
+static inline SimdInt simdAdd(SimdInt x, SimdInt y) {
+  return _mm256_add_epi32(x, y);
+}
+
+static inline SimdInt simdAdd1(SimdInt x, SimdInt y) {
+  return _mm256_add_epi8(x, y);
+}
+
+static inline SimdInt simdAdds1(SimdInt x, SimdInt y) {
+  return _mm256_adds_epu8(x, y);
+}
+
+static inline SimdDbl simdAddDbl(SimdDbl x, SimdDbl y) {
+  return _mm256_add_pd(x, y);
+}
+
+static inline SimdInt simdSub(SimdInt x, SimdInt y) {
+  return _mm256_sub_epi32(x, y);
+}
+
+static inline SimdInt simdSub1(SimdInt x, SimdInt y) {
+  return _mm256_sub_epi8(x, y);
+}
+
+static inline SimdDbl simdMulDbl(SimdDbl x, SimdDbl y) {
+  return _mm256_mul_pd(x, y);
+}
+
+static inline SimdInt simdQuadruple1(SimdInt x) {
+  return _mm256_slli_epi32(x, 2);
+}
+
+static inline SimdInt simdMax(SimdInt x, SimdInt y) {
+  return _mm256_max_epi32(x, y);
+}
+
+static inline SimdInt simdMin1(SimdInt x, SimdInt y) {
+  return _mm256_min_epu8(x, y);
+}
+
+static inline int simdHorizontalMax(SimdInt x) {
+  __m128i z = _mm256_castsi256_si128(x);
+  z = _mm_max_epi32(z, _mm256_extracti128_si256(x, 1));
+  z = _mm_max_epi32(z, _mm_shuffle_epi32(z, 0x4E));
+  z = _mm_max_epi32(z, _mm_shuffle_epi32(z, 0xB1));
+  return _mm_cvtsi128_si32(z);
+}
+
+static inline int simdHorizontalMin1(SimdInt x) {
+  __m128i z = _mm256_castsi256_si128(x);
+  z = _mm_min_epu8(z, _mm256_extracti128_si256(x, 1));
+  z = _mm_min_epu8(z, _mm_srli_epi16(z, 8));
+  z = _mm_minpos_epu16(z);
+  return _mm_extract_epi16(z, 0);
+}
+
+static inline double simdHorizontalAddDbl(SimdDbl x) {
+  __m128d z = _mm256_castpd256_pd128(x);
+  z = _mm_add_pd(z, _mm256_extractf128_pd(x, 1));
+  return _mm_cvtsd_f64(_mm_hadd_pd(z, z));
+}
+
+static inline SimdInt simdChoose1(SimdInt items, SimdInt choices) {
+  return _mm256_shuffle_epi8(items, choices);
+}
+
+#elif defined __SSE4_1__
+
+typedef __m128i SimdInt;
+typedef __m128i SimdUint1;
+typedef __m128d SimdDbl;
+
+const int simdBytes = 16;
+
+static inline SimdInt simdZero() {
+  return _mm_setzero_si128();
+}
+
+static inline SimdInt simdZero1() {
+  return _mm_setzero_si128();
+}
+
+static inline SimdDbl simdZeroDbl() {
+  return _mm_setzero_pd();
+}
+
+static inline SimdInt simdOnes1() {
+  return _mm_set1_epi32(-1);
+}
+
+static inline SimdInt simdLoad(const void *p) {
+  return _mm_loadu_si128((const SimdInt *)p);
+}
+
+static inline SimdInt simdLoad1(const void *p) {
+  return _mm_loadu_si128((const SimdInt *)p);
+}
+
+static inline SimdDbl simdLoadDbl(const double *p) {
+  return _mm_loadu_pd(p);
+}
+
+static inline void simdStore(void *p, SimdInt x) {
+  _mm_storeu_si128((SimdInt *)p, x);
+}
+
+static inline void simdStore1(void *p, SimdInt x) {
+  _mm_storeu_si128((SimdInt *)p, x);
+}
+
+static inline void simdStoreDbl(double *p, SimdDbl x) {
+  _mm_storeu_pd(p, x);
+}
+
+static inline SimdInt simdOr1(SimdInt x, SimdInt y) {
+  return _mm_or_si128(x, y);
+}
+
+static inline SimdInt simdBlend(SimdInt x, SimdInt y, SimdInt mask) {
+  return _mm_blendv_epi8(x, y, mask);  // SSE4.1
+}
+
+const int simdLen = 4;
+const int simdDblLen = 2;
+
+static inline SimdInt simdSet(int i3, int i2, int i1, int i0) {
+  return _mm_set_epi32(i3, i2, i1, i0);
+}
+
+static inline SimdInt simdSet1(char iF, char iE, char iD, char iC,
+			       char iB, char iA, char i9, char i8,
+			       char i7, char i6, char i5, char i4,
+			       char i3, char i2, char i1, char i0) {
+  return _mm_set_epi8(iF, iE, iD, iC, iB, iA, i9, i8,
+		      i7, i6, i5, i4, i3, i2, i1, i0);
+}
+
+static inline SimdDbl simdSetDbl(double i1, double i0) {
+  return _mm_set_pd(i1, i0);
+}
+
+static inline SimdInt simdFill(int x) {
+  return _mm_set1_epi32(x);
+}
+
+static inline SimdInt simdFill1(char x) {
+  return _mm_set1_epi8(x);
+}
+
+static inline SimdDbl simdFillDbl(double x) {
+  return _mm_set1_pd(x);
+}
+
+static inline SimdInt simdGt(SimdInt x, SimdInt y) {
+  return _mm_cmpgt_epi32(x, y);
+}
+
+static inline SimdInt simdGe1(SimdInt x, SimdInt y) {
+  return _mm_cmpeq_epi8(_mm_min_epu8(x, y), y);
+}
+
+static inline SimdInt simdAdd(SimdInt x, SimdInt y) {
+  return _mm_add_epi32(x, y);
+}
+
+static inline SimdInt simdAdd1(SimdInt x, SimdInt y) {
+  return _mm_add_epi8(x, y);
+}
+
+static inline SimdInt simdAdds1(SimdInt x, SimdInt y) {
+  return _mm_adds_epu8(x, y);
+}
+
+static inline SimdDbl simdAddDbl(SimdDbl x, SimdDbl y) {
+  return _mm_add_pd(x, y);
+}
+
+static inline SimdInt simdSub(SimdInt x, SimdInt y) {
+  return _mm_sub_epi32(x, y);
+}
+
+static inline SimdInt simdSub1(SimdInt x, SimdInt y) {
+  return _mm_sub_epi8(x, y);
+}
+
+static inline SimdDbl simdMulDbl(SimdDbl x, SimdDbl y) {
+  return _mm_mul_pd(x, y);
+}
+
+static inline SimdInt simdQuadruple1(SimdInt x) {
+  return _mm_slli_epi32(x, 2);
+}
+
+static inline SimdInt simdMax(SimdInt x, SimdInt y) {
+  return _mm_max_epi32(x, y);  // SSE4.1
+}
+
+static inline SimdInt simdMin1(SimdInt x, SimdInt y) {
+  return _mm_min_epu8(x, y);
+}
+
+static inline int simdHorizontalMax(SimdInt x) {
+  x = simdMax(x, _mm_shuffle_epi32(x, 0x4E));
+  x = simdMax(x, _mm_shuffle_epi32(x, 0xB1));
+  return _mm_cvtsi128_si32(x);
+}
+
+static inline int simdHorizontalMin1(SimdInt x) {
+  x = _mm_min_epu8(x, _mm_srli_epi16(x, 8));
+  x = _mm_minpos_epu16(x);  // SSE4.1
+  return _mm_extract_epi16(x, 0);
+}
+
+static inline double simdHorizontalAddDbl(SimdDbl x) {
+  return _mm_cvtsd_f64(_mm_hadd_pd(x, x));
+}
+
+static inline SimdInt simdChoose1(SimdInt items, SimdInt choices) {
+  return _mm_shuffle_epi8(items, choices);  // SSSE3
+}
+
+#elif defined __ARM_NEON
+
+typedef int32x4_t SimdInt;
+typedef uint32x4_t SimdUint;
+typedef uint8x16_t SimdUint1;
+typedef float64x2_t SimdDbl;
+
+const int simdBytes = 16;
+
+static inline SimdInt simdZero() {
+  return vdupq_n_s32(0);
+}
+
+static inline SimdUint1 simdZero1() {
+  return vdupq_n_u8(0);
+}
+
+static inline SimdDbl simdZeroDbl() {
+  return vdupq_n_f64(0);
+}
+
+static inline SimdUint1 simdOnes1() {
+  return vdupq_n_u8(-1);
+}
+
+static inline SimdInt simdLoad(const int *p) {
+  return vld1q_s32(p);
+}
+
+static inline SimdUint1 simdLoad1(const unsigned char *p) {
+  return vld1q_u8(p);
+}
+
+static inline SimdDbl simdLoadDbl(const double *p) {
+  return vld1q_f64(p);
+}
+
+static inline void simdStore(int *p, SimdInt x) {
+  vst1q_s32(p, x);
+}
+
+static inline void simdStore1(unsigned char *p, SimdUint1 x) {
+  vst1q_u8(p, x);
+}
+
+static inline void simdStoreDbl(double *p, SimdDbl x) {
+  vst1q_f64(p, x);
+}
+
+static inline SimdUint1 simdOr1(SimdUint1 x, SimdUint1 y) {
+  return vorrq_u8(x, y);
+}
+
+static inline SimdInt simdBlend(SimdInt x, SimdInt y, SimdUint mask) {
+  return vbslq_s32(mask, y, x);
+}
+
+const int simdLen = 4;
+const int simdDblLen = 2;
+
+static inline SimdInt simdSet(unsigned i3, unsigned i2,
+                              unsigned i1, unsigned i0) {
+  size_t lo = i1;
+  size_t hi = i3;
+  return
+    vcombine_s32(vcreate_s32((lo << 32) | i0), vcreate_s32((hi << 32) | i2));
+}
+
+static inline SimdUint1 simdSet1(unsigned char iF, unsigned char iE,
+				 unsigned char iD, unsigned char iC,
+				 unsigned char iB, unsigned char iA,
+				 unsigned char i9, unsigned char i8,
+				 unsigned char i7, unsigned char i6,
+				 unsigned char i5, unsigned char i4,
+				 unsigned char i3, unsigned char i2,
+				 unsigned char i1, unsigned char i0) {
+  size_t lo =
+    (size_t)i0       | (size_t)i1 <<  8 | (size_t)i2 << 16 | (size_t)i3 << 24 |
+    (size_t)i4 << 32 | (size_t)i5 << 40 | (size_t)i6 << 48 | (size_t)i7 << 56;
+
+  size_t hi =
+    (size_t)i8       | (size_t)i9 <<  8 | (size_t)iA << 16 | (size_t)iB << 24 |
+    (size_t)iC << 32 | (size_t)iD << 40 | (size_t)iE << 48 | (size_t)iF << 56;
+
+  return vcombine_u8(vcreate_u8(lo), vcreate_u8(hi));
+}
+
+static inline SimdDbl simdSetDbl(double i1, double i0) {
+  return vcombine_f64(vdup_n_f64(i0), vdup_n_f64(i1));
+}
+
+static inline SimdInt simdFill(int x) {
+  return vdupq_n_s32(x);
+}
+
+static inline SimdUint1 simdFill1(unsigned char x) {
+  return vdupq_n_u8(x);
+}
+
+static inline SimdDbl simdFillDbl(double x) {
+  return vdupq_n_f64(x);
+}
+
+static inline SimdUint simdGt(SimdInt x, SimdInt y) {
+  return vcgtq_s32(x, y);
+}
+
+static inline SimdUint1 simdGe1(SimdUint1 x, SimdUint1 y) {
+  return vcgeq_u8(x, y);
+}
+
+static inline SimdInt simdAdd(SimdInt x, SimdInt y) {
+  return vaddq_s32(x, y);
+}
+
+static inline SimdUint1 simdAdd1(SimdUint1 x, SimdUint1 y) {
+  return vaddq_u8(x, y);
+}
+
+static inline SimdUint1 simdAdds1(SimdUint1 x, SimdUint1 y) {
+  return vqaddq_u8(x, y);
+}
+
+static inline SimdDbl simdAddDbl(SimdDbl x, SimdDbl y) {
+  return vaddq_f64(x, y);
+}
+
+static inline SimdInt simdSub(SimdInt x, SimdInt y) {
+  return vsubq_s32(x, y);
+}
+
+static inline SimdUint1 simdSub1(SimdUint1 x, SimdUint1 y) {
+  return vsubq_u8(x, y);
+}
+
+static inline SimdDbl simdMulDbl(SimdDbl x, SimdDbl y) {
+  return vmulq_f64(x, y);
+}
+
+static inline SimdUint1 simdQuadruple1(SimdUint1 x) {
+  return vshlq_n_u8(x, 2);
+}
+
+static inline SimdInt simdMax(SimdInt x, SimdInt y) {
+  return vmaxq_s32(x, y);
+}
+
+static inline SimdUint1 simdMin1(SimdUint1 x, SimdUint1 y) {
+  return vminq_u8(x, y);
+}
+
+static inline int simdHorizontalMax(SimdInt x) {
+  return vmaxvq_s32(x);
+}
+
+static inline int simdHorizontalMin1(SimdUint1 x) {
+  return vminvq_u8(x);
+}
+
+static inline double simdHorizontalAddDbl(SimdDbl x) {
+  return vaddvq_f64(x);
+}
+
+static inline SimdUint1 simdChoose1(SimdUint1 items, SimdUint1 choices) {
+  return vqtbl1q_u8(items, choices);
+}
+
+#else
+
+typedef int SimdInt;
+typedef double SimdDbl;
+const int simdBytes = 1;
+const int simdLen = 1;
+const int simdDblLen = 1;
+static inline int simdZero() { return 0; }
+static inline double simdZeroDbl() { return 0; }
+static inline int simdSet(int x) { return x; }
+static inline double simdSetDbl(double x) { return x; }
+static inline int simdFill(int x) { return x; }
+static inline int simdLoad(const int *p) { return *p; }
+static inline double simdLoadDbl(const double *p) { return *p; }
+static inline void simdStore(int *p, int x) { *p = x; }
+static inline void simdStoreDbl(double *p, double x) { *p = x; }
+static inline double simdFillDbl(double x) { return x; }
+static inline int simdGt(int x, int y) { return x > y; }
+static inline int simdAdd(int x, int y) { return x + y; }
+static inline double simdAddDbl(double x, double y) { return x + y; }
+static inline int simdSub(int x, int y) { return x - y; }
+static inline double simdMulDbl(double x, double y) { return x * y; }
+static inline int simdMax(int x, int y) { return x > y ? x : y; }
+static inline int simdBlend(int x, int y, int mask) { return mask ? y : x; }
+static inline int simdHorizontalMax(int a) { return a; }
+static inline double simdHorizontalAddDbl(double x) { return x; }
+
+#endif
+
+}
+
+#endif
diff --git a/lib/tantan/tantan.cpp b/lib/tantan/tantan.cpp
new file mode 100644
index 000000000..0863b75a4
--- /dev/null
+++ b/lib/tantan/tantan.cpp
@@ -0,0 +1,553 @@
+// Author: Martin C. Frith 2010
+// SPDX-License-Identifier: MPL-2.0
+
+#include "tantan.h"
+#include "mcf_simd.h"
+
+#include <algorithm>  // fill, max
+#include <cassert>
+#include <cmath>  // pow, abs
+#include <iostream>  // cerr
+#include <numeric>  // accumulate
+#include <vector>
+
+#define BEG(v) ((v).empty() ? 0 : &(v).front())
+#define END(v) ((v).empty() ? 0 : &(v).back() + 1)
+
+namespace tantan {
+
+using namespace mcf;
+
+void multiplyAll(std::vector<double> &v, double factor) {
+  for (std::vector<double>::iterator i = v.begin(); i < v.end(); ++i)
+    *i *= factor;
+}
+
+double firstRepeatOffsetProb(double probMult, int maxRepeatOffset) {
+  if (probMult < 1 || probMult > 1) {
+    return (1 - probMult) / (1 - std::pow(probMult, maxRepeatOffset));
+  }
+  return 1.0 / maxRepeatOffset;
+}
+
+void checkForwardAndBackwardTotals(double fTot, double bTot) {
+  double x = std::abs(fTot);
+  double y = std::abs(bTot);
+
+  // ??? Is 1e6 suitable here ???
+  if (std::abs(fTot - bTot) > std::max(x, y) / 1e6)
+    std::cerr << "tantan: warning: possible numeric inaccuracy\n"
+              << "tantan:          forward algorithm total: " << fTot << "\n"
+              << "tantan:          backward algorithm total: " << bTot << "\n";
+}
+
+struct Tantan {
+  enum { scaleStepSize = 16 };
+
+  const uchar *seqBeg;  // start of the sequence
+  const uchar *seqEnd;  // end of the sequence
+  const uchar *seqPtr;  // current position in the sequence
+
+  int maxRepeatOffset;
+
+  const const_double_ptr *likelihoodRatioMatrix;
+
+  double b2b;  // transition probability from background to background
+  double f2b;  // transition probability from foreground to background
+  double g2g;  // transition probability from gap/indel to gap/indel
+  //double f2g;  // transition probability from foreground to gap/indel
+  //double g2f;  // transition probability from gap/indel to foreground
+  double oneGapProb;  // f2g * g2f
+  double endGapProb;  // f2g * 1
+  double f2f0;  // foreground to foreground, if there are 0 indel transitions
+  double f2f1;  // foreground to foreground, if there is 1 indel transition
+  double f2f2;  // foreground to foreground, if there are 2 indel transitions
+  double b2fDecay;
+  double b2fGrowth;
+  double b2fFirst;  // background state to first foreground state
+  double b2fLast;  // background state to last foreground state
+
+  double backgroundProb;
+  std::vector<double> b2fProbs;  // background state to each foreground state
+  std::vector<double> foregroundProbs;
+  std::vector<double> insertionProbs;
+
+  std::vector<double> scaleFactors;
+
+  Tantan(const uchar *seqBeg,
+         const uchar *seqEnd,
+         int maxRepeatOffset,
+         const const_double_ptr *likelihoodRatioMatrix,
+         double repeatProb,
+         double repeatEndProb,
+         double repeatOffsetProbDecay,
+         double firstGapProb,
+         double otherGapProb) {
+    assert(maxRepeatOffset > 0);
+    assert(repeatProb >= 0 && repeatProb < 1);
+    // (if repeatProb==1, then any sequence is impossible)
+    assert(repeatEndProb >= 0 && repeatEndProb <= 1);
+    assert(repeatOffsetProbDecay > 0 && repeatOffsetProbDecay <= 1);
+    assert(otherGapProb >= 0 && otherGapProb <= 1);
+    assert(firstGapProb >= 0);
+    assert(repeatEndProb + firstGapProb * 2 <= 1);
+
+    this->seqBeg = seqBeg;
+    this->seqEnd = seqEnd;
+    this->seqPtr = seqBeg;
+    this->maxRepeatOffset = maxRepeatOffset;
+    this->likelihoodRatioMatrix = likelihoodRatioMatrix;
+
+    b2b = 1 - repeatProb;
+    f2b = repeatEndProb;
+    g2g = otherGapProb;
+    //f2g = firstGapProb;
+    //g2f = 1 - otherGapProb;
+    oneGapProb = firstGapProb * (1 - otherGapProb);
+    endGapProb = firstGapProb * (maxRepeatOffset > 1);
+    f2f0 = 1 - repeatEndProb;
+    f2f1 = 1 - repeatEndProb - firstGapProb;
+    f2f2 = 1 - repeatEndProb - firstGapProb * 2;
+
+    b2fDecay = repeatOffsetProbDecay;
+    b2fGrowth = 1 / repeatOffsetProbDecay;
+
+    b2fFirst = repeatProb * firstRepeatOffsetProb(b2fDecay, maxRepeatOffset);
+    b2fLast = repeatProb * firstRepeatOffsetProb(b2fGrowth, maxRepeatOffset);
+
+    b2fProbs.resize(maxRepeatOffset);
+    foregroundProbs.resize(maxRepeatOffset);
+    insertionProbs.resize(maxRepeatOffset - 1);
+
+    double p = b2fFirst;
+    for (int i = 0; i < maxRepeatOffset; ++i) {
+      b2fProbs[i] = p;
+      p *= b2fDecay;
+    }
+
+    scaleFactors.resize((seqEnd - seqBeg) / scaleStepSize);
+  }
+
+  void initializeForwardAlgorithm() {
+    backgroundProb = 1.0;
+    std::fill(foregroundProbs.begin(), foregroundProbs.end(), 0.0);
+    std::fill(insertionProbs.begin(), insertionProbs.end(), 0.0);
+  }
+
+  double forwardTotal() {
+    double fromForeground = std::accumulate(foregroundProbs.begin(),
+                                            foregroundProbs.end(), 0.0);
+    double total = backgroundProb * b2b + fromForeground * f2b;
+    assert(total > 0);
+    return total;
+  }
+
+  void initializeBackwardAlgorithm() {
+    backgroundProb = b2b;
+    std::fill(foregroundProbs.begin(), foregroundProbs.end(), f2b);
+    std::fill(insertionProbs.begin(), insertionProbs.end(), 0.0);
+  }
+
+  double backwardTotal() {
+    assert(backgroundProb > 0);
+    return backgroundProb;
+  }
+
+  void calcForwardTransitionProbsWithGaps() {
+    double fromBackground = backgroundProb * b2fLast;
+    double *foregroundPtr = &foregroundProbs.back();
+    double f = *foregroundPtr;
+    double fromForeground = f;
+
+    double *insertionPtr = &insertionProbs.back();
+    double i = *insertionPtr;
+    *foregroundPtr = fromBackground + f * f2f1 + i * endGapProb;
+    double d = f;
+    --foregroundPtr;
+    fromBackground *= b2fGrowth;
+
+    while (foregroundPtr > &foregroundProbs.front()) {
+      f = *foregroundPtr;
+      fromForeground += f;
+      i = *(insertionPtr - 1);
+      *foregroundPtr = fromBackground + f * f2f2 + (i + d) * oneGapProb;
+      *insertionPtr = f + i * g2g;
+      d = f + d * g2g;
+      --foregroundPtr;
+      --insertionPtr;
+      fromBackground *= b2fGrowth;
+    }
+
+    f = *foregroundPtr;
+    fromForeground += f;
+    *foregroundPtr = fromBackground + f * f2f1 + d * endGapProb;
+    *insertionPtr = f;
+
+    backgroundProb = backgroundProb * b2b + fromForeground * f2b;
+  }
+
+  void calcBackwardTransitionProbsWithGaps() {
+    double toBackground = f2b * backgroundProb;
+    double *foregroundPtr = &foregroundProbs.front();
+    double f = *foregroundPtr;
+    double toForeground = f;
+
+    double *insertionPtr = &insertionProbs.front();
+    double i = *insertionPtr;
+    *foregroundPtr = toBackground + f2f1 * f + i;
+    double d = endGapProb * f;
+    ++foregroundPtr;
+    toForeground *= b2fGrowth;
+
+    while (foregroundPtr < &foregroundProbs.back()) {
+      f = *foregroundPtr;
+      toForeground += f;
+      i = *(insertionPtr + 1);
+      *foregroundPtr = toBackground + f2f2 * f + (i + d);
+      double oneGapProb_f = oneGapProb * f;
+      *insertionPtr = oneGapProb_f + g2g * i;
+      d = oneGapProb_f + g2g * d;
+      ++foregroundPtr;
+      ++insertionPtr;
+      toForeground *= b2fGrowth;
+    }
+
+    f = *foregroundPtr;
+    toForeground += f;
+    *foregroundPtr = toBackground + f2f1 * f + d;
+    *insertionPtr = endGapProb * f;
+
+    backgroundProb = b2b * backgroundProb + b2fLast * toForeground;
+  }
+
+  void calcForwardTransitionProbs() {
+    if (endGapProb > 0) return calcForwardTransitionProbsWithGaps();
+
+    double b = backgroundProb;
+    double fromForeground = 0;
+    double *foregroundBeg = BEG(foregroundProbs);
+
+    for (int i = 0; i < maxRepeatOffset; ++i) {
+      double f = foregroundBeg[i];
+      fromForeground += f;
+      foregroundBeg[i] = b * b2fProbs[i] + f * f2f0;
+    }
+
+    backgroundProb = b * b2b + fromForeground * f2b;
+  }
+
+  void calcBackwardTransitionProbs() {
+    if (endGapProb > 0) return calcBackwardTransitionProbsWithGaps();
+
+    double toBackground = f2b * backgroundProb;
+    double toForeground = 0;
+    double *foregroundBeg = BEG(foregroundProbs);
+
+    for (int i = 0; i < maxRepeatOffset; ++i) {
+      double f = foregroundBeg[i];
+      toForeground += b2fProbs[i] * f;
+      foregroundBeg[i] = toBackground + f2f0 * f;
+    }
+
+    backgroundProb = b2b * backgroundProb + toForeground;
+  }
+
+  void addEndCounts(double forwardProb,
+                    double totalProb,
+                    double *transitionCounts) {
+    double toEnd = forwardProb * b2b / totalProb;
+    transitionCounts[0] += toEnd;
+  }
+
+  void addTransitionCounts(double forwardProb,
+                           double totalProb,
+                           double *transitionCounts) {
+    double toBg = forwardProb * b2b / totalProb;
+    double toFg = forwardProb * b2fFirst / totalProb;
+
+    transitionCounts[0] += backgroundProb * toBg;
+
+    for (double *i = BEG(foregroundProbs); i < END(foregroundProbs); ++i) {
+      ++transitionCounts;
+      *transitionCounts += *i * toFg;
+      toFg *= b2fDecay;
+    }
+  }
+
+  bool isNearSeqBeg() {
+    return seqPtr - seqBeg < maxRepeatOffset;
+  }
+
+  int maxOffsetInTheSequence() {
+    return isNearSeqBeg() ? (seqPtr - seqBeg) : maxRepeatOffset;
+  }
+
+  const uchar *seqFurthestBack() {
+    return isNearSeqBeg() ? seqBeg : seqPtr - maxRepeatOffset;
+  }
+
+  void calcEmissionProbs() {
+    const double *lrRow = likelihoodRatioMatrix[*seqPtr];
+    const uchar *seqStop = seqFurthestBack();
+    double *foregroundPtr = BEG(foregroundProbs);
+    const uchar *offsetPtr = seqPtr;
+
+    while (offsetPtr > seqStop) {
+      --offsetPtr;
+      *foregroundPtr *= lrRow[*offsetPtr];
+      ++foregroundPtr;
+    }
+
+    while (foregroundPtr < END(foregroundProbs)) {
+      *foregroundPtr *= 0;
+      ++foregroundPtr;
+    }
+  }
+
+  void calcForwardTransitionAndEmissionProbs() {
+    if (endGapProb > 0) {
+      calcForwardTransitionProbsWithGaps();
+      calcEmissionProbs();
+      return;
+    }
+
+    double b = backgroundProb;
+    const double *b2f = BEG(b2fProbs);
+    double *fp = BEG(foregroundProbs);
+    const double *lrRow = likelihoodRatioMatrix[*seqPtr];
+    int maxOffset = maxOffsetInTheSequence();
+    const uchar *sp = seqPtr;
+
+    SimdDbl bV = simdFillDbl(b);
+    SimdDbl tV = simdFillDbl(f2f0);
+    SimdDbl sV = simdZeroDbl();
+
+    int i = 0;
+    for (; i <= maxOffset - simdDblLen; i += simdDblLen) {
+      SimdDbl rV = simdSetDbl(
+#if defined __SSE4_1__ || defined __ARM_NEON
+#ifdef __AVX2__
+			      lrRow[sp[-i-4]],
+			      lrRow[sp[-i-3]],
+#endif
+			      lrRow[sp[-i-2]],
+#endif
+			      lrRow[sp[-i-1]]);
+      SimdDbl fV = simdLoadDbl(fp+i);
+      sV = simdAddDbl(sV, fV);
+      SimdDbl xV = simdMulDbl(bV, simdLoadDbl(b2f+i));
+      simdStoreDbl(fp+i, simdMulDbl(simdAddDbl(xV, simdMulDbl(fV, tV)), rV));
+    }
+    double fromForeground = simdHorizontalAddDbl(sV);
+    for (; i < maxOffset; ++i) {
+      double f = fp[i];
+      fromForeground += f;
+      fp[i] = (b * b2f[i] + f * f2f0) * lrRow[sp[-i-1]];
+    }
+
+    backgroundProb = b * b2b + fromForeground * f2b;
+  }
+
+  void calcEmissionAndBackwardTransitionProbs() {
+    if (endGapProb > 0) {
+      calcEmissionProbs();
+      calcBackwardTransitionProbsWithGaps();
+      return;
+    }
+
+    double toBackground = f2b * backgroundProb;
+    const double *b2f = BEG(b2fProbs);
+    double *fp = BEG(foregroundProbs);
+    const double *lrRow = likelihoodRatioMatrix[*seqPtr];
+    int maxOffset = maxOffsetInTheSequence();
+    const uchar *sp = seqPtr;
+
+    SimdDbl bV = simdFillDbl(toBackground);
+    SimdDbl tV = simdFillDbl(f2f0);
+    SimdDbl sV = simdZeroDbl();
+
+    int i = 0;
+    for (; i <= maxOffset - simdDblLen; i += simdDblLen) {
+      SimdDbl rV = simdSetDbl(
+#if defined __SSE4_1__ || defined __ARM_NEON
+#ifdef __AVX2__
+			      lrRow[sp[-i-4]],
+			      lrRow[sp[-i-3]],
+#endif
+			      lrRow[sp[-i-2]],
+#endif
+			      lrRow[sp[-i-1]]);
+      SimdDbl fV = simdMulDbl(simdLoadDbl(fp+i), rV);
+      sV = simdAddDbl(sV, simdMulDbl(simdLoadDbl(b2f+i), fV));
+      simdStoreDbl(fp+i, simdAddDbl(bV, simdMulDbl(tV, fV)));
+    }
+    double toForeground = simdHorizontalAddDbl(sV);
+    for (; i < maxOffset; ++i) {
+      double f = fp[i] * lrRow[sp[-i-1]];
+      toForeground += b2f[i] * f;
+      fp[i] = toBackground + f2f0 * f;
+    }
+
+    backgroundProb = b2b * backgroundProb + toForeground;
+  }
+
+  void rescale(double scale) {
+    backgroundProb *= scale;
+    multiplyAll(foregroundProbs, scale);
+    multiplyAll(insertionProbs, scale);
+  }
+
+  void rescaleForward() {
+    if ((seqPtr - seqBeg) % scaleStepSize == scaleStepSize - 1) {
+      assert(backgroundProb > 0);
+      double scale = 1 / backgroundProb;
+      scaleFactors[(seqPtr - seqBeg) / scaleStepSize] = scale;
+      rescale(scale);
+    }
+  }
+
+  void rescaleBackward() {
+    if ((seqPtr - seqBeg) % scaleStepSize == scaleStepSize - 1) {
+      double scale = scaleFactors[(seqPtr - seqBeg) / scaleStepSize];
+      rescale(scale);
+    }
+  }
+
+  void calcRepeatProbs(float *letterProbs) {
+    initializeForwardAlgorithm();
+
+    while (seqPtr < seqEnd) {
+      calcForwardTransitionAndEmissionProbs();
+      rescaleForward();
+      *letterProbs = static_cast<float>(backgroundProb);
+      ++letterProbs;
+      ++seqPtr;
+    }
+
+    double z = forwardTotal();
+
+    initializeBackwardAlgorithm();
+
+    while (seqPtr > seqBeg) {
+      --seqPtr;
+      --letterProbs;
+      double nonRepeatProb = *letterProbs * backgroundProb / z;
+      // Convert nonRepeatProb to a float, so that it is more likely
+      // to be exactly 1 when it should be, e.g. for the 1st letter of
+      // a sequence:
+      *letterProbs = 1 - static_cast<float>(nonRepeatProb);
+      rescaleBackward();
+      calcEmissionAndBackwardTransitionProbs();
+    }
+
+    double z2 = backwardTotal();
+    checkForwardAndBackwardTotals(z, z2);
+  }
+
+  void countTransitions(double *transitionCounts) {
+    std::vector<float> p(seqEnd - seqBeg);
+    float *letterProbs = BEG(p);
+
+    initializeForwardAlgorithm();
+
+    while (seqPtr < seqEnd) {
+      *letterProbs = static_cast<float>(backgroundProb);
+      calcForwardTransitionProbs();
+      calcEmissionProbs();
+      rescaleForward();
+      ++letterProbs;
+      ++seqPtr;
+    }
+
+    double z = forwardTotal();
+
+    addEndCounts(backgroundProb, z, transitionCounts);
+
+    initializeBackwardAlgorithm();
+
+    while (seqPtr > seqBeg) {
+      --seqPtr;
+      --letterProbs;
+      rescaleBackward();
+      calcEmissionProbs();
+      addTransitionCounts(*letterProbs, z, transitionCounts);
+      calcBackwardTransitionProbs();
+    }
+
+    double z2 = backwardTotal();
+    checkForwardAndBackwardTotals(z, z2);
+  }
+};
+
+int maskSequences(uchar *seqBeg,
+                   uchar *seqEnd,
+                   int maxRepeatOffset,
+                   const const_double_ptr *likelihoodRatioMatrix,
+                   double repeatProb,
+                   double repeatEndProb,
+                   double repeatOffsetProbDecay,
+                   double firstGapProb,
+                   double otherGapProb,
+                   double minMaskProb,
+                   const uchar *maskTable) {
+  std::vector<float> p(seqEnd - seqBeg);
+  float *probabilities = BEG(p);
+
+  getProbabilities(seqBeg, seqEnd, maxRepeatOffset,
+                   likelihoodRatioMatrix, repeatProb, repeatEndProb,
+                   repeatOffsetProbDecay, firstGapProb, otherGapProb,
+                   probabilities);
+
+  return maskProbableLetters(seqBeg, seqEnd, probabilities, minMaskProb, maskTable);
+}
+
+void getProbabilities(const uchar *seqBeg,
+                      const uchar *seqEnd,
+                      int maxRepeatOffset,
+                      const const_double_ptr *likelihoodRatioMatrix,
+                      double repeatProb,
+                      double repeatEndProb,
+                      double repeatOffsetProbDecay,
+                      double firstGapProb,
+                      double otherGapProb,
+                      float *probabilities) {
+  Tantan tantan(seqBeg, seqEnd, maxRepeatOffset, likelihoodRatioMatrix,
+                repeatProb, repeatEndProb, repeatOffsetProbDecay,
+                firstGapProb, otherGapProb);
+  tantan.calcRepeatProbs(probabilities);
+}
+
+int maskProbableLetters(uchar *seqBeg,
+                         uchar *seqEnd,
+                         const float *probabilities,
+                         double minMaskProb,
+                         const uchar *maskTable) {
+  int masked = 0;
+  while (seqBeg < seqEnd) {
+    if (*probabilities >= minMaskProb) {
+        *seqBeg = maskTable[*seqBeg];
+        masked++;
+    }
+    ++probabilities;
+    ++seqBeg;
+  }
+  return masked;
+}
+
+void countTransitions(const uchar *seqBeg,
+                      const uchar *seqEnd,
+                      int maxRepeatOffset,
+                      const const_double_ptr *likelihoodRatioMatrix,
+                      double repeatProb,
+                      double repeatEndProb,
+                      double repeatOffsetProbDecay,
+                      double firstGapProb,
+                      double otherGapProb,
+                      double *transitionCounts) {
+  Tantan tantan(seqBeg, seqEnd, maxRepeatOffset, likelihoodRatioMatrix,
+                repeatProb, repeatEndProb, repeatOffsetProbDecay,
+                firstGapProb, otherGapProb);
+  tantan.countTransitions(transitionCounts);
+}
+
+}
diff --git a/src/commons/tantan.h b/lib/tantan/tantan.h
similarity index 68%
rename from src/commons/tantan.h
rename to lib/tantan/tantan.h
index 88af9d7ef..d4c02d276 100644
--- a/src/commons/tantan.h
+++ b/lib/tantan/tantan.h
@@ -1,16 +1,6 @@
-// Copyright 2010 Martin C. Frith
-// tantan is distributed under the GNU General Public License, either
-//        version 3 of the License, or (at your option) any later version.  For
-//        details, see COPYING.txt.
-//
-// If you use tantan in your research, please cite:
-// "A new repeat-masking method enables specific detection of homologous
-// sequences", MC Frith, Nucleic Acids Research 2011 39(4):e23.
-//
-// tantan's website is: http://www.cbrc.jp/tantan/
-//
-// If you have any questions, comments, or problems concerning tantan,
-// please email: tantan (ATmark) cbrc (dot) jp.
+// Author: Martin C. Frith 2010
+// SPDX-License-Identifier: MPL-2.0
+
 // These are routines for masking simple regions (low-complexity and
 // short-period tandem repeats) in biological sequences.  To
 // understand them in detail, see the published article (in
@@ -64,10 +54,11 @@
 
 namespace tantan {
 
+typedef unsigned char uchar;
 typedef const double *const_double_ptr;
 
-int maskSequences(char *seqBeg,
-                   char *seqEnd,
+int maskSequences(uchar *seqBeg,
+                   uchar *seqEnd,
                    int maxRepeatOffset,
                    const const_double_ptr *likelihoodRatioMatrix,
                    double repeatProb,
@@ -76,14 +67,14 @@ int maskSequences(char *seqBeg,
                    double firstGapProb,
                    double otherGapProb,
                    double minMaskProb,
-                   const char *maskTable);
+                   const uchar *maskTable);
 
 // The following routine gets the posterior probability that each
 // letter is repetitive.  It stores the results in "probabilities",
 // which must point to enough pre-allocated space to fit the results.
 
-void getProbabilities(const char *seqBeg,
-                      const char *seqEnd,
+void getProbabilities(const uchar *seqBeg,
+                      const uchar *seqEnd,
                       int maxRepeatOffset,
                       const const_double_ptr *likelihoodRatioMatrix,
                       double repeatProb,
@@ -96,11 +87,34 @@ void getProbabilities(const char *seqBeg,
 // The following routine masks each letter whose corresponding entry
 // in "probabilities" is >= minMaskProb.
 
-int maskProbableLetters(char *seqBeg,
-                         char *seqEnd,
+int maskProbableLetters(uchar *seqBeg,
+                         uchar *seqEnd,
                          const float *probabilities,
                          double minMaskProb,
-                         const char *maskTable);
+                         const uchar *maskTable);
+
+// The following routine counts the expected number of transitions
+// from the background (non-repeat) state to other states.  It adds
+// the results to "transitionCounts", which must point to
+// pre-initialized space for (maxRepeatOffset+1) items.  The
+// background->background transition count is stored in
+// transitionCounts[0].  The background->(period-i repeat) transition
+// count is stored in transitionCounts[i].
+
+// (In this routine, the HMM begin and end states are counted as
+// background states.  Thus, begin->X is added to background->X, and
+// X->end is added to X->background.)
+
+void countTransitions(const uchar *seqBeg,
+                      const uchar *seqEnd,
+                      int maxRepeatOffset,
+                      const const_double_ptr *likelihoodRatioMatrix,
+                      double repeatProb,
+                      double repeatEndProb,
+                      double repeatOffsetProbDecay,
+                      double firstGapProb,
+                      double otherGapProb,
+                      double *transitionCounts);
 
 }
 
diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
index a84726da5..89bc7f31b 100644
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -5,6 +5,7 @@ set(HAVE_GPROF 0 CACHE BOOL "Have GPROF Profiler")
 set(ENABLE_WERROR 0 CACHE BOOL "Enable Warnings as Errors")
 #set(DISABLE_LTO 0 CACHE BOOL "Disable link-time optimization in non-debug builds")
 set(REQUIRE_OPENMP 1 CACHE BOOL "Require availability of OpenMP")
+set(ATOMIC_LIB_OVERRIDE "" CACHE PATH "Override path to libatomic")
 
 include(AppendTargetProperty)
 
@@ -21,7 +22,6 @@ add_subdirectory(workflow)
 add_library(mmseqs-framework
         $<TARGET_OBJECTS:alp>
         $<TARGET_OBJECTS:ksw2>
-        $<TARGET_OBJECTS:cacode>
         ${alignment_header_files}
         ${alignment_source_files}
         ${clustering_header_files}
@@ -118,19 +118,27 @@ if (HAVE_POSIX_MADVISE)
 endif ()
 
 if (NOT DISABLE_IPS4O)
-    find_package(Atomic)
-    if (ATOMIC_FOUND)
-        target_link_libraries(mmseqs-framework ${ATOMIC_LIBRARIES})
+    if (ATOMIC_LIB_OVERRIDE)
+        add_library(LibAtomic STATIC IMPORTED)
+        set_target_properties(LibAtomic PROPERTIES IMPORTED_LOCATION ${ATOMIC_LIB_OVERRIDE})
+        target_link_libraries(mmseqs-framework LibAtomic)
         target_compile_definitions(mmseqs-framework PUBLIC -DENABLE_IPS4O=1)
-        message("-- IPS4O sorting works")
-    else ()
-        message("-- OMPTL sorting fallback")
+        message("-- IPS4O sorting forced with ${ATOMIC_LIB_OVERRIDE}")
+    else()
+        find_package(Atomic)
+        if (ATOMIC_FOUND)
+            target_link_libraries(mmseqs-framework ${ATOMIC_LIBRARIES})
+            target_compile_definitions(mmseqs-framework PUBLIC -DENABLE_IPS4O=1)
+            message("-- IPS4O sorting works")
+        else ()
+            message("-- OMPTL sorting fallback")
+        endif ()
     endif ()
 else ()
     message("-- OMPTL sorting fallback")
 endif ()
 
-target_link_libraries(mmseqs-framework tinyexpr ${ZSTD_LIBRARIES} microtar)
+target_link_libraries(mmseqs-framework tinyexpr ${ZSTD_LIBRARIES} microtar tantan)
 # if (CYGWIN)
 #     target_link_libraries(mmseqs-framework nedmalloc)
 # endif ()
@@ -216,12 +224,17 @@ find_package(OpenMP QUIET)
 if (OPENMP_FOUND)
     message("-- Found OpenMP")
     target_compile_definitions(mmseqs-framework PUBLIC -DOPENMP=1)
-    # For GCC we dont want to do this since it breaks macOS static builds
-    # It will link libgomp.a internally (through -fopenmp I guess)
-    # and also link libgomp.dylib thus breaking static builds
-    if (NOT "${CMAKE_CXX_COMPILER_ID}" STREQUAL "GNU")
-        target_link_libraries(mmseqs-framework ${OpenMP_CXX_LIBRARIES})
-    endif()
+    # hack: remove pthread from openmp deps, it gets linked correctly by cuda later
+    if (FORCE_STATIC_DEPS AND ENABLE_CUDA)
+        set(FILTERED_LIBRARIES "")
+        foreach (LIB ${OpenMP_CXX_LIBRARIES})
+            if (NOT LIB MATCHES "pthread")
+                list(APPEND FILTERED_LIBRARIES ${LIB})
+            endif ()
+        endforeach ()
+        set(OpenMP_CXX_LIBRARIES ${FILTERED_LIBRARIES})
+    endif ()
+    target_link_libraries(mmseqs-framework ${OpenMP_CXX_LIBRARIES})
     target_include_directories(mmseqs-framework PUBLIC ${OpenMP_CXX_INCLUDE_DIRS})
     append_target_property(mmseqs-framework COMPILE_FLAGS ${OpenMP_CXX_FLAGS})
     append_target_property(mmseqs-framework LINK_FLAGS ${OpenMP_CXX_FLAGS})
@@ -240,6 +253,26 @@ if (HAVE_GPROF)
     endif ()
 endif ()
 
+if (ENABLE_CUDA)
+    find_package(CUDAToolkit REQUIRED)
+    target_compile_definitions(mmseqs-framework PUBLIC -DHAVE_CUDA=1)
+    target_link_libraries(mmseqs-framework marv)
+    if (FORCE_STATIC_DEPS)
+        # link to rt explicitly so it doesn't get statically compiled and adds GLIBC_PRIVATE symbols
+        target_link_libraries(mmseqs-framework rt)
+        # remove librt.a which introduces GLIBC_PRIVATE symbols
+        get_property(linked_libraries TARGET CUDA::cudart_static_deps PROPERTY INTERFACE_LINK_LIBRARIES)
+        set(FILTERED_LIBRARIES "")
+        foreach (LIB ${linked_libraries})
+            if (NOT LIB MATCHES ".*librt\\.a$")
+                list(APPEND FILTERED_LIBRARIES ${LIB})
+            endif ()
+        endforeach ()
+        set_target_properties(CUDA::cudart_static_deps PROPERTIES INTERFACE_LINK_LIBRARIES "${FILTERED_LIBRARIES}")
+    endif ()
+    target_link_libraries(mmseqs-framework CUDA::cudart_static)
+endif ()
+
 if (NOT FRAMEWORK_ONLY)
     include(MMseqsSetupDerivedTarget)
     add_subdirectory(version)
diff --git a/src/CommandDeclarations.h b/src/CommandDeclarations.h
index 3c4abb4e0..bc742741d 100644
--- a/src/CommandDeclarations.h
+++ b/src/CommandDeclarations.h
@@ -28,6 +28,7 @@ extern int createindex(int argc, const char **argv, const Command& command);
 extern int createlinindex(int argc, const char **argv, const Command& command);
 extern int createseqfiledb(int argc, const char **argv, const Command& command);
 extern int createsubdb(int argc, const char **argv, const Command& command);
+extern int gpuserver(int argc, const char **argv, const Command& command);
 extern int view(int argc, const char **argv, const Command& command);
 extern int rmdb(int argc, const char **argv, const Command& command);
 extern int mvdb(int argc, const char **argv, const Command& command);
diff --git a/src/MMseqsBase.cpp b/src/MMseqsBase.cpp
index 8325f0d4a..94a6e6b23 100644
--- a/src/MMseqsBase.cpp
+++ b/src/MMseqsBase.cpp
@@ -22,7 +22,7 @@ std::vector<Command> baseCommands = {
                 "mmseqs easy-search examples/QUERY.fasta examples/DB.fasta result.m8 tmp --start-sens 2 -s 7 --sens-steps 3\n",
                 "Milot Mirdita <milot@mirdita.de> & Martin Steinegger <martin.steinegger@snu.ac.kr>",
                 "<i:queryFastaFile1[.gz|.bz2]> ... <i:queryFastaFileN[.gz|.bz2]>|<i:stdin> <i:targetFastaFile[.gz]>|<i:targetDB> <o:alignmentFile> <tmpDir>",
-                CITATION_SERVER | CITATION_MMSEQS2,{{"fastaFile[.gz|.bz2]", DbType::ACCESS_MODE_INPUT, DbType::NEED_DATA|DbType::VARIADIC, &DbValidator::flatfileAndStdin },
+                CITATION_SERVER | CITATION_MMSEQS2 | CITATION_GPU ,{{"fastaFile[.gz|.bz2]", DbType::ACCESS_MODE_INPUT, DbType::NEED_DATA|DbType::VARIADIC, &DbValidator::flatfileAndStdin },
                                                            {"targetDB", DbType::ACCESS_MODE_INPUT, DbType::NEED_DATA, &DbValidator::flatfile },
                                                            {"alignmentFile", DbType::ACCESS_MODE_OUTPUT, DbType::NEED_DATA, &DbValidator::flatfile },
                                                            {"tmpDir", DbType::ACCESS_MODE_OUTPUT, DbType::NEED_DATA, &DbValidator::directory }}},
@@ -130,12 +130,12 @@ std::vector<Command> baseCommands = {
                 "<i:fastaFile1[.gz|.bz2]> ... <i:fastaFileN[.gz|.bz2]>|<i:stdin> <o:sequenceDB>",
                 CITATION_MMSEQS2, {{"fast[a|q]File[.gz|bz2]|stdin", DbType::ACCESS_MODE_INPUT, DbType::NEED_DATA | DbType::VARIADIC, &DbValidator::flatfileStdinAndGeneric },
                                                            {"sequenceDB", DbType::ACCESS_MODE_OUTPUT, DbType::NEED_DATA, &DbValidator::flatfile }}},
-        {"makepaddedseqdb",               makepaddedseqdb,              &par.onlyverbosity,              COMMAND_HIDDEN,
+        {"makepaddedseqdb",               makepaddedseqdb,              &par.makepaddedseqdb,              COMMAND_HIDDEN,
                 "Generate a padded sequence DB",
                 "Generate a padded sequence DB",
-                "Martin Steinegger <martin.steinegger@snu.ac.kr>",
+                "Milot Mirdita <milot@mirdita.de> & Martin Steinegger <martin.steinegger@snu.ac.kr>",
                 "<i:sequenceDB> <o:sequenceDB>",
-                CITATION_MMSEQS2, {{"sequenceDB", DbType::ACCESS_MODE_INPUT, DbType::NEED_DATA|DbType::NEED_HEADER, &DbValidator::sequenceDb },
+                CITATION_GPU, {{"sequenceDB", DbType::ACCESS_MODE_INPUT, DbType::NEED_DATA|DbType::NEED_HEADER, &DbValidator::sequenceDb },
                                           {"sequenceIndexDB", DbType::ACCESS_MODE_OUTPUT, DbType::NEED_DATA, &DbValidator::sequenceDb }}},
         {"appenddbtoindex",      appenddbtoindex,      &par.appenddbtoindex,      COMMAND_HIDDEN,
                 NULL,
@@ -219,7 +219,7 @@ std::vector<Command> baseCommands = {
                 "mmseqs search queryDB targetDB resultDB --start-sens 2 -s 7 --sens-steps 3\n",
                 "Martin Steinegger <martin.steinegger@snu.ac.kr>",
                 "<i:queryDB> <i:targetDB> <o:alignmentDB> <tmpDir>",
-                CITATION_MMSEQS2, {{"queryDB", DbType::ACCESS_MODE_INPUT, DbType::NEED_DATA, &DbValidator::sequenceDb },
+                CITATION_MMSEQS2 | CITATION_GPU, {{"queryDB", DbType::ACCESS_MODE_INPUT, DbType::NEED_DATA, &DbValidator::sequenceDb },
                                                            {"targetDB", DbType::ACCESS_MODE_INPUT, DbType::NEED_DATA, &DbValidator::sequenceDb },
                                                            {"alignmentDB", DbType::ACCESS_MODE_OUTPUT, DbType::NEED_DATA, &DbValidator::alignmentDb },
                                                            {"tmpDir", DbType::ACCESS_MODE_OUTPUT, DbType::NEED_DATA, &DbValidator::directory }}},
@@ -585,17 +585,17 @@ std::vector<Command> baseCommands = {
         {"ungappedprefilter",    ungappedprefilter,    &par.ungappedprefilter,    COMMAND_PREFILTER,
                 "Optimal diagonal score search",
                 NULL,
-                "Martin Steinegger <martin.steinegger@snu.ac.kr>",
+                "Milot Mirdita <milot@mirdita.de> & Martin Steinegger <martin.steinegger@snu.ac.kr>",
                 "<i:queryDB> <i:targetDB> <o:prefilterDB>",
-                CITATION_MMSEQS2, {{"queryDB", DbType::ACCESS_MODE_INPUT, DbType::NEED_DATA, &DbValidator::sequenceDb },
+                CITATION_MMSEQS2 | CITATION_GPU, {{"queryDB", DbType::ACCESS_MODE_INPUT, DbType::NEED_DATA, &DbValidator::sequenceDb },
                                                            {"targetDB", DbType::ACCESS_MODE_INPUT, DbType::NEED_DATA, &DbValidator::sequenceDb },
                                                            {"prefilterDB", DbType::ACCESS_MODE_OUTPUT, DbType::NEED_DATA, &DbValidator::prefilterDb }}},
         {"gappedprefilter",      gappedprefilter,      &par.gappedprefilter,      COMMAND_PREFILTER,
                 "Optimal Smith-Waterman-based prefiltering (slow)",
                 NULL,
-                "Martin Steinegger <martin.steinegger@snu.ac.kr>",
+                "Milot Mirdita <milot@mirdita.de> & Martin Steinegger <martin.steinegger@snu.ac.kr>",
                 "<i:queryDB> <i:targetDB> <o:prefilterDB>",
-                CITATION_MMSEQS2, {{"queryDB", DbType::ACCESS_MODE_INPUT, DbType::NEED_DATA, &DbValidator::sequenceDb },
+                CITATION_MMSEQS2 | CITATION_GPU, {{"queryDB", DbType::ACCESS_MODE_INPUT, DbType::NEED_DATA, &DbValidator::sequenceDb },
                                                            {"targetDB", DbType::ACCESS_MODE_INPUT, DbType::NEED_DATA, &DbValidator::sequenceDb },
                                                            {"prefilterDB", DbType::ACCESS_MODE_OUTPUT, DbType::NEED_DATA, &DbValidator::prefilterDb }}},
         {"kmermatcher",          kmermatcher,          &par.kmermatcher,          COMMAND_PREFILTER,
@@ -751,14 +751,18 @@ std::vector<Command> baseCommands = {
                 "<i:DB> <o:outDir>",
                 CITATION_MMSEQS2, {{"DB", DbType::ACCESS_MODE_INPUT, DbType::NEED_DATA, NULL },
                                           {"outDir", DbType::ACCESS_MODE_OUTPUT, DbType::NEED_DATA, &DbValidator::directory }}},
-        {"touchdb",              touchdb,              &par.onlythreads,          COMMAND_STORAGE,
+        {"touchdb",              touchdb,              &par.touchdb,          COMMAND_STORAGE,
                 "Preload DB into memory (page cache)",
                 NULL,
                 "Martin Steinegger <martin.steinegger@snu.ac.kr> ",
                 "<i:DB>",
                 CITATION_MMSEQS2, {{"DB", DbType::ACCESS_MODE_INPUT, DbType::NEED_DATA, &DbValidator::allDb }}},
-
-
+        {"gpuserver",              gpuserver,              &par.gpuserver,          COMMAND_STORAGE,
+                "Start a GPU server",
+                NULL,
+                "Milot Mirdita <milot@mirdita.de> & Martin Steinegger <martin.steinegger@snu.ac.kr>",
+                "<i:DB>",
+                CITATION_GPU, {{"DB", DbType::ACCESS_MODE_INPUT, DbType::NEED_DATA, &DbValidator::allDb }}},
         {"createsubdb",          createsubdb,          &par.createsubdb,          COMMAND_SET,
                 "Create a subset of a DB from list of DB keys",
                 "# Create a new sequenceDB from sequenceDB entries with keys 1, 2 and 3\n"
@@ -1150,7 +1154,7 @@ std::vector<Command> baseCommands = {
                 "<i:hhsuiteHHMDB> <o:profileDB>",
                 CITATION_MMSEQS2,{{"",DbType::ACCESS_MODE_INPUT, DbType::NEED_DATA, NULL}}},
 
-        {"tsv2exprofiledb",      tsv2exprofiledb,      &par.verbandcompression,   COMMAND_PROFILE_PROFILE,
+        {"tsv2exprofiledb",      tsv2exprofiledb,      &par.tsv2exprofiledb,      COMMAND_PROFILE_PROFILE,
                 "Create a expandable profile db from TSV files",
                 NULL,
                 "Milot Mirdita <milot@mirdita.de>",
diff --git a/src/alignment/CMakeLists.txt b/src/alignment/CMakeLists.txt
index 0921d6a5e..9a23ba991 100644
--- a/src/alignment/CMakeLists.txt
+++ b/src/alignment/CMakeLists.txt
@@ -6,7 +6,6 @@ set(alignment_header_files
         alignment/MsaFilter.h
         alignment/MultipleAlignment.h
         alignment/PSSMCalculator.h
-        alignment/PSSMMasker.h
         alignment/StripedSmithWaterman.h
         alignment/BandedNucleotideAligner.h
         alignment/DistanceCalculator.h
diff --git a/src/alignment/CompressedA3M.cpp b/src/alignment/CompressedA3M.cpp
index 9cb14f246..4ff77567a 100644
--- a/src/alignment/CompressedA3M.cpp
+++ b/src/alignment/CompressedA3M.cpp
@@ -1,7 +1,7 @@
 /*
  * extractA3M adapted from HH-suite 3.0: a3m_compress.h
  * Original Author: meiermark
- * Licensed under GPLv3
+ * Relicensed with permission from jsoeding to MIT
  */
 
 #include "CompressedA3M.h"
diff --git a/src/alignment/Matcher.cpp b/src/alignment/Matcher.cpp
index 3b83bf51d..b4fd5cdf9 100644
--- a/src/alignment/Matcher.cpp
+++ b/src/alignment/Matcher.cpp
@@ -4,6 +4,7 @@
 #include "Util.h"
 #include "Parameters.h"
 #include "StripedSmithWaterman.h"
+#include <fast_float/fast_float.h>
 
 
 Matcher::Matcher(int querySeqType, int targetSeqType, int maxSeqLen, BaseMatrix *m, EvalueComputation * evaluer,
@@ -214,8 +215,10 @@ Matcher::result_t Matcher::parseAlignmentRecord(const char *data, bool readCompr
 
     unsigned int targetId = Util::fast_atoi<unsigned int>(key);
     int score = Util::fast_atoi<int>(entry[1]);
-    double seqId = strtod(entry[2],NULL);
-    double eval = strtod(entry[3],NULL);
+    double seqId;
+    fast_float::from_chars(entry[2], entry[3] - 1, seqId);
+    double eval;
+    fast_float::from_chars(entry[3], entry[4] - 1, eval);
 
     int qStart =  Util::fast_atoi<int>(entry[4]);
     int qEnd = Util::fast_atoi<int>(entry[5]);
diff --git a/src/alignment/PSSMCalculator.cpp b/src/alignment/PSSMCalculator.cpp
index 79bafaf8f..4faa1c891 100644
--- a/src/alignment/PSSMCalculator.cpp
+++ b/src/alignment/PSSMCalculator.cpp
@@ -239,6 +239,23 @@ void PSSMCalculator::printPSSM(size_t queryLength){
     }
 }
 
+void PSSMCalculator::profileToString(std::string& result, size_t queryLength){
+    result.append(5, ' ');
+    for (size_t aa = 0; aa < Sequence::PROFILE_AA_SIZE; aa++) {
+        result.append(SSTR(subMat->num2aa[aa]));
+        result.append(6, ' ');
+    }
+    result.append(1, '\n');
+    for (size_t i = 0; i < queryLength; i++) {
+        for (size_t aa = 0; aa < Sequence::PROFILE_AA_SIZE; aa++) {
+            result.append(SSTR(profile[i * Sequence::PROFILE_AA_SIZE + aa], 4));
+            result.append(1, ' ');
+        }
+        result.append(1, '\n');
+    }
+    result.append(1, '\n');
+}
+
 void PSSMCalculator::computeLogPSSM(BaseMatrix *subMat, char *pssm, const float *profile, float bitFactor, size_t queryLength, float scoreBias) {
     for(size_t pos = 0; pos < queryLength; pos++) {
         for(size_t aa = 0; aa < Sequence::PROFILE_AA_SIZE; aa++) {
diff --git a/src/alignment/PSSMCalculator.h b/src/alignment/PSSMCalculator.h
index 146475b2d..e07d7d9b2 100644
--- a/src/alignment/PSSMCalculator.h
+++ b/src/alignment/PSSMCalculator.h
@@ -54,6 +54,7 @@ class PSSMCalculator {
 
     void printProfile(size_t queryLength);
     void printPSSM(size_t queryLength);
+    void profileToString(std::string& result, size_t queryLength);
 
     // prepare pseudocounts
     static void preparePseudoCounts(float *frequency, float *frequency_with_pseudocounts, size_t entrySize, size_t queryLength, const float **R);
diff --git a/src/alignment/PSSMMasker.h b/src/alignment/PSSMMasker.h
deleted file mode 100644
index e00007c2c..000000000
--- a/src/alignment/PSSMMasker.h
+++ /dev/null
@@ -1,52 +0,0 @@
-#ifndef PSSMMASKER_H
-#define PSSMMASKER_H
-
-#include <cstring>
-#include "tantan.h"
-#include "BaseMatrix.h"
-#include "PSSMCalculator.h"
-
-class PSSMMasker {
-public:
-    PSSMMasker(size_t maxSeqLen, ProbabilityMatrix& probMatrix, BaseMatrix& subMat) : maxSeqLen(maxSeqLen), probMatrix(probMatrix), xAmioAcid(subMat.aa2num[static_cast<int>('X')]) {
-        charSequence = (char*)malloc(sizeof(char) * maxSeqLen);
-    }
-
-    ~PSSMMasker() {
-        free(charSequence);
-    }
-
-    void mask(Sequence& centerSequence, float maskProb, PSSMCalculator::Profile& pssmRes) {
-        if ((size_t)centerSequence.L > maxSeqLen) {
-            maxSeqLen = sizeof(char) * centerSequence.L * 1.5;
-            charSequence = (char*)realloc(charSequence, maxSeqLen);
-        }
-        memcpy(charSequence, centerSequence.numSequence, sizeof(unsigned char) * centerSequence.L);
-        tantan::maskSequences(charSequence, charSequence + centerSequence.L,
-                              50 /*options.maxCycleLength*/,
-                              probMatrix.probMatrixPointers,
-                              0.005 /*options.repeatProb*/,
-                              0.05 /*options.repeatEndProb*/,
-                              0.9 /*options.repeatOffsetProbDecay*/,
-                              0, 0,
-                              maskProb /*options.minMaskProb*/,
-                              probMatrix.hardMaskTable);
-
-        for (int pos = 0; pos < centerSequence.L; pos++) {
-            if (charSequence[pos] == xAmioAcid) {
-                for (size_t aa = 0; aa < Sequence::PROFILE_AA_SIZE; aa++) {
-//                    pssmRes.prob[pos * Sequence::PROFILE_AA_SIZE + aa] = subMat.pBack[aa] * 0.5;
-                    pssmRes.pssm[pos * Sequence::PROFILE_AA_SIZE + aa] = -1;
-                }
-//                pssmRes.consensus[pos] = 'X';
-            }
-        }
-    }
-private:
-    char *charSequence;
-    size_t maxSeqLen;
-    ProbabilityMatrix& probMatrix;
-    const int xAmioAcid;
-};
-
-#endif
diff --git a/src/commons/BaseMatrix.h b/src/commons/BaseMatrix.h
index 7d86bdf22..4e9783799 100644
--- a/src/commons/BaseMatrix.h
+++ b/src/commons/BaseMatrix.h
@@ -102,7 +102,7 @@ class ProbabilityMatrix {
         delete[] probMatrixPointers;
     }
 
-    char hardMaskTable[256];
+    unsigned char hardMaskTable[256];
     const double **probMatrixPointers;
 
 private:
diff --git a/src/commons/CMakeLists.txt b/src/commons/CMakeLists.txt
index dc7ab3e74..2a1e42d6e 100644
--- a/src/commons/CMakeLists.txt
+++ b/src/commons/CMakeLists.txt
@@ -16,11 +16,13 @@ set(commons_header_files
         commons/Domain.h
         commons/ExpressionParser.h
         commons/FileUtil.h
+        commons/GpuUtil.h
         commons/HeaderSummarizer.h
         commons/IndexReader.h
         commons/itoa.h
         commons/KSeqBufferReader.h
         commons/KSeqWrapper.h
+        commons/Masker.h
         commons/MathUtil.h
         commons/MemoryMapped.h
         commons/MemoryTracker.h
@@ -38,7 +40,6 @@ set(commons_header_files
         commons/StringBlock.h
         commons/SubstitutionMatrix.h
         commons/SubstitutionMatrixProfileStates.h
-        commons/tantan.h
         commons/TranslateNucl.h
         commons/Timer.h
         commons/UniprotKB.h
@@ -60,6 +61,7 @@ set(commons_source_files
         commons/FileUtil.cpp
         commons/HeaderSummarizer.cpp
         commons/KSeqWrapper.cpp
+        commons/Masker.cpp
         commons/MemoryMapped.cpp
         commons/MemoryTracker.cpp
         commons/MMseqsMPI.cpp
@@ -72,7 +74,6 @@ set(commons_source_files
         commons/Sequence.cpp
         commons/SequenceWeights.cpp
         commons/SubstitutionMatrix.cpp
-        commons/tantan.cpp
         commons/UniprotKB.cpp
         commons/Util.cpp
         PARENT_SCOPE
diff --git a/src/commons/Command.h b/src/commons/Command.h
index 308309e5a..ce34c91c6 100644
--- a/src/commons/Command.h
+++ b/src/commons/Command.h
@@ -10,10 +10,11 @@ const unsigned int CITATION_LINCLUST = 1U << 3;
 const unsigned int CITATION_PLASS    = 1U << 4;
 const unsigned int CITATION_SERVER   = 1U << 5;
 const unsigned int CITATION_TAXONOMY = 1U << 6;
+const unsigned int CITATION_GPU      = 1U << 7;
 
 // Make sure this is always the last bit
 // citations from inheriting modules will start from here
-const unsigned int CITATION_END      = CITATION_TAXONOMY << 1;
+const unsigned int CITATION_END      = CITATION_GPU << 1;
 
 struct MMseqsParameter;
 
diff --git a/src/commons/DBReader.cpp b/src/commons/DBReader.cpp
index f52d2369c..e40dff596 100644
--- a/src/commons/DBReader.cpp
+++ b/src/commons/DBReader.cpp
@@ -185,13 +185,15 @@ template <typename T> bool DBReader<T>::open(int accessType){
     }
 
     compression = isCompressed(dbtype);
-    if(compression == COMPRESSED){
+    padded = (getExtendedDbtype(dbtype) & Parameters::DBTYPE_EXTENDED_GPU);
+
+    if(compression == COMPRESSED || padded){
         compressedBufferSizes = new size_t[threads];
         compressedBuffers = new char*[threads];
         dstream = new ZSTD_DStream*[threads];
         for(int i = 0; i < threads; i++){
             // allocated buffer
-            compressedBufferSizes[i] = std::max(maxSeqLen+1, 1024u);
+            compressedBufferSizes[i] = std::max(maxSeqLen+2, 1024u);
             compressedBuffers[i] = (char*) malloc(compressedBufferSizes[i]);
             incrementMemory(compressedBufferSizes[i]);
             if(compressedBuffers[i]==NULL){
@@ -530,6 +532,31 @@ template <typename T> size_t DBReader<T>::bsearch(const Index * index, size_t N,
     return std::upper_bound(index, index + N, val, Index::compareByIdOnly) - index;
 }
 
+
+template <typename T> char* DBReader<T>::getUnpadded(size_t id, int thrIdx) {
+    char *data = getDataUncompressed(id);
+    size_t seqLen = getSeqLen(id);
+
+    static const char CODE_TO_CHAR[21] = {
+            'A', /*  0 */ 'C', /*  1 */ 'D', /*  2 */
+            'E', /*  3 */ 'F', /*  4 */ 'G', /*  5 */
+            'H', /*  6 */ 'I', /*  7 */ 'K', /*  8 */
+            'L', /*  9 */ 'M', /* 10 */ 'N', /* 11 */
+            'P', /* 12 */ 'Q', /* 13 */ 'R', /* 14 */
+            'S', /* 15 */ 'T', /* 16 */ 'V', /* 17 */
+            'W', /* 18 */ 'Y', /* 19 */ 'X'  /* 20 */
+    };
+
+    for(size_t i = 0; i < seqLen; i++){
+        unsigned char code = static_cast<unsigned char>(data[i]);
+        unsigned char baseCode = (code >= 32) ? code - 32 : code;
+        compressedBuffers[thrIdx][i] = CODE_TO_CHAR[baseCode];
+    }
+    compressedBuffers[thrIdx][seqLen + 0] = '\n';
+    compressedBuffers[thrIdx][seqLen + 1] = '\0';
+    return compressedBuffers[thrIdx];
+}
+
 template <typename T> char* DBReader<T>::getDataCompressed(size_t id, int thrIdx) {
     char *data = getDataUncompressed(id);
 
@@ -573,7 +600,9 @@ template <typename T> size_t DBReader<T>::getAminoAcidDBSize() {
 template <typename T> char* DBReader<T>::getData(size_t id, int thrIdx){
     if(compression == COMPRESSED){
         return getDataCompressed(id, thrIdx);
-    }else{
+    }else if (padded) {
+        return getUnpadded(id, thrIdx);
+    } else {
         return getDataUncompressed(id);
     }
 }
@@ -628,7 +657,9 @@ template <typename T> char* DBReader<T>::getDataByDBKey(T dbKey, int thrIdx) {
     size_t id = getId(dbKey);
     if(compression == COMPRESSED ){
         return (id != UINT_MAX) ? getDataCompressed(id, thrIdx) : NULL;
-    }else{
+    } if(padded) {
+        return (id != UINT_MAX) ? getUnpadded(id, thrIdx) : NULL;
+    } else{
         return (id != UINT_MAX) ? getDataByOffset(index[id].offset) : NULL;
     }
 }
@@ -1016,6 +1047,7 @@ int DBReader<T>::isCompressed(int dbtype) {
     return (dbtype & (1 << 31)) ? COMPRESSED : UNCOMPRESSED;
 }
 
+
 template<typename T>
 void DBReader<T>::setSequentialAdvice() {
 #ifdef HAVE_POSIX_MADVISE
diff --git a/src/commons/DBReader.h b/src/commons/DBReader.h
index 57589b1f6..64f274b46 100644
--- a/src/commons/DBReader.h
+++ b/src/commons/DBReader.h
@@ -174,6 +174,8 @@ class DBReader : public MemoryTracker {
 
     char* getDataCompressed(size_t id, int thrIdx);
 
+    char* getUnpadded(size_t id, int thrIdx);
+
     char* getDataUncompressed(size_t id);
 
     void touchData(size_t id);
@@ -479,6 +481,7 @@ class DBReader : public MemoryTracker {
     // stores the dbtype (if dbtype file exists)
     int dbtype;
     int compression;
+    int padded;
     char ** compressedBuffers;
     size_t * compressedBufferSizes;
     ZSTD_DStream ** dstream;
diff --git a/src/commons/Debug.h b/src/commons/Debug.h
index 8b7c79b1c..9c0ad6469 100644
--- a/src/commons/Debug.h
+++ b/src/commons/Debug.h
@@ -178,8 +178,13 @@ class Debug
             prevPrintedId = 0;
         }
 
-        void updateProgress(){
-            size_t id = __sync_fetch_and_add(&currentPos, 1);
+        void updateProgress(size_t pos = SIZE_MAX){
+            size_t id;
+            if (pos == SIZE_MAX) {
+                id = __sync_fetch_and_add(&currentPos, 1);
+            } else {
+                id = pos;
+            }
             // if no active terminal exists write dots
             if(interactive == false){
                 if(totalEntries==SIZE_MAX) {
diff --git a/src/commons/FastSort.h b/src/commons/FastSort.h
index b54a3f8ba..994566135 100644
--- a/src/commons/FastSort.h
+++ b/src/commons/FastSort.h
@@ -1,4 +1,5 @@
 #include <algorithm>
+
 #ifdef ENABLE_IPS4O
 # include "simde/hedley.h"
 # if defined(HEDLEY_GCC_VERSION) && HEDLEY_GCC_VERSION_CHECK(0,0,0) && !HEDLEY_GCC_VERSION_CHECK(5,1,0) && defined(__cplusplus)
@@ -14,14 +15,6 @@
 # endif
 # define SORT_SERIAL std::sort
 #else
-# ifdef OPENMP
-#  include <omptl/omptl_algorithm>
-#  define SORT_PARALLEL omptl::sort
-# else
-#  define SORT_PARALLEL std::sort
-# endif
+# define SORT_PARALLEL std::sort
 # define SORT_SERIAL std::sort
 #endif
-
-
-
diff --git a/src/commons/GpuUtil.h b/src/commons/GpuUtil.h
new file mode 100644
index 000000000..8f42ad765
--- /dev/null
+++ b/src/commons/GpuUtil.h
@@ -0,0 +1,148 @@
+#ifndef GPUUTIL_H
+#define GPUUTIL_H
+
+#include "Debug.h"
+#include "FileUtil.h"
+#include "PrefilteringIndexReader.h"
+#include "marv.h"
+#include <atomic>
+#include <cstring>
+#include <fcntl.h>
+#include <iostream>
+#include <string>
+#include <sys/mman.h>
+#include <unistd.h>
+#include <utility>
+#include <thread>
+
+struct GPUSharedMemory {
+    unsigned int maxSeqLen;                   // Maximum length of the sequence
+    unsigned int maxResListLen;               // Maximum length of the results list
+    std::atomic<unsigned int> serverReady{0}; // Server status indicator
+    std::atomic<unsigned int> clientReady{0}; // Client readiness indicator
+    unsigned int queryOffset;                 // Offset to the query data
+    unsigned int queryLen;                    // Length of the query sequence
+    unsigned int resultsOffset;               // Offset to the results data
+    unsigned int resultLen;                   // Length of the result list
+    unsigned int profileOffset;               // Offset to the profile data
+
+    // Get pointers to the query, results, and profile data sections
+    int8_t* getQueryPtr() { return reinterpret_cast<int8_t*>(this) + queryOffset; }
+    Marv::Result* getResultsPtr() { return reinterpret_cast<Marv::Result*>(reinterpret_cast<char*>(this) + resultsOffset); }
+    int8_t* getProfilePtr() { return reinterpret_cast<int8_t*>(this) + profileOffset; }
+
+    // Calculate the total size needed for the shared memory
+    static size_t calculateSize(unsigned int maxSeqLen, unsigned int maxResListLen) {
+        return sizeof(GPUSharedMemory) +
+               sizeof(char) * maxSeqLen +                              // Size for query data
+               sizeof(Marv::Result) * maxResListLen +  // Size for results data
+               sizeof(int8_t) * 20 * maxSeqLen;                        // Size for profile data
+    }
+
+    static std::string getShmHash(const std::string& db) {
+        std::string dbpath = FileUtil::getRealPathFromSymLink(PrefilteringIndexReader::dbPathWithoutIndex(db));
+        char* visibleDevices = getenv("CUDA_VISIBLE_DEVICES");
+        if (visibleDevices) {
+            dbpath.append(visibleDevices);
+        }
+        size_t hash = Util::hash(dbpath.c_str(), dbpath.length());
+        return SSTR(hash);
+    }
+
+    // Allocate and initialize shared memory
+    static GPUSharedMemory* alloc(const std::string& name, unsigned int maxSeqLen, unsigned int maxResListLen) {
+        size_t shm_size = calculateSize(maxSeqLen, maxResListLen);
+        int fd = shm_open(name.c_str(), O_CREAT | O_RDWR, 0666);
+        if (fd == -1) {
+            Debug(Debug::ERROR) << "Failed to open shared memory\n";
+            EXIT(EXIT_FAILURE);
+        }
+        if (ftruncate(fd, shm_size) == -1) {
+            close(fd);
+            Debug(Debug::ERROR) << "Failed to size shared memory\n";
+            EXIT(EXIT_FAILURE);
+        }
+        void* ptr = mmap(0, shm_size, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0);
+        close(fd);  // Close the file descriptor early as it's no longer needed after mmap
+        if (ptr == MAP_FAILED) {
+            Debug(Debug::ERROR) << "Failed to map shared memory\n";
+            EXIT(EXIT_FAILURE);
+        }
+
+        GPUSharedMemory* layout = new (ptr) GPUSharedMemory;
+        layout->maxSeqLen = maxSeqLen;
+        layout->maxResListLen = maxResListLen;
+        layout->queryOffset = sizeof(GPUSharedMemory);
+        layout->resultsOffset = layout->queryOffset + sizeof(char) * maxSeqLen;
+        layout->profileOffset = layout->resultsOffset + sizeof(Marv::Result) * maxResListLen;
+        return layout;
+    }
+
+    // Deallocate shared memory
+    static void dealloc(GPUSharedMemory* layout, const std::string& name) {
+        if (layout) {
+            size_t shm_size = calculateSize(layout->maxSeqLen, layout->maxResListLen);
+            if (munmap(layout, shm_size) == -1) {
+                Debug(Debug::ERROR) << "Error unmapping shared memory\n";
+            }
+            if (shm_unlink(name.c_str()) == -1) {
+                Debug(Debug::ERROR) << "Error unlinking shared memory\n";
+            }
+        }
+    }
+
+    static void unmap(GPUSharedMemory* layout) {
+        if (layout) {
+            size_t shm_size = calculateSize(layout->maxSeqLen, layout->maxResListLen);
+            if (munmap(layout, shm_size) == -1) {
+                Debug(Debug::ERROR) << "Error unmapping shared memory\n";
+            }
+        }
+    }
+
+    // Function to open and map existing shared memory and automatically determine sizes
+    static GPUSharedMemory* openSharedMemory(const std::string& name) {
+        int fd = shm_open(name.c_str(), O_RDWR, 0666);
+        if (fd == -1) {
+            Debug(Debug::ERROR) << "Failed to open shared memory\n";
+            EXIT(EXIT_FAILURE);
+        }
+
+        // Map enough memory to access the first part of the structure
+        void* ptr = mmap(0, sizeof(GPUSharedMemory), PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0);
+        if (ptr == MAP_FAILED) {
+            close(fd);
+            Debug(Debug::ERROR) << "Failed to map shared memory\n";
+            EXIT(EXIT_FAILURE);
+        }
+
+        // Now read maxSeqLen and maxResListLen from the mapped memory
+        unsigned int maxSeqLen = *(reinterpret_cast<unsigned int*>(ptr));
+        unsigned int maxResListLen = *(reinterpret_cast<unsigned int*>(ptr) + 1);
+
+        // Correctly calculate the total size of the shared memory using read values
+        size_t shm_size = GPUSharedMemory::calculateSize(maxSeqLen, maxResListLen);
+
+        // Re-map with the full size now that we know it
+        munmap(ptr, sizeof(GPUSharedMemory));  // Unmap the initial small mapping
+        ptr = mmap(0, shm_size, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0);
+        close(fd);  // Close the file descriptor as it's no longer needed after mmap
+        if (ptr == MAP_FAILED) {
+            Debug(Debug::ERROR) << "Failed to remap shared memory\n";
+            EXIT(EXIT_FAILURE);
+        }
+        return reinterpret_cast<GPUSharedMemory*>(ptr);
+    }
+
+    bool trySetServerReady(unsigned int pid) {
+        unsigned int expected = 0;
+        return serverReady.compare_exchange_strong(expected, pid, std::memory_order_release, std::memory_order_relaxed);
+    }
+
+    void resetServerAndClientReady() {
+        serverReady.store(0, std::memory_order_release);
+        clientReady.store(0, std::memory_order_release);
+    }
+};
+
+#endif
diff --git a/src/commons/GzReader.h b/src/commons/GzReader.h
new file mode 100644
index 000000000..85aa55a81
--- /dev/null
+++ b/src/commons/GzReader.h
@@ -0,0 +1,93 @@
+#include "Debug.h"
+
+#include <cstdio>
+#include <string>
+#include <cstring>
+
+#ifdef HAVE_ZLIB
+#include <zlib.h>
+#endif
+
+class GzReader {
+public:
+    enum Mode {
+        FILE_MODE,
+#ifdef HAVE_ZLIB
+        GZ_MODE
+#endif
+    };
+
+    GzReader(const std::string &filename) {
+        if (filename.size() >= 3 && filename.substr(filename.size() - 3) == ".gz") {
+#ifdef HAVE_ZLIB
+            mode = GZ_MODE;
+            gzHandle = gzopen(filename.c_str(), "r");
+            openFailed = !gzHandle;
+            return;
+#else
+            Debug(Debug::ERROR) << "MMseqs2 was not compiled with zlib support. Cannot read compressed input\n";
+            EXIT(EXIT_FAILURE);
+#endif
+        }
+        mode = FILE_MODE;
+        file = fopen(filename.c_str(), "r");
+        openFailed = !file;
+    }
+
+    ~GzReader() {
+        if (mode == FILE_MODE && file) fclose(file);
+#ifdef HAVE_ZLIB
+        else if (mode == GZ_MODE && gzHandle) gzclose(gzHandle);
+#endif
+    }
+
+    bool fail() const {
+        return openFailed;
+    }
+
+    bool getline(std::string &line) {
+        line.clear();
+        if (openFailed) return false;
+
+        char buffer[4096];
+        bool complete = false;
+        while (!complete) {
+            if (mode == FILE_MODE) {
+                if (fgets(buffer, sizeof(buffer), file) != NULL) {
+                    if (char *newline = strchr(buffer, '\n')) {
+                        line.append(buffer, newline - buffer);
+                        complete = true;
+                    } else {
+                        line.append(buffer);
+                    }
+                } else {
+                    return !line.empty();
+                }
+            }
+    #ifdef HAVE_ZLIB
+            else if (mode == GZ_MODE) {
+                if (gzgets(gzHandle, buffer, sizeof(buffer)) != NULL) {
+                    if (char *newline = strchr(buffer, '\n')) {
+                        line.append(buffer, newline - buffer);
+                        complete = true;
+                    } else {
+                        line.append(buffer);
+                    }
+                } else {
+                    return !line.empty();
+                }
+            }
+    #endif
+        }
+
+        return true;
+    }
+
+private:
+    Mode mode;
+    bool openFailed = false;
+    FILE *file = NULL;
+#ifdef HAVE_ZLIB
+    gzFile gzHandle = NULL;
+#endif
+};
diff --git a/src/commons/Masker.cpp b/src/commons/Masker.cpp
new file mode 100644
index 000000000..66736250e
--- /dev/null
+++ b/src/commons/Masker.cpp
@@ -0,0 +1,136 @@
+#include "Masker.h"
+#include <algorithm> // for std::toupper
+
+Masker::Masker(BaseMatrix &s) : subMat(s), probMatrix(s)
+{
+    maxSeqLen = 1;
+    charSequence = (unsigned char *)malloc(maxSeqLen * sizeof(char));
+    maskLetterNum = subMat.aa2num[(int)'X'];
+}
+
+Masker::~Masker() {
+    free(charSequence);
+}
+
+int Masker::maskSequence(Sequence & seq, bool maskTantan, double maskProb,
+                         bool maskLowerCaseLetter, int maskNrepeats) {
+
+    int maskedResidues = 0;
+
+    if(maskTantan){
+        // 1. Apply tantan masking without influencing by repeat mask
+        maskedResidues += tantan::maskSequences(seq.numSequence,
+                                            seq.numSequence + seq.L,
+                                            50 /*maxCycleLength*/,
+                                            probMatrix.probMatrixPointers,
+                                            0.005 /*repeatProb*/,
+                                            0.05 /*repeatEndProb*/,
+                                            0.9 /*repeatOffsetProbDecay*/,
+                                            0, 0,
+                                            maskProb /*minMaskProb*/,
+                                            probMatrix.hardMaskTable);
+    }
+    if( maskNrepeats > 0){
+        // 2. Generate the mask for repeats
+        maskedResidues += maskRepeats(seq.numSequence, seq.L, maskNrepeats, maskLetterNum);
+    }
+    // 3. Handle lowercase masking
+    if(maskLowerCaseLetter){
+        if ((Parameters::isEqualDbtype(seq.getSequenceType(), Parameters::DBTYPE_AMINO_ACIDS) ||
+             Parameters::isEqualDbtype(seq.getSequenceType(), Parameters::DBTYPE_NUCLEOTIDES))) {
+            const char *charSeq = seq.getSeqData();
+            for (int i = 0; i < seq.L; i++) {
+                if (std::islower((unsigned char)charSeq[i])) {
+                    seq.numSequence[i] = maskLetterNum; // Apply masking
+                    maskedResidues++;
+                }
+            }
+        }
+    }
+    // 4. Finalize masking
+    if(maskTantan || maskNrepeats || maskLowerCaseLetter){
+        finalizeMasking(seq.numSequence, seq.L);
+    }
+    return maskedResidues;
+}
+
+void Masker::maskPssm(Sequence& centerSequence, float maskProb, PSSMCalculator::Profile& pssmRes) {
+    if ((size_t)centerSequence.L > maxSeqLen) {
+        maxSeqLen = sizeof(char) * centerSequence.L * 1.5;
+        charSequence = (unsigned char*)realloc(charSequence, maxSeqLen);
+    }
+    memcpy(charSequence, centerSequence.numSequence, sizeof(unsigned char) * centerSequence.L);
+    tantan::maskSequences(charSequence, charSequence + centerSequence.L,
+                          50 /*options.maxCycleLength*/,
+                          probMatrix.probMatrixPointers,
+                          0.005 /*options.repeatProb*/,
+                          0.05 /*options.repeatEndProb*/,
+                          0.9 /*options.repeatOffsetProbDecay*/,
+                          0, 0,
+                          maskProb /*options.minMaskProb*/,
+                          probMatrix.hardMaskTable);
+
+    for (int pos = 0; pos < centerSequence.L; pos++) {
+        if (charSequence[pos] == maskLetterNum) {
+            for (size_t aa = 0; aa < Sequence::PROFILE_AA_SIZE; aa++) {
+                pssmRes.pssm[pos * Sequence::PROFILE_AA_SIZE + aa] = -1;
+            }
+        }
+    }
+}
+
+
+int Masker::maskRepeats(unsigned char * numSequence, const unsigned int seqLen, int maskNrepeating, char maskChar) {
+
+    unsigned int repeatCount = 0;
+    int startOfRepeat = -1;
+    char previousChar = '\0';
+    int maskedResidues = 0; // Counter for masked residues
+
+    for (unsigned int pos = 0; pos < seqLen; ++pos) {
+        char currentChar = numSequence[pos];
+
+        if (currentChar == previousChar) {
+            repeatCount++;
+        } else {
+            if (repeatCount > (unsigned int)maskNrepeating) {
+                for (unsigned int i = startOfRepeat; i < pos; ++i) {
+                    numSequence[i] = maskChar;
+                    maskedResidues++;
+                }
+            }
+            repeatCount = 1;
+            startOfRepeat = pos;
+            previousChar = currentChar;
+        }
+    }
+
+    // Handle the last run
+    if (repeatCount > (unsigned int)maskNrepeating) {
+        for (unsigned int i = startOfRepeat; i < seqLen; ++i) {
+            numSequence[i] = maskChar;
+            maskedResidues++;
+        }
+    }
+
+    return maskedResidues;
+}
+
+void Masker::finalizeMasking(unsigned char * numSequence, const unsigned int seqLen) {
+    unsigned char maskChar = probMatrix.hardMaskTable[0];
+
+    for (unsigned int i = 0; i < seqLen; i++) {
+        unsigned char code = numSequence[i];
+        numSequence[i] = (code == maskChar || code == maskLetterNum) ? maskLetterNum : numSequence[i];
+    }
+}
+
+void Masker::applySoftmasking(unsigned char *charSequence, const unsigned char * num_sequence, unsigned int seqLen) {
+    for (unsigned int pos = 0; pos < seqLen; pos++) {
+        // If masked, lowercase (soft) or uppercase (hard) could be applied here if needed.
+        // For simplicity, we treat maskChar as masked and others as uppercase:
+        charSequence[pos] = (num_sequence[pos] == maskLetterNum)
+                            ? (char)std::tolower(charSequence[pos])
+                            : (char)std::toupper(charSequence[pos]);
+    }
+}
diff --git a/src/commons/Masker.h b/src/commons/Masker.h
new file mode 100644
index 000000000..9a9013beb
--- /dev/null
+++ b/src/commons/Masker.h
@@ -0,0 +1,37 @@
+#ifndef MMSEQS_MASKER_H
+#define MMSEQS_MASKER_H
+
+#include "Parameters.h"
+#include "Sequence.h"
+#include "SubstitutionMatrix.h"
+#include "tantan.h"
+#include "PSSMCalculator.h"
+#include <cctype>
+
+class Masker {
+public:
+    Masker(BaseMatrix &subMat);
+
+    ~Masker();
+
+    int maskSequence(Sequence & seq, bool maskTantan, double maskProb,
+                     bool maskLowerCaseLetter, int maskNrepeating);
+
+    void maskPssm(Sequence& centerSequence, float maskProb, PSSMCalculator::Profile& pssmRes);
+
+    void applySoftmasking(unsigned char *charSequence, const unsigned char * numSequence, unsigned int seqLen);
+
+    char maskLetterNum;
+
+private:
+    int maskRepeats(unsigned char *numSequence, const unsigned int seqLen, int maskNrepeating, char maskChar);
+
+    void finalizeMasking(unsigned char * numSequence, const unsigned int seqLen);
+
+    BaseMatrix &subMat;
+    ProbabilityMatrix probMatrix;
+
+    unsigned char * charSequence;
+    size_t maxSeqLen;
+};
+#endif
diff --git a/src/commons/MathUtil.h b/src/commons/MathUtil.h
index 6808cf857..dc03bccc8 100644
--- a/src/commons/MathUtil.h
+++ b/src/commons/MathUtil.h
@@ -256,6 +256,13 @@ class MathUtil {
         return sum;
     }
 
+    static float squareDist(const float xx, const float xy, const float xz,
+            const float yx, const float yy, const float yz){
+        float d1 = xx - yx;
+        float d2 = xy - yy;
+        float d3 = xz - yz;
+        return (d1 * d1 + d2 * d2 + d3 * d3);
+    }
 
 };
 
diff --git a/src/commons/Parameters.cpp b/src/commons/Parameters.cpp
index 7e62c7c15..4dcd485c5 100644
--- a/src/commons/Parameters.cpp
+++ b/src/commons/Parameters.cpp
@@ -44,9 +44,10 @@ Parameters::Parameters():
         PARAM_MAX_SEQ_LEN(PARAM_MAX_SEQ_LEN_ID, "--max-seq-len", "Max sequence length", "Maximum sequence length", typeid(size_t), (void *) &maxSeqLen, "^[0-9]{1}[0-9]*", MMseqsParameter::COMMAND_COMMON | MMseqsParameter::COMMAND_EXPERT),
         PARAM_DIAGONAL_SCORING(PARAM_DIAGONAL_SCORING_ID, "--diag-score", "Diagonal scoring", "Use ungapped diagonal scoring during prefilter", typeid(bool), (void *) &diagonalScoring, "", MMseqsParameter::COMMAND_PREFILTER | MMseqsParameter::COMMAND_EXPERT),
         PARAM_EXACT_KMER_MATCHING(PARAM_EXACT_KMER_MATCHING_ID, "--exact-kmer-matching", "Exact k-mer matching", "Extract only exact k-mers for matching (range 0-1)", typeid(int), (void *) &exactKmerMatching, "^[0-1]{1}$", MMseqsParameter::COMMAND_PREFILTER | MMseqsParameter::COMMAND_EXPERT),
-        PARAM_MASK_RESIDUES(PARAM_MASK_RESIDUES_ID, "--mask", "Mask residues", "Mask sequences in k-mer stage: 0: w/o low complexity masking, 1: with low complexity masking", typeid(int), (void *) &maskMode, "^[0-1]{1}", MMseqsParameter::COMMAND_PREFILTER | MMseqsParameter::COMMAND_EXPERT),
+        PARAM_MASK_RESIDUES(PARAM_MASK_RESIDUES_ID, "--mask", "Mask residues", "Mask sequences in prefilter stage with tantan: 0: w/o low complexity masking, 1: with low complexity masking", typeid(int), (void *) &maskMode, "^[0-1]{1}", MMseqsParameter::COMMAND_PREFILTER | MMseqsParameter::COMMAND_EXPERT),
         PARAM_MASK_PROBABILTY(PARAM_MASK_PROBABILTY_ID, "--mask-prob", "Mask residues probability", "Mask sequences is probablity is above threshold", typeid(float), (void *) &maskProb, "^0(\\.[0-9]+)?|^1(\\.0+)?$", MMseqsParameter::COMMAND_PREFILTER | MMseqsParameter::COMMAND_EXPERT),
         PARAM_MASK_LOWER_CASE(PARAM_MASK_LOWER_CASE_ID, "--mask-lower-case", "Mask lower case residues", "Lowercase letters will be excluded from k-mer search 0: include region, 1: exclude region", typeid(int), (void *) &maskLowerCaseMode, "^[0-1]{1}", MMseqsParameter::COMMAND_PREFILTER | MMseqsParameter::COMMAND_EXPERT),
+        PARAM_MASK_N_REPEAT(PARAM_MASK_N_REPEAT_ID, "--mask-n-repeat", "Mask lower letter repeating N times", "Repeat letters that occure > threshold in a rwo", typeid(int), (void *) &maskNrepeats, "^[0-9]{1}[0-9]*$", MMseqsParameter::COMMAND_PREFILTER | MMseqsParameter::COMMAND_EXPERT),
         PARAM_MIN_DIAG_SCORE(PARAM_MIN_DIAG_SCORE_ID, "--min-ungapped-score", "Minimum diagonal score", "Accept only matches with ungapped alignment score above threshold", typeid(int), (void *) &minDiagScoreThr, "^[0-9]{1}[0-9]*$", MMseqsParameter::COMMAND_PREFILTER | MMseqsParameter::COMMAND_EXPERT),
         PARAM_K_SCORE(PARAM_K_SCORE_ID, "--k-score", "k-score", "k-mer threshold for generating similar k-mer lists", typeid(MultiParam<SeqProf<int>>), (void *) &kmerScore, "^[0-9]{1}[0-9]*$", MMseqsParameter::COMMAND_PREFILTER | MMseqsParameter::COMMAND_EXPERT),
         PARAM_MAX_SEQS(PARAM_MAX_SEQS_ID, "--max-seqs", "Max results per query", "Maximum results per query sequence allowed to pass the prefilter (affects sensitivity)", typeid(size_t), (void *) &maxResListLen, "^[1-9]{1}[0-9]*$", MMseqsParameter::COMMAND_PREFILTER),
@@ -100,6 +101,10 @@ Parameters::Parameters():
         PARAM_SIMILARITYSCORE(PARAM_SIMILARITYSCORE_ID, "--similarity-type", "Similarity type", "Type of score used for clustering. 1: alignment score 2: sequence identity", typeid(int), (void *) &similarityScoreType, "^[1-2]{1}$", MMseqsParameter::COMMAND_CLUST | MMseqsParameter::COMMAND_EXPERT),
         // logging
         PARAM_V(PARAM_V_ID, "-v", "Verbosity", "Verbosity level: 0: quiet, 1: +errors, 2: +warnings, 3: +info", typeid(int), (void *) &verbosity, "^[0-3]{1}$", MMseqsParameter::COMMAND_COMMON),
+        // gpu
+        PARAM_GPU(PARAM_GPU_ID, "--gpu", "Use GPU", "Use GPU (CUDA) if possible", typeid(int), (void *) &gpu, "^[0-1]{1}$", MMseqsParameter::COMMAND_COMMON),
+        PARAM_GPU_SERVER(PARAM_GPU_SERVER_ID, "--gpu-server", "Use GPU server", "Use GPU server", typeid(int), (void *) &gpuServer, "^[0-1]{1}$", MMseqsParameter::COMMAND_COMMON),
+        PARAM_GPU_SERVER_WAIT_TIMEOUT(PARAM_GPU_SERVER_WAIT_TIMEOUT_ID, "--gpu-server-wait-timeout", "Wait for GPU server", "Wait for GPU server for 0: don't wait -1: no wait limit: >0 this many seconds", typeid(int), (void *) &gpuServerWaitTimeout, "^-?[0-9]+", MMseqsParameter::COMMAND_COMMON),
         // convertalignments
         PARAM_FORMAT_MODE(PARAM_FORMAT_MODE_ID, "--format-mode", "Alignment format", "Output format:\n0: BLAST-TAB\n1: SAM\n2: BLAST-TAB + query/db length\n3: Pretty HTML\n4: BLAST-TAB + column headers\nBLAST-TAB (0) and BLAST-TAB + column headers (4) support custom output formats (--format-output)", typeid(int), (void *) &formatAlignmentMode, "^[0-4]{1}$"),
         PARAM_FORMAT_OUTPUT(PARAM_FORMAT_OUTPUT_ID, "--format-output", "Format alignment output", "Choose comma separated list of output columns from: query,target,evalue,gapopen,pident,fident,nident,qstart,qend,qlen\ntstart,tend,tlen,alnlen,raw,bits,cigar,qseq,tseq,qheader,theader,qaln,taln,qframe,tframe,mismatch,qcov,tcov\nqset,qsetid,tset,tsetid,taxid,taxname,taxlineage,qorfstart,qorfend,torfstart,torfend,ppos", typeid(std::string), (void *) &outfmt, ""),
@@ -134,6 +139,7 @@ Parameters::Parameters():
         PARAM_PC_MODE(PARAM_PC_MODE_ID, "--pseudo-cnt-mode", "Pseudo count mode", "use 0: substitution-matrix or 1: context-specific pseudocounts", typeid(int), (void *) &pcmode, "^[0-1]{1}$", MMseqsParameter::COMMAND_PROFILE | MMseqsParameter::COMMAND_EXPERT),
         PARAM_PCA(PARAM_PCA_ID, "--pca", "Pseudo count a", "Pseudo count admixture strength", typeid(MultiParam<PseudoCounts>), (void *) &pca, "^[0-9]*(\\.[0-9]+)?$", MMseqsParameter::COMMAND_PROFILE | MMseqsParameter::COMMAND_EXPERT),
         PARAM_PCB(PARAM_PCB_ID, "--pcb", "Pseudo count b", "Pseudo counts: Neff at half of maximum admixture (range 0.0-inf)", typeid(MultiParam<PseudoCounts>), (void *) &pcb, "^[0-9]*(\\.[0-9]+)?$", MMseqsParameter::COMMAND_PROFILE | MMseqsParameter::COMMAND_EXPERT),
+        PARAM_PROFILE_OUTPUT_MODE(PARAM_PROFILE_OUTPUT_MODE_ID, "--profile-output-mode", "Profile output mode", "Profile output mode: 0: binary log-odds 1: human-readable frequencies", typeid(int), (void *) &profileOutputMode, "^[0-1]{1}$", MMseqsParameter::COMMAND_PROFILE | MMseqsParameter::COMMAND_EXPERT),
         // sequence2profile
         PARAM_NEFF(PARAM_NEFF_ID, "--neff", "Neff", "Neff included into context state profile (1.0,20.0)", typeid(float), (void *) &neff, "^[0-9]*(\\.[0-9]+)?$", MMseqsParameter::COMMAND_PROFILE),
         PARAM_TAU(PARAM_TAU_ID, "--tau", "Tau", "Tau: context state pseudo count mixture (0.0,1.0)", typeid(float), (void *) &tau, "[0-9]*(\\.[0-9]+)?$", MMseqsParameter::COMMAND_PROFILE),
@@ -164,7 +170,7 @@ Parameters::Parameters():
         PARAM_NUM_ITERATIONS(PARAM_NUM_ITERATIONS_ID, "--num-iterations", "Search iterations", "Number of iterative profile search iterations", typeid(int), (void *) &numIterations, "^[1-9]{1}[0-9]*$", MMseqsParameter::COMMAND_PROFILE),
         PARAM_START_SENS(PARAM_START_SENS_ID, "--start-sens", "Start sensitivity", "Start sensitivity", typeid(float), (void *) &startSens, "^[0-9]*(\\.[0-9]+)?$"),
         PARAM_SENS_STEPS(PARAM_SENS_STEPS_ID, "--sens-steps", "Search steps", "Number of search steps performed from --start-sens to -s", typeid(int), (void *) &sensSteps, "^[1-9]{1}$"),
-        PARAM_PREF_MODE(PARAM_PREF_MODE_ID,"--prefilter-mode", "Prefilter mode", "prefilter mode: 0: kmer/ungapped 1: ungapped, 2: nofilter",typeid(int), (void *) &prefMode, "^[0-2]{1}$"),
+        PARAM_PREF_MODE(PARAM_PREF_MODE_ID,"--prefilter-mode", "Prefilter mode", "prefilter mode: 0: kmer/ungapped 1: ungapped, 2: nofilter, 3: ungapped&gapped",typeid(int), (void *) &prefMode, "^[0-3]{1}$"),
         PARAM_EXHAUSTIVE_SEARCH(PARAM_EXHAUSTIVE_SEARCH_ID, "--exhaustive-search", "Exhaustive search mode", "For bigger profile DB, run iteratively the search by greedily swapping the search results", typeid(bool), (void *) &exhaustiveSearch, "", MMseqsParameter::COMMAND_PROFILE | MMseqsParameter::COMMAND_EXPERT),
         PARAM_EXHAUSTIVE_SEARCH_FILTER(PARAM_EXHAUSTIVE_SEARCH_FILTER_ID, "--exhaustive-search-filter", "Filter results during exhaustive search", "Filter result during search: 0: do not filter, 1: filter", typeid(int), (void *) &exhaustiveFilterMsa, "^[0-1]{1}$", MMseqsParameter::COMMAND_ALIGN | MMseqsParameter::COMMAND_EXPERT),
 
@@ -173,6 +179,7 @@ Parameters::Parameters():
         PARAM_ORF_FILTER_S(PARAM_ORF_FILTER_S_ID, "--orf-filter-s", "ORF filter sensitivity", "Sensitivity used for query ORF prefiltering", typeid(float), (void *) &orfFilterSens, "^[0-9]*(\\.[0-9]+)?$"),
         PARAM_ORF_FILTER_E(PARAM_ORF_FILTER_E_ID, "--orf-filter-e", "ORF filter e-value", "E-value threshold used for query ORF prefiltering", typeid(double), (void *) &orfFilterEval, "^([-+]?[0-9]*\\.?[0-9]+([eE][-+]?[0-9]+)?)|[0-9]*(\\.[0-9]+)?$"),
         PARAM_LCA_SEARCH(PARAM_LCA_SEARCH_ID, "--lca-search", "LCA search mode", "Efficient search for LCA candidates", typeid(bool), (void *) &lcaSearch, "", MMseqsParameter::COMMAND_PROFILE | MMseqsParameter::COMMAND_EXPERT),
+        PARAM_TRANSLATION_MODE(PARAM_TRANSLATION_MODE_ID, "--translation-mode", "Translation mode", "Translation AA seq from nucleotide by 0: ORFs, 1: full reading frames", typeid(int), (void *) &translationMode, "^[0-1]{1}$"),
         // easysearch
         PARAM_GREEDY_BEST_HITS(PARAM_GREEDY_BEST_HITS_ID, "--greedy-best-hits", "Greedy best hits", "Choose the best hits greedily to cover the query", typeid(bool), (void *) &greedyBestHits, ""),
         // extractorfs
@@ -230,7 +237,7 @@ Parameters::Parameters():
         PARAM_EXTRACT_LINES(PARAM_EXTRACT_LINES_ID, "--extract-lines", "Extract N lines", "Extract n lines of each entry", typeid(int), (void *) &extractLines, "^[1-9]{1}[0-9]*$"),
         PARAM_COMP_OPERATOR(PARAM_COMP_OPERATOR_ID, "--comparison-operator", "Numerical comparison operator", "Filter by comparing each entry row numerically by using the le) less-than-equal, ge) greater-than-equal or e) equal operator", typeid(std::string), (void *) &compOperator, ""),
         PARAM_COMP_VALUE(PARAM_COMP_VALUE_ID, "--comparison-value", "Numerical comparison value", "Filter by comparing each entry to this value", typeid(double), (void *) &compValue, "^.*$"),
-        PARAM_SORT_ENTRIES(PARAM_SORT_ENTRIES_ID, "--sort-entries", "Sort entries", "Sort column set by --filter-column, by 0: no sorting, 1: increasing, 2: decreasing, 3: random shuffle", typeid(int), (void *) &sortEntries, "^[1-9]{1}[0-9]*$"),
+        PARAM_SORT_ENTRIES(PARAM_SORT_ENTRIES_ID, "--sort-entries", "Sort entries", "Sort column set by --filter-column, by 0: no sorting, 1: increasing, 2: decreasing, 3: random shuffle, 4: priority", typeid(int), (void *) &sortEntries, "^[0-4]{1}$"),
         PARAM_BEATS_FIRST(PARAM_BEATS_FIRST_ID, "--beats-first", "Beats first", "Filter by comparing each entry to the first entry", typeid(bool), (void *) &beatsFirst, ""),
         PARAM_JOIN_DB(PARAM_JOIN_DB_ID, "--join-db", "join to DB", "Join another database entry with respect to the database identifier in the chosen column", typeid(std::string), (void *) &joinDB, ""),
         // besthitperset
@@ -423,6 +430,7 @@ Parameters::Parameters():
     prefilter.push_back(&PARAM_MASK_RESIDUES);
     prefilter.push_back(&PARAM_MASK_PROBABILTY);
     prefilter.push_back(&PARAM_MASK_LOWER_CASE);
+    prefilter.push_back(&PARAM_MASK_N_REPEAT);
     prefilter.push_back(&PARAM_MIN_DIAG_SCORE);
     prefilter.push_back(&PARAM_TAXON_LIST);
     prefilter.push_back(&PARAM_INCLUDE_IDENTITY);
@@ -447,6 +455,10 @@ Parameters::Parameters():
     ungappedprefilter.push_back(&PARAM_MAX_SEQS);
     ungappedprefilter.push_back(&PARAM_TAXON_LIST);
     ungappedprefilter.push_back(&PARAM_PRELOAD_MODE);
+    ungappedprefilter.push_back(&PARAM_GPU);
+    ungappedprefilter.push_back(&PARAM_GPU_SERVER);
+    ungappedprefilter.push_back(&PARAM_GPU_SERVER_WAIT_TIMEOUT);
+    ungappedprefilter.push_back(&PARAM_PREF_MODE);
     ungappedprefilter.push_back(&PARAM_THREADS);
     ungappedprefilter.push_back(&PARAM_COMPRESSED);
     ungappedprefilter.push_back(&PARAM_V);
@@ -564,6 +576,7 @@ Parameters::Parameters():
     result2profile.push_back(&PARAM_THREADS);
     result2profile.push_back(&PARAM_COMPRESSED);
     result2profile.push_back(&PARAM_V);
+    result2profile.push_back(&PARAM_PROFILE_OUTPUT_MODE);
 
     // createtsv
     createtsv.push_back(&PARAM_FIRST_SEQ_REP_SEQ);
@@ -728,7 +741,9 @@ Parameters::Parameters():
 
     // extract frames
     extractframes.push_back(&PARAM_ORF_FORWARD_FRAMES);
-    extractframes.push_back(&PARAM_ORF_REVERSE_FRAMES);
+    extractframes.push_back(&PARAM_ORF_REVERSE_FRAMES);    
+    extractframes.push_back(&PARAM_TRANSLATION_TABLE);
+    extractframes.push_back(&PARAM_TRANSLATE);
     extractframes.push_back(&PARAM_CREATE_LOOKUP);
     extractframes.push_back(&PARAM_THREADS);
     extractframes.push_back(&PARAM_COMPRESSED);
@@ -755,7 +770,10 @@ Parameters::Parameters():
     splitsequence.push_back(&PARAM_V);
 
     // mask sequence
+    masksequence.push_back(&PARAM_MASK_RESIDUES);
     masksequence.push_back(&PARAM_MASK_PROBABILTY);
+    masksequence.push_back(&PARAM_MASK_LOWER_CASE);
+    masksequence.push_back(&PARAM_MASK_N_REPEAT);
     masksequence.push_back(&PARAM_THREADS);
     masksequence.push_back(&PARAM_COMPRESSED);
     masksequence.push_back(&PARAM_V);
@@ -778,6 +796,7 @@ Parameters::Parameters():
     indexdb.push_back(&PARAM_MASK_RESIDUES);
     indexdb.push_back(&PARAM_MASK_PROBABILTY);
     indexdb.push_back(&PARAM_MASK_LOWER_CASE);
+    indexdb.push_back(&PARAM_MASK_N_REPEAT);
     indexdb.push_back(&PARAM_SPACED_KMER_MODE);
     indexdb.push_back(&PARAM_SPACED_KMER_PATTERN);
     indexdb.push_back(&PARAM_S);
@@ -805,6 +824,7 @@ Parameters::Parameters():
     kmerindexdb.push_back(&PARAM_MASK_RESIDUES);
     kmerindexdb.push_back(&PARAM_MASK_PROBABILTY);
     kmerindexdb.push_back(&PARAM_MASK_LOWER_CASE);
+    kmerindexdb.push_back(&PARAM_MASK_N_REPEAT);
     kmerindexdb.push_back(&PARAM_CHECK_COMPATIBLE);
     kmerindexdb.push_back(&PARAM_SEARCH_TYPE);
     kmerindexdb.push_back(&PARAM_SPACED_KMER_MODE);
@@ -821,6 +841,17 @@ Parameters::Parameters():
     createdb.push_back(&PARAM_COMPRESSED);
     createdb.push_back(&PARAM_V);
 
+    // makepaddedseqdb
+    makepaddedseqdb.push_back(&PARAM_SUB_MAT);
+    makepaddedseqdb.push_back(&PARAM_SCORE_BIAS);
+    makepaddedseqdb.push_back(&PARAM_MASK_RESIDUES);
+    makepaddedseqdb.push_back(&PARAM_MASK_PROBABILTY);
+    makepaddedseqdb.push_back(&PARAM_MASK_LOWER_CASE);
+    makepaddedseqdb.push_back(&PARAM_MASK_N_REPEAT);
+    makepaddedseqdb.push_back(&PARAM_WRITE_LOOKUP);
+    makepaddedseqdb.push_back(&PARAM_THREADS);
+    makepaddedseqdb.push_back(&PARAM_V);
+
     // convert2fasta
     convert2fasta.push_back(&PARAM_USE_HEADER_FILE);
     convert2fasta.push_back(&PARAM_V);
@@ -867,6 +898,7 @@ Parameters::Parameters():
     filterDb.push_back(&PARAM_FILTER_FILE);
     filterDb.push_back(&PARAM_BEATS_FIRST);
     filterDb.push_back(&PARAM_MAPPING_FILE);
+    filterDb.push_back(&PARAM_WEIGHT_FILE);
     filterDb.push_back(&PARAM_TRIM_TO_ONE_COL);
     filterDb.push_back(&PARAM_EXTRACT_LINES);
     filterDb.push_back(&PARAM_COMP_OPERATOR);
@@ -972,6 +1004,7 @@ Parameters::Parameters():
     kmermatcher.push_back(&PARAM_MASK_RESIDUES);
     kmermatcher.push_back(&PARAM_MASK_PROBABILTY);
     kmermatcher.push_back(&PARAM_MASK_LOWER_CASE);
+    kmermatcher.push_back(&PARAM_MASK_N_REPEAT);
     kmermatcher.push_back(&PARAM_COV_MODE);
     kmermatcher.push_back(&PARAM_K);
     kmermatcher.push_back(&PARAM_C);
@@ -994,6 +1027,7 @@ Parameters::Parameters():
     kmersearch.push_back(&PARAM_MASK_RESIDUES);
     kmersearch.push_back(&PARAM_MASK_PROBABILTY);
     kmersearch.push_back(&PARAM_MASK_LOWER_CASE);
+    kmersearch.push_back(&PARAM_MASK_N_REPEAT);
     kmersearch.push_back(&PARAM_COV_MODE);
     kmersearch.push_back(&PARAM_C);
     kmersearch.push_back(&PARAM_MAX_SEQ_LEN);
@@ -1250,10 +1284,11 @@ Parameters::Parameters():
 
     // WORKFLOWS
     searchworkflow = combineList(align, prefilter);
+    searchworkflow = combineList(searchworkflow, ungappedprefilter);
     searchworkflow = combineList(searchworkflow, rescorediagonal);
     searchworkflow = combineList(searchworkflow, result2profile);
     searchworkflow = combineList(searchworkflow, extractorfs);
-    searchworkflow = combineList(searchworkflow, translatenucs);
+    searchworkflow = combineList(searchworkflow, extractframes);
     searchworkflow = combineList(searchworkflow, splitsequence);
     searchworkflow = combineList(searchworkflow, offsetalignment);
     // needed for slice search, however all its parameters are already present in searchworkflow
@@ -1261,7 +1296,6 @@ Parameters::Parameters():
     searchworkflow.push_back(&PARAM_NUM_ITERATIONS);
     searchworkflow.push_back(&PARAM_START_SENS);
     searchworkflow.push_back(&PARAM_SENS_STEPS);
-    searchworkflow.push_back(&PARAM_PREF_MODE);
     searchworkflow.push_back(&PARAM_EXHAUSTIVE_SEARCH);
     searchworkflow.push_back(&PARAM_EXHAUSTIVE_SEARCH_FILTER);
     searchworkflow.push_back(&PARAM_STRAND);
@@ -1270,11 +1304,11 @@ Parameters::Parameters():
     searchworkflow.push_back(&PARAM_RUNNER);
     searchworkflow.push_back(&PARAM_REUSELATEST);
     searchworkflow.push_back(&PARAM_REMOVE_TMP_FILES);
+    searchworkflow.push_back(&PARAM_TRANSLATION_MODE);
 
     linsearchworkflow = combineList(align, kmersearch);
     linsearchworkflow = combineList(linsearchworkflow, swapresult);
     linsearchworkflow = combineList(linsearchworkflow, extractorfs);
-    linsearchworkflow = combineList(linsearchworkflow, translatenucs);
     linsearchworkflow = combineList(linsearchworkflow, offsetalignment);
     linsearchworkflow.push_back(&PARAM_RUNNER);
     linsearchworkflow.push_back(&PARAM_REUSELATEST);
@@ -1288,18 +1322,19 @@ Parameters::Parameters():
     easysearchworkflow = combineList(searchworkflow, convertalignments);
     easysearchworkflow = combineList(easysearchworkflow, summarizeresult);
     easysearchworkflow = combineList(easysearchworkflow, createdb);
+    easysearchworkflow = combineList(easysearchworkflow, makepaddedseqdb);
     easysearchworkflow.push_back(&PARAM_GREEDY_BEST_HITS);
 
     // createindex workflow
     createindex = combineList(indexdb, extractorfs);
-    createindex = combineList(createindex, translatenucs);
+    createindex = combineList(createindex, extractframes);
     createindex = combineList(createindex, splitsequence);
+    createindex.push_back(&PARAM_TRANSLATION_MODE);
     createindex.push_back(&PARAM_STRAND);
     createindex.push_back(&PARAM_REMOVE_TMP_FILES);
 
     // createindex workflow
     createlinindex = combineList(kmerindexdb, extractorfs);
-    createlinindex = combineList(createlinindex, translatenucs);
     createlinindex.push_back(&PARAM_REMOVE_TMP_FILES);
 
     // linclust workflow
@@ -1324,6 +1359,9 @@ Parameters::Parameters():
     clusterworkflow.push_back(&PARAM_REUSELATEST);
     clusterworkflow.push_back(&PARAM_RUNNER);
     clusterworkflow = combineList(clusterworkflow, linclustworkflow);
+    clusterworkflow = removeParameter(clusterworkflow, PARAM_GPU);
+    clusterworkflow = removeParameter(clusterworkflow, PARAM_GPU_SERVER);
+    clusterworkflow = removeParameter(clusterworkflow, PARAM_GPU_SERVER_WAIT_TIMEOUT);
 
     // easyclusterworkflow
     easyclusterworkflow = combineList(clusterworkflow, createdb);
@@ -1365,21 +1403,29 @@ Parameters::Parameters():
     clusterUpdate.push_back(&PARAM_REUSELATEST);
     clusterUpdate.push_back(&PARAM_USESEQID);
     clusterUpdate.push_back(&PARAM_RECOVER_DELETED);
+    clusterUpdate = removeParameter(clusterUpdate, PARAM_GPU);
+    clusterUpdate = removeParameter(clusterUpdate, PARAM_GPU_SERVER);
+    clusterUpdate = removeParameter(clusterUpdate, PARAM_GPU_SERVER_WAIT_TIMEOUT);
 
     mapworkflow = combineList(prefilter, rescorediagonal);
     mapworkflow = combineList(mapworkflow, extractorfs);
-    mapworkflow = combineList(mapworkflow, translatenucs);
     mapworkflow.push_back(&PARAM_START_SENS);
     mapworkflow.push_back(&PARAM_SENS_STEPS);
     mapworkflow.push_back(&PARAM_RUNNER);
     mapworkflow.push_back(&PARAM_REUSELATEST);
     mapworkflow.push_back(&PARAM_REMOVE_TMP_FILES);
+    mapworkflow = removeParameter(mapworkflow, PARAM_GPU);
+    mapworkflow = removeParameter(mapworkflow, PARAM_GPU_SERVER);
+    mapworkflow = removeParameter(mapworkflow, PARAM_GPU_SERVER_WAIT_TIMEOUT);
 
     enrichworkflow = combineList(searchworkflow, prefilter);
     enrichworkflow = combineList(enrichworkflow, subtractdbs);
     enrichworkflow = combineList(enrichworkflow, align);
     enrichworkflow = combineList(enrichworkflow, expandaln);
     enrichworkflow = combineList(enrichworkflow, result2profile);
+    enrichworkflow = removeParameter(enrichworkflow, PARAM_GPU);
+    enrichworkflow = removeParameter(enrichworkflow, PARAM_GPU_SERVER);
+    enrichworkflow = removeParameter(enrichworkflow, PARAM_GPU_SERVER_WAIT_TIMEOUT);
 
     databases.push_back(&PARAM_HELP);
     databases.push_back(&PARAM_HELP_LONG);
@@ -1408,6 +1454,22 @@ Parameters::Parameters():
     appenddbtoindex.push_back(&PARAM_ID_LIST);
     appenddbtoindex.push_back(&PARAM_V);
 
+    // touchdb
+    touchdb.push_back(&PARAM_THREADS);
+    touchdb.push_back(&PARAM_V);
+
+    // gpu server
+    gpuserver.push_back(&PARAM_GPU);
+    gpuserver.push_back(&PARAM_MAX_SEQS);
+    gpuserver.push_back(&PARAM_PRELOAD_MODE);
+    gpuserver.push_back(&PARAM_PREF_MODE);
+
+    // tsv2exprofiledb
+    tsv2exprofiledb.push_back(&PARAM_GPU);
+    tsv2exprofiledb.push_back(&PARAM_THREADS);
+    tsv2exprofiledb.push_back(&PARAM_COMPRESSED);
+    tsv2exprofiledb.push_back(&PARAM_V);
+
     //checkSaneEnvironment();
     setDefaults();
 }
@@ -1568,7 +1630,7 @@ void Parameters::printUsageMessage(const Command& command, const unsigned int ou
     if (command.citations > 0) {
         ss << "\nreferences:\n";
         for (unsigned int pos = 0; pos != sizeof(command.citations) * CHAR_BIT; ++pos) {
-            unsigned int citation = 1 << pos;
+            unsigned int citation = 1U << pos;
             if (command.citations & citation && citations.find(citation) != citations.end()) {
                 ss << " - " << citations.at(citation) << "\n";
             }
@@ -2279,6 +2341,7 @@ void Parameters::setDefaults() {
     orfFilterSens = 2.0;
     orfFilterEval = 100;
     lcaSearch = false;
+    translationMode = PARAM_TRANSLATION_MODE_ORF;
 
     greedyBestHits = false;
 
@@ -2305,6 +2368,7 @@ void Parameters::setDefaults() {
     maskMode = 1;
     maskProb = 0.9;
     maskLowerCaseMode = 0;
+    maskNrepeats = 0;
     minDiagScoreThr = 15;
     spacedKmer = true;
     includeIdentity = false;
@@ -2394,6 +2458,7 @@ void Parameters::setDefaults() {
     pcmode = PCMODE_SUBSTITUTION_SCORE;
     pca = MultiParam<PseudoCounts>(PseudoCounts(1.1, 1.4));
     pcb = MultiParam<PseudoCounts>(PseudoCounts(4.1, 5.8));
+    profileOutputMode = 0;
 
     // sequence2profile
     neff = 1.0;
@@ -2402,6 +2467,22 @@ void Parameters::setDefaults() {
     // logging
     verbosity = Debug::INFO;
 
+    // gpu
+    gpu = 0;
+#ifdef HAVE_CUDA
+    char* gpuEnv = getenv("MMSEQS_FORCE_GPU");
+    if (gpuEnv != NULL) {
+        gpu = 1;
+    }
+#endif
+    gpuServer = 0;
+    gpuServerWaitTimeout = 10 * 60;
+#ifdef HAVE_CUDA
+    char* gpuServerEnv = getenv("MMSEQS_FORCE_GPUSERVER");
+    if (gpuServerEnv != NULL) {
+        gpuServer = 1;
+    }
+#endif
     //extractorfs
     orfMinLength = 30;
     orfMaxLength = 32734;
@@ -2601,8 +2682,9 @@ void Parameters::setDefaults() {
             { CITATION_UNICLUST, "Mirdita M, von den Driesch L, Galiez C, Martin M, Soding J, Steinegger M: Uniclust databases of clustered and deeply annotated protein sequences and alignments. Nucleic Acids Research 45(D1), D170-D176 (2017)" },
             { CITATION_LINCLUST, "Steinegger M, Soding J: Clustering huge protein sequence sets in linear time. Nature Communications, 9(1), 2542 (2018)" },
             { CITATION_PLASS,    "Steinegger M, Mirdita M, Soding J: Protein-level assembly increases protein sequence recovery from metagenomic samples manyfold. Nature Methods, 16(7), 603-606 (2019)" },
-            { CITATION_SERVER,   "Mirdita M, Steinegger M, Soding J: MMseqs2 desktop and local web server app for fast, interactive sequence searches. Bioinformatics, 35(16), 2856–2858 (2019)" },
+            { CITATION_SERVER,   "Mirdita M, Steinegger M, Soding J: MMseqs2 desktop and local web server app for fast, interactive sequence searches. Bioinformatics, 35(16), 2856-2858 (2019)" },
             { CITATION_TAXONOMY, "Mirdita M, Steinegger M, Breitwieser F, Soding J, Levy Karin E: Fast and sensitive taxonomic assignment to metagenomic contigs. Bioinformatics, btab184 (2021)" },
+            { CITATION_GPU,      "Kallenborn F, Chacon A, Hundt C, Sirelkhatim H, Didi K, Dallago C, Mirdita M, Schmidt B, Steinegger M: GPU-accelerated homology search with MMseqs2. bioRxiv, 2024.11.13.623350 (2024)" },
     };
 }
 
diff --git a/src/commons/Parameters.h b/src/commons/Parameters.h
index d69a57d0e..bfea67afe 100644
--- a/src/commons/Parameters.h
+++ b/src/commons/Parameters.h
@@ -90,6 +90,7 @@ class Parameters {
     static const unsigned int DBTYPE_EXTENDED_COMPRESSED = 1;
     static const unsigned int DBTYPE_EXTENDED_INDEX_NEED_SRC = 2;
     static const unsigned int DBTYPE_EXTENDED_CONTEXT_PSEUDO_COUNTS = 4;
+    static const unsigned int DBTYPE_EXTENDED_GPU = 8;
 
     // don't forget to add new database types to DBReader::getDbTypeName and Parameters::PARAM_OUTPUT_DBTYPE
 
@@ -312,6 +313,7 @@ class Parameters {
     static const int PREF_MODE_KMER = 0;
     static const int PREF_MODE_UNGAPPED = 1;
     static const int PREF_MODE_EXHAUSTIVE = 2;
+    static const int PREF_MODE_UNGAPPED_AND_GAPPED = 3;
 
     // unpackdb
     static const int UNPACK_NAME_KEY = 0;
@@ -321,6 +323,10 @@ class Parameters {
     static const int PARAM_RESULT_DIRECTION_QUERY  = 0;
     static const int PARAM_RESULT_DIRECTION_TARGET = 1;
 
+    // translation mode
+    static const int PARAM_TRANSLATION_MODE_ORF = 0;
+    static const int PARAM_TRANSLATION_MODE_FRAME = 1;
+
     // path to databases
     std::string db1;
     std::string db1Index;
@@ -386,6 +392,9 @@ class Parameters {
     size_t maxSeqLen;                    // sequence length
     size_t maxResListLen;                // Maximal result list length per query
     int    verbosity;                    // log level
+    int    gpu;                          // use GPU
+    int    gpuServer;                    // use the gpu server
+    int    gpuServerWaitTimeout;         // wait for this many seconds until GPU server is ready
     int    threads;                      // Amounts of threads
     int    compressed;                   // compressed writer
     bool   removeTmpFiles;               // Do not delete temp files
@@ -405,6 +414,7 @@ class Parameters {
     int    maskMode;                     // mask low complex areas
     float  maskProb;                     // mask probability
     int    maskLowerCaseMode;            // mask lowercase letters in prefilter and kmermatchers
+    int    maskNrepeats;                 // mask letters that occur at least N times in a row
 
     int    minDiagScoreThr;              // min diagonal score
     int    spacedKmer;                   // Spaced Kmers
@@ -467,6 +477,7 @@ class Parameters {
     float orfFilterSens;
     double orfFilterEval;
     bool lcaSearch;
+    int translationMode;
 
     // easysearch
     bool greedyBestHits;
@@ -527,6 +538,7 @@ class Parameters {
     int pcmode;
     MultiParam<PseudoCounts> pca;
     MultiParam<PseudoCounts> pcb;
+    int profileOutputMode;
 
     // sequence2profile
     float neff;
@@ -747,6 +759,7 @@ class Parameters {
     PARAMETER(PARAM_MASK_RESIDUES)
     PARAMETER(PARAM_MASK_PROBABILTY)
     PARAMETER(PARAM_MASK_LOWER_CASE)
+    PARAMETER(PARAM_MASK_N_REPEAT)
 
     PARAMETER(PARAM_MIN_DIAG_SCORE)
     PARAMETER(PARAM_K_SCORE)
@@ -808,7 +821,10 @@ class Parameters {
     // logging
     PARAMETER(PARAM_V)
     std::vector<MMseqsParameter*> clust;
-
+    // gpu
+    PARAMETER(PARAM_GPU)
+    PARAMETER(PARAM_GPU_SERVER)
+    PARAMETER(PARAM_GPU_SERVER_WAIT_TIMEOUT)
     // format alignment
     PARAMETER(PARAM_FORMAT_MODE)
     PARAMETER(PARAM_FORMAT_OUTPUT)
@@ -847,6 +863,7 @@ class Parameters {
     PARAMETER(PARAM_PC_MODE)
     PARAMETER(PARAM_PCA)
     PARAMETER(PARAM_PCB)
+    PARAMETER(PARAM_PROFILE_OUTPUT_MODE)
 
     // sequence2profile
     PARAMETER(PARAM_NEFF)
@@ -890,6 +907,7 @@ class Parameters {
     PARAMETER(PARAM_ORF_FILTER_S)
     PARAMETER(PARAM_ORF_FILTER_E)
     PARAMETER(PARAM_LCA_SEARCH)
+    PARAMETER(PARAM_TRANSLATION_MODE)
 
     // easysearch
     PARAMETER(PARAM_GREEDY_BEST_HITS)
@@ -1110,6 +1128,7 @@ class Parameters {
     std::vector<MMseqsParameter*> createlinindex;
     std::vector<MMseqsParameter*> convertalignments;
     std::vector<MMseqsParameter*> createdb;
+    std::vector<MMseqsParameter*> makepaddedseqdb;
     std::vector<MMseqsParameter*> convert2fasta;
     std::vector<MMseqsParameter*> result2flat;
     std::vector<MMseqsParameter*> result2repseq;
@@ -1182,6 +1201,9 @@ class Parameters {
     std::vector<MMseqsParameter*> tar2db;
     std::vector<MMseqsParameter*> unpackdbs;
     std::vector<MMseqsParameter*> appenddbtoindex;
+    std::vector<MMseqsParameter*> touchdb;
+    std::vector<MMseqsParameter*> gpuserver;
+    std::vector<MMseqsParameter*> tsv2exprofiledb;
 
     std::vector<MMseqsParameter*> combineList(const std::vector<MMseqsParameter*> &par1,
                                              const std::vector<MMseqsParameter*> &par2);
diff --git a/src/commons/Sequence.cpp b/src/commons/Sequence.cpp
index 6461efcda..f8b7ed505 100644
--- a/src/commons/Sequence.cpp
+++ b/src/commons/Sequence.cpp
@@ -7,7 +7,6 @@
 #include "MathUtil.h"
 #include "SubstitutionMatrixProfileStates.h"
 #include "PSSMCalculator.h"
-
 #include <climits> // short_max
 #include <cstddef>
 
@@ -227,7 +226,11 @@ void Sequence::mapSequence(size_t id, unsigned int dbKey, std::pair<const unsign
             numSequence = static_cast<unsigned char *>(realloc(numSequence, this->L+1));
             maxLen = this->L;
         }
-        memcpy(this->numSequence, data.first, this->L);
+        // map softmasked sequences to regular sequences
+        // softmasked character start at 32
+        for(int i = 0; i < this->L; i++){
+            this->numSequence[i] = ( data.first[i] >= 32) ? data.first[i] - 32 : data.first[i];
+        }
     } else {
         Debug(Debug::ERROR) << "Invalid sequence type!\n";
         EXIT(EXIT_FAILURE);
diff --git a/src/commons/SubstitutionMatrix.cpp b/src/commons/SubstitutionMatrix.cpp
index f15bc9981..3130a86c6 100644
--- a/src/commons/SubstitutionMatrix.cpp
+++ b/src/commons/SubstitutionMatrix.cpp
@@ -1,8 +1,7 @@
 #include "SubstitutionMatrix.h"
 #include "Util.h"
 #include "Debug.h"
-#include "lambda_calculator.h"
-
+// #include "lambda_calculator.h"
 
 #include <cstring>
 #include <algorithm>
@@ -58,34 +57,40 @@ SubstitutionMatrix::SubstitutionMatrix(const char *filename, float bitFactor, fl
 }
 
 
-bool SubstitutionMatrix::estimateLambdaAndBackground(const double **scoreMatrix,
-                                                     int alphabetSize, double *pBack, double &lambda) {
+bool SubstitutionMatrix::estimateLambdaAndBackground(
+        const double** MAYBE_UNUSED(scoreMatrix),
+        int MAYBE_UNUSED(alphabetSize),
+        double* MAYBE_UNUSED(pBack),
+        double& MAYBE_UNUSED(lambda)
+    ) {
+    Debug(Debug::ERROR) << "Custom Substitution Matrix not supported in the release. Please use previous release\n";
+    return false;
     // We need to pass the parameters as 1-based pointers, hence the +1s and -1s.
-    std::vector<double> cells(alphabetSize * (alphabetSize + 1));
-    std::vector<const double *> pointers(alphabetSize + 1);
-
-    for (int i = 0; i < alphabetSize; ++i) {
-        pointers[i + 1] = &cells[i * alphabetSize];
-        for (int j = 0; j < alphabetSize; ++j) {
-            cells[i * alphabetSize + j + 1] = scoreMatrix[i][j];
-        }
-    }
-
-    std::vector<double> letterProbs1(alphabetSize, 0);
-    std::vector<double> letterProbs2(alphabetSize, 0);
-
-    lambda = calculate_lambda(&pointers[0], alphabetSize,
-                              &letterProbs1[0] - 1,
-                              &letterProbs2[0] - 1);
-
-    for (int i = 0; i < alphabetSize; i++) {
-        pBack[i] = letterProbs1[i];
-    }
-
-    if (lambda < 0)
-        return false; //bad
-    else
-        return true; //good
+    // std::vector<double> cells(alphabetSize * (alphabetSize + 1));
+    // std::vector<const double *> pointers(alphabetSize + 1);
+
+    // for (int i = 0; i < alphabetSize; ++i) {
+    //     pointers[i + 1] = &cells[i * alphabetSize];
+    //     for (int j = 0; j < alphabetSize; ++j) {
+    //         cells[i * alphabetSize + j + 1] = scoreMatrix[i][j];
+    //     }
+    // }
+
+    // std::vector<double> letterProbs1(alphabetSize, 0);
+    // std::vector<double> letterProbs2(alphabetSize, 0);
+
+    // lambda = calculate_lambda(&pointers[0], alphabetSize,
+    //                           &letterProbs1[0] - 1,
+    //                           &letterProbs2[0] - 1);
+
+    // for (int i = 0; i < alphabetSize; i++) {
+    //     pBack[i] = letterProbs1[i];
+    // }
+
+    // if (lambda < 0)
+    //     return false; //bad
+    // else
+    //     return true; //good
 }
 
 
diff --git a/src/commons/Util.cpp b/src/commons/Util.cpp
index 6252262f1..ca622ae22 100644
--- a/src/commons/Util.cpp
+++ b/src/commons/Util.cpp
@@ -21,6 +21,9 @@
 #include <sys/mman.h>
 #include <fstream>      // std::ifstream
 
+#define FMT_HEADER_ONLY 1
+#include <fmt/fmt/core.h>
+
 #ifdef OPENMP
 #include <omp.h>
 #endif
@@ -653,14 +656,20 @@ std::string SSTR(unsigned long long x) {
 
 template<>
 std::string SSTR(double x) {
-    char buffer[32];
-    int n = snprintf(buffer, sizeof(buffer), "%.3E", x);
-    return std::string(buffer, n);
+    return fmt::format("{:.3E}", x);
+}
+
+template<>
+std::string SSTR(double x, int precision) {
+    return fmt::format("{:.{}E}", x, precision);
 }
 
 template<>
 std::string SSTR(float x) {
-    char buffer[32];
-    int n = snprintf(buffer, sizeof(buffer), "%.3f", x);
-    return std::string(buffer, n);
+    return fmt::format("{:.3f}", x);
 }
+
+template<>
+std::string SSTR(float x, int precision) {
+    return fmt::format("{:.{}f}", x, precision);
+}
\ No newline at end of file
diff --git a/src/commons/Util.h b/src/commons/Util.h
index 0dd67adf8..0d02e3222 100644
--- a/src/commons/Util.h
+++ b/src/commons/Util.h
@@ -28,6 +28,12 @@ std::string SSTR(T) {
     return "";
 }
 
+template<typename T>
+std::string SSTR(T, int) {
+    static_assert(assert_false<T>::value , "Not implemented for requested type");
+    return "";
+}
+
 template<> std::string SSTR(const char*);
 template<> std::string SSTR(char*);
 template<> std::string SSTR(bool);
@@ -45,6 +51,8 @@ template<> std::string SSTR(long long);
 template<> std::string SSTR(unsigned long long);
 template<> std::string SSTR(double);
 template<> std::string SSTR(float);
+template<> std::string SSTR(double, int precision);
+template<> std::string SSTR(float, int precision);
 
 #define ARRAY_SIZE(a) (sizeof(a) / sizeof(a[0]))
 
diff --git a/src/commons/tantan.cpp b/src/commons/tantan.cpp
deleted file mode 100644
index e870fa58b..000000000
--- a/src/commons/tantan.cpp
+++ /dev/null
@@ -1,506 +0,0 @@
-// Copyright 2010 Martin C. Frith
-
-#include "tantan.h"
-
-#include <algorithm>  // fill, max
-#include <cassert>
-#include <cmath>  // pow, abs
-#include <iostream>  // cerr
-#include <numeric>  // accumulate
-#include <vector>
-
-#define BEG(v) ((v).empty() ? 0 : &(v).front())
-#define END(v) ((v).empty() ? 0 : &(v).back() + 1)
-
-namespace tantan {
-
-    void multiplyAll(std::vector<double> &v, double factor) {
-        for (std::vector<double>::iterator i = v.begin(); i < v.end(); ++i)
-            *i *= factor;
-    }
-
-    double firstRepeatOffsetProb(double probMult, int maxRepeatOffset) {
-        if (probMult < 1 || probMult > 1) {
-            return (1 - probMult) / (1 - std::pow(probMult, maxRepeatOffset));
-        }
-        return 1.0 / maxRepeatOffset;
-    }
-
-    void checkForwardAndBackwardTotals(double fTot, double bTot) {
-        double x = std::abs(fTot);
-        double y = std::abs(bTot);
-
-        // ??? Is 1e6 suitable here ???
-        if (std::abs(fTot - bTot) > std::max(x, y) / 1e6)
-            std::cerr << "tantan: warning: possible numeric inaccuracy\n"
-                      << "tantan:          forward algorithm total: " << fTot << "\n"
-                      << "tantan:          backward algorithm total: " << bTot << "\n";
-    }
-
-    struct Tantan {
-        enum { scaleStepSize = 16 };
-
-        const char *seqBeg;  // start of the sequence
-        const char *seqEnd;  // end of the sequence
-        const char *seqPtr;  // current position in the sequence
-
-        int maxRepeatOffset;
-
-        const const_double_ptr *likelihoodRatioMatrix;
-
-        double b2b;  // transition probability from background to background
-        double f2b;  // transition probability from foreground to background
-        double g2g;  // transition probability from gap/indel to gap/indel
-        //double f2g;  // transition probability from foreground to gap/indel
-        //double g2f;  // transition probability from gap/indel to foreground
-        double oneGapProb;  // f2g * g2f
-        double endGapProb;  // f2g * 1
-        double f2f0;  // foreground to foreground, if there are 0 indel transitions
-        double f2f1;  // foreground to foreground, if there is 1 indel transition
-        double f2f2;  // foreground to foreground, if there are 2 indel transitions
-        double b2fDecay;
-        double b2fGrowth;
-        double b2fFirst;  // background state to first foreground state
-        double b2fLast;  // background state to last foreground state
-
-        double backgroundProb;
-        std::vector<double> b2fProbs;  // background state to each foreground state
-        std::vector<double> foregroundProbs;
-        std::vector<double> insertionProbs;
-
-        std::vector<double> scaleFactors;
-
-        Tantan(const char *seqBeg,
-               const char *seqEnd,
-               int maxRepeatOffset,
-               const const_double_ptr *likelihoodRatioMatrix,
-               double repeatProb,
-               double repeatEndProb,
-               double repeatOffsetProbDecay,
-               double firstGapProb,
-               double otherGapProb) {
-            assert(maxRepeatOffset > 0);
-            assert(repeatProb >= 0 && repeatProb < 1);
-            // (if repeatProb==1, then any sequence is impossible)
-            assert(repeatEndProb >= 0 && repeatEndProb <= 1);
-            assert(repeatOffsetProbDecay > 0 && repeatOffsetProbDecay <= 1);
-            assert(otherGapProb >= 0 && otherGapProb <= 1);
-            assert(firstGapProb >= 0);
-            assert(repeatEndProb + firstGapProb * 2 <= 1);
-
-            this->seqBeg = seqBeg;
-            this->seqEnd = seqEnd;
-            this->seqPtr = seqBeg;
-            this->maxRepeatOffset = maxRepeatOffset;
-            this->likelihoodRatioMatrix = likelihoodRatioMatrix;
-
-            b2b = 1 - repeatProb;
-            f2b = repeatEndProb;
-            g2g = otherGapProb;
-            //f2g = firstGapProb;
-            //g2f = 1 - otherGapProb;
-            oneGapProb = firstGapProb * (1 - otherGapProb);
-            endGapProb = firstGapProb * (maxRepeatOffset > 1);
-            f2f0 = 1 - repeatEndProb;
-            f2f1 = 1 - repeatEndProb - firstGapProb;
-            f2f2 = 1 - repeatEndProb - firstGapProb * 2;
-
-            b2fDecay = repeatOffsetProbDecay;
-            b2fGrowth = 1 / repeatOffsetProbDecay;
-
-            b2fFirst = repeatProb * firstRepeatOffsetProb(b2fDecay, maxRepeatOffset);
-            b2fLast = repeatProb * firstRepeatOffsetProb(b2fGrowth, maxRepeatOffset);
-
-            b2fProbs.resize(maxRepeatOffset);
-            foregroundProbs.resize(maxRepeatOffset);
-            insertionProbs.resize(maxRepeatOffset - 1);
-
-            double p = b2fFirst;
-            for (int i = 0; i < maxRepeatOffset; ++i) {
-                b2fProbs[i] = p;
-                p *= b2fDecay;
-            }
-
-            scaleFactors.resize((seqEnd - seqBeg) / scaleStepSize);
-        }
-
-        void initializeForwardAlgorithm() {
-            backgroundProb = 1.0;
-            std::fill(foregroundProbs.begin(), foregroundProbs.end(), 0.0);
-            std::fill(insertionProbs.begin(), insertionProbs.end(), 0.0);
-        }
-
-        double forwardTotal() {
-            double fromForeground = std::accumulate(foregroundProbs.begin(),
-                                                    foregroundProbs.end(), 0.0);
-            double total = backgroundProb * b2b + fromForeground * f2b;
-            assert(total > 0);
-            return total;
-        }
-
-        void initializeBackwardAlgorithm() {
-            backgroundProb = b2b;
-            std::fill(foregroundProbs.begin(), foregroundProbs.end(), f2b);
-            std::fill(insertionProbs.begin(), insertionProbs.end(), 0.0);
-        }
-
-        double backwardTotal() {
-            assert(backgroundProb > 0);
-            return backgroundProb;
-        }
-
-        void calcForwardTransitionProbsWithGaps() {
-            double fromBackground = backgroundProb * b2fLast;
-            double *foregroundPtr = &foregroundProbs.back();
-            double f = *foregroundPtr;
-            double fromForeground = f;
-
-            double *insertionPtr = &insertionProbs.back();
-            double i = *insertionPtr;
-            *foregroundPtr = fromBackground + f * f2f1 + i * endGapProb;
-            double d = f;
-            --foregroundPtr;
-            fromBackground *= b2fGrowth;
-
-            while (foregroundPtr > &foregroundProbs.front()) {
-                f = *foregroundPtr;
-                fromForeground += f;
-                i = *(insertionPtr - 1);
-                *foregroundPtr = fromBackground + f * f2f2 + (i + d) * oneGapProb;
-                *insertionPtr = f + i * g2g;
-                d = f + d * g2g;
-                --foregroundPtr;
-                --insertionPtr;
-                fromBackground *= b2fGrowth;
-            }
-
-            f = *foregroundPtr;
-            fromForeground += f;
-            *foregroundPtr = fromBackground + f * f2f1 + d * endGapProb;
-            *insertionPtr = f;
-
-            backgroundProb = backgroundProb * b2b + fromForeground * f2b;
-        }
-
-        void calcBackwardTransitionProbsWithGaps() {
-            double toBackground = f2b * backgroundProb;
-            double *foregroundPtr = &foregroundProbs.front();
-            double f = *foregroundPtr;
-            double toForeground = f;
-
-            double *insertionPtr = &insertionProbs.front();
-            double i = *insertionPtr;
-            *foregroundPtr = toBackground + f2f1 * f + i;
-            double d = endGapProb * f;
-            ++foregroundPtr;
-            toForeground *= b2fGrowth;
-
-            while (foregroundPtr < &foregroundProbs.back()) {
-                f = *foregroundPtr;
-                toForeground += f;
-                i = *(insertionPtr + 1);
-                *foregroundPtr = toBackground + f2f2 * f + (i + d);
-                double oneGapProb_f = oneGapProb * f;
-                *insertionPtr = oneGapProb_f + g2g * i;
-                d = oneGapProb_f + g2g * d;
-                ++foregroundPtr;
-                ++insertionPtr;
-                toForeground *= b2fGrowth;
-            }
-
-            f = *foregroundPtr;
-            toForeground += f;
-            *foregroundPtr = toBackground + f2f1 * f + d;
-            *insertionPtr = endGapProb * f;
-
-            backgroundProb = b2b * backgroundProb + b2fLast * toForeground;
-        }
-
-        void calcForwardTransitionProbs() {
-            if (endGapProb > 0) return calcForwardTransitionProbsWithGaps();
-
-            double b = backgroundProb;
-            double fromForeground = 0;
-            double *foregroundBeg = BEG(foregroundProbs);
-
-            for (int i = 0; i < maxRepeatOffset; ++i) {
-                double f = foregroundBeg[i];
-                fromForeground += f;
-                foregroundBeg[i] = b * b2fProbs[i] + f * f2f0;
-            }
-
-            backgroundProb = b * b2b + fromForeground * f2b;
-        }
-
-        void calcBackwardTransitionProbs() {
-            if (endGapProb > 0) return calcBackwardTransitionProbsWithGaps();
-
-            double toBackground = f2b * backgroundProb;
-            double toForeground = 0;
-            double *foregroundBeg = BEG(foregroundProbs);
-
-            for (int i = 0; i < maxRepeatOffset; ++i) {
-                double f = foregroundBeg[i];
-                toForeground += b2fProbs[i] * f;
-                foregroundBeg[i] = toBackground + f2f0 * f;
-            }
-
-            backgroundProb = b2b * backgroundProb + toForeground;
-        }
-
-        void addEndCounts(double forwardProb,
-                          double totalProb,
-                          double *transitionCounts) {
-            double toEnd = forwardProb * b2b / totalProb;
-            transitionCounts[0] += toEnd;
-        }
-
-        void addTransitionCounts(double forwardProb,
-                                 double totalProb,
-                                 double *transitionCounts) {
-            double toBg = forwardProb * b2b / totalProb;
-            double toFg = forwardProb * b2fFirst / totalProb;
-
-            transitionCounts[0] += backgroundProb * toBg;
-
-            for (double *i = BEG(foregroundProbs); i < END(foregroundProbs); ++i) {
-                ++transitionCounts;
-                *transitionCounts += *i * toFg;
-                toFg *= b2fDecay;
-            }
-        }
-
-        bool isNearSeqBeg() {
-            return seqPtr - seqBeg < maxRepeatOffset;
-        }
-
-        int maxOffsetInTheSequence() {
-            return isNearSeqBeg() ? (seqPtr - seqBeg) : maxRepeatOffset;
-        }
-
-        const char *seqFurthestBack() {
-            return isNearSeqBeg() ? seqBeg : seqPtr - maxRepeatOffset;
-        }
-
-        void calcEmissionProbs() {
-            const double *lrRow = likelihoodRatioMatrix[(int)*seqPtr];
-            const char *seqStop = seqFurthestBack();
-            double *foregroundPtr = BEG(foregroundProbs);
-            const char *offsetPtr = seqPtr;
-
-            while (offsetPtr > seqStop) {
-                --offsetPtr;
-                *foregroundPtr *= lrRow[(int)*offsetPtr];
-                ++foregroundPtr;
-            }
-
-            while (foregroundPtr < END(foregroundProbs)) {
-                *foregroundPtr *= 0;
-                ++foregroundPtr;
-            }
-        }
-
-        void calcForwardTransitionAndEmissionProbs() {
-            if (endGapProb > 0) {
-                calcForwardTransitionProbsWithGaps();
-                calcEmissionProbs();
-                return;
-            }
-
-            double b = backgroundProb;
-            double fromForeground = 0;
-            double *foregroundBeg = BEG(foregroundProbs);
-            const double *lrRow = likelihoodRatioMatrix[(int)*seqPtr];
-            int maxOffset = maxOffsetInTheSequence();
-
-            for (int i = 0; i < maxOffset; ++i) {
-                double f = foregroundBeg[i];
-                fromForeground += f;
-                foregroundBeg[i] = (b * b2fProbs[i] + f * f2f0) * lrRow[(int)seqPtr[-i-1]];
-            }
-
-            backgroundProb = b * b2b + fromForeground * f2b;
-        }
-
-        void calcEmissionAndBackwardTransitionProbs() {
-            if (endGapProb > 0) {
-                calcEmissionProbs();
-                calcBackwardTransitionProbsWithGaps();
-                return;
-            }
-
-            double toBackground = f2b * backgroundProb;
-            double toForeground = 0;
-            double *foregroundBeg = BEG(foregroundProbs);
-            const double *lrRow = likelihoodRatioMatrix[(int)*seqPtr];
-            int maxOffset = maxOffsetInTheSequence();
-
-            for (int i = 0; i < maxOffset; ++i) {
-                double f = foregroundBeg[i] * lrRow[(int)seqPtr[-i-1]];
-                toForeground += b2fProbs[i] * f;
-                foregroundBeg[i] = toBackground + f2f0 * f;
-            }
-
-            backgroundProb = b2b * backgroundProb + toForeground;
-        }
-
-        void rescale(double scale) {
-            backgroundProb *= scale;
-            multiplyAll(foregroundProbs, scale);
-            multiplyAll(insertionProbs, scale);
-        }
-
-        void rescaleForward() {
-            if ((seqPtr - seqBeg) % scaleStepSize == scaleStepSize - 1) {
-                assert(backgroundProb > 0);
-                double scale = 1 / backgroundProb;
-                scaleFactors[(seqPtr - seqBeg) / scaleStepSize] = scale;
-                rescale(scale);
-            }
-        }
-
-        void rescaleBackward() {
-            if ((seqPtr - seqBeg) % scaleStepSize == scaleStepSize - 1) {
-                double scale = scaleFactors[(seqPtr - seqBeg) / scaleStepSize];
-                rescale(scale);
-            }
-        }
-
-        void calcRepeatProbs(float *letterProbs) {
-            initializeForwardAlgorithm();
-
-            while (seqPtr < seqEnd) {
-                calcForwardTransitionAndEmissionProbs();
-                rescaleForward();
-                *letterProbs = static_cast<float>(backgroundProb);
-                ++letterProbs;
-                ++seqPtr;
-            }
-
-            double z = forwardTotal();
-
-            initializeBackwardAlgorithm();
-
-            while (seqPtr > seqBeg) {
-                --seqPtr;
-                --letterProbs;
-                double nonRepeatProb = *letterProbs * backgroundProb / z;
-                // Convert nonRepeatProb to a float, so that it is more likely
-                // to be exactly 1 when it should be, e.g. for the 1st letter of
-                // a sequence:
-                *letterProbs = 1 - static_cast<float>(nonRepeatProb);
-                rescaleBackward();
-                calcEmissionAndBackwardTransitionProbs();
-            }
-
-            double z2 = backwardTotal();
-            checkForwardAndBackwardTotals(z, z2);
-        }
-
-        void countTransitions(double *transitionCounts) {
-            std::vector<float> p(seqEnd - seqBeg);
-            float *letterProbs = BEG(p);
-
-            initializeForwardAlgorithm();
-
-            while (seqPtr < seqEnd) {
-                *letterProbs = static_cast<float>(backgroundProb);
-                calcForwardTransitionProbs();
-                calcEmissionProbs();
-                rescaleForward();
-                ++letterProbs;
-                ++seqPtr;
-            }
-
-            double z = forwardTotal();
-
-            addEndCounts(backgroundProb, z, transitionCounts);
-
-            initializeBackwardAlgorithm();
-
-            while (seqPtr > seqBeg) {
-                --seqPtr;
-                --letterProbs;
-                rescaleBackward();
-                calcEmissionProbs();
-                addTransitionCounts(*letterProbs, z, transitionCounts);
-                calcBackwardTransitionProbs();
-            }
-
-            double z2 = backwardTotal();
-            checkForwardAndBackwardTotals(z, z2);
-        }
-    };
-
-    int maskSequences(char *seqBeg,
-                       char *seqEnd,
-                       int maxRepeatOffset,
-                       const const_double_ptr *likelihoodRatioMatrix,
-                       double repeatProb,
-                       double repeatEndProb,
-                       double repeatOffsetProbDecay,
-                       double firstGapProb,
-                       double otherGapProb,
-                       double minMaskProb,
-                       const char *maskTable) {
-        std::vector<float> p(seqEnd - seqBeg);
-        float *probabilities = BEG(p);
-
-        getProbabilities(seqBeg, seqEnd, maxRepeatOffset,
-                         likelihoodRatioMatrix, repeatProb, repeatEndProb,
-                         repeatOffsetProbDecay, firstGapProb, otherGapProb,
-                         probabilities);
-
-        return maskProbableLetters(seqBeg, seqEnd, probabilities, minMaskProb, maskTable);
-    }
-
-    void getProbabilities(const char *seqBeg,
-                          const char *seqEnd,
-                          int maxRepeatOffset,
-                          const const_double_ptr *likelihoodRatioMatrix,
-                          double repeatProb,
-                          double repeatEndProb,
-                          double repeatOffsetProbDecay,
-                          double firstGapProb,
-                          double otherGapProb,
-                          float *probabilities) {
-        Tantan tantan(seqBeg, seqEnd, maxRepeatOffset, likelihoodRatioMatrix,
-                      repeatProb, repeatEndProb, repeatOffsetProbDecay,
-                      firstGapProb, otherGapProb);
-        tantan.calcRepeatProbs(probabilities);
-    }
-
-    int maskProbableLetters(char *seqBeg,
-                             char *seqEnd,
-                             const float *probabilities,
-                             double minMaskProb,
-                             const char *maskTable) {
-        int masked = 0;
-        while (seqBeg < seqEnd) {
-            if (*probabilities >= minMaskProb){
-                *seqBeg = maskTable[(int)*seqBeg];
-                masked++;
-            }
-            ++probabilities;
-            ++seqBeg;
-        }
-        return masked;
-    }
-
-    void countTransitions(const char *seqBeg,
-                          const char *seqEnd,
-                          int maxRepeatOffset,
-                          const const_double_ptr *likelihoodRatioMatrix,
-                          double repeatProb,
-                          double repeatEndProb,
-                          double repeatOffsetProbDecay,
-                          double firstGapProb,
-                          double otherGapProb,
-                          double *transitionCounts) {
-        Tantan tantan(seqBeg, seqEnd, maxRepeatOffset, likelihoodRatioMatrix,
-                      repeatProb, repeatEndProb, repeatOffsetProbDecay,
-                      firstGapProb, otherGapProb);
-        tantan.countTransitions(transitionCounts);
-    }
-
-}
diff --git a/src/linclust/kmermatcher.cpp b/src/linclust/kmermatcher.cpp
index 1d0b93547..3967dedce 100644
--- a/src/linclust/kmermatcher.cpp
+++ b/src/linclust/kmermatcher.cpp
@@ -16,6 +16,7 @@
 #include "FileUtil.h"
 #include "FastSort.h"
 #include "SequenceWeights.h"
+#include "Masker.h"
 
 #include <sys/stat.h>
 #include <sys/mman.h>
@@ -54,15 +55,14 @@ KmerPosition<T, IncludeAdjacentSeq> *initKmerPositionMemory(size_t size) {
     return hashSeqPair;
 }
 
-template <int TYPE, typename T, bool IncludeAdjacentSeq>
-std::pair<size_t, size_t> fillKmerPositionArray(KmerPosition<T, IncludeAdjacentSeq> * kmerArray, size_t kmerArraySize, DBReader<unsigned int> &seqDbr, Parameters & par, BaseMatrix * subMat, bool hashWholeSequence, size_t hashStartRange, size_t hashEndRange, size_t * hashDistribution){
+template <int TYPE, typename T>
+std::pair<size_t, size_t> fillKmerPositionArray(KmerPosition<T> * kmerArray, size_t kmerArraySize, DBReader<unsigned int> &seqDbr,
+                                                Parameters & par, BaseMatrix * subMat, bool hashWholeSequence,
+                                                size_t hashStartRange, size_t hashEndRange, size_t * hashDistribution){
     size_t offset = 0;
     int querySeqType  =  seqDbr.getDbtype();
     size_t longestKmer = par.kmerSize;
-    ProbabilityMatrix *probMatrix = NULL;
-    if (par.maskMode == 1) {
-        probMatrix = new ProbabilityMatrix(*subMat);
-    }
+
 
     ScoreMatrix two;
     ScoreMatrix three;
@@ -82,6 +82,10 @@ std::pair<size_t, size_t> fillKmerPositionArray(KmerPosition<T, IncludeAdjacentS
         unsigned short * scoreDist= new unsigned short[65536];
         unsigned int * hierarchicalScoreDist= new unsigned int[128];
 
+        Masker *masker = NULL;
+        if (par.maskMode == 1) {
+            masker = new Masker(*subMat);
+        }
         const int adjustedKmerSize = (par.adjustKmerLength) ? std::min( par.kmerSize+5, 23) :   par.kmerSize;
         Sequence seq(par.maxSeqLen, querySeqType, subMat, adjustedKmerSize, par.spacedKmer, false, true, par.spacedKmerPattern);
         KmerGenerator* generator;
@@ -115,9 +119,9 @@ std::pair<size_t, size_t> fillKmerPositionArray(KmerPosition<T, IncludeAdjacentS
                     seqHash = Util::hash(seq.numSequence, seq.L);
                     seqHash = hashUInt64(seqHash, par.hashShift);
                 }
-
-                maskSequence(par.maskMode, par.maskLowerCaseMode, par.maskProb, seq, subMat->aa2num[static_cast<int>('X')], probMatrix);
-
+                if(masker != NULL){
+                    masker->maskSequence(seq, par.maskMode,  par.maskProb, par.maskLowerCaseMode, par.maskNrepeats);
+                }
                 size_t seqKmerCount = 0;
                 unsigned int seqId = seq.getDbKey();
                 while (seq.hasNextKmer()) {
@@ -362,6 +366,9 @@ std::pair<size_t, size_t> fillKmerPositionArray(KmerPosition<T, IncludeAdjacentS
             if (thread_idx == 0) {
                 seqDbr.remapData();
             }
+            if (masker != NULL) {
+                delete masker;
+            }
 #pragma omp barrier
         }
 
@@ -386,9 +393,6 @@ std::pair<size_t, size_t> fillKmerPositionArray(KmerPosition<T, IncludeAdjacentS
         ExtendedSubstitutionMatrix::freeScoreMatrix(two);
     }
 
-    if (probMatrix != NULL) {
-        delete probMatrix;
-    }
     return std::make_pair(offset, longestKmer);
 }
 
diff --git a/src/multihit/MultiHitSearch.cpp b/src/multihit/MultiHitSearch.cpp
index 40db55673..d6bef5bc0 100644
--- a/src/multihit/MultiHitSearch.cpp
+++ b/src/multihit/MultiHitSearch.cpp
@@ -35,9 +35,6 @@ int multihitsearch(int argc, const char **argv, const Command &command) {
     for (size_t i = 0; i < par.extractorfs.size(); i++){
         par.extractorfs[i]->addCategory(MMseqsParameter::COMMAND_EXPERT);
     }
-    for (size_t i = 0; i < par.translatenucs.size(); i++){
-        par.translatenucs[i]->addCategory(MMseqsParameter::COMMAND_EXPERT);
-    }
     for (size_t i = 0; i < par.splitsequence.size(); i++) {
         par.splitsequence[i]->addCategory(MMseqsParameter::COMMAND_EXPERT);
     }
diff --git a/src/prefiltering/CacheFriendlyOperations.cpp b/src/prefiltering/CacheFriendlyOperations.cpp
index 61abc40ff..5d2316d49 100644
--- a/src/prefiltering/CacheFriendlyOperations.cpp
+++ b/src/prefiltering/CacheFriendlyOperations.cpp
@@ -40,7 +40,7 @@ size_t CacheFriendlyOperations<BINSIZE>::findDuplicates(IndexEntryLocal **input,
     do {
         setupBinPointer();
         CounterResult *lastPosition = (binDataFrame + BINCOUNT * binSize) - 1;
-        for (unsigned int i = indexFrom; i < indexTo; ++i) {
+        for (unsigned int i = indexFrom; i <= indexTo; ++i) {
             const size_t N = input[i + 1] - input[i];
             hashIndexEntry(i, input[i], N, lastPosition);
         }
diff --git a/src/prefiltering/IndexBuilder.cpp b/src/prefiltering/IndexBuilder.cpp
index 120a173bb..57d81ec15 100644
--- a/src/prefiltering/IndexBuilder.cpp
+++ b/src/prefiltering/IndexBuilder.cpp
@@ -1,6 +1,7 @@
 #include "IndexBuilder.h"
 #include "tantan.h"
 #include "ExtendedSubstitutionMatrix.h"
+#include "Masker.h"
 
 #ifdef OPENMP
 #include <omp.h>
@@ -51,11 +52,10 @@ class DbInfo {
 };
 
 
-void IndexBuilder::fillDatabase(IndexTable *indexTable, SequenceLookup **maskedLookup,
-                                SequenceLookup **unmaskedLookup,BaseMatrix &subMat,
+void IndexBuilder::fillDatabase(IndexTable *indexTable, SequenceLookup ** externalLookup, BaseMatrix &subMat,
                                 ScoreMatrix & three, ScoreMatrix & two, Sequence *seq,
                                 DBReader<unsigned int> *dbr, size_t dbFrom, size_t dbTo, int kmerThr,
-                                bool mask, bool maskLowerCaseMode, float maskProb, int targetSearchMode) {
+                                bool mask, bool maskLowerCaseMode, float maskProb, int maskNrepeats, int targetSearchMode) {
     Debug(Debug::INFO) << "Index table: counting k-mers\n";
 
     const bool isProfile = Parameters::isEqualDbtype(seq->getSeqType(), Parameters::DBTYPE_HMM_PROFILE);
@@ -64,32 +64,14 @@ void IndexBuilder::fillDatabase(IndexTable *indexTable, SequenceLookup **maskedL
     size_t dbSize = dbTo - dbFrom;
     DbInfo* info = new DbInfo(dbFrom, dbTo, seq->getEffectiveKmerSize(), *dbr);
 
-    SequenceLookup *sequenceLookup;
-    if (unmaskedLookup != NULL && maskedLookup == NULL) {
-        *unmaskedLookup = new SequenceLookup(dbSize, info->aaDbSize);
-        sequenceLookup = *unmaskedLookup;
-    } else if (unmaskedLookup == NULL && maskedLookup != NULL) {
-        *maskedLookup = new SequenceLookup(dbSize, info->aaDbSize);
-        sequenceLookup = *maskedLookup;
-    } else if (unmaskedLookup != NULL && maskedLookup != NULL) {
-        *unmaskedLookup = new SequenceLookup(dbSize, info->aaDbSize);
-        *maskedLookup = new SequenceLookup(dbSize, info->aaDbSize);
-        sequenceLookup = *maskedLookup;
-    } else{
-        Debug(Debug::ERROR) << "This should not happen\n";
-        EXIT(EXIT_FAILURE);
-    }
+    *externalLookup = new SequenceLookup(dbSize, info->aaDbSize);
+    SequenceLookup *sequenceLookup = *externalLookup;
 
-    // need to prune low scoring k-mers through masking
-    ProbabilityMatrix *probMatrix = NULL;
-    if (maskedLookup != NULL) {
-        probMatrix = new ProbabilityMatrix(subMat);
-    }
 
     // identical scores for memory reduction code
     char *idScoreLookup = getScoreLookup(subMat);
     Debug::Progress progress(dbTo-dbFrom);
-
+    bool needMasking = (mask == 1 || maskNrepeats > 0  || maskLowerCaseMode == 1);
     size_t maskedResidues = 0;
     size_t totalKmerCount = 0;
     #pragma omp parallel
@@ -98,12 +80,19 @@ void IndexBuilder::fillDatabase(IndexTable *indexTable, SequenceLookup **maskedL
 #ifdef OPENMP
         thread_idx = static_cast<unsigned int>(omp_get_thread_num());
 #endif
+        // need to prune low scoring k-mers through masking
+        Masker *masker = NULL;
+        if (needMasking) {
+            masker = new Masker(subMat);
+        }
 
-        Indexer idxer(static_cast<unsigned int>(indexTable->getAlphabetSize()), seq->getKmerSize());
+        unsigned int alphabetSize = (indexTable != NULL) ? static_cast<unsigned int>(indexTable->getAlphabetSize())
+                                                         : static_cast<unsigned int>(subMat.alphabetSize);
+        Indexer idxer(alphabetSize, seq->getKmerSize());
         Sequence s(seq->getMaxLen(), seq->getSeqType(), &subMat, seq->getKmerSize(), seq->isSpaced(), false, true, seq->getUserSpacedKmerPattern());
 
         KmerGenerator *generator = NULL;
-        if (isTargetSimiliarKmerSearch) {
+        if (isTargetSimiliarKmerSearch && indexTable != NULL) {
             generator = new KmerGenerator(seq->getKmerSize(), indexTable->getAlphabetSize(), kmerThr);
             if(isProfile){
                 generator->setDivideStrategy(s.profile_matrix);
@@ -130,47 +119,21 @@ void IndexBuilder::fillDatabase(IndexTable *indexTable, SequenceLookup **maskedL
             // count similar or exact k-mers based on sequence type
             if (isTargetSimiliarKmerSearch) {
                 // Find out if we should also mask profiles
-                totalKmerCount += indexTable->addSimilarKmerCount(&s, generator);
-                unsigned char * seq = (isProfile) ? s.numConsensusSequence : s.numSequence;
-                if (unmaskedLookup != NULL) {
-                    (*unmaskedLookup)->addSequence(seq, s.L, id - dbFrom, info->sequenceOffsets[id - dbFrom]);
-                } else if (maskedLookup != NULL) {
-                    (*maskedLookup)->addSequence(seq, s.L, id - dbFrom, info->sequenceOffsets[id - dbFrom]);
+                if(indexTable != NULL){
+                    totalKmerCount += indexTable->addSimilarKmerCount(&s, generator);
                 }
+                unsigned char * seq = (isProfile) ? s.numConsensusSequence : s.numSequence;
+
+                sequenceLookup->addSequence(seq, s.L, id - dbFrom, info->sequenceOffsets[id - dbFrom]);
+
             } else {
                 // Do not mask if column state sequences are used
-                if (unmaskedLookup != NULL) {
-                    (*unmaskedLookup)->addSequence(s.numSequence, s.L, id - dbFrom, info->sequenceOffsets[id - dbFrom]);
-                }
-                if (mask == true) {
-                    // s.print();
-                    maskedResidues += tantan::maskSequences((char*)s.numSequence,
-                                                            (char*)(s.numSequence + s.L),
-                                                            50 /*options.maxCycleLength*/,
-                                                            probMatrix->probMatrixPointers,
-                                                            0.005 /*options.repeatProb*/,
-                                                            0.05 /*options.repeatEndProb*/,
-                                                            0.9 /*options.repeatOffsetProbDecay*/,
-                                                            0, 0,
-                                                            maskProb /*options.minMaskProb*/,
-                                                            probMatrix->hardMaskTable);
-                }
+                maskedResidues += masker->maskSequence(s, mask, maskProb, maskLowerCaseMode, maskNrepeats);
+                sequenceLookup->addSequence(s.numSequence, s.L, id - dbFrom, info->sequenceOffsets[id - dbFrom]);
 
-                if(maskLowerCaseMode == true && (Parameters::isEqualDbtype(s.getSequenceType(), Parameters::DBTYPE_AMINO_ACIDS) ||
-                                                  Parameters::isEqualDbtype(s.getSequenceType(), Parameters::DBTYPE_NUCLEOTIDES))) {
-                    const char * charSeq = s.getSeqData();
-                    unsigned char maskLetter = subMat.aa2num[static_cast<int>('X')];
-                    for (int i = 0; i < s.L; i++) {
-                        bool isLowerCase = (islower(charSeq[i]));
-                        maskedResidues += isLowerCase;
-                        s.numSequence[i] = isLowerCase ? maskLetter : s.numSequence[i];
-                    }
-                }
-                if(maskedLookup != NULL){
-                    (*maskedLookup)->addSequence(s.numSequence, s.L, id - dbFrom, info->sequenceOffsets[id - dbFrom]);
+                if(indexTable != NULL){
+                    totalKmerCount += indexTable->addKmerCount(&s, &idxer, buffer, kmerThr, idScoreLookup);
                 }
-
-                totalKmerCount += indexTable->addKmerCount(&s, &idxer, buffer, kmerThr, idScoreLookup);
             }
         }
 
@@ -179,21 +142,21 @@ void IndexBuilder::fillDatabase(IndexTable *indexTable, SequenceLookup **maskedL
         if (generator != NULL) {
             delete generator;
         }
+        if(masker != NULL) {
+            delete masker;
+        }
     }
 
-    if(probMatrix != NULL) {
-        delete probMatrix;
-    }
+
 
     Debug(Debug::INFO) << "Index table: Masked residues: " << maskedResidues << "\n";
-    if(totalKmerCount == 0) {
-        Debug(Debug::ERROR) << "No k-mer could be extracted for the database " << dbr->getDataFileName() << ".\n"
+    if(indexTable != NULL && totalKmerCount == 0) {
+        Debug(Debug::WARNING) << "No k-mer could be extracted for the database " << dbr->getDataFileName() << ".\n"
                             << "Maybe the sequences length is less than 14 residues.\n";
         if (maskedResidues == true){
-            Debug(Debug::ERROR) << " or contains only low complexity regions.";
-            Debug(Debug::ERROR) << "Use --mask 0 to deactivate the low complexity filter.\n";
+            Debug(Debug::WARNING) << " or contains only low complexity regions.";
+            Debug(Debug::WARNING) << "Use --mask 0 to deactivate the low complexity filter.\n";
         }
-        EXIT(EXIT_FAILURE);
     }
 
     dbr->remapData();
@@ -211,58 +174,66 @@ void IndexBuilder::fillDatabase(IndexTable *indexTable, SequenceLookup **maskedL
 //    }
 //    Debug(Debug::INFO) << "Index table: Remove "<< lowSelectiveResidues <<" none selective residues\n";
 //    Debug(Debug::INFO) << "Index table: init... from "<< dbFrom << " to "<< dbTo << "\n";
-
-    indexTable->initMemory(info->tableSize);
-    indexTable->init();
+    if(indexTable != NULL){
+        indexTable->initMemory(info->tableSize);
+        indexTable->init();
+    }
 
     delete info;
-    Debug::Progress progress2(dbTo-dbFrom);
-
-    Debug(Debug::INFO) << "Index table: fill\n";
-    #pragma omp parallel
-    {
-        unsigned int thread_idx = 0;
+    if(indexTable != NULL) {
+        Debug::Progress progress2(dbTo - dbFrom);
+        Debug(Debug::INFO) << "Index table: fill\n";
+#pragma omp parallel
+        {
+            unsigned int thread_idx = 0;
 #ifdef OPENMP
-        thread_idx = static_cast<unsigned int>(omp_get_thread_num());
+            thread_idx = static_cast<unsigned int>(omp_get_thread_num());
 #endif
-        Sequence s(seq->getMaxLen(), seq->getSeqType(), &subMat, seq->getKmerSize(), seq->isSpaced(), false, true, seq->getUserSpacedKmerPattern());
-        Indexer idxer(static_cast<unsigned int>(indexTable->getAlphabetSize()), seq->getKmerSize());
-        IndexEntryLocalTmp *buffer = static_cast<IndexEntryLocalTmp *>(malloc( seq->getMaxLen() * sizeof(IndexEntryLocalTmp)));
-        size_t bufferSize = seq->getMaxLen();
-        KmerGenerator *generator = NULL;
-        if (isTargetSimiliarKmerSearch) {
-            generator = new KmerGenerator(seq->getKmerSize(), indexTable->getAlphabetSize(), kmerThr);
-            if(isProfile){
-                generator->setDivideStrategy(s.profile_matrix);
-            }else{
-                generator->setDivideStrategy(&three, &two);
+            Sequence s(seq->getMaxLen(), seq->getSeqType(), &subMat, seq->getKmerSize(), seq->isSpaced(), false, true,
+                       seq->getUserSpacedKmerPattern());
+            unsigned int alphabetSize = (indexTable != NULL) ? static_cast<unsigned int>(indexTable->getAlphabetSize())
+                                                             : static_cast<unsigned int>(subMat.alphabetSize);
+            Indexer idxer(alphabetSize, seq->getKmerSize());
+            IndexEntryLocalTmp *buffer = static_cast<IndexEntryLocalTmp *>(malloc(
+                    seq->getMaxLen() * sizeof(IndexEntryLocalTmp)));
+            size_t bufferSize = seq->getMaxLen();
+            KmerGenerator *generator = NULL;
+            if (isTargetSimiliarKmerSearch) {
+                generator = new KmerGenerator(seq->getKmerSize(), indexTable->getAlphabetSize(), kmerThr);
+                if (isProfile) {
+                    generator->setDivideStrategy(s.profile_matrix);
+                } else {
+                    generator->setDivideStrategy(&three, &two);
+                }
             }
-        }
 
-        #pragma omp for schedule(dynamic, 100)
-        for (size_t id = dbFrom; id < dbTo; id++) {
-            s.resetCurrPos();
-            progress2.updateProgress();
+#pragma omp for schedule(dynamic, 100)
+            for (size_t id = dbFrom; id < dbTo; id++) {
+                s.resetCurrPos();
+                progress2.updateProgress();
+
+                unsigned int qKey = dbr->getDbKey(id);
+                if (isTargetSimiliarKmerSearch) {
+                    s.mapSequence(id - dbFrom, qKey, dbr->getData(id, thread_idx), dbr->getSeqLen(id));
+                    indexTable->addSimilarSequence(&s, generator, &buffer, bufferSize, &idxer);
+                } else {
+                    s.mapSequence(id - dbFrom, qKey, sequenceLookup->getSequence(id - dbFrom));
+                    indexTable->addSequence(&s, &idxer, &buffer, bufferSize, kmerThr, idScoreLookup);
+                }
+            }
 
-            unsigned int qKey = dbr->getDbKey(id);
-            if (isTargetSimiliarKmerSearch) {
-                s.mapSequence(id - dbFrom, qKey, dbr->getData(id, thread_idx), dbr->getSeqLen(id));
-                indexTable->addSimilarSequence(&s, generator, &buffer, bufferSize, &idxer);
-            } else {
-                s.mapSequence(id - dbFrom, qKey, sequenceLookup->getSequence(id - dbFrom));
-                indexTable->addSequence(&s, &idxer, &buffer, bufferSize, kmerThr, idScoreLookup);
+            if (generator != NULL) {
+                delete generator;
             }
-        }
 
-        if (generator != NULL) {
-            delete generator;
+            free(buffer);
         }
-
-        free(buffer);
     }
     if(idScoreLookup!=NULL){
         delete[] idScoreLookup;
     }
-    indexTable->revertPointer();
-    indexTable->sortDBSeqLists();
+    if(indexTable != NULL){
+        indexTable->revertPointer();
+        indexTable->sortDBSeqLists();
+    }
 }
diff --git a/src/prefiltering/IndexBuilder.h b/src/prefiltering/IndexBuilder.h
index 3a61df189..b384c1e60 100644
--- a/src/prefiltering/IndexBuilder.h
+++ b/src/prefiltering/IndexBuilder.h
@@ -6,11 +6,10 @@
 
 class IndexBuilder {
 public:
-    static void fillDatabase(IndexTable *indexTable, SequenceLookup **maskedLookup, SequenceLookup **unmaskedLookup,
-                             BaseMatrix &subMat,
+    static void fillDatabase(IndexTable *indexTable, SequenceLookup **externalLookup, BaseMatrix &subMat,
                              ScoreMatrix & three,  ScoreMatrix & two, Sequence *seq,
                              DBReader<unsigned int> *dbr, size_t dbFrom, size_t dbTo, int kmerThr,
-                             bool mask, bool maskLowerCaseMode, float maskProb, int targetSearchMode);
+                             bool mask, bool maskLowerCaseMode, float maskProb, int maskNrepeats, int targetSearchMode);
 };
 
 #endif
diff --git a/src/prefiltering/Prefiltering.cpp b/src/prefiltering/Prefiltering.cpp
index 14e8992ab..166d39c19 100644
--- a/src/prefiltering/Prefiltering.cpp
+++ b/src/prefiltering/Prefiltering.cpp
@@ -37,6 +37,7 @@ Prefiltering::Prefiltering(const std::string &queryDB,
         maskMode(par.maskMode),
         maskLowerCaseMode(par.maskLowerCaseMode),
         maskProb(par.maskProb),
+        maskNrepeats(par.maskNrepeats),
         splitMode(par.splitMode),
         scoringMatrixFile(par.scoringMatrixFile),
         seedScoringMatrixFile(par.seedScoringMatrixFile),
@@ -530,14 +531,13 @@ void Prefiltering::getIndexTable(int split, size_t dbFrom, size_t dbSize) {
                                   Parameters::isEqualDbtype(targetSeqType,Parameters::DBTYPE_AMINO_ACIDS))
                                  ? alphabetSize -1 : alphabetSize;
         indexTable = new IndexTable(adjustAlphabetSize, kmerSize, false);
-        SequenceLookup **unmaskedLookup = maskMode == 0 && maskLowerCaseMode == 0 ? &sequenceLookup : NULL;
-        SequenceLookup **maskedLookup   = maskMode == 1 || maskLowerCaseMode == 1 ? &sequenceLookup : NULL;
 
         Debug(Debug::INFO) << "Index table k-mer threshold: " << localKmerThr << " at k-mer size " << kmerSize << " \n";
-        IndexBuilder::fillDatabase(indexTable, maskedLookup, unmaskedLookup, *kmerSubMat,
+        IndexBuilder::fillDatabase(indexTable, &sequenceLookup, *kmerSubMat,
                                    _3merSubMatrix, _2merSubMatrix,
                                    &tseq, tdbr, dbFrom, dbFrom + dbSize,
-                                   localKmerThr, maskMode, maskLowerCaseMode, maskProb, targetSearchMode);
+                                   localKmerThr, maskMode, maskLowerCaseMode,
+                                   maskProb, maskNrepeats, targetSearchMode);
 
         // sequenceLookup has to be temporarily present to speed up masking
         // afterwards its not needed anymore without diagonal scoring
@@ -762,7 +762,6 @@ bool Prefiltering::runSplit(const std::string &resultDB, const std::string &resu
     size_t doubleMatches = 0;
     size_t querySeqLenSum = 0;
     size_t resSize = 0;
-    size_t realResSize = 0;
     size_t diagonalOverflow = 0;
     size_t totalQueryDBSize = querySize;
 
@@ -882,7 +881,6 @@ bool Prefiltering::runSplit(const std::string &resultDB, const std::string &resu
                 querySeqLenSum += seq.L;
                 diagonalOverflow += matcher.getStatistics()->diagonalOverflow;
                 resSize += resultSize;
-                realResSize += std::min(resultSize, maxResListLen);
                 reslens[thread_idx]->emplace_back(resultSize);
             }
         } // step end
diff --git a/src/prefiltering/Prefiltering.h b/src/prefiltering/Prefiltering.h
index 1c10ef8d6..81994843b 100644
--- a/src/prefiltering/Prefiltering.h
+++ b/src/prefiltering/Prefiltering.h
@@ -90,6 +90,7 @@ class Prefiltering {
     int maskMode;
     int maskLowerCaseMode;
     float maskProb;
+    int maskNrepeats;
     int splitMode;
     int kmerThr;
     MultiParam<NuclAA<std::string>> scoringMatrixFile;
diff --git a/src/prefiltering/PrefilteringIndexReader.cpp b/src/prefiltering/PrefilteringIndexReader.cpp
index e5df17dcc..603448bc9 100644
--- a/src/prefiltering/PrefilteringIndexReader.cpp
+++ b/src/prefiltering/PrefilteringIndexReader.cpp
@@ -56,8 +56,12 @@ void PrefilteringIndexReader::createIndexFile(const std::string &outDB,
                                               BaseMatrix *subMat, int maxSeqLen,
                                               bool hasSpacedKmer, const std::string &spacedKmerPattern,
                                               bool compBiasCorrection, int alphabetSize, int kmerSize, int maskMode,
-                                              int maskLowerCase, float maskProb, int kmerThr, int targetSearchMode, int splits,
+                                              int maskLowerCase, float maskProb, int maskNrepeats, int kmerThr, int targetSearchMode, int splits,
                                               int indexSubset) {
+    const bool noKmerIndex = (indexSubset & Parameters::INDEX_SUBSET_NO_PREFILTER) != 0;
+    if (noKmerIndex) {
+        splits = 1;
+    }
 
     const int SPLIT_META = splits > 1 ? 0 : 0;
     const int SPLIT_SEQS = splits > 1 ? 1 : 0;
@@ -190,14 +194,9 @@ void PrefilteringIndexReader::createIndexFile(const std::string &outDB,
             (Parameters::isEqualDbtype(seqType, Parameters::DBTYPE_NUCLEOTIDES) || Parameters::isEqualDbtype(seqType, Parameters::DBTYPE_AMINO_ACIDS))
                 ? alphabetSize -1: alphabetSize;
 
-    const bool noPrefilter = (indexSubset & Parameters::INDEX_SUBSET_NO_PREFILTER) != 0;
-    if (noPrefilter) {
-        splits = 0;
-    }
-
     ScoreMatrix s3;
     ScoreMatrix s2;
-    if (Parameters::isEqualDbtype(seqType, Parameters::DBTYPE_HMM_PROFILE) == false && noPrefilter == false) {
+    if (Parameters::isEqualDbtype(seqType, Parameters::DBTYPE_HMM_PROFILE) == false && noKmerIndex == false) {
         int alphabetSize = subMat->alphabetSize;
         subMat->alphabetSize = subMat->alphabetSize-1;
         s3 = ExtendedSubstitutionMatrix::calcScoreMatrix(*subMat, 3);
@@ -225,35 +224,53 @@ void PrefilteringIndexReader::createIndexFile(const std::string &outDB,
             continue;
         }
 
-        IndexTable indexTable(adjustAlphabetSize, kmerSize, false);
+        IndexTable * indexTable;
+        if(noKmerIndex){
+            indexTable = NULL;
+        } else {
+            indexTable = new IndexTable(adjustAlphabetSize, kmerSize, false);
+        }
         SequenceLookup *sequenceLookup = NULL;
-        IndexBuilder::fillDatabase(&indexTable,
-                                   (maskMode == 1 || maskLowerCase == 1) ? &sequenceLookup : NULL,
-                                   (maskMode == 0 && maskLowerCase == 0) ? &sequenceLookup : NULL,
+        IndexBuilder::fillDatabase(indexTable, &sequenceLookup,
                                    *subMat, s3, s2, &seq, dbr1, dbFrom, dbFrom + dbSize, kmerThr,
-                                   maskMode, maskLowerCase, maskProb, targetSearchMode);
-        indexTable.printStatistics(subMat->num2aa);
+                                   maskMode, maskLowerCase, maskProb, maskNrepeats, targetSearchMode);
 
         if (sequenceLookup == NULL) {
             Debug(Debug::ERROR) << "Invalid mask mode. No sequence lookup created!\n";
             EXIT(EXIT_FAILURE);
         }
-
-        // save the entries
         unsigned int keyOffset = 1000 * s;
-        Debug(Debug::INFO) << "Write ENTRIES (" << (keyOffset + ENTRIES) << ")\n";
-        char *entries = (char *) indexTable.getEntries();
-        size_t entriesSize = indexTable.getTableEntriesNum() * indexTable.getSizeOfEntry();
-        writer.writeData(entries, entriesSize, (keyOffset + ENTRIES), SPLIT_INDX + s);
-        writer.alignToPageSize(SPLIT_INDX + s);
+        if(noKmerIndex == false){
+            indexTable->printStatistics(subMat->num2aa);
+            // save the entries
+            Debug(Debug::INFO) << "Write ENTRIES (" << (keyOffset + ENTRIES) << ")\n";
+            char *entries = (char *) indexTable->getEntries();
+            size_t entriesSize = indexTable->getTableEntriesNum() * indexTable->getSizeOfEntry();
+            writer.writeData(entries, entriesSize, (keyOffset + ENTRIES), SPLIT_INDX + s);
+            writer.alignToPageSize(SPLIT_INDX + s);
+
+            // save the size
+            Debug(Debug::INFO) << "Write ENTRIESOFFSETS (" << (keyOffset + ENTRIESOFFSETS) << ")\n";
+            char *offsets = (char *) indexTable->getOffsets();
+            size_t offsetsSize = (indexTable->getTableSize() + 1) * sizeof(size_t);
+            writer.writeData(offsets, offsetsSize, (keyOffset + ENTRIESOFFSETS), SPLIT_INDX + s);
+            writer.alignToPageSize(SPLIT_INDX + s);
+            indexTable->deleteEntries();
+
+            // ENTRIESNUM
+            Debug(Debug::INFO) << "Write ENTRIESNUM (" << (keyOffset + ENTRIESNUM) << ")\n";
+            uint64_t entriesNum = indexTable->getTableEntriesNum();
+            char *entriesNumPtr = (char *) &entriesNum;
+            writer.writeData(entriesNumPtr, 1 * sizeof(uint64_t), (keyOffset + ENTRIESNUM), SPLIT_INDX + s);
+            writer.alignToPageSize(SPLIT_INDX + s);
 
-        // save the size
-        Debug(Debug::INFO) << "Write ENTRIESOFFSETS (" << (keyOffset + ENTRIESOFFSETS) << ")\n";
-        char *offsets = (char*)indexTable.getOffsets();
-        size_t offsetsSize = (indexTable.getTableSize() + 1) * sizeof(size_t);
-        writer.writeData(offsets, offsetsSize, (keyOffset + ENTRIESOFFSETS), SPLIT_INDX + s);
+        }
+        // SEQCOUNT
+        Debug(Debug::INFO) << "Write SEQCOUNT (" << (keyOffset + SEQCOUNT) << ")\n";
+        size_t tablesize = sequenceLookup->getSequenceCount();
+        char *tablesizePtr = (char *) &tablesize;
+        writer.writeData(tablesizePtr, 1 * sizeof(size_t), (keyOffset + SEQCOUNT), SPLIT_INDX + s);
         writer.alignToPageSize(SPLIT_INDX + s);
-        indexTable.deleteEntries();
 
         Debug(Debug::INFO) << "Write SEQINDEXDATASIZE (" << (keyOffset + SEQINDEXDATASIZE) << ")\n";
         int64_t seqindexDataSize = sequenceLookup->getDataSize();
@@ -271,20 +288,9 @@ void PrefilteringIndexReader::createIndexFile(const std::string &outDB,
         writer.writeData(sequenceLookup->getData(), (sequenceLookup->getDataSize() + 1) * sizeof(char), (keyOffset + SEQINDEXDATA), SPLIT_INDX + s);
         writer.alignToPageSize(SPLIT_INDX + s);
         delete sequenceLookup;
-
-        // ENTRIESNUM
-        Debug(Debug::INFO) << "Write ENTRIESNUM (" << (keyOffset + ENTRIESNUM) << ")\n";
-        uint64_t entriesNum = indexTable.getTableEntriesNum();
-        char *entriesNumPtr = (char *) &entriesNum;
-        writer.writeData(entriesNumPtr, 1 * sizeof(uint64_t), (keyOffset + ENTRIESNUM), SPLIT_INDX + s);
-        writer.alignToPageSize(SPLIT_INDX + s);
-
-        // SEQCOUNT
-        Debug(Debug::INFO) << "Write SEQCOUNT (" << (keyOffset + SEQCOUNT) << ")\n";
-        size_t tablesize = indexTable.getSize();
-        char *tablesizePtr = (char *) &tablesize;
-        writer.writeData(tablesizePtr, 1 * sizeof(size_t), (keyOffset + SEQCOUNT), SPLIT_INDX + s);
-        writer.alignToPageSize(SPLIT_INDX + s);
+        if(indexTable != NULL){
+            delete indexTable;
+        }
     }
 
     if (Parameters::isEqualDbtype(seqType, Parameters::DBTYPE_HMM_PROFILE) == false && indexSubset != Parameters::INDEX_SUBSET_NO_PREFILTER) {
@@ -587,7 +593,7 @@ std::string PrefilteringIndexReader::searchForIndex(const std::string &pathToDB)
     return "";
 }
 
-std::string PrefilteringIndexReader::dbPathWithoutIndex(std::string & dbname) {
+std::string PrefilteringIndexReader::dbPathWithoutIndex(const std::string& dbname) {
     std::string rawname = dbname;
     // check for .idx
     size_t idxlastpos = dbname.rfind(".idx");
diff --git a/src/prefiltering/PrefilteringIndexReader.h b/src/prefiltering/PrefilteringIndexReader.h
index c5256cc20..b2880b84e 100644
--- a/src/prefiltering/PrefilteringIndexReader.h
+++ b/src/prefiltering/PrefilteringIndexReader.h
@@ -60,7 +60,7 @@ class PrefilteringIndexReader {
                                 DBReader<unsigned int> *alndbr,
                                 BaseMatrix *seedSubMat, int maxSeqLen, bool spacedKmer, const std::string &spacedKmerPattern,
                                 bool compBiasCorrection, int alphabetSize, int kmerSize, int maskMode,
-                                int maskLowerCase, float maskProb, int kmerThr, int targetSearchMode, int splits, int indexSubset = 0);
+                                int maskLowerCase, float maskProb, int maskNrepeats, int kmerThr, int targetSearchMode, int splits, int indexSubset = 0);
 
     static DBReader<unsigned int> *openNewHeaderReader(DBReader<unsigned int>*dbr, unsigned int dataIdx, unsigned int indexIdx, int threads, bool touchIndex, bool touchData);
 
@@ -86,7 +86,7 @@ class PrefilteringIndexReader {
 
     static std::string searchForIndex(const std::string &pathToDB);
 
-    static std::string dbPathWithoutIndex(std::string &dbname);
+    static std::string dbPathWithoutIndex(const std::string &dbname);
 
 private:
     static void printMeta(int *meta);
diff --git a/src/prefiltering/QueryMatcher.cpp b/src/prefiltering/QueryMatcher.cpp
index 4666bcfe8..994464a39 100644
--- a/src/prefiltering/QueryMatcher.cpp
+++ b/src/prefiltering/QueryMatcher.cpp
@@ -323,11 +323,14 @@ size_t QueryMatcher::match(Sequence *seq, float *compositionBias) {
     outer:
     indexPointer[indexTo + 1] = databaseHits + numMatches;
     // fill the output
-    size_t hitCount = findDuplicates(indexPointer, foundDiagonals + overflowHitCount,
+    size_t hitCount = 0;
+    if(numMatches > 0){
+        hitCount = findDuplicates(indexPointer, foundDiagonals + overflowHitCount,
                                      foundDiagonalsSize - overflowHitCount, indexStart, indexTo, (diagonalScoring == false));
-    if (overflowHitCount != 0) {
-        // overflow occurred
-        hitCount = mergeElements(foundDiagonals, overflowHitCount + hitCount);
+        if (overflowHitCount != 0) {
+            // overflow occurred
+            hitCount = mergeElements(foundDiagonals, overflowHitCount + hitCount);
+        }
     }
     stats->doubleMatches = 0;
     if (diagonalScoring == false) {
diff --git a/src/prefiltering/UngappedAlignment.cpp b/src/prefiltering/UngappedAlignment.cpp
index 54cef6f50..1aec6b267 100644
--- a/src/prefiltering/UngappedAlignment.cpp
+++ b/src/prefiltering/UngappedAlignment.cpp
@@ -249,7 +249,7 @@ void UngappedAlignment::scoreDiagonalAndUpdateHits(const char * queryProfile,
                 // hack to avoid too long sequences
                 // this sequences will be processed by computeLongScore later
                 seqs[seqIdx].seq = (unsigned char *) tmp.first;
-                seqs[seqIdx].seqLen = 1;
+                seqs[seqIdx].seqLen = 0;
                 seqs[seqIdx].id = seqIdx;
             }else{
                 seqs[seqIdx].seq = (unsigned char *) tmp.first;
@@ -276,7 +276,7 @@ void UngappedAlignment::scoreDiagonalAndUpdateHits(const char * queryProfile,
             unsigned int minSeqLen = std::min(targetMaxLen - minDistToDiagonal, queryLen);
             for(size_t i = 0; i < DIAGONALBINSIZE; i++) {
                 tmpSeqs[i] = seqs[i].seq + minDistToDiagonal;
-                seqLength[i] = std::min(seqs[i].seqLen - minDistToDiagonal, minSeqLen);
+                seqLength[i] = (seqs[i].seqLen > minDistToDiagonal) ? std::min(seqs[i].seqLen - minDistToDiagonal, minSeqLen) : 0;
             }
             unrolledDiagonalScoring<Sequence::PROFILE_AA_SIZE + 1>(queryProfile, seqLength,
                                                                    tmpSeqs, score_arr);
@@ -286,7 +286,7 @@ void UngappedAlignment::scoreDiagonalAndUpdateHits(const char * queryProfile,
         for(size_t hitIdx = 0; hitIdx < hitSize; hitIdx++){
             hits[seqs[hitIdx].id]->count = static_cast<unsigned char>(std::min(static_cast<unsigned int>(255),
                                                                                score_arr[hitIdx]));
-            if(seqs[hitIdx].seqLen == 1){
+            if(seqs[hitIdx].seqLen == 0){
                 std::pair<const unsigned char *, const unsigned int> dbSeq =  sequenceLookup->getSequence(hits[hitIdx]->id);
                 if(dbSeq.second >= 32768){
                     int max = computeLongScore(queryProfile, queryLen, dbSeq, diagonal);
diff --git a/src/prefiltering/ungappedprefilter.cpp b/src/prefiltering/ungappedprefilter.cpp
index 84bc0a96b..29347c38c 100644
--- a/src/prefiltering/ungappedprefilter.cpp
+++ b/src/prefiltering/ungappedprefilter.cpp
@@ -10,81 +10,287 @@
 #include "DBReader.h"
 #include "DBWriter.h"
 #include "QueryMatcher.h"
-#include "QueryMatcher.h"
 #include "NucleotideMatrix.h"
 #include "FastSort.h"
 #include "SubstitutionMatrixProfileStates.h"
 #include "IndexReader.h"
 #include "QueryMatcherTaxonomyHook.h"
+
+#include <fcntl.h>
+#include <sys/mman.h>
+#include <chrono>
+#include <thread>
+
 #ifdef OPENMP
 #include <omp.h>
 #endif
+// #define HAVE_CUDA 1
+#ifdef HAVE_CUDA
+#include "GpuUtil.h"
+#include "Alignment.h"
 
-int prefilterInternal(int argc, const char **argv, const Command &command, int mode) {
-    Parameters &par = Parameters::getInstance();
-    par.parseParameters(argc, argv, command, true, 0, 0);
-    DBWriter resultWriter(par.db3.c_str(), par.db3Index.c_str(), 1, par.compressed, Parameters::DBTYPE_PREFILTER_RES);
-    resultWriter.open();
-    bool sameDB = (par.db2.compare(par.db1) == 0);
-    bool touch = (par.preloadMode != Parameters::PRELOAD_MODE_MMAP);
-    IndexReader tDbrIdx(par.db2, par.threads, IndexReader::SEQUENCES, (touch) ? (IndexReader::PRELOAD_INDEX | IndexReader::PRELOAD_DATA) : 0 );
-    IndexReader * qDbrIdx = NULL;
-    DBReader<unsigned int> * qdbr = NULL;
-    DBReader<unsigned int> * tdbr = tDbrIdx.sequenceReader;
-    const int targetSeqType = tdbr->getDbtype();
-    int querySeqType;
-    if (sameDB == true) {
-        qDbrIdx = &tDbrIdx;
-        qdbr = tdbr;
-        querySeqType = targetSeqType;
-    } else {
-        // open the sequence, prefiltering and output databases
-        qDbrIdx = new IndexReader(par.db1, par.threads,  IndexReader::SEQUENCES, (touch) ? IndexReader::PRELOAD_INDEX : 0);
-        qdbr = qDbrIdx->sequenceReader;
-        querySeqType = qdbr->getDbtype();
+#endif
+
+#ifdef HAVE_CUDA
+void runFilterOnGpu(Parameters & par, BaseMatrix * subMat,
+                    DBReader<unsigned int> * qdbr, DBReader<unsigned int> * tdbr,
+                    bool sameDB, DBWriter & resultWriter, EvalueComputation * evaluer,
+                    QueryMatcherTaxonomyHook *taxonomyHook){
+    Debug::Progress progress(qdbr->getSize());
+    const int querySeqType = qdbr->getDbtype();
+    Sequence qSeq(par.maxSeqLen, querySeqType, subMat, 0, false, par.compBiasCorrection);
+
+    std::vector<Marv::Result> results;
+    results.reserve(par.maxResListLen);
+    std::vector<hit_t> shortResults;
+    std::vector<Matcher::result_t> resultsAln;
+
+    size_t profileBufferLength = par.maxSeqLen;
+    int8_t* profile = NULL;
+    if (Parameters::isEqualDbtype(querySeqType, Parameters::DBTYPE_HMM_PROFILE) == false) {
+        profile = (int8_t*)malloc(subMat->alphabetSize * profileBufferLength * sizeof(int8_t));
     }
 
-    SequenceLookup * sequenceLookup = NULL;
-    if(Parameters::isEqualDbtype(tDbrIdx.getDbtype(), Parameters::DBTYPE_INDEX_DB)){
-        PrefilteringIndexData data = PrefilteringIndexReader::getMetadata(tDbrIdx.index);
-        if(data.splits == 1){
-            sequenceLookup = PrefilteringIndexReader::getSequenceLookup(0, tDbrIdx.index, par.preloadMode);
-        }
+    std::string resultBuffer;
+    resultBuffer.reserve(262144);
+    char buffer[1024+32768];
+
+    size_t compBufferSize = (par.maxSeqLen + 1) * sizeof(float);
+    float *compositionBias = NULL;
+    if (par.compBiasCorrection == true) {
+        compositionBias = (float*)malloc(compBufferSize);
+        memset(compositionBias, 0, compBufferSize);
     }
-    BaseMatrix *subMat;
-    EvalueComputation * evaluer;
-    int8_t * tinySubMat;
-    if (Parameters::isEqualDbtype(querySeqType, Parameters::DBTYPE_NUCLEOTIDES)) {
-        subMat = new NucleotideMatrix(par.scoringMatrixFile.values.nucleotide().c_str(), 1.0, 0.0);
-        evaluer = new EvalueComputation(tdbr->getAminoAcidDBSize(), subMat);
-        tinySubMat = new int8_t[subMat->alphabetSize*subMat->alphabetSize];
-        for (int i = 0; i < subMat->alphabetSize; i++) {
-            for (int j = 0; j < subMat->alphabetSize; j++) {
-                tinySubMat[i*subMat->alphabetSize + j] = subMat->subMatrix[i][j];
+
+    std::string hash = "";
+    if (par.gpuServer != 0) {
+        hash = GPUSharedMemory::getShmHash(par.db2);
+        std::string path = "/dev/shm/" + hash;
+        int waitTimeout = par.gpuServerWaitTimeout;
+        std::chrono::steady_clock::time_point startTime = std::chrono::steady_clock::now();
+        bool statusPrinted = false;
+        while (true) {
+            size_t shmSize = FileUtil::getFileSize(path);
+            // server is ready once the shm file exists and is not 0 byte large
+            if (shmSize != (size_t)-1 && shmSize > 0) {
+                break;
+            }
+
+            if (waitTimeout == 0) {
+                Debug(Debug::ERROR) 
+                    << "gpuserver for database " << par.db2 << " not found.\n"
+                    << "Please start gpuserver with the same CUDA_VISIBLE_DEVICES\n";
+                EXIT(EXIT_FAILURE);
             }
+
+            if (waitTimeout > 0) {
+                if (statusPrinted == false) {
+                    Debug(Debug::INFO) << "Waiting for `gpuserver`";
+                    statusPrinted = true;
+                } else {
+                    Debug(Debug::INFO) << ".";
+                }
+                std::chrono::steady_clock::time_point now = std::chrono::steady_clock::now();
+                auto elapsed = std::chrono::duration_cast<std::chrono::seconds>(now - startTime).count();
+                if (elapsed >= waitTimeout) {
+                    Debug(Debug::ERROR)
+                        << "gpuserver for database " << par.db2 << " not found after " << elapsed <<  "seconds.\n"
+                        << "Please start gpuserver with the same CUDA_VISIBLE_DEVICES\n";
+                    EXIT(EXIT_FAILURE);
+                }
+            }
+            std::this_thread::sleep_for(std::chrono::milliseconds(500));
+        }
+        if (waitTimeout > 0 && statusPrinted) {
+            Debug(Debug::INFO) << "\n";
+        }
+    }
+
+    size_t* offsetData = NULL;
+    int32_t* lengthData = NULL;
+    std::vector<size_t> offsets;
+    std::vector<int32_t> lengths;
+    GPUSharedMemory* layout = NULL;
+    pid_t pid = 0;  // current process ID, only for server
+    if (hash.empty()) {
+        offsets.reserve(tdbr->getSize() + 1);
+        lengths.reserve(tdbr->getSize());
+        for (size_t id = 0; id < tdbr->getSize(); id++) {
+            offsets.emplace_back(tdbr->getIndex()[id].offset);
+            lengths.emplace_back(tdbr->getIndex()[id].length - 2);
         }
+        offsets.emplace_back(offsets.back() + lengths.back());
+        offsetData = offsets.data();
+        lengthData = lengths.data();
     } else {
-        // keep score bias at 0.0 (improved ROC)
-        subMat = new SubstitutionMatrix(par.scoringMatrixFile.values.aminoacid().c_str(), 2.0, 0.0);
-        evaluer = new EvalueComputation(tdbr->getAminoAcidDBSize(), subMat);
-        tinySubMat = new int8_t[subMat->alphabetSize*subMat->alphabetSize];
-        for (int i = 0; i < subMat->alphabetSize; i++) {
-            for (int j = 0; j < subMat->alphabetSize; j++) {
-                tinySubMat[i*subMat->alphabetSize + j] = subMat->subMatrix[i][j];
+        pid = getpid();
+        layout = GPUSharedMemory::openSharedMemory(hash);
+    }
+
+    const bool serverMode = par.gpuServer;
+    Marv* marv = NULL;
+    if (serverMode == 0) {
+       if (offsetData == NULL || lengthData == NULL) {
+           Debug(Debug::ERROR) << "Invalid GPU database\n";
+           EXIT(EXIT_FAILURE);
+       }
+        int32_t maxTargetLength = lengths.back();
+        Marv::AlignmentType type = (par.prefMode == Parameters::PREF_MODE_UNGAPPED_AND_GAPPED) ?
+                Marv::AlignmentType::GAPLESS_SMITH_WATERMAN : Marv::AlignmentType::GAPLESS;
+        marv = new Marv(tdbr->getSize(), subMat->alphabetSize, maxTargetLength,
+                        par.maxResListLen, type);
+        void* h = marv->loadDb(
+            tdbr->getDataForFile(0), offsetData, lengthData, tdbr->getDataSizeForFile(0)
+        );
+        marv->setDb(h);
+    } else if (layout == NULL) {
+       Debug(Debug::ERROR) << "No GPU server shared memory connection\n";
+       EXIT(EXIT_FAILURE);
+    }
+
+    // marv.prefetch();
+    for (size_t id = 0; id < qdbr->getSize(); id++) {
+        size_t queryKey = qdbr->getDbKey(id);
+        unsigned int querySeqLen = qdbr->getSeqLen(id);
+        char *querySeqData = qdbr->getData(id, 0);
+        qSeq.mapSequence(id, queryKey, querySeqData, querySeqLen);
+        if (Parameters::isEqualDbtype(querySeqType, Parameters::DBTYPE_HMM_PROFILE)) {
+            profile = qSeq.profile_for_alignment;
+        } else {
+            if ((size_t)qSeq.L >= profileBufferLength) {
+                profileBufferLength = (size_t)qSeq.L * 1.5;
+                profile = (int8_t*)realloc(profile, subMat->alphabetSize * profileBufferLength * sizeof(int8_t));
+            }
+            if (compositionBias != NULL) {
+                if ((size_t)qSeq.L >= compBufferSize) {
+                    compBufferSize = (size_t)qSeq.L * 1.5 * sizeof(float);
+                    compositionBias = (float*)realloc(compositionBias, compBufferSize);
+                    // memset(compositionBias, 0, compBufferSize);
+                }
+                SubstitutionMatrix::calcLocalAaBiasCorrection(subMat, qSeq.numSequence, qSeq.L, compositionBias, par.compBiasCorrectionScale);
+            }
+            for (size_t j = 0; j < (size_t)subMat->alphabetSize; ++j) {
+                for (size_t i = 0; i < (size_t)qSeq.L; ++i) {
+                    short bias = 0;
+                    if (compositionBias != NULL) {
+                        bias = static_cast<short>((compositionBias[i] < 0.0) ? (compositionBias[i] - 0.5) : (compositionBias[i] + 0.5));
+                    }
+                    profile[j * qSeq.L  + i] = subMat->subMatrix[j][qSeq.numSequence[i]] + bias;
+                }
             }
         }
-    }
+        Marv::Stats stats;
+        if (serverMode == 0) {
+            stats = marv->scan(reinterpret_cast<const char *>(qSeq.numSequence), qSeq.L, profile, results.data());
+        } else {
+            while(layout->trySetServerReady(pid)==false) {
+                std::this_thread::yield();
+            }
+            memcpy(layout->getQueryPtr(), qSeq.numSequence, qSeq.L);
+            memcpy(layout->getProfilePtr(), profile, subMat->alphabetSize * qSeq.L);
+            layout->queryLen = qSeq.L;
+            layout->clientReady.store(1, std::memory_order_release);
+            while(layout->serverReady.load(std::memory_order_acquire) != UINT_MAX) {
+                std::this_thread::yield();
+            }
+            memcpy(results.data(), layout->getResultsPtr(), layout->resultLen * sizeof(Marv::Result));
+            stats.results = layout->resultLen;
+            layout->resetServerAndClientReady();
+        }
 
+        for(size_t i = 0; i < stats.results; i++){
+            unsigned int targetKey = tdbr->getDbKey(results[i].id);
+            int score = results[i].score;
+            if(taxonomyHook != NULL){
+                TaxID currTax = taxonomyHook->taxonomyMapping->lookup(targetKey);
+                if (taxonomyHook->expression[0]->isAncestor(currTax) == false) {
+                    continue;
+                }
+            }
+            // check if evalThr != inf
+            // double evalue = 0.0;
+            // if (par.evalThr < std::numeric_limits<double>::max()) {
+            //     evalue = evaluer->computeEvalue(score, qSeq.L);
+            // }
+            // bool hasEvalue = (evalue <= par.evalThr);
+            bool hasDiagScore = (score > par.minDiagScoreThr);
 
-    QueryMatcherTaxonomyHook * taxonomyHook = NULL;
-    if (par.PARAM_TAXON_LIST.wasSet) {
-        taxonomyHook = new QueryMatcherTaxonomyHook(par.db2, tdbr, par.taxonList, par.threads);
+            const bool isIdentity = (queryKey == targetKey && (par.includeIdentity || sameDB))? true : false;
+            // --filter-hits
+            if (isIdentity || hasDiagScore) {
+                if(par.prefMode == Parameters::PREF_MODE_UNGAPPED_AND_GAPPED){
+                    Matcher::result_t res;
+                    res.dbKey = targetKey;
+                    res.eval = evaluer->computeEvalue(score, qSeq.L);
+                    res.dbEndPos = results[i].dbEndPos;
+                    res.dbLen = tdbr->getSeqLen(results[i].id);
+                    res.qEndPos =  results[i].qEndPos;
+                    res.qLen = qSeq.L;
+                    unsigned int qAlnLen = std::max(static_cast<unsigned int>(res.qEndPos), static_cast<unsigned int>(1));
+                    unsigned int dbAlnLen = std::max(static_cast<unsigned int>(res.dbEndPos), static_cast<unsigned int>(1));
+                    //seqId = (alignment.score1 / static_cast<float>(std::max(dbAlnLen, qAlnLen)))  * 0.1656 + 0.1141;
+                    res.seqId = Matcher::estimateSeqIdByScorePerCol(score, qAlnLen, dbAlnLen);
+                    res.qcov = SmithWaterman::computeCov(0, res.qEndPos, res.qLen );
+                    res.dbcov = SmithWaterman::computeCov(0, res.dbEndPos, res.dbLen );
+                    res.score = evaluer->computeBitScore(score);
+                    if(Alignment::checkCriteria(res, isIdentity, par.evalThr,  par.seqIdThr,  par.alnLenThr,  par.covMode,  par.covThr)){
+                        resultsAln.emplace_back(res);
+                    }
+                } else {
+                    hit_t hit;
+                    hit.seqId = targetKey;
+                    hit.prefScore = score;
+                    hit.diagonal = 0;
+                    shortResults.emplace_back(hit);
+                }
+            }
+        }
+        if(par.prefMode == Parameters::PREF_MODE_UNGAPPED_AND_GAPPED) {
+            SORT_PARALLEL(resultsAln.begin(), resultsAln.end(), Matcher::compareHits);
+            size_t maxSeqs = std::min(par.maxResListLen, resultsAln.size());
+            for (size_t i = 0; i < maxSeqs; ++i) {
+                size_t len = Matcher::resultToBuffer(buffer, resultsAln[i], false);
+                resultBuffer.append(buffer, len);
+            }
+        }else{
+            SORT_PARALLEL(shortResults.begin(), shortResults.end(), hit_t::compareHitsByScoreAndId);
+            size_t maxSeqs = std::min(par.maxResListLen, shortResults.size());
+            for (size_t i = 0; i < maxSeqs; ++i) {
+                size_t len = QueryMatcher::prefilterHitToBuffer(buffer, shortResults[i]);
+                resultBuffer.append(buffer, len);
+            }
+        }
+
+        resultWriter.writeData(resultBuffer.c_str(), resultBuffer.length(), queryKey, 0);
+        resultBuffer.clear();
+        shortResults.clear();
+        resultsAln.clear();
+        progress.updateProgress();
+    }
+    if (marv != NULL) {
+        delete marv;
+    } else {
+        GPUSharedMemory::unmap(layout);
     }
 
-    Debug::Progress progress(qdbr->getSize());
+    if (compositionBias != NULL) {
+        free(compositionBias);
+    }
+    if (Parameters::isEqualDbtype(querySeqType, Parameters::DBTYPE_HMM_PROFILE) == false) {
+        free(profile);
+    }
+}
+#endif
+
+void runFilterOnCpu(Parameters & par, BaseMatrix * subMat, int8_t * tinySubMat,
+                    DBReader<unsigned int> * qdbr, DBReader<unsigned int> * tdbr,
+                    SequenceLookup * sequenceLookup, bool sameDB, DBWriter & resultWriter, EvalueComputation * evaluer,
+                    QueryMatcherTaxonomyHook *taxonomyHook, int alignmentMode){
     std::vector<hit_t> shortResults;
     shortResults.reserve(tdbr->getSize()/2);
-
+    Debug::Progress progress(qdbr->getSize());
+    const int targetSeqType = tdbr->getDbtype();
+    const int querySeqType = qdbr->getDbtype();
 #ifdef OPENMP
     omp_set_nested(1);
 #endif
@@ -131,6 +337,11 @@ int prefilterInternal(int argc, const char **argv, const Command &command, int m
                     char * targetSeq = tdbr->getData(tId, thread_idx);
                     unsigned int targetSeqLen = tdbr->getSeqLen(tId);
                     tSeq.mapSequence(tId, targetKey, targetSeq, targetSeqLen);
+                    // mask numSequence
+                    unsigned char xChar = subMat->aa2num[static_cast<int>('X')];
+                    for (int i = 0; i < tSeq.L; i++) {
+                        tSeq.numSequence[i] = ((targetSeq[i] >= 32 && targetSeq[i] <= 52) || targetSeq[i] >= 97)  ? xChar : tSeq.numSequence[i];
+                    }
                 }else{
                     tSeq.mapSequence(tId, targetKey, sequenceLookup->getSequence(tId));
                 }
@@ -140,45 +351,45 @@ int prefilterInternal(int argc, const char **argv, const Command &command, int m
                     continue;
                 }
 
+                bool hasEvalue = true;
                 int score;
-                if (mode == 0) {
+                if (alignmentMode == 0) {
                     score = aligner.ungapped_alignment(tSeq.numSequence, tSeq.L);
                 } else {
                     std::string backtrace;
                     s_align res;
                     if (isIdentity) {
                         res = aligner.scoreIdentical(
-                            tSeq.numSequence, tSeq.L, evaluer, Matcher::SCORE_ONLY, backtrace
+                                tSeq.numSequence, tSeq.L, evaluer, Matcher::SCORE_ONLY, backtrace
                         );
                     } else {
                         res = aligner.ssw_align(
-                            tSeq.numSequence,
-                            tSeq.numConsensusSequence,
-                            tSeq.getAlignmentProfile(),
-                            tSeq.L,
-                            backtrace,
-                            par.gapOpen.values.aminoacid(),
-                            par.gapExtend.values.aminoacid(),
-                            Matcher::SCORE_ONLY,
-                            par.evalThr,
-                            evaluer,
-                            par.covMode,
-                            par.covThr,
-                            par.correlationScoreWeight,
-                            qSeq.L / 2,
-                            tId
+                                tSeq.numSequence,
+                                tSeq.numConsensusSequence,
+                                tSeq.getAlignmentProfile(),
+                                tSeq.L,
+                                backtrace,
+                                par.gapOpen.values.aminoacid(),
+                                par.gapExtend.values.aminoacid(),
+                                Matcher::SCORE_ONLY,
+                                par.evalThr,
+                                evaluer,
+                                par.covMode,
+                                par.covThr,
+                                par.correlationScoreWeight,
+                                qSeq.L / 2,
+                                tId
                         );
                     }
                     score = res.score1;
+                    // check if evalThr != inf
+                    double evalue = 0.0;
+                    if (par.evalThr < std::numeric_limits<double>::max()) {
+                        evalue = evaluer->computeEvalue(score, qSeq.L);
+                    }
+                    hasEvalue = (evalue <= par.evalThr);
                 }
                 bool hasDiagScore = (score > par.minDiagScoreThr);
-                double evalue = 0.0;
-                // check if evalThr != inf
-                if (par.evalThr < std::numeric_limits<double>::max()) {
-                    evalue = evaluer->computeEvalue(score, qSeq.L);
-                }
-                bool hasEvalue = (evalue <= par.evalThr);
-                // --filter-hits
                 if (isIdentity || (hasDiagScore && hasEvalue)) {
                     hit_t hit;
                     hit.seqId = targetKey;
@@ -210,12 +421,103 @@ int prefilterInternal(int argc, const char **argv, const Command &command, int m
 #pragma omp barrier
         }
     }
+}
+
+int prefilterInternal(int argc, const char **argv, const Command &command, int mode) {
+    Parameters &par = Parameters::getInstance();
+    par.parseParameters(argc, argv, command, true, 0, 0);
+    int outputDbtype = (par.prefMode == Parameters::PREF_MODE_UNGAPPED_AND_GAPPED)
+                      ? Parameters::DBTYPE_ALIGNMENT_RES : Parameters::DBTYPE_PREFILTER_RES;
+    DBWriter resultWriter(par.db3.c_str(), par.db3Index.c_str(), 1, par.compressed, outputDbtype);
+    resultWriter.open();
+    bool sameDB = (par.db2.compare(par.db1) == 0);
+    bool touch = (par.preloadMode != Parameters::PRELOAD_MODE_MMAP);
+    IndexReader tDbrIdx(par.db2, par.threads, IndexReader::SEQUENCES, (touch) ? (IndexReader::PRELOAD_INDEX | IndexReader::PRELOAD_DATA) : 0 );
+    IndexReader * qDbrIdx = NULL;
+    DBReader<unsigned int> * qdbr = NULL;
+    DBReader<unsigned int> * tdbr = tDbrIdx.sequenceReader;
 
+    if (par.gpu == true) {
+        const bool isGpuDb = DBReader<unsigned int>::getExtendedDbtype(tdbr->getDbtype()) & Parameters::DBTYPE_EXTENDED_GPU;
+        if (isGpuDb == false) {
+            Debug(Debug::ERROR) << "Database " << FileUtil::baseName(par.db2) << " is not a valid GPU database\n" 
+                                << "Please call: makepaddedseqdb " << FileUtil::baseName(par.db2) << " " << FileUtil::baseName(par.db2) << "_pad\n";
+            EXIT(EXIT_FAILURE);
+        }
+    }
+
+    const int targetSeqType = tdbr->getDbtype();
+    int querySeqType;
+    if (sameDB == true) {
+        qDbrIdx = &tDbrIdx;
+        qdbr = tdbr;
+        querySeqType = targetSeqType;
+    } else {
+        // open the sequence, prefiltering and output databases
+        qDbrIdx = new IndexReader(par.db1, par.threads, IndexReader::SEQUENCES, (touch) ? IndexReader::PRELOAD_INDEX : 0);
+        qdbr = qDbrIdx->sequenceReader;
+        querySeqType = qdbr->getDbtype();
+    }
+
+    SequenceLookup * sequenceLookup = NULL;
+    if(Parameters::isEqualDbtype(tDbrIdx.getDbtype(), Parameters::DBTYPE_INDEX_DB)){
+        PrefilteringIndexData data = PrefilteringIndexReader::getMetadata(tDbrIdx.index);
+        if(data.splits == 1){
+            sequenceLookup = PrefilteringIndexReader::getSequenceLookup(0, tDbrIdx.index, par.preloadMode);
+        }
+    }
+    BaseMatrix *subMat;
+    EvalueComputation * evaluer;
+    int8_t * tinySubMat;
+    if (Parameters::isEqualDbtype(querySeqType, Parameters::DBTYPE_NUCLEOTIDES)) {
+        subMat = new NucleotideMatrix(par.scoringMatrixFile.values.nucleotide().c_str(), 1.0, 0.0);
+        evaluer = new EvalueComputation(tdbr->getAminoAcidDBSize(), subMat, par.gapOpen.values.nucleotide(), par.gapExtend.values.nucleotide());
+        tinySubMat = new int8_t[subMat->alphabetSize*subMat->alphabetSize];
+        for (int i = 0; i < subMat->alphabetSize; i++) {
+            for (int j = 0; j < subMat->alphabetSize; j++) {
+                tinySubMat[i*subMat->alphabetSize + j] = subMat->subMatrix[i][j];
+            }
+        }
+    } else {
+        // keep score bias at 0.0 (improved ROC)
+        subMat = new SubstitutionMatrix(par.scoringMatrixFile.values.aminoacid().c_str(), 2.0, 0.0);
+        evaluer = new EvalueComputation(tdbr->getAminoAcidDBSize(), subMat, par.gapOpen.values.aminoacid(), par.gapExtend.values.aminoacid());
+        tinySubMat = new int8_t[subMat->alphabetSize*subMat->alphabetSize];
+        for (int i = 0; i < subMat->alphabetSize; i++) {
+            for (int j = 0; j < subMat->alphabetSize; j++) {
+                tinySubMat[i*subMat->alphabetSize + j] = subMat->subMatrix[i][j];
+            }
+        }
+    }
+
+
+    QueryMatcherTaxonomyHook * taxonomyHook = NULL;
+    if(par.PARAM_TAXON_LIST.wasSet){
+        taxonomyHook = new QueryMatcherTaxonomyHook(par.db2, tdbr, par.taxonList, par.threads);
+    }
+    if(par.gpu){
+#ifdef HAVE_CUDA
+        runFilterOnGpu(par, subMat, qdbr, tdbr, sameDB,
+                       resultWriter, evaluer, taxonomyHook);
+#else
+        Debug(Debug::ERROR) << "MMseqs2 was compiled without CUDA support\n";
+        EXIT(EXIT_FAILURE);
+#endif
+    }else{
+        runFilterOnCpu(par, subMat, tinySubMat, qdbr, tdbr, sequenceLookup, sameDB,
+                   resultWriter, evaluer, taxonomyHook,  mode);
+    }
+
+    resultWriter.close();
 
     if(taxonomyHook != NULL){
         delete taxonomyHook;
     }
 
+    if (sequenceLookup != NULL) {
+        delete sequenceLookup;
+    }
+
     if(sameDB == false){
         delete qDbrIdx;
     }
@@ -224,7 +526,6 @@ int prefilterInternal(int argc, const char **argv, const Command &command, int m
     delete subMat;
     delete evaluer;
 
-    resultWriter.close();
     return 0;
 }
 
diff --git a/src/taxonomy/NcbiTaxonomy.cpp b/src/taxonomy/NcbiTaxonomy.cpp
index a035d1727..8198c0fba 100644
--- a/src/taxonomy/NcbiTaxonomy.cpp
+++ b/src/taxonomy/NcbiTaxonomy.cpp
@@ -1,779 +1,829 @@
-// Ported from blast2lca
-// https://github.com/emepyc/Blast2lca
-// Originally licensed under GPLv2 or later
-
-#include "NcbiTaxonomy.h"
-#include "FileUtil.h"
-#include "MathUtil.h"
-#include "Debug.h"
-#include "Util.h"
-#include "sys/mman.h"
-
-#include <fstream>
-#include <algorithm>
-#include <cassert>
-
-const int NcbiTaxonomy::SERIALIZATION_VERSION = 2;
-
-int **makeMatrix(size_t maxNodes) {
-    size_t dimension = maxNodes * 2;
-    int **M = new int*[dimension];
-    int k = (int)(MathUtil::flog2(dimension)) + 1;
-    M[0] = new int[dimension * k]();
-    for(size_t i = 1; i < dimension; i++) {
-        M[i] = M[i-1] + k;
-    }
-
-    return M;
-}
-
-void deleteMatrix(int** M) {
-    delete[] M[0];
-    delete[] M;
-}
-
-NcbiTaxonomy::NcbiTaxonomy(const std::string &namesFile, const std::string &nodesFile, const std::string &mergedFile) : externalData(false) {
-    block = new StringBlock<unsigned int>();
-    std::vector<TaxonNode> tmpNodes;
-    loadNodes(tmpNodes, nodesFile);
-    loadMerged(mergedFile);
-    loadNames(tmpNodes, namesFile);
-
-    maxNodes = tmpNodes.size();
-    taxonNodes = new TaxonNode[maxNodes];
-    std::copy(tmpNodes.begin(), tmpNodes.end(), taxonNodes);
-
-    std::vector<int> tmpE;
-    tmpE.reserve(maxNodes * 2);
-
-    std::vector<int> tmpL;
-    tmpL.reserve(maxNodes * 2);
-
-    H = new int[maxNodes];
-    std::fill(H, H + maxNodes, 0);
-
-    std::vector<std::vector<TaxID>> children(tmpNodes.size());
-    for (std::vector<TaxonNode>::const_iterator it = tmpNodes.begin(); it != tmpNodes.end(); ++it) {
-        if (it->parentTaxId != it->taxId) {
-            children[nodeId(it->parentTaxId)].push_back(it->taxId);
-        }
-    }
-
-    elh(children, 1, 0, tmpE, tmpL);
-    tmpE.resize(maxNodes * 2, 0);
-    tmpL.resize(maxNodes * 2, 0);
-
-    E = new int[maxNodes * 2];
-    std::copy(tmpE.begin(), tmpE.end(), E);
-    L = new int[maxNodes * 2];
-    std::copy(tmpL.begin(), tmpL.end(), L);
-
-    M = makeMatrix(maxNodes);
-    InitRangeMinimumQuery();
-
-    mmapData = NULL;
-    mmapSize = 0;
-}
-
-NcbiTaxonomy::~NcbiTaxonomy() {
-    if (externalData) {
-        delete[] M;
-    } else {
-        delete[] taxonNodes;
-        delete[] H;
-        delete[] D;
-        delete[] E;
-        delete[] L;
-        deleteMatrix(M);
-    }
-    delete block;
-    if (mmapData != NULL) {
-        munmap(mmapData, mmapSize);
-    }
-}
-
-std::vector<std::string> splitByDelimiter(const std::string &s, const std::string &delimiter, int maxCol) {
-    std::vector<std::string> result;
-    size_t prev = 0, pos = 0;
-    int i = 0;
-    do {
-        pos = s.find(delimiter, prev);
-        if (pos == std::string::npos) pos = s.length();
-        result.emplace_back(s.substr(prev, pos - prev));
-        prev = pos + delimiter.length();
-        i++;
-    } while (pos < s.length() && prev < s.length() && i < maxCol);
-
-    return result;
-}
-
-size_t NcbiTaxonomy::loadNodes(std::vector<TaxonNode> &tmpNodes, const std::string &nodesFile) {
-    Debug(Debug::INFO) << "Loading nodes file ...";
-    std::ifstream ss(nodesFile);
-    if (ss.fail()) {
-        Debug(Debug::ERROR) << "File " << nodesFile << " not found!\n";
-        EXIT(EXIT_FAILURE);
-    }
-
-    std::map<TaxID, int> Dm; // temporary map TaxID -> internal ID;
-    maxTaxID = 0;
-    int currentId = 0;
-    std::string line;
-    while (std::getline(ss, line)) {
-        std::vector<std::string> result = splitByDelimiter(line, "\t|\t", 3);
-        TaxID taxId = (TaxID) strtol(result[0].c_str(), NULL, 10);
-        TaxID parentTaxId = (TaxID) strtol(result[1].c_str(), NULL, 10);
-        if (taxId > maxTaxID) {
-            maxTaxID = taxId;
-        }
-        size_t rankIdx = block->append(result[2].c_str(), result[2].size());
-        tmpNodes.emplace_back(currentId, taxId, parentTaxId, rankIdx, (size_t)-1);
-        Dm.emplace(taxId, currentId);
-        ++currentId;
-    }
-
-    D = new int[maxTaxID + 1];
-    std::fill_n(D, maxTaxID + 1, -1);
-    for (std::map<TaxID, int>::iterator it = Dm.begin(); it != Dm.end(); ++it) {
-        assert(it->first <= maxTaxID);
-        D[it->first] = it->second;
-    }
-
-    // Loop over taxonNodes and check all parents exist
-    for (std::vector<TaxonNode>::iterator it = tmpNodes.begin(); it != tmpNodes.end(); ++it) {
-        if (!nodeExists(it->parentTaxId)) {
-            Debug(Debug::ERROR) << "Inconsistent nodes.dmp taxonomy file! Cannot find parent taxon with ID " << it->parentTaxId << "!\n";
-            EXIT(EXIT_FAILURE);
-        }
-    }
-
-    Debug(Debug::INFO) << " Done, got " << tmpNodes.size() << " nodes\n";
-    return tmpNodes.size();
-}
-
-std::pair<int, std::string> parseName(const std::string &line) {
-    std::vector<std::string> result = splitByDelimiter(line, "\t|\t", 2);
-    if (result.size() != 2) {
-        Debug(Debug::ERROR) << "Invalid name entry!\n";
-        EXIT(EXIT_FAILURE);
-    }
-    return std::make_pair((int)strtol(result[0].c_str(), NULL, 10), result[1]);
-}
-
-void NcbiTaxonomy::loadNames(std::vector<TaxonNode> &tmpNodes, const std::string &namesFile) {
-    Debug(Debug::INFO) << "Loading names file ...";
-    std::ifstream ss(namesFile);
-    if (ss.fail()) {
-        Debug(Debug::ERROR) << "File " << namesFile << " not found!\n";
-        EXIT(EXIT_FAILURE);
-    }
-
-    std::string line;
-    while (std::getline(ss, line)) {
-        if (line.find("scientific name") == std::string::npos) {
-            continue;
-        }
-
-        std::pair<int, std::string> entry = parseName(line);
-        if (!nodeExists(entry.first)) {
-            Debug(Debug::ERROR) << "loadNames: Taxon " << entry.first << " not present in nodes file!\n";
-            EXIT(EXIT_FAILURE);
-        }
-        tmpNodes[nodeId(entry.first)].nameIdx = block->append(entry.second.c_str(), entry.second.size());
-    }
-    Debug(Debug::INFO) << " Done\n";
-}
-
-// Euler traversal of tree
-void NcbiTaxonomy::elh(std::vector<std::vector<TaxID>> const & children, TaxID taxId, int level, std::vector<int> &tmpE, std::vector<int> &tmpL) {
-    assert (taxId > 0);
-    int id = nodeId(taxId);
-
-    if (H[id] == 0) {
-        H[id] = tmpE.size();
-    }
-
-    tmpE.emplace_back(id);
-    tmpL.emplace_back(level);
-
-    for (std::vector<TaxID>::const_iterator child_it = children[id].begin(); child_it != children[id].end(); ++child_it) {
-        elh(children, *child_it, level + 1, tmpE, tmpL);
-    }
-    tmpE.emplace_back(nodeId(taxonNodes[id].parentTaxId));
-    tmpL.emplace_back(level - 1);
-}
-
-void NcbiTaxonomy::InitRangeMinimumQuery() {
-    Debug(Debug::INFO) << "Init RMQ ...";
-
-    for (unsigned int i = 0; i < (maxNodes * 2); ++i) {
-        M[i][0] = i;
-    }
-
-    for (unsigned int j = 1; (1ul << j) <= (maxNodes * 2); ++j) {
-        for (unsigned int i = 0; (i + (1ul << j) - 1) < (maxNodes * 2); ++i) {
-            int A = M[i][j - 1];
-            int B = M[i + (1ul << (j - 1))][j - 1];
-            if (L[A] < L[B]) {
-                M[i][j] = A;
-            } else {
-                M[i][j] = B;
-            }
-        }
-    }
-    Debug(Debug::INFO) << "Done\n";
-}
-
-int NcbiTaxonomy::RangeMinimumQuery(int i, int j) const {
-    assert(j >= i);
-    int k = (int)MathUtil::flog2(j - i + 1);
-    int A = M[i][k];
-    int B = M[j - MathUtil::ipow<int>(2, k) + 1][k];
-    if (L[A] <= L[B]) {
-        return A;
-    }
-    return B;
-}
-
-int NcbiTaxonomy::lcaHelper(int i, int j) const {
-    if (i == 0 || j == 0) {
-        return 0;
-    }
-    assert(i > 0);
-    assert(j > 0);
-    if (i == j) {
-        return i;
-    }
-    int v1 = H[i];
-    int v2 = H[j];
-    if (v1 > v2) {
-        int tmp = v1;
-        v1 = v2;
-        v2 = tmp;
-    }
-    int rmq = RangeMinimumQuery(v1, v2);
-    assert(E[rmq] >= 0);
-    return E[rmq];
-}
-
-bool NcbiTaxonomy::IsAncestor(TaxID ancestor, TaxID child) {
-    if (ancestor == child) {
-        return true;
-    }
-
-    if (ancestor == 0 || child == 0) {
-        return false;
-    }
-
-    if (!nodeExists(child)) {
-        return false;
-    }
-
-    if (!nodeExists(ancestor)) {
-        return false;
-    }
-
-    return lcaHelper(nodeId(child), nodeId(ancestor)) == nodeId(ancestor);
-}
-
-
-TaxID NcbiTaxonomy::LCA(TaxID taxonA, TaxID taxonB) const {
-    if (!nodeExists(taxonA)) {
-        return taxonB;
-    } else if (!nodeExists(taxonB)) {
-        return taxonA;
-    }
-    return taxonNodes[lcaHelper(nodeId(taxonA), nodeId(taxonB))].taxId;
-}
-
-
-TaxonNode const * NcbiTaxonomy::LCA(const std::vector<TaxID>& taxa) const {
-    std::vector<int>::const_iterator it = taxa.begin();
-    while (it != taxa.end() && !nodeExists(*it)) {
-        Debug(Debug::WARNING) << "No node for taxID " << *it << ", ignoring it.\n";
-        ++it;
-    }
-    if (it == taxa.end()) { return NULL; }
-    int red = nodeId(*it++);
-    for (; it != taxa.end(); ++it) {
-        if (nodeExists(*it)) {
-            red = lcaHelper(red, nodeId(*it));
-        } else {
-            Debug(Debug::WARNING) << "No node for taxID " << *it << ", ignoring it.\n";
-        }
-    }
-
-    assert(red >= 0 && static_cast<unsigned int>(red) < maxNodes);
-
-    return &(taxonNodes[red]);
-}
-
-
-// AtRanks returns a slice of slices having the taxons at the specified taxonomic levels
-std::vector<std::string> NcbiTaxonomy::AtRanks(TaxonNode const *node, const std::vector<std::string> &levels) const {
-    std::vector<std::string> result;
-    std::map<std::string, std::string> allRanks = AllRanks(node);
-    // map does not include "no rank" nor "no_rank"
-    const char* rank = getString(node->rankIdx);
-    int baseRankIndex = findRankIndex(rank);
-    std::string baseRank = "uc_";
-    baseRank.append(getString(node->nameIdx));
-    for (std::vector<std::string>::const_iterator it = levels.begin(); it != levels.end(); ++it) {
-        std::map<std::string, std::string>::iterator jt = allRanks.find(*it);
-        if (jt != allRanks.end()) {
-            result.emplace_back(jt->second);
-            continue;
-        }
-
-        // If not ... 2 possible causes: i) too low level ("uc_")
-        if (NcbiRanks.at(*it) < baseRankIndex) {
-            result.emplace_back(baseRank);
-            continue;
-        }
-
-        // ii) No taxon for the LCA at the required level -- give the first known upstream
-        result.emplace_back("unknown");
-    }
-    return result;
-}
-
-std::vector<std::string> NcbiTaxonomy::parseRanks(const std::string& ranks) {
-    std::vector<std::string> temp = Util::split(ranks, ",");
-    for (size_t i = 0; i < temp.size(); ++i) {
-        if (findRankIndex(temp[i]) == -1) {
-            Debug(Debug::ERROR) << "Invalid taxonomic rank " << temp[i] << "given\n";
-            EXIT(EXIT_FAILURE);
-        }
-    }
-    return temp;
-}
-
-int NcbiTaxonomy::findRankIndex(const std::string& rank) {
-    std::map<std::string, int>::const_iterator it;
-    if ((it = NcbiRanks.find(rank)) != NcbiRanks.end()) {
-        return it->second;
-    }
-    return -1;
-}
-
-char NcbiTaxonomy::findShortRank(const std::string& rank) {
-    std::map<std::string, char>::const_iterator it;
-    if ((it = NcbiShortRanks.find(rank)) != NcbiShortRanks.end()) {
-        return it->second;
-    }
-    return '-';
-}
-
-std::string NcbiTaxonomy::taxLineage(TaxonNode const *node, bool infoAsName) {
-    std::vector<TaxonNode const *> taxLineageVec;
-    std::string taxLineage;
-    taxLineage.reserve(4096);
-    do {
-        taxLineageVec.push_back(node);
-        node = taxonNode(node->parentTaxId);
-    } while (node->parentTaxId != node->taxId);
-
-    for (int i = taxLineageVec.size() - 1; i >= 0; --i) {
-        if (infoAsName) {
-            taxLineage += findShortRank(getString(taxLineageVec[i]->rankIdx));
-            taxLineage += '_';
-            taxLineage += getString(taxLineageVec[i]->nameIdx);
-        } else {
-            taxLineage += SSTR(taxLineageVec[i]->taxId);
-        }
-
-        if (i > 0) {
-            taxLineage += ";";
-        }
-    }
-    return taxLineage;
-}
-
-int NcbiTaxonomy::nodeId(TaxID taxonId) const {
-    if (taxonId < 0 || !nodeExists(taxonId)) {
-        Debug(Debug::ERROR) << "Invalid node " << taxonId << "!\n";
-        EXIT(EXIT_FAILURE);
-    }
-    return D[taxonId];
-}
-
-bool NcbiTaxonomy::nodeExists(TaxID taxonId) const {
-    return taxonId <= maxTaxID && D[taxonId] != -1;
-}
-
-TaxonNode const * NcbiTaxonomy::taxonNode(TaxID taxonId, bool fail) const {
-    if (taxonId == 0 || (!fail && !nodeExists(taxonId))) {
-        return NULL;
-    }
-    return &(taxonNodes[nodeId(taxonId)]);
-}
-
-std::map<std::string, std::string> NcbiTaxonomy::AllRanks(TaxonNode const *node) const {
-    std::map<std::string, std::string> result;
-    while (true) {
-        std::string rank = getString(node->rankIdx);
-        std::string name = getString(node->nameIdx);
-        if (node->taxId == 1) {
-            result.emplace(rank, name);
-            return result;
-        }
-
-        if ((rank != "no_rank") && (rank != "no rank")) {
-            result.emplace(rank, name);
-        }
-
-        node = taxonNode(node->parentTaxId);
-    }
-}
-
-size_t NcbiTaxonomy::loadMerged(const std::string &mergedFile) {
-    Debug(Debug::INFO) << "Loading merged file ...";
-    std::ifstream ss(mergedFile);
-    if (ss.fail()) {
-        Debug(Debug::ERROR) << "File " << mergedFile << " not found!\n";
-        EXIT(EXIT_FAILURE);
-    }
-
-    std::string line;
-    size_t count = 0;
-    while (std::getline(ss, line)) {
-        std::vector<std::string> result = splitByDelimiter(line, "\t|\t", 2);
-        if (result.size() != 2) {
-            Debug(Debug::ERROR) << "Invalid name entry!\n";
-            EXIT(EXIT_FAILURE);
-        }
-
-        unsigned int oldId = (unsigned int)strtoul(result[0].c_str(), NULL, 10);
-        unsigned int mergedId = (unsigned int)strtoul(result[1].c_str(), NULL, 10);
-        if (!nodeExists(oldId) && nodeExists(mergedId)) {
-            D[oldId] = D[mergedId];
-            ++count;
-        }
-    }
-    Debug(Debug::INFO) << " Done, added " << count << " merged nodes.\n";
-    return count;
-}
-
-std::unordered_map<TaxID, TaxonCounts> NcbiTaxonomy::getCladeCounts(std::unordered_map<TaxID, unsigned int>& taxonCounts) const {
-    Debug(Debug::INFO) << "Calculating clade counts ... ";
-    std::unordered_map<TaxID, TaxonCounts> cladeCounts;
-
-    for (std::unordered_map<TaxID, unsigned int>::const_iterator it = taxonCounts.begin(); it != taxonCounts.end(); ++it) {
-        cladeCounts[it->first].taxCount = it->second;
-        cladeCounts[it->first].cladeCount += it->second;
-        if (nodeExists(it->first)) {
-            TaxonNode const* taxon = taxonNode(it->first);
-            while (taxon->parentTaxId != taxon->taxId && nodeExists(taxon->parentTaxId)) {
-                taxon = taxonNode(taxon->parentTaxId);
-                cladeCounts[taxon->taxId].cladeCount += it->second;
-            }
-        }
-    }
-
-    for (size_t i = 0; i < maxNodes; ++i) {
-        TaxonNode& tn = taxonNodes[i];
-        if (tn.parentTaxId != tn.taxId && cladeCounts.count(tn.taxId)) {
-            std::unordered_map<TaxID, TaxonCounts>::iterator itp = cladeCounts.find(tn.parentTaxId);
-            itp->second.children.push_back(tn.taxId);
-        }
-    }
-
-    Debug(Debug::INFO) << " Done\n";
-    return cladeCounts;
-}
-
-NcbiTaxonomy * NcbiTaxonomy::openTaxonomy(const std::string &database){
-    std::string binFile = database + "_taxonomy";
-    if (FileUtil::fileExists(binFile.c_str())) {
-        FILE* handle = fopen(binFile.c_str(), "r");
-        struct stat sb;
-        if (fstat(fileno(handle), &sb) < 0) {
-            Debug(Debug::ERROR) << "Failed to fstat file " << binFile << "\n";
-            EXIT(EXIT_FAILURE);
-        }
-        char* data = (char*)mmap(NULL, sb.st_size, PROT_READ, MAP_PRIVATE, fileno(handle), 0);
-        if (data == MAP_FAILED){
-            Debug(Debug::ERROR) << "Failed to mmap file " << binFile << " with error " << errno << "\n";
-            EXIT(EXIT_FAILURE);
-        }
-        fclose(handle);
-        NcbiTaxonomy* t = NcbiTaxonomy::unserialize(data);
-        if (t != NULL) {
-            t->mmapData = data;
-            t->mmapSize = sb.st_size;
-            return t;
-        } else {
-            Debug(Debug::WARNING) << "Outdated taxonomy information, please recreate with createtaxdb.\n";
-        }
-    }
-    Debug(Debug::INFO) << "Loading NCBI taxonomy\n";
-    std::string nodesFile = database + "_nodes.dmp";
-    std::string namesFile = database + "_names.dmp";
-    std::string mergedFile = database + "_merged.dmp";
-    if (FileUtil::fileExists(nodesFile.c_str())
-        && FileUtil::fileExists(namesFile.c_str())
-        && FileUtil::fileExists(mergedFile.c_str())) {
-    } else if (FileUtil::fileExists("nodes.dmp")
-               && FileUtil::fileExists("names.dmp")
-               && FileUtil::fileExists("merged.dmp")) {
-        nodesFile = "nodes.dmp";
-        namesFile = "names.dmp";
-        mergedFile = "merged.dmp";
-    } else {
-        Debug(Debug::ERROR) << "names.dmp, nodes.dmp, merged.dmp from NCBI taxdump could not be found!\n";
-        EXIT(EXIT_FAILURE);
-    }
-    return new NcbiTaxonomy(namesFile, nodesFile, mergedFile);
-}
-
-const TaxID ROOT_TAXID = 1;
-const int ROOT_RANK = INT_MAX;
-
-struct TaxNode {
-    TaxNode(const double weight, const bool isCandidate, const TaxID childTaxon)
-            : weight(weight), isCandidate(isCandidate), childTaxon(childTaxon) {}
-
-    void update(const double weightToAdd, const TaxID & childTaxonInput) {
-        if (childTaxon != childTaxonInput) {
-            isCandidate = true;
-            childTaxon = childTaxonInput;
-        }
-        weight += weightToAdd;
-    }
-
-    double weight;
-    bool isCandidate;
-    TaxID childTaxon;
-};
-
-const char* NcbiTaxonomy::getString(size_t blockIdx) const {
-    return block->getString(blockIdx);
-}
-
-WeightedTaxHit::WeightedTaxHit(const TaxID taxon, const float evalue, const int weightVoteMode) : taxon(taxon) {
-    switch (weightVoteMode) {
-        case Parameters::AGG_TAX_UNIFORM:
-            weight = 1.0;
-            break;
-        case Parameters::AGG_TAX_MINUS_LOG_EVAL:
-            weight = evalue;
-            if (evalue != FLT_MAX) {
-                if (evalue > 0) {
-                    weight = -log(evalue);
-                } else {
-                    weight = MAX_TAX_WEIGHT;
-                }
-            }
-            break;
-        case Parameters::AGG_TAX_SCORE:
-            weight = evalue;
-            break;
-        default:
-            Debug(Debug::ERROR) << "Invalid weight vote mode\n";
-            EXIT(EXIT_FAILURE);
-    }
-}
-
-WeightedTaxResult NcbiTaxonomy::weightedMajorityLCA(const std::vector<WeightedTaxHit> &setTaxa, const float majorityCutoff) {
-    // count num occurences of each ancestor, possibly weighted
-    std::map<TaxID, TaxNode> ancTaxIdsCounts;
-
-    // initialize counters and weights
-    size_t assignedSeqs = 0;
-    size_t unassignedSeqs = 0;
-    double totalAssignedSeqsWeights = 0.0;
-
-    for (size_t i = 0; i < setTaxa.size(); ++i) {
-        TaxID currTaxId = setTaxa[i].taxon;
-        double currWeight = setTaxa[i].weight;
-        // ignore unassigned sequences
-        if (currTaxId == 0) {
-            unassignedSeqs++;
-            continue;
-        }
-        TaxonNode const *node = taxonNode(currTaxId, false);
-        if (node == NULL) {
-            Debug(Debug::ERROR) << "taxonid: " << currTaxId << " does not match a legal taxonomy node.\n";
-            EXIT(EXIT_FAILURE);
-        }
-        totalAssignedSeqsWeights += currWeight;
-        assignedSeqs++;
-
-        // each start of a path due to an orf is a candidate
-        std::map<TaxID, TaxNode>::iterator it;
-        if ((it = ancTaxIdsCounts.find(currTaxId)) != ancTaxIdsCounts.end()) {
-            it->second.update(currWeight, 0);
-        } else {
-            TaxNode current(currWeight, true, 0);
-            ancTaxIdsCounts.emplace(currTaxId, current);
-        }
-
-        // iterate all ancestors up to root (including). add currWeight and candidate status to each
-        TaxID currParentTaxId = node->parentTaxId;
-        while (currParentTaxId != currTaxId) {
-            if ((it = ancTaxIdsCounts.find(currParentTaxId)) != ancTaxIdsCounts.end()) {
-                it->second.update(currWeight, currTaxId);
-            } else {
-                TaxNode parent(currWeight, false, currTaxId);
-                ancTaxIdsCounts.emplace(currParentTaxId, parent);
-            }
-            // move up
-            currTaxId = currParentTaxId;
-            node = taxonNode(currParentTaxId, false);
-            currParentTaxId = node->parentTaxId;
-        }
-    }
-
-    TaxID selctedTaxon = 0;
-    if (totalAssignedSeqsWeights == 0) {
-        return WeightedTaxResult(selctedTaxon, assignedSeqs, unassignedSeqs, 0, 0.0);
-    }
-
-    // select the lowest ancestor that meets the cutoff
-    int minRank = INT_MAX;
-    double selectedPercent = 0;
-    for (std::map<TaxID, TaxNode>::iterator it = ancTaxIdsCounts.begin(); it != ancTaxIdsCounts.end(); it++) {
-        // consider only candidates
-        if (it->second.isCandidate == false) {
-            continue;
-        }
-
-        double currPercent = it->second.weight / totalAssignedSeqsWeights;
-        if (currPercent >= majorityCutoff) {
-            // iterate all ancestors to find lineage min rank (the candidate is a descendant of a node with this rank)
-            TaxID currTaxId = it->first;
-            TaxonNode const *node = taxonNode(currTaxId, false);
-            int currMinRank = ROOT_RANK;
-            TaxID currParentTaxId = node->parentTaxId;
-            while (currParentTaxId != currTaxId) {
-                int currRankInd = NcbiTaxonomy::findRankIndex(getString(node->rankIdx));
-                if ((currRankInd > 0) && (currRankInd < currMinRank)) {
-                    currMinRank = currRankInd;
-                    // the rank can only go up on the way to the root, so we can break
-                    break;
-                }
-                // move up:
-                currTaxId = currParentTaxId;
-                node = taxonNode(currParentTaxId, false);
-                currParentTaxId = node->parentTaxId;
-            }
-
-            if ((currMinRank < minRank) || ((currMinRank == minRank) && (currPercent > selectedPercent))) {
-                selctedTaxon = it->first;
-                minRank = currMinRank;
-                selectedPercent = currPercent;
-            }
-        }
-    }
-
-    // count the number of seqs who have selectedTaxon in their ancestors (agree with selection):
-    if (selctedTaxon == ROOT_TAXID) {
-        // all agree with "root"
-        return WeightedTaxResult(selctedTaxon, assignedSeqs, unassignedSeqs, assignedSeqs, selectedPercent);
-    }
-    if (selctedTaxon == 0) {
-        // nothing informative
-        return WeightedTaxResult(selctedTaxon, assignedSeqs, unassignedSeqs, 0, selectedPercent);
-    }
-    size_t seqsAgreeWithSelectedTaxon = 0;
-    // otherwise, iterate over all seqs
-    for (size_t i = 0; i < setTaxa.size(); ++i) {
-        TaxID currTaxId = setTaxa[i].taxon;
-        // ignore unassigned sequences
-        if (currTaxId == 0) {
-            continue;
-        }
-        TaxonNode const *node = taxonNode(currTaxId, false);
-
-        // iterate all ancestors up to the root
-        TaxID currParentTaxId = node->parentTaxId;
-        while (currParentTaxId != currTaxId) {
-            if (currTaxId == selctedTaxon) {
-                seqsAgreeWithSelectedTaxon++;
-                break;
-            }
-            currTaxId = currParentTaxId;
-            node = taxonNode(currParentTaxId, false);
-            currParentTaxId = node->parentTaxId;
-        }
-    }
-
-    return WeightedTaxResult(selctedTaxon, assignedSeqs, unassignedSeqs, seqsAgreeWithSelectedTaxon, selectedPercent);
-}
-
-std::pair<char*, size_t> NcbiTaxonomy::serialize(const NcbiTaxonomy& t) {
-    t.block->compact();
-    size_t matrixDim = (t.maxNodes * 2);
-    size_t matrixK = (int)(MathUtil::flog2(matrixDim)) + 1;
-    size_t matrixSize = matrixDim * matrixK * sizeof(int);
-    size_t blockSize = StringBlock<unsigned int>::memorySize(*t.block);
-    size_t memSize = sizeof(int) // SERIALIZATION_VERSION
-        + sizeof(size_t) // maxNodes
-        + sizeof(int) // maxTaxID
-        + t.maxNodes * sizeof(TaxonNode) // taxonNodes
-        + (t.maxTaxID + 1) * sizeof(int) // D
-        + 2 * (t.maxNodes * 2) * sizeof(int) // E,L
-        + t.maxNodes * sizeof(int) // H
-        + matrixSize // M
-        + blockSize; // block
-
-    char* mem = (char*) malloc(memSize);
-    char* p = mem;
-    memcpy(p, &t.SERIALIZATION_VERSION, sizeof(int));
-    p += sizeof(int);
-    memcpy(p, &t.maxNodes, sizeof(size_t));
-    p += sizeof(size_t);
-    memcpy(p, &t.maxTaxID, sizeof(int));
-    p += sizeof(int);
-    memcpy(p, t.taxonNodes, t.maxNodes * sizeof(TaxonNode));
-    p += t.maxNodes * sizeof(TaxonNode);
-    memcpy(p, t.D, (t.maxTaxID + 1) * sizeof(int));
-    p += (t.maxTaxID + 1) * sizeof(int);
-    memcpy(p, t.E, (t.maxNodes * 2) * sizeof(int));
-    p += (t.maxNodes * 2) * sizeof(int);
-    memcpy(p, t.L, (t.maxNodes * 2) * sizeof(int));
-    p += (t.maxNodes * 2) * sizeof(int);
-    memcpy(p, t.H, t.maxNodes * sizeof(int));
-    p += t.maxNodes * sizeof(int);
-    memcpy(p, t.M[0], matrixSize);
-    p += matrixSize;
-    char* blockData = StringBlock<unsigned int>::serialize(*t.block);
-    memcpy(p, blockData, blockSize);
-    p += blockSize;
-    free(blockData);
-    return std::make_pair(mem, memSize);
-}
-
-NcbiTaxonomy* NcbiTaxonomy::unserialize(char* mem) {
-    const char* p = mem;
-    int version = *((int*)p);
-    p += sizeof(int);
-    if (version != NcbiTaxonomy::SERIALIZATION_VERSION) {
-        return NULL;
-    }
-    size_t maxNodes = *((size_t*)p);
-    p += sizeof(size_t);
-    int maxTaxID = *((int*)p);
-    p += sizeof(int);
-    TaxonNode* taxonNodes = (TaxonNode*)p;
-    p += maxNodes * sizeof(TaxonNode);
-    int* D = (int*)p;
-    p += (maxTaxID + 1) * sizeof(int);
-    int* E = (int*)p;
-    p += (maxNodes * 2) * sizeof(int);
-    int* L = (int*)p;
-    p += (maxNodes * 2) * sizeof(int);
-    int* H = (int*)p;
-    p += maxNodes * sizeof(int);
-    size_t matrixDim = (maxNodes * 2);
-    size_t matrixK = (int)(MathUtil::flog2(matrixDim)) + 1;
-    size_t matrixSize = matrixDim * matrixK * sizeof(int);
-    int** M = new int*[matrixDim];
-    M[0] = (int*)p;
-    for(size_t i = 1; i < matrixDim; i++) {
-        M[i] = M[i-1] + matrixK;
-    }
-    p += matrixSize;
-    StringBlock<unsigned int>* block = StringBlock<unsigned int>::unserialize(p);
-    return new NcbiTaxonomy(taxonNodes, maxNodes, maxTaxID, D, E, L, H, M, block);
-}
+#include "NcbiTaxonomy.h"
+#include "FileUtil.h"
+#include "MathUtil.h"
+#include "Debug.h"
+#include "Util.h"
+#include "sys/mman.h"
+
+#include <fstream>
+#include <algorithm>
+#include <cassert>
+
+const int NcbiTaxonomy::SERIALIZATION_VERSION = 2;
+
+int **makeMatrix(size_t maxNodes) {
+    size_t dimension = maxNodes * 2;
+    int **M = new int*[dimension];
+    int k = (int)(MathUtil::flog2(dimension)) + 1;
+    M[0] = new int[dimension * k]();
+    for(size_t i = 1; i < dimension; i++) {
+        M[i] = M[i-1] + k;
+    }
+
+    return M;
+}
+
+void deleteMatrix(int** M) {
+    delete[] M[0];
+    delete[] M;
+}
+
+NcbiTaxonomy::NcbiTaxonomy(const std::string &namesFile, const std::string &nodesFile, const std::string &mergedFile) : externalData(false) {
+    block = new StringBlock<unsigned int>();
+    std::vector<TaxonNode> tmpNodes;
+    loadNodes(tmpNodes, nodesFile);
+    loadMerged(mergedFile);
+    loadNames(tmpNodes, namesFile);
+
+    maxNodes = tmpNodes.size();
+    taxonNodes = new TaxonNode[maxNodes];
+    std::copy(tmpNodes.begin(), tmpNodes.end(), taxonNodes);
+
+    std::vector<int> tmpE;
+    tmpE.reserve(maxNodes * 2);
+
+    std::vector<int> tmpL;
+    tmpL.reserve(maxNodes * 2);
+
+    H = new int[maxNodes];
+    std::fill(H, H + maxNodes, 0);
+
+    std::vector<std::vector<TaxID>> children(tmpNodes.size());
+    for (std::vector<TaxonNode>::const_iterator it = tmpNodes.begin(); it != tmpNodes.end(); ++it) {
+        if (it->parentTaxId != it->taxId) {
+            children[nodeId(it->parentTaxId)].push_back(it->taxId);
+        }
+    }
+
+    elh(children, 1, 0, tmpE, tmpL);
+    tmpE.resize(maxNodes * 2, 0);
+    tmpL.resize(maxNodes * 2, 0);
+
+    E = new int[maxNodes * 2];
+    std::copy(tmpE.begin(), tmpE.end(), E);
+    L = new int[maxNodes * 2];
+    std::copy(tmpL.begin(), tmpL.end(), L);
+
+    M = makeMatrix(maxNodes);
+    computeSparseTable();
+
+    mmapData = NULL;
+    mmapSize = 0;
+}
+
+NcbiTaxonomy::~NcbiTaxonomy() {
+    if (externalData) {
+        delete[] M;
+    } else {
+        delete[] taxonNodes;
+        delete[] H;
+        delete[] D;
+        delete[] E;
+        delete[] L;
+        deleteMatrix(M);
+    }
+    delete block;
+    if (mmapData != NULL) {
+        munmap(mmapData, mmapSize);
+    }
+}
+
+std::vector<std::string> splitByDelimiter(const std::string &s, const std::string &delimiter, int maxCol) {
+    std::vector<std::string> result;
+    size_t prev = 0, pos = 0;
+    int i = 0;
+    do {
+        pos = s.find(delimiter, prev);
+        if (pos == std::string::npos) pos = s.length();
+        result.emplace_back(s.substr(prev, pos - prev));
+        prev = pos + delimiter.length();
+        i++;
+    } while (pos < s.length() && prev < s.length() && i < maxCol);
+
+    return result;
+}
+
+size_t NcbiTaxonomy::loadNodes(std::vector<TaxonNode> &tmpNodes, const std::string &nodesFile) {
+    Debug(Debug::INFO) << "Loading nodes file ...";
+    std::ifstream ss(nodesFile);
+    if (ss.fail()) {
+        Debug(Debug::ERROR) << "File " << nodesFile << " not found!\n";
+        EXIT(EXIT_FAILURE);
+    }
+
+    std::map<TaxID, int> Dm; // temporary map TaxID -> internal ID;
+    maxTaxID = 0;
+    int currentId = 0;
+    std::string line;
+    while (std::getline(ss, line)) {
+        std::vector<std::string> result = splitByDelimiter(line, "\t|\t", 3);
+        TaxID taxId = (TaxID) strtol(result[0].c_str(), NULL, 10);
+        TaxID parentTaxId = (TaxID) strtol(result[1].c_str(), NULL, 10);
+        if (taxId > maxTaxID) {
+            maxTaxID = taxId;
+        }
+        size_t rankIdx = block->append(result[2].c_str(), result[2].size());
+        tmpNodes.emplace_back(currentId, taxId, parentTaxId, rankIdx, (size_t)-1);
+        Dm.emplace(taxId, currentId);
+        ++currentId;
+    }
+
+    D = new int[maxTaxID + 1];
+    std::fill_n(D, maxTaxID + 1, -1);
+    for (std::map<TaxID, int>::iterator it = Dm.begin(); it != Dm.end(); ++it) {
+        assert(it->first <= maxTaxID);
+        D[it->first] = it->second;
+    }
+
+    // Loop over taxonNodes and check all parents exist
+    for (std::vector<TaxonNode>::iterator it = tmpNodes.begin(); it != tmpNodes.end(); ++it) {
+        if (!nodeExists(it->parentTaxId)) {
+            Debug(Debug::ERROR) << "Inconsistent nodes.dmp taxonomy file! Cannot find parent taxon with ID " << it->parentTaxId << "!\n";
+            EXIT(EXIT_FAILURE);
+        }
+    }
+
+    Debug(Debug::INFO) << " Done, got " << tmpNodes.size() << " nodes\n";
+    return tmpNodes.size();
+}
+
+std::pair<int, std::string> parseName(const std::string &line) {
+    std::vector<std::string> result = splitByDelimiter(line, "\t|\t", 2);
+    if (result.size() != 2) {
+        Debug(Debug::ERROR) << "Invalid name entry!\n";
+        EXIT(EXIT_FAILURE);
+    }
+    return std::make_pair((int)strtol(result[0].c_str(), NULL, 10), result[1]);
+}
+
+void NcbiTaxonomy::loadNames(std::vector<TaxonNode> &tmpNodes, const std::string &namesFile) {
+    Debug(Debug::INFO) << "Loading names file ...";
+    std::ifstream ss(namesFile);
+    if (ss.fail()) {
+        Debug(Debug::ERROR) << "File " << namesFile << " not found!\n";
+        EXIT(EXIT_FAILURE);
+    }
+
+    std::string line;
+    while (std::getline(ss, line)) {
+        if (line.find("scientific name") == std::string::npos) {
+            continue;
+        }
+
+        std::pair<int, std::string> entry = parseName(line);
+        if (!nodeExists(entry.first)) {
+            Debug(Debug::ERROR) << "loadNames: Taxon " << entry.first << " not present in nodes file!\n";
+            EXIT(EXIT_FAILURE);
+        }
+        tmpNodes[nodeId(entry.first)].nameIdx = block->append(entry.second.c_str(), entry.second.size());
+    }
+    Debug(Debug::INFO) << " Done\n";
+}
+
+// Euler traversal of tree
+void NcbiTaxonomy::elh(std::vector<std::vector<TaxID>> const & children, TaxID taxId, int level, std::vector<int> &tmpE, std::vector<int> &tmpL) {
+    assert (taxId > 0);
+    int id = nodeId(taxId);
+
+    if (H[id] == 0) {
+        H[id] = tmpE.size();
+    }
+
+    tmpE.emplace_back(id);
+    tmpL.emplace_back(level);
+
+    for (std::vector<TaxID>::const_iterator child_it = children[id].begin(); child_it != children[id].end(); ++child_it) {
+        elh(children, *child_it, level + 1, tmpE, tmpL);
+    }
+    tmpE.emplace_back(nodeId(taxonNodes[id].parentTaxId));
+    tmpL.emplace_back(level - 1);
+}
+
+void NcbiTaxonomy::computeSparseTable() {
+    Debug(Debug::INFO) << "Init computeSparseTable ...";
+    // sparse table M has N rows and log(N) columns.
+    // M[i][j] refers to the subarray L[i..2^j]
+    // M[i][j] holds the index of the minimal value in the subarray
+    size_t N = maxNodes * 2; // TO DO - I think this can actually be changed to maxNodes!!!
+    // Debug(Debug::INFO) << "N: " << N << "\n";
+
+    // size_t helperCount = 0;
+
+    // initialize all rows for column 0
+    for (size_t row_ind = 0; row_ind < N; row_ind++) {
+        M[row_ind][0] = row_ind;
+        // helperCount++;
+    }
+
+    // fill in column after column
+    size_t col_ind = 1;
+    size_t exp_prev_col_ind = 1ul; // 2 ^ 0
+    size_t exp_col_ind = (1ul << col_ind);
+
+    while (exp_col_ind <= N) {   
+        size_t row_ind = 0;
+        while (row_ind + exp_col_ind - 1 < N) {
+            int min_ind_first_half = M[row_ind][col_ind - 1];
+            int min_ind_second_half = M[row_ind + exp_prev_col_ind][col_ind - 1];
+            if (L[min_ind_first_half] < L[min_ind_second_half]) {
+                M[row_ind][col_ind] = min_ind_first_half;
+                // helperCount++;
+            } else {
+                M[row_ind][col_ind] = min_ind_second_half;
+                // helperCount++;
+            }
+            // increase row_ind
+            row_ind = row_ind + 1;
+        }
+        // increase col_ind
+        col_ind = col_ind + 1;
+        exp_prev_col_ind = exp_col_ind;
+        exp_col_ind = (1ul << col_ind);
+    }
+    // Debug(Debug::INFO) << "updated cells of M: " << helperCount << "\n";
+    // Debug(Debug::INFO) << "last used exponent: " << exp_prev_col_ind << "\n";
+    // Debug(Debug::INFO) << "last unused exponent: " << exp_col_ind << "\n";
+    // Debug(Debug::INFO) << "col_ind: " << col_ind << "\n";
+    Debug(Debug::INFO) << "Done\n";
+}
+
+int NcbiTaxonomy::RangeMinimumQuery(int i, int j) const {
+    assert(j >= i);
+    int k = (int)MathUtil::flog2(j - i + 1);
+    int A = M[i][k];
+    int B = M[j - MathUtil::ipow<int>(2, k) + 1][k];
+    if (L[A] <= L[B]) {
+        return A;
+    }
+    return B;
+}
+
+int NcbiTaxonomy::lcaHelper(int i, int j) const {
+    if (i == 0 || j == 0) {
+        return 0;
+    }
+    assert(i > 0);
+    assert(j > 0);
+    if (i == j) {
+        return i;
+    }
+    int v1 = H[i];
+    int v2 = H[j];
+    if (v1 > v2) {
+        int tmp = v1;
+        v1 = v2;
+        v2 = tmp;
+    }
+    int rmq = RangeMinimumQuery(v1, v2);
+    assert(E[rmq] >= 0);
+    return E[rmq];
+}
+
+bool NcbiTaxonomy::IsAncestor(TaxID ancestor, TaxID child) {
+    if (ancestor == child) {
+        return true;
+    }
+
+    if (ancestor == 0 || child == 0) {
+        return false;
+    }
+
+    if (!nodeExists(child)) {
+        return false;
+    }
+
+    if (!nodeExists(ancestor)) {
+        return false;
+    }
+
+    return lcaHelper(nodeId(child), nodeId(ancestor)) == nodeId(ancestor);
+}
+
+
+TaxID NcbiTaxonomy::LCA(TaxID taxonA, TaxID taxonB) const {
+    if (!nodeExists(taxonA)) {
+        return taxonB;
+    } else if (!nodeExists(taxonB)) {
+        return taxonA;
+    }
+    return taxonNodes[lcaHelper(nodeId(taxonA), nodeId(taxonB))].taxId;
+}
+
+
+TaxonNode const * NcbiTaxonomy::LCA(const std::vector<TaxID>& taxa) const {
+    std::vector<int>::const_iterator it = taxa.begin();
+    while (it != taxa.end() && !nodeExists(*it)) {
+        Debug(Debug::WARNING) << "No node for taxID " << *it << ", ignoring it.\n";
+        ++it;
+    }
+    if (it == taxa.end()) { return NULL; }
+    int red = nodeId(*it++);
+    for (; it != taxa.end(); ++it) {
+        if (nodeExists(*it)) {
+            red = lcaHelper(red, nodeId(*it));
+        } else {
+            Debug(Debug::WARNING) << "No node for taxID " << *it << ", ignoring it.\n";
+        }
+    }
+
+    assert(red >= 0 && static_cast<unsigned int>(red) < maxNodes);
+
+    return &(taxonNodes[red]);
+}
+
+
+// AtRanks returns a slice of slices having the taxons at the specified taxonomic levels
+std::vector<std::string> NcbiTaxonomy::AtRanks(TaxonNode const *node, const std::vector<std::string> &levels) const {
+    std::vector<std::string> result;
+    std::map<std::string, std::string> allRanks = AllRanks(node);
+    // map does not include "no rank" nor "no_rank"
+    const char* rank = getString(node->rankIdx);
+    int baseRankIndex = findRankIndex(rank);
+    std::string baseRank = "uc_";
+    baseRank.append(getString(node->nameIdx));
+    for (std::vector<std::string>::const_iterator it = levels.begin(); it != levels.end(); ++it) {
+        std::map<std::string, std::string>::iterator jt = allRanks.find(*it);
+        if (jt != allRanks.end()) {
+            result.emplace_back(jt->second);
+            continue;
+        }
+
+        // If not ... 2 possible causes: i) too low level ("uc_")
+        if (NcbiRanks.at(*it) < baseRankIndex) {
+            result.emplace_back(baseRank);
+            continue;
+        }
+
+        // ii) No taxon for the LCA at the required level -- give the first known upstream
+        result.emplace_back("unknown");
+    }
+    return result;
+}
+
+std::vector<std::string> NcbiTaxonomy::parseRanks(const std::string& ranks) {
+    std::vector<std::string> temp = Util::split(ranks, ",");
+    for (size_t i = 0; i < temp.size(); ++i) {
+        if (findRankIndex(temp[i]) == -1) {
+            Debug(Debug::ERROR) << "Invalid taxonomic rank " << temp[i] << "given\n";
+            EXIT(EXIT_FAILURE);
+        }
+    }
+    return temp;
+}
+
+int NcbiTaxonomy::findRankIndex(const std::string& rank) {
+    std::map<std::string, int>::const_iterator it;
+    if ((it = NcbiRanks.find(rank)) != NcbiRanks.end()) {
+        return it->second;
+    }
+    return -1;
+}
+
+char NcbiTaxonomy::findShortRank(const std::string& rank) {
+    std::map<std::string, char>::const_iterator it;
+    if ((it = NcbiShortRanks.find(rank)) != NcbiShortRanks.end()) {
+        return it->second;
+    }
+    return '-';
+}
+
+std::string NcbiTaxonomy::taxLineage(TaxonNode const *node, bool infoAsName) {
+    std::vector<TaxonNode const *> taxLineageVec;
+    std::string taxLineage;
+    taxLineage.reserve(4096);
+    do {
+        taxLineageVec.push_back(node);
+        node = taxonNode(node->parentTaxId);
+    } while (node->parentTaxId != node->taxId);
+
+    for (int i = taxLineageVec.size() - 1; i >= 0; --i) {
+        if (infoAsName) {
+            taxLineage += findShortRank(getString(taxLineageVec[i]->rankIdx));
+            taxLineage += '_';
+            taxLineage += getString(taxLineageVec[i]->nameIdx);
+        } else {
+            taxLineage += SSTR(taxLineageVec[i]->taxId);
+        }
+
+        if (i > 0) {
+            taxLineage += ";";
+        }
+    }
+    return taxLineage;
+}
+
+int NcbiTaxonomy::nodeId(TaxID taxonId) const {
+    if (taxonId < 0 || !nodeExists(taxonId)) {
+        Debug(Debug::ERROR) << "Invalid node " << taxonId << "!\n";
+        EXIT(EXIT_FAILURE);
+    }
+    return D[taxonId];
+}
+
+bool NcbiTaxonomy::nodeExists(TaxID taxonId) const {
+    return taxonId <= maxTaxID && D[taxonId] != -1;
+}
+
+TaxonNode const * NcbiTaxonomy::taxonNode(TaxID taxonId, bool fail) const {
+    if (taxonId == 0 || (!fail && !nodeExists(taxonId))) {
+        return NULL;
+    }
+    return &(taxonNodes[nodeId(taxonId)]);
+}
+
+std::map<std::string, std::string> NcbiTaxonomy::AllRanks(TaxonNode const *node) const {
+    std::map<std::string, std::string> result;
+    while (true) {
+        std::string rank = getString(node->rankIdx);
+        std::string name = getString(node->nameIdx);
+        if (node->taxId == 1) {
+            result.emplace(rank, name);
+            return result;
+        }
+
+        if ((rank != "no_rank") && (rank != "no rank")) {
+            result.emplace(rank, name);
+        }
+
+        node = taxonNode(node->parentTaxId);
+    }
+}
+
+size_t NcbiTaxonomy::loadMerged(const std::string &mergedFile) {
+    Debug(Debug::INFO) << "Loading merged file ...";
+    std::ifstream ss(mergedFile);
+    if (ss.fail()) {
+        Debug(Debug::ERROR) << "File " << mergedFile << " not found!\n";
+        EXIT(EXIT_FAILURE);
+    }
+
+    std::unordered_map<TaxID, TaxID> mergedMap;
+    TaxID localMaxTaxID = maxTaxID;
+    std::string line;
+    while (std::getline(ss, line)) {
+        std::vector<std::string> result = splitByDelimiter(line, "\t|\t", 2);
+        if (result.size() != 2) {
+            Debug(Debug::ERROR) << "Invalid name entry!\n";
+            EXIT(EXIT_FAILURE);
+        }
+
+        TaxID oldId = (TaxID) strtoul(result[0].c_str(), NULL, 10);
+        TaxID mergedId = (TaxID) strtoul(result[1].c_str(), NULL, 10);
+
+        // Only update if the oldId doesn't exist yet AND the mergedId does exist
+        if (!nodeExists(oldId) && nodeExists(mergedId)) {
+            if (oldId > localMaxTaxID) {
+                localMaxTaxID = oldId;
+            }
+            if (mergedId > localMaxTaxID) {
+                localMaxTaxID = mergedId;
+            }
+            mergedMap[oldId] = mergedId;
+        }
+    }
+
+    // realloc D if we find a higher maxTaxID
+    if (localMaxTaxID > maxTaxID) {
+        int* newD = new int[localMaxTaxID + 1];
+        std::copy(D, D + maxTaxID + 1, newD);
+        std::fill(newD + maxTaxID + 1, newD + (localMaxTaxID + 1), -1);
+        delete[] D;
+        D = newD;
+        maxTaxID = localMaxTaxID;
+    }
+
+    size_t count = 0;
+    for (std::unordered_map<TaxID, TaxID>::iterator it = mergedMap.begin(); it != mergedMap.end(); ++it) {
+        D[it->first] = D[it->second];
+        ++count;
+    }
+    Debug(Debug::INFO) << " Done, added " << count << " merged nodes.\n";
+    return count;
+}
+
+std::unordered_map<TaxID, TaxonCounts> NcbiTaxonomy::getCladeCounts(std::unordered_map<TaxID, unsigned int>& taxonCounts) const {
+    Debug(Debug::INFO) << "Calculating clade counts ... ";
+    std::unordered_map<TaxID, TaxonCounts> cladeCounts;
+
+    for (std::unordered_map<TaxID, unsigned int>::const_iterator it = taxonCounts.begin(); it != taxonCounts.end(); ++it) {
+        cladeCounts[it->first].taxCount = it->second;
+        cladeCounts[it->first].cladeCount += it->second;
+        if (nodeExists(it->first)) {
+            TaxonNode const* taxon = taxonNode(it->first);
+            while (taxon->parentTaxId != taxon->taxId && nodeExists(taxon->parentTaxId)) {
+                taxon = taxonNode(taxon->parentTaxId);
+                cladeCounts[taxon->taxId].cladeCount += it->second;
+            }
+        }
+    }
+
+    for (size_t i = 0; i < maxNodes; ++i) {
+        TaxonNode& tn = taxonNodes[i];
+        if (tn.parentTaxId != tn.taxId && cladeCounts.count(tn.taxId)) {
+            std::unordered_map<TaxID, TaxonCounts>::iterator itp = cladeCounts.find(tn.parentTaxId);
+            itp->second.children.push_back(tn.taxId);
+        }
+    }
+
+    Debug(Debug::INFO) << " Done\n";
+    return cladeCounts;
+}
+
+NcbiTaxonomy * NcbiTaxonomy::openTaxonomy(const std::string &database){
+    std::string binFile = database + "_taxonomy";
+    if (FileUtil::fileExists(binFile.c_str())) {
+        FILE* handle = fopen(binFile.c_str(), "r");
+        struct stat sb;
+        if (fstat(fileno(handle), &sb) < 0) {
+            Debug(Debug::ERROR) << "Failed to fstat file " << binFile << "\n";
+            EXIT(EXIT_FAILURE);
+        }
+        char* data = (char*)mmap(NULL, sb.st_size, PROT_READ, MAP_PRIVATE, fileno(handle), 0);
+        if (data == MAP_FAILED){
+            Debug(Debug::ERROR) << "Failed to mmap file " << binFile << " with error " << errno << "\n";
+            EXIT(EXIT_FAILURE);
+        }
+        fclose(handle);
+        NcbiTaxonomy* t = NcbiTaxonomy::unserialize(data);
+        if (t != NULL) {
+            t->mmapData = data;
+            t->mmapSize = sb.st_size;
+            return t;
+        } else {
+            Debug(Debug::WARNING) << "Outdated taxonomy information, please recreate with createtaxdb.\n";
+        }
+    }
+    Debug(Debug::INFO) << "Loading NCBI taxonomy\n";
+    std::string nodesFile = database + "_nodes.dmp";
+    std::string namesFile = database + "_names.dmp";
+    std::string mergedFile = database + "_merged.dmp";
+    if (FileUtil::fileExists(nodesFile.c_str())
+        && FileUtil::fileExists(namesFile.c_str())
+        && FileUtil::fileExists(mergedFile.c_str())) {
+    } else if (FileUtil::fileExists("nodes.dmp")
+               && FileUtil::fileExists("names.dmp")
+               && FileUtil::fileExists("merged.dmp")) {
+        nodesFile = "nodes.dmp";
+        namesFile = "names.dmp";
+        mergedFile = "merged.dmp";
+    } else {
+        Debug(Debug::ERROR) << "names.dmp, nodes.dmp, merged.dmp from NCBI taxdump could not be found!\n";
+        EXIT(EXIT_FAILURE);
+    }
+    return new NcbiTaxonomy(namesFile, nodesFile, mergedFile);
+}
+
+const TaxID ROOT_TAXID = 1;
+const int ROOT_RANK = INT_MAX;
+
+struct TaxNode {
+    TaxNode(const double weight, const bool isCandidate, const TaxID childTaxon)
+            : weight(weight), isCandidate(isCandidate), childTaxon(childTaxon) {}
+
+    void update(const double weightToAdd, const TaxID & childTaxonInput) {
+        if (childTaxon != childTaxonInput) {
+            isCandidate = true;
+            childTaxon = childTaxonInput;
+        }
+        weight += weightToAdd;
+    }
+
+    double weight;
+    bool isCandidate;
+    TaxID childTaxon;
+};
+
+const char* NcbiTaxonomy::getString(size_t blockIdx) const {
+    return block->getString(blockIdx);
+}
+
+WeightedTaxHit::WeightedTaxHit(const TaxID taxon, const float evalue, const int weightVoteMode) : taxon(taxon) {
+    switch (weightVoteMode) {
+        case Parameters::AGG_TAX_UNIFORM:
+            weight = 1.0;
+            break;
+        case Parameters::AGG_TAX_MINUS_LOG_EVAL:
+            weight = evalue;
+            if (evalue != FLT_MAX) {
+                if (evalue > 0) {
+                    weight = -log(evalue);
+                } else {
+                    weight = MAX_TAX_WEIGHT;
+                }
+            }
+            break;
+        case Parameters::AGG_TAX_SCORE:
+            weight = evalue;
+            break;
+        default:
+            Debug(Debug::ERROR) << "Invalid weight vote mode\n";
+            EXIT(EXIT_FAILURE);
+    }
+}
+
+WeightedTaxResult NcbiTaxonomy::weightedMajorityLCA(const std::vector<WeightedTaxHit> &setTaxa, const float majorityCutoff) {
+    // count num occurences of each ancestor, possibly weighted
+    std::map<TaxID, TaxNode> ancTaxIdsCounts;
+
+    // initialize counters and weights
+    size_t assignedSeqs = 0;
+    size_t unassignedSeqs = 0;
+    double totalAssignedSeqsWeights = 0.0;
+
+    for (size_t i = 0; i < setTaxa.size(); ++i) {
+        TaxID currTaxId = setTaxa[i].taxon;
+        double currWeight = setTaxa[i].weight;
+        // ignore unassigned sequences
+        if (currTaxId == 0) {
+            unassignedSeqs++;
+            continue;
+        }
+        TaxonNode const *node = taxonNode(currTaxId, false);
+        if (node == NULL) {
+            unassignedSeqs++;
+            continue;
+        }
+        totalAssignedSeqsWeights += currWeight;
+        assignedSeqs++;
+
+        // each start of a path due to an orf is a candidate
+        std::map<TaxID, TaxNode>::iterator it;
+        if ((it = ancTaxIdsCounts.find(currTaxId)) != ancTaxIdsCounts.end()) {
+            it->second.update(currWeight, 0);
+        } else {
+            TaxNode current(currWeight, true, 0);
+            ancTaxIdsCounts.emplace(currTaxId, current);
+        }
+
+        // iterate all ancestors up to root (including). add currWeight and candidate status to each
+        TaxID currParentTaxId = node->parentTaxId;
+        while (currParentTaxId != currTaxId) {
+            if ((it = ancTaxIdsCounts.find(currParentTaxId)) != ancTaxIdsCounts.end()) {
+                it->second.update(currWeight, currTaxId);
+            } else {
+                TaxNode parent(currWeight, false, currTaxId);
+                ancTaxIdsCounts.emplace(currParentTaxId, parent);
+            }
+            // move up
+            currTaxId = currParentTaxId;
+            node = taxonNode(currParentTaxId, false);
+            currParentTaxId = node->parentTaxId;
+        }
+    }
+
+    TaxID selctedTaxon = 0;
+    if (totalAssignedSeqsWeights == 0) {
+        return WeightedTaxResult(selctedTaxon, assignedSeqs, unassignedSeqs, 0, 0.0);
+    }
+
+    // select the lowest ancestor that meets the cutoff
+    int minRank = INT_MAX;
+    double selectedPercent = 0;
+    for (std::map<TaxID, TaxNode>::iterator it = ancTaxIdsCounts.begin(); it != ancTaxIdsCounts.end(); it++) {
+        // consider only candidates
+        if (it->second.isCandidate == false) {
+            continue;
+        }
+
+        double currPercent = it->second.weight / totalAssignedSeqsWeights;
+        if (currPercent >= majorityCutoff) {
+            // iterate all ancestors to find lineage min rank (the candidate is a descendant of a node with this rank)
+            TaxID currTaxId = it->first;
+            TaxonNode const *node = taxonNode(currTaxId, false);
+            int currMinRank = ROOT_RANK;
+            TaxID currParentTaxId = node->parentTaxId;
+            while (currParentTaxId != currTaxId) {
+                int currRankInd = NcbiTaxonomy::findRankIndex(getString(node->rankIdx));
+                if ((currRankInd > 0) && (currRankInd < currMinRank)) {
+                    currMinRank = currRankInd;
+                    // the rank can only go up on the way to the root, so we can break
+                    break;
+                }
+                // move up:
+                currTaxId = currParentTaxId;
+                node = taxonNode(currParentTaxId, false);
+                currParentTaxId = node->parentTaxId;
+            }
+
+            if ((currMinRank < minRank) || ((currMinRank == minRank) && (currPercent > selectedPercent))) {
+                selctedTaxon = it->first;
+                minRank = currMinRank;
+                selectedPercent = currPercent;
+            }
+        }
+    }
+
+    // count the number of seqs who have selectedTaxon in their ancestors (agree with selection):
+    if (selctedTaxon == ROOT_TAXID) {
+        // all agree with "root"
+        return WeightedTaxResult(selctedTaxon, assignedSeqs, unassignedSeqs, assignedSeqs, selectedPercent);
+    }
+    if (selctedTaxon == 0) {
+        // nothing informative
+        return WeightedTaxResult(selctedTaxon, assignedSeqs, unassignedSeqs, 0, selectedPercent);
+    }
+    size_t seqsAgreeWithSelectedTaxon = 0;
+    // otherwise, iterate over all seqs
+    for (size_t i = 0; i < setTaxa.size(); ++i) {
+        TaxID currTaxId = setTaxa[i].taxon;
+        // ignore unassigned sequences
+        if (currTaxId == 0) {
+            continue;
+        }
+        TaxonNode const *node = taxonNode(currTaxId, false);
+        if (node == NULL) {
+            continue;
+        }
+
+        // iterate all ancestors up to the root
+        TaxID currParentTaxId = node->parentTaxId;
+        while (currParentTaxId != currTaxId) {
+            if (currTaxId == selctedTaxon) {
+                seqsAgreeWithSelectedTaxon++;
+                break;
+            }
+            currTaxId = currParentTaxId;
+            node = taxonNode(currParentTaxId, false);
+            currParentTaxId = node->parentTaxId;
+        }
+    }
+
+    return WeightedTaxResult(selctedTaxon, assignedSeqs, unassignedSeqs, seqsAgreeWithSelectedTaxon, selectedPercent);
+}
+
+std::pair<char*, size_t> NcbiTaxonomy::serialize(const NcbiTaxonomy& t) {
+    t.block->compact();
+    size_t matrixDim = (t.maxNodes * 2);
+    size_t matrixK = (int)(MathUtil::flog2(matrixDim)) + 1;
+    size_t matrixSize = matrixDim * matrixK * sizeof(int);
+    size_t blockSize = StringBlock<unsigned int>::memorySize(*t.block);
+    size_t memSize = sizeof(int) // SERIALIZATION_VERSION
+        + sizeof(size_t) // maxNodes
+        + sizeof(int) // maxTaxID
+        + t.maxNodes * sizeof(TaxonNode) // taxonNodes
+        + (t.maxTaxID + 1) * sizeof(int) // D
+        + 2 * (t.maxNodes * 2) * sizeof(int) // E,L
+        + t.maxNodes * sizeof(int) // H
+        + matrixSize // M
+        + blockSize; // block
+
+    char* mem = (char*) malloc(memSize);
+    char* p = mem;
+    memcpy(p, &t.SERIALIZATION_VERSION, sizeof(int));
+    p += sizeof(int);
+    memcpy(p, &t.maxNodes, sizeof(size_t));
+    p += sizeof(size_t);
+    memcpy(p, &t.maxTaxID, sizeof(int));
+    p += sizeof(int);
+    memcpy(p, t.taxonNodes, t.maxNodes * sizeof(TaxonNode));
+    p += t.maxNodes * sizeof(TaxonNode);
+    memcpy(p, t.D, (t.maxTaxID + 1) * sizeof(int));
+    p += (t.maxTaxID + 1) * sizeof(int);
+    memcpy(p, t.E, (t.maxNodes * 2) * sizeof(int));
+    p += (t.maxNodes * 2) * sizeof(int);
+    memcpy(p, t.L, (t.maxNodes * 2) * sizeof(int));
+    p += (t.maxNodes * 2) * sizeof(int);
+    memcpy(p, t.H, t.maxNodes * sizeof(int));
+    p += t.maxNodes * sizeof(int);
+    memcpy(p, t.M[0], matrixSize);
+    p += matrixSize;
+    char* blockData = StringBlock<unsigned int>::serialize(*t.block);
+    memcpy(p, blockData, blockSize);
+    p += blockSize;
+    free(blockData);
+    return std::make_pair(mem, memSize);
+}
+
+NcbiTaxonomy* NcbiTaxonomy::unserialize(char* mem) {
+    const char* p = mem;
+    int version = *((int*)p);
+    p += sizeof(int);
+    if (version != NcbiTaxonomy::SERIALIZATION_VERSION) {
+        return NULL;
+    }
+    size_t maxNodes = *((size_t*)p);
+    p += sizeof(size_t);
+    int maxTaxID = *((int*)p);
+    p += sizeof(int);
+    TaxonNode* taxonNodes = (TaxonNode*)p;
+    p += maxNodes * sizeof(TaxonNode);
+    int* D = (int*)p;
+    p += (maxTaxID + 1) * sizeof(int);
+    int* E = (int*)p;
+    p += (maxNodes * 2) * sizeof(int);
+    int* L = (int*)p;
+    p += (maxNodes * 2) * sizeof(int);
+    int* H = (int*)p;
+    p += maxNodes * sizeof(int);
+    size_t matrixDim = (maxNodes * 2);
+    size_t matrixK = (int)(MathUtil::flog2(matrixDim)) + 1;
+    size_t matrixSize = matrixDim * matrixK * sizeof(int);
+    int** M = new int*[matrixDim];
+    M[0] = (int*)p;
+    for(size_t i = 1; i < matrixDim; i++) {
+        M[i] = M[i-1] + matrixK;
+    }
+    p += matrixSize;
+    StringBlock<unsigned int>* block = StringBlock<unsigned int>::unserialize(p);
+    return new NcbiTaxonomy(taxonNodes, maxNodes, maxTaxID, D, E, L, H, M, block);
+}
diff --git a/src/taxonomy/NcbiTaxonomy.h b/src/taxonomy/NcbiTaxonomy.h
index 46a3fd0c2..0103822d1 100644
--- a/src/taxonomy/NcbiTaxonomy.h
+++ b/src/taxonomy/NcbiTaxonomy.h
@@ -1,154 +1,149 @@
-// Ported from blast2lca
-// Copyright: 2010 Miguel Pignatelli
-// License: GPLv2 or later
-// https://github.com/emepyc/Blast2lca
-
-#ifndef MMSEQS_NCBITAXONOMY_H
-#define MMSEQS_NCBITAXONOMY_H
-
-#include "StringBlock.h"
-
-#include <map>
-#include <unordered_map>
-#include <vector>
-#include <string>
-
-typedef int TaxID;
-
-struct TaxonNode {
-public:
-    int id;
-    TaxID taxId;
-    TaxID parentTaxId;
-    size_t rankIdx;
-    size_t nameIdx;
-
-    TaxonNode() {};
-
-    TaxonNode(int id, TaxID taxId, TaxID parentTaxId, size_t rankIdx, size_t nameIdx)
-            : id(id), taxId(taxId), parentTaxId(parentTaxId), rankIdx(rankIdx), nameIdx(nameIdx) {};
-};
-
-const double MAX_TAX_WEIGHT = 1000;
-struct WeightedTaxHit {
-    WeightedTaxHit(const TaxID taxon, const float evalue, const int weightVoteMode);
-
-    TaxID taxon;
-    double weight;
-};
-
-struct WeightedTaxResult {
-    WeightedTaxResult(TaxID taxon, size_t assignedSeqs, size_t unassignedSeqs, size_t seqsAgreeWithSelectedTaxon, double selectedPercent)
-            : taxon(taxon), assignedSeqs(assignedSeqs), unassignedSeqs(unassignedSeqs), seqsAgreeWithSelectedTaxon(seqsAgreeWithSelectedTaxon), selectedPercent(selectedPercent) {};
-
-    TaxID  taxon;
-    size_t assignedSeqs;
-    size_t unassignedSeqs;
-    size_t seqsAgreeWithSelectedTaxon;
-    double selectedPercent;
-};
-
-struct TaxonCounts {
-    unsigned int taxCount;       // number of reads/sequences matching to taxa
-    unsigned int cladeCount;     // number of reads/sequences matching to taxa or its children
-    std::vector<TaxID> children; // list of children
-};
-
-static const std::map<std::string, int> NcbiRanks = {{ "forma", 1 },
-                                                     { "varietas", 2 },
-                                                     { "subspecies", 3 },
-                                                     { "species", 4 },
-                                                     { "species subgroup", 5 },
-                                                     { "species group", 6 },
-                                                     { "subgenus", 7 },
-                                                     { "genus", 8 },
-                                                     { "subtribe", 9 },
-                                                     { "tribe", 10 },
-                                                     { "subfamily", 11 },
-                                                     { "family", 12 },
-                                                     { "superfamily", 13 },
-                                                     { "parvorder", 14 },
-                                                     { "infraorder", 15 },
-                                                     { "suborder", 16 },
-                                                     { "order", 17 },
-                                                     { "superorder", 18 },
-                                                     { "infraclass", 19 },
-                                                     { "subclass", 20 },
-                                                     { "class", 21 },
-                                                     { "superclass", 22 },
-                                                     { "subphylum", 23 },
-                                                     { "phylum", 24 },
-                                                     { "superphylum", 25 },
-                                                     { "subkingdom", 26 },
-                                                     { "kingdom", 27 },
-                                                     { "superkingdom", 28 }};
-
-static const std::map<std::string, char> NcbiShortRanks = {{ "species", 's' },
-                                                           { "genus", 'g' },
-                                                           { "family", 'f' },
-                                                           { "order", 'o' },
-                                                           { "class", 'c' },
-                                                           { "phylum", 'p' },
-                                                           { "kingdom", 'k' },
-                                                           { "superkingdom", 'd' }};
-
-class NcbiTaxonomy {
-public:
-    static NcbiTaxonomy* openTaxonomy(const std::string &database);
-    NcbiTaxonomy(const std::string &namesFile,  const std::string &nodesFile, const std::string &mergedFile);
-    ~NcbiTaxonomy();
-
-    TaxonNode const * LCA(const std::vector<TaxID>& taxa) const;
-    TaxID LCA(TaxID taxonA, TaxID taxonB) const;
-    std::vector<std::string> AtRanks(TaxonNode const * node, const std::vector<std::string> &levels) const;
-    std::map<std::string, std::string> AllRanks(TaxonNode const *node) const;
-    std::string taxLineage(TaxonNode const *node, bool infoAsName = true);
-
-    static std::vector<std::string> parseRanks(const std::string& ranks);
-    static int findRankIndex(const std::string& rank);
-    static char findShortRank(const std::string& rank);
-
-    bool IsAncestor(TaxID ancestor, TaxID child);
-    TaxonNode const* taxonNode(TaxID taxonId, bool fail = true) const;
-    bool nodeExists(TaxID taxId) const;
-
-    std::unordered_map<TaxID, TaxonCounts> getCladeCounts(std::unordered_map<TaxID, unsigned int>& taxonCounts) const;
-
-    WeightedTaxResult weightedMajorityLCA(const std::vector<WeightedTaxHit> &setTaxa, const float majorityCutoff);
-
-    const char* getString(size_t blockIdx) const;
-
-    static std::pair<char*, size_t> serialize(const NcbiTaxonomy& taxonomy);
-    static NcbiTaxonomy* unserialize(char* data);
-
-    TaxonNode* taxonNodes;
-    size_t maxNodes;
-private:
-    size_t loadNodes(std::vector<TaxonNode> &tmpNodes, const std::string &nodesFile);
-    size_t loadMerged(const std::string &mergedFile);
-    void loadNames(std::vector<TaxonNode> &tmpNodes, const std::string &namesFile);
-    void elh(std::vector<std::vector<TaxID>> const & children, int node, int level, std::vector<int> &tmpE, std::vector<int> &tmpL);
-    void InitRangeMinimumQuery();
-    int nodeId(TaxID taxId) const;
-
-    int RangeMinimumQuery(int i, int j) const;
-    int lcaHelper(int i, int j) const;
-
-    NcbiTaxonomy(TaxonNode* taxonNodes, size_t maxNodes, int maxTaxID, int *D, int *E, int *L, int *H, int **M, StringBlock<unsigned int> *block)
-        : taxonNodes(taxonNodes), maxNodes(maxNodes), maxTaxID(maxTaxID), D(D), E(E), L(L), H(H), M(M), block(block), externalData(true), mmapData(NULL), mmapSize(0) {};
-    int maxTaxID;
-    int *D; // maps from taxID to node ID in taxonNodes
-    int *E; // for Euler tour sequence (size 2N-1)
-    int *L; // Level of nodes in tour sequence (size 2N-1)
-    int *H;
-    int **M;
-    StringBlock<unsigned int>* block;
-
-    bool externalData;
-    char* mmapData;
-    size_t mmapSize;
-
-    static const int SERIALIZATION_VERSION;
-};
-
-#endif
+#ifndef MMSEQS_NCBITAXONOMY_H
+#define MMSEQS_NCBITAXONOMY_H
+
+#include "StringBlock.h"
+
+#include <map>
+#include <unordered_map>
+#include <vector>
+#include <string>
+
+typedef int TaxID;
+
+struct TaxonNode {
+public:
+    int id;
+    TaxID taxId;
+    TaxID parentTaxId;
+    size_t rankIdx;
+    size_t nameIdx;
+
+    TaxonNode() {};
+
+    TaxonNode(int id, TaxID taxId, TaxID parentTaxId, size_t rankIdx, size_t nameIdx)
+            : id(id), taxId(taxId), parentTaxId(parentTaxId), rankIdx(rankIdx), nameIdx(nameIdx) {};
+};
+
+const double MAX_TAX_WEIGHT = 1000;
+struct WeightedTaxHit {
+    WeightedTaxHit(const TaxID taxon, const float evalue, const int weightVoteMode);
+
+    TaxID taxon;
+    double weight;
+};
+
+struct WeightedTaxResult {
+    WeightedTaxResult(TaxID taxon, size_t assignedSeqs, size_t unassignedSeqs, size_t seqsAgreeWithSelectedTaxon, double selectedPercent)
+            : taxon(taxon), assignedSeqs(assignedSeqs), unassignedSeqs(unassignedSeqs), seqsAgreeWithSelectedTaxon(seqsAgreeWithSelectedTaxon), selectedPercent(selectedPercent) {};
+
+    TaxID  taxon;
+    size_t assignedSeqs;
+    size_t unassignedSeqs;
+    size_t seqsAgreeWithSelectedTaxon;
+    double selectedPercent;
+};
+
+struct TaxonCounts {
+    unsigned int taxCount;       // number of reads/sequences matching to taxa
+    unsigned int cladeCount;     // number of reads/sequences matching to taxa or its children
+    std::vector<TaxID> children; // list of children
+};
+
+static const std::map<std::string, int> NcbiRanks = {{ "forma", 1 },
+                                                     { "varietas", 2 },
+                                                     { "subspecies", 3 },
+                                                     { "species", 4 },
+                                                     { "species subgroup", 5 },
+                                                     { "species group", 6 },
+                                                     { "subgenus", 7 },
+                                                     { "genus", 8 },
+                                                     { "subtribe", 9 },
+                                                     { "tribe", 10 },
+                                                     { "subfamily", 11 },
+                                                     { "family", 12 },
+                                                     { "superfamily", 13 },
+                                                     { "parvorder", 14 },
+                                                     { "infraorder", 15 },
+                                                     { "suborder", 16 },
+                                                     { "order", 17 },
+                                                     { "superorder", 18 },
+                                                     { "infraclass", 19 },
+                                                     { "subclass", 20 },
+                                                     { "class", 21 },
+                                                     { "superclass", 22 },
+                                                     { "subphylum", 23 },
+                                                     { "phylum", 24 },
+                                                     { "superphylum", 25 },
+                                                     { "subkingdom", 26 },
+                                                     { "kingdom", 27 },
+                                                     { "superkingdom", 28 }};
+
+static const std::map<std::string, char> NcbiShortRanks = {{ "species", 's' },
+                                                           { "genus", 'g' },
+                                                           { "family", 'f' },
+                                                           { "order", 'o' },
+                                                           { "class", 'c' },
+                                                           { "phylum", 'p' },
+                                                           { "kingdom", 'k' },
+                                                           { "superkingdom", 'd' }};
+
+class NcbiTaxonomy {
+public:
+    static NcbiTaxonomy* openTaxonomy(const std::string &database);
+    NcbiTaxonomy(const std::string &namesFile,  const std::string &nodesFile, const std::string &mergedFile);
+    ~NcbiTaxonomy();
+
+    TaxonNode const * LCA(const std::vector<TaxID>& taxa) const;
+    TaxID LCA(TaxID taxonA, TaxID taxonB) const;
+    std::vector<std::string> AtRanks(TaxonNode const * node, const std::vector<std::string> &levels) const;
+    std::map<std::string, std::string> AllRanks(TaxonNode const *node) const;
+    std::string taxLineage(TaxonNode const *node, bool infoAsName = true);
+
+    static std::vector<std::string> parseRanks(const std::string& ranks);
+    static int findRankIndex(const std::string& rank);
+    static char findShortRank(const std::string& rank);
+
+    bool IsAncestor(TaxID ancestor, TaxID child);
+    TaxonNode const* taxonNode(TaxID taxonId, bool fail = true) const;
+    bool nodeExists(TaxID taxId) const;
+
+    std::unordered_map<TaxID, TaxonCounts> getCladeCounts(std::unordered_map<TaxID, unsigned int>& taxonCounts) const;
+
+    WeightedTaxResult weightedMajorityLCA(const std::vector<WeightedTaxHit> &setTaxa, const float majorityCutoff);
+
+    const char* getString(size_t blockIdx) const;
+
+    static std::pair<char*, size_t> serialize(const NcbiTaxonomy& taxonomy);
+    static NcbiTaxonomy* unserialize(char* data);
+
+    TaxonNode* taxonNodes;
+    size_t maxNodes;
+private:
+    size_t loadNodes(std::vector<TaxonNode> &tmpNodes, const std::string &nodesFile);
+    size_t loadMerged(const std::string &mergedFile);
+    void loadNames(std::vector<TaxonNode> &tmpNodes, const std::string &namesFile);
+    void elh(std::vector<std::vector<TaxID>> const & children, int node, int level, std::vector<int> &tmpE, std::vector<int> &tmpL);
+    void computeSparseTable();
+    int nodeId(TaxID taxId) const;
+
+    int RangeMinimumQuery(int i, int j) const;
+    int lcaHelper(int i, int j) const;
+
+    NcbiTaxonomy(TaxonNode* taxonNodes, size_t maxNodes, int maxTaxID, int *D, int *E, int *L, int *H, int **M, StringBlock<unsigned int> *block)
+        : taxonNodes(taxonNodes), maxNodes(maxNodes), maxTaxID(maxTaxID), D(D), E(E), L(L), H(H), M(M), block(block), externalData(true), mmapData(NULL), mmapSize(0) {};
+    int maxTaxID;
+    int *D; // maps from taxID to node ID in taxonNodes
+    int *E; // for Euler tour sequence (size 2N-1)
+    int *L; // Level of nodes in tour sequence (size 2N-1)
+    int *H;
+    int **M;
+    StringBlock<unsigned int>* block;
+
+    bool externalData;
+    char* mmapData;
+    size_t mmapSize;
+
+    static const int SERIALIZATION_VERSION;
+};
+
+#endif
diff --git a/src/test/CMakeLists.txt b/src/test/CMakeLists.txt
index 518e89b62..faca7f86a 100644
--- a/src/test/CMakeLists.txt
+++ b/src/test/CMakeLists.txt
@@ -33,6 +33,7 @@ set(TESTS
         TestUtil.cpp
         TestKsw2.cpp
         TestBestAlphabet.cpp
+        TestUngappedCpuPerf.cpp
         )
 
 
diff --git a/src/test/TestTanTan.cpp b/src/test/TestTanTan.cpp
index 056706989..2a66f8c9c 100644
--- a/src/test/TestTanTan.cpp
+++ b/src/test/TestTanTan.cpp
@@ -26,7 +26,7 @@ int main (int, const char**) {
     Sequence refSeq(10000, 0, &subMat, kmer_size, false, true);
     refSeq.mapSequence(0, 0, ref, strlen(ref));
 
-    char hardMaskTable[256];
+    unsigned char hardMaskTable[256];
     std::fill_n(hardMaskTable, 256, subMat.aa2num[(int) 'X']);
     double probMatrix[21][21];
 
@@ -40,12 +40,11 @@ int main (int, const char**) {
         }
         //std::cout << std::endl;
     }
-    char  refInt[100000];
-
 
+    unsigned char refInt[100000];
     for(size_t i = 0; i < 100000; i++){
         for(int i = 0; i < refSeq.L; i++){
-            refInt[i] = (char) refSeq.numSequence[i];
+            refInt[i] = (unsigned char) refSeq.numSequence[i];
         }
         tantan::maskSequences(refInt, refInt+len, 50 /*options.maxCycleLength*/,
                               probMatrixPointers,
diff --git a/src/test/TestUngappedCpuPerf.cpp b/src/test/TestUngappedCpuPerf.cpp
new file mode 100644
index 000000000..accfc340b
--- /dev/null
+++ b/src/test/TestUngappedCpuPerf.cpp
@@ -0,0 +1,108 @@
+#include "Util.h"
+#include "Parameters.h"
+#include "Sequence.h"
+#include "SubstitutionMatrix.h"
+#include "StripedSmithWaterman.h"
+
+#include <sys/time.h>
+
+#ifdef OPENMP
+#include <omp.h>
+#endif
+
+const char* binary_name = "test_ungappedcpuperf";
+DEFAULT_PARAMETER_SINGLETON_INIT
+
+#define AA_ALPHABET "ACDEFGHIKLMNPQRSTVWY"
+#define ALPHABET_SIZE 20
+
+void generateSequence(char *sequence, int length, unsigned int *seedp) {
+    for (int i = 0; i < length; i++) {
+        sequence[i] = AA_ALPHABET[rand_r(seedp) % ALPHABET_SIZE];
+    }
+    sequence[length] = '\0';
+}
+
+void generateNumSequence(unsigned char *sequence, int length, unsigned int *seedp) {
+    for (int i = 0; i < length; i++) {
+        sequence[i] = rand_r(seedp) % ALPHABET_SIZE;
+    }
+}
+
+int main (int, const char**) {
+    Parameters& par = Parameters::getInstance();
+    par.initMatrices();
+
+    SubstitutionMatrix subMat(par.scoringMatrixFile.values.aminoacid().c_str(), 2.0, 0.0);
+    int8_t* tinySubMat = new int8_t[subMat.alphabetSize * subMat.alphabetSize];
+    for (int i = 0; i < subMat.alphabetSize; i++) {
+        for (int j = 0; j < subMat.alphabetSize; j++) {
+            tinySubMat[i*subMat.alphabetSize + j] = subMat.subMatrix[i][j];
+        }
+    }
+
+    size_t targets = 5000000;
+
+    std::vector<int> benchSizes = {
+        32, 64, 96, 128, 160, 192, 224, 256, 288, 320, 352, 384, 416, 448, 480, 512, 576, 640, 704, 768, 832, 896, 960, 1024, 1152, 1280, 1408, 1536, 1664, 1792, 1920, 2048
+    };
+
+    for (auto seqLen : benchSizes) {
+        char *seq = (char *)malloc((seqLen + 1) * sizeof(char));
+        unsigned int qseed = 42;
+        generateSequence(seq, seqLen, &qseed);
+
+        size_t score = 0;
+        double avgTime = 0;
+        for (size_t rep = 0; rep < 5; rep++){
+            double repTime = 0;
+            size_t sanityCheck = 0;
+#pragma omp parallel reduction(+:score,sanityCheck) reduction(max:repTime)
+{
+            unsigned int thread_idx = 0;
+    #ifdef OPENMP
+            thread_idx = (unsigned int) omp_get_thread_num();
+    #endif
+            size_t ignore, total;
+            Util::decomposeDomain(targets, thread_idx, par.threads, &ignore, &total);
+            sanityCheck += total;
+
+            SmithWaterman aligner(seqLen, subMat.alphabetSize, false, 1.0, Parameters::DBTYPE_AMINO_ACIDS);
+            Sequence qSeq(seqLen, Parameters::DBTYPE_AMINO_ACIDS, &subMat, 0, false, false);
+            qSeq.mapSequence(0, 0, seq, seqLen);
+            aligner.ssw_init(&qSeq, tinySubMat, &subMat);
+
+            unsigned int tseed = 42 + thread_idx;
+            unsigned char** targetSeqs = new unsigned char*[total];
+            for (size_t i = 0; i < total; i++) {
+                targetSeqs[i] = (unsigned char *)malloc(seqLen * sizeof(unsigned char));
+                generateNumSequence(targetSeqs[i], seqLen, &tseed);
+            }
+
+            struct timeval start;
+            struct timeval end;
+            gettimeofday(&start, NULL);
+            for (size_t i = 0; i < total; i++) {
+                score = aligner.ungapped_alignment(targetSeqs[i], seqLen);
+            }
+            gettimeofday(&end, NULL);
+            double diff = (end.tv_sec - start.tv_sec) + 1e-6 * (end.tv_usec - start.tv_usec);
+            repTime = diff;
+
+            for (size_t i = 0; i < total; i++) {
+                free(targetSeqs[i]);
+            }
+            delete[] targetSeqs;
+}
+            Debug(Debug::INFO) << "total: " << sanityCheck << "\n";
+            avgTime += repTime;
+        }
+        avgTime /= 5.0f;
+        double cells = seqLen * seqLen * targets;
+        Debug(Debug::INFO) << score << "\t" << seqLen << "\t" << (cells / (avgTime * 1000000000.0f)) << "\n";
+        free(seq);
+    }
+
+    delete[] tinySubMat;
+}
+
diff --git a/src/util/CMakeLists.txt b/src/util/CMakeLists.txt
index c43c3356d..b740aa7d7 100644
--- a/src/util/CMakeLists.txt
+++ b/src/util/CMakeLists.txt
@@ -30,6 +30,7 @@ set(util_source_files
         util/filtera3m.cpp
         util/filterdb.cpp
         util/gff2db.cpp
+        util/gpuserver.cpp
         util/renamedbkeys.cpp
         util/makepaddedseqdb.cpp
         util/masksequence.cpp
diff --git a/src/util/apply.cpp b/src/util/apply.cpp
index 4e0f5f71a..e66e08352 100644
--- a/src/util/apply.cpp
+++ b/src/util/apply.cpp
@@ -23,99 +23,84 @@ int apply(int, const char **, const Command&) {
 #include <omp.h>
 #endif
 
-
-int pipe2_wrap(int fd[2], int flag) {
-    int ret = pipe(fd);
-    if (ret) {
-        return ret;
-    }
-    if (flag & O_CLOEXEC) {
-        if (    fcntl(fd[0], F_SETFD, FD_CLOEXEC) == -1
-             || fcntl(fd[1], F_SETFD, FD_CLOEXEC) == -1)
-        {
-            return -1;
-        }
+pid_t create_pipe(
+    const char* prog_path,
+    char** prog_argv,
+    char** local_environ,
+    int fd[2]) {
+    if (prog_path == NULL) {
+        return -1;
     }
-    if (flag & O_NONBLOCK) {
-        if (    fcntl(fd[0], F_SETFL, O_NONBLOCK) == -1
-             || fcntl(fd[1], F_SETFL, O_NONBLOCK) == -1)
-        {
-            return -1;
-        }
+    if (prog_argv == NULL) {
+        return -1;
     }
-    return 0;
-}
 
-// Analogous to gnulib implementation
-// https://www.gnu.org/software/gnulib/
-// Licensed under GPLv3
-pid_t create_pipe(const char *prog_path, char **prog_argv, char **environ, int fd[2]) {
-    int ifd[2];
-    int ofd[2];
-
-    int err;
-    if ((err = pipe2_wrap(ifd, O_CLOEXEC)) != 0) {
-        perror("pipe ifd");
-        errno = err;
+    int pipe1Ids[2];
+    int res = pipe(pipe1Ids);
+    if (res != 0) {
+        perror("pipe failed");
         return -1;
     }
-    if ((err = pipe2_wrap(ofd, O_CLOEXEC)) != 0) {
-        perror("pipe ofd");
-        errno = err;
+
+    if (fcntl(pipe1Ids[0], F_SETFD, FD_CLOEXEC) == -1 || fcntl(pipe1Ids[1], F_SETFD, FD_CLOEXEC) == -1) {
+        perror("fcntl failed");
+        close(pipe1Ids[0]);
+        close(pipe1Ids[1]);
         return -1;
     }
 
+    int pipe2Ids[2];
+    res = pipe(pipe2Ids);
+    if (res != 0) {
+        perror("pipe failed");
+        close(pipe1Ids[0]);
+        close(pipe1Ids[1]);
+        return -1;
+    }
 
-    int actions_allocated = 0;
-    int attrs_allocated = 0;
+    if (fcntl(pipe2Ids[0], F_SETFD, FD_CLOEXEC) == -1 || fcntl(pipe2Ids[1], F_SETFD, FD_CLOEXEC) == -1) {
+        perror("fcntl failed");
+        close(pipe1Ids[0]);
+        close(pipe1Ids[1]);
+        close(pipe2Ids[0]);
+        close(pipe2Ids[1]);
+        return -1;
+    }
 
     posix_spawn_file_actions_t actions;
-    posix_spawnattr_t attrs;
-    pid_t child;
-
-    if ((err = posix_spawn_file_actions_init(&actions)) != 0
-        || (actions_allocated = 1,
-               (err = posix_spawn_file_actions_adddup2 (&actions, ofd[0], STDIN_FILENO))  != 0
-            || (err = posix_spawn_file_actions_adddup2 (&actions, ifd[1], STDOUT_FILENO)) != 0
-            #ifdef POSIX_SPAWN_USEVFORK
-            || ((err = posix_spawnattr_init(&attrs)) != 0
-                || (attrs_allocated = 1,
-                   (err = posix_spawnattr_setflags(&attrs, POSIX_SPAWN_USEVFORK)) != 0))
-            #endif
-            || (err = posix_spawnp(&child, prog_path, &actions, attrs_allocated ? &attrs : NULL, prog_argv, environ)) != 0))
-    {
-        perror("fail");
-        errno = err;
-
-        if (actions_allocated) {
-            posix_spawn_file_actions_destroy(&actions);
-        }
-        if (attrs_allocated) {
-            posix_spawnattr_destroy(&attrs);
-        }
-
-        close(ifd[0]);
-        close(ifd[1]);
-        close(ofd[0]);
-        close(ofd[1]);
-
+    posix_spawn_file_actions_init(&actions);
+    if (posix_spawn_file_actions_adddup2(&actions, pipe2Ids[0], STDIN_FILENO) != 0
+     || posix_spawn_file_actions_adddup2(&actions, pipe1Ids[1], STDOUT_FILENO) != 0) {
+        perror("posix_spawn_file_actions failed");
+        posix_spawn_file_actions_destroy(&actions);
+        close(pipe1Ids[0]);
+        close(pipe1Ids[1]);
+        close(pipe2Ids[0]);
+        close(pipe2Ids[1]);
+        return -1;
+     }
+
+    int pid;
+    res = posix_spawnp(&pid, prog_path, &actions, NULL, prog_argv, local_environ);
+    if (res != 0) {
+        perror("posix_spawn failed");
+        close(pipe1Ids[0]);
+        close(pipe1Ids[1]);
+        close(pipe2Ids[0]);
+        close(pipe2Ids[1]);
         return -1;
     }
-
     posix_spawn_file_actions_destroy(&actions);
-    if (attrs_allocated) {
-        posix_spawnattr_destroy(&attrs);
-    }
 
-    if ((err = close(ofd[0])) == -1 || (err = close(ifd[1])) == -1) {
+    if (close(pipe2Ids[0]) != 0 || close(pipe1Ids[1]) != 0) {
         perror("close");
-        errno = err;
         return -1;
     }
 
-    fd[0] = ifd[0];
-    fd[1] = ofd[1];
-    return child;
+    fd[0] = pipe1Ids[0];
+    fd[1] = pipe2Ids[1];
+
+    return pid;
 }
 
 int apply_by_entry(char* data, size_t size, unsigned int key, DBWriter& writer,
@@ -132,10 +117,8 @@ int apply_by_entry(char* data, size_t size, unsigned int key, DBWriter& writer,
         return -1;
     }
 
-    // Analogous to gnulib implementation
     size_t written = 0;
     int error = 0;
-
     char buffer[PIPE_BUF];
     writer.writeStart(proc_idx);
     struct pollfd plist[2];
diff --git a/src/util/convertalignments.cpp b/src/util/convertalignments.cpp
index 79fb63b56..d0e8d6c15 100644
--- a/src/util/convertalignments.cpp
+++ b/src/util/convertalignments.cpp
@@ -337,9 +337,6 @@ int convertalignments(int argc, const char **argv, const Command &command) {
         std::string queryProfData;
         queryProfData.reserve(1024);
 
-        std::string queryBuffer;
-        queryBuffer.reserve(1024);
-
         std::string queryHeaderBuffer;
         queryHeaderBuffer.reserve(1024);
 
@@ -366,10 +363,6 @@ int convertalignments(int argc, const char **argv, const Command &command) {
                 size_t qId = qDbr.sequenceReader->getId(queryKey);
                 querySeqData = qDbr.sequenceReader->getData(qId, thread_idx);
                 querySeqLen = qDbr.sequenceReader->getSeqLen(qId);
-                if(sameDB && qDbr.sequenceReader->isCompressed()){
-                    queryBuffer.assign(querySeqData, querySeqLen);
-                    querySeqData = (char*) queryBuffer.c_str();
-                }
                 if (queryProfile) {
                     size_t queryEntryLen = qDbr.sequenceReader->getEntryLen(qId);
                     Sequence::extractProfileConsensus(querySeqData, queryEntryLen, *subMat, queryProfData);
diff --git a/src/util/convertkb.cpp b/src/util/convertkb.cpp
index 22a8dbadb..8221e28ac 100644
--- a/src/util/convertkb.cpp
+++ b/src/util/convertkb.cpp
@@ -4,10 +4,7 @@
 #include "FileUtil.h"
 #include "Debug.h"
 #include "UniprotKB.h"
-
-#ifdef HAVE_ZLIB
-#include "gzstream.h"
-#endif
+#include "GzReader.h"
 
 #include <fstream>
 #include <set>
@@ -102,19 +99,8 @@ int convertkb(int argc, const char **argv, const Command &command) {
 
     Debug::Progress progress;
     for (std::vector<std::string>::const_iterator it = par.filenames.begin(); it != par.filenames.end(); ++it) {
-        std::istream *kbIn;
-        if (Util::endsWith(".gz", *it)) {
-#ifdef HAVE_ZLIB
-            kbIn = new igzstream((*it).c_str());
-#else
-            Debug(Debug::ERROR) << "MMseqs2 was not compiled with zlib support. Can not read compressed input\n";
-            EXIT(EXIT_FAILURE);
-#endif
-        } else {
-            kbIn = new std::ifstream(*it);
-        }
-
-        if (kbIn->fail()) {
+        GzReader kbIn(*it);
+        if (kbIn.fail()) {
             Debug(Debug::ERROR) << "File " << (*it) << " not found\n";
             EXIT(EXIT_FAILURE);
         }
@@ -122,7 +108,7 @@ int convertkb(int argc, const char **argv, const Command &command) {
         Debug(Debug::INFO) << "Extracting data from " << (*it) << "\n";
         std::string line;
         unsigned int i = 0;
-        while (std::getline(*kbIn, line)) {
+        while (kbIn.getline(line)) {
             if (line.length() < 2) {
                 Debug(Debug::WARNING) << "Invalid entry\n";
                 continue;
@@ -156,7 +142,6 @@ int convertkb(int argc, const char **argv, const Command &command) {
                 i++;
             }
         }
-        delete kbIn;
     }
 
     for (std::vector<unsigned int>::const_iterator it = enabledColumns.begin(); it != enabledColumns.end(); ++it) {
diff --git a/src/util/convertmsa.cpp b/src/util/convertmsa.cpp
index 765664c78..9e8723ae3 100644
--- a/src/util/convertmsa.cpp
+++ b/src/util/convertmsa.cpp
@@ -2,33 +2,18 @@
 #include "DBWriter.h"
 #include "Debug.h"
 #include "Util.h"
+#include "GzReader.h"
 
-#include <fstream>
 #include <algorithm>
 #include <map>
 
-#ifdef HAVE_ZLIB
-#include "gzstream.h"
-#endif
 
 int convertmsa(int argc, const char **argv, const Command &command) {
     Parameters &par = Parameters::getInstance();
     par.parseParameters(argc, argv, command, true, 0, 0);
 
-    std::istream *in;
-    if (Util::endsWith(".gz", par.db1)) {
-#ifdef HAVE_ZLIB
-        in = new igzstream(par.db1.c_str());
-#else
-        Debug(Debug::ERROR) << "MMseqs2 was not compiled with zlib support. Can not read compressed input!\n";
-        return EXIT_FAILURE;
-#endif
-    } else {
-        in = new std::ifstream(par.db1);
-    }
-
-
-    if (in->fail()) {
+    GzReader in(par.db1);
+    if (in.fail()) {
         Debug(Debug::ERROR) << "File " << par.db1 << " not found!\n";
         return EXIT_FAILURE;
     }
@@ -47,7 +32,7 @@ int convertmsa(int argc, const char **argv, const Command &command) {
     result.reserve(10 * 1024 * 1024);
 
     Debug::Progress progress;
-    while (std::getline(*in, line)) {
+    while (in.getline(line)) {
         size_t lineLength = line.length();
         if (lineLength < 1) {
             continue;
@@ -135,6 +120,5 @@ int convertmsa(int argc, const char **argv, const Command &command) {
     }
     writer.close();
 
-    delete in;
     return EXIT_SUCCESS;
 }
diff --git a/src/util/createclusterdb.cpp b/src/util/createclusterdb.cpp
index 3a8bdb60f..72dd5bf20 100644
--- a/src/util/createclusterdb.cpp
+++ b/src/util/createclusterdb.cpp
@@ -43,7 +43,7 @@ int createclusearchdb(int argc, const char **argv, const Command& command) {
     #ifdef OPENMP
             thread_idx = static_cast<unsigned int>(omp_get_thread_num());
     #endif
-    #pragma omp for schedule(dynamic, 1)
+    #pragma omp for schedule(static)
             for (size_t id = 0; id < clusterReader.getSize(); id++) {
                 progress.updateProgress();
                 char *data = clusterReader.getData(id, thread_idx);
diff --git a/src/util/expandaln.cpp b/src/util/expandaln.cpp
index cbb767ccd..3c0c75968 100644
--- a/src/util/expandaln.cpp
+++ b/src/util/expandaln.cpp
@@ -12,10 +12,10 @@
 #include "MultipleAlignment.h"
 #include "MsaFilter.h"
 #include "PSSMCalculator.h"
-#include "PSSMMasker.h"
 #include "FastSort.h"
 #include "IntervalArray.h"
 #include "IndexReader.h"
+#include "Masker.h"
 
 #include <stack>
 #include <map>
@@ -181,7 +181,7 @@ int expandaln(int argc, const char **argv, const Command& command, bool returnAl
         MultipleAlignment *aligner = NULL;
         MsaFilter *filter = NULL;
         PSSMCalculator *calculator = NULL;
-        PSSMMasker *masker = NULL;
+        Masker *masker = NULL;
         std::vector<std::vector<unsigned char>> seqSet;
         std::vector<std::vector<unsigned char>> subSeqSet;
         std::string result;
@@ -203,7 +203,7 @@ int expandaln(int argc, const char **argv, const Command& command, bool returnAl
                 , par.gapPseudoCount
 #endif
             );
-            masker = new PSSMMasker(par.maxSeqLen, *probMatrix, subMat);
+            masker = new Masker(subMat);
             result.reserve(par.maxSeqLen * Sequence::PROFILE_READIN_SIZE);
             seqSet.reserve(300);
         }
@@ -276,7 +276,8 @@ int expandaln(int argc, const char **argv, const Command& command, bool returnAl
                             EXIT(EXIT_FAILURE);
                         }
                         if (k == 0) {
-                            unsigned int bSeqKey = resultAb.dbKey;
+                            // unsigned int bSeqKey = resultAb.dbKey;
+                            unsigned int bSeqKey = resultBc.dbKey;
                             size_t bSeqId = cReader->getId(bSeqKey);
                             bSeq->mapSequence(bSeqId, bSeqKey, cReader->getData(bSeqId, thread_idx), cReader->getSeqLen(bSeqId));
                         } else {
@@ -395,7 +396,7 @@ int expandaln(int argc, const char **argv, const Command& command, bool returnAl
                                          : res.setSize;
                 PSSMCalculator::Profile pssmRes = calculator->computePSSMFromMSA(filteredSetSize, aSeq.L, (const char **) res.msaSequence, par.wg, 0.0);
                 if (par.maskProfile == true) {
-                    masker->mask(aSeq, par.maskProb, pssmRes);
+                    masker->maskPssm(aSeq, par.maskProb, pssmRes);
                 }
                 pssmRes.toBuffer(aSeq, subMat, result);
                 writer.writeData(result.c_str(), result.length(), queryKey, thread_idx);
diff --git a/src/util/extractframes.cpp b/src/util/extractframes.cpp
index 348d40bf7..43120f06f 100644
--- a/src/util/extractframes.cpp
+++ b/src/util/extractframes.cpp
@@ -4,8 +4,7 @@
 #include "DBWriter.h"
 #include "Matcher.h"
 #include "Util.h"
-#include "itoa.h"
-
+#include "TranslateNucl.h"
 #include "Orf.h"
 
 #include <unistd.h>
@@ -16,6 +15,41 @@
 #include <omp.h>
 #endif
 
+void handleSingleFrame(TranslateNucl& translateNucl, DBWriter& sequenceWriter, DBWriter& headerWriter, unsigned int key, char* headerBuffer, const char* data, size_t seqLen, int frame, bool reverse, bool translate, char*& aaBuffer, size_t& aaBufferSize, int thread_idx) {
+    data = data + frame;
+    seqLen = seqLen - frame;
+    if (translate == true) {
+        if (seqLen < 3) {
+            return;
+        }
+        size_t codonLength = (seqLen / 3) * 3;
+        if ((codonLength + 1) > aaBufferSize) {
+            aaBufferSize = codonLength * 1.5 + 1;
+            aaBuffer = (char*)realloc(aaBuffer, aaBufferSize * sizeof(char));
+        }
+        translateNucl.translate(aaBuffer, data, codonLength);
+        aaBuffer[codonLength / 3] = '\n';
+        sequenceWriter.writeData(aaBuffer, (codonLength / 3) + 1, key, thread_idx);
+        size_t bufferLen;
+        if (reverse) {
+            bufferLen = Orf::writeOrfHeader(headerBuffer, key, frame + codonLength, static_cast<size_t>(frame), 0, 0);
+        } else {
+            bufferLen = Orf::writeOrfHeader(headerBuffer, key, static_cast<size_t>(frame), frame + codonLength, 0, 0);
+        }
+        headerWriter.writeData(headerBuffer, bufferLen, key, thread_idx);
+    } else {
+        // +1: add newline, but remove it from the end pos
+        sequenceWriter.writeData(data, seqLen + 1, key, thread_idx);
+        size_t bufferLen;
+        if (reverse) {
+            bufferLen = Orf::writeOrfHeader(headerBuffer, key, seqLen - 1, static_cast<size_t>(frame), 0, 0);
+        } else {
+            bufferLen = Orf::writeOrfHeader(headerBuffer, key, static_cast<size_t>(frame), seqLen - 1, 0, 0);
+        }
+        headerWriter.writeData(headerBuffer, bufferLen, key, thread_idx);
+    }
+}
+
 int extractframes(int argc, const char **argv, const Command& command) {
     Parameters& par = Parameters::getInstance();
     par.parseParameters(argc, argv, command, true, 0, 0);
@@ -23,7 +57,11 @@ int extractframes(int argc, const char **argv, const Command& command) {
     DBReader<unsigned int> reader(par.db1.c_str(), par.db1Index.c_str(), par.threads, DBReader<unsigned int>::USE_INDEX|DBReader<unsigned int>::USE_DATA);
     reader.open(DBReader<unsigned int>::NOSORT);
 
-    DBWriter sequenceWriter(par.db2.c_str(), par.db2Index.c_str(), par.threads, par.compressed, reader.getDbtype());
+    int outputDbtype = reader.getDbtype();
+    if (par.translate) {
+        outputDbtype = Parameters::DBTYPE_AMINO_ACIDS;
+    }
+    DBWriter sequenceWriter(par.db2.c_str(), par.db2Index.c_str(), par.threads, par.compressed, outputDbtype);
     sequenceWriter.open();
 
     DBWriter headerWriter(par.hdr2.c_str(), par.hdr2Index.c_str(), par.threads, false, Parameters::DBTYPE_GENERIC_DB);
@@ -31,8 +69,9 @@ int extractframes(int argc, const char **argv, const Command& command) {
 
     unsigned int forwardFrames = Orf::getFrames(par.forwardFrames);
     unsigned int reverseFrames = Orf::getFrames(par.reverseFrames);
-    Debug::Progress progress(reader.getSize());
 
+    Debug::Progress progress(reader.getSize());
+    TranslateNucl translateNucl(static_cast<TranslateNucl::GenCode>(par.translationTable));
 #pragma omp parallel
     {
         int thread_idx = 0;
@@ -46,70 +85,65 @@ int extractframes(int argc, const char **argv, const Command& command) {
             queryFrom = 0;
         }
 
+        size_t aaBufferSize = par.maxSeqLen + 3 + 1;
+        char* aa = NULL;
+        if (par.translate == true) {
+            aa = (char*)malloc(aaBufferSize * sizeof(char));
+        }
+
         char buffer[1024];
+
         std::string reverseComplementStr;
         reverseComplementStr.reserve(32000);
+
         for (unsigned int i = queryFrom; i < (queryFrom + querySize); ++i){
             progress.updateProgress();
 
             unsigned int key = reader.getDbKey(i);
             const char* data = reader.getData(i, thread_idx);
-            size_t dataLength = reader.getEntryLen(i);
-
-            size_t bufferLen;
-            switch (forwardFrames){
-                case Orf::FRAME_1:
-                    // -1 to ignore the null byte copy the new line
-                    sequenceWriter.writeData(data, dataLength - 1, key, thread_idx);
-                    bufferLen = Orf::writeOrfHeader(buffer, key, static_cast<size_t >(0), dataLength - 3, 0, 0);
-                    headerWriter.writeData(buffer, bufferLen, key, thread_idx);
-                    break;
-                case Orf::FRAME_2:
-                    sequenceWriter.writeData(data + 1, dataLength - 2, key, thread_idx);
-                    bufferLen = Orf::writeOrfHeader(buffer, key, static_cast<size_t >(1), dataLength - 4, 0, 0);
-                    headerWriter.writeData(buffer, bufferLen, key, thread_idx);
-                    break;
-                case Orf::FRAME_3:
-                    sequenceWriter.writeData(data + 2, dataLength - 3, key, thread_idx);
-                    bufferLen = Orf::writeOrfHeader(buffer, key, static_cast<size_t >(2), dataLength - 5, 0, 0);
-                    headerWriter.writeData(buffer, bufferLen, key, thread_idx);
-                    break;
+            size_t seqLen = reader.getSeqLen(i);
+
+            if (forwardFrames & Orf::FRAME_1) {
+                handleSingleFrame(translateNucl, sequenceWriter, headerWriter, key, buffer, data, seqLen, 0, false, par.translate, aa, aaBufferSize, thread_idx);
+            }
+            if (forwardFrames & Orf::FRAME_2) {
+                handleSingleFrame(translateNucl, sequenceWriter, headerWriter, key, buffer, data, seqLen, 1, false, par.translate, aa, aaBufferSize, thread_idx);
+            }
+            if (forwardFrames & Orf::FRAME_3) {
+                handleSingleFrame(translateNucl, sequenceWriter, headerWriter, key, buffer, data, seqLen, 2, false, par.translate, aa, aaBufferSize, thread_idx);
             }
 
-            if(reverseFrames != 0){
-                size_t sequenceLength =  dataLength -2;
+            if (reverseFrames != 0) {
                 // bool hasWrongChar = false;
-                for(size_t pos = 0; pos < sequenceLength; ++pos) {
-                    char reverseComplement = Orf::complement(data[sequenceLength - pos - 1]);
+                for (size_t pos = 0; pos < seqLen; ++pos) {
+                    char reverseComplement = Orf::complement(data[seqLen - pos - 1]);
                     reverseComplement = (reverseComplement == '.') ? 'N' : reverseComplement;
                     reverseComplementStr.push_back(reverseComplement);
                     // hasWrongChar |= (reverseComplement == '.');
                 }
-//                if(hasWrongChar == true){
-//                    continue;
-//                }
+                // if (hasWrongChar == true) {
+                //     continue;
+                // }
                 reverseComplementStr.push_back('\n');
+                data = reverseComplementStr.c_str();
             }
 
-            switch (reverseFrames){
-                case Orf::FRAME_1:
-                    sequenceWriter.writeData(reverseComplementStr.c_str(), reverseComplementStr.size(), key, thread_idx);
-                    bufferLen = Orf::writeOrfHeader(buffer, key, reverseComplementStr.size() - 2, static_cast<size_t >(0), 0, 0);
-                    headerWriter.writeData(buffer, bufferLen, key, thread_idx);
-                    break;
-                case Orf::FRAME_2:
-                    sequenceWriter.writeData(reverseComplementStr.c_str()+1, reverseComplementStr.size()-1, key, thread_idx);
-                    bufferLen = Orf::writeOrfHeader(buffer, key, reverseComplementStr.size() - 3, static_cast<size_t >(1), 0, 0);
-                    headerWriter.writeData(buffer, bufferLen, key, thread_idx);
-                    break;
-                case Orf::FRAME_3:
-                    sequenceWriter.writeData(reverseComplementStr.c_str()+2, reverseComplementStr.size()-2, key, thread_idx);
-                    bufferLen = Orf::writeOrfHeader(buffer, key, reverseComplementStr.size() - 4, static_cast<size_t >(2), 0, 0);
-                    headerWriter.writeData(buffer, bufferLen, key, thread_idx);
-                    break;
+            if (reverseFrames & Orf::FRAME_1) {
+                handleSingleFrame(translateNucl, sequenceWriter, headerWriter, key, buffer, data, seqLen, 0, true, par.translate, aa, aaBufferSize, thread_idx);
+            }
+                
+            if (reverseFrames & Orf::FRAME_2) {
+                handleSingleFrame(translateNucl, sequenceWriter, headerWriter, key, buffer, data, seqLen, 1, true, par.translate, aa, aaBufferSize, thread_idx);
+            }
+                
+            if (reverseFrames & Orf::FRAME_3) {
+                handleSingleFrame(translateNucl, sequenceWriter, headerWriter, key, buffer, data, seqLen, 2, true, par.translate, aa, aaBufferSize, thread_idx);
             }
             reverseComplementStr.clear();
         }
+        if (aa != NULL) {
+            free(aa);
+        }
     }
     headerWriter.close(true);
     sequenceWriter.close(true);
diff --git a/src/util/filterdb.cpp b/src/util/filterdb.cpp
index 3ab883d6f..b645d2dba 100644
--- a/src/util/filterdb.cpp
+++ b/src/util/filterdb.cpp
@@ -9,6 +9,7 @@
 #include <fstream>
 #include <random>
 #include <iostream>
+#include <unordered_map>
 
 #include <regex.h>
 
@@ -51,6 +52,8 @@ ComparisonOperator mapOperator(const std::string& op) {
 #define INCREASING 1
 #define DECREASING 2
 #define SHUFFLE    3
+#define PRIORITY   4
+
 
 struct compareString {
     bool operator() (const std::string& lhs, const std::string& rhs) const{
@@ -84,6 +87,8 @@ struct compareFirstEntryDecreasing {
 
 int filterdb(int argc, const char **argv, const Command &command) {
     Parameters &par = Parameters::getInstance();
+    par.PARAM_WEIGHT_FILE.replaceCategory(MMseqsParameter::COMMAND_MISC);
+
     par.parseParameters(argc, argv, command, true, 0, 0);
 
     const size_t column = static_cast<size_t>(par.filterColumn);
@@ -108,7 +113,7 @@ int filterdb(int argc, const char **argv, const Command &command) {
 
     // JOIN_DB
     DBReader<unsigned int>* helper = NULL;
-
+    std::unordered_map<unsigned int, float> weights;
     // REGEX_FILTERING
     regex_t regex;
     std::random_device rng;
@@ -117,6 +122,32 @@ int filterdb(int argc, const char **argv, const Command &command) {
     if (par.sortEntries != 0) {
         mode = SORT_ENTRIES;
         Debug(Debug::INFO) << "Filtering by sorting entries\n";
+        if (par.sortEntries == PRIORITY) {
+            if (par.weightFile.empty()) {
+                Debug(Debug::ERROR) << "Weights file (--weights) must be specified for priority sorting.\n";
+                EXIT(EXIT_FAILURE);
+            }
+            Debug(Debug::INFO) << "Sorting entries by priority\n";
+            // Read the weights
+            std::ifstream weightsFile(par.weightFile);
+            if (!weightsFile) {
+                Debug(Debug::ERROR) << "Cannot open weights file " << par.weightFile << "\n";
+                EXIT(EXIT_FAILURE);
+            }
+
+            std::string line;
+            while (std::getline(weightsFile, line)) {
+                std::istringstream iss(line);
+                unsigned int key;
+                float weight;
+                if (!(iss >> key >> weight)) {
+                    Debug(Debug::WARNING) << "Invalid line in weights file: " << line << "\n";
+                    continue;
+                }
+                weights[key] = weight;
+            }
+            weightsFile.close();
+        }
     } else if (par.filteringFile.empty() == false) {
         mode = FILE_FILTERING;
         Debug(Debug::INFO) << "Filtering using file(s)\n";
@@ -453,8 +484,19 @@ int filterdb(int argc, const char **argv, const Command &command) {
                         memcpy(lineBuffer, newLineBuffer, newLineBufferIndex + 1);
                     }
                 } else if (mode == SORT_ENTRIES) {
-                    toSort.emplace_back(std::strtod(columnValue, NULL), lineBuffer);
-                    // do not put anything in the output buffer
+                    if (par.sortEntries == PRIORITY) {
+                        unsigned int key = static_cast<unsigned int>(strtoul(columnPointer[column - 1], NULL, 10));
+                        float weight = 0.0f;
+                        auto it = weights.find(key);
+                        if (it != weights.end()) {
+                            weight = it->second;
+                        }
+                        toSort.emplace_back(weight, std::string(lineBuffer));
+                    } else {
+                        // Existing code
+                        toSort.emplace_back(std::strtod(columnValue, NULL), lineBuffer);
+                    }
+                    // Do not put anything in the output buffer
                     nomatch = 1;
                 } else {
                     // Unknown filtering mode, keep all entries
@@ -482,7 +524,7 @@ int filterdb(int argc, const char **argv, const Command &command) {
             if (mode == SORT_ENTRIES) {
                 if (par.sortEntries == INCREASING) {
                     std::stable_sort(toSort.begin(), toSort.end(), compareFirstEntry());
-                } else if (par.sortEntries == DECREASING) {
+                } else if (par.sortEntries == DECREASING || par.sortEntries == PRIORITY) {
                     std::stable_sort(toSort.begin(), toSort.end(), compareFirstEntryDecreasing());
                 } else if (par.sortEntries == SHUFFLE) {
                     std::shuffle(toSort.begin(), toSort.end(), urng);
diff --git a/src/util/gpuserver.cpp b/src/util/gpuserver.cpp
new file mode 100644
index 000000000..cb475fd81
--- /dev/null
+++ b/src/util/gpuserver.cpp
@@ -0,0 +1,101 @@
+#include "Parameters.h"
+#include "Util.h"
+#include "PrefilteringIndexReader.h"
+#include "MemoryMapped.h"
+#include "IndexReader.h"
+#include "SubstitutionMatrix.h"
+#include "NucleotideMatrix.h"
+
+#ifdef HAVE_CUDA
+#include "GpuUtil.h"
+#include "marv.h"
+#endif
+
+#include <random>
+#include <fcntl.h>
+#include <sys/mman.h>
+#include <signal.h>
+
+volatile sig_atomic_t keepRunning = 1;
+void intHandler(int) {
+    keepRunning = 0;
+}
+
+int gpuserver(int argc, const char **argv, const Command& command) {
+    Parameters& par = Parameters::getInstance();
+    par.parseParameters(argc, argv, command, true, 0, 0);
+#ifdef HAVE_CUDA
+    bool touch = (par.preloadMode != Parameters::PRELOAD_MODE_MMAP);
+    IndexReader dbrIdx(par.db1, par.threads, IndexReader::SEQUENCES, (touch) ? (IndexReader::PRELOAD_INDEX | IndexReader::PRELOAD_DATA) : 0 );
+    DBReader<unsigned int>* dbr = dbrIdx.sequenceReader;
+
+    const bool isGpuDb = DBReader<unsigned int>::getExtendedDbtype(dbr->getDbtype()) & Parameters::DBTYPE_EXTENDED_GPU;
+    if (isGpuDb == false) {
+        Debug(Debug::ERROR) << "Database " << FileUtil::baseName(par.db1) << " is not a valid GPU database\n"
+                            << "Please call: makepaddedseqdb " << FileUtil::baseName(par.db2) << " " << FileUtil::baseName(par.db2) << "_pad\n";
+        EXIT(EXIT_FAILURE);
+    }
+
+    std::vector<size_t> offsets;
+    offsets.reserve(dbr->getSize() + 1);
+
+    std::vector<int32_t> lengths;
+    lengths.reserve(dbr->getSize());
+    for(size_t id = 0; id < dbr->getSize(); id++){
+        offsets.emplace_back(dbr->getIndex()[id].offset);
+        lengths.emplace_back(dbr->getIndex()[id].length - 2);
+    }
+    offsets.emplace_back(offsets.back() + lengths.back());
+    int32_t maxTargetLength = lengths.back();
+
+    BaseMatrix *subMat;
+    if (Parameters::isEqualDbtype(dbrIdx.sequenceReader->getDbtype(), Parameters::DBTYPE_NUCLEOTIDES)) {
+        subMat = new NucleotideMatrix(par.scoringMatrixFile.values.nucleotide().c_str(), 1.0, 0.0);
+    } else {
+        subMat = new SubstitutionMatrix(par.scoringMatrixFile.values.aminoacid().c_str(), 2.0, 0.0);
+    }
+
+    Marv::AlignmentType type =  (par.prefMode == Parameters::PREF_MODE_UNGAPPED_AND_GAPPED) ?
+                                Marv::AlignmentType::GAPLESS_SMITH_WATERMAN : Marv::AlignmentType::GAPLESS;
+    Marv marv(dbr->getSize(), subMat->alphabetSize, maxTargetLength, par.maxResListLen, type);
+    void* h1 = marv.loadDb(
+            dbr->getDataForFile(0), offsets.data(), lengths.data(), dbr->getDataSizeForFile(0)
+    );
+    marv.setDb(h1);
+    marv.prefetch();
+
+    struct sigaction act;
+    memset(&act, 0, sizeof(act));
+    act.sa_handler = intHandler;
+
+    // Set up the handler for SIGINT and SIGTERM
+    sigaction(SIGINT, &act, NULL);
+    sigaction(SIGTERM, &act, NULL);
+
+    std::string shmFile = GPUSharedMemory::getShmHash(par.db1);
+    GPUSharedMemory* layout = GPUSharedMemory::alloc(shmFile, par.maxSeqLen, par.maxResListLen);
+    Debug(Debug::WARNING) << shmFile << "\n";
+    while (keepRunning) {
+        while (layout->serverReady.load(std::memory_order_acquire) == 0 || layout->clientReady.load(std::memory_order_acquire) == 0) {
+            std::this_thread::yield();
+            if (keepRunning == false) {
+                break;
+            }
+        }
+        if (keepRunning == false) {
+            break;
+        }
+        Marv::Stats stats = marv.scan(reinterpret_cast<const char *>(layout->getQueryPtr()), layout->queryLen, layout->getProfilePtr(), layout->getResultsPtr());
+        layout->resultLen = stats.results;
+        layout->serverReady.store(UINT_MAX, std::memory_order_release);
+        while (layout->clientReady.load(std::memory_order_acquire) != 0) {
+            std::this_thread::yield();  // Wait for client to finish
+        }
+        layout->resetServerAndClientReady();
+    }
+
+    GPUSharedMemory::dealloc(layout, shmFile);
+#endif
+    return EXIT_SUCCESS;
+
+}
diff --git a/src/util/indexdb.cpp b/src/util/indexdb.cpp
index c9c6470a7..5236e6fd9 100644
--- a/src/util/indexdb.cpp
+++ b/src/util/indexdb.cpp
@@ -177,7 +177,7 @@ int indexdb(int argc, const char **argv, const Command &command) {
         PrefilteringIndexReader::createIndexFile(indexDB, &dbr, dbr2, hdbr1, hdbr2, alndbr, seedSubMat, par.maxSeqLen,
                                                  par.spacedKmer, par.spacedKmerPattern, par.compBiasCorrection,
                                                  seedSubMat->alphabetSize, par.kmerSize, par.maskMode, par.maskLowerCaseMode,
-                                                 par.maskProb, kmerScore, par.targetSearchMode, par.split, par.indexSubset);
+                                                 par.maskProb, par.maskNrepeats,kmerScore, par.targetSearchMode, par.split, par.indexSubset);
 
         if (alndbr != NULL) {
             alndbr->close();
diff --git a/src/util/makepaddedseqdb.cpp b/src/util/makepaddedseqdb.cpp
index 6239fe226..2fd4641d1 100644
--- a/src/util/makepaddedseqdb.cpp
+++ b/src/util/makepaddedseqdb.cpp
@@ -3,52 +3,141 @@
 #include "DBWriter.h"
 #include "Debug.h"
 #include "Util.h"
+#include "SubstitutionMatrix.h"
+#include "tantan.h"
+#include "Masker.h"
 
-int makepaddedseqdb(int argc, const char **argv, const Command &command) {
-    const char AA_TO_20[256] = {
-        20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20,
-        20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20,
-        20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20,
-        20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20,
-        20,  0, 20,  4,  3,  6, 13,  7,  8,  9, 20, 11, 10, 12,  2, 20,
-        14,  5,  1, 15, 16, 20, 19, 17, 20, 18, 20, 20, 20, 20, 20, 20,
-        20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20,
-        20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20,
-        20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20,
-        20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20,
-        20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20,
-        20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20,
-        20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20,
-        20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20,
-        20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20,
-        20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20
-    };
+#ifdef OPENMP
+#include <omp.h>
+#endif
 
+int makepaddedseqdb(int argc, const char **argv, const Command &command) {
     Parameters &par = Parameters::getInstance();
     par.parseParameters(argc, argv, command, true, 0, 0);
-    DBReader<unsigned int> dbr(par.db1.c_str(), par.db1Index.c_str(), 1,
-                               DBReader<unsigned int>::USE_INDEX | DBReader<unsigned int>::USE_DATA);
+
+    const int mode = DBReader<unsigned int>::USE_INDEX | DBReader<unsigned int>::USE_DATA;
+    DBReader<unsigned int> dbr(par.db1.c_str(), par.db1Index.c_str(), par.threads, mode);
     dbr.open(DBReader<unsigned int>::SORT_BY_LENGTH);
-    DBWriter writer(par.db2.c_str(), par.db2Index.c_str(), 1, false, dbr.getDbtype());
-    writer.open();
+
+    DBReader<unsigned int> dbhr(par.hdr1.c_str(), par.hdr1Index.c_str(), par.threads, mode);
+    dbhr.open(DBReader<unsigned int>::NOSORT);
+
+    SubstitutionMatrix subMat(par.scoringMatrixFile.values.aminoacid().c_str(), 2.0, par.scoreBias);
+
+    int dbType = DBReader<unsigned int>::setExtendedDbtype(dbr.getDbtype(), Parameters::DBTYPE_EXTENDED_GPU);
+    DBWriter dbsw(par.db2.c_str(), par.db2Index.c_str(), par.threads, false, dbType);
+    dbsw.open();
+    DBWriter dbhw(par.hdr2.c_str(), par.hdr2Index.c_str(), par.threads, false, Parameters::DBTYPE_GENERIC_DB);
+    dbhw.open();
+
+    // need to prune low scoring k-mers through masking
+
+    Debug::Progress progress(dbr.getSize());
+#pragma omp parallel
+{
+    unsigned int thread_idx = 0;
+#ifdef OPENMP
+    thread_idx = static_cast<unsigned int>(omp_get_thread_num());
+#endif
+    Masker masker(subMat);
     std::string result;
+    result.reserve(par.maxSeqLen);
+
     const int ALIGN = 4;
-    for (long id = dbr.getSize() - 1; id >= 0; id--) {
+    Sequence seq(dbr.getMaxSeqLen(), dbr.getDbtype(), &subMat,  0, false, false);
+
+    size_t firstIt = SIZE_MAX;
+    unsigned int seqKey = 0;
+
+    size_t charSeqBufferSize = par.maxSeqLen + 1;
+    unsigned char *charSequence = NULL;
+    if (par.maskMode) {
+        charSequence = (unsigned char*)malloc(charSeqBufferSize * sizeof(char));
+    }
+
+#pragma omp for schedule(static)
+    for (size_t i = 0; i < dbr.getSize(); i++) {
+        progress.updateProgress();
+
+        if (firstIt == SIZE_MAX) {
+            firstIt = i;
+        }
+
+        size_t id = dbr.getSize() - 1 - i;
         unsigned int key = dbr.getDbKey(id);
-        char *data = dbr.getData(id, 0);
+        char *data = dbr.getData(id, thread_idx);
         size_t seqLen = dbr.getSeqLen(id);
-        const size_t sequencepadding = (seqLen % ALIGN == 0) ? 0 : ALIGN - seqLen % ALIGN;
-        for (size_t i = 0; i < seqLen; i++) {
-            result.append(1, AA_TO_20[(unsigned char)data[i]]);
+        seq.mapSequence(id, key, data, seqLen);
+
+        if (charSequence != NULL) {
+            if ((size_t)seq.L >= charSeqBufferSize) {
+                charSeqBufferSize = seq.L * 1.5;
+                charSequence = (unsigned char*)realloc(charSequence, charSeqBufferSize * sizeof(char));
+            }
+            memcpy(charSequence, seq.numSequence, seq.L);
+            masker.maskSequence(seq, par.maskMode, par.maskProb, par.maskLowerCaseMode, par.maskNrepeats);
+            for (int i = 0; i < seq.L; i++) {
+                result.append(1, (seq.numSequence[i] == masker.maskLetterNum) ? charSequence[i] + 32 : charSequence[i]);
+            }
+        } else {
+            for (int i = 0; i < seq.L; i++) {
+                char aa = data[i];
+                result.append(1, (islower(aa)) ? seq.numSequence[i] + 32 : seq.numSequence[i]);
+            }
         }
+        const size_t sequencepadding = (seq.L % ALIGN == 0) ? 0 : ALIGN - seq.L % ALIGN;
         result.append(sequencepadding, static_cast<char>(20));
-        writer.writeData(result.c_str(), result.size(), key, 0, false, false);
-        writer.writeIndexEntry(key, writer.getStart(0), seqLen, 0);
+        dbsw.writeData(result.c_str(), result.size(), key, thread_idx, false, false);
+
+        // + 2 is needed for newline and null character
+        size_t start = dbsw.getStart(thread_idx);
+        if (start % 4 != 0) {
+            Debug(Debug::ERROR) << "Misalligned entry\n";
+            EXIT(EXIT_FAILURE);
+        }
+        dbsw.writeIndexEntry(firstIt + seqKey, start, seq.L + 2, thread_idx);
+
+        unsigned int headerId = dbhr.getId(key);
+        dbhw.writeData(dbhr.getData(headerId, thread_idx), dbhr.getEntryLen(headerId), firstIt + seqKey, thread_idx, false);
+
+        seqKey++;
         result.clear();
     }
-    writer.close(true, false);
-    DBReader<unsigned int>::softlinkDb(par.db1, par.db2, DBFiles::SEQUENCE_ANCILLARY);
-
+    if (charSequence != NULL) {
+        free(charSequence);
+    }
+}
+    dbsw.close(true, false);
+    dbhw.close(true, false);
+    if (par.writeLookup == true) {
+        DBReader<unsigned int> readerHeader(par.hdr2.c_str(), par.hdr2Index.c_str(), 1, DBReader<unsigned int>::USE_DATA | DBReader<unsigned int>::USE_INDEX);
+        readerHeader.open(DBReader<unsigned int>::NOSORT);
+        // create lookup file
+        std::string lookupFile = par.db2 + ".lookup";
+        FILE* file = FileUtil::openAndDelete(lookupFile.c_str(), "w");
+        std::string buffer;
+        buffer.reserve(2048);
+        DBReader<unsigned int>::LookupEntry entry;
+        size_t totalSize = dbr.getSize();
+        for (unsigned int id = 0; id < readerHeader.getSize(); id++) {
+            char *header = readerHeader.getData(id, 0);
+            entry.id = id;
+            entry.entryName = Util::parseFastaHeader(header);
+            entry.fileNumber = dbr.getDbKey(totalSize - 1 - id);
+            readerHeader.lookupEntryToBuffer(buffer, entry);
+            int written = fwrite(buffer.c_str(), sizeof(char), buffer.size(), file);
+            if (written != (int)buffer.size()) {
+                Debug(Debug::ERROR) << "Cannot write to lookup file " << lookupFile << "\n";
+                EXIT(EXIT_FAILURE);
+            }
+            buffer.clear();
+        }
+        if (fclose(file) != 0) {
+            Debug(Debug::ERROR) << "Cannot close file " << lookupFile << "\n";
+            EXIT(EXIT_FAILURE);
+        }
+        readerHeader.close();
+    }
     dbr.close();
     return EXIT_SUCCESS;
 }
\ No newline at end of file
diff --git a/src/util/masksequence.cpp b/src/util/masksequence.cpp
index 82a468c76..e41f215ad 100644
--- a/src/util/masksequence.cpp
+++ b/src/util/masksequence.cpp
@@ -6,6 +6,7 @@
 #include "Debug.h"
 #include "Util.h"
 #include "FileUtil.h"
+#include "Masker.h"
 
 #ifdef OPENMP
 #include <omp.h>
@@ -26,13 +27,8 @@ int masksequence(int argc, const char **argv, const Command& command) {
         // keep score bias at 0.0 (improved ROC)
         subMat = new SubstitutionMatrix(par.scoringMatrixFile.values.aminoacid().c_str(), 2.0, 0.0);
     }
-    size_t maxSeqLen = 0;
 
-    for (size_t i = 0; i < reader.getSize(); i++) {
-        maxSeqLen = std::max(reader.getSeqLen(i), maxSeqLen);
-    }
     // need to prune low scoring k-mers through masking
-    ProbabilityMatrix probMatrix(*subMat);
 
     DBWriter writer(par.db2.c_str(), par.db2Index.c_str(), par.threads, par.compressed, reader.getDbtype());
     writer.open();
@@ -43,32 +39,17 @@ int masksequence(int argc, const char **argv, const Command& command) {
 #ifdef OPENMP
         thread_idx = (unsigned int) omp_get_thread_num();
 #endif
-        char *charSequence = new char[maxSeqLen + 1];
-
+        Masker masker(*subMat);
+        unsigned char *charSequence = new unsigned char[reader.getMaxSeqLen() + 1];
+        Sequence seq(reader.getMaxSeqLen(), reader.getDbtype(), subMat,  0, false, false);
 #pragma omp for schedule(dynamic, 1)
         for (size_t id = 0; id < reader.getSize(); ++id) {
-            char *seqData = reader.getData(id, thread_idx);
-            unsigned int seqLen = 0;
-            while (seqData[seqLen] != '\0') {
-                charSequence[seqLen] = (char) subMat->aa2num[static_cast<int>(seqData[seqLen])];
-                seqLen++;
-            }
-            tantan::maskSequences(charSequence,
-                                  charSequence + seqLen,
-                                  50 /*options.maxCycleLength*/,
-                                  probMatrix.probMatrixPointers,
-                                  0.005 /*options.repeatProb*/,
-                                  0.05 /*options.repeatEndProb*/,
-                                  0.9 /*options.repeatOffsetProbDecay*/,
-                                  0, 0,
-                                  par.maskProb /*options.minMaskProb*/,
-                                  probMatrix.hardMaskTable);
-
-            for (unsigned int pos = 0; pos < seqLen; pos++) {
-                char aa = seqData[pos];
-                charSequence[pos] = (charSequence[pos] == probMatrix.hardMaskTable[0]) ? tolower(aa) : toupper(aa);
-            }
-            writer.writeData(charSequence, seqLen, reader.getDbKey(id), thread_idx);
+            seq.mapSequence(id, reader.getDbKey(id), reader.getData(id, thread_idx), reader.getSeqLen(id));
+            masker.maskSequence(seq, par.maskMode, par.maskProb, par.maskLowerCaseMode, par.maskNrepeats);
+            memcpy(charSequence, seq.getSeqData(), seq.L * sizeof(char));
+            masker.applySoftmasking(charSequence, seq.numSequence, seq.L);
+            charSequence[seq.L] = '\n';
+            writer.writeData((const char *)charSequence, seq.L + 1,  seq.getDbKey(), thread_idx);
         }
         delete[] charSequence;
     }
diff --git a/src/util/nrtotaxmapping.cpp b/src/util/nrtotaxmapping.cpp
index 0dee3d3fb..4033da702 100644
--- a/src/util/nrtotaxmapping.cpp
+++ b/src/util/nrtotaxmapping.cpp
@@ -6,11 +6,7 @@
 #include "NcbiTaxonomy.h"
 #include "FastSort.h"
 #include "MemoryMapped.h"
-
-#ifdef HAVE_ZLIB
-#include "gzstream.h"
-#endif
-#include <fstream>
+#include "GzReader.h"
 
 #ifdef OPENMP
 #include <omp.h>
@@ -65,26 +61,15 @@ int nrtotaxmapping(int argc, const char **argv, const Command& command) {
 
     std::vector<std::pair<std::string, TaxID>> accessionMapping;
     for (size_t i = 0; i < par.filenames.size(); i++) {
-        std::istream *kbIn;
-        if (Util::endsWith(".gz", par.filenames[i])) {
-#ifdef HAVE_ZLIB
-            kbIn = new igzstream(par.filenames[i].c_str());
-#else
-            Debug(Debug::ERROR) << "MMseqs2 was not compiled with zlib support. Cannot read compressed input\n";
-            EXIT(EXIT_FAILURE);
-#endif
-        } else {
-            kbIn = new std::ifstream(par.filenames[i]);
-        }
-
-        if (kbIn->fail()) {
+        GzReader kbIn(par.filenames[i]);
+        if (kbIn.fail()) {
             Debug(Debug::ERROR) << "File " << par.filenames[i] << " not found\n";
             EXIT(EXIT_FAILURE);
         }
 
         std::string line;
         const char *entry[255];
-        while (std::getline(*kbIn, line)) {
+        while (kbIn.getline(line)) {
             progress.updateProgress();
             const size_t columns = Util::getWordsOfLine(line.c_str(), entry, 255);
             if (columns < 4) {
diff --git a/src/util/result2profile.cpp b/src/util/result2profile.cpp
index 34759b171..bb3d24bae 100644
--- a/src/util/result2profile.cpp
+++ b/src/util/result2profile.cpp
@@ -1,7 +1,6 @@
 #include "MsaFilter.h"
 #include "Parameters.h"
 #include "PSSMCalculator.h"
-#include "PSSMMasker.h"
 #include "DBReader.h"
 #include "DBWriter.h"
 #include "Debug.h"
@@ -9,6 +8,7 @@
 #include "FileUtil.h"
 #include "tantan.h"
 #include "IndexReader.h"
+#include "Masker.h"
 
 #ifdef OPENMP
 #include <omp.h>
@@ -104,7 +104,11 @@ int result2profile(int argc, const char **argv, const Command &command, bool ret
     }
 
     int type = Parameters::DBTYPE_HMM_PROFILE;
-    if (returnAlnRes) {
+    const int writePlain = par.profileOutputMode == 1;
+    if (par.profileOutputMode == 1) {
+        type = Parameters::DBTYPE_OMIT_FILE;
+        par.compressed = false;
+    } else if (returnAlnRes) {
         type = Parameters::DBTYPE_ALIGNMENT_RES;
         if (needSrcIndex) {
             type = DBReader<unsigned int>::setExtendedDbtype(type, Parameters::DBTYPE_EXTENDED_INDEX_NEED_SRC);
@@ -120,7 +124,6 @@ int result2profile(int argc, const char **argv, const Command &command, bool ret
 
     // adjust score of each match state by -0.2 to trim alignment
     SubstitutionMatrix subMat(par.scoringMatrixFile.values.aminoacid().c_str(), 2.0f, -0.2f);
-    ProbabilityMatrix probMatrix(subMat);
     EvalueComputation evalueComputation(tDbr->getAminoAcidDBSize(), &subMat, par.gapOpen.values.aminoacid(), par.gapExtend.values.aminoacid());
 
     if (qDbr->getDbtype() == -1 || targetSeqType == -1) {
@@ -147,7 +150,7 @@ int result2profile(int argc, const char **argv, const Command &command, bool ret
         Matcher matcher(qDbr->getDbtype(), tDbr->getDbtype(), maxSequenceLength, &subMat, &evalueComputation,
                         par.compBiasCorrection, par.compBiasCorrectionScale,
                         par.gapOpen.values.aminoacid(), par.gapExtend.values.aminoacid(), 0.0, par.zdrop);
-        PSSMMasker masker(maxSequenceLength, probMatrix, subMat);
+        Masker masker(subMat);
         MultipleAlignment aligner(maxSequenceLength, &subMat);
         PSSMCalculator calculator(
             &subMat, maxSequenceLength, maxSetSize, par.pcmode, par.pca, par.pcb
@@ -261,18 +264,25 @@ int result2profile(int argc, const char **argv, const Command &command, bool ret
                                                                                 alnResults,
 #endif
                                                                                 par.wg, 0.0);
-                if (par.compBiasCorrection == true){
-                    SubstitutionMatrix::calcGlobalAaBiasCorrection(&subMat, pssmRes.pssm, pNullBuffer,
-                                                                   Sequence::PROFILE_AA_SIZE,
-                                                                   res.centerLength);
-                }
-
-                if (par.maskProfile == true) {
-                    masker.mask(centerSequence, par.maskProb, pssmRes);
-                }
-                pssmRes.toBuffer(centerSequence, subMat, result);
+                if (writePlain) {
+                    result.clear();
+                    result.append("Query profile of sequence ");
+                    result.append(SSTR(queryKey));
+                    result.push_back('\n');
+                    calculator.profileToString(result, res.centerLength);
+                } else {                                                                
+                    if (par.compBiasCorrection == true){
+                        SubstitutionMatrix::calcGlobalAaBiasCorrection(&subMat, pssmRes.pssm, pNullBuffer,
+                                                                    Sequence::PROFILE_AA_SIZE,
+                                                                    res.centerLength);
+                    }
+                    if (par.maskProfile == true) {
+                        masker.maskPssm(centerSequence, par.maskProb, pssmRes);
+                    }
+                    pssmRes.toBuffer(centerSequence, subMat, result);
+                } 
             }
-            resultWriter.writeData(result.c_str(), result.length(), queryKey, thread_idx);
+            resultWriter.writeData(result.c_str(), result.length(), queryKey, thread_idx, writePlain == false);
             result.clear();
             alnResults.clear();
 
@@ -281,7 +291,10 @@ int result2profile(int argc, const char **argv, const Command &command, bool ret
         }
         delete[] pNullBuffer;
     }
-    resultWriter.close(returnAlnRes == false);
+    resultWriter.close(returnAlnRes == false || writePlain == true);
+    if (writePlain) {
+        FileUtil::remove(par.db4Index.c_str());
+    }
     resultReader.close();
 
     if (!sameDatabase) {
diff --git a/src/util/sequence2profile.cpp b/src/util/sequence2profile.cpp
index 43a73b2b8..0f34e40dc 100644
--- a/src/util/sequence2profile.cpp
+++ b/src/util/sequence2profile.cpp
@@ -7,9 +7,9 @@
 #include "DBReader.h"
 #include "Parameters.h"
 #include "DBWriter.h"
+#include "Masker.h"
 
 #include <string>
-#include <PSSMMasker.h>
 
 
 #ifdef OPENMP
@@ -37,8 +37,7 @@ int sequence2profile(int argc, const char **argv, const Command& command) {
     {
         Sequence seq(par.maxSeqLen, sequenceDb.getDbtype(), &subMat, 0, false, false);
         CSProfile ps(par.maxSeqLen);
-        ProbabilityMatrix probMatrix(subMat);
-        PSSMMasker masker(sequenceDb.getMaxSeqLen(), probMatrix, subMat);
+        Masker masker(subMat);
         char * pssm = (char * )mem_align(16, Sequence::PROFILE_AA_SIZE * sequenceDb.getMaxSeqLen() * sizeof(char));
         float * Neff_M = new float[sequenceDb.getMaxSeqLen()];
         std::fill(Neff_M, Neff_M + sequenceDb.getMaxSeqLen(), 1.0f);
@@ -65,7 +64,7 @@ int sequence2profile(int argc, const char **argv, const Command& command) {
             PSSMCalculator::Profile pssmRes(pssm, profile, Neff_M, seq.numSequence);
 #endif
             if (par.maskProfile == true) {
-                masker.mask(seq, par.maskProb, pssmRes);
+                masker.maskPssm(seq, par.maskProb, pssmRes);
             }
             pssmRes.toBuffer(seq, subMat, result);
 
diff --git a/src/util/tsv2exprofiledb.cpp b/src/util/tsv2exprofiledb.cpp
index f1c217399..0be49502a 100644
--- a/src/util/tsv2exprofiledb.cpp
+++ b/src/util/tsv2exprofiledb.cpp
@@ -1,6 +1,7 @@
 #include "Parameters.h"
 #include "FileUtil.h"
 #include "CommandCaller.h"
+#include "Debug.h"
 
 #include <cassert>
 
@@ -18,8 +19,15 @@ int tsv2exprofiledb(int argc, const char **argv, const Command &command) {
     std::string program = par.db2 + ".sh";
     FileUtil::writeFile(program, tsv2exprofiledb_sh, tsv2exprofiledb_sh_len);
 
+    if (par.gpu) {
+        Debug(Debug::INFO) << "Disabling compression for GPU-databases\n";
+        par.compressed = false;
+    }
+
     CommandCaller cmd;
     cmd.addVariable("COMPRESSED", par.compressed ? "TRUE" : NULL);
+    cmd.addVariable("GPU", par.gpu ? "TRUE" : NULL);
+    cmd.addVariable("THREADS", par.createParameterString(par.onlythreads).c_str());
     cmd.addVariable("VERBOSITY", par.createParameterString(par.onlyverbosity).c_str());
     cmd.execProgram(FileUtil::getRealPathFromSymLink(program).c_str(), par.filenames);
 
diff --git a/src/workflow/ClusterUpdate.cpp b/src/workflow/ClusterUpdate.cpp
index 1e84b49fe..25df389f9 100644
--- a/src/workflow/ClusterUpdate.cpp
+++ b/src/workflow/ClusterUpdate.cpp
@@ -35,9 +35,6 @@ int clusterupdate(int argc, const char **argv, const Command& command) {
     for (size_t i = 0; i < par.extractorfs.size(); i++) {
         par.extractorfs[i]->addCategory(MMseqsParameter::COMMAND_EXPERT);
     }
-    for (size_t i = 0; i < par.translatenucs.size(); i++) {
-        par.translatenucs[i]->addCategory(MMseqsParameter::COMMAND_EXPERT);
-    }
     for (size_t i = 0; i < par.splitsequence.size(); i++) {
         par.splitsequence[i]->addCategory(MMseqsParameter::COMMAND_EXPERT);
     }
diff --git a/src/workflow/CreateIndex.cpp b/src/workflow/CreateIndex.cpp
index fc913f09b..086acc18e 100644
--- a/src/workflow/CreateIndex.cpp
+++ b/src/workflow/CreateIndex.cpp
@@ -42,8 +42,12 @@ int createindex(Parameters &par, const Command &command, const std::string &inde
     cmd.addVariable("INDEXER", indexerModule.c_str());
     cmd.addVariable("REMOVE_TMP", par.removeTmpFiles ? "TRUE" : NULL);
     par.translate = 1;
-    cmd.addVariable("ORF_PAR", par.createParameterString(par.extractorfs).c_str());
-    cmd.addVariable("EXTRACT_FRAMES_PAR", par.createParameterString(par.extractframes).c_str());
+    cmd.addVariable("ORF_SKIP", par.translationMode == Parameters::PARAM_TRANSLATION_MODE_FRAME ? "TRUE" : NULL);
+    if (par.translationMode == Parameters::PARAM_TRANSLATION_MODE_FRAME) {
+        cmd.addVariable("EXTRACT_FRAMES_PAR", par.createParameterString(par.extractframes).c_str());
+    } else {
+        cmd.addVariable("ORF_PAR", par.createParameterString(par.extractorfs).c_str());
+    }
     cmd.addVariable("SPLIT_SEQ_PAR", par.createParameterString(par.splitsequence).c_str());
     if(indexerModule == "kmerindexdb"){
         cmd.addVariable("INDEX_PAR", par.createParameterString(par.kmerindexdb).c_str());
@@ -78,9 +82,6 @@ int createlinindex(int argc, const char **argv, const Command& command) {
     for (size_t i = 0; i < par.extractorfs.size(); i++) {
         par.extractorfs[i]->addCategory(MMseqsParameter::COMMAND_EXPERT);
     }
-    for (size_t i = 0; i < par.translatenucs.size(); i++) {
-        par.translatenucs[i]->addCategory(MMseqsParameter::COMMAND_EXPERT);
-    }
     par.PARAM_COMPRESSED.addCategory(MMseqsParameter::COMMAND_EXPERT);
     par.PARAM_THREADS.removeCategory(MMseqsParameter::COMMAND_EXPERT);
     par.PARAM_V.removeCategory(MMseqsParameter::COMMAND_EXPERT);
@@ -127,9 +128,6 @@ int createindex(int argc, const char **argv, const Command& command) {
     for (size_t i = 0; i < par.splitsequence.size(); i++) {
         par.splitsequence[i]->addCategory(MMseqsParameter::COMMAND_EXPERT);
     }
-    for (size_t i = 0; i < par.translatenucs.size(); i++) {
-        par.translatenucs[i]->addCategory(MMseqsParameter::COMMAND_EXPERT);
-    }
     par.PARAM_COMPRESSED.addCategory(MMseqsParameter::COMMAND_EXPERT);
     par.PARAM_THREADS.removeCategory(MMseqsParameter::COMMAND_EXPERT);
     par.PARAM_V.removeCategory(MMseqsParameter::COMMAND_EXPERT);
diff --git a/src/workflow/EasyRbh.cpp b/src/workflow/EasyRbh.cpp
index c78da2530..c02a39efb 100644
--- a/src/workflow/EasyRbh.cpp
+++ b/src/workflow/EasyRbh.cpp
@@ -23,9 +23,6 @@ int easyrbh(int argc, const char **argv, const Command &command) {
     for (size_t i = 0; i < par.extractorfs.size(); i++){
         par.extractorfs[i]->addCategory(MMseqsParameter::COMMAND_EXPERT);
     }
-    for (size_t i = 0; i < par.translatenucs.size(); i++){
-        par.translatenucs[i]->addCategory(MMseqsParameter::COMMAND_EXPERT);
-    }
     for (size_t i = 0; i < par.result2profile.size(); i++){
         par.result2profile[i]->addCategory(MMseqsParameter::COMMAND_EXPERT);
     }
@@ -75,6 +72,7 @@ int easyrbh(int argc, const char **argv, const Command &command) {
     CommandCaller cmd;
     cmd.addVariable("TMP_PATH", tmpDir.c_str());
     cmd.addVariable("RESULTS", par.filenames.back().c_str());
+    cmd.addVariable("MAKEPADDEDSEQDB_PAR", par.createParameterString(par.makepaddedseqdb).c_str());
     par.filenames.pop_back();
     std::string target = par.filenames.back().c_str();
     cmd.addVariable("TARGET", target.c_str());
diff --git a/src/workflow/EasySearch.cpp b/src/workflow/EasySearch.cpp
index 4a90bbd5c..d3f74b864 100644
--- a/src/workflow/EasySearch.cpp
+++ b/src/workflow/EasySearch.cpp
@@ -42,9 +42,6 @@ int doeasysearch(int argc, const char **argv, const Command &command, bool linse
     for (size_t i = 0; i < par.extractorfs.size(); i++){
         par.extractorfs[i]->addCategory(MMseqsParameter::COMMAND_EXPERT);
     }
-    for (size_t i = 0; i < par.translatenucs.size(); i++){
-        par.translatenucs[i]->addCategory(MMseqsParameter::COMMAND_EXPERT);
-    }
     for (size_t i = 0; i < par.splitsequence.size(); i++) {
         par.splitsequence[i]->addCategory(MMseqsParameter::COMMAND_EXPERT);
     }
@@ -131,9 +128,15 @@ int doeasysearch(int argc, const char **argv, const Command &command, bool linse
     cmd.addVariable("RUNNER", par.runner.c_str());
     cmd.addVariable("VERBOSITY", par.createParameterString(par.onlyverbosity).c_str());
 
+    bool origShuffle = par.shuffleDatabase;
+    // don't need to shuffle query, only relevant for prefilter target database order
+    par.shuffleDatabase = false;
     cmd.addVariable("CREATEDB_QUERY_PAR", par.createParameterString(par.createdb).c_str());
+    par.shuffleDatabase = origShuffle;
     par.createdbMode = Parameters::SEQUENCE_SPLIT_MODE_HARD;
     cmd.addVariable("CREATEDB_PAR", par.createParameterString(par.createdb).c_str());
+    cmd.addVariable("GPU", par.gpu ? "TRUE" : NULL);
+    cmd.addVariable("MAKEPADDEDSEQDB_PAR", par.createParameterString(par.makepaddedseqdb).c_str());
     cmd.addVariable("CONVERT_PAR", par.createParameterString(par.convertalignments).c_str());
     cmd.addVariable("SUMMARIZE_PAR", par.createParameterString(par.summarizeresult).c_str());
 
diff --git a/src/workflow/Linsearch.cpp b/src/workflow/Linsearch.cpp
index 5e0fdd991..74b9f2968 100644
--- a/src/workflow/Linsearch.cpp
+++ b/src/workflow/Linsearch.cpp
@@ -40,9 +40,6 @@ int linsearch(int argc, const char **argv, const Command &command) {
     for (size_t i = 0; i < par.extractorfs.size(); i++) {
         par.extractorfs[i]->addCategory(MMseqsParameter::COMMAND_EXPERT);
     }
-    for (size_t i = 0; i < par.translatenucs.size(); i++) {
-        par.translatenucs[i]->addCategory(MMseqsParameter::COMMAND_EXPERT);
-    }
     par.PARAM_COMPRESSED.removeCategory(MMseqsParameter::COMMAND_EXPERT);
     par.PARAM_THREADS.removeCategory(MMseqsParameter::COMMAND_EXPERT);
     par.PARAM_V.removeCategory(MMseqsParameter::COMMAND_EXPERT);
@@ -143,7 +140,6 @@ int linsearch(int argc, const char **argv, const Command &command) {
         par.translate = 1;
         cmd.addVariable("ORF_PAR", par.createParameterString(par.extractorfs).c_str());
         cmd.addVariable("OFFSETALIGNMENT_PAR", par.createParameterString(par.offsetalignment).c_str());
-        cmd.addVariable("TRANSLATE_PAR", par.createParameterString(par.translatenucs).c_str());
         cmd.addVariable("SEARCH", program.c_str());
         program = std::string(tmpDir + "/translated_search.sh");
         FileUtil::writeFile(program, Linsearch::translated_search_sh, Linsearch::translated_search_sh_len);
diff --git a/src/workflow/Map.cpp b/src/workflow/Map.cpp
index e37a3b166..b459ee1b7 100644
--- a/src/workflow/Map.cpp
+++ b/src/workflow/Map.cpp
@@ -32,9 +32,6 @@ int map(int argc, const char **argv, const Command &command) {
     for (size_t i = 0; i < par.extractorfs.size(); i++){
         par.extractorfs[i]->addCategory(MMseqsParameter::COMMAND_EXPERT);
     }
-    for (size_t i = 0; i < par.translatenucs.size(); i++){
-        par.translatenucs[i]->addCategory(MMseqsParameter::COMMAND_EXPERT);
-    }
     par.PARAM_COMPRESSED.removeCategory(MMseqsParameter::COMMAND_EXPERT);
     par.PARAM_V.removeCategory(MMseqsParameter::COMMAND_EXPERT);
     par.PARAM_THREADS.removeCategory(MMseqsParameter::COMMAND_EXPERT);
diff --git a/src/workflow/Rbh.cpp b/src/workflow/Rbh.cpp
index 28bdd3021..7e5e1ab18 100644
--- a/src/workflow/Rbh.cpp
+++ b/src/workflow/Rbh.cpp
@@ -27,9 +27,6 @@ int rbh(int argc, const char **argv, const Command &command) {
     for (size_t i = 0; i < par.extractorfs.size(); i++){
         par.extractorfs[i]->addCategory(MMseqsParameter::COMMAND_EXPERT);
     }
-    for (size_t i = 0; i < par.translatenucs.size(); i++){
-        par.translatenucs[i]->addCategory(MMseqsParameter::COMMAND_EXPERT);
-    }
     for (size_t i = 0; i < par.splitsequence.size(); i++) {
         par.splitsequence[i]->addCategory(MMseqsParameter::COMMAND_EXPERT);
     }
diff --git a/src/workflow/Search.cpp b/src/workflow/Search.cpp
index 62ec132e2..c70c5a82b 100644
--- a/src/workflow/Search.cpp
+++ b/src/workflow/Search.cpp
@@ -208,8 +208,8 @@ int search(int argc, const char **argv, const Command& command) {
     for (size_t i = 0; i < par.extractorfs.size(); i++) {
         par.extractorfs[i]->addCategory(MMseqsParameter::COMMAND_EXPERT);
     }
-    for (size_t i = 0; i < par.translatenucs.size(); i++) {
-        par.translatenucs[i]->addCategory(MMseqsParameter::COMMAND_EXPERT);
+    for (size_t i = 0; i < par.extractframes.size(); i++) {
+        par.extractframes[i]->addCategory(MMseqsParameter::COMMAND_EXPERT);
     }
     for (size_t i = 0; i < par.splitsequence.size(); i++) {
         par.splitsequence[i]->addCategory(MMseqsParameter::COMMAND_EXPERT);
@@ -319,6 +319,21 @@ int search(int argc, const char **argv, const Command& command) {
         cmd.addVariable("ALIGN_MODULE", "align");
     }
 
+    // GPU can only use the ungapped prefilter
+    if (par.gpu == 1 && par.PARAM_PREF_MODE.wasSet == false) {
+        if (par.numIterations > 1
+            || par.alignmentMode != Parameters::ALIGNMENT_MODE_SCORE_ONLY
+            || par.altAlignment > 0
+            || par.scoreBias != 0.0
+            || par.realign == true
+            || par.addBacktrace == true
+            ) {
+            par.prefMode = Parameters::PREF_MODE_UNGAPPED;
+        } else {
+            par.prefMode = Parameters::PREF_MODE_UNGAPPED_AND_GAPPED;
+        }
+    }
+
     switch(par.prefMode){
         case Parameters::PREF_MODE_KMER:
             cmd.addVariable("PREFMODE", "KMER");
@@ -326,6 +341,9 @@ int search(int argc, const char **argv, const Command& command) {
         case Parameters::PREF_MODE_UNGAPPED:
             cmd.addVariable("PREFMODE", "UNGAPPED");
             break;
+        case Parameters::PREF_MODE_UNGAPPED_AND_GAPPED:
+            cmd.addVariable("PREFMODE", "UNGAPPED_AND_GAPPED");
+            break;
         case Parameters::PREF_MODE_EXHAUSTIVE:
             cmd.addVariable("PREFMODE", "EXHAUSTIVE");
             break;
@@ -337,6 +355,10 @@ int search(int argc, const char **argv, const Command& command) {
 //    cmd.addVariable("ALIGNMENT_DB_EXT", Parameters::isEqualDbtype(targetDbType, Parameters::DBTYPE_PROFILE_STATE_SEQ) ? ".255" : "");
     par.filenames[1] = targetDB;
     if (par.exhaustiveSearch == true) {
+        if (par.gpu != 0) {
+            Debug(Debug::ERROR) << "No GPU support in exhaustive search\n";
+            EXIT(EXIT_FAILURE);
+        }
         // By default (0), diskSpaceLimit (in bytes) will be set in the workflow to use as much as possible
         cmd.addVariable("AVAIL_DISK", SSTR(static_cast<size_t>(par.diskSpaceLimit)).c_str());
 
@@ -351,7 +373,7 @@ int search(int argc, const char **argv, const Command& command) {
         par.maxResListLen = std::max((size_t)300, queryDbSize);
         if(par.prefMode == Parameters::PREF_MODE_KMER){
             cmd.addVariable("PREFILTER_PAR", par.createParameterString(par.prefilter).c_str());
-        } else if (par.prefMode == Parameters::PREF_MODE_UNGAPPED) {
+        } else if (par.prefMode == Parameters::PREF_MODE_UNGAPPED || par.prefMode == Parameters::PREF_MODE_UNGAPPED_AND_GAPPED) {
             cmd.addVariable("UNGAPPEDPREFILTER_PAR", par.createParameterString(par.ungappedprefilter).c_str());
         }
         par.maxResListLen = maxResListLen;
@@ -377,6 +399,10 @@ int search(int argc, const char **argv, const Command& command) {
         FileUtil::writeFile(program, searchslicedtargetprofile_sh, searchslicedtargetprofile_sh_len);
     } else if (((searchMode & Parameters::SEARCH_MODE_FLAG_TARGET_PROFILE) && (searchMode & Parameters::SEARCH_MODE_FLAG_QUERY_AMINOACID))
         && par.PARAM_NUM_ITERATIONS.wasSet){
+        if (par.gpu != 0) {
+            Debug(Debug::ERROR) << "No GPU support in profile-profile search\n";
+            EXIT(EXIT_FAILURE);
+        }
         par.exhaustiveSearch = true;
         par.addBacktrace = true;
         int originalNumIterations = par.numIterations;
@@ -407,7 +433,8 @@ int search(int argc, const char **argv, const Command& command) {
             if (par.prefMode == Parameters::PREF_MODE_KMER) {
                 cmd.addVariable(std::string("PREFILTER_PAR_" + SSTR(i)).c_str(),
                                 par.createParameterString(par.prefilter).c_str());
-            } else if (par.prefMode == Parameters::PREF_MODE_UNGAPPED) {
+            } else if (par.prefMode == Parameters::PREF_MODE_UNGAPPED ||
+                       par.prefMode == Parameters::PREF_MODE_UNGAPPED_AND_GAPPED) {
                 cmd.addVariable(std::string("UNGAPPEDPREFILTER_PAR_" + SSTR(i)).c_str(),
                                 par.createParameterString(par.ungappedprefilter).c_str());
             }
@@ -424,6 +451,10 @@ int search(int argc, const char **argv, const Command& command) {
         FileUtil::writeFile(tmpDir + "/iterativepp.sh", iterativepp_sh, iterativepp_sh_len);
         program = std::string(tmpDir + "/iterativepp.sh");
     } else if (searchMode & Parameters::SEARCH_MODE_FLAG_TARGET_PROFILE) {
+        if (par.gpu != 0) {
+            Debug(Debug::ERROR) << "No GPU support in target-side k-mer search\n";
+            EXIT(EXIT_FAILURE);
+        }
         cmd.addVariable("PREFILTER_PAR", par.createParameterString(par.prefilter).c_str());
         // we need to align all hits in case of target Profile hits
         size_t maxResListLen = par.maxResListLen;
@@ -487,8 +518,12 @@ int search(int argc, const char **argv, const Command& command) {
         program = std::string(tmpDir + "/blastpgp.sh");
     } else {
         if (par.sensSteps > 1) {
+            if (par.gpu != 0) {
+                Debug(Debug::ERROR) << "No GPU support in increasing sensitivity search\n";
+                EXIT(EXIT_FAILURE);
+            }
             if (par.startSens > par.sensitivity) {
-                Debug(Debug::ERROR) << "--start-sens should not be greater -s.\n";
+                Debug(Debug::ERROR) << "--start-sens can not be greater than -s\n";
                 EXIT(EXIT_FAILURE);
             }
             cmd.addVariable("SENSE_0", SSTR(par.startSens).c_str());
@@ -518,7 +553,8 @@ int search(int argc, const char **argv, const Command& command) {
         }
         if (par.prefMode == Parameters::PREF_MODE_KMER) {
             cmd.addVariable("PREFILTER_PAR", par.createParameterString(prefilterWithoutS).c_str());
-        } else if (par.prefMode == Parameters::PREF_MODE_UNGAPPED) {
+        } else if (par.prefMode == Parameters::PREF_MODE_UNGAPPED ||
+                   par.prefMode == Parameters::PREF_MODE_UNGAPPED_AND_GAPPED) {
             cmd.addVariable("UNGAPPEDPREFILTER_PAR", par.createParameterString(par.ungappedprefilter).c_str());
         }
         if (isUngappedMode) {
@@ -534,19 +570,28 @@ int search(int argc, const char **argv, const Command& command) {
 
     if (searchMode & (Parameters::SEARCH_MODE_FLAG_QUERY_TRANSLATED|Parameters::SEARCH_MODE_FLAG_TARGET_TRANSLATED)) {
         cmd.addVariable("NO_TARGET_INDEX", (indexStr == "") ? "TRUE" : NULL);
-        FileUtil::writeFile(tmpDir + "/translated_search.sh", translated_search_sh, translated_search_sh_len);
         cmd.addVariable("QUERY_NUCL", (searchMode & Parameters::SEARCH_MODE_FLAG_QUERY_TRANSLATED) ? "TRUE" : NULL);
-        cmd.addVariable("TARGET_NUCL", (searchMode & Parameters::SEARCH_MODE_FLAG_TARGET_TRANSLATED)  ? "TRUE" : NULL);
+        cmd.addVariable("TARGET_NUCL", (searchMode & Parameters::SEARCH_MODE_FLAG_TARGET_TRANSLATED) ? "TRUE" : NULL);
         cmd.addVariable("THREAD_COMP_PAR", par.createParameterString(par.threadsandcompression).c_str());
         par.subDbMode = 1;
         cmd.addVariable("CREATESUBDB_PAR", par.createParameterString(par.createsubdb).c_str());
         par.translate = 1;
-        cmd.addVariable("ORF_PAR", par.createParameterString(par.extractorfs).c_str());
         cmd.addVariable("OFFSETALIGNMENT_PAR", par.createParameterString(par.offsetalignment).c_str());
+        cmd.addVariable("ORF_SKIP", par.translationMode == Parameters::PARAM_TRANSLATION_MODE_FRAME ? "TRUE" : NULL);
+        if (par.translationMode == Parameters::PARAM_TRANSLATION_MODE_FRAME) {
+            cmd.addVariable("EXTRACT_FRAMES_PAR", par.createParameterString(par.extractframes).c_str());
+        } else {
+            cmd.addVariable("ORF_PAR", par.createParameterString(par.extractorfs).c_str());
+        }
         cmd.addVariable("SEARCH", program.c_str());
         program = std::string(tmpDir + "/translated_search.sh");
+        FileUtil::writeFile(program.c_str(), translated_search_sh, translated_search_sh_len);
     }else if(searchMode & Parameters::SEARCH_MODE_FLAG_QUERY_NUCLEOTIDE &&
             searchMode & Parameters::SEARCH_MODE_FLAG_TARGET_NUCLEOTIDE){
+        if (par.gpu != 0) {
+            Debug(Debug::ERROR) << "No GPU support in nucleotide search\n";
+            EXIT(EXIT_FAILURE);
+        }
         FileUtil::writeFile(tmpDir + "/blastn.sh", blastn_sh, blastn_sh_len);
         //  0: reverse, 1: forward, 2: both
         switch (par.strand){
diff --git a/util/build_osx.sh b/util/build_osx.sh
index 084131ac9..110dcafc8 100755
--- a/util/build_osx.sh
+++ b/util/build_osx.sh
@@ -23,50 +23,94 @@ if [ ! -d "$REPO" ]; then
     exit 1
 fi
 
-export MACOSX_DEPLOYMENT_TARGET=10.12
+ALLOWED_DL_LIBS="lib(System\.B|z|bz2|c\+\+|objc)\."
+
+export MACOSX_DEPLOYMENT_TARGET=10.15
 
 mkdir -p "$BUILD/build_libomp" && cd "$BUILD/build_libomp"
-wget -qO- http://github.com/llvm/llvm-project/releases/download/llvmorg-11.0.0/openmp-11.0.0.src.tar.xz | tar xvf -
-cd openmp-11.0.0.src
-wget https://raw.githubusercontent.com/Homebrew/formula-patches/7e2ee1d7/libomp/arm.patch
-patch -p1 < arm.patch
+OMPVERSION=14.0.6
+wget -qO- https://github.com/llvm/llvm-project/releases/download/llvmorg-${OMPVERSION}/openmp-${OMPVERSION}.src.tar.xz | tar xvf -
+cd openmp-${OMPVERSION}.src
 
-mkdir -p "$BUILD/build_libomp/openmp-11.0.0.src/build-amd64" && cd "$BUILD/build_libomp/openmp-11.0.0.src/build-amd64"
-cmake -DLIBOMP_ENABLE_SHARED=OFF -DLIBOMP_INSTALL_ALIASES=OFF -DLIBOMP_ARCH=x86_64 -DCMAKE_CXX_FLAGS="-arch x86_64" ..
+mkdir -p "$BUILD/build_libomp/openmp-${OMPVERSION}.src/build-amd64" && cd "$BUILD/build_libomp/openmp-${OMPVERSION}.src/build-amd64"
+cmake \
+    -DLIBOMP_ENABLE_SHARED=OFF \
+    -DLIBOMP_INSTALL_ALIASES=OFF \
+    -DLIBOMP_ARCH=x86_64 \
+    -DCMAKE_OSX_ARCHITECTURES=x86_64 \
+    -DCMAKE_C_FLAGS="-arch x86_64" \
+    -DCMAKE_CXX_FLAGS="-arch x86_64" \
+    -DLIBOMP_ASMFLAGS="-arch x86_64" \
+    ..
 make -j${CPUS}
-export LIBOMP_AMD64="$BUILD/build_libomp/openmp-11.0.0.src/build-amd64/runtime/src"
+export LIBOMP_AMD64="$BUILD/build_libomp/openmp-${OMPVERSION}.src/build-amd64/runtime/src"
 
-mkdir -p "$BUILD/build_sse41" && cd "$BUILD/build_sse41"
-cmake -DCMAKE_BUILD_TYPE=Release -DHAVE_TESTS=0 -DHAVE_MPI=0 -DHAVE_SSE4_1=1 -DCMAKE_C_FLAGS="-arch x86_64" -DCMAKE_CXX_FLAGS="-arch x86_64" -DBUILD_SHARED_LIBS=OFF -DCMAKE_FIND_LIBRARY_SUFFIXES=".a" -DOpenMP_C_FLAGS="-Xpreprocessor -fopenmp -I${LIBOMP_AMD64}" -DOpenMP_C_LIB_NAMES=omp -DOpenMP_CXX_FLAGS="-Xpreprocessor -fopenmp -I${LIBOMP_AMD64}" -DOpenMP_CXX_LIB_NAMES=omp -DOpenMP_omp_LIBRARY=${LIBOMP_AMD64}/libomp.a "$REPO"
+mkdir -p "$BUILD/build_avx2" && cd "$BUILD/build_avx2"
+cmake \
+    -DCMAKE_BUILD_TYPE=Release \
+    -DHAVE_TESTS=0 -DHAVE_MPI=0 -DHAVE_AVX2=1 \
+    -DCMAKE_OSX_ARCHITECTURES=x86_64 \
+    -DCMAKE_C_FLAGS="-arch x86_64" -DCMAKE_CXX_FLAGS="-arch x86_64" -DCMAKE_ASM_FLAGS="-arch arm64" \
+    -DBUILD_SHARED_LIBS=OFF -DCMAKE_FIND_LIBRARY_SUFFIXES=".a" \
+    -DOpenMP_C_FLAGS="-Xpreprocessor -fopenmp -I${LIBOMP_AMD64}" -DOpenMP_C_LIB_NAMES=omp -DOpenMP_CXX_FLAGS="-Xpreprocessor -fopenmp -I${LIBOMP_AMD64}" -DOpenMP_CXX_LIB_NAMES=omp -DOpenMP_omp_LIBRARY=${LIBOMP_AMD64}/libomp.a \
+    "$REPO"
 make -j${CPUS}
 
-if [ "$(echo $(otool -L "src/${BINARY_NAME}" | wc -l))" != 5 ]; then
+otool -L "src/${BINARY_NAME}"
+if [ "$(otool -L "src/${BINARY_NAME}" | tail -n +2 | grep -v -E "${ALLOWED_DL_LIBS}" )" != "" ]; then
     echo "Too many linked libraries found in ${BINARY_NAME} binary. Build is not static!"
     exit 1
 fi
 
-mkdir -p "$BUILD/build_avx2" && cd "$BUILD/build_avx2"
-cmake -DCMAKE_BUILD_TYPE=Release -DHAVE_TESTS=0 -DHAVE_MPI=0 -DHAVE_AVX2=1 -DCMAKE_C_FLAGS="-arch x86_64h" -DCMAKE_CXX_FLAGS="-arch x86_64h" -DBUILD_SHARED_LIBS=OFF -DCMAKE_FIND_LIBRARY_SUFFIXES=".a" -DOpenMP_C_FLAGS="-Xpreprocessor -fopenmp -I${LIBOMP_AMD64}" -DOpenMP_C_LIB_NAMES=omp -DOpenMP_CXX_FLAGS="-Xpreprocessor -fopenmp -I${LIBOMP_AMD64}" -DOpenMP_CXX_LIB_NAMES=omp -DOpenMP_omp_LIBRARY=${LIBOMP_AMD64}/libomp.a "$REPO"
-make -j${CPUS}
-
-if [ "$(echo $(otool -L "src/${BINARY_NAME}" | wc -l))" != 5 ]; then
-    echo "Too many linked libraries found in ${BINARY_NAME} binary. Build is not static!"
+if ! vtool -show "src/${BINARY_NAME}" | tee | grep minos | \
+     awk -v version="${MACOSX_DEPLOYMENT_TARGET}" '$2 > version { exit 1 }'
+then
+    echo "macOS deployment target was not set correctly"
     exit 1
 fi
 
 export MACOSX_DEPLOYMENT_TARGET=11.0
 
-mkdir -p "$BUILD/build_libomp/openmp-11.0.0.src/build-arm64" && cd "$BUILD/build_libomp/openmp-11.0.0.src/build-arm64"
-cmake -DLIBOMP_ENABLE_SHARED=OFF -DLIBOMP_INSTALL_ALIASES=OFF -DLIBOMP_ARCH=aarch64 -DCMAKE_CXX_FLAGS="-arch arm64" -DLIBOMP_ASMFLAGS="-arch arm64" ..
+mkdir -p "$BUILD/build_libomp/openmp-${OMPVERSION}.src/build-arm64" && cd "$BUILD/build_libomp/openmp-${OMPVERSION}.src/build-arm64"
+cmake \
+    -DLIBOMP_ENABLE_SHARED=OFF \
+    -DLIBOMP_INSTALL_ALIASES=OFF \
+    -DLIBOMP_ARCH=aarch64 \
+    -DCMAKE_OSX_ARCHITECTURES=arm64 \
+    -DCMAKE_C_FLAGS="-arch arm64" \
+    -DCMAKE_CXX_FLAGS="-arch arm64" \
+    -DLIBOMP_ASMFLAGS="-arch arm64" \
+    ..
 make -j${CPUS}
-export LIBOMP_AARCH64="$BUILD/build_libomp/openmp-11.0.0.src/build-arm64/runtime/src"
+export LIBOMP_AARCH64="$BUILD/build_libomp/openmp-${OMPVERSION}.src/build-arm64/runtime/src"
 
 mkdir -p "$BUILD/build_arm64" && cd "$BUILD/build_arm64"
-cmake -DCMAKE_BUILD_TYPE=Release -DHAVE_TESTS=0 -DHAVE_MPI=0 -DHAVE_ARM8=1 -DCMAKE_C_FLAGS="-arch arm64" -DCMAKE_CXX_FLAGS="-arch arm64" -DBUILD_SHARED_LIBS=OFF -DCMAKE_FIND_LIBRARY_SUFFIXES=".a" -DOpenMP_C_FLAGS="-Xpreprocessor -fopenmp -I${LIBOMP_AARCH64}" -DOpenMP_C_LIB_NAMES=omp -DOpenMP_CXX_FLAGS="-Xpreprocessor -fopenmp -I${LIBOMP_AARCH64}" -DOpenMP_CXX_LIB_NAMES=omp -DOpenMP_omp_LIBRARY=${LIBOMP_AARCH64}/libomp.a "$REPO"
+cmake \
+    -DCMAKE_BUILD_TYPE=Release \
+    -DHAVE_TESTS=0 -DHAVE_MPI=0 -DHAVE_ARM8=1 \
+    -DCMAKE_OSX_ARCHITECTURES=arm64 \
+    -DCMAKE_C_FLAGS="-arch arm64" -DCMAKE_CXX_FLAGS="-arch arm64" -DCMAKE_ASM_FLAGS="-arch arm64" \
+    -DBUILD_SHARED_LIBS=OFF -DCMAKE_FIND_LIBRARY_SUFFIXES=".a" \
+    -DOpenMP_C_FLAGS="-Xpreprocessor -fopenmp -I${LIBOMP_AARCH64}" -DOpenMP_C_LIB_NAMES=omp -DOpenMP_CXX_FLAGS="-Xpreprocessor -fopenmp -I${LIBOMP_AARCH64}" -DOpenMP_CXX_LIB_NAMES=omp -DOpenMP_omp_LIBRARY=${LIBOMP_AARCH64}/libomp.a \
+    "$REPO"
 make -j${CPUS}
-if [ "$(echo $(otool -L "src/${BINARY_NAME}" | wc -l))" != 5 ]; then
+
+otool -L "src/${BINARY_NAME}"
+if [ "$(otool -L "src/${BINARY_NAME}" | tail -n +2 | grep -v -E "${ALLOWED_DL_LIBS}" )" != "" ]; then
     echo "Too many linked libraries found in ${BINARY_NAME} binary. Build is not static!"
     exit 1
 fi
 
-lipo -create -arch x86_64 "$BUILD/build_sse41/src/${BINARY_NAME}" -arch x86_64h "$BUILD/build_avx2/src/${BINARY_NAME}" -arch arm64 "$BUILD/build_arm64/src/${BINARY_NAME}" -output "$BUILD/${BINARY_NAME}"
+if ! vtool -show "src/${BINARY_NAME}" | tee | grep minos | \
+     awk -v version="${MACOSX_DEPLOYMENT_TARGET}" '$2 > version { exit 1 }'
+then
+    echo "macOS deployment target was not set correctly"
+    exit 1
+fi
+
+lipo \
+    -create \
+    -arch x86_64 "$BUILD/build_avx2/src/${BINARY_NAME}" \
+    -arch arm64 "$BUILD/build_arm64/src/${BINARY_NAME}" \
+    -output "$BUILD/${BINARY_NAME}"
+
diff --git a/util/regression b/util/regression
index 5e3bc17e1..346eb3b95 160000
--- a/util/regression
+++ b/util/regression
@@ -1 +1 @@
-Subproject commit 5e3bc17e17fb7d34e459bc8ad2bedbf7ded2a038
+Subproject commit 346eb3b95853b55cbdbc949267d9a5d815910133
diff --git a/util/update_libmarv.sh b/util/update_libmarv.sh
new file mode 100755
index 000000000..6ba1f8cc4
--- /dev/null
+++ b/util/update_libmarv.sh
@@ -0,0 +1,2 @@
+#!/bin/sh -e
+git subtree pull --prefix lib/libmarv git@github.com:steineggerlab/libmarv.git main --squash