Speed up kputll.

jkbonfield · whitwham · commit 19a27e959e31 · 2024-07-16T15:19:13.000+01:00
The kputuw function is considerably faster as it encodes 2 digits at a
time and also utilises __builtin_clz.  This changes kputll to use the
same 2 digits at a time trick.  I have a __builtin_clzll variant too,
but with longer numbers it's not the main bottleneck and we fall back
to kputuw for small numbers.  This avoids complicating the code with
builtin checks and alternate versions.

An alternative, purely for sam_format1_append would be something like:

    static inline int kputll_fast(long long c, kstring_t *s) {
        return c &lt;= INT_MAX &amp;&amp; c &gt;= INT_MIN ? kputw(c, s) : kputll(c, s);
    }
    #define kputll kputll_fast

This works as BAM/CRAM only support 32-bit numbers for POS, PNEXT and
TLEN anyway, so ll vs w is an irrelevant distinction.  However I chose
to modify the header file so it fixes other callers.

Overall compressed BAM to uncompressed SAM conversion is about 5%
quicker (tested on 10 million short-read seqs; it'll be minimal on
long seqs).  This includes decode time and other functions too.  The
sam_format1_append only component of that is about 15-25% quicker
depending on compiler and version.
diff --git a/htslib/kstring.h b/htslib/kstring.h
@@ -375,17 +375,63 @@ static inline int kputw(int c, kstring_t *s)
 
 static inline int kputll(long long c, kstring_t *s)
 {
-	char buf[32];
-	int i, l = 0;
-	unsigned long long x = c;
-	if (c < 0) x = -x;
-	do { buf[l++] = x%10 + '0'; x /= 10; } while (x > 0);
-	if (c < 0) buf[l++] = '-';
-	if (ks_resize(s, s->l + l + 2) < 0)
-		return EOF;
-	for (i = l - 1; i >= 0; --i) s->s[s->l++] = buf[i];
-	s->s[s->l] = 0;
-	return 0;
+    // Worst case expansion.  One check reduces function size
+    // and aids inlining chance.  Memory overhead is minimal.
+    if (ks_resize(s, s->l + 23) < 0)
+	return EOF;
+
+    unsigned long long x = c;
+    if (c < 0) {
+	x = -x;
+        s->s[s->l++] = '-';
+    }
+
+    if (x <= UINT32_MAX)
+	return kputuw(x, s);
+
+    static const char kputull_dig2r[] =
+        "00010203040506070809"
+        "10111213141516171819"
+        "20212223242526272829"
+        "30313233343536373839"
+        "40414243444546474849"
+        "50515253545556575859"
+        "60616263646566676869"
+        "70717273747576777879"
+        "80818283848586878889"
+        "90919293949596979899";
+    unsigned int l, j;
+    char *cp;
+
+    // Find out how long the number is (could consider clzll)
+    uint64_t m = 1;
+    l = 0;
+    if (sizeof(long long)==sizeof(uint64_t) && x >= 10000000000000000000ULL) {
+	// avoids overflow below
+	l = 20;
+    } else {
+	do {
+	    l++;
+	    m *= 10;
+	} while (x >= m);
+    }
+
+    // Add digits two at a time
+    j = l;
+    cp = s->s + s->l;
+    while (x >= 10) {
+        const char *d = &kputull_dig2r[2*(x%100)];
+        x /= 100;
+        memcpy(&cp[j-=2], d, 2);
+    }
+
+    // Last one (if necessary).  We know that x < 10 by now.
+    if (j == 1)
+        cp[0] = x + '0';
+
+    s->l += l;
+    s->s[s->l] = 0;
+    return 0;
 }
 
 static inline int kputl(long c, kstring_t *s) {
diff --git a/test/test_kstring.c b/test/test_kstring.c
@@ -261,6 +261,84 @@ static int test_kputw(int64_t start, int64_t end) {
     return 0;
 }
 
+static int test_kputll_from_to(kstring_t *str, long long s, long long e) {
+    long long i = s;
+
+    for (;;) {
+        str->l = 0;
+        memset(str->s, 0xff, str->m);
+        if (kputll(i, str) < 0 || !str->s) {
+            perror("kputll");
+            return -1;
+        }
+        if (str->l >= str->m || str->s[str->l] != '\0') {
+            fprintf(stderr, "No NUL termination on string from kputll\n");
+            return -1;
+        }
+        if (i != strtoll(str->s, NULL, 10)) {
+            fprintf(stderr,
+                    "kputll wrote the wrong value, expected %lld, got %s\n",
+                    i, str->s);
+            return -1;
+        }
+        if (i >= e) break;
+        i++;
+    }
+    return 0;
+}
+
+static int test_kputll(long long start, long long end) {
+    kstring_t str = { 0, 0, NULL };
+    unsigned long long val;
+
+    str.s = malloc(2);
+    if (!str.s) {
+        perror("malloc");
+        return -1;
+    }
+    str.m = 2;
+
+    for (val = 1; val < INT64_MAX-5; val *= 10) {
+        if (test_kputll_from_to(&str, val >= 5 ? val - 5 : val, val) < 0) {
+            free(ks_release(&str));
+            return -1;
+        }
+    }
+
+    for (val = 1; val < INT64_MAX-5; val *= 10) {
+        long long valm = -val;
+        if (test_kputll_from_to(&str, valm >= 5 ? valm - 5 : valm, valm) < 0) {
+            free(ks_release(&str));
+            return -1;
+        }
+    }
+
+    if (test_kputll_from_to(&str, INT64_MAX - 5, INT64_MAX) < 0) {
+        free(ks_release(&str));
+        return -1;
+    }
+
+    if (test_kputll_from_to(&str, INT64_MIN, INT64_MIN + 5) < 0) {
+        free(ks_release(&str));
+        return -1;
+    }
+
+    str.m = 1; // Force a resize
+    int64_t start2 = (int64_t)start; // no larger on our platforms
+    int64_t end2   = (int64_t)end;
+    clamp(&start2, INT64_MIN, INT64_MAX);
+    clamp(&end2,   INT64_MIN, INT64_MAX);
+
+    if (test_kputll_from_to(&str, start, end) < 0) {
+        free(ks_release(&str));
+        return -1;
+    }
+
+    free(ks_release(&str));
+
+    return 0;
+}
+
 // callback used by test_kgetline
 static char *mock_fgets(char *str, int num, void *p) {
     int *mock_state = (int*)p;
@@ -413,6 +491,9 @@ int main(int argc, char **argv) {
     if (!test || strcmp(test, "kputw") == 0)
         if (test_kputw(start, end) != 0) res = EXIT_FAILURE;
 
+    if (!test || strcmp(test, "kputll") == 0)
+        if (test_kputll(start, end) != 0) res = EXIT_FAILURE;
+
     if (!test || strcmp(test, "kgetline") == 0)
         if (test_kgetline() != 0) res = EXIT_FAILURE;