Skip to content

Commit 9c091f9

Browse files
committed
Use FNV1a for string hashing
The existing X31 hash propagates bits fairly slowly, resulting in a poor distribution of keys if most of the differences in strings are at the end. Fix by using FNV1a instead, which is a similar speed to calculate but distributes keys much more effectively. Includes kh_stats() function in khash which produces a histogram of probe chain lengths and a khash test framework. The test program can also be used to benchmark insertion and lookup times.
1 parent 1187fa8 commit 9c091f9

File tree

4 files changed

+592
-4
lines changed

4 files changed

+592
-4
lines changed

.gitignore

+1
Original file line numberDiff line numberDiff line change
@@ -67,6 +67,7 @@ shlib-exports-*.txt
6767
/test/test_index
6868
/test/test_introspection
6969
/test/test_kfunc
70+
/test/test_khash
7071
/test/test_kstring
7172
/test/test_mod
7273
/test/test_nibbles

Makefile

+6
Original file line numberDiff line numberDiff line change
@@ -85,6 +85,7 @@ BUILT_TEST_PROGRAMS = \
8585
test/test_expr \
8686
test/test_faidx \
8787
test/test_kfunc \
88+
test/test_khash \
8889
test/test_kstring \
8990
test/test_mod \
9091
test/test_nibbles \
@@ -605,6 +606,7 @@ check test: all $(HTSCODECS_TEST_TARGETS)
605606
test/hts_endian
606607
test/test_expr
607608
test/test_kfunc
609+
test/test_khash
608610
test/test_kstring
609611
test/test_nibbles -v
610612
test/test_str2int
@@ -669,6 +671,9 @@ test/test_faidx: test/test_faidx.o libhts.a
669671
test/test_kfunc: test/test_kfunc.o libhts.a
670672
$(CC) $(LDFLAGS) -o $@ test/test_kfunc.o libhts.a -lz $(LIBS) -lpthread
671673

674+
test/test_khash: test/test_khash.o libhts.a
675+
$(CC) $(LDFLAGS) -o $@ test/test_khash.o libhts.a -lz $(LIBS) -lpthread
676+
672677
test/test_kstring: test/test_kstring.o libhts.a
673678
$(CC) $(LDFLAGS) -o $@ test/test_kstring.o libhts.a -lz $(LIBS) -lpthread
674679

@@ -778,6 +783,7 @@ test/sam.o: test/sam.c config.h $(htslib_hts_defs_h) $(htslib_sam_h) $(htslib_fa
778783
test/test_bgzf.o: test/test_bgzf.c config.h $(htslib_bgzf_h) $(htslib_hfile_h) $(htslib_hts_log_h) $(hfile_internal_h)
779784
test/test_expr.o: test/test_expr.c config.h $(htslib_hts_expr_h)
780785
test/test_kfunc.o: test/test_kfunc.c config.h $(htslib_kfunc_h)
786+
test/test_khash.o: test/test_khash.c config.h $(htslib_khash_h) $(htslib_kroundup_h)
781787
test/test_kstring.o: test/test_kstring.c config.h $(htslib_kstring_h)
782788
test/test_mod.o: test/test_mod.c config.h $(htslib_sam_h)
783789
test/test_nibbles.o: test/test_nibbles.c config.h $(htslib_sam_h) $(sam_internal_h)

htslib/khash.h

+83-4
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
/* The MIT License
22
33
Copyright (c) 2008, 2009, 2011 by Attractive Chaos <[email protected]>
4-
Copyright (C) 2014-2015, 2018 Genome Research Ltd.
4+
Copyright (C) 2014-2015, 2018, 2024 Genome Research Ltd.
55
66
Permission is hereby granted, free of charge, to any person obtaining
77
a copy of this software and associated documentation files (the
@@ -356,7 +356,39 @@ static const double __ac_HASH_UPPER = 0.77;
356356
__ac_set_isdel_true(h->flags, x); \
357357
--h->size; \
358358
} \
359-
}
359+
} \
360+
SCOPE int kh_stats_##name(kh_##name##_t *h, khint_t *empty, \
361+
khint_t *deleted, khint_t *hist_size, \
362+
khint_t **hist_out) \
363+
{ \
364+
khint_t i, *hist = NULL, dist_max = 0, k, dist, step; \
365+
khint_t mask = h->n_buckets - 1; \
366+
*empty = *deleted = *hist_size = 0; \
367+
hist = (khint_t *) calloc(1, sizeof(*hist)); \
368+
if (!hist) { return -1; } \
369+
for (i = kh_begin(h); i < kh_end(h); ++i) { \
370+
if (__ac_isempty(h->flags, i)) { (*empty)++; continue; } \
371+
if (__ac_isdel(h->flags, i)) { (*deleted)++; continue; } \
372+
k = __hash_func(h->keys[i]) & (h->n_buckets - 1); \
373+
dist = 0; \
374+
step = 0; \
375+
while (k != i) { \
376+
dist++; \
377+
k = (k + (++step)) & mask; \
378+
} \
379+
if (dist_max <= dist) { \
380+
khint_t *new_hist = (khint_t *) realloc(hist, sizeof(*new_hist) * (dist + 1)); \
381+
if (!new_hist) { free(hist); return -1; } \
382+
for (k = dist_max + 1; k <= dist; k++) new_hist[k] = 0; \
383+
hist = new_hist; \
384+
dist_max = dist; \
385+
} \
386+
hist[dist]++; \
387+
} \
388+
*hist_out = hist; \
389+
*hist_size = dist_max + 1; \
390+
return 0; \
391+
}
360392

361393
#define KHASH_DECLARE(name, khkey_t, khval_t) \
362394
__KHASH_TYPE(name, khkey_t, khval_t) \
@@ -391,6 +423,7 @@ static const double __ac_HASH_UPPER = 0.77;
391423
@abstract 64-bit integer comparison function
392424
*/
393425
#define kh_int64_hash_equal(a, b) ((a) == (b))
426+
394427
/*! @function
395428
@abstract const char* hash function
396429
@param s Pointer to a null terminated string
@@ -402,12 +435,28 @@ static kh_inline khint_t __ac_X31_hash_string(const char *s)
402435
if (h) for (++s ; *s; ++s) h = (h << 5) - h + (khint_t)*s;
403436
return h;
404437
}
438+
439+
/*! @function
440+
@abstract const char* FNV1a hash function
441+
@param s Pointer to a null terminated string
442+
@return The hash value
443+
*/
444+
static kh_inline khint_t __ac_FNV1a_hash_string(const char *s)
445+
{
446+
const khint_t offset_basis = 2166136261;
447+
const khint_t FNV_prime = 16777619;
448+
khint_t h = offset_basis;
449+
for (; *s; ++s) h = (h ^ (uint8_t) *s) * FNV_prime;
450+
return h;
451+
}
452+
405453
/*! @function
406454
@abstract Another interface to const char* hash function
407455
@param key Pointer to a nul terminated string [const char*]
408456
@return The hash value [khint_t]
409457
*/
410-
#define kh_str_hash_func(key) __ac_X31_hash_string(key)
458+
#define kh_str_hash_func(key) __ac_FNV1a_hash_string(key)
459+
411460
/*! @function
412461
@abstract Const char* comparison function
413462
*/
@@ -426,12 +475,29 @@ static kh_inline khint_t __ac_X31_hash_kstring(const kstring_t ks)
426475
h = (h << 5) - h + (khint_t)ks.s[i];
427476
return h;
428477
}
478+
479+
/*! @function
480+
@abstract Kstring hash function
481+
@param s Pointer to a kstring
482+
@return The hash value
483+
*/
484+
static kh_inline khint_t __ac_FNV1a_hash_kstring(const kstring_t ks)
485+
{
486+
const khint_t offset_basis = 2166136261;
487+
const khint_t FNV_prime = 16777619;
488+
khint_t h = offset_basis;
489+
size_t i;
490+
for (i = 0; i < ks.l; i++)
491+
h = (h ^ (uint8_t) ks.s[i]) * FNV_prime;
492+
return h;
493+
}
494+
429495
/*! @function
430496
@abstract Interface to kstring hash function.
431497
@param key Pointer to a khash; permits hashing on non-nul terminated strings.
432498
@return The hash value [khint_t]
433499
*/
434-
#define kh_kstr_hash_func(key) __ac_X31_hash_kstring(key)
500+
#define kh_kstr_hash_func(key) __ac_FNV1a_hash_kstring(key)
435501
/*! @function
436502
@abstract kstring comparison function
437503
*/
@@ -604,6 +670,19 @@ static kh_inline khint_t __ac_Wang_hash(khint_t key)
604670
code; \
605671
} }
606672

673+
/*! @function
674+
@abstract Gather hash table statistics
675+
@param name Name of the hash table [symbol]
676+
@param h Pointer to the hash table [khash_t(name)*]
677+
@param empty[out] Number of empty hash bins
678+
@param deleted[out] Number of hash bins with the deleted flag
679+
@param hist_size[out] Size of @p hist array
680+
@param hist[out] Probe count histogram
681+
@return 0 on success; -1 on failure
682+
*/
683+
#define kh_stats(name, h, empty, deleted, hist_size, hist) \
684+
kh_stats_##name(h, empty, deleted, hist_size, hist)
685+
607686
/* More convenient interfaces */
608687

609688
/*! @function

0 commit comments

Comments
 (0)