Skip to content

Commit

Permalink
feat: fuzzy - caverphone coding
Browse files Browse the repository at this point in the history
  • Loading branch information
nalgeon committed Nov 20, 2021
1 parent 76244e6 commit 76b1609
Show file tree
Hide file tree
Showing 5 changed files with 325 additions and 0 deletions.
4 changes: 4 additions & 0 deletions docs/fuzzy.md
Original file line number Diff line number Diff line change
Expand Up @@ -47,11 +47,15 @@ Only ASCII strings are supported.

Compute phonetic string code:

- `caverphone(x)` - Caverphone code,
- `phonetic_hash(x)` - Spellcheck phonetic code,
- `soundex(x)` - Soundex code,
- `rsoundex(x)` - Refined Soundex code.

```
sqlite> select caverphone('awesome');
AWSM111111
sqlite> select phonetic_hash('awesome');
ABACAMA
Expand Down
294 changes: 294 additions & 0 deletions src/fuzzy/caverphone.c
Original file line number Diff line number Diff line change
@@ -0,0 +1,294 @@
#include <assert.h>
#include <stdlib.h>
#include <string.h>

// remove_non_letters deletes everything from the source string,
// except lowercased letters a-z
static char* remove_non_letters(const char* src) {
size_t src_len = strlen(src);
char* res = malloc((src_len + 1) * sizeof(char));
const char* src_it;
char* res_it = res;
for (size_t idx = 0; idx < src_len; idx++) {
src_it = src + idx;
if (*src_it < 97 || *src_it > 122) {
continue;
}
*res_it = *src_it;
res_it++;
}
*res_it = '\0';
return res;
}

// replace_start replaces the `old` substring with the `new` one
// if it matches at the beginning of the `src` string
static char* replace_start(const char* src, const char* old, const char* new) {
size_t src_len = strlen(src);
size_t old_len = strlen(old);
size_t new_len = strlen(new);
assert(new_len <= old_len);

char* res = malloc((src_len + 1) * sizeof(char));
if (strncmp(src, old, old_len) == 0) {
strncpy(res, new, new_len);
strncpy(res + new_len, src + old_len, src_len - old_len);
*(res + src_len - old_len + new_len) = '\0';
} else {
strncpy(res, src, src_len);
*(res + src_len) = '\0';
}
return res;
}

// replace_end replaces the `old` substring with the `new` one
// if it matches at the end of the `src` string
static char* replace_end(const char* src, const char* old, const char* new) {
size_t src_len = strlen(src);
size_t old_len = strlen(old);
size_t new_len = strlen(new);
assert(new_len <= old_len);

char* res = malloc((src_len + 1) * sizeof(char));
strncpy(res, src, src_len - old_len);
if (strncmp(src + src_len - old_len, old, old_len) == 0) {
strncpy(res + src_len - old_len, new, new_len);
*(res + src_len - old_len + new_len) = '\0';
} else {
strncpy(res + src_len - old_len, src + src_len - old_len, old_len);
*(res + src_len) = '\0';
}
return res;
}

// replace replaces all `old` substrings with `new` ones
// in the the `src` string
static char* replace(const char* src, const char* old, const char* new) {
size_t src_len = strlen(src);
size_t old_len = strlen(old);
size_t new_len = strlen(new);
assert(new_len <= old_len);

char* res = malloc((src_len + 1) * sizeof(char));
const char* src_it;
char* res_it = res;
for (size_t idx = 0; idx < src_len;) {
src_it = src + idx;
if (strncmp(src_it, old, old_len) == 0) {
strncpy(res_it, new, new_len);
res_it += new_len;
idx += old_len;
} else {
*res_it = *src_it;
res_it++;
idx++;
}
}
*res_it = '\0';
return res;
}

// replace_seq replaces all sequences of the `old` character
// with the `new` substring in the the `src` string
static char* replace_seq(const char* src, const char old, const char* new) {
size_t src_len = strlen(src);
size_t new_len = strlen(new);
char* res = malloc((src_len + 1) * sizeof(char));
const char* src_it;
char* res_it = res;
size_t match_len = 0;
for (size_t idx = 0; idx < src_len;) {
src_it = src + idx;
if (*src_it == old) {
match_len++;
idx++;
} else {
if (match_len > 0) {
strncpy(res_it, new, new_len);
res_it += new_len;
match_len = 0;
}
*res_it = *src_it;
res_it++;
idx++;
}
}
if (match_len > 0) {
strncpy(res_it, new, new_len);
res_it += new_len;
}
*res_it = '\0';
return res;
}

// pad pads `src` string with trailing 1s
// up to the length of 10 characters
static char* pad(const char* src) {
size_t src_len = strlen(src);
size_t max_len = 10;

char* res = malloc((max_len + 1) * sizeof(char));
strncpy(res, src, max_len);
if (src_len < max_len) {
for (size_t idx = src_len; idx < max_len; idx++) {
*(res + idx) = '1';
}
}
*(res + max_len) = '\0';
return res;
}

// step frees the source string and returns the result one
static char* step(char* res, char* src) {
free(src);
return res;
}

// caverphone implements the Caverphone phonetic hashing algorithm
// as described in https://caversham.otago.ac.nz/files/working/ctp150804.pdf
char* caverphone(const char* src) {
assert(src != NULL);

char* res = malloc((strlen(src) + 1) * sizeof(char));

if (src == 0 || *src == '\0') {
res[0] = '\0';
return res;
}

strcpy(res, src);

// Remove anything not in the standard alphabet
res = step(remove_non_letters((const char*)res), res);

// Remove final e
res = step(replace_end((const char*)res, "e", ""), res);

// If the name starts with *gh make it *2f
res = step(replace_start((const char*)res, "cough", "cou2f"), res);
res = step(replace_start((const char*)res, "rough", "rou2f"), res);
res = step(replace_start((const char*)res, "tough", "tou2f"), res);
res = step(replace_start((const char*)res, "enough", "enou2f"), res);
res = step(replace_start((const char*)res, "trough", "trou2f"), res);

// If the name starts with gn make it 2n
res = step(replace_start((const char*)res, "gn", "2n"), res);
// If the name ends with mb make it m2
res = step(replace_end((const char*)res, "mb", "m2"), res);
// replace cq with 2q
res = step(replace((const char*)res, "cq", "2q"), res);

// replace c[iey] with s[iey]
res = step(replace((const char*)res, "ci", "si"), res);
res = step(replace((const char*)res, "ce", "se"), res);
res = step(replace((const char*)res, "cy", "sy"), res);

// replace tch with 2ch
res = step(replace((const char*)res, "tch", "2ch"), res);

// replace [cqx] with k
res = step(replace((const char*)res, "c", "k"), res);
res = step(replace((const char*)res, "q", "k"), res);
res = step(replace((const char*)res, "x", "k"), res);

// replace v with f
res = step(replace((const char*)res, "v", "f"), res);
// replace dg with 2g
res = step(replace((const char*)res, "dg", "2g"), res);

// replace ti[oa] with si[oa]
res = step(replace((const char*)res, "tio", "sio"), res);
res = step(replace((const char*)res, "tia", "sia"), res);

// replace d with t
res = step(replace((const char*)res, "d", "t"), res);
// replace ph with fh
res = step(replace((const char*)res, "ph", "fh"), res);
// replace b with p
res = step(replace((const char*)res, "b", "p"), res);
// replace sh with s2
res = step(replace((const char*)res, "sh", "s2"), res);
// replace z with s
res = step(replace((const char*)res, "z", "s"), res);

// replace an initial vowel [aeiou] with an A
res = step(replace_start((const char*)res, "a", "A"), res);
res = step(replace_start((const char*)res, "e", "A"), res);
res = step(replace_start((const char*)res, "i", "A"), res);
res = step(replace_start((const char*)res, "o", "A"), res);
res = step(replace_start((const char*)res, "u", "A"), res);

// replace all other vowels with a 3
res = step(replace((const char*)res, "a", "3"), res);
res = step(replace((const char*)res, "e", "3"), res);
res = step(replace((const char*)res, "i", "3"), res);
res = step(replace((const char*)res, "o", "3"), res);
res = step(replace((const char*)res, "u", "3"), res);

// replace j with y
res = step(replace((const char*)res, "j", "y"), res);

// replace an initial y3 with Y3
res = step(replace_start((const char*)res, "y3", "Y3"), res);
// replace an initial y with A
res = step(replace_start((const char*)res, "y", "A"), res);
// replace y with 3
res = step(replace((const char*)res, "y", "3"), res);

// replace 3gh3 with 3kh3
res = step(replace((const char*)res, "3gh3", "3kh3"), res);
// replace gh with 22
res = step(replace((const char*)res, "gh", "22"), res);
// replace g with k
res = step(replace((const char*)res, "g", "k"), res);

// replace sequence of the letter [stpkfmn] with an uppercased letter
res = step(replace_seq((const char*)res, 's', "S"), res);
res = step(replace_seq((const char*)res, 't', "T"), res);
res = step(replace_seq((const char*)res, 'p', "P"), res);
res = step(replace_seq((const char*)res, 'k', "K"), res);
res = step(replace_seq((const char*)res, 'f', "F"), res);
res = step(replace_seq((const char*)res, 'm', "M"), res);
res = step(replace_seq((const char*)res, 'n', "N"), res);

// replace w3 with W3
res = step(replace((const char*)res, "w3", "W3"), res);
// replace wh3 with Wh3
res = step(replace((const char*)res, "wh3", "Wh3"), res);
// replace the final w with 3
res = step(replace_end((const char*)res, "w", "3"), res);
// replace w with 2
res = step(replace((const char*)res, "w", "2"), res);

// replace an initial h with an A
res = step(replace_start((const char*)res, "h", "A"), res);
// replace all other occurrences of h with a 2
res = step(replace((const char*)res, "h", "2"), res);

// replace r3 with R3
res = step(replace((const char*)res, "r3", "R3"), res);
// replace the final r with 3
res = step(replace_end((const char*)res, "r", "3"), res);
// replace r with 2
res = step(replace((const char*)res, "r", "2"), res);

// replace l3 with L3
res = step(replace((const char*)res, "l3", "L3"), res);
// replace the final l with 3
res = step(replace_end((const char*)res, "l", "3"), res);
// replace l with 2
res = step(replace((const char*)res, "l", "2"), res);

// remove all 2s
res = step(replace((const char*)res, "2", ""), res);
// replace the final 3 with A
res = step(replace_end((const char*)res, "3", "A"), res);
// remove all 3s
res = step(replace((const char*)res, "3", ""), res);

// put ten 1s on the end
// take the first ten characters as the code
res = step(pad((const char*)res), res);

return res;
}
1 change: 1 addition & 0 deletions src/fuzzy/fuzzy.h
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@ unsigned optimal_string_alignment(const char*, const char*);
int edit_distance(const char*, const char*, int*);

// phonetics
char* caverphone(const char*);
char* soundex(const char*);
char* refined_soundex(const char*);
unsigned char* phonetic_hash(const unsigned char*, int);
Expand Down
19 changes: 19 additions & 0 deletions src/sqlite3-fuzzy.c
Original file line number Diff line number Diff line change
Expand Up @@ -237,6 +237,23 @@ static void sqlite3_script_code(sqlite3_context* context, int argc, sqlite3_valu
sqlite3_result_int(context, res);
}

// Below are custom functions

// sqlite3_caverphone implements Caverphone coding
static void sqlite3_caverphone(sqlite3_context* context, int argc, sqlite3_value** argv) {
assert(argc == 1);
const unsigned char* source = sqlite3_value_text(argv[0]);
if (source == 0) {
return;
}
if (!is_ascii(source)) {
sqlite3_result_error(context, "argument should be ASCII string", -1);
return;
}
char* result = caverphone((const char*)source);
sqlite3_result_text(context, result, -1, free);
}

/*
* Registers the extension.
*/
Expand All @@ -259,5 +276,7 @@ __declspec(dllexport)
sqlite3_create_function(db, "phonetic_hash", 1, flags, 0, sqlite3_phonetic_hash, 0, 0);
sqlite3_create_function(db, "script_code", 1, flags, 0, sqlite3_script_code, 0, 0);
sqlite3_create_function(db, "translit", 1, flags, 0, sqlite3_transliterate, 0, 0);
// custom
sqlite3_create_function(db, "caverphone", 1, flags, 0, sqlite3_caverphone, 0, 0);
return SQLITE_OK;
}
7 changes: 7 additions & 0 deletions test/fuzzy.sql
Original file line number Diff line number Diff line change
Expand Up @@ -70,3 +70,10 @@ select '122', rsoundex('') = '';
select '123', rsoundex('phonetics') = 'P1080603';
select '124', rsoundex('is') = 'I03';
select '125', rsoundex('awesome') = 'A03080';

-- Caverphone phonetic code
select '131', caverphone(null) is null;
select '132', caverphone('') = '';
select '133', caverphone('phonetics') = 'FNTKS11111';
select '134', caverphone('is') = 'AS11111111';
select '135', caverphone('awesome') = 'AWSM111111';

0 comments on commit 76b1609

Please sign in to comment.