Skip to content

Commit bfec486

Browse files
committed
Replace trie with general string-based-key hashmap
Previously, trie implementation is not consistent, mainly because of using index to point the referencing func_t to FUNCS, additionally, trie's advantage is that enables prefix lookup, but in shecc, it hasn't been used in this way, furthur more, it takes 512 bytes per trie node, while in this implementation, it 24 + W (W stands for key length including NULL character) bytes per hashmap bucket node, which significantly reduces memory usage. This also allows for future refactoring of additional structures using a hashmap implementation. Notice that currently FNV-1a hashing function uses signed integer to hash keys, which would lead to undefined behavior, instead of adding unsigned integer to resolve this, we add "-fwrapv" compiler flag to instruct gcc to wrap overflow result according to 2's complement representation. Meanwhile in shecc, it's guaranteed to be always wrap around according to 2's complement representation.
1 parent 09bb918 commit bfec486

File tree

3 files changed

+212
-82
lines changed

3 files changed

+212
-82
lines changed

Makefile

+1
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
CC ?= gcc
22
CFLAGS := -O -g \
33
-std=c99 -pedantic \
4+
-fwrapv \
45
-Wall -Wextra \
56
-Wno-unused-but-set-variable \
67
-Wno-variadic-macros \

src/defs.h

+11-4
Original file line numberDiff line numberDiff line change
@@ -20,7 +20,6 @@
2020
#define MAX_LOCALS 1500
2121
#define MAX_FIELDS 64
2222
#define MAX_FUNCS 512
23-
#define MAX_FUNC_TRIES 2160
2423
#define MAX_TYPES 64
2524
#define MAX_IR_INSTR 50000
2625
#define MAX_BB_PRED 128
@@ -305,10 +304,18 @@ typedef struct {
305304
int value;
306305
} constant_t;
307306

307+
/* string-based hash map definitions */
308+
309+
typedef struct hashmap_node {
310+
char *key;
311+
void *val;
312+
struct hashmap_node *next;
313+
} hashmap_node_t;
314+
308315
typedef struct {
309-
int index;
310-
int next[128];
311-
} trie_t;
316+
int size;
317+
hashmap_node_t **buckets;
318+
} hashmap_t;
312319

313320
struct phi_operand {
314321
var_t *var;

src/globals.c

+200-78
Original file line numberDiff line numberDiff line change
@@ -15,18 +15,13 @@ block_list_t BLOCKS;
1515
macro_t *MACROS;
1616
int macros_idx = 0;
1717

18-
/* the first element is reserved for global scope */
19-
func_t *FUNCS;
20-
int funcs_idx = 1;
21-
22-
/* FUNC_TRIES is used to improve the performance of the find_func function.
23-
* Instead of searching through all functions and comparing their names, we can
24-
* utilize the trie data structure to search for existing functions efficiently.
25-
* The index starts from 1 because the first trie node represents an empty input
26-
* string, and it is not possible to record a function with an empty name.
18+
/* FUNCS_MAP is used to integerate function storing and boost lookup
19+
* performance, currently it uses FNV-1a hash function to hash function
20+
* name. The bucket size defaults to MAX_FUNCS. Ideally, it should be a small
21+
* number, but due to lack of rehashing implementation, to prevent collision,
22+
* we have to initially create large amount of buckets.
2723
*/
28-
trie_t *FUNC_TRIES;
29-
int func_tries_idx = 1;
24+
hashmap_t *FUNCS_MAP;
3025

3126
type_t *TYPES;
3227
int types_idx = 0;
@@ -75,72 +70,195 @@ char *elf_strtab;
7570
char *elf_section;
7671

7772
/**
78-
* insert_trie() - Inserts a new element into the trie structure.
79-
* @trie: A pointer to the trie where the name will be inserted.
80-
* @name: The name to be inserted into the trie.
81-
* @funcs_index: The index of the pointer to the func_t. The index is recorded
82-
* in a 1-indexed format. Because the first element of 'FUNCS' has been
83-
* reserved, there is no need to shift it.
84-
* Return: The index of the pointer to the func_t.
73+
* hashmap_hash_index() - hashses a string with FNV-1a hash function
74+
* and converts into usable hashmap index. The range of returned
75+
* hashmap index is ranged from "(0 ~ 2,147,483,647) mod size" due to
76+
* lack of unsigned integer implementation.
77+
* @size: The size of map. Must not be negative or 0.
78+
* @key: The key string. May be NULL.
8579
*
86-
* If the function has been inserted, the return value is the index of the
87-
* function in FUNCS. Otherwise, the return value is the value of the parameter
88-
* @funcs_index.
80+
* Return: The usable hashmap index.
8981
*/
90-
int insert_trie(trie_t *trie, char *name, int funcs_index)
91-
{
92-
char first_char;
93-
int fc;
94-
95-
while (1) {
96-
first_char = *name;
97-
fc = first_char;
98-
if (!fc) {
99-
if (!trie->index)
100-
trie->index = funcs_index;
101-
return trie->index;
102-
}
103-
if (!trie->next[fc]) {
104-
/* FIXME: The func_tries_idx variable may exceed the maximum number,
105-
* which can lead to a segmentation fault. This issue is affected by
106-
* the number of functions and the length of their names. The proper
107-
* way to handle this is to dynamically allocate a new element.
108-
*/
109-
trie->next[fc] = func_tries_idx++;
110-
for (int i = 0; i < 128; i++)
111-
FUNC_TRIES[trie->next[fc]].next[i] = 0;
112-
FUNC_TRIES[trie->next[fc]].index = 0;
113-
}
114-
trie = &FUNC_TRIES[trie->next[fc]];
115-
name++;
82+
int hashmap_hash_index(int size, char *key)
83+
{
84+
int hash = 0x811c9dc5, mask;
85+
86+
for (; *key; key++) {
87+
hash ^= *key;
88+
hash *= 0x01000193;
11689
}
90+
91+
mask = hash >> 31;
92+
return ((hash ^ mask) - mask) & (size - 1);
93+
}
94+
95+
int round_up_pow2(int v)
96+
{
97+
v--;
98+
v |= v >> 1;
99+
v |= v >> 2;
100+
v |= v >> 4;
101+
v |= v >> 8;
102+
v |= v >> 16;
103+
v++;
104+
return v;
117105
}
118106

119107
/**
120-
* find_trie() - search the index of the function name in the trie
121-
* @trie: A pointer to the trie where the name will be searched.
122-
* @name: The name to be searched.
108+
* hashmap_create() - creates a hashmap on heap. Notice that
109+
* provided size will always be rounded up to nearest power of 2.
110+
* @size: The initial bucket size of hashmap. Must not be 0 or
111+
* negative.
123112
*
124-
* Return: The index of the pointer to the func_t.
113+
* Return: The pointer of created hashmap.
114+
*/
115+
hashmap_t *hashmap_create(int size)
116+
{
117+
hashmap_t *map = malloc(sizeof(hashmap_t));
118+
119+
if (!map) {
120+
printf("Failed to allocate hashmap_t with size %d\n", size);
121+
return NULL;
122+
}
123+
124+
map->size = round_up_pow2(size);
125+
map->buckets = malloc(map->size * sizeof(hashmap_node_t *));
126+
127+
if (!map->buckets) {
128+
printf("Failed to allocate buckets in hashmap_t\n");
129+
free(map);
130+
return NULL;
131+
}
132+
133+
for (int i = 0; i < map->size; i++)
134+
map->buckets[i] = 0;
135+
136+
return map;
137+
}
138+
139+
/**
140+
* hashmap_node_new() - creates a hashmap node on heap.
141+
* @key: The key of node. Must not be NULL.
142+
* @val: The value of node. Could be NULL.
125143
*
126-
* 0 - the name not found.
127-
* otherwise - the index of the founded index in the trie array.
144+
* Return: The pointer of created node.
128145
*/
129-
int find_trie(trie_t *trie, char *name)
146+
hashmap_node_t *hashmap_node_new(char *key, void *val)
130147
{
131-
char first_char;
132-
int fc;
148+
if (!key)
149+
return NULL;
150+
151+
int len = strlen(key);
152+
hashmap_node_t *node = malloc(sizeof(hashmap_node_t));
133153

134-
while (1) {
135-
first_char = *name;
136-
fc = first_char;
137-
if (!fc)
138-
return trie->index;
139-
if (!trie->next[fc])
140-
return 0;
141-
trie = &FUNC_TRIES[trie->next[fc]];
142-
name++;
154+
155+
if (!node) {
156+
printf("Failed to allocate hashmap_node_t\n");
157+
return NULL;
143158
}
159+
160+
node->key = calloc(len + 1, sizeof(char));
161+
162+
if (!node->key) {
163+
printf("Failed to allocate hashmap_node_t key with size %d\n");
164+
free(node);
165+
return NULL;
166+
}
167+
168+
strcpy(node->key, key);
169+
node->val = val;
170+
node->next = NULL;
171+
return node;
172+
}
173+
174+
/**
175+
* hashmap_put() - puts a key-value pair into given hashmap.
176+
* If key already contains a value, then replace it with new
177+
* value, the old value will be freed.
178+
* @map: The hashmap to be put into. Must not be NULL.
179+
* @key: The key string. May be NULL.
180+
* @val: The value pointer. May be NULL. This value's lifetime
181+
* is held by hashmap.
182+
*/
183+
void hashmap_put(hashmap_t *map, char *key, void *val)
184+
{
185+
if (!map)
186+
return;
187+
188+
int index = hashmap_hash_index(map->size, key);
189+
hashmap_node_t *cur = map->buckets[index];
190+
191+
if (!cur) {
192+
map->buckets[index] = hashmap_node_new(key, val);
193+
} else {
194+
while (cur->next)
195+
cur = cur->next;
196+
cur->next = hashmap_node_new(key, val);
197+
}
198+
199+
/* TODO: Rehash if size exceeds size * load factor */
200+
}
201+
202+
/**
203+
* hashmap_get() - gets value from hashmap from given key.
204+
* @map: The hashmap to be looked up. Must no be NULL.
205+
* @key: The key string. May be NULL.
206+
*
207+
* Return: The look up result, if the key-value pair entry
208+
* exists, then returns its value's address, NULL otherwise.
209+
*/
210+
void *hashmap_get(hashmap_t *map, char *key)
211+
{
212+
if (!map)
213+
return NULL;
214+
215+
int index = hashmap_hash_index(map->size, key);
216+
217+
for (hashmap_node_t *cur = map->buckets[index]; cur; cur = cur->next)
218+
if (!strcmp(cur->key, key))
219+
return cur->val;
220+
221+
return NULL;
222+
}
223+
224+
/**
225+
* hashmap_contains() - checks if the key-value pair entry exists
226+
* from given key.
227+
* @map: The hashmap to be looked up. Must no be NULL.
228+
* @key: The key string. May be NULL.
229+
*
230+
* Return: The look up result, if the key-value pair entry
231+
* exists, then returns true, false otherwise.
232+
*/
233+
bool hashmap_contains(hashmap_t *map, char *key)
234+
{
235+
return hashmap_get(map, key);
236+
}
237+
238+
/**
239+
* hashmap_free() - frees the hashmap, this also frees key-value pair
240+
* entry's value.
241+
* @map: The hashmap to be looked up. Must no be NULL.
242+
*/
243+
void hashmap_free(hashmap_t *map)
244+
{
245+
if (!map)
246+
return;
247+
248+
for (int i = 0; i < map->size; i++) {
249+
for (hashmap_node_t *cur = map->buckets[i], *next; cur; cur = next) {
250+
next = cur->next;
251+
free(cur->key);
252+
free(cur->val);
253+
/* FIXME: Remove this if-clause will cause double free error */
254+
if (cur != map->buckets[0])
255+
free(cur);
256+
cur = next;
257+
}
258+
}
259+
260+
free(map->buckets);
261+
free(map);
144262
}
145263

146264
/* options */
@@ -321,12 +439,20 @@ int find_macro_param_src_idx(char *name, block_t *parent)
321439
func_t *add_func(char *name)
322440
{
323441
func_t *fn;
324-
int index = insert_trie(FUNC_TRIES, name, funcs_idx);
325-
if (index == funcs_idx) {
326-
fn = &FUNCS[funcs_idx++];
442+
if (hashmap_contains(FUNCS_MAP, name)) {
443+
fn = hashmap_get(FUNCS_MAP, name);
444+
} else {
445+
fn = malloc(sizeof(func_t));
446+
447+
if (!fn) {
448+
printf("Failed to allocate func_t\n");
449+
return NULL;
450+
}
451+
452+
hashmap_put(FUNCS_MAP, name, fn);
327453
strcpy(fn->return_def.var_name, name);
328454
}
329-
fn = &FUNCS[index];
455+
330456
fn->stack_size = 4; /* starting point of stack */
331457
return fn;
332458
}
@@ -361,10 +487,7 @@ constant_t *find_constant(char alias[])
361487

362488
func_t *find_func(char func_name[])
363489
{
364-
int index = find_trie(FUNC_TRIES, func_name);
365-
if (index)
366-
return &FUNCS[index];
367-
return NULL;
490+
return hashmap_get(FUNCS_MAP, func_name);
368491
}
369492

370493
var_t *find_member(char token[], type_t *type)
@@ -600,8 +723,7 @@ void global_init()
600723
BLOCKS.head = NULL;
601724
BLOCKS.tail = NULL;
602725
MACROS = malloc(MAX_ALIASES * sizeof(macro_t));
603-
FUNCS = malloc(MAX_FUNCS * sizeof(func_t));
604-
FUNC_TRIES = malloc(MAX_FUNC_TRIES * sizeof(trie_t));
726+
FUNCS_MAP = hashmap_create(MAX_FUNCS);
605727
TYPES = malloc(MAX_TYPES * sizeof(type_t));
606728
GLOBAL_IR = malloc(MAX_GLOBAL_IR * sizeof(ph1_ir_t));
607729
PH1_IR = malloc(MAX_IR_INSTR * sizeof(ph1_ir_t));
@@ -619,7 +741,8 @@ void global_init()
619741
elf_section = malloc(MAX_SECTION);
620742

621743
/* set starting point of global stack manually */
622-
FUNCS[0].stack_size = 4;
744+
func_t *global_func = add_func("");
745+
global_func->stack_size = 4;
623746
}
624747

625748
void global_release()
@@ -630,8 +753,7 @@ void global_release()
630753
BLOCKS.head = next;
631754
}
632755
free(MACROS);
633-
free(FUNCS);
634-
free(FUNC_TRIES);
756+
hashmap_free(FUNCS_MAP);
635757
free(TYPES);
636758
free(GLOBAL_IR);
637759
free(PH1_IR);

0 commit comments

Comments
 (0)