-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathfalse-sharing-with-report.c
268 lines (232 loc) · 7.57 KB
/
false-sharing-with-report.c
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
/*
* This is an example program to show false sharing between
* numa nodes.
*
* It can be compiled two ways:
* gcc -g false-sharing-with-report.c -pthread -lnuma -o false_sharing
* gcc -g false-sharing-with-report.c -pthread -lnuma -DNO_FALSE_SHARING -o no_false_sharing
*
* The -DNO_FALSE_SHARING macro reduces the false sharing by expanding the shared data
* structure into two different cachelines, (and it runs faster).
*
* The usage is:
* ./false_sharing <number of threads per node>
* ./no_false_sharing <number of threads per node>
*
* The program will make half the threads writer threads and half reader
* threads. It will pin those threads in round-robin format to the
* different numa nodes in the system.
*
* For example, on a system with 4 numa nodes:
* ./false_sharing 2
* 12165 mticks, reader_thd (thread 6), on node 2 (cpu 144).
* 12403 mticks, reader_thd (thread 5), on node 1 (cpu 31).
* 12514 mticks, reader_thd (thread 4), on node 0 (cpu 96).
* 12703 mticks, reader_thd (thread 7), on node 3 (cpu 170).
* 12982 mticks, lock_th (thread 0), on node 0 (cpu 1).
* 13018 mticks, lock_th (thread 1), on node 1 (cpu 24).
* 13049 mticks, lock_th (thread 3), on node 3 (cpu 169).
* 13050 mticks, lock_th (thread 2), on node 2 (cpu 49).
*
* # ./no_false_sharing 2
* 1918 mticks, reader_thd (thread 4), on node 0 (cpu 96).
* 2432 mticks, reader_thd (thread 7), on node 3 (cpu 170).
* 2468 mticks, reader_thd (thread 6), on node 2 (cpu 146).
* 3903 mticks, reader_thd (thread 5), on node 1 (cpu 40).
* 7560 mticks, lock_th (thread 0), on node 0 (cpu 1).
* 7574 mticks, lock_th (thread 2), on node 2 (cpu 145).
* 7602 mticks, lock_th (thread 3), on node 3 (cpu 169).
* 7625 mticks, lock_th (thread 1), on node 1 (cpu 24).
*
*/
#define _MULTI_THREADED
#define _GNU_SOURCE
#include <stdio.h>
#include <string.h>
#include <stdlib.h>
#include <stdint.h>
#include <unistd.h>
#include <sched.h>
#include <pthread.h>
#include <numa.h>
#include <sys/types.h>
/*
* A thread on each numa node seems to provoke cache misses
*/
#define LOOP_CNT (5 * 1024 * 1024)
#if defined(__x86_64__) || defined(__i386__)
static __inline__ uint64_t rdtsc() {
unsigned hi, lo;
__asm__ __volatile__ ( "rdtsc" : "=a"(lo), "=d"(hi));
return ( (uint64_t)lo) | ( ((uint64_t)hi) << 32);
}
#elif defined(__aarch64__)
static __inline__ uint64_t rdtsc(void)
{
uint64_t val;
/*
* According to ARM DDI 0487F.c, from Armv8.0 to Armv8.5 inclusive, the
* system counter is at least 56 bits wide; from Armv8.6, the counter
* must be 64 bits wide. So the system counter could be less than 64
* bits wide and it is attributed with the flag 'cap_user_time_short'
* is true.
*/
asm volatile("mrs %0, cntvct_el0" : "=r" (val));
return val;
}
#endif
/*
* Create a struct where reader fields share a cacheline with the hot lock field.
* Compiling with -DNO_FALSE_SHARING inserts padding to avoid that sharing.
*/
typedef struct _buf {
long lock0;
long lock1;
long reserved1;
#if defined(NO_FALSE_SHARING)
long pad[5]; // to keep the 'lock*' fields on their own cacheline.
#else
long pad[1]; // to provoke false sharing.
#endif
long reader1;
long reader2;
long reader3;
long reader4;
} buf __attribute__((aligned (64)));
buf buf1;
buf buf2;
volatile int wait_to_begin = 1;
struct thread_data *thread;
int max_node_num;
int num_threads;
char * lock_thd_name = "lock_th";
char * reader_thd_name = "reader_thd";
#define checkResults(string, val) { \
if (val) { \
printf("Failed with %d at %s", val, string); \
exit(1); \
} \
}
struct thread_data {
pthread_t tid;
long tix;
long node;
char *name;
};
/*
* Bind a thread to the specified numa node.
*/
void setAffinity(void *parm) {
volatile uint64_t rc, j;
int node = ((struct thread_data *)parm)->node;
char *func_name = ((struct thread_data *)parm)->name;
numa_run_on_node(node);
pthread_setname_np(pthread_self(),func_name);
}
/*
* Thread function to simulate the false sharing.
* The "lock" threads will test-n-set the lock field,
* while the reader threads will just read the other fields
* in the struct.
*/
extern void *read_write_func(void *parm) {
int tix = ((struct thread_data *)parm)->tix;
uint64_t start, stop, j;
char *thd_name = ((struct thread_data *)parm)->name;
// Pin each thread to a numa node.
setAffinity(parm);
// Wait for all threads to get created before starting.
while(wait_to_begin) ;
start = rdtsc();
for(j=0; j<LOOP_CNT; j++) {
// Check for lock thread.
if (*thd_name == *lock_thd_name) {
__sync_lock_test_and_set(&buf1.lock0, 1 );
buf1.lock0 += 1;
buf2.lock1 = 1;
} else {
// Reader threads.
switch(tix % max_node_num) {
volatile long var;
case 0:
var = *(volatile uint64_t *)&buf1.reader1;
var = *(volatile uint64_t *)&buf2.reader1;
break;
case 1:
var = *(volatile uint64_t *)&buf1.reader2;
var = *(volatile uint64_t *)&buf2.reader2;
break;
case 2:
var = *(volatile uint64_t *)&buf1.reader3;
var = *(volatile uint64_t *)&buf2.reader3;
break;
case 3:
var = *(volatile uint64_t *)&buf1.reader4;
var = *(volatile uint64_t *)&buf2.reader4;
break;
};
};
} // End of for LOOP_CNT loop
// Print out stats
//
stop = rdtsc();
int cpu = sched_getcpu();
int node = numa_node_of_cpu(cpu);
printf("%ld mticks, %s (thread %d), on node %d (cpu %d).\n", (stop-start)/1000000, thd_name, tix, node, cpu);
return NULL;
}
int main ( int argc, char *argv[] )
{
int i, n, rc=0;
if ( argc != 2 ) /* argc should be 2 for correct execution */
{
printf( "usage: %s <n>\n", argv[0] );
printf( "where \"n\" is the number of threads per node\n");
exit(1);
}
if ( numa_available() < 0 )
{
printf( "NUMA not available\n" );
exit(1);
}
int thread_cnt = atoi(argv[1]);
max_node_num = numa_max_node();
if ( max_node_num == 0 )
max_node_num = 1;
int node_cnt = max_node_num + 1;
// Use "thread_cnt" threads per node.
num_threads = (max_node_num +1) * thread_cnt;
thread = malloc( sizeof(struct thread_data) * num_threads);
// Create the first half of threads as lock threads.
// Assign each thread a successive round robin node to
// be pinned to (later after it gets created.)
//
for (i=0; i<=(num_threads/2 - 1); i++) {
thread[i].tix = i;
thread[i].node = i%node_cnt;
thread[i].name = lock_thd_name;
rc = pthread_create(&thread[i].tid, NULL, read_write_func, &thread[i]);
checkResults("pthread_create()\n", rc);
usleep(500);
}
// Create the second half of threads as reader threads.
// Assign each thread a successive round robin node to
// be pinned to (later after it gets created.)
//
for (i=((num_threads/2)); i<(num_threads); i++) {
thread[i].tix = i;
thread[i].node = i%node_cnt;
thread[i].name = reader_thd_name;
rc = pthread_create(&thread[i].tid, NULL, read_write_func, &thread[i]);
checkResults("pthread_create()\n", rc);
usleep(500);
}
// Sync to let threads start together
usleep(500);
wait_to_begin = 0;
for (i=0; i <num_threads; i++) {
rc = pthread_join(thread[i].tid, NULL);
checkResults("pthread_join()\n", rc);
}
return 0;
}