-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathcache_latency_checker.cpp
206 lines (180 loc) · 6.31 KB
/
cache_latency_checker.cpp
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
// The following code is adapted from: https://stackoverflow.com/a/21463541/1691873
#include <fcntl.h>
#include <stdint.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <sys/mman.h>
#include <unistd.h>
int i386_cpuid_caches(size_t* data_caches) {
int i;
int num_data_caches = 0;
for (i = 0; i < 32; i++) {
// Variables to hold the contents of the 4 i386 legacy registers
uint32_t eax, ebx, ecx, edx;
eax = 4; // get cache info
ecx = i; // cache id
asm("cpuid" // call i386 cpuid instruction
: "+a"(eax) // contains the cpuid command code, 4 for cache query
,
"=b"(ebx), "+c"(ecx) // contains the cache id
,
"=d"(edx)); // generates output in 4 registers eax, ebx, ecx and edx
// taken from http://download.intel.com/products/processor/manual/325462.pdf Vol. 2A 3-149
int cache_type = eax & 0x1F;
if (cache_type == 0) // end of valid cache identifiers
break;
const char* cache_type_string;
switch (cache_type) {
case 1:
cache_type_string = "Data Cache";
break;
case 2:
cache_type_string = "Instruction Cache";
break;
case 3:
cache_type_string = "Unified Cache";
break;
default:
cache_type_string = "Unknown Type Cache";
break;
}
int cache_level = (eax >>= 5) & 0x7;
int cache_is_self_initializing = (eax >>= 3) & 0x1; // does not need SW initialization
int cache_is_fully_associative = (eax >>= 1) & 0x1;
// taken from http://download.intel.com/products/processor/manual/325462.pdf 3-166 Vol. 2A
// ebx contains 3 integers of 10, 10 and 12 bits respectively
unsigned int cache_sets = ecx + 1;
unsigned int cache_coherency_line_size = (ebx & 0xFFF) + 1;
unsigned int cache_physical_line_partitions = ((ebx >>= 12) & 0x3FF) + 1;
unsigned int cache_ways_of_associativity = ((ebx >>= 10) & 0x3FF) + 1;
// Total cache size is the product
size_t cache_total_size = cache_ways_of_associativity * cache_physical_line_partitions *
cache_coherency_line_size * cache_sets;
if (cache_type == 1 || cache_type == 3) {
data_caches[num_data_caches++] = cache_total_size;
}
printf(
"Cache ID %d:\n"
"- Level: %d\n"
"- Type: %s\n"
"- Sets: %d\n"
"- System Coherency Line Size: %d bytes\n"
"- Physical Line partitions: %d\n"
"- Ways of associativity: %d\n"
"- Total Size: %zu bytes (%zu kb)\n"
"- Is fully associative: %s\n"
"- Is Self Initializing: %s\n"
"\n",
i, cache_level, cache_type_string, cache_sets, cache_coherency_line_size,
cache_physical_line_partitions, cache_ways_of_associativity, cache_total_size,
cache_total_size >> 10, cache_is_fully_associative ? "true" : "false",
cache_is_self_initializing ? "true" : "false");
}
return num_data_caches;
}
int test_cache(size_t attempts, size_t lower_cache_size, size_t* latencies, size_t max_latency) {
int fd = open("/dev/urandom", O_RDONLY);
if (fd < 0) {
perror("open");
abort();
}
char* random_data = (char*)mmap(NULL, lower_cache_size, PROT_READ | PROT_WRITE,
MAP_PRIVATE | MAP_ANON // | MAP_POPULATE
,
-1, 0); // get some random data
if (random_data == MAP_FAILED) {
perror("mmap");
abort();
}
size_t i;
for (i = 0; i < lower_cache_size; i += sysconf(_SC_PAGESIZE)) {
random_data[i] = 1;
}
int64_t random_offset = 0;
while (attempts--) {
// use processor clock timer for exact measurement
random_offset += rand();
random_offset %= lower_cache_size;
int32_t cycles_used, edx, temp1, temp2;
asm("mfence\n\t" // memory fence
"rdtsc\n\t" // get cpu cycle count
"mov %%edx, %2\n\t"
"mov %%eax, %3\n\t"
"mfence\n\t" // memory fence
"mov %4, %%al\n\t" // load data
"mfence\n\t"
"rdtsc\n\t"
"sub %2, %%edx\n\t" // substract cycle count
"sbb %3, %%eax" // substract cycle count
: "=a"(cycles_used), "=d"(edx), "=r"(temp1), "=r"(temp2)
: "m"(random_data[random_offset]));
// printf("%d\n", cycles_used);
if (cycles_used < max_latency)
latencies[cycles_used]++;
else
latencies[max_latency - 1]++;
}
munmap(random_data, lower_cache_size);
return 0;
}
int main() {
size_t cache_sizes[32];
int num_data_caches = i386_cpuid_caches(cache_sizes);
size_t latencies[0x400];
memset(latencies, 0, sizeof(latencies));
size_t empty_cycles = 0;
size_t i;
int attempts = 1000000;
for (i = 0; i < attempts; i++) { // measure how much overhead we have for counting cyscles
int32_t cycles_used, edx, temp1, temp2;
asm("mfence\n\t" // memory fence
"rdtsc\n\t" // get cpu cycle count
"mov %%edx, %2\n\t"
"mov %%eax, %3\n\t"
"mfence\n\t" // memory fence
"mfence\n\t"
"rdtsc\n\t"
"sub %2, %%edx\n\t" // substract cycle count
"sbb %3, %%eax" // substract cycle count
: "=a"(cycles_used), "=d"(edx), "=r"(temp1), "=r"(temp2)
:);
if (cycles_used < sizeof(latencies) / sizeof(*latencies))
latencies[cycles_used]++;
else
latencies[sizeof(latencies) / sizeof(*latencies) - 1]++;
}
{
size_t j;
size_t sum = 0;
for (j = 0; j < sizeof(latencies) / sizeof(*latencies); j++) {
sum += latencies[j];
}
size_t sum2 = 0;
for (j = 0; j < sizeof(latencies) / sizeof(*latencies); j++) {
sum2 += latencies[j];
if (sum2 >= sum * .75) {
empty_cycles = j;
fprintf(stderr, "Empty counting takes %lu cycles\n", empty_cycles);
break;
}
}
}
for (i = 0; i < num_data_caches; i++) {
test_cache(attempts, cache_sizes[i] * 4, latencies, sizeof(latencies) / sizeof(*latencies));
size_t j;
size_t sum = 0;
for (j = 0; j < sizeof(latencies) / sizeof(*latencies); j++) {
sum += latencies[j];
}
size_t sum2 = 0;
for (j = 0; j < sizeof(latencies) / sizeof(*latencies); j++) {
sum2 += latencies[j];
if (sum2 >= sum * .75) {
fprintf(stderr, "Cache ID %lu has latency %lu cycles\n", i, j - empty_cycles);
break;
}
}
}
return 0;
}