@@ -26,7 +26,7 @@ typedef struct {
2626 void * output ;
2727 size_t total_elements ;
2828 size_t chunk_elements ;
29-
29+
3030 // Shared work queue
3131 pthread_mutex_t work_mutex ;
3232 pthread_cond_t work_available ;
@@ -38,21 +38,21 @@ typedef struct {
3838} thread_pool_t ;
3939
4040static void * worker_thread (void * arg ) {
41- thread_pool_t * pool = (thread_pool_t * )arg ;
42-
41+ thread_pool_t * pool = (thread_pool_t * ) arg ;
42+
4343 while (1 ) {
4444 pthread_mutex_lock (& pool -> work_mutex );
45-
45+
4646 // Wait for work or exit signal
4747 while (!pool -> work_ready && !pool -> should_exit ) {
4848 pthread_cond_wait (& pool -> work_available , & pool -> work_mutex );
4949 }
50-
50+
5151 if (pool -> should_exit ) {
5252 pthread_mutex_unlock (& pool -> work_mutex );
5353 break ;
5454 }
55-
55+
5656 // Process chunks until all work is done
5757 while (pool -> next_chunk_idx < pool -> total_elements ) {
5858 // Get next chunk
@@ -61,20 +61,20 @@ static void *worker_thread(void *arg) {
6161 if (my_chunk_idx + chunk_size > pool -> total_elements ) {
6262 chunk_size = pool -> total_elements - my_chunk_idx ;
6363 }
64-
64+
6565 pool -> next_chunk_idx += chunk_size ;
6666 pthread_mutex_unlock (& pool -> work_mutex );
67-
67+
6868 // Do the work (outside mutex)
6969 const void * adjusted_inputs [10 ];
7070 for (int i = 0 ; i < pool -> num_inputs ; i ++ ) {
71- adjusted_inputs [i ] = (const double * )pool -> inputs [i ] + my_chunk_idx ;
71+ adjusted_inputs [i ] = (const double * ) pool -> inputs [i ] + my_chunk_idx ;
7272 }
73- double * output = (double * )pool -> output + my_chunk_idx ;
74-
73+ double * output = (double * ) pool -> output + my_chunk_idx ;
74+
7575 me_eval (pool -> expr , adjusted_inputs , pool -> num_inputs ,
7676 output , chunk_size );
77-
77+
7878 // Update completion status
7979 pthread_mutex_lock (& pool -> work_mutex );
8080 pool -> completed_elements += chunk_size ;
@@ -83,17 +83,17 @@ static void *worker_thread(void *arg) {
8383 pthread_cond_signal (& pool -> all_done );
8484 }
8585 }
86-
86+
8787 pthread_mutex_unlock (& pool -> work_mutex );
8888 }
89-
89+
9090 return NULL ;
9191}
9292
93- static thread_pool_t * create_thread_pool (int num_threads , pthread_t * * threads_out ) {
93+ static thread_pool_t * create_thread_pool (int num_threads , pthread_t * * threads_out ) {
9494 thread_pool_t * pool = malloc (sizeof (thread_pool_t ));
9595 if (!pool ) return NULL ;
96-
96+
9797 pool -> expr = NULL ;
9898 pool -> inputs = NULL ;
9999 pool -> num_inputs = 0 ;
@@ -104,21 +104,21 @@ static thread_pool_t* create_thread_pool(int num_threads, pthread_t **threads_ou
104104 pool -> completed_elements = 0 ;
105105 pool -> work_ready = false;
106106 pool -> should_exit = false;
107-
107+
108108 pthread_mutex_init (& pool -> work_mutex , NULL );
109109 pthread_cond_init (& pool -> work_available , NULL );
110110 pthread_cond_init (& pool -> all_done , NULL );
111-
111+
112112 pthread_t * threads = malloc (num_threads * sizeof (pthread_t ));
113113 if (!threads ) {
114114 free (pool );
115115 return NULL ;
116116 }
117-
117+
118118 for (int i = 0 ; i < num_threads ; i ++ ) {
119119 pthread_create (& threads [i ], NULL , worker_thread , pool );
120120 }
121-
121+
122122 * threads_out = threads ;
123123 return pool ;
124124}
@@ -128,22 +128,22 @@ static void destroy_thread_pool(thread_pool_t *pool, pthread_t *threads, int num
128128 pool -> should_exit = true;
129129 pthread_cond_broadcast (& pool -> work_available );
130130 pthread_mutex_unlock (& pool -> work_mutex );
131-
131+
132132 for (int i = 0 ; i < num_threads ; i ++ ) {
133133 pthread_join (threads [i ], NULL );
134134 }
135-
135+
136136 pthread_mutex_destroy (& pool -> work_mutex );
137137 pthread_cond_destroy (& pool -> work_available );
138138 pthread_cond_destroy (& pool -> all_done );
139-
139+
140140 free (threads );
141141 free (pool );
142142}
143143
144144static double benchmark_chunksize (thread_pool_t * pool , size_t chunk_bytes ,
145- double * a , double * b , double * c , double * result ,
146- size_t total_elements ) {
145+ double * a , double * b , double * c , double * result ,
146+ size_t total_elements ) {
147147 const size_t chunk_elements = chunk_bytes / sizeof (double );
148148 if (chunk_elements == 0 ) return 0.0 ;
149149
@@ -167,13 +167,13 @@ static double benchmark_chunksize(thread_pool_t *pool, size_t chunk_bytes,
167167 pool -> completed_elements = 0 ;
168168 pool -> work_ready = true;
169169 pthread_mutex_unlock (& pool -> work_mutex );
170-
170+
171171 struct timespec start , end ;
172172 clock_gettime (CLOCK_MONOTONIC , & start );
173173
174174 // Signal threads that work is available
175175 pthread_cond_broadcast (& pool -> work_available );
176-
176+
177177 // Wait for all work to be completed
178178 pthread_mutex_lock (& pool -> work_mutex );
179179 while (pool -> completed_elements < pool -> total_elements ) {
@@ -184,7 +184,7 @@ static double benchmark_chunksize(thread_pool_t *pool, size_t chunk_bytes,
184184 clock_gettime (CLOCK_MONOTONIC , & end );
185185
186186 double elapsed = (end .tv_sec - start .tv_sec ) + (end .tv_nsec - start .tv_nsec ) / 1e9 ;
187- double throughput = (total_elements / elapsed ) / 1e6 ; // Melems/sec
187+ double throughput = (total_elements / elapsed ) / 1e6 ; // Melems/sec
188188
189189 me_free (expr );
190190
@@ -215,23 +215,29 @@ int main() {
215215
216216 if (!a || !b || !c || !result ) {
217217 fprintf (stderr , "Failed to allocate arrays\n" );
218- free (a ); free (b ); free (c ); free (result );
218+ free (a );
219+ free (b );
220+ free (c );
221+ free (result );
219222 return 1 ;
220223 }
221224
222225 // Initialize data once
223226 for (size_t i = 0 ; i < total_elements ; i ++ ) {
224- a [i ] = (double )(i % 1000 ) / 100.0 ;
225- b [i ] = (double )((i + 333 ) % 1000 ) / 100.0 ;
226- c [i ] = (double )((i % 100 ) - 50 );
227+ a [i ] = (double ) (i % 1000 ) / 100.0 ;
228+ b [i ] = (double ) ((i + 333 ) % 1000 ) / 100.0 ;
229+ c [i ] = (double ) ((i % 100 ) - 50 );
227230 }
228231
229232 // Create thread pool once
230233 pthread_t * threads ;
231234 thread_pool_t * pool = create_thread_pool (NUM_THREADS , & threads );
232235 if (!pool ) {
233236 fprintf (stderr , "Failed to create thread pool\n" );
234- free (a ); free (b ); free (c ); free (result );
237+ free (a );
238+ free (b );
239+ free (c );
240+ free (result );
235241 return 1 ;
236242 }
237243
@@ -258,8 +264,8 @@ int main() {
258264 continue ;
259265 }
260266
261- double bandwidth = (throughput * 4 * sizeof (double )) / 1000.0 ; // MB/s to GB/s, 3 inputs + 1 output
262- double gflops = throughput * 2.0 / 1000.0 ; // 2 FLOP per element (1 add, 1 mul)
267+ double bandwidth = (throughput * 4 * sizeof (double )) / 1000.0 ; // MB/s to GB/s, 3 inputs + 1 output
268+ double gflops = throughput * 2.0 / 1000.0 ; // 2 FLOP per element (1 add, 1 mul)
263269
264270 printf ("%10zu %21.2f %16.2f %8.2f\n" ,
265271 chunk_kb , throughput , bandwidth , gflops );
0 commit comments