@@ -1261,7 +1261,8 @@ void dpnp_rng_vonmises_large_kappa_c(void* result, const _DataType mu, const _Da
1261
1261
_DataType s_minus_one, hpt, r_over_two_kappa_minus_one, rho_minus_one;
1262
1262
_DataType* Uvec = nullptr ;
1263
1263
_DataType* Vvec = nullptr ;
1264
- size_t * n = nullptr ;
1264
+ bool * result_ready = nullptr ;
1265
+ bool * result_mask = nullptr ;
1265
1266
const _DataType d_zero = 0.0 , d_one = 1.0 ;
1266
1267
1267
1268
assert (kappa > 1.0 );
@@ -1277,50 +1278,59 @@ void dpnp_rng_vonmises_large_kappa_c(void* result, const _DataType mu, const _Da
1277
1278
1278
1279
Uvec = reinterpret_cast <_DataType*>(dpnp_memory_alloc_c (size * sizeof (_DataType)));
1279
1280
Vvec = reinterpret_cast <_DataType*>(dpnp_memory_alloc_c (size * sizeof (_DataType)));
1280
- n = reinterpret_cast <size_t *>(dpnp_memory_alloc_c (sizeof (size_t )));
1281
- for (*n = 0 ; *n < size;)
1281
+
1282
+ result_ready = reinterpret_cast <bool *>(dpnp_memory_alloc_c (1 * sizeof (bool )));
1283
+ result_ready[0 ] = false ;
1284
+ result_mask = reinterpret_cast <bool *>(dpnp_memory_alloc_c (size * sizeof (bool )));
1285
+ dpnp_full_c<bool >(result_ready, result_mask, size);
1286
+
1287
+ while (!result_ready[0 ])
1282
1288
{
1283
- size_t diff_size = size - *n;
1284
1289
mkl_rng::uniform<_DataType> uniform_distribution_u (d_zero, 0.5 * M_PI);
1285
- auto uniform_distr_u_event = mkl_rng::generate (uniform_distribution_u, DPNP_RNG_ENGINE, diff_size , Uvec);
1290
+ auto uniform_distr_u_event = mkl_rng::generate (uniform_distribution_u, DPNP_RNG_ENGINE, size , Uvec);
1286
1291
mkl_rng::uniform<_DataType> uniform_distribution_v (d_zero, d_one);
1287
- auto uniform_distr_v_event = mkl_rng::generate (uniform_distribution_v, DPNP_RNG_ENGINE, diff_size , Vvec);
1292
+ auto uniform_distr_v_event = mkl_rng::generate (uniform_distribution_v, DPNP_RNG_ENGINE, size , Vvec);
1288
1293
1289
- cl::sycl::range<1 > diff_gws (diff_size );
1294
+ cl::sycl::range<1 > gws (size );
1290
1295
auto paral_kernel_some = [&](cl::sycl::handler& cgh) {
1291
1296
cgh.depends_on ({uniform_distr_u_event, uniform_distr_v_event});
1292
- cgh.parallel_for (diff_gws , [=](cl::sycl::id<1 > global_id) {
1297
+ cgh.parallel_for (gws , [=](cl::sycl::id<1 > global_id) {
1293
1298
size_t i = global_id[0 ];
1299
+ if (!result_mask[i]) {
1300
+ _DataType sn, cn, sn2, cn2;
1301
+ _DataType neg_W_minus_one, V, Y;
1294
1302
1295
- _DataType sn, cn, sn2, cn2;
1296
- _DataType neg_W_minus_one, V, Y;
1297
-
1298
- sn = cl::sycl::sin (Uvec[i]);
1299
- cn = cl::sycl::cos (Uvec[i]);
1300
- V = Vvec[i];
1301
- sn2 = sn * sn;
1302
- cn2 = cn * cn;
1303
+ sn = cl::sycl::sin (Uvec[i]);
1304
+ cn = cl::sycl::cos (Uvec[i]);
1305
+ V = Vvec[i];
1306
+ sn2 = sn * sn;
1307
+ cn2 = cn * cn;
1303
1308
1304
- neg_W_minus_one = s_minus_one * sn2 / (0.5 * s_minus_one + cn2);
1305
- Y = kappa * (s_minus_one + neg_W_minus_one);
1309
+ neg_W_minus_one = s_minus_one * sn2 / (0.5 * s_minus_one + cn2);
1310
+ Y = kappa * (s_minus_one + neg_W_minus_one);
1306
1311
1307
- if ((Y * (2 - Y) >= V) || (cl::sycl::log (Y / V) + 1 >= Y))
1308
- {
1309
- Y = neg_W_minus_one * (2 - neg_W_minus_one);
1310
- if (Y < 0 )
1311
- Y = 0.0 ;
1312
- else if (Y > 1.0 )
1313
- Y = 1.0 ;
1314
- *n = *n + 1 ;
1315
- result1[*n] = cl::sycl::asin (cl::sycl::sqrt (Y));
1312
+ if ((Y * (2 - Y) >= V) || (cl::sycl::log (Y / V) + 1 >= Y))
1313
+ {
1314
+ Y = neg_W_minus_one * (2 - neg_W_minus_one);
1315
+ if (Y < 0 )
1316
+ Y = 0.0 ;
1317
+ else if (Y > 1.0 )
1318
+ Y = 1.0 ;
1319
+
1320
+ result1[i] = cl::sycl::asin (cl::sycl::sqrt (Y));
1321
+ result_mask[i] = true ;
1322
+ }
1316
1323
}
1317
1324
});
1318
1325
};
1319
1326
auto some_event = DPNP_QUEUE.submit (paral_kernel_some);
1320
1327
some_event.wait ();
1328
+
1329
+ dpnp_all_c<bool , bool >(result_mask, result_ready, size);
1321
1330
}
1322
1331
dpnp_memory_free_c (Uvec);
1323
- dpnp_memory_free_c (n);
1332
+ dpnp_memory_free_c (result_ready);
1333
+ dpnp_memory_free_c (result_mask);
1324
1334
1325
1335
mkl_rng::uniform<_DataType> uniform_distribution (d_zero, d_one);
1326
1336
auto uniform_distr_event = mkl_rng::generate (uniform_distribution, DPNP_RNG_ENGINE, size, Vvec);
@@ -1359,7 +1369,8 @@ void dpnp_rng_vonmises_small_kappa_c(void* result, const _DataType mu, const _Da
1359
1369
_DataType rho_over_kappa, rho, r, s_kappa;
1360
1370
_DataType* Uvec = nullptr ;
1361
1371
_DataType* Vvec = nullptr ;
1362
- size_t * n = nullptr ;
1372
+ bool * result_ready = nullptr ;
1373
+ bool * result_mask = nullptr ;
1363
1374
1364
1375
const _DataType d_zero = 0.0 , d_one = 1.0 ;
1365
1376
@@ -1374,39 +1385,47 @@ void dpnp_rng_vonmises_small_kappa_c(void* result, const _DataType mu, const _Da
1374
1385
1375
1386
Uvec = reinterpret_cast <_DataType*>(dpnp_memory_alloc_c (size * sizeof (_DataType)));
1376
1387
Vvec = reinterpret_cast <_DataType*>(dpnp_memory_alloc_c (size * sizeof (_DataType)));
1377
- n = reinterpret_cast <size_t *>(dpnp_memory_alloc_c (sizeof (size_t )));
1378
1388
1379
- for (*n = 0 ; *n < size;)
1389
+ result_ready = reinterpret_cast <bool *>(dpnp_memory_alloc_c (1 * sizeof (bool )));
1390
+ result_ready[0 ] = false ;
1391
+ result_mask = reinterpret_cast <bool *>(dpnp_memory_alloc_c (size * sizeof (bool )));
1392
+ dpnp_full_c<bool >(result_ready, result_mask, size);
1393
+
1394
+ while (!result_ready[0 ])
1380
1395
{
1381
- size_t diff_size = size - *n;
1382
1396
mkl_rng::uniform<_DataType> uniform_distribution_u (d_zero, M_PI);
1383
- auto uniform_distr_u_event = mkl_rng::generate (uniform_distribution_u, DPNP_RNG_ENGINE, diff_size , Uvec);
1397
+ auto uniform_distr_u_event = mkl_rng::generate (uniform_distribution_u, DPNP_RNG_ENGINE, size , Uvec);
1384
1398
mkl_rng::uniform<_DataType> uniform_distribution_v (d_zero, d_one);
1385
- auto uniform_distr_v_event = mkl_rng::generate (uniform_distribution_v, DPNP_RNG_ENGINE, diff_size , Vvec);
1399
+ auto uniform_distr_v_event = mkl_rng::generate (uniform_distribution_v, DPNP_RNG_ENGINE, size , Vvec);
1386
1400
1387
- cl::sycl::range<1 > diff_gws ((diff_size ));
1401
+ cl::sycl::range<1 > gws ((size ));
1388
1402
1389
1403
auto paral_kernel_some = [&](cl::sycl::handler& cgh) {
1390
1404
cgh.depends_on ({uniform_distr_u_event, uniform_distr_v_event});
1391
- cgh.parallel_for (diff_gws , [=](cl::sycl::id<1 > global_id) {
1405
+ cgh.parallel_for (gws , [=](cl::sycl::id<1 > global_id) {
1392
1406
size_t i = global_id[0 ];
1393
- _DataType Z, W, Y, V;
1394
- Z = cl::sycl::cos (Uvec[i]);
1395
- V = Vvec[i];
1396
- W = (kappa + s_kappa * Z) / (s_kappa + kappa * Z);
1397
- Y = s_kappa - kappa * W;
1398
- if ((Y * (2 - Y) >= V) || (cl::sycl::log (Y / V) + 1 >= Y))
1399
- {
1400
- *n = *n + 1 ;
1401
- result1[*n] = cl::sycl::acos (W);
1407
+ if (!result_mask[i]) {
1408
+ _DataType Z, W, Y, V;
1409
+ Z = cl::sycl::cos (Uvec[i]);
1410
+ V = Vvec[i];
1411
+ W = (kappa + s_kappa * Z) / (s_kappa + kappa * Z);
1412
+ Y = s_kappa - kappa * W;
1413
+ if ((Y * (2 - Y) >= V) || (cl::sycl::log (Y / V) + 1 >= Y))
1414
+ {
1415
+ result1[i] = cl::sycl::acos (W);
1416
+ result_mask[i] = true ;
1417
+ }
1402
1418
}
1403
1419
});
1404
1420
};
1405
1421
auto some_event = DPNP_QUEUE.submit (paral_kernel_some);
1406
1422
some_event.wait ();
1423
+
1424
+ dpnp_all_c<bool , bool >(result_mask, result_ready, size);
1407
1425
}
1408
1426
dpnp_memory_free_c (Uvec);
1409
- dpnp_memory_free_c (n);
1427
+ dpnp_memory_free_c (result_ready);
1428
+ dpnp_memory_free_c (result_mask);
1410
1429
1411
1430
mkl_rng::uniform<_DataType> uniform_distribution (d_zero, d_one);
1412
1431
auto uniform_distr_event = mkl_rng::generate (uniform_distribution, DPNP_RNG_ENGINE, size, Vvec);
0 commit comments