@@ -244,22 +244,15 @@ def step(u, v, e, u1, v1, e1, u2, v2, e2):
244
244
t = i * dt
245
245
246
246
if t >= next_t_export - 1e-8 :
247
- if device :
248
- # FIXME gpu.memcpy to host requires identity layout
249
- # FIXME reduction on gpu
250
- # e_host = e.to_device()
251
- # u_host = u.to_device()
252
- # h_host = h.to_device()
253
- # _elev_max = np.max(e_host, all_axes)
254
- # _u_max = np.max(u_host, all_axes)
255
- # _total_v = np.sum(e_host + h, all_axes)
256
- _elev_max = 0
257
- _u_max = 0
258
- _total_v = 0
259
- else :
260
- _elev_max = np .max (e , all_axes )
261
- _u_max = np .max (u , all_axes )
262
- _total_v = np .sum (e + h , all_axes )
247
+ sync ()
248
+ H_tmp = e + h
249
+ sync ()
250
+ _elev_max = np .max (e , all_axes ).to_device ()
251
+ # NOTE max(u) segfaults, shape (n+1, n) too large for tiling
252
+ _u_max = np .max (u [1 :, :], all_axes ).to_device ()
253
+ _total_v = np .sum (H_tmp , all_axes ).to_device ()
254
+ # NOTE this segfaults
255
+ # _total_v = np.sum(e + h, all_axes).to_device() # segfaults
263
256
264
257
elev_max = float (_elev_max )
265
258
u_max = float (_u_max )
@@ -294,16 +287,11 @@ def step(u, v, e, u1, v1, e1, u2, v2, e2):
294
287
duration = time_mod .perf_counter () - tic
295
288
info (f"Duration: { duration :.2f} s" )
296
289
297
- if device :
298
- # FIXME gpu.memcpy to host requires identity layout
299
- # FIXME reduction on gpu
300
- # err2_host = err2.to_device()
301
- # err_L2 = math.sqrt(float(np.sum(err2_host, all_axes)))
302
- err_L2 = 0
303
- else :
304
- e_exact = exact_elev (t , x_t_2d , y_t_2d , lx , ly )
305
- err2 = (e_exact - e ) * (e_exact - e ) * dx * dy / lx / ly
306
- err_L2 = math .sqrt (float (np .sum (err2 , all_axes )))
290
+ e_exact = exact_elev (t , x_t_2d , y_t_2d , lx , ly ).to_device (device )
291
+ err2 = (e_exact - e ) * (e_exact - e ) * dx * dy / lx / ly
292
+ err2sum = np .sum (err2 , all_axes ).to_device ()
293
+ sync ()
294
+ err_L2 = math .sqrt (float (err2sum ))
307
295
info (f"L2 error: { err_L2 :7.5e} " )
308
296
309
297
if nx == 128 and ny == 128 and not benchmark_mode and not device :
0 commit comments