diff --git a/asQ/diag_preconditioner.py b/asQ/diag_preconditioner.py index f7db374f..fba096ed 100644 --- a/asQ/diag_preconditioner.py +++ b/asQ/diag_preconditioner.py @@ -116,10 +116,6 @@ def initialize(self, pc): # sanity check assert (self.blockV.dim()*paradiag.nlocal_timesteps == W_all.dim()) - # Input/Output wrapper Functions for all-at-once residual being acted on - self.xf = fd.Function(W_all) # input - self.yf = fd.Function(W_all) # output - # Gamma coefficients exponents = np.arange(self.ntimesteps)/self.ntimesteps self.Gam = paradiag.alpha**exponents @@ -165,20 +161,43 @@ def initialize(self, pc): self.xfr = fd.Function(W_all) # setting up the FFT stuff + self.smaller_transpose = True # construct simply dist array and 1d fftn: subcomm = Subcomm(self.ensemble.ensemble_comm, [0, 1]) + # dimensions of space-time data in this ensemble_comm nlocal = self.blockV.node_set.size NN = np.array([self.ntimesteps, nlocal], dtype=int) + # transfer pencil is aligned along axis 1 self.p0 = Pencil(subcomm, NN, axis=1) - # a0 is the local part of our fft working array - # has shape of (partition/P, nlocal) - self.a0 = np.zeros(self.p0.subshape, complex) self.p1 = self.p0.pencil(0) - # a0 is the local part of our other fft working array - self.a1 = np.zeros(self.p1.subshape, complex) - self.transfer = self.p0.transfer(self.p1, complex) + + # data types + rtype = self.xfr.dat[0].data.dtype + ctype = complex + + # set up real valued transfer + self.rtransfer = self.p0.transfer(self.p1, rtype) + + # a0 is the local part of the original data (i.e. distributed in time) + # has shape of (nlocal_timesteps, nlocal) + self.ra0 = np.zeros(self.p0.subshape, rtype) + + # a1 is the local part of the transposed data (i.e. distributed in space) + # has shape of (ntimesteps, ...) + self.ra1 = np.zeros(self.p1.subshape, rtype) + + # set up complex valued transfer + self.ctransfer = self.p0.transfer(self.p1, ctype) + + # a0 is the local part of the original data (i.e. distributed in time) + # has shape of (nlocal_timesteps, nlocal) + self.ca0 = np.zeros(self.p0.subshape, ctype) + + # a1 is the local part of the transposed data (i.e. distributed in space) + # has shape of (ntimesteps, ...) + self.ca1 = np.zeros(self.p1.subshape, ctype) # setting up the Riesz map default_riesz_method = { @@ -351,98 +370,117 @@ def update(self, pc): return @profiler() - def apply(self, pc, x, y): + def _transfer_forward1(self): + if self.smaller_transpose: + self.rtransfer.forward(self.ra0, self.ra1) + self.ca1[:] = self.ra1[:] + else: + self.ca0[:] = self.ra0[:] + self.ctransfer.forward(self.ca0, self.ca1) - # copy petsc vec into Function - # hopefully this works - with self.xf.dat.vec_wo as v: - x.copy(v) + @profiler() + def _fft(self): + self.ca1[:] = fft(self.ca1, axis=0) + + @profiler() + def _transfer_backward2(self): + self.ctransfer.backward(self.ca1, self.ca0) + + @profiler() + def _transfer_forward3(self): + self.ctransfer.forward(self.ca0, self.ca1) + + @profiler() + def _ifft(self): + self.ca1[:] = ifft(self.ca1, axis=0) + + @profiler() + def _transfer_backward4(self): + if self.smaller_transpose: + self.ra1[:] = self.ca1.real[:] + self.rtransfer.backward(self.ra1, self.ra0) + else: + self.ctransfer.backward(self.ca1, self.ca0) + self.ra0[:] = self.ca0.real[:] + + @profiler() + def apply(self, pc, x, y): # get array of basis coefficients - with self.xf.dat.vec_ro as v: - parray = v.array_r.reshape((self.aaos.nlocal_timesteps, - self.blockV.node_set.size)) # This produces an array whose rows are time slices # and columns are finite element basis coefficients + self.ra0[:] = x.array_r.reshape((self.aaos.nlocal_timesteps, + self.blockV.node_set.size)) ###################### # Diagonalise - scale, transfer, FFT, transfer, Copy - # Scale - # is there a better way to do this with broadcasting? - parray = (1.0+0.j)*(self.Gam_slice*parray.T).T*np.sqrt(self.ntimesteps) - # transfer forward - self.a0[:] = parray[:] - with PETSc.Log.Event("asQ.diag_preconditioner.DiagFFTPC.apply.transfer"): - self.transfer.forward(self.a0, self.a1) - # FFT - with PETSc.Log.Event("asQ.diag_preconditioner.DiagFFTPC.apply.fft"): - self.a1[:] = fft(self.a1, axis=0) + # scale + # is there a better way to do this with broadcasting? + self.ra0[:] = (self.Gam_slice*self.ra0.T).T*np.sqrt(self.ntimesteps) - # transfer backward - with PETSc.Log.Event("asQ.diag_preconditioner.DiagFFTPC.apply.transfer"): - self.transfer.backward(self.a1, self.a0) + self.ensemble.global_comm.Barrier() + self._transfer_forward1() + self.ensemble.global_comm.Barrier() + self._fft() + self.ensemble.global_comm.Barrier() + self._transfer_backward2() + self.ensemble.global_comm.Barrier() # Copy into xfi, xfr - parray[:] = self.a0[:] with self.xfr.dat.vec_wo as v: - v.array[:] = parray.real.reshape(-1) + v.array[:] = self.ca0.real.reshape(-1) with self.xfi.dat.vec_wo as v: - v.array[:] = parray.imag.reshape(-1) + v.array[:] = self.ca0.imag.reshape(-1) ##################### # Do the block solves - with PETSc.Log.Event("asQ.diag_preconditioner.DiagFFTPC.apply.block_solves"): - for i in range(self.aaos.nlocal_timesteps): - # copy the data into solver input - self.xtemp.assign(0.) + for i in range(self.aaos.nlocal_timesteps): + # copy the data into solver input + self.xtemp.assign(0.) - cpx.set_real(self.xtemp, self.aaos.get_field_components(i, f_alls=self.xfr.subfunctions)) - cpx.set_imag(self.xtemp, self.aaos.get_field_components(i, f_alls=self.xfi.subfunctions)) + cpx.set_real(self.xtemp, self.aaos.get_field_components(i, f_alls=self.xfr.subfunctions)) + cpx.set_imag(self.xtemp, self.aaos.get_field_components(i, f_alls=self.xfi.subfunctions)) - # Do a project for Riesz map, to be superceded - # when we get Cofunction + # Do a project for Riesz map, to be superceded + # when we get Cofunction + with PETSc.Log.Event("asQ.DiagFFTPC.apply.riesz_map"): self.Proj.solve(self.Jprob_in, self.xtemp) - # solve the block system + # solve the block system + with PETSc.Log.Event("asQ.DiagFFTPC.apply.block_solves"): self.Jprob_out.assign(0.) self.Jsolvers[i].solve() - # copy the data from solver output - cpx.get_real(self.Jprob_out, self.aaos.get_field_components(i, f_alls=self.xfr.subfunctions)) - cpx.get_imag(self.Jprob_out, self.aaos.get_field_components(i, f_alls=self.xfi.subfunctions)) + # copy the data from solver output + cpx.get_real(self.Jprob_out, self.aaos.get_field_components(i, f_alls=self.xfr.subfunctions)) + cpx.get_imag(self.Jprob_out, self.aaos.get_field_components(i, f_alls=self.xfi.subfunctions)) ###################### # Undiagonalise - Copy, transfer, IFFT, transfer, scale, copy # get array of basis coefficients - with self.xfi.dat.vec_ro as v: - parray = 1j*v.array_r.reshape((self.aaos.nlocal_timesteps, - self.blockV.node_set.size)) with self.xfr.dat.vec_ro as v: - parray += v.array_r.reshape((self.aaos.nlocal_timesteps, - self.blockV.node_set.size)) - # transfer forward - self.a0[:] = parray[:] - with PETSc.Log.Event("asQ.diag_preconditioner.DiagFFTPC.apply.transfer"): - self.transfer.forward(self.a0, self.a1) - - # IFFT - with PETSc.Log.Event("asQ.diag_preconditioner.DiagFFTPC.apply.fft"): - self.a1[:] = ifft(self.a1, axis=0) + self.ca0.real[:] = v.array_r.reshape((self.aaos.nlocal_timesteps, + self.blockV.node_set.size)) + with self.xfi.dat.vec_ro as v: + self.ca0.imag[:] = v.array_r.reshape((self.aaos.nlocal_timesteps, + self.blockV.node_set.size)) - # transfer backward - with PETSc.Log.Event("asQ.diag_preconditioner.DiagFFTPC.apply.transfer"): - self.transfer.backward(self.a1, self.a0) - parray[:] = self.a0[:] + self.ensemble.global_comm.Barrier() + self._transfer_forward3() + self.ensemble.global_comm.Barrier() + self._ifft() + self.ensemble.global_comm.Barrier() + self._transfer_backward4() + self.ensemble.global_comm.Barrier() # scale - parray = ((1.0/self.Gam_slice)*parray.T).T - # Copy into xfi, xfr - with self.yf.dat.vec_wo as v: - v.array[:] = parray.reshape(-1).real - with self.yf.dat.vec_ro as v: - v.copy(y) + self.ra0[:] = ((1.0/self.Gam_slice)*self.ra0[:].T).T + + # Copy into output + y.array[:] = self.ra0.reshape(-1) + ################ self._record_diagnostics() diff --git a/tests/test_paradiag.py b/tests/test_paradiag.py index 0c95ff1b..fbacbda1 100644 --- a/tests/test_paradiag.py +++ b/tests/test_paradiag.py @@ -17,7 +17,7 @@ def test_galewsky_timeseries(): from utils.serial import ComparisonMiniapp from copy import deepcopy - ref_level = 2 + ref_level = 3 nwindows = 1 nslices = 2 slice_length = 2 @@ -120,8 +120,6 @@ def form_mass(u, h, v, q): # nonlinear solver options snes_sparameters = { - 'monitor': None, - 'converged_reason': None, 'atol': 1e-0, 'rtol': 1e-10, 'stol': 1e-12, @@ -134,17 +132,24 @@ def form_mass(u, h, v, q): 'snes': snes_sparameters } serial_sparameters.update(deepcopy(block_sparameters)) - serial_sparameters['ksp']['monitor'] = None - serial_sparameters['ksp']['converged_reason'] = None + serial_sparameters['ksp']['atol'] = snes_sparameters['atol'] + # if ensemble.ensemble_comm.rank == 0: + # serial_sparameters['snes']['monitor'] = None + # serial_sparameters['snes']['converged_reason'] = None + # serial_sparameters['ksp']['monitor'] = None + # serial_sparameters['ksp']['converged_reason'] = None # solver parameters for parallel method parallel_sparameters = { 'snes': snes_sparameters, + 'snes_monitor': None, + 'snes_converged_reason': None, 'mat_type': 'matfree', 'ksp_type': 'fgmres', 'ksp': { 'monitor': None, 'converged_reason': None, + 'atol': snes_sparameters['atol'] }, 'pc_type': 'python', 'pc_python_type': 'asQ.DiagFFTPC',