diff --git a/mpi4py_fft/__init__.py b/mpi4py_fft/__init__.py index 16ae4c8..75442a7 100644 --- a/mpi4py_fft/__init__.py +++ b/mpi4py_fft/__init__.py @@ -24,3 +24,8 @@ from . import fftw from .fftw import fftlib from .io import HDF5File, NCFile, generate_xdmf + +try: + from .distarrayCuPy import DistArrayCuPy +except ModuleNotFoundError: + pass diff --git a/mpi4py_fft/distarray.py b/mpi4py_fft/distarray.py index 3a69e5c..ff92b5c 100644 --- a/mpi4py_fft/distarray.py +++ b/mpi4py_fft/distarray.py @@ -7,34 +7,13 @@ comm = MPI.COMM_WORLD -class DistArray(np.ndarray): - """Distributed Numpy array - - This Numpy array is part of a larger global array. Information about the - distribution is contained in the attributes. - - Parameters - ---------- - global_shape : sequence of ints - Shape of non-distributed global array - subcomm : None, :class:`.Subcomm` object or sequence of ints, optional - Describes how to distribute the array - val : Number or None, optional - Initialize array with this number if buffer is not given - dtype : np.dtype, optional - Type of array - buffer : Numpy array, optional - Array of correct shape. The buffer owns the memory that is used for - this array. - alignment : None or int, optional - Make sure array is aligned in this direction. Note that alignment does - not take rank into consideration. - rank : int, optional - Rank of tensor (number of free indices, a scalar is zero, vector one, - matrix two) +class DistArrayBase: + """ + Abstract class for distributed arrays in numpy or cupy implementation - For more information, see `numpy.ndarray `_ + Attributes: + - xp: Numerical library, a.k.a. numpy or cupy as class attribute Note ---- @@ -53,58 +32,9 @@ class DistArray(np.ndarray): Also note that the ``alignment`` keyword does not take rank into consideration. Setting alignment=2 for the array above means that the last axis will be aligned, also when rank>0. - """ - def __new__(cls, global_shape, subcomm=None, val=None, dtype=float, - buffer=None, strides=None, alignment=None, rank=0): - if len(global_shape[rank:]) < 2: # 1D case - obj = np.ndarray.__new__(cls, global_shape, dtype=dtype, buffer=buffer, strides=strides) - if buffer is None and isinstance(val, Number): - obj.fill(val) - obj._rank = rank - obj._p0 = None - return obj - - if isinstance(subcomm, Subcomm): - pass - else: - if isinstance(subcomm, (tuple, list)): - assert len(subcomm) == len(global_shape[rank:]) - # Do nothing if already containing communicators. A tuple of subcommunicators is not necessarily a Subcomm - if not np.all([isinstance(s, MPI.Comm) for s in subcomm]): - subcomm = Subcomm(comm, subcomm) - else: - assert subcomm is None - subcomm = [0] * len(global_shape[rank:]) - if alignment is not None: - subcomm[alignment] = 1 - else: - subcomm[-1] = 1 - alignment = len(subcomm)-1 - subcomm = Subcomm(comm, subcomm) - sizes = [s.Get_size() for s in subcomm] - if alignment is not None: - assert isinstance(alignment, (int, np.integer)) - assert sizes[alignment] == 1 - else: - # Decide that alignment is the last axis with size 1 - alignment = np.flatnonzero(np.array(sizes) == 1)[-1] - p0 = Pencil(subcomm, global_shape[rank:], axis=alignment) - subshape = p0.subshape - if rank > 0: - subshape = global_shape[:rank] + subshape - obj = np.ndarray.__new__(cls, subshape, dtype=dtype, buffer=buffer) - if buffer is None and isinstance(val, Number): - obj.fill(val) - obj._p0 = p0 - obj._rank = rank - return obj - def __array_finalize__(self, obj): - if obj is None: - return - self._p0 = getattr(obj, '_p0', None) - self._rank = getattr(obj, '_rank', None) + xp = None @property def alignment(self): @@ -152,32 +82,72 @@ def dimensions(self): """Return dimensions of array not including rank""" return len(self._p0.shape) + @staticmethod + def get_subcomm(subcomm, global_shape, rank, alignment): + if isinstance(subcomm, Subcomm): + pass + else: + if isinstance(subcomm, (tuple, list)): + assert len(subcomm) == len(global_shape[rank:]) + # Do nothing if already containing communicators. A tuple of subcommunicators is not necessarily a Subcomm + if not np.all([isinstance(s, MPI.Comm) for s in subcomm]): + subcomm = Subcomm(comm, subcomm) + else: + assert subcomm is None + subcomm = [0] * len(global_shape[rank:]) + if alignment is not None: + subcomm[alignment] = 1 + else: + subcomm[-1] = 1 + alignment = len(subcomm) - 1 + subcomm = Subcomm(comm, subcomm) + return subcomm + + @classmethod + def setup_pencil(cls, subcomm, rank, global_shape, alignment): + sizes = [s.Get_size() for s in subcomm] + if alignment is not None: + assert isinstance(alignment, (int, cls.xp.integer)) + assert sizes[alignment] == 1 + else: + # Decide that alignment is the last axis with size 1 + alignment = np.flatnonzero(np.array(sizes) == 1)[-1] + + p0 = Pencil(subcomm, global_shape[rank:], axis=alignment) + + subshape = p0.subshape + if rank > 0: + subshape = global_shape[:rank] + subshape + + return p0, subshape + + def __array_finalize__(self, obj): + if obj is None: + return + self._p0 = getattr(obj, "_p0", None) + self._rank = getattr(obj, "_rank", None) + def __getitem__(self, i): # Return DistArray if the result is a component of a tensor # Otherwise return ndarray view if self.ndim == 1: - return np.ndarray.__getitem__(self, i) + return self.xp.ndarray.__getitem__(self, i) if isinstance(i, (Integral, slice)) and self.rank > 0: - v0 = np.ndarray.__getitem__(self, i) + v0 = self.xp.ndarray.__getitem__(self, i) v0._rank = self.rank - (self.ndim - v0.ndim) return v0 if isinstance(i, (Integral, slice)) and self.rank == 0: - return np.ndarray.__getitem__(self.v, i) + return self.xp.ndarray.__getitem__(self.v, i) assert isinstance(i, tuple) if len(i) <= self.rank: - v0 = np.ndarray.__getitem__(self, i) + v0 = self.xp.ndarray.__getitem__(self, i) v0._rank = self.rank - (self.ndim - v0.ndim) return v0 - return np.ndarray.__getitem__(self.v, i) - - @property - def v(self): - """ Return local ``self`` array as an ``ndarray`` object""" - return self.__array__() + return self.xp.ndarray.__getitem__(self.v, i) def get(self, gslice): """Return global slice of ``self`` @@ -215,6 +185,7 @@ def get(self, gslice): # global MPI. We create a global file with MPI, but then open it without # MPI and only on rank 0. import h5py + # TODO: can we use h5py to communicate the data without copying to cpu first when using cupy? f = h5py.File('tmp.h5', 'w', driver="mpio", comm=comm) s = self.local_slice() sp = np.nonzero([isinstance(x, slice) for x in gslice])[0] @@ -230,7 +201,8 @@ def get(self, gslice): else: on_this_proc = False if on_this_proc: - f["data"][sf] = self[tuple(gslice)] + data = self.asnumpy() + f["data"][sf] = data[tuple(gslice)] f.close() c = None if comm.Get_rank() == 0: @@ -277,24 +249,6 @@ def local_slice(self): self._p0.subshape)] return tuple([slice(0, s) for s in self.shape[:self.rank]] + v) - def get_pencil_and_transfer(self, axis): - """Return pencil and transfer objects for alignment along ``axis`` - - Parameters - ---------- - axis : int - The new axis to align data with - - Returns - ------- - 2-tuple - 2-tuple where first item is a :class:`.Pencil` aligned in ``axis``. - Second item is a :class:`.Transfer` object for executing the - redistribution of data - """ - p1 = self._p0.pencil(axis) - return p1, self._p0.transfer(p1, self.dtype) - def redistribute(self, axis=None, out=None): """Global redistribution of local ``self`` array @@ -327,7 +281,7 @@ def redistribute(self, axis=None, out=None): return self if out is not None: - assert isinstance(out, DistArray) + assert issubclass(type(out), DistArrayBase) assert self.global_shape == out.global_shape axis = out.alignment if self.commsizes == out.commsizes: @@ -343,11 +297,11 @@ def redistribute(self, axis=None, out=None): p1, transfer = self.get_pencil_and_transfer(axis) if out is None: - out = DistArray(self.global_shape, - subcomm=p1.subcomm, - dtype=self.dtype, - alignment=axis, - rank=self.rank) + out = type(self)(self.global_shape, + subcomm=p1.subcomm, + dtype=self.dtype, + alignment=axis, + rank=self.rank) if self.rank == 0: transfer.forward(self, out) @@ -362,6 +316,24 @@ def redistribute(self, axis=None, out=None): transfer.destroy() return out + def get_pencil_and_transfer(self, axis): + """Return pencil and transfer objects for alignment along ``axis`` + + Parameters + ---------- + axis : int + The new axis to align data with + + Returns + ------- + 2-tuple + 2-tuple where first item is a :class:`.Pencil` aligned in ``axis``. + Second item is a :class:`.Transfer` object for executing the + redistribution of data + """ + p1 = self._p0.pencil(axis) + return p1, self._p0.transfer(p1, self.dtype) + def write(self, filename, name='darray', step=0, global_slice=None, domain=None, as_scalar=False): """Write snapshot ``step`` of ``self`` to file ``filename`` @@ -439,6 +411,67 @@ def read(self, filename, name='darray', step=0): f.read(self, name, step=step) +class DistArray(DistArrayBase, np.ndarray): + """Distributed Numpy array + + This Numpy array is part of a larger global array. Information about the + distribution is contained in the attributes. + + Parameters + ---------- + global_shape : sequence of ints + Shape of non-distributed global array + subcomm : None, :class:`.Subcomm` object or sequence of ints, optional + Describes how to distribute the array + val : Number or None, optional + Initialize array with this number if buffer is not given + dtype : np.dtype, optional + Type of array + buffer : Numpy array, optional + Array of correct shape. The buffer owns the memory that is used for + this array. + alignment : None or int, optional + Make sure array is aligned in this direction. Note that alignment does + not take rank into consideration. + rank : int, optional + Rank of tensor (number of free indices, a scalar is zero, vector one, + matrix two) + + + For more information, see `numpy.ndarray `_ + + """ + + xp = np + + def __new__(cls, global_shape, subcomm=None, val=None, dtype=float, + buffer=None, strides=None, alignment=None, rank=0): + if len(global_shape[rank:]) < 2: # 1D case + obj = cls.xp.ndarray.__new__(cls, global_shape, dtype=dtype, buffer=buffer, strides=strides) + if buffer is None and isinstance(val, Number): + obj.fill(val) + obj._rank = rank + obj._p0 = None + return obj + + subcomm = cls.get_subcomm(subcomm, global_shape, rank, alignment) + p0, subshape = cls.setup_pencil(subcomm, rank, global_shape, alignment) + + obj = cls.xp.ndarray.__new__(cls, subshape, dtype=dtype, buffer=buffer) + if buffer is None and isinstance(val, Number): + obj.fill(val) + obj._p0 = p0 + obj._rank = rank + return obj + + @property + def v(self): + """Return local ``self`` array as an ``ndarray`` object""" + return self.__array__() + + def asnumpy(self): + return self + def newDistArray(pfft, forward_output=True, val=0, rank=0, view=False): """Return a new :class:`.DistArray` object for provided :class:`.PFFT` object @@ -480,7 +513,13 @@ def newDistArray(pfft, forward_output=True, val=0, rank=0, view=False): else: dtype = pfft.forward.input_array.dtype global_shape = (len(global_shape),)*rank + global_shape - z = DistArray(global_shape, subcomm=p0.subcomm, val=val, dtype=dtype, + + if pfft.xfftn[0].backend in ["cupy", "cupyx-scipy"]: + from mpi4py_fft.distarrayCuPy import DistArrayCuPy as darraycls + else: + darraycls = DistArray + + z = darraycls(global_shape, subcomm=p0.subcomm, val=val, dtype=dtype, alignment=p0.axis, rank=rank) return z.v if view else z diff --git a/mpi4py_fft/distarrayCuPy.py b/mpi4py_fft/distarrayCuPy.py new file mode 100644 index 0000000..2c243bb --- /dev/null +++ b/mpi4py_fft/distarrayCuPy.py @@ -0,0 +1,86 @@ +from mpi4py_fft.distarray import DistArrayBase +import cupy as cp +from mpi4py import MPI +from numbers import Number + +comm = MPI.COMM_WORLD + + +class DistArrayCuPy(DistArrayBase, cp.ndarray): + """Distributed CuPy array + + This CuPy array is part of a larger global array. Information about the + distribution is contained in the attributes. + + Parameters + ---------- + global_shape : sequence of ints + Shape of non-distributed global array + subcomm : None, :class:`.Subcomm` object or sequence of ints, optional + Describes how to distribute the array + val : Number or None, optional + Initialize array with this number if buffer is not given + dtype : cp.dtype, optional + Type of array + memptr : cupy.cuda.MemoryPointer, optional + Pointer to the array content head. + alignment : None or int, optional + Make sure array is aligned in this direction. Note that alignment does + not take rank into consideration. + rank : int, optional + Rank of tensor (number of free indices, a scalar is zero, vector one, + matrix two) + + + For more information, see `cupy.ndarray `_ + + """ + + xp = cp + + def __new__( + cls, + global_shape, + subcomm=None, + val=None, + dtype=float, + memptr=None, + strides=None, + alignment=None, + rank=0, + ): + if len(global_shape[rank:]) < 2: # 1D case + obj = cls.xp.ndarray.__new__( + cls, global_shape, dtype=dtype, memptr=memptr, strides=strides + ) + if memptr is None and isinstance(val, Number): + obj.fill(val) + obj._rank = rank + obj._p0 = None + return obj + + subcomm = cls.get_subcomm(subcomm, global_shape, rank, alignment) + p0, subshape = cls.setup_pencil(subcomm, rank, global_shape, alignment) + + obj = cls.xp.ndarray.__new__(cls, subshape, dtype=dtype, memptr=memptr, strides=strides) + if memptr is None and isinstance(val, Number): + obj.fill(val) + obj._p0 = p0 + obj._rank = rank + return obj + + def get(self, *args, **kwargs): + """Untangle inheritance conflicts""" + if any(isinstance(me, tuple) for me in args): + return DistArrayBase.get(self, *args, **kwargs) + else: + return cp.ndarray.get(self, *args, **kwargs) + + def asnumpy(self): + """Copy the array to CPU""" + return self.get() + + @property + def v(self): + """Return local ``self`` array as an ``ndarray`` object""" + return cp.ndarray.__getitem__(self, slice(None, None, None)) diff --git a/mpi4py_fft/libfft.py b/mpi4py_fft/libfft.py index 580bb9e..aca5265 100644 --- a/mpi4py_fft/libfft.py +++ b/mpi4py_fft/libfft.py @@ -78,6 +78,30 @@ def _Xfftn_plan_fftw(shape, axes, dtype, transforms, options): xfftn_bck = plan_bck(V, s=s, axes=axes, threads=threads, flags=flags, output_array=U) return (xfftn_fwd, xfftn_bck) +def _Xfftn_plan_cupy(shape, axes, dtype, transforms, options): + import cupy as cp + cp.fft.config.enable_nd_planning = True + + transforms = {} if transforms is None else transforms + if tuple(axes) in transforms: + plan_fwd, plan_bck = transforms[tuple(axes)] + else: + if cp.issubdtype(dtype, cp.floating): + plan_fwd = cp.fft.rfftn + plan_bck = cp.fft.irfftn + else: + plan_fwd = cp.fft.fftn + plan_bck = cp.fft.ifftn + + s = tuple(np.take(shape, axes)) + U = cp.empty(shape=shape, dtype=dtype) + V = cp.empty_like(plan_fwd(U, s=s, axes=axes)) + + return ( + _Yfftn_wrap(plan_fwd, U, V, 1, {'s': s, 'axes': axes, 'norm': 'backward',}, xp=cp), + _Yfftn_wrap(plan_bck, V, U, 1, {'s': s, 'axes': axes, 'norm': 'forward',}, xp=cp), + ) + def _Xfftn_plan_numpy(shape, axes, dtype, transforms, options): transforms = {} if transforms is None else transforms @@ -125,6 +149,25 @@ def _Xfftn_plan_mkl(shape, axes, dtype, transforms, options): #pragma: no cover return (_Yfftn_wrap(plan_fwd, U, V, 1, {'s': s, 'axes': axes}), _Yfftn_wrap(plan_bck, V, U, M, {'s': s, 'axes': axes})) +def _Xfftn_plan_cupyx_scipy(shape, axes, dtype, transforms, options): + import cupy as cp + import cupyx.scipy.fft as cufft + + transforms = {} if transforms is None else transforms + if tuple(axes) in transforms: + plan_fwd, plan_bck = transforms[tuple(axes)] + else: + plan_fwd = cufft.fftn + plan_bck = cufft.ifftn + + s = tuple(np.take(shape, axes)) + U = cp.empty(shape=shape, dtype=dtype) + V = plan_fwd(U, s=s, axes=axes) + V = cp.array(V) + M = np.prod(s) + return (_Yfftn_wrap(plan_fwd, U, V, 1, {'s': s, 'axes': axes, 'overwrite_x': True}, xp=cp), + _Yfftn_wrap(plan_bck, V, U, M, {'s': s, 'axes': axes, 'overwrite_x': True}, xp=cp)) + def _Xfftn_plan_scipy(shape, axes, dtype, transforms, options): transforms = {} if transforms is None else transforms @@ -143,19 +186,23 @@ def _Xfftn_plan_scipy(shape, axes, dtype, transforms, options): return (_Yfftn_wrap(plan_fwd, U, V, 1, {'shape': s, 'axes': axes}), _Yfftn_wrap(plan_bck, V, U, M, {'shape': s, 'axes': axes})) +def _copyto(dst, src, xp): + xp.copyto(dst, src, casting='unsafe') + class _Yfftn_wrap(object): #Wraps numpy/scipy/mkl transforms to FFTW style # pylint: disable=too-few-public-methods - __slots__ = ('_xfftn', '_M', '_opt', '__doc__', '_input_array', '_output_array') + __slots__ = ('_xfftn', '_M', '_opt', '__doc__', '_input_array', '_output_array', 'xp') - def __init__(self, xfftn_obj, input_array, output_array, M, opt): + def __init__(self, xfftn_obj, input_array, output_array, M, opt, xp=np): object.__setattr__(self, '_xfftn', xfftn_obj) object.__setattr__(self, '_opt', opt) object.__setattr__(self, '_M', M) object.__setattr__(self, '_input_array', input_array) object.__setattr__(self, '_output_array', output_array) object.__setattr__(self, '__doc__', xfftn_obj.__doc__) + object.__setattr__(self, 'xp', xp) @property def input_array(self): @@ -177,9 +224,12 @@ def opt(self): def M(self): return object.__getattribute__(self, '_M') + def copyto(self, dst, src): + _copyto(dst, src, self.xp) + def __call__(self, *args, **kwargs): self.opt.update(kwargs) - self.output_array[...] = self.xfftn(self.input_array, **self.opt) + self.copyto(self._output_array, self.xfftn(self.input_array, **self.opt)) if abs(self.M-1) > 1e-8: self._output_array *= self.M return self.output_array @@ -188,13 +238,14 @@ class _Xfftn_wrap(object): #Common interface for all serial transforms # pylint: disable=too-few-public-methods - __slots__ = ('_xfftn', '__doc__', '_input_array', '_output_array') + __slots__ = ('_xfftn', '__doc__', '_input_array', '_output_array', 'xp') - def __init__(self, xfftn_obj, input_array, output_array): + def __init__(self, xfftn_obj, input_array, output_array, xp=np): object.__setattr__(self, '_xfftn', xfftn_obj) object.__setattr__(self, '_input_array', input_array) object.__setattr__(self, '_output_array', output_array) object.__setattr__(self, '__doc__', xfftn_obj.__doc__) + object.__setattr__(self, 'xp', xp) @property def input_array(self): @@ -208,12 +259,15 @@ def output_array(self): def xfftn(self): return object.__getattribute__(self, '_xfftn') + def copyto(self, dst, src): + _copyto(dst, src, self.xp) + def __call__(self, input_array=None, output_array=None, **options): if input_array is not None: - self.input_array[...] = input_array + self.copyto(self.input_array, input_array) self.xfftn(**options) if output_array is not None: - output_array[...] = self.output_array + self.copyto(output_array, self.output_array) return output_array else: return self.output_array @@ -310,7 +364,6 @@ def _padding_backward(self, trunc_array, padded_array): su[axis] = -(N//2) padded_array[tuple(su)] *= 0.5 - class FFT(FFTBase): """Class for serial FFT transforms @@ -373,6 +426,7 @@ class FFT(FFTBase): output_array : array """ + def __init__(self, shape, axes=None, dtype=float, padding=False, backend='fftw', transforms=None, **kw): FFTBase.__init__(self, shape, axes, dtype, padding) @@ -380,8 +434,10 @@ def __init__(self, shape, axes=None, dtype=float, padding=False, 'pyfftw': _Xfftn_plan_pyfftw, 'fftw': _Xfftn_plan_fftw, 'numpy': _Xfftn_plan_numpy, + 'cupy': _Xfftn_plan_cupy, 'mkl_fft': _Xfftn_plan_mkl, 'scipy': _Xfftn_plan_scipy, + 'cupyx-scipy': _Xfftn_plan_cupyx_scipy, }[backend] self.backend = backend self.fwd, self.bck = plan(self.shape, self.axes, self.dtype, transforms, kw) @@ -391,19 +447,27 @@ def __init__(self, shape, axes=None, dtype=float, padding=False, self.M = 1./np.prod(np.take(self.shape, self.axes)) else: self.M = self.fwd.get_normalization() - if backend == 'scipy': + if backend in ['scipy', 'cupyx-scipy']: self.real_transform = False # No rfftn/irfftn methods + self.padding_factor = 1.0 if padding is not False: self.padding_factor = padding[self.axes[-1]] if np.ndim(padding) else padding + xp = np + if 'cupy' in self.backend: + import cupy as cp + xp = cp + if abs(self.padding_factor-1.0) > 1e-8: assert len(self.axes) == 1 trunc_array = self._get_truncarray(shape, V.dtype) - self.forward = _Xfftn_wrap(self._forward, U, trunc_array) - self.backward = _Xfftn_wrap(self._backward, trunc_array, U) + if 'cupy' in self.backend: # TODO: Skip detour via CPU + trunc_array = cp.array(trunc_array) + self.forward = _Xfftn_wrap(self._forward, U, trunc_array, xp=xp) + self.backward = _Xfftn_wrap(self._backward, trunc_array, U, xp=xp) else: - self.forward = _Xfftn_wrap(self._forward, U, V) - self.backward = _Xfftn_wrap(self._backward, V, U) + self.forward = _Xfftn_wrap(self._forward, U, V, xp=xp) + self.backward = _Xfftn_wrap(self._backward, V, U, xp=xp) def _forward(self, **kw): normalize = kw.pop('normalize', True) diff --git a/mpi4py_fft/mpifft.py b/mpi4py_fft/mpifft.py index 4abe5d8..5b9a04b 100644 --- a/mpi4py_fft/mpifft.py +++ b/mpi4py_fft/mpifft.py @@ -63,17 +63,18 @@ def __call__(self, input_array=None, output_array=None, **kw): """ if input_array is not None: - self.input_array[...] = input_array + self._xfftn[0].copyto(self.input_array, input_array) for i in range(len(self._transfer)): self._xfftn[i](**kw) arrayA = self._xfftn[i].output_array arrayB = self._xfftn[i+1].input_array self._transfer[i](arrayA, arrayB) + self._xfftn[-1](**kw) if output_array is not None: - output_array[...] = self.output_array + self._xfftn[-1].copyto(output_array, self.output_array) return output_array else: return self.output_array @@ -128,6 +129,11 @@ class PFFT(object): fftn/ifftn for complex input arrays. Real-to-real transforms can be configured using this dictionary and real-to-real transforms from the :mod:`.fftw.xfftn` module. See Examples. + comm_backend : str, optional + Choose backend for communication. When using GPU based serial backends, + the "NCCL" backend or a "customMPI" backend can be be used in `Alltoallw` + operations to speedup GPU to GPU transfer. Keep in mind that this is used + alongside MPI and assumes one GPU per MPI rank is used. Other Parameters ---------------- @@ -201,7 +207,7 @@ class PFFT(object): """ def __init__(self, comm, shape=None, axes=None, dtype=float, grid=None, padding=False, collapse=False, backend='fftw', - transforms=None, darray=None, **kw): + transforms=None, darray=None, comm_backend='MPI', **kw): # pylint: disable=too-many-locals # pylint: disable=too-many-branches # pylint: disable=too-many-statements @@ -311,6 +317,7 @@ def __init__(self, comm, shape=None, axes=None, dtype=float, grid=None, self.pencil = [None, None] axes = self.axes[-1] + Pencil.backend = comm_backend pencil = Pencil(self.subcomm, shape, axes[-1]) xfftn = FFT(pencil.subshape, axes, dtype, padding, backend=backend, transforms=transforms, **kw) diff --git a/mpi4py_fft/pencil.py b/mpi4py_fft/pencil.py index 6671d8c..637646d 100644 --- a/mpi4py_fft/pencil.py +++ b/mpi4py_fft/pencil.py @@ -29,6 +29,17 @@ def _subarraytypes(comm, shape, axis, subshape, dtype): return tuple(datatypes) +def get_slice(subtype): + """ + Extract from the subtype object generated for MPI what shape the buffer + should have and what part of the array we want to send / receive. + """ + decoded = subtype.decode() + subsizes = decoded[2]['subsizes'] + starts = decoded[2]['starts'] + return tuple(slice(starts[i], starts[i] + subsizes[i]) for i in range(len(starts))) + + class Subcomm(tuple): r"""Class returning a tuple of subcommunicators of any dimensionality @@ -155,6 +166,7 @@ def __init__(self, comm, shape, dtype, subshapeA, axisA, subshapeB, axisB): + self.comm = comm self.shape = tuple(shape) self.dtype = dtype = np.dtype(dtype) @@ -179,8 +191,15 @@ def forward(self, arrayA, arrayB): assert self.subshapeB == arrayB.shape assert self.dtype == arrayA.dtype assert self.dtype == arrayB.dtype - self.comm.Alltoallw([arrayA, self._counts_displs, self._subtypesA], - [arrayB, self._counts_displs, self._subtypesB]) + self.Alltoallw(arrayA, self._subtypesA, arrayB, self._subtypesB) + + def Alltoallw(self, arrayA, subtypesA, arrayB, subtypesB): + """ + Redistribute arrayA to arrayB + """ + self.comm.Alltoallw([arrayA, self._counts_displs, subtypesA], + [arrayB, self._counts_displs, subtypesB]) + def backward(self, arrayB, arrayA): """Global redistribution from arrayB to arrayA @@ -197,8 +216,7 @@ def backward(self, arrayB, arrayA): assert self.subshapeB == arrayB.shape assert self.dtype == arrayA.dtype assert self.dtype == arrayB.dtype - self.comm.Alltoallw([arrayB, self._counts_displs, self._subtypesB], - [arrayA, self._counts_displs, self._subtypesA]) + self.Alltoallw(arrayB, self._subtypesB, arrayA, self._subtypesA) def destroy(self): for datatype in self._subtypesA: @@ -209,6 +227,196 @@ def destroy(self): datatype.Free() +class CustomMPITransfer(Transfer): + + def Alltoallw(self, arrayA, subtypesA, arrayB, subtypesB): + """ + Redistribute arrayA to arrayB. + """ + if type(arrayA) == np.ndarray: + xp = np + def synchronize_stream(): + pass + else: + import cupy as xp + synchronize_stream = xp.cuda.get_current_stream().synchronize + + rank, size, comm = self.comm.rank, self.comm.size, self.comm + + for i in range(1, size + 1): + send_to = (rank + i) % size + recv_from = (rank -i + size) % size + + sliceA = get_slice(subtypesA[send_to]) + sliceB = get_slice(subtypesB[recv_from]) + + if send_to == rank: + arrayB[sliceB][:] = arrayA[sliceA][:] + else: + recvbuf = xp.ascontiguousarray(arrayB[sliceB]) + sendbuf = xp.ascontiguousarray(arrayA[sliceA]) + + synchronize_stream() + comm.Sendrecv(sendbuf, send_to, recvbuf=recvbuf, source=recv_from) + arrayB[sliceB][:] = recvbuf[:] + + +class NCCLTransfer(Transfer): + """ + Transfer class which uses NCCL for `Alltoallw` operations. The NCCL + communicator will share rank and size attributes with the MPI communicator. + In particular, this assumes one GPU per MPI rank. + """ + + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + self.forward_graph = None + self.backward_graph = None + + from cupy.cuda import nccl + self.comm_nccl = nccl.NcclCommunicator(self.comm.size, self.comm.bcast(nccl.get_unique_id(), root=0), self.comm.rank) + + # determine how to send the data. If we have complex numbers, we need to send two real numbers. + if self.dtype in [np.dtype('float32'), np.dtype('complex64')]: + self.NCCL_dtype = nccl.NCCL_FLOAT32 + elif self.dtype in [np.dtype('float64'), np.dtype('complex128')]: + self.NCCL_dtype = nccl.NCCL_FLOAT64 + elif self.dtype in [np.dtype('int32')]: + self.NCCL_dtype = nccl.NCCL_INT32 + elif self.dtype in [np.dtype('int64')]: + self.NCCL_dtype = nccl.NCCL_INT64 + else: + raise NotImplementedError(f'Don\'t know what NCCL dtype to use to send data of dtype {self.dtype}!') + self.count_modifier = 2 if 'complex' in str(self.dtype) else 1 + + self.arrayA = None + self.arrayB = None + + def _copy_to_input_arrays(self, arrayA, arrayB): + """ + This is needed to make the CUDA graph work with arbitrary input data. + + Parameters + ---------- + arrayA : array + Data to be copied to the graph input + arrayB : array + Data to be copied to the graph output + """ + if self.arrayA is None: + self.arrayA = arrayA + elif self.arrayA.data.ptr == arrayA.data.ptr: + pass + else: + self.arrayA[...] = arrayA + + if self.arrayB is None: + self.arrayB = arrayB + elif self.arrayB.data.ptr == arrayB.data.ptr: + pass + else: + self.arrayB[...] = arrayB + + def _retrieve_from_input_arrays(self, arrayA, arrayB): + """ + This is needed to make the CUDA graph work with arbitrary input data. + + Parameters + ---------- + arrayA : array + Data to be copied from the graph input + arrayB : array + Data to be copied from the graph output + """ + if self.arrayA.data.ptr == arrayA.data.ptr: + pass + else: + arrayA[...] = self.arrayA + + if self.arrayB.data.ptr == arrayB.data.ptr: + pass + else: + arrayB[...] = self.arrayB + + def backward(self, arrayB, arrayA): + """Global redistribution from arrayB to arrayA + + Parameters + ---------- + arrayB : array + Array of shape subshapeB, containing data to be redistributed + arrayA : array + Array of shape subshapeA, for receiving data + + """ + self._copy_to_input_arrays(arrayA, arrayB) + self.backward_graph = self.Alltoallw(self.arrayB, self._subtypesB, self.arrayA, self._subtypesA, graph=self.backward_graph) + self._retrieve_from_input_arrays(arrayA, arrayB) + + def forward(self, arrayA, arrayB): + """Global redistribution from arrayA to arrayB + + Parameters + ---------- + arrayA : array + Array of shape subshapeA, containing data to be redistributed + arrayB : array + Array of shape subshapeB, for receiving data + """ + self._copy_to_input_arrays(arrayA, arrayB) + self.forward_graph = self.Alltoallw(self.arrayA, self._subtypesA, self.arrayB, self._subtypesB, graph=self.forward_graph) + self._retrieve_from_input_arrays(arrayA, arrayB) + + def Alltoallw(self, arrayA, subtypesA, arrayB, subtypesB, graph=None): + """ + Redistribute arrayA to arrayB. + """ + import cupy as cp + rank, size, comm = self.comm.rank, self.comm.size, self.comm_nccl + + # record to a graph if we haven't already done so + if graph is None: + stream = cp.cuda.Stream(non_blocking=True) + with stream: + stream.begin_capture() + + # initialize dictionaries required to overlap sends + recvbufs = {} + sendbufs = {} + sliceBs = {} + + # perform all sends and receives in a single kernel to allow overlap + cp.cuda.nccl.groupStart() + for i in range(1, size + 1): + + send_to = (rank + i) % size + recv_from = (rank -i + size) % size + + sliceA = get_slice(subtypesA[send_to]) + sliceBs[i] = get_slice(subtypesB[recv_from]) + + recvbufs[i] = cp.ascontiguousarray(arrayB[sliceBs[i]]) + sendbufs[i] = cp.ascontiguousarray(arrayA[sliceA]) + + comm.recv(recvbufs[i].data.ptr, recvbufs[i].size * self.count_modifier, self.NCCL_dtype, recv_from, stream.ptr) + comm.send(sendbufs[i].data.ptr, sendbufs[i].size * self.count_modifier, self.NCCL_dtype, send_to, stream.ptr) + cp.cuda.nccl.groupEnd() + + for i in recvbufs.keys(): + cp.copyto(arrayB[sliceBs[i]], recvbufs[i]) + + graph = stream.end_capture() + + graph.launch(stream=cp.cuda.get_current_stream()) + return graph + + def destroy(self): + del self.forward_graph + del self.backward_graph + self.comm_nccl.destroy() + super().destroy() + + class Pencil(object): """Class to represent a distributed array (pencil) @@ -274,6 +482,8 @@ class Pencil(object): aligned in axis 1. """ + backend = 'MPI' + def __init__(self, subcomm, shape, axis=-1): assert len(shape) >= 2 assert min(shape) >= 1 @@ -349,6 +559,15 @@ def transfer(self, pencil, dtype): shape = list(penA.subshape) shape[axis] = penA.shape[axis] - return Transfer(comm, shape, dtype, + if self.backend == 'MPI': + transfer_class = Transfer + elif self.backend == 'NCCL': + transfer_class = NCCLTransfer + elif self.backend == 'customMPI': + transfer_class = CustomMPITransfer + else: + raise NotImplementedError(f'Don\'t have a transfer class for backend \"{self.backend}\"!') + + return transfer_class(comm, shape, dtype, penA.subshape, penA.axis, penB.subshape, penB.axis) diff --git a/tests/test_PFFT_cupy_backend.py b/tests/test_PFFT_cupy_backend.py new file mode 100644 index 0000000..73a742f --- /dev/null +++ b/tests/test_PFFT_cupy_backend.py @@ -0,0 +1,36 @@ +def test_PFFT_cupy_backend(): + import numpy as np + import cupy as cp + from mpi4py import MPI + from mpi4py_fft import PFFT, newDistArray + + comm = MPI.COMM_WORLD + + # Set global size of the computational box + N = np.array([comm.size * 8] * 3, dtype=int) + expected_shape = (N[0] // comm.size, N[1], N[2]) + axes = ((0,), (1, 2)) + + backends = ['numpy', 'cupy'] + FFTs = {backend: PFFT(comm, N, axes=axes, grid=(-1,), backend=backend) for backend in backends} + assert FFTs['numpy'].axes == FFTs['cupy'].axes + + us = {backend: newDistArray(FFTs[backend], forward_output=False) for backend in backends} + us['numpy'][:] = np.random.random(us['numpy'].shape).astype(us['numpy'].dtype) + us['cupy'][:] = cp.array(us['numpy']) + + + + for backend, xp in zip(backends, [np, cp]): + us['hat_' + backend] = newDistArray(FFTs[backend], forward_output=True) + us['hat_' + backend] = FFTs[backend].forward(us[backend], us['hat_' + backend]) + us['back_and_forth_' + backend] = xp.zeros_like(us[backend]) + us['back_and_forth_' + backend] = FFTs[backend].backward(us['hat_' + backend], us['back_and_forth_' + backend]) + + assert xp.allclose(us[backend], us['back_and_forth_' + backend]), f'Got different values after back and forth transformation with {backend} backend' + assert np.allclose(us['back_and_forth_' + backend].shape, expected_shape), f"Got unexpected shape {us['back_and_forth_' + backend].shape} when expecting {expected_shape} with {backend} backend" + assert np.allclose(us['hat_cupy'].get(), us['hat_numpy']), 'Got different values in frequency space' + + +if __name__ == '__main__': + test_PFFT_cupy_backend() diff --git a/tests/test_darrayCuPy.py b/tests/test_darrayCuPy.py new file mode 100644 index 0000000..fb0e1b0 --- /dev/null +++ b/tests/test_darrayCuPy.py @@ -0,0 +1,155 @@ +import cupy as cp +from mpi4py import MPI +from mpi4py_fft import DistArrayCuPy, newDistArray, PFFT +from mpi4py_fft.pencil import Subcomm + +comm = MPI.COMM_WORLD + + +def test_1Darray(): + N = (8,) + z = DistArrayCuPy(N, val=2) + assert z[0] == 2 + assert z.shape == N + + +def test_2Darray(): + N = (8, 8) + for subcomm in ((0, 1), (1, 0), None, Subcomm(comm, (0, 1))): + for rank in (0, 1, 2): + M = (2,) * rank + N + alignment = None + if subcomm is None and rank == 1: + alignment = 1 + a = DistArrayCuPy(M, subcomm=subcomm, val=1, rank=rank, alignment=alignment) + assert a.rank == rank + assert a.global_shape == M + _ = a.substart + c = a.subcomm + z = a.commsizes + _ = a.pencil + assert cp.prod(cp.array(z)) == comm.Get_size() + if rank > 0: + a0 = a[0] + assert isinstance(a0, DistArrayCuPy) + assert a0.rank == rank - 1 + aa = a.v + assert isinstance(aa, cp.ndarray) + try: + k = a.get((0,) * rank + (0, slice(None))) + if comm.Get_rank() == 0: + assert len(k) == N[1] + assert cp.sum(k) == N[1] + k = a.get((0,) * rank + (slice(None), 0)) + if comm.Get_rank() == 0: + assert len(k) == N[0] + assert cp.sum(k) == N[0] + except ModuleNotFoundError: + pass + _ = a.local_slice() + newaxis = (a.alignment + 1) % 2 + _ = a.get_pencil_and_transfer(newaxis) + a[:] = MPI.COMM_WORLD.Get_rank() + b = a.redistribute(newaxis) + a = b.redistribute(out=a) + a = b.redistribute(a.alignment, out=a) + s0 = MPI.COMM_WORLD.reduce(cp.linalg.norm(a) ** 2) + s1 = MPI.COMM_WORLD.reduce(cp.linalg.norm(b) ** 2) + if MPI.COMM_WORLD.Get_rank() == 0: + assert abs(s0 - s1) < 1e-1 + c = a.redistribute(a.alignment) + assert c is a + + +def test_3Darray(): + N = (8, 8, 8) + for subcomm in ( + (0, 0, 1), + (0, 1, 0), + (1, 0, 0), + (0, 1, 1), + (1, 0, 1), + (1, 1, 0), + None, + Subcomm(comm, (0, 0, 1)), + ): + for rank in (0, 1, 2): + M = (3,) * rank + N + alignment = None + if subcomm is None and rank == 1: + alignment = 2 + a = DistArrayCuPy(M, subcomm=subcomm, val=1, rank=rank, alignment=alignment) + assert a.rank == rank + assert a.global_shape == M + _ = a.substart + _ = a.subcomm + z = a.commsizes + _ = a.pencil + assert cp.prod(cp.array(z)) == comm.Get_size() + if rank > 0: + a0 = a[0] + assert isinstance(a0, DistArrayCuPy) + assert a0.rank == rank - 1 + if rank == 2: + a0 = a[0, 1] + assert isinstance(a0, DistArrayCuPy) + assert a0.rank == 0 + aa = a.v + assert isinstance(aa, cp.ndarray) + try: + k = a.get((0,) * rank + (0, 0, slice(None))) + if comm.Get_rank() == 0: + assert len(k) == N[2] + assert cp.sum(k) == N[2] + k = a.get((0,) * rank + (slice(None), 0, 0)) + if comm.Get_rank() == 0: + assert len(k) == N[0] + assert cp.sum(k) == N[0] + except ModuleNotFoundError: + pass + _ = a.local_slice() + newaxis = (a.alignment + 1) % 3 + _ = a.get_pencil_and_transfer(newaxis) + a[:] = MPI.COMM_WORLD.Get_rank() + b = a.redistribute(newaxis) + a = b.redistribute(out=a) + s0 = MPI.COMM_WORLD.reduce(cp.linalg.norm(a) ** 2) + s1 = MPI.COMM_WORLD.reduce(cp.linalg.norm(b) ** 2) + if MPI.COMM_WORLD.Get_rank() == 0: + assert abs(s0 - s1) < 1e-1 + + +def test_newDistArray(): + N = (8, 8, 8) + pfft = PFFT(MPI.COMM_WORLD, N, backend='cupy') + for forward_output in (True, False): + for view in (True, False): + for rank in (0, 1, 2): + a = newDistArray( + pfft, + forward_output=forward_output, + rank=rank, + view=view, + ) + if view is False: + assert isinstance(a, DistArrayCuPy) + assert a.rank == rank + if rank == 0: + qfft = PFFT(MPI.COMM_WORLD, darray=a, backend='cupy') + elif rank == 1: + qfft = PFFT(MPI.COMM_WORLD, darray=a[0], backend='cupy') + else: + qfft = PFFT(MPI.COMM_WORLD, darray=a[0, 0], backend='cupy') + qfft.destroy() + + else: + assert isinstance(a, cp.ndarray) + assert a.base.rank == rank + pfft.destroy() + + +if __name__ == "__main__": + test_1Darray() + test_2Darray() + test_3Darray() + test_newDistArray() diff --git a/tests/test_libfft.py b/tests/test_libfft.py index 23c6c8e..180fc11 100644 --- a/tests/test_libfft.py +++ b/tests/test_libfft.py @@ -7,18 +7,32 @@ from mpi4py_fft.libfft import FFT has_backend = {'fftw': True} -for backend in ('pyfftw', 'mkl_fft', 'scipy', 'numpy'): +for backend in ('pyfftw', 'mkl_fft', 'scipy', 'numpy', 'cupy'): has_backend[backend] = True try: importlib.import_module(backend) except ImportError: has_backend[backend] = False +if has_backend['cupy']: + import cupy as cp + has_backend['cupyx-scipy'] = True + abstol = dict(f=5e-5, d=1e-14, g=1e-14) +def get_xpy(backend=None, array=None): + if backend in ['cupy', 'cupyx-scipy']: + return cp + if has_backend['cupy'] and array is not None: + if type(array) == cp.ndarray: + return cp + return np + + def allclose(a, b): atol = abstol[a.dtype.char.lower()] - return np.allclose(a, b, rtol=0, atol=atol) + xp = get_xpy(array=a) + return xp.allclose(a, b, rtol=0, atol=atol) def test_libfft(): from itertools import product @@ -30,9 +44,10 @@ def test_libfft(): if fftw.get_fftw_lib(t): types += t+t.upper() - for backend in ('pyfftw', 'mkl_fft', 'scipy', 'numpy', 'fftw'): + for backend in has_backend.keys(): if has_backend[backend] is False: continue + xp = get_xpy(backend=backend) t0 = 0 for typecode in types: for dim in dims: @@ -47,7 +62,7 @@ def test_libfft(): A = fft.forward.input_array B = fft.forward.output_array - A[...] = np.random.random(A.shape).astype(typecode) + A[...] = xp.random.random(A.shape).astype(typecode) X = A.copy() B.fill(0) @@ -59,14 +74,15 @@ def test_libfft(): t0 -= time() A = fft.backward(B, A) t0 += time() - assert allclose(A, X) + assert allclose(A, X), f'Failed with {backend} backend with type \"{typecode}\" in {dim} dims!' print('backend: ', backend, t0) # Padding is different because the physical space is padded and as such # difficult to initialize. We solve this problem by making one extra # transform - for backend in ('pyfftw', 'mkl_fft', 'scipy', 'numpy', 'fftw'): + for backend in has_backend.keys(): if has_backend[backend] is False: continue + xp = get_xpy(backend=backend) for padding in (1.5, 2.0): for typecode in types: for dim in dims: @@ -84,7 +100,7 @@ def test_libfft(): A = fft.forward.input_array B = fft.forward.output_array - A[...] = np.random.random(A.shape).astype(typecode) + A[...] = xp.random.random(A.shape).astype(typecode) B.fill(0) B = fft.forward(A, B) @@ -95,11 +111,12 @@ def test_libfft(): B.fill(0) B = fft.forward(A, B) - assert allclose(B, X), np.linalg.norm(B-X) + assert allclose(B, X), xp.linalg.norm(B-X) - for backend in ('pyfftw', 'mkl_fft', 'scipy', 'numpy', 'fftw'): + for backend in has_backend.keys(): if has_backend[backend] is False: continue + xp = get_xpy(backend=backend) if backend == 'fftw': dctn = functools.partial(fftw.dctn, type=3) @@ -113,6 +130,9 @@ def test_libfft(): elif backend == 'numpy': transforms = {(1,): (np.fft.rfftn, np.fft.irfftn), (0, 1): (np.fft.rfftn, np.fft.irfftn)} + elif backend == 'cupy': + transforms = {(1,): (cp.fft.rfftn, cp.fft.irfftn), + (0, 1): (cp.fft.rfftn, cp.fft.irfftn)} elif backend == 'mkl_fft': import mkl_fft transforms = {(1,): (mkl_fft._numpy_fft.rfftn, mkl_fft._numpy_fft.irfftn), @@ -121,12 +141,17 @@ def test_libfft(): from scipy.fftpack import fftn, ifftn transforms = {(1,): (fftn, ifftn), (0, 1): (fftn, ifftn)} + elif backend == 'cupyx-scipy': + from scipy.fftpack import fftn, ifftn + import cupyx.scipy.fft as fftlib + transforms = {(1,): (fftlib.fftn, fftlib.ifftn), + (0, 1): (fftlib.fftn, fftlib.ifftn)} for axis in ((1,), (0, 1)): fft = FFT(shape, axis, backend=backend, transforms=transforms) A = fft.forward.input_array B = fft.forward.output_array - A[...] = np.random.random(A.shape) + A[...] = xp.random.random(A.shape) X = A.copy() B.fill(0) B = fft.forward(A, B) diff --git a/tests/test_pencil.py b/tests/test_pencil.py index 9a74c40..6985e46 100644 --- a/tests/test_pencil.py +++ b/tests/test_pencil.py @@ -11,53 +11,74 @@ def test_pencil(): sizes = (7, 8, 9) types = 'fdFD' #'hilfdgFDG' + backends = ['MPI', 'customMPI'] + + xp = { + 'MPI': np, + 'customMPI': np, + } + + try: + import cupy as cp + from cupy.cuda import nccl + backends += ['NCCL'] + xp['NCCL'] = cp + except ImportError: + pass + for typecode in types: for dim in dims: for shape in product(*([sizes]*dim)): - axes = list(range(dim)) - for axis1, axis2, axis3 in product(axes, axes, axes): + for backend in backends: + + Pencil.backend = backend + axes = list(range(dim)) + + for axis1, axis2, axis3 in product(axes, axes, axes): + + if axis1 == axis2: continue + if axis2 == axis3: continue + axis3 -= len(shape) + #if comm.rank == 0: + # print(shape, axis1, axis2, axis3, typecode) - if axis1 == axis2: continue - if axis2 == axis3: continue - axis3 -= len(shape) - #if comm.rank == 0: - # print(shape, axis1, axis2, axis3, typecode) + for pdim in [None] + list(range(1, dim-1)): - for pdim in [None] + list(range(1, dim-1)): + subcomm = Subcomm(comm, pdim) + pencil0 = Pencil(subcomm, shape) - subcomm = Subcomm(comm, pdim) - pencil0 = Pencil(subcomm, shape) + pencilA = pencil0.pencil(axis1) + pencilB = pencilA.pencil(axis2) + pencilC = pencilB.pencil(axis3) - pencilA = pencil0.pencil(axis1) - pencilB = pencilA.pencil(axis2) - pencilC = pencilB.pencil(axis3) + assert pencilC.backend == backend - trans1 = Pencil.transfer(pencilA, pencilB, typecode) - trans2 = Pencil.transfer(pencilB, pencilC, typecode) + trans1 = Pencil.transfer(pencilA, pencilB, typecode) + trans2 = Pencil.transfer(pencilB, pencilC, typecode) - X = np.random.random(pencilA.subshape).astype(typecode) + X = xp[backend].random.random(pencilA.subshape).astype(typecode) - A = np.empty(pencilA.subshape, dtype=typecode) - B = np.empty(pencilB.subshape, dtype=typecode) - C = np.empty(pencilC.subshape, dtype=typecode) + A = xp[backend].empty(pencilA.subshape, dtype=typecode) + B = xp[backend].empty(pencilB.subshape, dtype=typecode) + C = xp[backend].empty(pencilC.subshape, dtype=typecode) - A[...] = X + A[...] = X - B.fill(0) - trans1.forward(A, B) - C.fill(0) - trans2.forward(B, C) + B.fill(0) + trans1.forward(A, B) + C.fill(0) + trans2.forward(B, C) - B.fill(0) - trans2.backward(C, B) - A.fill(0) - trans1.backward(B, A) + B.fill(0) + trans2.backward(C, B) + A.fill(0) + trans1.backward(B, A) - assert np.allclose(A, X) + assert xp[backend].allclose(A, X) - trans1.destroy() - trans2.destroy() - subcomm.destroy() + trans1.destroy() + trans2.destroy() + subcomm.destroy() if __name__ == '__main__': diff --git a/tests/test_transfer_classes.py b/tests/test_transfer_classes.py new file mode 100644 index 0000000..f979730 --- /dev/null +++ b/tests/test_transfer_classes.py @@ -0,0 +1,92 @@ +from mpi4py import MPI +from mpi4py_fft.pencil import Transfer, CustomMPITransfer, Pencil, Subcomm +import numpy as np + +transfer_classes = [CustomMPITransfer] +xps = {CustomMPITransfer: np} + +try: + import cupy as cp + from mpi4py_fft.pencil import NCCLTransfer + transfer_classes += [NCCLTransfer] + xps[NCCLTransfer] = cp +except ModuleNotFoundError: + pass + + +def get_args(comm, shape, dtype): + subcomm = Subcomm(comm=comm, dims=None) + pencilA = Pencil(subcomm, shape, 0) + pencilB = Pencil(subcomm, shape, 1) + + kwargs = { + 'comm': comm, + 'shape': shape, + 'subshapeA': pencilA.subshape, + 'subshapeB': pencilB.subshape, + 'axisA': 0, + 'axisB': 1, + 'dtype': dtype, + } + return kwargs + + +def get_arrays(kwargs, xp): + arrayA = xp.zeros(shape=kwargs['subshapeA'], dtype=kwargs['dtype']) + arrayB = xp.zeros(shape=kwargs['subshapeB'], dtype=kwargs['dtype']) + + arrayA[:] = xp.random.random(arrayA.shape).astype(arrayA.dtype) + return arrayA, arrayB + + +def single_test_all_to_allw(transfer_class, shape, dtype, comm=None, xp=None): + comm = comm if comm else MPI.COMM_WORLD + kwargs = get_args(comm, shape, dtype) + arrayA, arrayB = get_arrays(kwargs, xp) + arrayB_ref = arrayB.copy() + + transfer = transfer_class(**kwargs) + reference_transfer = Transfer(**kwargs) + + transfer.Alltoallw(arrayA, transfer._subtypesA, arrayB, transfer._subtypesB) + reference_transfer.Alltoallw(arrayA, transfer._subtypesA, arrayB_ref, transfer._subtypesB) + assert xp.allclose(arrayB, arrayB_ref), f'Did not get the same result from `alltoallw` with {transfer_class.__name__} transfer class as MPI implementation on rank {comm.rank}!' + + comm.Barrier() + if comm.rank == 0: + print(f'{transfer_class.__name__} passed alltoallw test with shape {shape} and dtype {dtype}') + + +def single_test_forward_backward(transfer_class, shape, dtype, comm=None, xp=None): + comm = comm if comm else MPI.COMM_WORLD + kwargs = get_args(comm, shape, dtype) + arrayA, arrayB = get_arrays(kwargs, xp) + arrayA_ref = arrayA.copy() + + transfer = transfer_class(**kwargs) + + transfer.forward(arrayA, arrayB) + transfer.backward(arrayB, arrayA) + assert xp.allclose(arrayA, arrayA_ref), f'Did not get the same result when transferring back and forth with {transfer_class.__name__} transfer class on rank {comm.rank}!' + + comm.Barrier() + if comm.rank == 0: + print(f'{transfer_class.__name__} passed forward/backward test with shape {shape} and dtype {dtype}') + + +def test_transfer_class(): + dims = (2, 3) + sizes = (7, 8, 9, 128) + dtypes = 'fFdD' + + shapes = [[size] * dim for size in sizes for dim in dims] + [[32, 256, 129]] + + for transfer_class in transfer_classes: + for shape in shapes: + for dtype in dtypes: + single_test_all_to_allw(transfer_class, shape, dtype, xp=xps[transfer_class]) + single_test_forward_backward(transfer_class, shape, dtype, xp=xps[transfer_class]) + + +if __name__ == '__main__': + test_transfer_class()