diff --git a/mpi4py_fft/__init__.py b/mpi4py_fft/__init__.py
index 16ae4c8..75442a7 100644
--- a/mpi4py_fft/__init__.py
+++ b/mpi4py_fft/__init__.py
@@ -24,3 +24,8 @@
 from . import fftw
 from .fftw import fftlib
 from .io import HDF5File, NCFile, generate_xdmf
+
+try:
+    from .distarrayCuPy import DistArrayCuPy
+except ModuleNotFoundError:
+    pass
diff --git a/mpi4py_fft/distarray.py b/mpi4py_fft/distarray.py
index 3a69e5c..ff92b5c 100644
--- a/mpi4py_fft/distarray.py
+++ b/mpi4py_fft/distarray.py
@@ -7,34 +7,13 @@
 
 comm = MPI.COMM_WORLD
 
-class DistArray(np.ndarray):
-    """Distributed Numpy array
-
-    This Numpy array is part of a larger global array. Information about the
-    distribution is contained in the attributes.
-
-    Parameters
-    ----------
-    global_shape : sequence of ints
-        Shape of non-distributed global array
-    subcomm : None, :class:`.Subcomm` object or sequence of ints, optional
-        Describes how to distribute the array
-    val : Number or None, optional
-        Initialize array with this number if buffer is not given
-    dtype : np.dtype, optional
-        Type of array
-    buffer : Numpy array, optional
-        Array of correct shape. The buffer owns the memory that is used for
-        this array.
-    alignment : None or int, optional
-        Make sure array is aligned in this direction. Note that alignment does
-        not take rank into consideration.
-    rank : int, optional
-        Rank of tensor (number of free indices, a scalar is zero, vector one,
-        matrix two)
 
+class DistArrayBase:
+    """
+    Abstract class for distributed arrays in numpy or cupy implementation
 
-    For more information, see `numpy.ndarray <https://docs.scipy.org/doc/numpy/reference/arrays.ndarray.html>`_
+    Attributes:
+        - xp: Numerical library, a.k.a. numpy or cupy as class attribute
 
     Note
     ----
@@ -53,58 +32,9 @@ class DistArray(np.ndarray):
     Also note that the ``alignment`` keyword does not take rank into
     consideration. Setting alignment=2 for the array above means that the last
     axis will be aligned, also when rank>0.
-
     """
-    def __new__(cls, global_shape, subcomm=None, val=None, dtype=float,
-                buffer=None, strides=None, alignment=None, rank=0):
-        if len(global_shape[rank:]) < 2: # 1D case
-            obj = np.ndarray.__new__(cls, global_shape, dtype=dtype, buffer=buffer, strides=strides)
-            if buffer is None and isinstance(val, Number):
-                obj.fill(val)
-            obj._rank = rank
-            obj._p0 = None
-            return obj
-
-        if isinstance(subcomm, Subcomm):
-            pass
-        else:
-            if isinstance(subcomm, (tuple, list)):
-                assert len(subcomm) == len(global_shape[rank:])
-                # Do nothing if already containing communicators. A tuple of subcommunicators is not necessarily a Subcomm
-                if not np.all([isinstance(s, MPI.Comm) for s in subcomm]):
-                    subcomm = Subcomm(comm, subcomm)
-            else:
-                assert subcomm is None
-                subcomm = [0] * len(global_shape[rank:])
-                if alignment is not None:
-                    subcomm[alignment] = 1
-                else:
-                    subcomm[-1] = 1
-                    alignment = len(subcomm)-1
-                subcomm = Subcomm(comm, subcomm)
-        sizes = [s.Get_size() for s in subcomm]
-        if alignment is not None:
-            assert isinstance(alignment, (int, np.integer))
-            assert sizes[alignment] == 1
-        else:
-            # Decide that alignment is the last axis with size 1
-            alignment = np.flatnonzero(np.array(sizes) == 1)[-1]
-        p0 = Pencil(subcomm, global_shape[rank:], axis=alignment)
-        subshape = p0.subshape
-        if rank > 0:
-            subshape = global_shape[:rank] + subshape
-        obj = np.ndarray.__new__(cls, subshape, dtype=dtype, buffer=buffer)
-        if buffer is None and isinstance(val, Number):
-            obj.fill(val)
-        obj._p0 = p0
-        obj._rank = rank
-        return obj
 
-    def __array_finalize__(self, obj):
-        if obj is None:
-            return
-        self._p0 = getattr(obj, '_p0', None)
-        self._rank = getattr(obj, '_rank', None)
+    xp = None
 
     @property
     def alignment(self):
@@ -152,32 +82,72 @@ def dimensions(self):
         """Return dimensions of array not including rank"""
         return len(self._p0.shape)
 
+    @staticmethod
+    def get_subcomm(subcomm, global_shape, rank, alignment):
+        if isinstance(subcomm, Subcomm):
+            pass
+        else:
+            if isinstance(subcomm, (tuple, list)):
+                assert len(subcomm) == len(global_shape[rank:])
+                # Do nothing if already containing communicators. A tuple of subcommunicators is not necessarily a Subcomm
+                if not np.all([isinstance(s, MPI.Comm) for s in subcomm]):
+                    subcomm = Subcomm(comm, subcomm)
+            else:
+                assert subcomm is None
+                subcomm = [0] * len(global_shape[rank:])
+                if alignment is not None:
+                    subcomm[alignment] = 1
+                else:
+                    subcomm[-1] = 1
+                    alignment = len(subcomm) - 1
+                subcomm = Subcomm(comm, subcomm)
+        return subcomm
+
+    @classmethod
+    def setup_pencil(cls, subcomm, rank, global_shape, alignment):
+        sizes = [s.Get_size() for s in subcomm]
+        if alignment is not None:
+            assert isinstance(alignment, (int, cls.xp.integer))
+            assert sizes[alignment] == 1
+        else:
+            # Decide that alignment is the last axis with size 1
+            alignment = np.flatnonzero(np.array(sizes) == 1)[-1]
+
+        p0 = Pencil(subcomm, global_shape[rank:], axis=alignment)
+
+        subshape = p0.subshape
+        if rank > 0:
+            subshape = global_shape[:rank] + subshape
+
+        return p0, subshape
+
+    def __array_finalize__(self, obj):
+        if obj is None:
+            return
+        self._p0 = getattr(obj, "_p0", None)
+        self._rank = getattr(obj, "_rank", None)
+
     def __getitem__(self, i):
         # Return DistArray if the result is a component of a tensor
         # Otherwise return ndarray view
         if self.ndim == 1:
-            return np.ndarray.__getitem__(self, i)
+            return self.xp.ndarray.__getitem__(self, i)
 
         if isinstance(i, (Integral, slice)) and self.rank > 0:
-            v0 = np.ndarray.__getitem__(self, i)
+            v0 = self.xp.ndarray.__getitem__(self, i)
             v0._rank = self.rank - (self.ndim - v0.ndim)
             return v0
 
         if isinstance(i, (Integral, slice)) and self.rank == 0:
-            return np.ndarray.__getitem__(self.v, i)
+            return self.xp.ndarray.__getitem__(self.v, i)
 
         assert isinstance(i, tuple)
         if len(i) <= self.rank:
-            v0 = np.ndarray.__getitem__(self, i)
+            v0 = self.xp.ndarray.__getitem__(self, i)
             v0._rank = self.rank - (self.ndim - v0.ndim)
             return v0
 
-        return np.ndarray.__getitem__(self.v, i)
-
-    @property
-    def v(self):
-        """ Return local ``self`` array as an ``ndarray`` object"""
-        return self.__array__()
+        return self.xp.ndarray.__getitem__(self.v, i)
 
     def get(self, gslice):
         """Return global slice of ``self``
@@ -215,6 +185,7 @@ def get(self, gslice):
         # global MPI. We create a global file with MPI, but then open it without
         # MPI and only on rank 0.
         import h5py
+        # TODO: can we use h5py to communicate the data without copying to cpu first when using cupy?
         f = h5py.File('tmp.h5', 'w', driver="mpio", comm=comm)
         s = self.local_slice()
         sp = np.nonzero([isinstance(x, slice) for x in gslice])[0]
@@ -230,7 +201,8 @@ def get(self, gslice):
             else:
                 on_this_proc = False
         if on_this_proc:
-            f["data"][sf] = self[tuple(gslice)]
+            data = self.asnumpy()
+            f["data"][sf] = data[tuple(gslice)]
         f.close()
         c = None
         if comm.Get_rank() == 0:
@@ -277,24 +249,6 @@ def local_slice(self):
                                                                self._p0.subshape)]
         return tuple([slice(0, s) for s in self.shape[:self.rank]] + v)
 
-    def get_pencil_and_transfer(self, axis):
-        """Return pencil and transfer objects for alignment along ``axis``
-
-        Parameters
-        ----------
-        axis : int
-            The new axis to align data with
-
-        Returns
-        -------
-        2-tuple
-            2-tuple where first item is a :class:`.Pencil` aligned in ``axis``.
-            Second item is a :class:`.Transfer` object for executing the
-            redistribution of data
-        """
-        p1 = self._p0.pencil(axis)
-        return p1, self._p0.transfer(p1, self.dtype)
-
     def redistribute(self, axis=None, out=None):
         """Global redistribution of local ``self`` array
 
@@ -327,7 +281,7 @@ def redistribute(self, axis=None, out=None):
                 return self
 
         if out is not None:
-            assert isinstance(out, DistArray)
+            assert issubclass(type(out), DistArrayBase)
             assert self.global_shape == out.global_shape
             axis = out.alignment
             if self.commsizes == out.commsizes:
@@ -343,11 +297,11 @@ def redistribute(self, axis=None, out=None):
 
         p1, transfer = self.get_pencil_and_transfer(axis)
         if out is None:
-            out = DistArray(self.global_shape,
-                            subcomm=p1.subcomm,
-                            dtype=self.dtype,
-                            alignment=axis,
-                            rank=self.rank)
+            out = type(self)(self.global_shape,
+                             subcomm=p1.subcomm,
+                             dtype=self.dtype,
+                             alignment=axis,
+                             rank=self.rank)
 
         if self.rank == 0:
             transfer.forward(self, out)
@@ -362,6 +316,24 @@ def redistribute(self, axis=None, out=None):
         transfer.destroy()
         return out
 
+    def get_pencil_and_transfer(self, axis):
+        """Return pencil and transfer objects for alignment along ``axis``
+
+        Parameters
+        ----------
+        axis : int
+            The new axis to align data with
+
+        Returns
+        -------
+        2-tuple
+            2-tuple where first item is a :class:`.Pencil` aligned in ``axis``.
+            Second item is a :class:`.Transfer` object for executing the
+            redistribution of data
+        """
+        p1 = self._p0.pencil(axis)
+        return p1, self._p0.transfer(p1, self.dtype)
+
     def write(self, filename, name='darray', step=0, global_slice=None,
               domain=None, as_scalar=False):
         """Write snapshot ``step`` of ``self`` to file ``filename``
@@ -439,6 +411,67 @@ def read(self, filename, name='darray', step=0):
         f.read(self, name, step=step)
 
 
+class DistArray(DistArrayBase, np.ndarray):
+    """Distributed Numpy array
+
+    This Numpy array is part of a larger global array. Information about the
+    distribution is contained in the attributes.
+
+    Parameters
+    ----------
+    global_shape : sequence of ints
+        Shape of non-distributed global array
+    subcomm : None, :class:`.Subcomm` object or sequence of ints, optional
+        Describes how to distribute the array
+    val : Number or None, optional
+        Initialize array with this number if buffer is not given
+    dtype : np.dtype, optional
+        Type of array
+    buffer : Numpy array, optional
+        Array of correct shape. The buffer owns the memory that is used for
+        this array.
+    alignment : None or int, optional
+        Make sure array is aligned in this direction. Note that alignment does
+        not take rank into consideration.
+    rank : int, optional
+        Rank of tensor (number of free indices, a scalar is zero, vector one,
+        matrix two)
+
+
+    For more information, see `numpy.ndarray <https://docs.scipy.org/doc/numpy/reference/arrays.ndarray.html>`_
+
+    """
+
+    xp = np
+
+    def __new__(cls, global_shape, subcomm=None, val=None, dtype=float,
+                buffer=None, strides=None, alignment=None, rank=0):
+        if len(global_shape[rank:]) < 2: # 1D case
+            obj = cls.xp.ndarray.__new__(cls, global_shape, dtype=dtype, buffer=buffer, strides=strides)
+            if buffer is None and isinstance(val, Number):
+                obj.fill(val)
+            obj._rank = rank
+            obj._p0 = None
+            return obj
+
+        subcomm = cls.get_subcomm(subcomm, global_shape, rank, alignment)
+        p0, subshape = cls.setup_pencil(subcomm, rank, global_shape, alignment)
+
+        obj = cls.xp.ndarray.__new__(cls, subshape, dtype=dtype, buffer=buffer)
+        if buffer is None and isinstance(val, Number):
+            obj.fill(val)
+        obj._p0 = p0
+        obj._rank = rank
+        return obj
+
+    @property
+    def v(self):
+        """Return local ``self`` array as an ``ndarray`` object"""
+        return self.__array__()
+
+    def asnumpy(self):
+        return self
+
 def newDistArray(pfft, forward_output=True, val=0, rank=0, view=False):
     """Return a new :class:`.DistArray` object for provided :class:`.PFFT` object
 
@@ -480,7 +513,13 @@ def newDistArray(pfft, forward_output=True, val=0, rank=0, view=False):
     else:
         dtype = pfft.forward.input_array.dtype
     global_shape = (len(global_shape),)*rank + global_shape
-    z = DistArray(global_shape, subcomm=p0.subcomm, val=val, dtype=dtype,
+
+    if pfft.xfftn[0].backend in ["cupy", "cupyx-scipy"]:
+        from mpi4py_fft.distarrayCuPy import DistArrayCuPy as darraycls
+    else:
+        darraycls = DistArray
+
+    z = darraycls(global_shape, subcomm=p0.subcomm, val=val, dtype=dtype,
                   alignment=p0.axis, rank=rank)
     return z.v if view else z
 
diff --git a/mpi4py_fft/distarrayCuPy.py b/mpi4py_fft/distarrayCuPy.py
new file mode 100644
index 0000000..2c243bb
--- /dev/null
+++ b/mpi4py_fft/distarrayCuPy.py
@@ -0,0 +1,86 @@
+from mpi4py_fft.distarray import DistArrayBase
+import cupy as cp
+from mpi4py import MPI
+from numbers import Number
+
+comm = MPI.COMM_WORLD
+
+
+class DistArrayCuPy(DistArrayBase, cp.ndarray):
+    """Distributed CuPy array
+
+    This CuPy array is part of a larger global array. Information about the
+    distribution is contained in the attributes.
+
+    Parameters
+    ----------
+    global_shape : sequence of ints
+        Shape of non-distributed global array
+    subcomm : None, :class:`.Subcomm` object or sequence of ints, optional
+        Describes how to distribute the array
+    val : Number or None, optional
+        Initialize array with this number if buffer is not given
+    dtype : cp.dtype, optional
+        Type of array
+    memptr : cupy.cuda.MemoryPointer, optional
+        Pointer to the array content head.
+    alignment : None or int, optional
+        Make sure array is aligned in this direction. Note that alignment does
+        not take rank into consideration.
+    rank : int, optional
+        Rank of tensor (number of free indices, a scalar is zero, vector one,
+        matrix two)
+
+
+    For more information, see `cupy.ndarray <https://docs.cupy.dev/en/stable/reference/generated/cupy.ndarray.html#cupy.ndarray>`_
+
+    """
+
+    xp = cp
+
+    def __new__(
+        cls,
+        global_shape,
+        subcomm=None,
+        val=None,
+        dtype=float,
+        memptr=None,
+        strides=None,
+        alignment=None,
+        rank=0,
+    ):
+        if len(global_shape[rank:]) < 2:  # 1D case
+            obj = cls.xp.ndarray.__new__(
+                cls, global_shape, dtype=dtype, memptr=memptr, strides=strides
+            )
+            if memptr is None and isinstance(val, Number):
+                obj.fill(val)
+            obj._rank = rank
+            obj._p0 = None
+            return obj
+
+        subcomm = cls.get_subcomm(subcomm, global_shape, rank, alignment)
+        p0, subshape = cls.setup_pencil(subcomm, rank, global_shape, alignment)
+
+        obj = cls.xp.ndarray.__new__(cls, subshape, dtype=dtype, memptr=memptr, strides=strides)
+        if memptr is None and isinstance(val, Number):
+            obj.fill(val)
+        obj._p0 = p0
+        obj._rank = rank
+        return obj
+
+    def get(self, *args, **kwargs):
+        """Untangle inheritance conflicts"""
+        if any(isinstance(me, tuple) for me in args):
+            return DistArrayBase.get(self, *args, **kwargs)
+        else:
+            return cp.ndarray.get(self, *args, **kwargs)
+
+    def asnumpy(self):
+        """Copy the array to CPU"""
+        return self.get()
+
+    @property
+    def v(self):
+        """Return local ``self`` array as an ``ndarray`` object"""
+        return cp.ndarray.__getitem__(self, slice(None, None, None))
diff --git a/mpi4py_fft/libfft.py b/mpi4py_fft/libfft.py
index 580bb9e..aca5265 100644
--- a/mpi4py_fft/libfft.py
+++ b/mpi4py_fft/libfft.py
@@ -78,6 +78,30 @@ def _Xfftn_plan_fftw(shape, axes, dtype, transforms, options):
     xfftn_bck = plan_bck(V, s=s, axes=axes, threads=threads, flags=flags, output_array=U)
     return (xfftn_fwd, xfftn_bck)
 
+def _Xfftn_plan_cupy(shape, axes, dtype, transforms, options):
+    import cupy as cp
+    cp.fft.config.enable_nd_planning = True
+
+    transforms = {} if transforms is None else transforms
+    if tuple(axes) in transforms:
+        plan_fwd, plan_bck = transforms[tuple(axes)]
+    else:
+        if cp.issubdtype(dtype, cp.floating):
+            plan_fwd = cp.fft.rfftn
+            plan_bck = cp.fft.irfftn
+        else:
+            plan_fwd = cp.fft.fftn
+            plan_bck = cp.fft.ifftn
+
+    s = tuple(np.take(shape, axes))
+    U = cp.empty(shape=shape, dtype=dtype)
+    V = cp.empty_like(plan_fwd(U, s=s, axes=axes))
+
+    return (
+        _Yfftn_wrap(plan_fwd, U, V, 1, {'s': s, 'axes': axes, 'norm': 'backward',}, xp=cp),
+        _Yfftn_wrap(plan_bck, V, U, 1, {'s': s, 'axes': axes, 'norm': 'forward',}, xp=cp),
+    )
+
 def _Xfftn_plan_numpy(shape, axes, dtype, transforms, options):
 
     transforms = {} if transforms is None else transforms
@@ -125,6 +149,25 @@ def _Xfftn_plan_mkl(shape, axes, dtype, transforms, options): #pragma: no cover
     return (_Yfftn_wrap(plan_fwd, U, V, 1, {'s': s, 'axes': axes}),
             _Yfftn_wrap(plan_bck, V, U, M, {'s': s, 'axes': axes}))
 
+def _Xfftn_plan_cupyx_scipy(shape, axes, dtype, transforms, options):
+    import cupy as cp
+    import cupyx.scipy.fft as cufft
+
+    transforms = {} if transforms is None else transforms
+    if tuple(axes) in transforms:
+        plan_fwd, plan_bck = transforms[tuple(axes)]
+    else:
+        plan_fwd = cufft.fftn
+        plan_bck = cufft.ifftn
+
+    s = tuple(np.take(shape, axes))
+    U = cp.empty(shape=shape, dtype=dtype)
+    V = plan_fwd(U, s=s, axes=axes)
+    V = cp.array(V)
+    M = np.prod(s)
+    return (_Yfftn_wrap(plan_fwd, U, V, 1, {'s': s, 'axes': axes, 'overwrite_x': True}, xp=cp),
+            _Yfftn_wrap(plan_bck, V, U, M, {'s': s, 'axes': axes, 'overwrite_x': True}, xp=cp))
+
 def _Xfftn_plan_scipy(shape, axes, dtype, transforms, options):
 
     transforms = {} if transforms is None else transforms
@@ -143,19 +186,23 @@ def _Xfftn_plan_scipy(shape, axes, dtype, transforms, options):
     return (_Yfftn_wrap(plan_fwd, U, V, 1, {'shape': s, 'axes': axes}),
             _Yfftn_wrap(plan_bck, V, U, M, {'shape': s, 'axes': axes}))
 
+def _copyto(dst, src, xp):
+    xp.copyto(dst, src, casting='unsafe')
+
 class _Yfftn_wrap(object):
     #Wraps numpy/scipy/mkl transforms to FFTW style
     # pylint: disable=too-few-public-methods
 
-    __slots__ = ('_xfftn', '_M', '_opt', '__doc__', '_input_array', '_output_array')
+    __slots__ = ('_xfftn', '_M', '_opt', '__doc__', '_input_array', '_output_array', 'xp')
 
-    def __init__(self, xfftn_obj, input_array, output_array, M, opt):
+    def __init__(self, xfftn_obj, input_array, output_array, M, opt, xp=np):
         object.__setattr__(self, '_xfftn', xfftn_obj)
         object.__setattr__(self, '_opt', opt)
         object.__setattr__(self, '_M', M)
         object.__setattr__(self, '_input_array', input_array)
         object.__setattr__(self, '_output_array', output_array)
         object.__setattr__(self, '__doc__', xfftn_obj.__doc__)
+        object.__setattr__(self, 'xp', xp)
 
     @property
     def input_array(self):
@@ -177,9 +224,12 @@ def opt(self):
     def M(self):
         return object.__getattribute__(self, '_M')
 
+    def copyto(self, dst, src):
+        _copyto(dst, src, self.xp)
+
     def __call__(self, *args, **kwargs):
         self.opt.update(kwargs)
-        self.output_array[...] = self.xfftn(self.input_array, **self.opt)
+        self.copyto(self._output_array, self.xfftn(self.input_array, **self.opt))
         if abs(self.M-1) > 1e-8:
             self._output_array *= self.M
         return self.output_array
@@ -188,13 +238,14 @@ class _Xfftn_wrap(object):
     #Common interface for all serial transforms
     # pylint: disable=too-few-public-methods
 
-    __slots__ = ('_xfftn', '__doc__', '_input_array', '_output_array')
+    __slots__ = ('_xfftn', '__doc__', '_input_array', '_output_array', 'xp')
 
-    def __init__(self, xfftn_obj, input_array, output_array):
+    def __init__(self, xfftn_obj, input_array, output_array, xp=np):
         object.__setattr__(self, '_xfftn', xfftn_obj)
         object.__setattr__(self, '_input_array', input_array)
         object.__setattr__(self, '_output_array', output_array)
         object.__setattr__(self, '__doc__', xfftn_obj.__doc__)
+        object.__setattr__(self, 'xp', xp)
 
     @property
     def input_array(self):
@@ -208,12 +259,15 @@ def output_array(self):
     def xfftn(self):
         return object.__getattribute__(self, '_xfftn')
 
+    def copyto(self, dst, src):
+        _copyto(dst, src, self.xp)
+
     def __call__(self, input_array=None, output_array=None, **options):
         if input_array is not None:
-            self.input_array[...] = input_array
+            self.copyto(self.input_array, input_array)
         self.xfftn(**options)
         if output_array is not None:
-            output_array[...] = self.output_array
+            self.copyto(output_array, self.output_array)
             return output_array
         else:
             return self.output_array
@@ -310,7 +364,6 @@ def _padding_backward(self, trunc_array, padded_array):
                     su[axis] = -(N//2)
                     padded_array[tuple(su)] *= 0.5
 
-
 class FFT(FFTBase):
     """Class for serial FFT transforms
 
@@ -373,6 +426,7 @@ class FFT(FFTBase):
         output_array : array
 
     """
+
     def __init__(self, shape, axes=None, dtype=float, padding=False,
                  backend='fftw', transforms=None, **kw):
         FFTBase.__init__(self, shape, axes, dtype, padding)
@@ -380,8 +434,10 @@ def __init__(self, shape, axes=None, dtype=float, padding=False,
             'pyfftw': _Xfftn_plan_pyfftw,
             'fftw': _Xfftn_plan_fftw,
             'numpy': _Xfftn_plan_numpy,
+            'cupy': _Xfftn_plan_cupy,
             'mkl_fft': _Xfftn_plan_mkl,
             'scipy': _Xfftn_plan_scipy,
+            'cupyx-scipy': _Xfftn_plan_cupyx_scipy,
         }[backend]
         self.backend = backend
         self.fwd, self.bck = plan(self.shape, self.axes, self.dtype, transforms, kw)
@@ -391,19 +447,27 @@ def __init__(self, shape, axes=None, dtype=float, padding=False,
             self.M = 1./np.prod(np.take(self.shape, self.axes))
         else:
             self.M = self.fwd.get_normalization()
-        if backend == 'scipy':
+        if backend in ['scipy', 'cupyx-scipy']:
             self.real_transform = False # No rfftn/irfftn methods
+
         self.padding_factor = 1.0
         if padding is not False:
             self.padding_factor = padding[self.axes[-1]] if np.ndim(padding) else padding
+        xp = np
+        if 'cupy' in self.backend:
+            import cupy as cp
+            xp = cp
+
         if abs(self.padding_factor-1.0) > 1e-8:
             assert len(self.axes) == 1
             trunc_array = self._get_truncarray(shape, V.dtype)
-            self.forward = _Xfftn_wrap(self._forward, U, trunc_array)
-            self.backward = _Xfftn_wrap(self._backward, trunc_array, U)
+            if 'cupy' in self.backend:  # TODO: Skip detour via CPU
+                trunc_array = cp.array(trunc_array)
+            self.forward = _Xfftn_wrap(self._forward, U, trunc_array, xp=xp)
+            self.backward = _Xfftn_wrap(self._backward, trunc_array, U, xp=xp)
         else:
-            self.forward = _Xfftn_wrap(self._forward, U, V)
-            self.backward = _Xfftn_wrap(self._backward, V, U)
+            self.forward = _Xfftn_wrap(self._forward, U, V, xp=xp)
+            self.backward = _Xfftn_wrap(self._backward, V, U, xp=xp)
 
     def _forward(self, **kw):
         normalize = kw.pop('normalize', True)
diff --git a/mpi4py_fft/mpifft.py b/mpi4py_fft/mpifft.py
index 4abe5d8..5b9a04b 100644
--- a/mpi4py_fft/mpifft.py
+++ b/mpi4py_fft/mpifft.py
@@ -63,17 +63,18 @@ def __call__(self, input_array=None, output_array=None, **kw):
 
         """
         if input_array is not None:
-            self.input_array[...] = input_array
+            self._xfftn[0].copyto(self.input_array, input_array)
 
         for i in range(len(self._transfer)):
             self._xfftn[i](**kw)
             arrayA = self._xfftn[i].output_array
             arrayB = self._xfftn[i+1].input_array
             self._transfer[i](arrayA, arrayB)
+
         self._xfftn[-1](**kw)
 
         if output_array is not None:
-            output_array[...] = self.output_array
+            self._xfftn[-1].copyto(output_array, self.output_array)
             return output_array
         else:
             return self.output_array
@@ -128,6 +129,11 @@ class PFFT(object):
         fftn/ifftn for complex input arrays. Real-to-real transforms can be
         configured using this dictionary and real-to-real transforms from the
         :mod:`.fftw.xfftn` module. See Examples.
+    comm_backend : str, optional
+        Choose backend for communication. When using GPU based serial backends,
+        the "NCCL" backend or a "customMPI" backend can be be used in `Alltoallw`
+        operations to speedup GPU to GPU transfer. Keep in mind that this is used
+        alongside MPI and assumes one GPU per MPI rank is used.
 
     Other Parameters
     ----------------
@@ -201,7 +207,7 @@ class PFFT(object):
     """
     def __init__(self, comm, shape=None, axes=None, dtype=float, grid=None,
                  padding=False, collapse=False, backend='fftw',
-                 transforms=None, darray=None, **kw):
+                 transforms=None, darray=None, comm_backend='MPI', **kw):
         # pylint: disable=too-many-locals
         # pylint: disable=too-many-branches
         # pylint: disable=too-many-statements
@@ -311,6 +317,7 @@ def __init__(self, comm, shape=None, axes=None, dtype=float, grid=None,
         self.pencil = [None, None]
 
         axes = self.axes[-1]
+        Pencil.backend = comm_backend
         pencil = Pencil(self.subcomm, shape, axes[-1])
         xfftn = FFT(pencil.subshape, axes, dtype, padding, backend=backend,
                     transforms=transforms, **kw)
diff --git a/mpi4py_fft/pencil.py b/mpi4py_fft/pencil.py
index 6671d8c..637646d 100644
--- a/mpi4py_fft/pencil.py
+++ b/mpi4py_fft/pencil.py
@@ -29,6 +29,17 @@ def _subarraytypes(comm, shape, axis, subshape, dtype):
     return tuple(datatypes)
 
 
+def get_slice(subtype):
+    """
+    Extract from the subtype object generated for MPI what shape the buffer
+    should have and what part of the array we want to send / receive.
+    """
+    decoded = subtype.decode()
+    subsizes = decoded[2]['subsizes']
+    starts = decoded[2]['starts']
+    return tuple(slice(starts[i], starts[i] + subsizes[i]) for i in range(len(starts)))
+
+
 class Subcomm(tuple):
     r"""Class returning a tuple of subcommunicators of any dimensionality
 
@@ -155,6 +166,7 @@ def __init__(self,
                  comm, shape, dtype,
                  subshapeA, axisA,
                  subshapeB, axisB):
+
         self.comm = comm
         self.shape = tuple(shape)
         self.dtype = dtype = np.dtype(dtype)
@@ -179,8 +191,15 @@ def forward(self, arrayA, arrayB):
         assert self.subshapeB == arrayB.shape
         assert self.dtype == arrayA.dtype
         assert self.dtype == arrayB.dtype
-        self.comm.Alltoallw([arrayA, self._counts_displs, self._subtypesA],
-                            [arrayB, self._counts_displs, self._subtypesB])
+        self.Alltoallw(arrayA, self._subtypesA, arrayB, self._subtypesB)
+
+    def Alltoallw(self, arrayA, subtypesA, arrayB, subtypesB):
+        """
+        Redistribute arrayA to arrayB
+        """
+        self.comm.Alltoallw([arrayA, self._counts_displs, subtypesA],
+                            [arrayB, self._counts_displs, subtypesB])
+
 
     def backward(self, arrayB, arrayA):
         """Global redistribution from arrayB to arrayA
@@ -197,8 +216,7 @@ def backward(self, arrayB, arrayA):
         assert self.subshapeB == arrayB.shape
         assert self.dtype == arrayA.dtype
         assert self.dtype == arrayB.dtype
-        self.comm.Alltoallw([arrayB, self._counts_displs, self._subtypesB],
-                            [arrayA, self._counts_displs, self._subtypesA])
+        self.Alltoallw(arrayB, self._subtypesB, arrayA, self._subtypesA)
 
     def destroy(self):
         for datatype in self._subtypesA:
@@ -209,6 +227,196 @@ def destroy(self):
                 datatype.Free()
 
 
+class CustomMPITransfer(Transfer):
+
+    def Alltoallw(self, arrayA, subtypesA, arrayB, subtypesB):
+        """
+        Redistribute arrayA to arrayB.
+        """
+        if type(arrayA) == np.ndarray:
+            xp = np
+            def synchronize_stream():
+                pass
+        else:
+            import cupy as xp
+            synchronize_stream = xp.cuda.get_current_stream().synchronize
+
+        rank, size, comm = self.comm.rank, self.comm.size, self.comm
+
+        for i in range(1, size + 1):
+            send_to = (rank + i) % size
+            recv_from = (rank -i + size) % size
+
+            sliceA = get_slice(subtypesA[send_to])
+            sliceB = get_slice(subtypesB[recv_from])
+
+            if send_to == rank:
+                arrayB[sliceB][:] = arrayA[sliceA][:]
+            else:
+                recvbuf = xp.ascontiguousarray(arrayB[sliceB])
+                sendbuf = xp.ascontiguousarray(arrayA[sliceA])
+
+                synchronize_stream()
+                comm.Sendrecv(sendbuf, send_to, recvbuf=recvbuf, source=recv_from)
+                arrayB[sliceB][:] = recvbuf[:]
+
+
+class NCCLTransfer(Transfer):
+    """
+    Transfer class which uses NCCL for `Alltoallw` operations. The NCCL
+    communicator will share rank and size attributes with the MPI communicator.
+    In particular, this assumes one GPU per MPI rank.
+    """
+
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        self.forward_graph = None
+        self.backward_graph = None
+
+        from cupy.cuda import nccl
+        self.comm_nccl = nccl.NcclCommunicator(self.comm.size, self.comm.bcast(nccl.get_unique_id(), root=0), self.comm.rank)
+
+        # determine how to send the data. If we have complex numbers, we need to send two real numbers.
+        if self.dtype in [np.dtype('float32'), np.dtype('complex64')]:
+            self.NCCL_dtype = nccl.NCCL_FLOAT32
+        elif self.dtype in [np.dtype('float64'), np.dtype('complex128')]:
+            self.NCCL_dtype = nccl.NCCL_FLOAT64
+        elif self.dtype in [np.dtype('int32')]:
+            self.NCCL_dtype = nccl.NCCL_INT32
+        elif self.dtype in [np.dtype('int64')]:
+            self.NCCL_dtype = nccl.NCCL_INT64
+        else:
+            raise NotImplementedError(f'Don\'t know what NCCL dtype to use to send data of dtype {self.dtype}!')
+        self.count_modifier = 2 if 'complex' in str(self.dtype) else 1
+
+        self.arrayA = None
+        self.arrayB = None
+
+    def _copy_to_input_arrays(self, arrayA, arrayB):
+        """
+        This is needed to make the CUDA graph work with arbitrary input data.
+
+        Parameters
+        ----------
+        arrayA : array
+            Data to be copied to the graph input
+        arrayB : array
+            Data to be copied to the graph output
+        """
+        if self.arrayA is None:
+            self.arrayA = arrayA
+        elif self.arrayA.data.ptr == arrayA.data.ptr:
+            pass
+        else:
+            self.arrayA[...] = arrayA
+
+        if self.arrayB is None:
+            self.arrayB = arrayB
+        elif self.arrayB.data.ptr == arrayB.data.ptr:
+            pass
+        else:
+            self.arrayB[...] = arrayB
+
+    def _retrieve_from_input_arrays(self, arrayA, arrayB):
+        """
+        This is needed to make the CUDA graph work with arbitrary input data.
+
+        Parameters
+        ----------
+        arrayA : array
+            Data to be copied from the graph input
+        arrayB : array
+            Data to be copied from the graph output
+        """
+        if self.arrayA.data.ptr == arrayA.data.ptr:
+            pass
+        else:
+            arrayA[...] = self.arrayA
+
+        if self.arrayB.data.ptr == arrayB.data.ptr:
+            pass
+        else:
+            arrayB[...] = self.arrayB
+
+    def backward(self, arrayB, arrayA):
+        """Global redistribution from arrayB to arrayA
+
+        Parameters
+        ----------
+        arrayB : array
+            Array of shape subshapeB, containing data to be redistributed
+        arrayA : array
+            Array of shape subshapeA, for receiving data
+
+        """
+        self._copy_to_input_arrays(arrayA, arrayB)
+        self.backward_graph = self.Alltoallw(self.arrayB, self._subtypesB, self.arrayA, self._subtypesA, graph=self.backward_graph)
+        self._retrieve_from_input_arrays(arrayA, arrayB)
+
+    def forward(self, arrayA, arrayB):
+        """Global redistribution from arrayA to arrayB
+
+        Parameters
+        ----------
+        arrayA : array
+            Array of shape subshapeA, containing data to be redistributed
+        arrayB : array
+            Array of shape subshapeB, for receiving data
+        """
+        self._copy_to_input_arrays(arrayA, arrayB)
+        self.forward_graph = self.Alltoallw(self.arrayA, self._subtypesA, self.arrayB, self._subtypesB, graph=self.forward_graph)
+        self._retrieve_from_input_arrays(arrayA, arrayB)
+
+    def Alltoallw(self, arrayA, subtypesA, arrayB, subtypesB, graph=None):
+        """
+        Redistribute arrayA to arrayB.
+        """
+        import cupy as cp
+        rank, size, comm = self.comm.rank, self.comm.size, self.comm_nccl
+
+        # record to a graph if we haven't already done so
+        if graph is None:
+            stream = cp.cuda.Stream(non_blocking=True)
+            with stream:
+                stream.begin_capture()
+
+                # initialize dictionaries required to overlap sends
+                recvbufs = {}
+                sendbufs = {}
+                sliceBs = {}
+
+                # perform all sends and receives in a single kernel to allow overlap
+                cp.cuda.nccl.groupStart()
+                for i in range(1, size + 1):
+
+                    send_to = (rank + i) % size
+                    recv_from = (rank -i + size) % size
+
+                    sliceA = get_slice(subtypesA[send_to])
+                    sliceBs[i] = get_slice(subtypesB[recv_from])
+
+                    recvbufs[i] = cp.ascontiguousarray(arrayB[sliceBs[i]])
+                    sendbufs[i] = cp.ascontiguousarray(arrayA[sliceA])
+
+                    comm.recv(recvbufs[i].data.ptr, recvbufs[i].size * self.count_modifier, self.NCCL_dtype, recv_from, stream.ptr)
+                    comm.send(sendbufs[i].data.ptr, sendbufs[i].size * self.count_modifier, self.NCCL_dtype, send_to, stream.ptr)
+                cp.cuda.nccl.groupEnd()
+
+                for i in recvbufs.keys():
+                    cp.copyto(arrayB[sliceBs[i]], recvbufs[i])
+
+                graph = stream.end_capture()
+
+        graph.launch(stream=cp.cuda.get_current_stream())
+        return graph
+
+    def destroy(self):
+        del self.forward_graph
+        del self.backward_graph
+        self.comm_nccl.destroy()
+        super().destroy()
+
+
 class Pencil(object):
     """Class to represent a distributed array (pencil)
 
@@ -274,6 +482,8 @@ class Pencil(object):
     aligned in axis 1.
 
     """
+    backend = 'MPI'
+
     def __init__(self, subcomm, shape, axis=-1):
         assert len(shape) >= 2
         assert min(shape) >= 1
@@ -349,6 +559,15 @@ def transfer(self, pencil, dtype):
         shape = list(penA.subshape)
         shape[axis] = penA.shape[axis]
 
-        return Transfer(comm, shape, dtype,
+        if self.backend == 'MPI':
+            transfer_class = Transfer
+        elif self.backend == 'NCCL':
+            transfer_class = NCCLTransfer
+        elif self.backend == 'customMPI':
+            transfer_class = CustomMPITransfer
+        else:
+            raise NotImplementedError(f'Don\'t have a transfer class for backend \"{self.backend}\"!')
+
+        return transfer_class(comm, shape, dtype,
                         penA.subshape, penA.axis,
                         penB.subshape, penB.axis)
diff --git a/tests/test_PFFT_cupy_backend.py b/tests/test_PFFT_cupy_backend.py
new file mode 100644
index 0000000..73a742f
--- /dev/null
+++ b/tests/test_PFFT_cupy_backend.py
@@ -0,0 +1,36 @@
+def test_PFFT_cupy_backend():
+    import numpy as np
+    import cupy as cp
+    from mpi4py import MPI
+    from mpi4py_fft import PFFT, newDistArray
+    
+    comm = MPI.COMM_WORLD
+    
+    # Set global size of the computational box
+    N = np.array([comm.size * 8] * 3, dtype=int)
+    expected_shape = (N[0] // comm.size, N[1], N[2])
+    axes = ((0,), (1, 2))
+    
+    backends = ['numpy', 'cupy']
+    FFTs = {backend: PFFT(comm, N, axes=axes, grid=(-1,), backend=backend) for backend in backends}
+    assert FFTs['numpy'].axes == FFTs['cupy'].axes
+    
+    us = {backend: newDistArray(FFTs[backend], forward_output=False) for backend in backends}
+    us['numpy'][:] = np.random.random(us['numpy'].shape).astype(us['numpy'].dtype)
+    us['cupy'][:] = cp.array(us['numpy'])
+    
+    
+    
+    for backend, xp in zip(backends, [np, cp]):
+        us['hat_' + backend] = newDistArray(FFTs[backend], forward_output=True)
+        us['hat_' + backend] = FFTs[backend].forward(us[backend], us['hat_' + backend])
+        us['back_and_forth_' + backend] = xp.zeros_like(us[backend])
+        us['back_and_forth_' + backend] = FFTs[backend].backward(us['hat_' + backend], us['back_and_forth_' + backend])
+    
+        assert xp.allclose(us[backend], us['back_and_forth_' + backend]), f'Got different values after back and forth transformation with {backend} backend'
+        assert np.allclose(us['back_and_forth_' + backend].shape, expected_shape), f"Got unexpected shape {us['back_and_forth_' + backend].shape} when expecting {expected_shape} with {backend} backend"
+    assert np.allclose(us['hat_cupy'].get(), us['hat_numpy']), 'Got different values in frequency space'
+
+
+if __name__ == '__main__':
+    test_PFFT_cupy_backend()
diff --git a/tests/test_darrayCuPy.py b/tests/test_darrayCuPy.py
new file mode 100644
index 0000000..fb0e1b0
--- /dev/null
+++ b/tests/test_darrayCuPy.py
@@ -0,0 +1,155 @@
+import cupy as cp
+from mpi4py import MPI
+from mpi4py_fft import DistArrayCuPy, newDistArray, PFFT
+from mpi4py_fft.pencil import Subcomm
+
+comm = MPI.COMM_WORLD
+
+
+def test_1Darray():
+    N = (8,)
+    z = DistArrayCuPy(N, val=2)
+    assert z[0] == 2
+    assert z.shape == N
+
+
+def test_2Darray():
+    N = (8, 8)
+    for subcomm in ((0, 1), (1, 0), None, Subcomm(comm, (0, 1))):
+        for rank in (0, 1, 2):
+            M = (2,) * rank + N
+            alignment = None
+            if subcomm is None and rank == 1:
+                alignment = 1
+            a = DistArrayCuPy(M, subcomm=subcomm, val=1, rank=rank, alignment=alignment)
+            assert a.rank == rank
+            assert a.global_shape == M
+            _ = a.substart
+            c = a.subcomm
+            z = a.commsizes
+            _ = a.pencil
+            assert cp.prod(cp.array(z)) == comm.Get_size()
+            if rank > 0:
+                a0 = a[0]
+                assert isinstance(a0, DistArrayCuPy)
+                assert a0.rank == rank - 1
+            aa = a.v
+            assert isinstance(aa, cp.ndarray)
+            try:
+                k = a.get((0,) * rank + (0, slice(None)))
+                if comm.Get_rank() == 0:
+                    assert len(k) == N[1]
+                    assert cp.sum(k) == N[1]
+                k = a.get((0,) * rank + (slice(None), 0))
+                if comm.Get_rank() == 0:
+                    assert len(k) == N[0]
+                    assert cp.sum(k) == N[0]
+            except ModuleNotFoundError:
+                pass
+            _ = a.local_slice()
+            newaxis = (a.alignment + 1) % 2
+            _ = a.get_pencil_and_transfer(newaxis)
+            a[:] = MPI.COMM_WORLD.Get_rank()
+            b = a.redistribute(newaxis)
+            a = b.redistribute(out=a)
+            a = b.redistribute(a.alignment, out=a)
+            s0 = MPI.COMM_WORLD.reduce(cp.linalg.norm(a) ** 2)
+            s1 = MPI.COMM_WORLD.reduce(cp.linalg.norm(b) ** 2)
+            if MPI.COMM_WORLD.Get_rank() == 0:
+                assert abs(s0 - s1) < 1e-1
+            c = a.redistribute(a.alignment)
+            assert c is a
+
+
+def test_3Darray():
+    N = (8, 8, 8)
+    for subcomm in (
+        (0, 0, 1),
+        (0, 1, 0),
+        (1, 0, 0),
+        (0, 1, 1),
+        (1, 0, 1),
+        (1, 1, 0),
+        None,
+        Subcomm(comm, (0, 0, 1)),
+    ):
+        for rank in (0, 1, 2):
+            M = (3,) * rank + N
+            alignment = None
+            if subcomm is None and rank == 1:
+                alignment = 2
+            a = DistArrayCuPy(M, subcomm=subcomm, val=1, rank=rank, alignment=alignment)
+            assert a.rank == rank
+            assert a.global_shape == M
+            _ = a.substart
+            _ = a.subcomm
+            z = a.commsizes
+            _ = a.pencil
+            assert cp.prod(cp.array(z)) == comm.Get_size()
+            if rank > 0:
+                a0 = a[0]
+                assert isinstance(a0, DistArrayCuPy)
+                assert a0.rank == rank - 1
+            if rank == 2:
+                a0 = a[0, 1]
+                assert isinstance(a0, DistArrayCuPy)
+                assert a0.rank == 0
+            aa = a.v
+            assert isinstance(aa, cp.ndarray)
+            try:
+                k = a.get((0,) * rank + (0, 0, slice(None)))
+                if comm.Get_rank() == 0:
+                    assert len(k) == N[2]
+                    assert cp.sum(k) == N[2]
+                k = a.get((0,) * rank + (slice(None), 0, 0))
+                if comm.Get_rank() == 0:
+                    assert len(k) == N[0]
+                    assert cp.sum(k) == N[0]
+            except ModuleNotFoundError:
+                pass
+            _ = a.local_slice()
+            newaxis = (a.alignment + 1) % 3
+            _ = a.get_pencil_and_transfer(newaxis)
+            a[:] = MPI.COMM_WORLD.Get_rank()
+            b = a.redistribute(newaxis)
+            a = b.redistribute(out=a)
+            s0 = MPI.COMM_WORLD.reduce(cp.linalg.norm(a) ** 2)
+            s1 = MPI.COMM_WORLD.reduce(cp.linalg.norm(b) ** 2)
+            if MPI.COMM_WORLD.Get_rank() == 0:
+                assert abs(s0 - s1) < 1e-1
+
+
+def test_newDistArray():
+    N = (8, 8, 8)
+    pfft = PFFT(MPI.COMM_WORLD, N, backend='cupy')
+    for forward_output in (True, False):
+        for view in (True, False):
+            for rank in (0, 1, 2):
+                a = newDistArray(
+                    pfft,
+                    forward_output=forward_output,
+                    rank=rank,
+                    view=view,
+                )
+                if view is False:
+                    assert isinstance(a, DistArrayCuPy)
+                    assert a.rank == rank
+                    if rank == 0:
+                        qfft = PFFT(MPI.COMM_WORLD, darray=a, backend='cupy')
+                    elif rank == 1:
+                        qfft = PFFT(MPI.COMM_WORLD, darray=a[0], backend='cupy')
+                    else:
+                        qfft = PFFT(MPI.COMM_WORLD, darray=a[0, 0], backend='cupy')
+                    qfft.destroy()
+
+                else:
+                    assert isinstance(a, cp.ndarray)
+                    assert a.base.rank == rank
+    pfft.destroy()
+
+
+if __name__ == "__main__":
+    test_1Darray()
+    test_2Darray()
+    test_3Darray()
+    test_newDistArray()
diff --git a/tests/test_libfft.py b/tests/test_libfft.py
index 23c6c8e..180fc11 100644
--- a/tests/test_libfft.py
+++ b/tests/test_libfft.py
@@ -7,18 +7,32 @@
 from mpi4py_fft.libfft import FFT
 
 has_backend = {'fftw': True}
-for backend in ('pyfftw', 'mkl_fft', 'scipy', 'numpy'):
+for backend in ('pyfftw', 'mkl_fft', 'scipy', 'numpy', 'cupy'):
     has_backend[backend] = True
     try:
         importlib.import_module(backend)
     except ImportError:
         has_backend[backend] = False
 
+if has_backend['cupy']:
+    import cupy as cp
+    has_backend['cupyx-scipy'] = True
+
 abstol = dict(f=5e-5, d=1e-14, g=1e-14)
 
+def get_xpy(backend=None, array=None):
+    if backend in ['cupy', 'cupyx-scipy']:
+        return cp
+    if has_backend['cupy'] and array is not None:
+        if type(array) == cp.ndarray:
+            return cp
+    return np
+
+
 def allclose(a, b):
     atol = abstol[a.dtype.char.lower()]
-    return np.allclose(a, b, rtol=0, atol=atol)
+    xp = get_xpy(array=a)
+    return xp.allclose(a, b, rtol=0, atol=atol)
 
 def test_libfft():
     from itertools import product
@@ -30,9 +44,10 @@ def test_libfft():
         if fftw.get_fftw_lib(t):
             types += t+t.upper()
 
-    for backend in ('pyfftw', 'mkl_fft', 'scipy', 'numpy', 'fftw'):
+    for backend in has_backend.keys():
         if has_backend[backend] is False:
             continue
+        xp = get_xpy(backend=backend)
         t0 = 0
         for typecode in types:
             for dim in dims:
@@ -47,7 +62,7 @@ def test_libfft():
                                 A = fft.forward.input_array
                                 B = fft.forward.output_array
 
-                                A[...] = np.random.random(A.shape).astype(typecode)
+                                A[...] = xp.random.random(A.shape).astype(typecode)
                                 X = A.copy()
 
                                 B.fill(0)
@@ -59,14 +74,15 @@ def test_libfft():
                                 t0 -= time()
                                 A = fft.backward(B, A)
                                 t0 += time()
-                                assert allclose(A, X)
+                                assert allclose(A, X), f'Failed with {backend} backend with type \"{typecode}\" in {dim} dims!'
         print('backend: ', backend, t0)
     # Padding is different because the physical space is padded and as such
     # difficult to initialize. We solve this problem by making one extra
     # transform
-    for backend in ('pyfftw', 'mkl_fft', 'scipy', 'numpy', 'fftw'):
+    for backend in has_backend.keys():
         if has_backend[backend] is False:
             continue
+        xp = get_xpy(backend=backend)
         for padding in (1.5, 2.0):
             for typecode in types:
                 for dim in dims:
@@ -84,7 +100,7 @@ def test_libfft():
                             A = fft.forward.input_array
                             B = fft.forward.output_array
 
-                            A[...] = np.random.random(A.shape).astype(typecode)
+                            A[...] = xp.random.random(A.shape).astype(typecode)
 
                             B.fill(0)
                             B = fft.forward(A, B)
@@ -95,11 +111,12 @@ def test_libfft():
 
                             B.fill(0)
                             B = fft.forward(A, B)
-                            assert allclose(B, X), np.linalg.norm(B-X)
+                            assert allclose(B, X), xp.linalg.norm(B-X)
 
-    for backend in ('pyfftw', 'mkl_fft', 'scipy', 'numpy', 'fftw'):
+    for backend in has_backend.keys():
         if has_backend[backend] is False:
             continue
+        xp = get_xpy(backend=backend)
 
         if backend == 'fftw':
             dctn = functools.partial(fftw.dctn, type=3)
@@ -113,6 +130,9 @@ def test_libfft():
         elif backend == 'numpy':
             transforms = {(1,): (np.fft.rfftn, np.fft.irfftn),
                           (0, 1): (np.fft.rfftn, np.fft.irfftn)}
+        elif backend == 'cupy':
+            transforms = {(1,): (cp.fft.rfftn, cp.fft.irfftn),
+                          (0, 1): (cp.fft.rfftn, cp.fft.irfftn)}
         elif backend == 'mkl_fft':
             import mkl_fft
             transforms = {(1,): (mkl_fft._numpy_fft.rfftn, mkl_fft._numpy_fft.irfftn),
@@ -121,12 +141,17 @@ def test_libfft():
             from scipy.fftpack import fftn, ifftn
             transforms = {(1,): (fftn, ifftn),
                           (0, 1): (fftn, ifftn)}
+        elif backend == 'cupyx-scipy':
+            from scipy.fftpack import fftn, ifftn
+            import cupyx.scipy.fft as fftlib
+            transforms = {(1,): (fftlib.fftn, fftlib.ifftn),
+                          (0, 1): (fftlib.fftn, fftlib.ifftn)}
 
         for axis in ((1,), (0, 1)):
             fft = FFT(shape, axis, backend=backend, transforms=transforms)
             A = fft.forward.input_array
             B = fft.forward.output_array
-            A[...] = np.random.random(A.shape)
+            A[...] = xp.random.random(A.shape)
             X = A.copy()
             B.fill(0)
             B = fft.forward(A, B)
diff --git a/tests/test_pencil.py b/tests/test_pencil.py
index 9a74c40..6985e46 100644
--- a/tests/test_pencil.py
+++ b/tests/test_pencil.py
@@ -11,53 +11,74 @@ def test_pencil():
     sizes = (7, 8, 9)
     types = 'fdFD' #'hilfdgFDG'
 
+    backends = ['MPI', 'customMPI']
+
+    xp = {
+        'MPI': np,
+        'customMPI': np,
+    }
+
+    try:
+        import cupy as cp
+        from cupy.cuda import nccl
+        backends += ['NCCL']
+        xp['NCCL'] = cp
+    except ImportError:
+        pass
+
     for typecode in types:
         for dim in dims:
             for shape in product(*([sizes]*dim)):
-                axes = list(range(dim))
-                for axis1, axis2, axis3 in product(axes, axes, axes):
+                for backend in backends:
+                     
+                    Pencil.backend = backend
+                    axes = list(range(dim))
+                     
+                    for axis1, axis2, axis3 in product(axes, axes, axes):
+
+                        if axis1 == axis2: continue
+                        if axis2 == axis3: continue
+                        axis3 -= len(shape)
+                        #if comm.rank == 0:
+                        #    print(shape, axis1, axis2, axis3, typecode)
 
-                    if axis1 == axis2: continue
-                    if axis2 == axis3: continue
-                    axis3 -= len(shape)
-                    #if comm.rank == 0:
-                    #    print(shape, axis1, axis2, axis3, typecode)
+                        for pdim in [None] + list(range(1, dim-1)):
 
-                    for pdim in [None] + list(range(1, dim-1)):
+                            subcomm = Subcomm(comm, pdim)
+                            pencil0 = Pencil(subcomm, shape)
 
-                        subcomm = Subcomm(comm, pdim)
-                        pencil0 = Pencil(subcomm, shape)
+                            pencilA = pencil0.pencil(axis1)
+                            pencilB = pencilA.pencil(axis2)
+                            pencilC = pencilB.pencil(axis3)
 
-                        pencilA = pencil0.pencil(axis1)
-                        pencilB = pencilA.pencil(axis2)
-                        pencilC = pencilB.pencil(axis3)
+                            assert pencilC.backend == backend
 
-                        trans1 = Pencil.transfer(pencilA, pencilB, typecode)
-                        trans2 = Pencil.transfer(pencilB, pencilC, typecode)
+                            trans1 = Pencil.transfer(pencilA, pencilB, typecode)
+                            trans2 = Pencil.transfer(pencilB, pencilC, typecode)
 
-                        X = np.random.random(pencilA.subshape).astype(typecode)
+                            X = xp[backend].random.random(pencilA.subshape).astype(typecode)
 
-                        A = np.empty(pencilA.subshape, dtype=typecode)
-                        B = np.empty(pencilB.subshape, dtype=typecode)
-                        C = np.empty(pencilC.subshape, dtype=typecode)
+                            A = xp[backend].empty(pencilA.subshape, dtype=typecode)
+                            B = xp[backend].empty(pencilB.subshape, dtype=typecode)
+                            C = xp[backend].empty(pencilC.subshape, dtype=typecode)
 
-                        A[...] = X
+                            A[...] = X
 
-                        B.fill(0)
-                        trans1.forward(A, B)
-                        C.fill(0)
-                        trans2.forward(B, C)
+                            B.fill(0)
+                            trans1.forward(A, B)
+                            C.fill(0)
+                            trans2.forward(B, C)
 
-                        B.fill(0)
-                        trans2.backward(C, B)
-                        A.fill(0)
-                        trans1.backward(B, A)
+                            B.fill(0)
+                            trans2.backward(C, B)
+                            A.fill(0)
+                            trans1.backward(B, A)
 
-                        assert np.allclose(A, X)
+                            assert xp[backend].allclose(A, X)
 
-                        trans1.destroy()
-                        trans2.destroy()
-                        subcomm.destroy()
+                            trans1.destroy()
+                            trans2.destroy()
+                            subcomm.destroy()
 
 
 if __name__ == '__main__':
diff --git a/tests/test_transfer_classes.py b/tests/test_transfer_classes.py
new file mode 100644
index 0000000..f979730
--- /dev/null
+++ b/tests/test_transfer_classes.py
@@ -0,0 +1,92 @@
+from mpi4py import MPI
+from mpi4py_fft.pencil import Transfer, CustomMPITransfer, Pencil, Subcomm
+import numpy as np
+
+transfer_classes = [CustomMPITransfer]
+xps = {CustomMPITransfer: np}
+
+try:
+    import cupy as cp
+    from mpi4py_fft.pencil import NCCLTransfer
+    transfer_classes += [NCCLTransfer]
+    xps[NCCLTransfer] = cp
+except ModuleNotFoundError:
+    pass
+
+
+def get_args(comm, shape, dtype):
+    subcomm = Subcomm(comm=comm, dims=None)
+    pencilA = Pencil(subcomm, shape, 0)
+    pencilB = Pencil(subcomm, shape, 1)
+
+    kwargs = {
+        'comm': comm,
+        'shape': shape,
+        'subshapeA': pencilA.subshape,
+        'subshapeB': pencilB.subshape,
+        'axisA': 0,
+        'axisB': 1,
+        'dtype': dtype,
+    }
+    return kwargs
+
+
+def get_arrays(kwargs, xp):
+    arrayA = xp.zeros(shape=kwargs['subshapeA'], dtype=kwargs['dtype'])
+    arrayB = xp.zeros(shape=kwargs['subshapeB'], dtype=kwargs['dtype'])
+
+    arrayA[:] = xp.random.random(arrayA.shape).astype(arrayA.dtype)
+    return arrayA, arrayB
+ 
+
+def single_test_all_to_allw(transfer_class, shape, dtype, comm=None, xp=None):
+    comm = comm if comm else MPI.COMM_WORLD
+    kwargs = get_args(comm, shape, dtype)
+    arrayA, arrayB = get_arrays(kwargs, xp)
+    arrayB_ref = arrayB.copy()
+
+    transfer = transfer_class(**kwargs)
+    reference_transfer = Transfer(**kwargs)
+
+    transfer.Alltoallw(arrayA, transfer._subtypesA, arrayB, transfer._subtypesB)
+    reference_transfer.Alltoallw(arrayA, transfer._subtypesA, arrayB_ref, transfer._subtypesB)
+    assert xp.allclose(arrayB, arrayB_ref), f'Did not get the same result from `alltoallw` with {transfer_class.__name__} transfer class as MPI implementation on rank {comm.rank}!'
+
+    comm.Barrier()
+    if comm.rank == 0:
+        print(f'{transfer_class.__name__} passed alltoallw test with shape {shape} and dtype {dtype}')
+
+
+def single_test_forward_backward(transfer_class, shape, dtype, comm=None, xp=None):
+    comm = comm if comm else MPI.COMM_WORLD
+    kwargs = get_args(comm, shape, dtype)
+    arrayA, arrayB = get_arrays(kwargs, xp)
+    arrayA_ref = arrayA.copy()
+
+    transfer = transfer_class(**kwargs)
+
+    transfer.forward(arrayA, arrayB)
+    transfer.backward(arrayB, arrayA)
+    assert xp.allclose(arrayA, arrayA_ref), f'Did not get the same result when transferring back and forth with {transfer_class.__name__} transfer class on rank {comm.rank}!'
+
+    comm.Barrier()
+    if comm.rank == 0:
+        print(f'{transfer_class.__name__} passed forward/backward test with shape {shape} and dtype {dtype}')
+
+
+def test_transfer_class():
+    dims = (2, 3)
+    sizes = (7, 8, 9, 128)
+    dtypes = 'fFdD'
+
+    shapes = [[size] * dim for size in sizes for dim in dims] + [[32, 256, 129]]
+
+    for transfer_class in transfer_classes:
+        for shape in shapes:
+            for dtype in dtypes:
+                single_test_all_to_allw(transfer_class, shape, dtype, xp=xps[transfer_class])
+                single_test_forward_backward(transfer_class, shape, dtype, xp=xps[transfer_class])
+
+
+if __name__ == '__main__':
+    test_transfer_class()