diff --git a/.github/workflows/ci-additional.yaml b/.github/workflows/ci-additional.yaml
index fd7cc8242..a097c606c 100644
--- a/.github/workflows/ci-additional.yaml
+++ b/.github/workflows/ci-additional.yaml
@@ -65,10 +65,6 @@ jobs:
       - name: Install flox
         run: |
           python -m pip install --no-deps -e .
-      - name: Version info
-        run: |
-          conda info -a
-          conda list
       - name: Run doctests
         run: |
           python -m pytest --doctest-modules flox --ignore flox/tests
@@ -97,22 +93,27 @@ jobs:
         uses: mamba-org/provision-with-micromamba@v14
         with:
           environment-file: ${{env.CONDA_ENV_FILE}}
-          environment-name: xarray-tests
+          environment-name: flox-tests
           extra-specs: |
             python=${{env.PYTHON_VERSION}}
+            lxml
           cache-env: true
           cache-env-key: "${{runner.os}}-${{runner.arch}}-py${{env.PYTHON_VERSION}}-${{env.TODAY}}-${{hashFiles(env.CONDA_ENV_FILE)}}"
-      - name: Install xarray
+      - name: Install flox
         run: |
           python -m pip install --no-deps -e .
-      - name: Version info
-        run: |
-          conda info -a
-          conda list
       - name: Install mypy
         run: |
           python -m pip install mypy
 
       - name: Run mypy
         run: |
-          python -m mypy --install-types --non-interactive
+          python -m mypy --install-types --non-interactive --cobertura-xml-report mypy_report
+      - name: Upload mypy coverage to Codecov
+        uses: codecov/codecov-action@v3.1.1
+        with:
+          file: mypy_report/cobertura.xml
+          flags: mypy
+          env_vars: PYTHON_VERSION
+          name: codecov-umbrella
+          fail_ci_if_error: false
diff --git a/conftest.py b/conftest.py
new file mode 100644
index 000000000..c5c121d32
--- /dev/null
+++ b/conftest.py
@@ -0,0 +1,13 @@
+"""Configuration for pytest."""
+
+import pytest
+
+
+@pytest.fixture(scope="module", params=["flox", "numpy", "numba"])
+def engine(request):
+    if request.param == "numba":
+        try:
+            import numba  # noqa: F401
+        except ImportError:
+            pytest.xfail()
+    return request.param
diff --git a/tests/__init__.py b/flox/tests/__init__.py
similarity index 93%
rename from tests/__init__.py
rename to flox/tests/__init__.py
index b1a266652..e2b8d8584 100644
--- a/tests/__init__.py
+++ b/flox/tests/__init__.py
@@ -125,13 +125,3 @@ def assert_equal_tuple(a, b):
             np.testing.assert_array_equal(a_, b_)
         else:
             assert a_ == b_
-
-
-@pytest.fixture(scope="module", params=["flox", "numpy", "numba"])
-def engine(request):
-    if request.param == "numba":
-        try:
-            import numba
-        except ImportError:
-            pytest.xfail()
-    return request.param
diff --git a/tests/test_core.py b/flox/tests/test_core.py
similarity index 91%
rename from tests/test_core.py
rename to flox/tests/test_core.py
index 3270e6151..1921edaa7 100644
--- a/tests/test_core.py
+++ b/flox/tests/test_core.py
@@ -2,7 +2,7 @@
 
 import itertools
 from functools import partial, reduce
-from typing import TYPE_CHECKING
+from typing import TYPE_CHECKING, Any
 
 import numpy as np
 import pandas as pd
@@ -26,7 +26,6 @@
 from . import (
     assert_equal,
     assert_equal_tuple,
-    engine,
     has_dask,
     raise_if_dask_computes,
     requires_dask,
@@ -78,7 +77,11 @@ def dask_array_ones(*args):
 )
 
 if TYPE_CHECKING:
-    from flox.core import T_Engine, T_ExpectedGroupsOpt, T_Func2
+    from flox.core import T_Agg, T_Engine, T_ExpectedGroupsOpt, T_Method
+
+    # Let anything through in kwargs for code readability, will likely miss a lot of
+    # type errors within these dicts though:
+    T_Kwargs = dict[str, Any]
 
 
 def test_alignment_error():
@@ -119,7 +122,7 @@ def test_alignment_error():
 )
 def test_groupby_reduce(
     engine: T_Engine,
-    func: T_Func2,
+    func: T_Agg,
     array: np.ndarray,
     by: np.ndarray,
     expected: list[float],
@@ -157,10 +160,12 @@ def test_groupby_reduce(
     assert_equal(expected_result, result)
 
 
-def gen_array_by(size, func):
+def gen_array_by(
+    size: tuple[int, ...], func: str
+) -> tuple[np.ndarray[Any, Any], np.ndarray[Any, Any]]:
     by = np.ones(size[-1])
     rng = np.random.default_rng(12345)
-    array = rng.random(size)
+    array: np.ndarray[Any, Any] = rng.random(size)
     if "nan" in func and "nanarg" not in func:
         array[[1, 4, 5], ...] = np.nan
     elif "nanarg" in func and len(size) > 1:
@@ -175,7 +180,9 @@ def gen_array_by(size, func):
 @pytest.mark.parametrize("size", ((12,), (12, 9)))
 @pytest.mark.parametrize("add_nan_by", [True, False])
 @pytest.mark.parametrize("func", ALL_FUNCS)
-def test_groupby_reduce_all(nby, size, chunks, func, add_nan_by, engine):
+def test_groupby_reduce_all(
+    nby, size: tuple[int, ...], chunks, func: str, add_nan_by: bool, engine: T_Engine
+) -> None:
     if chunks is not None and not has_dask:
         pytest.skip()
     if "arg" in func and engine == "flox":
@@ -184,15 +191,14 @@ def test_groupby_reduce_all(nby, size, chunks, func, add_nan_by, engine):
     array, by = gen_array_by(size, func)
     if chunks:
         array = dask.array.from_array(array, chunks=chunks)
-    by = (by,) * nby
-    by = [b + idx for idx, b in enumerate(by)]
+    bys = [by] * nby
+    bys = [b + idx for idx, b in enumerate(bys)]
     if add_nan_by:
         for idx in range(nby):
-            by[idx][2 * idx : 2 * idx + 3] = np.nan
-    by = tuple(by)
-    nanmask = reduce(np.logical_or, (np.isnan(b) for b in by))
+            bys[idx][2 * idx : 2 * idx + 3] = np.nan
+    nanmask = reduce(np.logical_or, (np.isnan(b) for b in bys))
 
-    finalize_kwargs = [{}]
+    finalize_kwargs: list[T_Kwargs] = [{}]
     if "var" in func or "std" in func:
         finalize_kwargs = finalize_kwargs + [{"ddof": 1}, {"ddof": 0}]
         fill_value = np.nan
@@ -202,7 +208,9 @@ def test_groupby_reduce_all(nby, size, chunks, func, add_nan_by, engine):
         tolerance = None
 
     for kwargs in finalize_kwargs:
-        flox_kwargs = dict(func=func, engine=engine, finalize_kwargs=kwargs, fill_value=fill_value)
+        flox_kwargs: T_Kwargs = dict(
+            func=func, engine=engine, finalize_kwargs=kwargs, fill_value=fill_value
+        )
         with np.errstate(invalid="ignore", divide="ignore"):
             if "arg" in func and add_nan_by:
                 array[..., nanmask] = np.nan
@@ -212,7 +220,7 @@ def test_groupby_reduce_all(nby, size, chunks, func, add_nan_by, engine):
         for _ in range(nby):
             expected = np.expand_dims(expected, -1)
 
-        actual, *groups = groupby_reduce(array, *by, **flox_kwargs)
+        actual, *groups = groupby_reduce(array, *bys, **flox_kwargs)
         assert actual.ndim == (array.ndim + nby - 1)
         assert expected.ndim == (array.ndim + nby - 1)
         expected_groups = tuple(np.array([idx + 1.0]) for idx in range(nby))
@@ -229,7 +237,7 @@ def test_groupby_reduce_all(nby, size, chunks, func, add_nan_by, engine):
         params.extend(itertools.product(["cohorts"], [False, None]))
         for method, reindex in params:
             call = partial(
-                groupby_reduce, array, *by, method=method, reindex=reindex, **flox_kwargs
+                groupby_reduce, array, *bys, method=method, reindex=reindex, **flox_kwargs
             )
             if "arg" in func and reindex is True:
                 # simple_combine with argreductions not supported right now
@@ -252,7 +260,7 @@ def test_groupby_reduce_all(nby, size, chunks, func, add_nan_by, engine):
 @requires_dask
 @pytest.mark.parametrize("size", ((12,), (12, 5)))
 @pytest.mark.parametrize("func", ("argmax", "nanargmax", "argmin", "nanargmin"))
-def test_arg_reduction_dtype_is_int(size, func):
+def test_arg_reduction_dtype_is_int(size: tuple[int, ...], func: str) -> None:
     """avoid bugs being hidden by the xfail in the above test."""
 
     rng = np.random.default_rng(12345)
@@ -272,14 +280,14 @@ def test_arg_reduction_dtype_is_int(size, func):
     assert actual.dtype.kind == "i"
 
 
-def test_groupby_reduce_count():
+def test_groupby_reduce_count() -> None:
     array = np.array([0, 0, np.nan, np.nan, np.nan, 1, 1])
     labels = np.array(["a", "b", "b", "b", "c", "c", "c"])
     result, _ = groupby_reduce(array, labels, func="count")
     assert_equal(result, np.array([1, 1, 2], dtype=np.int64))
 
 
-def test_func_is_aggregation():
+def test_func_is_aggregation() -> None:
     from flox.aggregations import mean
 
     array = np.array([0, 0, np.nan, np.nan, np.nan, 1, 1])
@@ -292,14 +300,14 @@ def test_func_is_aggregation():
 @requires_dask
 @pytest.mark.parametrize("func", ("sum", "prod"))
 @pytest.mark.parametrize("dtype", [np.float32, np.float64, np.int32, np.int64])
-def test_groupby_reduce_preserves_dtype(dtype, func):
+def test_groupby_reduce_preserves_dtype(dtype, func: str) -> None:
     array = np.ones((2, 12), dtype=dtype)
     by = np.array([labels] * 2)
     result, _ = groupby_reduce(from_array(array, chunks=(-1, 4)), by, func=func)
     assert result.dtype == array.dtype
 
 
-def test_numpy_reduce_nd_md():
+def test_numpy_reduce_nd_md() -> None:
     array = np.ones((2, 12))
     by = np.array([labels] * 2)
 
@@ -346,7 +354,16 @@ def test_numpy_reduce_nd_md():
         ((10, 12), (3, 3), 3),  # form 3
     ],
 )
-def test_groupby_agg_dask(func, shape, array_chunks, group_chunks, add_nan, dtype, engine, reindex):
+def test_groupby_agg_dask(
+    func: str,
+    shape: tuple[int, ...],
+    array_chunks: tuple[int, ...],
+    group_chunks,
+    add_nan: bool,
+    dtype,
+    engine: T_Engine,
+    reindex: bool | None,
+) -> None:
     """Tests groupby_reduce with dask arrays against groupby_reduce with numpy arrays"""
 
     rng = np.random.default_rng(12345)
@@ -365,7 +382,7 @@ def test_groupby_agg_dask(func, shape, array_chunks, group_chunks, add_nan, dtyp
         labels[:3] = np.nan  # entire block is NaN when group_chunks=3
         labels[-2:] = np.nan
 
-    kwargs = dict(
+    kwargs: T_Kwargs = dict(
         func=func, expected_groups=[0, 1, 2], fill_value=False if func in ["all", "any"] else 123
     )
 
@@ -394,11 +411,11 @@ def test_groupby_agg_dask(func, shape, array_chunks, group_chunks, add_nan, dtyp
     assert_equal(expected, actual)
 
 
-def test_numpy_reduce_axis_subset(engine):
+def test_numpy_reduce_axis_subset(engine: T_Engine) -> None:
     # TODO: add NaNs
     by = labels2d
     array = np.ones_like(by, dtype=np.int64)
-    kwargs = dict(func="count", engine=engine, fill_value=0)
+    kwargs: T_Kwargs = dict(func="count", engine=engine, fill_value=0)
     result, _ = groupby_reduce(array, by, **kwargs, axis=1)
     assert_equal(result, np.array([[2, 3], [2, 3]], dtype=np.int64))
 
@@ -427,7 +444,7 @@ def test_numpy_reduce_axis_subset(engine):
 
 
 @requires_dask
-def test_dask_reduce_axis_subset():
+def test_dask_reduce_axis_subset() -> None:
 
     by = labels2d
     array = np.ones_like(by, dtype=np.int64)
@@ -482,12 +499,13 @@ def test_dask_reduce_axis_subset():
 @pytest.mark.parametrize(
     "axis", [None, (0, 1, 2), (0, 1), (0, 2), (1, 2), 0, 1, 2, (0,), (1,), (2,)]
 )
-def test_groupby_reduce_axis_subset_against_numpy(func, axis, engine):
+def test_groupby_reduce_axis_subset_against_numpy(func: str, axis, engine: T_Engine) -> None:
     if "arg" in func and engine == "flox":
         pytest.skip()
 
     if not isinstance(axis, int) and "arg" in func and (axis is None or len(axis) > 1):
         pytest.skip()
+    fill_value: bool | float
     if func in ["all", "any"]:
         fill_value = False
     else:
@@ -527,7 +545,9 @@ def test_groupby_reduce_axis_subset_against_numpy(func, axis, engine):
         (None, [0], (1,)),  # global reduction; 0 shaped group axis; 1 group
     ],
 )
-def test_groupby_reduce_nans(reindex, chunks, axis, groups, expected_shape, engine):
+def test_groupby_reduce_nans(
+    reindex: bool | None, chunks, axis, groups, expected_shape: tuple[int, ...], engine: T_Engine
+) -> None:
     def _maybe_chunk(arr):
         if chunks:
             if not has_dask:
@@ -566,7 +586,9 @@ def _maybe_chunk(arr):
 @pytest.mark.parametrize(
     "expected_groups, reindex", [(None, None), (None, False), ([0, 1, 2], True), ([0, 1, 2], False)]
 )
-def test_groupby_all_nan_blocks_dask(expected_groups, reindex, engine):
+def test_groupby_all_nan_blocks_dask(
+    expected_groups, reindex: bool | None, engine: T_Engine
+) -> None:
     labels = np.array([0, 0, 2, 2, 2, 1, 1, 2, 2, 1, 1, 0])
     nan_labels = labels.astype(float)  # copy
     nan_labels[:5] = np.nan
@@ -590,7 +612,7 @@ def test_groupby_all_nan_blocks_dask(expected_groups, reindex, engine):
 
 
 @pytest.mark.parametrize("axis", (0, 1, 2, -1))
-def test_reindex(axis):
+def test_reindex(axis: int) -> None:
     shape = [2, 2, 2]
     fill_value = 0
 
@@ -610,7 +632,7 @@ def test_reindex(axis):
 
 
 @pytest.mark.xfail
-def test_bad_npg_behaviour():
+def test_bad_npg_behaviour() -> None:
     labels = np.array([0, 0, 2, 2, 2, 1, 1, 2, 2, 1, 1, 0], dtype=int)
     # fmt: off
     array = np.array([[1] * 12, [1] * 12])
@@ -627,7 +649,7 @@ def test_bad_npg_behaviour():
 
 @pytest.mark.xfail
 @pytest.mark.parametrize("func", ("nanargmax", "nanargmin"))
-def test_npg_nanarg_bug(func):
+def test_npg_nanarg_bug(func: str) -> None:
     array = np.array([1, 1, 2, 1, 1, np.nan, 6, 1])
     labels = np.array([1, 1, 1, 1, 1, 1, 1, 1]) - 1
 
@@ -639,7 +661,9 @@ def test_npg_nanarg_bug(func):
 @pytest.mark.parametrize("method", ["cohorts", "map-reduce"])
 @pytest.mark.parametrize("chunk_labels", [False, True])
 @pytest.mark.parametrize("chunks", ((), (1,), (2,)))
-def test_groupby_bins(chunk_labels, chunks, engine, method) -> None:
+def test_groupby_bins(
+    chunk_labels: bool, chunks: tuple[int, ...], engine: T_Engine, method: T_Method
+) -> None:
     array = [1, 1, 1, 1, 1, 1]
     labels = [0.2, 1.5, 1.9, 2, 3, 20]
 
@@ -687,7 +711,7 @@ def test_groupby_bins(chunk_labels, chunks, engine, method) -> None:
         [(10,), (10,)],
     ],
 )
-def test_rechunk_for_blockwise(inchunks, expected):
+def test_rechunk_for_blockwise(inchunks: tuple[int, ...], expected: tuple[int, ...]) -> None:
     labels = np.array([1, 1, 1, 2, 2, 3, 3, 5, 5, 5])
     assert _get_optimal_chunks_for_groups(inchunks, labels) == expected
 
@@ -710,7 +734,7 @@ def test_rechunk_for_blockwise(inchunks, expected):
         ],
     ],
 )
-def test_find_group_cohorts(expected, labels, chunks, merge):
+def test_find_group_cohorts(expected, labels, chunks: tuple[int, ...], merge: bool) -> None:
     actual = list(find_group_cohorts(labels, (chunks,), merge).values())
     assert actual == expected, (actual, expected)
 
@@ -724,7 +748,7 @@ def test_find_group_cohorts(expected, labels, chunks, merge):
         [3, ((3, 4, 3, 4, 3, 4, 3, 4, 2),)],
     ],
 )
-def test_rechunk_for_cohorts(chunk_at, expected):
+def test_rechunk_for_cohorts(chunk_at: int, expected) -> None:
     array = dask.array.ones((30,), chunks=7)
     labels = np.arange(0, 30) % 7
     rechunked = rechunk_for_cohorts(array, axis=-1, force_new_chunk_at=chunk_at, labels=labels)
@@ -734,7 +758,7 @@ def test_rechunk_for_cohorts(chunk_at, expected):
 @pytest.mark.parametrize("chunks", [None, 3])
 @pytest.mark.parametrize("fill_value", [123, np.nan])
 @pytest.mark.parametrize("func", ALL_FUNCS)
-def test_fill_value_behaviour(func, chunks, fill_value, engine):
+def test_fill_value_behaviour(func: str, chunks, fill_value: float, engine: T_Engine) -> None:
     # fill_value = np.nan tests promotion of int counts to float
     # This is used by xarray
     if func in ["all", "any"] or "arg" in func:
@@ -765,7 +789,8 @@ def npfunc(x):
 @requires_dask
 @pytest.mark.parametrize("func", ["mean", "sum"])
 @pytest.mark.parametrize("dtype", ["float32", "float64", "int32", "int64"])
-def test_dtype_preservation(dtype, func, engine):
+def test_dtype_preservation(dtype: str, func: str, engine: T_Engine) -> None:
+    expected: np.typing.DTypeLike
     if func == "sum" or (func == "mean" and "float" in dtype):
         expected = np.dtype(dtype)
     elif func == "mean" and "int" in dtype:
@@ -786,7 +811,7 @@ def test_dtype_preservation(dtype, func, engine):
     "labels_dtype", [pytest.param(np.int32, marks=pytest.mark.xfail), np.int64]
 )
 @pytest.mark.parametrize("method", ["map-reduce", "cohorts"])
-def test_cohorts_map_reduce_consistent_dtypes(method, dtype, labels_dtype):
+def test_cohorts_map_reduce_consistent_dtypes(method: T_Method, dtype, labels_dtype) -> None:
     repeats = np.array([4, 4, 12, 2, 3, 4], dtype=np.int32)
     labels = np.repeat(np.arange(6, dtype=labels_dtype), repeats)
     array = dask.array.from_array(labels.astype(dtype), chunks=(4, 8, 4, 9, 4))
@@ -804,7 +829,7 @@ def test_cohorts_map_reduce_consistent_dtypes(method, dtype, labels_dtype):
 @pytest.mark.parametrize("func", ALL_FUNCS)
 @pytest.mark.parametrize("axis", (-1, None))
 @pytest.mark.parametrize("method", ["blockwise", "cohorts", "map-reduce", "split-reduce"])
-def test_cohorts_nd_by(func, method, axis, engine):
+def test_cohorts_nd_by(func: str, method: T_Method, axis: int | None, engine: T_Engine) -> None:
     o = dask.array.ones((3,), chunks=-1)
     o2 = dask.array.ones((2, 3), chunks=-1)
 
@@ -813,11 +838,12 @@ def test_cohorts_nd_by(func, method, axis, engine):
     by[0, 1] = 30
     by[2, 1] = 40
     by[0, 4] = 31
-    array = np.broadcast_to(array, (2, 3) + array.shape)
+    array = dask.array.broadcast_to(array, (2, 3) + array.shape)
 
     if "arg" in func and (axis is None or engine == "flox"):
         pytest.skip()
 
+    fill_value: bool | int
     if func in ["any", "all"]:
         fill_value = False
     else:
@@ -826,7 +852,9 @@ def test_cohorts_nd_by(func, method, axis, engine):
     if axis is not None and method != "map-reduce":
         pytest.xfail()
 
-    kwargs = dict(func=func, engine=engine, method=method, axis=axis, fill_value=fill_value)
+    kwargs: T_Kwargs = dict(
+        func=func, engine=engine, method=method, axis=axis, fill_value=fill_value
+    )
     actual, groups = groupby_reduce(array, by, **kwargs)
     expected, sorted_groups = groupby_reduce(array.compute(), by, **kwargs)
     assert_equal(groups, sorted_groups)
@@ -843,7 +871,9 @@ def test_cohorts_nd_by(func, method, axis, engine):
 
 @pytest.mark.parametrize("func", ["sum", "count"])
 @pytest.mark.parametrize("fill_value, expected", ((0, np.integer), (np.nan, np.floating)))
-def test_dtype_promotion(func, fill_value, expected, engine):
+def test_dtype_promotion(
+    func: str, fill_value: int, expected: np.typing.DTypeLike, engine: T_Engine
+) -> None:
     array = np.array([1, 1])
     by = [0, 1]
 
@@ -854,7 +884,7 @@ def test_dtype_promotion(func, fill_value, expected, engine):
 
 
 @pytest.mark.parametrize("func", ["mean", "nanmean"])
-def test_empty_bins(func, engine):
+def test_empty_bins(func: str, engine: T_Engine) -> None:
     array = np.ones((2, 3, 2))
     by = np.broadcast_to([0, 1], array.shape)
 
@@ -871,7 +901,7 @@ def test_empty_bins(func, engine):
     assert_equal(actual, expected)
 
 
-def test_datetime_binning():
+def test_datetime_binning() -> None:
     time_bins = pd.date_range(start="2010-08-01", end="2010-08-15", freq="24H")
     by = pd.date_range("2010-08-01", "2010-08-15", freq="15min")
 
@@ -887,7 +917,7 @@ def test_datetime_binning():
 
 
 @pytest.mark.parametrize("func", ALL_FUNCS)
-def test_bool_reductions(func, engine):
+def test_bool_reductions(func: str, engine: T_Engine) -> None:
     if "arg" in func and engine == "flox":
         pytest.skip()
     groups = np.array([1, 1, 1])
@@ -905,7 +935,7 @@ def test_map_reduce_blockwise_mixed() -> None:
         dask.array.from_array(data.values, chunks=365),
         t.dt.month,
         func="mean",
-        method="split-reduce",
+        method="cohorts",
     )
     expected, _ = groupby_reduce(data, t.dt.month, func="mean")
     assert_equal(expected, actual)
@@ -913,8 +943,8 @@ def test_map_reduce_blockwise_mixed() -> None:
 
 @requires_dask
 @pytest.mark.parametrize("method", ["split-reduce", "blockwise", "map-reduce", "cohorts"])
-def test_group_by_datetime(engine, method):
-    kwargs = dict(
+def test_group_by_datetime(engine: T_Engine, method: T_Method) -> None:
+    kwargs: T_Kwargs = dict(
         func="mean",
         method=method,
         engine=engine,
@@ -946,7 +976,7 @@ def test_group_by_datetime(engine, method):
     assert_equal(expected, actual)
 
 
-def test_factorize_values_outside_bins():
+def test_factorize_values_outside_bins() -> None:
 
     vals = factorize_(
         (np.arange(10).reshape(5, 2), np.arange(10).reshape(5, 2)),
@@ -1033,7 +1063,7 @@ def test_multiple_groupers_errors() -> None:
 
 
 def test_factorize_reindex_sorting_strings():
-    kwargs = dict(
+    kwargs: T_Kwargs = dict(
         by=(np.array(["El-Nino", "La-Nina", "boo", "Neutral"]),),
         axis=-1,
         expected_groups=(np.array(["El-Nino", "Neutral", "foo", "La-Nina"]),),
@@ -1052,8 +1082,8 @@ def test_factorize_reindex_sorting_strings():
     assert_equal(expected, np.array([0, 1, 3, 2], dtype=np.int64))
 
 
-def test_factorize_reindex_sorting_ints():
-    kwargs = dict(
+def test_factorize_reindex_sorting_ints() -> None:
+    kwargs: T_Kwargs = dict(
         by=(np.array([-10, 1, 10, 2, 3, 5]),),
         axis=-1,
         expected_groups=(np.array([0, 1, 2, 3, 4, 5], np.int64),),
@@ -1075,7 +1105,7 @@ def test_factorize_reindex_sorting_ints():
 
 
 @requires_dask
-def test_custom_aggregation_blockwise():
+def test_custom_aggregation_blockwise() -> None:
     def grouped_median(group_idx, array, *, axis=-1, size=None, fill_value=None, dtype=None):
         return aggregate(
             group_idx,
@@ -1098,7 +1128,8 @@ def grouped_median(group_idx, array, *, axis=-1, size=None, fill_value=None, dty
     expected = np.median(array, axis=-1, keepdims=True)
     assert_equal(expected, actual)
 
-    for method in ["map-reduce", "cohorts", "split-reduce"]:
+    methods: list[T_Method] = ["map-reduce", "cohorts"]
+    for method in methods:
         with pytest.raises(NotImplementedError):
             groupby_reduce(
                 dask.array.from_array(array, chunks=(1, -1)),
diff --git a/tests/test_xarray.py b/flox/tests/test_xarray.py
similarity index 84%
rename from tests/test_xarray.py
rename to flox/tests/test_xarray.py
index 0bee41c15..963fe9c1c 100644
--- a/tests/test_xarray.py
+++ b/flox/tests/test_xarray.py
@@ -1,3 +1,7 @@
+from __future__ import annotations
+
+from typing import TYPE_CHECKING
+
 import numpy as np
 import pandas as pd
 import pytest
@@ -8,7 +12,7 @@
 
 from flox.xarray import rechunk_for_blockwise, resample_reduce, xarray_reduce
 
-from . import assert_equal, engine, has_dask, raise_if_dask_computes, requires_dask
+from . import assert_equal, has_dask, raise_if_dask_computes, requires_dask
 
 # isort: off
 if has_dask:
@@ -23,6 +27,9 @@
 except ValueError:
     pass
 
+if TYPE_CHECKING:
+    from flox.core import T_Engine
+
 
 tolerance64 = {"rtol": 1e-15, "atol": 1e-18}
 np.random.seed(123)
@@ -32,7 +39,9 @@
 @pytest.mark.parametrize("min_count", [None, 1, 3])
 @pytest.mark.parametrize("add_nan", [True, False])
 @pytest.mark.parametrize("skipna", [True, False])
-def test_xarray_reduce(skipna, add_nan, min_count, engine, reindex):
+def test_xarray_reduce(
+    skipna: bool, add_nan: bool, min_count: int | None, engine: T_Engine, reindex: bool | None
+) -> None:
     arr = np.ones((4, 12))
 
     if add_nan:
@@ -80,7 +89,9 @@ def test_xarray_reduce(skipna, add_nan, min_count, engine, reindex):
 # TODO: sort
 @pytest.mark.parametrize("pass_expected_groups", [True, False])
 @pytest.mark.parametrize("chunk", (True, False))
-def test_xarray_reduce_multiple_groupers(pass_expected_groups, chunk, engine):
+def test_xarray_reduce_multiple_groupers(
+    pass_expected_groups: bool, chunk: bool, engine: T_Engine
+) -> None:
     if not has_dask and chunk:
         pytest.skip()
 
@@ -104,32 +115,43 @@ def test_xarray_reduce_multiple_groupers(pass_expected_groups, chunk, engine):
         coords={"labels": ["a", "b", "c", "f"], "labels2": [1, 2]},
     ).expand_dims(z=4)
 
-    kwargs = dict(func="count", engine=engine)
+    func = "count"
+    expected_groups = None
     if pass_expected_groups:
-        kwargs["expected_groups"] = (expected.labels.data, expected.labels2.data)
+        expected_groups = (expected.labels.data, expected.labels2.data)
 
     with raise_if_dask_computes():
-        actual = xarray_reduce(da, da.labels, da.labels2, **kwargs)
+        actual = xarray_reduce(
+            da, da.labels, da.labels2, func=func, expected_groups=expected_groups, engine=engine
+        )
     xr.testing.assert_identical(expected, actual)
 
     with raise_if_dask_computes():
-        actual = xarray_reduce(da, "labels", da.labels2, **kwargs)
+        actual = xarray_reduce(
+            da, "labels", da.labels2, func=func, expected_groups=expected_groups, engine=engine
+        )
     xr.testing.assert_identical(expected, actual)
 
     with raise_if_dask_computes():
-        actual = xarray_reduce(da, "labels", "labels2", **kwargs)
+        actual = xarray_reduce(
+            da, "labels", "labels2", func=func, expected_groups=expected_groups, engine=engine
+        )
     xr.testing.assert_identical(expected, actual)
 
     if pass_expected_groups:
-        kwargs["expected_groups"] = (expected.labels2.data, expected.labels.data)
+        expected_groups = (expected.labels2.data, expected.labels.data)
     with raise_if_dask_computes():
-        actual = xarray_reduce(da, "labels2", "labels", **kwargs)
+        actual = xarray_reduce(
+            da, "labels2", "labels", func=func, expected_groups=expected_groups, engine=engine
+        )
     xr.testing.assert_identical(expected.transpose("z", "labels2", "labels"), actual)
 
 
 @pytest.mark.parametrize("pass_expected_groups", [True, False])
 @pytest.mark.parametrize("chunk", (True, False))
-def test_xarray_reduce_multiple_groupers_2(pass_expected_groups, chunk, engine):
+def test_xarray_reduce_multiple_groupers_2(
+    pass_expected_groups: bool, chunk: bool, engine: T_Engine
+) -> None:
     if not has_dask and chunk:
         pytest.skip()
 
@@ -155,20 +177,31 @@ def test_xarray_reduce_multiple_groupers_2(pass_expected_groups, chunk, engine):
         },
     ).expand_dims(z=4, x=2)
 
-    kwargs = dict(func="count", engine=engine)
+    func = "count"
+    expected_groups = None
     if pass_expected_groups:
-        kwargs["expected_groups"] = (expected.labels.data, expected.labels.data)
+        expected_groups = (expected.labels.data, expected.labels.data)
 
     with raise_if_dask_computes():
-        actual = xarray_reduce(da, "labels", "labels2", **kwargs)
+        actual = xarray_reduce(
+            da, "labels", "labels2", func=func, expected_groups=expected_groups, engine=engine
+        )
     xr.testing.assert_identical(expected, actual)
 
     with pytest.raises(NotImplementedError):
-        xarray_reduce(da, "labels", "labels2", dim=..., **kwargs)
+        xarray_reduce(
+            da,
+            "labels",
+            "labels2",
+            dim=...,
+            func=func,
+            expected_groups=expected_groups,
+            engine=engine,
+        )
 
 
 @requires_dask
-def test_dask_groupers_error():
+def test_dask_groupers_error() -> None:
     da = xr.DataArray(
         [1.0, 2.0], dims="x", coords={"labels": ("x", [1, 2]), "labels2": ("x", [1, 2])}
     )
@@ -177,7 +210,7 @@ def test_dask_groupers_error():
 
 
 @requires_dask
-def test_xarray_reduce_single_grouper(engine):
+def test_xarray_reduce_single_grouper(engine: T_Engine) -> None:
 
     # DataArray
     ds = xr.tutorial.open_dataset("rasm", chunks={"time": 9})
@@ -222,7 +255,7 @@ def test_xarray_reduce_single_grouper(engine):
     xr.testing.assert_allclose(actual, expected)
 
 
-def test_xarray_reduce_errors():
+def test_xarray_reduce_errors() -> None:
 
     da = xr.DataArray(np.ones((12,)), dims="x")
     by = xr.DataArray(np.ones((12,)), dims="x")
@@ -242,7 +275,7 @@ def test_xarray_reduce_errors():
 @pytest.mark.parametrize("isdask", [True, False])
 @pytest.mark.parametrize("dataarray", [True, False])
 @pytest.mark.parametrize("chunklen", [27, 4 * 31 + 1, 4 * 31 + 20])
-def test_xarray_resample(chunklen, isdask, dataarray, engine):
+def test_xarray_resample(chunklen: int, isdask: bool, dataarray: bool, engine: T_Engine) -> None:
     if isdask:
         if not has_dask:
             pytest.skip()
@@ -265,7 +298,7 @@ def test_xarray_resample(chunklen, isdask, dataarray, engine):
 
 
 @requires_dask
-def test_xarray_resample_dataset_multiple_arrays(engine):
+def test_xarray_resample_dataset_multiple_arrays(engine: T_Engine) -> None:
     # regression test for #35
     times = pd.date_range("2000", periods=5)
     foo = xr.DataArray(range(5), dims=["time"], coords=[times], name="foo")
@@ -298,7 +331,7 @@ def test_xarray_resample_dataset_multiple_arrays(engine):
         [(10,), (10,)],
     ],
 )
-def test_rechunk_for_blockwise(inchunks, expected):
+def test_rechunk_for_blockwise(inchunks: tuple[int, ...], expected: tuple[int, ...]) -> None:
     labels = np.array([1, 1, 1, 2, 2, 3, 3, 5, 5, 5])
 
     da = xr.DataArray(dask.array.ones((10,), chunks=inchunks), dims="x", name="foo")
@@ -319,7 +352,7 @@ def test_rechunk_for_blockwise(inchunks, expected):
 # TODO: dim=None, dim=Ellipsis, groupby unindexed dim
 
 
-def test_groupby_duplicate_coordinate_labels(engine):
+def test_groupby_duplicate_coordinate_labels(engine: T_Engine) -> None:
     # fix for http://stackoverflow.com/questions/38065129
     array = xr.DataArray([1, 2, 3], [("x", [1, 1, 2])])
     expected = xr.DataArray([3, 3], [("x", [1, 2])])
@@ -327,7 +360,7 @@ def test_groupby_duplicate_coordinate_labels(engine):
     assert_equal(expected, actual)
 
 
-def test_multi_index_groupby_sum(engine):
+def test_multi_index_groupby_sum(engine: T_Engine) -> None:
     # regression test for xarray GH873
     ds = xr.Dataset(
         {"foo": (("x", "y", "z"), np.ones((3, 4, 2)))},
@@ -351,7 +384,7 @@ def test_multi_index_groupby_sum(engine):
 
 
 @pytest.mark.parametrize("chunks", (None, 2))
-def test_xarray_groupby_bins(chunks, engine):
+def test_xarray_groupby_bins(chunks, engine: T_Engine) -> None:
     array = xr.DataArray([1, 1, 1, 1, 1], dims="x")
     labels = xr.DataArray([1, 1.5, 1.9, 2, 3], dims="x", name="labels")
 
@@ -361,16 +394,17 @@ def test_xarray_groupby_bins(chunks, engine):
         array = array.chunk({"x": chunks})
         labels = labels.chunk({"x": chunks})
 
-    kwargs = dict(
-        dim="x",
-        func="count",
-        engine=engine,
-        expected_groups=np.array([1, 2, 4, 5]),
-        isbin=True,
-        fill_value=0,
-    )
     with raise_if_dask_computes():
-        actual = xarray_reduce(array, labels, **kwargs)
+        actual = xarray_reduce(
+            array,
+            labels,
+            dim="x",
+            func="count",
+            engine=engine,
+            expected_groups=np.array([1, 2, 4, 5]),
+            isbin=True,
+            fill_value=0,
+        )
     expected = xr.DataArray(
         np.array([3, 1, 0]),
         dims="labels_bins",
@@ -383,7 +417,16 @@ def test_xarray_groupby_bins(chunks, engine):
     labels = labels.expand_dims(y=2).copy()
     labels.data[-1, -1] = np.nan
     with raise_if_dask_computes():
-        actual = xarray_reduce(array, labels, **kwargs)
+        actual = xarray_reduce(
+            array,
+            labels,
+            dim="x",
+            func="count",
+            engine=engine,
+            expected_groups=np.array([1, 2, 4, 5]),
+            isbin=True,
+            fill_value=0,
+        )
     expected = xr.DataArray(
         np.array([[[3, 1, 0]] * 3, [[3, 0, 0]] * 3]),
         dims=("y", "z", "labels_bins"),
@@ -393,7 +436,7 @@ def test_xarray_groupby_bins(chunks, engine):
 
 
 @requires_dask
-def test_func_is_aggregation():
+def test_func_is_aggregation() -> None:
     from flox.aggregations import mean
 
     ds = xr.tutorial.open_dataset("rasm", chunks={"time": 9})
@@ -409,7 +452,7 @@ def test_func_is_aggregation():
 
 
 @requires_dask
-def test_cache():
+def test_cache() -> None:
     pytest.importorskip("cachey")
 
     from flox.cache import cache
@@ -432,7 +475,7 @@ def test_cache():
 
 @pytest.mark.parametrize("use_cftime", [True, False])
 @pytest.mark.parametrize("func", ["count", "mean"])
-def test_datetime_array_reduce(use_cftime, func, engine):
+def test_datetime_array_reduce(use_cftime: bool, func, engine: T_Engine) -> None:
 
     time = xr.DataArray(
         xr.date_range("2009-01-01", "2012-12-31", use_cftime=use_cftime),
@@ -445,7 +488,7 @@ def test_datetime_array_reduce(use_cftime, func, engine):
 
 
 @requires_dask
-def test_groupby_bins_indexed_coordinate():
+def test_groupby_bins_indexed_coordinate() -> None:
     ds = (
         xr.tutorial.open_dataset("air_temperature")
         .isel(time=slice(100))
@@ -460,13 +503,13 @@ def test_groupby_bins_indexed_coordinate():
         expected_groups=([40, 50, 60, 70],),
         isbin=(True,),
         func="mean",
-        method="split-reduce",
+        method="cohorts",
     )
     xr.testing.assert_allclose(expected, actual)
 
 
 @pytest.mark.parametrize("chunk", (True, False))
-def test_mixed_grouping(chunk):
+def test_mixed_grouping(chunk: bool) -> None:
     if not has_dask and chunk:
         pytest.skip()
     # regression test for https://github.com/xarray-contrib/flox/pull/111
diff --git a/flox/xarray.py b/flox/xarray.py
index 35d99b6fa..abe58602d 100644
--- a/flox/xarray.py
+++ b/flox/xarray.py
@@ -23,6 +23,8 @@
     from xarray.core.resample import Resample
     from xarray.core.types import T_DataArray, T_Dataset
 
+    from .core import T_Engine, T_Method
+
     Dims = Union[str, Iterable[Hashable], None]
 
 
@@ -70,8 +72,8 @@ def xarray_reduce(
     dim: Dims | ellipsis = None,
     fill_value=None,
     dtype: np.typing.DTypeLike = None,
-    method: str = "map-reduce",
-    engine: str = "numpy",
+    method: T_Method = "map-reduce",
+    engine: T_Engine = "numpy",
     keep_attrs: bool | None = True,
     skipna: bool | None = None,
     min_count: int | None = None,
diff --git a/pyproject.toml b/pyproject.toml
index 32e55d712..885626285 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -33,8 +33,8 @@ known_third_party = [
 
 [tool.mypy]
 allow_redefinition = true
-exclude = "properties|asv_bench|doc|tests|flycheck"
-files = "flox/*.py"
+exclude = "properties|doc|flycheck"
+files = "flox"
 show_error_codes = true
 
 [[tool.mypy.overrides]]
@@ -43,7 +43,8 @@ module=[
     "cftime",
     "dask.*",
     "importlib_metadata",
-    "numpy_groupies",
+    "numba.*",
+    "numpy_groupies.*",
     "matplotlib.*",
     "pandas",
     "setuptools",