diff --git a/asv_bench/benchmarks/factorize.py b/asv_bench/benchmarks/factorize.py new file mode 100644 index 000000000..7d2eb4ecd --- /dev/null +++ b/asv_bench/benchmarks/factorize.py @@ -0,0 +1,69 @@ +#!/usr/bin/env python3 + +import numpy as np +import pandas as pd +from asv_runner.benchmarks.mark import parameterize + +import flox + +Nsmall = 4 +Nlarge = 2000 + + +class Factorize: + """Time the core factorize_ function.""" + + def setup(self, *args, **kwargs): + raise NotImplementedError + + @parameterize( + { + "expected": (None, (pd.Index([1, 3]),), (pd.RangeIndex(Nsmall),)), + "reindex": [True, False], + "sort": [True, False], + } + ) + def time_factorize_small(self, expected, reindex, sort): + flox.core.factorize_( + self.by_small, + axes=(-1,), + expected_groups=expected, + reindex=reindex, + sort=sort, + ) + + @parameterize( + { + "expected": (None, (pd.Index([1, 3]),), (pd.RangeIndex(Nsmall),)), + "reindex": [True, False], + "sort": [True, False], + } + ) + def time_factorize_large(self, expected, reindex, sort): + flox.core.factorize_( + self.by_large, + axes=(-1,), + expected_groups=None, + reindex=reindex, + sort=sort, + ) + + +class SingleGrouper1D(Factorize): + def setup(self, *args, **kwargs): + self.by_small = (np.repeat(np.arange(Nsmall), 250),) + self.by_large = (np.random.permutation(np.arange(Nlarge)),) + + +class SingleGrouper3D(Factorize): + def setup(self, *args, **kwargs): + self.by_small = (np.broadcast_to(np.repeat(np.arange(Nsmall), 250), (5, 5, 1000)),) + self.by_large = (np.broadcast_to(np.random.permutation(np.arange(Nlarge)), (5, 5, Nlarge)),) + + +# class Multiple(Factorize): +# def setup(self, *args, **kwargs): +# pass + +# class CFTimeFactorize(Factorize): +# pass diff --git a/flox/core.py b/flox/core.py index 46124e2ab..9b1ff1d9b 100644 --- a/flox/core.py +++ b/flox/core.py @@ -731,6 +731,31 @@ def offset_labels(labels: np.ndarray, ngroups: int) -> tuple[np.ndarray, int]: return offset, size +def fast_isin(ar1, ar2, invert): + """ + Faster version of numpy isin. + 1. Use pd.factorize instead of np.unique + 2. Skip a bunch of checks + """ + rev_idx, ar1 = pd.factorize(ar1, sort=False) + + ar = np.concatenate((ar1, ar2)) + # We need this to be a stable sort, so always use 'mergesort' + # here. The values from the first array should always come before + # the values from the second array. + order = ar.argsort(kind="mergesort") + sar = ar[order] + if invert: + bool_ar = sar[1:] != sar[:-1] + else: + bool_ar = sar[1:] == sar[:-1] + flag = np.concatenate((bool_ar, [invert])) + ret = np.empty(ar.shape, dtype=bool) + ret[order] = flag + + return ret[rev_idx] + + @overload def factorize_( by: T_Bys, @@ -826,12 +851,23 @@ def factorize_( if expect is not None and reindex: sorter = np.argsort(expect) groups = expect[(sorter,)] if sort else expect + idx = np.searchsorted(expect, flat, sorter=sorter) - mask = ~np.isin(flat, expect) | isnull(flat) | (idx == len(expect)) + mask = fast_isin(flat, expect, invert=True) + if not np.issubdtype(flat.dtype, np.integer): + mask |= isnull(flat) + outside_last_elem_mask = idx == len(expect) + mask |= outside_last_elem_mask + + # idx = np.full(flat.shape, -1) + # result = np.searchsorted(expect.values, flat[~mask], sorter=sorter) + # idx[~mask] = result + # idx = np.searchsorted(expect.values, flat, sorter=sorter) + # idx[mask] = -1 if not sort: # idx is the index in to the sorted array. # if we didn't want sorting, unsort it back - idx[(idx == len(expect),)] = -1 + idx[(outside_last_elem_mask)] = -1 idx = sorter[(idx,)] idx[mask] = -1 else: