Merge pull request #118 from FAST-HEP/BK_add_observed_option

benkrikler · web-flow · commit 8fa33c02c319 · 2020-04-03T20:06:17.000+02:00
Add `observed` option for speed with many bins
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -6,6 +6,13 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 
 ## [Unreleased]
 
+## [0.17.5] - 2020-04-03
+### Added
+- Add `observed` option to BinnedDataframe for speed boost with many bins, PR #118 [@BenKrikler](https://github.com/benkrikler)
+
+### Changed
+- Pin the version for the Mantichora package that AlphaTwirl depends on
+
 ## [0.17.4] - 2020-03-12
 ### Changed
 - `pad_missing` was replacing bin contents when set to True, PR #116 [@BenKrikler](https://github.com/benkrikler)
diff --git a/fast_carpenter/summary/binned_dataframe.py b/fast_carpenter/summary/binned_dataframe.py
@@ -150,6 +150,11 @@ def __init__(self, name, out_dir, binning, weights=None, dataset_col=False):
         excluded from the stored dataframe.  Leaving this ``False`` can save
         some disk-space and improve processing time, particularly if the bins are
         only very sparsely filled.
+      observed (bool): If ``False`` bins in the dataframe will only be filled
+        if their are datapoints contained within them.  Otherwise, depending on
+        the binning specification for each dimension, all bins for that
+        dimension will be present.  Use `pad_missing: true` to force all bins
+        to be present.
 
     Other Parameters:
       name (str):  The name of this stage (handled automatically by fast-flow)
@@ -161,7 +166,8 @@ def __init__(self, name, out_dir, binning, weights=None, dataset_col=False):
 
     """
 
-    def __init__(self, name, out_dir, binning, weights=None, dataset_col=True, pad_missing=False, file_format=None):
+    def __init__(self, name, out_dir, binning, weights=None, dataset_col=True,
+                 pad_missing=False, file_format=None, observed=False):
         self.name = name
         self.out_dir = out_dir
         ins, outs, binnings = cfg.create_binning_list(self.name, binning)
@@ -173,6 +179,7 @@ def __init__(self, name, out_dir, binning, weights=None, dataset_col=True, pad_m
         self._weights = cfg.create_weights(self.name, weights)
         self._pad_missing = pad_missing
         self._file_format = cfg.create_file_format(self.name, file_format)
+        self._observed = observed
         self.contents = None
 
     def collector(self):
@@ -202,7 +209,8 @@ def event(self, chunk):
                                     binnings=self._binnings,
                                     weights=weights,
                                     out_weights=self._weights.keys(),
-                                    out_dimensions=self._out_bin_dims)
+                                    out_dimensions=self._out_bin_dims,
+                                    observed=self._observed)
         if self.contents is None:
             self.contents = binned_values
         else:
@@ -228,7 +236,7 @@ def _make_column_labels(weights):
     return [count_label] + labels
 
 
-def _bin_values(data, dimensions, binnings, weights, out_dimensions=None, out_weights=None):
+def _bin_values(data, dimensions, binnings, weights, out_dimensions=None, out_weights=None, observed=True):
     if not out_dimensions:
         out_dimensions = dimensions
     if not out_weights:
@@ -247,7 +255,7 @@ def _bin_values(data, dimensions, binnings, weights, out_dimensions=None, out_we
         weight_sq_dims = [w + "_squared" for w in weights]
         data[weight_sq_dims] = data[weights] ** 2
 
-    bins = data.groupby(final_bin_dims)
+    bins = data.groupby(final_bin_dims, observed=observed)
     counts = bins[data.columns[0]].count()
 
     if weights:
diff --git a/fast_carpenter/version.py b/fast_carpenter/version.py
@@ -12,5 +12,5 @@ def split_version(version):
     return tuple(result)
 
 
-__version__ = '0.17.4'
+__version__ = '0.17.5'
 version_info = split_version(__version__) # noqa
diff --git a/setup.cfg b/setup.cfg
@@ -1,5 +1,5 @@
 [bumpversion]
-current_version = 0.17.4
+current_version = 0.17.5
 commit = True
 tag = False
 
@@ -18,3 +18,4 @@ test = pytest
 
 [tool:pytest]
 collect_ignore = ['setup.py']
+
diff --git a/setup.py b/setup.py
@@ -21,7 +21,8 @@ def get_version():
     return _globals["__version__"]
 
 
-requirements = ['atuproot==0.1.13', 'atsge==0.2.1', 'fast-flow', 'fast-curator', 'awkward',
+requirements = ['atuproot==0.1.13', 'atsge==0.2.1', 'mantichora==0.9.7',
+                'fast-flow', 'fast-curator', 'awkward',
                 'pandas', 'numpy', 'numba', 'numexpr', 'uproot>=3']
 repositories = []
 
diff --git a/tests/summary/test_binned_dataframe.py b/tests/summary/test_binned_dataframe.py
@@ -86,6 +86,14 @@ def test_BinnedDataframe_run_data(binned_df_2, tmpdir, infile):
     chunk = FakeBEEvent(infile, "data")
     binned_df_2.event(chunk)
 
+    collector = binned_df_2.collector()
+    dataset_readers_list = (("test_dataset", (binned_df_2,)),)
+    results = collector._prepare_output(dataset_readers_list)
+
+    totals = results.sum()
+    # Based on: events->Draw("Jet_Py", "", "goff")
+    assert totals["n"] == 4616
+
 
 def test_BinnedDataframe_run_twice(binned_df_1, tmpdir, infile):
     chunk = FakeBEEvent(infile, "mc")
@@ -108,9 +116,10 @@ def test_BinnedDataframe_run_twice(binned_df_1, tmpdir, infile):
 
 
 @pytest.fixture
-def run_twice_data_mc(config_1, infile):
+def run_twice_data_mc(config_1, infile, observed):
     chunk_mc = FakeBEEvent(infile, "mc")
     chunk_data = FakeBEEvent(infile, "data")
+    config_1["observed"] = observed
 
     binned_dfs = [make_binned_df_1(config_1) for _ in range(4)]
     binned_dfs[0].event(chunk_mc)
@@ -122,26 +131,25 @@ def run_twice_data_mc(config_1, infile):
                            ("test_data", (binned_dfs[2], binned_dfs[3])))
 
 
+@pytest.mark.skipif(int(pd.__version__.split(".")[0]) < 1, reason="requires Pandas 1.0 or higher")
 @pytest.mark.parametrize("dataset_col", [True, False])
 @pytest.mark.parametrize("pad_missing", [True, False])
-def test_binneddataframe_run_twice_data_mc(run_twice_data_mc, dataset_col, pad_missing):
+@pytest.mark.parametrize("observed", [True, False])
+def test_binneddataframe_run_twice_data_mc(run_twice_data_mc, dataset_col, pad_missing, observed):
     binned_df_1, dataset_readers_list = run_twice_data_mc
     binned_df_1._pad_missing = pad_missing
     binned_df_1._dataset_col = dataset_col
     collector = binned_df_1.collector()
     results = collector._prepare_output(dataset_readers_list)
 
     assert results.index.nlevels == 2 + int(dataset_col)
-    if tuple(map(int, pd.__version__.split("."))) >= (1, 0, 0):
-        length = (4 * 31) * (1 + int(dataset_col))
-    else:
-        # Pre Pandas 1.0.0 the following lengths were needed.
-        if pad_missing or not dataset_col:
-            length = (4 * 31) * (1 + int(dataset_col))
-        else:
-            length = None
-    if length:
-        assert len(results) == length
+    if pad_missing or not observed:
+        length = (4 * 31)
+    elif observed:
+        length = 111
+
+    length *= 1 + int(dataset_col)
+    assert len(results) == length
 
     totals = results.sum()
     # Based on: events->Draw("Jet_Py", "", "goff")