Skip to content

Commit 8fa33c0

Browse files
authored
Merge pull request #118 from FAST-HEP/BK_add_observed_option
Add `observed` option for speed with many bins
2 parents e889c12 + 1afbc86 commit 8fa33c0

File tree

6 files changed

+44
-19
lines changed

6 files changed

+44
-19
lines changed

CHANGELOG.md

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,13 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
66

77
## [Unreleased]
88

9+
## [0.17.5] - 2020-04-03
10+
### Added
11+
- Add `observed` option to BinnedDataframe for speed boost with many bins, PR #118 [@BenKrikler](https://github.com/benkrikler)
12+
13+
### Changed
14+
- Pin the version for the Mantichora package that AlphaTwirl depends on
15+
916
## [0.17.4] - 2020-03-12
1017
### Changed
1118
- `pad_missing` was replacing bin contents when set to True, PR #116 [@BenKrikler](https://github.com/benkrikler)

fast_carpenter/summary/binned_dataframe.py

Lines changed: 12 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -150,6 +150,11 @@ def __init__(self, name, out_dir, binning, weights=None, dataset_col=False):
150150
excluded from the stored dataframe. Leaving this ``False`` can save
151151
some disk-space and improve processing time, particularly if the bins are
152152
only very sparsely filled.
153+
observed (bool): If ``False`` bins in the dataframe will only be filled
154+
if their are datapoints contained within them. Otherwise, depending on
155+
the binning specification for each dimension, all bins for that
156+
dimension will be present. Use `pad_missing: true` to force all bins
157+
to be present.
153158
154159
Other Parameters:
155160
name (str): The name of this stage (handled automatically by fast-flow)
@@ -161,7 +166,8 @@ def __init__(self, name, out_dir, binning, weights=None, dataset_col=False):
161166
162167
"""
163168

164-
def __init__(self, name, out_dir, binning, weights=None, dataset_col=True, pad_missing=False, file_format=None):
169+
def __init__(self, name, out_dir, binning, weights=None, dataset_col=True,
170+
pad_missing=False, file_format=None, observed=False):
165171
self.name = name
166172
self.out_dir = out_dir
167173
ins, outs, binnings = cfg.create_binning_list(self.name, binning)
@@ -173,6 +179,7 @@ def __init__(self, name, out_dir, binning, weights=None, dataset_col=True, pad_m
173179
self._weights = cfg.create_weights(self.name, weights)
174180
self._pad_missing = pad_missing
175181
self._file_format = cfg.create_file_format(self.name, file_format)
182+
self._observed = observed
176183
self.contents = None
177184

178185
def collector(self):
@@ -202,7 +209,8 @@ def event(self, chunk):
202209
binnings=self._binnings,
203210
weights=weights,
204211
out_weights=self._weights.keys(),
205-
out_dimensions=self._out_bin_dims)
212+
out_dimensions=self._out_bin_dims,
213+
observed=self._observed)
206214
if self.contents is None:
207215
self.contents = binned_values
208216
else:
@@ -228,7 +236,7 @@ def _make_column_labels(weights):
228236
return [count_label] + labels
229237

230238

231-
def _bin_values(data, dimensions, binnings, weights, out_dimensions=None, out_weights=None):
239+
def _bin_values(data, dimensions, binnings, weights, out_dimensions=None, out_weights=None, observed=True):
232240
if not out_dimensions:
233241
out_dimensions = dimensions
234242
if not out_weights:
@@ -247,7 +255,7 @@ def _bin_values(data, dimensions, binnings, weights, out_dimensions=None, out_we
247255
weight_sq_dims = [w + "_squared" for w in weights]
248256
data[weight_sq_dims] = data[weights] ** 2
249257

250-
bins = data.groupby(final_bin_dims)
258+
bins = data.groupby(final_bin_dims, observed=observed)
251259
counts = bins[data.columns[0]].count()
252260

253261
if weights:

fast_carpenter/version.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -12,5 +12,5 @@ def split_version(version):
1212
return tuple(result)
1313

1414

15-
__version__ = '0.17.4'
15+
__version__ = '0.17.5'
1616
version_info = split_version(__version__) # noqa

setup.cfg

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
[bumpversion]
2-
current_version = 0.17.4
2+
current_version = 0.17.5
33
commit = True
44
tag = False
55

@@ -18,3 +18,4 @@ test = pytest
1818

1919
[tool:pytest]
2020
collect_ignore = ['setup.py']
21+

setup.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -21,7 +21,8 @@ def get_version():
2121
return _globals["__version__"]
2222

2323

24-
requirements = ['atuproot==0.1.13', 'atsge==0.2.1', 'fast-flow', 'fast-curator', 'awkward',
24+
requirements = ['atuproot==0.1.13', 'atsge==0.2.1', 'mantichora==0.9.7',
25+
'fast-flow', 'fast-curator', 'awkward',
2526
'pandas', 'numpy', 'numba', 'numexpr', 'uproot>=3']
2627
repositories = []
2728

tests/summary/test_binned_dataframe.py

Lines changed: 20 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -86,6 +86,14 @@ def test_BinnedDataframe_run_data(binned_df_2, tmpdir, infile):
8686
chunk = FakeBEEvent(infile, "data")
8787
binned_df_2.event(chunk)
8888

89+
collector = binned_df_2.collector()
90+
dataset_readers_list = (("test_dataset", (binned_df_2,)),)
91+
results = collector._prepare_output(dataset_readers_list)
92+
93+
totals = results.sum()
94+
# Based on: events->Draw("Jet_Py", "", "goff")
95+
assert totals["n"] == 4616
96+
8997

9098
def test_BinnedDataframe_run_twice(binned_df_1, tmpdir, infile):
9199
chunk = FakeBEEvent(infile, "mc")
@@ -108,9 +116,10 @@ def test_BinnedDataframe_run_twice(binned_df_1, tmpdir, infile):
108116

109117

110118
@pytest.fixture
111-
def run_twice_data_mc(config_1, infile):
119+
def run_twice_data_mc(config_1, infile, observed):
112120
chunk_mc = FakeBEEvent(infile, "mc")
113121
chunk_data = FakeBEEvent(infile, "data")
122+
config_1["observed"] = observed
114123

115124
binned_dfs = [make_binned_df_1(config_1) for _ in range(4)]
116125
binned_dfs[0].event(chunk_mc)
@@ -122,26 +131,25 @@ def run_twice_data_mc(config_1, infile):
122131
("test_data", (binned_dfs[2], binned_dfs[3])))
123132

124133

134+
@pytest.mark.skipif(int(pd.__version__.split(".")[0]) < 1, reason="requires Pandas 1.0 or higher")
125135
@pytest.mark.parametrize("dataset_col", [True, False])
126136
@pytest.mark.parametrize("pad_missing", [True, False])
127-
def test_binneddataframe_run_twice_data_mc(run_twice_data_mc, dataset_col, pad_missing):
137+
@pytest.mark.parametrize("observed", [True, False])
138+
def test_binneddataframe_run_twice_data_mc(run_twice_data_mc, dataset_col, pad_missing, observed):
128139
binned_df_1, dataset_readers_list = run_twice_data_mc
129140
binned_df_1._pad_missing = pad_missing
130141
binned_df_1._dataset_col = dataset_col
131142
collector = binned_df_1.collector()
132143
results = collector._prepare_output(dataset_readers_list)
133144

134145
assert results.index.nlevels == 2 + int(dataset_col)
135-
if tuple(map(int, pd.__version__.split("."))) >= (1, 0, 0):
136-
length = (4 * 31) * (1 + int(dataset_col))
137-
else:
138-
# Pre Pandas 1.0.0 the following lengths were needed.
139-
if pad_missing or not dataset_col:
140-
length = (4 * 31) * (1 + int(dataset_col))
141-
else:
142-
length = None
143-
if length:
144-
assert len(results) == length
146+
if pad_missing or not observed:
147+
length = (4 * 31)
148+
elif observed:
149+
length = 111
150+
151+
length *= 1 + int(dataset_col)
152+
assert len(results) == length
145153

146154
totals = results.sum()
147155
# Based on: events->Draw("Jet_Py", "", "goff")

0 commit comments

Comments
 (0)