Merge pull request #122 from FAST-HEP/BK_broadcast_expressions

Add variable broadcasting for expressions
FAST-HEP · Jun 16, 2020 · 7f887b9 · 7f887b9
2 parents 072cc2f + 4e8fe7d
commit 7f887b9
Show file tree

Hide file tree

Showing 6 changed files with 67 additions and 21 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -5,6 +5,11 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
 and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
 
 ## [Unreleased]
+
+## [0.18.0] - 2020-06-17
+### Added
+- Add broadcasting between variables of different jaggedness in expressions, PR #122 [@BenKrikler](httsp://github.com/benkrikler)
+
 ### Removed
 - Testing against Python <= 3.5, PR #124
 

diff --git a/fast_carpenter/expressions.py b/fast_carpenter/expressions.py
@@ -3,11 +3,14 @@
 import numexpr
 import tokenize
 import awkward
+import logging
 try:
     from StringIO import StringIO
 except ImportError:
     from io import StringIO
 
+logger = logging.getLogger(__name__)
+
 
 __all__ = ["get_branches", "evaluate"]
 
@@ -58,35 +61,60 @@ class TreeToDictAdaptor():
     """
     Make an uproot tree look like a dict for numexpr
     """
-    def __init__(self, tree, alias_dict):
+    def __init__(self, tree, alias_dict, needed_variables):
         self.tree = tree
-        self.counts = None
         self.aliases = alias_dict
+        self.vars, self.counts = self.broadcast_variables(needed_variables)
+
+    def broadcast_variables(self, variables):
+        arrays = {}
+        most_jagged = (-1, None)
+        for var in variables:
+            if var in constants:
+                continue
+            array = self.get_raw(var)
+            contents, counts = deconstruct_jaggedness(array, counts=[])
+            arrays[var] = (contents, counts, array)
+            if len(counts) > most_jagged[0]:
+                most_jagged = (len(counts), var)
+        most_jagged = most_jagged[1]
+
+        broadcast_to = arrays[most_jagged][1]
+        broadcast_vars = {most_jagged: arrays[most_jagged]}
+        for var, (contents, counts, raw) in arrays.items():
+            if var == most_jagged:
+                continue
+
+            # Check broadcastable
+            for left, right in zip(broadcast_to, counts):
+                if not np.array_equal(left, right):
+                    raise ValueError("Unable to broadcast all values")
+            for copies in broadcast_to[len(counts):]:
+                contents = np.repeat(contents, copies)
+
+            broadcast_vars[var] = (contents, broadcast_to, raw)
+        return broadcast_vars, broadcast_to
 
     def __getitem__(self, item):
+        if item in constants:
+            return constants[item]
+        result = self.vars[item][0]
+        return result
+
+    def get_raw(self, item):
         if item in constants:
             return constants[item]
         full_item = self.aliases.get(item, item)
         array = self.tree.array(full_item)
-        array = self.strip_jaggedness(array)
         return array
 
     def __contains__(self, item):
-        return item in self.tree or item in self.aliases
+        return item in self.vars
 
     def __iter__(self):
-        for i in self.tree:
+        for i in self.vars:
             yield i
 
-    def strip_jaggedness(self, array):
-        array, new_counts = deconstruct_jaggedness(array, counts=[])
-        if self.counts is not None:
-            if not all(np.array_equal(c, n) for c, n in zip(self.counts, new_counts)):
-                raise RuntimeError("Operation using arrays with different jaggedness")
-        else:
-            self.counts = new_counts
-        return array
-
     def apply_jaggedness(self, array):
         if self.counts is None:
             return array
@@ -111,7 +139,14 @@ def preprocess_expression(expression):
 
 def evaluate(tree, expression):
     cleaned_expression, alias_dict = preprocess_expression(expression)
-    adaptor = TreeToDictAdaptor(tree, alias_dict)
+    context = numexpr.necompiler.getContext({}, frame_depth=1)
+    variables = numexpr.necompiler.getExprNames(cleaned_expression, context)[0]
+    try:
+        adaptor = TreeToDictAdaptor(tree, alias_dict, variables)
+    except ValueError:
+        msg = "Cannot broadcast all variables in expression: %s" % expression
+        logger.error(msg)
+        raise ValueError(msg)
     result = numexpr.evaluate(cleaned_expression, local_dict=adaptor)
     result = adaptor.apply_jaggedness(result)
     return result
diff --git a/fast_carpenter/version.py b/fast_carpenter/version.py
@@ -12,5 +12,5 @@ def split_version(version):
     return tuple(result)
 
 
-__version__ = '0.17.5'
+__version__ = '0.18.0'
 version_info = split_version(__version__) # noqa
diff --git a/setup.cfg b/setup.cfg
@@ -1,5 +1,5 @@
 [bumpversion]
-current_version = 0.17.5
+current_version = 0.18.0
 commit = True
 tag = False
 
@@ -18,4 +18,3 @@ test = pytest
 
 [tool:pytest]
 collect_ignore = ['setup.py']
-
diff --git a/setup.py b/setup.py
@@ -21,7 +21,7 @@ def get_version():
     return _globals["__version__"]
 
 
-requirements = ['atuproot==0.1.13', 'atsge==0.2.1', 'mantichora==0.9.7',
+requirements = ['atuproot==0.1.13', 'atsge==0.2.1', 'atpbar==1.0.8', 'mantichora==0.9.7',
                 'fast-flow', 'fast-curator', 'awkward',
                 'pandas', 'numpy', 'numba', 'numexpr', 'uproot>=3']
 repositories = []

diff --git a/tests/test_expressions.py b/tests/test_expressions.py
@@ -88,9 +88,9 @@ def test_3D_jagged(wrapped_tree):
     fake_3d_2 = JaggedArray.fromiter(fake_3d_2)
     wrapped_tree.new_variable("SecondFake3D", fake_3d_2)
 
-    with pytest.raises(RuntimeError) as e:
+    with pytest.raises(ValueError) as e:
         expressions.evaluate(wrapped_tree, "SecondFake3D + Fake3D")
-    assert "different jaggedness" in str(e)
+    assert "Cannot broadcast" in str(e)
 
 
 @pytest.mark.parametrize('input, expected', [
@@ -107,3 +107,10 @@ def test_preprocess_expression(input, expected):
     clean_expr, alias_dict = expressions.preprocess_expression(input)
     assert clean_expr == expected[0]
     assert alias_dict == expected[1]
+
+
+def test_broadcast(wrapped_tree):
+    expressions.evaluate(wrapped_tree, "NJet * Jet_Py + NElectron * Jet_Px")
+
+    with pytest.raises(ValueError):
+        expressions.evaluate(wrapped_tree, "Jet_Py + Muon_Px")