Skip to content

Commit eb69a10

Browse files
committed
v0.0.73 Introduce the conditional processor
1 parent dff6a37 commit eb69a10

File tree

6 files changed

+78
-3
lines changed

6 files changed

+78
-3
lines changed

PROCESSORS.md

Lines changed: 33 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -220,6 +220,39 @@ def dump_to_sql(tables,
220220
- `batch_size` - Maximum amount of rows to write at the same time to the DB (default 1000)
221221
- `use_bloom_filter` - Preprocess existing DB data to improve update performance (default: True)
222222

223+
### Flow Control
224+
225+
#### conditional
226+
227+
Run parts of the flow based on the structure of the datapackage at this point.
228+
229+
```python
230+
def conditional(predicate, flow):
231+
pass
232+
```
233+
234+
- `predicate` - a boolean function, receiving a single parameter which is a `Package.datapacakge` and returns true/false
235+
- `flow` - a `Flow` to chain to the processing pipeline if the predicate is positive.
236+
237+
Example - add a field if it doesn't exist in the first resource in the data package:
238+
239+
```python
240+
def no_such_field(field_name):
241+
def func(dp):
242+
return any(field_name == f.name for f in dp.resources[0].schema.fields)
243+
return func
244+
245+
Flow(
246+
# ...
247+
conditional(
248+
no_such_field('my-field', Flow(
249+
add_field('my-field', 'string', 'default-value')
250+
))
251+
)
252+
# ...
253+
)
254+
```
255+
223256
#### checkpoint
224257

225258
Save results from running a series of steps, if checkpoint exists - loads from checkpoint instead of running the steps.

dataflows/VERSION

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1 +1 @@
1-
0.0.72
1+
0.0.73

dataflows/processors/__init__.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,7 @@
88
from .add_field import add_field
99
from .checkpoint import checkpoint
1010
from .concatenate import concatenate
11+
from .conditional import conditional
1112
from .delete_fields import delete_fields
1213
from .deduplicate import deduplicate
1314
from .duplicate import duplicate

dataflows/processors/conditional.py

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,16 @@
1+
from .. import DataStreamProcessor
2+
3+
4+
class conditional(DataStreamProcessor):
5+
6+
def __init__(self, predicate, flow):
7+
super().__init__()
8+
self.predicate = predicate
9+
self.flow = flow
10+
11+
def _process(self):
12+
ds = self.source._process()
13+
if self.predicate(ds.dp):
14+
return self.flow.datastream(ds)
15+
else:
16+
return ds

setup.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -24,7 +24,7 @@ def read(*paths):
2424
'tabulator>=1.23.0',
2525
'datapackage>=1.5.0',
2626
'tableschema>=1.5',
27-
'kvfile>=0.0.6',
27+
'kvfile>=0.0.8',
2828
'click',
2929
'jinja2',
3030
'awesome-slugify',

tests/test_lib.py

Lines changed: 26 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1532,7 +1532,6 @@ def test_force_temporal_format():
15321532
}
15331533
]]
15341534

1535-
15361535
# Extract missing values
15371536

15381537
def test_extract_missing_values():
@@ -1632,3 +1631,29 @@ def test_extract_missing_values_options_source_is_list():
16321631
{'col1': None, 'col2': None, 'missingValues': {'col1': 'mis1', 'col2': 'mis2'}},
16331632
{'col1': 7, 'col2': 7, 'missingValues': {}},
16341633
]]
1634+
1635+
1636+
def test_conditional():
1637+
from dataflows import Flow, conditional, add_field
1638+
1639+
tester = lambda dp: 'b' in [f.name for r in dp.resources for f in r.schema.fields]
1640+
1641+
data1 = [
1642+
dict(a=i, b=i) for i in range(3)
1643+
]
1644+
data2 = [
1645+
dict(a=i, c=i) for i in range(3)
1646+
]
1647+
1648+
result1, _, _ = Flow(
1649+
data1, conditional(tester, Flow(add_field('d', 'integer', lambda r: r['a'])))
1650+
).results()
1651+
result2, _, _ = Flow(
1652+
data2, conditional(tester, Flow(add_field('d', 'integer', lambda r: r['a'])))
1653+
).results()
1654+
assert result1[0] == [
1655+
dict(a=i, b=i, d=i) for i in range(3)
1656+
]
1657+
assert result2[0] == [
1658+
dict(a=i, c=i) for i in range(3)
1659+
]

0 commit comments

Comments
 (0)