Skip to content

Commit 82d30a9

Browse files
committed
v0.5.5 performance improvements
1 parent 5d9d0ce commit 82d30a9

File tree

6 files changed

+53
-16
lines changed

6 files changed

+53
-16
lines changed

dataflows/VERSION

+1-1
Original file line numberDiff line numberDiff line change
@@ -1 +1 @@
1-
0.5.4
1+
0.5.5

dataflows/base/datastream_processor.py

+9-8
Original file line numberDiff line numberDiff line change
@@ -96,15 +96,18 @@ def raise_exception(self, cause):
9696
raise error from cause
9797
raise cause
9898

99-
def safe_process(self, on_error=None):
99+
def safe_process(self, return_results=False, on_error=None):
100100
results = []
101101
try:
102102
ds = self._process()
103103
for res in ds.res_iter:
104-
if on_error is not None:
105-
results.append(list(
106-
schema_validator(res.res, res, on_error=on_error)
107-
))
104+
if return_results:
105+
if on_error is not None:
106+
results.append(list(
107+
schema_validator(res.res, res, on_error=on_error)
108+
))
109+
else:
110+
results.append(list(res))
108111
else:
109112
collections.deque(res, maxlen=0)
110113
except UniqueKeyError as e:
@@ -121,7 +124,5 @@ def process(self):
121124
return ds.dp, ds.merge_stats()
122125

123126
def results(self, on_error=None):
124-
if on_error is None:
125-
on_error = raise_exception
126-
ds, results = self.safe_process(on_error=on_error)
127+
ds, results = self.safe_process(return_results=True, on_error=on_error)
127128
return results, ds.dp, ds.merge_stats()

dataflows/base/flow.py

+2-1
Original file line numberDiff line numberDiff line change
@@ -2,13 +2,14 @@
22
from collections.abc import Iterable
33

44
from .datastream_processor import DataStreamProcessor
5+
from .schema_validator import raise_exception
56

67

78
class Flow:
89
def __init__(self, *args):
910
self.chain = args
1011

11-
def results(self, on_error=None):
12+
def results(self, on_error=raise_exception):
1213
return self._chain().results(on_error=on_error)
1314

1415
def process(self):

dataflows/processors/load.py

+11-5
Original file line numberDiff line numberDiff line change
@@ -175,9 +175,10 @@ def safe_process_datapackage(self, dp: Package):
175175
descriptor['encoding'] = self.options['encoding']
176176
self.options['custom_parsers'] = self.get_custom_parsers(self.options.get('custom_parsers'))
177177
self.options.setdefault('ignore_blank_headers', True)
178+
if 'headers' not in self.options:
179+
self.options.setdefault('skip_rows', [{'type': 'preset', 'value': 'auto'}])
178180
self.options.setdefault('headers', 1)
179181
self.options.setdefault('sample_size', 1000)
180-
self.options.setdefault('skip_rows', [{'type': 'preset', 'value': 'auto'}])
181182
stream: Stream = Stream(self.load_source, **self.options).open()
182183
if len(stream.headers) != len(set(stream.headers)):
183184
if not self.deduplicate_headers:
@@ -215,11 +216,16 @@ def safe_process_datapackage(self, dp: Package):
215216
return dp
216217

217218
def stripper(self, iterator):
219+
whitespace = set(' \t\n\r')
218220
for r in iterator:
219-
yield dict(
220-
(k, v.strip()) if isinstance(v, str) else (k, v)
221-
for k, v in r.items()
222-
)
221+
for k, v in r.items():
222+
if v and isinstance(v, str) and (v[-1] in whitespace or v[0] in whitespace):
223+
r[k] = v.strip()
224+
yield r
225+
# yield dict(
226+
# (k, v.strip()) if isinstance(v, str) else (k, v)
227+
# for k, v in r.items()
228+
# )
223229

224230
def limiter(self, iterator):
225231
count = 0

setup.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -23,7 +23,7 @@ def read(*paths):
2323
INSTALL_REQUIRES = [
2424
'dataflows-tabulator>=1.54.0',
2525
'datapackage>=1.15.4',
26-
'tableschema>=1.20.10',
26+
'tableschema>=1.20.11',
2727
'kvfile>=1.1.1',
2828
'click',
2929
'jinja2',

tests/test_lib.py

+29
Original file line numberDiff line numberDiff line change
@@ -2523,3 +2523,32 @@ def test_dump_to_json_objects():
25232523
for x in ['a', 'b']:
25242524
assert res[50][x] == data[50][x]
25252525
assert res[50]['c'][x] == data[50][x]
2526+
2527+
2528+
def aux_profile(filename, fast=False):
2529+
from dataflows import Flow, load, schema_validator
2530+
return Flow(
2531+
load(filename, cast_strategy=load.CAST_WITH_SCHEMA),
2532+
).results(on_error=None if fast else schema_validator.raise_exception)[0][0]
2533+
2534+
@pytest.mark.parametrize('fast', [False, True])
2535+
def test_profile(fast):
2536+
import csv
2537+
import tempfile
2538+
import os
2539+
from decimal import Decimal
2540+
2541+
NUM = 1000000
2542+
2543+
with tempfile.TemporaryDirectory() as tmpdirname:
2544+
filename = os.path.join(tmpdirname, 'test.csv')
2545+
with open(filename, 'w') as f:
2546+
writer = csv.writer(f)
2547+
writer.writerow(['id', 'name', 'age', 'percent'])
2548+
for i in range(NUM):
2549+
writer.writerow([str(i), 'name is ' + str(i), i % 100, i / 100])
2550+
res = aux_profile(filename, fast)
2551+
for i in range(NUM):
2552+
assert res[i]['id'] == i
2553+
assert res[i]['name'] == 'name is ' + str(i)
2554+
assert res[i]['age'] == i % 100

0 commit comments

Comments
 (0)