Skip to content

Commit 99f5215

Browse files
authored
Join on row number (#132)
* Implemented POC * Bump tabulator to fix tests * Upgrade to [email protected] * Fixed linting/restart Travis * Added to processors.md * Added an example
1 parent 3bdc37f commit 99f5215

File tree

7 files changed

+113
-8
lines changed

7 files changed

+113
-8
lines changed

PROCESSORS.md

Lines changed: 32 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -779,7 +779,7 @@ def join_with_self(resource_name, join_key, fields):
779779
- `source_name` - name of the _source_ resource
780780
- `source_key` - One of
781781
- List of field names which should be used as the lookup key
782-
- String, which would be interpreted as a Python format string used to form the key (e.g. `{<field_name_1>}:{field_name_2}`)
782+
- String, which would be interpreted as a Python format string used to form the key (e.g. `{<field_name_1>}:{field_name_2}`). It's possible to use `#` as a special field name to include a row number (startring from the first row after the headers row) e.g. `{#}:{field_name_2}`.
783783
- `source_delete` - delete source from data-package after joining (`True` by default)
784784

785785
- `target_name` - name of the _target_ resource to hold the joined data.
@@ -934,3 +934,34 @@ age|first_name |last_name |the_house
934934
27|Tyrion |Lannister |Lannister
935935
5|Rickon |Stark |Stark
936936
16|Daenerys |Targaryen |Targaryen
937+
938+
*Joining using row numbers*:
939+
`source`:
940+
| values |
941+
|--------|
942+
| value1 |
943+
| value2 |
944+
945+
`target`:
946+
| id | names |
947+
|----|-------|
948+
| 01 | name1 |
949+
| 02 | name2 |
950+
951+
```python
952+
Flow(#...
953+
join(
954+
source_name='source',
955+
source_key=['#'],
956+
target_name='target',
957+
target_key=['#'],
958+
fields={'values': {'name': 'values'}}
959+
),
960+
)
961+
```
962+
963+
Output:
964+
| id | names | values |
965+
|----|-------|--------|
966+
| 01 | name1 | value1 |
967+
| 02 | name2 | value2 |

data/cities_comment.csv

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,4 @@
1+
city,comment
2+
paris,city with population in row 2
3+
london,city with population in row 1
4+
rome,city with population in row 3

data/names.csv

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
id,names
2+
01,name1
3+
02,name2

data/values.csv

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
values
2+
value1
3+
value2

dataflows/processors/join.py

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -21,8 +21,8 @@ def __init__(self, key_spec):
2121
self.key_spec = key_spec
2222
self.key_list = key_list
2323

24-
def __call__(self, row):
25-
return self.key_spec.format(**row)
24+
def __call__(self, row, row_number):
25+
return self.key_spec.format(**{**row, '#': row_number})
2626

2727

2828
# Aggregator helpers
@@ -183,8 +183,8 @@ def join_aux(source_name, source_key, source_delete, # noqa: C901
183183

184184
# Indexes the source data
185185
def indexer(resource):
186-
for row in resource:
187-
key = source_key(row)
186+
for row_number, row in enumerate(resource, start=1):
187+
key = source_key(row, row_number)
188188
try:
189189
current = db.get(key)
190190
except KeyError:
@@ -223,8 +223,8 @@ def process_target(resource):
223223
))
224224
yield row
225225
else:
226-
for row in resource:
227-
key = target_key(row)
226+
for row_number, row in enumerate(resource, start=1):
227+
key = target_key(row, row_number)
228228
try:
229229
extra = create_extra_by_key(key)
230230
db_keys_usage.set(key, True)

setup.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -21,7 +21,7 @@ def read(*paths):
2121
PACKAGE = 'dataflows'
2222
NAME = PACKAGE.replace('_', '-')
2323
INSTALL_REQUIRES = [
24-
'tabulator>=1.23.0',
24+
'tabulator>=1.38.4',
2525
'datapackage>=1.5.0',
2626
'tableschema>=1.5',
2727
'kvfile>=0.0.8',

tests/test_lib.py

Lines changed: 64 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1420,6 +1420,7 @@ def test_load_override_schema_and_fields():
14201420
{'name': None, 'age': '22'},
14211421
]]
14221422

1423+
14231424
def test_delete_fields_regex():
14241425
from dataflows import load, delete_fields
14251426
flow = Flow(
@@ -1433,6 +1434,7 @@ def test_delete_fields_regex():
14331434
{'city': 'rome'},
14341435
]]
14351436

1437+
14361438
def test_join_full_outer():
14371439
from dataflows import load, set_type, join
14381440
flow = Flow(
@@ -1456,6 +1458,68 @@ def test_join_full_outer():
14561458
]]
14571459

14581460

1461+
def test_join_row_number():
1462+
from dataflows import load, set_type, join
1463+
flow = Flow(
1464+
load('data/population.csv'),
1465+
load('data/cities.csv'),
1466+
join(
1467+
source_name='population',
1468+
source_key=['#'],
1469+
target_name='cities',
1470+
target_key=['#'],
1471+
fields={'population': {'name': 'population'}}
1472+
),
1473+
)
1474+
data = flow.results()[0]
1475+
assert data == [[
1476+
{'id': 1, 'city': 'london', 'population': 8},
1477+
{'id': 2, 'city': 'paris', 'population': 2},
1478+
{'id': 3, 'city': 'rome', 'population': 3},
1479+
]]
1480+
1481+
1482+
def test_join_row_number_readme_example():
1483+
from dataflows import load, set_type, join
1484+
flow = Flow(
1485+
load('data/values.csv'),
1486+
load('data/names.csv'),
1487+
join(
1488+
source_name='values',
1489+
source_key=['#'],
1490+
target_name='names',
1491+
target_key=['#'],
1492+
fields={'values': {'name': 'values'}}
1493+
),
1494+
)
1495+
data = flow.results()[0]
1496+
assert data == [[
1497+
{'id': 1, 'names': 'name1', 'values': 'value1'},
1498+
{'id': 2, 'names': 'name2', 'values': 'value2'},
1499+
]]
1500+
1501+
1502+
def test_join_row_number_format_string():
1503+
from dataflows import load, set_type, join
1504+
flow = Flow(
1505+
load('data/population.csv'),
1506+
load('data/cities_comment.csv'),
1507+
join(
1508+
source_name='population',
1509+
source_key='city with population in row {#}',
1510+
target_name='cities_comment',
1511+
target_key='{comment}',
1512+
fields={'population': {'name': 'population'}}
1513+
),
1514+
)
1515+
data = flow.results()[0]
1516+
assert data == [[
1517+
{'city': 'paris', 'population': 2, 'comment': 'city with population in row 2'},
1518+
{'city': 'london', 'population': 8, 'comment': 'city with population in row 1'},
1519+
{'city': 'rome', 'population': 3, 'comment': 'city with population in row 3'},
1520+
]]
1521+
1522+
14591523
def test_load_duplicate_headers():
14601524
from dataflows import load
14611525
flow = Flow(

0 commit comments

Comments
 (0)