forked from IDEMSInternational/rapidpro-flow-toolkit
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathrowdatasheet.py
96 lines (84 loc) · 3.87 KB
/
rowdatasheet.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
import networkx as nx
import tablib
class RowDataSheet:
def __init__(self, row_parser, rows, target_headers=set(), excluded_headers=set()):
"""
Class to export list of models to spreadsheet.
Note: If the model supports remapping where multiple fields of different
types are mapped to the same column header, that column header needs to
be specified in target_headers.
Args:
rows (list[ParserModel]): a list of RowModel instances
row_parser (RowParser): parser to use for converting rows to flat dicts.
target_headers (set[str]): Complex type fields (ParserModels, lists, dicts)
whose content should be represented in the output as single columns.
A trailing asterisk may be used to specify multiple fields at once,
such as `list.*` and `field.*`.
excluded_headers (set[str]): Fields to exclude from the output. Same format
as target_headers.
"""
self.row_parser = row_parser
self.rows = rows
self.target_headers = target_headers
self.excluded_headers = excluded_headers
def export(self, filename, file_format="csv"):
"""
Export a list of RowModel instances to file.
Args:
filename: destination filename
format: Export file format.
Supported file formats as supported by tablib,
see https://tablib.readthedocs.io/en/stable/formats.html
"""
data = self.convert_to_tablib()
exported_data = data.export(file_format)
write_type = "w" if type(exported_data) is str else "wb"
with open(filename, write_type, encoding='utf-8') if write_type == "w" else open(filename, write_type) as f:
f.write(exported_data)
def convert_to_tablib(self):
"""
Convert a list of RowModel instances to tablib.Dataset.
Return:
A tablib.Dataset representation of the data.
"""
data = tablib.Dataset()
data.headers = self._get_headers()
for row in self.rows:
row_dict = self.row_parser.unparse_row(
row, self.target_headers, self.excluded_headers
)
data.append([row_dict.get(header, "") for header in data.headers])
return data
def _get_headers(self):
"""
Get an ordered list of column headers.
Each row contains a subset of the final set of column headers.
These subsets need to be merged while respecting the relative order
within each row. Note: The resulting set of headers is unique,
however, their order is not guaranteed to be unique.
TODO: A better approach would be to use the DataModel of the rows
to uniquely infer the order of the headers.
Return:
A list of strings representing the column headers of the sheet.
"""
# Create a graph (representing a poset) whose nodes are the column headers,
# and whose edges A -> B represent that column header A should come before
# column header B.
header_graph = nx.DiGraph()
for row in self.rows:
row_dict = self.row_parser.unparse_row(
row, self.target_headers, self.excluded_headers
)
k_prev = None
# For each pair of consecutive headers in this row, add an edge.
for k, _ in row_dict.items():
if k_prev:
header_graph.add_edge(k_prev, k)
k_prev = k
# We now get a linear order of our headers from this poset graph
# by doing a topological sort.
try:
ordering = list(nx.topological_sort(header_graph))
except nx.exception.NetworkXUnfeasible:
raise ValueError("Inconsistent ordering of headers in provided rows.")
return ordering