-
-
Notifications
You must be signed in to change notification settings - Fork 18.5k
PERF: Cythonize from_nested_dict
#33485
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 11 commits
6d44e55
78ae5ab
61c841d
1c7ffbb
94977b2
bcb25b9
b3ebae6
7dfbca8
c8e515f
4829f78
5faa02b
d349b4f
4f8afed
babaf64
e22fbda
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -2526,3 +2526,20 @@ def fast_multiget(dict mapping, ndarray keys, default=np.nan): | |
output[i] = default | ||
|
||
return maybe_convert_objects(output) | ||
|
||
|
||
@cython.wraparound(False) | ||
@cython.boundscheck(False) | ||
def from_nested_dict(dict data) -> dict: | ||
cdef: | ||
dict new_data = {} | ||
object index, column, value, dict_or_series | ||
|
||
for index, dict_or_series in data.items(): | ||
for column, value in dict_or_series.items(): | ||
if column in new_data: | ||
new_data[column].update(dict([(index, value)])) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Maybe just do the loop more explicit: for index, dict_or_series in data.items():
for column, value in dict_or_series.items():
if column not in new_data:
new_data[column] = {index: value}
else:
new_data[column][index] = value ? Aparty from that I got no more comments and LGTM. |
||
else: | ||
new_data.setdefault(column, dict([(index, value)])) | ||
|
||
return new_data |
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -1265,8 +1265,13 @@ def from_dict(cls, data, orient="columns", dtype=None, columns=None) -> "DataFra | |
if orient == "index": | ||
if len(data) > 0: | ||
# TODO speed up Series case | ||
if isinstance(list(data.values())[0], (Series, dict)): | ||
data = _from_nested_dict(data) | ||
first_val = next(iter((data.values())), None) | ||
if isinstance(first_val, (Series, dict)): | ||
# If we are dealing with not a builtin dict, | ||
# `collections.defaultdict` for example, we need to convert it | ||
# to a regular dict so Cython will not raise. | ||
data = dict(data) if not type(data) is dict else data | ||
data = lib.from_nested_dict(data) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. If you change the above to data = dict(data) if not type(data) is dict else data # convert OrderedDict
data = lib.from_nested_dict(data) you can change the function interface in |
||
else: | ||
data, index = list(data.values()), list(data.keys()) | ||
elif orient == "columns": | ||
|
@@ -8817,12 +8822,3 @@ def isin(self, values) -> "DataFrame": | |
|
||
ops.add_flex_arithmetic_methods(DataFrame) | ||
ops.add_special_arithmetic_methods(DataFrame) | ||
|
||
|
||
def _from_nested_dict(data): | ||
# TODO: this should be seriously cythonized | ||
new_data = collections.defaultdict(dict) | ||
for index, s in data.items(): | ||
for col, v in s.items(): | ||
new_data[col][index] = v | ||
return new_data |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Can you keep with using the defaultdict? Should be more performant as it is implemented already in C
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Sure, one thing I do have a concern and it's the return type, if it's a
defaultdict
, the return type must be anobject
, unless we return it as adict
, e.gThere was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Does it make a difference whether typed as object or dict? I think both are just mapped to PyObject anyway so maybe doesn’t matter?