From 05141f091b27c2bce3772eb66eca7710b3fb67ec Mon Sep 17 00:00:00 2001 From: Ben Lindsay Date: Wed, 9 Sep 2020 23:28:57 -0500 Subject: [PATCH 1/3] initial addition of create_ecdf --- .../plotly/plotly/figure_factory/__init__.py | 4 + .../plotly/plotly/figure_factory/_ecdf.py | 146 ++++++++++++++++++ 2 files changed, 150 insertions(+) create mode 100644 packages/python/plotly/plotly/figure_factory/_ecdf.py diff --git a/packages/python/plotly/plotly/figure_factory/__init__.py b/packages/python/plotly/plotly/figure_factory/__init__.py index 0a41dca1ba2..13cb7e335a1 100644 --- a/packages/python/plotly/plotly/figure_factory/__init__.py +++ b/packages/python/plotly/plotly/figure_factory/__init__.py @@ -29,12 +29,16 @@ if optional_imports.get_module("pandas") is not None: from plotly.figure_factory._county_choropleth import create_choropleth + from plotly.figure_factory._ecdf import create_ecdf from plotly.figure_factory._hexbin_mapbox import create_hexbin_mapbox else: def create_choropleth(*args, **kwargs): raise ImportError("Please install pandas to use `create_choropleth`") + def create_ecdf(*args, **kwargs): + raise ImportError("Please install pandas to use `create_ecdf`") + def create_hexbin_mapbox(*args, **kwargs): raise ImportError("Please install pandas to use `create_hexbin_mapbox`") diff --git a/packages/python/plotly/plotly/figure_factory/_ecdf.py b/packages/python/plotly/plotly/figure_factory/_ecdf.py new file mode 100644 index 00000000000..17031f0b406 --- /dev/null +++ b/packages/python/plotly/plotly/figure_factory/_ecdf.py @@ -0,0 +1,146 @@ +import pandas as pd +import plotly.express as px + + +def create_ecdf( + df: pd.DataFrame, + x: str, + weight: str = None, + color: str = None, + facet_col: str = None, + facet_row: str = None, + animation_frame: str = None, + normalized: bool = True, + mode: str = "lines+markers", + line_shape: str = "hv", + **kwargs, +): + """ + Returns figure for an ECDF plot. + + :param (pd.DataFrame) df: the input data to plot. + :param (str) x: the name of the column containing the values whose + distribution you're trying to plot. + :param (str, optional) weight: the name of the column with weights to be + applied to the values + :param (str, optional) color: the name of the column to be used to split + values into color groups + :param (str, optional) facet_col: the name of the column to be used to + split values into facet column groups + :param (str, optional) facet_row: the name of the column to be used to + split values into facet row groups + :param (str, optional) animation_frame: the name of the column to be used + to split values into animation frame groups + :param (bool, optional) normalized: if `True`, ECDF ranges from 0 to 1. + Otherwise, ECDF ranges from 0 to the number of points in the + distribution, or the sum of the weight column in the distribution if + `weight` is defined. + :param (str, optional) mode: mode option to be used in traces. + Permitted options are any combination of "lines", "markers", and "text" + joined with "+" character + :param (str, optional) line_shape: the shape of the line to be used if + `mode` contains "lines". Permitted options are "linear", "spline", + "hv", "vh", "hvh", and "vhv". + + Example 1: Normal distribution ECDFs grouped and colored by category + ''' + >>> import pandas as pd + >>> import plotly.figure_factory as ff + >>> import numpy as np + + >>> df = pd.DataFrame( + ... { + ... "category": ["a"] * 100 + ["b"] * 30, + ... "value": np.concatenate( + ... [ + ... np.random.normal(0, size=100), + ... np.random.normal(5, size=30), + ... ], + ... ), + ... } + ... ) + + >>> fig = ff.create_ecdf(df, x="value", color="category") + >>> fig.show() + ''' + + Example 2: Animated, weighted ECDF + ''' + >>> import plotly.express as px + >>> import plotly.figure_factory as ff + + >>> df = px.data.gapminder() + >>> fig = ff.create_ecdf( + ... df, + ... x="pop", + ... color="continent", + ... hover_data=["continent", "country"], + ... animation_frame="year", + ... normalized=False, + ... range_x=[-50_000_000, 1_400_000_000], + ... range_y=[-100_000_000, 5_000_000_000], + ... weight="pop", + ... ) + + >>> fig.show() + ''' + """ + df = df.sort_values(x, ascending=True).copy() + + if weight is None: + weight_col = "_weight_" + df[weight_col] = 1 + else: + weight_col = weight + + groupby_cols = [] + if color is not None: + groupby_cols.append(color) + if facet_col is not None: + groupby_cols.append(facet_col) + if facet_row is not None: + groupby_cols.append(facet_row) + if animation_frame is not None: + groupby_cols.append(animation_frame) + + if normalized: + y = "Normalized " + else: + y = "Absolute " + + if weight is None: + y += "ECDF" + else: + y += f"{weight}-Weighted ECDF" + + if len(groupby_cols) == 0: + groupby_cols = ["_dum_"] + df["_dum_"] = 1 + + df_list = [] + for _, group in df.groupby(groupby_cols): + group = group.copy() + group[y] = group[weight_col].cumsum() + if normalized: + group[y] /= group[weight_col].sum() + df_list.append(group) + df = pd.concat(df_list, ignore_index=True) + + fig = px.line( + df, + x=x, + y=y, + color=color, + facet_col=facet_col, + facet_row=facet_row, + animation_frame=animation_frame, + **kwargs, + ) + + if animation_frame is not None: + for f in fig.frames: + for d in f.data: + d.mode = mode + fig.update_traces(line_shape=line_shape, mode=mode) + + return fig From d54be6a60c7eff2e5ebd8463bb7676016158f2fb Mon Sep 17 00:00:00 2001 From: Ben Lindsay Date: Sun, 13 Sep 2020 23:12:26 -0500 Subject: [PATCH 2/3] add ability to create ecdfs with array-like --- .../plotly/plotly/figure_factory/_ecdf.py | 279 +++++++++++++----- 1 file changed, 209 insertions(+), 70 deletions(-) diff --git a/packages/python/plotly/plotly/figure_factory/_ecdf.py b/packages/python/plotly/plotly/figure_factory/_ecdf.py index 17031f0b406..dcc302a45bd 100644 --- a/packages/python/plotly/plotly/figure_factory/_ecdf.py +++ b/packages/python/plotly/plotly/figure_factory/_ecdf.py @@ -3,42 +3,89 @@ def create_ecdf( - df: pd.DataFrame, - x: str, - weight: str = None, - color: str = None, - facet_col: str = None, - facet_row: str = None, - animation_frame: str = None, - normalized: bool = True, - mode: str = "lines+markers", - line_shape: str = "hv", + data_frame=None, + x=None, + weight=None, + color=None, + line_dash=None, + line_group=None, + facet_row=None, + facet_col=None, + animation_frame=None, + animation_group=None, + hover_name=None, + hover_data=None, + normalized=True, + reverse_ecdf=False, + mode="lines+markers", + line_shape="hv", **kwargs, ): """ Returns figure for an ECDF plot. - :param (pd.DataFrame) df: the input data to plot. - :param (str) x: the name of the column containing the values whose - distribution you're trying to plot. - :param (str, optional) weight: the name of the column with weights to be - applied to the values - :param (str, optional) color: the name of the column to be used to split - values into color groups - :param (str, optional) facet_col: the name of the column to be used to - split values into facet column groups - :param (str, optional) facet_row: the name of the column to be used to - split values into facet row groups - :param (str, optional) animation_frame: the name of the column to be used - to split values into animation frame groups - :param (bool, optional) normalized: if `True`, ECDF ranges from 0 to 1. + :param (DataFrame or dict) data_frame: This argument needs to be passed + for column names (and not keyword names) to be used. Array-like and + dict are tranformed internally to a pandas DataFrame. Optional: if + missing, a DataFrame gets constructed under the hood using the other + arguments. + :param (str or int or array-like) x: Either a name of a column in + `data_frame`, or a pandas Series or array_like object. Values from this + column or array_like are used to position marks along the x axis in + cartesian coordinates and correspond to the distribution that will be + plotted. + :param (str or int or array-like, optional) weight: Either a name of a + column in `data_frame`, or a pandas Series or array_like object. Values + from this column or array_like are used to weight `x` values. If + `normalized` is `False`, the resulting `y` values correspond to the + cumulative sum of the `weight` values in each group. If `normalized` is + `True`, the `y` values are scaled to be in range (0, 1]. Optional: if + missing, all `x` values are given a weight of 1. + :param (str or int or array-like, optional) color: Either a name of a + column in data_frame, or a pandas Series or array_like object. Values + from this column or array_like are used to assign color to marks. + :param (str or int or array-like, optional) line_dash: Either a name of a + column in `data_frame`, or a pandas Series or array-like object. Values + from this column or array-like are used to assign dash-patterns to + lines. + :param (str or int or array-like, optional) line_group: Either a name of a + column in `data_frame`, or a pandas Series or array_like object. Values + from this column or array-like are used to group rows of data_frame + into lines. + :param (str or int or array-like, optional) facet_row: Either a name of a + column in data_frame, or a pandas Series or array_like object. Values + from this column or array-like are used to assign marks to facetted + subplots in the vertical direction. + :param (str or int or array-like, optional) facet_col: Either a name of a + column in data_frame, or a pandas Series or array_like object. Values + from this column or array_like are used to assign marks to facetted + subplots in the horizontal direction. + :param (str or int or array-like, optional) animation_frame: Either a name + of a column in `data_frame`, or a pandas Series or array-like object. + Values from this column or array-like are used to assign marks to + animation frames. + :param (str or int or array-like, optional) animation_group: Either a name + of a column in `data_frame`, or a pandas Series or array-like object. + Values from this column or array-like are used to provide + object-constancy across animation frames: rows with matching + `animation_group`s will be treated as if they describe the same object + in each frame. + :param (str or int or array-like, optional) hover_name: Either a name of a + column in `data_frame`, or a pandas Series or array-like object. Values + from this column or array-like appear in bold in the hover tooltip. + :param (list of str or int, or array-like, optional) hover_data: Either a + from this column or array-like appear in bold in the hover tooltip. + :param (bool) normalized: If `True`, ECDF ranges from 0 to 1. Otherwise, ECDF ranges from 0 to the number of points in the distribution, or the sum of the weight column in the distribution if - `weight` is defined. - :param (str, optional) mode: mode option to be used in traces. + `weight` is defined. Defaults to `True`. + :param (bool) reverse_ecdf: If `True`, ECDF values increase with decreasing + `x` values rather than the default of increasing with increasing `x` + values. + :param (str) mode: Mode option to be used in traces. Permitted options are any combination of "lines", "markers", and "text" - joined with "+" character - :param (str, optional) line_shape: the shape of the line to be used if + joined with "+" character. + :param (str) line_shape: the shape of the line to be used if `mode` contains "lines". Permitted options are "linear", "spline", "hv", "vh", "hvh", and "vhv". @@ -85,62 +132,154 @@ def create_ecdf( >>> fig.show() ''' """ - df = df.sort_values(x, ascending=True).copy() - - if weight is None: - weight_col = "_weight_" - df[weight_col] = 1 - else: - weight_col = weight - - groupby_cols = [] - if color is not None: - groupby_cols.append(color) - if facet_col is not None: - groupby_cols.append(facet_col) - if facet_row is not None: - groupby_cols.append(facet_row) - if animation_frame is not None: - groupby_cols.append(animation_frame) - - if normalized: - y = "Normalized " - else: - y = "Absolute " + col_dict = dict( + x=x, + weight=weight, + color=color, + line_group=line_group, + line_dash=line_dash, + facet_row=facet_row, + facet_col=facet_col, + animation_frame=animation_frame, + animation_group=animation_group, + hover_name=hover_name, + hover_data=hover_data, + ) + col_dict, data_frame = _prep_col_dict_and_data_frame( + col_dict, data_frame, normalized + ) - if weight is None: - y += "ECDF" - else: - y += f"{weight}-Weighted ECDF" + groupby_col_types = [ + "color", + "line_group", + "line_dash", + "facet_row", + "facet_col", + "animation_frame", + "animation_group", + ] + groupby_cols = [ + col_name + for col_type, col_name in col_dict.items() + if col_type in groupby_col_types + ] if len(groupby_cols) == 0: groupby_cols = ["_dum_"] - df["_dum_"] = 1 + data_frame["_dum_"] = 1 + + ascending = not reverse_ecdf + data_frame = data_frame.sort_values(col_dict["x"], ascending=ascending).copy() df_list = [] - for _, group in df.groupby(groupby_cols): + for _, group in data_frame.groupby(groupby_cols): group = group.copy() - group[y] = group[weight_col].cumsum() + group[col_dict["y"]] = group[col_dict["weight"]].cumsum() if normalized: - group[y] /= group[weight_col].sum() + group[col_dict["y"]] /= group[col_dict["weight"]].sum() df_list.append(group) - df = pd.concat(df_list, ignore_index=True) + data_frame = pd.concat(df_list, ignore_index=True) fig = px.line( - df, - x=x, - y=y, - color=color, - facet_col=facet_col, - facet_row=facet_row, - animation_frame=animation_frame, + data_frame, + x=col_dict.get("x"), + y=col_dict.get("y"), + color=col_dict.get("color"), + line_group=col_dict.get("line_group"), + line_dash=col_dict.get("line_dash"), + facet_row=col_dict.get("facet_row"), + facet_col=col_dict.get("facet_col"), + animation_frame=col_dict.get("animation_frame"), + animation_group=col_dict.get("animation_group"), + hover_name=col_dict.get("hover_name"), + hover_data=col_dict.get("hover_data"), **kwargs, ) - if animation_frame is not None: - for f in fig.frames: - for d in f.data: - d.mode = mode + if "animation_frame" in col_dict: + for frame in fig.frames: + for trace in frame.data: + trace.mode = mode fig.update_traces(line_shape=line_shape, mode=mode) return fig + + +def _prep_col_dict_and_data_frame(col_dict, data_frame, normalized): + """Prepare col_dict and data_frame prior to computing ECDF""" + col_dict = { + col_type: col_value + for col_type, col_value in col_dict.items() + if col_value is not None + } + if data_frame is None: + col_dict, data_frame = _handle_hover_data(col_dict) + + elif isinstance(data_frame, dict): + data_frame = pd.DataFrame(data_frame) + + if not isinstance(data_frame, pd.DataFrame): + raise TypeError("data_frame must be of type pd.DataFrame or dict") + for col_type, col_name in col_dict.items(): + if col_type == "hover_data": + for name in col_name: + _error_if_column_not_found(data_frame, col_type, name) + else: + _error_if_column_not_found(data_frame, col_type, col_name) + + col_dict["y"] = "" + if not normalized: + col_dict["y"] += "Unnormalized " + + if "weight" in col_dict: + col_dict["y"] += "{}-Weighted ".format(col_dict["weight"]) + else: + col_dict["weight"] = "_weight_" + data_frame[col_dict["weight"]] = 1 + + col_dict["y"] += "ECDF" + + return col_dict, data_frame + + +def _error_if_column_not_found(data_frame, col_type, col_name): + if col_name not in data_frame.columns: + raise ValueError( + "{} column '{}' not found in data_frame".format(col_type, col_name) + ) + + +def _handle_hover_data(col_dict): + """Convert col_dict with data into a data_frame, handling the complexities + of hover_data. If `"hover_data"` is part of `col_dict`, add the data as + new columns in `data_frame`. Return a new `col_dict` that maps column types + to column names, with `col_dict["hover_data"]` containing a list of + `hover_data` column names, as well as the `data_frame`""" + + if "hover_data" in col_dict: + hover_data = col_dict.pop("hover_data") + new_hover_data = [] + if isinstance(hover_data, dict): + for col_name, col_data in hover_data.items(): + if col_name in col_dict: + raise ValueError( + str(col_name) + + " from hover_data is already represented." + + " Choose a different column name." + ) + col_dict[col_name] = col_data + new_hover_data.append(col_name) + else: + for i, d in enumerate(hover_data): + col_name = "hover_data_" + str(i) + col_dict[col_name] = d + new_hover_data.append(col_name) + else: + hover_data = None + + data_frame = pd.DataFrame(col_dict) + col_dict = {col_type: col_type for col_type in col_dict.keys()} + + if hover_data is not None: + col_dict["hover_data"] = new_hover_data + return col_dict, data_frame From 810279734539e9e8f912d9c475d2a756df67b1ed Mon Sep 17 00:00:00 2001 From: Ben Lindsay Date: Mon, 14 Sep 2020 22:03:00 -0500 Subject: [PATCH 3/3] get rid of stray marks in ecdf docstring examples --- packages/python/plotly/plotly/figure_factory/_ecdf.py | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) diff --git a/packages/python/plotly/plotly/figure_factory/_ecdf.py b/packages/python/plotly/plotly/figure_factory/_ecdf.py index dcc302a45bd..b6fabbd333f 100644 --- a/packages/python/plotly/plotly/figure_factory/_ecdf.py +++ b/packages/python/plotly/plotly/figure_factory/_ecdf.py @@ -90,7 +90,7 @@ def create_ecdf( "hv", "vh", "hvh", and "vhv". Example 1: Normal distribution ECDFs grouped and colored by category - ''' + >>> import pandas as pd >>> import plotly.figure_factory as ff >>> import numpy as np @@ -106,13 +106,12 @@ def create_ecdf( ... ), ... } ... ) - >>> fig = ff.create_ecdf(df, x="value", color="category") >>> fig.show() - ''' + Example 2: Animated, weighted ECDF - ''' + >>> import plotly.express as px >>> import plotly.figure_factory as ff @@ -128,9 +127,7 @@ def create_ecdf( ... range_y=[-100_000_000, 5_000_000_000], ... weight="pop", ... ) - >>> fig.show() - ''' """ col_dict = dict( x=x,