From e8942ed194860f4c4aec5a63d70f4da4e8bf2df9 Mon Sep 17 00:00:00 2001 From: anomrake Date: Thu, 24 Apr 2014 11:26:06 -0400 Subject: [PATCH 1/2] BUG: fix handling of color argument for variety of plotting functions parallel_coordinates - fix reordering of class column (from set) causing possible color/class mismatch - deprecated use of argument colors in favor of color radviz - fix reordering of class column (from set) causing possible color/class mismatch - added explicit color keyword argument (avoids multiple values 'color' being passed to plotting method) andrews_curves - added explicit color keyword argument (avoids multiple values 'color' being passed to plotting method) --- doc/source/release.rst | 4 ++ pandas/tests/test_graphics.py | 50 +++++++++++-- pandas/tools/plotting.py | 131 +++++++++++++++++----------------- 3 files changed, 114 insertions(+), 71 deletions(-) diff --git a/doc/source/release.rst b/doc/source/release.rst index a6aa842940bc0..88833f48f1659 100644 --- a/doc/source/release.rst +++ b/doc/source/release.rst @@ -229,6 +229,10 @@ Deprecations returned if possible, otherwise a copy will be made. Previously the user could think that ``copy=False`` would ALWAYS return a view. (:issue:`6894`) +- The :func:`parallel_coordinates` function now takes argument ``color`` + instead of ``colors``. A ``FutureWarning`` is raised to alert that + the old ``colors`` argument will not be supported in a future release + Prior Version Deprecations/Changes ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ diff --git a/pandas/tests/test_graphics.py b/pandas/tests/test_graphics.py index 829b2b296155f..629c011b4dbde 100644 --- a/pandas/tests/test_graphics.py +++ b/pandas/tests/test_graphics.py @@ -1220,11 +1220,26 @@ def scat2(x, y, by=None, ax=None, figsize=None): def test_andrews_curves(self): from pandas import read_csv from pandas.tools.plotting import andrews_curves - + from matplotlib import cm + path = os.path.join(curpath(), 'data', 'iris.csv') df = read_csv(path) _check_plot_works(andrews_curves, df, 'Name') + _check_plot_works(andrews_curves, df, 'Name', + color=('#556270', '#4ECDC4', '#C7F464')) + _check_plot_works(andrews_curves, df, 'Name', + color=['dodgerblue', 'aquamarine', 'seagreen']) + _check_plot_works(andrews_curves, df, 'Name', colormap=cm.jet) + + colors = ['b', 'g', 'r'] + df = DataFrame({"A": [1, 2, 3], + "B": [1, 2, 3], + "C": [1, 2, 3], + "Name": colors}) + ax = andrews_curves(df, 'Name', color=colors) + legend_colors = [l.get_color() for l in ax.legend().get_lines()] + self.assertEqual(colors, legend_colors) @slow def test_parallel_coordinates(self): @@ -1235,13 +1250,9 @@ def test_parallel_coordinates(self): df = read_csv(path) _check_plot_works(parallel_coordinates, df, 'Name') _check_plot_works(parallel_coordinates, df, 'Name', - colors=('#556270', '#4ECDC4', '#C7F464')) - _check_plot_works(parallel_coordinates, df, 'Name', - colors=['dodgerblue', 'aquamarine', 'seagreen']) + color=('#556270', '#4ECDC4', '#C7F464')) _check_plot_works(parallel_coordinates, df, 'Name', - colors=('#556270', '#4ECDC4', '#C7F464')) - _check_plot_works(parallel_coordinates, df, 'Name', - colors=['dodgerblue', 'aquamarine', 'seagreen']) + color=['dodgerblue', 'aquamarine', 'seagreen']) _check_plot_works(parallel_coordinates, df, 'Name', colormap=cm.jet) df = read_csv(path, header=None, skiprows=1, names=[1, 2, 4, 8, @@ -1249,6 +1260,15 @@ def test_parallel_coordinates(self): _check_plot_works(parallel_coordinates, df, 'Name', use_columns=True) _check_plot_works(parallel_coordinates, df, 'Name', xticks=[1, 5, 25, 125]) + + colors = ['b', 'g', 'r'] + df = DataFrame({"A": [1, 2, 3], + "B": [1, 2, 3], + "C": [1, 2, 3], + "Name": colors}) + ax = parallel_coordinates(df, 'Name', color=colors) + legend_colors = [l.get_color() for l in ax.legend().get_lines()] + self.assertEqual(colors, legend_colors) @slow def test_radviz(self): @@ -1259,8 +1279,24 @@ def test_radviz(self): path = os.path.join(curpath(), 'data', 'iris.csv') df = read_csv(path) _check_plot_works(radviz, df, 'Name') + _check_plot_works(radviz, df, 'Name', + color=('#556270', '#4ECDC4', '#C7F464')) + _check_plot_works(radviz, df, 'Name', + color=['dodgerblue', 'aquamarine', 'seagreen']) _check_plot_works(radviz, df, 'Name', colormap=cm.jet) + colors = [[0., 0., 1., 1.], + [0., 0.5, 1., 1.], + [1., 0., 0., 1.]] + df = DataFrame({"A": [1, 2, 3], + "B": [2, 1, 3], + "C": [3, 2, 1], + "Name": ['b', 'g', 'r']}) + ax = radviz(df, 'Name', color=colors) + legend_colors = [c.get_facecolor().squeeze().tolist() + for c in ax.collections] + self.assertEqual(colors, legend_colors) + @slow def test_plot_int_columns(self): df = DataFrame(randn(100, 4)).cumsum() diff --git a/pandas/tools/plotting.py b/pandas/tools/plotting.py index 4453b1db359e9..a7628f759132f 100644 --- a/pandas/tools/plotting.py +++ b/pandas/tools/plotting.py @@ -8,7 +8,7 @@ import numpy as np -from pandas.util.decorators import cache_readonly +from pandas.util.decorators import cache_readonly, deprecate_kwarg import pandas.core.common as com from pandas.core.index import MultiIndex from pandas.core.series import Series, remove_na @@ -355,18 +355,22 @@ def _get_marker_compat(marker): return marker -def radviz(frame, class_column, ax=None, colormap=None, **kwds): +def radviz(frame, class_column, ax=None, color=None, colormap=None, **kwds): """RadViz - a multivariate data visualization algorithm Parameters: ----------- - frame: DataFrame object - class_column: Column name that contains information about class membership + frame: DataFrame + class_column: str + Column name containing class names ax: Matplotlib axis object, optional + color: list or tuple, optional + Colors to use for the different classes colormap : str or matplotlib colormap object, default None Colormap to select colors from. If string, load colormap with that name from matplotlib. - kwds: Matplotlib scatter method keyword arguments, optional + kwds: keywords + Options to pass to matplotlib scatter plotting method Returns: -------- @@ -380,44 +384,42 @@ def normalize(series): b = max(series) return (series - a) / (b - a) - column_names = [column_name for column_name in frame.columns - if column_name != class_column] - - df = frame[column_names].apply(normalize) + n = len(frame) + classes = frame[class_column].drop_duplicates() + class_col = frame[class_column] + df = frame.drop(class_column, axis=1).apply(normalize) if ax is None: ax = plt.gca(xlim=[-1, 1], ylim=[-1, 1]) - classes = set(frame[class_column]) to_plot = {} - colors = _get_standard_colors(num_colors=len(classes), colormap=colormap, - color_type='random', color=kwds.get('color')) + color_type='random', color=color) - for class_ in classes: - to_plot[class_] = [[], []] + for kls in classes: + to_plot[kls] = [[], []] n = len(frame.columns) - 1 s = np.array([(np.cos(t), np.sin(t)) for t in [2.0 * np.pi * (i / float(n)) for i in range(n)]]) - for i in range(len(frame)): - row = df.irow(i).values + for i in range(n): + row = df.iloc[i].values row_ = np.repeat(np.expand_dims(row, axis=1), 2, axis=1) y = (s * row_).sum(axis=0) / row.sum() - class_name = frame[class_column].iget(i) - to_plot[class_name][0].append(y[0]) - to_plot[class_name][1].append(y[1]) + kls = class_col.iat[i] + to_plot[kls][0].append(y[0]) + to_plot[kls][1].append(y[1]) - for i, class_ in enumerate(classes): - ax.scatter(to_plot[class_][0], to_plot[class_][1], color=colors[i], - label=com.pprint_thing(class_), **kwds) + for i, kls in enumerate(classes): + ax.scatter(to_plot[kls][0], to_plot[kls][1], color=colors[i], + label=com.pprint_thing(kls), **kwds) ax.legend() ax.add_patch(patches.Circle((0.0, 0.0), radius=1.0, facecolor='none')) - for xy, name in zip(s, column_names): + for xy, name in zip(s, df.columns): ax.add_patch(patches.Circle(xy, radius=0.025, facecolor='gray')) @@ -438,20 +440,23 @@ def normalize(series): return ax -def andrews_curves(data, class_column, ax=None, samples=200, colormap=None, - **kwds): +def andrews_curves(frame, class_column, ax=None, samples=200, color=None, + colormap=None, **kwds): """ Parameters: ----------- - data : DataFrame + frame : DataFrame Data to be plotted, preferably normalized to (0.0, 1.0) class_column : Name of the column containing class names ax : matplotlib axes object, default None samples : Number of points to plot in each curve + color: list or tuple, optional + Colors to use for the different classes colormap : str or matplotlib colormap object, default None Colormap to select colors from. If string, load colormap with that name from matplotlib. - kwds : Optional plotting arguments to be passed to matplotlib + kwds: keywords + Options to pass to matplotlib plotting method Returns: -------- @@ -475,30 +480,31 @@ def f(x): return result return f - n = len(data) - class_col = data[class_column] - uniq_class = class_col.drop_duplicates() - columns = [data[col] for col in data.columns if (col != class_column)] + n = len(frame) + class_col = frame[class_column] + classes = frame[class_column].drop_duplicates() + df = frame.drop(class_column, axis=1) x = [-pi + 2.0 * pi * (t / float(samples)) for t in range(samples)] used_legends = set([]) - colors = _get_standard_colors(num_colors=len(uniq_class), colormap=colormap, - color_type='random', color=kwds.get('color')) - col_dict = dict([(klass, col) for klass, col in zip(uniq_class, colors)]) + color_values = _get_standard_colors(num_colors=len(classes), + colormap=colormap, color_type='random', + color=color) + colors = dict(zip(classes, color_values)) if ax is None: ax = plt.gca(xlim=(-pi, pi)) for i in range(n): - row = [columns[c][i] for c in range(len(columns))] + row = df.iloc[i].values f = function(row) y = [f(t) for t in x] - label = None - if com.pprint_thing(class_col[i]) not in used_legends: - label = com.pprint_thing(class_col[i]) + kls = class_col.iat[i] + label = com.pprint_thing(kls) + if label not in used_legends: used_legends.add(label) - ax.plot(x, y, color=col_dict[class_col[i]], label=label, **kwds) + ax.plot(x, y, color=colors[kls], label=label, **kwds) else: - ax.plot(x, y, color=col_dict[class_col[i]], **kwds) - + ax.plot(x, y, color=colors[kls], **kwds) + ax.legend(loc='upper right') ax.grid() return ax @@ -564,22 +570,22 @@ def bootstrap_plot(series, fig=None, size=50, samples=500, **kwds): plt.setp(axis.get_yticklabels(), fontsize=8) return fig - -def parallel_coordinates(data, class_column, cols=None, ax=None, colors=None, - use_columns=False, xticks=None, colormap=None, **kwds): +@deprecate_kwarg(old_arg_name='colors', new_arg_name='color') +def parallel_coordinates(frame, class_column, cols=None, ax=None, color=None, + use_columns=False, xticks=None, colormap=None, + **kwds): """Parallel coordinates plotting. Parameters ---------- - data: DataFrame - A DataFrame containing data to be plotted + frame: DataFrame class_column: str Column name containing class names cols: list, optional A list of column names to use ax: matplotlib.axis, optional matplotlib axis object - colors: list or tuple, optional + color: list or tuple, optional Colors to use for the different classes use_columns: bool, optional If true, columns will be used as xticks @@ -587,8 +593,8 @@ def parallel_coordinates(data, class_column, cols=None, ax=None, colors=None, A list of values to use for xticks colormap: str or matplotlib colormap, default None Colormap to use for line colors. - kwds: list, optional - A list of keywords for matplotlib plot method + kwds: keywords + Options to pass to matplotlib plotting method Returns ------- @@ -600,20 +606,19 @@ def parallel_coordinates(data, class_column, cols=None, ax=None, colors=None, >>> from pandas.tools.plotting import parallel_coordinates >>> from matplotlib import pyplot as plt >>> df = read_csv('https://raw.github.com/pydata/pandas/master/pandas/tests/data/iris.csv') - >>> parallel_coordinates(df, 'Name', colors=('#556270', '#4ECDC4', '#C7F464')) + >>> parallel_coordinates(df, 'Name', color=('#556270', '#4ECDC4', '#C7F464')) >>> plt.show() """ import matplotlib.pyplot as plt - - n = len(data) - classes = set(data[class_column]) - class_col = data[class_column] + n = len(frame) + classes = frame[class_column].drop_duplicates() + class_col = frame[class_column] if cols is None: - df = data.drop(class_column, axis=1) + df = frame.drop(class_column, axis=1) else: - df = data[cols] + df = frame[cols] used_legends = set([]) @@ -638,19 +643,17 @@ def parallel_coordinates(data, class_column, cols=None, ax=None, colors=None, color_values = _get_standard_colors(num_colors=len(classes), colormap=colormap, color_type='random', - color=colors) + color=color) colors = dict(zip(classes, color_values)) for i in range(n): - row = df.irow(i).values - y = row - kls = class_col.iget_value(i) - if com.pprint_thing(kls) not in used_legends: - label = com.pprint_thing(kls) + y = df.iloc[i].values + kls = class_col.iat[i] + label = com.pprint_thing(kls) + if label not in used_legends: used_legends.add(label) - ax.plot(x, y, color=colors[kls], - label=label, **kwds) + ax.plot(x, y, color=colors[kls], label=label, **kwds) else: ax.plot(x, y, color=colors[kls], **kwds) From 1980c7a804ff122072ffc594f106037f39e507fc Mon Sep 17 00:00:00 2001 From: anomrake Date: Thu, 1 May 2014 14:58:07 -0400 Subject: [PATCH 2/2] TST: add tests for deprecation warnings from plotting functions parallel_coordinates/andrews_curves - added deprecate_kwarg decorator for using frame argument instead of data - added tests to check that FutureWarning is raised properly --- doc/source/release.rst | 8 ++++++++ doc/source/v0.14.0.txt | 8 ++++++++ pandas/tests/test_graphics.py | 9 +++++++++ pandas/tools/plotting.py | 4 ++-- 4 files changed, 27 insertions(+), 2 deletions(-) diff --git a/doc/source/release.rst b/doc/source/release.rst index 88833f48f1659..4e4d61c3e971c 100644 --- a/doc/source/release.rst +++ b/doc/source/release.rst @@ -233,6 +233,10 @@ Deprecations instead of ``colors``. A ``FutureWarning`` is raised to alert that the old ``colors`` argument will not be supported in a future release +- The :func:`parallel_coordinates` and :func:`andrews_curves` functions now take + positional argument ``frame`` instead of ``data``. A ``FutureWarning`` is + raised if the old ``data`` argument is used by name. + Prior Version Deprecations/Changes ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ @@ -464,6 +468,10 @@ Bug Fixes - Bug in timeseries-with-frequency plot cursor display (:issue:`5453`) - Bug surfaced in groupby.plot when using a ``Float64Index`` (:issue:`7025`) - Stopped tests from failing if options data isn't able to be downloaded from Yahoo (:issue:`7034`) +- Bug in ``parallel_coordinates`` and ``radviz`` where reordering of class column + caused possible color/class mismatch +- Bug in ``radviz`` and ``andrews_curves`` where multiple values of 'color' + were being passed to plotting method pandas 0.13.1 ------------- diff --git a/doc/source/v0.14.0.txt b/doc/source/v0.14.0.txt index b5df39df3b617..f5e018b6141fe 100644 --- a/doc/source/v0.14.0.txt +++ b/doc/source/v0.14.0.txt @@ -382,6 +382,14 @@ Plotting Because of the default `align` value changes, coordinates of bar plots are now located on integer values (0.0, 1.0, 2.0 ...). This is intended to make bar plot be located on the same coodinates as line plot. However, bar plot may differs unexpectedly when you manually adjust the bar location or drawing area, such as using `set_xlim`, `set_ylim`, etc. In this cases, please modify your script to meet with new coordinates. +- The :func:`parallel_coordinates` function now takes argument ``color`` + instead of ``colors``. A ``FutureWarning`` is raised to alert that + the old ``colors`` argument will not be supported in a future release + +- The :func:`parallel_coordinates` and :func:`andrews_curves` functions now take + positional argument ``frame`` instead of ``data``. A ``FutureWarning`` is + raised if the old ``data`` argument is used by name. + .. _whatsnew_0140.prior_deprecations: Prior Version Deprecations/Changes diff --git a/pandas/tests/test_graphics.py b/pandas/tests/test_graphics.py index 629c011b4dbde..e3f49e14400d1 100644 --- a/pandas/tests/test_graphics.py +++ b/pandas/tests/test_graphics.py @@ -1240,6 +1240,9 @@ def test_andrews_curves(self): ax = andrews_curves(df, 'Name', color=colors) legend_colors = [l.get_color() for l in ax.legend().get_lines()] self.assertEqual(colors, legend_colors) + + with tm.assert_produces_warning(FutureWarning): + andrews_curves(data=df, class_column='Name') @slow def test_parallel_coordinates(self): @@ -1269,6 +1272,12 @@ def test_parallel_coordinates(self): ax = parallel_coordinates(df, 'Name', color=colors) legend_colors = [l.get_color() for l in ax.legend().get_lines()] self.assertEqual(colors, legend_colors) + + with tm.assert_produces_warning(FutureWarning): + parallel_coordinates(df, 'Name', colors=colors) + + with tm.assert_produces_warning(FutureWarning): + parallel_coordinates(data=df, class_column='Name') @slow def test_radviz(self): diff --git a/pandas/tools/plotting.py b/pandas/tools/plotting.py index a7628f759132f..b11d71f48baf2 100644 --- a/pandas/tools/plotting.py +++ b/pandas/tools/plotting.py @@ -354,7 +354,6 @@ def _get_marker_compat(marker): return 'o' return marker - def radviz(frame, class_column, ax=None, color=None, colormap=None, **kwds): """RadViz - a multivariate data visualization algorithm @@ -439,7 +438,7 @@ def normalize(series): ax.axis('equal') return ax - +@deprecate_kwarg(old_arg_name='data', new_arg_name='frame') def andrews_curves(frame, class_column, ax=None, samples=200, color=None, colormap=None, **kwds): """ @@ -571,6 +570,7 @@ def bootstrap_plot(series, fig=None, size=50, samples=500, **kwds): return fig @deprecate_kwarg(old_arg_name='colors', new_arg_name='color') +@deprecate_kwarg(old_arg_name='data', new_arg_name='frame') def parallel_coordinates(frame, class_column, cols=None, ax=None, color=None, use_columns=False, xticks=None, colormap=None, **kwds):