From 2871b9b12555531cb4850f8a3c1ef4b7405d3be1 Mon Sep 17 00:00:00 2001 From: Andrew Schonfeld Date: Fri, 8 Nov 2019 14:12:01 -0500 Subject: [PATCH] conditional usage of correlation computation based on presence of NaNs (1.3.6) --- CHANGES.md | 6 ++ docs/source/conf.py | 4 +- dtale/views.py | 28 +++++--- package.json | 2 +- setup.py | 2 +- static/__tests__/data/correlations.json | 3 +- static/__tests__/popups/Correlations-test.jsx | 64 ++++++++++++++++--- static/dtale/DataViewerMenu.jsx | 4 +- static/popups/Correlations.jsx | 19 +++++- .../popups/correlations/correlationsUtils.jsx | 20 ------ tests/dtale/test_views.py | 35 ++++++++-- 11 files changed, 134 insertions(+), 53 deletions(-) diff --git a/CHANGES.md b/CHANGES.md index 69131540..b0792f55 100644 --- a/CHANGES.md +++ b/CHANGES.md @@ -53,3 +53,9 @@ Changelog * Bug fixes for: * duplicate loading of histogram data * string serialization failing when mixing `future.str` & `str` in scatter function + +### 1.3.6 (2019-11-08) + + * Bug fixes for: + * choose between `pandas.corr` & `numpy.corrcoef` depending on presence of NaNs + * hide timeseries correlations when date columns only contain one day diff --git a/docs/source/conf.py b/docs/source/conf.py index bc705229..76e0fcd1 100644 --- a/docs/source/conf.py +++ b/docs/source/conf.py @@ -63,9 +63,9 @@ # built documents. # # The short X.Y version. -version = u'1.3.5' +version = u'1.3.6' # The full version, including alpha/beta/rc tags. -release = u'1.3.5' +release = u'1.3.6' # The language for content autogenerated by Sphinx. Refer to documentation # for a list of supported languages. diff --git a/dtale/views.py b/dtale/views.py index f6f2784b..b7206582 100644 --- a/dtale/views.py +++ b/dtale/views.py @@ -16,8 +16,8 @@ from dtale import dtale from dtale.cli.clickutils import retrieve_meta_info_and_version -from dtale.utils import (build_shutdown_url, build_url, dict_merge, - filter_df_for_grid, find_dtype_formatter, +from dtale.utils import (build_shutdown_url, build_url, classify_type, + dict_merge, filter_df_for_grid, find_dtype_formatter, find_selected_column, get_dtypes, get_int_arg, get_str_arg, grid_columns, grid_formatter, json_date, json_float, json_int, json_timestamp, jsonify, @@ -469,17 +469,29 @@ def get_correlations(): data = DATA[port] data = data.query(query) if query is not None else data - # using pandas.corr proved to be quite slow on large datasets so I moved to numpy: - # https://stackoverflow.com/questions/48270953/pandas-corr-and-corrwith-very-slow - valid_corr_cols = [c['name'] for c in DTYPES[port] if any((c['dtype'].startswith(s) for s in ['int', 'float']))] - data = np.corrcoef(data[valid_corr_cols].values, rowvar=False) - data = pd.DataFrame(data, columns=valid_corr_cols, index=valid_corr_cols) + valid_corr_cols = [] + valid_date_cols = [] + for col_info in DTYPES[port]: + name, dtype = map(col_info.get, ['name', 'dtype']) + dtype = classify_type(dtype) + if dtype in ['I', 'F']: + valid_corr_cols.append(name) + elif dtype == 'D' and len(data[name].dropna().unique()) > 1: + valid_date_cols.append(name) + + if data[valid_corr_cols].isnull().values.any(): + data = data.corr(method='pearson') + else: + # using pandas.corr proved to be quite slow on large datasets so I moved to numpy: + # https://stackoverflow.com/questions/48270953/pandas-corr-and-corrwith-very-slow + data = np.corrcoef(data[valid_corr_cols].values, rowvar=False) + data = pd.DataFrame(data, columns=valid_corr_cols, index=valid_corr_cols) data.index.name = str('column') data = data.reset_index() col_types = grid_columns(data) f = grid_formatter(col_types, nan_display=None) - return jsonify(data=f.format_dicts(data.itertuples())) + return jsonify(data=f.format_dicts(data.itertuples()), dates=valid_date_cols) except BaseException as e: return jsonify(dict(error=str(e), traceback=str(traceback.format_exc()))) diff --git a/package.json b/package.json index f49df61c..8c7d9eda 100644 --- a/package.json +++ b/package.json @@ -1,6 +1,6 @@ { "name": "dtale", - "version": "1.3.5", + "version": "1.3.6", "description": "All-purpose Data Viewer", "main": "main.js", "directories": { diff --git a/setup.py b/setup.py index e8f743a0..0243bc7f 100644 --- a/setup.py +++ b/setup.py @@ -50,7 +50,7 @@ def run_tests(self): setup( name="dtale", - version="1.3.5", + version="1.3.6", author="MAN Alpha Technology", author_email="ManAlphaTech@man.com", description="Web Client for Visualizing Pandas Objects", diff --git a/static/__tests__/data/correlations.json b/static/__tests__/data/correlations.json index cf9439e3..714566b2 100644 --- a/static/__tests__/data/correlations.json +++ b/static/__tests__/data/correlations.json @@ -28,5 +28,6 @@ "col4": 1.0, "column": "col4" } - ] + ], + "dates": ["col4", "col5"] } diff --git a/static/__tests__/popups/Correlations-test.jsx b/static/__tests__/popups/Correlations-test.jsx index cb4f7eed..349f1a54 100644 --- a/static/__tests__/popups/Correlations-test.jsx +++ b/static/__tests__/popups/Correlations-test.jsx @@ -5,6 +5,7 @@ import _ from "lodash"; import React from "react"; import mockPopsicle from "../MockPopsicle"; +import correlationsData from "../data/correlations"; import * as t from "../jest-assertions"; import { buildInnerHTML, withGlobalJquery } from "../test-utils"; @@ -13,14 +14,6 @@ const chartData = { type: "correlations", title: "Correlations Test", query: "col == 3", - columns: [ - { name: "dtale_index", dtype: "int64" }, - { name: "col1", dtype: "int64" }, - { name: "col2", dtype: "float64" }, - { name: "col3", dtype: "object" }, - { name: "col4", dtype: "datetime64[ns]" }, - { name: "col5", dtype: "datetime64[ns]" }, - ], }; const originalOffsetHeight = Object.getOwnPropertyDescriptor(HTMLElement.prototype, "offsetHeight"); @@ -35,9 +28,15 @@ describe("Correlations tests", () => { mockPopsicle.mock(url => { if (url.startsWith("/dtale/correlations")) { const query = qs.parse(url.split("?")[1]).query; - if (query == "null") { + if (url.startsWith("/dtale/correlations?") && query == "null") { return { error: "No data found." }; } + if (url.startsWith("/dtale/correlations?") && query == "one-date") { + return { data: correlationsData.data, dates: ["col4"] }; + } + if (url.startsWith("/dtale/correlations?") && query == "no-date") { + return { data: correlationsData.data, dates: [] }; + } } const { urlFetcher } = require("../redux-test-utils").default; return urlFetcher(url); @@ -100,6 +99,53 @@ describe("Correlations tests", () => { }, 200); }); + test("Correlations rendering data w/ one date column", done => { + const Correlations = require("../../popups/Correlations").ReactCorrelations; + const TimeseriesChartBody = require("../../popups/TimeseriesChartBody").TimeseriesChartBody; + buildInnerHTML(""); + const result = mount(, { + attachTo: document.getElementById("content"), + }); + result.update(); + setTimeout(() => { + result.update(); + const corrGrid = result.first().find("div.ReactVirtualized__Grid__innerScrollContainer"); + corrGrid + .find("div.cell") + .at(1) + .simulate("click"); + setTimeout(() => { + result.update(); + t.equal(result.find(TimeseriesChartBody).length, 1, "should show correlation timeseries"); + t.ok(result.find("select.custom-select").length == 0, "should not render date options for timeseries"); + t.ok((result.state().selectedDate = "col5"), "should change timeseries date"); + done(); + }, 200); + }, 200); + }); + + test("Correlations rendering data w/ no date columns", done => { + const Correlations = require("../../popups/Correlations").ReactCorrelations; + buildInnerHTML(""); + const result = mount(, { + attachTo: document.getElementById("content"), + }); + result.update(); + setTimeout(() => { + result.update(); + const corrGrid = result.first().find("div.ReactVirtualized__Grid__innerScrollContainer"); + corrGrid + .find("div.cell") + .at(1) + .simulate("click"); + setTimeout(() => { + result.update(); + t.equal(result.find("#rawScatterChart").length, 1, "should show scatter chart"); + done(); + }, 200); + }, 200); + }); + test("Correlations missing data", done => { const Correlations = require("../../popups/Correlations").ReactCorrelations; const result = mount(); diff --git a/static/dtale/DataViewerMenu.jsx b/static/dtale/DataViewerMenu.jsx index 993ce054..80850cc8 100644 --- a/static/dtale/DataViewerMenu.jsx +++ b/static/dtale/DataViewerMenu.jsx @@ -43,7 +43,7 @@ class ReactDataViewerMenu extends React.Component { const col = _.head(this.props.selectedCols); this.props.openChart(_.assignIn({ type: "histogram", col, title: col }, this.props)); }; - const openScatter = () => { + const openCorrelations = () => { this.props.openChart(_.assignIn({ type: "correlations", title: "Correlations" }, this.props)); }; const openCoverage = () => { @@ -177,7 +177,7 @@ class ReactDataViewerMenu extends React.Component {
  • - diff --git a/static/popups/Correlations.jsx b/static/popups/Correlations.jsx index ead71ca4..f0c1e647 100644 --- a/static/popups/Correlations.jsx +++ b/static/popups/Correlations.jsx @@ -24,10 +24,22 @@ const BASE_SCATTER_URL = "/dtale/scatter?"; const BASE_CORRELATIONS_URL = "/dtale/correlations?"; const BASE_CORRELATIONS_TS_URL = "/dtale/correlations-ts?"; +function buildState() { + return { + chart: null, + error: null, + scatterError: null, + correlations: null, + selectedCols: [], + tsUrl: null, + selectedDate: null, + }; +} + class ReactCorrelations extends React.Component { constructor(props) { super(props); - this.state = corrUtils.buildState(props); + this.state = buildState(); _.forEach( ["buildTs", "buildScatter", "viewScatter", "_cellRenderer", "viewScatterRow", "changeDate"], f => (this[f] = this[f].bind(this)) @@ -65,7 +77,8 @@ class ReactCorrelations extends React.Component { this.setState({ error: }); return; } - this.setState({ correlations: gridData.data }); + const { data, dates } = gridData; + this.setState({ correlations: data, dates, hasDate: _.size(dates) > 0, selectedDate: _.get(dates, 0, null) }); }); } @@ -151,7 +164,7 @@ class ReactCorrelations extends React.Component { ); } - const { correlations, selectedCols, tsUrl, hasDate, selectedDate, dates } = this.state; + const { correlations, selectedCols, tsUrl, selectedDate, hasDate, dates } = this.state; return (
    diff --git a/static/popups/correlations/correlationsUtils.jsx b/static/popups/correlations/correlationsUtils.jsx index d5e86e2c..1a6fd348 100644 --- a/static/popups/correlations/correlationsUtils.jsx +++ b/static/popups/correlations/correlationsUtils.jsx @@ -10,7 +10,6 @@ function toggleBouncer() { $("#rawScatterChart").toggle(); } -const findDateCols = columns => _.map(_.filter(columns, ({ dtype }) => dtype.startsWith("datetime")), "name"); const pointFormatter = (xProp, yProp) => point => ({ x: point[xProp], y: point[yProp], index: point.index }); const colorScale = chroma.scale(["red", "yellow", "green"]).domain([-1, 0, 1]); const percent = num => (num === "N/A" ? num : `${_.round(num * 100, 2)}%`); @@ -63,29 +62,10 @@ function createScatter(ctx, chartData, xProp, yProp, label, onClick) { return chart; } -function buildState(props) { - const { columns } = props.chartData; - const dates = findDateCols(columns); - const hasDate = _.size(dates) > 0; - const selectedDate = hasDate ? _.head(dates) : null; - return { - chart: null, - error: null, - scatterError: null, - correlations: null, - selectedCols: [], - tsUrl: null, - dates, - hasDate, - selectedDate, - }; -} - export default { toggleBouncer, colorScale, createScatter, - buildState, percent, pointFormatter, }; diff --git a/tests/dtale/test_views.py b/tests/dtale/test_views.py index d80968f6..13d445cd 100644 --- a/tests/dtale/test_views.py +++ b/tests/dtale/test_views.py @@ -2,6 +2,7 @@ from builtins import str import mock +import numpy as np import pandas as pd import pandas.util.testing as pdt import pytest @@ -359,16 +360,18 @@ def test_get_correlations(unittest, test_data): stack.enter_context(mock.patch('dtale.views.DTYPES', {c.port: views.build_dtypes_state(test_data)})) response = c.get('/dtale/correlations') response_data = json.loads(response.data) - expected = dict(data=[ - dict(column='security_id', security_id=1.0, foo=None, bar=None), - dict(column='foo', security_id=None, foo=None, bar=None), - dict(column='bar', security_id=None, foo=None, bar=None) - ]) + expected = dict( + data=[ + dict(column='security_id', security_id=1.0, foo=None, bar=None), + dict(column='foo', security_id=None, foo=None, bar=None), + dict(column='bar', security_id=None, foo=None, bar=None) + ], + dates=[] + ) unittest.assertEqual(response_data, expected, 'should return correlations') with app.test_client() as c: with ExitStack() as stack: - test_data, _ = views.format_data(test_data) stack.enter_context(mock.patch('dtale.views.DATA', {c.port: test_data})) stack.enter_context(mock.patch('dtale.views.DTYPES', {c.port: views.build_dtypes_state(test_data)})) response = c.get('/dtale/correlations', query_string=dict(query="missing_col == 'blah'")) @@ -377,6 +380,26 @@ def test_get_correlations(unittest, test_data): response_data['error'], "name 'missing_col' is not defined", 'should handle correlations exception' ) + with app.test_client() as c: + with ExitStack() as stack: + test_data.loc[test_data.security_id == 1, 'bar'] = np.nan + test_data2 = test_data.copy() + test_data2.loc[:, 'date'] = pd.Timestamp('20000102') + test_data = pd.concat([test_data, test_data2], ignore_index=True) + stack.enter_context(mock.patch('dtale.views.DATA', {c.port: test_data})) + stack.enter_context(mock.patch('dtale.views.DTYPES', {c.port: views.build_dtypes_state(test_data)})) + response = c.get('/dtale/correlations') + response_data = json.loads(response.data) + expected = expected = dict( + data=[ + dict(column='security_id', security_id=1.0, foo=None, bar=None), + dict(column='foo', security_id=None, foo=None, bar=None), + dict(column='bar', security_id=None, foo=None, bar=None) + ], + dates=['date'] + ) + unittest.assertEqual(response_data, expected, 'should return correlations') + def build_ts_data(size=5, days=5): start = pd.Timestamp('20000101')