From 1edd227eb4acb2337bc586489132ec1da68acc70 Mon Sep 17 00:00:00 2001 From: Gaurav Gupta <47334368+gaugup@users.noreply.github.com> Date: Mon, 6 Jun 2022 16:10:52 -0700 Subject: [PATCH] Add more utilities into raiutils (#1295) * Add more utilities into raiutils Signed-off-by: Gaurav Gupta * Fix faulty file name Signed-off-by: Gaurav Gupta * fix heatmap bug (#1297) * Make "save and switch" work from cohort settings (#1276) * make save and switch work * fix naming * lintfix * adjustment according to Ilya's comment * lintfix * add retry logic to codecov step and only upload results for one python version (#1298) * add github action to release raiutils to pypi (#1294) * Add highchart for Dataset Explorer (#1286) * test * style * click * fix test * fix test * test * test * test * test * Update requirements-linting.txt to add flake8-pytest-style (#1296) * Fix sort abs (#1299) * Rename "base cohort" to "global cohort" (#1278) * change base cohort to global cohort * fix spelling * lintfix * fix codecov comment not appearing on PRs (#1302) * take absolute value of error calculation for regression scenario (#1301) * Limit individual feature importance selection to up to 5 (#1305) * update feature importance string * limit selection to up to 5 * add group count * remove message bar, show info icon instead * update e2e locator * fix E2E failure on feature importance * add ariaLabel for expand collapse button * add renderOnNewLayer props * Add error message for counterfactual panel (#1310) * add error message for counterfactual * update error message in camel case to fix build error * Add to_json() and from_json() methods to Cohort class (#1300) * Add to_json() and from_json() methods to Cohort class Signed-off-by: Gaurav Gupta * Address code review comments Signed-off-by: Gaurav Gupta * Fix linting Signed-off-by: Gaurav Gupta * Add a highchart heatmap helper class (#1307) * add highchart heatmap helper class * add erroneously deleted line back * Fix cohort setting string (#1304) * Fix string * remove none * name * test * Fix all data style (#1303) * Add a feature flag for the new model overview experience (#1306) * add feature flag for new model overview experience and turn it off by default * remove useless constructor * Clean up charts code (#1313) * clean up chart code * remove arg * Expand the counterfactual flyout to cover the full page (#1315) Signed-off-by: Gaurav Gupta * Bump minimist from 1.2.5 to 1.2.6 (#1292) * Bump minimist from 1.2.5 to 1.2.6 Bumps [minimist](https://github.com/substack/minimist) from 1.2.5 to 1.2.6. - [Release notes](https://github.com/substack/minimist/releases) - [Commits](https://github.com/substack/minimist/compare/1.2.5...1.2.6) --- updated-dependencies: - dependency-name: minimist dependency-type: indirect ... Signed-off-by: dependabot[bot] * minimist ^1.2.6 Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> Co-authored-by: xuke444 <40614413+xuke444@users.noreply.github.com> Co-authored-by: Roman Lutz * fix random node download failures by upgrading to latest github action with retry logic (#1317) * Add dataset cohort table to new ModelOverview experience (#1314) * add only dataset cohort table, update wrapping code * lintfix * lintfix * build error fix * Add installation instructions for raiwidgets to README (#1320) * refactor RAIInsights into RAIInsightsBase class for basic functionality (#1284) * Fix what if counterfactual header and description text misaligned (#1316) * align * e2e * add clear temporary cohort button to error analysis (#1322) * Raise UserConfigValidationException in case no model but valid model serializer (#1325) Signed-off-by: Gaurav Gupta * Add test case for handling different types in causal (#1321) Signed-off-by: Gaurav Gupta * show shift to an empty cohort in tree view as an empty node (#1318) * Bug fixing (#1326) * Move chart description up and remove scroll bar * Change string * Add box outlier for dataset explorer (#1323) * add outlier for dataset explorer * name * update string when no datapoint selected (#1331) * Fix Big empty space for featureImportance chart (#1328) * legend * removed invalid test case * constant * Disable save as new cohort button if nothing is selected in error tree (#1327) * Add disaggregated analysis table/heatmap (#1332) * disaggregated analysis changes only * lintfix * Change warning message to user exception for model type and task type mismatch (#1330) * Change warning message to user exception for model type and task type mismatch Signed-off-by: Gaurav Gupta * Fix flake8 errors Signed-off-by: Gaurav Gupta * Change the counterfactual text color from black to grey (#1337) Signed-off-by: Gaurav Gupta * Limit each component description width up to 750px for readability (#1336) * limit description width up to 750px * export maxWidth from a common place * block empty cohort creation in RAI Dashboard (#1335) * Add warning message in cohort editor for invalid input value; Update 'Shift cohort' to 'Switch cohort' (#1339) * add error message for invalid value * update shift cohort to switch cohort * Rename counterfactual style files to confirm with *.styles.ts (#1338) Signed-off-by: Gaurav Gupta * Add disaggregated analysis table to Model Overview (#1341) * pull in changes for disaggregated analysis * add styles file * add textured NaN cells * module import for textured cells and grid y axis * lintfix * use combobox for dropdown rather than dropdown * lintfix * Rename causal style files to confirm with *.styles.ts (#1342) Signed-off-by: Gaurav Gupta * update responsibleai to interpret-community 0.25.0 (#1343) * All component title and descriptions should be aligned (#1346) * update Signed-off-by: vinutha karanth * lintfix Signed-off-by: vinutha karanth * Remove 5K limit blurb from local explanations tab (#1347) Signed-off-by: Gaurav Gupta * Sort features by default in counterfactual flyout (#1312) * Sort features by default in counterfactual flyout Signed-off-by: Gaurav Gupta * Fix failing tests Signed-off-by: Gaurav Gupta * attempt to fix test Signed-off-by: Gaurav Gupta * Remove check Signed-off-by: Gaurav Gupta * Bump moment from 2.28.0 to 2.29.2 (#1333) Bumps [moment](https://github.com/moment/moment) from 2.28.0 to 2.29.2. - [Release notes](https://github.com/moment/moment/releases) - [Changelog](https://github.com/moment/moment/blob/develop/CHANGELOG.md) - [Commits](https://github.com/moment/moment/compare/2.28.0...2.29.2) --- updated-dependencies: - dependency-name: moment dependency-type: direct:production ... Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> Co-authored-by: Roman Lutz * Counterfactual flyout top section need to be moved to left & Error analysis move side content to align with description text (#1350) * update Signed-off-by: vinutha karanth * lintfix Signed-off-by: vinutha karanth * move the root all data statistics to ErrorReport and ErrorAnalysisData (#1344) * update error analysis documentation to clarify the error tree splits on errors even when other metrics are selected (#1349) Co-authored-by: Vinutha Karanth * update erroranalysis to 0.2.1 and remove some duplicate dependencies (#1334) * disable turbo checking for large amount of data (#1351) Signed-off-by: Ke Xu * force re-render when chart type changes (#1354) Signed-off-by: Ke Xu * move the root all data statistics to ErrorReport and ErrorAnalysisData (#1352) * Rename output column name in the counterfactual flyout (#1353) Signed-off-by: Gaurav Gupta * Show column chart for categorical feature in data explorer (#1355) * Show column chart for categorical feature in data explorer * address comments * update fluentui (#1356) Signed-off-by: Ke Xu * update code owner (#1308) * update code owner Signed-off-by: Ke Xu * remove dup Signed-off-by: Ke Xu Co-authored-by: Roman Lutz * update version to match studio (#1357) Signed-off-by: Ke Xu * alignment fixes (#1359) * Add charts for new model overview experience (#1348) * rename OverallTable to DisaggregatedMetricsTable and move to core-ui * Copy the ModelPerformanceTab into model-assessment and rename to ModelOverview * reference OverallTable again in fairness widget * refactor core chart component out into core-ui * refactor out core chart component into core-ui * lintfix * undo tsconfig.lib.json change * fix chartAndConfigsId in tests * lintfix * add table for cohort metrics and add dropdown metric selector, add new metrics * add new metrics * undo unwanted changes * fix casing * add superscript 2 for r-squared * update tests to reflect new metrics * lintfix * add feature flag * fix mae * fix mae calculation * first version of new model overview table * get probability distribution box plot to work * add feature flag for new model overview experience and turn it off by default * add highchart heatmap helper class * remove custom styling * add erroneously deleted line back * remove useless constructor * modularize model overview * show outliers in box plot, fix positioning * remove showmetricsummary * refactor heatmap code into a common class * add featureDropdownRef to allow focusing * add only dataset cohort table, update wrapping code * lintfix * lintfix * build error fix * add chart config flyout (in progress) * add chart config flyout (in progress) * address feedback, use finalized color * adjust feature selection to disable options if limit is reached, add axis config buttons * select all via dropdown * lintfix * refactor box plot calculations and rendering * add style file * textured NaN cells, grid labels on y axis * standardize box plots to use fences * fix merge issues * unify box plot tooltip formatting code, fix bar chart sizing issue * small fixes * rearrange feature dropdown * lintfix * remove commented out code * remove box plot tooltip customization * lintfix * add a few unit tests * unit tests for smaller utilities, localization fixes, consistent flyout flow with confirm/cancel buttons * lintfix * fix chart config flyout update * fix test case * rename files for lint * file rename for lint * release rai-core-flask 0.3.0 (#1361) * upgrade python version used with flask CI to fix segfault error (#1363) * release raiwidgets and responsibleai v0.18.0 (#1360) * fix two bugs (#1364) * Add pre-built cohort into adult census notebook (#1243) * [WIP] Add pre-built cohort into adult census notebook Signed-off-by: Gaurav Gupta * erroranalysis version bump in raiwidgets to 0.1.31 (#1245) * Make cohrtData empty list in case no pre-bdefined cohorts are injected (#1247) Signed-off-by: Gaurav Gupta * Simplify the train pipeline responsibleaidashboard-census-classification-model-debugging.ipynb (#1195) * Simplify the train pipeline responsibleaidashboard-census-classification-model-debugging.ipynb Signed-off-by: Gaurav Gupta * Address code review comments * Update notebooks/responsibleaidashboard/responsibleaidashboard-census-classification-model-debugging.ipynb Co-authored-by: Roman Lutz Co-authored-by: Roman Lutz Signed-off-by: Gaurav Gupta * Add regression test for pre-defined cohorts in raiwidgets (#1249) Signed-off-by: Gaurav Gupta * color (#1248) * Add feature importance box & bar chart (#1241) * refactor * build * build * temp * temp * temp * temp * box * cache * e2e * e2e * fix * e2e fix * e2e * fix e2e * widget * widget * fix * widget * e2e * e2e * e2e * test * test * PreBuilt cohorts UX changes (#1242) * Intial SDK implementation cohorts Signed-off-by: Gaurav Gupta * Add basic validationf for cohorts Signed-off-by: Gaurav Gupta * Add serialized version of cohort config to ResponsibleAiDashboard Signed-off-by: Gaurav Gupta * Add more tests cohorts Signed-off-by: Gaurav Gupta * fix broken builds due to pip upgrade which broke pip-tools (#1185) * refactor matrix filter and area state to be private static (#1179) * Change variable name Signed-off-by: Gaurav Gupta * Add more cohort filters Signed-off-by: Gaurav Gupta * Add cohort data to dashboard e2e Signed-off-by: Gaurav Gupta * Add more cohorts filters Signed-off-by: Gaurav Gupta * Document various data validation for cohorts Signed-off-by: Gaurav Gupta * Add new interfaces for pre-built cohort Signed-off-by: Gaurav Gupta * Add more cohort filters Signed-off-by: Gaurav Gupta * Add prebuilt cohort walking logic in UI and add more data validation scenarios Signed-off-by: Gaurav Gupta * Add basic data validation checks Signed-off-by: Gaurav Gupta * Add logic to translate the Index cohort filter Signed-off-by: Gaurav Gupta * Remove commented out code Signed-off-by: Gaurav Gupta * Add SDK validations for Index based cohort filter Signed-off-by: Gaurav Gupta * Add code for validating classification outcome Signed-off-by: Gaurav Gupta * Add error filter validations and add tests Signed-off-by: Gaurav Gupta * Add fake cohorts for regression dataset Signed-off-by: Gaurav Gupta * Add fake cohorts for multi-class classification dataset Signed-off-by: Gaurav Gupta * Add handling of regression filter Signed-off-by: Gaurav Gupta * Add support for classification outcome in UI Signed-off-by: Gaurav Gupta * Add validations for Predicted Y and True Y cohort filters Signed-off-by: Gaurav Gupta * Add UI code to handle prediced Y and true Y for pre-built cohort filters Signed-off-by: Gaurav Gupta * Add cohort validation with test data to raiwidgets Signed-off-by: Gaurav Gupta * Add tests for validating Predicted/True Y cohorts Signed-off-by: Gaurav Gupta * Add UI support for TrueY/PredictedY for classification Signed-off-by: Gaurav Gupta * Rename cohort_filter_list to cohort_list Signed-off-by: Gaurav Gupta * Rename UI varibles to match SDK Signed-off-by: Gaurav Gupta * Fix duplicate cohort name Signed-off-by: Gaurav Gupta * Add SDK cohorts to notebook Signed-off-by: Gaurav Gupta * Add dataset validations and add categorical features Signed-off-by: Gaurav Gupta * Add validations for categorical_features Signed-off-by: Gaurav Gupta * Fix sorted imports Signed-off-by: Gaurav Gupta * Add code for translating categorical values Signed-off-by: Gaurav Gupta * Move cohort processing to a separate file Signed-off-by: Gaurav Gupta * Fix code review comments Signed-off-by: Gaurav Gupta * Refactor cohort translated function into different small functions Signed-off-by: Gaurav Gupta * Change to lowercase for outcome Signed-off-by: Gaurav Gupta * Fix code review comments Signed-off-by: Gaurav Gupta * Refactor cohort_list validations and converge pytest common functions into fixtures Signed-off-by: Gaurav Gupta * Add conftest into raiwidgets tests Signed-off-by: Gaurav Gupta * Add validations for cohort list Signed-off-by: Gaurav Gupta * Add cohortData test Signed-off-by: Gaurav Gupta * Fix sorted imports Signed-off-by: Gaurav Gupta * isort fix Signed-off-by: Gaurav Gupta * Add UI unit tests for cohort translation Signed-off-by: Gaurav Gupta * Add more checks in UI uni test Signed-off-by: Gaurav Gupta * Add UI tests for regression cohorts Signed-off-by: Gaurav Gupta * REmove notebook change Signed-off-by: Gaurav Gupta * Fix typescript build Signed-off-by: Gaurav Gupta * Change cohort filter values so that cohort filters non-zero points Signed-off-by: Gaurav Gupta * Fix for empty cohort list Signed-off-by: Gaurav Gupta * Simplify the train pipeline responsibleaidashboard-census-classification-model-debugging.ipynb (#1195) * Simplify the train pipeline responsibleaidashboard-census-classification-model-debugging.ipynb Signed-off-by: Gaurav Gupta * Address code review comments * Update notebooks/responsibleaidashboard/responsibleaidashboard-census-classification-model-debugging.ipynb Co-authored-by: Roman Lutz Co-authored-by: Roman Lutz * Propagate error strings instead of raising exceptions Signed-off-by: Gaurav Gupta * Fix code issues Signed-off-by: Gaurav Gupta * Fix code review comments Signed-off-by: Gaurav Gupta * Fix code review comments Signed-off-by: Gaurav Gupta Co-authored-by: Ilya Matiach Co-authored-by: Roman Lutz * Make _cohort.py module a public module (#1253) * Make _cohort.py a public module Signed-off-by: Gaurav Gupta * Add missing file Signed-off-by: Gaurav Gupta * fix notebook build failures due to pywinpty dependency release failing in python 3.6 (#1257) * fix notebook build failures due to pywinpty dependency release failing in python 3.6 * build pywinpty from conda instead * add lowerbound * fixup * fixup * Add supported models and data types to README.md responsibleai (#1259) Signed-off-by: Gaurav Gupta * make getting-started notebook a markdown file showing APIs (#1223) * refactor tabs out of RAI dashboard into a separate component (#1256) * Add individual causal scatter chart (#1258) * temp * refactor * test * style fix * comment * minor fix to url for responsibleai package in setup.py (#1260) * Fix UX e2e tests and address code review comments Signed-off-by: Gaurav Gupta * Fix eslint Signed-off-by: Gaurav Gupta * Address review comments Signed-off-by: Gaurav Gupta * Reset the number of samples in test dataset Signed-off-by: Gaurav Gupta Co-authored-by: Ilya Matiach Co-authored-by: Roman Lutz Co-authored-by: Bo Zhang <71688188+zhb000@users.noreply.github.com> * Add pre-defined cohorts in responsibleaidashboard-diabetes-decision-making.ipynb (#1252) * [WIP] Add pre-defined cohorts in responsibleaidashboard-diabetes-decision-making.ipynb Signed-off-by: Gaurav Gupta * Fix the e2e test for notebook Signed-off-by: Gaurav Gupta * Make _cohort.py module a public module (#1253) * Make _cohort.py a public module Signed-off-by: Gaurav Gupta * Add missing file Signed-off-by: Gaurav Gupta * Fix cohort namespace Signed-off-by: Gaurav Gupta * minor fix to url for responsibleai package in setup.py (#1260) * Counterfactual Chart: Correct target description according to task_type (#1261) * Counterfactual Chart: Correct target description according to task_type Signed-off-by: Gaurav Gupta * Change function name Signed-off-by: Gaurav Gupta * Make Range lowercase Signed-off-by: Gaurav Gupta * fix whitespace in values of adult census income dataset (#1263) * Add what-If scatter chart from highchart lib (#1262) * add whatIf scatter chart * widget test * what if local importance bar chart * fix * widget * fix tooltip * refactor * test * test * add a builddebug yarn command to build UX locally which can be debugged in browser e2e (#1265) * allow rai text insights to work with RAI dashboard (#1269) * remove duplicate code in explanation dashboard (#1266) * Individual causal style responsive (#1268) * add whatIf scatter chart * widget test * what if local importance bar chart * fix * widget * fix tooltip * refactor * test * test * Causal Style * Allow duplicating cohorts multiple times (#1274) * allow duplicating a cohort more than once * lintfix * Disable column header highlighting on hover in IndividualFeatureImportanceView (#1272) * disable column highlight on hover * lintfix * Rename new cohorts from "Unsaved" to "Temporary cohort" (#1273) * rename Unsaved to Temporary cohort * localize temp cohort * Counterfactual style refactor (#1275) * style refactor * test * test * test * fix * Don't change cursor on hover over cohort name * Fix (#1281) * fix cohort info styling (#1277) * fix readme link to fairness and interpretability example notebook (#1282) * add new RAI Utils package for common utilities shared across RAI packages (#1280) * Add ICE chart (#1283) * Fix * ice chart * ic * test * test * update docstring for explanation dashboard in regards to min number of rows (#1271) * make builds more reliable by adding retry logic to urlretrieve calls in notebooks (#1218) * upgrade pytest to 7.0.1, remove mock and updgrade pytest-mock to 3.6.1 (#1287) * remove deprecated codecov parameter (#1293) * Fix min/max special case in cohort filter creation with "in the range of" (#1279) * fix logic in the case that min or max are zero * lintfix * Rename 'Dashboard navigation' to 'Dashboard configuration' (#1291) * Rename 'Dashboard navigation' to 'Dashboard configuration' Signed-off-by: Gaurav Gupta * Notebook change Signed-off-by: Gaurav Gupta * Add raiutils to PR template (#1290) * fix heatmap bug (#1297) * Make "save and switch" work from cohort settings (#1276) * make save and switch work * fix naming * lintfix * adjustment according to Ilya's comment * lintfix * add retry logic to codecov step and only upload results for one python version (#1298) * add github action to release raiutils to pypi (#1294) * Add highchart for Dataset Explorer (#1286) * test * style * click * fix test * fix test * test * test * test * test * Update requirements-linting.txt to add flake8-pytest-style (#1296) * Fix sort abs (#1299) * Rename "base cohort" to "global cohort" (#1278) * change base cohort to global cohort * fix spelling * lintfix * fix codecov comment not appearing on PRs (#1302) * take absolute value of error calculation for regression scenario (#1301) * Limit individual feature importance selection to up to 5 (#1305) * update feature importance string * limit selection to up to 5 * add group count * remove message bar, show info icon instead * update e2e locator * fix E2E failure on feature importance * add ariaLabel for expand collapse button * add renderOnNewLayer props * Add error message for counterfactual panel (#1310) * add error message for counterfactual * update error message in camel case to fix build error * Add to_json() and from_json() methods to Cohort class (#1300) * Add to_json() and from_json() methods to Cohort class Signed-off-by: Gaurav Gupta * Address code review comments Signed-off-by: Gaurav Gupta * Fix linting Signed-off-by: Gaurav Gupta * Add a highchart heatmap helper class (#1307) * add highchart heatmap helper class * add erroneously deleted line back * Fix cohort setting string (#1304) * Fix string * remove none * name * test * Fix all data style (#1303) * Add a feature flag for the new model overview experience (#1306) * add feature flag for new model overview experience and turn it off by default * remove useless constructor * Clean up charts code (#1313) * clean up chart code * remove arg * Expand the counterfactual flyout to cover the full page (#1315) Signed-off-by: Gaurav Gupta * Bump minimist from 1.2.5 to 1.2.6 (#1292) * Bump minimist from 1.2.5 to 1.2.6 Bumps [minimist](https://github.com/substack/minimist) from 1.2.5 to 1.2.6. - [Release notes](https://github.com/substack/minimist/releases) - [Commits](https://github.com/substack/minimist/compare/1.2.5...1.2.6) --- updated-dependencies: - dependency-name: minimist dependency-type: indirect ... Signed-off-by: dependabot[bot] * minimist ^1.2.6 Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> Co-authored-by: xuke444 <40614413+xuke444@users.noreply.github.com> Co-authored-by: Roman Lutz * fix random node download failures by upgrading to latest github action with retry logic (#1317) * Add dataset cohort table to new ModelOverview experience (#1314) * add only dataset cohort table, update wrapping code * lintfix * lintfix * build error fix * Add installation instructions for raiwidgets to README (#1320) * refactor RAIInsights into RAIInsightsBase class for basic functionality (#1284) * Fix what if counterfactual header and description text misaligned (#1316) * align * e2e * add clear temporary cohort button to error analysis (#1322) * Raise UserConfigValidationException in case no model but valid model serializer (#1325) Signed-off-by: Gaurav Gupta * Add test case for handling different types in causal (#1321) Signed-off-by: Gaurav Gupta * show shift to an empty cohort in tree view as an empty node (#1318) * Bug fixing (#1326) * Move chart description up and remove scroll bar * Change string * Add box outlier for dataset explorer (#1323) * add outlier for dataset explorer * name * update string when no datapoint selected (#1331) * Fix Big empty space for featureImportance chart (#1328) * legend * removed invalid test case * constant * Disable save as new cohort button if nothing is selected in error tree (#1327) * Add disaggregated analysis table/heatmap (#1332) * disaggregated analysis changes only * lintfix * Change warning message to user exception for model type and task type mismatch (#1330) * Change warning message to user exception for model type and task type mismatch Signed-off-by: Gaurav Gupta * Fix flake8 errors Signed-off-by: Gaurav Gupta * Change the counterfactual text color from black to grey (#1337) Signed-off-by: Gaurav Gupta * Limit each component description width up to 750px for readability (#1336) * limit description width up to 750px * export maxWidth from a common place * block empty cohort creation in RAI Dashboard (#1335) * Add warning message in cohort editor for invalid input value; Update 'Shift cohort' to 'Switch cohort' (#1339) * add error message for invalid value * update shift cohort to switch cohort * Rename counterfactual style files to confirm with *.styles.ts (#1338) Signed-off-by: Gaurav Gupta * Add disaggregated analysis table to Model Overview (#1341) * pull in changes for disaggregated analysis * add styles file * add textured NaN cells * module import for textured cells and grid y axis * lintfix * use combobox for dropdown rather than dropdown * lintfix * Rename causal style files to confirm with *.styles.ts (#1342) Signed-off-by: Gaurav Gupta * update responsibleai to interpret-community 0.25.0 (#1343) * All component title and descriptions should be aligned (#1346) * update Signed-off-by: vinutha karanth * lintfix Signed-off-by: vinutha karanth * Remove 5K limit blurb from local explanations tab (#1347) Signed-off-by: Gaurav Gupta * Sort features by default in counterfactual flyout (#1312) * Sort features by default in counterfactual flyout Signed-off-by: Gaurav Gupta * Fix failing tests Signed-off-by: Gaurav Gupta * attempt to fix test Signed-off-by: Gaurav Gupta * Remove check Signed-off-by: Gaurav Gupta * Bump moment from 2.28.0 to 2.29.2 (#1333) Bumps [moment](https://github.com/moment/moment) from 2.28.0 to 2.29.2. - [Release notes](https://github.com/moment/moment/releases) - [Changelog](https://github.com/moment/moment/blob/develop/CHANGELOG.md) - [Commits](https://github.com/moment/moment/compare/2.28.0...2.29.2) --- updated-dependencies: - dependency-name: moment dependency-type: direct:production ... Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> Co-authored-by: Roman Lutz * Counterfactual flyout top section need to be moved to left & Error analysis move side content to align with description text (#1350) * update Signed-off-by: vinutha karanth * lintfix Signed-off-by: vinutha karanth * move the root all data statistics to ErrorReport and ErrorAnalysisData (#1344) * update error analysis documentation to clarify the error tree splits on errors even when other metrics are selected (#1349) Co-authored-by: Vinutha Karanth * update erroranalysis to 0.2.1 and remove some duplicate dependencies (#1334) * disable turbo checking for large amount of data (#1351) Signed-off-by: Ke Xu * force re-render when chart type changes (#1354) Signed-off-by: Ke Xu * move the root all data statistics to ErrorReport and ErrorAnalysisData (#1352) * Rename output column name in the counterfactual flyout (#1353) Signed-off-by: Gaurav Gupta * Show column chart for categorical feature in data explorer (#1355) * Show column chart for categorical feature in data explorer * address comments * update fluentui (#1356) Signed-off-by: Ke Xu * update code owner (#1308) * update code owner Signed-off-by: Ke Xu * remove dup Signed-off-by: Ke Xu Co-authored-by: Roman Lutz * update version to match studio (#1357) Signed-off-by: Ke Xu * alignment fixes (#1359) * Add charts for new model overview experience (#1348) * rename OverallTable to DisaggregatedMetricsTable and move to core-ui * Copy the ModelPerformanceTab into model-assessment and rename to ModelOverview * reference OverallTable again in fairness widget * refactor core chart component out into core-ui * refactor out core chart component into core-ui * lintfix * undo tsconfig.lib.json change * fix chartAndConfigsId in tests * lintfix * add table for cohort metrics and add dropdown metric selector, add new metrics * add new metrics * undo unwanted changes * fix casing * add superscript 2 for r-squared * update tests to reflect new metrics * lintfix * add feature flag * fix mae * fix mae calculation * first version of new model overview table * get probability distribution box plot to work * add feature flag for new model overview experience and turn it off by default * add highchart heatmap helper class * remove custom styling * add erroneously deleted line back * remove useless constructor * modularize model overview * show outliers in box plot, fix positioning * remove showmetricsummary * refactor heatmap code into a common class * add featureDropdownRef to allow focusing * add only dataset cohort table, update wrapping code * lintfix * lintfix * build error fix * add chart config flyout (in progress) * add chart config flyout (in progress) * address feedback, use finalized color * adjust feature selection to disable options if limit is reached, add axis config buttons * select all via dropdown * lintfix * refactor box plot calculations and rendering * add style file * textured NaN cells, grid labels on y axis * standardize box plots to use fences * fix merge issues * unify box plot tooltip formatting code, fix bar chart sizing issue * small fixes * rearrange feature dropdown * lintfix * remove commented out code * remove box plot tooltip customization * lintfix * add a few unit tests * unit tests for smaller utilities, localization fixes, consistent flyout flow with confirm/cancel buttons * lintfix * fix chart config flyout update * fix test case * rename files for lint * file rename for lint * release rai-core-flask 0.3.0 (#1361) * upgrade python version used with flask CI to fix segfault error (#1363) * release raiwidgets and responsibleai v0.18.0 (#1360) * fix two bugs (#1364) * Add pre-built cohort into adult census notebook (#1243) * [WIP] Add pre-built cohort into adult census notebook Signed-off-by: Gaurav Gupta * erroranalysis version bump in raiwidgets to 0.1.31 (#1245) * Make cohrtData empty list in case no pre-bdefined cohorts are injected (#1247) Signed-off-by: Gaurav Gupta * Simplify the train pipeline responsibleaidashboard-census-classification-model-debugging.ipynb (#1195) * Simplify the train pipeline responsibleaidashboard-census-classification-model-debugging.ipynb Signed-off-by: Gaurav Gupta * Address code review comments * Update notebooks/responsibleaidashboard/responsibleaidashboard-census-classification-model-debugging.ipynb Co-authored-by: Roman Lutz Co-authored-by: Roman Lutz Signed-off-by: Gaurav Gupta * Add regression test for pre-defined cohorts in raiwidgets (#1249) Signed-off-by: Gaurav Gupta * color (#1248) * Add feature importance box & bar chart (#1241) * refactor * build * build * temp * temp * temp * temp * box * cache * e2e * e2e * fix * e2e fix * e2e * fix e2e * widget * widget * fix * widget * e2e * e2e * e2e * test * test * PreBuilt cohorts UX changes (#1242) * Intial SDK implementation cohorts Signed-off-by: Gaurav Gupta * Add basic validationf for cohorts Signed-off-by: Gaurav Gupta * Add serialized version of cohort config to ResponsibleAiDashboard Signed-off-by: Gaurav Gupta * Add more tests cohorts Signed-off-by: Gaurav Gupta * fix broken builds due to pip upgrade which broke pip-tools (#1185) * refactor matrix filter and area state to be private static (#1179) * Change variable name Signed-off-by: Gaurav Gupta * Add more cohort filters Signed-off-by: Gaurav Gupta * Add cohort data to dashboard e2e Signed-off-by: Gaurav Gupta * Add more cohorts filters Signed-off-by: Gaurav Gupta * Document various data validation for cohorts Signed-off-by: Gaurav Gupta * Add new interfaces for pre-built cohort Signed-off-by: Gaurav Gupta * Add more cohort filters Signed-off-by: Gaurav Gupta * Add prebuilt cohort walking logic in UI and add more data validation scenarios Signed-off-by: Gaurav Gupta * Add basic data validation checks Signed-off-by: Gaurav Gupta * Add logic to translate the Index cohort filter Signed-off-by: Gaurav Gupta * Remove commented out code Signed-off-by: Gaurav Gupta * Add SDK validations for Index based cohort filter Signed-off-by: Gaurav Gupta * Add code for validating classification outcome Signed-off-by: Gaurav Gupta * Add error filter validations and add tests Signed-off-by: Gaurav Gupta * Add fake cohorts for regression dataset Signed-off-by: Gaurav Gupta * Add fake cohorts for multi-class classification dataset Signed-off-by: Gaurav Gupta * Add handling of regression filter Signed-off-by: Gaurav Gupta * Add support for classification outcome in UI Signed-off-by: Gaurav Gupta * Add validations for Predicted Y and True Y cohort filters Signed-off-by: Gaurav Gupta * Add UI code to handle prediced Y and true Y for pre-built cohort filters Signed-off-by: Gaurav Gupta * Add cohort validation with test data to raiwidgets Signed-off-by: Gaurav Gupta * Add tests for validating Predicted/True Y cohorts Signed-off-by: Gaurav Gupta * Add UI support for TrueY/PredictedY for classification Signed-off-by: Gaurav Gupta * Rename cohort_filter_list to cohort_list Signed-off-by: Gaurav Gupta * Rename UI varibles to match SDK Signed-off-by: Gaurav Gupta * Fix duplicate cohort name Signed-off-by: Gaurav Gupta * Add SDK cohorts to notebook Signed-off-by: Gaurav Gupta * Add dataset validations and add categorical features Signed-off-by: Gaurav Gupta * Add validations for categorical_features Signed-off-by: Gaurav Gupta * Fix sorted imports Signed-off-by: Gaurav Gupta * Add code for translating categorical values Signed-off-by: Gaurav Gupta * Move cohort processing to a separate file Signed-off-by: Gaurav Gupta * Fix code review comments Signed-off-by: Gaurav Gupta * Refactor cohort translated function into different small functions Signed-off-by: Gaurav Gupta * Change to lowercase for outcome Signed-off-by: Gaurav Gupta * Fix code review comments Signed-off-by: Gaurav Gupta * Refactor cohort_list validations and converge pytest common functions into fixtures Signed-off-by: Gaurav Gupta * Add conftest into raiwidgets tests Signed-off-by: Gaurav Gupta * Add validations for cohort list Signed-off-by: Gaurav Gupta * Add cohortData test Signed-off-by: Gaurav Gupta * Fix sorted imports Signed-off-by: Gaurav Gupta * isort fix Signed-off-by: Gaurav Gupta * Add UI unit tests for cohort translation Signed-off-by: Gaurav Gupta * Add more checks in UI uni test Signed-off-by: Gaurav Gupta * Add UI tests for regression cohorts Signed-off-by: Gaurav Gupta * REmove notebook change Signed-off-by: Gaurav Gupta * Fix typescript build Signed-off-by: Gaurav Gupta * Change cohort filter values so that cohort filters non-zero points Signed-off-by: Gaurav Gupta * Fix for empty cohort list Signed-off-by: Gaurav Gupta * Simplify the train pipeline responsibleaidashboard-census-classification-model-debugging.ipynb (#1195) * Simplify the train pipeline responsibleaidashboard-census-classification-model-debugging.ipynb Signed-off-by: Gaurav Gupta * Address code review comments * Update notebooks/responsibleaidashboard/responsibleaidashboard-census-classification-model-debugging.ipynb Co-authored-by: Roman Lutz Co-authored-by: Roman Lutz * Propagate error strings instead of raising exceptions Signed-off-by: Gaurav Gupta * Fix code issues Signed-off-by: Gaurav Gupta * Fix code review comments Signed-off-by: Gaurav Gupta * Fix code review comments Signed-off-by: Gaurav Gupta Co-authored-by: Ilya Matiach Co-authored-by: Roman Lutz * Make _cohort.py module a public module (#1253) * Make _cohort.py a public module Signed-off-by: Gaurav Gupta * Add missing file Signed-off-by: Gaurav Gupta * fix notebook build failures due to pywinpty dependency release failing in python 3.6 (#1257) * fix notebook build failures due to pywinpty dependency release failing in python 3.6 * build pywinpty from conda instead * add lowerbound * fixup * fixup * Add supported models and data types to README.md responsibleai (#1259) Signed-off-by: Gaurav Gupta * make getting-started notebook a markdown file showing APIs (#1223) * refactor tabs out of RAI dashboard into a separate component (#1256) * Add individual causal scatter chart (#1258) * temp * refactor * test * style fix * comment * minor fix to url for responsibleai package in setup.py (#1260) * Fix UX e2e tests and address code review comments Signed-off-by: Gaurav Gupta * Fix eslint Signed-off-by: Gaurav Gupta * Address review comments Signed-off-by: Gaurav Gupta * Reset the number of samples in test dataset Signed-off-by: Gaurav Gupta Co-authored-by: Ilya Matiach Co-authored-by: Roman Lutz Co-authored-by: Bo Zhang <71688188+zhb000@users.noreply.github.com> Signed-off-by: Gaurav Gupta * Change cohort name Signed-off-by: Gaurav Gupta Co-authored-by: Ilya Matiach Co-authored-by: Bo Zhang <71688188+zhb000@users.noreply.github.com> Co-authored-by: Roman Lutz Co-authored-by: tongy-msft <91754176+tongyu-microsoft@users.noreply.github.com> Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> Co-authored-by: xuke444 <40614413+xuke444@users.noreply.github.com> Co-authored-by: Vinutha Karanth * Add more context to download print in fetch_dataset (#1368) * Add missing desired range to counterfactual chart (#1366) Signed-off-by: Gaurav Gupta * Add a summary for disaggregated analysis showing ratio and difference (#1367) * add fairness metric table, update existing tables (order, N/A values, grid axis label lengths) * lintfix * merge fixes * addressed eslint issue for null value by creating a dedicated file, localized N/A, updated tooltip text to be wrapped (+ associated unit tests) * keep feature flag * Update PR template to be more intuitive (#1116) * Updating PR template to make sure there's at least 1 box that can be ticked. * replace checkboxes with GitHub emojis for checkmark and x to prevent GitHub from interpreting it as a task list * update PR template based on feedback * lintfix * Update CODEOWNERS file (#1369) * Use global cohort for disaggregated analysis (#1370) * use global cohort for disaggregated analysis and warn user about it * lintfix * Change procedural loops to functions map() and filter() (#1255) * Change procedural loops to function map() and filter() Signed-off-by: Gaurav Gupta * Fix review comment Signed-off-by: Gaurav Gupta * Add accuracy as multiclass metric (#1371) * support accuracy for multiclass classification * lintfix * update code owner (#1372) Signed-off-by: Ke Xu Co-authored-by: Roman Lutz * Rename _validate_model_analysis_input_parameters() to _validate_rai_insights_input_parameters (#1373) Signed-off-by: Gaurav Gupta * Cache and serialize the counterfactual explainer (#1374) * Cache and serialize the counterfactual explainer Signed-off-by: Gaurav Gupta * Fix failing tests Signed-off-by: Gaurav Gupta * Add a top-level pivot to model overview (#1375) * show only disaggregated analysis OR only dataset cohorts * lintfix * basec -> basic typo * Add a toggle to model overview to turn heatmap colors on and off (#1376) * add visual display toggle * lintfix * simplify onVisualDisplayToggleChange method * color replacements * fluentui colors * localization, style file * lintfix * Create rai-e2e package for tests shared with AML (#1378) * update Signed-off-by: vinutha karanth * update Signed-off-by: vinutha karanth * index update Signed-off-by: vinutha karanth * remove files from wid-e2e Signed-off-by: vinutha karanth * update datashape Signed-off-by: vinutha karanth * lintfix Signed-off-by: vinutha karanth * add pyspark support to tree surrogate model in error analysis (#1251) * Create e2e package for tests shared with AML (#1383) * new buildable folder Signed-off-by: vinutha karanth * remove rai-e2e lib Signed-off-by: vinutha karanth * add license Signed-off-by: vinutha karanth * update Signed-off-by: vinutha karanth * Add feature and metric configuration flyouts to model overview (#1381) * show only disaggregated analysis OR only dataset cohorts * lintfix * basec -> basic typo * feature config * merge fixes * metric config * lintfix * wrap metric descriptions * lintfix * move constant to constants file to avoid circular dependency, lintfix * move inline code into functions * Add probability distribution line chart to model overview (#1382) * line chart for prob distribution * lintfix * fix race condition in tree view update code which can cause root stats to be ignored on selected cohort in RAI dashboard static view (#1386) * String updates for model overview (#1387) * string updates for model overview * lintfix * Handle the loading of causal models more gracefully (#1377) * Handle the loading of causal models more gracefully Signed-off-by: Gaurav Gupta * Address code review commensts and fix test Signed-off-by: Gaurav Gupta * Fix code review comment Signed-off-by: Gaurav Gupta * Some fixes for data explorer, heatmap, counterfactual components (#1389) * add x axis and update heatmap * cf custom point color should be consistent * cf custom point name should be displayed * disable causal treatment whatif in static view (#1391) * fix big data set for counter factual chart (#1393) Signed-off-by: Ke Xu * alignment fixes for causal (#1392) * Fix feature selection state carry-over bug in model overview (#1390) * Provide feature flight support to dashboard, test environment, and Python widget (#1385) * Python part of exposing flights * add feature flighting * fix path for test case * update snapshot * add none constant, contrib guide update, and unit tests for feature_flights param * flake8 * isort * release raiwidgets and responsibleai v0.18.1 (#1413) * Add warning in counterfactual manager when unable to load explainer (#1412) * Add warning in counterfactual manager when unable to load explainer Signed-off-by: Gaurav Gupta * Fix linting Signed-off-by: Gaurav Gupta * Bug fixes on 'Set value' not copying over feature values correctly in what if counterfactual panel (#1416) * fix cf set value bug * remove comment * fix error on machines with pyspark installed where passed dataframe is not spark pandas (#1415) * add postbuild branch trigger (#1417) * Fix causal UI strings according to classification/regression tasks (#1419) * Fix causal UI strings according to classification/regression tasks Signed-off-by: Gaurav Gupta * Fix lint error Signed-off-by: Gaurav Gupta * Fix UI test Signed-off-by: Gaurav Gupta * Fix description for model overview (#1425) * fix description for model overview * keep new description for new model overview * fix failing to create error report when filter_features is empty list (#1421) * filter out missing values from what if dropdown to prevent explanation dashboard from crashing (#1418) * Remove |Set Value| blurb in case it is not availble in counterfactual panel (#1426) Signed-off-by: Gaurav Gupta * Add y-axis description to counterfactual feature importance chart (#1423) Signed-off-by: Gaurav Gupta Co-authored-by: xuke444 <40614413+xuke444@users.noreply.github.com> * Add the user class name to causal UI strings (#1422) * Fix causal UI strings according to classification/regression tasks Signed-off-by: Gaurav Gupta * Fix lint error Signed-off-by: Gaurav Gupta * Fix UI test Signed-off-by: Gaurav Gupta * Add the user class name to causal UI strings Signed-off-by: Gaurav Gupta Co-authored-by: xuke444 <40614413+xuke444@users.noreply.github.com> * fix math.min / max for array size more than 10^7 (#1427) Signed-off-by: Ke Xu * upgrade pytest and lightgbm to try to fix random pytest segfault test failures (#1424) s * fix flaky notebook causing build failures by adding retry logic (#1431) * Upper bound SciKit-Learn to address freeze in causal (#1432) ## Description Replaces #1429 to address #1430 . Causal analysis is getting stuck with the latest release of SciKit-Learn. This contains: - Test case which gets stuck with SciKit-Learn 1.1.0 - Upper bound on SciKit-Learn in `requirements.txt` ## Checklist - [x] I have added screenshots above for all UI changes. - [x] Documentation was updated if it was needed. - [x] New tests were added or changes were manually verified. Signed-off-by: Richard Edgar * fix dependency chart axis updating with incorrect values in explanation dashboard (#1437) * fix codecov and widget test screenshot uploads (#1428) * release raiwidgets and responsibleai v0.18.2 (#1439) * fix (#1441) Signed-off-by: vinutha karanth * Fix cohort name conflict and not run few tests for AML (#1442) * fix Signed-off-by: vinutha karanth * lintfix Signed-off-by: vinutha karanth * Few e2e tests changes to accommodate AML static tests (#1445) * update Signed-off-by: vinutha karanth * update Signed-off-by: vinutha karanth * Fix locators logic for string features - data explorer and model statistics components (#1446) * update Signed-off-by: vinutha karanth * update Signed-off-by: vinutha karanth * fix Signed-off-by: vinutha karanth * update Signed-off-by: vinutha karanth * lintfix Signed-off-by: vinutha karanth * fix Signed-off-by: vinutha karanth * Add more unittests RAI dashboard input class (#1448) * Add unit tests for ResponsibleAIDashboardInput Signed-off-by: Gaurav Gupta * Add more tests Signed-off-by: Gaurav Gupta * Fix imports Signed-off-by: Gaurav Gupta * Address code review comments Signed-off-by: Gaurav Gupta * Address code review comments Signed-off-by: Gaurav Gupta Co-authored-by: Bo Zhang <71688188+zhb000@users.noreply.github.com> Co-authored-by: Roman Lutz Co-authored-by: Ilya Matiach Co-authored-by: tongy-msft <91754176+tongyu-microsoft@users.noreply.github.com> Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> Co-authored-by: xuke444 <40614413+xuke444@users.noreply.github.com> Co-authored-by: Vinutha Karanth Co-authored-by: Richard Edgar --- raiutils/raiutils/data_processing/__init__.py | 11 + .../data_processing/data_processing_utils.py | 161 +++++++++++ raiutils/raiutils/models/__init__.py | 7 + raiutils/raiutils/models/model_utils.py | 23 ++ raiutils/requirements.txt | 6 +- raiutils/tests/test_data_processing_utils.py | 267 ++++++++++++++++++ raiutils/tests/test_model_utils.py | 27 ++ 7 files changed, 501 insertions(+), 1 deletion(-) create mode 100644 raiutils/raiutils/data_processing/__init__.py create mode 100644 raiutils/raiutils/data_processing/data_processing_utils.py create mode 100644 raiutils/raiutils/models/__init__.py create mode 100644 raiutils/raiutils/models/model_utils.py create mode 100644 raiutils/tests/test_data_processing_utils.py create mode 100644 raiutils/tests/test_model_utils.py diff --git a/raiutils/raiutils/data_processing/__init__.py b/raiutils/raiutils/data_processing/__init__.py new file mode 100644 index 0000000000..a1e1a58ba4 --- /dev/null +++ b/raiutils/raiutils/data_processing/__init__.py @@ -0,0 +1,11 @@ +# Copyright (c) Microsoft Corporation +# Licensed under the MIT License. + +"""Module for defining common utilities related to data processing.""" +from .data_processing_utils import (convert_to_list, + convert_to_string_list_dict, + serialize_json_safe) + +__all__ = ['convert_to_list', + 'convert_to_string_list_dict', + 'serialize_json_safe'] diff --git a/raiutils/raiutils/data_processing/data_processing_utils.py b/raiutils/raiutils/data_processing/data_processing_utils.py new file mode 100644 index 0000000000..3bbbf7b96b --- /dev/null +++ b/raiutils/raiutils/data_processing/data_processing_utils.py @@ -0,0 +1,161 @@ +# Copyright (c) Microsoft Corporation +# Licensed under the MIT License. + +import datetime +import json +from typing import Any, Dict, List + +import numpy as np +import pandas as pd +from scipy.sparse import issparse +from sklearn.utils import check_consistent_length + +_DF_COLUMN_BAD_NAME = "DataFrame column names must be strings."\ + " Name '{0}' is of type {1}" +_LIST_NONSCALAR = "Lists must be of scalar types" +_TOO_MANY_DIMS = "Array must have at most two dimensions" + + +def convert_to_list(array, custom_err_msg=None): + """Convert an array to a list. + + :param array: An array like python object. + :type array: pd.DataFrame or pd.Series or np.ndarray or + pd.Index or scipy sparse array + :param custom_err_msg: A custom error message to use. + :type custom_err_msg: str + :return: Python List. + :rtype: list + """ + if issparse(array): + if array.shape[1] > 1000: + if custom_err_msg is None: + raise ValueError("Exceeds maximum number of features for " + "visualization (1000)") + else: + raise ValueError(custom_err_msg) + return array.toarray().tolist() + if isinstance(array, pd.DataFrame) or isinstance(array, pd.Series): + return array.values.tolist() + if isinstance(array, np.ndarray) or isinstance(array, pd.Index): + return array.tolist() + return array + + +def convert_to_string_list_dict( + base_name_format: str, + ys, + sample_array) -> Dict[str, List]: + """Convert the given input to a string-list dictionary. + + This function is used to convert arrays in a variety of types + into a dictionary mapping column names to regular Python lists + (in preparation for JSON serialization). It is a modification + of the feature processing code in :class:`fairlearn.metrics.MetricFrame`. + + The array to be converted is passed in :code:`ys`, and a variety + of types are supported. The :code:`sample_array` argument is + used in a call to :func:`sklearn.utils.check_consistent_length` + to ensure that the resultant lists are of the right length. + Finally `base_name_format` is used to generate sequential + keys for the dictionary if none are in the supplied :code:`ys`. + It must be of the form :code:`'Base String {0}'`, with the + :code:`{0}` being replaced by a sequential integer. + + It is not possible to list out all the possible underlying types + for :code:`ys`. A brief summary: + - :class:`pd.Series` + - :class:`pd.DataFrame` + - A simple Python list + - A Python dictionary with string keys and values which are + convertible to lists + - Anything convertible to a :class:`np.ndarray` + + :param base_name_format: A custom name format to use. + :type base_name_format: str + :param ys: An array like python object. + :type ys: pd.DataFrame or pd.Series or list or dictionary + :param sample_array: An array like python object. + :type sample_array: pd.DataFrame or pd.Series or list or dictionary + :return: A dictionary of string and lists. + :rtype: Dict[str, List] + """ + result = {} + + if isinstance(ys, pd.Series): + check_consistent_length(ys, sample_array) + if ys.name is not None: + result[ys.name] = convert_to_list(ys) + else: + result[base_name_format.format(0)] = convert_to_list(ys) + elif isinstance(ys, pd.DataFrame): + for i in range(len(ys.columns)): + col_name = ys.columns[i] + if not isinstance(col_name, str): + msg = _DF_COLUMN_BAD_NAME.format(col_name, type(col_name)) + raise ValueError(msg) + column = ys.iloc[:, i] + check_consistent_length(column, sample_array) + result[col_name] = convert_to_list(column) + elif isinstance(ys, list): + if np.isscalar(ys[0]): + f_arr = np.atleast_1d(np.squeeze(np.asarray(ys))) + assert len(f_arr.shape) == 1 # Sanity check + check_consistent_length(f_arr, sample_array) + result[base_name_format.format(0)] = convert_to_list(f_arr) + else: + raise ValueError(_LIST_NONSCALAR) + elif isinstance(ys, dict): + for k, v in ys.items(): + result[k] = convert_to_list(v) + else: + # Assume it's something which can go into np.as_array + f_arr = np.squeeze(np.asarray(ys, dtype=object)) + if len(f_arr.shape) == 1: + check_consistent_length(f_arr, sample_array) + result[base_name_format.format(0)] = convert_to_list(f_arr) + elif len(f_arr.shape) == 2: + # Work similarly to pd.DataFrame(data=ndarray) + for i in range(f_arr.shape[1]): + col = f_arr[:, i] + check_consistent_length(col, sample_array) + result[base_name_format.format(i)] = convert_to_list(col) + else: + raise ValueError(_TOO_MANY_DIMS) + + return result + + +def serialize_json_safe(o: Any): + """ + Convert a value into something that is safe to parse as JSON. + + :param o: Object to make JSON safe. + :type o: Any + :return: Serialized object. + """ + if type(o) in {bool, int, float, str, type(None)}: + if isinstance(o, float): + if np.isinf(o) or np.isnan(o): + return 0 + # need to escape double quoted string values + # and other special characters for json + if isinstance(o, str): + return json.dumps(o)[1:-1] + return o + elif isinstance(o, datetime.datetime): + return o.__str__() + elif isinstance(o, dict): + return {k: serialize_json_safe(v, ) for k, v in o.items()} + elif isinstance(o, list): + return [serialize_json_safe(v) for v in o] + elif isinstance(o, tuple): + return tuple(serialize_json_safe(v) for v in o) + elif isinstance(o, np.ndarray): + return serialize_json_safe(o.tolist()) + elif hasattr(o, 'item'): + return o.item() # numpy types + elif hasattr(o, '__dict__'): + return serialize_json_safe(o.__dict__) # objects + else: + return o diff --git a/raiutils/raiutils/models/__init__.py b/raiutils/raiutils/models/__init__.py new file mode 100644 index 0000000000..f9cae2bbe1 --- /dev/null +++ b/raiutils/raiutils/models/__init__.py @@ -0,0 +1,7 @@ +# Copyright (c) Microsoft Corporation +# Licensed under the MIT License. + +"""Module for defining common utilities related to models.""" +from .model_utils import SKLearn, is_classifier + +__all__ = ['is_classifier', 'SKLearn'] diff --git a/raiutils/raiutils/models/model_utils.py b/raiutils/raiutils/models/model_utils.py new file mode 100644 index 0000000000..eb499a4872 --- /dev/null +++ b/raiutils/raiutils/models/model_utils.py @@ -0,0 +1,23 @@ +# Copyright (c) Microsoft Corporation +# Licensed under the MIT License. + + +class SKLearn(object): + """Provide scikit-learn related constants.""" + + EXAMPLES = 'examples' + LABELS = 'labels' + PREDICT = 'predict' + PREDICTIONS = 'predictions' + PREDICT_PROBA = 'predict_proba' + + +def is_classifier(model): + """Check if the model is a classifier. + + :return: True if the model is a classifier, False otherwise. + :rtype: bool + """ + return (model is not None and + hasattr(model, SKLearn.PREDICT_PROBA) and + model.predict_proba is not None) diff --git a/raiutils/requirements.txt b/raiutils/requirements.txt index fd7d3e06f0..d993101f8d 100644 --- a/raiutils/requirements.txt +++ b/raiutils/requirements.txt @@ -1 +1,5 @@ -requests==2.25.1 \ No newline at end of file +numpy +pandas +requests +scikit-learn +scipy diff --git a/raiutils/tests/test_data_processing_utils.py b/raiutils/tests/test_data_processing_utils.py new file mode 100644 index 0000000000..c70b3a0261 --- /dev/null +++ b/raiutils/tests/test_data_processing_utils.py @@ -0,0 +1,267 @@ +# Copyright (c) Microsoft Corporation +# Licensed under the MIT License. + +import datetime +import json + +import numpy as np +import pandas as pd +import pytest +from scipy.sparse import csr_matrix + +from raiutils.data_processing import (convert_to_list, + convert_to_string_list_dict, + serialize_json_safe) + + +class TestConvertToStringListDict: + def test_unnamed_series(self): + input = pd.Series(data=[0, 1, 2]) + sample_array = [4, 5, 6] + result = convert_to_string_list_dict("Base {0}", input, sample_array) + assert isinstance(result, dict) + assert len(result) == 1 + assert "Base 0" in result + arr = result["Base 0"] + assert isinstance(arr, list) + assert np.array_equal(arr, [0, 1, 2]) + + def test_named_series(self): + input = pd.Series(data=[1, 3, 5], name="Something") + sample_array = [4, 5, 6] + result = convert_to_string_list_dict("Base {0}", input, sample_array) + assert isinstance(result, dict) + assert len(result) == 1 + assert "Something" in result + arr = result["Something"] + assert isinstance(arr, list) + assert np.array_equal(arr, [1, 3, 5]) + + def test_dataframe(self): + input = pd.DataFrame.from_dict({"a": [0, 1, 2], "b": [4, 5, 6]}) + sample_array = [3, 6, 9] + result = convert_to_string_list_dict("Base {0}", input, sample_array) + assert isinstance(result, dict) + assert len(result) == 2 + assert "a" in result + arr = result["a"] + assert isinstance(arr, list) + assert np.array_equal(arr, [0, 1, 2]) + assert "b" in result + arr = result["b"] + assert isinstance(arr, list) + assert np.array_equal(arr, [4, 5, 6]) + + def test_simplelist(self): + input = [0, 1, 4] + sample_array = [2, 3, 4] + result = convert_to_string_list_dict("Base {0}", input, sample_array) + assert isinstance(result, dict) + assert len(result) == 1 + assert "Base 0" in result + arr = result["Base 0"] + assert isinstance(arr, list) + assert np.array_equal(arr, [0, 1, 4]) + + def test_dict(self): + input = {"a": np.array([0, 1, 2]), "b": pd.Series(data=[3, 4, 5])} + sample_array = [2, 3, 4] + result = convert_to_string_list_dict("Base {0}", input, sample_array) + assert isinstance(result, dict) + assert len(result) == 2 + assert "a" in result + arr = result["a"] + assert isinstance(arr, list) + assert np.array_equal(arr, [0, 1, 2]) + assert "b" in result + arr = result["b"] + assert isinstance(arr, list) + assert np.array_equal(arr, [3, 4, 5]) + + def test_numpy1d(self): + input = np.array([0, 1, 4]) + sample_array = [2, 3, 4] + result = convert_to_string_list_dict("Base {0}", input, sample_array) + assert isinstance(result, dict) + assert len(result) == 1 + assert "Base 0" in result + arr = result["Base 0"] + assert isinstance(arr, list) + assert np.array_equal(arr, [0, 1, 4]) + + def test_numpy2d(self): + # Note transpose on the end + input = np.array([[0, 1, 4], [2, 6, 7]]).T + sample_array = [2, 3, 4] + result = convert_to_string_list_dict("Base {0}", input, sample_array) + assert isinstance(result, dict) + assert len(result) == 2 + assert "Base 0" in result + arr = result["Base 0"] + assert isinstance(arr, list) + assert np.array_equal(arr, [0, 1, 4]) + assert "Base 1" in result + arr = result["Base 1"] + assert isinstance(arr, list) + assert np.array_equal(arr, [2, 6, 7]) + + +class TestConvertToList: + def test_pandas_dataframe_to_list(self): + input_dataframe = pd.DataFrame.from_dict( + {"a": [0, 1, 2], "b": [4, 5, 6]} + ) + expected_list = [[0, 4], [1, 5], [2, 6]] + input_as_list = convert_to_list(input_dataframe) + + assert input_as_list is not None + assert input_as_list == expected_list + + def test_array_to_list(self): + input_array = np.array([[0, 4], [1, 5], [2, 6]]) + expected_list = [[0, 4], [1, 5], [2, 6]] + input_as_list = convert_to_list(input_array) + + assert input_as_list is not None + assert input_as_list == expected_list + + def test_list_to_list(self): + input_list = [[0, 4], [1, 5], [2, 6]] + expected_list = [[0, 4], [1, 5], [2, 6]] + input_as_list = convert_to_list(input_list) + + assert input_as_list is not None + assert input_as_list == expected_list + + def test_series_to_list(self): + input_series = pd.Series(data=[[0, 4], [1, 5], [2, 6]]) + expected_list = [[0, 4], [1, 5], [2, 6]] + input_as_list = convert_to_list(input_series) + + assert input_as_list is not None + assert input_as_list == expected_list + + def test_index_to_list(self): + input_index = pd.Index(data=[[0, 4], [1, 5], [2, 6]]) + expected_list = [[0, 4], [1, 5], [2, 6]] + input_as_list = convert_to_list(input_index) + + assert input_as_list is not None + assert input_as_list == expected_list + + def test_csr_matrix_to_list(self): + input_sparse_matrix = csr_matrix((3, 10000), + dtype=np.int8) + with pytest.raises(ValueError) as ve: + convert_to_list(input_sparse_matrix) + assert "Exceeds maximum number of features for " + \ + "visualization (1000)" in str(ve.value) + + with pytest.raises(ValueError) as ve: + convert_to_list(input_sparse_matrix, + custom_err_msg="Error occurred") + assert "Error occurred" in str(ve.value) + + row = np.array([0, 0, 1, 2, 2, 2]) + col = np.array([0, 2, 2, 0, 1, 2]) + data = np.array([1, 2, 3, 4, 5, 6]) + sparse_matrix = csr_matrix((data, (row, col)), shape=(3, 3)) + expected_list = [[1, 0, 2], + [0, 0, 3], + [4, 5, 6]] + input_as_list = convert_to_list(sparse_matrix) + + assert input_as_list is not None + assert input_as_list == expected_list + + +class TestSerializationUtilities: + + def test_embedded_object(self): + class A: + def __init__(self): + self.a_data = 'a' + + class B: + def __init__(self): + self.b_data = A() + + result = serialize_json_safe({'B': B()}) + assert result == {'B': {'b_data': {'a_data': 'a'}}} + + def test_numpy(self): + result = serialize_json_safe(np.array([1, 2, 3])) + assert result == [1, 2, 3] + + def test_unknown(self): + c = complex(1, 2) + result = serialize_json_safe([c, 42]) + assert result == [c, 42] + + def test_strings_with_special_chars(self): + special_chars_dict = {"hello": "world\"with\"quotes", + "hi": ["a", "list", "of", + "special\t\"\r\nblah", + "chars"]} + result = json.dumps(special_chars_dict, default=serialize_json_safe) + assert result == ("{\"hello\": \"world\\\"with\\\"quotes\", " + + "\"hi\": [\"a\", \"list\", \"of\", " + + "\"special\\t\\\"\\r\\nblah\", \"chars\"]}") + deserialized_special_chars_dict = json.loads(result) + assert special_chars_dict == deserialized_special_chars_dict + + def test_serialize_json_safe_basic(self): + values = [0, 1, 2, 3, 4, 5] + result = serialize_json_safe(values) + assert result == [0, 1, 2, 3, 4, 5] + + values = ['a', 'b', 'a', 'c', 'a', 'b'] + result = serialize_json_safe(values) + assert result == ['a', 'b', 'a', 'c', 'a', 'b'] + + def test_serialize_json_safe_missing(self): + values = [0, np.nan, 2, 3, 4, 5] + result = serialize_json_safe(values) + assert result == [0, 0, 2, 3, 4, 5] + + values = [0, np.inf, 2, 3, 4, 5] + result = serialize_json_safe(values) + assert result == [0, 0, 2, 3, 4, 5] + + values = ['a', 'b', 'a', np.nan, 'a', 'b'] + result = serialize_json_safe(values) + assert result == ['a', 'b', 'a', 0, 'a', 'b'] + + def test_serialize_json_safe_aggregate_types(self): + o = { + 'a': [1, 2, 3], + 'c': 'b' + } + result = serialize_json_safe(o) + assert result == o + + o = ('a', [1, 2, 3]) + result = serialize_json_safe(o) + assert result == o + + values = np.array([[1, 2, 3], [4, 5, 6]]) + result = serialize_json_safe(values) + assert result == values.tolist() + + def test_serialize_timestamp(self): + datetime_str = "2020-10-10" + datetime_object = datetime.datetime.strptime(datetime_str, "%Y-%m-%d") + result = serialize_json_safe(datetime_object) + assert datetime_str in result + + def test_serialize_via_json_timestamp(self): + timestamp_obj = pd.Timestamp(2020, 1, 1) + assert isinstance(timestamp_obj, pd.Timestamp) + result = json.dumps(serialize_json_safe(timestamp_obj)) + assert result is not None + assert "2020" in result + + timestamp_obj_array = np.array([pd.Timestamp(2020, 1, 1)]) + result = json.dumps(serialize_json_safe(timestamp_obj_array)) + assert result is not None + assert "2020" in result diff --git a/raiutils/tests/test_model_utils.py b/raiutils/tests/test_model_utils.py new file mode 100644 index 0000000000..c192b973d6 --- /dev/null +++ b/raiutils/tests/test_model_utils.py @@ -0,0 +1,27 @@ +# Copyright (c) Microsoft Corporation +# Licensed under the MIT License. + +from raiutils.models import is_classifier + + +class Classifier: + def predict_proba(self): + pass + + def predict(self): + pass + + +class Regressor: + def predict(self): + pass + + +class TestIsClassifier: + def test_classifier(self): + classifier = Classifier() + assert is_classifier(classifier) + + def test_regressor(self): + regressor = Regressor() + assert not is_classifier(regressor)