From 5163cec775bcc5397917331093506367d6afbaf7 Mon Sep 17 00:00:00 2001 From: Tommy Dang Date: Mon, 13 May 2024 22:01:28 -0700 Subject: [PATCH] 3 --- .env.dev | 5 +- .../overview/dashboard/block_layout.yaml | 47 ++-- .../dashboard/block_layout.yaml | 119 +++++++++ .../dashboard/block_layout.yaml | 24 +- .../unit_2_training/global_data_products.yaml | 2 +- mlops/unit_3_observability/charts/__init__.py | 0 ...pipeline_runs_daily_sklearn_training_a3.py | 4 + ...pipeline_runs_daily_sklearn_training_o5.py | 4 + .../charts/feature_profiles_for_ingest.py | 61 +++++ .../charts/ingest_time_series_bar_chart_y0.py | 0 .../charts/missing_values_for_ingest.py | 8 + .../charts/most_frequent_values_for_ingest.py | 21 ++ .../charts/prepare_histogram_k4.py | 16 ++ .../charts/runs_by_model.py | 0 .../charts/shap_values.py | 34 +++ .../charts/shap_values_bars.py | 34 +++ .../charts/shap_values_force_plot.py | 64 +++++ .../charts/summary_overview_for_ingest.py | 17 ++ .../charts/unique_values_for_ingest.py | 2 + .../charts/xgboost_metrics.py | 0 .../charts/xgboost_metrics_by_runs.py | 2 +- .../charts/xgboost_training_runs_hourly.py | 2 +- mlops/unit_3_observability/custom/__init__.py | 0 .../custom/dashboard_data_source.py | 23 ++ .../custom/load_models.py | 28 ++ .../data_exporters/__init__.py | 0 .../data_exporters/training.py | 29 +++ .../data_exporters/xgboost.py | 35 +++ .../global_data_products.yaml | 8 + mlops/unit_3_observability/io_config.yaml | 8 + .../pipelines/data_preparation/__init__.py | 0 .../pipelines/data_preparation/metadata.yaml | 242 ++++++++++++++++++ .../pipelines/sklearn_training/__init__.py | 0 .../sklearn_training/interactions.yaml | 2 + .../pipelines/sklearn_training/metadata.yaml | 106 ++++++++ .../pipelines/xgboost_training/__init__.py | 0 .../xgboost_training/interactions.yaml | 2 + .../pipelines/xgboost_training/metadata.yaml | 105 ++++++++ .../transformers/__init__.py | 0 .../hyperparameter_tuning/sklearn.py | 39 +++ .../hyperparameter_tuning/xgboost.py | 38 +++ 41 files changed, 1091 insertions(+), 40 deletions(-) rename mlops/{unit_5_running => }/presenters/overview/dashboard/block_layout.yaml (65%) create mode 100644 mlops/presenters/pipelines/sklearn_training/dashboard/block_layout.yaml rename mlops/{unit_5_running => }/presenters/pipelines/xgboost_training/dashboard/block_layout.yaml (72%) create mode 100644 mlops/unit_3_observability/charts/__init__.py create mode 100644 mlops/unit_3_observability/charts/completed_pipeline_runs_daily_sklearn_training_a3.py create mode 100644 mlops/unit_3_observability/charts/failed_pipeline_runs_daily_sklearn_training_o5.py create mode 100644 mlops/unit_3_observability/charts/feature_profiles_for_ingest.py create mode 100644 mlops/unit_3_observability/charts/ingest_time_series_bar_chart_y0.py create mode 100644 mlops/unit_3_observability/charts/missing_values_for_ingest.py create mode 100644 mlops/unit_3_observability/charts/most_frequent_values_for_ingest.py create mode 100644 mlops/unit_3_observability/charts/prepare_histogram_k4.py rename mlops/{unit_5_running => unit_3_observability}/charts/runs_by_model.py (100%) create mode 100644 mlops/unit_3_observability/charts/shap_values.py create mode 100644 mlops/unit_3_observability/charts/shap_values_bars.py create mode 100644 mlops/unit_3_observability/charts/shap_values_force_plot.py create mode 100644 mlops/unit_3_observability/charts/summary_overview_for_ingest.py create mode 100644 mlops/unit_3_observability/charts/unique_values_for_ingest.py rename mlops/{unit_5_running => unit_3_observability}/charts/xgboost_metrics.py (100%) rename mlops/{unit_5_running => unit_3_observability}/charts/xgboost_metrics_by_runs.py (98%) rename mlops/{unit_5_running => unit_3_observability}/charts/xgboost_training_runs_hourly.py (97%) create mode 100755 mlops/unit_3_observability/custom/__init__.py create mode 100644 mlops/unit_3_observability/custom/dashboard_data_source.py create mode 100644 mlops/unit_3_observability/custom/load_models.py create mode 100755 mlops/unit_3_observability/data_exporters/__init__.py create mode 100644 mlops/unit_3_observability/data_exporters/training.py create mode 100644 mlops/unit_3_observability/data_exporters/xgboost.py create mode 100644 mlops/unit_3_observability/global_data_products.yaml create mode 100755 mlops/unit_3_observability/io_config.yaml create mode 100755 mlops/unit_3_observability/pipelines/data_preparation/__init__.py create mode 100755 mlops/unit_3_observability/pipelines/data_preparation/metadata.yaml create mode 100755 mlops/unit_3_observability/pipelines/sklearn_training/__init__.py create mode 100644 mlops/unit_3_observability/pipelines/sklearn_training/interactions.yaml create mode 100755 mlops/unit_3_observability/pipelines/sklearn_training/metadata.yaml create mode 100755 mlops/unit_3_observability/pipelines/xgboost_training/__init__.py create mode 100644 mlops/unit_3_observability/pipelines/xgboost_training/interactions.yaml create mode 100755 mlops/unit_3_observability/pipelines/xgboost_training/metadata.yaml create mode 100755 mlops/unit_3_observability/transformers/__init__.py create mode 100644 mlops/unit_3_observability/transformers/hyperparameter_tuning/sklearn.py create mode 100644 mlops/unit_3_observability/transformers/hyperparameter_tuning/xgboost.py diff --git a/.env.dev b/.env.dev index 4707197e4..bfe1a6daf 100644 --- a/.env.dev +++ b/.env.dev @@ -12,14 +12,15 @@ PYTHONPATH="${MAGE_CODE_PATH}/${PROJECT_NAME}:${PYTHONPATH}" MAGE_PRESENTERS_DIRECTORY="$PROJECT_NAME/presenters" # Database +POSTGRES_HOST=magic-database POSTGRES_DB=magic POSTGRES_PASSWORD=password POSTGRES_USER=postgres -MAGE_DATABASE_CONNECTION_URL="postgresql+psycopg2://${POSTGRES_USER}:${POSTGRES_PASSWORD}@magic-database:5432/${POSTGRES_DB}" +MAGE_DATABASE_CONNECTION_URL="postgresql+psycopg2://${POSTGRES_USER}:${POSTGRES_PASSWORD}@${POSTGRES_HOST}:5432/${POSTGRES_DB}" # Experiments EXPERIMENTS_DB=experiments -EXPERIMENTS_TRACKING_URI="postgresql+psycopg2://${POSTGRES_USER}:${POSTGRES_PASSWORD}@magic-database:5432/${EXPERIMENTS_DB}" +EXPERIMENTS_TRACKING_URI="postgresql+psycopg2://${POSTGRES_USER}:${POSTGRES_PASSWORD}@${POSTGRES_HOST}:5432/${EXPERIMENTS_DB}" # Alerts SMTP_EMAIL= diff --git a/mlops/unit_5_running/presenters/overview/dashboard/block_layout.yaml b/mlops/presenters/overview/dashboard/block_layout.yaml similarity index 65% rename from mlops/unit_5_running/presenters/overview/dashboard/block_layout.yaml rename to mlops/presenters/overview/dashboard/block_layout.yaml index 5cd2541b6..f45199676 100644 --- a/mlops/unit_5_running/presenters/overview/dashboard/block_layout.yaml +++ b/mlops/presenters/overview/dashboard/block_layout.yaml @@ -3,7 +3,7 @@ blocks: configuration: chart_type: pie chart group_by: - - model + - model data_source: type: chart_code error: null @@ -15,12 +15,12 @@ blocks: configuration: chart_type: time series line chart group_by: - - start_time + - start_time metrics: - - aggregation: average - column: mse - - aggregation: average - column: rmse + - aggregation: average + column: mse + - aggregation: average + column: rmse time_interval: original data_source: type: chart_code @@ -34,12 +34,12 @@ blocks: configuration: chart_type: bar chart group_by: - - run_id + - run_id metrics: - - aggregation: max - column: rmse - - aggregation: max - column: mse + - aggregation: max + column: rmse + - aggregation: max + column: mse data_source: type: chart_code error: null @@ -53,25 +53,26 @@ blocks: chart_style: horizontal chart_type: bar chart group_by: - - start_time_hour + - start_time_hour metrics: - - aggregation: count_distinct - column: run_id + - aggregation: count_distinct + column: run_id y_sort_order: descending data_source: - refresh_interval: '60000' + refresh_interval: "60000" type: chart_code error: null name: XGBoost training runs hourly + name_new: XGBoost training runs hourly skip_render: false type: chart uuid: xgboost_training_runs_hourly layout: -- - block_uuid: xgboost_metrics - width: 1 - - block_uuid: xgboost_metrics_by_runs - width: 1 -- - block_uuid: runs_by_model - width: 1 - - block_uuid: xgboost_training_runs_hourly - width: 1 + - - block_uuid: xgboost_metrics + width: 1 + - block_uuid: xgboost_metrics_by_runs + width: 1 + - - block_uuid: xgboost_training_runs_hourly + width: 1 + - block_uuid: runs_by_model + width: 1 diff --git a/mlops/presenters/pipelines/sklearn_training/dashboard/block_layout.yaml b/mlops/presenters/pipelines/sklearn_training/dashboard/block_layout.yaml new file mode 100644 index 000000000..2ca2d2a4c --- /dev/null +++ b/mlops/presenters/pipelines/sklearn_training/dashboard/block_layout.yaml @@ -0,0 +1,119 @@ +blocks: + completed_pipeline_runs_daily_sklearn_training_a3: + configuration: + chart_type: time series line chart + group_by: + - execution_date + metrics: + - aggregation: count_distinct + column: id + time_interval: day + data_source: + pipeline_uuid: sklearn_training + type: pipeline_runs + name: Completed pipeline runs daily + type: chart + uuid: completed_pipeline_runs_daily_sklearn_training_a3 + failed_pipeline_runs_daily_sklearn_training_o5: + configuration: + chart_type: time series line chart + group_by: + - execution_date + metrics: + - aggregation: count_distinct + column: id + time_interval: day + data_source: + pipeline_uuid: sklearn_training + type: pipeline_runs + name: Failed pipeline runs daily + type: chart + uuid: failed_pipeline_runs_daily_sklearn_training_o5 + pipeline_run_status_sklearn_training_n3: + configuration: + chart_style: horizontal + chart_type: bar chart + group_by: + - status + metrics: + - aggregation: count_distinct + column: id + y_sort_order: descending + data_source: + pipeline_uuid: sklearn_training + type: pipeline_runs + name: Pipeline run status + type: chart + uuid: pipeline_run_status_sklearn_training_n3 + pipeline_runs_daily_sklearn_training_p1: + configuration: + chart_type: time series line chart + group_by: + - execution_date + metrics: + - aggregation: count_distinct + column: id + time_interval: day + data_source: + pipeline_uuid: sklearn_training + type: pipeline_runs + name: Pipeline runs daily + type: chart + uuid: pipeline_runs_daily_sklearn_training_p1 + trigger_active_status_sklearn_training_r5: + configuration: + chart_type: bar chart + group_by: + - status + metrics: + - aggregation: count_distinct + column: id + y_sort_order: descending + data_source: + pipeline_uuid: sklearn_training + type: pipeline_schedules + name: Trigger active status + type: chart + uuid: trigger_active_status_sklearn_training_r5 + trigger_frequency_sklearn_training_o8: + configuration: + chart_style: horizontal + chart_type: bar chart + group_by: + - schedule_interval + metrics: + - aggregation: count_distinct + column: id + y_sort_order: descending + data_source: + pipeline_uuid: sklearn_training + type: pipeline_schedules + name: Trigger frequency + type: chart + uuid: trigger_frequency_sklearn_training_o8 + trigger_types_sklearn_training_s4: + configuration: + chart_type: pie chart + group_by: + - schedule_type + data_source: + pipeline_uuid: sklearn_training + type: pipeline_schedules + name: Trigger types + type: chart + uuid: trigger_types_sklearn_training_s4 +layout: +- - block_uuid: trigger_active_status_sklearn_training_r5 + width: 1 + - block_uuid: trigger_types_sklearn_training_s4 + width: 1 + - block_uuid: trigger_frequency_sklearn_training_o8 + width: 2 +- - block_uuid: pipeline_run_status_sklearn_training_n3 + width: 1 + - block_uuid: pipeline_runs_daily_sklearn_training_p1 + width: 2 +- - block_uuid: completed_pipeline_runs_daily_sklearn_training_a3 + width: 1 + - block_uuid: failed_pipeline_runs_daily_sklearn_training_o5 + width: 1 diff --git a/mlops/unit_5_running/presenters/pipelines/xgboost_training/dashboard/block_layout.yaml b/mlops/presenters/pipelines/xgboost_training/dashboard/block_layout.yaml similarity index 72% rename from mlops/unit_5_running/presenters/pipelines/xgboost_training/dashboard/block_layout.yaml rename to mlops/presenters/pipelines/xgboost_training/dashboard/block_layout.yaml index 71b9ae896..16efe627f 100644 --- a/mlops/unit_5_running/presenters/pipelines/xgboost_training/dashboard/block_layout.yaml +++ b/mlops/presenters/pipelines/xgboost_training/dashboard/block_layout.yaml @@ -3,10 +3,10 @@ blocks: configuration: chart_type: custom data_source: - block_uuid: chart_source + block_uuid: dashboard_data_source pipeline_schedule_id: null pipeline_uuid: xgboost_training - refresh_interval: '60000' + refresh_interval: "60000" type: block error: null name: SHAP values @@ -18,7 +18,7 @@ blocks: configuration: chart_type: custom data_source: - block_uuid: chart_source + block_uuid: dashboard_data_source pipeline_schedule_id: null pipeline_uuid: xgboost_training type: block @@ -32,7 +32,7 @@ blocks: configuration: chart_type: custom data_source: - block_uuid: chart_source + block_uuid: dashboard_data_source pipeline_schedule_id: null pipeline_uuid: xgboost_training type: block @@ -42,11 +42,11 @@ blocks: type: chart uuid: shap_values_force_plot layout: -- - block_uuid: shap_values - height: 500 - width: 1 - - block_uuid: shap_values_bars - height: 500 - width: 1 -- - block_uuid: shap_values_force_plot - width: 1 + - - block_uuid: shap_values + height: 500 + width: 1 + - block_uuid: shap_values_bars + height: 500 + width: 1 + - - block_uuid: shap_values_force_plot + width: 1 diff --git a/mlops/unit_2_training/global_data_products.yaml b/mlops/unit_2_training/global_data_products.yaml index b82f0d3aa..514db3f2d 100644 --- a/mlops/unit_2_training/global_data_products.yaml +++ b/mlops/unit_2_training/global_data_products.yaml @@ -2,7 +2,7 @@ training_set: object_type: pipeline object_uuid: data_preparation outdated_after: - seconds: 3600 + seconds: 600 settings: build: partitions: 1 diff --git a/mlops/unit_3_observability/charts/__init__.py b/mlops/unit_3_observability/charts/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/mlops/unit_3_observability/charts/completed_pipeline_runs_daily_sklearn_training_a3.py b/mlops/unit_3_observability/charts/completed_pipeline_runs_daily_sklearn_training_a3.py new file mode 100644 index 000000000..db6dbba66 --- /dev/null +++ b/mlops/unit_3_observability/charts/completed_pipeline_runs_daily_sklearn_training_a3.py @@ -0,0 +1,4 @@ + +@data_source +def d(df): + return df[df['status'] == 'completed'] diff --git a/mlops/unit_3_observability/charts/failed_pipeline_runs_daily_sklearn_training_o5.py b/mlops/unit_3_observability/charts/failed_pipeline_runs_daily_sklearn_training_o5.py new file mode 100644 index 000000000..2d61fc8ce --- /dev/null +++ b/mlops/unit_3_observability/charts/failed_pipeline_runs_daily_sklearn_training_o5.py @@ -0,0 +1,4 @@ + +@data_source +def d(df): + return df[df['status'] == 'failed'] diff --git a/mlops/unit_3_observability/charts/feature_profiles_for_ingest.py b/mlops/unit_3_observability/charts/feature_profiles_for_ingest.py new file mode 100644 index 000000000..ad72b4884 --- /dev/null +++ b/mlops/unit_3_observability/charts/feature_profiles_for_ingest.py @@ -0,0 +1,61 @@ +import statistics +from mage_ai.data_cleaner.column_types.column_type_detector import infer_column_types +from mage_ai.data_preparation.models.constants import DATAFRAME_ANALYSIS_MAX_COLUMNS +from mage_ai.shared.parsers import convert_matrix_to_dataframe + + +df_1 = convert_matrix_to_dataframe(df_1) +df_1 = df_1.iloc[:, :DATAFRAME_ANALYSIS_MAX_COLUMNS] +columns_and_types = infer_column_types(df_1).items() +columns = [t[0] for t in columns_and_types] +stats = ['Type', 'Missing values', 'Unique values', 'Min', 'Max', 'Mean', 'Median', 'Mode'] +rows = [[] for _ in stats] + +for col, col_type in columns_and_types: + series = df_1[col] + + min_value = None + max_value = None + mean = None + median = None + + not_null = series[series.notnull()] + + if len(not_null) == 0: + continue + + if col_type.value in ['number', 'number_with_decimals']: + if str(series.dtype) == 'object': + if col_type.value == 'number_with_decimals': + series = series.astype('float64') + not_null = not_null.astype('float64') + else: + series = series.astype('int64') + not_null = not_null.astype('int64') + + count = len(not_null.index) + if count >= 1: + mean = round(not_null.sum() / count, 2) + median = sorted(not_null)[int(count / 2)] + min_value = round(series.min(), 2) + max_value = round(series.max(), 2) + else: + min_value = not_null.astype(str).min() + max_value = not_null.astype(str).max() + + _, mode = sorted( + [(v, k) for k, v in not_null.value_counts().items()], + reverse=True, + )[0] + + for idx, value in enumerate([ + col_type.value, + len(series[series.isna()].index), + len(series.unique()), + min_value, + max_value, + mean, + median, + mode, + ]): + rows[idx].append(value) diff --git a/mlops/unit_3_observability/charts/ingest_time_series_bar_chart_y0.py b/mlops/unit_3_observability/charts/ingest_time_series_bar_chart_y0.py new file mode 100644 index 000000000..e69de29bb diff --git a/mlops/unit_3_observability/charts/missing_values_for_ingest.py b/mlops/unit_3_observability/charts/missing_values_for_ingest.py new file mode 100644 index 000000000..91dddf5df --- /dev/null +++ b/mlops/unit_3_observability/charts/missing_values_for_ingest.py @@ -0,0 +1,8 @@ +number_of_rows = len(df_1.index) +columns_with_mising_values = [] +percentage_of_missing_values = [] +for col in df_1.columns: + missing = df_1[col].isna().sum() + if missing > 0: + columns_with_mising_values.append(col) + percentage_of_missing_values.append(100 * missing / number_of_rows) diff --git a/mlops/unit_3_observability/charts/most_frequent_values_for_ingest.py b/mlops/unit_3_observability/charts/most_frequent_values_for_ingest.py new file mode 100644 index 000000000..b482bb4b9 --- /dev/null +++ b/mlops/unit_3_observability/charts/most_frequent_values_for_ingest.py @@ -0,0 +1,21 @@ +from mage_ai.data_preparation.models.constants import DATAFRAME_ANALYSIS_MAX_COLUMNS +from mage_ai.shared.parsers import convert_matrix_to_dataframe + + +df_1 = convert_matrix_to_dataframe(df_1) +columns = ['mode value', 'frequency', '% of values'] +column_index = [] +rows = [] +for col in df_1.columns[:DATAFRAME_ANALYSIS_MAX_COLUMNS]: + value_counts = df_1[col].value_counts() + if len(value_counts.index) == 0: + continue + column_value = value_counts.index[0] + value = value_counts[column_value] + number_of_rows = df_1[col].count() + column_index.append(col) + rows.append([ + column_value, + f'{round(100 * value / number_of_rows, 2)}%', + value, + ]) diff --git a/mlops/unit_3_observability/charts/prepare_histogram_k4.py b/mlops/unit_3_observability/charts/prepare_histogram_k4.py new file mode 100644 index 000000000..a95db5c30 --- /dev/null +++ b/mlops/unit_3_observability/charts/prepare_histogram_k4.py @@ -0,0 +1,16 @@ +import pandas as pd + +from mage_ai.shared.parsers import convert_matrix_to_dataframe + + +if isinstance(df_1, list) and len(df_1) >= 1: + item = df_1[0] + if isinstance(item, pd.Series): + item = item.to_frame() + elif not isinstance(item, pd.DataFrame): + item = convert_matrix_to_dataframe(item) + df_1 = item + +columns = df_1.columns +col = 'trip_distance' +x = df_1[df_1[col] <= 20][col] diff --git a/mlops/unit_5_running/charts/runs_by_model.py b/mlops/unit_3_observability/charts/runs_by_model.py similarity index 100% rename from mlops/unit_5_running/charts/runs_by_model.py rename to mlops/unit_3_observability/charts/runs_by_model.py diff --git a/mlops/unit_3_observability/charts/shap_values.py b/mlops/unit_3_observability/charts/shap_values.py new file mode 100644 index 000000000..ece1cd41f --- /dev/null +++ b/mlops/unit_3_observability/charts/shap_values.py @@ -0,0 +1,34 @@ +import base64 +import io +from typing import Tuple + +import matplotlib.pyplot as plt +import numpy as np +import shap +from pandas import Series +from scipy.sparse._csr import csr_matrix +from xgboost import Booster + + +@render(render_type='jpeg') +def create_visualization(inputs: Tuple[Booster, csr_matrix, Series], *args, **kwargs): + model, X, _ = inputs + + # Random sampling - for example, 10% of the data + sample_indices = np.random.choice(X.shape[0], size=int(X.shape[0] * 0.1), replace=False) + X_sampled = X[sample_indices] + X_sampled = X[:1] + + # Now, use X_sampled instead of X for SHAP analysis + explainer = shap.TreeExplainer(model) + shap_values = explainer.shap_values(X_sampled) + shap.summary_plot(shap_values, X_sampled) + + my_stringIObytes = io.BytesIO() + plt.savefig(my_stringIObytes, format='jpg') + my_stringIObytes.seek(0) + my_base64_jpgData = base64.b64encode(my_stringIObytes.read()).decode() + + plt.close() + + return my_base64_jpgData diff --git a/mlops/unit_3_observability/charts/shap_values_bars.py b/mlops/unit_3_observability/charts/shap_values_bars.py new file mode 100644 index 000000000..31b75f488 --- /dev/null +++ b/mlops/unit_3_observability/charts/shap_values_bars.py @@ -0,0 +1,34 @@ +import base64 +import io +from typing import Tuple + +import matplotlib.pyplot as plt +import numpy as np +import shap +from pandas import Series +from scipy.sparse._csr import csr_matrix +from xgboost import Booster + + +@render(render_type='jpeg') +def create_visualization(inputs: Tuple[Booster, csr_matrix, Series], *args, **kwargs): + model, X, _ = inputs + + # Random sampling - for example, 10% of the data + sample_indices = np.random.choice(X.shape[0], size=int(X.shape[0] * 0.1), replace=False) + X_sampled = X[sample_indices] + X_sampled = X[:1] + + # Now, use X_sampled instead of X for SHAP analysis + explainer = shap.TreeExplainer(model) + shap_values = explainer.shap_values(X_sampled) + shap.summary_plot(shap_values, X_sampled, plot_type='bar') + + my_stringIObytes = io.BytesIO() + plt.savefig(my_stringIObytes, format='jpg') + my_stringIObytes.seek(0) + my_base64_jpgData = base64.b64encode(my_stringIObytes.read()).decode() + + plt.close() + + return my_base64_jpgData diff --git a/mlops/unit_3_observability/charts/shap_values_force_plot.py b/mlops/unit_3_observability/charts/shap_values_force_plot.py new file mode 100644 index 000000000..681a9e3a7 --- /dev/null +++ b/mlops/unit_3_observability/charts/shap_values_force_plot.py @@ -0,0 +1,64 @@ +import base64 +import io +from typing import Tuple + +import matplotlib.pyplot as plt +import numpy as np +import pandas as pd +import shap +from pandas import Series +from scipy.sparse._csr import csr_matrix +from xgboost import Booster + +from mage_ai.shared.parsers import convert_matrix_to_dataframe + + +@render(render_type='jpeg') +def create_visualization(inputs: Tuple[Booster, csr_matrix, Series], *args, **kwargs): + model, X, _ = inputs + + # Random sampling - for example, 10% of the data + sample_indices = np.random.choice(X.shape[0], size=int(X.shape[0] * 0.1), replace=False) + X_sampled = X[sample_indices] + X_sampled = X[:1] + + X_sampled = convert_matrix_to_dataframe(X_sampled) + + # Now, use X_sampled instead of X for SHAP analysis + explainer = shap.TreeExplainer(model) + shap_values = explainer.shap_values(X_sampled) + + # Calculate the mean absolute SHAP values for each feature + shap_sum = np.abs(shap_values).mean(axis=0) + + X = convert_matrix_to_dataframe(X) + + importance_df = pd.DataFrame([X.columns.tolist(), shap_sum.tolist()]).T + importance_df.columns = ['column_name', 'shap_importance'] + importance_df = importance_df.sort_values('shap_importance', ascending=False) + + # Get the names of the top 10 most important features + top_n_features = importance_df['column_name'].head(10).tolist() + + # Reduce the original X to these top 10 features + X_top_n = X[top_n_features] + + # If idx is not defined, assuming we're taking the first sample for demonstration + idx = 0 # Or any specific index of interest + + # Generate the force plot for this specific instance and only for the top N features + shap.force_plot( + explainer.expected_value, + shap_values[idx, :][np.newaxis, X.columns.get_indexer(top_n_features)], + X_top_n.iloc[idx, :], + matplotlib=True + ) + + string_bytes = io.BytesIO() + plt.savefig(string_bytes, format='png') + string_bytes.seek(0) + image_str = base64.b64encode(string_bytes.read()).decode() + + plt.close() + + return image_str diff --git a/mlops/unit_3_observability/charts/summary_overview_for_ingest.py b/mlops/unit_3_observability/charts/summary_overview_for_ingest.py new file mode 100644 index 000000000..769c5ab9f --- /dev/null +++ b/mlops/unit_3_observability/charts/summary_overview_for_ingest.py @@ -0,0 +1,17 @@ +from mage_ai.data_cleaner.column_types.column_type_detector import infer_column_types + + +headers = ['value'] +stats = ['Columns', 'Rows'] +rows = [[len(df_1.columns)], [len(df_1.index)]] + +col_counts = {} +for col, col_type in infer_column_types(df_1).items(): + col_type_name = col_type.value + if not col_counts.get(col_type_name): + col_counts[col_type_name] = 0 + col_counts[col_type_name] += 1 + +for col_type, count in sorted(col_counts.items()): + stats.append(f'# of {col_type}') + rows.append([count]) diff --git a/mlops/unit_3_observability/charts/unique_values_for_ingest.py b/mlops/unit_3_observability/charts/unique_values_for_ingest.py new file mode 100644 index 000000000..6ffb530dc --- /dev/null +++ b/mlops/unit_3_observability/charts/unique_values_for_ingest.py @@ -0,0 +1,2 @@ +columns = df_1.columns +number_of_unique_values = [df_1[col].nunique() for col in columns] diff --git a/mlops/unit_5_running/charts/xgboost_metrics.py b/mlops/unit_3_observability/charts/xgboost_metrics.py similarity index 100% rename from mlops/unit_5_running/charts/xgboost_metrics.py rename to mlops/unit_3_observability/charts/xgboost_metrics.py diff --git a/mlops/unit_5_running/charts/xgboost_metrics_by_runs.py b/mlops/unit_3_observability/charts/xgboost_metrics_by_runs.py similarity index 98% rename from mlops/unit_5_running/charts/xgboost_metrics_by_runs.py rename to mlops/unit_3_observability/charts/xgboost_metrics_by_runs.py index f71a79fcc..4520f50d3 100644 --- a/mlops/unit_5_running/charts/xgboost_metrics_by_runs.py +++ b/mlops/unit_3_observability/charts/xgboost_metrics_by_runs.py @@ -29,7 +29,7 @@ def experiments(*args, **kwargs): ON runs.run_uuid = metrics.run_uuid WHERE tags.key = 'model' -AND tags.value = 'xgboost' +AND tags.value = 'Booster' ORDER BY runs.start_time ASC """ diff --git a/mlops/unit_5_running/charts/xgboost_training_runs_hourly.py b/mlops/unit_3_observability/charts/xgboost_training_runs_hourly.py similarity index 97% rename from mlops/unit_5_running/charts/xgboost_training_runs_hourly.py rename to mlops/unit_3_observability/charts/xgboost_training_runs_hourly.py index 688669d93..e2c4ebcec 100644 --- a/mlops/unit_5_running/charts/xgboost_training_runs_hourly.py +++ b/mlops/unit_3_observability/charts/xgboost_training_runs_hourly.py @@ -31,7 +31,7 @@ def experiments(*args, **kwargs): ON runs.run_uuid = metrics.run_uuid WHERE tags.key = 'model' -AND tags.value = 'xgboost' +AND tags.value = 'Booster' ORDER BY runs.start_time ASC """ diff --git a/mlops/unit_3_observability/custom/__init__.py b/mlops/unit_3_observability/custom/__init__.py new file mode 100755 index 000000000..e69de29bb diff --git a/mlops/unit_3_observability/custom/dashboard_data_source.py b/mlops/unit_3_observability/custom/dashboard_data_source.py new file mode 100644 index 000000000..d7b1a2238 --- /dev/null +++ b/mlops/unit_3_observability/custom/dashboard_data_source.py @@ -0,0 +1,23 @@ +from typing import Dict, Tuple, Union + +from pandas import Series +from scipy.sparse import csr_matrix +from xgboost import Booster + +if 'custom' not in globals(): + from mage_ai.data_preparation.decorators import custom + + +@custom +def source( + settings: Tuple[ + Dict[str, Union[bool, float, int, str]], + csr_matrix, + Series, + ], + model: Booster, + **kwargs, +) -> Tuple[Booster, csr_matrix, csr_matrix]: + _, X_train, y_train = settings + + return model, X_train, y_train diff --git a/mlops/unit_3_observability/custom/load_models.py b/mlops/unit_3_observability/custom/load_models.py new file mode 100644 index 000000000..c52316b15 --- /dev/null +++ b/mlops/unit_3_observability/custom/load_models.py @@ -0,0 +1,28 @@ +from typing import Dict, List, Tuple + +if 'custom' not in globals(): + from mage_ai.data_preparation.decorators import custom + + +@custom +def models(*args, **kwargs) -> Tuple[List[str], List[Dict[str, str]]]: + """ + models: comma separated strings + linear_model.Lasso + linear_model.LinearRegression + svm.LinearSVR + ensemble.ExtraTreesRegressor + ensemble.GradientBoostingRegressor + ensemble.RandomForestRegressor + """ + model_names: str = kwargs.get( + 'models', 'linear_model.LinearRegression,linear_model.Lasso' + ) + child_data: List[str] = [ + model_name.strip() for model_name in model_names.split(',') + ] + child_metadata: List[Dict] = [ + dict(block_uuid=model_name.split('.')[-1]) for model_name in child_data + ] + + return child_data, child_metadata diff --git a/mlops/unit_3_observability/data_exporters/__init__.py b/mlops/unit_3_observability/data_exporters/__init__.py new file mode 100755 index 000000000..e69de29bb diff --git a/mlops/unit_3_observability/data_exporters/training.py b/mlops/unit_3_observability/data_exporters/training.py new file mode 100644 index 000000000..c97eaeb56 --- /dev/null +++ b/mlops/unit_3_observability/data_exporters/training.py @@ -0,0 +1,29 @@ +from typing import Callable, Dict, Tuple, Union + +from pandas import Series +from scipy.sparse._csr import csr_matrix +from sklearn.base import BaseEstimator + +from mlops.utils.models.sklearn import load_class, train_model + +if 'data_exporter' not in globals(): + from mage_ai.data_preparation.decorators import data_exporter + + +@data_exporter +def train( + settings: Tuple[ + Dict[str, Union[bool, float, int, str]], + csr_matrix, + Series, + Dict[str, Union[Callable[..., BaseEstimator], str]], + ], + **kwargs, +) -> Tuple[BaseEstimator, Dict[str, str]]: + hyperparameters, X, y, model_info = settings + + model_class = model_info['cls'] + model = model_class(**hyperparameters) + model.fit(X, y) + + return model, model_info diff --git a/mlops/unit_3_observability/data_exporters/xgboost.py b/mlops/unit_3_observability/data_exporters/xgboost.py new file mode 100644 index 000000000..74cfbf0c3 --- /dev/null +++ b/mlops/unit_3_observability/data_exporters/xgboost.py @@ -0,0 +1,35 @@ +from typing import Dict, Tuple, Union + +from pandas import Series +from scipy.sparse._csr import csr_matrix +from xgboost import Booster + +from utils.models.xgboost import build_data, fit_model + +if 'data_exporter' not in globals(): + from mage_ai.data_preparation.decorators import data_exporter + + +@data_exporter +def train( + settings: Tuple[ + Dict[str, Union[bool, float, int, str]], + csr_matrix, + Series, + ], + **kwargs, +) -> Tuple[Booster, csr_matrix, Series]: + hyperparameters, X, y = settings + + # Test training a model with low max depth + # so that the output renders a reasonably sized plot tree. + if kwargs.get('max_depth'): + hyperparameters['max_depth'] = int(kwargs.get('max_depth')) + + model = fit_model( + build_data(X, y), + hyperparameters, + verbose_eval=kwargs.get('verbose_eval', 100), + ) + + return model \ No newline at end of file diff --git a/mlops/unit_3_observability/global_data_products.yaml b/mlops/unit_3_observability/global_data_products.yaml new file mode 100644 index 000000000..514db3f2d --- /dev/null +++ b/mlops/unit_3_observability/global_data_products.yaml @@ -0,0 +1,8 @@ +training_set: + object_type: pipeline + object_uuid: data_preparation + outdated_after: + seconds: 600 + settings: + build: + partitions: 1 diff --git a/mlops/unit_3_observability/io_config.yaml b/mlops/unit_3_observability/io_config.yaml new file mode 100755 index 000000000..0307fc20d --- /dev/null +++ b/mlops/unit_3_observability/io_config.yaml @@ -0,0 +1,8 @@ +version: 0.1.1 +default: + POSTGRES_CONNECT_TIMEOUT: 10 + POSTGRES_DBNAME: "{{ env_var('EXPERIMENTS_DB') }}" + POSTGRES_USER: "{{ env_var('POSTGRES_USER') }}" + POSTGRES_PASSWORD: "{{ env_var('POSTGRES_PASSWORD') }}" + POSTGRES_HOST: "{{ env_var('POSTGRES_HOST') }}" + POSTGRES_PORT: 5432 diff --git a/mlops/unit_3_observability/pipelines/data_preparation/__init__.py b/mlops/unit_3_observability/pipelines/data_preparation/__init__.py new file mode 100755 index 000000000..e69de29bb diff --git a/mlops/unit_3_observability/pipelines/data_preparation/metadata.yaml b/mlops/unit_3_observability/pipelines/data_preparation/metadata.yaml new file mode 100755 index 000000000..91eccf01b --- /dev/null +++ b/mlops/unit_3_observability/pipelines/data_preparation/metadata.yaml @@ -0,0 +1,242 @@ +blocks: + - all_upstream_blocks_executed: true + color: null + configuration: + disable_output_preview: false + file_source: + path: unit_1_data_preparation/data_loaders/ingest.py + downstream_blocks: + - prepare + - ingest_time_series_bar_chart_y0 + - missing_values_for_ingest + - unique_values_for_ingest + - most_frequent_values_for_ingest + - summary_overview_for_ingest + - feature_profiles_for_ingest + executor_config: null + executor_type: local_python + has_callback: false + language: python + name: Ingest + retry_config: {} + status: executed + timeout: null + type: data_loader + upstream_blocks: [] + uuid: ingest + - all_upstream_blocks_executed: true + color: null + configuration: + file_source: + path: unit_1_data_preparation/transformers/prepare.py + downstream_blocks: + - build + - prepare_histogram_k4 + executor_config: null + executor_type: local_python + has_callback: false + language: python + name: Prepare + retry_config: null + status: executed + timeout: null + type: transformer + upstream_blocks: + - ingest + uuid: prepare + - all_upstream_blocks_executed: true + color: null + configuration: + file_source: + path: unit_1_data_preparation/data_exporters/build.py + downstream_blocks: [] + executor_config: null + executor_type: local_python + has_callback: false + language: python + name: Build + retry_config: null + status: executed + timeout: null + type: data_exporter + upstream_blocks: + - prepare + uuid: build +cache_block_output_in_memory: false +callbacks: [] +concurrency_config: {} +conditionals: [] +created_at: "2024-05-05 05:35:38.032338+00:00" +data_integration: null +description: Collect data from various sources, generate additional training data + if needed, and perform feature engineering to transform the raw data into a set + of useful input features. +executor_config: {} +executor_count: 1 +executor_type: null +extensions: {} +name: Data preparation +notification_config: {} +remote_variables_dir: null +retry_config: {} +run_pipeline_in_one_process: false +settings: + triggers: null +spark_config: {} +tags: [] +type: python +uuid: data_preparation +variables: + split_on_feature: lpep_pickup_datetime + split_on_feature_value: "2024-02-01" + target: duration +variables_dir: /home/src/mage_data/unit_1_data_preparation +widgets: + - all_upstream_blocks_executed: true + color: null + configuration: + chart_type: time series bar chart + group_by: + - lpep_pickup_datetime + metrics: + - aggregation: count + column: lpep_pickup_datetime + time_interval: original + downstream_blocks: [] + executor_config: null + executor_type: local_python + has_callback: false + language: python + name: ingest_time series bar chart_y0 + retry_config: null + status: executed + timeout: null + type: chart + upstream_blocks: + - ingest + uuid: ingest_time_series_bar_chart_y0 + - all_upstream_blocks_executed: true + color: null + configuration: + chart_style: horizontal + chart_type: bar chart + x: columns_with_mising_values + y: percentage_of_missing_values + y_sort_order: descending + downstream_blocks: [] + executor_config: null + executor_type: local_python + has_callback: false + language: python + name: missing values for ingest + retry_config: null + status: executed + timeout: null + type: chart + upstream_blocks: + - ingest + uuid: missing_values_for_ingest + - all_upstream_blocks_executed: true + color: null + configuration: + chart_style: horizontal + chart_type: bar chart + x: columns + y: number_of_unique_values + y_sort_order: descending + downstream_blocks: [] + executor_config: null + executor_type: local_python + has_callback: false + language: python + name: unique values for ingest + retry_config: null + status: executed + timeout: null + type: chart + upstream_blocks: + - ingest + uuid: unique_values_for_ingest + - all_upstream_blocks_executed: true + color: null + configuration: + chart_type: table + height: 3000 + index: column_index + x: columns + y: rows + downstream_blocks: [] + executor_config: null + executor_type: local_python + has_callback: false + language: python + name: most frequent values for ingest + retry_config: null + status: executed + timeout: null + type: chart + upstream_blocks: + - ingest + uuid: most_frequent_values_for_ingest + - all_upstream_blocks_executed: true + color: null + configuration: + chart_type: table + height: 3000 + index: stats + x: headers + y: rows + downstream_blocks: [] + executor_config: null + executor_type: local_python + has_callback: false + language: python + name: summary overview for ingest + retry_config: null + status: executed + timeout: null + type: chart + upstream_blocks: + - ingest + uuid: summary_overview_for_ingest + - all_upstream_blocks_executed: true + color: null + configuration: + chart_type: table + height: 3000 + index: stats + x: columns + y: rows + downstream_blocks: [] + executor_config: null + executor_type: local_python + has_callback: false + language: python + name: feature profiles for ingest + retry_config: null + status: executed + timeout: null + type: chart + upstream_blocks: + - ingest + uuid: feature_profiles_for_ingest + - all_upstream_blocks_executed: true + color: null + configuration: + buckets: "20" + chart_type: histogram + group_by: [] + x: x + downstream_blocks: [] + executor_config: null + executor_type: local_python + has_callback: false + language: python + name: prepare_histogram_k4 + retry_config: null + status: executed + timeout: null + type: chart + upstream_blocks: + - prepare + uuid: prepare_histogram_k4 diff --git a/mlops/unit_3_observability/pipelines/sklearn_training/__init__.py b/mlops/unit_3_observability/pipelines/sklearn_training/__init__.py new file mode 100755 index 000000000..e69de29bb diff --git a/mlops/unit_3_observability/pipelines/sklearn_training/interactions.yaml b/mlops/unit_3_observability/pipelines/sklearn_training/interactions.yaml new file mode 100644 index 000000000..a1d40f831 --- /dev/null +++ b/mlops/unit_3_observability/pipelines/sklearn_training/interactions.yaml @@ -0,0 +1,2 @@ +blocks: {} +layout: [] diff --git a/mlops/unit_3_observability/pipelines/sklearn_training/metadata.yaml b/mlops/unit_3_observability/pipelines/sklearn_training/metadata.yaml new file mode 100755 index 000000000..620ba657c --- /dev/null +++ b/mlops/unit_3_observability/pipelines/sklearn_training/metadata.yaml @@ -0,0 +1,106 @@ +blocks: + - all_upstream_blocks_executed: true + color: null + configuration: + global_data_product: + uuid: training_set + downstream_blocks: + - hyperparameter_tuning/sklearn + executor_config: null + executor_type: local_python + has_callback: false + language: python + name: Training set + retry_config: null + status: executed + timeout: null + type: global_data_product + upstream_blocks: [] + uuid: training_set + - all_upstream_blocks_executed: true + color: teal + configuration: + dynamic: true + file_source: + path: unit_2_training/custom/load_models.py + downstream_blocks: + - hyperparameter_tuning/sklearn + executor_config: null + executor_type: local_python + has_callback: false + language: python + name: Load models + retry_config: null + status: executed + timeout: null + type: custom + upstream_blocks: [] + uuid: load_models + - all_upstream_blocks_executed: true + color: null + configuration: + file_source: + path: unit_2_training/transformers/hyperparameter_tuning/sklearn.py + downstream_blocks: + - training + executor_config: null + executor_type: local_python + has_callback: false + language: python + name: Hyperparameter tuning + retry_config: null + status: executed + timeout: null + type: transformer + upstream_blocks: + - training_set + - load_models + uuid: hyperparameter_tuning/sklearn + - all_upstream_blocks_executed: true + color: null + configuration: + file_source: + path: unit_2_training/data_exporters/training.py + downstream_blocks: [] + executor_config: null + executor_type: local_python + has_callback: false + language: python + name: Training + retry_config: null + status: executed + timeout: null + type: data_exporter + upstream_blocks: + - hyperparameter_tuning/sklearn + uuid: training +cache_block_output_in_memory: false +callbacks: [] +concurrency_config: + block_run_limit: 2 + on_pipeline_run_limit_reached: wait + pipeline_run_limit: 30 + pipeline_run_limit_all_triggers: 50 +conditionals: [] +created_at: "2024-05-07 13:38:01.412176+00:00" +data_integration: null +description: + Train models from the sklearn library (e.g. ExtraTreesRegressor, GradientBoostingRegressor, + Lasso, LinearRegression, LinearSVR, RandomForestRegressor). +executor_config: {} +executor_count: 1 +executor_type: null +extensions: {} +name: sklearn training +notification_config: {} +remote_variables_dir: null +retry_config: {} +run_pipeline_in_one_process: false +settings: + triggers: null +spark_config: {} +tags: [] +type: python +uuid: sklearn_training +variables_dir: /home/src/mage_data/unit_2_training +widgets: [] diff --git a/mlops/unit_3_observability/pipelines/xgboost_training/__init__.py b/mlops/unit_3_observability/pipelines/xgboost_training/__init__.py new file mode 100755 index 000000000..e69de29bb diff --git a/mlops/unit_3_observability/pipelines/xgboost_training/interactions.yaml b/mlops/unit_3_observability/pipelines/xgboost_training/interactions.yaml new file mode 100644 index 000000000..a1d40f831 --- /dev/null +++ b/mlops/unit_3_observability/pipelines/xgboost_training/interactions.yaml @@ -0,0 +1,2 @@ +blocks: {} +layout: [] diff --git a/mlops/unit_3_observability/pipelines/xgboost_training/metadata.yaml b/mlops/unit_3_observability/pipelines/xgboost_training/metadata.yaml new file mode 100755 index 000000000..1a421ca5d --- /dev/null +++ b/mlops/unit_3_observability/pipelines/xgboost_training/metadata.yaml @@ -0,0 +1,105 @@ +blocks: +- all_upstream_blocks_executed: true + color: null + configuration: + global_data_product: + uuid: training_set + downstream_blocks: [] + executor_config: null + executor_type: local_python + has_callback: false + language: python + name: Training set + retry_config: null + status: executed + timeout: null + type: global_data_product + upstream_blocks: [] + uuid: training_set +- all_upstream_blocks_executed: true + color: null + configuration: + file_source: + path: null + downstream_blocks: + - xgboost + - dashboard_data_source + executor_config: null + executor_type: local_python + has_callback: false + language: python + name: xgboost Hyperparameter tuning + retry_config: null + status: executed + timeout: null + type: transformer + upstream_blocks: + - training_set + uuid: hyperparameter_tuning/xgboost +- all_upstream_blocks_executed: true + color: null + configuration: + file_source: + path: null + downstream_blocks: + - dashboard_data_source + executor_config: null + executor_type: local_python + has_callback: false + language: python + name: XGBoost + retry_config: null + status: executed + timeout: null + type: data_exporter + upstream_blocks: + - hyperparameter_tuning/xgboost + uuid: xgboost +- all_upstream_blocks_executed: true + color: pink + configuration: + file_source: + path: null + downstream_blocks: [] + executor_config: null + executor_type: local_python + has_callback: false + language: python + name: Dashboard data source + retry_config: null + status: executed + timeout: null + type: custom + upstream_blocks: + - hyperparameter_tuning/xgboost + - xgboost + uuid: dashboard_data_source +cache_block_output_in_memory: false +callbacks: [] +concurrency_config: {} +conditionals: [] +created_at: '2024-05-07 18:27:34.902705+00:00' +data_integration: null +description: 'XGBoost is a scalable and efficient implementation of gradient boosted + decision trees, a powerful ensemble machine learning technique. ' +executor_config: {} +executor_count: 1 +executor_type: null +extensions: {} +name: XGBoost training +notification_config: {} +remote_variables_dir: null +retry_config: {} +run_pipeline_in_one_process: false +settings: + triggers: null +spark_config: {} +tags: [] +type: python +uuid: xgboost_training +variables: + early_stopping_rounds: 1 + max_depth: 1 + max_evaluations: 1 +variables_dir: /home/src/mage_data/unit_3_observability +widgets: [] diff --git a/mlops/unit_3_observability/transformers/__init__.py b/mlops/unit_3_observability/transformers/__init__.py new file mode 100755 index 000000000..e69de29bb diff --git a/mlops/unit_3_observability/transformers/hyperparameter_tuning/sklearn.py b/mlops/unit_3_observability/transformers/hyperparameter_tuning/sklearn.py new file mode 100644 index 000000000..de9aef004 --- /dev/null +++ b/mlops/unit_3_observability/transformers/hyperparameter_tuning/sklearn.py @@ -0,0 +1,39 @@ +from typing import Callable, Dict, Tuple, Union + +from pandas import Series +from scipy.sparse._csr import csr_matrix +from sklearn.base import BaseEstimator + +from utils.models.sklearn import load_class, tune_hyperparameters + +if 'transformer' not in globals(): + from mage_ai.data_preparation.decorators import transformer + + +@transformer +def hyperparameter_tuning( + training_set: Dict[str, Union[Series, csr_matrix]], + model_class_name: str, + *args, + **kwargs, +) -> Tuple[ + Dict[str, Union[bool, float, int, str]], + csr_matrix, + Series, + Callable[..., BaseEstimator], +]: + X, X_train, X_val, y, y_train, y_val, _ = training_set['build'] + + model_class = load_class(model_class_name) + + hyperparameters = tune_hyperparameters( + model_class, + X_train=X_train, + y_train=y_train, + X_val=X_val, + y_val=y_val, + max_evaluations=kwargs.get('max_evaluations', 50), + random_state=kwargs.get('random_state', 3), + ) + + return hyperparameters, X, y, dict(cls=model_class, name=model_class_name) diff --git a/mlops/unit_3_observability/transformers/hyperparameter_tuning/xgboost.py b/mlops/unit_3_observability/transformers/hyperparameter_tuning/xgboost.py new file mode 100644 index 000000000..9bb5d8f23 --- /dev/null +++ b/mlops/unit_3_observability/transformers/hyperparameter_tuning/xgboost.py @@ -0,0 +1,38 @@ +from typing import Dict, Tuple, Union + +import numpy as np +import xgboost as xgb +from pandas import Series +from scipy.sparse._csr import csr_matrix + +from utils.logging import track_experiment +from utils.models.xgboost import build_data, tune_hyperparameters + +if 'transformer' not in globals(): + from mage_ai.data_preparation.decorators import transformer +if 'test' not in globals(): + from mage_ai.data_preparation.decorators import test + + +@transformer +def hyperparameter_tuning( + training_set: Dict[str, Union[Series, csr_matrix]], + **kwargs, +) -> Tuple[ + Dict[str, Union[bool, float, int, str]], + csr_matrix, + Series, +]: + X, X_train, X_val, y, y_train, y_val, _ = training_set['build'] + + training = build_data(X_train, y_train) + validation = build_data(X_val, y_val) + + best_hyperparameters = tune_hyperparameters( + training, + validation, + callback=lambda **opts: track_experiment(**{**opts, **kwargs}), + **kwargs, + ) + + return best_hyperparameters, X_train, y_train