Skip to content

Commit

Permalink
3
Browse files Browse the repository at this point in the history
  • Loading branch information
tommydangerous committed May 14, 2024
1 parent d0437b7 commit 5163cec
Show file tree
Hide file tree
Showing 41 changed files with 1,091 additions and 40 deletions.
5 changes: 3 additions & 2 deletions .env.dev
Original file line number Diff line number Diff line change
Expand Up @@ -12,14 +12,15 @@ PYTHONPATH="${MAGE_CODE_PATH}/${PROJECT_NAME}:${PYTHONPATH}"
MAGE_PRESENTERS_DIRECTORY="$PROJECT_NAME/presenters"

# Database
POSTGRES_HOST=magic-database
POSTGRES_DB=magic
POSTGRES_PASSWORD=password
POSTGRES_USER=postgres
MAGE_DATABASE_CONNECTION_URL="postgresql+psycopg2://${POSTGRES_USER}:${POSTGRES_PASSWORD}@magic-database:5432/${POSTGRES_DB}"
MAGE_DATABASE_CONNECTION_URL="postgresql+psycopg2://${POSTGRES_USER}:${POSTGRES_PASSWORD}@${POSTGRES_HOST}:5432/${POSTGRES_DB}"

# Experiments
EXPERIMENTS_DB=experiments
EXPERIMENTS_TRACKING_URI="postgresql+psycopg2://${POSTGRES_USER}:${POSTGRES_PASSWORD}@magic-database:5432/${EXPERIMENTS_DB}"
EXPERIMENTS_TRACKING_URI="postgresql+psycopg2://${POSTGRES_USER}:${POSTGRES_PASSWORD}@${POSTGRES_HOST}:5432/${EXPERIMENTS_DB}"

# Alerts
SMTP_EMAIL=
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@ blocks:
configuration:
chart_type: pie chart
group_by:
- model
- model
data_source:
type: chart_code
error: null
Expand All @@ -15,12 +15,12 @@ blocks:
configuration:
chart_type: time series line chart
group_by:
- start_time
- start_time
metrics:
- aggregation: average
column: mse
- aggregation: average
column: rmse
- aggregation: average
column: mse
- aggregation: average
column: rmse
time_interval: original
data_source:
type: chart_code
Expand All @@ -34,12 +34,12 @@ blocks:
configuration:
chart_type: bar chart
group_by:
- run_id
- run_id
metrics:
- aggregation: max
column: rmse
- aggregation: max
column: mse
- aggregation: max
column: rmse
- aggregation: max
column: mse
data_source:
type: chart_code
error: null
Expand All @@ -53,25 +53,26 @@ blocks:
chart_style: horizontal
chart_type: bar chart
group_by:
- start_time_hour
- start_time_hour
metrics:
- aggregation: count_distinct
column: run_id
- aggregation: count_distinct
column: run_id
y_sort_order: descending
data_source:
refresh_interval: '60000'
refresh_interval: "60000"
type: chart_code
error: null
name: XGBoost training runs hourly
name_new: XGBoost training runs hourly
skip_render: false
type: chart
uuid: xgboost_training_runs_hourly
layout:
- - block_uuid: xgboost_metrics
width: 1
- block_uuid: xgboost_metrics_by_runs
width: 1
- - block_uuid: runs_by_model
width: 1
- block_uuid: xgboost_training_runs_hourly
width: 1
- - block_uuid: xgboost_metrics
width: 1
- block_uuid: xgboost_metrics_by_runs
width: 1
- - block_uuid: xgboost_training_runs_hourly
width: 1
- block_uuid: runs_by_model
width: 1
Original file line number Diff line number Diff line change
@@ -0,0 +1,119 @@
blocks:
completed_pipeline_runs_daily_sklearn_training_a3:
configuration:
chart_type: time series line chart
group_by:
- execution_date
metrics:
- aggregation: count_distinct
column: id
time_interval: day
data_source:
pipeline_uuid: sklearn_training
type: pipeline_runs
name: Completed pipeline runs daily
type: chart
uuid: completed_pipeline_runs_daily_sklearn_training_a3
failed_pipeline_runs_daily_sklearn_training_o5:
configuration:
chart_type: time series line chart
group_by:
- execution_date
metrics:
- aggregation: count_distinct
column: id
time_interval: day
data_source:
pipeline_uuid: sklearn_training
type: pipeline_runs
name: Failed pipeline runs daily
type: chart
uuid: failed_pipeline_runs_daily_sklearn_training_o5
pipeline_run_status_sklearn_training_n3:
configuration:
chart_style: horizontal
chart_type: bar chart
group_by:
- status
metrics:
- aggregation: count_distinct
column: id
y_sort_order: descending
data_source:
pipeline_uuid: sklearn_training
type: pipeline_runs
name: Pipeline run status
type: chart
uuid: pipeline_run_status_sklearn_training_n3
pipeline_runs_daily_sklearn_training_p1:
configuration:
chart_type: time series line chart
group_by:
- execution_date
metrics:
- aggregation: count_distinct
column: id
time_interval: day
data_source:
pipeline_uuid: sklearn_training
type: pipeline_runs
name: Pipeline runs daily
type: chart
uuid: pipeline_runs_daily_sklearn_training_p1
trigger_active_status_sklearn_training_r5:
configuration:
chart_type: bar chart
group_by:
- status
metrics:
- aggregation: count_distinct
column: id
y_sort_order: descending
data_source:
pipeline_uuid: sklearn_training
type: pipeline_schedules
name: Trigger active status
type: chart
uuid: trigger_active_status_sklearn_training_r5
trigger_frequency_sklearn_training_o8:
configuration:
chart_style: horizontal
chart_type: bar chart
group_by:
- schedule_interval
metrics:
- aggregation: count_distinct
column: id
y_sort_order: descending
data_source:
pipeline_uuid: sklearn_training
type: pipeline_schedules
name: Trigger frequency
type: chart
uuid: trigger_frequency_sklearn_training_o8
trigger_types_sklearn_training_s4:
configuration:
chart_type: pie chart
group_by:
- schedule_type
data_source:
pipeline_uuid: sklearn_training
type: pipeline_schedules
name: Trigger types
type: chart
uuid: trigger_types_sklearn_training_s4
layout:
- - block_uuid: trigger_active_status_sklearn_training_r5
width: 1
- block_uuid: trigger_types_sklearn_training_s4
width: 1
- block_uuid: trigger_frequency_sklearn_training_o8
width: 2
- - block_uuid: pipeline_run_status_sklearn_training_n3
width: 1
- block_uuid: pipeline_runs_daily_sklearn_training_p1
width: 2
- - block_uuid: completed_pipeline_runs_daily_sklearn_training_a3
width: 1
- block_uuid: failed_pipeline_runs_daily_sklearn_training_o5
width: 1
Original file line number Diff line number Diff line change
Expand Up @@ -3,10 +3,10 @@ blocks:
configuration:
chart_type: custom
data_source:
block_uuid: chart_source
block_uuid: dashboard_data_source
pipeline_schedule_id: null
pipeline_uuid: xgboost_training
refresh_interval: '60000'
refresh_interval: "60000"
type: block
error: null
name: SHAP values
Expand All @@ -18,7 +18,7 @@ blocks:
configuration:
chart_type: custom
data_source:
block_uuid: chart_source
block_uuid: dashboard_data_source
pipeline_schedule_id: null
pipeline_uuid: xgboost_training
type: block
Expand All @@ -32,7 +32,7 @@ blocks:
configuration:
chart_type: custom
data_source:
block_uuid: chart_source
block_uuid: dashboard_data_source
pipeline_schedule_id: null
pipeline_uuid: xgboost_training
type: block
Expand All @@ -42,11 +42,11 @@ blocks:
type: chart
uuid: shap_values_force_plot
layout:
- - block_uuid: shap_values
height: 500
width: 1
- block_uuid: shap_values_bars
height: 500
width: 1
- - block_uuid: shap_values_force_plot
width: 1
- - block_uuid: shap_values
height: 500
width: 1
- block_uuid: shap_values_bars
height: 500
width: 1
- - block_uuid: shap_values_force_plot
width: 1
2 changes: 1 addition & 1 deletion mlops/unit_2_training/global_data_products.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@ training_set:
object_type: pipeline
object_uuid: data_preparation
outdated_after:
seconds: 3600
seconds: 600
settings:
build:
partitions: 1
Empty file.
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@

@data_source
def d(df):
return df[df['status'] == 'completed']
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@

@data_source
def d(df):
return df[df['status'] == 'failed']
61 changes: 61 additions & 0 deletions mlops/unit_3_observability/charts/feature_profiles_for_ingest.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,61 @@
import statistics
from mage_ai.data_cleaner.column_types.column_type_detector import infer_column_types
from mage_ai.data_preparation.models.constants import DATAFRAME_ANALYSIS_MAX_COLUMNS
from mage_ai.shared.parsers import convert_matrix_to_dataframe


df_1 = convert_matrix_to_dataframe(df_1)
df_1 = df_1.iloc[:, :DATAFRAME_ANALYSIS_MAX_COLUMNS]
columns_and_types = infer_column_types(df_1).items()
columns = [t[0] for t in columns_and_types]
stats = ['Type', 'Missing values', 'Unique values', 'Min', 'Max', 'Mean', 'Median', 'Mode']
rows = [[] for _ in stats]

for col, col_type in columns_and_types:
series = df_1[col]

min_value = None
max_value = None
mean = None
median = None

not_null = series[series.notnull()]

if len(not_null) == 0:
continue

if col_type.value in ['number', 'number_with_decimals']:
if str(series.dtype) == 'object':
if col_type.value == 'number_with_decimals':
series = series.astype('float64')
not_null = not_null.astype('float64')
else:
series = series.astype('int64')
not_null = not_null.astype('int64')

count = len(not_null.index)
if count >= 1:
mean = round(not_null.sum() / count, 2)
median = sorted(not_null)[int(count / 2)]
min_value = round(series.min(), 2)
max_value = round(series.max(), 2)
else:
min_value = not_null.astype(str).min()
max_value = not_null.astype(str).max()

_, mode = sorted(
[(v, k) for k, v in not_null.value_counts().items()],
reverse=True,
)[0]

for idx, value in enumerate([
col_type.value,
len(series[series.isna()].index),
len(series.unique()),
min_value,
max_value,
mean,
median,
mode,
]):
rows[idx].append(value)
Empty file.
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
number_of_rows = len(df_1.index)
columns_with_mising_values = []
percentage_of_missing_values = []
for col in df_1.columns:
missing = df_1[col].isna().sum()
if missing > 0:
columns_with_mising_values.append(col)
percentage_of_missing_values.append(100 * missing / number_of_rows)
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
from mage_ai.data_preparation.models.constants import DATAFRAME_ANALYSIS_MAX_COLUMNS
from mage_ai.shared.parsers import convert_matrix_to_dataframe


df_1 = convert_matrix_to_dataframe(df_1)
columns = ['mode value', 'frequency', '% of values']
column_index = []
rows = []
for col in df_1.columns[:DATAFRAME_ANALYSIS_MAX_COLUMNS]:
value_counts = df_1[col].value_counts()
if len(value_counts.index) == 0:
continue
column_value = value_counts.index[0]
value = value_counts[column_value]
number_of_rows = df_1[col].count()
column_index.append(col)
rows.append([
column_value,
f'{round(100 * value / number_of_rows, 2)}%',
value,
])
Loading

0 comments on commit 5163cec

Please sign in to comment.