5
5
# MAGIC %restart_python
6
6
7
7
# COMMAND ----------
8
+ import hashlib
8
9
import time
9
10
10
11
import mlflow
11
12
import pandas as pd
13
+ import requests
12
14
from databricks .sdk import WorkspaceClient
13
15
from databricks .sdk .service .serving import EndpointCoreConfigInput , ServedEntityInput
14
16
from lightgbm import LGBMRegressor
15
17
from mlflow import MlflowClient
16
18
from mlflow .models import infer_signature
19
+ from pyspark .dbutils import DBUtils
17
20
from pyspark .sql import SparkSession
18
21
from sklearn .compose import ColumnTransformer
19
22
from sklearn .impute import SimpleImputer
20
23
from sklearn .metrics import mean_absolute_error , mean_squared_error , r2_score
21
24
from sklearn .pipeline import Pipeline
22
- from sklearn .preprocessing import OneHotEncoder
23
25
from sklearn .preprocessing import StandardScaler
24
- import hashlib
25
- import requests
26
26
27
27
from wine_quality .config import ProjectConfig
28
28
58
58
"max_depth" : ab_test_params ["max_depth_b" ],
59
59
}
60
60
61
+ # COMMAND ----------
62
+ spark = SparkSession .builder .getOrCreate ()
63
+ dbutils = DBUtils (spark )
64
+
61
65
# COMMAND ----------
62
66
63
67
# MAGIC %md
108
112
109
113
# Train the model
110
114
pipeline .fit (X_train , y_train )
111
- y_pred = pipeline .predict (X_test )
115
+ y_pred = pipeline .predict (X_test )
112
116
113
117
# Calculate performance metrics
114
118
mse = mean_squared_error (y_test , y_pred )
124
128
signature = infer_signature (model_input = X_train , model_output = y_pred )
125
129
126
130
# Log the input dataset for tracking reproducibility
127
- dataset = mlflow .data .from_spark (train_set_spark ,
128
- table_name = f"{ catalog_name } .{ schema_name } .train_set" ,
129
- version = "0" )
131
+ dataset = mlflow .data .from_spark (train_set_spark , table_name = f"{ catalog_name } .{ schema_name } .train_set" , version = "0" )
130
132
mlflow .log_input (dataset , context = "training" )
131
133
132
134
# Log the pipeline model in MLflow with a unique artifact path
178
180
mlflow .log_metric ("r2_score" , r2 )
179
181
signature = infer_signature (model_input = X_train , model_output = y_pred )
180
182
181
- dataset = mlflow .data .from_spark (train_set_spark ,
182
- table_name = f"{ catalog_name } .{ schema_name } .train_set" , version = "0" )
183
+ dataset = mlflow .data .from_spark (train_set_spark , table_name = f"{ catalog_name } .{ schema_name } .train_set" , version = "0" )
183
184
mlflow .log_input (dataset , context = "training" )
184
185
mlflow .sklearn .log_model (sk_model = pipeline , artifact_path = "lightgbm-pipeline-model" , signature = signature )
185
186
@@ -233,16 +234,14 @@ def predict(self, context, model_input):
233
234
234
235
# COMMAND ----------
235
236
X_train = train_set [num_features + ["id" ]]
236
- X_test = test_set [num_features + ["id" ]]
237
+ X_test = test_set [num_features + ["id" ]]
237
238
238
239
239
240
# COMMAND ----------
240
241
models = [model_A , model_B ]
241
242
wrapped_model = WineQualityModelWrapper (models ) # we pass the loaded models to the wrapper
242
243
example_input = X_test .iloc [0 :1 ] # Select the first row for prediction as example
243
- example_prediction = wrapped_model .predict (
244
- context = None ,
245
- model_input = example_input )
244
+ example_prediction = wrapped_model .predict (context = None , model_input = example_input )
246
245
print ("Example Prediction:" , example_prediction )
247
246
248
247
# COMMAND ----------
@@ -251,22 +250,16 @@ def predict(self, context, model_input):
251
250
252
251
with mlflow .start_run () as run :
253
252
run_id = run .info .run_id
254
- signature = infer_signature (model_input = X_train ,
255
- model_output = {"Prediction" : 1234.5 ,
256
- "model" : "Model B" })
257
- dataset = mlflow .data .from_spark (train_set_spark ,
258
- table_name = f"{ catalog_name } .{ schema_name } .train_set" ,
259
- version = "0" )
253
+ signature = infer_signature (model_input = X_train , model_output = {"Prediction" : 1234.5 , "model" : "Model B" })
254
+ dataset = mlflow .data .from_spark (train_set_spark , table_name = f"{ catalog_name } .{ schema_name } .train_set" , version = "0" )
260
255
mlflow .log_input (dataset , context = "training" )
261
256
mlflow .pyfunc .log_model (
262
- python_model = wrapped_model ,# passing wrapped model here instead sklearn model
257
+ python_model = wrapped_model , # passing wrapped model here instead sklearn model
263
258
artifact_path = "pyfunc-wine-quality-model-ab" ,
264
- signature = signature
259
+ signature = signature ,
265
260
)
266
261
model_version = mlflow .register_model (
267
- model_uri = f"runs:/{ run_id } /pyfunc-wine-quality-model-ab" ,
268
- name = model_name ,
269
- tags = {"git_sha" : f"{ git_sha } " }
262
+ model_uri = f"runs:/{ run_id } /pyfunc-wine-quality-model-ab" , name = model_name , tags = {"git_sha" : f"{ git_sha } " }
270
263
)
271
264
272
265
# COMMAND ----------
@@ -276,7 +269,7 @@ def predict(self, context, model_input):
276
269
predictions = model .predict (X_test .iloc [0 :1 ])
277
270
278
271
# Display predictions
279
- predictions
272
+ # predictions
280
273
281
274
# COMMAND ----------
282
275
@@ -313,7 +306,6 @@ def predict(self, context, model_input):
313
306
# MAGIC ### Call the endpoint
314
307
315
308
# COMMAND ----------
316
-
317
309
token = dbutils .notebook .entry_point .getDbutils ().notebook ().getContext ().apiToken ().get ()
318
310
host = spark .conf .get ("spark.databricks.workspaceUrl" )
319
311
@@ -342,9 +334,7 @@ def predict(self, context, model_input):
342
334
343
335
start_time = time .time ()
344
336
345
- model_serving_endpoint = (
346
- f"https://{ host } /serving-endpoints/wine-quality-model-serving-ab-test/invocations"
347
- )
337
+ model_serving_endpoint = f"https://{ host } /serving-endpoints/wine-quality-model-serving-ab-test/invocations"
348
338
349
339
response = requests .post (
350
340
f"{ model_serving_endpoint } " ,
0 commit comments