Skip to content

Commit 296ae01

Browse files
committed
update samples - test
1 parent 8f4efe1 commit 296ae01

File tree

29 files changed

+1124
-161
lines changed

29 files changed

+1124
-161
lines changed

configuration.ipynb

+1-1
Original file line numberDiff line numberDiff line change
@@ -103,7 +103,7 @@
103103
"source": [
104104
"import azureml.core\n",
105105
"\n",
106-
"print(\"This notebook was created using version 1.0.79 of the Azure ML SDK\")\n",
106+
"print(\"This notebook was created using version 1.0.81 of the Azure ML SDK\")\n",
107107
"print(\"You are currently using version\", azureml.core.VERSION, \"of the Azure ML SDK\")"
108108
]
109109
},

how-to-use-azureml/automated-machine-learning/automl_env.yml

+1-1
Original file line numberDiff line numberDiff line change
@@ -30,7 +30,7 @@ dependencies:
3030
- pytorch-transformers==1.0.0
3131
- spacy==2.1.8
3232
- joblib
33-
- onnxruntime==0.4.0
33+
- onnxruntime==1.0.0
3434
- https://aka.ms/automl-resources/packages/en_core_web_sm-2.1.0.tar.gz
3535

3636
channels:

how-to-use-azureml/automated-machine-learning/automl_env_mac.yml

+1-1
Original file line numberDiff line numberDiff line change
@@ -31,7 +31,7 @@ dependencies:
3131
- pytorch-transformers==1.0.0
3232
- spacy==2.1.8
3333
- joblib
34-
- onnxruntime==0.4.0
34+
- onnxruntime==1.0.0
3535
- https://aka.ms/automl-resources/packages/en_core_web_sm-2.1.0.tar.gz
3636

3737
channels:

how-to-use-azureml/automated-machine-learning/classification-bank-marketing-all-features/auto-ml-classification-bank-marketing-all-features.ipynb

+5-5
Original file line numberDiff line numberDiff line change
@@ -288,7 +288,7 @@
288288
"|**blacklist_models** | *List* of *strings* indicating machine learning algorithms for AutoML to avoid in this run. <br><br> Allowed values for **Classification**<br><i>LogisticRegression</i><br><i>SGD</i><br><i>MultinomialNaiveBayes</i><br><i>BernoulliNaiveBayes</i><br><i>SVM</i><br><i>LinearSVM</i><br><i>KNN</i><br><i>DecisionTree</i><br><i>RandomForest</i><br><i>ExtremeRandomTrees</i><br><i>LightGBM</i><br><i>GradientBoosting</i><br><i>TensorFlowDNN</i><br><i>TensorFlowLinearClassifier</i><br><br>Allowed values for **Regression**<br><i>ElasticNet</i><br><i>GradientBoosting</i><br><i>DecisionTree</i><br><i>KNN</i><br><i>LassoLars</i><br><i>SGD</i><br><i>RandomForest</i><br><i>ExtremeRandomTrees</i><br><i>LightGBM</i><br><i>TensorFlowLinearRegressor</i><br><i>TensorFlowDNN</i><br><br>Allowed values for **Forecasting**<br><i>ElasticNet</i><br><i>GradientBoosting</i><br><i>DecisionTree</i><br><i>KNN</i><br><i>LassoLars</i><br><i>SGD</i><br><i>RandomForest</i><br><i>ExtremeRandomTrees</i><br><i>LightGBM</i><br><i>TensorFlowLinearRegressor</i><br><i>TensorFlowDNN</i><br><i>Arima</i><br><i>Prophet</i>|\n",
289289
"| **whitelist_models** | *List* of *strings* indicating machine learning algorithms for AutoML to use in this run. Same values listed above for **blacklist_models** allowed for **whitelist_models**.|\n",
290290
"|**experiment_exit_score**| Value indicating the target for *primary_metric*. <br>Once the target is surpassed the run terminates.|\n",
291-
"|**experiment_timeout_minutes**| Maximum amount of time in minutes that all iterations combined can take before the experiment terminates.|\n",
291+
"|**experiment_timeout_hours**| Maximum amount of time in hours that all iterations combined can take before the experiment terminates.|\n",
292292
"|**enable_early_stopping**| Flag to enble early termination if the score is not improving in the short term.|\n",
293293
"|**featurization**| 'auto' / 'off' Indicator for whether featurization step should be done automatically or not. Note: If the input data is sparse, featurization cannot be turned on.|\n",
294294
"|**n_cross_validations**|Number of cross validation splits.|\n",
@@ -306,7 +306,7 @@
306306
"outputs": [],
307307
"source": [
308308
"automl_settings = {\n",
309-
" \"experiment_timeout_minutes\" : 20,\n",
309+
" \"experiment_timeout_hours\" : 0.3,\n",
310310
" \"enable_early_stopping\" : True,\n",
311311
" \"iteration_timeout_minutes\": 5,\n",
312312
" \"max_concurrent_iterations\": 4,\n",
@@ -694,10 +694,10 @@
694694
"from azureml.core.webservice import AciWebservice\n",
695695
"from azureml.core.webservice import Webservice\n",
696696
"from azureml.core.model import Model\n",
697+
"from azureml.core.environment import Environment\n",
697698
"\n",
698-
"inference_config = InferenceConfig(runtime = \"python\", \n",
699-
" entry_script = script_file_name,\n",
700-
" conda_file = conda_env_file_name)\n",
699+
"myenv = Environment.from_conda_specification(name=\"myenv\", file_path=conda_env_file_name)\n",
700+
"inference_config = InferenceConfig(entry_script=script_file_name, environment=myenv)\n",
701701
"\n",
702702
"aciconfig = AciWebservice.deploy_configuration(cpu_cores = 1, \n",
703703
" memory_gb = 1, \n",

how-to-use-azureml/automated-machine-learning/classification-bank-marketing-all-features/auto-ml-classification-bank-marketing-all-features.yml

+1-1
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,6 @@ dependencies:
88
- azureml-widgets
99
- matplotlib
1010
- pandas_ml
11-
- onnxruntime==0.4.0
11+
- onnxruntime==1.0.0
1212
- azureml-explain-model
1313
- azureml-contrib-interpret

how-to-use-azureml/automated-machine-learning/classification-credit-card-fraud/auto-ml-classification-credit-card-fraud.ipynb

+3-13
Original file line numberDiff line numberDiff line change
@@ -213,7 +213,7 @@
213213
" \"preprocess\": True,\n",
214214
" \"enable_early_stopping\": True,\n",
215215
" \"max_concurrent_iterations\": 2, # This is a limit for testing purpose, please increase it as per cluster size\n",
216-
" \"experiment_timeout_minutes\": 10, # This is a time limit for testing purposes, remove it for real use cases, this will drastically limit ablity to find the best model possible\n",
216+
" \"experiment_timeout_hours\": 0.2, # This is a time limit for testing purposes, remove it for real use cases, this will drastically limit ablity to find the best model possible\n",
217217
" \"verbosity\": logging.INFO,\n",
218218
"}\n",
219219
"\n",
@@ -305,7 +305,7 @@
305305
"source": [
306306
"#### Explain model\n",
307307
"\n",
308-
"Automated ML models can be explained and visualized using the SDK Explainability library. [Learn how to use the explainer](https://github.com/Azure/MachineLearningNotebooks/blob/master/how-to-use-azureml/automated-machine-learning/model-explanation-remote-amlcompute/auto-ml-model-explanations-remote-compute.ipynb)."
308+
"Automated ML models can be explained and visualized using the SDK Explainability library. "
309309
]
310310
},
311311
{
@@ -334,17 +334,7 @@
334334
"metadata": {},
335335
"source": [
336336
"#### Print the properties of the model\n",
337-
"The fitted_model is a python object and you can read the different properties of the object.\n",
338-
"See *Print the properties of the model* section in [this sample notebook](https://github.com/Azure/MachineLearningNotebooks/blob/master/how-to-use-azureml/automated-machine-learning/classification/auto-ml-classification.ipynb)."
339-
]
340-
},
341-
{
342-
"cell_type": "markdown",
343-
"metadata": {},
344-
"source": [
345-
"### Deploy\n",
346-
"\n",
347-
"To deploy the model into a web service endpoint, see _Deploy_ section in [this sample notebook](https://github.com/Azure/MachineLearningNotebooks/blob/master/how-to-use-azureml/automated-machine-learning/classification-with-deployment/auto-ml-classification-with-deployment.ipynb)"
337+
"The fitted_model is a python object and you can read the different properties of the object.\n"
348338
]
349339
},
350340
{

how-to-use-azureml/automated-machine-learning/continuous-retraining/auto-ml-continuous-retraining.ipynb

+40-22
Original file line numberDiff line numberDiff line change
@@ -210,7 +210,24 @@
210210
"metadata": {},
211211
"source": [
212212
"## Data Ingestion Pipeline \n",
213-
"For this demo, we will use NOAA weather data from [Azure Open Datasets](https://azure.microsoft.com/services/open-datasets/). You can replace this with your own dataset, or you can skip this pipeline if you already have a time-series based `TabularDataset`.\n",
213+
"For this demo, we will use NOAA weather data from [Azure Open Datasets](https://azure.microsoft.com/services/open-datasets/). You can replace this with your own dataset, or you can skip this pipeline if you already have a time-series based `TabularDataset`.\n"
214+
]
215+
},
216+
{
217+
"cell_type": "code",
218+
"execution_count": null,
219+
"metadata": {},
220+
"outputs": [],
221+
"source": [
222+
"# The name and target column of the Dataset to create \n",
223+
"dataset = \"NOAA-Weather-DS4\"\n",
224+
"target_column_name = \"temperature\""
225+
]
226+
},
227+
{
228+
"cell_type": "markdown",
229+
"metadata": {},
230+
"source": [
214231
"\n",
215232
"### Upload Data Step\n",
216233
"The data ingestion pipeline has a single step with a script to query the latest weather data and upload it to the blob store. During the first run, the script will create and register a time-series based `TabularDataset` with the past one week of weather data. For each subsequent run, the script will create a partition in the blob store by querying NOAA for new weather data since the last modified time of the dataset (`dataset.data_changed_time`) and creating a data.csv file."
@@ -225,8 +242,6 @@
225242
"from azureml.pipeline.core import Pipeline, PipelineParameter\n",
226243
"from azureml.pipeline.steps import PythonScriptStep\n",
227244
"\n",
228-
"# The name of the Dataset to create \n",
229-
"dataset = \"NOAA-Weather-DS4\"\n",
230245
"ds_name = PipelineParameter(name=\"ds_name\", default_value=dataset)\n",
231246
"upload_data_step = PythonScriptStep(script_name=\"upload_weather_data.py\", \n",
232247
" allow_reuse=False,\n",
@@ -272,7 +287,7 @@
272287
"## Training Pipeline\n",
273288
"### Prepare Training Data Step\n",
274289
"\n",
275-
"Script to bring data into common X,y format. We need to set allow_reuse flag to False to allow the pipeline to run even when inputs don't change. We also need the name of the model to check the time the model was last trained."
290+
"Script to check if new data is available since the model was last trained. If no new data is available, we cancel the remaining pipeline steps. We need to set allow_reuse flag to False to allow the pipeline to run even when inputs don't change. We also need the name of the model to check the time the model was last trained."
276291
]
277292
},
278293
{
@@ -283,11 +298,8 @@
283298
"source": [
284299
"from azureml.pipeline.core import PipelineData\n",
285300
"\n",
286-
"target_column = PipelineParameter(\"target_column\", default_value=\"y\")\n",
287301
"# The model name with which to register the trained model in the workspace.\n",
288-
"model_name = PipelineParameter(\"model_name\", default_value=\"y\")\n",
289-
"output_x = PipelineData(\"output_x\", datastore=dstor)\n",
290-
"output_y = PipelineData(\"output_y\", datastore=dstor)"
302+
"model_name = PipelineParameter(\"model_name\", default_value=\"noaaweatherds\")"
291303
]
292304
},
293305
{
@@ -299,16 +311,23 @@
299311
"data_prep_step = PythonScriptStep(script_name=\"check_data.py\", \n",
300312
" allow_reuse=False,\n",
301313
" name=\"check_data\",\n",
302-
" arguments=[\"--target_column\", target_column,\n",
303-
" \"--output_x\", output_x,\n",
304-
" \"--output_y\", output_y,\n",
305-
" \"--ds_name\", ds_name,\n",
306-
" \"--model_name\", model_name],\n",
307-
" outputs=[output_x, output_y], \n",
314+
" arguments=[\"--ds_name\", ds_name,\n",
315+
" \"--model_name\", model_name],\n",
308316
" compute_target=compute_target, \n",
309317
" runconfig=conda_run_config)"
310318
]
311319
},
320+
{
321+
"cell_type": "code",
322+
"execution_count": null,
323+
"metadata": {},
324+
"outputs": [],
325+
"source": [
326+
"from azureml.core import Dataset\n",
327+
"train_ds = Dataset.get_by_name(ws, dataset)\n",
328+
"train_ds = train_ds.drop_columns([\"partition_date\"])"
329+
]
330+
},
312331
{
313332
"cell_type": "markdown",
314333
"metadata": {},
@@ -324,11 +343,11 @@
324343
"outputs": [],
325344
"source": [
326345
"from azureml.train.automl import AutoMLConfig\n",
327-
"from azureml.train.automl.runtime import AutoMLStep\n",
346+
"from azureml.train.automl import AutoMLStep\n",
328347
"\n",
329348
"automl_settings = {\n",
330-
" \"iteration_timeout_minutes\": 20,\n",
331-
" \"experiment_timeout_minutes\": 30,\n",
349+
" \"iteration_timeout_minutes\": 10,\n",
350+
" \"experiment_timeout_minutes\": 10,\n",
332351
" \"n_cross_validations\": 3,\n",
333352
" \"primary_metric\": 'r2_score',\n",
334353
" \"preprocess\": True,\n",
@@ -342,8 +361,8 @@
342361
" debug_log = 'automl_errors.log',\n",
343362
" path = \".\",\n",
344363
" compute_target=compute_target,\n",
345-
" run_configuration=conda_run_config,\n",
346-
" data_script = \"get_data.py\",\n",
364+
" training_data = train_ds,\n",
365+
" label_column_name = target_column_name,\n",
347366
" **automl_settings\n",
348367
" )"
349368
]
@@ -378,7 +397,6 @@
378397
"automl_step = AutoMLStep(\n",
379398
" name='automl_module',\n",
380399
" automl_config=automl_config,\n",
381-
" inputs=[output_x, output_y],\n",
382400
" outputs=[metirics_data, model_data],\n",
383401
" allow_reuse=False)"
384402
]
@@ -432,7 +450,7 @@
432450
"outputs": [],
433451
"source": [
434452
"training_pipeline_run = experiment.submit(training_pipeline, pipeline_parameters={\n",
435-
" \"target_column\": \"temperature\", \"ds_name\": dataset, \"model_name\": \"noaaweatherds\"})"
453+
" \"ds_name\": dataset, \"model_name\": \"noaaweatherds\"})"
436454
]
437455
},
438456
{
@@ -475,7 +493,7 @@
475493
"source": [
476494
"from azureml.pipeline.core import Schedule\n",
477495
"schedule = Schedule.create(workspace=ws, name=\"RetrainingSchedule\",\n",
478-
" pipeline_parameters={\"target_column\": \"temperature\",\"ds_name\": dataset, \"model_name\": \"noaaweatherds\"},\n",
496+
" pipeline_parameters={\"ds_name\": dataset, \"model_name\": \"noaaweatherds\"},\n",
479497
" pipeline_id=published_pipeline.id, \n",
480498
" experiment_name=experiment_name, \n",
481499
" datastore=dstor,\n",

how-to-use-azureml/automated-machine-learning/continuous-retraining/check_data.py

+6-35
Original file line numberDiff line numberDiff line change
@@ -15,32 +15,16 @@
1515
else:
1616
ws = run.experiment.workspace
1717

18-
19-
def write_output(df, path):
20-
os.makedirs(path, exist_ok=True)
21-
print("%s created" % path)
22-
df.to_csv(path + "/part-00000", index=False)
23-
24-
25-
print("Check for new data and prepare the data")
18+
print("Check for new data.")
2619

2720
parser = argparse.ArgumentParser("split")
28-
parser.add_argument("--target_column", type=str, help="input split features")
2921
parser.add_argument("--ds_name", help="input dataset name")
3022
parser.add_argument("--model_name", help="name of the deployed model")
31-
parser.add_argument("--output_x", type=str,
32-
help="output features")
33-
parser.add_argument("--output_y", type=str,
34-
help="output labels")
35-
3623

3724
args = parser.parse_args()
3825

3926
print("Argument 1(ds_name): %s" % args.ds_name)
40-
print("Argument 2(target_column): %s" % args.target_column)
41-
print("Argument 3(model_name): %s" % args.model_name)
42-
print("Argument 4(output_x): %s" % args.output_x)
43-
print("Argument 5(output_y): %s" % args.output_y)
27+
print("Argument 2(model_name): %s" % args.model_name)
4428

4529
# Get the latest registered model
4630
try:
@@ -54,22 +38,9 @@ def write_output(df, path):
5438
train_ds = Dataset.get_by_name(ws, args.ds_name)
5539
dataset_changed_time = train_ds.data_changed_time
5640

57-
if dataset_changed_time > last_train_time:
58-
# New data is available since the model was last trained
59-
print("Dataset was last updated on {0}. Retraining...".format(dataset_changed_time))
60-
train_ds = train_ds.drop_columns(["partition_date"])
61-
X_train = train_ds.drop_columns(
62-
columns=[args.target_column]).to_pandas_dataframe()
63-
y_train = train_ds.keep_columns(
64-
columns=[args.target_column]).to_pandas_dataframe()
65-
66-
non_null = y_train[args.target_column].notnull()
67-
y = y_train[non_null]
68-
X = X_train[non_null]
69-
70-
if not (args.output_x is None and args.output_y is None):
71-
write_output(X, args.output_x)
72-
write_output(y, args.output_y)
73-
else:
41+
if not dataset_changed_time > last_train_time:
7442
print("Cancelling run since there is no new data.")
7543
run.parent.cancel()
44+
else:
45+
# New data is available since the model was last trained
46+
print("Dataset was last updated on {0}. Retraining...".format(dataset_changed_time))

how-to-use-azureml/automated-machine-learning/continuous-retraining/get_data.py

-15
This file was deleted.

how-to-use-azureml/automated-machine-learning/continuous-retraining/upload_weather_data.py

+6-6
Original file line numberDiff line numberDiff line change
@@ -58,7 +58,7 @@ def get_noaa_data(start_time, end_time):
5858
print(traceback.format_exc())
5959
print("Dataset with name {0} not found, registering new dataset.".format(args.ds_name))
6060
register_dataset = True
61-
end_time_last_slice = datetime.today() - relativedelta(weeks=1)
61+
end_time_last_slice = datetime.today() - relativedelta(weeks=2)
6262

6363
end_time = datetime.utcnow()
6464
train_df = get_noaa_data(end_time_last_slice, end_time)
@@ -80,10 +80,10 @@ def get_noaa_data(start_time, end_time):
8080
target_path=folder_name,
8181
overwrite=True,
8282
show_progress=True)
83-
84-
if register_dataset:
85-
ds = Dataset.Tabular.from_delimited_files(dstor.path("{}/**/*.csv".format(
86-
args.ds_name)), partition_format='/{partition_date:yyyy/MM/dd/hh/mm/ss}/data.csv')
87-
ds.register(ws, name=args.ds_name)
8883
else:
8984
print("No new data since {0}.".format(end_time_last_slice))
85+
86+
if register_dataset:
87+
ds = Dataset.Tabular.from_delimited_files(dstor.path("{}/**/*.csv".format(
88+
args.ds_name)), partition_format='/{partition_date:yyyy/MM/dd/HH/mm/ss}/data.csv')
89+
ds.register(ws, name=args.ds_name)

how-to-use-azureml/automated-machine-learning/forecasting-grouping/build.py

+2-2
Original file line numberDiff line numberDiff line change
@@ -30,11 +30,11 @@ def _get_configs(automlconfig: AutoMLConfig,
3030
groups = _get_groups(data, group_column_names)
3131
configs = {}
3232
for i, group in groups.iterrows():
33-
single = data
33+
single = data._dataflow
3434
group_name = "#####".join(str(x) for x in group.values)
3535
group_name = valid_chars.sub('', group_name)
3636
for key in group.index:
37-
single = single._dataflow.filter(data._dataflow[key] == group[key])
37+
single = single.filter(data._dataflow[key] == group[key])
3838
t_dataset = TabularDataset._create(single)
3939
group_conf = copy.deepcopy(automlconfig)
4040
group_conf.user_settings['training_data'] = t_dataset

how-to-use-azureml/automated-machine-learning/regression-hardware-performance-explanation-and-featurization/auto-ml-regression-hardware-performance-explanation-and-featurization.ipynb

+1-12
Original file line numberDiff line numberDiff line change
@@ -558,7 +558,6 @@
558558
"\n",
559559
"# specify CondaDependencies obj\n",
560560
"conda_run_config.environment.python.conda_dependencies = CondaDependencies.create(\n",
561-
" conda_packages=['scikit-learn', 'numpy','py-xgboost<=0.80'],\n",
562561
" pip_packages=azureml_pip_packages)"
563562
]
564563
},
@@ -718,17 +717,7 @@
718717
"metadata": {},
719718
"outputs": [],
720719
"source": [
721-
"from azureml.core.conda_dependencies import CondaDependencies \n",
722-
"\n",
723-
"azureml_pip_packages = [\n",
724-
" 'azureml-explain-model', 'azureml-train-automl', 'azureml-defaults'\n",
725-
"]\n",
726-
" \n",
727-
"\n",
728-
"# specify CondaDependencies obj\n",
729-
"myenv = CondaDependencies.create(conda_packages=['scikit-learn', 'pandas', 'numpy', 'py-xgboost<=0.80'],\n",
730-
" pip_packages=azureml_pip_packages,\n",
731-
" pin_sdk_version=True)\n",
720+
"myenv = automl_run.get_environment().python.conda_dependencies\n",
732721
"\n",
733722
"with open(\"myenv.yml\",\"w\") as f:\n",
734723
" f.write(myenv.serialize_to_string())\n",

0 commit comments

Comments
 (0)