Skip to content

Commit 5fd14ba

Browse files
authored
Merge pull request Azure#199 from rastala/master
update automl databricks
2 parents 42f2822 + 3fa4095 commit 5fd14ba

File tree

2 files changed

+63
-44
lines changed

2 files changed

+63
-44
lines changed

how-to-use-azureml/azure-databricks/automl/automl-databricks-local-01.ipynb

Lines changed: 22 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -123,13 +123,6 @@
123123
"ws.get_details()"
124124
]
125125
},
126-
{
127-
"cell_type": "code",
128-
"execution_count": null,
129-
"metadata": {},
130-
"outputs": [],
131-
"source": []
132-
},
133126
{
134127
"cell_type": "markdown",
135128
"metadata": {},
@@ -270,15 +263,14 @@
270263
"#If your data is in a dataframe, please use read_pandas_dataframe to convert a dataframe to dataflow before usind dprep.\n",
271264
"\n",
272265
"import azureml.dataprep as dprep\n",
273-
"# You can use `auto_read_file` which intelligently figures out delimiters and datatypes of a file.\n",
266+
"\n",
274267
"# The data referenced here was pulled from `sklearn.datasets.load_digits()`.\n",
275268
"simple_example_data_root = 'https://dprepdata.blob.core.windows.net/automl-notebook-data/'\n",
276-
"X_train = dprep.auto_read_file(simple_example_data_root + 'X.csv').skip(1) # Remove the header row.\n",
277269
"\n",
278-
"# You can also use `read_csv` and `to_*` transformations to read (with overridable delimiter)\n",
279-
"# and convert column types manually.\n",
280-
"# Here we read a comma delimited file and convert all columns to integers.\n",
281-
"y_train = dprep.read_csv(simple_example_data_root + 'y.csv').to_long(dprep.ColumnSelector(term='.*', use_regex = True))"
270+
"#Convert Pandas DataFrame to DataFlow\n",
271+
"#The read_pandas_dataframe reader can take a DataFrame and use it as the data source for a Dataflow.\n",
272+
"X_train = dprep.read_pandas_dataframe(pd.read_csv(simple_example_data_root + 'X.csv'), temp_folder='/dbfs/dataset_dataflowX_train') \n",
273+
"y_train = dprep.read_pandas_dataframe(pd.read_csv(simple_example_data_root + 'y.csv'), temp_folder='/dbfs/dataset_dataflowy_train').to_long(dprep.ColumnSelector(term='.*', use_regex = True))\n"
282274
]
283275
},
284276
{
@@ -295,7 +287,16 @@
295287
"metadata": {},
296288
"outputs": [],
297289
"source": [
298-
"X_train.skip(1).head(5)"
290+
"X_train.get_profile()"
291+
]
292+
},
293+
{
294+
"cell_type": "code",
295+
"execution_count": null,
296+
"metadata": {},
297+
"outputs": [],
298+
"source": [
299+
"y_train.get_profile()"
299300
]
300301
},
301302
{
@@ -333,7 +334,8 @@
333334
" debug_log = 'automl_errors.log',\n",
334335
" primary_metric = 'AUC_weighted',\n",
335336
" iteration_timeout_minutes = 10,\n",
336-
" iterations = 30,\n",
337+
" iterations = 5,\n",
338+
" preprocess = True,\n",
337339
" n_cross_validations = 10,\n",
338340
" max_concurrent_iterations = 2, #change it based on number of worker nodes\n",
339341
" verbosity = logging.INFO,\n",
@@ -349,8 +351,7 @@
349351
"source": [
350352
"## Train the Models\n",
351353
"\n",
352-
"Call the `submit` method on the experiment object and pass the run configuration. Execution of local runs is synchronous. Depending on the data and the number of iterations this can run for a while.\n",
353-
"In this example, we specify `show_output = True` to print currently running iterations to the console."
354+
"Call the `submit` method on the experiment object and pass the run configuration. Execution of local runs is synchronous. Depending on the data and the number of iterations this can run for a while."
354355
]
355356
},
356357
{
@@ -359,7 +360,7 @@
359360
"metadata": {},
360361
"outputs": [],
361362
"source": [
362-
"local_run = experiment.submit(automl_config, show_output = True) # for higher runs please use show_output=False and use the below"
363+
"local_run = experiment.submit(automl_config, show_output = False) # for higher runs please use show_output=False and use the below"
363364
]
364365
},
365366
{
@@ -549,11 +550,11 @@
549550
"name": "python",
550551
"nbconvert_exporter": "python",
551552
"pygments_lexer": "ipython3",
552-
"version": "3.7.0"
553+
"version": "3.6.5"
553554
},
554555
"name": "auto-ml-classification-local-adb",
555-
"notebookId": 817220787969977
556+
"notebookId": 587284549713154
556557
},
557558
"nbformat": 4,
558-
"nbformat_minor": 0
559+
"nbformat_minor": 1
559560
}

how-to-use-azureml/azure-databricks/automl/automl-databricks-local-with-deployment.ipynb

Lines changed: 41 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -99,10 +99,10 @@
9999
"metadata": {},
100100
"outputs": [],
101101
"source": [
102-
"subscription_id = \"<Your SubscriptionId>\"\n",
103-
"resource_group = \"<Resource group - new or existing>\"\n",
104-
"workspace_name = \"<workspace to be created>\"\n",
105-
"workspace_region = \"<azureregion>\""
102+
"subscription_id = \"<Your SubscriptionId>\" #you should be owner or contributor\n",
103+
"resource_group = \"<Resource group - new or existing>\" #you should be owner or contributor\n",
104+
"workspace_name = \"<workspace to be created>\" #your workspace name\n",
105+
"workspace_region = \"<azureregion>\" #your region"
106106
]
107107
},
108108
{
@@ -134,7 +134,7 @@
134134
"ws = Workspace.create(name = workspace_name,\n",
135135
" subscription_id = subscription_id,\n",
136136
" resource_group = resource_group, \n",
137-
" location = workspace_region,\n",
137+
" location = workspace_region, \n",
138138
" exist_ok=True)\n",
139139
"ws.get_details()"
140140
]
@@ -160,7 +160,8 @@
160160
" resource_group = resource_group)\n",
161161
"\n",
162162
"# Persist the subscription id, resource group name, and workspace name in aml_config/config.json.\n",
163-
"ws.write_config()"
163+
"ws.write_config()\n",
164+
"write_config(path=\"/databricks/driver/aml_config/\",file_name=<alias_conf.cfg>)"
164165
]
165166
},
166167
{
@@ -262,6 +263,13 @@
262263
"set_diagnostics_collection(send_diagnostics = True)"
263264
]
264265
},
266+
{
267+
"cell_type": "markdown",
268+
"metadata": {},
269+
"source": [
270+
"## Covert Pandas Dataframe to DataFlow"
271+
]
272+
},
265273
{
266274
"cell_type": "markdown",
267275
"metadata": {},
@@ -276,15 +284,16 @@
276284
"outputs": [],
277285
"source": [
278286
"import azureml.dataprep as dprep\n",
279-
"# You can use `auto_read_file` which intelligently figures out delimiters and datatypes of a file.\n",
287+
"\n",
280288
"# The data referenced here was pulled from `sklearn.datasets.load_digits()`.\n",
281289
"simple_example_data_root = 'https://dprepdata.blob.core.windows.net/automl-notebook-data/'\n",
282-
"X_train = dprep.auto_read_file(simple_example_data_root + 'X.csv').skip(1) # Remove the header row.\n",
283290
"\n",
284-
"# You can also use `read_csv` and `to_*` transformations to read (with overridable delimiter)\n",
285-
"# and convert column types manually.\n",
286-
"# Here we read a comma delimited file and convert all columns to integers.\n",
287-
"y_train = dprep.read_csv(simple_example_data_root + 'y.csv').to_long(dprep.ColumnSelector(term='.*', use_regex = True))"
291+
"#Convert Pandas DataFrame to DataFlow\n",
292+
"#The read_pandas_dataframe reader can take a DataFrame and use it as the data source for a Dataflow.\n",
293+
"X_train = dprep.read_pandas_dataframe(pd.read_csv(simple_example_data_root + 'X.csv'), temp_folder='/dbfs/dataset_dataflowX_train') \n",
294+
"y_train = dprep.read_pandas_dataframe(pd.read_csv(simple_example_data_root + 'y.csv'), temp_folder='/dbfs/dataset_dataflowy_train').to_long(dprep.ColumnSelector(term='.*', use_regex = True))\n",
295+
"\n",
296+
"\n"
288297
]
289298
},
290299
{
@@ -301,7 +310,16 @@
301310
"metadata": {},
302311
"outputs": [],
303312
"source": [
304-
"X_train.skip(1).head(5)"
313+
"X_train.get_profile()"
314+
]
315+
},
316+
{
317+
"cell_type": "code",
318+
"execution_count": null,
319+
"metadata": {},
320+
"outputs": [],
321+
"source": [
322+
"y_train.get_profile()"
305323
]
306324
},
307325
{
@@ -339,14 +357,14 @@
339357
" debug_log = 'automl_errors.log',\n",
340358
" primary_metric = 'AUC_weighted',\n",
341359
" iteration_timeout_minutes = 10,\n",
342-
" iterations = 5,\n",
343-
" n_cross_validations = 2,\n",
344-
" max_concurrent_iterations = 4, #change it based on number of worker nodes\n",
360+
" iterations = 30,\n",
361+
" preprocess = True,\n",
362+
" n_cross_validations = 10,\n",
363+
" max_concurrent_iterations = 2, #change it based on number of worker nodes\n",
345364
" verbosity = logging.INFO,\n",
346365
" spark_context=sc, #databricks/spark related\n",
347366
" X = X_train, \n",
348367
" y = y_train,\n",
349-
" enable_cache=False,\n",
350368
" path = project_folder)"
351369
]
352370
},
@@ -356,8 +374,7 @@
356374
"source": [
357375
"## Train the Models\n",
358376
"\n",
359-
"Call the `submit` method on the experiment object and pass the run configuration. Execution of local runs is synchronous. Depending on the data and the number of iterations this can run for a while.\n",
360-
"In this example, we specify `show_output = True` to print currently running iterations to the console."
377+
"Call the `submit` method on the experiment object and pass the run configuration. Execution of local runs is synchronous. Depending on the data and the number of iterations this can run for a while."
361378
]
362379
},
363380
{
@@ -366,7 +383,7 @@
366383
"metadata": {},
367384
"outputs": [],
368385
"source": [
369-
"local_run = experiment.submit(automl_config, show_output = True) # for higher runs please use show_output=False and use the below"
386+
"local_run = experiment.submit(automl_config, show_output = False) # for higher runs please use show_output=False and use the below"
370387
]
371388
},
372389
{
@@ -419,6 +436,7 @@
419436
"metricslist = {}\n",
420437
"for run in children:\n",
421438
" properties = run.get_properties()\n",
439+
" #print(properties)\n",
422440
" metrics = {k: v for k, v in run.get_metrics().items() if isinstance(v, float)} \n",
423441
" metricslist[int(properties['iteration'])] = metrics\n",
424442
"\n",
@@ -694,11 +712,11 @@
694712
"name": "python",
695713
"nbconvert_exporter": "python",
696714
"pygments_lexer": "ipython3",
697-
"version": "3.7.0"
715+
"version": "3.6.5"
698716
},
699717
"name": "auto-ml-classification-local-adb",
700-
"notebookId": 3888835968049288
718+
"notebookId": 2733885892129020
701719
},
702720
"nbformat": 4,
703-
"nbformat_minor": 0
721+
"nbformat_minor": 1
704722
}

0 commit comments

Comments
 (0)