Skip to content

Commit 54e05b4

Browse files
[Test/Azure] Fix the torch version in examples for smoke test and change the credential for Azure (skypilot-org#1330)
* Upgrade images for three clouds * Fix cuda version * pin cuda version for torch * Fix torch version * fix comments * Fix azure provider * fix credential * revert back to previous azure image * switch back to cuda 11.3 for pytorch due to azure's image * fix torch installation * increase the multi-node timeout * Update sky/clouds/azure.py Co-authored-by: Zongheng Yang <[email protected]> * revert aws image version * pin cu113 for huggingface * Add comment * format * Update sky/clouds/aws.py Co-authored-by: Zongheng Yang <[email protected]> * Update sky/clouds/gcp.py Co-authored-by: Zongheng Yang <[email protected]> * revert gcp image * Fix doc Co-authored-by: Zongheng Yang <[email protected]>
1 parent 1c88b13 commit 54e05b4

File tree

9 files changed

+42
-15
lines changed

9 files changed

+42
-15
lines changed

docs/source/examples/distributed-jobs.rst

+2-1
Original file line numberDiff line numberDiff line change
@@ -20,8 +20,9 @@ For example, here is a simple PyTorch Distributed training example:
2020
setup: |
2121
pip3 install --upgrade pip
2222
git clone https://github.com/michaelzhiluo/pytorch-distributed-resnet
23+
cd pytorch-distributed-resnet
2324
# SkyPilot's default image on AWS/GCP has CUDA 11.6 (Azure 11.5).
24-
cd pytorch-distributed-resnet && pip3 install -r requirements.txt torch==1.12.1+cu113 --extra-index-url https://download.pytorch.org/whl/cu113
25+
pip3 install -r requirements.txt torch==1.12.1+cu113 --extra-index-url https://download.pytorch.org/whl/cu113
2526
mkdir -p data && mkdir -p saved_models && cd data && \
2627
wget -c --quiet https://www.cs.toronto.edu/~kriz/cifar-10-python.tar.gz
2728
tar -xvzf cifar-10-python.tar.gz

examples/huggingface_glue_imdb_app.yaml

+2-1
Original file line numberDiff line numberDiff line change
@@ -13,7 +13,8 @@ setup: |
1313
git checkout v4.21.0
1414
pip3 install .
1515
cd examples/pytorch/text-classification
16-
pip3 install -r requirements.txt
16+
# SkyPilot's default image on AWS/GCP has CUDA 11.6 (Azure 11.5).
17+
pip3 install -r requirements.txt torch==1.12.1+cu113 --extra-index-url https://download.pytorch.org/whl/cu113
1718
1819
# The command to run. Will be run under the working directory.
1920
run: |

examples/resnet_distributed_torch.yaml

+3-1
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,9 @@ num_nodes: 2
99
setup: |
1010
pip3 install --upgrade pip
1111
git clone https://github.com/michaelzhiluo/pytorch-distributed-resnet
12-
cd pytorch-distributed-resnet && pip3 install -r requirements.txt torch==1.12.1+cu113 --extra-index-url https://download.pytorch.org/whl/cu113
12+
cd pytorch-distributed-resnet
13+
# SkyPilot's default image on AWS/GCP has CUDA 11.6 (Azure 11.5).
14+
pip3 install -r requirements.txt torch==1.12.1+cu113 --extra-index-url https://download.pytorch.org/whl/cu113
1315
mkdir -p data && mkdir -p saved_models && cd data && \
1416
wget -c --quiet https://www.cs.toronto.edu/~kriz/cifar-10-python.tar.gz
1517
tar -xvzf cifar-10-python.tar.gz

examples/resnet_distributed_torch_app.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,7 @@
99
setup = 'echo \"alias python=python3\" >> ~/.bashrc && pip3 install --upgrade pip && \
1010
[ -d pytorch-distributed-resnet ] || \
1111
(git clone https://github.com/michaelzhiluo/pytorch-distributed-resnet && \
12-
cd pytorch-distributed-resnet && pip3 install -r requirements.txt && \
12+
cd pytorch-distributed-resnet && pip3 install -r requirements.txt torch==1.12.1+cu113 --extra-index-url https://download.pytorch.org/whl/cu113 && \
1313
mkdir -p data && mkdir -p saved_models && cd data && \
1414
wget -c --quiet https://www.cs.toronto.edu/~kriz/cifar-10-python.tar.gz && \
1515
tar -xvzf cifar-10-python.tar.gz)'

examples/resnet_distributed_torch_scripts/setup.sh

+3-2
Original file line numberDiff line numberDiff line change
@@ -7,10 +7,11 @@ if [ $? -eq 0 ]; then
77
echo "conda env exists"
88
else
99
echo "conda env does not exist"
10-
conda create -n resnet python=3.6 -y
10+
conda create -n resnet python=3.7 -y
1111
conda activate resnet
1212
fi
13-
pip install -r requirements.txt
13+
# SkyPilot's default image on AWS/GCP has CUDA 11.6 (Azure 11.5).
14+
pip install -r requirements.txt torch==1.12.1+cu113 --extra-index-url https://download.pytorch.org/whl/cu113
1415

1516
mkdir -p data
1617
mkdir -p saved_models

sky/clouds/aws.py

+13-2
Original file line numberDiff line numberDiff line change
@@ -110,9 +110,20 @@ def get_default_ami(cls, region_name: str, instance_type: str) -> str:
110110
}
111111
assert region_name in amis, region_name
112112
return amis[region_name]
113-
# Deep Learning AMI GPU PyTorch 1.10.0 (Ubuntu 20.04) 20220308
114113
# https://console.aws.amazon.com/ec2/v2/home?region=us-east-1#Images:visibility=public-images;v=3;search=:64,:Ubuntu%2020,:Deep%20Learning%20AMI%20GPU%20PyTorch # pylint: disable=line-too-long
115-
# Nvidia driver: 510.47.03, CUDA Version: 11.6
114+
115+
# Commented below are newer AMIs, but as other clouds do not support
116+
# torch==1.13.0+cu117 we do not use these AMIs to avoid frequent updates:
117+
# Deep Learning AMI GPU PyTorch 1.12.1 (Ubuntu 20.04) 20221025
118+
# Nvidia driver: 510.47.03, CUDA Version: 11.6 (supports torch==1.13.0+cu117)
119+
# 'us-east-1': 'ami-0eb1f91977a3fcc1b'
120+
# 'us-east-2': 'ami-0274a6db2e19b7cc6'
121+
# 'us-west-1': 'ami-0fb299af41d32cfd3'
122+
# 'us-west-2': 'ami-04ba15f9bd464eb20'
123+
#
124+
# Current AMIs:
125+
# Deep Learning AMI GPU PyTorch 1.10.0 (Ubuntu 20.04) 20220308
126+
# Nvidia driver: 510.47.03, CUDA Version: 11.6 (does not support torch==1.13.0+cu117)
116127
amis = {
117128
'us-east-1': 'ami-0729d913a335efca7',
118129
'us-east-2': 'ami-070f4af81c19b41bf',

sky/clouds/azure.py

+7
Original file line numberDiff line numberDiff line change
@@ -82,6 +82,13 @@ def get_default_instance_type(cls) -> str:
8282
return 'Standard_D8_v4'
8383

8484
def _get_image_config(self, gen_version, instance_type):
85+
# az vm image list \
86+
# --publisher microsoft-dsvm --all --output table
87+
# nvidia-driver: 495.29.05, cuda: 11.5
88+
89+
# The latest image 2022.09.14/2022.08.11/22.06.10/22.05.11/
90+
# 22.04.27/22.04.05 has even older nvidia driver 470.57.02,
91+
# cuda: 11.4
8592
image_config = {
8693
'image_publisher': 'microsoft-dsvm',
8794
'image_offer': 'ubuntu-2004',

sky/clouds/gcp.py

+8
Original file line numberDiff line numberDiff line change
@@ -214,6 +214,9 @@ def make_deploy_resources_variables(
214214
region_name = region.name
215215
zones = [zones[0].name]
216216

217+
# gcloud compute images list \
218+
# --project deeplearning-platform-release \
219+
# --no-standard-images
217220
image_id = _IMAGE_ID_PREFIX + 'common-cpu-v20220806'
218221

219222
r = resources
@@ -255,10 +258,15 @@ def make_deploy_resources_variables(
255258
acc.lower())
256259
resources_vars['gpu_count'] = acc_count
257260
if acc == 'K80':
261+
# Though the image is called cu113, it actually has later
262+
# versions of CUDA as noted below.
258263
# CUDA driver version 470.57.02, CUDA Library 11.4
259264
image_id = _IMAGE_ID_PREFIX + 'common-cu113-v20220701'
260265
else:
266+
# Though the image is called cu113, it actually has later
267+
# versions of CUDA as noted below.
261268
# CUDA driver version 510.47.03, CUDA Library 11.6
269+
# Does not support torch==1.13.0 with cu117
262270
image_id = _IMAGE_ID_PREFIX + 'common-cu113-v20220806'
263271

264272
if resources.image_id is not None:

sky/skylet/providers/azure/node_provider.py

+3-7
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@
44
from threading import RLock
55
from uuid import uuid4
66

7-
from azure.identity import DefaultAzureCredential
7+
from azure.identity import AzureCliCredential
88
from azure.mgmt.compute import ComputeManagementClient
99
from azure.mgmt.network import NetworkManagementClient
1010
from azure.mgmt.resource import ResourceManagementClient
@@ -65,12 +65,8 @@ def __init__(self, provider_config, cluster_name):
6565
_configure_resource_group({"provider": provider_config})
6666
subscription_id = provider_config["subscription_id"]
6767
self.cache_stopped_nodes = provider_config.get("cache_stopped_nodes", True)
68-
# AWS provides managed identity for Azure, but it is not setup properly by
69-
# default. This interferes with azure-cli credentials and causes failures,
70-
# when using sky to launch Azure on AWS ec2 instances. We disable it to give
71-
# way to azure-cli credentials.
72-
credential = DefaultAzureCredential(exclude_shared_token_cache_credential=True,
73-
exclude_managed_identity_credential=True)
68+
# Sky only supports Azure CLI credential for now.
69+
credential = AzureCliCredential()
7470
self.compute_client = ComputeManagementClient(credential, subscription_id)
7571
self.network_client = NetworkManagementClient(credential, subscription_id)
7672
self.resource_client = ResourceManagementClient(credential, subscription_id)

0 commit comments

Comments
 (0)