Skip to content

Commit e536f7e

Browse files
authoredOct 24, 2022
[UX/Programmatic API] Support task in launch/exec/spot_launch (skypilot-org#1293)
* Support task in launch/exec/spot_launch * Fix resnet_app * fix spot launch and resnet_app * address comments * update examples * copy when converting * rename to task * fix * address comments
1 parent 561c21f commit e536f7e

8 files changed

+325
-309
lines changed
 

‎examples/huggingface_glue_imdb_grid_search_app.py

+21-22
Original file line numberDiff line numberDiff line change
@@ -17,30 +17,29 @@
1717
sky.launch(dag, cluster_name='hgs', detach_run=True)
1818

1919
for lr in [1e-5, 2e-5, 3e-5, 4e-5]:
20-
with sky.Dag() as dag:
21-
# To be filled in: {lr}.
22-
run_format = f"""\
23-
cd transformers/examples/pytorch/text-classification
24-
python3 run_glue.py
25-
--learning_rate {lr}
26-
--output_dir /tmp/imdb-{lr}/
27-
--model_name_or_path bert-base-cased
28-
--dataset_name imdb
29-
--do_train
30-
--max_seq_length 128
31-
--per_device_train_batch_size 32
32-
--max_steps 50
33-
--fp16 --overwrite_output_dir 2>&1 | tee run-{lr}.log'
34-
"""
20+
# To be filled in: {lr}.
21+
run_format = f"""\
22+
cd transformers/examples/pytorch/text-classification
23+
python3 run_glue.py
24+
--learning_rate {lr}
25+
--output_dir /tmp/imdb-{lr}/
26+
--model_name_or_path bert-base-cased
27+
--dataset_name imdb
28+
--do_train
29+
--max_seq_length 128
30+
--per_device_train_batch_size 32
31+
--max_steps 50
32+
--fp16 --overwrite_output_dir 2>&1 | tee run-{lr}.log'
33+
"""
3534

36-
per_trial_resources = sky.Resources(accelerators={'V100': 1})
35+
per_trial_resources = sky.Resources(accelerators={'V100': 1})
3736

38-
task = sky.Task(
39-
# A descriptive name.
40-
f'task-{lr}',
41-
# Run command for each task, with different lr.
42-
run=run_format.format(lr=lr)).set_resources(per_trial_resources)
37+
task = sky.Task(
38+
# A descriptive name.
39+
f'task-{lr}',
40+
# Run command for each task, with different lr.
41+
run=run_format.format(lr=lr)).set_resources(per_trial_resources)
4342

4443
# Set 'stream_logs=False' to not mix all tasks' outputs together.
4544
# Each task's output is redirected to run-{lr}.log and can be tail-ed.
46-
sky.exec(dag, cluster_name='hgs', stream_logs=False, detach_run=True)
45+
sky.exec(task, cluster_name='hgs', stream_logs=False, detach_run=True)

‎examples/multi_echo.py

+4-5
Original file line numberDiff line numberDiff line change
@@ -27,11 +27,10 @@ def run(cluster: Optional[str] = None):
2727

2828
# Submit multiple tasks in parallel to trigger queueing behaviors.
2929
def _exec(i):
30-
with sky.Dag() as dag:
31-
task = sky.Task(run=f'echo {i}; sleep 5')
32-
resources = sky.Resources(accelerators={'K80': 0.5})
33-
task.set_resources(resources)
34-
sky.exec(dag, cluster_name=cluster, detach_run=True)
30+
task = sky.Task(run=f'echo {i}; sleep 5')
31+
resources = sky.Resources(accelerators={'K80': 0.5})
32+
task.set_resources(resources)
33+
sky.exec(task, cluster_name=cluster, detach_run=True)
3534

3635
with pool.ThreadPool(8) as p:
3736
list(p.imap(_exec, range(32)))

‎examples/resnet_app.py

+74-70
Original file line numberDiff line numberDiff line change
@@ -2,83 +2,87 @@
22

33
import sky
44

5-
with sky.Dag() as dag:
6-
# The working directory contains all code and will be synced to remote.
7-
workdir = '~/Downloads/tpu'
8-
subprocess.run(f'cd {workdir} && git checkout 222cc86',
9-
shell=True,
10-
check=True)
5+
# The working directory contains all code and will be synced to remote.
6+
workdir = '~/Downloads/tpu'
7+
subprocess.run(f'cd {workdir} && git checkout 222cc86', shell=True, check=True)
118

12-
# The setup command. Will be run under the working directory.
13-
setup = 'pip install --upgrade pip && \
14-
conda init bash && \
15-
conda activate resnet || \
16-
(conda create -n resnet python=3.7 -y && \
17-
conda activate resnet && \
18-
conda install cudatoolkit=11.0 -y && \
19-
pip install tensorflow==2.4.0 pyyaml && \
20-
pip install protobuf==3.20 && \
21-
cd models && pip install -e .)'
9+
# The setup command. Will be run under the working directory.
10+
setup = """\
11+
set -e
12+
pip install --upgrade pip
13+
conda init bash
14+
conda activate resnet && exists=1 || exists=0
15+
if [ $exists -eq 0 ]; then
16+
conda create -n resnet python=3.7 -y
17+
conda activate resnet
18+
conda install cudatoolkit=11.0 -y
19+
pip install tensorflow==2.4.0 pyyaml
20+
pip install protobuf==3.20
21+
cd models && pip install -e .
22+
fi
23+
"""
2224

23-
# The command to run. Will be run under the working directory.
24-
run = 'conda activate resnet && \
25-
export XLA_FLAGS=\'--xla_gpu_cuda_data_dir=/usr/local/cuda/\' && \
26-
python -u models/official/resnet/resnet_main.py --use_tpu=False \
25+
# The command to run. Will be run under the working directory.
26+
run = """\
27+
conda activate resnet
28+
export XLA_FLAGS=\'--xla_gpu_cuda_data_dir=/usr/local/cuda/\'
29+
python -u models/official/resnet/resnet_main.py --use_tpu=False \
2730
--mode=train --train_batch_size=256 --train_steps=250 \
2831
--iterations_per_loop=125 \
2932
--data_dir=gs://cloud-tpu-test-datasets/fake_imagenet \
3033
--model_dir=resnet-model-dir \
31-
--amp --xla --loss_scale=128'
34+
--amp --xla --loss_scale=128
35+
"""
3236

33-
### Optional: download data to VM's local disks. ###
34-
# Format: {VM paths: local paths / cloud URLs}.
35-
file_mounts = {
36-
# Download from GCS before training starts.
37-
# '/tmp/fake_imagenet': 'gs://cloud-tpu-test-datasets/fake_imagenet',
38-
}
39-
# Refer to the VM local path.
40-
# run = run.replace('gs://cloud-tpu-test-datasets/fake_imagenet',
41-
# '/tmp/fake_imagenet')
42-
### Optional end ###
37+
### Optional: download data to VM's local disks. ###
38+
# Format: {VM paths: local paths / cloud URLs}.
39+
file_mounts = {
40+
# Download from GCS before training starts.
41+
# '/tmp/fake_imagenet': 'gs://cloud-tpu-test-datasets/fake_imagenet',
42+
}
43+
# Refer to the VM local path.
44+
# run = run.replace('gs://cloud-tpu-test-datasets/fake_imagenet',
45+
# '/tmp/fake_imagenet')
46+
### Optional end ###
4347

44-
train = sky.Task(
45-
'train',
46-
workdir=workdir,
47-
setup=setup,
48-
run=run,
49-
)
50-
train.set_file_mounts(file_mounts)
51-
# TODO: allow option to say (or detect) no download/egress cost.
52-
train.set_inputs('gs://cloud-tpu-test-datasets/fake_imagenet',
53-
estimated_size_gigabytes=70)
54-
train.set_outputs('resnet-model-dir', estimated_size_gigabytes=0.1)
55-
train.set_resources({
56-
##### Fully specified
57-
# sky.Resources(sky.AWS(), 'p3.2xlarge'),
58-
# sky.Resources(sky.GCP(), 'n1-standard-16'),
59-
# sky.Resources(
60-
# sky.GCP(),
61-
# 'n1-standard-8',
62-
# # Options: 'V100', {'V100': <num>}.
63-
# 'V100',
64-
# ),
65-
##### Partially specified
66-
# sky.Resources(accelerators='T4'),
67-
# sky.Resources(accelerators={'T4': 8}, use_spot=True),
68-
# sky.Resources(sky.AWS(), accelerators={'T4': 8}, use_spot=True),
69-
# sky.Resources(sky.AWS(), accelerators='K80'),
70-
# sky.Resources(sky.AWS(), accelerators='K80', use_spot=True),
71-
# sky.Resources(accelerators='tpu-v3-8'),
72-
# sky.Resources(accelerators='V100', use_spot=True),
73-
# sky.Resources(accelerators={'T4': 4}),
74-
sky.Resources(sky.AWS(), accelerators='V100'),
75-
# sky.Resources(sky.GCP(), accelerators={'V100': 4}),
76-
# sky.Resources(sky.AWS(), accelerators='V100', use_spot=True),
77-
# sky.Resources(sky.AWS(), accelerators={'V100': 8}),
78-
})
48+
task = sky.Task(
49+
'train',
50+
workdir=workdir,
51+
setup=setup,
52+
run=run,
53+
)
54+
task.set_file_mounts(file_mounts)
55+
# TODO: allow option to say (or detect) no download/egress cost.
56+
task.set_inputs('gs://cloud-tpu-test-datasets/fake_imagenet',
57+
estimated_size_gigabytes=70)
58+
task.set_outputs('resnet-model-dir', estimated_size_gigabytes=0.1)
59+
task.set_resources({
60+
##### Fully specified
61+
# sky.Resources(sky.AWS(), 'p3.2xlarge'),
62+
# sky.Resources(sky.GCP(), 'n1-standard-16'),
63+
# sky.Resources(
64+
# sky.GCP(),
65+
# 'n1-standard-8',
66+
# # Options: 'V100', {'V100': <num>}.
67+
# 'V100',
68+
# ),
69+
##### Partially specified
70+
# sky.Resources(accelerators='T4'),
71+
# sky.Resources(accelerators={'T4': 8}, use_spot=True),
72+
# sky.Resources(sky.AWS(), accelerators={'T4': 8}, use_spot=True),
73+
# sky.Resources(sky.AWS(), accelerators='K80'),
74+
# sky.Resources(sky.AWS(), accelerators='K80', use_spot=True),
75+
# sky.Resources(accelerators='tpu-v3-8'),
76+
# sky.Resources(accelerators='V100', use_spot=True),
77+
# sky.Resources(accelerators={'T4': 4}),
78+
sky.Resources(sky.AWS(), accelerators='V100'),
79+
# sky.Resources(sky.GCP(), accelerators={'V100': 4}),
80+
# sky.Resources(sky.AWS(), accelerators='V100', use_spot=True),
81+
# sky.Resources(sky.AWS(), accelerators={'V100': 8}),
82+
})
7983

80-
# Optionally, specify a time estimator: Resources -> time in seconds.
81-
# train.set_time_estimator(time_estimators.resnet50_estimate_runtime)
84+
# Optionally, specify a time estimator: Resources -> time in seconds.
85+
# task.set_time_estimator(time_estimators.resnet50_estimate_runtime)
8286

83-
# sky.launch(dag, dryrun=True)
84-
sky.launch(dag)
87+
# sky.launch(task, dryrun=True)
88+
sky.launch(task)

‎examples/resnet_distributed_torch_app.py

+50-49
Original file line numberDiff line numberDiff line change
@@ -2,52 +2,53 @@
22

33
import sky
44

5-
with sky.Dag() as dag:
6-
# Total Nodes, INCLUDING Head Node
7-
num_nodes = 2
8-
9-
# The setup command. Will be run under the working directory.
10-
setup = 'echo \"alias python=python3\" >> ~/.bashrc && pip3 install --upgrade pip && \
11-
[ -d pytorch-distributed-resnet ] || \
12-
(git clone https://github.com/michaelzhiluo/pytorch-distributed-resnet && \
13-
cd pytorch-distributed-resnet && pip3 install -r requirements.txt && \
14-
mkdir -p data && mkdir -p saved_models && cd data && \
15-
wget -c --quiet https://www.cs.toronto.edu/~kriz/cifar-10-python.tar.gz && \
16-
tar -xvzf cifar-10-python.tar.gz)'
17-
18-
# The command to run. Will be run under the working directory.
19-
def run_fn(node_rank: int, ip_list: List[str]) -> Optional[str]:
20-
num_nodes = len(ip_list)
21-
return f"""\
22-
cd pytorch-distributed-resnet
23-
python3 -m torch.distributed.launch --nproc_per_node=1 \
24-
--nnodes={num_nodes} --node_rank={node_rank} --master_addr={ip_list[0]} \
25-
--master_port=8008 resnet_ddp.py --num_epochs 20
26-
"""
27-
28-
train = sky.Task(
29-
'train',
30-
setup=setup,
31-
num_nodes=num_nodes,
32-
run=run_fn,
33-
)
34-
35-
train.set_resources({
36-
##### Fully specified
37-
sky.Resources(sky.AWS(), 'p3.2xlarge'),
38-
# sky.Resources(sky.GCP(), 'n1-standard-16'),
39-
#sky.Resources(
40-
# sky.GCP(),
41-
# 'n1-standard-8',
42-
# Options: 'V100', {'V100': <num>}.
43-
# 'V100',
44-
#),
45-
##### Partially specified
46-
#sky.Resources(accelerators='V100'),
47-
# sky.Resources(accelerators='tpu-v3-8'),
48-
# sky.Resources(sky.AWS(), accelerators={'V100': 4}),
49-
# sky.Resources(sky.AWS(), accelerators='V100'),
50-
})
51-
52-
sky.launch(dag, cluster_name='dth')
53-
# sky.exec(dag, cluster_name='dth')
5+
# Total Nodes, INCLUDING Head Node
6+
num_nodes = 2
7+
8+
# The setup command. Will be run under the working directory.
9+
setup = 'echo \"alias python=python3\" >> ~/.bashrc && pip3 install --upgrade pip && \
10+
[ -d pytorch-distributed-resnet ] || \
11+
(git clone https://github.com/michaelzhiluo/pytorch-distributed-resnet && \
12+
cd pytorch-distributed-resnet && pip3 install -r requirements.txt && \
13+
mkdir -p data && mkdir -p saved_models && cd data && \
14+
wget -c --quiet https://www.cs.toronto.edu/~kriz/cifar-10-python.tar.gz && \
15+
tar -xvzf cifar-10-python.tar.gz)'
16+
17+
18+
# The command to run. Will be run under the working directory.
19+
def run_fn(node_rank: int, ip_list: List[str]) -> Optional[str]:
20+
num_nodes = len(ip_list)
21+
return f"""\
22+
cd pytorch-distributed-resnet
23+
python3 -m torch.distributed.launch --nproc_per_node=1 \
24+
--nnodes={num_nodes} --node_rank={node_rank} --master_addr={ip_list[0]} \
25+
--master_port=8008 resnet_ddp.py --num_epochs 20
26+
"""
27+
28+
29+
train = sky.Task(
30+
'train',
31+
setup=setup,
32+
num_nodes=num_nodes,
33+
run=run_fn,
34+
)
35+
36+
train.set_resources({
37+
##### Fully specified
38+
sky.Resources(sky.AWS(), 'p3.2xlarge'),
39+
# sky.Resources(sky.GCP(), 'n1-standard-16'),
40+
#sky.Resources(
41+
# sky.GCP(),
42+
# 'n1-standard-8',
43+
# Options: 'V100', {'V100': <num>}.
44+
# 'V100',
45+
#),
46+
##### Partially specified
47+
#sky.Resources(accelerators='V100'),
48+
# sky.Resources(accelerators='tpu-v3-8'),
49+
# sky.Resources(sky.AWS(), accelerators={'V100': 4}),
50+
# sky.Resources(sky.AWS(), accelerators='V100'),
51+
})
52+
53+
sky.launch(train, cluster_name='dth')
54+
# sky.exec(train, cluster_name='dth')

‎examples/resnet_distributed_torch_with_script.yaml

+3-3
Original file line numberDiff line numberDiff line change
@@ -7,10 +7,10 @@ resources:
77

88
num_nodes: 2
99

10-
workdir: .
10+
workdir: ./examples/resnet_distributed_torch_scripts
1111

1212
setup: |
13-
bash examples/resnet_distributed_torch_scripts/setup.sh
13+
bash ./setup.sh
1414
1515
run: |
16-
bash examples/resnet_distributed_torch_scripts/run.sh
16+
bash ./run.sh

0 commit comments

Comments
 (0)
Please sign in to comment.