|
2 | 2 |
|
3 | 3 | import sky
|
4 | 4 |
|
5 |
| -with sky.Dag() as dag: |
6 |
| - # The working directory contains all code and will be synced to remote. |
7 |
| - workdir = '~/Downloads/tpu' |
8 |
| - subprocess.run(f'cd {workdir} && git checkout 222cc86', |
9 |
| - shell=True, |
10 |
| - check=True) |
| 5 | +# The working directory contains all code and will be synced to remote. |
| 6 | +workdir = '~/Downloads/tpu' |
| 7 | +subprocess.run(f'cd {workdir} && git checkout 222cc86', shell=True, check=True) |
11 | 8 |
|
12 |
| - # The setup command. Will be run under the working directory. |
13 |
| - setup = 'pip install --upgrade pip && \ |
14 |
| - conda init bash && \ |
15 |
| - conda activate resnet || \ |
16 |
| - (conda create -n resnet python=3.7 -y && \ |
17 |
| - conda activate resnet && \ |
18 |
| - conda install cudatoolkit=11.0 -y && \ |
19 |
| - pip install tensorflow==2.4.0 pyyaml && \ |
20 |
| - pip install protobuf==3.20 && \ |
21 |
| - cd models && pip install -e .)' |
| 9 | +# The setup command. Will be run under the working directory. |
| 10 | +setup = """\ |
| 11 | + set -e |
| 12 | + pip install --upgrade pip |
| 13 | + conda init bash |
| 14 | + conda activate resnet && exists=1 || exists=0 |
| 15 | + if [ $exists -eq 0 ]; then |
| 16 | + conda create -n resnet python=3.7 -y |
| 17 | + conda activate resnet |
| 18 | + conda install cudatoolkit=11.0 -y |
| 19 | + pip install tensorflow==2.4.0 pyyaml |
| 20 | + pip install protobuf==3.20 |
| 21 | + cd models && pip install -e . |
| 22 | + fi |
| 23 | + """ |
22 | 24 |
|
23 |
| - # The command to run. Will be run under the working directory. |
24 |
| - run = 'conda activate resnet && \ |
25 |
| - export XLA_FLAGS=\'--xla_gpu_cuda_data_dir=/usr/local/cuda/\' && \ |
26 |
| - python -u models/official/resnet/resnet_main.py --use_tpu=False \ |
| 25 | +# The command to run. Will be run under the working directory. |
| 26 | +run = """\ |
| 27 | + conda activate resnet |
| 28 | + export XLA_FLAGS=\'--xla_gpu_cuda_data_dir=/usr/local/cuda/\' |
| 29 | + python -u models/official/resnet/resnet_main.py --use_tpu=False \ |
27 | 30 | --mode=train --train_batch_size=256 --train_steps=250 \
|
28 | 31 | --iterations_per_loop=125 \
|
29 | 32 | --data_dir=gs://cloud-tpu-test-datasets/fake_imagenet \
|
30 | 33 | --model_dir=resnet-model-dir \
|
31 |
| - --amp --xla --loss_scale=128' |
| 34 | + --amp --xla --loss_scale=128 |
| 35 | + """ |
32 | 36 |
|
33 |
| - ### Optional: download data to VM's local disks. ### |
34 |
| - # Format: {VM paths: local paths / cloud URLs}. |
35 |
| - file_mounts = { |
36 |
| - # Download from GCS before training starts. |
37 |
| - # '/tmp/fake_imagenet': 'gs://cloud-tpu-test-datasets/fake_imagenet', |
38 |
| - } |
39 |
| - # Refer to the VM local path. |
40 |
| - # run = run.replace('gs://cloud-tpu-test-datasets/fake_imagenet', |
41 |
| - # '/tmp/fake_imagenet') |
42 |
| - ### Optional end ### |
| 37 | +### Optional: download data to VM's local disks. ### |
| 38 | +# Format: {VM paths: local paths / cloud URLs}. |
| 39 | +file_mounts = { |
| 40 | + # Download from GCS before training starts. |
| 41 | + # '/tmp/fake_imagenet': 'gs://cloud-tpu-test-datasets/fake_imagenet', |
| 42 | +} |
| 43 | +# Refer to the VM local path. |
| 44 | +# run = run.replace('gs://cloud-tpu-test-datasets/fake_imagenet', |
| 45 | +# '/tmp/fake_imagenet') |
| 46 | +### Optional end ### |
43 | 47 |
|
44 |
| - train = sky.Task( |
45 |
| - 'train', |
46 |
| - workdir=workdir, |
47 |
| - setup=setup, |
48 |
| - run=run, |
49 |
| - ) |
50 |
| - train.set_file_mounts(file_mounts) |
51 |
| - # TODO: allow option to say (or detect) no download/egress cost. |
52 |
| - train.set_inputs('gs://cloud-tpu-test-datasets/fake_imagenet', |
53 |
| - estimated_size_gigabytes=70) |
54 |
| - train.set_outputs('resnet-model-dir', estimated_size_gigabytes=0.1) |
55 |
| - train.set_resources({ |
56 |
| - ##### Fully specified |
57 |
| - # sky.Resources(sky.AWS(), 'p3.2xlarge'), |
58 |
| - # sky.Resources(sky.GCP(), 'n1-standard-16'), |
59 |
| - # sky.Resources( |
60 |
| - # sky.GCP(), |
61 |
| - # 'n1-standard-8', |
62 |
| - # # Options: 'V100', {'V100': <num>}. |
63 |
| - # 'V100', |
64 |
| - # ), |
65 |
| - ##### Partially specified |
66 |
| - # sky.Resources(accelerators='T4'), |
67 |
| - # sky.Resources(accelerators={'T4': 8}, use_spot=True), |
68 |
| - # sky.Resources(sky.AWS(), accelerators={'T4': 8}, use_spot=True), |
69 |
| - # sky.Resources(sky.AWS(), accelerators='K80'), |
70 |
| - # sky.Resources(sky.AWS(), accelerators='K80', use_spot=True), |
71 |
| - # sky.Resources(accelerators='tpu-v3-8'), |
72 |
| - # sky.Resources(accelerators='V100', use_spot=True), |
73 |
| - # sky.Resources(accelerators={'T4': 4}), |
74 |
| - sky.Resources(sky.AWS(), accelerators='V100'), |
75 |
| - # sky.Resources(sky.GCP(), accelerators={'V100': 4}), |
76 |
| - # sky.Resources(sky.AWS(), accelerators='V100', use_spot=True), |
77 |
| - # sky.Resources(sky.AWS(), accelerators={'V100': 8}), |
78 |
| - }) |
| 48 | +task = sky.Task( |
| 49 | + 'train', |
| 50 | + workdir=workdir, |
| 51 | + setup=setup, |
| 52 | + run=run, |
| 53 | +) |
| 54 | +task.set_file_mounts(file_mounts) |
| 55 | +# TODO: allow option to say (or detect) no download/egress cost. |
| 56 | +task.set_inputs('gs://cloud-tpu-test-datasets/fake_imagenet', |
| 57 | + estimated_size_gigabytes=70) |
| 58 | +task.set_outputs('resnet-model-dir', estimated_size_gigabytes=0.1) |
| 59 | +task.set_resources({ |
| 60 | + ##### Fully specified |
| 61 | + # sky.Resources(sky.AWS(), 'p3.2xlarge'), |
| 62 | + # sky.Resources(sky.GCP(), 'n1-standard-16'), |
| 63 | + # sky.Resources( |
| 64 | + # sky.GCP(), |
| 65 | + # 'n1-standard-8', |
| 66 | + # # Options: 'V100', {'V100': <num>}. |
| 67 | + # 'V100', |
| 68 | + # ), |
| 69 | + ##### Partially specified |
| 70 | + # sky.Resources(accelerators='T4'), |
| 71 | + # sky.Resources(accelerators={'T4': 8}, use_spot=True), |
| 72 | + # sky.Resources(sky.AWS(), accelerators={'T4': 8}, use_spot=True), |
| 73 | + # sky.Resources(sky.AWS(), accelerators='K80'), |
| 74 | + # sky.Resources(sky.AWS(), accelerators='K80', use_spot=True), |
| 75 | + # sky.Resources(accelerators='tpu-v3-8'), |
| 76 | + # sky.Resources(accelerators='V100', use_spot=True), |
| 77 | + # sky.Resources(accelerators={'T4': 4}), |
| 78 | + sky.Resources(sky.AWS(), accelerators='V100'), |
| 79 | + # sky.Resources(sky.GCP(), accelerators={'V100': 4}), |
| 80 | + # sky.Resources(sky.AWS(), accelerators='V100', use_spot=True), |
| 81 | + # sky.Resources(sky.AWS(), accelerators={'V100': 8}), |
| 82 | +}) |
79 | 83 |
|
80 |
| - # Optionally, specify a time estimator: Resources -> time in seconds. |
81 |
| - # train.set_time_estimator(time_estimators.resnet50_estimate_runtime) |
| 84 | +# Optionally, specify a time estimator: Resources -> time in seconds. |
| 85 | +# task.set_time_estimator(time_estimators.resnet50_estimate_runtime) |
82 | 86 |
|
83 |
| -# sky.launch(dag, dryrun=True) |
84 |
| -sky.launch(dag) |
| 87 | +# sky.launch(task, dryrun=True) |
| 88 | +sky.launch(task) |
0 commit comments