|
2 | 2 |
|
3 | 3 | import sky |
4 | 4 |
|
5 | | -with sky.Dag() as dag: |
6 | | - # The working directory contains all code and will be synced to remote. |
7 | | - workdir = '~/Downloads/tpu' |
8 | | - subprocess.run(f'cd {workdir} && git checkout 222cc86', |
9 | | - shell=True, |
10 | | - check=True) |
| 5 | +# The working directory contains all code and will be synced to remote. |
| 6 | +workdir = '~/Downloads/tpu' |
| 7 | +subprocess.run(f'cd {workdir} && git checkout 222cc86', shell=True, check=True) |
11 | 8 |
|
12 | | - # The setup command. Will be run under the working directory. |
13 | | - setup = 'pip install --upgrade pip && \ |
14 | | - conda init bash && \ |
15 | | - conda activate resnet || \ |
16 | | - (conda create -n resnet python=3.7 -y && \ |
17 | | - conda activate resnet && \ |
18 | | - conda install cudatoolkit=11.0 -y && \ |
19 | | - pip install tensorflow==2.4.0 pyyaml && \ |
20 | | - pip install protobuf==3.20 && \ |
21 | | - cd models && pip install -e .)' |
| 9 | +# The setup command. Will be run under the working directory. |
| 10 | +setup = """\ |
| 11 | + set -e |
| 12 | + pip install --upgrade pip |
| 13 | + conda init bash |
| 14 | + conda activate resnet && exists=1 || exists=0 |
| 15 | + if [ $exists -eq 0 ]; then |
| 16 | + conda create -n resnet python=3.7 -y |
| 17 | + conda activate resnet |
| 18 | + conda install cudatoolkit=11.0 -y |
| 19 | + pip install tensorflow==2.4.0 pyyaml |
| 20 | + pip install protobuf==3.20 |
| 21 | + cd models && pip install -e . |
| 22 | + fi |
| 23 | + """ |
22 | 24 |
|
23 | | - # The command to run. Will be run under the working directory. |
24 | | - run = 'conda activate resnet && \ |
25 | | - export XLA_FLAGS=\'--xla_gpu_cuda_data_dir=/usr/local/cuda/\' && \ |
26 | | - python -u models/official/resnet/resnet_main.py --use_tpu=False \ |
| 25 | +# The command to run. Will be run under the working directory. |
| 26 | +run = """\ |
| 27 | + conda activate resnet |
| 28 | + export XLA_FLAGS=\'--xla_gpu_cuda_data_dir=/usr/local/cuda/\' |
| 29 | + python -u models/official/resnet/resnet_main.py --use_tpu=False \ |
27 | 30 | --mode=train --train_batch_size=256 --train_steps=250 \ |
28 | 31 | --iterations_per_loop=125 \ |
29 | 32 | --data_dir=gs://cloud-tpu-test-datasets/fake_imagenet \ |
30 | 33 | --model_dir=resnet-model-dir \ |
31 | | - --amp --xla --loss_scale=128' |
| 34 | + --amp --xla --loss_scale=128 |
| 35 | + """ |
32 | 36 |
|
33 | | - ### Optional: download data to VM's local disks. ### |
34 | | - # Format: {VM paths: local paths / cloud URLs}. |
35 | | - file_mounts = { |
36 | | - # Download from GCS before training starts. |
37 | | - # '/tmp/fake_imagenet': 'gs://cloud-tpu-test-datasets/fake_imagenet', |
38 | | - } |
39 | | - # Refer to the VM local path. |
40 | | - # run = run.replace('gs://cloud-tpu-test-datasets/fake_imagenet', |
41 | | - # '/tmp/fake_imagenet') |
42 | | - ### Optional end ### |
| 37 | +### Optional: download data to VM's local disks. ### |
| 38 | +# Format: {VM paths: local paths / cloud URLs}. |
| 39 | +file_mounts = { |
| 40 | + # Download from GCS before training starts. |
| 41 | + # '/tmp/fake_imagenet': 'gs://cloud-tpu-test-datasets/fake_imagenet', |
| 42 | +} |
| 43 | +# Refer to the VM local path. |
| 44 | +# run = run.replace('gs://cloud-tpu-test-datasets/fake_imagenet', |
| 45 | +# '/tmp/fake_imagenet') |
| 46 | +### Optional end ### |
43 | 47 |
|
44 | | - train = sky.Task( |
45 | | - 'train', |
46 | | - workdir=workdir, |
47 | | - setup=setup, |
48 | | - run=run, |
49 | | - ) |
50 | | - train.set_file_mounts(file_mounts) |
51 | | - # TODO: allow option to say (or detect) no download/egress cost. |
52 | | - train.set_inputs('gs://cloud-tpu-test-datasets/fake_imagenet', |
53 | | - estimated_size_gigabytes=70) |
54 | | - train.set_outputs('resnet-model-dir', estimated_size_gigabytes=0.1) |
55 | | - train.set_resources({ |
56 | | - ##### Fully specified |
57 | | - # sky.Resources(sky.AWS(), 'p3.2xlarge'), |
58 | | - # sky.Resources(sky.GCP(), 'n1-standard-16'), |
59 | | - # sky.Resources( |
60 | | - # sky.GCP(), |
61 | | - # 'n1-standard-8', |
62 | | - # # Options: 'V100', {'V100': <num>}. |
63 | | - # 'V100', |
64 | | - # ), |
65 | | - ##### Partially specified |
66 | | - # sky.Resources(accelerators='T4'), |
67 | | - # sky.Resources(accelerators={'T4': 8}, use_spot=True), |
68 | | - # sky.Resources(sky.AWS(), accelerators={'T4': 8}, use_spot=True), |
69 | | - # sky.Resources(sky.AWS(), accelerators='K80'), |
70 | | - # sky.Resources(sky.AWS(), accelerators='K80', use_spot=True), |
71 | | - # sky.Resources(accelerators='tpu-v3-8'), |
72 | | - # sky.Resources(accelerators='V100', use_spot=True), |
73 | | - # sky.Resources(accelerators={'T4': 4}), |
74 | | - sky.Resources(sky.AWS(), accelerators='V100'), |
75 | | - # sky.Resources(sky.GCP(), accelerators={'V100': 4}), |
76 | | - # sky.Resources(sky.AWS(), accelerators='V100', use_spot=True), |
77 | | - # sky.Resources(sky.AWS(), accelerators={'V100': 8}), |
78 | | - }) |
| 48 | +task = sky.Task( |
| 49 | + 'train', |
| 50 | + workdir=workdir, |
| 51 | + setup=setup, |
| 52 | + run=run, |
| 53 | +) |
| 54 | +task.set_file_mounts(file_mounts) |
| 55 | +# TODO: allow option to say (or detect) no download/egress cost. |
| 56 | +task.set_inputs('gs://cloud-tpu-test-datasets/fake_imagenet', |
| 57 | + estimated_size_gigabytes=70) |
| 58 | +task.set_outputs('resnet-model-dir', estimated_size_gigabytes=0.1) |
| 59 | +task.set_resources({ |
| 60 | + ##### Fully specified |
| 61 | + # sky.Resources(sky.AWS(), 'p3.2xlarge'), |
| 62 | + # sky.Resources(sky.GCP(), 'n1-standard-16'), |
| 63 | + # sky.Resources( |
| 64 | + # sky.GCP(), |
| 65 | + # 'n1-standard-8', |
| 66 | + # # Options: 'V100', {'V100': <num>}. |
| 67 | + # 'V100', |
| 68 | + # ), |
| 69 | + ##### Partially specified |
| 70 | + # sky.Resources(accelerators='T4'), |
| 71 | + # sky.Resources(accelerators={'T4': 8}, use_spot=True), |
| 72 | + # sky.Resources(sky.AWS(), accelerators={'T4': 8}, use_spot=True), |
| 73 | + # sky.Resources(sky.AWS(), accelerators='K80'), |
| 74 | + # sky.Resources(sky.AWS(), accelerators='K80', use_spot=True), |
| 75 | + # sky.Resources(accelerators='tpu-v3-8'), |
| 76 | + # sky.Resources(accelerators='V100', use_spot=True), |
| 77 | + # sky.Resources(accelerators={'T4': 4}), |
| 78 | + sky.Resources(sky.AWS(), accelerators='V100'), |
| 79 | + # sky.Resources(sky.GCP(), accelerators={'V100': 4}), |
| 80 | + # sky.Resources(sky.AWS(), accelerators='V100', use_spot=True), |
| 81 | + # sky.Resources(sky.AWS(), accelerators={'V100': 8}), |
| 82 | +}) |
79 | 83 |
|
80 | | - # Optionally, specify a time estimator: Resources -> time in seconds. |
81 | | - # train.set_time_estimator(time_estimators.resnet50_estimate_runtime) |
| 84 | +# Optionally, specify a time estimator: Resources -> time in seconds. |
| 85 | +# task.set_time_estimator(time_estimators.resnet50_estimate_runtime) |
82 | 86 |
|
83 | | -# sky.launch(dag, dryrun=True) |
84 | | -sky.launch(dag) |
| 87 | +# sky.launch(task, dryrun=True) |
| 88 | +sky.launch(task) |
0 commit comments