Skip to content

Commit 6defe8c

Browse files
Port .j2 provision speed optimizations to AWS. (skypilot-org#1111)
1 parent 4b67bca commit 6defe8c

File tree

2 files changed

+28
-10
lines changed

2 files changed

+28
-10
lines changed

sky/templates/aws-ray.yml.j2

Lines changed: 23 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -74,18 +74,25 @@ rsync_exclude: []
7474
initialization_commands: []
7575

7676
# List of shell commands to run to set up nodes.
77+
# NOTE: these are very performance-sensitive. Each new item opens/closes an SSH
78+
# connection, which is expensive. Try your best to co-locate commands into fewer
79+
# items!
80+
#
81+
# Increment the following for catching performance bugs easier:
82+
# current num items (num SSH connections): 1
7783
setup_commands:
7884
# Create ~/.ssh/config file in case the file does not exist in the custom image.
7985
# Make sure python3 & pip3 are available on this image.
8086
# This also kills the service that is holding the lock on dpkg (problem only exists on aws/azure, not gcp)
81-
- mkdir -p ~/.ssh; touch ~/.ssh/config;
82-
pip3 --version > /dev/null 2>&1 || (curl -sSL https://bootstrap.pypa.io/get-pip.py -o get-pip.py && python3 get-pip.py && echo "PATH=$HOME/.local/bin:$PATH" >> ~/.bashrc);
83-
(type -a python | grep -q python3) || echo 'alias python=python3' >> ~/.bashrc;
8487
# We set auto_activate_base to be false for pre-installed conda.
85-
- (which conda > /dev/null 2>&1 && conda init > /dev/null && conda config --set auto_activate_base false) || (wget -nc https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh && bash Miniconda3-latest-Linux-x86_64.sh -b && eval "$(~/miniconda3/bin/conda shell.bash hook)" && conda init && conda config --set auto_activate_base true);
8688
# This also kills the service that is holding the lock on dpkg (problem only exists on aws/azure, not gcp)
8789
# patch the buggy ray files and enable `-o allow_other` option for `goofys`
88-
- (type -a pip | grep -q pip3) || echo 'alias pip=pip3' >> ~/.bashrc;
90+
- mkdir -p ~/.ssh; touch ~/.ssh/config;
91+
pip3 --version > /dev/null 2>&1 || (curl -sSL https://bootstrap.pypa.io/get-pip.py -o get-pip.py && python3 get-pip.py && echo "PATH=$HOME/.local/bin:$PATH" >> ~/.bashrc);
92+
(type -a python | grep -q python3) || echo 'alias python=python3' >> ~/.bashrc;
93+
(type -a pip | grep -q pip3) || echo 'alias pip=pip3' >> ~/.bashrc;
94+
(which conda > /dev/null 2>&1 && conda init > /dev/null && conda config --set auto_activate_base false) || (wget -nc https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh && bash Miniconda3-latest-Linux-x86_64.sh -b && eval "$(~/miniconda3/bin/conda shell.bash hook)" && conda init && conda config --set auto_activate_base true);
95+
source ~/.bashrc;
8996
(pip3 list | grep ray | grep {{ray_version}} 2>&1 > /dev/null || pip3 install -U ray[default]=={{ray_version}}) && mkdir -p ~/sky_workdir && mkdir -p ~/.sky/sky_app;
9097
pip3 uninstall skypilot -y &> /dev/null;
9198
pip3 install "$(echo {{sky_remote_path}}/skypilot-{{sky_version}}*.whl)[aws]";
@@ -98,20 +105,28 @@ setup_commands:
98105
[ -f /etc/fuse.conf ] && sudo sed -i 's/#user_allow_other/user_allow_other/g' /etc/fuse.conf || (sudo sh -c 'echo "user_allow_other" > /etc/fuse.conf'); # This is needed for `-o allow_other` option for `goofys`;
99106

100107
# Command to start ray on the head node. You don't need to change this.
108+
# NOTE: these are very performance-sensitive. Each new item opens/closes an SSH
109+
# connection, which is expensive. Try your best to co-locate commands into fewer
110+
# items! The same comment applies for worker_start_ray_commands.
111+
#
112+
# Increment the following for catching performance bugs easier:
113+
# current num items (num SSH connections): 2
101114
head_start_ray_commands:
102115
# Set the ulimit as suggested by ray docs for performance. https://docs.ray.io/en/latest/cluster/guide.html?highlight=ulimit#system-configuration
103116
# Solution from https://discuss.ray.io/t/setting-ulimits-on-ec2-instances/590
104117
# This line is intentionally separated from the next line to reload the session after the ulimit is set.
105118
- sudo bash -c 'rm -rf /etc/security/limits.d; echo "* soft nofile 65535" >> /etc/security/limits.conf; echo "* hard nofile 65535" >> /etc/security/limits.conf;';
106119
(grep -Pzo -q "Host \*\n StrictHostKeyChecking no" ~/.ssh/config) || printf "Host *\n StrictHostKeyChecking no\n" >> ~/.ssh/config;
107-
- (ps aux | grep "-m sky.skylet.skylet" | grep -q python3) || nohup python3 -m sky.skylet.skylet >> ~/.sky/skylet.log 2>&1 & # Start skylet daemon. (Should not place it in the head_setup_commands, otherwise it will run before sky is installed.)
108-
- ray stop; ray start --head --port=6379 --object-manager-port=8076 --autoscaling-config=~/ray_bootstrap_config.yaml {{"--resources='%s'" % custom_resources if custom_resources}}
120+
# Start skylet daemon. (Should not place it in the head_setup_commands, otherwise it will run before sky is installed.)
121+
# NOTE: --disable-usage-stats in `ray start` saves 10 seconds of idle wait.
122+
- ((ps aux | grep "-m sky.skylet.skylet" | grep -q python3) || nohup python3 -m sky.skylet.skylet >> ~/.sky/skylet.log 2>&1 &);
123+
ray stop; ray start --disable-usage-stats --head --port=6379 --object-manager-port=8076 --autoscaling-config=~/ray_bootstrap_config.yaml {{"--resources='%s'" % custom_resources if custom_resources}}
109124

110125
{%- if num_nodes > 1 %}
111126
worker_start_ray_commands:
112127
- sudo bash -c 'rm -rf /etc/security/limits.d; echo "* soft nofile 65535" >> /etc/security/limits.conf; echo "* hard nofile 65535" >> /etc/security/limits.conf;';
113128
(grep -Pzo -q "Host \*\n StrictHostKeyChecking no" ~/.ssh/config) || printf "Host *\n StrictHostKeyChecking no\n" >> ~/.ssh/config
114-
- ray stop; ray start --address=$RAY_HEAD_IP:6379 --object-manager-port=8076 {{"--resources='%s'" % custom_resources if custom_resources}}
129+
- ray stop; ray start --disable-usage-stats --address=$RAY_HEAD_IP:6379 --object-manager-port=8076 {{"--resources='%s'" % custom_resources if custom_resources}}
115130
{%- else %}
116131
worker_start_ray_commands: []
117132
{%- endif %}

sky/templates/gcp-ray.yml.j2

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -130,13 +130,16 @@ setup_commands:
130130
# commands.
131131
# Line 'python3 -c ..': patch the buggy ray files and enable `-o allow_other` option for `goofys`
132132
- mkdir -p ~/.ssh; touch ~/.ssh/config;
133-
pip3 --version > /dev/null 2>&1 || (curl -sSL https://bootstrap.pypa.io/get-pip.py -o get-pip.py && python3 get-pip.py && echo "PATH=$HOME/.local/bin:$PATH" >> ~/.bashrc); (type -a python | grep -q python3) || echo 'alias python=python3' >> ~/.bashrc; (type -a pip | grep -q pip3) || echo 'alias pip=pip3' >> ~/.bashrc;
133+
pip3 --version > /dev/null 2>&1 || (curl -sSL https://bootstrap.pypa.io/get-pip.py -o get-pip.py && python3 get-pip.py && echo "PATH=$HOME/.local/bin:$PATH" >> ~/.bashrc);
134+
(type -a python | grep -q python3) || echo 'alias python=python3' >> ~/.bashrc;
135+
(type -a pip | grep -q pip3) || echo 'alias pip=pip3' >> ~/.bashrc;
134136
which conda > /dev/null 2>&1 || (wget -nc https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh && bash Miniconda3-latest-Linux-x86_64.sh -b && eval "$(/home/gcpuser/miniconda3/bin/conda shell.bash hook)" && conda init && conda config --set auto_activate_base true);
135137
source ~/.bashrc;
136138
(pip3 list | grep ray | grep {{ray_version}} 2>&1 > /dev/null || pip3 install -U ray[default]=={{ray_version}}) && mkdir -p ~/sky_workdir && mkdir -p ~/.sky/sky_app;
137139
pip3 uninstall skypilot -y &> /dev/null;
138140
pip3 install "$(echo {{sky_remote_path}}/skypilot-{{sky_version}}*.whl)[gcp]" || exit 1;
139-
python3 -c "from sky.skylet.ray_patches import patch; patch()"; [ -f /etc/fuse.conf ] && sudo sed -i 's/#user_allow_other/user_allow_other/g' /etc/fuse.conf || (sudo sh -c 'echo "user_allow_other" > /etc/fuse.conf'); # This is needed for `-o allow_other` option for `gcsfuse`;
141+
python3 -c "from sky.skylet.ray_patches import patch; patch()";
142+
[ -f /etc/fuse.conf ] && sudo sed -i 's/#user_allow_other/user_allow_other/g' /etc/fuse.conf || (sudo sh -c 'echo "user_allow_other" > /etc/fuse.conf'); # This is needed for `-o allow_other` option for `gcsfuse`;
140143
{%- if tpu_vm %}
141144
- pip3 install --upgrade google-api-python-client;
142145
{%- endif %}

0 commit comments

Comments
 (0)