Skip to content

Commit 1c88b13

Browse files
authored
[Image] Fix blocking by unattended-upgrade (skypilot-org#1347)
* Fix blocking by unattended-upgrade * adopt to gcp and azure
1 parent fca1824 commit 1c88b13

File tree

3 files changed

+27
-13
lines changed

3 files changed

+27
-13
lines changed

sky/templates/aws-ray.yml.j2

Lines changed: 9 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -89,6 +89,7 @@ initialization_commands: []
8989
# Increment the following for catching performance bugs easier:
9090
# current num items (num SSH connections): 1
9191
setup_commands:
92+
# Disable `unattended-upgrades` to prevent apt-get from hanging. It should be called at the beginning before the process started to avoid being blocked. (This is a temporary fix.)
9293
# Create ~/.ssh/config file in case the file does not exist in the custom image.
9394
# Make sure python3 & pip3 are available on this image.
9495
# We set auto_activate_base to be false for pre-installed conda.
@@ -97,19 +98,21 @@ setup_commands:
9798
# Line 'sudo grep ..': set the number of threads per process to unlimited to avoid ray job submit stucking issue when the number of running ray jobs increase.
9899
# Line 'mkdir -p ..': disable host key check
99100
# Line 'python3 -c ..': patch the buggy ray files and enable `-o allow_other` option for `goofys`
100-
- mkdir -p ~/.ssh; touch ~/.ssh/config;
101+
- sudo systemctl stop unattended-upgrades || true;
102+
sudo systemctl disable unattended-upgrades || true;
103+
sudo sed -i 's/Unattended-Upgrade "1"/Unattended-Upgrade "0"/g' /etc/apt/apt.conf.d/20auto-upgrades || true;
104+
sudo kill -9 `sudo lsof /var/lib/dpkg/lock-frontend | awk '{print $2}' | tail -n 1` || true;
105+
sudo pkill -9 apt-get;
106+
sudo pkill -9 dpkg;
107+
sudo dpkg --configure -a;
108+
mkdir -p ~/.ssh; touch ~/.ssh/config;
101109
pip3 --version > /dev/null 2>&1 || (curl -sSL https://bootstrap.pypa.io/get-pip.py -o get-pip.py && python3 get-pip.py && echo "PATH=$HOME/.local/bin:$PATH" >> ~/.bashrc);
102110
(type -a python | grep -q python3) || echo 'alias python=python3' >> ~/.bashrc;
103111
(type -a pip | grep -q pip3) || echo 'alias pip=pip3' >> ~/.bashrc;
104112
(which conda > /dev/null 2>&1 && conda init > /dev/null && conda config --set auto_activate_base false) || (wget -nc https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh && bash Miniconda3-latest-Linux-x86_64.sh -b && eval "$(~/miniconda3/bin/conda shell.bash hook)" && conda init && conda config --set auto_activate_base true);
105113
source ~/.bashrc;
106114
(pip3 list | grep ray | grep {{ray_version}} 2>&1 > /dev/null || pip3 install -U ray[default]=={{ray_version}}) && mkdir -p ~/sky_workdir && mkdir -p ~/.sky/sky_app;
107115
(pip3 list | grep skypilot && [ "$(cat {{sky_remote_path}}/current_sky_wheel_hash)" == "{{sky_wheel_hash}}" ]) || (pip3 uninstall skypilot -y; pip3 install "$(echo {{sky_remote_path}}/{{sky_wheel_hash}}/skypilot-{{sky_version}}*.whl)[aws]" && echo "{{sky_wheel_hash}}" > {{sky_remote_path}}/current_sky_wheel_hash || exit 1);
108-
sudo systemctl stop unattended-upgrades;
109-
sudo kill -9 `sudo lsof /var/lib/dpkg/lock-frontend | awk '{print $2}' | tail -n 1` || true;
110-
sudo pkill -9 apt-get;
111-
sudo pkill -9 dpkg;
112-
sudo dpkg --configure -a;
113116
sudo bash -c 'rm -rf /etc/security/limits.d; echo "* soft nofile 1048576" >> /etc/security/limits.conf; echo "* hard nofile 1048576" >> /etc/security/limits.conf';
114117
sudo grep -e '^DefaultTasksMax' /etc/systemd/system.conf || (sudo bash -c 'echo "DefaultTasksMax=infinity" >> /etc/systemd/system.conf'); sudo systemctl set-property user-$(id -u $(whoami)).slice TasksMax=infinity; sudo systemctl daemon-reload;
115118
mkdir -p ~/.ssh; (grep -Pzo -q "Host \*\n StrictHostKeyChecking no" ~/.ssh/config) || printf "Host *\n StrictHostKeyChecking no\n" >> ~/.ssh/config;

sky/templates/azure-ray.yml.j2

Lines changed: 9 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -88,26 +88,29 @@ initialization_commands: []
8888
# Increment the following for catching performance bugs easier:
8989
# current num items (num SSH connections): 1
9090
setup_commands:
91+
# Disable `unattended-upgrades` to prevent apt-get from hanging. It should be called at the beginning before the process started to avoid being blocked. (This is a temporary fix.)
9192
# Create ~/.ssh/config file in case the file does not exist in the image.
9293
# Make sure python3 & pip3 are available on this image.
9394
# Line 'sudo bash ..': set the ulimit as suggested by ray docs for performance. https://docs.ray.io/en/latest/cluster/vms/user-guides/large-cluster-best-practices.html#system-configuration
9495
# Line 'sudo grep ..': set the number of threads per process to unlimited to avoid ray job submit stucking issue when the number of running ray jobs increase.
9596
# Line 'mkdir -p ..': disable host key check
9697
# Line 'python3 -c ..': patch the buggy ray files and enable `-o allow_other` option for `goofys`
9798
# This also kills the service that is holding the lock on dpkg (problem only exists on aws/azure, not gcp)
98-
- mkdir -p ~/.ssh; touch ~/.ssh/config;
99+
- sudo systemctl stop unattended-upgrades || true;
100+
sudo systemctl disable unattended-upgrades || true;
101+
sudo sed -i 's/Unattended-Upgrade "1"/Unattended-Upgrade "0"/g' /etc/apt/apt.conf.d/20auto-upgrades || true;
102+
sudo kill -9 `sudo lsof /var/lib/dpkg/lock-frontend | awk '{print $2}' | tail -n 1` || true;
103+
sudo pkill -9 apt-get;
104+
sudo pkill -9 dpkg;
105+
sudo dpkg --configure -a;
106+
mkdir -p ~/.ssh; touch ~/.ssh/config;
99107
pip3 --version > /dev/null 2>&1 || (curl -sSL https://bootstrap.pypa.io/get-pip.py -o get-pip.py && python3 get-pip.py && echo "PATH=$HOME/.local/bin:$PATH" >> ~/.bashrc);
100108
(type -a python | grep -q python3) || echo 'alias python=python3' >> ~/.bashrc;
101109
(type -a pip | grep -q pip3) || echo 'alias pip=pip3' >> ~/.bashrc;
102110
which conda > /dev/null 2>&1 || (wget -nc https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh && bash Miniconda3-latest-Linux-x86_64.sh -b && eval "$(/home/azureuser/miniconda3/bin/conda shell.bash hook)" && conda init && conda config --set auto_activate_base true);
103111
source ~/.bashrc;
104112
(pip3 list | grep ray | grep {{ray_version}} 2>&1 > /dev/null || pip3 install -U ray[default]=={{ray_version}}) && mkdir -p ~/sky_workdir && mkdir -p ~/.sky/sky_app && touch ~/.sudo_as_admin_successful;
105113
(pip3 list | grep skypilot && [ "$(cat {{sky_remote_path}}/current_sky_wheel_hash)" == "{{sky_wheel_hash}}" ]) || (pip3 uninstall skypilot -y; pip3 install "$(echo {{sky_remote_path}}/{{sky_wheel_hash}}/skypilot-{{sky_version}}*.whl)[azure]" && echo "{{sky_wheel_hash}}" > {{sky_remote_path}}/current_sky_wheel_hash || exit 1);
106-
sudo systemctl stop unattended-upgrades;
107-
sudo kill -9 `sudo lsof /var/lib/dpkg/lock-frontend | awk '{print $2}' | tail -n 1` || true;
108-
sudo pkill -9 apt-get;
109-
sudo pkill -9 dpkg;
110-
sudo dpkg --configure -a;
111114
sudo bash -c 'rm -rf /etc/security/limits.d; echo "* soft nofile 1048576" >> /etc/security/limits.conf; echo "* hard nofile 1048576" >> /etc/security/limits.conf';
112115
sudo grep -e '^DefaultTasksMax' /etc/systemd/system.conf || (sudo bash -c 'echo "DefaultTasksMax=infinity" >> /etc/systemd/system.conf'); sudo systemctl set-property user-$(id -u $(whoami)).slice TasksMax=infinity; sudo systemctl daemon-reload;
113116
mkdir -p ~/.ssh; (grep -Pzo -q "Host \*\n StrictHostKeyChecking no" ~/.ssh/config) || printf "Host *\n StrictHostKeyChecking no\n" >> ~/.ssh/config;

sky/templates/gcp-ray.yml.j2

Lines changed: 9 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -123,6 +123,7 @@ initialization_commands: []
123123
# Increment the following for catching performance bugs easier:
124124
# current num items (num SSH connections): 1 (+1 if tpu_vm)
125125
setup_commands:
126+
# Disable `unattended-upgrades` to prevent apt-get from hanging. It should be called at the beginning before the process started to avoid being blocked. (This is a temporary fix.)
126127
# Line 'mkdir -p ..': Create ~/.ssh/config file in case the file does not exist in the custom image.
127128
# Line 'pip3 --v ..': Make sure python3 & pip3 are available on this image.
128129
# Line 'which conda ..': some images (TPU VM) do not install conda by
@@ -132,7 +133,14 @@ setup_commands:
132133
# Line 'sudo grep ..': set the number of threads per process to unlimited to avoid ray job submit stucking issue when the number of running ray jobs increase.
133134
# Line 'mkdir -p ..': disable host key check
134135
# Line 'python3 -c ..': patch the buggy ray files and enable `-o allow_other` option for `goofys`
135-
- mkdir -p ~/.ssh; touch ~/.ssh/config;
136+
- sudo systemctl stop unattended-upgrades || true;
137+
sudo systemctl disable unattended-upgrades || true;
138+
sudo sed -i 's/Unattended-Upgrade "1"/Unattended-Upgrade "0"/g' /etc/apt/apt.conf.d/20auto-upgrades || true;
139+
sudo kill -9 `sudo lsof /var/lib/dpkg/lock-frontend | awk '{print $2}' | tail -n 1` || true;
140+
sudo pkill -9 apt-get;
141+
sudo pkill -9 dpkg;
142+
sudo dpkg --configure -a;
143+
mkdir -p ~/.ssh; touch ~/.ssh/config;
136144
pip3 --version > /dev/null 2>&1 || (curl -sSL https://bootstrap.pypa.io/get-pip.py -o get-pip.py && python3 get-pip.py && echo "PATH=$HOME/.local/bin:$PATH" >> ~/.bashrc);
137145
(type -a python | grep -q python3) || echo 'alias python=python3' >> ~/.bashrc;
138146
(type -a pip | grep -q pip3) || echo 'alias pip=pip3' >> ~/.bashrc;

0 commit comments

Comments
 (0)