Skip to content

Commit 9750679

Browse files
Docker example and fix goofys-docker mounting (skypilot-org#686)
* Fix docker killing * Add docker example * Fix docker example * Fix * Fix the docker example for pytorch installation * Use model caching * Mount output folder * Permission issue * remove useless lines * fix license * Add storage mounting for output and fix the goofys mounting * Minor touches * examples/docker_app.yaml -> examples/detectron2_docker.yaml * Minor * Fix gcp fuse.conf * simplify file_mount options * remove wait Co-authored-by: Zongheng Yang <[email protected]>
1 parent 1668954 commit 9750679

12 files changed

+156
-12
lines changed

LICENSE

+20
Original file line numberDiff line numberDiff line change
@@ -199,3 +199,23 @@
199199
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
200200
See the License for the specific language governing permissions and
201201
limitations under the License.
202+
203+
--------------------------------------------------------------------------------
204+
205+
Code in examples/docker/detectron2/ from
206+
https://github.com/facebookresearch/detectron2/tree/main/docker/
207+
Git Revision: 0cebda53b71aead685627487d39dae4fa64017fb
208+
209+
Copyright 2019-2022 detectron2 developers
210+
211+
Licensed under the Apache License, Version 2.0 (the "License");
212+
you may not use this file except in compliance with the License.
213+
You may obtain a copy of the License at
214+
215+
https://www.apache.org/licenses/LICENSE-2.0
216+
217+
Unless required by applicable law or agreed to in writing, software
218+
distributed under the License is distributed on an "AS IS" BASIS,
219+
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
220+
See the License for the specific language governing permissions and
221+
limitations under the License.

examples/detectron2_docker.yaml

+40
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,40 @@
1+
# Steps to run this example:
2+
#
3+
# (1) Run the following commands locally to get the example input image.
4+
# mkdir -p ~/Downloads/detectron-inputs
5+
# wget http://images.cocodataset.org/val2017/000000439715.jpg -O ~/Downloads/detectron-inputs/input.jpg
6+
#
7+
# (2) Change L18 to a unique bucket name to create a private bucket.
8+
9+
resources:
10+
accelerators: V100:4
11+
12+
file_mounts:
13+
# TODO: run the download commands above first.
14+
/inputs: ~/Downloads/detectron-inputs
15+
/detectron2: ./examples/docker/detectron2
16+
/outputs:
17+
# TODO: Change the name to your own bucket name (e.g., append your user name).
18+
name: sky-detectron2-outputs
19+
mode: MOUNT
20+
21+
22+
setup: |
23+
# Build:
24+
sudo apt update
25+
docker build --build-arg USER_ID=$UID -t detectron2:v0 /detectron2
26+
27+
run: |
28+
# Launch (require GPUs):
29+
docker run -a stdout -a stderr --gpus=all --pid=host --rm \
30+
--shm-size=8gb \
31+
--volume=$HOME/.torch/fvcore_cache:/tmp:rw \
32+
--volume="/inputs:/inputs:ro" \
33+
--volume="/outputs:/outputs:rw" \
34+
detectron2:v0 /bin/bash -c \
35+
"echo CUDA_VISIBLE_DEVICES $CUDA_VISIBLE_DEVICES && \
36+
sudo chmod -R 777 /tmp && \
37+
python3 demo/demo.py \
38+
--config-file configs/COCO-InstanceSegmentation/mask_rcnn_R_50_FPN_3x.yaml \
39+
--input /inputs/input.jpg --output /outputs \
40+
--opts MODEL.WEIGHTS detectron2://COCO-InstanceSegmentation/mask_rcnn_R_50_FPN_3x/137849600/model_final_f10217.pkl"

examples/docker/detectron2/Dockerfile

+47
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,47 @@
1+
FROM nvidia/cuda:11.1.1-cudnn8-devel-ubuntu18.04
2+
# use an older system (18.04) to avoid opencv incompatibility (issue#3524)
3+
4+
ENV DEBIAN_FRONTEND noninteractive
5+
RUN apt-get update && apt-get install -y \
6+
python3-opencv ca-certificates python3-dev git wget sudo ninja-build
7+
RUN ln -sv /usr/bin/python3 /usr/bin/python
8+
9+
# create a non-root user
10+
ARG USER_ID=1000
11+
RUN useradd -m --no-log-init --system --uid ${USER_ID} appuser -g sudo
12+
RUN echo '%sudo ALL=(ALL) NOPASSWD:ALL' >> /etc/sudoers
13+
USER appuser
14+
WORKDIR /home/appuser
15+
16+
ENV PATH="/home/appuser/.local/bin:${PATH}"
17+
RUN wget https://bootstrap.pypa.io/pip/3.6/get-pip.py && \
18+
python3 get-pip.py --user && \
19+
rm get-pip.py
20+
21+
# install dependencies
22+
# See https://pytorch.org/ for other options if you use a different version of CUDA
23+
RUN pip install --user tensorboard cmake # cmake from apt-get is too old
24+
RUN pip install --user torch==1.10 torchvision==0.11.1 --trusted-host download.pytorch.org -f http://download.pytorch.org/whl/cu111/torch_stable.html
25+
26+
RUN pip install --user 'git+https://github.com/facebookresearch/fvcore'
27+
# install detectron2
28+
RUN git clone https://github.com/facebookresearch/detectron2 detectron2_repo
29+
# set FORCE_CUDA because during `docker build` cuda is not accessible
30+
ENV FORCE_CUDA="1"
31+
# This will by default build detectron2 for all common cuda architectures and take a lot more time,
32+
# because inside `docker build`, there is no way to tell which architecture will be used.
33+
ARG TORCH_CUDA_ARCH_LIST="Kepler;Kepler+Tesla;Maxwell;Maxwell+Tegra;Pascal;Volta;Turing"
34+
ENV TORCH_CUDA_ARCH_LIST="${TORCH_CUDA_ARCH_LIST}"
35+
36+
RUN pip install --user -e detectron2_repo
37+
38+
# Set a fixed model cache directory.
39+
ENV FVCORE_CACHE="/tmp"
40+
WORKDIR /home/appuser/detectron2_repo
41+
42+
# run detectron2 under user "appuser":
43+
# wget http://images.cocodataset.org/val2017/000000439715.jpg -O input.jpg
44+
# python3 demo/demo.py \
45+
#--config-file configs/COCO-InstanceSegmentation/mask_rcnn_R_50_FPN_3x.yaml \
46+
#--input input.jpg --output outputs/ \
47+
#--opts MODEL.WEIGHTS detectron2://COCO-InstanceSegmentation/mask_rcnn_R_50_FPN_3x/137849600/model_final_f10217.pkl
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,32 @@
1+
# Copyright (c) Facebook, Inc. and its affiliates.
2+
# This file defines a container that compiles the C++ examples of detectron2.
3+
# See docker/README.md for usage.
4+
5+
# Depends on the image produced by "./Dockerfile"
6+
FROM detectron2:v0
7+
8+
USER appuser
9+
ENV HOME=/home/appuser
10+
WORKDIR $HOME
11+
12+
# Let torchvision find libtorch
13+
ENV CMAKE_PREFIX_PATH=$HOME/.local/lib/python3.6/site-packages/torch/
14+
15+
RUN sudo apt-get update && sudo apt-get install libopencv-dev --yes
16+
17+
# install libtorchvision
18+
RUN git clone --branch v0.11.1 https://github.com/pytorch/vision/
19+
RUN mkdir vision/build && cd vision/build && \
20+
cmake .. -DCMAKE_INSTALL_PREFIX=$HOME/.local -DCMAKE_BUILD_TYPE=Release -DWITH_CUDA=on -DTORCH_CUDA_ARCH_LIST=$TORCH_CUDA_ARCH_LIST && \
21+
make -j && make install
22+
23+
# make our installation take effect
24+
ENV CPATH=$HOME/.local/include \
25+
LIBRARY_PATH=$HOME/.local/lib \
26+
LD_LIBRARY_PATH=$HOME/.local/lib
27+
28+
29+
# build C++ examples of detectron2
30+
RUN cd detectron2_repo/tools/deploy && mkdir build && cd build && \
31+
cmake -DTORCH_CUDA_ARCH_LIST=$TORCH_CUDA_ARCH_LIST .. && make
32+
# binaries will be available under tools/deploy/build
File renamed without changes.
File renamed without changes.

sky/data/storage.py

+10-10
Original file line numberDiff line numberDiff line change
@@ -752,8 +752,8 @@ def _get_bucket(self) -> Tuple[StorageHandle, bool]:
752752
if self.source.startswith('s3://'):
753753
raise exceptions.StorageBucketGetError(
754754
'Attempted to connect to a non-existent bucket: '
755-
f'{self.source}. Consider using aws s3 ls '
756-
f'{self.source} to debug.') from e
755+
f'{self.source}. Consider using `aws s3 ls '
756+
f'{self.source}` to debug.') from e
757757
else:
758758
bucket = self._create_s3_bucket(self.name)
759759
return bucket, True
@@ -766,7 +766,7 @@ def _get_bucket(self) -> Tuple[StorageHandle, bool]:
766766
'Failed to connect to an existing bucket. \n'
767767
'Check if the 1) the bucket name is taken and/or '
768768
'2) the bucket permissions are not setup correctly. '
769-
f'Consider using aws s3 ls {self.name} to debug.')
769+
f'Consider using `aws s3 ls {self.name}` to debug.')
770770
logger.error(ex)
771771
raise ex from e
772772

@@ -792,19 +792,19 @@ def mount_command(self, mount_path: str) -> str:
792792
script = textwrap.dedent(f"""
793793
#!/usr/bin/env bash
794794
set -e
795-
795+
796796
S3_SOURCE={self.bucket.name}
797797
MOUNT_PATH={mount_path}
798798
STAT_CACHE_TTL={self._STAT_CACHE_TTL}
799799
TYPE_CACHE_TTL={self._TYPE_CACHE_TTL}
800-
800+
801801
# Check if path is already mounted
802802
if ! [ "$(grep -q $MOUNT_PATH /proc/mounts)" ] ; then
803803
echo "Path already mounted - unmounting..."
804804
fusermount -u "$MOUNT_PATH"
805805
echo "Successfully unmounted $MOUNT_PATH."
806806
fi
807-
807+
808808
# Install goofys if not already installed
809809
if ! [ -x "$(command -v goofys)" ]; then
810810
echo "Installing goofys..."
@@ -813,7 +813,7 @@ def mount_command(self, mount_path: str) -> str:
813813
else
814814
echo "Goofys already installed. Proceeding..."
815815
fi
816-
816+
817817
# Check if mount path exists
818818
if [ ! -d "$MOUNT_PATH" ]; then
819819
echo "Mount path $MOUNT_PATH does not exist. Creating..."
@@ -827,7 +827,7 @@ def mount_command(self, mount_path: str) -> str:
827827
fi
828828
fi
829829
echo "Mounting $S3_SOURCE to $MOUNT_PATH with goofys..."
830-
goofys --stat-cache-ttl $STAT_CACHE_TTL --type-cache-ttl $TYPE_CACHE_TTL $S3_SOURCE $MOUNT_PATH
830+
goofys -o allow_other --stat-cache-ttl $STAT_CACHE_TTL --type-cache-ttl $TYPE_CACHE_TTL $S3_SOURCE $MOUNT_PATH
831831
echo "Mounting done."
832832
""")
833833

@@ -1042,15 +1042,15 @@ def _get_bucket(self) -> Tuple[StorageHandle, bool]:
10421042
f'Failed to connect to external bucket {self.name} \n'
10431043
'Check if the 1) the bucket name is taken and/or '
10441044
'2) the bucket permissions are not setup correctly. '
1045-
f'Consider using gsutil ls gs://{self.name} to debug.')
1045+
f'Consider using `gsutil ls gs://{self.name}` to debug.')
10461046
logger.error(ex)
10471047
raise ex from e
10481048
except ValueError as e:
10491049
ex = exceptions.StorageBucketGetError(
10501050
f'Attempted to access a private external bucket {self.name}'
10511051
'\nCheck if the 1) the bucket name is taken and/or '
10521052
'2) the bucket permissions are not setup correctly. '
1053-
f'Consider using gsutil ls gs://{self.name} to debug.')
1053+
f'Consider using `gsutil ls gs://{self.name}` to debug.')
10541054
logger.error(ex)
10551055
raise ex from e
10561056

sky/skylet/subprocess_daemon.sh

+2
Original file line numberDiff line numberDiff line change
@@ -7,4 +7,6 @@ proc_pid=$2
77
while kill -s 0 ${parent_pid}; do sleep 1; done
88

99
pkill -TERM -P ${proc_pid}
10+
# Wait the processes to gracefully exit
11+
sleep 5
1012
kill -9 ${proc_pid}

sky/templates/aws-ray.yml.j2

+1
Original file line numberDiff line numberDiff line change
@@ -84,6 +84,7 @@ setup_commands:
8484
sudo pkill -9 apt-get;
8585
sudo pkill -9 dpkg;
8686
sudo dpkg --configure -a;
87+
sudo sed -i 's/#user_allow_other/user_allow_other/g' /etc/fuse.conf; # This is needed for `-o allow_other` option for `goofys`
8788
- pip3 uninstall sky -y &> /dev/null; pip3 install {{sky_remote_path}}/*.whl && python3 -c "from sky.skylet.ray_patches import patch; patch()" # patch the buggy ray file
8889

8990
# Command to start ray on the head node. You don't need to change this.

sky/templates/azure-ray.yml.j2

+2-1
Original file line numberDiff line numberDiff line change
@@ -94,7 +94,8 @@ setup_commands:
9494
sudo pkill -9 apt-get;
9595
sudo pkill -9 dpkg;
9696
sudo dpkg --configure -a;
97-
which conda > /dev/null 2>&1 || (wget -nc https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh && bash Miniconda3-latest-Linux-x86_64.sh -b && eval "$(/home/azureuser/miniconda3/bin/conda shell.bash hook)" && conda init && conda config --set auto_activate_base false)
97+
which conda > /dev/null 2>&1 || (wget -nc https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh && bash Miniconda3-latest-Linux-x86_64.sh -b && eval "$(/home/azureuser/miniconda3/bin/conda shell.bash hook)" && conda init && conda config --set auto_activate_base false);
98+
sudo sed -i 's/#user_allow_other/user_allow_other/g' /etc/fuse.conf; # This is needed for `-o allow_other` option for `goofys`
9899
# We have to install azure-cli because the Azure cluster does not pre-install it.
99100
- pip3 uninstall sky -y &> /dev/null; pip3 install {{sky_remote_path}}/*.whl && pip3 install azure-cli==2.30.0 && python3 -c "from sky.skylet.ray_patches import patch; patch()" # patch the buggy ray file
100101

sky/templates/gcp-ray.yml.j2

+2-1
Original file line numberDiff line numberDiff line change
@@ -105,7 +105,8 @@ initialization_commands: []
105105

106106
# List of shell commands to run to set up nodes.
107107
setup_commands:
108-
- pip3 install -U ray[default]=={{ray_version}} && mkdir -p ~/sky_workdir && mkdir -p ~/.sky/sky_app
108+
- pip3 install -U ray[default]=={{ray_version}} && mkdir -p ~/sky_workdir && mkdir -p ~/.sky/sky_app;
109+
sudo sed -i 's/#user_allow_other/user_allow_other/g' /etc/fuse.conf; # This is needed for `-o allow_other` option for `goofys`;
109110
- pip3 uninstall sky -y &> /dev/null; pip3 install {{sky_remote_path}}/*.whl && python3 -c "from sky.skylet.ray_patches import patch; patch()" # patch the buggy ray file
110111

111112
# Command to start ray on the head node. You don't need to change this.

0 commit comments

Comments
 (0)