Skip to content

Commit 8e9ce20

Browse files
committed
fix on
1 parent 91d20b0 commit 8e9ce20

File tree

17 files changed

+3587
-0
lines changed

17 files changed

+3587
-0
lines changed

.ci/pytorch/build.sh

Lines changed: 412 additions & 0 deletions
Large diffs are not rendered by default.

.ci/pytorch/common-build.sh

Lines changed: 59 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,59 @@
1+
#!/bin/bash
2+
# Required environment variables:
3+
# $BUILD_ENVIRONMENT (should be set by your Docker image)
4+
5+
if [[ "$BUILD_ENVIRONMENT" != *win-* ]]; then
6+
# Save the absolute path in case later we chdir (as occurs in the gpu perf test)
7+
script_dir="$( cd "$(dirname "${BASH_SOURCE[0]}")" || exit ; pwd -P )"
8+
9+
if which sccache > /dev/null; then
10+
# Save sccache logs to file
11+
sccache --stop-server > /dev/null 2>&1 || true
12+
rm -f ~/sccache_error.log || true
13+
14+
function sccache_epilogue() {
15+
echo "::group::Sccache Compilation Log"
16+
echo '=================== sccache compilation log ==================='
17+
python "$script_dir/print_sccache_log.py" ~/sccache_error.log 2>/dev/null || true
18+
echo '=========== If your build fails, please take a look at the log above for possible reasons ==========='
19+
sccache --show-stats
20+
sccache --stop-server || true
21+
echo "::endgroup::"
22+
}
23+
24+
# Register the function here so that the error log can be printed even when
25+
# sccache fails to start, i.e. timeout error
26+
trap_add sccache_epilogue EXIT
27+
28+
if [[ -n "${SKIP_SCCACHE_INITIALIZATION:-}" ]]; then
29+
# sccache --start-server seems to hang forever on self hosted runners for GHA
30+
# so let's just go ahead and skip the --start-server altogether since it seems
31+
# as though sccache still gets used even when the sscache server isn't started
32+
# explicitly
33+
echo "Skipping sccache server initialization, setting environment variables"
34+
export SCCACHE_IDLE_TIMEOUT=0
35+
export SCCACHE_ERROR_LOG=~/sccache_error.log
36+
export RUST_LOG=sccache::server=error
37+
elif [[ "${BUILD_ENVIRONMENT}" == *rocm* ]]; then
38+
SCCACHE_ERROR_LOG=~/sccache_error.log SCCACHE_IDLE_TIMEOUT=0 sccache --start-server
39+
else
40+
# increasing SCCACHE_IDLE_TIMEOUT so that extension_backend_test.cpp can build after this PR:
41+
# https://github.com/pytorch/pytorch/pull/16645
42+
SCCACHE_ERROR_LOG=~/sccache_error.log SCCACHE_IDLE_TIMEOUT=0 RUST_LOG=sccache::server=error sccache --start-server
43+
fi
44+
45+
# Report sccache stats for easier debugging. It's ok if this commands
46+
# timeouts and fails on MacOS
47+
sccache --zero-stats || true
48+
fi
49+
50+
if which ccache > /dev/null; then
51+
# Report ccache stats for easier debugging
52+
ccache --zero-stats
53+
ccache --show-stats
54+
function ccache_epilogue() {
55+
ccache --show-stats
56+
}
57+
trap_add ccache_epilogue EXIT
58+
fi
59+
fi

.ci/pytorch/common.sh

Lines changed: 24 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,24 @@
1+
#!/bin/bash
2+
3+
# Common setup for all Jenkins scripts
4+
# shellcheck source=./common_utils.sh
5+
source "$(dirname "${BASH_SOURCE[0]}")/common_utils.sh"
6+
set -ex
7+
8+
# Required environment variables:
9+
# $BUILD_ENVIRONMENT (should be set by your Docker image)
10+
11+
# Figure out which Python to use for ROCm
12+
if [[ "${BUILD_ENVIRONMENT}" == *rocm* ]]; then
13+
# HIP_PLATFORM is auto-detected by hipcc; unset to avoid build errors
14+
unset HIP_PLATFORM
15+
export PYTORCH_TEST_WITH_ROCM=1
16+
# temporary to locate some kernel issues on the CI nodes
17+
export HSAKMT_DEBUG_LEVEL=4
18+
# improve rccl performance for distributed tests
19+
export HSA_FORCE_FINE_GRAIN_PCIE=1
20+
fi
21+
22+
# TODO: Renable libtorch testing for MacOS, see https://github.com/pytorch/pytorch/issues/62598
23+
# shellcheck disable=SC2034
24+
BUILD_TEST_LIBTORCH=0

.ci/pytorch/common_utils.sh

Lines changed: 240 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,240 @@
1+
#!/bin/bash
2+
3+
# Common util **functions** that can be sourced in other scripts.
4+
5+
# note: printf is used instead of echo to avoid backslash
6+
# processing and to properly handle values that begin with a '-'.
7+
8+
log() { printf '%s\n' "$*"; }
9+
error() { log "ERROR: $*" >&2; }
10+
fatal() { error "$@"; exit 1; }
11+
12+
retry () {
13+
"$@" || (sleep 10 && "$@") || (sleep 20 && "$@") || (sleep 40 && "$@")
14+
}
15+
16+
# compositional trap taken from https://stackoverflow.com/a/7287873/23845
17+
# appends a command to a trap
18+
#
19+
# - 1st arg: code to add
20+
# - remaining args: names of traps to modify
21+
#
22+
trap_add() {
23+
trap_add_cmd=$1; shift || fatal "${FUNCNAME[0]} usage error"
24+
for trap_add_name in "$@"; do
25+
trap -- "$(
26+
# helper fn to get existing trap command from output
27+
# of trap -p
28+
extract_trap_cmd() { printf '%s\n' "$3"; }
29+
# print existing trap command with newline
30+
eval "extract_trap_cmd $(trap -p "${trap_add_name}")"
31+
# print the new trap command
32+
printf '%s\n' "${trap_add_cmd}"
33+
)" "${trap_add_name}" \
34+
|| fatal "unable to add to trap ${trap_add_name}"
35+
done
36+
}
37+
# set the trace attribute for the above function. this is
38+
# required to modify DEBUG or RETURN traps because functions don't
39+
# inherit them unless the trace attribute is set
40+
declare -f -t trap_add
41+
42+
function assert_git_not_dirty() {
43+
# TODO: we should add an option to `build_amd.py` that reverts the repo to
44+
# an unmodified state.
45+
if [[ "$BUILD_ENVIRONMENT" != *rocm* ]] && [[ "$BUILD_ENVIRONMENT" != *xla* ]] ; then
46+
git_status=$(git status --porcelain | grep -v '?? third_party' || true)
47+
if [[ $git_status ]]; then
48+
echo "Build left local git repository checkout dirty"
49+
echo "git status --porcelain:"
50+
echo "${git_status}"
51+
exit 1
52+
fi
53+
fi
54+
}
55+
56+
function pip_install_whl() {
57+
# This is used to install PyTorch and other build artifacts wheel locally
58+
# without using any network connection
59+
60+
# Convert the input arguments into an array
61+
local args=("$@")
62+
63+
# Check if the first argument contains multiple paths separated by spaces
64+
if [[ "${args[0]}" == *" "* ]]; then
65+
# Split the string by spaces into an array
66+
IFS=' ' read -r -a paths <<< "${args[0]}"
67+
# Loop through each path and install individually
68+
for path in "${paths[@]}"; do
69+
echo "Installing $path"
70+
python3 -mpip install --no-index --no-deps "$path"
71+
done
72+
else
73+
# Loop through each argument and install individually
74+
for path in "${args[@]}"; do
75+
echo "Installing $path"
76+
python3 -mpip install --no-index --no-deps "$path"
77+
done
78+
fi
79+
}
80+
81+
82+
function pip_install() {
83+
# retry 3 times
84+
# old versions of pip don't have the "--progress-bar" flag
85+
pip install --progress-bar off "$@" || pip install --progress-bar off "$@" || pip install --progress-bar off "$@" ||\
86+
pip install "$@" || pip install "$@" || pip install "$@"
87+
}
88+
89+
function pip_uninstall() {
90+
# uninstall 2 times
91+
pip uninstall -y "$@" || pip uninstall -y "$@"
92+
}
93+
94+
function get_exit_code() {
95+
set +e
96+
"$@"
97+
retcode=$?
98+
set -e
99+
return $retcode
100+
}
101+
102+
function get_bazel() {
103+
# Download and use the cross-platform, dependency-free Python
104+
# version of Bazelisk to fetch the platform specific version of
105+
# Bazel to use from .bazelversion.
106+
retry curl --location --output tools/bazel \
107+
https://raw.githubusercontent.com/bazelbuild/bazelisk/v1.16.0/bazelisk.py
108+
shasum --algorithm=1 --check \
109+
<(echo 'd4369c3d293814d3188019c9f7527a948972d9f8 tools/bazel')
110+
chmod u+x tools/bazel
111+
}
112+
113+
# This function is bazel specific because of the bug
114+
# in the bazel that requires some special paths massaging
115+
# as a workaround. See
116+
# https://github.com/bazelbuild/bazel/issues/10167
117+
function install_sccache_nvcc_for_bazel() {
118+
sudo mv /usr/local/cuda/bin/nvcc /usr/local/cuda/bin/nvcc-real
119+
120+
# Write the `/usr/local/cuda/bin/nvcc`
121+
cat << EOF | sudo tee /usr/local/cuda/bin/nvcc
122+
#!/bin/sh
123+
if [ \$(env -u LD_PRELOAD ps -p \$PPID -o comm=) != sccache ]; then
124+
exec sccache /usr/local/cuda/bin/nvcc "\$@"
125+
else
126+
exec external/local_cuda/cuda/bin/nvcc-real "\$@"
127+
fi
128+
EOF
129+
130+
sudo chmod +x /usr/local/cuda/bin/nvcc
131+
}
132+
133+
function install_monkeytype {
134+
# Install MonkeyType
135+
pip_install MonkeyType
136+
}
137+
138+
139+
function get_pinned_commit() {
140+
cat .github/ci_commit_pins/"${1}".txt
141+
}
142+
143+
function install_torchaudio() {
144+
local commit
145+
commit=$(get_pinned_commit audio)
146+
if [[ "$1" == "cuda" ]]; then
147+
# TODO: This is better to be passed as a parameter from _linux-test workflow
148+
# so that it can be consistent with what is set in build
149+
TORCH_CUDA_ARCH_LIST="8.0;8.6" pip_install --no-use-pep517 --user "git+https://github.com/pytorch/audio.git@${commit}"
150+
else
151+
pip_install --no-use-pep517 --user "git+https://github.com/pytorch/audio.git@${commit}"
152+
fi
153+
154+
}
155+
156+
function install_torchtext() {
157+
local data_commit
158+
local text_commit
159+
data_commit=$(get_pinned_commit data)
160+
text_commit=$(get_pinned_commit text)
161+
pip_install --no-use-pep517 --user "git+https://github.com/pytorch/data.git@${data_commit}"
162+
pip_install --no-use-pep517 --user "git+https://github.com/pytorch/text.git@${text_commit}"
163+
}
164+
165+
function install_torchvision() {
166+
local orig_preload
167+
local commit
168+
commit=$(get_pinned_commit vision)
169+
orig_preload=${LD_PRELOAD}
170+
if [ -n "${LD_PRELOAD}" ]; then
171+
# Silence dlerror to work-around glibc ASAN bug, see https://sourceware.org/bugzilla/show_bug.cgi?id=27653#c9
172+
echo 'char* dlerror(void) { return "";}'|gcc -fpic -shared -o "${HOME}/dlerror.so" -x c -
173+
LD_PRELOAD=${orig_preload}:${HOME}/dlerror.so
174+
fi
175+
pip_install --no-use-pep517 --user "git+https://github.com/pytorch/vision.git@${commit}"
176+
if [ -n "${LD_PRELOAD}" ]; then
177+
LD_PRELOAD=${orig_preload}
178+
fi
179+
}
180+
181+
function install_tlparse() {
182+
pip_install --user "tlparse==0.3.7"
183+
PATH="$(python -m site --user-base)/bin:$PATH"
184+
}
185+
186+
function install_torchrec_and_fbgemm() {
187+
local torchrec_commit
188+
torchrec_commit=$(get_pinned_commit torchrec)
189+
local fbgemm_commit
190+
fbgemm_commit=$(get_pinned_commit fbgemm)
191+
pip_uninstall torchrec-nightly
192+
pip_uninstall fbgemm-gpu-nightly
193+
pip_install setuptools-git-versioning scikit-build pyre-extensions
194+
# See https://github.com/pytorch/pytorch/issues/106971
195+
CUDA_PATH=/usr/local/cuda-12.1 pip_install --no-use-pep517 --user "git+https://github.com/pytorch/FBGEMM.git@${fbgemm_commit}#egg=fbgemm-gpu&subdirectory=fbgemm_gpu"
196+
pip_install --no-use-pep517 --user "git+https://github.com/pytorch/torchrec.git@${torchrec_commit}"
197+
}
198+
199+
function clone_pytorch_xla() {
200+
if [[ ! -d ./xla ]]; then
201+
git clone --recursive --quiet https://github.com/pytorch/xla.git
202+
pushd xla
203+
# pin the xla hash so that we don't get broken by changes to xla
204+
git checkout "$(cat ../.github/ci_commit_pins/xla.txt)"
205+
git submodule sync
206+
git submodule update --init --recursive
207+
popd
208+
fi
209+
}
210+
211+
function checkout_install_torchbench() {
212+
local commit
213+
commit=$(get_pinned_commit torchbench)
214+
git clone https://github.com/pytorch/benchmark torchbench
215+
pushd torchbench
216+
git checkout "$commit"
217+
218+
if [ "$1" ]; then
219+
python install.py --continue_on_fail models "$@"
220+
else
221+
# Occasionally the installation may fail on one model but it is ok to continue
222+
# to install and test other models
223+
python install.py --continue_on_fail
224+
fi
225+
echo "Print all dependencies after TorchBench is installed"
226+
python -mpip freeze
227+
popd
228+
}
229+
230+
function print_sccache_stats() {
231+
echo 'PyTorch Build Statistics'
232+
sccache --show-stats
233+
234+
if [[ -n "${OUR_GITHUB_JOB_ID}" ]]; then
235+
sccache --show-stats --stats-format json | jq .stats \
236+
> "sccache-stats-${BUILD_ENVIRONMENT}-${OUR_GITHUB_JOB_ID}.json"
237+
else
238+
echo "env var OUR_GITHUB_JOB_ID not set, will not write sccache stats to json"
239+
fi
240+
}

0 commit comments

Comments
 (0)