Skip to content

Commit 419afeb

Browse files
author
hhy
committed
[OSS] fix fbgemm LB_LIBRARY_PATH
1 parent c39bd60 commit 419afeb

File tree

1 file changed

+44
-24
lines changed

1 file changed

+44
-24
lines changed

.github/scripts/validate_binaries.sh

+44-24
Original file line numberDiff line numberDiff line change
@@ -7,8 +7,9 @@
77

88

99
export PYTORCH_CUDA_PKG=""
10+
export CONDA_ENV="build_binary"
1011

11-
conda create -y -n build_binary python="${MATRIX_PYTHON_VERSION}"
12+
conda create -y -n "${CONDA_ENV}" python="${MATRIX_PYTHON_VERSION}"
1213

1314
conda run -n build_binary python --version
1415

@@ -49,41 +50,60 @@ elif [[ ${MATRIX_CHANNEL} = 'release' ]]; then
4950
export PYTORCH_URL="https://download.pytorch.org/whl/${CUDA_VERSION}"
5051
fi
5152

53+
54+
echo "CU_VERSION: ${CUDA_VERSION}"
55+
echo "MATRIX_CHANNEL: ${MATRIX_CHANNEL}"
56+
echo "CONDA_ENV: ${CONDA_ENV}"
57+
58+
# shellcheck disable=SC2155
59+
export CONDA_PREFIX=$(conda run -n "${CONDA_ENV}" printenv CONDA_PREFIX)
60+
61+
find / -name *cuda*
62+
63+
if [[ $CUDA_VERSION = cu* ]]; then
64+
# Setting LD_LIBRARY_PATH fixes the runtime error with fbgemm_gpu not
65+
# being able to locate libnvrtc.so
66+
echo "[NOVA] Setting LD_LIBRARY_PATH ..."
67+
conda env config vars set -n ${CONDA_ENV} \
68+
LD_LIBRARY_PATH="/usr/local/lib:/usr/lib64:${CONDA_PREFIX}/lib:${LD_LIBRARY_PATH}"
69+
fi
70+
71+
5272
# install pytorch
5373
# switch back to conda once torch nightly is fixed
5474
# if [[ ${MATRIX_GPU_ARCH_TYPE} = 'cuda' ]]; then
5575
# export PYTORCH_CUDA_PKG="pytorch-cuda=${MATRIX_GPU_ARCH_VERSION}"
5676
# fi
57-
conda run -n build_binary pip install torch --index-url "$PYTORCH_URL"
77+
conda run -n "${CONDA_ENV}" pip install torch --index-url "$PYTORCH_URL"
5878

5979
# install fbgemm
60-
conda run -n build_binary pip install fbgemm-gpu --index-url "$PYTORCH_URL"
80+
conda run -n "${CONDA_ENV}" pip install fbgemm-gpu --index-url "$PYTORCH_URL"
6181

6282
# install requirements from pypi
63-
conda run -n build_binary pip install torchmetrics==1.0.3
83+
conda run -n "${CONDA_ENV}" pip install torchmetrics==1.0.3
6484

6585
# install torchrec
66-
conda run -n build_binary pip install torchrec --index-url "$PYTORCH_URL"
86+
conda run -n "${CONDA_ENV}" pip install torchrec --index-url "$PYTORCH_URL"
6787

6888
# Run small import test
69-
conda run -n build_binary python -c "import torch; import fbgemm_gpu; import torchrec"
89+
conda run -n "${CONDA_ENV}" python -c "import torch; import fbgemm_gpu; import torchrec"
7090

7191
# check directory
7292
ls -R
7393

7494
# check if cuda available
75-
conda run -n build_binary python -c "import torch; print(torch.cuda.is_available())"
95+
conda run -n "${CONDA_ENV}" python -c "import torch; print(torch.cuda.is_available())"
7696

7797
# check cuda version
78-
conda run -n build_binary python -c "import torch; print(torch.version.cuda)"
98+
conda run -n "${CONDA_ENV}" python -c "import torch; print(torch.version.cuda)"
7999

80100
# Finally run smoke test
81101
# python 3.11 needs torchx-nightly
82-
conda run -n build_binary pip install torchx-nightly iopath
102+
conda run -n "${CONDA_ENV}" pip install torchx-nightly iopath
83103
if [[ ${MATRIX_GPU_ARCH_TYPE} = 'cuda' ]]; then
84-
conda run -n build_binary torchx run -s local_cwd dist.ddp -j 1 --gpu 2 --script test_installation.py
104+
conda run -n "${CONDA_ENV}" torchx run -s local_cwd dist.ddp -j 1 --gpu 2 --script test_installation.py
85105
else
86-
conda run -n build_binary torchx run -s local_cwd dist.ddp -j 1 --script test_installation.py -- --cpu_only
106+
conda run -n "${CONDA_ENV}" torchx run -s local_cwd dist.ddp -j 1 --script test_installation.py -- --cpu_only
87107
fi
88108

89109

@@ -93,31 +113,31 @@ if [[ ${MATRIX_CHANNEL} != 'release' ]]; then
93113
exit 0
94114
else
95115
# Check version matches only for release binaries
96-
torchrec_version=$(conda run -n build_binary pip show torchrec | grep Version | cut -d' ' -f2)
97-
fbgemm_version=$(conda run -n build_binary pip show fbgemm_gpu | grep Version | cut -d' ' -f2)
116+
torchrec_version=$(conda run -n "${CONDA_ENV}" pip show torchrec | grep Version | cut -d' ' -f2)
117+
fbgemm_version=$(conda run -n "${CONDA_ENV}" pip show fbgemm_gpu | grep Version | cut -d' ' -f2)
98118

99119
if [ "$torchrec_version" != "$fbgemm_version" ]; then
100120
echo "Error: TorchRec package version does not match FBGEMM package version"
101121
exit 1
102122
fi
103123
fi
104124

105-
conda create -y -n build_binary python="${MATRIX_PYTHON_VERSION}"
125+
conda create -y -n "${CONDA_ENV}" python="${MATRIX_PYTHON_VERSION}"
106126

107-
conda run -n build_binary python --version
127+
conda run -n "${CONDA_ENV}" python --version
108128

109129
if [[ ${MATRIX_GPU_ARCH_VERSION} != '12.4' ]]; then
110130
exit 0
111131
fi
112132

113133
echo "checking pypi release"
114-
conda run -n build_binary pip install torch
115-
conda run -n build_binary pip install fbgemm-gpu
116-
conda run -n build_binary pip install torchrec
134+
conda run -n "${CONDA_ENV}" pip install torch
135+
conda run -n "${CONDA_ENV}" pip install fbgemm-gpu
136+
conda run -n "${CONDA_ENV}" pip install torchrec
117137

118138
# Check version matching again for PyPI
119-
torchrec_version=$(conda run -n build_binary pip show torchrec | grep Version | cut -d' ' -f2)
120-
fbgemm_version=$(conda run -n build_binary pip show fbgemm_gpu | grep Version | cut -d' ' -f2)
139+
torchrec_version=$(conda run -n "${CONDA_ENV}" pip show torchrec | grep Version | cut -d' ' -f2)
140+
fbgemm_version=$(conda run -n "${CONDA_ENV}" pip show fbgemm_gpu | grep Version | cut -d' ' -f2)
121141

122142
if [ "$torchrec_version" != "$fbgemm_version" ]; then
123143
echo "Error: TorchRec package version does not match FBGEMM package version"
@@ -128,13 +148,13 @@ fi
128148
ls -R
129149

130150
# check if cuda available
131-
conda run -n build_binary python -c "import torch; print(torch.cuda.is_available())"
151+
conda run -n "${CONDA_ENV}" python -c "import torch; print(torch.cuda.is_available())"
132152

133153
# check cuda version
134-
conda run -n build_binary python -c "import torch; print(torch.version.cuda)"
154+
conda run -n "${CONDA_ENV}" python -c "import torch; print(torch.version.cuda)"
135155

136156
# python 3.11 needs torchx-nightly
137-
conda run -n build_binary pip install torchx-nightly iopath
157+
conda run -n "${CONDA_ENV}" pip install torchx-nightly iopath
138158

139159
# Finally run smoke test
140-
conda run -n build_binary torchx run -s local_cwd dist.ddp -j 1 --gpu 2 --script test_installation.py
160+
conda run -n "${CONDA_ENV}" torchx run -s local_cwd dist.ddp -j 1 --gpu 2 --script test_installation.py

0 commit comments

Comments
 (0)