Skip to content

Commit 34fa74c

Browse files
TroyGardenfacebook-github-bot
authored andcommitted
set LD_LIBRARY_PATH for fbgemm in validate_binaries.sh (#2696)
Summary: # context * to address the error when running github test ``` +++ conda run -n build_binary python -c 'import torch; import fbgemm_gpu; import torchrec' +++ local cmd=run +++ case "$cmd" in +++ __conda_exe run -n build_binary python -c 'import torch; import fbgemm_gpu; import torchrec' +++ /opt/conda/bin/conda run -n build_binary python -c 'import torch; import fbgemm_gpu; import torchrec' ERROR:root:Could not load the library 'fbgemm_gpu_tbe_index_select.so': /lib64/libstdc++.so.6: version `GLIBCXX_3.4.29' not found (required by /opt/conda/envs/build_binary/lib/python3.10/site-packages/fbgemm_gpu/fbgemm_gpu_tbe_index_select.so) Traceback (most recent call last): File "<string>", line 1, in <module> File "/opt/conda/envs/build_binary/lib/python3.10/site-packages/fbgemm_gpu/__init__.py", line 62, in <module> _load_library(f"{library}.so") File "/opt/conda/envs/build_binary/lib/python3.10/site-packages/fbgemm_gpu/__init__.py", line 21, in _load_library raise error File "/opt/conda/envs/build_binary/lib/python3.10/site-packages/fbgemm_gpu/__init__.py", line 17, in _load_library main() File "/home/ec2-user/actions-runner/_work/torchrec/torchrec/test-infra/.github/scripts/run_with_env_secrets.py", line 98, in main run_cmd_or_die(f"docker exec -t {container_name} /exec") File "/home/ec2-user/actions-runner/_work/torchrec/torchrec/test-infra/.github/scripts/run_with_env_secrets.py", line 39, in run_cmd_or_die raise RuntimeError(f"Command {cmd} failed with exit code {exit_code}") RuntimeError: Command docker exec -t d5cfe23625bf3b1538b808a1344090ae72ff3977990bc1f780c7a46435a384ec /exec failed with exit code 1 torch.ops.load_library(os.path.join(os.path.dirname(__file__), filename)) File "/opt/conda/envs/build_binary/lib/python3.10/site-packages/torch/_ops.py", line 1357, in load_library ctypes.CDLL(path) File "/opt/conda/envs/build_binary/lib/python3.10/ctypes/__init__.py", line 374, in __init__ self._handle = _dlopen(self._name, mode) OSError: /lib64/libstdc++.so.6: version `GLIBCXX_3.4.29' not found (required by /opt/conda/envs/build_binary/lib/python3.10/site-packages/fbgemm_gpu/fbgemm_gpu_tbe_index_select.so) ``` * the issue was fixed before by D67949409 ([#2671](#2671)) in for another test * this diff applies the same fix on the validate_binaries test. # details * previous failures {F1974496108} Differential Revision: D68511145
1 parent 519f193 commit 34fa74c

File tree

2 files changed

+54
-24
lines changed

2 files changed

+54
-24
lines changed

.github/scripts/validate_binaries.sh

+48-24
Original file line numberDiff line numberDiff line change
@@ -7,8 +7,9 @@
77

88

99
export PYTORCH_CUDA_PKG=""
10+
export CONDA_ENV="build_binary"
1011

11-
conda create -y -n build_binary python="${MATRIX_PYTHON_VERSION}"
12+
conda create -y -n "${CONDA_ENV}" python="${MATRIX_PYTHON_VERSION}"
1213

1314
conda run -n build_binary python --version
1415

@@ -49,41 +50,64 @@ elif [[ ${MATRIX_CHANNEL} = 'release' ]]; then
4950
export PYTORCH_URL="https://download.pytorch.org/whl/${CUDA_VERSION}"
5051
fi
5152

53+
54+
echo "CU_VERSION: ${CUDA_VERSION}"
55+
echo "MATRIX_CHANNEL: ${MATRIX_CHANNEL}"
56+
echo "CONDA_ENV: ${CONDA_ENV}"
57+
58+
# shellcheck disable=SC2155
59+
export CONDA_PREFIX=$(conda run -n "${CONDA_ENV}" printenv CONDA_PREFIX)
60+
61+
find / -name *cuda*
62+
63+
if [[ $CUDA_VERSION = cu* ]]; then
64+
# Setting LD_LIBRARY_PATH fixes the runtime error with fbgemm_gpu not
65+
# being able to locate libnvrtc.so
66+
echo "[NOVA] Setting LD_LIBRARY_PATH ..."
67+
conda env config vars set -n ${CONDA_ENV} \
68+
LD_LIBRARY_PATH="/usr/local/lib:/usr/lib64:${CONDA_PREFIX}/lib:${LD_LIBRARY_PATH}"
69+
else
70+
echo "[NOVA] Setting LD_LIBRARY_PATH ..."
71+
conda env config vars set -p ${CONDA_ENV} \
72+
LD_LIBRARY_PATH="/usr/local/lib:/usr/lib64:${LD_LIBRARY_PATH}"
73+
fi
74+
75+
5276
# install pytorch
5377
# switch back to conda once torch nightly is fixed
5478
# if [[ ${MATRIX_GPU_ARCH_TYPE} = 'cuda' ]]; then
5579
# export PYTORCH_CUDA_PKG="pytorch-cuda=${MATRIX_GPU_ARCH_VERSION}"
5680
# fi
57-
conda run -n build_binary pip install torch --index-url "$PYTORCH_URL"
81+
conda run -n "${CONDA_ENV}" pip install torch --index-url "$PYTORCH_URL"
5882

5983
# install fbgemm
60-
conda run -n build_binary pip install fbgemm-gpu --index-url "$PYTORCH_URL"
84+
conda run -n "${CONDA_ENV}" pip install fbgemm-gpu --index-url "$PYTORCH_URL"
6185

6286
# install requirements from pypi
63-
conda run -n build_binary pip install torchmetrics==1.0.3
87+
conda run -n "${CONDA_ENV}" pip install torchmetrics==1.0.3
6488

6589
# install torchrec
66-
conda run -n build_binary pip install torchrec --index-url "$PYTORCH_URL"
90+
conda run -n "${CONDA_ENV}" pip install torchrec --index-url "$PYTORCH_URL"
6791

6892
# Run small import test
69-
conda run -n build_binary python -c "import torch; import fbgemm_gpu; import torchrec"
93+
conda run -n "${CONDA_ENV}" python -c "import torch; import fbgemm_gpu; import torchrec"
7094

7195
# check directory
7296
ls -R
7397

7498
# check if cuda available
75-
conda run -n build_binary python -c "import torch; print(torch.cuda.is_available())"
99+
conda run -n "${CONDA_ENV}" python -c "import torch; print(torch.cuda.is_available())"
76100

77101
# check cuda version
78-
conda run -n build_binary python -c "import torch; print(torch.version.cuda)"
102+
conda run -n "${CONDA_ENV}" python -c "import torch; print(torch.version.cuda)"
79103

80104
# Finally run smoke test
81105
# python 3.11 needs torchx-nightly
82-
conda run -n build_binary pip install torchx-nightly iopath
106+
conda run -n "${CONDA_ENV}" pip install torchx-nightly iopath
83107
if [[ ${MATRIX_GPU_ARCH_TYPE} = 'cuda' ]]; then
84-
conda run -n build_binary torchx run -s local_cwd dist.ddp -j 1 --gpu 2 --script test_installation.py
108+
conda run -n "${CONDA_ENV}" torchx run -s local_cwd dist.ddp -j 1 --gpu 2 --script test_installation.py
85109
else
86-
conda run -n build_binary torchx run -s local_cwd dist.ddp -j 1 --script test_installation.py -- --cpu_only
110+
conda run -n "${CONDA_ENV}" torchx run -s local_cwd dist.ddp -j 1 --script test_installation.py -- --cpu_only
87111
fi
88112

89113

@@ -93,31 +117,31 @@ if [[ ${MATRIX_CHANNEL} != 'release' ]]; then
93117
exit 0
94118
else
95119
# Check version matches only for release binaries
96-
torchrec_version=$(conda run -n build_binary pip show torchrec | grep Version | cut -d' ' -f2)
97-
fbgemm_version=$(conda run -n build_binary pip show fbgemm_gpu | grep Version | cut -d' ' -f2)
120+
torchrec_version=$(conda run -n "${CONDA_ENV}" pip show torchrec | grep Version | cut -d' ' -f2)
121+
fbgemm_version=$(conda run -n "${CONDA_ENV}" pip show fbgemm_gpu | grep Version | cut -d' ' -f2)
98122

99123
if [ "$torchrec_version" != "$fbgemm_version" ]; then
100124
echo "Error: TorchRec package version does not match FBGEMM package version"
101125
exit 1
102126
fi
103127
fi
104128

105-
conda create -y -n build_binary python="${MATRIX_PYTHON_VERSION}"
129+
conda create -y -n "${CONDA_ENV}" python="${MATRIX_PYTHON_VERSION}"
106130

107-
conda run -n build_binary python --version
131+
conda run -n "${CONDA_ENV}" python --version
108132

109133
if [[ ${MATRIX_GPU_ARCH_VERSION} != '12.4' ]]; then
110134
exit 0
111135
fi
112136

113137
echo "checking pypi release"
114-
conda run -n build_binary pip install torch
115-
conda run -n build_binary pip install fbgemm-gpu
116-
conda run -n build_binary pip install torchrec
138+
conda run -n "${CONDA_ENV}" pip install torch
139+
conda run -n "${CONDA_ENV}" pip install fbgemm-gpu
140+
conda run -n "${CONDA_ENV}" pip install torchrec
117141

118142
# Check version matching again for PyPI
119-
torchrec_version=$(conda run -n build_binary pip show torchrec | grep Version | cut -d' ' -f2)
120-
fbgemm_version=$(conda run -n build_binary pip show fbgemm_gpu | grep Version | cut -d' ' -f2)
143+
torchrec_version=$(conda run -n "${CONDA_ENV}" pip show torchrec | grep Version | cut -d' ' -f2)
144+
fbgemm_version=$(conda run -n "${CONDA_ENV}" pip show fbgemm_gpu | grep Version | cut -d' ' -f2)
121145

122146
if [ "$torchrec_version" != "$fbgemm_version" ]; then
123147
echo "Error: TorchRec package version does not match FBGEMM package version"
@@ -128,13 +152,13 @@ fi
128152
ls -R
129153

130154
# check if cuda available
131-
conda run -n build_binary python -c "import torch; print(torch.cuda.is_available())"
155+
conda run -n "${CONDA_ENV}" python -c "import torch; print(torch.cuda.is_available())"
132156

133157
# check cuda version
134-
conda run -n build_binary python -c "import torch; print(torch.version.cuda)"
158+
conda run -n "${CONDA_ENV}" python -c "import torch; print(torch.version.cuda)"
135159

136160
# python 3.11 needs torchx-nightly
137-
conda run -n build_binary pip install torchx-nightly iopath
161+
conda run -n "${CONDA_ENV}" pip install torchx-nightly iopath
138162

139163
# Finally run smoke test
140-
conda run -n build_binary torchx run -s local_cwd dist.ddp -j 1 --gpu 2 --script test_installation.py
164+
conda run -n "${CONDA_ENV}" torchx run -s local_cwd dist.ddp -j 1 --gpu 2 --script test_installation.py

.github/workflows/validate-binaries.yml

+6
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,12 @@
11
name: Validate binaries
22

33
on:
4+
pull_request:
5+
paths-ignore:
6+
- "docs/*"
7+
- "third_party/*"
8+
- .gitignore
9+
- "*.md"
410
workflow_call:
511
inputs:
612
channel:

0 commit comments

Comments
 (0)