1
1
# # Global Args #################################################################
2
- ARG BASE_UBI_IMAGE_TAG=9.4-1181
3
- ARG PROTOC_VERSION=25.2
2
+ ARG BASE_UBI_IMAGE_TAG=latest
3
+ ARG PROTOC_VERSION=25.3
4
4
ARG PYTORCH_INDEX="https://download.pytorch.org/whl"
5
5
# ARG PYTORCH_INDEX="https://download.pytorch.org/whl/nightly"
6
+ ARG AUTO_GPTQ_VERSION=0.7.1
6
7
7
8
# match PyTorch version that was used to compile flash-attention v2 pre-built wheels
8
9
# e.g. flash-attn v2.5.2 => torch ['1.12.1', '1.13.1', '2.0.1', '2.1.2', '2.2.0', '2.3.0.dev20240126']
9
10
# https://github.com/Dao-AILab/flash-attention/blob/v2.5.2/.github/workflows/publish.yml#L47
10
11
# use nightly build index for torch .dev pre-release versions
11
- ARG PYTORCH_VERSION=2.2.0
12
+ ARG PYTORCH_VERSION=2.2.1
12
13
13
14
ARG PYTHON_VERSION=3.11
14
15
@@ -35,18 +36,19 @@ ENV LANG=C.UTF-8 \
35
36
# # CUDA Base ###################################################################
36
37
FROM base as cuda-base
37
38
38
- ENV CUDA_VERSION=11.8.0 \
39
- NV_CUDA_LIB_VERSION=11.8.0-1 \
39
+ # Ref: https://docs.nvidia.com/cuda/archive/12.1.0/cuda-toolkit-release-notes/
40
+ ENV CUDA_VERSION=12.1.0 \
41
+ NV_CUDA_LIB_VERSION=12.1.0-1 \
40
42
NVIDIA_VISIBLE_DEVICES=all \
41
43
NVIDIA_DRIVER_CAPABILITIES=compute,utility \
42
- NV_CUDA_CUDART_VERSION=11.8.89 -1 \
43
- NV_CUDA_COMPAT_VERSION=520.61.05 -1
44
+ NV_CUDA_CUDART_VERSION=12.1.55 -1 \
45
+ NV_CUDA_COMPAT_VERSION=530.30.02 -1
44
46
45
47
RUN dnf config-manager \
46
48
--add-repo https://developer.download.nvidia.com/compute/cuda/repos/rhel9/x86_64/cuda-rhel9.repo \
47
49
&& dnf install -y \
48
- cuda-cudart-11-8 -${NV_CUDA_CUDART_VERSION} \
49
- cuda-compat-11-8 -${NV_CUDA_COMPAT_VERSION} \
50
+ cuda-cudart-12-1 -${NV_CUDA_CUDART_VERSION} \
51
+ cuda-compat-12-1 -${NV_CUDA_COMPAT_VERSION} \
50
52
&& echo "/usr/local/nvidia/lib" >> /etc/ld.so.conf.d/nvidia.conf \
51
53
&& echo "/usr/local/nvidia/lib64" >> /etc/ld.so.conf.d/nvidia.conf \
52
54
&& dnf clean all
@@ -56,53 +58,35 @@ ENV CUDA_HOME="/usr/local/cuda" \
56
58
LD_LIBRARY_PATH="/usr/local/nvidia/lib:/usr/local/nvidia/lib64:$CUDA_HOME/lib64:$CUDA_HOME/extras/CUPTI/lib64:${LD_LIBRARY_PATH}"
57
59
58
60
59
- # # CUDA Runtime ################################################################
60
- FROM cuda-base as cuda-runtime
61
-
62
- ENV NV_NVTX_VERSION=11.8.86-1 \
63
- NV_LIBNPP_VERSION=11.8.0.86-1 \
64
- NV_LIBCUBLAS_VERSION=11.11.3.6-1 \
65
- NV_LIBNCCL_PACKAGE_VERSION=2.15.5-1+cuda11.8
66
-
67
- RUN dnf config-manager \
68
- --add-repo https://developer.download.nvidia.com/compute/cuda/repos/rhel9/x86_64/cuda-rhel9.repo \
69
- && dnf install -y \
70
- cuda-libraries-11-8-${NV_CUDA_LIB_VERSION} \
71
- cuda-nvtx-11-8-${NV_NVTX_VERSION} \
72
- libnpp-11-8-${NV_LIBNPP_VERSION} \
73
- libcublas-11-8-${NV_LIBCUBLAS_VERSION} \
74
- libnccl-${NV_LIBNCCL_PACKAGE_VERSION} \
75
- && dnf clean all
76
-
77
-
78
61
# # CUDA Development ############################################################
79
62
FROM cuda-base as cuda-devel
80
63
81
- ENV NV_CUDA_CUDART_DEV_VERSION=11.8.89-1 \
82
- NV_NVML_DEV_VERSION=11.8.86-1 \
83
- NV_LIBCUBLAS_DEV_VERSION=11.11.3.6-1 \
84
- NV_LIBNPP_DEV_VERSION=11.8.0.86-1 \
85
- NV_LIBNCCL_DEV_PACKAGE_VERSION=2.15.5-1+cuda11.8
64
+ # Ref: https://developer.nvidia.com/nccl/nccl-legacy-downloads
65
+ ENV NV_CUDA_CUDART_DEV_VERSION=12.1.55-1 \
66
+ NV_NVML_DEV_VERSION=12.1.55-1 \
67
+ NV_LIBCUBLAS_DEV_VERSION=12.1.0.26-1 \
68
+ NV_LIBNPP_DEV_VERSION=12.0.2.50-1 \
69
+ NV_LIBNCCL_DEV_PACKAGE_VERSION=2.18.3-1+cuda12.1
86
70
87
71
RUN dnf config-manager \
88
72
--add-repo https://developer.download.nvidia.com/compute/cuda/repos/rhel9/x86_64/cuda-rhel9.repo \
89
73
&& dnf install -y \
90
- cuda-command-line-tools-11-8 -${NV_CUDA_LIB_VERSION} \
91
- cuda-libraries-devel-11-8 -${NV_CUDA_LIB_VERSION} \
92
- cuda-minimal-build-11-8 -${NV_CUDA_LIB_VERSION} \
93
- cuda-cudart-devel-11-8 -${NV_CUDA_CUDART_DEV_VERSION} \
94
- cuda-nvml-devel-11-8 -${NV_NVML_DEV_VERSION} \
95
- libcublas-devel-11-8 -${NV_LIBCUBLAS_DEV_VERSION} \
96
- libnpp-devel-11-8 -${NV_LIBNPP_DEV_VERSION} \
74
+ cuda-command-line-tools-12-1 -${NV_CUDA_LIB_VERSION} \
75
+ cuda-libraries-devel-12-1 -${NV_CUDA_LIB_VERSION} \
76
+ cuda-minimal-build-12-1 -${NV_CUDA_LIB_VERSION} \
77
+ cuda-cudart-devel-12-1 -${NV_CUDA_CUDART_DEV_VERSION} \
78
+ cuda-nvml-devel-12-1 -${NV_NVML_DEV_VERSION} \
79
+ libcublas-devel-12-1 -${NV_LIBCUBLAS_DEV_VERSION} \
80
+ libnpp-devel-12-1 -${NV_LIBNPP_DEV_VERSION} \
97
81
libnccl-devel-${NV_LIBNCCL_DEV_PACKAGE_VERSION} \
98
82
&& dnf clean all
99
83
100
84
ENV LIBRARY_PATH="$CUDA_HOME/lib64/stubs"
101
85
102
86
103
87
# # Rust builder ################################################################
104
- # Specific debian version so that compatible glibc version is used
105
- FROM rust:1.76-bullseye as rust-builder
88
+ # Using bookworm for compilation so the rust binaries get linked against libssl.so.3
89
+ FROM rust:1.78-bookworm as rust-builder
106
90
ARG PROTOC_VERSION
107
91
108
92
ENV CARGO_REGISTRIES_CRATES_IO_PROTOCOL=sparse
@@ -180,6 +164,9 @@ RUN cd server && \
180
164
make gen-server && \
181
165
pip install ".[accelerate]" --no-cache-dir
182
166
167
+ # temp: install newer transformers lib that optimum clashes with
168
+ RUN pip install transformers==4.40.0 tokenizers==0.19.1 --no-cache-dir
169
+
183
170
# Patch codegen model changes into transformers
184
171
RUN cp server/transformers_patch/modeling_codegen.py ${SITE_PACKAGES}/transformers/models/codegen/modeling_codegen.py
185
172
@@ -218,12 +205,12 @@ ENV PATH=/opt/tgis/bin/:$PATH
218
205
# Install specific version of torch
219
206
RUN pip install ninja==1.11.1.1 --no-cache-dir
220
207
RUN pip install packaging --no-cache-dir
221
- RUN pip install torch==$PYTORCH_VERSION+cu118 --index-url "${PYTORCH_INDEX}/cu118 " --no-cache-dir
208
+ RUN pip install torch==$PYTORCH_VERSION+cu121 --index-url "${PYTORCH_INDEX}/cu121 " --no-cache-dir
222
209
223
210
224
211
# # Build flash attention v2 ####################################################
225
212
FROM python-builder as flash-att-v2-builder
226
- ARG FLASH_ATT_VERSION=v2.5.2
213
+ ARG FLASH_ATT_VERSION=v2.5.6
227
214
228
215
WORKDIR /usr/src/flash-attention-v2
229
216
@@ -237,14 +224,15 @@ RUN MAX_JOBS=2 pip --verbose wheel --no-deps flash-attn==${FLASH_ATT_VERSION} \
237
224
238
225
239
226
# # Install auto-gptq ###########################################################
240
- FROM python-builder as auto-gptq-installer
241
- ARG AUTO_GPTQ_REF=ccb6386ebfde63c17c45807d38779a93cd25846f
242
-
243
- WORKDIR /usr/src/auto-gptq-wheel
244
-
245
- # numpy is required to run auto-gptq's setup.py
246
- RUN pip install numpy
247
- RUN DISABLE_QIGEN=1 pip wheel git+https://github.com/AutoGPTQ/AutoGPTQ@${AUTO_GPTQ_REF} --no-cache-dir --no-deps --verbose
227
+ # # Uncomment if a custom autogptq build is required
228
+ # FROM python-builder as auto-gptq-installer
229
+ # ARG AUTO_GPTQ_REF=896d8204bc89a7cfbda42bf3314e13cf4ce20b02
230
+ #
231
+ # WORKDIR /usr/src/auto-gptq-wheel
232
+ #
233
+ # # numpy is required to run auto-gptq's setup.py
234
+ # RUN pip install numpy
235
+ # RUN DISABLE_QIGEN=1 pip wheel git+https://github.com/AutoGPTQ/AutoGPTQ@${AUTO_GPTQ_REF} --no-cache-dir --no-deps --verbose
248
236
249
237
# # Build libraries #############################################################
250
238
FROM python-builder as build
@@ -254,75 +242,76 @@ COPY server/custom_kernels/ /usr/src/.
254
242
RUN cd /usr/src && python setup.py build_ext && python setup.py install
255
243
256
244
257
- # # Build transformers exllama kernels ##########################################
258
- FROM python-builder as exllama-kernels-builder
259
-
260
- WORKDIR /usr/src
261
-
262
- COPY server/exllama_kernels/ .
263
- RUN python setup.py build
264
-
265
-
266
- # # Build transformers exllamav2 kernels ########################################
267
- FROM python-builder as exllamav2-kernels-builder
268
-
269
- WORKDIR /usr/src
270
-
271
- COPY server/exllamav2_kernels/ .
272
- RUN python setup.py build
273
-
274
-
275
245
# # Flash attention v2 cached build image #######################################
276
246
FROM base as flash-att-v2-cache
277
247
278
248
# Copy just the wheels we built for flash-attention
279
249
COPY --from=flash-att-v2-builder /usr/src/flash-attention-v2 /usr/src/flash-attention-v2
280
250
281
251
282
- # # Auto gptq cached build image
283
- FROM base as auto-gptq-cache
252
+ # # Auto gptq cached build image ################################################
253
+ # # Uncomment if a custom autogptq build is required
254
+ # FROM base as auto-gptq-cache
255
+ #
256
+ # # Copy just the wheel we built for auto-gptq
257
+ # COPY --from=auto-gptq-installer /usr/src/auto-gptq-wheel /usr/src/auto-gptq-wheel
284
258
285
- # Copy just the wheel we built for auto-gptq
286
- COPY --from=auto-gptq-installer /usr/src/auto-gptq-wheel /usr/src/auto-gptq-wheel
287
259
260
+ # # Full set of python installations for server release #########################
261
+
262
+ FROM python-builder as python-installations
288
263
289
- # # Final Inference Server image ################################################
290
- FROM cuda-runtime as server-release
291
264
ARG PYTHON_VERSION
265
+ ARG AUTO_GPTQ_VERSION
292
266
ARG SITE_PACKAGES=/opt/tgis/lib/python${PYTHON_VERSION}/site-packages
293
267
294
- # Install C++ compiler (required at runtime when PT2_COMPILE is enabled)
295
- RUN dnf install -y gcc-c++ git && dnf clean all \
296
- && useradd -u 2000 tgis -m -g 0
297
-
298
- SHELL ["/bin/bash" , "-c" ]
299
-
300
268
COPY --from=build /opt/tgis /opt/tgis
301
269
270
+ # `pip` is installed in the venv here
302
271
ENV PATH=/opt/tgis/bin:$PATH
303
272
304
273
# Install flash attention v2 from the cache build
305
274
RUN --mount=type=bind,from=flash-att-v2-cache,src=/usr/src/flash-attention-v2,target=/usr/src/flash-attention-v2 \
306
275
pip install /usr/src/flash-attention-v2/*.whl --no-cache-dir
307
276
308
- # Copy build artifacts from exllama kernels builder
309
- COPY --from=exllama-kernels-builder /usr/src/build/lib.linux-x86_64-cpython-* ${SITE_PACKAGES}
310
-
311
- # Copy build artifacts from exllamav2 kernels builder
312
- COPY --from=exllamav2-kernels-builder /usr/src/build/lib.linux-x86_64-cpython-* ${SITE_PACKAGES}
313
-
314
277
# Copy over the auto-gptq wheel and install it
315
- RUN --mount=type=bind,from=auto-gptq-cache,src=/usr/src/auto-gptq-wheel,target=/usr/src/auto-gptq-wheel \
316
- pip install /usr/src/auto-gptq-wheel/*.whl --no-cache-dir
278
+ # RUN --mount=type=bind,from=auto-gptq-cache,src=/usr/src/auto-gptq-wheel,target=/usr/src/auto-gptq-wheel \
279
+ # pip install /usr/src/auto-gptq-wheel/*.whl --no-cache-dir
280
+
281
+ # We only need to install a custom-built auto-gptq version if we need a pre-release
282
+ # or are using a PyTorch nightly version
283
+ RUN pip install auto-gptq=="${AUTO_GPTQ_VERSION}" --no-cache-dir
317
284
318
285
# Install server
286
+ # git is required to pull the fms-extras dependency
287
+ RUN dnf install -y git && dnf clean all
319
288
COPY proto proto
320
289
COPY server server
321
- RUN cd server && make gen-server && pip install ".[accelerate, ibm-fms, onnx-gpu, quantize]" --no-cache-dir
290
+ # Extra url is required to install cuda-12 version of onnxruntime-gpu
291
+ # Ref: https://onnxruntime.ai/docs/install/#install-onnx-runtime-gpu-cuda-12x
292
+ RUN cd server && make gen-server && pip install ".[accelerate, ibm-fms, onnx-gpu, quantize]" --no-cache-dir --extra-index-url=https://aiinfra.pkgs.visualstudio.com/PublicPackages/_packaging/onnxruntime-cuda-12/pypi/simple/
293
+
294
+ # temp: install newer transformers lib that optimum clashes with
295
+ RUN pip install transformers==4.40.0 tokenizers==0.19.1 --no-cache-dir
322
296
323
297
# Patch codegen model changes into transformers 4.35
324
298
RUN cp server/transformers_patch/modeling_codegen.py ${SITE_PACKAGES}/transformers/models/codegen/modeling_codegen.py
325
299
300
+
301
+ # # Final Inference Server image ################################################
302
+ FROM base as server-release
303
+ ARG PYTHON_VERSION
304
+ ARG SITE_PACKAGES=/opt/tgis/lib/python${PYTHON_VERSION}/site-packages
305
+
306
+ # Install C++ compiler (required at runtime when PT2_COMPILE is enabled)
307
+ RUN dnf install -y gcc-c++ && dnf clean all \
308
+ && useradd -u 2000 tgis -m -g 0
309
+
310
+ # Copy in the full python environment
311
+ COPY --from=python-installations /opt/tgis /opt/tgis
312
+
313
+ ENV PATH=/opt/tgis/bin:$PATH
314
+
326
315
# Print a list of all installed packages and versions
327
316
RUN pip list -v --disable-pip-version-check --no-python-version-warning
328
317
0 commit comments