ACE-Step-1.5/Dockerfile.jetson at main · ace-step/ACE-Step-1.5 · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
# =============================================================================
# ACE-Step 1.5 — NVIDIA Jetson Dockerfile
# =============================================================================
#
# Builds ACE-Step 1.5 with GPU acceleration for NVIDIA Jetson platforms.
#
# Supported hardware:
#   - Jetson Orin Nano  (4/8 GB unified memory)
#   - Jetson Orin NX    (8/16 GB)
#   - Jetson AGX Orin   (32/64 GB)
#   - Jetson Xavier NX / AGX Xavier (JetPack 5.x — see JetPack 5 note below)
#
# Requirements:
#   - JetPack 6.x installed on the Jetson (L4T R36.x)
#   - NVIDIA Container Runtime (`nvidia-docker2` or `nvidia-container-toolkit`)
#   - Docker with BuildKit (Docker >= 20.10)
#
# Build:
#   docker build -f Dockerfile.jetson -t acestep-jetson .
#
# Run (Gradio UI — default, models pre-loaded at startup):
#   docker run --runtime nvidia -it --rm \
#     -p 7860:7860 \
#     -v $(pwd)/checkpoints:/app/checkpoints \
#     -v $(pwd)/gradio_outputs:/app/gradio_outputs \
#     acestep-jetson
#
# Run (REST API server):
#   docker run --runtime nvidia -it --rm \
#     -p 8001:8001 \
#     -v $(pwd)/checkpoints:/app/checkpoints \
#     -e ACESTEP_MODE=api \
#     acestep-jetson
#
# Run without pre-initialization (deferred to UI "Initialize" button):
#   docker run --runtime nvidia -it --rm \
#     -p 7860:7860 \
#     -v $(pwd)/checkpoints:/app/checkpoints \
#     -e ACESTEP_INIT_SERVICE=false \
#     acestep-jetson
#
# ---- JetPack 5.x (Xavier) ----
# Override build args for JetPack 5:
#   docker build -f Dockerfile.jetson \
#     --build-arg L4T_VERSION=r35.5.0 \
#     -t acestep-jetson-jp5 .
#
# =============================================================================

# ==================== Build arguments ====================

# L4T JetPack image tag — must match your Jetson's JetPack installation.
# JetPack 6.2 → r36.4.0 | JetPack 6.1 → r36.3.0 | JetPack 6.0 → r36.2.0
# The l4t-jetpack image includes CUDA toolkit, cuDNN and TensorRT.
ARG L4T_VERSION=r36.4.0

# ==================== Base image ====================
FROM nvcr.io/nvidia/l4t-jetpack:${L4T_VERSION}

ENV DEBIAN_FRONTEND=noninteractive
ENV LANG=C.UTF-8
ENV LC_ALL=C.UTF-8

# ==================== System packages ====================
# NOTE: We use the system Python 3.10 (shipped with Ubuntu 22.04 / L4T) because
# NVIDIA's Jetson AI Lab only publishes PyTorch wheels for cp310.
# The ACE-Step codebase is compatible with Python 3.10.
RUN apt-get update && apt-get install -y --no-install-recommends \
        software-properties-common \
        build-essential \
        cmake \
        git \
        curl \
        wget \
        pkg-config \
        python3-dev \
        python3-venv \
        # Audio processing libraries
        libsndfile1 \
        libsndfile1-dev \
        # FFmpeg build dependencies (we build FFmpeg 7 from source for torchcodec)
        nasm \
        yasm \
        libx264-dev \
        libx265-dev \
        libmp3lame-dev \
        libopus-dev \
        libvorbis-dev \
        # BLAS / LAPACK for scipy & numpy on aarch64
        libopenblas-dev \
        liblapack-dev \
        gfortran \
    && rm -rf /var/lib/apt/lists/*

# ==================== FFmpeg 7 (from source) ====================
# torchcodec 0.10.0 requires FFmpeg 7 shared libraries (libavfilter.so.10,
# libavcodec.so.61, etc.). Ubuntu 22.04 ships FFmpeg 4.4 which is too old.
# We build a minimal FFmpeg 7.1 with shared libs and install to /usr/local.
ARG FFMPEG_VERSION=7.1
RUN cd /tmp \
    && curl -fSL "https://ffmpeg.org/releases/ffmpeg-${FFMPEG_VERSION}.tar.xz" -o ffmpeg.tar.xz \
    && tar xf ffmpeg.tar.xz \
    && cd ffmpeg-${FFMPEG_VERSION} \
    && ./configure \
        --prefix=/usr/local \
        --enable-shared \
        --disable-static \
        --enable-gpl \
        --enable-libx264 \
        --enable-libx265 \
        --enable-libmp3lame \
        --enable-libopus \
        --enable-libvorbis \
        --disable-doc \
        --disable-programs \
    && make -j$(nproc) \
    && make install \
    && ldconfig \
    && cd /tmp && rm -rf ffmpeg* \
    && echo "FFmpeg $(ffmpeg -version 2>&1 | head -1 || echo 'libs installed')"

# Ensure 'python' -> python3 symlink exists.
RUN ln -sf /usr/bin/python3 /usr/bin/python

# Bootstrap pip and install a modern numpy (base image ships 1.21 for 3.10).
RUN curl -sS https://bootstrap.pypa.io/get-pip.py | python3 \
    && pip install --no-cache-dir --upgrade pip setuptools wheel \
    && pip install --no-cache-dir "numpy>=1.24"

# ==================== PyTorch — Jetson-optimised wheels ====================
# Installed from NVIDIA's Jetson AI Lab pip index which provides aarch64
# wheels compiled specifically for Jetson GPUs (SM 8.7 / Orin architecture).
# Standard PyPI cu126 wheels do NOT include SM 8.7 kernels and will fail
# with "no kernel image is available for execution on the device".
ARG JETSON_PIP_INDEX=https://pypi.jetson-ai-lab.io/jp6/cu126/+simple/

# nvidia-cudss-cu12 provides libcudss.so.0 which torch 2.9+ requires at import.
# IMPORTANT: nvidia-cudss-cu12 from PyPI pulls in nvidia-cublas-cu12 and
# nvidia-cuda-runtime-cu12 for a *newer* CUDA (12.9).  These conflict with the
# CUDA 12.6 system libs shipped in l4t-jetpack and cause CUBLAS_STATUS errors.
# We keep ONLY the cudss .so and remove the conflicting cublas/cuda-runtime pkgs
# so that torch uses the system CUDA 12.6 libraries at runtime.
ENV NVIDIA_PYTHON_LIBS=/usr/local/lib/python3.10/dist-packages/nvidia
ENV LD_LIBRARY_PATH="${NVIDIA_PYTHON_LIBS}/cu12/lib:${LD_LIBRARY_PATH}"

RUN pip install --no-cache-dir nvidia-cudss-cu12 \
    && pip uninstall -y nvidia-cublas-cu12 nvidia-cuda-runtime-cu12 \
        nvidia-cusparse-cu12 nvidia-nvjitlink-cu12 2>/dev/null || true \
    && echo "${NVIDIA_PYTHON_LIBS}/cu12/lib" > /etc/ld.so.conf.d/nvidia-cudss.conf \
    && ldconfig \
    && pip install --no-cache-dir \
        "torch==2.9.1" "torchvision==0.24.1" "torchaudio==2.9.1" \
        --index-url ${JETSON_PIP_INDEX} \
    && python -c "import torch; print(f'PyTorch {torch.__version__}  CUDA avail: {torch.cuda.is_available()}  Archs: {torch.cuda.get_arch_list()}')" \
    || echo "WARNING: torch import check failed (expected during build without GPU — will work at runtime with --runtime nvidia)"

# torchcodec — required by torchaudio 2.9+ as the default audio decoder.
# The Jetson AI Lab prebuilt wheel has an ABI mismatch (links against desktop
# NVDEC / libnvcuvid.so.1 which doesn't exist on Jetson).  We build v0.10.0
# from source with ENABLE_CUDA=0 (CPU-only FFmpeg decode, which is all we need
# for audio).  pybind11 is required at build time.
# HARD REQUIREMENT: build will fail if torchcodec cannot be compiled.
ARG TORCHCODEC_VERSION=v0.10.0
RUN pip install --no-cache-dir pybind11 \
    && cd /tmp \
    && git clone --depth 1 --branch ${TORCHCODEC_VERSION} \
        https://github.com/pytorch/torchcodec.git \
    && cd torchcodec \
    && CMAKE_PREFIX_PATH="$(python -c 'import pybind11; print(pybind11.get_cmake_dir())'):${CMAKE_PREFIX_PATH}" \
       PKG_CONFIG_PATH="/usr/local/lib/pkgconfig:${PKG_CONFIG_PATH}" \
       ENABLE_CUDA=0 \
       I_CONFIRM_THIS_IS_NOT_A_LICENSE_VIOLATION=1 \
       pip install --no-cache-dir --no-build-isolation . \
    && cd /tmp && rm -rf torchcodec \
    && python -c "import torchcodec; print(f'torchcodec {torchcodec.__version__}')"

# ==================== Project source ====================
WORKDIR /app
COPY . /app/

# ==================== Python dependencies ====================
# We install dependencies explicitly rather than via `pip install .` because
# pyproject.toml's aarch64 markers point to cu130 wheels (DGX Spark), which
# are incompatible with Jetson's CUDA 12.x.
#
# Excluded packages (Jetson-incompatible):
#   - torch/torchvision/torchaudio  → already installed from Jetson wheels
#   - mlx / mlx-lm                  → Apple Silicon only
#   - torchcodec                    → installed separately from Jetson AI Lab index

# Core + training + API dependencies
RUN pip install --no-cache-dir \
        "transformers>=4.51.0,<4.58.0" \
        "diffusers" \
        "gradio==6.2.0" \
        "matplotlib>=3.7.5" \
        "scipy>=1.10.1" \
        "soundfile>=0.13.1" \
        "loguru>=0.7.3" \
        "einops>=0.8.1" \
        "accelerate>=1.12.0" \
        "fastapi>=0.110.0" \
        "diskcache" \
        "uvicorn[standard]>=0.27.0" \
        "numba>=0.63.1" \
        "vector-quantize-pytorch>=1.27.15" \
        "toml" \
        "safetensors" \
        "modelscope" \
        "peft>=0.18.0" \
        "lycoris-lora" \
        "lightning>=2.0.0" \
        "tensorboard>=2.20.0" \
        "typer-slim>=0.21.1" \
        "xxhash" \
        "pyyaml" \
        "bitsandbytes>=0.49.0"

# torchao — DISABLED on Jetson.
# The original diffusers 0.36.0 logger bug is fixed in 0.37.0+, but torchao
# 0.16.0 skips its C++ extensions with torch 2.9.1 ("incompatible torch
# version") making quantization ops non-functional.  Since ACE-Step does not
# use torchao quantization, installing it adds noise without benefit.
# Re-evaluate when Jetson AI Lab ships a torch build that torchao supports.

# ==================== Triton + nano-vllm ====================
# Triton aarch64 wheels are available on the Jetson AI Lab index.
# flash-attn is NOT installed: the Jetson AI Lab wheels are compiled against an
# older PyTorch ABI and crash on import with torch 2.9.x (undefined SymInt
# symbols). nano-vllm gracefully falls back to SDPA attention without flash-attn.
# nano-vllm is installed from the bundled source with --no-deps to avoid pulling
# x86-only flash-attn wheels from its pyproject.toml.
# HARD REQUIREMENT: build will fail if nano-vllm cannot be installed.
#
# Triton requires ptxas and cuda.h from the CUDA toolkit — we set the env var
# and create a symlink so triton's nvidia backend can find them.
ENV TRITON_PTXAS_PATH=/usr/local/cuda/bin/ptxas
RUN pip install --no-cache-dir \
        "triton>=3.4.0" \
        --index-url ${JETSON_PIP_INDEX} \
    && mkdir -p /usr/local/lib/python3.10/dist-packages/triton/backends/nvidia/include \
    && ln -sf /usr/local/cuda/include/cuda.h \
              /usr/local/lib/python3.10/dist-packages/triton/backends/nvidia/include/cuda.h \
    && pip install --no-cache-dir --no-deps /app/acestep/third_parts/nano-vllm \
    && python -c "import nanovllm; print('nano-vllm OK')"

# ==================== Runtime directories ====================
RUN mkdir -p /app/checkpoints /app/gradio_outputs

# ==================== Jetson environment defaults ====================

# LLM backend: "vllm" uses nano-vllm with paged KV cache (recommended for
# ≥24GB VRAM with the 4B LM model).  CUDA graph capture is automatically
# disabled on Jetson (enforce_eager) since SDPA paged-cache decode is
# incompatible with graph capture.
ENV ACESTEP_LLM_BACKEND=vllm

# Bind to all interfaces so Docker port-mapping works.
ENV ACESTEP_API_HOST=0.0.0.0
ENV GRADIO_SERVER_NAME=0.0.0.0

# Default startup mode: "gradio" for the web UI, "api" for the REST server.
ENV ACESTEP_MODE=gradio

# Auto-initialize models on startup so users can generate immediately.
# Set to "false" to defer initialization to the UI "Initialize" button.
ENV ACESTEP_INIT_SERVICE=true

# Default DiT model to load at startup (must exist in /app/checkpoints).
ENV ACESTEP_CONFIG_PATH=acestep-v15-turbo

# Default LM model — 4B gives best quality on ≥24GB GPUs (see README).
# Use "acestep-5Hz-lm-0.6B" or "acestep-5Hz-lm-1.7B" for lower VRAM.
ENV ACESTEP_LM_MODEL_PATH=acestep-5Hz-lm-4B

# Disable tokenizers parallelism (avoids fork warnings in containers).
ENV TOKENIZERS_PARALLELISM=false

# ==================== Ports ====================
# 7860 = Gradio web UI | 8001 = REST API server
EXPOSE 7860 8001

# ==================== Health check ====================
# Lightweight probe: the Gradio or API server must be listening.
HEALTHCHECK --interval=60s --timeout=10s --start-period=120s --retries=3 \
    CMD curl -sf http://localhost:${GRADIO_PORT:-7860}/ > /dev/null 2>&1 \
     || curl -sf http://localhost:${ACESTEP_API_PORT:-8001}/health > /dev/null 2>&1 \
     || exit 1

# ==================== Entrypoint ====================
COPY <<'EOF' /app/docker-entrypoint.sh
#!/usr/bin/env bash
set -e

echo "==========================================="
echo "  ACE-Step 1.5 — NVIDIA Jetson Container"
echo "==========================================="
echo "Mode      : ${ACESTEP_MODE}"
echo "Python    : $(python --version 2>&1)"
echo "PyTorch   : $(python -c 'import torch; print(torch.__version__)' 2>/dev/null || echo 'N/A')"

if python -c 'import torch; assert torch.cuda.is_available()' 2>/dev/null; then
    echo "CUDA      : $(python -c 'import torch; print(torch.version.cuda)')"
    echo "GPU       : $(python -c 'import torch; print(torch.cuda.get_device_name(0))')"
    echo "Memory    : $(python -c 'import torch; p=torch.cuda.get_device_properties(0); print(f"{p.total_memory/1024**3:.1f} GB")')"
else
    echo "CUDA      : NOT AVAILABLE — running on CPU"
    echo "           (make sure you launched with --runtime nvidia)"
fi
echo "==========================================="

# Build --init_service flags when ACESTEP_INIT_SERVICE=true
INIT_ARGS=""
if [ "${ACESTEP_INIT_SERVICE:-true}" = "true" ]; then
    INIT_ARGS="--init_service true"
    [ -n "${ACESTEP_CONFIG_PATH:-}" ]   && INIT_ARGS="${INIT_ARGS} --config_path ${ACESTEP_CONFIG_PATH}"
    [ -n "${ACESTEP_LM_MODEL_PATH:-}" ] && INIT_ARGS="${INIT_ARGS} --init_llm true --lm_model_path ${ACESTEP_LM_MODEL_PATH}"
    echo "Auto-init    : DiT=${ACESTEP_CONFIG_PATH:-auto}  LM=${ACESTEP_LM_MODEL_PATH:-none}"
fi

if [ "${ACESTEP_MODE}" = "api" ]; then
    echo "Starting REST API server on 0.0.0.0:${ACESTEP_API_PORT:-8001} ..."
    exec python -m acestep.api_server \
        --host "${ACESTEP_API_HOST:-0.0.0.0}" \
        --port "${ACESTEP_API_PORT:-8001}" \
        ${ACESTEP_EXTRA_ARGS:-}
else
    echo "Starting Gradio UI on 0.0.0.0:${GRADIO_PORT:-7860} ..."
    exec python -m acestep.acestep_v15_pipeline \
        --server-name "${GRADIO_SERVER_NAME:-0.0.0.0}" \
        --port "${GRADIO_PORT:-7860}" \
        --backend "${ACESTEP_LLM_BACKEND:-pt}" \
        ${INIT_ARGS} \
        ${ACESTEP_EXTRA_ARGS:-}
fi
EOF

RUN chmod +x /app/docker-entrypoint.sh

ENTRYPOINT ["/app/docker-entrypoint.sh"]