[docker] Build CUDA kernels in separate Docker stage for faster rebuilds (#29452)

Signed-off-by: Amr Mahdi <amrmahdi@meta.com>
This commit is contained in:
Amr Mahdi
2025-12-03 03:41:53 -08:00
committed by GitHub
parent 78f4bb0ba8
commit f5d3d93c40
4 changed files with 74 additions and 11 deletions

View File

@@ -150,8 +150,8 @@ ARG torch_cuda_arch_list='7.0 7.5 8.0 8.9 9.0 10.0 12.0'
ENV TORCH_CUDA_ARCH_LIST=${torch_cuda_arch_list}
#################### BASE BUILD IMAGE ####################
#################### WHEEL BUILD IMAGE ####################
FROM base AS build
#################### CSRC BUILD IMAGE ####################
FROM base AS csrc-build
ARG TARGETPLATFORM
ARG PIP_INDEX_URL UV_INDEX_URL
@@ -172,10 +172,13 @@ RUN --mount=type=cache,target=/root/.cache/uv \
uv pip install --python /opt/venv/bin/python3 -r requirements/build.txt \
--extra-index-url ${PYTORCH_CUDA_INDEX_BASE_URL}/cu$(echo $CUDA_VERSION | cut -d. -f1,2 | tr -d '.')
COPY . .
ARG GIT_REPO_CHECK=0
RUN --mount=type=bind,source=.git,target=.git \
if [ "$GIT_REPO_CHECK" != "0" ]; then bash tools/check_repo.sh ; fi
WORKDIR /workspace
COPY pyproject.toml setup.py CMakeLists.txt ./
COPY cmake cmake/
COPY csrc csrc/
COPY vllm/envs.py vllm/envs.py
COPY vllm/__init__.py vllm/__init__.py
# max jobs used by Ninja to build extensions
ARG max_jobs=2
@@ -195,9 +198,11 @@ ARG SCCACHE_S3_NO_CREDENTIALS=0
ARG VLLM_USE_PRECOMPILED=""
ARG VLLM_MAIN_CUDA_VERSION=""
# Use dummy version for csrc-build wheel (only .so files are extracted, version doesn't matter)
ENV SETUPTOOLS_SCM_PRETEND_VERSION="0.0.0+csrc.build"
# if USE_SCCACHE is set, use sccache to speed up compilation
RUN --mount=type=cache,target=/root/.cache/uv \
--mount=type=bind,source=.git,target=.git \
if [ "$USE_SCCACHE" = "1" ]; then \
echo "Installing sccache..." \
&& curl -L -o sccache.tar.gz ${SCCACHE_DOWNLOAD_URL} \
@@ -223,7 +228,6 @@ ENV VLLM_TARGET_DEVICE=${vllm_target_device}
ENV CCACHE_DIR=/root/.cache/ccache
RUN --mount=type=cache,target=/root/.cache/ccache \
--mount=type=cache,target=/root/.cache/uv \
--mount=type=bind,source=.git,target=.git \
if [ "$USE_SCCACHE" != "1" ]; then \
# Clean any existing CMake artifacts
rm -rf .deps && \
@@ -232,6 +236,52 @@ RUN --mount=type=cache,target=/root/.cache/ccache \
export VLLM_DOCKER_BUILD_CONTEXT=1 && \
python3 setup.py bdist_wheel --dist-dir=dist --py-limited-api=cp38; \
fi
#################### CSRC BUILD IMAGE ####################
#################### WHEEL BUILD IMAGE ####################
FROM base AS build
ARG TARGETPLATFORM
ARG PIP_INDEX_URL UV_INDEX_URL
ARG PIP_EXTRA_INDEX_URL UV_EXTRA_INDEX_URL
ARG PYTORCH_CUDA_INDEX_BASE_URL
# install build dependencies
COPY requirements/build.txt requirements/build.txt
# This timeout (in seconds) is necessary when installing some dependencies via uv since it's likely to time out
# Reference: https://github.com/astral-sh/uv/pull/1694
ENV UV_HTTP_TIMEOUT=500
ENV UV_INDEX_STRATEGY="unsafe-best-match"
# Use copy mode to avoid hardlink failures with Docker cache mounts
ENV UV_LINK_MODE=copy
RUN --mount=type=cache,target=/root/.cache/uv \
uv pip install --python /opt/venv/bin/python3 -r requirements/build.txt \
--extra-index-url ${PYTORCH_CUDA_INDEX_BASE_URL}/cu$(echo $CUDA_VERSION | cut -d. -f1,2 | tr -d '.')
WORKDIR /workspace
COPY --from=csrc-build /workspace/dist /precompiled-wheels
COPY . .
ARG GIT_REPO_CHECK=0
RUN --mount=type=bind,source=.git,target=.git \
if [ "$GIT_REPO_CHECK" != "0" ]; then bash tools/check_repo.sh ; fi
ARG vllm_target_device="cuda"
ENV VLLM_TARGET_DEVICE=${vllm_target_device}
# Skip adding +precompiled suffix to version (preserves git-derived version)
ENV VLLM_SKIP_PRECOMPILED_VERSION_SUFFIX=1
RUN --mount=type=cache,target=/root/.cache/uv \
--mount=type=bind,source=.git,target=.git \
if [ "${vllm_target_device}" = "cuda" ]; then \
export VLLM_PRECOMPILED_WHEEL_LOCATION=$(ls /precompiled-wheels/*.whl); \
fi && \
python3 setup.py bdist_wheel --dist-dir=dist --py-limited-api=cp38
# Install DeepGEMM from source
ARG DEEPGEMM_GIT_REF

Binary file not shown.

Before

Width:  |  Height:  |  Size: 146 KiB

After

Width:  |  Height:  |  Size: 174 KiB

View File

@@ -461,14 +461,22 @@ class precompiled_wheel_utils:
"vllm/cumem_allocator.abi3.so",
]
compiled_regex = re.compile(
flash_attn_regex = re.compile(
r"vllm/vllm_flash_attn/(?:[^/.][^/]*/)*(?!\.)[^/]*\.py"
)
triton_kernels_regex = re.compile(
r"vllm/third_party/triton_kernels/(?:[^/.][^/]*/)*(?!\.)[^/]*\.py"
)
file_members = list(
filter(lambda x: x.filename in files_to_copy, wheel.filelist)
)
file_members += list(
filter(lambda x: compiled_regex.match(x.filename), wheel.filelist)
filter(lambda x: flash_attn_regex.match(x.filename), wheel.filelist)
)
file_members += list(
filter(
lambda x: triton_kernels_regex.match(x.filename), wheel.filelist
)
)
for file in file_members:
@@ -648,7 +656,7 @@ def get_vllm_version() -> str:
if envs.VLLM_TARGET_DEVICE == "empty":
version += f"{sep}empty"
elif _is_cuda():
if envs.VLLM_USE_PRECOMPILED:
if envs.VLLM_USE_PRECOMPILED and not envs.VLLM_SKIP_PRECOMPILED_VERSION_SUFFIX:
version += f"{sep}precompiled"
else:
cuda_version = str(get_nvcc_cuda_version())

View File

@@ -78,6 +78,7 @@ if TYPE_CHECKING:
MAX_JOBS: str | None = None
NVCC_THREADS: str | None = None
VLLM_USE_PRECOMPILED: bool = False
VLLM_SKIP_PRECOMPILED_VERSION_SUFFIX: bool = False
VLLM_DOCKER_BUILD_CONTEXT: bool = False
VLLM_TEST_USE_PRECOMPILED_NIGHTLY_WHEEL: bool = False
VLLM_KEEP_ALIVE_ON_ENGINE_DEATH: bool = False
@@ -462,6 +463,10 @@ environment_variables: dict[str, Callable[[], Any]] = {
.lower()
in ("1", "true")
or bool(os.environ.get("VLLM_PRECOMPILED_WHEEL_LOCATION")),
# If set, skip adding +precompiled suffix to version string
"VLLM_SKIP_PRECOMPILED_VERSION_SUFFIX": lambda: bool(
int(os.environ.get("VLLM_SKIP_PRECOMPILED_VERSION_SUFFIX", "0"))
),
# Used to mark that setup.py is running in a Docker build context,
# in order to force the use of precompiled binaries.
"VLLM_DOCKER_BUILD_CONTEXT": lambda: os.environ.get("VLLM_DOCKER_BUILD_CONTEXT", "")