mirror of
https://github.com/vllm-project/vllm.git
synced 2025-12-06 15:04:47 +08:00
[docker] Build CUDA kernels in separate Docker stage for faster rebuilds (#29452)
Signed-off-by: Amr Mahdi <amrmahdi@meta.com>
This commit is contained in:
@@ -150,8 +150,8 @@ ARG torch_cuda_arch_list='7.0 7.5 8.0 8.9 9.0 10.0 12.0'
|
||||
ENV TORCH_CUDA_ARCH_LIST=${torch_cuda_arch_list}
|
||||
#################### BASE BUILD IMAGE ####################
|
||||
|
||||
#################### WHEEL BUILD IMAGE ####################
|
||||
FROM base AS build
|
||||
#################### CSRC BUILD IMAGE ####################
|
||||
FROM base AS csrc-build
|
||||
ARG TARGETPLATFORM
|
||||
|
||||
ARG PIP_INDEX_URL UV_INDEX_URL
|
||||
@@ -172,10 +172,13 @@ RUN --mount=type=cache,target=/root/.cache/uv \
|
||||
uv pip install --python /opt/venv/bin/python3 -r requirements/build.txt \
|
||||
--extra-index-url ${PYTORCH_CUDA_INDEX_BASE_URL}/cu$(echo $CUDA_VERSION | cut -d. -f1,2 | tr -d '.')
|
||||
|
||||
COPY . .
|
||||
ARG GIT_REPO_CHECK=0
|
||||
RUN --mount=type=bind,source=.git,target=.git \
|
||||
if [ "$GIT_REPO_CHECK" != "0" ]; then bash tools/check_repo.sh ; fi
|
||||
WORKDIR /workspace
|
||||
|
||||
COPY pyproject.toml setup.py CMakeLists.txt ./
|
||||
COPY cmake cmake/
|
||||
COPY csrc csrc/
|
||||
COPY vllm/envs.py vllm/envs.py
|
||||
COPY vllm/__init__.py vllm/__init__.py
|
||||
|
||||
# max jobs used by Ninja to build extensions
|
||||
ARG max_jobs=2
|
||||
@@ -195,9 +198,11 @@ ARG SCCACHE_S3_NO_CREDENTIALS=0
|
||||
ARG VLLM_USE_PRECOMPILED=""
|
||||
ARG VLLM_MAIN_CUDA_VERSION=""
|
||||
|
||||
# Use dummy version for csrc-build wheel (only .so files are extracted, version doesn't matter)
|
||||
ENV SETUPTOOLS_SCM_PRETEND_VERSION="0.0.0+csrc.build"
|
||||
|
||||
# if USE_SCCACHE is set, use sccache to speed up compilation
|
||||
RUN --mount=type=cache,target=/root/.cache/uv \
|
||||
--mount=type=bind,source=.git,target=.git \
|
||||
if [ "$USE_SCCACHE" = "1" ]; then \
|
||||
echo "Installing sccache..." \
|
||||
&& curl -L -o sccache.tar.gz ${SCCACHE_DOWNLOAD_URL} \
|
||||
@@ -223,7 +228,6 @@ ENV VLLM_TARGET_DEVICE=${vllm_target_device}
|
||||
ENV CCACHE_DIR=/root/.cache/ccache
|
||||
RUN --mount=type=cache,target=/root/.cache/ccache \
|
||||
--mount=type=cache,target=/root/.cache/uv \
|
||||
--mount=type=bind,source=.git,target=.git \
|
||||
if [ "$USE_SCCACHE" != "1" ]; then \
|
||||
# Clean any existing CMake artifacts
|
||||
rm -rf .deps && \
|
||||
@@ -232,6 +236,52 @@ RUN --mount=type=cache,target=/root/.cache/ccache \
|
||||
export VLLM_DOCKER_BUILD_CONTEXT=1 && \
|
||||
python3 setup.py bdist_wheel --dist-dir=dist --py-limited-api=cp38; \
|
||||
fi
|
||||
#################### CSRC BUILD IMAGE ####################
|
||||
|
||||
#################### WHEEL BUILD IMAGE ####################
|
||||
FROM base AS build
|
||||
ARG TARGETPLATFORM
|
||||
|
||||
ARG PIP_INDEX_URL UV_INDEX_URL
|
||||
ARG PIP_EXTRA_INDEX_URL UV_EXTRA_INDEX_URL
|
||||
ARG PYTORCH_CUDA_INDEX_BASE_URL
|
||||
|
||||
# install build dependencies
|
||||
COPY requirements/build.txt requirements/build.txt
|
||||
|
||||
# This timeout (in seconds) is necessary when installing some dependencies via uv since it's likely to time out
|
||||
# Reference: https://github.com/astral-sh/uv/pull/1694
|
||||
ENV UV_HTTP_TIMEOUT=500
|
||||
ENV UV_INDEX_STRATEGY="unsafe-best-match"
|
||||
# Use copy mode to avoid hardlink failures with Docker cache mounts
|
||||
ENV UV_LINK_MODE=copy
|
||||
|
||||
RUN --mount=type=cache,target=/root/.cache/uv \
|
||||
uv pip install --python /opt/venv/bin/python3 -r requirements/build.txt \
|
||||
--extra-index-url ${PYTORCH_CUDA_INDEX_BASE_URL}/cu$(echo $CUDA_VERSION | cut -d. -f1,2 | tr -d '.')
|
||||
|
||||
WORKDIR /workspace
|
||||
|
||||
COPY --from=csrc-build /workspace/dist /precompiled-wheels
|
||||
|
||||
COPY . .
|
||||
|
||||
ARG GIT_REPO_CHECK=0
|
||||
RUN --mount=type=bind,source=.git,target=.git \
|
||||
if [ "$GIT_REPO_CHECK" != "0" ]; then bash tools/check_repo.sh ; fi
|
||||
|
||||
ARG vllm_target_device="cuda"
|
||||
ENV VLLM_TARGET_DEVICE=${vllm_target_device}
|
||||
|
||||
# Skip adding +precompiled suffix to version (preserves git-derived version)
|
||||
ENV VLLM_SKIP_PRECOMPILED_VERSION_SUFFIX=1
|
||||
|
||||
RUN --mount=type=cache,target=/root/.cache/uv \
|
||||
--mount=type=bind,source=.git,target=.git \
|
||||
if [ "${vllm_target_device}" = "cuda" ]; then \
|
||||
export VLLM_PRECOMPILED_WHEEL_LOCATION=$(ls /precompiled-wheels/*.whl); \
|
||||
fi && \
|
||||
python3 setup.py bdist_wheel --dist-dir=dist --py-limited-api=cp38
|
||||
|
||||
# Install DeepGEMM from source
|
||||
ARG DEEPGEMM_GIT_REF
|
||||
|
||||
Binary file not shown.
|
Before Width: | Height: | Size: 146 KiB After Width: | Height: | Size: 174 KiB |
14
setup.py
14
setup.py
@@ -461,14 +461,22 @@ class precompiled_wheel_utils:
|
||||
"vllm/cumem_allocator.abi3.so",
|
||||
]
|
||||
|
||||
compiled_regex = re.compile(
|
||||
flash_attn_regex = re.compile(
|
||||
r"vllm/vllm_flash_attn/(?:[^/.][^/]*/)*(?!\.)[^/]*\.py"
|
||||
)
|
||||
triton_kernels_regex = re.compile(
|
||||
r"vllm/third_party/triton_kernels/(?:[^/.][^/]*/)*(?!\.)[^/]*\.py"
|
||||
)
|
||||
file_members = list(
|
||||
filter(lambda x: x.filename in files_to_copy, wheel.filelist)
|
||||
)
|
||||
file_members += list(
|
||||
filter(lambda x: compiled_regex.match(x.filename), wheel.filelist)
|
||||
filter(lambda x: flash_attn_regex.match(x.filename), wheel.filelist)
|
||||
)
|
||||
file_members += list(
|
||||
filter(
|
||||
lambda x: triton_kernels_regex.match(x.filename), wheel.filelist
|
||||
)
|
||||
)
|
||||
|
||||
for file in file_members:
|
||||
@@ -648,7 +656,7 @@ def get_vllm_version() -> str:
|
||||
if envs.VLLM_TARGET_DEVICE == "empty":
|
||||
version += f"{sep}empty"
|
||||
elif _is_cuda():
|
||||
if envs.VLLM_USE_PRECOMPILED:
|
||||
if envs.VLLM_USE_PRECOMPILED and not envs.VLLM_SKIP_PRECOMPILED_VERSION_SUFFIX:
|
||||
version += f"{sep}precompiled"
|
||||
else:
|
||||
cuda_version = str(get_nvcc_cuda_version())
|
||||
|
||||
@@ -78,6 +78,7 @@ if TYPE_CHECKING:
|
||||
MAX_JOBS: str | None = None
|
||||
NVCC_THREADS: str | None = None
|
||||
VLLM_USE_PRECOMPILED: bool = False
|
||||
VLLM_SKIP_PRECOMPILED_VERSION_SUFFIX: bool = False
|
||||
VLLM_DOCKER_BUILD_CONTEXT: bool = False
|
||||
VLLM_TEST_USE_PRECOMPILED_NIGHTLY_WHEEL: bool = False
|
||||
VLLM_KEEP_ALIVE_ON_ENGINE_DEATH: bool = False
|
||||
@@ -462,6 +463,10 @@ environment_variables: dict[str, Callable[[], Any]] = {
|
||||
.lower()
|
||||
in ("1", "true")
|
||||
or bool(os.environ.get("VLLM_PRECOMPILED_WHEEL_LOCATION")),
|
||||
# If set, skip adding +precompiled suffix to version string
|
||||
"VLLM_SKIP_PRECOMPILED_VERSION_SUFFIX": lambda: bool(
|
||||
int(os.environ.get("VLLM_SKIP_PRECOMPILED_VERSION_SUFFIX", "0"))
|
||||
),
|
||||
# Used to mark that setup.py is running in a Docker build context,
|
||||
# in order to force the use of precompiled binaries.
|
||||
"VLLM_DOCKER_BUILD_CONTEXT": lambda: os.environ.get("VLLM_DOCKER_BUILD_CONTEXT", "")
|
||||
|
||||
Reference in New Issue
Block a user