# Copyright 1999-2026 Gentoo Authors # Distributed under the terms of the GNU General Public License v2 EAPI=8 DISTUTILS_USE_PEP517=setuptools PYTHON_COMPAT=( python3_{12..14} ) DISTUTILS_SINGLE_IMPL=1 ROCM_VERSION=7.2 inherit distutils-r1 pypi rocm # Commit pinned by cmake/external_projects/vllm_flash_attn.cmake (GIT_TAG). # Pre-staged so we can patch out FA3's unconditional-build quirk before # vllm's CMake FetchContent reaches it. Bump in lockstep with vllm # bumps that change the pin. VLLM_FA_COMMIT="f5bc33cfc02c744d24a2e9d50e6db656de40611c" DESCRIPTION="High-throughput, memory-efficient inference and serving engine for LLMs" HOMEPAGE=" https://github.com/vllm-project/vllm https://docs.vllm.ai/ https://pypi.org/project/vllm/ " SRC_URI+=" cuda? ( https://github.com/vllm-project/flash-attention/archive/${VLLM_FA_COMMIT}.tar.gz -> vllm-flash-attn-${VLLM_FA_COMMIT:0:7}.gh.tar.gz ) " LICENSE="Apache-2.0" SLOT="0" KEYWORDS="~amd64" IUSE="cpu cuda rocm" # VLLM_TARGET_DEVICE is single-valued; cpu, cuda, and rocm paths are # mutually exclusive. Default (none) → empty target. REQUIRED_USE=" ?? ( cpu cuda rocm ) rocm? ( || ( ${ROCM_REQUIRED_USE} ) ) " # USE=cpu (default off): build with VLLM_TARGET_DEVICE=cpu so the # Python entrypoints can actually drive inference on CPU hardware. # Pulls torchaudio + numba (vllm's cpu.txt also lists intel-openmp on # x86_64, but Intel ships it as a proprietary blob — we omit it; vllm # falls back to the pthreads OpenMP shipped with sci-libs/openblas etc.) # # CAVEAT (historical): ::gentoo sci-ml/pytorch's caffe2::mkl public # link interface used to drag MKL's MPI / cluster libs (scalapack, # cdft, blacs_intelmpi) and Intel-OpenMP threading (intel_thread) # into every consumer link, breaking the build on hosts without # Intel Cluster Edition + Compiler. We pin >=sci-ml/caffe2-2.11.0-r90 # below — this overlay's r90 fork ships a scrub patch on # cmake/public/mkl.cmake that filters those libs and forces # gnu_thread. Drop the pin once an equivalent upstream fix lands. # # USE=cuda: build with VLLM_TARGET_DEVICE=cuda. Pulls torchaudio + # torchvision + numba and the full Tier-0..5 CUDA stack (flashinfer # + tilelang + nvidia-cutlass-dsl + cuda-bindings + nvidia-cudnn- # frontend + ...). Compiles the _C / _moe_C / _vllm_fa* CUDA C++ # extensions in setup.py via nvcc and the system CUDA toolkit at # /opt/cuda. CMAKE_CUDA_HOST_COMPILER is pinned to the gcc-15 slot # below — CUDA 13.2's nvcc rejects __GNUC__>15 via host_config.h # (see feedback_cuda_13_host_compiler_gcc_15.md). FetchContent of # CUTLASS / spdlog / etc. happens during the vllm CMake build, so # RESTRICT="cuda? ( network-sandbox )" mirrors the cpu? pattern. # # CAVEAT (historical): same MKL-MPI link pollution as USE=cpu — # ::gentoo sci-ml/pytorch with USE=mkl exported MKL MPI / cluster # libs in its public link interface, breaking the cumem_allocator # extension's link step on partial-MKL hosts. Fixed by the # >=sci-ml/caffe2-2.11.0-r90 pin below: this overlay's r90 fork # scrubs those libs from caffe2::mkl. Without that pin, all 339 # CUDA-compiled objects (_C / _moe_C / _vllm_fa2/3 extensions) # would still build cleanly but the final cumem_allocator link # would fail with "cannot find -lmkl_scalapack_ilp64". # # USE=rocm: build with VLLM_TARGET_DEVICE=rocm. Pulls torchaudio + # torchvision + numba + the runai-streamer/tensorizer/conch-triton # trio from upstream's requirements/rocm.txt, plus the HIP libs that # vllm's CMake `enable_language(HIP)` and the linked libtorch_hip # resolve at link time (hipBLAS / hipBLASLt / hipFFT / hipRAND / # hipSOLVER / hipSPARSE / hipCUB). Compiles the _C / _moe_C / _rocm_C # extensions and csrc/rocm/*.cu via hipcc and the system ROCm # toolchain at /opt/rocm. Inherits sci-ml/caffe2's MKL-MPI scrub # (>=2.11.0-r90) — same link-pollution caveat as the cuda path. # PYTORCH_ROCM_ARCH is derived from AMDGPU_TARGETS via rocm.eclass's # get_amdgpu_flags. FetchContent of CK / spdlog / etc. happens during # the vllm CMake build, hence RESTRICT="rocm? ( network-sandbox )". # # amd-quark (in requirements/rocm.txt as "for Quark quantization on # ROCm") is deliberately omitted from RDEPEND: no direct `import` from # vllm core code, only used by vllm.model_executor.layers.quantization. # quark internals when Quark-quantized models are loaded. # dev-python/amd-quark-bin in this overlay caps PYTHON_COMPAT at # 3.{11,12}, which would block vllm on 3.13/3.14. Users wanting Quark # quantization install amd-quark-bin separately. # # Upstream requirements/cuda.txt pins nvidia-cutlass-dsl==4.4.2 and # tilelang==0.1.9 exactly; flashinfer-python-0.6.8_p1 in this overlay # already enforces ~4.4.2 transitively but we restate it on the vllm # side so portage can't pick a newer cutlass-dsl if the flashinfer # pin is ever relaxed. nvidia-cudnn-frontend cap (>=1.13.0,<1.19.0) # lives on the flashinfer-python ebuild — vllm has zero direct # cudnn_frontend imports; the cap is for flashinfer's internal use. # # verified 2026-05-16 against vllm-0.21.0 cuda.txt. # # tokenspeed-mla (in requirements/cuda.txt at ==0.1.2 with the comment # "for faster mla with spec decode") is deliberately omitted from # cuda?'s RDEPEND for similar reasons: all imports in vllm core are # lazy and gated by try/except with a clear pip-install hint, the # kernels are Blackwell SM100/SM103-only (irrelevant on Ampere/Hopper # hosts), and the package transitively pulls tokenspeed-triton — a # Triton vendor-fork we'd otherwise have to package as a hard build # dep for a backend most users never enable. Users on Blackwell with # DeepSeek R1 + spec decode install tokenspeed-mla separately. # # verified 2026-05-16: vllm imports clean without it. # gfx1150 (Strix Point iGPU) rocm build verified on # caffe2[rocm,amdgpu_targets_gfx1150,-nccl,-cusparselt] with # AMDGPU_TARGETS=gfx1150. Both runs produced four working HIP # extensions (_C, _moe_C, _rocm_C, cumem_allocator) and a clean # `import vllm` from the install tree. # # verified 2026-05-08 for 0.20.1, 2026-05-16 for 0.21.0. # # RTX A4500 Laptop (sm_86 Ampere) cuda build verified on # caffe2-2.11.0-r90 + CUDA-13.2 + CUDAHOSTCXX=g++-15 + MAX_JOBS=4. # Pre-FA3-skip baseline: ~2h30m wallclock, 339 CUDA template files # (FA3 .cu compiled at nvcc's default arch — wasted on Ampere). # Post-FA3-skip (next commit, files/vllm-flash-attn-...-fa3-only- # when-archs.patch): ~1h35m wallclock, 144 CUDA template files. # Peak ~14 GiB RSS in either case (16 GiB free headroom on 31 GiB # host). Smoke test in both shapes: `from vllm import LLM` # succeeds, torch.cuda.is_available() True, torch reports "NVIDIA # RTX A4500 Laptop GPU"; FA2 kernels build for sm_80+PTX (forward- # compat with sm_86); FA3 (Hopper) does NOT build on sm_86 in the # post-patch shape (FA3_AVAILABLE=False at runtime, vllm picks FA2). # # verified 2026-05-17 for 0.21.0 on sm_86 + CUDA 13.2 (both shapes). # # USE=-cpu -cuda -rocm (default): build with VLLM_TARGET_DEVICE=empty # — Python entrypoints import cleanly, backend kernels fail at first # model-load. Useful if you only want the API surface for development. # # media-libs/opencv lower bound: upstream requirements/common.txt says # opencv-python-headless >=4.13.0, ::gentoo tops at 4.12.0. The full # cv2 surface vllm imports — resize, cvtColor, COLOR_BGR2RGB, # CAP_PROP_FRAME_COUNT/FPS/FRAME_WIDTH/FRAME_HEIGHT, VideoCapture incl. # the 3-arg bytes+backend form, VideoWriter, VideoWriter_fourcc, # videoio_registry submodule — is present in 4.12.0; the 4.13 lower # bound upstream is wheel-publication churn, not an API extension. # # verified 2026-05-16 against media-libs/opencv-4.12.0-r1[python]. RDEPEND=" ~sci-ml/pytorch-2.11.0[${PYTHON_SINGLE_USEDEP}] >=sci-ml/transformers-4.56.0[${PYTHON_SINGLE_USEDEP}] >=sci-ml/tokenizers-0.21.1[${PYTHON_SINGLE_USEDEP}] >=dev-python/xgrammar-0.2.0[${PYTHON_SINGLE_USEDEP}] =dev-python/requests-2.26.0[${PYTHON_USEDEP}] dev-python/tqdm[${PYTHON_USEDEP}] dev-python/blake3[${PYTHON_USEDEP}] dev-python/py-cpuinfo[${PYTHON_USEDEP}] >=dev-python/protobuf-5.29.6[${PYTHON_USEDEP}] >=dev-python/fastapi-0.115.0[${PYTHON_USEDEP}] >=dev-python/aiohttp-3.13.3[${PYTHON_USEDEP}] >=dev-python/openai-2.0.0[${PYTHON_USEDEP}] >=dev-python/pydantic-2.12.0[${PYTHON_USEDEP}] >=dev-python/prometheus-client-0.18.0[${PYTHON_USEDEP}] dev-python/pillow[${PYTHON_USEDEP}] >=dev-python/prometheus-fastapi-instrumentator-7.0.0[${PYTHON_USEDEP}] >=dev-python/tiktoken-0.6.0[${PYTHON_USEDEP}] ~dev-python/lm-format-enforcer-0.11.3[${PYTHON_USEDEP}] >=dev-python/llguidance-1.3.0[${PYTHON_USEDEP}] =dev-python/diskcache-5.6.3[${PYTHON_USEDEP}] >=dev-python/lark-1.2.2[${PYTHON_USEDEP}] >=dev-python/typing-extensions-4.10[${PYTHON_USEDEP}] >=dev-python/filelock-3.16.1[${PYTHON_USEDEP}] dev-python/partial-json-parser[${PYTHON_USEDEP}] >=dev-python/pyzmq-25.0.0[${PYTHON_USEDEP}] dev-python/msgspec[${PYTHON_USEDEP}] >=dev-python/gguf-0.17.0[${PYTHON_USEDEP}] >=dev-python/mistral-common-1.11.2[${PYTHON_USEDEP},image] >=media-libs/opencv-4.12.0[python,${PYTHON_USEDEP}] dev-python/pyyaml[${PYTHON_USEDEP}] dev-python/six[${PYTHON_USEDEP}] dev-python/einops[${PYTHON_USEDEP}] ~dev-python/depyf-0.20.0[${PYTHON_USEDEP}] dev-python/cloudpickle[${PYTHON_USEDEP}] dev-python/uvloop[${PYTHON_USEDEP}] dev-python/watchfiles[${PYTHON_USEDEP}] dev-python/python-json-logger[${PYTHON_USEDEP}] dev-python/pybase64[${PYTHON_USEDEP}] dev-python/cbor2[${PYTHON_USEDEP}] dev-python/ijson[${PYTHON_USEDEP}] dev-python/setproctitle[${PYTHON_USEDEP}] >=dev-python/openai-harmony-0.0.3[${PYTHON_USEDEP}] >=dev-python/anthropic-0.71.0[${PYTHON_USEDEP}] >=dev-python/model-hosting-container-standards-0.1.14[${PYTHON_USEDEP}] =dev-python/opentelemetry-sdk-1.27.0[${PYTHON_USEDEP}] >=dev-python/opentelemetry-api-1.27.0[${PYTHON_USEDEP}] >=dev-python/opentelemetry-exporter-otlp-1.27.0[${PYTHON_USEDEP}] >=dev-python/opentelemetry-semantic-conventions-ai-0.4.1[${PYTHON_USEDEP}] ') cpu? ( >=sci-ml/caffe2-2.11.0-r90 ~sci-ml/torchaudio-2.11.0 $(python_gen_cond_dep ' >=dev-python/numba-0.65.0[${PYTHON_USEDEP}] ') ) cuda? ( >=sci-ml/caffe2-2.11.0-r90 ~sci-ml/torchaudio-2.11.0 ~sci-ml/torchvision-0.26.0[${PYTHON_SINGLE_USEDEP}] ~dev-python/flashinfer-python-0.6.8_p1[${PYTHON_SINGLE_USEDEP}] ~dev-python/tilelang-0.1.9[${PYTHON_SINGLE_USEDEP}] >=dev-python/quack-kernels-0.3.3[${PYTHON_SINGLE_USEDEP}] $(python_gen_cond_dep ' >=dev-python/numba-0.65.0[${PYTHON_USEDEP}] >=dev-python/fastsafetensors-0.2.2[${PYTHON_USEDEP}] ~dev-python/nvidia-cutlass-dsl-4.4.2[${PYTHON_USEDEP}] ') dev-util/nvidia-cuda-toolkit:= ) rocm? ( >=sci-ml/caffe2-2.11.0-r90 ~sci-ml/torchaudio-2.11.0 ~sci-ml/torchvision-0.26.0[${PYTHON_SINGLE_USEDEP}] >=dev-python/runai-model-streamer-bin-0.15.7[${PYTHON_SINGLE_USEDEP}] ~dev-python/tensorizer-2.10.1[${PYTHON_SINGLE_USEDEP}] dev-python/tilelang[${PYTHON_SINGLE_USEDEP}] $(python_gen_cond_dep ' >=dev-python/numba-0.65.0[${PYTHON_USEDEP}] ~dev-python/conch-triton-kernels-1.2.1[${PYTHON_USEDEP}] >=dev-util/amdsmi-7.0.2[${PYTHON_USEDEP}] ') >=dev-util/hip-7.2:= >=sci-libs/hipBLAS-7.2:= >=sci-libs/hipBLASLt-7.2:= >=sci-libs/hipFFT-7.2:= >=sci-libs/hipRAND-7.2:= >=sci-libs/hipSOLVER-7.2:= >=sci-libs/hipSPARSE-7.2:= >=sci-libs/hipCUB-7.2:= ) " # Upstream pyproject.toml caps setuptools at <81.0.0; dropped from # BDEPEND because (a) gentoo only ships 79.0.1 + 82.0.1 (nothing in # the 80.x/81.x line), and downgrading to 79.0.1 fights pkg-resources- # 81.0.0 (which has !/dev/null || die eapply -p0 \ "${FILESDIR}/vllm-flash-attn-${VLLM_FA_COMMIT:0:7}-fa3-only-when-archs.patch" popd >/dev/null || die fi } src_configure() { if use cuda; then export VLLM_TARGET_DEVICE=cuda # Point vllm's cmake FetchContent at our pre-staged + patched # flash-attention source instead of re-fetching from github. export VLLM_FLASH_ATTN_SRC_DIR="${WORKDIR}/flash-attention-${VLLM_FA_COMMIT}" # CUDA 13.2's nvcc rejects gcc>15 via crt/host_config.h; this # host's active gcc is 16. Pin nvcc's host compiler to the # gcc-15 slot. See feedback_cuda_13_host_compiler_gcc_15.md # for the rationale and broader applicability. export CUDAHOSTCXX=/usr/bin/x86_64-pc-linux-gnu-g++-15 export CMAKE_ARGS+=" -DCMAKE_CUDA_HOST_COMPILER=${CUDAHOSTCXX}" # vllm's heavy CUDA template instantiations # (paged_attention_v*, layernorm_quant_kernels, w8a8/fp8/...) # can each peak at 3-4 GiB during cudafe++. With ninja's # default 24-way parallelism this OOM-kills on a 31 GiB host # (cudafe++ dies with SIGKILL, "[code=9]"). MAX_JOBS is the # env var vllm's setup.py reads to throttle the CMake build; # CMAKE_BUILD_PARALLEL_LEVEL backs it up for direct cmake # --build invocations. Tune this per-host: 31 GiB → 4-6, # 54 GiB → 8-10, 128 GiB → ~16. The OOM threshold was measured # against 0.20.1; 0.21.0's CUDA template set wasn't re-profiled # at bump time but the heavy instantiations (paged_attention, # layernorm_quant, w8a8/fp8) are unchanged, so MAX_JOBS=4 stays # a conservative default. # verified 2026-05-07 against 0.20.1. # # Caller-overridable so users on smaller/larger hosts can adjust # without ebuild-edit (e.g. MAX_JOBS=2 emerge … on a 16 GiB # host). export MAX_JOBS="${MAX_JOBS:-4}" export CMAKE_BUILD_PARALLEL_LEVEL="${CMAKE_BUILD_PARALLEL_LEVEL:-${MAX_JOBS}}" elif use cpu; then export VLLM_TARGET_DEVICE=cpu elif use rocm; then export VLLM_TARGET_DEVICE=rocm # rocm.eclass turns AMDGPU_TARGETS into a semicolon-joined # list. vllm's CMakeLists reads PYTORCH_ROCM_ARCH and feeds # it to enable_language(HIP). Same MAX_JOBS throttle as the # cuda branch — HIP template instantiation in csrc/rocm/ # (skinny_gemms, attention) hits comparable peak RSS. export PYTORCH_ROCM_ARCH=$(get_amdgpu_flags) export MAX_JOBS=4 export CMAKE_BUILD_PARALLEL_LEVEL=4 else export VLLM_TARGET_DEVICE=empty fi distutils-r1_src_configure }