# Copyright 1999-2026 Gentoo Authors # Distributed under the terms of the GNU General Public License v2 EAPI=8 DISTUTILS_USE_PEP517=setuptools DISTUTILS_EXT=1 PYTHON_COMPAT=( python3_{12..14} ) DISTUTILS_SINGLE_IMPL=1 ROCM_VERSION=7.2 RUST_MIN_VER="1.89.0" # vllm 0.22.0 ships a Rust frontend binary (vllm-rs) built via # setuptools-rust from the bundled rust/ workspace. Vendor its crate # dependencies (generated from rust/Cargo.lock) rather than relying on a # network-sandbox bypass, per the overlay's Rust+Python convention. The # frontend is opt-in at runtime (VLLM_USE_RUST_FRONTEND=1, default off); # vllm's Python API server stays the default, so the binary is a # performance option, not load-bearing. CRATES=" adler2@2.0.1 ahash@0.8.12 aho-corasick@1.1.4 aligned-vec@0.6.4 aligned@0.4.3 android_system_properties@0.1.5 anes@0.1.6 anstream@0.6.21 anstream@1.0.0 anstyle-parse@0.2.7 anstyle-parse@1.0.0 anstyle-query@1.1.5 anstyle-wincon@3.0.11 anstyle@1.0.13 anyhow@1.0.102 arbitrary@1.4.2 arc-swap@1.9.0 arg_enum_proc_macro@0.3.4 arrayref@0.3.9 arrayvec@0.7.6 as-slice@0.2.1 async-io@2.6.0 async-openai-macros@0.1.1 async-openai@0.33.1 async-trait@0.1.89 asynchronous-codec@0.7.0 asynk-strim-attr-macro@0.1.0 asynk-strim-attr@0.1.0 asynk-strim@0.1.5 atomic-waker@1.1.2 autocfg@1.5.0 av-scenechange@0.14.1 av1-grain@0.2.5 avif-serialize@0.8.8 axum-core@0.5.6 axum@0.8.8 backoff@0.4.0 base64@0.13.1 base64@0.22.1 base64ct@1.8.3 bit-set@0.5.3 bit-set@0.8.0 bit-vec@0.6.3 bit-vec@0.8.0 bit_field@0.10.3 bitflags@2.11.0 bitstream-io@4.10.0 blake3@1.8.5 block-buffer@0.10.4 bstr@1.12.1 built@0.8.0 bumpalo@3.20.2 bytemuck@1.25.0 bytemuck_derive@1.10.2 byteorder-lite@0.1.0 byteorder@1.5.0 bytes@1.11.1 cast@0.3.0 castaway@0.2.4 cc@1.2.56 cfg-if@1.0.4 cfg_aliases@0.2.1 chrono@0.4.44 ciborium-io@0.2.2 ciborium-ll@0.2.2 ciborium@0.2.2 clap@4.5.60 clap_builder@4.5.60 clap_derive@4.5.55 clap_lex@1.0.0 color_quant@1.1.0 colorchoice@1.0.4 compact_str@0.9.0 concurrent-queue@2.5.0 console@0.15.11 console@0.16.2 constant_time_eq@0.4.2 cookie@0.18.1 cookie_store@0.22.1 core-foundation-sys@0.8.7 core-foundation@0.10.1 core-foundation@0.9.4 cpufeatures@0.2.17 cpufeatures@0.3.0 crc32fast@1.5.0 criterion-plot@0.5.0 criterion@0.5.1 crossbeam-deque@0.8.6 crossbeam-epoch@0.9.18 crossbeam-queue@0.3.12 crossbeam-utils@0.8.21 crunchy@0.2.4 crypto-common@0.1.7 daachorse@1.0.0 darling@0.20.11 darling@0.23.0 darling_core@0.20.11 darling_core@0.23.0 darling_macro@0.20.11 darling_macro@0.23.0 dary_heap@0.3.8 der@0.8.0 deranged@0.5.8 derive_builder@0.20.2 derive_builder_core@0.20.2 derive_builder_macro@0.20.2 derive_more-impl@1.0.0 derive_more@1.0.0 digest@0.10.7 dirs-sys@0.5.0 dirs@6.0.0 displaydoc@0.2.5 dissimilar@1.0.11 document-features@0.2.12 dtoa@1.0.11 dyn-clone@1.0.20 easy-ext@1.0.3 educe@0.6.0 either@1.15.0 encode_unicode@1.0.0 encoding_rs@0.8.35 enum-as-inner@0.7.0 enum-ordinalize-derive@4.3.2 enum-ordinalize@4.3.2 env_filter@1.0.1 env_logger@0.11.10 equator-macro@0.4.2 equator@0.4.2 equivalent@1.0.2 errno@0.3.14 esaxx-rs@0.1.10 eventsource-stream@0.2.3 expect-test@1.5.1 exr@1.74.0 fancy-regex@0.13.0 fancy-regex@0.17.0 fast_image_resize@6.0.0 fastokens@0.2.0 fastrand@2.3.0 fax@0.2.6 fax_derive@0.2.0 fdeflate@0.3.7 find-msvc-tools@0.1.9 fixedbitset@0.5.7 flate2@1.1.9 fnv@1.0.7 foldhash@0.1.5 foreign-types-shared@0.1.1 foreign-types@0.3.2 form_urlencoded@1.2.2 fslock@0.2.1 futures-channel@0.3.32 futures-core@0.3.32 futures-executor@0.3.32 futures-io@0.3.32 futures-lite@2.6.1 futures-macro@0.3.32 futures-sink@0.3.32 futures-task@0.3.32 futures-timer@3.0.3 futures-util@0.3.32 futures@0.3.32 generic-array@0.14.7 getopts@0.2.24 getrandom@0.2.17 getrandom@0.3.4 getrandom@0.4.2 gif@0.14.2 h2@0.4.13 half@2.7.1 hashbrown@0.12.3 hashbrown@0.14.5 hashbrown@0.15.5 hashbrown@0.16.1 heck@0.5.0 hermit-abi@0.5.2 hex@0.4.3 hf-hub@0.4.3 hf-hub@0.5.0 hmac@0.12.1 hound@3.5.1 http-body-util@0.1.3 http-body@1.0.1 http@1.4.0 httparse@1.10.1 httpdate@1.0.3 hyper-rustls@0.27.7 hyper-timeout@0.5.2 hyper-tls@0.6.0 hyper-util@0.1.20 hyper@1.8.1 iana-time-zone-haiku@0.1.2 iana-time-zone@0.1.65 icu_collections@2.1.1 icu_locale_core@2.1.1 icu_normalizer@2.1.1 icu_normalizer_data@2.1.1 icu_properties@2.1.2 icu_properties_data@2.1.2 icu_provider@2.1.1 id-arena@2.3.0 ident_case@1.0.1 idna@1.1.0 idna_adapter@1.2.1 image-webp@0.2.4 image@0.25.10 imgref@1.12.0 indexmap@1.9.3 indexmap@2.13.0 indicatif@0.17.11 indicatif@0.18.4 instant@0.1.13 interpolate_name@0.2.4 ipnet@2.12.0 iri-string@0.7.10 is-macro@0.3.7 is-terminal@0.4.17 is_terminal_polyfill@1.70.2 itertools@0.10.5 itertools@0.11.0 itertools@0.14.0 itoa@1.0.17 jiff-static@0.2.23 jiff@0.2.23 jobserver@0.1.34 js-sys@0.3.91 lalrpop-util@0.20.2 lazy_static@1.5.0 leb128fmt@0.1.0 lebe@0.5.3 libc@0.2.183 libfuzzer-sys@0.4.12 libm@0.2.16 libmimalloc-sys@0.1.49 libredox@0.1.14 linux-raw-sys@0.12.1 litemap@0.8.1 litrs@1.0.0 lock_api@0.4.14 log@0.4.29 loop9@0.1.5 lru-slab@0.1.2 macro_rules_attribute-proc_macro@0.2.2 macro_rules_attribute@0.2.2 malachite-base@0.4.22 malachite-bigint@0.2.3 malachite-nz@0.4.22 malachite-q@0.4.22 malachite@0.4.22 matchers@0.2.0 matchit@0.8.4 matrixmultiply@0.3.10 maybe-rayon@0.1.1 memchr@2.8.0 memo-map@0.3.3 mimalloc@0.1.52 mime@0.3.17 mime_guess@2.0.5 minijinja-contrib@2.18.0 minijinja@2.18.0 minimal-lexical@0.2.1 miniz_oxide@0.8.9 mio@1.1.1 monostate-impl@0.1.18 monostate@0.1.18 moxcms@0.8.1 multimap@0.10.1 native-tls@0.2.18 ndarray@0.16.1 ndarray@0.17.2 new_debug_unreachable@1.0.6 no_std_io2@0.9.3 nom@7.1.3 nom@8.0.0 noop_proc_macro@0.3.0 nu-ansi-term@0.50.3 num-bigint@0.4.6 num-complex@0.4.6 num-conv@0.2.0 num-derive@0.4.2 num-integer@0.1.46 num-rational@0.4.2 num-traits@0.2.19 num_cpus@1.17.0 num_threads@0.1.7 number_prefix@0.4.0 once_cell@1.21.3 once_cell_polyfill@1.70.2 onig@6.5.1 onig_sys@69.9.1 oorandom@11.1.5 openai-harmony@0.0.8 openai-protocol@1.6.0 openssl-macros@0.1.1 openssl-probe@0.2.1 openssl-src@300.5.5+3.5.5 openssl-sys@0.9.112 openssl@0.10.76 option-ext@0.2.0 parking@2.2.1 parking_lot@0.12.5 parking_lot_core@0.9.12 paste@1.0.15 pastey@0.1.1 pcre2-sys@0.2.10 pcre2@0.2.11 pem-rfc7468@1.0.0 percent-encoding@2.3.2 petgraph@0.8.3 phf@0.11.3 phf_codegen@0.11.3 phf_generator@0.11.3 phf_shared@0.11.3 pin-project-internal@1.1.11 pin-project-lite@0.2.17 pin-project@1.1.11 pin-utils@0.1.0 pkg-config@0.3.32 plotters-backend@0.3.7 plotters-svg@0.3.7 plotters@0.3.7 png@0.18.1 polling@3.11.0 portable-atomic-util@0.2.6 portable-atomic@1.13.1 potential_utf@0.1.4 powerfmt@0.2.0 ppv-lite86@0.2.21 prettyplease@0.2.37 primal-check@0.3.4 proc-macro-crate@3.5.0 proc-macro-error-attr2@2.0.0 proc-macro-error2@2.0.1 proc-macro2@1.0.106 profiling-procmacros@1.0.17 profiling@1.0.17 prometheus-client-derive-encode@0.5.0 prometheus-client@0.24.0 prost-build@0.14.3 prost-derive@0.14.3 prost-types@0.14.3 prost@0.14.3 pulldown-cmark-to-cmark@22.0.0 pulldown-cmark@0.13.3 pxfm@0.1.29 qoi@0.4.1 quick-error@2.0.1 quinn-proto@0.11.14 quinn-udp@0.5.14 quinn@0.11.9 quote@1.0.45 r-efi@5.3.0 r-efi@6.0.0 rand@0.8.5 rand@0.9.2 rand_chacha@0.3.1 rand_chacha@0.9.0 rand_core@0.6.4 rand_core@0.9.5 rav1e@0.8.1 ravif@0.13.0 rawpointer@0.2.1 rayon-cond@0.4.0 rayon-core@1.13.0 rayon@1.11.0 realfft@3.5.0 redox_syscall@0.5.18 redox_users@0.5.2 ref-cast-impl@1.0.25 ref-cast@1.0.25 regex-automata@0.4.14 regex-syntax@0.8.10 regex@1.12.3 reqwest-eventsource@0.6.0 reqwest@0.12.28 rgb@0.8.53 ring@0.17.14 riptoken@0.3.0 rmp-serde@1.3.1 rmp@0.8.15 rmpv@1.3.1 rubato@0.16.2 rustc-hash@1.1.0 rustc-hash@2.1.1 rustfft@6.4.1 rustix@1.1.4 rustls-native-certs@0.8.3 rustls-pki-types@1.14.0 rustls-webpki@0.103.9 rustls@0.23.37 rustpython-ast@0.4.0 rustpython-parser-core@0.4.0 rustpython-parser-vendored@0.4.0 rustpython-parser@0.4.0 rustversion@1.0.22 ryu@1.0.23 saa@5.5.0 same-file@1.0.6 scc@2.4.0 scc@3.6.9 schannel@0.1.29 schemars@0.8.22 schemars@0.9.0 schemars@1.2.1 schemars_derive@0.8.22 scopeguard@1.2.0 sdd@3.0.10 sdd@4.7.3 secrecy@0.10.3 security-framework-sys@2.17.0 security-framework@3.7.0 semver@1.0.27 serde-json-fmt@0.1.0 serde@1.0.228 serde_bytes@0.11.19 serde_core@1.0.228 serde_default@0.2.0 serde_derive@1.0.228 serde_derive_internals@0.29.1 serde_json@1.0.149 serde_path_to_error@0.1.20 serde_repr@0.1.20 serde_tuple@1.1.3 serde_tuple_macros@1.1.3 serde_urlencoded@0.7.1 serde_with@3.18.0 serde_with_macros@3.18.0 serial_test@3.4.0 serial_test_derive@3.4.0 sha1@0.10.6 sha2@0.10.9 sharded-slab@0.1.7 shlex@1.3.0 signal-hook-registry@1.4.8 simd-adler32@0.3.8 simd_helpers@0.1.0 siphasher@1.0.2 slab@0.4.12 smallvec@1.15.1 smartstring@1.0.1 socket2@0.6.3 socks@0.3.4 spm_precompiled@0.1.4 stable_deref_trait@1.2.1 static_assertions@1.1.0 strength_reduce@0.2.4 strsim@0.11.1 strum@0.27.2 strum_macros@0.27.2 subenum@1.1.3 subtle@2.6.1 syn@1.0.109 syn@2.0.117 sync_wrapper@1.0.2 synstructure@0.13.2 system-configuration-sys@0.6.0 system-configuration@0.7.0 task-local@0.1.1 tekken-rs@0.1.1 tempfile@3.27.0 thiserror-ext-derive@0.3.0 thiserror-ext@0.3.0 thiserror-impl@1.0.69 thiserror-impl@2.0.18 thiserror@1.0.69 thiserror@2.0.18 thread_local@1.1.9 tiff@0.11.3 tiktoken-rs@0.7.0 tiktoken-rs@0.9.1 time-core@0.1.8 time-macros@0.2.27 time@0.3.47 tiny-keccak@2.0.2 tinystr@0.8.2 tinytemplate@1.2.1 tinyvec@1.11.0 tinyvec_macros@0.1.1 tokenizers@0.22.2 tokio-macros@2.6.1 tokio-native-tls@0.3.1 tokio-rustls@0.26.4 tokio-stream@0.1.18 tokio-tungstenite@0.28.0 tokio-util@0.7.18 tokio@1.50.0 toml_datetime@1.1.1+spec-1.1.0 toml_edit@0.25.11+spec-1.1.0 toml_parser@1.1.2+spec-1.1.0 tonic-build@0.14.5 tonic-prost-build@0.14.5 tonic-prost@0.14.5 tonic@0.14.5 tool-parser@1.2.0 tower-http@0.6.8 tower-layer@0.3.3 tower-service@0.3.3 tower@0.5.3 tracing-attributes@0.1.31 tracing-core@0.1.36 tracing-futures@0.2.5 tracing-log@0.2.0 tracing-subscriber@0.3.22 tracing@0.1.44 trait-set@0.3.0 transpose@0.2.3 try-lock@0.2.5 tungstenite@0.28.0 typenum@1.19.0 unic-char-property@0.9.0 unic-char-range@0.9.0 unic-common@0.9.0 unic-emoji-char@0.9.0 unic-ucd-ident@0.9.0 unic-ucd-version@0.9.0 unicase@2.9.0 unicode-ident@1.0.24 unicode-normalization-alignments@0.1.12 unicode-segmentation@1.13.1 unicode-width@0.2.2 unicode-xid@0.2.6 unicode_categories@0.1.1 unicode_names2@1.3.0 unicode_names2_generator@1.3.0 unit-prefix@0.5.2 untrusted@0.9.0 ureq-proto@0.6.0 ureq@2.12.1 ureq@3.3.0 url@2.5.8 utf-8@0.7.6 utf16_iter@1.0.5 utf8-zero@0.8.1 utf8_iter@1.0.4 utf8parse@0.2.2 uuid@1.22.0 v_frame@0.3.9 validator@0.20.0 validator_derive@0.20.0 valuable@0.1.1 vcpkg@0.2.15 version_check@0.9.5 walkdir@2.5.0 want@0.3.1 wasi@0.11.1+wasi-snapshot-preview1 wasip2@1.0.2+wasi-0.2.9 wasip3@0.4.0+wasi-0.3.0-rc-2026-01-06 wasm-bindgen-futures@0.4.64 wasm-bindgen-macro-support@0.2.114 wasm-bindgen-macro@0.2.114 wasm-bindgen-shared@0.2.114 wasm-bindgen@0.2.114 wasm-encoder@0.244.0 wasm-metadata@0.244.0 wasm-streams@0.4.2 wasmparser@0.244.0 web-sys@0.3.91 web-time@1.1.0 webpki-root-certs@1.0.6 webpki-roots@0.26.11 webpki-roots@1.0.6 weezl@0.1.12 win_uds@0.2.2 winapi-i686-pc-windows-gnu@0.4.0 winapi-util@0.1.11 winapi-x86_64-pc-windows-gnu@0.4.0 winapi@0.3.9 windows-core@0.62.2 windows-implement@0.60.2 windows-interface@0.59.3 windows-link@0.2.1 windows-registry@0.6.1 windows-result@0.4.1 windows-strings@0.5.1 windows-sys@0.52.0 windows-sys@0.59.0 windows-sys@0.60.2 windows-sys@0.61.2 windows-targets@0.52.6 windows-targets@0.53.5 windows_aarch64_gnullvm@0.52.6 windows_aarch64_gnullvm@0.53.1 windows_aarch64_msvc@0.52.6 windows_aarch64_msvc@0.53.1 windows_i686_gnu@0.52.6 windows_i686_gnu@0.53.1 windows_i686_gnullvm@0.52.6 windows_i686_gnullvm@0.53.1 windows_i686_msvc@0.52.6 windows_i686_msvc@0.53.1 windows_x86_64_gnu@0.52.6 windows_x86_64_gnu@0.53.1 windows_x86_64_gnullvm@0.52.6 windows_x86_64_gnullvm@0.53.1 windows_x86_64_msvc@0.52.6 windows_x86_64_msvc@0.53.1 winnow@1.0.2 wit-bindgen-core@0.51.0 wit-bindgen-rust-macro@0.51.0 wit-bindgen-rust@0.51.0 wit-bindgen@0.51.0 wit-component@0.244.0 wit-parser@0.244.0 write16@1.0.0 writeable@0.6.2 y4m@0.8.0 yoke-derive@0.8.1 yoke@0.8.1 zerocopy-derive@0.8.42 zerocopy@0.8.42 zerofrom-derive@0.1.6 zerofrom@0.1.6 zeroize@1.8.2 zeromq@0.6.0 zerotrie@0.2.3 zerovec-derive@0.11.2 zerovec@0.11.5 zmij@1.0.21 zune-core@0.5.1 zune-inflate@0.2.54 zune-jpeg@0.5.15 " declare -A GIT_CRATES=( [llm-multimodal]='https://github.com/vllm-project/llm-multimodal;5b558989844d1c7af3e43d0f604069ffd9c06320;llm-multimodal-%commit%' ) # The Rust frontend (vllm-rs) is opt-in at runtime (VLLM_USE_RUST_FRONTEND=1, # default off) and a heavy 600+-crate build, so gate it behind USE=rust rather # than building it for every install. CARGO_OPTIONAL stops the cargo eclass from # auto-adding its BDEPEND/SRC_URI/phase functions; we wire those under rust? # below and call cargo_src_unpack manually. CARGO_OPTIONAL=1 inherit cargo distutils-r1 pypi rocm toolchain-funcs # Commit pinned by cmake/external_projects/vllm_flash_attn.cmake (GIT_TAG). # Pre-staged so we can patch out FA3's unconditional-build quirk before # vllm's CMake FetchContent reaches it. Bump in lockstep with vllm # bumps that change the pin. VLLM_FA_COMMIT="dd62dac706b1cf7895bd99b18c6cb7e7e117ee25" DESCRIPTION="High-throughput, memory-efficient inference and serving engine for LLMs" HOMEPAGE=" https://github.com/vllm-project/vllm https://docs.vllm.ai/ https://pypi.org/project/vllm/ " SRC_URI+=" rust? ( ${CARGO_CRATE_URIS} ) cuda? ( https://github.com/vllm-project/flash-attention/archive/${VLLM_FA_COMMIT}.tar.gz -> vllm-flash-attn-${VLLM_FA_COMMIT:0:7}.gh.tar.gz ) " LICENSE="Apache-2.0" # Dependent crate licenses LICENSE+=" Apache-2.0 BSD-2 BSD CC0-1.0 CDLA-Permissive-2.0 ISC LGPL-3 MIT MPL-2.0 MPL-2.0 UoI-NCSA Unicode-3.0 Unicode-DFS-2016 Unlicense ZLIB " SLOT="0" KEYWORDS="~amd64" IUSE="cpu cuda humming rocm rust" # VLLM_TARGET_DEVICE is single-valued; cpu, cuda, and rocm paths are # mutually exclusive. Default (none) → empty target. USE=rust is # orthogonal — it builds the optional vllm-rs Rust serving frontend # (opt-in at runtime via VLLM_USE_RUST_FRONTEND=1) and combines with any # target. REQUIRED_USE=" ?? ( cpu cuda rocm ) rocm? ( || ( ${ROCM_REQUIRED_USE} ) ) humming? ( cuda ) " # USE=cpu (default off): build with VLLM_TARGET_DEVICE=cpu so the # Python entrypoints can actually drive inference on CPU hardware. # Pulls torchaudio + numba (vllm's cpu.txt also lists intel-openmp on # x86_64, but Intel ships it as a proprietary blob — we omit it; vllm # falls back to the pthreads OpenMP shipped with sci-libs/openblas etc.) # # CAVEAT (historical): ::gentoo sci-ml/pytorch's caffe2::mkl public # link interface used to drag MKL's MPI / cluster libs (scalapack, # cdft, blacs_intelmpi) and Intel-OpenMP threading (intel_thread) # into every consumer link, breaking the build on hosts without # Intel Cluster Edition + Compiler. We pin >=sci-ml/caffe2-2.11.0-r90 # below — this overlay's r90 fork ships a scrub patch on # cmake/public/mkl.cmake that filters those libs and forces # gnu_thread. Drop the pin once an equivalent upstream fix lands. # # USE=cuda: build with VLLM_TARGET_DEVICE=cuda. Pulls torchaudio + # torchvision + numba and the full Tier-0..5 CUDA stack (flashinfer # + tilelang + nvidia-cutlass-dsl + cuda-bindings + nvidia-cudnn- # frontend + ...). Compiles the _C / _moe_C / _vllm_fa* CUDA C++ # extensions in setup.py via nvcc and the system CUDA toolkit at # /opt/cuda. CMAKE_CUDA_HOST_COMPILER is pinned to the gcc-15 slot # below — CUDA 13.2's nvcc rejects __GNUC__>15 via host_config.h. # FetchContent of # CUTLASS / spdlog / etc. happens during the vllm CMake build, so # RESTRICT="cuda? ( network-sandbox )" mirrors the cpu? pattern. # # CAVEAT (historical): same MKL-MPI link pollution as USE=cpu — # ::gentoo sci-ml/pytorch with USE=mkl exported MKL MPI / cluster # libs in its public link interface, breaking the cumem_allocator # extension's link step on partial-MKL hosts. Fixed by the # >=sci-ml/caffe2-2.11.0-r90 pin below: this overlay's r90 fork # scrubs those libs from caffe2::mkl. Without that pin, all 339 # CUDA-compiled objects (_C / _moe_C / _vllm_fa2/3 extensions) # would still build cleanly but the final cumem_allocator link # would fail with "cannot find -lmkl_scalapack_ilp64". # # USE=rocm: build with VLLM_TARGET_DEVICE=rocm. Pulls torchaudio + # torchvision + numba + the runai-streamer/tensorizer/conch-triton # trio from upstream's requirements/rocm.txt, plus the HIP libs that # vllm's CMake `enable_language(HIP)` and the linked libtorch_hip # resolve at link time (hipBLAS / hipBLASLt / hipFFT / hipRAND / # hipSOLVER / hipSPARSE / hipCUB). Compiles the _C / _moe_C / _rocm_C # extensions and csrc/rocm/*.cu via hipcc and the system ROCm # toolchain at /opt/rocm. Inherits sci-ml/caffe2's MKL-MPI scrub # (>=2.11.0-r90) — same link-pollution caveat as the cuda path. # PYTORCH_ROCM_ARCH is derived from AMDGPU_TARGETS via rocm.eclass's # get_amdgpu_flags. FetchContent of CK / spdlog / etc. happens during # the vllm CMake build, hence RESTRICT="rocm? ( network-sandbox )". # # amd-quark (in requirements/rocm.txt as "for Quark quantization on # ROCm") is deliberately omitted from RDEPEND: no direct `import` from # vllm core code, only used by vllm.model_executor.layers.quantization. # quark internals when Quark-quantized models are loaded. # dev-python/amd-quark-bin in this overlay caps PYTHON_COMPAT at # 3.{11,12}, which would block vllm on 3.13/3.14. Users wanting Quark # quantization install amd-quark-bin separately. # # Upstream requirements/cuda.txt pins nvidia-cutlass-dsl[cu13]==4.5.2, # tilelang==0.1.9 and flashinfer-python==0.6.12 exactly; we pin # ~nvidia-cutlass-dsl-4.5.2 and ~flashinfer-python-0.6.12 to match. # The cutlass-dsl metapackage pulls nvidia-cutlass-dsl-libs-cu13 # transitively, so it already covers the [cu13] extra. 0.23.0 raises the # nvidia-cudnn-frontend floor to >=1.19.1 (0.22.x wanted <1.19.0); that # dep lives on the flashinfer-python ebuild — vllm has zero direct # cudnn_frontend imports; it is for flashinfer's internal use. fastsafetensors # floor rose 0.2.2 -> 0.3.2. # # static cuda.txt audit 2026-06-13 against vllm-0.23.0 (rocm gfx1150 + # # cpu + empty + USE=rust build-verified 2026-06-13; cuda sm_86 GPU # # build re-verification still pending). # # tokenspeed-mla (in requirements/cuda.txt at ==0.1.2 with the comment # "for faster mla with spec decode") is deliberately omitted from # cuda?'s RDEPEND for similar reasons: all imports in vllm core are # lazy and gated by try/except with a clear pip-install hint, the # kernels are Blackwell SM100/SM103-only (irrelevant on Ampere/Hopper # hosts), and the package transitively pulls tokenspeed-triton — a # Triton vendor-fork we'd otherwise have to package as a hard build # dep for a backend most users never enable. Users on Blackwell with # DeepSeek R1 + spec decode install tokenspeed-mla separately. # # verified 2026-05-16: vllm imports clean without it. # # humming-kernels[cu13] (requirements/cuda.txt, ==0.1.4 "for quantization # gemm") provides the optional `humming` quant backend -- pulled only # under USE=humming. vllm's quant registry imports `.humming` for any # quant method, and humming.py imports the external `humming` package # under `if current_platform.is_cuda():` with no fallback, so a cuda # build without it aborts on every quantized model load. The # ${P}-humming-import-optional.patch guards that import so the other # quant methods still work with USE=-humming; upstream makes it lazy in # vllm > 0.23.0 (vllm-project/vllm#44921). # 2026-06-15 # gfx1150 (Strix Point iGPU) rocm build verified on # caffe2[rocm,amdgpu_targets_gfx1150,-nccl,-cusparselt] with # AMDGPU_TARGETS=gfx1150. Produces the HIP extensions (_C, # _C_stable_libtorch, _moe_C, _rocm_C, cumem_allocator, spinloop) and # installs cleanly. # # verified 2026-05-08 for 0.20.1, 2026-05-16 for 0.21.0, 2026-06-13 for # # 0.23.0 (with pytorch/caffe2 2.11.0; cpu + empty + USE=rust also OK). # # RTX A4500 Laptop (sm_86 Ampere) cuda build verified on # caffe2-2.11.0-r90 + CUDA-13.2 + CUDAHOSTCXX=g++-15 + MAX_JOBS=4. # Pre-FA3-skip baseline: ~2h30m wallclock, 339 CUDA template files # (FA3 .cu compiled at nvcc's default arch — wasted on Ampere). # Post-FA3-skip (next commit, files/vllm-flash-attn-...-fa3-only- # when-archs.patch): ~1h35m wallclock, 144 CUDA template files. # Peak ~14 GiB RSS in either case (16 GiB free headroom on 31 GiB # host). Smoke test in both shapes: `from vllm import LLM` # succeeds, torch.cuda.is_available() True, torch reports "NVIDIA # RTX A4500 Laptop GPU"; FA2 kernels build for sm_80+PTX (forward- # compat with sm_86); FA3 (Hopper) does NOT build on sm_86 in the # post-patch shape (FA3_AVAILABLE=False at runtime, vllm picks FA2). # # verified 2026-05-17 for 0.21.0 on sm_86 + CUDA 13.2 (both shapes). # # USE=-cpu -cuda -rocm (default): build with VLLM_TARGET_DEVICE=empty # — Python entrypoints import cleanly, backend kernels fail at first # model-load. Useful if you only want the API surface for development. # # media-libs/opencv lower bound: upstream requirements/common.txt says # opencv-python-headless >=4.13.0, ::gentoo tops at 4.12.0. The full # cv2 surface vllm imports — resize, cvtColor, COLOR_BGR2RGB, # CAP_PROP_FRAME_COUNT/FPS/FRAME_WIDTH/FRAME_HEIGHT, VideoCapture incl. # the 3-arg bytes+backend form, VideoWriter, VideoWriter_fourcc, # videoio_registry submodule — is present in 4.12.0; the 4.13 lower # bound upstream is wheel-publication churn, not an API extension. # # verified 2026-05-16 against media-libs/opencv-4.12.0-r1[python]. # # vllm resolves its runtime platform from the host hardware (not the # VLLM_TARGET_DEVICE built below). platforms/cuda.py / rocm.py import # torch.distributed.PrefixStore + ProcessGroup unconditionally at module # load (needs USE=distributed), and at engine init vllm builds a CPU # coordination group on the gloo backend. Since our caffe2 builds CUDA # with USE_NCCL=OFF, vllm's nccl device group also falls back to gloo, so # USE=gloo is required too. Both flags are default-off: without # caffe2[distributed,gloo] vllm ImportErrors at startup, or # AssertionErrors ("Fallback Gloo backend is not available") at engine # init. verified 2026-06-14, bug #274 # # vllm's GPU kernels (slot mapping, attention, sampling, and the # torch.compile/inductor path) are @triton.jit on both the cuda and # rocm targets -- on ROCm, vllm's custom paged-attention also falls # back to a Triton kernel on gfx targets without it (e.g. gfx1150). # Gentoo's source-built torch does not pull Triton the way upstream's # PyPI wheels do, so the cuda? and rocm? targets require # dev-python/triton-bin or vllm dies at first GPU inference with # "'function' object is not subscriptable". torch-2.11.0 pairs with # triton 3.6.0; its AMD backend JITs gfx kernels via hipcc. cuda # verified 2026-06-14 (bug #274); rocm gfx1150 verified 2026-06-14 # (opt-125m generated, inductor path + Triton _fwd_kernel). RDEPEND=" ~sci-ml/pytorch-2.11.0[${PYTHON_SINGLE_USEDEP}] sci-ml/caffe2[distributed,gloo] >=sci-ml/transformers-4.56.0[${PYTHON_SINGLE_USEDEP}] >=sci-ml/tokenizers-0.21.1[${PYTHON_SINGLE_USEDEP}] >=dev-python/xgrammar-0.2.0[${PYTHON_SINGLE_USEDEP}] =sci-ml/safetensors-0.6.2[${PYTHON_USEDEP}] dev-python/numpy[${PYTHON_USEDEP}] >=dev-python/requests-2.26.0[${PYTHON_USEDEP}] dev-python/tqdm[${PYTHON_USEDEP}] dev-python/blake3[${PYTHON_USEDEP}] dev-python/py-cpuinfo[${PYTHON_USEDEP}] >=dev-python/protobuf-5.29.6[${PYTHON_USEDEP}] >=dev-python/fastapi-0.115.0[${PYTHON_USEDEP}] >=dev-python/aiohttp-3.13.3[${PYTHON_USEDEP}] >=dev-python/openai-2.0.0[${PYTHON_USEDEP}] >=dev-python/pydantic-2.12.0[${PYTHON_USEDEP}] >=dev-python/prometheus-client-0.18.0[${PYTHON_USEDEP}] dev-python/pillow[${PYTHON_USEDEP}] >=dev-python/prometheus-fastapi-instrumentator-7.0.0[${PYTHON_USEDEP}] >=dev-python/tiktoken-0.6.0[${PYTHON_USEDEP}] ~dev-python/lm-format-enforcer-0.11.3[${PYTHON_USEDEP}] >=dev-python/llguidance-1.7.0[${PYTHON_USEDEP}] =dev-python/diskcache-5.6.3[${PYTHON_USEDEP}] >=dev-python/lark-1.2.2[${PYTHON_USEDEP}] >=dev-python/typing-extensions-4.10[${PYTHON_USEDEP}] >=dev-python/filelock-3.16.1[${PYTHON_USEDEP}] dev-python/partial-json-parser[${PYTHON_USEDEP}] >=dev-python/pyzmq-25.0.0[${PYTHON_USEDEP}] dev-python/msgspec[${PYTHON_USEDEP}] >=dev-python/gguf-0.17.0[${PYTHON_USEDEP}] >=dev-python/mistral-common-1.11.3[${PYTHON_USEDEP},image] >=media-libs/opencv-4.12.0[python,${PYTHON_USEDEP}] dev-python/pyyaml[${PYTHON_USEDEP}] dev-python/six[${PYTHON_USEDEP}] dev-python/einops[${PYTHON_USEDEP}] ~dev-python/depyf-0.20.0[${PYTHON_USEDEP}] dev-python/cloudpickle[${PYTHON_USEDEP}] dev-python/uvloop[${PYTHON_USEDEP}] dev-python/watchfiles[${PYTHON_USEDEP}] dev-python/python-json-logger[${PYTHON_USEDEP}] dev-python/pybase64[${PYTHON_USEDEP}] dev-python/cbor2[${PYTHON_USEDEP}] dev-python/ijson[${PYTHON_USEDEP}] dev-python/setproctitle[${PYTHON_USEDEP}] >=dev-python/openai-harmony-0.0.3[${PYTHON_USEDEP}] >=dev-python/anthropic-0.71.0[${PYTHON_USEDEP}] >=dev-python/model-hosting-container-standards-0.1.14[${PYTHON_USEDEP}] =dev-python/opentelemetry-sdk-1.27.0[${PYTHON_USEDEP}] >=dev-python/opentelemetry-api-1.27.0[${PYTHON_USEDEP}] >=dev-python/opentelemetry-exporter-otlp-1.27.0[${PYTHON_USEDEP}] >=dev-python/opentelemetry-semantic-conventions-ai-0.4.1[${PYTHON_USEDEP}] ') cpu? ( >=sci-ml/caffe2-2.11.0-r90 ~sci-ml/torchaudio-2.11.0 $(python_gen_cond_dep ' >=dev-python/numba-0.65.0[${PYTHON_USEDEP}] ') ) cuda? ( >=sci-ml/caffe2-2.11.0-r90 ~sci-ml/torchaudio-2.11.0 ~sci-ml/torchvision-0.26.0[${PYTHON_SINGLE_USEDEP}] ~dev-python/flashinfer-python-0.6.12[${PYTHON_SINGLE_USEDEP}] ~dev-python/tilelang-0.1.9[${PYTHON_SINGLE_USEDEP}] >=dev-python/quack-kernels-0.3.3[${PYTHON_SINGLE_USEDEP}] humming? ( ~dev-python/humming-kernels-0.1.4[${PYTHON_SINGLE_USEDEP}] ) $(python_gen_cond_dep ' >=dev-python/numba-0.65.0[${PYTHON_USEDEP}] >=dev-python/fastsafetensors-0.3.2[${PYTHON_USEDEP}] ~dev-python/nvidia-cutlass-dsl-4.5.2[${PYTHON_USEDEP}] ~dev-python/triton-bin-3.6.0[${PYTHON_USEDEP}] ') dev-util/nvidia-cuda-toolkit:= ) rocm? ( >=sci-ml/caffe2-2.11.0-r90 ~sci-ml/torchaudio-2.11.0 ~sci-ml/torchvision-0.26.0[${PYTHON_SINGLE_USEDEP}] >=dev-python/runai-model-streamer-bin-0.15.7[${PYTHON_SINGLE_USEDEP}] ~dev-python/tensorizer-2.10.1[${PYTHON_SINGLE_USEDEP}] ~dev-python/tilelang-0.1.10[${PYTHON_SINGLE_USEDEP}] $(python_gen_cond_dep ' >=dev-python/numba-0.65.0[${PYTHON_USEDEP}] ~dev-python/conch-triton-kernels-1.2.1[${PYTHON_USEDEP}] ~dev-python/triton-bin-3.6.0[${PYTHON_USEDEP}] >=dev-util/amdsmi-7.0.2[${PYTHON_USEDEP}] ') >=dev-util/hip-7.2:= >=sci-libs/hipBLAS-7.2:= >=sci-libs/hipBLASLt-7.2:= >=sci-libs/hipFFT-7.2:= >=sci-libs/hipRAND-7.2:= >=sci-libs/hipSOLVER-7.2:= >=sci-libs/hipSPARSE-7.2:= >=sci-libs/hipCUB-7.2:= ) " # Upstream pyproject.toml caps setuptools at <81.0.0; dropped from # BDEPEND because (a) gentoo only ships 79.0.1 + 82.0.1 (nothing in # the 80.x/81.x line), and downgrading to 79.0.1 fights pkg-resources- # 81.0.0 (which has !/dev/null || die # Skip the FA3 (Hopper) target body when no Hopper arch is in # CUDA_ARCHS so Ampere/Ada builds don't compile unrunnable kernels. eapply -p0 \ "${FILESDIR}/vllm-flash-attn-${VLLM_FA_COMMIT:0:7}-fa3-only-when-archs.patch" # vllm's PYTHON_COMPAT allows python3_14, but flash-attn's # CMakeLists hard-codes a supported-Python whitelist and # FATAL_ERRORs on 3.14 at configure. The extension is abi3 # (USE_SABI 3), so widening that whitelist is safe. bug #274 eapply -p0 \ "${FILESDIR}/vllm-flash-attn-${VLLM_FA_COMMIT:0:7}-py314.patch" popd >/dev/null || die fi } src_configure() { # When the Rust frontend is requested, make its build mandatory so a # failure errors out instead of setuptools-rust silently skipping the # optional extension. use rust && export VLLM_REQUIRE_RUST_FRONTEND=1 if use cuda; then export VLLM_TARGET_DEVICE=cuda # Point vllm's cmake FetchContent at our pre-staged + patched # flash-attention source instead of re-fetching from github. export VLLM_FLASH_ATTN_SRC_DIR="${WORKDIR}/flash-attention-${VLLM_FA_COMMIT}" # CUDA 13.2's nvcc rejects gcc>15 via crt/host_config.h. Pin # nvcc's host compiler to the gcc-15 slot when the active # system gcc is newer. export CUDAHOSTCXX=/usr/bin/x86_64-pc-linux-gnu-g++-15 export CMAKE_ARGS+=" -DCMAKE_CUDA_HOST_COMPILER=${CUDAHOSTCXX}" # vllm's heavy CUDA template instantiations # (paged_attention_v*, layernorm_quant_kernels, w8a8/fp8/...) # can each peak at 3-4 GiB during cudafe++. With ninja's # default 24-way parallelism this OOM-kills on a 31 GiB host # (cudafe++ dies with SIGKILL, "[code=9]"). MAX_JOBS is the # env var vllm's setup.py reads to throttle the CMake build; # CMAKE_BUILD_PARALLEL_LEVEL backs it up for direct cmake # --build invocations. Tune this per-host: 31 GiB → 4-6, # 54 GiB → 8-10, 128 GiB → ~16. The OOM threshold was measured # against 0.20.1; 0.21.0's CUDA template set wasn't re-profiled # at bump time but the heavy instantiations (paged_attention, # layernorm_quant, w8a8/fp8) are unchanged, so MAX_JOBS=4 stays # a conservative default. # verified 2026-05-07 against 0.20.1. # # Caller-overridable so users on smaller/larger hosts can adjust # without ebuild-edit (e.g. MAX_JOBS=2 emerge … on a 16 GiB # host). export MAX_JOBS="${MAX_JOBS:-4}" export CMAKE_BUILD_PARALLEL_LEVEL="${CMAKE_BUILD_PARALLEL_LEVEL:-${MAX_JOBS}}" elif use cpu; then export VLLM_TARGET_DEVICE=cpu # vllm 0.22.x cpu_extension.cmake locates OpenMP via # vllm_prepare_torch_gomp_shim(), which expects a libgomp vendored # inside PyTorch (torch.libs/libgomp-*.so — a PyPI-wheel artifact). # Our source-built sci-ml/caffe2 ships none, so cmake falls back to # find_library(NAMES gomp), which misses Gentoo's libgomp under the # gcc-internal dir. Point CMAKE_LIBRARY_PATH at the toolchain's # libgomp so the fallback resolves. # verified 2026-06-05 (0.22.1) local gomp_dir gomp_dir=$(dirname "$($(tc-getCC) -print-file-name=libgomp.so)") export CMAKE_ARGS+=" -DCMAKE_LIBRARY_PATH=${gomp_dir}" elif use rocm; then export VLLM_TARGET_DEVICE=rocm # rocm.eclass turns AMDGPU_TARGETS into a semicolon-joined # list. vllm's CMakeLists reads PYTORCH_ROCM_ARCH and feeds # it to enable_language(HIP). Same MAX_JOBS throttle as the # cuda branch — HIP template instantiation in csrc/rocm/ # (skinny_gemms, attention) hits comparable peak RSS. export PYTORCH_ROCM_ARCH=$(get_amdgpu_flags) export MAX_JOBS="${MAX_JOBS:-4}" export CMAKE_BUILD_PARALLEL_LEVEL="${CMAKE_BUILD_PARALLEL_LEVEL:-${MAX_JOBS}}" else export VLLM_TARGET_DEVICE=empty fi distutils-r1_src_configure } pkg_postinst() { if use cuda; then elog "vllm's CUDA path pulls dev-python/flashinfer-python, which" elog "JIT-compiles GPU kernels with nvcc on first inference. CUDA" elog "13.x nvcc rejects host compilers newer than gcc 15, so if the" elog "active gcc is newer, vllm aborts at first run with a" elog "'Ninja build failed ... unsupported GNU version' error." elog "" elog "Pin nvcc's host compiler to a gcc <= 15 when launching vllm:" elog "" elog " NVCC_PREPEND_FLAGS=\"-ccbin /usr/bin/${CHOST}-g++-15\" vllm serve ..." elog "" elog "or switch the system compiler via 'eselect gcc'." fi if use cuda && ! use humming; then elog "" elog "The optional 'humming' MXFP4 quantization backend is off by" elog "default. Enable USE=humming to pull dev-python/humming-kernels" elog "if you serve humming-quantized models." fi if use rocm; then elog "vllm initializes a torch.distributed process group at engine" elog "start (a TCPStore rendezvous) even for single-GPU inference." elog "Since torch 2.4 the TCPStore defaults to the libuv backend," elog "but sci-ml/pytorch's ROCm build ships no libuv -- it rides in" elog "via tensorpipe, which is disabled for ROCm. Without it vllm" elog "aborts at engine init with:" elog "" elog " DistStoreError: use_libuv was requested but PyTorch was" elog " built without libuv support" elog "" elog "Launch vllm with USE_LIBUV=0 to use the legacy socket store:" elog "" elog " USE_LIBUV=0 vllm serve ..." fi }