# Copyright 2026 Gentoo Authors # Distributed under the terms of the GNU General Public License v2 EAPI=8 ROCM_VERSION="6.3" inherit cmake cuda flag-o-matic rocm linux-info systemd DESCRIPTION="llama.cpp fork with additional SOTA quants and improved performance" HOMEPAGE="https://github.com/ikawrakow/ik_llama.cpp" # Pinned to commit d1339249d708159bd43aadfef089d461f53019f1 (BUILD_NUMBER=4582, 2026-06-12) SRC_URI="https://github.com/ikawrakow/ik_llama.cpp/archive/d1339249d708159bd43aadfef089d461f53019f1.tar.gz -> ${P}.gh.tar.gz" S="${WORKDIR}/ik_llama.cpp-d1339249d708159bd43aadfef089d461f53019f1" KEYWORDS="~amd64" LICENSE="MIT" SLOT="0" # wmma: rocWMMA flash-attention on RDNA3+/CDNA GPUs # see https://github.com/ikawrakow/ik_llama.cpp/blob/master/docs/build.md IUSE=" curl openblas +openmp blis rocm cuda vulkan flexiblas wmma cpu_flags_x86_avx cpu_flags_x86_avx2 cpu_flags_x86_fma3 cpu_flags_x86_f16c cpu_flags_x86_bmi2 cpu_flags_x86_avx512f cpu_flags_x86_avx512vbmi cpu_flags_x86_avx512_vnni cpu_flags_x86_avx512_bf16 " REQUIRED_USE=" ?? ( openblas blis flexiblas ) wmma? ( rocm ) cpu_flags_x86_avx2? ( cpu_flags_x86_avx ) cpu_flags_x86_avx512f? ( cpu_flags_x86_avx2 ) cpu_flags_x86_avx512vbmi? ( cpu_flags_x86_avx512f ) cpu_flags_x86_avx512_vnni? ( cpu_flags_x86_avx512f ) cpu_flags_x86_avx512_bf16? ( cpu_flags_x86_avx512f ) " # curl: needed for pulling models from huggingface CDEPEND=" curl? ( net-misc/curl:= ) openblas? ( sci-libs/openblas:= ) openmp? ( llvm-runtimes/openmp:= ) blis? ( sci-libs/blis:= ) flexiblas? ( sci-libs/flexiblas:= ) rocm? ( >=dev-util/hip-${ROCM_VERSION}:= >=sci-libs/hipBLAS-${ROCM_VERSION}:= wmma? ( >=sci-libs/rocWMMA-${ROCM_VERSION}:= ) ) cuda? ( dev-util/nvidia-cuda-toolkit:= ) " DEPEND="${CDEPEND} vulkan? ( dev-util/vulkan-headers ) " RDEPEND="${CDEPEND} vulkan? ( media-libs/vulkan-loader ) acct-user/ik-llama acct-group/ik-llama " BDEPEND="media-libs/shaderc" pkg_setup() { if use rocm; then linux-info_pkg_setup if linux-info_get_any_version && linux_config_exists; then if ! linux_chkconfig_present HSA_AMD_SVM; then ewarn "ROCm/HIP requires HSA_AMD_SVM enabled in your kernel config." fi fi fi } src_prepare() { use cuda && cuda_src_prepare cmake_src_prepare } src_configure() { local mycmakeargs=( # -- Build options -- -DCMAKE_INSTALL_INCLUDEDIR=include/ik_llama.cpp -DLLAMA_BUILD_TESTS=OFF -DLLAMA_BUILD_EXAMPLES=ON -DLLAMA_BUILD_SERVER=ON -DCMAKE_SKIP_BUILD_RPATH=ON -DGGML_RPC=ON -DLLAMA_CURL=$(usex curl) -DBUILD_NUMBER="4582" -DGENTOO_REMOVE_CMAKE_BLAS_HACK=ON # -- CPU feature flags -- # No -march=native: explicit mapping from CPU_FLAGS_X86 for # reproducible/portable builds. SSE4.2 is the baseline. # NOTE: This fork has no GGML_AVX_VNNI flag (upstream-only). -DGGML_NATIVE=0 -DGGML_SSE42=ON -DGGML_AVX=$(usex cpu_flags_x86_avx) -DGGML_AVX2=$(usex cpu_flags_x86_avx2) -DGGML_BMI2=$(usex cpu_flags_x86_bmi2) -DGGML_FMA=$(usex cpu_flags_x86_fma3) -DGGML_F16C=$(usex cpu_flags_x86_f16c) -DGGML_AVX512=$(usex cpu_flags_x86_avx512f) -DGGML_AVX512_VBMI=$(usex cpu_flags_x86_avx512vbmi) -DGGML_AVX512_VNNI=$(usex cpu_flags_x86_avx512_vnni) -DGGML_AVX512_BF16=$(usex cpu_flags_x86_avx512_bf16) # -- Backends -- -DGGML_CUDA=$(usex cuda) -DGGML_OPENMP=$(usex openmp) -DGGML_VULKAN=$(usex vulkan) # -- Install paths (avoid clashing with whisper.cpp / upstream llama.cpp) -- -DCMAKE_INSTALL_LIBDIR="${EPREFIX}/usr/$(get_libdir)/ik_llama.cpp" -DCMAKE_INSTALL_RPATH="${EPREFIX}/usr/$(get_libdir)/ik_llama.cpp" ) # HAVE_FANCY_SIMD (IQK kernel activation) needs AVX-512F + VNNI + VL + # BW + DQ. GGML_AVX512=ON only passes -mavx512f -mavx512bw to the # compiler -- VL and DQ are MISSING, so the IQK kernels silently fall # back to AVX2. Inject the missing sub-flags via compiler flags. if use cpu_flags_x86_avx512f; then append-cflags -mavx512vl -mavx512dq append-cxxflags -mavx512vl -mavx512dq fi # BLAS vendor selection (mutually exclusive via REQUIRED_USE) if use openblas; then mycmakeargs+=( -DGGML_BLAS=ON -DGGML_BLAS_VENDOR=OpenBLAS ) fi if use blis; then mycmakeargs+=( -DGGML_BLAS=ON -DGGML_BLAS_VENDOR=FLAME ) fi if use flexiblas; then mycmakeargs+=( -DGGML_BLAS=ON -DGGML_BLAS_VENDOR=FlexiBLAS ) fi # CUDA: set host compiler and sandbox for device node symlinks if use cuda; then local -x CUDAHOSTCXX="$(cuda_gccdir)" cuda_add_sandbox addpredict "/dev/char/" fi # ROCm/HIP: use hipcc and set GPU architecture targets if use rocm; then rocm_use_hipcc mycmakeargs+=( -DGGML_HIP=ON -DAMDGPU_TARGETS=$(get_amdgpu_flags) -DGGML_HIP_ROCWMMA_FATTN=$(usex wmma) ) fi cmake_src_configure } src_install() { cmake_src_install # Remove installed headers (avoid clashing with whisper.cpp / upstream) rm -rf "${ED}/usr/include" # Prefix all binaries with ik_ to coexist with upstream llama.cpp for f in "${ED}"/usr/bin/*; do mv "$f" "${ED}/usr/bin/ik_$(basename $f)" || die done # Systemd service unit and environment configuration insinto /etc/ik-llama doins "${FILESDIR}"/ik-llama-server.conf systemd_dounit "${FILESDIR}"/ik-llama-server.service # State directory for model storage keepdir /var/lib/ik-llama/models } pkg_preinst() { keepdir /var/lib/ik-llama/models fowners ik-llama:ik-llama /var/lib/ik-llama /var/lib/ik-llama/models fperms 0750 /var/lib/ik-llama /var/lib/ik-llama/models } pkg_postinst() { elog "All binaries are ik_-prefixed to coexist with upstream llama.cpp:" elog " ik_llama-server, ik_llama-cli, ik_llama-quantize, ..." elog "" elog "Running as a systemd service:" elog " 1. Place a GGUF model in /var/lib/ik-llama/models/" elog " 2. Edit /etc/ik-llama/ik-llama-server.conf (set LLAMA_MODEL, LLAMA_THREADS)" elog " 3. systemctl enable --now ik-llama-server" elog " The API is then available at http://\${LLAMA_HOST}:\${LLAMA_PORT} (OpenAI-compatible)" elog "" ewarn "Defaults in /etc/ik-llama/ik-llama-server.conf are CONSERVATIVE:" ewarn " LLAMA_THREADS=2 -- adjust to your physical core count (not SMT threads)!" ewarn " LLAMA_MODEL=... -- must point to an actual GGUF file!" ewarn "Without adjustment the service runs on 2 threads or fails to find a model." }