# Copyright 2026 Gentoo Authors
# Distributed under the terms of the GNU General Public License v2

EAPI=8

ROCM_VERSION="6.3"

inherit cmake cuda flag-o-matic rocm linux-info systemd

DESCRIPTION="llama.cpp fork with additional SOTA quants and improved performance"
HOMEPAGE="https://github.com/ikawrakow/ik_llama.cpp"

# Pinned to commit d1339249d708159bd43aadfef089d461f53019f1 (BUILD_NUMBER=4582, 2026-06-12)
SRC_URI="https://github.com/ikawrakow/ik_llama.cpp/archive/d1339249d708159bd43aadfef089d461f53019f1.tar.gz -> ${P}.gh.tar.gz"
S="${WORKDIR}/ik_llama.cpp-d1339249d708159bd43aadfef089d461f53019f1"
KEYWORDS="~amd64"

LICENSE="MIT"
SLOT="0"

# wmma: rocWMMA flash-attention on RDNA3+/CDNA GPUs
# see https://github.com/ikawrakow/ik_llama.cpp/blob/master/docs/build.md
IUSE="
    curl openblas +openmp blis rocm cuda vulkan flexiblas wmma
    cpu_flags_x86_avx cpu_flags_x86_avx2 cpu_flags_x86_fma3 
    cpu_flags_x86_f16c cpu_flags_x86_bmi2 cpu_flags_x86_avx512f 
    cpu_flags_x86_avx512vbmi cpu_flags_x86_avx512_vnni 
    cpu_flags_x86_avx512_bf16
"

REQUIRED_USE="
    ?? ( openblas blis flexiblas )
    wmma? ( rocm )
    cpu_flags_x86_avx2? ( cpu_flags_x86_avx )
    cpu_flags_x86_avx512f? ( cpu_flags_x86_avx2 )
    cpu_flags_x86_avx512vbmi? ( cpu_flags_x86_avx512f )
    cpu_flags_x86_avx512_vnni? ( cpu_flags_x86_avx512f )
    cpu_flags_x86_avx512_bf16? ( cpu_flags_x86_avx512f )
"

# curl: needed for pulling models from huggingface
CDEPEND="
    curl? ( net-misc/curl:= )
    openblas? ( sci-libs/openblas:= )
    openmp? ( llvm-runtimes/openmp:= )
    blis? ( sci-libs/blis:= )
    flexiblas? ( sci-libs/flexiblas:= )
    rocm? (
        >=dev-util/hip-${ROCM_VERSION}:=
        >=sci-libs/hipBLAS-${ROCM_VERSION}:=
        wmma? ( >=sci-libs/rocWMMA-${ROCM_VERSION}:= )
    )
    cuda? ( dev-util/nvidia-cuda-toolkit:= )
"
DEPEND="${CDEPEND}
    vulkan? ( dev-util/vulkan-headers )
"
RDEPEND="${CDEPEND}
    vulkan? ( media-libs/vulkan-loader )
    acct-user/ik-llama
    acct-group/ik-llama
"
BDEPEND="media-libs/shaderc"

pkg_setup() {
    if use rocm; then
        linux-info_pkg_setup
        if linux-info_get_any_version && linux_config_exists; then
            if ! linux_chkconfig_present HSA_AMD_SVM; then
                ewarn "ROCm/HIP requires HSA_AMD_SVM enabled in your kernel config."
            fi
        fi
    fi
}

src_prepare() {
    use cuda && cuda_src_prepare
    cmake_src_prepare
}

src_configure() {
    local mycmakeargs=(
        # -- Build options --
        -DCMAKE_INSTALL_INCLUDEDIR=include/ik_llama.cpp
        -DLLAMA_BUILD_TESTS=OFF
        -DLLAMA_BUILD_EXAMPLES=ON
        -DLLAMA_BUILD_SERVER=ON
        -DCMAKE_SKIP_BUILD_RPATH=ON
        -DGGML_RPC=ON
        -DLLAMA_CURL=$(usex curl)
        -DBUILD_NUMBER="4582"
        -DGENTOO_REMOVE_CMAKE_BLAS_HACK=ON

        # -- CPU feature flags --
        # No -march=native: explicit mapping from CPU_FLAGS_X86 for
        # reproducible/portable builds.  SSE4.2 is the baseline.
        # NOTE: This fork has no GGML_AVX_VNNI flag (upstream-only).
        -DGGML_NATIVE=0
        -DGGML_SSE42=ON
        -DGGML_AVX=$(usex cpu_flags_x86_avx)
        -DGGML_AVX2=$(usex cpu_flags_x86_avx2)
        -DGGML_BMI2=$(usex cpu_flags_x86_bmi2)
        -DGGML_FMA=$(usex cpu_flags_x86_fma3)
        -DGGML_F16C=$(usex cpu_flags_x86_f16c)
        -DGGML_AVX512=$(usex cpu_flags_x86_avx512f)
        -DGGML_AVX512_VBMI=$(usex cpu_flags_x86_avx512vbmi)
        -DGGML_AVX512_VNNI=$(usex cpu_flags_x86_avx512_vnni)
        -DGGML_AVX512_BF16=$(usex cpu_flags_x86_avx512_bf16)

        # -- Backends --
        -DGGML_CUDA=$(usex cuda)
        -DGGML_OPENMP=$(usex openmp)
        -DGGML_VULKAN=$(usex vulkan)

        # -- Install paths (avoid clashing with whisper.cpp / upstream llama.cpp) --
        -DCMAKE_INSTALL_LIBDIR="${EPREFIX}/usr/$(get_libdir)/ik_llama.cpp"
        -DCMAKE_INSTALL_RPATH="${EPREFIX}/usr/$(get_libdir)/ik_llama.cpp"
    )

    # HAVE_FANCY_SIMD (IQK kernel activation) needs AVX-512F + VNNI + VL +
    # BW + DQ.  GGML_AVX512=ON only passes -mavx512f -mavx512bw to the
    # compiler -- VL and DQ are MISSING, so the IQK kernels silently fall
    # back to AVX2.  Inject the missing sub-flags via compiler flags.
    if use cpu_flags_x86_avx512f; then
        append-cflags -mavx512vl -mavx512dq
        append-cxxflags -mavx512vl -mavx512dq
    fi

    # BLAS vendor selection (mutually exclusive via REQUIRED_USE)
    if use openblas; then
        mycmakeargs+=(
            -DGGML_BLAS=ON -DGGML_BLAS_VENDOR=OpenBLAS
        )
    fi

    if use blis; then
        mycmakeargs+=(
            -DGGML_BLAS=ON -DGGML_BLAS_VENDOR=FLAME
        )
    fi

    if use flexiblas; then
        mycmakeargs+=(
            -DGGML_BLAS=ON -DGGML_BLAS_VENDOR=FlexiBLAS
        )
    fi

    # CUDA: set host compiler and sandbox for device node symlinks
    if use cuda; then
        local -x CUDAHOSTCXX="$(cuda_gccdir)"
        cuda_add_sandbox
        addpredict "/dev/char/"
    fi

    # ROCm/HIP: use hipcc and set GPU architecture targets
    if use rocm; then
        rocm_use_hipcc
        mycmakeargs+=(
            -DGGML_HIP=ON
            -DAMDGPU_TARGETS=$(get_amdgpu_flags)
            -DGGML_HIP_ROCWMMA_FATTN=$(usex wmma)
        )
    fi

    cmake_src_configure
}

src_install() {
    cmake_src_install

    # Remove installed headers (avoid clashing with whisper.cpp / upstream)
    rm -rf "${ED}/usr/include"

    # Prefix all binaries with ik_ to coexist with upstream llama.cpp
    for f in "${ED}"/usr/bin/*; do
        mv "$f" "${ED}/usr/bin/ik_$(basename $f)" || die
    done

    # Systemd service unit and environment configuration
    insinto /etc/ik-llama
    doins "${FILESDIR}"/ik-llama-server.conf
    systemd_dounit "${FILESDIR}"/ik-llama-server.service

    # State directory for model storage
    keepdir /var/lib/ik-llama/models
}

pkg_preinst() {
    keepdir /var/lib/ik-llama/models
    fowners ik-llama:ik-llama /var/lib/ik-llama /var/lib/ik-llama/models
    fperms 0750 /var/lib/ik-llama /var/lib/ik-llama/models
}

pkg_postinst() {
    elog "All binaries are ik_-prefixed to coexist with upstream llama.cpp:"
    elog "  ik_llama-server, ik_llama-cli, ik_llama-quantize, ..."
    elog ""
    elog "Running as a systemd service:"
    elog "  1. Place a GGUF model in /var/lib/ik-llama/models/"
    elog "  2. Edit /etc/ik-llama/ik-llama-server.conf (set LLAMA_MODEL, LLAMA_THREADS)"
    elog "  3. systemctl enable --now ik-llama-server"
    elog "  The API is then available at http://\${LLAMA_HOST}:\${LLAMA_PORT} (OpenAI-compatible)"
    elog ""
    ewarn "Defaults in /etc/ik-llama/ik-llama-server.conf are CONSERVATIVE:"
    ewarn "  LLAMA_THREADS=2  -- adjust to your physical core count (not SMT threads)!"
    ewarn "  LLAMA_MODEL=...  -- must point to an actual GGUF file!"
    ewarn "Without adjustment the service runs on 2 threads or fails to find a model."
}