# Copyright 2024-2026 Gentoo Authors
# Distributed under the terms of the GNU General Public License v2

EAPI=8

ROCM_VERSION=7.1

inherit cmake cuda rocm linux-info systemd

TINY_LLAMAS_COMMIT="99dd1a73db5a37100bd4ae633f4cfce6560e1567"

DESCRIPTION="LLM inference in C/C++ (GGML/GGUF) — CPU + optional GPU backends"
HOMEPAGE="https://github.com/ggml-org/llama.cpp"

SRC_URI="https://github.com/ggml-org/llama.cpp/archive/refs/tags/b${PV}.tar.gz -> ${P}.gh.tar.gz"
S="${WORKDIR}/llama.cpp-b${PV}"

SRC_URI+="
    examples? (
        https://huggingface.co/ggml-org/tiny-llamas/resolve/${TINY_LLAMAS_COMMIT}/stories15M-q4_0.gguf
            -> ggml-org_models_tinyllamas_stories15M-q4_0-${TINY_LLAMAS_COMMIT}.gguf
    )
"

LICENSE="MIT"
SLOT="0"
KEYWORDS="~amd64"


# wmma: rocWMMA flash-attention on RDNA3+/CDNA GPUs
# see https://github.com/ggml-org/llama.cpp/blob/master/docs/build.md#hip
IUSE="
    curl openblas +openmp blis rocm cuda opencl openssl vulkan flexiblas wmma 
    examples cpu_flags_x86_avx cpu_flags_x86_avx2 cpu_flags_x86_fma3 
    cpu_flags_x86_f16c cpu_flags_x86_bmi2 cpu_flags_x86_avx_vnni 
    cpu_flags_x86_avx512f cpu_flags_x86_avx512vbmi cpu_flags_x86_avx512_vnni
    cpu_flags_x86_avx512_bf16
"

REQUIRED_USE="
    ?? ( openblas blis flexiblas )
    wmma? ( rocm )
    cpu_flags_x86_avx2? ( cpu_flags_x86_avx )
    cpu_flags_x86_avx512f? ( cpu_flags_x86_avx2 )
    cpu_flags_x86_avx512vbmi? ( cpu_flags_x86_avx512f )
    cpu_flags_x86_avx512_vnni? ( cpu_flags_x86_avx512f )
    cpu_flags_x86_avx512_bf16? ( cpu_flags_x86_avx512f )
"

# curl: needed for pulling models from huggingface
# numpy: used by convert_hf_to_gguf.py
CDEPEND="
    curl? ( net-misc/curl:= )
    openblas? ( sci-libs/openblas:= )
    openmp? ( llvm-runtimes/openmp:= )
    blis? ( sci-libs/blis:= )
    flexiblas? ( sci-libs/flexiblas:= )
    rocm? (
        >=dev-util/hip-${ROCM_VERSION}:=
        >=sci-libs/hipBLAS-${ROCM_VERSION}:=
        wmma? ( >=sci-libs/rocWMMA-${ROCM_VERSION}:= )
    )
    cuda? ( dev-util/nvidia-cuda-toolkit:= )
    openssl? ( dev-libs/openssl:= )
"
DEPEND="${CDEPEND}
    opencl? ( dev-util/opencl-headers )
    vulkan? (
        dev-util/spirv-headers
        dev-util/vulkan-headers
    )
"
RDEPEND="${CDEPEND}
    dev-python/numpy
    opencl? ( dev-libs/opencl-icd-loader )
    vulkan? ( media-libs/vulkan-loader )
    acct-user/llama-cpp
    acct-group/llama-cpp
"
BDEPEND="media-libs/shaderc"

pkg_setup() {
    if use rocm; then
        linux-info_pkg_setup
        if linux-info_get_any_version && linux_config_exists; then
            if ! linux_chkconfig_present HSA_AMD_SVM; then
                ewarn "ROCm/HIP requires HSA_AMD_SVM enabled in your kernel config."
            fi
        fi
    fi
}

src_prepare() {
    use cuda && cuda_src_prepare
    cmake_src_prepare
    if use examples; then
        mkdir -p "${BUILD_DIR}/tinyllamas" || die
        cp "${DISTDIR}/ggml-org_models_tinyllamas_stories15M-q4_0-${TINY_LLAMAS_COMMIT}.gguf" \
            "${BUILD_DIR}/tinyllamas/stories15M-q4_0.gguf" || die
    fi
}

src_configure() {
    local mycmakeargs=(
        # -- Build options --
        -DLLAMA_BUILD_TESTS=OFF
        -DLLAMA_BUILD_EXAMPLES=$(usex examples)
        -DLLAMA_BUILD_SERVER=ON
        -DCMAKE_SKIP_BUILD_RPATH=ON
        -DGGML_RPC=ON
        -DLLAMA_CURL=$(usex curl)
        -DLLAMA_OPENSSL=$(usex openssl)
        -DBUILD_NUMBER="1"
        -DGENTOO_REMOVE_CMAKE_BLAS_HACK=ON

        # -- CPU feature flags --
        # No -march=native: explicit mapping from CPU_FLAGS_X86 for
        # reproducible/portable builds.  SSE4.2 is the baseline.
        -DGGML_NATIVE=0
        -DGGML_SSE42=ON
        -DGGML_AVX=$(usex cpu_flags_x86_avx)
        -DGGML_AVX2=$(usex cpu_flags_x86_avx2)
        -DGGML_BMI2=$(usex cpu_flags_x86_bmi2)
        -DGGML_FMA=$(usex cpu_flags_x86_fma3)
        -DGGML_F16C=$(usex cpu_flags_x86_f16c)
        -DGGML_AVX_VNNI=$(usex cpu_flags_x86_avx_vnni)
        -DGGML_AVX512=$(usex cpu_flags_x86_avx512f)
        -DGGML_AVX512_VBMI=$(usex cpu_flags_x86_avx512vbmi)
        -DGGML_AVX512_VNNI=$(usex cpu_flags_x86_avx512_vnni)
        -DGGML_AVX512_BF16=$(usex cpu_flags_x86_avx512_bf16)

        # -- Backends --
        -DGGML_CUDA=$(usex cuda)
        -DGGML_OPENCL=$(usex opencl)
        -DGGML_OPENMP=$(usex openmp)
        -DGGML_VULKAN=$(usex vulkan)

        # -- Install paths (avoid clashing with whisper.cpp) --
        -DCMAKE_INSTALL_LIBDIR="${EPREFIX}/usr/$(get_libdir)/llama.cpp"
        -DCMAKE_INSTALL_RPATH="${EPREFIX}/usr/$(get_libdir)/llama.cpp"
    )

    # BLAS vendor selection (mutually exclusive via REQUIRED_USE)
    if use openblas; then
        mycmakeargs+=(
            -DGGML_BLAS=ON -DGGML_BLAS_VENDOR=OpenBLAS
        )
    fi

    if use blis; then
        mycmakeargs+=(
            -DGGML_BLAS=ON -DGGML_BLAS_VENDOR=FLAME
        )
    fi

    if use flexiblas; then
        mycmakeargs+=(
            -DGGML_BLAS=ON -DGGML_BLAS_VENDOR=FlexiBLAS
        )
    fi

    # CUDA: set host compiler and sandbox for device node symlinks
    if use cuda; then
        local -x CUDAHOSTCXX="$(cuda_gccdir)"
        cuda_add_sandbox
        addpredict "/dev/char/"
    fi

    # ROCm/HIP: use hipcc and set GPU architecture targets
    if use rocm; then
        rocm_use_hipcc
        mycmakeargs+=(
            -DGGML_HIP=ON
            -DAMDGPU_TARGETS=$(get_amdgpu_flags)
            -DGGML_HIP_ROCWMMA_FATTN=$(usex wmma)
        )
    fi

    cmake_src_configure
}

src_install() {
    cmake_src_install
    dobin "${BUILD_DIR}/bin/rpc-server"

    # Remove installed headers to avoid clashing with whisper.cpp
    rm -rf "${ED}/usr/include"

    # Systemd service unit and environment configuration
    insinto /etc/llama-cpp
    doins "${FILESDIR}"/llama-server.conf
    systemd_dounit "${FILESDIR}"/llama-server.service

    # State directory for model storage
    keepdir /var/lib/llama-cpp/models
}

pkg_preinst() {
    keepdir /var/lib/llama-cpp/models
    fowners llama-cpp:llama-cpp /var/lib/llama-cpp /var/lib/llama-cpp/models
    fperms 0750 /var/lib/llama-cpp /var/lib/llama-cpp/models
}

pkg_postinst() {
    elog "Installed binaries: llama-server, llama-cli, llama-quantize, rpc-server, ..."
    elog ""
    elog "Running as a systemd service:"
    elog "  1. Place a GGUF model in /var/lib/llama-cpp/models/"
    elog "  2. Edit /etc/llama-cpp/llama-server.conf (set LLAMA_MODEL, LLAMA_THREADS)"
    elog "  3. systemctl enable --now llama-server"
    elog "  The API is then available at http://\${LLAMA_HOST}:\${LLAMA_PORT} (OpenAI-compatible)"
    elog ""
    ewarn "Defaults in /etc/llama-cpp/llama-server.conf are CONSERVATIVE:"
    ewarn "  LLAMA_THREADS=2  -- adjust to your physical core count (not SMT threads)!"
    ewarn "  LLAMA_MODEL=...  -- must point to an actual GGUF file!"
    ewarn "Without adjustment the service runs on 2 threads or fails to find a model."
}