# Copyright 2022-2023 Gentoo Authors
# Distributed under the terms of the GNU General Public License v2

EAPI=8
PYTHON_COMPAT=( python3_{10..12} )
CUDA_TARGETS_COMPAT=( sm_50 sm_52 sm_53 sm_60 sm_61 sm_62 sm_70 sm_72 sm_75 sm_80 sm_86 sm_87 sm_89 sm_90 )
ROCM_VERSION="5.7.1"
AMDGPU_TARGETS_COMPAT=( gfx1030 gfx1031 gfx1032 gfx1033 gfx1034 gfx1035 gfx1036 gfx1100 gfx1101	gfx1102	gfx1103 )

inherit python-single-r1 cmake cuda llvm rocm

DESCRIPTION="Cross-platform inference and training machine-learning accelerator."
HOMEPAGE="https://onnxruntime.ai"
SAFEINT_COMMIT=ff15c6ada150a5018c5ef2172401cb4529eac9c0
FLATBUFFERS_PV=1.12.0
DATE_PV=2.4.1
SRC_URI="
	https://github.com/microsoft/${PN}/archive/refs/tags/v${PV}.tar.gz -> ${P}.tar.gz
	https://github.com/dcleblanc/SafeInt/archive/${SAFEINT_COMMIT}.tar.gz -> SafeInt-${SAFEINT_COMMIT:0:10}.tar.gz
	https://github.com/google/flatbuffers/archive/v${FLATBUFFERS_PV}.tar.gz -> flatbuffers-${FLATBUFFERS_PV}.tar.gz
	https://github.com/HowardHinnant/date/archive/v${DATE_PV}.tar.gz -> hhdate-${DATE_PV}.tar.gz
"

LICENSE="MIT"
SLOT="0"
KEYWORDS="~amd64"
RESTRICT="mirror test"
CPU_FLAGS="cpu_flags_x86_avx cpu_flags_x86_avx2 cpu_flags_x86_avx512"
IUSE="benchmark cuda onednn cudnn debug hip javascript +python migraphx +mpi mimalloc lto test tensorrt llvm xnnpack
${CPU_FLAGS}
${CUDA_TARGETS_COMPAT[@]/#/cuda_targets_}
${AMDGPU_TARGETS_COMPAT[@]/#/amdgpu_targets_}"
REQUIRED_USE="
	cuda? ( cudnn )
	hip? ( migraphx )
	|| ( cudnn migraphx onednn tensorrt )
"
RDEPEND="
	dev-libs/protobuf:=
"

BDEPEND="
	${PYTHON_DEPS}
	dev-python/numpy
	app-admin/chrpath
	dev-libs/date:=
	dev-libs/nsync:=
	dev-libs/flatbuffers:=
	dev-libs/cpuinfo:=
	dev-libs/FP16
	dev-libs/FXdiv
	sys-cluster/openmpi:=[cuda?]
	dev-cpp/eigen:=[cuda?]
	dev-cpp/ms-gsl:=
	dev-cpp/nlohmann_json
	sci-libs/pytorch
	dev-libs/re2
	dev-libs/protobuf:=
	sci-libs/onnx:=
	benchmark? ( dev-cpp/benchmark )
	cuda? ( dev-util/nvidia-cuda-toolkit:= )
	cudnn? ( dev-libs/cudnn:= )
	javascript? ( dev-util/emscripten )
	migraphx? ( >=sci-libs/MIGraphX-${ROCM_VERSION}:= )
	onednn? ( dev-libs/oneDNN:= )
	hip? (
		sci-libs/hipFFT:=
		sci-libs/hipCUB:=
		>=dev-libs/rocr-runtime-${ROCM_VERSION}:=
		>=dev-util/hip-${ROCM_VERSION}:=
	)
	xnnpack? ( sci-libs/XNNPACK )
	tensorrt? ( sci-libs/tensorboard:= )
	python? (
		$(python_gen_cond_dep '
			dev-python/numpy[${PYTHON_USEDEP}]
			dev-python/h5py[${PYTHON_USEDEP}]
			dev-python/cerberus[${PYTHON_USEDEP}]
			dev-python/coloredlogs[${PYTHON_USEDEP}]
			dev-python/flatbuffers[${PYTHON_USEDEP}]
			dev-python/numpy[${PYTHON_USEDEP}]
			dev-python/sympy[${PYTHON_USEDEP}]
			dev-python/psutil[${PYTHON_USEDEP}]
			dev-python/py-cpuinfo[${PYTHON_USEDEP}]
		')
	)
"

PATCHES=(
	"${FILESDIR}/${PN}-system-dnnl.patch"
	"${FILESDIR}/re2-pkg-config-r1.patch"
	"${FILESDIR}/system-onnx-r1.patch"
	"${FILESDIR}/system-cpuinfo.patch"
	"${FILESDIR}/system-nsync.patch"
	"${FILESDIR}/system-composable_kernel.patch"
	"${FILESDIR}/system-protobuf.patch"
	"${FILESDIR}/system-mp11.patch"
	"${FILESDIR}/system-gsl.patch"
	"${FILESDIR}/rocm-version-override-r2.patch"
	"${FILESDIR}/hip-gentoo.patch"
	"${FILESDIR}/shared-build-fix.patch"
	"${FILESDIR}/hip-libdir.patch"
)

pkg_setup() {
	use python && python-single-r1_pkg_setup
}

src_prepare() {
	CMAKE_USE_DIR="${S}/cmake"

	python && python_setup

	use cuda && cuda_src_prepare

	# Workaround for binary drivers.
	addpredict /dev/ati
	addpredict /dev/dri
	addpredict /dev/nvidiactl

    # fix build with gcc12(?), take idea from https://github.com/microsoft/onnxruntime/pull/11667 and https://github.com/microsoft/onnxruntime/pull/10014
	sed 's|dims)|TensorShape(dims))|g' \
		-i onnxruntime/contrib_ops/cuda/quantization/qordered_ops/qordered_qdq.cc || die "Sed failed"

	# fix missing #include <iostream>
	sed '11a#include <iostream>' -i orttraining/orttraining/test/training_api/trainer/trainer.cc

	sed 's/\"-mavx512f\"/\"-mavx512f -Wno-error\"/g' -i cmake/onnxruntime_mlas.cmake || die "Sed failed"

	if use tensorrt; then
		# Tensorrt 8.6 EA
    	eapply "${FILESDIR}/15089.diff"

		# Update Tensorboard 00d59e65d866a6d4b9fe855dce81ee6ba8b40c4f
    	sed -e 's|373eb09e4c5d2b3cc2493f0949dc4be6b6a45e81|00d59e65d866a6d4b9fe855dce81ee6ba8b40c4f|g' \
        	-e 's|67b833913605a4f3f499894ab11528a702c2b381|ff427b6a135344d86b65fa2928fbd29886eefaec|g' \
        	-i cmake/deps.txt || die sed "Sed failed"
	    # Update onnx_tensorrt 6872a9473391a73b96741711d52b98c2c3e25146
    	sed -e 's|369d6676423c2a6dbf4a5665c4b5010240d99d3c|6872a9473391a73b96741711d52b98c2c3e25146|g' \
        	-e 's|62119892edfb78689061790140c439b111491275|75462057c95f7fdbc256179f0a0e9e4b7be28ae3|g' \
        	-i cmake/deps.txt || die sed "Sed failed"
	fi

	strip-unsupported-flags

	append-cppflags "-I/usr/include/eigen3"
	append-flags "-Wno-error=deprecated-declarations "
	#append-flags "-Wno-error=instantiation-after-specialization" "-Wno-error=shorten-64-to-32" "-Wno-error=unused-private-field"

	cmake_src_prepare
}

src_configure() {
	export ROCM_PATH=/usr MIOPEN_PATH=/usr
	export ROCM_VERSION="${ROCM_VERSION}"-

	python && python_setup
	CMAKE_BUILD_TYPE=$(usex debug RelWithDebInfo Release)
	CMAKE_INSTALL_PREFIX="${EPREFIX}/usr"
	CMAKE_TLS_VERIFY=ON
	PYTHON_EXECUTABLE="/usr/bin/${EPYTHON}"
	PYTHON_INCLUDE_DIR="$(python_get_includedir)"
	PYTHON_LIBRARY="$(python_get_library_path)"

	append-cxxflags -Wno-dangling-reference -Wno-c++20-compat

	local mycmakeargs=(
		-DCMAKE_INSTALL_INCLUDEDIR="include/${PN}"
		-Donnxruntime_REQUIRE_PYTHON_EMBED_LIB=OFF
		-Donnxruntime_USE_FULL_PROTOBUF=OFF
		-Donnxruntime_ENABLE_LANGUAGE_INTEROP_OPS=OFF
		-Donnxruntime_BUILD_SHARED_LIB=ON
		-Donnxruntime_ENABLE_PYTHON=$(usex python)
		-Donnxruntime_BUILD_BENCHMARKS=$(usex benchmark)
	    -Donnxruntime_BUILD_UNIT_TESTS=$(usex test)
		-Donnxruntime_RUN_ONNX_TESTS=$(usex test)
	    -Donnxruntime_ENABLE_LAZY_TENSOR=OFF
    	-Donnxruntime_USE_MPI=$(usex mpi)
    	-Donnxruntime_USE_PREINSTALLED_EIGEN=ON
    	-Donnxruntime_USE_DNNL=$(usex onednn)
		-Donnxruntime_USE_CUDA=$(usex cuda)
		-Donnxruntime_USE_ROCM=$(usex hip)
		-Donnxruntime_USE_AVX=$(usex cpu_flags_x86_avx)
		-Donnxruntime_USE_AVX2=$(usex cpu_flags_x86_avx2)
		-Donnxruntime_USE_AVX512=$(usex cpu_flags_x86_avx512)
		-Donnxruntime_USE_MIMALLOC=$(usex mimalloc)
		-Donnxruntime_USE_XNNPACK=$(usex xnnpack)
      	-Donnxruntime_USE_NCCL=OFF # Multi GPU CUDA
		-Donnxruntime_ENABLE_LTO=$(usex lto)
	    -Donnxruntime_CUDA_HOME=/opt/cuda
      	-Donnxruntime_CUDNN_HOME=/usr
    	-Deigen_SOURCE_PATH=/usr/include/eigen3
		-DFETCHCONTENT_TRY_FIND_PACKAGE_MODE=ALWAYS
		-DFETCHCONTENT_FULLY_DISCONNECTED=ON
		-DFETCHCONTENT_QUIET=OFF
		-DFETCHCONTENT_SOURCE_DIR_SAFEINT="${WORKDIR}/SafeInt-${SAFEINT_COMMIT}"
		-DFETCHCONTENT_SOURCE_DIR_FLATBUFFERS="${WORKDIR}/flatbuffers-${FLATBUFFERS_PV}"
		-DFETCHCONTENT_SOURCE_DIR_DATE="${WORKDIR}/date-${DATE_PV}"
		-Donnxruntime_USE_TENSORRT=$(usex tensorrt)
		-Donnxruntime_USE_JSEP=$(usex javascript)
		-Donnxruntime_ENABLE_MEMORY_PROFILE=OFF
		-Donnxruntime_DISABLE_ABSEIL=ON
		-Donnxruntime_BUILD_FOR_NATIVE_MACHINE=ON
		-Donnxruntime_USE_MIMALLOC=OFF
		-Donnxruntime_BUILD_CSHARP=OFF
		-Donnxruntime_BUILD_JAVA=OFF
		-Donnxruntime_BUILD_NODEJS=OFF
		-Donnxruntime_BUILD_OBJC=OFF
		-Donnxruntime_BUILD_SHARED_LIB=OFF
		-Donnxruntime_BUILD_APPLE_FRAMEWORK=OFF
		-Donnxruntime_USE_NNAPI_BUILTIN=OFF
		-Donnxruntime_USE_RKNPU=OFF
		-Donnxruntime_USE_LLVM=$(usex llvm)
		-Donnxruntime_ENABLE_MICROSOFT_INTERNAL=OFF
		-Donnxruntime_USE_VITISAI=OFF
		-Donnxruntime_USE_TENSORRT_BUILTIN_PARSER=OFF
		-Donnxruntime_USE_TVM=OFF
		-Donnxruntime_TVM_CUDA_RUNTIME=OFF
		-Donnxruntime_TVM_USE_HASH=OFF
		-Donnxruntime_USE_MIGRAPHX=$(usex migraphx)
		-Donnxruntime_CROSS_COMPILING=$(tc-is-cross-compiler && echo ON || echo OFF)
		-Donnxruntime_DISABLE_CONTRIB_OPS=ON
		-Donnxruntime_DISABLE_ML_OPS=ON
		-Donnxruntime_DISABLE_RTTI=OFF
		-Donnxruntime_DISABLE_EXCEPTIONS=$(usex !debug)
		-Donnxruntime_MINIMAL_BUILD=OFF
		-Donnxruntime_EXTENDED_MINIMAL_BUILD=OFF
		-Donnxruntime_MINIMAL_BUILD_CUSTOM_OPS=OFF
		-Donnxruntime_REDUCED_OPS_BUILD=OFF
		-Donnxruntime_ENABLE_LANGUAGE_INTEROP_OPS=OFF
		-Donnxruntime_USE_DML=OFF
		-Donnxruntime_USE_WINML=OFF
		-Donnxruntime_BUILD_MS_EXPERIMENTAL_OPS=OFF
		-Donnxruntime_USE_TELEMETRY=OFF
		-Donnxruntime_USE_ACL=OFF
		-Donnxruntime_USE_ACL_1902=OFF
		-Donnxruntime_USE_ACL_1905=OFF
		-Donnxruntime_USE_ACL_1908=OFF
		-Donnxruntime_USE_ACL_2002=OFF
		-Donnxruntime_USE_ARMNN=OFF
		-Donnxruntime_ARMNN_RELU_USE_CPU=ON
		-Donnxruntime_ARMNN_BN_USE_CPU=ON
		-Donnxruntime_ENABLE_NVTX_PROFILE=OFF
		-Donnxruntime_ENABLE_TRAINING=OFF
		-Donnxruntime_ENABLE_TRAINING_OPS=OFF
		-Donnxruntime_ENABLE_TRAINING_APIS=OFF
		-Donnxruntime_ENABLE_CPU_FP16_OPS=OFF
		-Donnxruntime_USE_NCCL=OFF
		-DOnnxruntime_GCOV_COVERAGE=OFF
		-Donnxruntime_ENABLE_MEMORY_PROFILE=OFF
		-Donnxruntime_ENABLE_CUDA_LINE_NUMBER_INFO=OFF
		-Donnxruntime_BUILD_WEBASSEMBLY_STATIC_LIB=OFF
		-Donnxruntime_ENABLE_WEBASSEMBLY_EXCEPTION_CATCHING=ON
		-Donnxruntime_ENABLE_WEBASSEMBLY_API_EXCEPTION_CATCHING=OFF
		-Donnxruntime_ENABLE_WEBASSEMBLY_EXCEPTION_THROWING=ON
		-Donnxruntime_WEBASSEMBLY_RUN_TESTS_IN_BROWSER=OFF
		-Donnxruntime_ENABLE_WEBASSEMBLY_THREADS=OFF
		-Donnxruntime_ENABLE_WEBASSEMBLY_DEBUG_INFO=OFF
		-Donnxruntime_ENABLE_WEBASSEMBLY_PROFILING=OFF
		-Donnxruntime_ENABLE_EAGER_MODE=OFF
		-Donnxruntime_ENABLE_LAZY_TENSOR=OFF
		-Donnxruntime_ENABLE_EXTERNAL_CUSTOM_OP_SCHEMAS=OFF
		-Donnxruntime_ENABLE_CUDA_PROFILING=OFF
		-Donnxruntime_ENABLE_ROCM_PROFILING=OFF
		-Donnxruntime_USE_CANN=OFF
		-Donnxruntime_PYBIND_EXPORT_OPSCHEMA=OFF
		-Donnxruntime_ENABLE_MEMLEAK_CHECKER=ON
	)

	if use cuda; then
		for CA in ${CUDA_TARGETS_COMPAT[*]}; do
			use ${CA/#/cuda_targets_} && CUDA_ARCH+="${CA#sm_*}-real;"
		done
		mycmakeargs+=(
			-DCMAKE_CUDA_ARCHITECTURES="${CUDA_ARCH%%;}"
			-DCMAKE_CUDA_HOST_COMPILER="$(cuda_gccdir)"
			-DCMAKE_CUDA_FLAGS="-forward-unknown-opts -fno-lto ${NVCCFLAGS}"
    		-DCMAKE_CUDA_STANDARD_REQUIRED=ON
    		-DCMAKE_CXX_STANDARD_REQUIRED=ON
    		-Donnxruntime_NVCC_THREADS=1
    	)
	fi

	if use hip; then
		mycmakeargs+=(
			-DCMAKE_HIP_COMPILER="$(get_llvm_prefix)/bin/clang++"
			-DCMAKE_HIP_ARCHITECTURES="$(get_amdgpu_flags)"
		)
	fi

	cmake_src_configure
}

src_compile() {
	cmake_src_compile
	use python && python-single-r1_src_compile
}

src_install() {
	cmake_src_install
	use python && python-single-r1_src_install
}