# Copyright 2023-2025 Gentoo Authors
# Distributed under the terms of the GNU General Public License v2

# Autogenerated by pycargoebuild 0.13.3

EAPI=8

DISTUTILS_USE_PEP517=maturin
PYTHON_COMPAT=( python3_{10..13} )
DISTUTILS_EXT=1
DISTUTILS_SINGLE_IMPL=1
CRATES="
	aho-corasick@1.1.3
	autocfg@1.4.0
	base64@0.13.1
	bit-set@0.8.0
	bit-vec@0.8.0
	bitflags@1.3.2
	cc@1.2.24
	cfg-if@1.0.0
	console@0.15.11
	crossbeam-deque@0.8.6
	crossbeam-epoch@0.9.18
	crossbeam-utils@0.8.21
	darling@0.20.11
	darling_core@0.20.11
	darling_macro@0.20.11
	derive_builder@0.20.2
	derive_builder_core@0.20.2
	derive_builder_macro@0.20.2
	either@1.15.0
	esaxx-rs@0.1.10
	fancy-regex@0.14.0
	fnv@1.0.7
	getrandom@0.2.16
	heck@0.5.0
	ident_case@1.0.1
	indicatif@0.17.11
	indoc@2.0.6
	itertools@0.11.0
	itertools@0.12.1
	itertools@0.13.0
	itoa@1.0.15
	lazy_static@1.5.0
	libc@0.2.172
	log@0.4.27
	macro_rules_attribute-proc_macro@0.2.0
	macro_rules_attribute@0.2.0
	matrixmultiply@0.3.10
	memchr@2.7.4
	memoffset@0.9.1
	minimal-lexical@0.2.1
	monostate-impl@0.1.14
	monostate@0.1.14
	ndarray@0.16.1
	nom@7.1.3
	num-complex@0.4.6
	num-integer@0.1.46
	num-traits@0.2.19
	number_prefix@0.4.0
	numpy@0.25.0
	once_cell@1.21.3
	onig@6.4.0
	onig_sys@69.8.1
	paste@1.0.15
	pkg-config@0.3.32
	portable-atomic@1.11.0
	ppv-lite86@0.2.21
	proc-macro2@1.0.95
	pyo3-build-config@0.25.0
	pyo3-ffi@0.25.0
	pyo3-macros-backend@0.25.0
	pyo3-macros@0.25.0
	pyo3@0.25.0
	quote@1.0.40
	rand@0.8.5
	rand_chacha@0.3.1
	rand_core@0.6.4
	rawpointer@0.2.1
	rayon-cond@0.3.0
	rayon-core@1.12.1
	rayon@1.10.0
	regex-automata@0.4.9
	regex-syntax@0.8.5
	regex@1.11.1
	rustc-hash@2.1.1
	ryu@1.0.20
	serde@1.0.219
	serde_derive@1.0.219
	serde_json@1.0.140
	shlex@1.3.0
	smallvec@1.15.0
	spm_precompiled@0.1.4
	strsim@0.11.1
	syn@2.0.101
	target-lexicon@0.13.2
	thiserror-impl@2.0.12
	thiserror@2.0.12
	unicode-ident@1.0.18
	unicode-normalization-alignments@0.1.12
	unicode-segmentation@1.12.0
	unicode-width@0.2.0
	unicode_categories@0.1.1
	unindent@0.2.4
	zerocopy@0.8.25
"

inherit cargo distutils-r1

DESCRIPTION="Implementation of today's most used tokenizers"
HOMEPAGE="https://github.com/huggingface/tokenizers"
SRC_URI="
	https://github.com/huggingface/${PN}/archive/refs/tags/v${PV}.tar.gz
	-> ${P}.gh.tar.gz
	${CARGO_CRATE_URIS}
"

S="${WORKDIR}"/${P}/bindings/python

LICENSE="Apache-2.0"
# Dependent crate licenses
LICENSE+="
	Apache-2.0 Apache-2.0-with-LLVM-exceptions BSD-2 BSD ISC MIT MPL-2.0
	Unicode-DFS-2016
"
SLOT="0"
KEYWORDS="~amd64"

RDEPEND="dev-libs/oniguruma
	"
	# $(python_gen_cond_dep '
	# 	dev-python/numpy[${PYTHON_USEDEP}]
	# ')
BDEPEND="
	test? ( sci-ml/datasets[${PYTHON_SINGLE_USEDEP}] )
	$(python_gen_cond_dep '
		dev-python/setuptools-rust[${PYTHON_USEDEP}]
	')
"
# 		dev-python/numpy[${PYTHON_USEDEP}]

distutils_enable_tests pytest

QA_FLAGS_IGNORED=".*/site-packages/tokenizers/.*so"

src_unpack() {
	cargo_src_unpack
}

pkg_setup() {
	python-single-r1_pkg_setup
	rust_pkg_setup
}

src_prepare() {
	# default
	# cd bindings/python
	distutils-r1_src_prepare
	eapply "${FILESDIR}"/${PN}-0.15.2-test.patch
	# - replace dependencies.
	echo ${FILESDIR}/Cargo-${PVR}.toml "$WORKDIR"/${PN}-${PV}/${PN}/Cargo.toml
	cp ${FILESDIR}/Cargo-${PVR}.toml "$WORKDIR"/${PN}-${PV}/${PN}/Cargo.toml
	cp ${FILESDIR}/Cargo-${PN}-python-${PVR}.toml "$WORKDIR"/${PN}-${PV}/bindings/python/Cargo.toml
	cp ${FILESDIR}/Cargo-console-${PVR}.toml "$WORKDIR"/cargo_home/gentoo/console-0.15.11/Cargo.toml
	cp ${FILESDIR}/Cargo-zerocopy-${PVR}.toml "$WORKDIR"/cargo_home/gentoo/zerocopy-0.8.25/Cargo.toml
	cp ${FILESDIR}/Cargo-getrandom-${PVR}.toml "$WORKDIR"/cargo_home/gentoo/getrandom-0.2.16/Cargo.toml
	cp ${FILESDIR}/Cargo-anstyle-query-${PVR}.toml "$WORKDIR"/cargo_home/gentoo/anstyle-query-1.1.2/Cargo.toml
	cp ${FILESDIR}/Cargo-jiff-${PVR}.toml "$WORKDIR"/cargo_home/gentoo/jiff-0.2.14/Cargo.toml
	# cp ${FILESDIR}/Cargo-anstream-${PVR}.toml "$WORKDIR"/cargo_home/gentoo/anstream-0.6.18/Cargo.toml
	# cp ${FILESDIR}/Cargo-env_logger-${PVR}.toml "$WORKDIR"/cargo_home/gentoo/env_logger-0.11.8/Cargo.toml
	eapply ${FILESDIR}/env_logger_disable-${PVR}.patch
	cp ${FILESDIR}/Cargo-ndarray-${PVR}.toml "$WORKDIR"/cargo_home/gentoo/ndarray-0.16.1/Cargo.toml
}

src_configure() {
	# cd tokenizers
	# cargo_gen_config
	# --target="$(uname -m)-unknown-linux-gnu"
	cargo_src_configure
	# cd ../bindings/python
	distutils-r1_src_configure
}

python_compile() {
	cargo_src_compile
	distutils-r1_python_compile
}


src_compile() {
	export RUSTONIG_SYSTEM_LIBONIG=1
	# cd tokenizers
	# cargo_src_compile
	# cd ../bindings/python
	distutils-r1_src_compile
}

src_test() {
	# cd tokenizers
	# Tests do not work
	#cargo_src_test
	# cd ../bindings/python
	local -x EPYTEST_IGNORE=( benches/ )
	local -x EPYTEST_DESELECT=(
		tests/bindings/test_encoding.py::TestEncoding::test_sequence_ids
		tests/bindings/test_encoding.py::TestEncoding::test_n_sequences
		tests/bindings/test_encoding.py::TestEncoding::test_word_to_tokens
		tests/bindings/test_encoding.py::TestEncoding::test_word_to_chars
		tests/bindings/test_encoding.py::TestEncoding::test_token_to_sequence
		tests/bindings/test_encoding.py::TestEncoding::test_token_to_chars
		tests/bindings/test_encoding.py::TestEncoding::test_token_to_word
		tests/bindings/test_encoding.py::TestEncoding::test_char_to_token
		tests/bindings/test_encoding.py::TestEncoding::test_char_to_word
		tests/bindings/test_encoding.py::TestEncoding::test_truncation
		tests/bindings/test_encoding.py::TestEncoding::test_invalid_truncate_direction
		tests/bindings/test_models.py::TestBPE::test_instantiate
		tests/bindings/test_models.py::TestWordLevel::test_instantiate
		tests/bindings/test_models.py::TestWordPiece::test_instantiate
		tests/bindings/test_processors.py::TestByteLevelProcessing::test_processing
		tests/bindings/test_trainers.py::TestUnigram::test_continuing_prefix_trainer_mismatch
		tests/bindings/test_trainers.py::TestUnigram::test_train
		tests/bindings/test_trainers.py::TestUnigram::test_train_parallelism_with_custom_pretokenizer
		tests/documentation/test_pipeline.py::TestPipeline::test_pipeline
		tests/documentation/test_pipeline.py::TestPipeline::test_bert_example
		tests/implementations/test_char_bpe.py::TestCharBPETokenizer::test_basic_encode
		tests/implementations/test_char_bpe.py::TestCharBPETokenizer::test_lowercase
		tests/implementations/test_char_bpe.py::TestCharBPETokenizer::test_decoding
		tests/implementations/test_char_bpe.py::TestCharBPETokenizer::test_multiprocessing_with_parallelism
		tests/test_serialization.py::TestSerialization::test_full_serialization_albert
		tests/test_serialization.py::TestSerialization::test_str_big
		tests/bindings/test_tokenizer.py::TestTokenizer::test_encode_formats
		tests/bindings/test_tokenizer.py::TestTokenizer::test_encode_add_special_tokens
		tests/bindings/test_tokenizer.py::TestTokenizer::test_from_pretrained
		tests/bindings/test_tokenizer.py::TestTokenizer::test_from_pretrained_revision
		tests/bindings/test_tokenizer.py::TestTokenizer::test_encode_special_tokens
		tests/bindings/test_tokenizer.py::TestTokenizer::test_splitting
		tests/documentation/test_quicktour.py::TestQuicktour::test_quicktour
		tests/documentation/test_tutorial_train_from_iterators.py::TestTrainFromIterators::test_datasets
		tests/documentation/test_tutorial_train_from_iterators.py::TestTrainFromIterators::test_gzip
		tests/implementations/test_bert_wordpiece.py::TestBertWordPieceTokenizer::test_basic_encode
		tests/implementations/test_bert_wordpiece.py::TestBertWordPieceTokenizer::test_multiprocessing_with_parallelism
		tests/implementations/test_byte_level_bpe.py::TestByteLevelBPE::test_basic_encode
		tests/implementations/test_byte_level_bpe.py::TestByteLevelBPE::test_add_prefix_space
		tests/implementations/test_byte_level_bpe.py::TestByteLevelBPE::test_lowerspace
		tests/implementations/test_byte_level_bpe.py::TestByteLevelBPE::test_multiprocessing_with_parallelism

	)
	distutils-r1_src_test
}

src_install() {
	# cd tokenizers
	# cd ../bindings/python
	distutils-r1_src_install
}