# Copyright 2023-2025 Gentoo Authors # Distributed under the terms of the GNU General Public License v2 # Autogenerated by pycargoebuild 0.13.3 EAPI=8 DISTUTILS_USE_PEP517=maturin PYTHON_COMPAT=( python3_{10..13} ) DISTUTILS_EXT=1 DISTUTILS_SINGLE_IMPL=1 CRATES=" aho-corasick@1.1.3 autocfg@1.4.0 base64@0.13.1 bit-set@0.8.0 bit-vec@0.8.0 bitflags@1.3.2 cc@1.2.24 cfg-if@1.0.0 console@0.15.11 crossbeam-deque@0.8.6 crossbeam-epoch@0.9.18 crossbeam-utils@0.8.21 darling@0.20.11 darling_core@0.20.11 darling_macro@0.20.11 derive_builder@0.20.2 derive_builder_core@0.20.2 derive_builder_macro@0.20.2 either@1.15.0 esaxx-rs@0.1.10 fancy-regex@0.14.0 fnv@1.0.7 getrandom@0.2.16 heck@0.5.0 ident_case@1.0.1 indicatif@0.17.11 indoc@2.0.6 itertools@0.11.0 itertools@0.12.1 itertools@0.13.0 itoa@1.0.15 lazy_static@1.5.0 libc@0.2.172 log@0.4.27 macro_rules_attribute-proc_macro@0.2.0 macro_rules_attribute@0.2.0 matrixmultiply@0.3.10 memchr@2.7.4 memoffset@0.9.1 minimal-lexical@0.2.1 monostate-impl@0.1.14 monostate@0.1.14 ndarray@0.16.1 nom@7.1.3 num-complex@0.4.6 num-integer@0.1.46 num-traits@0.2.19 number_prefix@0.4.0 numpy@0.25.0 once_cell@1.21.3 onig@6.4.0 onig_sys@69.8.1 paste@1.0.15 pkg-config@0.3.32 portable-atomic@1.11.0 ppv-lite86@0.2.21 proc-macro2@1.0.95 pyo3-build-config@0.25.0 pyo3-ffi@0.25.0 pyo3-macros-backend@0.25.0 pyo3-macros@0.25.0 pyo3@0.25.0 quote@1.0.40 rand@0.8.5 rand_chacha@0.3.1 rand_core@0.6.4 rawpointer@0.2.1 rayon-cond@0.3.0 rayon-core@1.12.1 rayon@1.10.0 regex-automata@0.4.9 regex-syntax@0.8.5 regex@1.11.1 rustc-hash@2.1.1 ryu@1.0.20 serde@1.0.219 serde_derive@1.0.219 serde_json@1.0.140 shlex@1.3.0 smallvec@1.15.0 spm_precompiled@0.1.4 strsim@0.11.1 syn@2.0.101 target-lexicon@0.13.2 thiserror-impl@2.0.12 thiserror@2.0.12 unicode-ident@1.0.18 unicode-normalization-alignments@0.1.12 unicode-segmentation@1.12.0 unicode-width@0.2.0 unicode_categories@0.1.1 unindent@0.2.4 zerocopy@0.8.25 " inherit cargo distutils-r1 DESCRIPTION="Implementation of today's most used tokenizers" HOMEPAGE="https://github.com/huggingface/tokenizers" SRC_URI=" https://github.com/huggingface/${PN}/archive/refs/tags/v${PV}.tar.gz -> ${P}.gh.tar.gz ${CARGO_CRATE_URIS} " S="${WORKDIR}"/${P}/bindings/python LICENSE="Apache-2.0" # Dependent crate licenses LICENSE+=" Apache-2.0 Apache-2.0-with-LLVM-exceptions BSD-2 BSD ISC MIT MPL-2.0 Unicode-DFS-2016 " SLOT="0" KEYWORDS="~amd64" RDEPEND="dev-libs/oniguruma " # $(python_gen_cond_dep ' # dev-python/numpy[${PYTHON_USEDEP}] # ') BDEPEND=" test? ( sci-ml/datasets[${PYTHON_SINGLE_USEDEP}] ) $(python_gen_cond_dep ' dev-python/setuptools-rust[${PYTHON_USEDEP}] ') " # dev-python/numpy[${PYTHON_USEDEP}] distutils_enable_tests pytest QA_FLAGS_IGNORED=".*/site-packages/tokenizers/.*so" src_unpack() { cargo_src_unpack } pkg_setup() { python-single-r1_pkg_setup rust_pkg_setup } src_prepare() { # default # cd bindings/python distutils-r1_src_prepare eapply "${FILESDIR}"/${PN}-0.15.2-test.patch # - replace dependencies. echo ${FILESDIR}/Cargo-${PVR}.toml "$WORKDIR"/${PN}-${PV}/${PN}/Cargo.toml cp ${FILESDIR}/Cargo-${PVR}.toml "$WORKDIR"/${PN}-${PV}/${PN}/Cargo.toml cp ${FILESDIR}/Cargo-${PN}-python-${PVR}.toml "$WORKDIR"/${PN}-${PV}/bindings/python/Cargo.toml cp ${FILESDIR}/Cargo-console-${PVR}.toml "$WORKDIR"/cargo_home/gentoo/console-0.15.11/Cargo.toml cp ${FILESDIR}/Cargo-zerocopy-${PVR}.toml "$WORKDIR"/cargo_home/gentoo/zerocopy-0.8.25/Cargo.toml cp ${FILESDIR}/Cargo-getrandom-${PVR}.toml "$WORKDIR"/cargo_home/gentoo/getrandom-0.2.16/Cargo.toml cp ${FILESDIR}/Cargo-anstyle-query-${PVR}.toml "$WORKDIR"/cargo_home/gentoo/anstyle-query-1.1.2/Cargo.toml cp ${FILESDIR}/Cargo-jiff-${PVR}.toml "$WORKDIR"/cargo_home/gentoo/jiff-0.2.14/Cargo.toml # cp ${FILESDIR}/Cargo-anstream-${PVR}.toml "$WORKDIR"/cargo_home/gentoo/anstream-0.6.18/Cargo.toml # cp ${FILESDIR}/Cargo-env_logger-${PVR}.toml "$WORKDIR"/cargo_home/gentoo/env_logger-0.11.8/Cargo.toml eapply ${FILESDIR}/env_logger_disable-${PVR}.patch cp ${FILESDIR}/Cargo-ndarray-${PVR}.toml "$WORKDIR"/cargo_home/gentoo/ndarray-0.16.1/Cargo.toml } src_configure() { # cd tokenizers # cargo_gen_config # --target="$(uname -m)-unknown-linux-gnu" cargo_src_configure # cd ../bindings/python distutils-r1_src_configure } python_compile() { cargo_src_compile distutils-r1_python_compile } src_compile() { export RUSTONIG_SYSTEM_LIBONIG=1 # cd tokenizers # cargo_src_compile # cd ../bindings/python distutils-r1_src_compile } src_test() { # cd tokenizers # Tests do not work #cargo_src_test # cd ../bindings/python local -x EPYTEST_IGNORE=( benches/ ) local -x EPYTEST_DESELECT=( tests/bindings/test_encoding.py::TestEncoding::test_sequence_ids tests/bindings/test_encoding.py::TestEncoding::test_n_sequences tests/bindings/test_encoding.py::TestEncoding::test_word_to_tokens tests/bindings/test_encoding.py::TestEncoding::test_word_to_chars tests/bindings/test_encoding.py::TestEncoding::test_token_to_sequence tests/bindings/test_encoding.py::TestEncoding::test_token_to_chars tests/bindings/test_encoding.py::TestEncoding::test_token_to_word tests/bindings/test_encoding.py::TestEncoding::test_char_to_token tests/bindings/test_encoding.py::TestEncoding::test_char_to_word tests/bindings/test_encoding.py::TestEncoding::test_truncation tests/bindings/test_encoding.py::TestEncoding::test_invalid_truncate_direction tests/bindings/test_models.py::TestBPE::test_instantiate tests/bindings/test_models.py::TestWordLevel::test_instantiate tests/bindings/test_models.py::TestWordPiece::test_instantiate tests/bindings/test_processors.py::TestByteLevelProcessing::test_processing tests/bindings/test_trainers.py::TestUnigram::test_continuing_prefix_trainer_mismatch tests/bindings/test_trainers.py::TestUnigram::test_train tests/bindings/test_trainers.py::TestUnigram::test_train_parallelism_with_custom_pretokenizer tests/documentation/test_pipeline.py::TestPipeline::test_pipeline tests/documentation/test_pipeline.py::TestPipeline::test_bert_example tests/implementations/test_char_bpe.py::TestCharBPETokenizer::test_basic_encode tests/implementations/test_char_bpe.py::TestCharBPETokenizer::test_lowercase tests/implementations/test_char_bpe.py::TestCharBPETokenizer::test_decoding tests/implementations/test_char_bpe.py::TestCharBPETokenizer::test_multiprocessing_with_parallelism tests/test_serialization.py::TestSerialization::test_full_serialization_albert tests/test_serialization.py::TestSerialization::test_str_big tests/bindings/test_tokenizer.py::TestTokenizer::test_encode_formats tests/bindings/test_tokenizer.py::TestTokenizer::test_encode_add_special_tokens tests/bindings/test_tokenizer.py::TestTokenizer::test_from_pretrained tests/bindings/test_tokenizer.py::TestTokenizer::test_from_pretrained_revision tests/bindings/test_tokenizer.py::TestTokenizer::test_encode_special_tokens tests/bindings/test_tokenizer.py::TestTokenizer::test_splitting tests/documentation/test_quicktour.py::TestQuicktour::test_quicktour tests/documentation/test_tutorial_train_from_iterators.py::TestTrainFromIterators::test_datasets tests/documentation/test_tutorial_train_from_iterators.py::TestTrainFromIterators::test_gzip tests/implementations/test_bert_wordpiece.py::TestBertWordPieceTokenizer::test_basic_encode tests/implementations/test_bert_wordpiece.py::TestBertWordPieceTokenizer::test_multiprocessing_with_parallelism tests/implementations/test_byte_level_bpe.py::TestByteLevelBPE::test_basic_encode tests/implementations/test_byte_level_bpe.py::TestByteLevelBPE::test_add_prefix_space tests/implementations/test_byte_level_bpe.py::TestByteLevelBPE::test_lowerspace tests/implementations/test_byte_level_bpe.py::TestByteLevelBPE::test_multiprocessing_with_parallelism ) distutils-r1_src_test } src_install() { # cd tokenizers # cd ../bindings/python distutils-r1_src_install }