# Copyright 2020-2022 Gentoo Authors
# Distributed under the terms of the GNU General Public License v2

EAPI=8

inherit check-reqs

DESCRIPTION="Data files for NLTK"
HOMEPAGE="https://www.nltk.org/nltk_data/"

# at least some of the files have poorly documented licenses
# https://github.com/nltk/nltk_data/issues/102
# TODO: create a USE flag for free-ish subset
LICENSE="all-rights-reserved"
SLOT="0"
KEYWORDS="amd64 ~ppc64 ~riscv x86"
RESTRICT="bindist mirror"

BDEPEND="app-arch/unzip"

# https://github.com/nltk/nltk_data/commits/gh-pages

PACKAGES_ZIP=(
	# wget -O - https://www.nltk.org/nltk_data/ | xml sel -t -m '//package[@unzip=0]' -v @subdir -o "/" -v @id -n - | sort
	corpora/bcp47
	corpora/comtrans
	corpora/conll2007
	corpora/extended_omw
	corpora/jeita
	corpora/knbc
	corpora/machado
	corpora/masc_tagged
	corpora/nombank.1.0
	corpora/omw
	corpora/omw-1.4
	corpora/panlex_swadesh
	corpora/propbank
	corpora/reuters
	corpora/semcor
	corpora/universal_treebanks_v20
	corpora/wordnet
	corpora/wordnet2021
	corpora/wordnet31
	sentiment/vader_lexicon
	stemmers/snowball_data
)

PACKAGES_UNPACK=(
	# wget -O - https://www.nltk.org/nltk_data/ | xml sel -t -m '//package[@unzip=1]' -v @subdir -o "/" -v @id -n - | sort
	chunkers/maxent_ne_chunker
	corpora/abc
	corpora/alpino
	corpora/biocreative_ppi
	corpora/brown
	corpora/brown_tei
	corpora/cess_cat
	corpora/cess_esp
	corpora/chat80
	corpora/city_database
	corpora/cmudict
	corpora/comparative_sentences
	corpora/conll2000
	corpora/conll2002
	corpora/crubadan
	corpora/dependency_treebank
	corpora/dolch
	corpora/europarl_raw
	corpora/floresta
	corpora/framenet_v15
	corpora/framenet_v17
	corpora/gazetteers
	corpora/genesis
	corpora/gutenberg
	corpora/ieer
	corpora/inaugural
	corpora/indian
	corpora/kimmo
	corpora/lin_thesaurus
	corpora/mac_morpho
	corpora/movie_reviews
	corpora/mte_teip5
	corpora/names
	corpora/nonbreaking_prefixes
	corpora/nps_chat
	corpora/opinion_lexicon
	corpora/paradigms
	corpora/pe08
	corpora/pil
	corpora/pl196x
	corpora/ppattach
	corpora/problem_reports
	corpora/product_reviews_1
	corpora/product_reviews_2
	corpora/pros_cons
	corpora/ptb
	corpora/qc
	corpora/rte
	corpora/senseval
	corpora/sentence_polarity
	corpora/sentiwordnet
	corpora/shakespeare
	corpora/sinica_treebank
	corpora/smultron
	corpora/state_union
	corpora/stopwords
	corpora/subjectivity
	corpora/swadesh
	corpora/switchboard
	corpora/timit
	corpora/toolbox
	corpora/treebank
	corpora/twitter_samples
	corpora/udhr
	corpora/udhr2
	corpora/unicode_samples
	corpora/verbnet
	corpora/verbnet3
	corpora/webtext
	corpora/wordnet2022
	corpora/wordnet_ic
	corpora/words
	corpora/ycoe
	grammars/basque_grammars
	grammars/book_grammars
	grammars/large_grammars
	grammars/sample_grammars
	grammars/spanish_grammars
	help/tagsets
	misc/mwa_ppdb
	misc/perluniprops
	models/bllip_wsj_no_aux
	models/moses_sample
	models/wmt15_eval
	models/word2vec_sample
	stemmers/porter_test
	stemmers/rslp
	taggers/averaged_perceptron_tagger
	taggers/averaged_perceptron_tagger_ru
	taggers/maxent_treebank_pos_tagger
	taggers/universal_tagset
	tokenizers/punkt
)

add_data() {
	local data=${1}

	for data; do
		SRC_URI+="
			https://raw.githubusercontent.com/nltk/nltk_data/gh-pages/packages/${data}.zip
				-> nltk-${data#*/}-${PV}.zip"
	done
}

add_data "${PACKAGES_ZIP[@]}" "${PACKAGES_UNPACK[@]}"

CHECKREQS_DISK_USR=3G
CHECKREQS_DISK_BUILD=${CHECKREQS_DISK_USR}

unpack_data() {
	local data=${1}

	for data; do
		local cat=${data%/*}
		local pkg=${data#*/}

		mkdir -p "${S}/${cat}" || die
		cd "${S}/${cat}" || die
		unpack "nltk-${pkg}-${PV}.zip"
	done
}

src_unpack() {
	unpack_data "${PACKAGES_UNPACK[@]}"
}

install_zips() {
	local data=${1}

	for data; do
		local cat=${data%/*}
		local pkg=${data#*/}

		insinto "/usr/share/nltk_data/${cat}"
		newins "${DISTDIR}/nltk-${pkg}-${PV}.zip" "${pkg}.zip"
	done
}

src_install() {
	dodir /usr/share/nltk_data
	mv * "${ED}/usr/share/nltk_data/" || die

	install_zips "${PACKAGES_ZIP[@]}"
}