mirror of
https://github.com/gentoo-mirror/guru.git
synced 2026-04-07 03:10:15 -04:00
108 lines
2.4 KiB
Bash
108 lines
2.4 KiB
Bash
# Copyright 2026 Gentoo Authors
|
|
# Distributed under the terms of the GNU General Public License v2
|
|
|
|
EAPI=8
|
|
|
|
DISTUTILS_EXT=1
|
|
DISTUTILS_USE_PEP517=setuptools
|
|
PYTHON_COMPAT=( python3_{11..14} )
|
|
|
|
RUST_MIN_VER="1.85.0"
|
|
CRATES="
|
|
aho-corasick@1.1.4
|
|
autocfg@1.5.0
|
|
bit-set@0.5.3
|
|
bit-vec@0.6.3
|
|
bstr@1.12.1
|
|
fancy-regex@0.13.0
|
|
heck@0.5.0
|
|
indoc@2.0.7
|
|
libc@0.2.183
|
|
memchr@2.8.0
|
|
memoffset@0.9.1
|
|
once_cell@1.21.4
|
|
portable-atomic@1.13.1
|
|
proc-macro2@1.0.106
|
|
pyo3-build-config@0.26.0
|
|
pyo3-ffi@0.26.0
|
|
pyo3-macros-backend@0.26.0
|
|
pyo3-macros@0.26.0
|
|
pyo3@0.26.0
|
|
quote@1.0.45
|
|
regex-automata@0.4.14
|
|
regex-syntax@0.8.10
|
|
regex@1.12.3
|
|
rustc-hash@2.1.1
|
|
rustversion@1.0.22
|
|
serde@1.0.228
|
|
serde_core@1.0.228
|
|
serde_derive@1.0.228
|
|
syn@2.0.117
|
|
target-lexicon@0.13.5
|
|
unicode-ident@1.0.24
|
|
unindent@0.2.4
|
|
"
|
|
|
|
inherit cargo distutils-r1 optfeature pypi
|
|
|
|
DESCRIPTION="A fast BPE tokeniser for use with OpenAI's models"
|
|
HOMEPAGE="
|
|
https://github.com/openai/tiktoken
|
|
https://pypi.org/project/tiktoken/
|
|
"
|
|
TTE_TAG=2026.03.26.0
|
|
TTE_BASE_URI="https://github.com/falbrechtskirchinger/overlay-assets/releases/download"
|
|
SRC_URI+="
|
|
${CARGO_CRATE_URIS}
|
|
test? (
|
|
${TTE_BASE_URI}/v${TTE_TAG}/tiktoken-encodings-v${TTE_TAG%.*}.tar.xz
|
|
)
|
|
"
|
|
# The encodings cache (tiktoken-encodings-*.tar.xz) holds files named after
|
|
# the SHA-1 of their URL. It can be generated from the source directory via:
|
|
# grep -Eo 'https://openaipublic.blob[^"]+' tiktoken_ext/openai_public.py | \
|
|
# sort -u | while read u; do h=$(echo -n "$u" | sha1sum | awk '{print $1}'); \
|
|
# wget -O "$h" "$u" ; done
|
|
# Include the license file from the source repo:
|
|
# https://github.com/openai/tiktoken/issues/92
|
|
|
|
LICENSE="MIT"
|
|
# Dependent crate licenses
|
|
LICENSE+=" Apache-2.0-with-LLVM-exceptions MIT Unicode-3.0"
|
|
SLOT="0"
|
|
KEYWORDS="~amd64 ~arm64"
|
|
|
|
RDEPEND="
|
|
dev-python/regex[${PYTHON_USEDEP}]
|
|
dev-python/requests[${PYTHON_USEDEP}]
|
|
"
|
|
BDEPEND="
|
|
test? (
|
|
dev-python/blobfile[${PYTHON_USEDEP}]
|
|
)
|
|
"
|
|
|
|
PATCHES=(
|
|
# test_encoding.py::test_hyp_roundtrip throws ValueError for special tokens
|
|
"${FILESDIR}/tiktoken-0.12.0-special-token-roudtrip.patch"
|
|
)
|
|
|
|
EPYTEST_PLUGINS=(
|
|
hypothesis
|
|
pytest-{asyncio,timeout}
|
|
)
|
|
distutils_enable_tests pytest
|
|
|
|
python_test() {
|
|
local -x PATH=${BUILD_DIR}/install/usr/bin:${PATH}
|
|
local -x TIKTOKEN_CACHE_DIR="${WORKDIR}/tiktoken-encodings"
|
|
|
|
rm -rf tiktoken || die
|
|
|
|
epytest
|
|
}
|
|
|
|
pkg_postinst() {
|
|
optfeature "reading GCS, ABS files" dev-python/blobfile
|
|
}
|