python3Packages.tokenizers: 0.8.1 -> 0.9.2

Changelog:

https://github.com/huggingface/tokenizers/releases/tag/python-v0.9.0
https://github.com/huggingface/tokenizers/releases/tag/python-v0.9.1
https://github.com/huggingface/tokenizers/releases/tag/python-v0.9.2

Changes in the derivation:

* Tokenizers uses a newer version of pyo3 that does not require Rust
  nightly anymore. So, we do not have to use any cheat codes anymore.
* Tokenizers is now a mixed Rust/Python project. The way it is set
  up does not work with Maturin, so switch to setuptools-rust instead.
* Add additional data files needed for tests.
* Use `pytestCheckHook`.
This commit is contained in:
Daniël de Kok 2020-10-16 11:40:35 +02:00 committed by Jonathan Ringer
parent 93b3fa6088
commit d2e918cc12
2 changed files with 25 additions and 112 deletions

View File

@ -2,10 +2,12 @@
, rustPlatform
, fetchFromGitHub
, fetchurl
, maturin
, pipInstallHook
, pytest
, setuptools-rust
, wheel
, numpy
, python
, pytestCheckHook
, requests
}:
@ -18,10 +20,18 @@ let
url = "https://s3.amazonaws.com/models.huggingface.co/bert/roberta-base-merges.txt";
sha256 = "1idd4rvkpqqbks51i2vjbd928inw7slij9l4r063w3y5fd3ndq8w";
};
albertVocab = fetchurl {
url = "https://s3.amazonaws.com/models.huggingface.co/bert/albert-base-v1-tokenizer.json";
sha256 = "1hra9pn8rczx7378z88zjclw2qsdrdwq20m56sy42s2crbas6akf";
};
bertVocab = fetchurl {
url = "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-uncased-vocab.txt";
sha256 = "18rq42cmqa8zanydsbzrb34xwy4l6cz1y900r4kls57cbhvyvv07";
};
norvigBig = fetchurl {
url = "https://norvig.com/big.txt";
sha256 = "0yz80icdly7na03cfpl0nfk5h3j3cam55rj486n03wph81ynq1ps";
};
openaiVocab = fetchurl {
url = "https://s3.amazonaws.com/models.huggingface.co/bert/openai-gpt-vocab.json";
sha256 = "0y40gc9bixj5rxv674br1rxmxkd3ly29p80x1596h8yywwcrpx7x";
@ -32,38 +42,34 @@ let
};
in rustPlatform.buildRustPackage rec {
pname = "tokenizers";
version = "0.8.1";
version = "0.9.2";
src = fetchFromGitHub {
owner = "huggingface";
repo = pname;
rev = "python-v${version}";
sha256 = "0sxdwx05hr87j2z32rk4rgwn6a26w9r7m5fgj6ah1sgagiiyxbjw";
sha256 = "0rsm1g5zfq3ygdb3s8v9xqqpgfzvvkc4n5ik3ahy8sw7pyjljb4m";
};
# Update parking_lot to be compatible with recent Rust versions, that
# replace asm! by llvm_asm!:
#
# https://github.com/Amanieu/parking_lot/pull/223
#
# Remove once upstream updates this dependency.
cargoPatches = [ ./update-parking-lot.diff ];
cargoSha256 = "0cdkxmj8z2wdspn6r62lqlpvd0sj1z0cmb1zpqaajxvr0b2kjlj8";
cargoSha256 = "0yn699dq9hdjh7fyci99ni8mmd5qdhzrsi80grzgf5cch8g38rbi";
sourceRoot = "source/bindings/python";
nativeBuildInputs = [
maturin
pipInstallHook
setuptools-rust
wheel
];
propagatedBuildInputs = [
numpy
python
];
# tokenizers uses pyo3, which requires Rust nightly.
RUSTC_BOOTSTRAP = 1;
installCheckInputs = [
pytestCheckHook
requests
];
doCheck = false;
doInstallCheck = true;
@ -74,51 +80,21 @@ in rustPlatform.buildRustPackage rec {
( cd $sourceRoot/tests/data
ln -s ${robertaVocab} roberta-base-vocab.json
ln -s ${robertaMerges} roberta-base-merges.txt
ln -s ${albertVocab} albert-base-v1-tokenizer.json
ln -s ${bertVocab} bert-base-uncased-vocab.txt
ln -s ${norvigBig} big.txt
ln -s ${openaiVocab} openai-gpt-vocab.json
ln -s ${openaiMerges} openai-gpt-merges.txt )
'';
postPatch = ''
# pyo3's build check verifies that Rust is a nightly
# version. Disable this check.
substituteInPlace $NIX_BUILD_TOP/$cargoDepsCopy/pyo3/build.rs \
--replace "check_rustc_version()?;" ""
# Patching the vendored dependency invalidates the file
# checksums, so remove them. This should be safe, since
# this is just a copy of the vendored dependencies and
# the integrity of the vendored dependencies is validated
# by cargoSha256.
sed -r -i 's|"files":\{[^}]+\}|"files":{}|' \
$NIX_BUILD_TOP/$cargoDepsCopy/pyo3/.cargo-checksum.json
# Maturin uses the crate name as the wheel name.
substituteInPlace Cargo.toml \
--replace "tokenizers-python" "tokenizers"
'';
buildPhase = ''
maturin build --release --manylinux off
${python.interpreter} setup.py bdist_wheel
'';
installPhase = ''
# Put the wheels where the pip install hook can find them.
install -Dm644 -t dist target/wheels/*.whl
pipInstallPhase
'';
installCheckInputs = [
pytest
requests
];
installCheckPhase = ''
# Append paths, or the binding's tokenizer module will be
# used, since the test directories have __init__.py
pytest --import-mode=append
'';
meta = with stdenv.lib; {
homepage = "https://github.com/huggingface/tokenizers";
description = "Fast State-of-the-Art Tokenizers optimized for Research and Production";

View File

@ -1,63 +0,0 @@
diff --git a/bindings/python/Cargo.lock b/bindings/python/Cargo.lock
index f50db71..ea71817 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -269,7 +269,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
[[package]]
name = "lock_api"
-version = "0.3.3"
+version = "0.3.4"
source = "registry+https://github.com/rust-lang/crates.io-index"
dependencies = [
"scopeguard 1.1.0 (registry+https://github.com/rust-lang/crates.io-index)",
@@ -337,16 +337,16 @@ dependencies = [
[[package]]
name = "parking_lot"
-version = "0.10.0"
+version = "0.10.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
dependencies = [
- "lock_api 0.3.3 (registry+https://github.com/rust-lang/crates.io-index)",
- "parking_lot_core 0.7.0 (registry+https://github.com/rust-lang/crates.io-index)",
+ "lock_api 0.3.4 (registry+https://github.com/rust-lang/crates.io-index)",
+ "parking_lot_core 0.7.2 (registry+https://github.com/rust-lang/crates.io-index)",
]
[[package]]
name = "parking_lot_core"
-version = "0.7.0"
+version = "0.7.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
dependencies = [
"cfg-if 0.1.10 (registry+https://github.com/rust-lang/crates.io-index)",
@@ -409,7 +409,7 @@ dependencies = [
"inventory 0.1.6 (registry+https://github.com/rust-lang/crates.io-index)",
"libc 0.2.68 (registry+https://github.com/rust-lang/crates.io-index)",
"num-traits 0.2.11 (registry+https://github.com/rust-lang/crates.io-index)",
- "parking_lot 0.10.0 (registry+https://github.com/rust-lang/crates.io-index)",
+ "parking_lot 0.10.2 (registry+https://github.com/rust-lang/crates.io-index)",
"paste 0.1.10 (registry+https://github.com/rust-lang/crates.io-index)",
"pyo3cls 0.9.2 (registry+https://github.com/rust-lang/crates.io-index)",
"regex 1.3.6 (registry+https://github.com/rust-lang/crates.io-index)",
@@ -768,7 +768,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
"checksum itoa 0.4.5 (registry+https://github.com/rust-lang/crates.io-index)" = "b8b7a7c0c47db5545ed3fef7468ee7bb5b74691498139e4b3f6a20685dc6dd8e"
"checksum lazy_static 1.4.0 (registry+https://github.com/rust-lang/crates.io-index)" = "e2abad23fbc42b3700f2f279844dc832adb2b2eb069b2df918f455c4e18cc646"
"checksum libc 0.2.68 (registry+https://github.com/rust-lang/crates.io-index)" = "dea0c0405123bba743ee3f91f49b1c7cfb684eef0da0a50110f758ccf24cdff0"
-"checksum lock_api 0.3.3 (registry+https://github.com/rust-lang/crates.io-index)" = "79b2de95ecb4691949fea4716ca53cdbcfccb2c612e19644a8bad05edcf9f47b"
+"checksum lock_api 0.3.4 (registry+https://github.com/rust-lang/crates.io-index)" = "c4da24a77a3d8a6d4862d95f72e6fdb9c09a643ecdb402d754004a557f2bec75"
"checksum maybe-uninit 2.0.0 (registry+https://github.com/rust-lang/crates.io-index)" = "60302e4db3a61da70c0cb7991976248362f30319e88850c487b9b95bbf059e00"
"checksum memchr 2.3.3 (registry+https://github.com/rust-lang/crates.io-index)" = "3728d817d99e5ac407411fa471ff9800a778d88a24685968b36824eaf4bee400"
"checksum memoffset 0.5.4 (registry+https://github.com/rust-lang/crates.io-index)" = "b4fc2c02a7e374099d4ee95a193111f72d2110197fe200272371758f6c3643d8"
@@ -777,8 +777,8 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
"checksum number_prefix 0.3.0 (registry+https://github.com/rust-lang/crates.io-index)" = "17b02fc0ff9a9e4b35b3342880f48e896ebf69f2967921fe8646bf5b7125956a"
"checksum onig 6.0.0 (registry+https://github.com/rust-lang/crates.io-index)" = "bd91ccd8a02fce2f7e8a86655aec67bc6c171e6f8e704118a0e8c4b866a05a8a"
"checksum onig_sys 69.5.0 (registry+https://github.com/rust-lang/crates.io-index)" = "3814583fad89f3c60ae0701d80e87e1fd3028741723deda72d0d4a0ecf0cb0db"
-"checksum parking_lot 0.10.0 (registry+https://github.com/rust-lang/crates.io-index)" = "92e98c49ab0b7ce5b222f2cc9193fc4efe11c6d0bd4f648e374684a6857b1cfc"
-"checksum parking_lot_core 0.7.0 (registry+https://github.com/rust-lang/crates.io-index)" = "7582838484df45743c8434fbff785e8edf260c28748353d44bc0da32e0ceabf1"
+"checksum parking_lot 0.10.2 (registry+https://github.com/rust-lang/crates.io-index)" = "d3a704eb390aafdc107b0e392f56a82b668e3a71366993b5340f5833fd62505e"
+"checksum parking_lot_core 0.7.2 (registry+https://github.com/rust-lang/crates.io-index)" = "d58c7c768d4ba344e3e8d72518ac13e259d7c7ade24167003b8488e10b6740a3"
"checksum paste 0.1.10 (registry+https://github.com/rust-lang/crates.io-index)" = "ab4fb1930692d1b6a9cfabdde3d06ea0a7d186518e2f4d67660d8970e2fa647a"
"checksum paste-impl 0.1.10 (registry+https://github.com/rust-lang/crates.io-index)" = "a62486e111e571b1e93b710b61e8f493c0013be39629b714cb166bdb06aa5a8a"
"checksum pkg-config 0.3.17 (registry+https://github.com/rust-lang/crates.io-index)" = "05da548ad6865900e60eaba7f589cc0783590a92e940c26953ff81ddbab2d677"