start work on tokenizer model artifact

This commit is contained in:
austinvhuang 2021-01-10 12:26:37 -05:00 committed by Torsten Scholak
parent a54aef4247
commit 193bfaacfd
No known key found for this signature in database
GPG Key ID: EF135E6C40866D80
3 changed files with 31 additions and 9 deletions

View File

@ -3,6 +3,7 @@ all: download run
clean:
rm -f roberta-base-vocab.json*
rm -f roberta-base-merges.txt*
rm -f roberta-base-tokenizer.json*
download: clean
./get_roberta.sh

View File

@ -2,5 +2,6 @@
# https://github.com/huggingface/transformers/issues/1083
# https://huggingface.co/transformers/v1.1.0/_modules/pytorch_transformers/tokenization_roberta.html
wget "https://s3.amazonaws.com/models.huggingface.co/bert/roberta-base-vocab.json"
wget "https://s3.amazonaws.com/models.huggingface.co/bert/roberta-base-merges.txt"
wget https://huggingface.co/roberta-base/resolve/main/vocab.json -O roberta-base-vocab.json
wget https://huggingface.co/roberta-base/resolve/main/merges.txt -O roberta-base-merges.txt
wget https://huggingface.co/roberta-base/resolve/main/tokenizer.json -O roberta-base-tokenizer.json

View File

@ -1,21 +1,41 @@
use std::ffi::CStr;
use std::os::raw::c_char;
use tokenizers::
use tokenizers::models::bpe::BpeBuilder;
use tokenizers::models::bpe::BPE;
// use tokenizers::pre_tokenizers::byte_level::ByteLevel;
use tokenizers::tokenizer::{Tokenizer};
use tokenizers::tokenizer::Tokenizer;
#[no_mangle]
pub extern "C" fn mk_roberta_tokenizer() {
}
#[no_mangle]
pub extern "C" fn mk_bpe_builder_from_files(
cvocab: *const c_char,
cmerges: *const c_char,
) -> *mut BpeBuilder {
unsafe {
let vocab = CStr::from_ptr(cvocab);
let merges = CStr::from_ptr(cmerges);
if let (Ok(vocab_file), Ok(merges_file)) = (vocab.to_str(), merges.to_str()) {
Box::into_raw(Box::new(BPE::from_file(vocab_file, merges_file)))
} else {
panic!("Unable to read parameters.");
}
}
}
#[no_mangle]
pub extern "C" fn mk_tokenizer(cvocab: *const c_char, cmerges: *const c_char) -> *mut Tokenizer {
unsafe {
let vocab = CStr::from_ptr(cvocab);
let merges = CStr::from_ptr(cmerges);
if let (Ok(vocab_file), Ok(merges_file)) = (vocab.to_str(), merges.to_str()) {
let bpe_builder = BPE::from_file(vocab_file, merges_file);
let bpe = bpe_builder
.dropout(0.1)
.build().unwrap();
return Box::into_raw(Box::new(Tokenizer::new(bpe)))
if let (Ok(vocab_file), Ok(merges_file)) = (vocab.to_str(), merges.to_str()) {
let bpe_builder = BPE::from_file(vocab_file, merges_file);
let bpe = bpe_builder.build().unwrap();
return Box::into_raw(Box::new(Tokenizer::new(bpe)));
} else {
panic!("Unable to read parameters.");
}