Python - Add bert wordpiece training example

This commit is contained in:
Anthony MOI 2020-01-03 16:51:39 -05:00
parent 6e3efe8954
commit fab4e96b51
No known key found for this signature in database
GPG Key ID: CB646B1164C636A0
3 changed files with 65 additions and 1 deletions

View File

@ -0,0 +1,57 @@
import argparse
import glob
from tokenizers import Tokenizer, models, pre_tokenizers, decoders, trainers, normalizers
parser = argparse.ArgumentParser()
parser.add_argument("--files",
default=None,
metavar="path",
type=str,
required=True,
help="The files to use as training; accept '**/*.txt' type of patterns \
if enclosed in quotes")
parser.add_argument("--out",
default="./",
type=str,
help="Path to the output directory, where the files will be saved")
parser.add_argument("--name",
default="bert-wordpiece",
type=str,
help="The name of the output vocab files")
args = parser.parse_args()
files = glob.glob(args.files)
if not files:
print(f"File does not exist: {args.files}")
exit(1)
# Initialize an empty tokenizer
tokenizer = Tokenizer(models.WordPiece.empty())
# Customize all the steps
tokenizer.with_normalizer(normalizers.BertNormalizer.new(
clean_text=True,
handle_chinese_chars=True,
strip_accents=True,
lowercase=True,
))
tokenizer.with_pre_tokenizer(pre_tokenizers.BertPreTokenizer.new())
tokenizer.with_decoder(decoders.WordPiece.new())
# And then train
trainer = trainers.WordPieceTrainer.new(
vocab_size=50000,
min_frequency=2,
show_progress=True,
special_tokens=[ "<s>", "<unk>", "<pad>", "</s>" ],
limit_alphabet=1000,
continuing_subword_prefix="##"
)
tokenizer.train(trainer, files)
# Save the files
tokenizer.model.save(args.out, args.name)

View File

@ -40,7 +40,7 @@ trainer = trainers.BpeTrainer.new(
vocab_size=50000,
min_frequency=2,
show_progress=True,
special_tokens=[ "<s>", "<pad>", "</s" ],
special_tokens=[ "<s>", "<pad>", "</s>" ],
initial_alphabet=pre_tokenizers.ByteLevel.alphabet()
)
tokenizer.train(trainer, files)

View File

@ -133,4 +133,11 @@ impl WordPiece {
}),
}
}
#[staticmethod]
fn empty() -> Model {
Model {
model: Container::Owned(Box::new(tk::models::wordpiece::WordPiece::default())),
}
}
}