Python - Add components getter/setters to BaseTokenizer

This commit is contained in:
Anthony MOI 2021-01-11 14:33:13 -05:00 committed by Anthony MOI
parent 65b91966f7
commit 49d11b1f69
2 changed files with 77 additions and 0 deletions

View File

@ -1,4 +1,9 @@
from tokenizers import Tokenizer, Encoding, AddedToken, InputSequence, EncodeInput
from tokenizers.models import Model
from tokenizers.normalizers import Normalizer
from tokenizers.pre_tokenizers import PreTokenizer
from tokenizers.processors import PostProcessor
from tokenizers.decoders import Decoder
from typing import List, Union, Tuple, Optional, Dict
@ -366,3 +371,43 @@ class BaseTokenizer:
The resulting Encoding
"""
return self._tokenizer.post_process(encoding, pair, add_special_tokens)
@property
def model(self) -> Model:
return self._tokenizer.model
@model.setter
def model(self, model: Model):
self._tokenizer.model = model
@property
def normalizer(self) -> Normalizer:
return self._tokenizer.normalizer
@normalizer.setter
def normalizer(self, normalizer: Normalizer):
self._tokenizer.normalizer = normalizer
@property
def pre_tokenizer(self) -> PreTokenizer:
return self._tokenizer.pre_tokenizer
@pre_tokenizer.setter
def pre_tokenizer(self, pre_tokenizer: PreTokenizer):
self._tokenizer.pre_tokenizer = pre_tokenizer
@property
def post_processor(self) -> PostProcessor:
return self._tokenizer.post_processor
@post_processor.setter
def post_processor(self, post_processor: PostProcessor):
self._tokenizer.post_processor = post_processor
@property
def decoder(self) -> Decoder:
return self._tokenizer.decoder
@decoder.setter
def decoder(self, decoder: Decoder):
self._tokenizer.decoder = decoder

View File

@ -0,0 +1,32 @@
import pytest
from tokenizers.implementations import BaseTokenizer
from tokenizers import Tokenizer, models, normalizers, pre_tokenizers, processors, decoders
class TestBaseTokenizer:
def test_get_set_components(self):
toki = Tokenizer(models.BPE())
toki.normalizer = normalizers.NFC()
toki.pre_tokenizer = pre_tokenizers.ByteLevel()
toki.post_processor = processors.BertProcessing(("A", 0), ("B", 1))
toki.decoder = decoders.ByteLevel()
tokenizer = BaseTokenizer(toki)
assert isinstance(tokenizer.model, models.BPE)
assert isinstance(tokenizer.normalizer, normalizers.NFC)
assert isinstance(tokenizer.pre_tokenizer, pre_tokenizers.ByteLevel)
assert isinstance(tokenizer.post_processor, processors.BertProcessing)
assert isinstance(tokenizer.decoder, decoders.ByteLevel)
tokenizer.model = models.Unigram()
assert isinstance(tokenizer.model, models.Unigram)
tokenizer.normalizer = normalizers.NFD()
assert isinstance(tokenizer.normalizer, normalizers.NFD)
tokenizer.pre_tokenizer = pre_tokenizers.Whitespace()
assert isinstance(tokenizer.pre_tokenizer, pre_tokenizers.Whitespace)
tokenizer.post_processor = processors.ByteLevel()
assert isinstance(tokenizer.post_processor, processors.ByteLevel)
tokenizer.decoder = decoders.WordPiece()
assert isinstance(tokenizer.decoder, decoders.WordPiece)