mirror of
https://github.com/hasktorch/tokenizers.git
synced 2024-10-26 14:09:09 +03:00
Python - Add components getter/setters to BaseTokenizer
This commit is contained in:
parent
65b91966f7
commit
49d11b1f69
@ -1,4 +1,9 @@
|
||||
from tokenizers import Tokenizer, Encoding, AddedToken, InputSequence, EncodeInput
|
||||
from tokenizers.models import Model
|
||||
from tokenizers.normalizers import Normalizer
|
||||
from tokenizers.pre_tokenizers import PreTokenizer
|
||||
from tokenizers.processors import PostProcessor
|
||||
from tokenizers.decoders import Decoder
|
||||
|
||||
from typing import List, Union, Tuple, Optional, Dict
|
||||
|
||||
@ -366,3 +371,43 @@ class BaseTokenizer:
|
||||
The resulting Encoding
|
||||
"""
|
||||
return self._tokenizer.post_process(encoding, pair, add_special_tokens)
|
||||
|
||||
@property
|
||||
def model(self) -> Model:
|
||||
return self._tokenizer.model
|
||||
|
||||
@model.setter
|
||||
def model(self, model: Model):
|
||||
self._tokenizer.model = model
|
||||
|
||||
@property
|
||||
def normalizer(self) -> Normalizer:
|
||||
return self._tokenizer.normalizer
|
||||
|
||||
@normalizer.setter
|
||||
def normalizer(self, normalizer: Normalizer):
|
||||
self._tokenizer.normalizer = normalizer
|
||||
|
||||
@property
|
||||
def pre_tokenizer(self) -> PreTokenizer:
|
||||
return self._tokenizer.pre_tokenizer
|
||||
|
||||
@pre_tokenizer.setter
|
||||
def pre_tokenizer(self, pre_tokenizer: PreTokenizer):
|
||||
self._tokenizer.pre_tokenizer = pre_tokenizer
|
||||
|
||||
@property
|
||||
def post_processor(self) -> PostProcessor:
|
||||
return self._tokenizer.post_processor
|
||||
|
||||
@post_processor.setter
|
||||
def post_processor(self, post_processor: PostProcessor):
|
||||
self._tokenizer.post_processor = post_processor
|
||||
|
||||
@property
|
||||
def decoder(self) -> Decoder:
|
||||
return self._tokenizer.decoder
|
||||
|
||||
@decoder.setter
|
||||
def decoder(self, decoder: Decoder):
|
||||
self._tokenizer.decoder = decoder
|
||||
|
32
bindings/python/tests/implementations/test_base_tokenizer.py
Normal file
32
bindings/python/tests/implementations/test_base_tokenizer.py
Normal file
@ -0,0 +1,32 @@
|
||||
import pytest
|
||||
|
||||
from tokenizers.implementations import BaseTokenizer
|
||||
from tokenizers import Tokenizer, models, normalizers, pre_tokenizers, processors, decoders
|
||||
|
||||
|
||||
class TestBaseTokenizer:
|
||||
def test_get_set_components(self):
|
||||
toki = Tokenizer(models.BPE())
|
||||
toki.normalizer = normalizers.NFC()
|
||||
toki.pre_tokenizer = pre_tokenizers.ByteLevel()
|
||||
toki.post_processor = processors.BertProcessing(("A", 0), ("B", 1))
|
||||
toki.decoder = decoders.ByteLevel()
|
||||
|
||||
tokenizer = BaseTokenizer(toki)
|
||||
|
||||
assert isinstance(tokenizer.model, models.BPE)
|
||||
assert isinstance(tokenizer.normalizer, normalizers.NFC)
|
||||
assert isinstance(tokenizer.pre_tokenizer, pre_tokenizers.ByteLevel)
|
||||
assert isinstance(tokenizer.post_processor, processors.BertProcessing)
|
||||
assert isinstance(tokenizer.decoder, decoders.ByteLevel)
|
||||
|
||||
tokenizer.model = models.Unigram()
|
||||
assert isinstance(tokenizer.model, models.Unigram)
|
||||
tokenizer.normalizer = normalizers.NFD()
|
||||
assert isinstance(tokenizer.normalizer, normalizers.NFD)
|
||||
tokenizer.pre_tokenizer = pre_tokenizers.Whitespace()
|
||||
assert isinstance(tokenizer.pre_tokenizer, pre_tokenizers.Whitespace)
|
||||
tokenizer.post_processor = processors.ByteLevel()
|
||||
assert isinstance(tokenizer.post_processor, processors.ByteLevel)
|
||||
tokenizer.decoder = decoders.WordPiece()
|
||||
assert isinstance(tokenizer.decoder, decoders.WordPiece)
|
Loading…
Reference in New Issue
Block a user