From 49d11b1f6924beefdae6e8aadfedf274c7617589 Mon Sep 17 00:00:00 2001 From: Anthony MOI Date: Mon, 11 Jan 2021 14:33:13 -0500 Subject: [PATCH] Python - Add components getter/setters to BaseTokenizer --- .../implementations/base_tokenizer.py | 45 +++++++++++++++++++ .../implementations/test_base_tokenizer.py | 32 +++++++++++++ 2 files changed, 77 insertions(+) create mode 100644 bindings/python/tests/implementations/test_base_tokenizer.py diff --git a/bindings/python/py_src/tokenizers/implementations/base_tokenizer.py b/bindings/python/py_src/tokenizers/implementations/base_tokenizer.py index 2fdb490..dc5ea11 100644 --- a/bindings/python/py_src/tokenizers/implementations/base_tokenizer.py +++ b/bindings/python/py_src/tokenizers/implementations/base_tokenizer.py @@ -1,4 +1,9 @@ from tokenizers import Tokenizer, Encoding, AddedToken, InputSequence, EncodeInput +from tokenizers.models import Model +from tokenizers.normalizers import Normalizer +from tokenizers.pre_tokenizers import PreTokenizer +from tokenizers.processors import PostProcessor +from tokenizers.decoders import Decoder from typing import List, Union, Tuple, Optional, Dict @@ -366,3 +371,43 @@ class BaseTokenizer: The resulting Encoding """ return self._tokenizer.post_process(encoding, pair, add_special_tokens) + + @property + def model(self) -> Model: + return self._tokenizer.model + + @model.setter + def model(self, model: Model): + self._tokenizer.model = model + + @property + def normalizer(self) -> Normalizer: + return self._tokenizer.normalizer + + @normalizer.setter + def normalizer(self, normalizer: Normalizer): + self._tokenizer.normalizer = normalizer + + @property + def pre_tokenizer(self) -> PreTokenizer: + return self._tokenizer.pre_tokenizer + + @pre_tokenizer.setter + def pre_tokenizer(self, pre_tokenizer: PreTokenizer): + self._tokenizer.pre_tokenizer = pre_tokenizer + + @property + def post_processor(self) -> PostProcessor: + return self._tokenizer.post_processor + + @post_processor.setter + def post_processor(self, post_processor: PostProcessor): + self._tokenizer.post_processor = post_processor + + @property + def decoder(self) -> Decoder: + return self._tokenizer.decoder + + @decoder.setter + def decoder(self, decoder: Decoder): + self._tokenizer.decoder = decoder diff --git a/bindings/python/tests/implementations/test_base_tokenizer.py b/bindings/python/tests/implementations/test_base_tokenizer.py new file mode 100644 index 0000000..02f66b4 --- /dev/null +++ b/bindings/python/tests/implementations/test_base_tokenizer.py @@ -0,0 +1,32 @@ +import pytest + +from tokenizers.implementations import BaseTokenizer +from tokenizers import Tokenizer, models, normalizers, pre_tokenizers, processors, decoders + + +class TestBaseTokenizer: + def test_get_set_components(self): + toki = Tokenizer(models.BPE()) + toki.normalizer = normalizers.NFC() + toki.pre_tokenizer = pre_tokenizers.ByteLevel() + toki.post_processor = processors.BertProcessing(("A", 0), ("B", 1)) + toki.decoder = decoders.ByteLevel() + + tokenizer = BaseTokenizer(toki) + + assert isinstance(tokenizer.model, models.BPE) + assert isinstance(tokenizer.normalizer, normalizers.NFC) + assert isinstance(tokenizer.pre_tokenizer, pre_tokenizers.ByteLevel) + assert isinstance(tokenizer.post_processor, processors.BertProcessing) + assert isinstance(tokenizer.decoder, decoders.ByteLevel) + + tokenizer.model = models.Unigram() + assert isinstance(tokenizer.model, models.Unigram) + tokenizer.normalizer = normalizers.NFD() + assert isinstance(tokenizer.normalizer, normalizers.NFD) + tokenizer.pre_tokenizer = pre_tokenizers.Whitespace() + assert isinstance(tokenizer.pre_tokenizer, pre_tokenizers.Whitespace) + tokenizer.post_processor = processors.ByteLevel() + assert isinstance(tokenizer.post_processor, processors.ByteLevel) + tokenizer.decoder = decoders.WordPiece() + assert isinstance(tokenizer.decoder, decoders.WordPiece)