mirror of
https://github.com/hasktorch/tokenizers.git
synced 2024-10-26 14:09:09 +03:00
Python - Update tests for new encode
This commit is contained in:
parent
2e105c4258
commit
dbc8e68c68
@ -19,7 +19,7 @@ class TestBertProcessing:
|
||||
tokenizer.add_tokens(["my", "name", "is", "john", "pair"])
|
||||
tokenizer.post_processor = BertProcessing(("[SEP]", 0), ("[CLS]", 1))
|
||||
|
||||
output = tokenizer.encode("my name", "pair")
|
||||
output = tokenizer.encode(("my name", "pair"))
|
||||
assert output.tokens == ["[CLS]", "my", "name", "[SEP]", "pair", "[SEP]"]
|
||||
assert output.ids == [1, 2, 3, 0, 6, 0]
|
||||
|
||||
@ -37,7 +37,7 @@ class TestRobertaProcessing:
|
||||
tokenizer.add_tokens(["my", "name", "is", "john", "pair"])
|
||||
tokenizer.post_processor = RobertaProcessing(("</s>", 1), ("<s>", 0))
|
||||
|
||||
output = tokenizer.encode("my name", "pair")
|
||||
output = tokenizer.encode(("my name", "pair"))
|
||||
assert output.tokens == ["<s>", "my", "name", "</s>", "</s>", "pair", "</s>"]
|
||||
assert output.ids == [0, 2, 3, 1, 1, 6, 1]
|
||||
|
||||
|
@ -102,9 +102,13 @@ class TestTokenizer:
|
||||
assert type(output.overflowing) == list
|
||||
|
||||
# Can encode a pair of sequences
|
||||
output = tokenizer.encode("my name is john", "pair")
|
||||
output = tokenizer.encode(("my name is john", "pair"))
|
||||
assert output.tokens == ["my", "name", "is", "john", "pair"]
|
||||
|
||||
# Can encode a single pre-tokenized sequence
|
||||
output = tokenizer.encode(["my", "name", "is", "john"])
|
||||
assert output.tokens == ["my", "name", "is", "john"]
|
||||
|
||||
# Can encode a batch with both a single sequence and a pair of sequences
|
||||
output = tokenizer.encode_batch(["my name is john", ("my name is john", "pair")])
|
||||
assert len(output) == 2
|
||||
@ -136,7 +140,7 @@ class TestTokenizer:
|
||||
assert output.tokens == ["my", "name"]
|
||||
|
||||
# Can truncate pair sequences as well
|
||||
output = tokenizer.encode("my name is john", "pair")
|
||||
output = tokenizer.encode(("my name is john", "pair"))
|
||||
assert output.tokens == ["my", "pair"]
|
||||
|
||||
def test_padding(self):
|
||||
@ -156,7 +160,7 @@ class TestTokenizer:
|
||||
tokenizer.enable_padding(max_length=4)
|
||||
output = tokenizer.encode("my name")
|
||||
assert output.tokens == ["my", "name", "[PAD]", "[PAD]"]
|
||||
output = tokenizer.encode("my name", "pair")
|
||||
output = tokenizer.encode(("my name", "pair"))
|
||||
assert output.tokens == ["my", "name", "pair", "[PAD]"]
|
||||
|
||||
def test_decode(self):
|
||||
|
@ -7,14 +7,14 @@ class TestBertWordPieceBPE:
|
||||
tokenizer = BertWordPieceTokenizer(bert_files["vocab"])
|
||||
|
||||
# Encode with special tokens by default
|
||||
output = tokenizer.encode("My name is John", "pair")
|
||||
output = tokenizer.encode(("My name is John", "pair"))
|
||||
assert output.ids == [101, 2026, 2171, 2003, 2198, 102, 3940, 102]
|
||||
assert output.tokens == ["[CLS]", "my", "name", "is", "john", "[SEP]", "pair", "[SEP]"]
|
||||
assert output.offsets == [(0, 0), (0, 2), (3, 7), (8, 10), (11, 15), (0, 0), (0, 4), (0, 0)]
|
||||
assert output.type_ids == [0, 0, 0, 0, 0, 0, 1, 1]
|
||||
|
||||
# Can encode without the special tokens
|
||||
output = tokenizer.encode("My name is John", "pair", add_special_tokens=False)
|
||||
output = tokenizer.encode(("My name is John", "pair"), add_special_tokens=False)
|
||||
assert output.ids == [2026, 2171, 2003, 2198, 3940]
|
||||
assert output.tokens == ["my", "name", "is", "john", "pair"]
|
||||
assert output.offsets == [(0, 2), (3, 7), (8, 10), (11, 15), (0, 4)]
|
||||
|
@ -6,7 +6,7 @@ class TestBertWordPieceBPE:
|
||||
def test_basic_encode(self, openai_files):
|
||||
tokenizer = CharBPETokenizer(openai_files["vocab"], openai_files["merges"])
|
||||
|
||||
output = tokenizer.encode("My name is John", "pair")
|
||||
output = tokenizer.encode(("My name is John", "pair"))
|
||||
assert output.ids == [0, 253, 1362, 544, 0, 7, 12662, 2688]
|
||||
assert output.tokens == [
|
||||
"<unk>",
|
||||
@ -32,7 +32,7 @@ class TestBertWordPieceBPE:
|
||||
|
||||
def test_lowercase(self, openai_files):
|
||||
tokenizer = CharBPETokenizer(openai_files["vocab"], openai_files["merges"], lowercase=True)
|
||||
output = tokenizer.encode("My name is John", "pair", add_special_tokens=False)
|
||||
output = tokenizer.encode(("My name is John", "pair"), add_special_tokens=False)
|
||||
assert output.ids == [547, 1362, 544, 2476, 2688]
|
||||
assert output.tokens == ["my</w>", "name</w>", "is</w>", "john</w>", "pair</w>"]
|
||||
assert output.offsets == [(0, 2), (3, 7), (8, 10), (11, 15), (0, 4)]
|
||||
|
Loading…
Reference in New Issue
Block a user